Repository: dragonflydb/dragonfly
Branch: main
Commit: 9dc515b4c7fa
Files: 1014
Total size: 8.9 MB

Directory structure:
gitextract_74218km6/

├── .agent/
│   └── rules/
│       └── ANTIGRAVITY_INSTRUCTIONS.md
├── .circleci/
│   └── config.yml
├── .clang-format
├── .clang-tidy
├── .clangd
├── .claude/
│   ├── hooks/
│   │   └── format-after-edit.sh
│   ├── settings.json
│   └── skills/
│       └── reproduce-fuzz-crash/
│           └── SKILL.md
├── .ct.yaml
├── .cursorrules
├── .devcontainer/
│   ├── alpine/
│   │   ├── devcontainer.json
│   │   └── post-create.sh
│   ├── fedora/
│   │   └── devcontainer.json
│   ├── fedora41/
│   │   └── devcontainer.json
│   ├── ubuntu20/
│   │   ├── cmake-tools-kits.json
│   │   ├── devcontainer.json
│   │   └── post-create.sh
│   ├── ubuntu20-gcc14/
│   │   └── devcontainer.json
│   ├── ubuntu22/
│   │   ├── devcontainer.json
│   │   └── post-create.sh
│   └── ubuntu24/
│       └── devcontainer.json
├── .dockerignore
├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug_report.md
│   │   ├── config.yml
│   │   └── feature_request.md
│   ├── PULL_REQUEST_TEMPLATE.md
│   ├── actions/
│   │   ├── builder/
│   │   │   └── action.yml
│   │   ├── fuzzing/
│   │   │   └── action.yml
│   │   ├── lint-test-chart/
│   │   │   └── action.yml
│   │   ├── multi-registry-docker-login/
│   │   │   └── action.yml
│   │   ├── regression-tests/
│   │   │   └── action.yml
│   │   ├── repeat/
│   │   │   └── action.yml
│   │   ├── sync-valkey-tests/
│   │   │   └── action.yml
│   │   └── test-docker/
│   │       └── action.yml
│   ├── bullmq-skipped-tests.txt
│   ├── copilot-instructions.md
│   ├── dependabot.yml
│   ├── instructions/
│   │   └── code-review.instructions.md
│   └── workflows/
│       ├── benchmark.yml
│       ├── bullmq-tests.yml
│       ├── ci.yml
│       ├── copilot-setup-steps.yml
│       ├── cov.yml
│       ├── daily-builds.yml
│       ├── docker-dev-release.yml
│       ├── docker-release2.yml
│       ├── epoll-regression-tests.yml
│       ├── fuzz-long.yml
│       ├── fuzz-pr.yml
│       ├── generate-osrepo-site.yml
│       ├── heavy-tests.yml
│       ├── ioloop-v2-regtests.yml
│       ├── mastodon-ruby-tests.yml
│       ├── package-install.yml
│       ├── regression-tests.yml
│       ├── release.yml
│       ├── repeat-tests.yml
│       └── test-fakeredis.yml
├── .gitignore
├── .gitmodules
├── .gitorderfile
├── .nvmrc
├── .pre-commit-config.yaml
├── .pre-commit-hooks.yaml
├── .snyk
├── .vscode/
│   └── c_cpp_properties.json
├── AGENTS.md
├── CLA.txt
├── CMakeLists.txt
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── CONTRIBUTORS.md
├── LICENSE.md
├── Makefile
├── README.ja-JP.md
├── README.ko-KR.md
├── README.md
├── README.pt-BR.md
├── README.zh-CN.md
├── TODO.md
├── contrib/
│   ├── charts/
│   │   └── dragonfly/
│   │       ├── .helmignore
│   │       ├── Chart.yaml
│   │       ├── README.md
│   │       ├── ci/
│   │       │   ├── affinity-values.golden.yaml
│   │       │   ├── affinity-values.yaml
│   │       │   ├── command_extraargs-values.golden.yaml
│   │       │   ├── command_extraargs-values.yaml
│   │       │   ├── commonlabels-values.golden.yaml
│   │       │   ├── commonlabels-values.yaml
│   │       │   ├── extracontainer-string-values.golden.yaml
│   │       │   ├── extracontainer-string-values.yaml
│   │       │   ├── extracontainer-tpl-values.golden.yaml
│   │       │   ├── extracontainer-tpl-values.yaml
│   │       │   ├── extraenv-and-passwordSecret-values.golden.yaml
│   │       │   ├── extraenv-and-passwordSecret-values.yaml
│   │       │   ├── extraenv-values.golden.yaml
│   │       │   ├── extraenv-values.yaml
│   │       │   ├── extravolumes-values.golden.yaml
│   │       │   ├── extravolumes-values.yaml
│   │       │   ├── initcontainer-string-values.golden.yaml
│   │       │   ├── initcontainer-string-values.yaml
│   │       │   ├── initcontainer-tpl-values.golden.yaml
│   │       │   ├── initcontainer-tpl-values.yaml
│   │       │   ├── password-old-env-values.golden.yaml
│   │       │   ├── password-old-env-values.yaml
│   │       │   ├── passwordsecret-values.golden.yaml
│   │       │   ├── passwordsecret-values.tpl.golden.yaml
│   │       │   ├── passwordsecret-values.tpl.yaml
│   │       │   ├── passwordsecret-values.yaml
│   │       │   ├── persistence-and-existing-secret.golden.yaml
│   │       │   ├── persistence-and-existing-secret.yaml
│   │       │   ├── persistent-values.golden.yaml
│   │       │   ├── persistent-values.yaml
│   │       │   ├── priorityclassname-values.golden.yaml
│   │       │   ├── priorityclassname-values.yaml
│   │       │   ├── prometheusrules-values.golden.yaml
│   │       │   ├── prometheusrules-values.yaml
│   │       │   ├── resources-values.golden.yaml
│   │       │   ├── resources-values.yaml
│   │       │   ├── securitycontext-values.golden.yaml
│   │       │   ├── securitycontext-values.yaml
│   │       │   ├── service-loadbalancer-ip.golden.yaml
│   │       │   ├── service-loadbalancer-ip.yaml
│   │       │   ├── service-monitor-values.golden.yaml
│   │       │   ├── service-monitor-values.yaml
│   │       │   ├── taints-tolerations-values.golden.yaml
│   │       │   ├── taints-tolerations-values.yaml
│   │       │   ├── tls-values.golden.yaml
│   │       │   ├── tls-values.yaml
│   │       │   ├── tolerations-values.golden.yaml
│   │       │   └── tolerations-values.yaml
│   │       ├── go.mod
│   │       ├── go.sum
│   │       ├── golden_test.go
│   │       ├── templates/
│   │       │   ├── NOTES.txt
│   │       │   ├── _helpers.tpl
│   │       │   ├── _pod.tpl
│   │       │   ├── certificate.yaml
│   │       │   ├── deployment.yaml
│   │       │   ├── extra-manifests.yaml
│   │       │   ├── metrics-service.yaml
│   │       │   ├── prometheusrule.yaml
│   │       │   ├── service.yaml
│   │       │   ├── serviceaccount.yaml
│   │       │   ├── servicemonitor.yaml
│   │       │   ├── statefulset.yaml
│   │       │   └── tls-secret.yaml
│   │       └── values.yaml
│   ├── docker/
│   │   ├── README.md
│   │   └── docker-compose.yml
│   └── scripts/
│       ├── conventional-commits
│       └── signed-commit
├── docs/
│   ├── README.md
│   ├── async-tiering.md
│   ├── cluster-node-health.md
│   ├── coordinator.excalidraw
│   ├── dashtable.md
│   ├── dense_set.excalidraw
│   ├── dense_set.md
│   ├── df-share-nothing.md
│   ├── differences.md
│   ├── faq.md
│   ├── memcached_benchmark.md
│   ├── memory_bgsave.tsv
│   ├── namespaces.md
│   ├── quick-start/
│   │   └── README.md
│   ├── rdbsave.excalidraw
│   ├── rdbsave.md
│   ├── shard-serialization.md
│   ├── thread-per-core.excalidraw
│   └── transaction.md
├── fuzz/
│   ├── FUZZING.md
│   ├── dict/
│   │   ├── memcache.dict
│   │   └── resp.dict
│   ├── generate_targeted_seeds.py
│   ├── memcache_mutator.py
│   ├── package_crash.sh
│   ├── replay_crash.py
│   ├── resp_mutator.py
│   ├── run_fuzzer.sh
│   ├── seeds/
│   │   ├── memcache/
│   │   │   ├── add_replace.mc
│   │   │   ├── append_prepend.mc
│   │   │   ├── cas.mc
│   │   │   ├── delete.mc
│   │   │   ├── expiry.mc
│   │   │   ├── flags.mc
│   │   │   ├── flush.mc
│   │   │   ├── gat.mc
│   │   │   ├── incr_decr.mc
│   │   │   ├── large_value.mc
│   │   │   ├── meta_commands.mc
│   │   │   ├── multiget.mc
│   │   │   ├── noreply.mc
│   │   │   ├── set_get.mc
│   │   │   └── stats_version.mc
│   │   └── resp/
│   │       ├── acl.resp
│   │       ├── acl_ops.resp
│   │       ├── acl_ops2.resp
│   │       ├── bf_add.resp
│   │       ├── bitfield.resp
│   │       ├── bitfield_ops.resp
│   │       ├── bitops.resp
│   │       ├── bloom_ops.resp
│   │       ├── client.resp
│   │       ├── config.resp
│   │       ├── copy.resp
│   │       ├── del.resp
│   │       ├── eval.resp
│   │       ├── expire_ops.resp
│   │       ├── function.resp
│   │       ├── function_ops.resp
│   │       ├── generic_ops.resp
│   │       ├── generic_ops2.resp
│   │       ├── geo_ops.resp
│   │       ├── geo_ops2.resp
│   │       ├── geoadd.resp
│   │       ├── get.resp
│   │       ├── getdel.resp
│   │       ├── hash_ops.resp
│   │       ├── hash_ops2.resp
│   │       ├── hll_ops.resp
│   │       ├── hset.resp
│   │       ├── json.resp
│   │       ├── json_ops.resp
│   │       ├── json_ops2.resp
│   │       ├── list_blocking.resp
│   │       ├── list_ops.resp
│   │       ├── lmpop.resp
│   │       ├── lpos.resp
│   │       ├── lpush.resp
│   │       ├── memory.resp
│   │       ├── monitor.resp
│   │       ├── mset.resp
│   │       ├── multi_type_pipeline.resp
│   │       ├── object.resp
│   │       ├── pfadd.resp
│   │       ├── ping.resp
│   │       ├── pipeline.resp
│   │       ├── pubsub_ops.resp
│   │       ├── pubsub_ops2.resp
│   │       ├── rename.resp
│   │       ├── rpoplpush.resp
│   │       ├── sadd.resp
│   │       ├── scan_hscan.resp
│   │       ├── script_ops.resp
│   │       ├── script_ops2.resp
│   │       ├── sdiffstore.resp
│   │       ├── search_ops.resp
│   │       ├── search_ops2.resp
│   │       ├── server_ops.resp
│   │       ├── server_ops2.resp
│   │       ├── set.resp
│   │       ├── set_ops.resp
│   │       ├── set_ops2.resp
│   │       ├── smove.resp
│   │       ├── sort.resp
│   │       ├── srandmember.resp
│   │       ├── stream_ops.resp
│   │       ├── stream_ops2.resp
│   │       ├── string_ops.resp
│   │       ├── string_ops2.resp
│   │       ├── subscribe.resp
│   │       ├── throttle.resp
│   │       ├── transaction.resp
│   │       ├── transaction_ops2.resp
│   │       ├── watch.resp
│   │       ├── watch_multi.resp
│   │       ├── xadd.resp
│   │       ├── xread.resp
│   │       ├── zadd.resp
│   │       ├── zmpop.resp
│   │       ├── zrangebyscore.resp
│   │       ├── zset_ops.resp
│   │       └── zset_ops2.resp
│   └── triage_crashes.sh
├── go.work
├── go.work.sum
├── patches/
│   └── mimalloc-v2.2.4/
│       ├── 0_base.patch
│       ├── 1_add_stat_type.patch
│       ├── 2_return_stat.patch
│       ├── 3_track_full_size.patch
│       └── 4_fix_heap_collect.patch
├── pyproject.toml
├── src/
│   ├── .gitignore
│   ├── CMakeLists.txt
│   ├── GetGitRevisionDescription.cmake
│   ├── GetGitRevisionDescription.cmake.in
│   ├── common/
│   │   ├── arg_range.h
│   │   ├── backed_args.h
│   │   ├── heap_size.h
│   │   └── string_or_view.h
│   ├── core/
│   │   ├── CMakeLists.txt
│   │   ├── allocation_tracker.cc
│   │   ├── allocation_tracker.h
│   │   ├── allocation_tracker_test.cc
│   │   ├── bloom.cc
│   │   ├── bloom.h
│   │   ├── bloom_test.cc
│   │   ├── bptree_set.h
│   │   ├── bptree_set_test.cc
│   │   ├── cms.cc
│   │   ├── cms.h
│   │   ├── cms_test.cc
│   │   ├── collection_entry.h
│   │   ├── compact_object.cc
│   │   ├── compact_object.h
│   │   ├── compact_object_test.cc
│   │   ├── dash.h
│   │   ├── dash_bench.cc
│   │   ├── dash_internal.h
│   │   ├── dash_test.cc
│   │   ├── dense_set.cc
│   │   ├── dense_set.h
│   │   ├── detail/
│   │   │   ├── bitpacking.cc
│   │   │   ├── bitpacking.h
│   │   │   ├── bptree_internal.h
│   │   │   ├── gen_utils.h
│   │   │   ├── listpack.cc
│   │   │   ├── listpack.h
│   │   │   ├── listpack_wrap.cc
│   │   │   ├── listpack_wrap.h
│   │   │   └── stateless_allocator.h
│   │   ├── dfly_core_test.cc
│   │   ├── dict_builder.cc
│   │   ├── dict_builder.h
│   │   ├── dict_builder_test.cc
│   │   ├── dragonfly_core.cc
│   │   ├── expire_period.h
│   │   ├── extent_tree.cc
│   │   ├── extent_tree.h
│   │   ├── extent_tree_test.cc
│   │   ├── flatbuffers.h
│   │   ├── flatbuffers_test.cc
│   │   ├── generate_bin_sizes.py
│   │   ├── glob_matcher.cc
│   │   ├── glob_matcher.h
│   │   ├── huff_coder.cc
│   │   ├── huff_coder.h
│   │   ├── intent_lock.h
│   │   ├── interpreter.cc
│   │   ├── interpreter.h
│   │   ├── interpreter_polyfill.h
│   │   ├── interpreter_test.cc
│   │   ├── json/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── detail/
│   │   │   │   ├── common.h
│   │   │   │   ├── flat_dfs.cc
│   │   │   │   ├── flat_dfs.h
│   │   │   │   ├── interned_blob.cc
│   │   │   │   ├── interned_blob.h
│   │   │   │   ├── interned_string.cc
│   │   │   │   ├── interned_string.h
│   │   │   │   ├── jsoncons_dfs.cc
│   │   │   │   └── jsoncons_dfs.h
│   │   │   ├── driver.cc
│   │   │   ├── driver.h
│   │   │   ├── interned_blob_test.cc
│   │   │   ├── json_object.cc
│   │   │   ├── json_object.h
│   │   │   ├── json_test.cc
│   │   │   ├── jsonpath_grammar.y
│   │   │   ├── jsonpath_lexer.lex
│   │   │   ├── jsonpath_test.cc
│   │   │   ├── lexer_impl.cc
│   │   │   ├── lexer_impl.h
│   │   │   ├── path.cc
│   │   │   └── path.h
│   │   ├── linear_search_map.h
│   │   ├── linear_search_map_test.cc
│   │   ├── listpack_test.cc
│   │   ├── memory_test.cc
│   │   ├── mi_memory_resource.cc
│   │   ├── mi_memory_resource.h
│   │   ├── oah_entry.cc
│   │   ├── oah_entry.h
│   │   ├── oah_set.h
│   │   ├── oah_set_test.cc
│   │   ├── overloaded.h
│   │   ├── page_usage/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── page_usage_stats.cc
│   │   │   └── page_usage_stats.h
│   │   ├── page_usage_stats_test.cc
│   │   ├── qlist.cc
│   │   ├── qlist.h
│   │   ├── qlist_test.cc
│   │   ├── score_map.cc
│   │   ├── score_map.h
│   │   ├── score_map_test.cc
│   │   ├── sds_utils.cc
│   │   ├── sds_utils.h
│   │   ├── search/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── ast_expr.cc
│   │   │   ├── ast_expr.h
│   │   │   ├── base.cc
│   │   │   ├── base.h
│   │   │   ├── block_list.cc
│   │   │   ├── block_list.h
│   │   │   ├── block_list_test.cc
│   │   │   ├── compressed_sorted_set.cc
│   │   │   ├── compressed_sorted_set.h
│   │   │   ├── compressed_sorted_set_test.cc
│   │   │   ├── hnsw_alg.h
│   │   │   ├── hnsw_index.cc
│   │   │   ├── hnsw_index.h
│   │   │   ├── index_result.h
│   │   │   ├── indices.cc
│   │   │   ├── indices.h
│   │   │   ├── lexer.lex
│   │   │   ├── mrmw_mutex.h
│   │   │   ├── mrmw_mutex_test.cc
│   │   │   ├── parser.y
│   │   │   ├── query_driver.cc
│   │   │   ├── query_driver.h
│   │   │   ├── range_tree.cc
│   │   │   ├── range_tree.h
│   │   │   ├── range_tree_test.cc
│   │   │   ├── rax_tree.h
│   │   │   ├── rax_tree_test.cc
│   │   │   ├── renewable_quota.cc
│   │   │   ├── renewable_quota.h
│   │   │   ├── scanner.h
│   │   │   ├── search.cc
│   │   │   ├── search.h
│   │   │   ├── search_parser_test.cc
│   │   │   ├── search_test.cc
│   │   │   ├── sort_indices.cc
│   │   │   ├── sort_indices.h
│   │   │   ├── stateless_allocator.h
│   │   │   ├── synonyms.cc
│   │   │   ├── synonyms.h
│   │   │   ├── tag_types.h
│   │   │   ├── vector_utils.cc
│   │   │   └── vector_utils.h
│   │   ├── segment_allocator.cc
│   │   ├── segment_allocator.h
│   │   ├── size_tracking_channel.h
│   │   ├── small_string.cc
│   │   ├── small_string.h
│   │   ├── sorted_map.cc
│   │   ├── sorted_map.h
│   │   ├── sorted_map_test.cc
│   │   ├── sse_port.h
│   │   ├── string_map.cc
│   │   ├── string_map.h
│   │   ├── string_map_test.cc
│   │   ├── string_set.cc
│   │   ├── string_set.h
│   │   ├── string_set_test.cc
│   │   ├── task_queue.cc
│   │   ├── task_queue.h
│   │   ├── testdata/
│   │   │   ├── ids.txt.zst
│   │   │   └── list.txt.zst
│   │   ├── tiering_types.cc
│   │   ├── tiering_types.h
│   │   ├── top_keys.cc
│   │   ├── top_keys.h
│   │   ├── top_keys_test.cc
│   │   ├── topk.cc
│   │   ├── topk.h
│   │   ├── topk_test.cc
│   │   ├── tx_queue.cc
│   │   ├── tx_queue.h
│   │   └── zstd_test.cc
│   ├── external_libs.cmake
│   ├── facade/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   ├── cmd_arg_parser.cc
│   │   ├── cmd_arg_parser.h
│   │   ├── cmd_arg_parser_test.cc
│   │   ├── command_id.h
│   │   ├── conn_context.h
│   │   ├── connection_ref.h
│   │   ├── disk_backed_queue.cc
│   │   ├── disk_backed_queue.h
│   │   ├── disk_backed_queue_test.cc
│   │   ├── dragonfly_connection.cc
│   │   ├── dragonfly_connection.h
│   │   ├── dragonfly_listener.cc
│   │   ├── dragonfly_listener.h
│   │   ├── error.h
│   │   ├── facade.cc
│   │   ├── facade_stats.h
│   │   ├── facade_test.cc
│   │   ├── facade_test.h
│   │   ├── facade_types.h
│   │   ├── memcache_parser.cc
│   │   ├── memcache_parser.h
│   │   ├── memcache_parser_test.cc
│   │   ├── ok_main.cc
│   │   ├── op_status.cc
│   │   ├── op_status.h
│   │   ├── parsed_command.cc
│   │   ├── parsed_command.h
│   │   ├── redis_parser.cc
│   │   ├── redis_parser.h
│   │   ├── redis_parser_test.cc
│   │   ├── reply_builder.cc
│   │   ├── reply_builder.h
│   │   ├── reply_builder_test.cc
│   │   ├── reply_capture.cc
│   │   ├── reply_capture.h
│   │   ├── reply_mode.h
│   │   ├── reply_payload.h
│   │   ├── resp_expr.cc
│   │   ├── resp_expr.h
│   │   ├── resp_expr_test_utils.cc
│   │   ├── resp_expr_test_utils.h
│   │   ├── resp_parser.cc
│   │   ├── resp_parser.h
│   │   ├── resp_parser_test.cc
│   │   ├── resp_srv_parser.cc
│   │   ├── resp_srv_parser.h
│   │   ├── resp_srv_parser_test.cc
│   │   ├── resp_validator.cc
│   │   ├── service_interface.cc
│   │   ├── service_interface.h
│   │   ├── socket_utils.cc
│   │   ├── socket_utils.h
│   │   ├── tls_helpers.cc
│   │   └── tls_helpers.h
│   ├── huff/
│   │   ├── LICENSE
│   │   ├── README.md
│   │   ├── hist.h
│   │   ├── huf.h
│   │   └── mem.h
│   ├── redis/
│   │   ├── CMakeLists.txt
│   │   ├── LICENSE.redis
│   │   ├── config.h
│   │   ├── crc16.c
│   │   ├── crc16.h
│   │   ├── crc64.c
│   │   ├── crc64.h
│   │   ├── crcspeed.c
│   │   ├── crcspeed.h
│   │   ├── debug.c
│   │   ├── dict.c
│   │   ├── dict.h
│   │   ├── endianconv.h
│   │   ├── geo.c
│   │   ├── geo.h
│   │   ├── geohash.c
│   │   ├── geohash.h
│   │   ├── geohash_helper.c
│   │   ├── geohash_helper.h
│   │   ├── hiredis.c
│   │   ├── hiredis.h
│   │   ├── hyperloglog.c
│   │   ├── hyperloglog.h
│   │   ├── intset.c
│   │   ├── intset.h
│   │   ├── listpack.c
│   │   ├── listpack.h
│   │   ├── lua/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── README.md
│   │   │   ├── bit/
│   │   │   │   └── bit.c
│   │   │   ├── cjson/
│   │   │   │   ├── fpconv.c
│   │   │   │   ├── fpconv.h
│   │   │   │   ├── lua_cjson.c
│   │   │   │   ├── strbuf.c
│   │   │   │   └── strbuf.h
│   │   │   ├── cmsgpack/
│   │   │   │   └── lua_cmsgpack.c
│   │   │   └── struct/
│   │   │       └── lua_struct.c
│   │   ├── lzf.h
│   │   ├── lzfP.h
│   │   ├── lzf_c.c
│   │   ├── lzf_d.c
│   │   ├── rax.c
│   │   ├── rax.h
│   │   ├── rax_malloc.h
│   │   ├── rdb.h
│   │   ├── read.c
│   │   ├── read.h
│   │   ├── redis_aux.c
│   │   ├── redis_aux.h
│   │   ├── sds.c
│   │   ├── sds.h
│   │   ├── sdsalloc.h
│   │   ├── siphash.c
│   │   ├── stream.h
│   │   ├── t_stream.c
│   │   ├── util.c
│   │   ├── util.h
│   │   ├── ziplist.c
│   │   ├── ziplist.h
│   │   ├── zmalloc.c
│   │   ├── zmalloc.h
│   │   └── zmalloc_mi.c
│   └── server/
│       ├── CMakeLists.txt
│       ├── acl/
│       │   ├── acl_commands_def.h
│       │   ├── acl_family.cc
│       │   ├── acl_family.h
│       │   ├── acl_family_test.cc
│       │   ├── acl_log.cc
│       │   ├── acl_log.h
│       │   ├── user.cc
│       │   ├── user.h
│       │   ├── user_registry.cc
│       │   ├── user_registry.h
│       │   ├── validator.cc
│       │   └── validator.h
│       ├── bitops_family.cc
│       ├── bitops_family_test.cc
│       ├── blocking_controller.cc
│       ├── blocking_controller.h
│       ├── blocking_controller_test.cc
│       ├── bloom_family.cc
│       ├── bloom_family_test.cc
│       ├── channel_store.cc
│       ├── channel_store.h
│       ├── cluster/
│       │   ├── CMakeLists.txt
│       │   ├── cluster_config.cc
│       │   ├── cluster_config.h
│       │   ├── cluster_config_test.cc
│       │   ├── cluster_defs.cc
│       │   ├── cluster_defs.h
│       │   ├── cluster_family.cc
│       │   ├── cluster_family.h
│       │   ├── cluster_family_test.cc
│       │   ├── cluster_utility.cc
│       │   ├── cluster_utility.h
│       │   ├── coordinator.cc
│       │   ├── coordinator.h
│       │   ├── incoming_slot_migration.cc
│       │   ├── incoming_slot_migration.h
│       │   ├── outgoing_slot_migration.cc
│       │   ├── outgoing_slot_migration.h
│       │   └── slot_set.h
│       ├── cluster_support.cc
│       ├── cluster_support.h
│       ├── cmd_support.cc
│       ├── cmd_support.h
│       ├── cms_family.cc
│       ├── cms_family_test.cc
│       ├── collection_family_fallback.cc
│       ├── command_families.h
│       ├── command_registry.cc
│       ├── command_registry.h
│       ├── common.cc
│       ├── common.h
│       ├── common_types.h
│       ├── config_registry.cc
│       ├── config_registry.h
│       ├── conn_context.cc
│       ├── conn_context.h
│       ├── container_utils.cc
│       ├── container_utils.h
│       ├── db_slice.cc
│       ├── db_slice.h
│       ├── debugcmd.cc
│       ├── debugcmd.h
│       ├── detail/
│       │   ├── compressor.cc
│       │   ├── compressor.h
│       │   ├── decompress.cc
│       │   ├── decompress.h
│       │   ├── save_stages_controller.cc
│       │   ├── save_stages_controller.h
│       │   ├── snapshot_storage.cc
│       │   ├── snapshot_storage.h
│       │   ├── table.h
│       │   └── wrapped_json_path.h
│       ├── dfly_bench.cc
│       ├── dfly_main.cc
│       ├── dflycmd.cc
│       ├── dflycmd.h
│       ├── dragonfly_test.cc
│       ├── engine_shard.cc
│       ├── engine_shard.h
│       ├── engine_shard_set.cc
│       ├── engine_shard_set.h
│       ├── engine_shard_set_test.cc
│       ├── error.cc
│       ├── error.h
│       ├── execution_state.cc
│       ├── execution_state.h
│       ├── family_utils.cc
│       ├── family_utils.h
│       ├── generic_family.cc
│       ├── generic_family.h
│       ├── generic_family_test.cc
│       ├── geo_family.cc
│       ├── geo_family_test.cc
│       ├── hll_family.cc
│       ├── hll_family_test.cc
│       ├── hset_family.cc
│       ├── hset_family.h
│       ├── hset_family_test.cc
│       ├── http_api.cc
│       ├── http_api.h
│       ├── journal/
│       │   ├── CMakeLists.txt
│       │   ├── cmd_serializer.cc
│       │   ├── cmd_serializer.h
│       │   ├── executor.cc
│       │   ├── executor.h
│       │   ├── journal.cc
│       │   ├── journal.h
│       │   ├── journal_slice.cc
│       │   ├── journal_slice.h
│       │   ├── journal_test.cc
│       │   ├── pending_buf.h
│       │   ├── serializer.cc
│       │   ├── serializer.h
│       │   ├── streamer.cc
│       │   ├── streamer.h
│       │   ├── tx_executor.cc
│       │   ├── tx_executor.h
│       │   ├── types.cc
│       │   └── types.h
│       ├── json_family.cc
│       ├── json_family_memory_test.cc
│       ├── json_family_test.cc
│       ├── list_family.cc
│       ├── list_family_test.cc
│       ├── main_service.cc
│       ├── main_service.h
│       ├── memory_cmd.cc
│       ├── memory_cmd.h
│       ├── multi_command_squasher.cc
│       ├── multi_command_squasher.h
│       ├── multi_test.cc
│       ├── namespaces.cc
│       ├── namespaces.h
│       ├── protocol_client.cc
│       ├── protocol_client.h
│       ├── rdb_extensions.h
│       ├── rdb_load.cc
│       ├── rdb_load.h
│       ├── rdb_load_context.cc
│       ├── rdb_load_context.h
│       ├── rdb_save.cc
│       ├── rdb_save.h
│       ├── rdb_test.cc
│       ├── replica.cc
│       ├── replica.h
│       ├── replica_types.h
│       ├── script_mgr.cc
│       ├── script_mgr.h
│       ├── search/
│       │   ├── CMakeLists.txt
│       │   ├── aggregator.cc
│       │   ├── aggregator.h
│       │   ├── aggregator_test.cc
│       │   ├── doc_accessors.cc
│       │   ├── doc_accessors.h
│       │   ├── doc_index.cc
│       │   ├── doc_index.h
│       │   ├── doc_index_fallback.cc
│       │   ├── global_hnsw_index.cc
│       │   ├── global_hnsw_index.h
│       │   ├── index_builder.cc
│       │   ├── index_builder.h
│       │   ├── index_join.cc
│       │   ├── index_join.h
│       │   ├── index_join_test.cc
│       │   ├── search_family.cc
│       │   ├── search_family.h
│       │   └── search_family_test.cc
│       ├── serializer_base.cc
│       ├── serializer_base.h
│       ├── serializer_base_test.cc
│       ├── serializer_commons.cc
│       ├── serializer_commons.h
│       ├── server_family.cc
│       ├── server_family.h
│       ├── server_family_test.cc
│       ├── server_state.cc
│       ├── server_state.h
│       ├── set_family.cc
│       ├── set_family.h
│       ├── set_family_test.cc
│       ├── sharding.cc
│       ├── sharding.h
│       ├── slowlog.cc
│       ├── slowlog.h
│       ├── snapshot.cc
│       ├── snapshot.h
│       ├── stats.cc
│       ├── stats.h
│       ├── stream_family.cc
│       ├── stream_family.h
│       ├── stream_family_test.cc
│       ├── string_family.cc
│       ├── string_family_test.cc
│       ├── string_stats.cc
│       ├── string_stats.h
│       ├── string_stats_test.cc
│       ├── synchronization.cc
│       ├── synchronization.h
│       ├── table.cc
│       ├── table.h
│       ├── test_utils.cc
│       ├── test_utils.h
│       ├── testdata/
│       │   ├── RDB_TYPE_STREAM_LISTPACKS_2.rdb
│       │   ├── RDB_TYPE_STREAM_LISTPACKS_3.rdb
│       │   ├── empty.rdb
│       │   ├── hll.rdb
│       │   ├── ignore_expiry.rdb
│       │   ├── redis6_small.rdb
│       │   ├── redis6_stream.rdb
│       │   ├── redis7_small.rdb
│       │   └── redis_json.rdb
│       ├── tiered_storage.cc
│       ├── tiered_storage.h
│       ├── tiered_storage_test.cc
│       ├── tiering/
│       │   ├── CMakeLists.txt
│       │   ├── common.h
│       │   ├── decoders.cc
│       │   ├── decoders.h
│       │   ├── disk_storage.cc
│       │   ├── disk_storage.h
│       │   ├── disk_storage_test.cc
│       │   ├── entry_map.h
│       │   ├── external_alloc.cc
│       │   ├── external_alloc.h
│       │   ├── external_alloc_test.cc
│       │   ├── op_manager.cc
│       │   ├── op_manager.h
│       │   ├── op_manager_test.cc
│       │   ├── serialized_map.cc
│       │   ├── serialized_map.h
│       │   ├── serialized_map_test.cc
│       │   ├── small_bins.cc
│       │   ├── small_bins.h
│       │   ├── small_bins_test.cc
│       │   └── test_common.h
│       ├── transaction.cc
│       ├── transaction.h
│       ├── tx_base.cc
│       ├── tx_base.h
│       ├── version.cc.in
│       ├── version.h
│       ├── version_monitor.cc
│       ├── version_monitor.h
│       ├── zset_family.cc
│       ├── zset_family.h
│       └── zset_family_test.cc
├── tests/
│   ├── README.md
│   ├── dragonfly/
│   │   ├── __init__.py
│   │   ├── acl_family_test.py
│   │   ├── bull_sidekiq_test.py
│   │   ├── celery_test.py
│   │   ├── cluster_mgr_test.py
│   │   ├── cluster_test.py
│   │   ├── config_test.py
│   │   ├── conftest.py
│   │   ├── connection_test.py
│   │   ├── eval_test.py
│   │   ├── generic_test.py
│   │   ├── http_conf_test.py
│   │   ├── instance.py
│   │   ├── json_test.py
│   │   ├── list_family_test.py
│   │   ├── management_test.py
│   │   ├── memcache_meta.py
│   │   ├── memory_test.py
│   │   ├── proxy.py
│   │   ├── pymemcached_test.py
│   │   ├── redis_replication_test.py
│   │   ├── replication_test.py
│   │   ├── requirements.txt
│   │   ├── search_benchmark_test.py
│   │   ├── search_benchmark_utils.py
│   │   ├── search_test.py
│   │   ├── seeder/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── script-generate.lua
│   │   │   ├── script-genlib.lua
│   │   │   ├── script-hash.lua
│   │   │   ├── script-hashlib.lua
│   │   │   └── script-utillib.lua
│   │   ├── seeder_test.py
│   │   ├── sentinel_test.py
│   │   ├── server_family_test.py
│   │   ├── set_test.py
│   │   ├── shutdown_test.py
│   │   ├── snapshot_test.py
│   │   ├── test_dash_gc.py
│   │   ├── tiering_test.py
│   │   ├── tls_conf_test.py
│   │   ├── utility.py
│   │   └── valkey_search/
│   │       ├── README.md
│   │       ├── __init__.py
│   │       ├── conftest.py
│   │       ├── sync-valkey-search-tests.sh
│   │       ├── util.py
│   │       └── valkey_search_test_case_dragonfly.py
│   ├── fakeredis/
│   │   ├── README.md
│   │   ├── pyproject.toml
│   │   └── test/
│   │       ├── __init__.py
│   │       ├── conftest.py
│   │       ├── test_asyncredis.py
│   │       ├── test_hypotesis_joint/
│   │       │   ├── __init__.py
│   │       │   └── test_joint.py
│   │       ├── test_hypothesis/
│   │       │   ├── __init__.py
│   │       │   ├── _server_info.py
│   │       │   ├── base.py
│   │       │   ├── test_connection.py
│   │       │   ├── test_hash.py
│   │       │   ├── test_list.py
│   │       │   ├── test_server.py
│   │       │   ├── test_set.py
│   │       │   ├── test_string.py
│   │       │   ├── test_transaction.py
│   │       │   └── test_zset.py
│   │       ├── test_issues.py
│   │       ├── test_json/
│   │       │   ├── __init__.py
│   │       │   ├── test_json.py
│   │       │   ├── test_json_arr_commands.py
│   │       │   └── test_json_commands.py
│   │       ├── test_mixins/
│   │       │   ├── __init__.py
│   │       │   ├── test_bitmap_commands.py
│   │       │   ├── test_connection.py
│   │       │   ├── test_generic_commands.py
│   │       │   ├── test_geo_commands.py
│   │       │   ├── test_hash_commands.py
│   │       │   ├── test_list_commands.py
│   │       │   ├── test_pubsub_commands.py
│   │       │   ├── test_scan.py
│   │       │   ├── test_scripting.py
│   │       │   ├── test_server_commands.py
│   │       │   ├── test_set_commands.py
│   │       │   ├── test_sortedset_commands.py
│   │       │   ├── test_streams_commands.py
│   │       │   ├── test_string_commands.py
│   │       │   └── test_zadd.py
│   │       ├── test_stack/
│   │       │   ├── __init__.py
│   │       │   ├── test_bloomfilter.py
│   │       │   ├── test_cms.py
│   │       │   ├── test_cuckoofilter.py
│   │       │   ├── test_tdigest.py
│   │       │   └── test_topk.py
│   │       ├── test_transactions.py
│   │       └── testtools.py
│   ├── integration/
│   │   ├── .dockerignore
│   │   ├── .run_ioredis_valid_test.sh
│   │   ├── async.py
│   │   ├── gen_sets.sh
│   │   ├── generate_sets.py
│   │   ├── ioredis.Dockerfile
│   │   ├── jedis.Dockerfile
│   │   ├── node-redis.Dockerfile
│   │   ├── pascaldekloe.Dockerfile
│   │   ├── relay.Dockerfile
│   │   ├── run_ioredis_on_docker.sh
│   │   └── stress_shutdown.sh
│   └── pytest.ini
└── tools/
    ├── balls_bins.py
    ├── benchmark/
    │   ├── k8s-benchmark-job.yaml
    │   └── post_run_checks.py
    ├── cache_logs_player.py
    ├── cache_testing.py
    ├── cluster_mgr.py
    ├── defrag_db.py
    ├── defrag_mem_test.py
    ├── docker/
    │   ├── entrypoint.sh
    │   ├── fetch_release.sh
    │   └── healthcheck.sh
    ├── eviction/
    │   ├── fill_db.py
    │   ├── run_fill_db.sh
    │   └── stop_fill_db.sh
    ├── faulty_io.sh
    ├── generate-tls-files.sh
    ├── json_benchmark.py
    ├── local/
    │   ├── gen-test-certs.sh
    │   └── monitoring/
    │       ├── docker-compose.yml
    │       ├── grafana/
    │       │   ├── config.monitoring
    │       │   └── provisioning/
    │       │       ├── dashboards/
    │       │       │   ├── dashboard.yml
    │       │       │   ├── dragonfly.json
    │       │       │   ├── memcached.json
    │       │       │   ├── node-exporter.json
    │       │       │   └── redis.json
    │       │       └── datasources/
    │       │           └── datasource.yml
    │       └── prometheus/
    │           └── prometheus.yml
    ├── packaging/
    │   ├── Dockerfile.alpine-dev
    │   ├── Dockerfile.ubuntu-dev
    │   ├── Dockerfile.ubuntu-prod
    │   ├── README.md
    │   ├── debian/
    │   │   ├── compat
    │   │   ├── control
    │   │   ├── dragonfly.conf
    │   │   ├── dragonfly.install
    │   │   ├── dragonfly.logrotate
    │   │   ├── dragonfly.postinst
    │   │   ├── dragonfly.postrm
    │   │   ├── dragonfly.preinst
    │   │   ├── dragonfly.service
    │   │   └── rules
    │   ├── generate_changelog.sh
    │   ├── generate_debian_package.sh
    │   ├── osrepos/
    │   │   ├── README.md
    │   │   ├── dragonfly.repo
    │   │   ├── dragonfly.sources
    │   │   ├── pgp-key.public
    │   │   ├── reprepro-config/
    │   │   │   ├── distributions
    │   │   │   └── options
    │   │   ├── requirements.txt
    │   │   └── scripts/
    │   │       ├── fetch-releases.py
    │   │       ├── generate-apt-repo.sh
    │   │       ├── generate-index.py
    │   │       └── sign-rpms.sh
    │   └── rpm/
    │       ├── build_rpm.sh
    │       ├── dragonfly.service
    │       └── dragonfly.spec
    ├── parse_allocator_tracking_logs.py
    ├── plot_memtier_latency.py
    ├── release.sh
    ├── replay/
    │   ├── go.mod
    │   ├── go.sum
    │   ├── main.go
    │   ├── parsing.go
    │   └── workers.go
    ├── requirements.txt
    ├── run_master_replica.sh
    └── vector-benches/
        ├── README.md
        ├── go.mod
        ├── go.sum
        └── main.go

================================================
FILE CONTENTS
================================================

================================================
FILE: .agent/rules/ANTIGRAVITY_INSTRUCTIONS.md
================================================
# Antigravity Agent Instructions for Dragonfly

**READ [AGENTS.md](../../AGENTS.md)**

All project information, workflows, patterns, and guidelines are in `AGENTS.md`.


================================================
FILE: .circleci/config.yml
================================================
version: 2.1

machine: true

jobs:
  build-ubuntu:
      docker: 
        - image: ghcr.io/romange/ubuntu-dev:22
      steps:
        - checkout
        - run:
            name: Set up environment
            environment:
              BUILD_TYPE: Debug            
            command: | 
              git submodule update --init --recursive
              cmake -B build -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -GNinja -DCMAKE_CXX_COMPILER_LAUNCHER=ccache              
        - run:
            name: Build & Test
            command: | 
              cd build && pwd 
              ninja -j4 src/all
              ctest -V -L DFLY
          

# Orchestrate our job run sequence
workflows:
  build_and_test:
    jobs:
      - build-ubuntu


================================================
FILE: .clang-format
================================================
# ---
# We'll use defaults from the Google style, but with 2 columns indentation.
BasedOnStyle: Google
IndentWidth: 2
ColumnLimit: 100
---
Language: Cpp
AllowShortLoopsOnASingleLine: false
AllowShortFunctionsOnASingleLine: false
AllowShortIfStatementsOnASingleLine: false
AlwaysBreakTemplateDeclarations: false
PackConstructorInitializers: NextLine
DerivePointerAlignment: false
PointerAlignment: Left
BasedOnStyle: Google
ColumnLimit: 100
---
Language: Proto
BasedOnStyle: Google


================================================
FILE: .clang-tidy
================================================
---

Checks: >
  -abseil-no-namespace,
  bugprone*,
  # Sadly narrowing conversions is too noisy
  -bugprone-narrowing-conversions,
  -bugprone-easily-swappable-parameters,
  -bugprone-branch-clone,
  -bugprone-implicit-widening-of-multiplication-result,
  -bugprone-too-small-loop-variable,
  -bugprone-reserved-identifier,
  boost-use-to-string,
  performance*,
  -cert-err58-cpp,
  -cert-dcl58-cpp,  # Ignore std changes
  -cert-dcl51-cpp,  # bugprone-reserved-identifier
  # Doesn't work with abseil flags
  clang-analyzer*,
  google-*,
  -google-runtime-int,
  -google-readability-*,
  -google-build-using-namespace,
  misc-definitions-in-headers,
  misc-misleading*,
  misc-misplaced-const,
  misc-new-delete-overloads,
  misc-non-copyable-objects,
  misc-redundant-expression,
  misc-static-assert,
  misc-throw-by-value-catch-by-reference,
  misc-unconventional-assign-operator,
  misc-uniqueptr-reset-release,
  misc-unused-alias-decls,
  misc-unused-using-decls,
  modernize-deprecated-headers,
  modernize-macro-to-enum,
  modernize-make-shared,
  modernize-make-unique,
  modernize-pass-by-value,
  modernize-raw-string-literal,
  modernize-redundant-void-arg,
  modernize-replace-disallow-copy-and-assign-macro,
  modernize-return-braced-init-list,
  modernize-shrink-to-fit,
  modernize-unary-static-assert,
  modernize-use-emplace,
  modernize-use-equals-delete,
  modernize-use-noexcept,
  modernize-use-transparent-functors,
  modernize-use-uncaught-exceptions,
  modernize-use-using,
  readability-avoid-const-params-in-decls,
  readability-const-return-type,
  readability-container-contains,
  readability-container-size-empty,
  readability-delete-null-pointer,
  readability-duplicate-include,
  readability-function-size,
  readability-identifier-naming,
  -readability-inconsistent-declaration-parameter-name,
  readability-make-member-function-const,
  readability-misplaced-array-index,
  readability-named-parameter,
  readability-non-const-parameter,
  readability-redundant-access-specifiers,
  readability-redundant-control-flow,
  readability-redundant-declaration,
  readability-redundant-function-ptr-dereference,
  readability-redundant-member-init,
  readability-redundant-preprocessor,
  readability-redundant-smartptr-get,
  readability-redundant-string-cstr,
  readability-redundant-string-init,
  readability-simplify-subscript-expr,
  readability-static-definition-in-anonymous-namespace,
  readability-string-compare,
  readability-suspicious-call-argument,
  readability-uniqueptr-delete-release,
  readability-use-anyofallof


# Disabled because they're currently too disruptive, but one day might be nice to have:
# modernize-use-nullptr,
# modernize-use-equals-default,
# readability-qualified-auto,

CheckOptions:
  - key: bugprone-narrowing-conversions.WarnOnIntegerNarrowingConversion
    value: false
  - key: bugprone-narrowing-conversions.WarnOnEquivalentBitWidth
    value: false


================================================
FILE: .clangd
================================================
Diagnostics:
  UnusedIncludes: None
  MissingIncludes: None
  Includes:
    IgnoreHeader: base/*.h

CompileFlags:
  CompilationDatabase: build-dbg/       # Search for compile_commands.json


================================================
FILE: .claude/hooks/format-after-edit.sh
================================================
#!/bin/bash
# Hook to automatically format files after Edit/Write operations
# Filters out src/redis directory from formatting

# Read JSON input from stdin
INPUT=$(cat)
FILE_PATH=$(echo "$INPUT" | jq -r '.tool_input.file_path // empty')

# Skip if no file path
if [ -z "$FILE_PATH" ]; then
  exit 0
fi

# Skip if file is in src/redis directory
if [[ "$FILE_PATH" == */src/redis/* ]]; then
  echo "Skipping formatting for src/redis file: $FILE_PATH" >&2
  exit 0
fi

# Skip if file doesn't exist
if [ ! -f "$FILE_PATH" ]; then
  exit 0
fi

# Run pre-commit on the file
pre-commit run --files "$FILE_PATH" 2>&1

# Always exit 0 to not block the operation even if formatting fails
exit 0


================================================
FILE: .claude/settings.json
================================================
{
  "permissions": {
    "allow": [
      "Read($CLAUDE_PROJECT_DIR/**)",
      "Edit($CLAUDE_PROJECT_DIR/**)",
      "Write($CLAUDE_PROJECT_DIR/**)",
      "Bash(./*_test:*)",
      "Bash(ninja:*)",
      "Bash(git add:*)",
      "Bash(git reset:*)",
      "Bash(gh issue view:*)",
      "Bash(git log:*)",
      "Bash(git show:*)",
      "WebSearch",
      "Bash(grep:*)",
      "Bash(pre-commit run:*)",
      "Bash(clang-format:*)",
      "Bash(git checkout:*)",
      "Bash(tee:*)",
      "Bash(sort:*)",
      "Bash(git patch-id:*)"
    ]
  },
  "hooks": {
    "PostToolUse": [
      {
        "matcher": "Edit|Write",
        "hooks": [
          {
            "type": "command",
            "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/format-after-edit.sh",
            "timeout": 30,
            "statusMessage": "Formatting code..."
          }
        ]
      }
    ]
  }
}


================================================
FILE: .claude/skills/reproduce-fuzz-crash/SKILL.md
================================================
---
name: reproduce-fuzz-crash
description: >
  Reproduce AFL++ fuzz crashes from GitHub Actions. Use when user provides a
  GitHub Actions fuzz run URL and wants to reproduce and analyze the crash locally.
argument-hint: <github-actions-run-url>
allowed-tools: Bash, Read, Grep, Glob, Write
---

# Reproduce Fuzz Crash

Given a GitHub Actions fuzz run URL, download crash artifacts, triage them
with `fuzz/triage_crashes.sh`, and produce a crash analysis report.

**Input**: `$ARGUMENTS` — a GitHub Actions run URL like:
`https://github.com/dragonflydb/dragonfly/actions/runs/22906484769`
or with query params like `?pr=6855`.

## Workflow

### Step 1: Parse the URL

Extract `owner/repo` and `run_id` from the URL.

```
https://github.com/{owner}/{repo}/actions/runs/{run_id}[?...]
```

Strip any query parameters from `run_id`.

### Step 2: Download artifacts

List crash artifacts via the GitHub API, then download each as a `.zip` directly:

**IMPORTANT**: Run each command as a separate Bash tool call (no `&&` chaining)
to ensure auto-approval works with the user's permission patterns.

```bash
# List artifacts — filter for names containing "crash"
gh api repos/{owner}/{repo}/actions/runs/{run_id}/artifacts

# Create output directory
mkdir -p /tmp/fuzz-repro-{run_id}

# Download each crash artifact by ID (separate command)
gh api repos/{owner}/{repo}/actions/artifacts/{artifact_id}/zip > /tmp/fuzz-repro-{run_id}/<artifact-name>.zip
```

This gives real `.zip` files that the triage script can consume directly.

If no crash artifacts are found, report that the run has no crash artifacts and stop.

Note: there may be duplicate artifact names (same name, different IDs) from
retried jobs. Download the **most recent** one (highest artifact ID).

### Step 3: Determine mode

Infer the protocol mode from the artifact name:
- Contains "memcache" → `memcache`
- Otherwise → `resp`

### Step 4: Check Dragonfly binary

Check if the debug binary already exists and runs:

```bash
./build-dbg/dragonfly --version
```

Only build if the binary doesn't exist or fails to run:

```bash
ninja -C build-dbg dragonfly
```

If `build-dbg` doesn't exist, run `./helio/blaze.sh` first.

### Step 5: Run triage_crashes.sh

For each zip file, run:

```bash
./fuzz/triage_crashes.sh ./build-dbg/dragonfly <mode> /tmp/fuzz-repro-{run_id}/<artifact-name>.zip
```

Capture the full output.

### Step 6: Analyze and report

Parse the triage output for confirmed crashes. For each confirmed crash:

1. **Read the source** at the crash location — use the stack trace to identify
   the source file and line number, then read that code.
2. **Provide analysis**: likely root cause, what to investigate.

Print a structured report:

```
## Fuzz Crash Report

**Run**: {url}
**Artifacts**: {number} crash(es) found

---

### Crash NNNNNN

**Reproduced**: Yes / No (false positive)
**Signal**: SIGABRT (6) / SIGSEGV (11) / etc.

**Stack trace**:
\```
<stack trace from triage output>
\```

**Analysis**:
<1-3 sentences explaining the likely root cause based on the stack trace,
the assertion message, and the crash input. Identify the source file and
line number. Suggest what to investigate.>
```

## Important Notes

- The triage script uses port **6379** (resp) or **11211** (memcache).
  Ensure no other Dragonfly or Redis instance is using these ports.
- The script adds `--rename_command` flags to avoid false positives from
  commands like DEBUG SLEEP that the fuzzer might generate.
- Some crashes are non-deterministic (thread timing). The script reports
  these as "FALSE POSITIVE" — note this clearly, it doesn't mean the bug
  is invalid, just that it didn't reproduce on this run.
- The script handles its own cleanup of Dragonfly processes.
- Do NOT delete `/tmp/fuzz-repro-{run_id}/` — the user may want to inspect it.
- If `gh run download` fails with permissions, suggest the user authenticate
  with `gh auth login`.


================================================
FILE: .ct.yaml
================================================
# See https://github.com/helm/chart-testing#configuration
remote: origin
target-branch: main
chart-dirs:
  - contrib/charts
helm-extra-args: --debug --timeout 60s
check-version-increment: false
validate-maintainers: false


================================================
FILE: .cursorrules
================================================
# Cursor AI Rules for Dragonfly

**READ `AGENTS.md`**

All project information, workflows, patterns, and guidelines are in `AGENTS.md`.


================================================
FILE: .devcontainer/alpine/devcontainer.json
================================================
{
  "name": "alpine-dev",
  "image": "ghcr.io/romange/alpine-dev",
  "customizations": {
    "vscode": {
      "extensions": [
        "ms-vscode.cpptools",
        "ms-vscode.cmake-tools",
        "ms-vscode.cpptools-themes",
        "twxs.cmake"
      ],
      "settings": {
        "cmake.buildDirectory": "/build",
        "extensions.ignoreRecommendations": true,
        "cmake.configureArgs": []
      }
    }
  },
  "mounts": [
    "source=alpine-vol,target=/build,type=volume"
  ],
  "postCreateCommand": ".devcontainer/alpine/post-create.sh ${containerWorkspaceFolder}"
}


================================================
FILE: .devcontainer/alpine/post-create.sh
================================================
#!/bin/bash

containerWorkspaceFolder=$1
git config --global --add safe.directory ${containerWorkspaceFolder}
git config --global --add safe.directory ${containerWorkspaceFolder}/helio
mkdir -p /root/.local/share/CMakeTools


================================================
FILE: .devcontainer/fedora/devcontainer.json
================================================
{
  "name": "fedora30",
  "image": "ghcr.io/romange/fedora:30",
  "customizations": {
    "vscode": {
      "extensions": [
        "ms-vscode.cpptools",
        "ms-vscode.cmake-tools",
        "ms-vscode.cpptools-themes",
        "twxs.cmake"
      ],
      "settings": {
        "cmake.buildDirectory": "/build",
        "extensions.ignoreRecommendations": true
      }
    }
  },
  "mounts": [
    "source=fedora-vol,target=/build,type=volume"
  ]
}


================================================
FILE: .devcontainer/fedora41/devcontainer.json
================================================
{
  "name": "fedora41",
  "image": "ghcr.io/romange/fedora:41",
  "customizations": {
    "vscode": {
      "extensions": [
        "ms-vscode.cpptools",
        "ms-vscode.cmake-tools",
        "ms-vscode.cpptools-themes",
        "twxs.cmake"
      ],
      "settings": {
        "cmake.buildDirectory": "/build",
        "extensions.ignoreRecommendations": true
      }
    }
  },
  "mounts": [
    "source=fedora41-vol,target=/build,type=volume"
  ]
}


================================================
FILE: .devcontainer/ubuntu20/cmake-tools-kits.json
================================================
[
  {
    "name": "GCC x86_64-linux-gnu",
    "compilers": {
      "C": "gcc",
      "CXX": "g++"
    },
    "isTrusted": true
  }
]


================================================
FILE: .devcontainer/ubuntu20/devcontainer.json
================================================
{
  "name": "ubuntu20",
  "image": "ghcr.io/romange/ubuntu-dev:20",
  "customizations": {
    "vscode": {
      "extensions": [
        "ms-vscode.cpptools",
        "ms-vscode.cmake-tools",
        "ms-vscode.cpptools-themes",
        "twxs.cmake"
      ],
      "settings": {
        "cmake.buildDirectory": "/build",
        "extensions.ignoreRecommendations": true
      }
    }
  },
  "mounts": [
    "source=ubuntu20-vol,target=/build,type=volume"
  ],
  "postCreateCommand": ".devcontainer/ubuntu20/post-create.sh ${containerWorkspaceFolder}"
}


================================================
FILE: .devcontainer/ubuntu20/post-create.sh
================================================
#!/bin/bash

containerWorkspaceFolder=$1
git config --global --add safe.directory '*'
mkdir -p /root/.local/share/CMakeTools
cp ${containerWorkspaceFolder}/.devcontainer/ubuntu20/cmake-tools-kits.json /root/.local/share/CMakeTools/


================================================
FILE: .devcontainer/ubuntu20-gcc14/devcontainer.json
================================================
{
  "name": "ubuntu20-gcc14",
  "image": "ghcr.io/romange/ubuntu-dev:20-gcc14",
  "customizations": {
    "vscode": {
      "extensions": [
        "ms-vscode.cpptools",
        "ms-vscode.cmake-tools",
        "ms-vscode.cpptools-themes",
        "twxs.cmake",
        "mk12.better-git-line-blame"
      ],
      "settings": {
        "cmake.buildDirectory": "/build",
        "cmake.configureArgs": [
          "-DWITH_AWS=OFF",
          "-DWITH_GCP=OFF",
          "-DWITH_GPERF=OFF"
        ],
        "extensions.ignoreRecommendations": true
      }
    }
  },
  "mounts": [
    "source=ubuntu20-gcc14-vol,target=/build,type=volume"
  ],
  "postCreateCommand": ".devcontainer/ubuntu20/post-create.sh ${containerWorkspaceFolder}"
}


================================================
FILE: .devcontainer/ubuntu22/devcontainer.json
================================================
{
  "name": "ubuntu22",
  "image": "ghcr.io/romange/ubuntu-dev:22",
  "customizations": {
    "vscode": {
      "extensions": [
        "ms-vscode.cpptools",
        "ms-vscode.cmake-tools",
        "ms-vscode.cpptools-themes",
        "twxs.cmake"
      ],
      "settings": {
        "cmake.buildDirectory": "/build",
        "extensions.ignoreRecommendations": true
      }
    }
  },
  "mounts": [
    "source=ubuntu22-vol,target=/build,type=volume"
  ],
  "postCreateCommand": ".devcontainer/ubuntu22/post-create.sh ${containerWorkspaceFolder}"
}


================================================
FILE: .devcontainer/ubuntu22/post-create.sh
================================================
#!/bin/bash

containerWorkspaceFolder=$1
git config --global --add safe.directory ${containerWorkspaceFolder}
git config --global --add safe.directory ${containerWorkspaceFolder}/helio
mkdir -p /root/.local/share/CMakeTools


================================================
FILE: .devcontainer/ubuntu24/devcontainer.json
================================================
{
  "name": "ubuntu24",
  "image": "ghcr.io/romange/ubuntu-dev:24",
  "customizations": {
    "vscode": {
      "extensions": [
        "ms-vscode.cpptools",
        "ms-vscode.cmake-tools",
        "ms-vscode.cpptools-themes",
        "twxs.cmake"
      ],
      "settings": {
        "cmake.buildDirectory": "/build",
        "extensions.ignoreRecommendations": true
      }
    }
  },
  "mounts": [
    "source=ubuntu24-vol,target=/build,type=volume"
  ],
  "postCreateCommand": ".devcontainer/ubuntu24/post-create.sh ${containerWorkspaceFolder}"
}


================================================
FILE: .dockerignore
================================================
_deps/*
build-*
tools/packaging/*
.github/*

================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: Bug report
about: Create a report to help Dragonfly DB improve
title: ''
labels: 'bug'
assignees: ''

---

**Describe the bug**
A clear and concise description of what the bug is.

**To Reproduce**
Steps to reproduce the behavior:
1. Insert records using `command`
2. Query records using `command`
3. Scroll down to '....'
4. See error

**Expected behavior**
A clear and concise description of what you expected to happen.

**Screenshots**
If applicable, add screenshots to help explain your problem.

**Environment (please complete the following information):**
 - OS: [ubuntu 20.04]
 - Kernel: # Command: `uname -a`
 - Containerized?: [Bare Metal, Docker, Docker Compose, Docker Swarm, Kubernetes, Other]
 - Dragonfly Version: [e.g. 0.3.0]

**Reproducible Code Snippet**
```
# Minimal code snippet to reproduce this bug
```

**Additional context**
Add any other context about the problem here.


================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
blank_issues_enabled: true

contact_links:
  - name: Dragonfly DB Discord Channel
    url: https://discord.gg/HsPjXGVH85
    about: Get help! Ask questions, get support, and share ideas.

  - name: GitHub Discussions
    url: https://github.com/dragonflydb/dragonfly/discussions
    about: Ask Questions. Benchmark Questions Belong here.

  - name: Twitter
    url: https://twitter.com/romanger
    about: Follow Roman on Twitter


================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.md
================================================
---
name: Feature request
about: Suggest an idea for Dragonfly DB
title: ''
labels: 'feature request'
assignees: ''

---
**Did you search GitHub Issues and GitHub Discussions First?**
Many users may find their feature is already being discussed. Help us keep duplicates to a minimum by searching for your feature first to see if it is already in progress.

**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]

**Describe the solution you'd like**
A clear and concise description of what you want to happen.

**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.

**Additional context**
Add any other context or screenshots about the feature request here.


================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
<!--
**Commits Must Be Signed and Your PR title must conform to the conventional commit spec**
  * See: https://github.com/dragonflydb/dragonfly/blob/main/CONTRIBUTING.md
  * Please follow the section on `pre-commit hooks`, a linter will validate before you push

  Example PR Title: <type>(<scope>)!: <description>

  * `type` = bug, chore, feat, fix, docs, build, style, refactor, perf, test
  * `!` = OPTIONAL: signals a breaking change
  * `scope` = Optional when `type` is "chore" or "docs"
  * `description` = short description of the change

Examples:

  * chore(examples): Clarify `docker` usage #120
  * docs(readme): Fix Example Links #121
  * feat(ingest)!: Add new ingest #122
  * fix(ingest): Refactor for loop to list comprehension #123
-->


================================================
FILE: .github/actions/builder/action.yml
================================================
name: Build Dragonfly
description: "Build Dragonfly with configurable CMake options"

inputs:
  build-type:
    description: "CMake build type (Debug or Release)"
    required: false
    default: 'Debug'
    type: string
  build-dir:
    description: "Build directory name (relative to workspace root)"
    required: false
    default: 'build'
    type: string
  c-compiler:
    description: "C compiler to use"
    required: false
    default: ''
    type: string
  cxx-compiler:
    description: "C++ compiler to use"
    required: false
    default: ''
    type: string
  cxx-flags:
    description: "C++ compiler flags"
    required: false
    default: '-no-pie'
    type: string
  sanitizers:
    description: "Enable sanitizers (NoSanitizers or Sanitizers)"
    required: false
    default: 'NoSanitizers'
    type: string
  with-aws:
    description: "Build with AWS support"
    required: false
    default: 'ON'
    type: string
  targets:
    description: "Build targets to compile"
    required: false
    default: 'src/all'
    type: string

runs:
  using: "composite"
  steps:
    - name: Configure CMake
      shell: bash
      run: |
        # Set sanitizer flags
        ASAN="OFF"
        USAN="OFF"
        if [ '${{ inputs.sanitizers }}' = 'Sanitizers' ]; then
          echo "Enabling ASAN/USAN"
          ASAN="ON"
          USAN="ON"
        fi

        # Build cmake command array
        CMAKE_CMD=(cmake
          -B "${GITHUB_WORKSPACE}/${{ inputs.build-dir }}"
          -DCMAKE_BUILD_TYPE="${{ inputs.build-type }}"
          -GNinja
        )

        # Add optional compiler flags
        if [ -n "${{ inputs.c-compiler }}" ]; then
          CMAKE_CMD+=(-DCMAKE_C_COMPILER="${{ inputs.c-compiler }}")
        fi
        if [ -n "${{ inputs.cxx-compiler }}" ]; then
          CMAKE_CMD+=(-DCMAKE_CXX_COMPILER="${{ inputs.cxx-compiler }}")
        fi
        if [ -n "${{ inputs.cxx-flags }}" ]; then
          CMAKE_CMD+=(-DCMAKE_CXX_FLAGS="${{ inputs.cxx-flags }}")
        fi

        # Add fixed options
        CMAKE_CMD+=(
          -DPRINT_STACKTRACES_ON_SIGNAL=ON
          -DWITH_AWS="${{ inputs.with-aws }}"
          -DWITH_GCP=OFF
          -DWITH_UNWIND=OFF
          -DWITH_GPERF=OFF
          -DWITH_ASAN="${ASAN}"
          -DWITH_USAN="${USAN}"
        )

        # Execute CMake
        echo "Running: ${CMAKE_CMD[@]}"
        "${CMAKE_CMD[@]}"

    - name: Build
      shell: bash
      run: |
        cd ${GITHUB_WORKSPACE}/${{ inputs.build-dir }}
        echo "Building target: ${{ inputs.targets }}"
        ninja ${{ inputs.targets }}


================================================
FILE: .github/actions/fuzzing/action.yml
================================================
name: Run AFL++ Fuzzing
description: "Run AFL++ fuzzing campaign with configurable parameters"

inputs:
  mode:
    description: "Fuzzing mode: 'smoke' (stop on first crash) or 'long' (collect all crashes)"
    required: true
    type: string
  target:
    description: "Fuzz target: 'resp' or 'memcache'"
    required: false
    default: 'resp'
    type: string
  duration-minutes:
    description: "Fuzzing duration in minutes"
    required: true
    type: string
  run-number:
    description: "GitHub run number for artifact naming"
    required: true
    type: string
  extra-seeds-dir:
    description: "Directory with additional seed files (initial fuzzer inputs) to merge into the corpus"
    required: false
    default: ''
  focus-commands:
    description: "JSON list of command names for the mutator to prefer (~70% selection weight)"
    required: false
    default: ''
  build:
    description: "Build the binary before fuzzing. Set to 'false' when reusing a binary built by a previous action call in the same job — fails if the binary is missing."
    required: false
    default: 'true'

outputs:
  hang_count:
    description: "Number of unique hangs found during fuzzing"
    value: ${{ steps.analyze.outputs.hang_count }}
  crash_count:
    description: "Number of unique crashes found during fuzzing"
    value: ${{ steps.analyze.outputs.crash_count }}

runs:
  using: "composite"
  steps:
    - name: Verify AFL++ installation
      shell: bash
      run: |
        echo "Verifying AFL++ installation..."
        afl-fuzz -h | head -5 || true

        # Verify AFL++ compilers are available
        which afl-clang-fast
        which afl-clang-fast++
        afl-clang-fast --version

    - name: Configure system for fuzzing
      shell: bash
      run: |
        echo "Configuring system for AFL++ fuzzing..."
        afl-system-config || true
        echo core > /proc/sys/kernel/core_pattern || echo "Warning: Could not set core_pattern"
        echo "System configured"

    - name: Build Dragonfly with AFL++
      shell: bash
      run: |
        if [ "${{ inputs.build }}" = "false" ]; then
          if [ ! -f "./build-dbg/dragonfly" ]; then
            echo "::error::build=false but binary not found at ./build-dbg/dragonfly"
            exit 1
          fi
          echo "Skipping build, reusing existing binary"
          ls -lh ./build-dbg/dragonfly
        else
          echo "Building Dragonfly with AFL++ instrumentation..."
          ./helio/blaze.sh -DUSE_AFL:BOOL=ON
          cd ./build-dbg && ninja dragonfly && cd ..
          echo "Build complete"
          ls -lh ./build-dbg/dragonfly
        fi

    - name: Merge targeted seeds
      shell: bash
      if: ${{ inputs.extra-seeds-dir != '' }}
      run: |
        EXTRA_DIR="${{ inputs.extra-seeds-dir }}"
        SEEDS_DIR="fuzz/seeds/${{ inputs.target }}"

        # Copy only seed files, skip metadata like focus_commands.json
        COUNT=$(find "$EXTRA_DIR" -maxdepth 1 -type f ! -name '*.json' 2>/dev/null | wc -l)
        if [ "$COUNT" -gt 0 ]; then
          echo "Merging ${COUNT} targeted seeds into corpus"
          find "$EXTRA_DIR" -maxdepth 1 -type f ! -name '*.json' -exec cp -t "$SEEDS_DIR/" {} +
        else
          echo "No targeted seed files to merge"
        fi

    - name: Run AFL++ fuzzing
      shell: bash
      run: |
        MODE="${{ inputs.mode }}"
        DURATION_MINUTES="${{ inputs.duration-minutes }}"

        echo "Starting AFL++ fuzzing..."
        echo "Configuration:"
        echo "  Target: ${{ inputs.target }}"
        echo "  Mode: ${MODE}"
        echo "  Duration: ${DURATION_MINUTES} minutes"

        cd fuzz
        export BUILD_DIR="${GITHUB_WORKSPACE}/build-dbg"

        # Run fuzzer with timeout
        timeout ${DURATION_MINUTES}m ./run_fuzzer.sh "${{ inputs.target }}" || EXIT_CODE=$?

        # timeout returns 124 if it timed out (expected), 0 if finished naturally
        if [ "${EXIT_CODE:-0}" -eq 124 ]; then
          echo "Fuzzing completed (timeout reached)"
        elif [ "${EXIT_CODE:-0}" -eq 0 ]; then
          echo "Fuzzing completed normally"
        else
          echo "::error::Fuzzer failed with exit code ${EXIT_CODE}"
          exit 1
        fi
      env:
        # Mode-specific environment variables
        AFL_BENCH_UNTIL_CRASH: ${{ inputs.mode == 'smoke' && '1' || '' }}
        AFL_NO_UI: 1
        AFL_AUTORESUME: 1
        AFL_I_DONT_CARE_ABOUT_MISSING_CRASHES: 1
        AFL_TESTCACHE_SIZE: ${{ inputs.mode == 'smoke' && '50' || '500' }}
        AFL_SKIP_CPUFREQ: 1
        AFL_FAST_CAL: ${{ inputs.mode == 'long' && '1' || '' }}
        AFL_PERSISTENT_RECORD: 1000
        AFL_CUSTOM_MUTATOR_ONLY: 1
        FUZZ_FOCUS_COMMANDS: ${{ inputs.focus-commands }}

    - name: Analyze fuzzing results
      shell: bash
      if: always()
      id: analyze
      run: |
        echo "Analyzing fuzzing results..."

        TARGET="${{ inputs.target }}"
        CRASHES_DIR="fuzz/artifacts/${TARGET}/default/crashes"
        HANGS_DIR="fuzz/artifacts/${TARGET}/default/hangs"
        QUEUE_DIR="fuzz/artifacts/${TARGET}/default/queue"

        # Count results
        CRASH_COUNT=0
        HANG_COUNT=0
        CORPUS_SIZE=0

        if [ -d "$CRASHES_DIR" ]; then
          CRASH_COUNT=$(find "$CRASHES_DIR" -maxdepth 1 -type f -name 'id:*' 2>/dev/null | wc -l)
        fi

        if [ -d "$HANGS_DIR" ]; then
          HANG_COUNT=$(find "$HANGS_DIR" -maxdepth 1 -type f -name 'id:*' ! -name 'RECORD:*' 2>/dev/null | wc -l)
        fi

        if [ -d "$QUEUE_DIR" ]; then
          CORPUS_SIZE=$(find "$QUEUE_DIR" -type f ! -name ".state" 2>/dev/null | wc -l)
        fi

        echo "Fuzzing Results:"
        echo "   Crashes: $CRASH_COUNT"
        echo "   Hangs: $HANG_COUNT"
        echo "   Corpus size: $CORPUS_SIZE"

        # Show statistics for long mode
        if [ "${{ inputs.mode }}" = "long" ]; then
          STATS_FILE="fuzz/artifacts/${TARGET}/default/fuzzer_stats"
          if [ -f "$STATS_FILE" ]; then
            echo ""
            echo "Key Statistics:"
            grep -E "execs_done|execs_per_sec|paths_total|corpus_count|unique_crashes|unique_hangs|last_crash|last_hang" "$STATS_FILE" || true
          fi
        fi

        echo "hang_count=${HANG_COUNT}" >> "$GITHUB_OUTPUT"
        echo "crash_count=${CRASH_COUNT}" >> "$GITHUB_OUTPUT"

        # Fail the job if crashes or hangs were found
        if [ "$CRASH_COUNT" -gt 0 ]; then
          echo "::error::Found $CRASH_COUNT crash(es)!"
          echo ""
          echo "Crash input files (excluding RECORD):"
          find "$CRASHES_DIR" -maxdepth 1 -name 'id:*' ! -name 'RECORD:*' -type f | sort || true
          exit 1
        fi

        if [ "$HANG_COUNT" -gt 0 ]; then
          echo "::error::Found $HANG_COUNT hang(s)!"
          echo ""
          echo "Hang input files (excluding RECORD):"
          find "$HANGS_DIR" -maxdepth 1 -name 'id:*' ! -name 'RECORD:*' -type f | sort || true
          exit 1
        fi

        if [ "$CORPUS_SIZE" -gt 0 ]; then
          echo "No crashes found - fuzzing test passed!"
        else
          echo "No fuzzing artifacts found (fuzzer may not have started)"
        fi

    - name: Package crash artifacts
      shell: bash
      if: failure() && steps.analyze.outputs.crash_count > 0
      run: |
        CRASHES_DIR="$(pwd)/fuzz/artifacts/${{ inputs.target }}/default/crashes"

        if [ ! -d "$CRASHES_DIR" ] || [ -z "$(ls -A "$CRASHES_DIR" 2>/dev/null)" ]; then
          echo "No crash artifacts to package"
          exit 0
        fi

        echo "Raw crash directory contents:"
        ls -la "$CRASHES_DIR"

        mkdir -p fuzz/packaged

        # Find crash input files (not RECORD files)
        find "$CRASHES_DIR" -maxdepth 1 -name 'id:*' ! -name 'RECORD:*' -type f | while read -r f; do
          CRASH_ID=$(basename "$f" | sed 's/^id:\([0-9]*\),.*/\1/')
          echo "Packaging crash ${CRASH_ID}..."
          if ( cd fuzz && ./package_crash.sh "$CRASH_ID" "$CRASHES_DIR" ); then
            mv "fuzz/crash-${CRASH_ID}.tar.gz" fuzz/packaged/ 2>/dev/null || true
          else
            echo "Warning: failed to package crash ${CRASH_ID}, continuing..."
          fi
        done

        echo "Packaged crashes:"
        ls -lh fuzz/packaged/ 2>/dev/null || echo "  (none)"

    - name: Upload crash artifacts
      if: failure() && steps.analyze.outputs.crash_count > 0
      uses: actions/upload-artifact@v4
      with:
        name: fuzz-${{ inputs.mode }}-${{ inputs.target }}-crashes-${{ inputs.run-number }}
        path: |
          fuzz/packaged/*.tar.gz
          fuzz/artifacts/${{ inputs.target }}/default/fuzzer_stats
        retention-days: 10
        if-no-files-found: ignore

    - name: Package hang artifacts
      shell: bash
      if: failure() && steps.analyze.outputs.hang_count > 0
      run: |
        HANGS_DIR="fuzz/artifacts/${{ inputs.target }}/default/hangs"

        if [ ! -d "$HANGS_DIR" ] || [ -z "$(ls -A "$HANGS_DIR" 2>/dev/null)" ]; then
          echo "No hang artifacts to package"
          exit 0
        fi

        mkdir -p fuzz/packaged_hangs
        tar -czf "fuzz/packaged_hangs/hangs-${{ inputs.target }}.tar.gz" \
          -C "$(dirname "$HANGS_DIR")" hangs/

        echo "Packaged hangs:"
        ls -lh fuzz/packaged_hangs/

    - name: Upload hang artifacts
      if: failure() && steps.analyze.outputs.hang_count > 0
      uses: actions/upload-artifact@v4
      with:
        name: fuzz-${{ inputs.mode }}-${{ inputs.target }}-hangs-${{ inputs.run-number }}
        path: |
          fuzz/packaged_hangs/*.tar.gz
          fuzz/artifacts/${{ inputs.target }}/default/fuzzer_stats
        retention-days: 10
        if-no-files-found: ignore


================================================
FILE: .github/actions/lint-test-chart/action.yml
================================================
name: Lint test chart
description: "Run lint test chart"

runs:
  using: "composite"
  steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: Set up Helm
        uses: azure/setup-helm@v4

      - uses: actions/setup-python@v5
        with:
          python-version: "3.9"
          check-latest: true

      - name: Chart Rendering Tests
        shell: bash
        run: |
          go test -v ./contrib/charts/dragonfly/...

      - name: Set up chart-testing
        uses: helm/chart-testing-action@v2.6.1

      - name: Run chart-testing (list-changed)
        id: list-changed
        shell: bash
        run: |
          changed=$(ct list-changed --config .ct.yaml)
          if [[ -n "$changed" ]]; then
            echo "changed=true" >> $GITHUB_OUTPUT
          fi

      - name: Run chart-testing (lint)
        shell: bash
        run: |
          ct \
            lint \
            --config .ct.yaml \
            ${{github.event_name == 'workflow_dispatch' && '--all'}} ;

      - name: Create kind cluster
        uses: helm/kind-action@v1

      - name: Install Dependencies
        shell: bash
        run: |
          curl -sL https://github.com/prometheus-operator/prometheus-operator/releases/download/v0.73.0/bundle.yaml | kubectl create -f -

      - name: Getting cluster ready
        shell: bash
        run: |
          kubectl label nodes chart-testing-control-plane key/node-kind=high-memory

      - name: Run chart-testing (install)
        shell: bash
        run: |
          ct \
            install \
            --config .ct.yaml \
            --debug \
            --helm-extra-set-args "--set=image.repository=ghcr.io/${{ github.repository }},probes=null" \
            ${{github.event_name == 'workflow_dispatch' && '--all'}} ;


================================================
FILE: .github/actions/multi-registry-docker-login/action.yml
================================================
name: 'Multi-Registry Docker Login'
description: 'Authenticate with both GHCR and Google Artifact Registry'
inputs:
  GITHUB_TOKEN:
    description: 'GitHub token for GHCR'
    required: true
  GCP_SA_KEY:
    description: 'Google Service Account JSON key'
    required: true

runs:
  using: "composite"
  steps:
    - name: Login to GHCR
      uses: docker/login-action@v3
      with:
        registry: ghcr.io
        username: ${{ github.repository_owner }}
        password: ${{ inputs.GITHUB_TOKEN }}

    - name: Login to Google Artifact Registry
      uses: docker/login-action@v3
      with:
        registry: us-central1-docker.pkg.dev
        username: _json_key
        password: ${{ inputs.GCP_SA_KEY }}


================================================
FILE: .github/actions/regression-tests/action.yml
================================================
name: Regression Tests
description: "Run regression tests"

inputs:
  dfly-executable:
    required: true
    type: string
  gspace-secret:
    required: false
    type: string
  run-only-on-ubuntu-latest:
    # 'true' or 'false' cause boolean
    # is not supported in composite actions
    required: true
    type: string
  build-folder-name:
    required: true
    type: string
  filter:
    required: false
    type: string
  aws-access-key-id:
    required: false
    type: string
    description: "AWS access key ID (optional if using OIDC - credentials set by workflow)"
  aws-secret-access-key:
    required: false
    type: string
    description: "AWS secret access key (optional if using OIDC - credentials set by workflow)"
  s3-bucket:
    required: true
    type: string
  epoll:
    required: false
    type: string

runs:
  using: "composite"
  # bring back timeouts once composite actions start supporting them
  # timeout-minutes: 20
  steps:
    - name: Sync valkey-search tests
      uses: ./.github/actions/sync-valkey-tests

    - name: Free disk space
      if: contains(runner.labels, 'self-hosted') == false
      shell: bash
      run: |
        echo "===================Before freeing up space ============================================"
        df -h
        rm -rf /hostroot/usr/share/dotnet
        rm -rf /hostroot/usr/local/share/boost
        rm -rf /hostroot/usr/local/lib/android
        rm -rf /hostroot/opt/ghc
        echo "===================After freeing up space ============================================"
        df -h

    - name: Install Python test requirements
      shell: bash
      run: |
        cd ${GITHUB_WORKSPACE}/tests
        # https://peps.python.org/pep-0668/#keep-the-marker-file-in-container-images
        if compgen -G '/usr/lib/python3.*/EXTERNALLY-MANAGED' > /dev/null; then
          pip3 install --break-system-packages -r dragonfly/requirements.txt
        else
          pip3 install -r dragonfly/requirements.txt
        fi

    - name: Run S3 snapshot tests with MinIO
      if: inputs.s3-bucket != ''
      shell: bash
      run: |
        echo "=== Running S3 snapshot tests with local MinIO ==="
        cd ${GITHUB_WORKSPACE}/tests

        export DRAGONFLY_PATH="${GITHUB_WORKSPACE}/${{inputs.build-folder-name}}/${{inputs.dfly-executable}}"

        # MinIO binary is downloaded and started by conftest.py when MINIO_S3_ENDPOINT is set
        MINIO_S3_ENDPOINT=http://localhost:9000 timeout 10m pytest -k "s3" --timeout=300 --color=yes dragonfly/snapshot_test.py --log-cli-level=INFO -v

    - name: Run PyTests
      id: main
      shell: bash
      run: |
        ls -l ${GITHUB_WORKSPACE}/
        cd ${GITHUB_WORKSPACE}/tests
        echo "Current commit is ${{github.sha}}"
        # used by PyTests
        export DRAGONFLY_PATH="${GITHUB_WORKSPACE}/${{inputs.build-folder-name}}/${{inputs.dfly-executable}}"
        export ROOT_DIR="${GITHUB_WORKSPACE}/tests/dragonfly/valkey_search"
        export UBSAN_OPTIONS=print_stacktrace=1:halt_on_error=1 # to crash on errors
        export FILTER="${{inputs.filter}}"

        # Exclude large tests unless explicitly requested
        if [[ "$FILTER" == "large" ]]; then
          : # keep as-is, run only large tests
        elif [[ -n "$FILTER" ]]; then
          FILTER="(not large) and ($FILTER)"
        else
          FILTER="not large"
        fi

        if [[ "${{inputs.epoll}}" == 'epoll' ]]; then
          FILTER="$FILTER and not exclude_epoll"
          # Run only replication tests with epoll
          timeout 80m pytest -m "$FILTER" --durations=10 --timeout=300 --color=yes --json-report --json-report-file=report.json dragonfly --df force_epoll=true --log-cli-level=INFO || code=$?
        else
          # Run only replication tests with iouring
          timeout 80m pytest -m "$FILTER" --durations=10 --timeout=300 --color=yes --json-report --json-report-file=report.json dragonfly --log-cli-level=INFO || code=$?
        fi

        # timeout returns 124 if we exceeded the timeout duration
        if [[ $code -eq 124 ]]; then
          # Add an extra new line here because when tests timeout the first line below continues from the test failure name
          echo "\n"
          echo "🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑"
          echo "🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 TESTS TIMEDOUT 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑"
          echo "🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑"
          # Copy the last log file because we timedout and pytest did not copy it over
          # the /tmp/failed/ folder
          cat /tmp/last_test_log_dir.txt | xargs -I {} mv {}/ /tmp/failed/
          exit 1
        fi

        # when a test fails in pytest it returns 1 but there are other return codes as well so we just check if the code is non zero
        if [[ $code -ne 0 ]]; then
          exit 1
        fi
      env:
        # Add environment variables to enable the S3 snapshot test.
        # AWS credentials: if inputs provided, use them; otherwise rely on workflow OIDC auth
        DRAGONFLY_S3_BUCKET: ${{ inputs.s3-bucket }}
        AWS_ACCESS_KEY_ID: ${{ inputs.aws-access-key-id || env.AWS_ACCESS_KEY_ID }}
        AWS_SECRET_ACCESS_KEY: ${{ inputs.aws-secret-access-key || env.AWS_SECRET_ACCESS_KEY }}
        AWS_SESSION_TOKEN: ${{ env.AWS_SESSION_TOKEN }}
        AWS_REGION: ${{ env.AWS_REGION || 'us-east-1' }}

    - name: Send notification on failure
      if: failure() && github.ref == 'refs/heads/main'
      shell: bash
      run: |
        get_failed_tests() {
          local report_file=$1
          echo $(jq -r '.tests[] | select(.outcome == "failed") | .nodeid' "$report_file")
        }
        cd ${GITHUB_WORKSPACE}/tests
        failed_tests=""
        if [ -f report.json ]; then
          failed_tests=$(get_failed_tests report.json)
        fi

        KIND="iouring"
        if [[ "${{inputs.epoll}}" == 'epoll' ]]; then
          KIND="epoll"
        fi

        job_link="${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}"
        message="Regression $KIND tests failed.\\n The commit is: ${{github.sha}}.\\n $failed_tests \\n Job Link: ${job_link}\\n"

        curl -s \
          -X POST \
          -H 'Content-Type: application/json' \
          '${{ inputs.gspace-secret }}' \
          -d '{"text": "'"${message}"'"}'
    - name: Copy binary on a self hosted runner
      if: failure() && contains(runner.labels, 'self-hosted')
      shell: bash
      run: |
        cd ${GITHUB_WORKSPACE}/build
        timestamp=$(date +%Y-%m-%d_%H:%M:%S)
        mv ./dragonfly /var/crash/dragonfly_${timestamp}


================================================
FILE: .github/actions/repeat/action.yml
================================================
name: Run Tests On Repeat
description: "Repeat specific tests"

inputs:
  dfly-executable:
    required: true
    type: string
  run-only-on-ubuntu-latest:
    required: true
    type: string
  build-folder-name:
    required: true
    type: string
  expression:
    required: false
    type: string
  aws-access-key-id:
    required: false
    type: string
    description: "AWS access key ID (optional if using OIDC - credentials set by workflow)"
  aws-secret-access-key:
    required: false
    type: string
    description: "AWS secret access key (optional if using OIDC - credentials set by workflow)"
  s3-bucket:
    required: true
    type: string
  count:
    required: true
    type: number
  timeout:
    required: true
    type: string
  epoll:
    required: true
    type: string
  vmodule_expression:
    required: true
    type: string

runs:
  using: "composite"
  steps:
    - name: Repeat pytests
      id: main
      shell: bash
      run: |
        ls -l ${GITHUB_WORKSPACE}/
        cd ${GITHUB_WORKSPACE}/tests
        echo "Current commit is ${{github.sha}}"
        pip3 install -r dragonfly/requirements.txt
        # used by PyTests
        export DRAGONFLY_PATH="${GITHUB_WORKSPACE}/${{inputs.build-folder-name}}/${{inputs.dfly-executable}}"
        export UBSAN_OPTIONS=print_stacktrace=1:halt_on_error=1 # to crash on errors
        if [[ "${{ inputs.epoll }}" == "epoll" ]]; then
          FORCE_EPOLL="--df force_epoll=true"
        else
          FORCE_EPOLL=""
        fi
        if [[ $"{{ inputs.vmodule_expression }}" != "" ]]; then
          VMOD="--df vmodule=${{ inputs.vmodule_expression }}"
        else
          VMOD=""
        fi
        echo Running command: timeout ${{ inputs.timeout }} pytest ${{ inputs.expression }} --drop-data-after-each-test ${FORCE_EPOLL} ${VMOD} --color=yes --json-report --json-report-file=report.json --log-cli-level=DEBUG --count=${{ inputs.count }}
        timeout ${{ inputs.timeout }} pytest ${{ inputs.expression }} --drop-data-after-each-test ${FORCE_EPOLL} ${VMOD} --color=yes --json-report --json-report-file=report.json --log-cli-level=DEBUG --count=${{ inputs.count }} || code=$?
        # timeout returns 124 if we exceeded the timeout duration
        if [[ $code -eq 124 ]]; then
          # Add an extra new line here because when tests timeout the first line below continues from the test failure name
          echo "\n"
          echo "🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑"
          echo "🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 TESTS TIMEDOUT 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑"
          echo "🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑 🛑"
          # Copy the last log file because we timedout and pytest did not copy it over
          # the /tmp/failed/ folder
          cat /tmp/last_test_log_dir.txt | xargs -I {} mv {}/ /tmp/failed/
          exit 1
        fi

        # when a test fails in pytest it returns 1 but there are other return codes as well so we just check if the code is non zero
        if [[ $code -ne 0 ]]; then
          exit 1
        fi
      env:
        # Add environment variables to enable the S3 snapshot test.
        # AWS credentials: if inputs provided, use them; otherwise rely on workflow OIDC auth
        DRAGONFLY_S3_BUCKET: ${{ inputs.s3-bucket }}
        AWS_ACCESS_KEY_ID: ${{ inputs.aws-access-key-id || env.AWS_ACCESS_KEY_ID }}
        AWS_SECRET_ACCESS_KEY: ${{ inputs.aws-secret-access-key || env.AWS_SECRET_ACCESS_KEY }}
        AWS_SESSION_TOKEN: ${{ env.AWS_SESSION_TOKEN }}
        AWS_REGION: ${{ env.AWS_REGION || 'us-east-1' }}


================================================
FILE: .github/actions/sync-valkey-tests/action.yml
================================================
name: Sync valkey-search tests
description: "Synchronizes valkey-search tests using a fixed revision"

runs:
  using: composite
  steps:
    - name: Sync valkey-search tests
      shell: bash
      run: |
        cd ${GITHUB_WORKSPACE}/tests/dragonfly/valkey_search
        # main branch revision
        ./sync-valkey-search-tests.sh 90124dc91756b24cb2e58e5c4eea5b8d53004ea6


================================================
FILE: .github/actions/test-docker/action.yml
================================================
name: Test Docker Image

inputs:
  image_id:
    required: true
    type: string
  name:
    required: true
    type: string

runs:
  using: "composite"
  steps:
    - name: Test Image
      shell: bash
      run: |
        echo "Testing ${{ inputs.name }} image"
        docker pull ${{inputs.image_id}}
        docker image inspect ${{inputs.image_id}}

        # docker run with port-forwarding
        docker run  --name test -d -p 6379:6379 ${{inputs.image_id}}
        until [ "`docker inspect -f {{.State.Health.Status}} test`"=="healthy" ]; do
          sleep 0.1;
        done;


================================================
FILE: .github/bullmq-skipped-tests.txt
================================================
# BullMQ tests excluded from CI runs against Dragonfly
#
# Format: one pattern per line (used as JS regex in mocha --grep --invert)
# Categories:
#   DRAGONFLY_BUG  - Dragonfly does not support this behaviour yet
#   FLAKY          - Test has race conditions / timing issues unrelated to Dragonfly

# ── DRAGONFLY BUG ────────────────────────────────────────────────────────────
# BullMQ Lua scripts access keys that are not declared in KEYS[].
# Dragonfly enforces strict Lua key declaration; allow-undeclared-keys causes
# global transaction mode and breaks other tests.
handle errors.*for flows
Flows - addBulk.*handle errors

# Job.finished: job hash persists after removeOnComplete instead of being deleted.
rejects with missing key for job message

# ── FLAKY ─────────────────────────────────────────────────────────────────────
# deduplication key removal races with the 'deduplicated' QueueEvents listener.
# XREAD from '$' is noted as unstable in upstream BullMQ code.
removes deduplication key

# QueueEvents 'waiting' event: XREAD from '$' is unstable on CI.
# Upstream comment: "additional delay since XREAD from '$' is unstable"
emits waiting when a job has been added

# getWorkers: race between worker 'ready' event and assertion.
gets all workers for this queue only

# getWorkers (shared connection): upstream test file has comment
# "Test is very flaky on CI, so we skip it for now."
gets all workers for a given queue

# Job Scheduler monthly repeat: sinon fake-timer races with real Redis async ops.
# The worker loop does not advance in time before the 200 s timeout expires.
should repeat 7:th day every month at 9:25


================================================
FILE: .github/copilot-instructions.md
================================================
---
description: 'Code review guidelines for GitHub copilot in this project'
applyTo: '**'
excludeAgent: ["coding-agent"]
---

# Code Review Instructions

Keep reviews high-signal and minimal. Only comment on real bugs with high confidence.

## Comment Only When
- The issue is a correctness, security, concurrency, or architecture problem.
- The impact is clear and non-trivial.
- You can point to concrete evidence in the diff (not speculation).

## Avoid
- Style, formatting, naming, or minor performance nits.
- Optional refactors or “nice to have” suggestions.
- Praise, restating the code, or long explanations.
- Duplicate comments for the same root cause.

## Review Style
- Be terse: 1-2 sentences per issue.
- Include file and line references when possible.
- If no issues are found, say “No issues found.”
- Provide concrete suggestions for fixes when possible, or examples to illustrate the problem.


================================================
FILE: .github/dependabot.yml
================================================
version: 2
updates:
  - package-ecosystem: "github-actions"
    directory: "/"
    schedule:
      interval: "weekly"

    open-pull-requests-limit: 1
    groups:
      actions:
        patterns:
          - "*"


  - package-ecosystem: "gomod"
    directories:
      - "/contrib/charts/dragonfly"
      - "/tools/replay"
    schedule:
      interval: "weekly"

    open-pull-requests-limit: 1
    #uncomment it to group dependency updates
    #groups:
      #go-mod:
        #patterns:
          #- "*"
    ignore:
      # Disable all updates except security updates
      #remove an item from ignore list to get dependency updates of that kind
      - dependency-name: "*"
        update-types:
          - "version-update:semver-major"
          - "version-update:semver-minor"
          - "version-update:semver-patch"


  - package-ecosystem: "pip"
    directories:
      - "/tests/dragonfly"
      - "/tools"
    schedule:
      interval: "weekly"

    #uncomment it to group dependency updates
    #groups:
      #py-dep:
        #patterns:
          #- "*"
    ignore:
      # Disable all updates except security updates
      #remove an item from ignore list to get dependency updates of that kind
      - dependency-name: "*"
        update-types:
          - "version-update:semver-major"
          - "version-update:semver-minor"
          - "version-update:semver-patch"


================================================
FILE: .github/instructions/code-review.instructions.md
================================================
---
description: 'Code review instructions for Dragonfly'
applyTo: '**'
excludeAgent: ["coding-agent"]
---

# Dragonfly Code Review Instructions

Dragonfly is a high-performance, Redis-compatible in-memory data store written in C++20 with a unique shared-nothing, fiber-based architecture. Code reviews must prioritize correctness, security, and architectural compliance specific to this threading model.

## Review Priorities

### 🔴 CRITICAL (Block merge immediately)

**Threading Model Violations** (causes deadlocks/crashes):
- ❌ **NEVER** use `std::thread`, `std::mutex`, `std::condition_variable`, or standard library threading primitives
- ✅ **ALWAYS** use fiber-aware equivalents: `util::fb2::Mutex`, `util::fb2::Fiber`, `util::fb2::CondVar` from `util/fibers/`

**Architecture Violations**:
- ❌ Cross-shard data access without proper synchronization
- ✅ Per-shard operations only (see `src/server/db_slice.cc` for patterns)

**Security Vulnerabilities**:
- Authentication/authorization bypass in ACL code (`src/server/acl/`)
- Exposed secrets, credentials in code or logs
- Buffer overflows, use-after-free, memory safety issues

**Correctness Issues**:
- Race conditions in fiber scheduling
- Logic errors in transaction handling (`src/server/transaction.cc`)
- Data corruption risks in DashTable operations (`src/core/dash.h`)

### 🟡 IMPORTANT (Requires discussion)

**Code Quality**:
- Missing error handling (should return `OpStatus` from `facade/op_status.h`)
- Obvious memory leaks (check ASAN reports)
- Performance bottlenecks in hot paths (unnecessary allocations, N+1 patterns)

**Test Coverage**:
- New features without tests (both C++ unit tests and Python integration tests)
- Changes to critical paths (transactions, replication, cluster) without test coverage
- Modified code that fails existing tests

**Style Violations** (severe only):
- Not following naming conventions: `snake_case` variables, `PascalCase` functions, `kPascalCase` constants
- Code that won't pass pre-commit hooks (clang-format, 100 char limit)

### 🟢 SUGGESTIONS (Non-blocking, comment only if obvious)

- Over-engineering: adding abstraction layers, feature flags, or configurability not requested
- Missing comments on complex fiber synchronization logic
- Premature optimization without profiling

## Dragonfly-Specific Patterns

### ✅ DO: Correct Patterns

**Threading & Synchronization**:
```cpp
// ✅ CORRECT: Fiber-aware mutex
util::fb2::Mutex mutex_;
std::lock_guard<util::fb2::Mutex> lock(mutex_);

// ✅ CORRECT: Fiber-aware operations
util::fb2::Fiber fb = util::fb2::Fiber("name", [&] { /* work */ });
```


**Per-Shard Design**:
```cpp
// ✅ CORRECT: Operate on shard-local data
void DbSlice::SomeOperation() {
  // Access only this shard's data
  auto& db_slice = cntx->ns->GetCurrentDbSlice();
}
```

### ❌ DON'T: Anti-Patterns

**Threading**:
```cpp
// ❌ WRONG: Standard library threading (causes deadlocks!)
std::mutex mutex_;
std::thread worker;
std::condition_variable cv_;
```

**Global State**:
```cpp
// ❌ WRONG: Global mutable state (breaks shared-nothing architecture)
static std::unordered_map<string, int> global_cache;
```

**Build Commands**:
- ❌ Don't suggest `./tools/docker/build.sh` or `make` for incremental builds
- ✅ Use `cd build-dbg && ninja <target>` instead

## Code Review Checklist

When reviewing Dragonfly code, verify:

1. **Architecture Compliance**:
   - [ ] No standard library threading primitives (`std::thread`, `std::mutex`)
   - [ ] No global mutable state
   - [ ] Fiber-aware synchronization used correctly
   - [ ] Follows per-shard, shared-nothing design

2. **Security**:
   - [ ] No OWASP vulnerabilities (injection, XSS, auth bypass)
   - [ ] No hardcoded secrets or credentials
   - [ ] Input validation on command arguments
   - [ ] Safe memory operations (no buffer overflows)

3. **Testing**:
   - [ ] New functionality has test coverage
   - [ ] Tests build and pass: `cd build-dbg && ninja <test> && ./<test>`
   - [ ] No test regressions

4. **Style & Formatting**:
   - [ ] Follows naming conventions (snake_case vars, PascalCase functions)
   - [ ] Will pass pre-commit checks (clang-format, 100 char limit)
   - [ ] Code compiles without warnings (CI uses `-Werror`)

5. **Helio Submodule**:
   - [ ] No direct edits to `helio/` directory (it's a git submodule)

## Common False Positives to Ignore

These are **NOT** issues in Dragonfly's design. Do not comment on:

1. **Single-threaded-looking code**: Per-shard operations intentionally avoid locks
2. **Custom allocators**: mimalloc is used intentionally for performance
3. **Manual memory management**: Required for performance-critical paths
4. **Complex template metaprogramming**: DashTable uses advanced C++20 features
5. **Missing const**: Not always applicable in high-performance code

## Review Style Guidelines

1. **Be specific**: Reference file:line, explain WHY it's wrong
2. **Show examples**: Demonstrate the correct pattern with code
3. **Prioritize**: Security and correctness over style
4. **Link to docs**: Reference `docs/df-share-nothing.md`, `docs/transaction.md`, etc.
5. **Be concise**: Dragonfly team values focused, actionable feedback

## Example Review Comments

**❌ BAD - Too noisy**:
> "Consider using auto here for type inference"

**✅ GOOD - Actionable and specific**:
> "🔴 CRITICAL: Line 42 uses `std::mutex`. This will cause fiber deadlocks. Replace with `util::fb2::Mutex` from helio/util/fibers/. See src/server/set_family.cc:123 for correct pattern."

**✅ GOOD - Security focused**:
> "🔴 SECURITY: Line 58 doesn't validate `user_input` before passing to eval(). Vulnerable to command injection. Add validation or use SafeEval()."

**✅ GOOD - Architecture violation**:
> "🟡 ARCHITECTURE: Line 91 accesses global `cache_map`. Dragonfly uses shared-nothing design - each shard must have its own cache. See docs/df-share-nothing.md"

---

**Key Files Reference**: See AGENTS.md for complete codebase structure, build commands, and testing procedures.


================================================
FILE: .github/workflows/benchmark.yml
================================================
name: benchmark-tests

on:
  schedule:
    - cron: "0 9 * * *" # run at 6 AM UTC
  workflow_dispatch:

permissions:
  contents: read

jobs:
  benchmark:
    if: github.repository == 'dragonflydb/dragonfly'
    strategy:
      matrix:
        config:
          - operator:
              apiVersion: "dragonflydb.io/v1alpha1"
              kind: "Dragonfly"
              metadata:
                labels:
                  app.kubernetes.io/name: "dragonfly"
                  app.kubernetes.io/instance: "dragonfly-sample"
                  app.kubernetes.io/part-of: "dragonfly-operator"
                  app.kubernetes.io/managed-by: "kustomize"
                  app.kubernetes.io/created-by: "dragonfly-operator"
                name: "dragonfly-sample"
              spec:
                image: "ghcr.io/dragonflydb/dragonfly:latest"
                args: ["--cache_mode"]
                replicas: 2
                resources:
                  requests:
                    cpu: "2"
                    memory: "2000Mi"
                  limits:
                    cpu: "2"
                    memory: "2000Mi"

    runs-on: ubuntu-latest

    container:
      image: ghcr.io/romange/benchmark-dev:latest
      options: --security-opt seccomp=unconfined

    permissions:
      id-token: write

    steps:
      - name: Setup namespace name
        id: setup
        run: echo "namespace=benchmark-$(date +"%Y-%m-%d-%s")" >> $GITHUB_OUTPUT

      - uses: actions/checkout@v6
        with:
          submodules: true

      - name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@v5
        with:
          role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }}
          aws-region: ${{ vars.AWS_REGION }}

      - name: Update kube config
        run: aws eks update-kubeconfig --name "$EKS_CLUSTER_NAME" --region "$AWS_REGION"
        env:
          AWS_REGION: ${{ vars.AWS_REGION }}
          EKS_CLUSTER_NAME: dev

      - name: Scale up
        run: |
          set -x
          aws autoscaling set-desired-capacity --auto-scaling-group-name "$AUTOSCALING_GROUP" --desired-capacity "$DESIRED_CAPACITY"
        env:
          AUTOSCALING_GROUP: ${{ vars.DEV_EKS_AS_GROUP }}
          DESIRED_CAPACITY: 1

      - name: Install the CRD and Operator
        run: |
          # Install the CRD and Operator
          kubectl apply -f https://raw.githubusercontent.com/dragonflydb/dragonfly-operator/main/manifests/dragonfly-operator.yaml

      - name: Apply Configuration
        run: |
          set -x
          kubectl create namespace ${{ steps.setup.outputs.namespace }} || true
          echo '${{ toJson(matrix.config.operator) }}' | kubectl apply -n ${{ steps.setup.outputs.namespace }} -f -

      - name: Wait For Service
        run: |
          set -x
          kubectl wait -n ${{ steps.setup.outputs.namespace }} dragonfly/dragonfly-sample --for=jsonpath='{.status.phase}'=ready --timeout=180s
          kubectl wait -n ${{ steps.setup.outputs.namespace }} pods --selector app=dragonfly-sample --for condition=Ready --timeout=120s
          kubectl describe -n ${{ steps.setup.outputs.namespace }} pod dragonfly-sample-0

      - name: Run Memtier Benchmark
        shell: bash
        run: |
          kubectl apply -n ${{ steps.setup.outputs.namespace }} -f tools/benchmark/k8s-benchmark-job.yaml

      - name: Version upgrade
        shell: bash
        run: |
          # benchmark is running, wait for 30 seconds before version upgrade
          sleep 30
          kubectl patch dragonfly dragonfly-sample -n ${{ steps.setup.outputs.namespace }}  --type merge -p '{"spec":{"image":"ghcr.io/dragonflydb/dragonfly-weekly:latest"}}'

      - name: Wait for Memtier Benchmark fail
        shell: bash
        run: |
          # Memtier benchmark run will fail at some point because old master shutdown on version upgrade
          kubectl wait --for=condition=failed --timeout=120s -n ${{ steps.setup.outputs.namespace }} jobs/memtier-benchmark 2>/dev/null
          kubectl logs -n ${{ steps.setup.outputs.namespace }} -f jobs/memtier-benchmark
          kubectl delete -n ${{ steps.setup.outputs.namespace }} jobs/memtier-benchmark

      - name: Run Memtier Benchmark again
        shell: bash
        run: |
          kubectl apply -n ${{ steps.setup.outputs.namespace }} -f tools/benchmark/k8s-benchmark-job.yaml

          while true; do
            if kubectl wait --for=condition=complete --timeout=0 -n ${{ steps.setup.outputs.namespace }} jobs/memtier-benchmark 2>/dev/null; then
              job_result=0
              break
            fi

            if kubectl wait --for=condition=failed --timeout=0 -n ${{ steps.setup.outputs.namespace }} jobs/memtier-benchmark 2>/dev/null; then
              job_result=1
              break
            fi

            sleep 3
          done

          kubectl logs -n ${{ steps.setup.outputs.namespace }} -f jobs/memtier-benchmark
          if [[ $job_result -eq 1 ]]; then
              exit 1
          fi

      - name: Server checks
        run: |
          nohup kubectl port-forward -n ${{ steps.setup.outputs.namespace }} service/dragonfly-sample 6379:6379 &
          pip install -r tools/requirements.txt
          python3 tools/benchmark/post_run_checks.py

      - name: Get Dragonfly logs
        uses: nick-fields/retry@v3
        if: always()
        with:
          timeout_minutes: 1
          max_attempts: 3
          command: |
            kubectl logs -n ${{ steps.setup.outputs.namespace }} dragonfly-sample-0

      - name: Get Dragonfly replica logs
        uses: nick-fields/retry@v3
        if: always()
        with:
          timeout_minutes: 1
          max_attempts: 3
          command: |
            kubectl logs -n ${{ steps.setup.outputs.namespace }} dragonfly-sample-1

      - name: Describe dragonflydb object
        uses: nick-fields/retry@v3
        if: always()
        with:
          timeout_minutes: 1
          max_attempts: 3
          command: |
            kubectl describe dragonflies.dragonflydb.io -n ${{ steps.setup.outputs.namespace }} dragonfly-sample

      - name: Scale down to zero
        if: always()
        run: |
          set -x
          aws autoscaling set-desired-capacity --auto-scaling-group-name "$AUTOSCALING_GROUP" --desired-capacity 0
        env:
          AUTOSCALING_GROUP: ${{ vars.DEV_EKS_AS_GROUP }}

      - name: Cleanup
        if: always()
        run: |
          set -x
          kubectl delete namespace ${{ steps.setup.outputs.namespace }}
          kubectl delete namespace dragonfly-operator-system

      - name: Send notification on failure
        if: failure() && github.ref == 'refs/heads/main'
        shell: bash
        run: |
          job_link="${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}"
          message="Benchmark tests failed.\\n Job Link: ${job_link}\\n"

          curl -s \
            -X POST \
            -H 'Content-Type: application/json' \
            '${{ secrets.GSPACES_BOT_DF_BUILD }}' \
            -d '{"text": "'"${message}"'"}'


================================================
FILE: .github/workflows/bullmq-tests.yml
================================================
name: bullmq-tests

on:
  schedule:
    - cron: '0 7 * * *' # run at 7 AM daily
  workflow_dispatch:

permissions:
  contents: read

env:
  NODE_VERSION: "22.12.0"

jobs:
  build:
    if: github.repository == 'dragonflydb/dragonfly'
    runs-on: ubuntu-latest
    name: Build
    timeout-minutes: 60

    container:
      image: ghcr.io/romange/ubuntu-dev:20-gcc14
      options: --security-opt seccomp=unconfined
      credentials:
        username: ${{ github.repository_owner }}
        password: ${{ secrets.GITHUB_TOKEN }}

    steps:
      - uses: actions/checkout@v6
        with:
          submodules: true

      - name: Build Dragonfly
        run: |
          cmake -B ${GITHUB_WORKSPACE}/build \
            -DCMAKE_BUILD_TYPE=Release \
            -DWITH_AWS=OFF \
            -DWITH_GCP=OFF \
            -DWITH_UNWIND=OFF \
            -DWITH_GPERF=OFF \
            -GNinja \
            -L
          cd ${GITHUB_WORKSPACE}/build && ninja dragonfly

      - name: Install Node.js
        run: |
          wget -q https://unofficial-builds.nodejs.org/download/release/v${NODE_VERSION}/node-v${NODE_VERSION}-linux-x64-glibc-217.tar.xz
          tar -xf node-v${NODE_VERSION}-linux-x64-glibc-217.tar.xz
          cp -r node-v${NODE_VERSION}-linux-x64-glibc-217/* /usr/local/
          apt-get update && apt-get install -y jq redis-tools
          npm install -g yarn
          node --version
          yarn --version

      - name: Start Dragonfly
        run: |
          ${GITHUB_WORKSPACE}/build/dragonfly \
            --alsologtostderr \
            --cluster_mode=emulated \
            --lock_on_hashtags \
            --dbfilename= \
            --port 6379 &
          timeout 15s bash -c 'until redis-cli -p 6379 PING 2>/dev/null | grep -q PONG; do sleep 0.1; done'

      - name: Build BullMQ
        run: |
          cd ${GITHUB_WORKSPACE}
          git clone https://github.com/dragonflydb/bullmq
          cd bullmq
          yarn install
          yarn build

      - name: Run BullMQ tests
        run: |
          cd ${GITHUB_WORKSPACE}/bullmq
          SKIP_PATTERN=$(grep -v '^#' ${GITHUB_WORKSPACE}/.github/bullmq-skipped-tests.txt | grep -v '^[[:space:]]*$' | paste -sd '|' || true)
          if [ -n "${SKIP_PATTERN}" ]; then
            BULLMQ_TEST_PREFIX={b} yarn test --grep "${SKIP_PATTERN}" --invert
          else
            BULLMQ_TEST_PREFIX={b} yarn test
          fi

      - name: Upload logs on failure
        if: failure()
        uses: actions/upload-artifact@v6
        with:
          name: unit_logs
          path: /tmp/dragonfly.*

      - name: Send notification on failure
        if: failure() && github.ref == 'refs/heads/main'
        run: |
          job_link="${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}"
          message="BullMQ tests failed.\\n Commit: ${{github.sha}}\\n Job Link: ${job_link}\\n"

          curl -s \
            -X POST \
            -H 'Content-Type: application/json' \
            '${{ secrets.GSPACES_BOT_DF_BUILD }}' \
            -d '{"text": "'"${message}"'"}'


================================================
FILE: .github/workflows/ci.yml
================================================
name: ci-tests

on:
  # push:
  # branches: [ main ]
  pull_request:
    branches: [main]
  workflow_dispatch:

concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
  cancel-in-progress: true

jobs:
  pre-commit:
    if: github.event_name == 'pull_request'
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v6
        with:
          fetch-depth: 0
      - uses: actions/setup-python@v6
        with:
          python-version: '3.12'
          cache: 'pip'
      - uses: actions/cache@v4
        with:
          path: ~/.cache/pre-commit
          key: pre-commit-${{ runner.os }}-${{ hashFiles('.pre-commit-config.yaml') }}
      - uses: pre-commit/action@v3.0.1
        with:
          extra_args: >-
            --show-diff-on-failure --color=always
            --from-ref ${{ github.event.pull_request.base.sha }}
            --to-ref ${{ github.event.pull_request.head.sha }}
  build:
    strategy:
      matrix:
        # Test of these containers
        container: ["ubuntu-dev:24", "alpine-dev:latest"]
        build-type: [Debug, Release]
        compiler: [{ cxx: g++, c: gcc }]
        # -no-pie to disable address randomization so we could symbolize stacktraces
        cxx_flags: ["-Werror -no-pie"]
        sanitizers: ["NoSanitizers"]
        include:
          - container: "alpine-dev:latest"
            build-type: Debug
            compiler: { cxx: clang++, c: clang }
            cxx_flags: ""
            sanitizers: "NoSanitizers"
          - container: "ubuntu-dev:24"
            build-type: Debug
            compiler: { cxx: clang++, c: clang }
            # https://maskray.me/blog/2023-08-25-clang-wunused-command-line-argument (search for compiler-rt)
            cxx_flags: "-Wno-error=unused-command-line-argument"
            sanitizers: "Sanitizers"

    runs-on: ubuntu-latest
    container:
      image: ghcr.io/romange/${{ matrix.container }}
      # Seems that docker by default prohibits running iouring syscalls
      options: --security-opt seccomp=unconfined --sysctl "net.ipv6.conf.all.disable_ipv6=0"
      volumes:
        - /:/hostroot
        - /mnt:/mnt
      credentials:
        username: ${{ github.repository_owner }}
        password: ${{ secrets.GITHUB_TOKEN }}

    steps:
      - uses: actions/checkout@v6
        with:
          submodules: true

      - name: Prepare Environment
        run: |
          uname -a
          cmake --version
          mkdir -p ${GITHUB_WORKSPACE}/build
          mount

          echo "===================Before freeing up space ============================================"
          df -h
          rm -rf /hostroot/usr/share/dotnet
          rm -rf /hostroot/usr/local/share/boost
          rm -rf /hostroot/usr/local/lib/android
          rm -rf /hostroot/opt/ghc
          echo "===================After freeing up space ============================================"
          df -h
          touch /mnt/foo
          ls -la /mnt/foo

      - name: System diagnostics
        run: |
          echo "ulimit is"
          ulimit -s
          echo "-----------------------------"
          echo "disk space is:"
          df -h
          echo "-----------------------------"

      - name: Build Dragonfly
        uses: ./.github/actions/builder
        with:
          build-type: ${{matrix.build-type}}
          c-compiler: ${{matrix.compiler.c}}
          cxx-compiler: ${{matrix.compiler.cxx}}
          cxx-flags: ${{matrix.cxx_flags}}
          sanitizers: ${{matrix.sanitizers}}
          with-aws: 'OFF'

      - name: PostFail
        if: failure()
        run: |
          echo "disk space is:"
          df -h

      - name: C++ Unit Tests - IoUring
        run: |
          cd ${GITHUB_WORKSPACE}/build
          echo Run ctest -V -L DFLY

          GLOG_alsologtostderr=1 GLOG_vmodule=rdb_load=1,rdb_save=1,snapshot=1,op_manager=1,op_manager_test=1 \
          FLAGS_fiber_safety_margin=4096 timeout 20m ctest -V -L DFLY -E allocation_tracker_test

          # Run allocation tracker test separately without alsologtostderr because it generates a TON of logs.
          FLAGS_fiber_safety_margin=4096 timeout 5m ./allocation_tracker_test

          timeout 5m ./dragonfly_test
          timeout 5m ./json_family_test --jsonpathv2=false
          timeout 5m ./tiered_storage_test --vmodule=db_slice=2 --logtostderr
          timeout 5m ./search_test --use_numeric_range_tree=false
          timeout 5m ./search_family_test --use_numeric_range_tree=false


      - name: C++ Unit Tests - Epoll
        run: |
          cd ${GITHUB_WORKSPACE}/build

          # Create a rule that automatically prints stacktrace upon segfault
          cat > ./init.gdb <<EOF
          catch signal SIGSEGV
          command
          bt
          end
          EOF

          gdb -ix ./init.gdb --batch -ex r --args ./dragonfly_test --force_epoll
          GLOG_alsologtostderr=1 FLAGS_fiber_safety_margin=4096 FLAGS_force_epoll=true GLOG_vmodule=rdb_load=1,rdb_save=1,snapshot=1 \
          timeout 20m ctest -V -L DFLY -E allocation_tracker_test

          FLAGS_fiber_safety_margin=4096 FLAGS_force_epoll=true timeout 5m ./allocation_tracker_test

      - name: C++ Unit Tests - IoUring with cluster mode
        run: |
          cd ${GITHUB_WORKSPACE}/build
          FLAGS_fiber_safety_margin=4096 FLAGS_cluster_mode=emulated timeout 20m ctest -V -L DFLY

      - name: C++ Unit Tests - IoUring with cluster mode and FLAGS_lock_on_hashtags
        run: |
          cd ${GITHUB_WORKSPACE}/build
          FLAGS_fiber_safety_margin=4096 FLAGS_cluster_mode=emulated FLAGS_lock_on_hashtags=true timeout 20m ctest -V -L DFLY

      - name: Upload unit logs on failure
        if: failure()
        uses: actions/upload-artifact@v6
        with:
          name: unit_logs
          path: /tmp/*INFO*

      - name: Run regression tests
        if: matrix.container == 'ubuntu-dev:24' && matrix.sanitizers == 'NoSanitizers'
        uses: ./.github/actions/regression-tests
        with:
          dfly-executable: dragonfly
          run-only-on-ubuntu-latest: true
          build-folder-name: build
          # Non-release build will not run tests marked as opt_only
          # "not empty" string is needed for release build because pytest command can not get empty string for filter
          filter: ${{ matrix.build-type == 'Release' && 'not debug_only' || 'not opt_only' }}

      - name: Upload regression logs on failure
        if: failure()
        uses: actions/upload-artifact@v6
        with:
          name: regression_logs
          path: /tmp/failed/*

  lint-test-chart:
    runs-on: ubuntu-latest
    needs: [build]
    steps:
      - uses: actions/checkout@v6
      - uses: ./.github/actions/lint-test-chart

  large-tests-arm:
    runs-on: CI-LARGE-ARM

    permissions:
      id-token: write
      contents: read

    container:
      image: ghcr.io/romange/ubuntu-dev:24
      options: --security-opt seccomp=unconfined --sysctl "net.ipv6.conf.all.disable_ipv6=0"
      volumes:
        - /var/crash:/var/crash
        - /:/hostroot
        - /mnt:/mnt

    steps:
      - uses: actions/checkout@v6
        with:
          submodules: true

      - name: Print environment info
        run: |
          cat /proc/cpuinfo
          ulimit -a
          env
          lsblk -l

      - name: Build Dragonfly
        uses: ./.github/actions/builder
        with:
          build-type: Release
          targets: 'dragonfly'

      - name: Authenticate to AWS
        # Runs if it's NOT a PR, OR if the PR originates from the same repository (not a fork)
        if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository
        uses: aws-actions/configure-aws-credentials@v5
        with:
          role-to-assume: ${{ secrets.AWS_CI_S3_ROLE_ARN }}
          aws-region: us-east-1

      - name: Run large tests on ARM
        uses: ./.github/actions/regression-tests
        with:
          dfly-executable: dragonfly
          gspace-secret: ${{ secrets.GSPACES_BOT_DF_BUILD }}
          build-folder-name: build
          run-only-on-ubuntu-latest: true
          filter: large
          s3-bucket: ${{ secrets.S3_REGTEST_BUCKET }}

      - name: Upload logs on failure
        if: failure()
        uses: actions/upload-artifact@v6
        with:
          name: large-tests-arm-logs
          path: /tmp/failed/*


================================================
FILE: .github/workflows/copilot-setup-steps.yml
================================================
# Copilot Build Environment Setup Steps
# This file contains steps to configure the Dragonfly build environment
# with AWS, GCP, GPERF, and UNWIND disabled for faster development builds

name: Copilot setup steps

# Automatically run the setup steps when they are changed to allow for easy validation, and
# allow manual testing through the repository's "Actions" tab
on:
  workflow_dispatch:
  push:
    paths:
      - .github/workflows/copilot-setup-steps.yml
  pull_request:
    paths:
      - .github/workflows/copilot-setup-steps.yml

jobs:
  # The job MUST be called `copilot-setup-steps` or it will not be picked up by Copilot.
  copilot-setup-steps:
    runs-on: ubuntu-latest
    permissions:
      contents: read

    steps:
      - uses: actions/checkout@v6
        with:
          submodules: true

      - name: Install required system dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y libboost-context-dev

      - name: Configure CMake build (Debug, no AWS/GCP)
        run: ./helio/blaze.sh -DWITH_AWS=OFF -DWITH_GCP=OFF -DWITH_GPERF=OFF

      - name: Install pre-commit
        run: pip3 install pre-commit

# Notes:
# - The build directory will be created at build-dbg/
# - Disabling AWS/GCP significantly speeds up compilation
# - WITH_GPERF=OFF disables Google Performance Tools
# - Use ninja for faster parallel builds


================================================
FILE: .github/workflows/cov.yml
================================================
name: Daily Coverage

on:
    schedule:
      - cron: '0 6 * * *' # run at 6 AM UTC
    workflow_dispatch:

jobs:
  build:
    if: github.repository == 'dragonflydb/dragonfly'
    # The CMake configure and build commands are platform agnostic and should work equally
    # well on Windows or Mac.  You can convert this to a matrix build if you need
    # cross-platform coverage.
    # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
    runs-on: ubuntu-latest
    strategy:
      matrix:
        include:
          - container: "ubuntu-dev:24"
            build-type: Debug
            compiler: {cxx: g++, c: gcc}
            cxx_flags: "-fprofile-arcs -ftest-coverage"
    timeout-minutes: 300
    container:
      image: ghcr.io/romange/${{ matrix.container }}
      options: --security-opt seccomp=unconfined --sysctl "net.ipv6.conf.all.disable_ipv6=0"
      volumes:
        - /:/hostroot
        - /mnt:/mnt
      credentials:
        username: ${{ github.repository_owner }}
        password: ${{ secrets.GITHUB_TOKEN }}

    steps:
      - uses: actions/checkout@v6
        with:
          submodules: true
      - name: Delete Space
        run: |
          df -h
          rm -rf /hostroot/usr/share/dotnet
          rm -rf /hostroot/usr/local/share/boost
          rm -rf /hostroot/usr/local/lib/android
          rm -rf /hostroot/opt/ghc
          echo "***************After Deletion***************************"
          df -h
      - name: Install dependencies
        run: |
          uname -a
          cmake --version
          mkdir -p ${{github.workspace}}/build
          apt update && apt install -y lcov pip
      - name: Cache build deps
        id: cache-deps
        uses: actions/cache@v5
        with:
          path: |
            ~/.ccache
            ${{github.workspace}}/build/_deps
          key: ${{ runner.os }}-deps-${{ github.base_ref }}-${{ github.sha }}
          restore-keys: |
            ${{ runner.os }}-deps-${{ github.base_ref }}-

      - name: Configure CMake
        run: |
          pip install -r tests/dragonfly/requirements.txt
          cmake -B build \
            -DCMAKE_BUILD_TYPE=${{matrix.build-type}} \
            -GNinja \
            -DCMAKE_C_COMPILER="${{matrix.compiler.c}}" \
            -DCMAKE_CXX_COMPILER="${{matrix.compiler.cxx}}" \
            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
            -DCMAKE_CXX_FLAGS="${{matrix.cxx_flags}}" \
            -L
          pwd
          cd build && pwd
      - name: Build
        run: |
          cd $GITHUB_WORKSPACE/build
          echo "-----------------------------"
          ninja src/all

      - name: Run C++ Unit Tests
        run: |
          cd $GITHUB_WORKSPACE/build
          ctest -V -L DFLY

      - name: Run Python Integration Tests
        run: |
          cd $GITHUB_WORKSPACE/build
          export DRAGONFLY_PATH=`pwd`/dragonfly
          pytest ../tests/dragonfly/ --durations=10 --timeout=300 --color=yes --log-cli-level=INFO

      - name: Generate Coverage Report
        run: |
          cd $GITHUB_WORKSPACE/build
          lcov -c -d . -o main_coverage.info
          lcov --remove main_coverage.info -o main_coverage.info '/usr/*' '*/_deps/*' '*/third_party/*'
          genhtml main_coverage.info --ignore-errors source --output-directory covout -p $GITHUB_WORKSPACE
          ls ./
          echo ls covout
          ls covout/
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        with:
          files: build/main_coverage.info
          fail_ci_if_error: true
          token: ${{ secrets.CODECOV_TOKEN }}
      - name: Upload coverage
        uses: actions/upload-artifact@v6
        with:
          name: coverage-report
          path: build/covout/
          if-no-files-found: error


================================================
FILE: .github/workflows/daily-builds.yml
================================================
name: daily-builds

on:
  schedule:
    - cron: '0 6 * * *' # run at 6 AM UTC
  workflow_dispatch:

jobs:
  build:
    if: github.repository == 'dragonflydb/dragonfly'
    # The CMake configure and build commands are platform agnostic and should work equally
    # well on Windows or Mac.  You can convert this to a matrix build if you need
    # cross-platform coverage.
    # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
    runs-on: ubuntu-latest
    name: Build ${{ matrix.name }}
    strategy:
      matrix:
        include:
          # Build with these flags
          - name: generic
            container: alpine-dev
            flags: "-DMARCH_OPT=-march=x86-64"
          - name: fedora
            container: fedora:30-gcc14

    timeout-minutes: 45

    container:
      image: ghcr.io/romange/${{ matrix.container }}
      options: --security-opt seccomp=unconfined
      credentials:
        username: ${{ github.repository_owner }}
        password: ${{ secrets.GITHUB_TOKEN }}

    steps:
      - uses: actions/checkout@v6
        with:
          submodules: true

      - name: Show compiler version
        run: |
          echo "=== Compiler Version ==="
          ${CXX:-g++} --version
          echo "=== CMake Version ==="
          cmake --version
          echo "=== glibc Version ==="
          ldd --version | head -1 || true
          mkdir -p $GITHUB_WORKSPACE/build

      - name: Configure & Build
        run: |
          cd $GITHUB_WORKSPACE/build
          cmake .. -DCMAKE_BUILD_TYPE=Debug -GNinja ${{ matrix.flags }}
          ninja src/all
      - name: Test
        run: |
            cd $GITHUB_WORKSPACE/build
            ctest -V -L DFLY

      - name: Send notification on failure
        if: failure() && github.ref == 'refs/heads/main'
        run: |
          job_link="${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}"
          message="Daily build (${{ matrix.name }}) failed.\\n Commit: ${{github.sha}}\\n Job Link: ${job_link}\\n"

          curl -s \
            -X POST \
            -H 'Content-Type: application/json' \
            '${{ secrets.GSPACES_BOT_DF_BUILD }}' \
            -d '{"text": "'"${message}"'"}'

  build-macos:
    if: github.repository == 'dragonflydb/dragonfly'
    runs-on: macos-15
    timeout-minutes: 45
    steps:
      - uses: actions/checkout@v6
        with:
          submodules: true

      - name: Install dependencies
        run: |

          # Remove Python3 symlinks in /usr/local/bin as workaround to brew update issues
          # https://github.com/actions/setup-python/issues/577
          rm /usr/local/bin/2to3* || :
          rm /usr/local/bin/idle3* || :
          rm /usr/local/bin/pydoc* || :
          rm /usr/local/bin/python3* || :
          brew update && brew install ninja boost automake zstd bison autoconf libtool

          mkdir -p $GITHUB_WORKSPACE/build

      - name: Configure & Build
        run: |
          cd $GITHUB_WORKSPACE/build

          export PATH=/opt/homebrew/bin:$PATH
          export PATH=/opt/homebrew/opt/bison/bin/:$PATH

          which bison
          bison --version

          # Check system clang version
          clang --version
          clang++ --version

          # Verify current macOS SDK
          xcrun --show-sdk-path

          autoconf --help
          autoreconf --help

          echo "*************************** START BUILDING **************************************"
          # Configure for using current macOS SDK
          export SDKROOT=$(xcrun --sdk macosx --show-sdk-path)
          echo "Using SDK: $SDKROOT"

          # Use system clang/clang++ with macOS SDK
          cmake .. -DCMAKE_BUILD_TYPE=Debug -GNinja \
            -DCMAKE_C_COMPILER=clang \
            -DCMAKE_CXX_COMPILER=clang++ \
            -DCMAKE_OSX_SYSROOT="$SDKROOT" \
            -DCMAKE_OSX_DEPLOYMENT_TARGET=15.0

          ninja src/all

      - name: Test
        run: |
            cd $GITHUB_WORKSPACE/build
            ctest -V -L DFLY

      - name: Send notification on failure
        if: failure() && github.ref == 'refs/heads/main'
        run: |
          job_link="${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}"
          message="Daily build (macOS) failed.\\n Commit: ${{github.sha}}\\n Job Link: ${job_link}\\n"

          curl -s \
            -X POST \
            -H 'Content-Type: application/json' \
            '${{ secrets.GSPACES_BOT_DF_BUILD }}' \
            -d '{"text": "'"${message}"'"}'


================================================
FILE: .github/workflows/docker-dev-release.yml
================================================
name: Development Docker Build

on:
  schedule:
    - cron: '15 0 * * *'
  workflow_dispatch:

concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
  cancel-in-progress: true

env:
  image: ghcr.io/dragonflydb/dragonfly-dev
  GCS_IMAGE: us-central1-docker.pkg.dev/dragonflydb-public/dragonfly-registry/dragonfly-dev

jobs:
  build_and_tag:
    if: github.repository == 'dragonflydb/dragonfly'
    name: Build and Push ${{matrix.flavor}} ${{ matrix.os.arch }} image
    strategy:
      matrix:
        flavor: [alpine,ubuntu]
        os:
          - image: ubuntu-24.04
            arch: amd64
          - image: ubuntu-24.04-arm
            arch: arm64

    runs-on: ${{ matrix.os.image }}
    permissions:
      contents: read
      packages: write
      id-token: write
    steps:
      - name: checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 1
          submodules: true

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3

      - name: Login to Registries
        uses: ./.github/actions/multi-registry-docker-login
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          GCP_SA_KEY: ${{ secrets.GCP_SA_KEY }}

      - name: Authenticate to Google Cloud
        uses: google-github-actions/auth@v3
        with:
          credentials_json: ${{ secrets.GCP_SA_KEY }}

      - name: Configure AWS Credentials
        uses: aws-actions/configure-aws-credentials@v5
        with:
          role-to-assume: ${{ secrets.AWS_CI_S3_ROLE_ARN }}
          aws-region: us-east-1

      - name: Get Build Information
        id: build_info
        run: |
          echo "short_sha=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT

      - name: Docker meta
        id: metadata
        uses: docker/metadata-action@v5
        with:
          images: |
            ${{ env.image }}
            ${{ env.GCS_IMAGE }}
          tags: |
            type=sha,enable=true,prefix=${{ matrix.flavor}}-,suffix=-${{ matrix.os.arch }},format=short
          labels: |
            org.opencontainers.image.vendor=DragonflyDB LTD
            org.opencontainers.image.title=Dragonfly Development Image
            org.opencontainers.image.description=The fastest in-memory store
      - name: Build image
        id: build
        uses: docker/build-push-action@v6
        with:
          context: .
          push: true
          provenance: false  # Prevent pushing a docker manifest
          tags: |
            ${{ steps.metadata.outputs.tags }}
          labels: ${{ steps.metadata.outputs.labels }}
          file: tools/packaging/Dockerfile.${{ matrix.flavor }}-dev
          cache-from: type=gha,scope=tagged${{ matrix.flavor }}
          cache-to: type=gha,scope=tagged${{ matrix.flavor }},mode=max
          load: true  # Load the build images into the local docker.
      - name: Test Image
        run: |
          echo ${{ steps.build.outputs.digest }}
          image_tags=(${{ steps.metadata.outputs.tags }})

          # install redis-tools
          sudo apt-get install redis-tools -y

          for image_tag in "${image_tags[@]}"; do
            echo "Testing image: ${image_tag}"
            docker image inspect ${image_tag}
            echo "Testing ${{ matrix.flavor }} image"

            # docker run with port-forwarding
            docker run -d -p 6379:6379 ${image_tag}
            sleep 5
            redis-cli -h localhost ping | grep -q "PONG" || exit 1
            docker stop $(docker ps -q --filter ancestor=${image_tag})
          done

      - name: Extract and Upload Binaries
        if: matrix.flavor == 'ubuntu'  # Only run once per flavor
        run: |
          # Get the image tag
          image_tags=(${{ steps.metadata.outputs.tags }})
          image_tag=${image_tags[0]}

          # Extract version from the image
          echo "Extracting version from image..."
          VERSION=$(docker run --rm ${image_tag} dragonfly --version | sed -r "s/\x1B\[([0-9]{1,3}(;[0-9]{1,2})?)?[mGK]//g" | head -n1 | cut -d' ' -f2 | cut -d'-' -f1)
          # Check if version starts with a release version (v*.*.*)
          if [[ ! $VERSION =~ ^v[0-9]+\.[0-9]+\.[0-9]+ ]]; then
            # Get the latest release version to use as prefix
            LATEST_RELEASE=$(curl -s https://api.github.com/repos/dragonflydb/dragonfly/releases/latest | jq -r .tag_name)
            VERSION="${LATEST_RELEASE}+${VERSION}"
          fi
          echo "Dragonfly version: $VERSION"

          echo "Extracting binary from ${image_tag} for ${{ matrix.os.arch }}"

          # Create a temporary container and copy the binary
          container_id=$(docker create ${image_tag})
          docker cp ${container_id}:/usr/local/bin/dragonfly ./dragonfly
          docker rm ${container_id}

          # Create a tar archive
          if [[ "${{ matrix.os.arch }}" == "arm64" ]]; then
            arch_name="aarch64"
          else
            arch_name="x86_64"
          fi
          tar_name="dragonfly-${arch_name}-dbgsym.tar.gz"
          tar czf ${tar_name} dragonfly

          # Upload to GCS
          echo "Uploading ${tar_name} to GCS"
          gcloud storage cp "$tar_name" "gs://${{ secrets.STAGING_BINARY_BUCKET }}/dragonfly/$VERSION/$tar_name"

          # Upload to AWS
          echo "Uploading ${tar_name} to AWS"
          aws s3 cp "$tar_name" "s3://${{ secrets.STAGING_BINARY_BUCKET }}/dragonfly/$VERSION/$tar_name"

          # Cleanup
          rm -f dragonfly ${tar_name}

    outputs:
      # matrix jobs outputs override each other, but we use the same sha
      # for all images, so we can use the same output name.
      sha: ${{ steps.build_info.outputs.short_sha }}

  merge_manifest:
    if: github.repository == 'dragonflydb/dragonfly'
    needs: [build_and_tag]
    runs-on: ubuntu-latest
    strategy:
      matrix:
        flavor: [alpine,ubuntu]
    steps:
      - name: checkout
        uses: actions/checkout@v6

      - name: Login to Registries
        uses: ./.github/actions/multi-registry-docker-login
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          GCP_SA_KEY: ${{ secrets.GCP_SA_KEY }}

      - name: Merge and Push
        run: |
            # Function to create and push manifests for a given registry
            create_and_push_manifests() {
              local registry=$1
              local flavor=$2
              local sha=$3

              # Create and push the manifest like dragonfly-dev:alpine-<sha>
              local sha_tag="${registry}:${flavor}-${sha}"
              docker manifest create ${sha_tag} --amend ${sha_tag}-amd64 --amend ${sha_tag}-arm64
              docker manifest push ${sha_tag}

              # Create and push the manifest like dragonfly-dev:alpine
              local flavor_tag="${registry}:${flavor}"
              docker manifest create ${flavor_tag} --amend ${sha_tag}-amd64 --amend ${sha_tag}-arm64
              docker manifest push ${flavor_tag}
            }

            # GitHub Container Registry manifests
            create_and_push_manifests "${{ env.image }}" "${{ matrix.flavor }}" "${{ needs.build_and_tag.outputs.sha }}"

            # Google Artifact Registry manifests
            create_and_push_manifests "${{ env.GCS_IMAGE }}" "${{ matrix.flavor }}" "${{ needs.build_and_tag.outputs.sha }}"


================================================
FILE: .github/workflows/docker-release2.yml
================================================
name: Docker Release-v2

on:
  workflow_dispatch:
    inputs:
      TAG_NAME:
        description: 'Tag name that the major tag will point to'
        required: true
      PRERELEASE:
        description: 'Whether this is a prerelease'
        type: boolean
        required: true

  release:
    types: [published]

concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
  cancel-in-progress: true

env:
  TAG_NAME: ${{ github.event.inputs.TAG_NAME || github.event.release.tag_name }}
  IS_PRERELEASE: ${{ github.event.release.prerelease || github.event.inputs.PRERELEASE }}
  IMAGE: ghcr.io/dragonflydb/dragonfly
  GCS_IMAGE: us-central1-docker.pkg.dev/dragonflydb-public/dragonfly-registry/dragonfly

jobs:
  build_and_tag:
    name: Build and Push ${{matrix.flavor}} ${{ matrix.os.arch }} image
    strategy:
      matrix:
        flavor: [ubuntu]
        os:
          - image: ubuntu-24.04
            arch: amd64
          - image: ubuntu-24.04-arm
            arch: arm64

    runs-on: ${{ matrix.os.image }}
    permissions:
      contents: read
      packages: write
      id-token: write

    steps:
      - name: checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0
          submodules: true
      - name: Set up Docker Build
        uses: docker/setup-buildx-action@v3

      - name: Login to Registries
        uses: ./.github/actions/multi-registry-docker-login
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          GCP_SA_KEY: ${{ secrets.GCP_SA_KEY }}

      - name: Fetch release asset
        uses: dsaltares/fetch-gh-release-asset@1.1.2
        with:
          version: "tags/${{ env.TAG_NAME }}"
          regex: true
          file: "dragonfly-.*\\.tar\\.gz"
          target: 'releases/'
          token: ${{ secrets.GITHUB_TOKEN }}

      - name: Extract artifacts
        run: |
          echo "Event prerelease ${{ github.event.release.prerelease }}"
          echo "Input prerelease ${{ github.event.inputs.PRERELEASE }}"
          ls -l
          ls -l releases
          for f in releases/*.tar.gz; do tar xvfz $f -C releases; done
          rm releases/*.tar.gz

      - name: Docker meta
        id: metadata
        uses: docker/metadata-action@v5
        with:
          images: |
            ${{ env.IMAGE }}
            ${{ env.GCS_IMAGE }}
          flavor: |
            latest=false
            prefix=${{ matrix.flavor}}-
            suffix=-${{ matrix.os.arch }}
          tags: |
            type=semver,pattern={{version}},enable=true,value=${{ env.TAG_NAME }}
            type=semver,pattern={{raw}},enable=true,value=${{ env.TAG_NAME }}
            type=ref,event=pr
          labels: |
            org.opencontainers.image.vendor=DragonflyDB LTD
            org.opencontainers.image.title=Dragonfly Production Image
            org.opencontainers.image.description=The fastest in-memory store
            org.opencontainers.image.version=${{ env.TAG_NAME }}

      - name: Build image
        id: build
        uses: docker/build-push-action@v6
        with:
          context: .
          push: true
          provenance: false  # Prevent pushing a docker manifest
          tags: |
            ${{ steps.metadata.outputs.tags }}
          labels: ${{ steps.metadata.outputs.labels }}
          file: tools/packaging/Dockerfile.${{ matrix.flavor }}-prod
          cache-from: type=gha,scope=prod-${{ matrix.flavor }}
          cache-to: type=gha,scope=prod-${{ matrix.flavor }},mode=max
          load: true  # Load the build images into the local docker.

      - name: Test Image
        uses: ./.github/actions/test-docker
        timeout-minutes: 1
        with:
          image_id: ${{ env.IMAGE }}@${{ steps.build.outputs.digest }}
          name: ${{ matrix.flavor }}-${{ matrix.os.arch }}

      - id: output-sha
        run: |
          echo "sha_${{ matrix.os.arch }}=${{ steps.build.outputs.digest }}" >> $GITHUB_OUTPUT
    outputs:
      sha_amd: ${{ steps.output-sha.outputs.sha_amd64 }}
      sha_arm: ${{ steps.output-sha.outputs.sha_arm64 }}

  merge_manifest:
    needs: [build_and_tag]
    runs-on: ubuntu-latest
    strategy:
      matrix:
        flavor: [ubuntu]
    steps:
      - name: checkout
        uses: actions/checkout@v6

      - name: Login to Registries
        uses: ./.github/actions/multi-registry-docker-login
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          GCP_SA_KEY: ${{ secrets.GCP_SA_KEY }}

      - name: Merge and Push
        run: |
            # Function to create and push manifests for a given registry
            create_and_push_manifests() {
              local registry=$1
              local sha_amd=$2
              local sha_arm=$3
              local flavor=$4
              local tag_name=$5
              local is_prerelease=$6

              # Function for semantic version comparison
              # Returns true if current_version >= latest_version
              semver_cmp() {
                local current_version=$1
                local latest_version=$2
                local should_update=true

                # Extract major.minor.patch components
                IFS='.' read -ra CURRENT_PARTS <<< "$current_version"
                IFS='.' read -ra LATEST_PARTS <<< "$latest_version"

                # Pad arrays to same length for comparison
                while [ ${#CURRENT_PARTS[@]} -lt 3 ]; do CURRENT_PARTS+=(0); done
                while [ ${#LATEST_PARTS[@]} -lt 3 ]; do LATEST_PARTS+=(0); done

                # Compare major.minor.patch numerically
                if (( 10#${CURRENT_PARTS[0]} < 10#${LATEST_PARTS[0]} )); then
                  should_update=false
                elif (( 10#${CURRENT_PARTS[0]} == 10#${LATEST_PARTS[0]} )) && (( 10#${CURRENT_PARTS[1]} < 10#${LATEST_PARTS[1]} )); then
                  should_update=false
                elif (( 10#${CURRENT_PARTS[0]} == 10#${LATEST_PARTS[0]} )) && (( 10#${CURRENT_PARTS[1]} == 10#${LATEST_PARTS[1]} )) && (( 10#${CURRENT_PARTS[2]} < 10#${LATEST_PARTS[2]} )); then
                  should_update=false
                fi

                # Log debug info to stderr instead of stdout
                echo "Version comparison: current=${CURRENT_PARTS[0]}.${CURRENT_PARTS[1]}.${CURRENT_PARTS[2]} vs latest=${LATEST_PARTS[0]}.${LATEST_PARTS[1]}.${LATEST_PARTS[2]}" >&2

                # Return only the result
                echo $should_update
              }

              if [[ "$is_prerelease" == 'true' ]]; then
                # Create and push the manifest like dragonfly:alpha-ubuntu
                tag="${registry}:alpha-${flavor}"
                docker manifest create ${tag} --amend ${sha_amd} --amend ${sha_arm}
                docker manifest push ${tag}
              elif [[ "$flavor" == 'ubuntu' ]]; then
                # Checking if this version should be tagged as latest
                echo "Checking if ${tag_name} should be tagged as latest..."

                # Remove 'v' prefix if present for semantic comparison
                current_version=${tag_name#v}

                # Get the current latest version by running the latest image
                latest_version=""
                if docker pull ${registry}:latest &>/dev/null; then
                  echo "Found latest tag, checking its version..."

                  # First try to get version from image labels using docker inspect
                  echo "Method 1: Trying to get version from image labels..."
                  label_version=$(docker image inspect --format '{{ index .Config.Labels "org.opencontainers.image.version" }}' ${registry}:latest 2>/dev/null || echo "")

                  if [[ -n "$label_version" ]]; then
                    echo "Found version from image labels: $label_version"

                    # Extract version from format like "ubuntu-1.28.1-arm64"
                    if [[ $label_version == ubuntu-*-* ]]; then
                      # Extract the middle part (version) from ubuntu-VERSION-arch
                      latest_full_version=$(echo "$label_version" | cut -d'-' -f2)
                    else
                      # Use the label as is
                      latest_full_version=$label_version
                    fi

                    echo "Extracted version: $latest_full_version"
                  else
                    # Fallback to running the container if label inspect failed
                    echo "Method 2: Falling back to container execution..."
                    latest_full_version=$(docker run --rm --entrypoint /bin/sh ${registry}:latest -c "dragonfly --version | cut -d' ' -f2 | head -n 1")
                  fi

                  echo "Latest full version: ${latest_full_version}"

                  # Extract only the semantic version part (before any dash)
                  latest_version=$(echo "${latest_full_version}" | cut -d'-' -f1)
                  # Remove 'v' prefix if present
                  latest_version=${latest_version#v}
                  echo "Current latest version: ${latest_version}"
                else
                  echo "No latest tag found yet or couldn't pull it"
                fi

                # Compare versions only if we have a latest version
                should_update_latest=true
                if [[ -n "$latest_version" ]]; then
                  # Call our semver comparison function
                  should_update_latest=$(semver_cmp "$current_version" "$latest_version")
                fi

                if [[ "$should_update_latest" == true ]]; then
                  echo "Version ${tag_name} is newer than or equal to current latest, updating latest tag"
                  tag="${registry}:latest"
                  # Create and push the manifest like dragonfly:latest
                  docker manifest create ${tag} --amend ${sha_amd} --amend ${sha_arm}
                  docker manifest push ${tag}
                else
                  echo "Version ${tag_name} is older than current latest (${latest_version}), NOT updating latest tag"
                fi
              fi

              # Create and push the manifest like dragonfly:v1.26.4
              tag="${registry}:${tag_name}"
              docker manifest create ${tag} --amend ${sha_amd} --amend ${sha_arm}
              docker manifest push ${tag}
            }

            # GitHub Container Registry manifests
            ghcr_sha_amd=${{ env.IMAGE }}@${{ needs.build_and_tag.outputs.sha_amd }}
            ghcr_sha_arm=${{ env.IMAGE }}@${{ needs.build_and_tag.outputs.sha_arm }}
            create_and_push_manifests "${{ env.IMAGE }}" "$ghcr_sha_amd" "$ghcr_sha_arm" "${{ matrix.flavor }}" "${{ env.TAG_NAME }}" "${{ env.IS_PRERELEASE }}"

            # Google Artifact Registry manifests
            gar_sha_amd=${{ env.GCS_IMAGE }}@${{ needs.build_and_tag.outputs.sha_amd }}
            gar_sha_arm=${{ env.GCS_IMAGE }}@${{ needs.build_and_tag.outputs.sha_arm }}
            create_and_push_manifests "${{ env.GCS_IMAGE }}" "$gar_sha_amd" "$gar_sha_arm" "${{ matrix.flavor }}" "${{ env.TAG_NAME }}" "${{ env.IS_PRERELEASE }}"

  release_helm_and_notify:
    needs: [merge_manifest]
    runs-on: ubuntu-latest
    permissions:
      contents: write
      packages: write
      pull-requests: write
    steps:
    - name: print_env
      run: env

    - name: checkout
      uses: actions/checkout@v6
      with:
        token: ${{ secrets.DRAGONFLY_TOKEN }}  # PAT to push to main
        fetch-depth: 0

    - name: Install helm
      uses: azure/setup-helm@v4

    - name: Setup Go
      uses: actions/setup-go@v6

    - name: Configure Git
      if: env.IS_PRERELEASE != 'true'
      run: |
        git config user.name "$GITHUB_ACTOR"
        git config user.email "$GITHUB_ACTOR@users.noreply.github.com"

    - name: Update helm chart
      if: env.IS_PRERELEASE != 'true'
      run: |
        git checkout -b helm-chart-update/${{ env.TAG_NAME }} origin/main
        sed -Ei \
            -e 's/^(version\:) .*/\1 '${{ env.TAG_NAME }}'/g' \
            -e 's/^(appVersion\:) .*/\1 "'${{ env.TAG_NAME }}'"/g' \
            contrib/charts/dragonfly/Chart.yaml

        go test ./contrib/charts/dragonfly/... -update

        git commit \
          -m 'chore(helm-chart): update to ${{ env.TAG_NAME }}' \
          contrib/charts/dragonfly/Chart.yaml \
          contrib/charts/dragonfly/ci || true

    - name: Push Helm chart as OCI to Github
      if: env.IS_PRERELEASE != 'true'
      run: |
        echo "${{ secrets.GITHUB_TOKEN }}" | \
          helm registry login -u ${{ github.actor }} --password-stdin ghcr.io

        helm package contrib/charts/dragonfly

        helm push dragonfly-${{ env.TAG_NAME }}.tgz oci://ghcr.io/${{ github.repository }}/helm

    - name: Discord notification
      env:
        DISCORD_WEBHOOK: ${{ secrets.DISCORD_WEBHOOK }}
      uses: Ilshidur/action-discord@d2594079a10f1d6739ee50a2471f0ca57418b554
      with:
        args: 'DragonflyDB version [${{ env.TAG_NAME }}](https://github.com/dragonflydb/dragonfly/releases/tag/${{ env.TAG_NAME }}) has been released 🎉'

    - name: Re-build Docs
      if: env.IS_PRERELEASE != 'true'
      run: |
        curl -s -X POST '${{ secrets.VERCEL_DOCS_WEBHOOK }}'

    - name: Create Helm Chart PR
      if: env.IS_PRERELEASE != 'true'
      env:
        GH_TOKEN: ${{ secrets.DRAGONFLY_TOKEN }}
      run: |
        git push origin helm-chart-update/${{ env.TAG_NAME }}
        gh pr create \
          --base main \
          --head helm-chart-update/${{ env.TAG_NAME }} \
          --title 'chore(helm-chart): update to ${{ env.TAG_NAME }}' \
          --body 'Automated Helm chart version bump to ${{ env.TAG_NAME }}.' \
          --reviewer vyavdoshenko


================================================
FILE: .github/workflows/epoll-regression-tests.yml
================================================
name: Epoll Regression Tests

on:
  schedule:
    - cron: "0 0/3 * * *"
  workflow_dispatch:

jobs:
  build:
    if: github.repository == 'dragonflydb/dragonfly'
    strategy:
      matrix:
        # Test of these containers
        container: ["ubuntu-dev:24"]
        proactor: [Epoll]
        build-type: [Debug]
        runner: [ubuntu-latest, [self-hosted, linux, ARM64]]

    runs-on: ${{ matrix.runner }}

    permissions:
      id-token: write
      contents: read

    container:
      image: ghcr.io/romange/${{ matrix.container }}
      options: --security-opt seccomp=unconfined --sysctl "net.ipv6.conf.all.disable_ipv6=0"
      volumes:
        - /var/crash:/var/crash
        - /:/hostroot
        - /mnt:/mnt
    steps:
      - uses: actions/checkout@v6
        with:
          submodules: true

      - name: Print environment info
        run: |
          cat /proc/cpuinfo
          ulimit -a
          env

      - name: Build Dragonfly
        uses: ./.github/actions/builder
        with:
          build-type: ${{matrix.build-type}}
          targets: 'dragonfly'

      - name: Authenticate to AWS
        uses: aws-actions/configure-aws-credentials@v5
        with:
          role-to-assume: ${{ secrets.AWS_CI_S3_ROLE_ARN }}
          aws-region: us-east-1

      - name: Run regression tests action
        uses: ./.github/actions/regression-tests
        with:
          dfly-executable: dragonfly
          gspace-secret: ${{ secrets.GSPACES_BOT_DF_BUILD }}
          build-folder-name: build
          filter: ${{ matrix.build-type == 'Release' && 'not empty' || 'not opt_only' }}
          s3-bucket: ${{ secrets.S3_REGTEST_BUCKET }}
          # Chain ternary oprator of the form (which can be nested)
          # (expression == condition && <true expression> || <false expression>)
          epoll: ${{ matrix.proactor == 'Epoll' && 'epoll' || 'iouring' }}

      - name: Upload logs on failure
        if: failure()
        uses: actions/upload-artifact@v6
        with:
          name: logs
          path: /tmp/failed/*

  lint-test-chart:
    if: github.repository == 'dragonflydb/dragonfly'
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v6
      - uses: ./.github/actions/lint-test-chart


================================================
FILE: .github/workflows/fuzz-long.yml
================================================
name: AFL++ Long Fuzzing Campaign

on:
  schedule:
    # Run nightly at 2 AM UTC
    - cron: '0 2 * * *'
  workflow_dispatch:
    inputs:
      resp_duration:
        description: 'RESP fuzzing duration in minutes'
        required: false
        default: '60'
        type: string
      memcache_duration:
        description: 'Memcache fuzzing duration in minutes'
        required: false
        default: '30'
        type: string

concurrency:
  group: ${{ github.workflow }}
  cancel-in-progress: true

jobs:
  fuzz-long:
    if: github.repository == 'dragonflydb/dragonfly'
    runs-on: CI-LARGE-86
    timeout-minutes: 120

    strategy:
      fail-fast: false
      matrix:
        include:
          - target: resp
            duration: '60'
          - target: memcache
            duration: '30'

    container:
      image: ghcr.io/romange/ubuntu-dev:24-afl
      options: --security-opt seccomp=unconfined --sysctl "net.ipv6.conf.all.disable_ipv6=0"
      credentials:
        username: ${{ github.repository_owner }}
        password: ${{ secrets.GITHUB_TOKEN }}

    steps:
      - name: Checkout code
        uses: actions/checkout@v6
        with:
          submodules: true

      - name: Run AFL++ long fuzzing campaign (${{ matrix.target }})
        uses: ./.github/actions/fuzzing
        with:
          mode: long
          target: ${{ matrix.target }}
          duration-minutes: ${{ matrix.target == 'resp' && (github.event.inputs.resp_duration || matrix.duration) || (github.event.inputs.memcache_duration || matrix.duration) }}
          run-number: ${{ github.run_number }}

      - name: Send notification on failure
        if: failure() && github.ref == 'refs/heads/main'
        run: |
          job_link="${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}"
          message="AFL++ ${{ matrix.target }} fuzzing found crashes.\\n Commit: ${{github.sha}}\\n Job Link: ${job_link}\\n"

          curl -s \
            -X POST \
            -H 'Content-Type: application/json' \
            '${{ secrets.GSPACES_BOT_DF_BUILD }}' \
            -d '{"text": "'"${message}"'"}'


================================================
FILE: .github/workflows/fuzz-pr.yml
================================================
# Run AFL++ fuzzing on PRs that touch C++ code.
#
# For each PR, an LLM analyzes the diff and generates:
#   1. Targeted seed files — initial inputs crafted to exercise the changed code paths.
#      (A "seed" is a RESP-encoded sequence of Redis commands that the fuzzer starts from
#       and mutates; see fuzz/seeds/resp/*.resp for the existing seed corpus.)
#   2. Focus command list — commands the mutator should prefer (~70% of the time),
#      so mutations concentrate on the affected code instead of spreading randomly.
#
# The fuzzer then runs for 15 minutes in "smoke" mode (stop on first crash).
# When ANTHROPIC_API_KEY is unavailable (e.g. fork PRs), seed generation is skipped
# and the fuzzer uses the existing seed corpus as-is.
#
# Additionally, if the PR touches memcache-related code (memcache_parser, mc_family,
# fuzz/memcache_mutator.py, or fuzz/seeds/memcache/), a focused memcache fuzzing step
# runs automatically after RESP fuzzing passes, reusing the already-built binary.
name: AFL++ PR Fuzzing

on:
  pull_request:
    branches: [main]
    paths:
      - 'src/**/*.cc'
      - 'src/**/*.h'
      - 'helio/**/*.cc'
      - 'helio/**/*.h'
      - 'fuzz/**'
      - '.github/workflows/fuzz-pr.yml'
      - '.github/actions/fuzzing/**'
  workflow_dispatch:
    inputs:
      duration:
        description: 'Fuzzing duration in minutes'
        required: false
        default: '15'
        type: string
      memcache-duration:
        description: 'Memcache fuzzing duration in minutes'
        required: false
        default: '10'
        type: string

concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
  cancel-in-progress: true

jobs:
  fuzz-pr:
    runs-on: CI-LARGE-86
    timeout-minutes: 60

    container:
      image: ghcr.io/romange/ubuntu-dev:24-afl
      options: --security-opt seccomp=unconfined --sysctl "net.ipv6.conf.all.disable_ipv6=0"
      credentials:
        username: ${{ github.repository_owner }}
        password: ${{ secrets.GITHUB_TOKEN }}

    steps:
      - name: Checkout code
        uses: actions/checkout@v6
        with:
          submodules: true
          fetch-depth: 0

      - name: Generate PR diff
        id: diff
        run: |
          if [ "${{ github.event_name }}" = "pull_request" ]; then
            git config --global --add safe.directory "$GITHUB_WORKSPACE"
            BASE=${{ github.event.pull_request.base.sha }}
            HEAD_SHA=${{ github.event.pull_request.head.sha }}
            MERGE_BASE=$(git merge-base "$BASE" "$HEAD_SHA")
            git diff "$MERGE_BASE".."$HEAD_SHA" > /tmp/pr_diff.txt
          else
            echo "" > /tmp/pr_diff.txt
          fi

          DIFF_LINES=$(wc -l < /tmp/pr_diff.txt)
          echo "diff_lines=${DIFF_LINES}" >> "$GITHUB_OUTPUT"

          echo "::group::PR diff summary"
          echo "C++ diff lines: ${DIFF_LINES}"
          if [ "$DIFF_LINES" -gt 0 ]; then
            echo "Changed files:"
            grep '^diff --git' /tmp/pr_diff.txt | sed 's|diff --git a/.* b/|  |' || true
          else
            echo "No C++ file changes in this PR — seed generation will be skipped"
          fi
          echo "::endgroup::"

      - name: Generate targeted seeds
        id: seeds
        run: |
          pip install 'anthropic>=0.39,<1' 2>/dev/null || pip install --break-system-packages 'anthropic>=0.39,<1' 2>/dev/null || true

          SEEDS_DIR="${GITHUB_WORKSPACE}/fuzz/seeds/pr_targeted"
          mkdir -p "$SEEDS_DIR"

          python3 fuzz/generate_targeted_seeds.py \
            --output-dir "$SEEDS_DIR" \
            < /tmp/pr_diff.txt

          FOCUS=""
          if [ -f "$SEEDS_DIR/focus_commands.json" ]; then
            FOCUS=$(cat "$SEEDS_DIR/focus_commands.json")
          fi
          echo "focus_commands=${FOCUS}" >> "$GITHUB_OUTPUT"
          echo "seeds_dir=${SEEDS_DIR}" >> "$GITHUB_OUTPUT"

          SEED_COUNT=$(ls "$SEEDS_DIR"/*.resp 2>/dev/null | wc -l || echo 0)

          echo "::group::Seed generation results"
          echo "Seeds generated: ${SEED_COUNT}"
          echo "Focus commands: ${FOCUS:-none}"
          if [ "$SEED_COUNT" -gt 0 ]; then
            ls -la "$SEEDS_DIR"/*.resp
          fi
          echo "::endgroup::"

          # Job summary
          {
            echo "### Fuzzing Seed Generation"
            echo ""
            if [ "$SEED_COUNT" -gt 0 ]; then
              echo "- **Seeds generated:** ${SEED_COUNT}"
              echo "- **Focus commands:** \`${FOCUS}\`"
            elif [ "$(wc -l < /tmp/pr_diff.txt)" -eq 0 ]; then
              echo "- No C++ changes in PR — using default seed corpus"
            elif [ -z "$ANTHROPIC_API_KEY" ]; then
              echo "- No API key — using default seed corpus"
            else
              echo "- LLM did not produce usable seeds — using default seed corpus"
            fi
          } >> "$GITHUB_STEP_SUMMARY"
        env:
          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}

      - name: Run AFL++ PR fuzzing
        uses: ./.github/actions/fuzzing
        with:
          mode: smoke
          duration-minutes: ${{ github.event.inputs.duration || '15' }}
          run-number: ${{ github.run_number }}
          extra-seeds-dir: ${{ steps.seeds.outputs.seeds_dir }}
          focus-commands: ${{ steps.seeds.outputs.focus_commands }}

      # Reuses the binary built by the RESP step above (build: false).
      # Only runs when RESP fuzzing passed (default success() condition) and memcache
      # code was actually touched in this PR.
      - name: Check if memcache-related files changed
        id: memcache-check
        run: |
          if [ "${{ github.event_name }}" = "pull_request" ]; then
            CHANGED=$(grep -E '^diff --git a/(src/(facade/memcache|server/mc_family)|fuzz/(memcache_mutator|seeds/memcache))' /tmp/pr_diff.txt || true)
            if [ -n "$CHANGED" ]; then
              echo "run=true" >> "$GITHUB_OUTPUT"
              echo "Memcache-related files changed — will run memcache fuzzing:"
              echo "$CHANGED" | sed 's|diff --git a/.* b/|  |'
            else
              echo "run=false" >> "$GITHUB_OUTPUT"
              echo "No memcache-related files changed — skipping memcache fuzzing"
            fi
          else
            echo "run=true" >> "$GITHUB_OUTPUT"
            echo "Manual trigger — running memcache fuzzing"
          fi

      - name: Run AFL++ memcache fuzzing
        if: success() && steps.memcache-check.outputs.run == 'true'
        uses: ./.github/actions/fuzzing
        with:
          mode: smoke
          target: memcache
          build: 'false'
          duration-minutes: ${{ github.event.inputs['memcache-duration'] || '10' }}
          run-number: ${{ github.run_number }}


================================================
FILE: .github/workflows/generate-osrepo-site.yml
================================================
name: generate-site
on:
  workflow_dispatch:
  release:
    types: [published]

jobs:
  gen-site:
    runs-on: ubuntu-latest
    env:
      SiteRoot: _site

    name: Generate index and site assets
    steps:
      - name: Checkout Repository
        uses: actions/checkout@v6

      - name: Install packaging tools
        # RPM tools are available on ubuntu
        run: sudo apt install -y rpm gpg createrepo-c dpkg-dev reprepro

      - name: Setup requirements
        working-directory: tools/packaging/osrepos
        run: pip install -r requirements.txt

      - name: Download packages
        working-directory: tools/packaging/osrepos
        run: python scripts/fetch-releases.py $SiteRoot

      - name: Import GPG key
        id: gpg-import
        uses: crazy-max/ghaction-import-gpg@v6
        with:
          gpg_private_key: ${{ secrets.GPG_PRIVATE_KEY }}

      - name: Sign RPMs
        shell: sh
        working-directory: tools/packaging/osrepos
        run: sh scripts/sign-rpms.sh ${{ steps.gpg-import.outputs.fingerprint }}

      - name: Create YUM repository
        # Creates metadata for YUM/DNF repository, the files were copied in the download step
        shell: sh
        working-directory: tools/packaging/osrepos
        run: createrepo_c -v $SiteRoot/rpm

      - name: Sign YUM repository
        shell: sh
        working-directory: tools/packaging/osrepos
        run: gpg --armor --detach-sign $SiteRoot/rpm/repodata/repomd.xml

      - name: Create APT repository
        # The configuration for apt repo is in tools/packaging/osrepos/reprepro-config,
        # which ensures the same GPG key used elsewhere in this action is used to sign
        # the repository
        shell: sh
        working-directory: tools/packaging/osrepos
        run: sh -x scripts/generate-apt-repo.sh

      - name: Prepare assets
        working-directory: tools/packaging/osrepos
        run: |
          cp -aRv dragonfly.repo pgp-key.public dragonfly.sources $SiteRoot/
          rm -rf $SiteRoot/deb/conf

      - name: Generate Directory Listings
        working-directory: tools/packaging/osrepos
        run: python scripts/generate-index.py $SiteRoot

      - name: Authenticate
        uses: 'google-github-actions/auth@v3'
        with:
          project_id: 'dragonflydb'
          credentials_json: ${{ secrets.GCP_BUCKET_CREDENTIALS }}

      - name: GCloud setup
        uses: 'google-github-actions/setup-gcloud@v3'

      - name: Deploy site
        working-directory: tools/packaging/osrepos
        run: |
          gcloud storage rm ${{ secrets.GCP_PACKAGES_BUCKET }}/**
          gcloud storage rsync $SiteRoot ${{ secrets.GCP_PACKAGES_BUCKET }} --recursive --delete-unmatched-destination-objects

      - name: Notify on failure
        if: failure()
        run: |
          job_link="${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}"
          message="Package repo generation failed.\nCommit: ${{ github.sha }}\nJob: ${job_link}"
          curl -sSf -X POST -H 'Content-Type: application/json' '${{ secrets.GSPACES_BOT_DF_BUILD }}' -d '{"text": "'"${message}"'"}'


================================================
FILE: .github/workflows/heavy-tests.yml
================================================
name: Heavy Tests

on:
  schedule:
    - cron: "0 0/6 * * *"
  workflow_dispatch:

jobs:
  build:
    if: github.repository == 'dragonflydb/dragonfly'
    strategy:
      matrix:
        # Test of these containers
        container: ["ubuntu-dev:24"]
        proactor: [Uring]
        build-type: [Release]
        runner: [CI-LARGE-86, CI-LARGE-ARM]

    runs-on: ${{ matrix.runner }}

    permissions:
      id-token: write
      contents: read

    container:
      image: ghcr.io/romange/${{ matrix.container }}
      options: --security-opt seccomp=unconfined --sysctl "net.ipv6.conf.all.disable_ipv6=0"
      volumes:
        - /var/crash:/var/crash
        - /:/hostroot
        - /mnt:/mnt
    steps:
      - uses: actions/checkout@v6
        with:
          submodules: true

      - name: Print environment info
        run: |
          cat /proc/cpuinfo
          ulimit -a
          env
          lsblk -l

      - name: Build Dragonfly
        uses: ./.github/actions/builder
        with:
          build-type: ${{matrix.build-type}}
          targets: 'dragonfly'

      - name: Authenticate to AWS
        uses: aws-actions/configure-aws-credentials@v5
        with:
          role-to-assume: ${{ secrets.AWS_CI_S3_ROLE_ARN }}
          aws-region: us-east-1

      - name: Run heavy tests
        uses: ./.github/actions/regression-tests
        with:
          dfly-executable: dragonfly
          gspace-secret: ${{ secrets.GSPACES_BOT_DF_BUILD }}
          build-folder-name: build
          filter: large
          s3-bucket: ${{ secrets.S3_REGTEST_BUCKET }}

      - name: Upload logs on failure
        if: failure()
        uses: actions/upload-artifact@v6
        with:
          name: logs-${{ matrix.runner }}
          path: /tmp/failed/*


================================================
FILE: .github/workflows/ioloop-v2-regtests.yml
================================================
name: RegTests IoLoopV2

# Manually triggered only
on:
  workflow_dispatch:

jobs:
  build:
    strategy:
      matrix:
        # Test of these containers
        container: ["ubuntu-dev:20-gcc14"]
        proactor: [Uring]
        build-type: [Debug, Release]
        runner: [ubuntu-latest, [self-hosted, linux, ARM64]]

    runs-on: ${{ matrix.runner }}

    permissions:
      id-token: write
      contents: read

    container:
      image: ghcr.io/romange/${{ matrix.container }}
      options: --security-opt seccomp=unconfined --sysctl "net.ipv6.conf.all.disable_ipv6=0"
      volumes:
        - /var/crash:/var/crash
        - /:/hostroot
        - /mnt:/mnt

    steps:
      - uses: actions/checkout@v6
        with:
          submodules: true

      - name: Print environment info
        run: |
          cat /proc/cpuinfo
          ulimit -a
          env

      - name: Build Dragonfly
        uses: ./.github/actions/builder
        with:
          build-type: ${{matrix.build-type}}
          targets: 'dragonfly'

      - name: Authenticate to AWS
        uses: aws-actions/configure-aws-credentials@v5
        with:
          role-to-assume: ${{ secrets.AWS_CI_S3_ROLE_ARN }}
          aws-region: us-east-1

      - name: Run regression tests action
        uses: ./.github/actions/regression-tests
        with:
          dfly-executable: dragonfly
          gspace-secret: ${{ secrets.GSPACES_BOT_DF_BUILD }}
          build-folder-name: build
          filter: ${{ matrix.build-type == 'Release' && 'not debug_only and not tls' || 'not opt_only and not tls' }}
          s3-bucket: ${{ secrets.S3_REGTEST_BUCKET }}

      - name: Upload logs on failure
        if: failure()
        uses: actions/upload-artifact@v6
        with:
          name: logs
          path: /tmp/failed/*

  lint-test-chart:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v6
      - uses: ./.github/actions/lint-test-chart


================================================
FILE: .github/workflows/mastodon-ruby-tests.yml
================================================
name: Mastodon ruby tests
on:
  schedule:
    - cron: '0 6 * * *' # run at 6 AM UTC
  workflow_dispatch:


jobs:
  build-and-test:
    if: github.repository == 'dragonflydb/dragonfly'
    runs-on: ubuntu-latest
    name: Build and run tests

    services:
      postgres:
        image: postgres:14-alpine
        env:
          POSTGRES_PASSWORD: postgres
          POSTGRES_USER: postgres
        options: >-
          --health-cmd pg_isready
          --health-interval 10ms
          --health-timeout 3s
          --health-retries 50
        ports:
          - 5432:5432

      redis:
        image: docker.dragonflydb.io/dragonflydb/dragonfly:latest
        options: >-
          --health-cmd "redis-cli ping"
          --health-interval 10ms
          --health-timeout 3s
          --health-retries 50
        ports:
          - 6379:6379

    env:
      DB_HOST: localhost
      DB_USER: postgres
      DB_PASS: postgres
      RAILS_ENV: test
      ALLOW_NOPAM: true
      PAM_ENABLED: true
      PAM_DEFAULT_SERVICE: pam_test
      PAM_CONTROLLED_SERVICE: pam_test_controlled
      OIDC_ENABLED: true
      OIDC_SCOPE: read
      SAML_ENABLED: true
      CAS_ENABLED: true
      BUNDLE_WITH: 'pam_authentication test'
      GITHUB_RSPEC: false

    steps:
      - name: Checkout mastodon
        uses: actions/checkout@v6
        with:
          repository: mastodon/mastodon
      - name: Install pre-requisites
        run: |
          sudo apt update
          sudo apt install -y libicu-dev libidn11-dev libvips42 ffmpeg imagemagick libpam-dev
      - name: Set up Ruby
        uses: ruby/setup-ruby@v1
        with:
          ruby-version: 3.4
          bundler-cache: true
      - name: Enable corepack
        shell: bash
        run: corepack enable
      - name: Install all production yarn packages
        shell: bash
        run: yarn workspaces focus --production
      - name: Set up Node.js
        uses: actions/setup-node@v6
        with:
          node-version-file: '.nvmrc'
      - name: Precompile assets
        run: |-
          bin/rails assets:precompile
      - name: Load database schema
        run: |
          bin/rails db:setup
          bin/flatware fan bin/rails db:test:prepare
      - name: Run tests
        env:
          SPEC_OPTS: '--exclude-pattern "**/self_destruct_scheduler_spec.rb"'
        run: |
          unset COVERAGE
          bin/flatware rspec -r ./spec/flatware_helper.rb
      - name: Notify on failures
        if: failure()
        shell: bash
        run: |
          job_link="${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}"
          message="Mastodon ruby tests failed.\\n The commit is: ${{github.sha}}.\\n Job Link: ${job_link}\\n"
          curl -s \
            -X POST \
            -H 'Content-Type: application/json' \
            '${{ secrets.GSPACES_BOT_DF_BUILD }}' \
            -d '{"text": "'"${message}"'"}'


================================================
FILE: .github/workflows/package-install.yml
================================================
name: package-install-tests

on:
  schedule:
    - cron: '0 6 * * *'
  workflow_dispatch:
  workflow_run:
    workflows: ["generate-site"]
    types: [completed]

jobs:
  test-rpm:
    runs-on: ubuntu-latest
    if: github.repository == 'dragonflydb/dragonfly' && (github.event_name != 'workflow_run' || github.event.workflow_run.conclusion == 'success')
    container:
      image: ghcr.io/romange/fedora:30
    steps:
      - name: Install on fedora
        run: |
          curl -Lo /etc/yum.repos.d/dragonfly.repo https://packages.dragonflydb.io/dragonfly.repo
          dnf clean all
          dnf makecache
          dnf -y install dragonfly
          dragonfly --version

  test-deb-ubuntu:
    runs-on: ubuntu-latest
    if: github.repository == 'dragonflydb/dragonfly' && (github.event_name != 'workflow_run' || github.event.workflow_run.conclusion == 'success')
    container:
      image: ghcr.io/romange/ubuntu:noble
    steps:
      - name: Install on ubuntu
        run: |
          apt update
          apt install -y curl
          curl -Lo /usr/share/keyrings/dragonfly-keyring.public https://packages.dragonflydb.io/pgp-key.public
          curl -Lo /etc/apt/sources.list.d/dragonfly.sources https://packages.dragonflydb.io/dragonfly.sources
          apt update
          apt install -y dragonfly
          dragonfly --version

  notify-on-failure:
    runs-on: ubuntu-latest
    needs: [test-rpm, test-deb-ubuntu]
    if: github.repository == 'dragonflydb/dragonfly' && always() && (needs.test-rpm.result == 'failure' || needs.test-deb-ubuntu.result == 'failure')
    steps:
      - name: Notify on failure
        run: |
          job_link="${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}"
          message="Package install tests failed.\nCommit: ${{ github.sha }}\nJob: ${job_link}"
          curl -sSf -X POST -H 'Content-Type: application/json' '${{ secrets.GSPACES_BOT_DF_BUILD }}' -d '{"text": "'"${message}"'"}'


================================================
FILE: .github/workflows/regression-tests.yml
================================================
name: Regression Tests

on:
  schedule:
    - cron: "0 0/3 * * *"
  workflow_dispatch:

jobs:
  build:
    if: github.repository == 'dragonflydb/dragonfly'
    strategy:
      matrix:
        # Test of these containers
        container: ["ubuntu-dev:24"]
        proactor: [Uring]
        build-type: [Debug, Release]
        runner: [ubuntu-latest, [self-hosted, linux, ARM64]]

    runs-on: ${{ matrix.runner }}

    permissions:
      id-token: write
      contents: read

    container:
      image: ghcr.io/romange/${{ matrix.container }}
      options: --security-opt seccomp=unconfined --sysctl "net.ipv6.conf.all.disable_ipv6=0"
      volumes:
        - /var/crash:/var/crash
        - /:/hostroot
        - /mnt:/mnt
    steps:
      - uses: actions/checkout@v6
        with:
          submodules: true

      - name: Print environment info
        run: |
          cat /proc/cpuinfo
          ulimit -a
          env
          lsblk -l

      - name: Build Dragonfly
        uses: ./.github/actions/builder
        with:
          build-type: ${{matrix.build-type}}
          targets: 'dragonfly'

      - name: Authenticate to AWS
        uses: aws-actions/configure-aws-credentials@v5
        with:
          role-to-assume: ${{ secrets.AWS_CI_S3_ROLE_ARN }}
          aws-region: us-east-1

      - name: Run regression tests action
        uses: ./.github/actions/regression-tests
        with:
          dfly-executable: dragonfly
          gspace-secret: ${{ secrets.GSPACES_BOT_DF_BUILD }}
          build-folder-name: build
          filter: ${{ matrix.build-type == 'Release' && 'not debug_only' || 'not opt_only' }}
          s3-bucket: ${{ secrets.S3_REGTEST_BUCKET }}

      - name: Upload logs on failure
        if: failure()
        uses: actions/upload-artifact@v6
        with:
          name: logs
          path: /tmp/failed/*

  lint-test-chart:
    if: github.repository == 'dragonflydb/dragonfly'
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v6
      - uses: ./.github/actions/lint-test-chart


================================================
FILE: .github/workflows/release.yml
================================================
name: Version Release

on:
  push:
    tags:
    - 'v*'

permissions:
  contents: write

env:
  RELEASE_DIR: build-release

jobs:
  create-release:
    runs-on: ubuntu-latest
    steps:
      - name: Create Release
        uses: ncipollo/release-action@v1
        with:
          allowUpdates: true
          omitBody: true
          prerelease: true
          draft: true
          token: ${{ secrets.GITHUB_TOKEN }}

  build-arm:
    runs-on: ubuntu-24.04-arm
    name: Build arm64 on ubuntu-24.04-arm
    needs: create-release
    container:
      image: ghcr.io/romange/ubuntu-dev:20-gcc14
      options: --security-opt seccomp=unconfined --sysctl "net.ipv6.conf.all.disable_ipv6=0"
    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true
    - name: Build artifacts
      run: |
            # Work around https://github.com/actions/checkout/issues/766
            git config --global --add safe.directory "$GITHUB_WORKSPACE"
            git describe --always --tags ${{ github.sha }}
            ./tools/release.sh
            ./tools/packaging/generate_debian_package.sh ${{ env.RELEASE_DIR }}/dragonfly-aarch64
            mv dragonfly_*.deb ${{ env.RELEASE_DIR }}/

    - name: Upload
      uses: actions/upload-artifact@v6
      with:
        name: dragonfly-aarch64
        path: |
          ${{ env.RELEASE_DIR }}/dragonfly-*tar.gz
          ${{ env.RELEASE_DIR }}/dragonfly_*.deb
          ${{ env.RELEASE_DIR }}/dfly_bench-*tar.gz

  build-native:
    runs-on: ubuntu-latest
    needs: create-release
    strategy:
      matrix:
        include:
          # Build with these flags
          - name: debian
            container: ubuntu-dev:20-gcc14
          - name: rpm
            container: fedora:30-gcc14
    container:
      image: ghcr.io/romange/${{ matrix.container }}
      options: --security-opt seccomp=unconfined --sysctl "net.ipv6.conf.all.disable_ipv6=0"
      # Some tests which launch their own containers need a mounted volume to write through files
      # into child containers
      volumes:
        - /mnt:/mnt

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true
    - name: Configure
      run: |
          if [ -f /etc/redhat-release ]; then
            dnf install -y rpm-build libstdc++-static
          fi
    - name: Build artifacts
      timeout-minutes: 25
      run: |
          # Work around https://github.com/actions/checkout/issues/766
          git config --global --add safe.directory "$GITHUB_WORKSPACE"
          git describe --always --tags ${{ github.sha }}

          # set WITH_SIMSIMD=OFF for fedora:30
          if [ "${{ matrix.name }}" == 'rpm' ]; then
            export WITH_SIMSIMD="OFF"
          fi
          ./tools/release.sh

          # once the build is over, we want to generate a Debian package
          if [ -f /etc/debian_version ]; then
            ./tools/packaging/generate_debian_package.sh ${{ env.RELEASE_DIR }}/dragonfly-x86_64
          else
            echo "Creating package for ${{github.ref_name}}"
            ./tools/packaging/rpm/build_rpm.sh ${{ env.RELEASE_DIR }}/dragonfly-x86_64.tar.gz ${{github.ref_name}}
          fi

    - name: Save artifacts
      run: |
          # place all artifacts at the same location
          set -eu
          mkdir -p results-artifacts
          if [ -f /etc/debian_version ]; then
            mv ${{ env.RELEASE_DIR }}/dragonfly-*tar.gz results-artifacts
            mv dragonfly_*.deb results-artifacts
            mv ${{ env.RELEASE_DIR }}/dfly_bench-*tar.gz results-artifacts
          else
            ls -l *.rpm
            mv ./*.rpm ./results-artifacts/
          fi

    - name: Upload
      uses: actions/upload-artifact@v6
      with:
        name: dragonfly-amd64-${{ matrix.name }}
        path: results-artifacts/*

  test-regression:
    needs: [build-native, build-arm]
    runs-on: ${{ matrix.runner }}
    strategy:
      matrix:
        include:
          - name: amd64
            runner: ubuntu-latest
            artifact: dragonfly-amd64-debian
            binary: dragonfly-x86_64
          - name: arm64
            runner: ubuntu-24.04-arm
            artifact: dragonfly-aarch64
            binary: dragonfly-aarch64
    container:
      image: ghcr.io/romange/ubuntu-dev:24
      options: --security-opt seccomp=unconfined --sysctl "net.ipv6.conf.all.disable_ipv6=0"
      volumes:
        - /mnt:/mnt
    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true
    - name: Download artifacts
      uses: actions/download-artifact@v7
      with:
        name: ${{ matrix.artifact }}
        path: results-artifacts
    - name: Extract artifacts
      run: |
        set -eu
        mkdir -p ${{ env.RELEASE_DIR }}
        tar -xzf results-artifacts/dragonfly-*dbgsym.tar.gz -C ${{ env.RELEASE_DIR }}
    - name: Run regression tests
      uses: ./.github/actions/regression-tests
      with:
        dfly-executable: ${{ matrix.binary }}
        gspace-secret: ${{ secrets.GSPACES_BOT_DF_BUILD }}
        build-folder-name: ${{ env.RELEASE_DIR }}
        filter: 'not debug_only'

  publish_release:
    runs-on: ubuntu-latest
    needs: test-regression
    steps:
      - uses: actions/download-artifact@v7
        name: Download files
        with:
          path: artifacts
      - name: See all the artifacts
        run: |
          ls -lR artifacts/
      - uses: ncipollo/release-action@v1
        with:
          artifacts: "artifacts/dragonfly-*/*"
          allowUpdates: true
          draft: true
          prerelease: true
          omitNameDuringUpdate: true
          token: ${{ secrets.GITHUB_TOKEN }}


================================================
FILE: .github/workflows/repeat-tests.yml
================================================
name: Repeat Tests

on:
  workflow_dispatch:
    inputs:
      branch:
        description: "The branch on which tests will be repeated"
        type: string
        required: false
      commit:
        description: "A specific commit SHA to test (takes precedence over branch)"
        type: string
        required: false
      count:
        description: "The number of times the tests will be repeated"
        type: number
        required: false
        default: 1
      expression:
        description: "A pytest expression which will filter the tests"
        required: true
        type: string
      timeout:
        description: "Overall timeout for all test runs"
        required: false
        type: string
        default: "60m"
      epoll:
        description: "Force epoll mode in test"
        required: false
        type: string
        default: "no"
      use_release:
        description: "Use latest release instead of building dragonfly"
        required: false
        type: string
        default: "no"
      vmodule_expression:
        description: "Emit verbose dragonfly logs for modules, eg x=2,y=3"
        required: false
        type: string
        default: ""
      build_type:
        description: "Build type: Debug or Release"
        required: false
        type: choice
        options:
          - Debug
          - Release
        default: "Debug"

jobs:
  build:
    strategy:
      matrix:
        container: ["ubuntu-dev:24"]
        proactor: [Uring]
        build-type: ["${{ inputs.build_type || 'Debug' }}"]
        runner: [ubuntu-latest]

    runs-on: ${{ matrix.runner }}

    permissions:
      id-token: write
      contents: read

    container:
      image: ghcr.io/romange/${{ matrix.container }}
      options: --security-opt seccomp=unconfined --sysctl "net.ipv6.conf.all.disable_ipv6=0"
      volumes:
        - /var/crash:/var/crash

    steps:
      - uses: actions/checkout@v6
        with:
          submodules: true
          ref: ${{ inputs.commit || inputs.branch }}

      - name: Print environment info
        run: |
          cat /proc/cpuinfo
          ulimit -a
          env

      - name: Fetch release
        shell: bash
        if: ${{ inputs.use_release == 'yes' }}
        run: |
          mkdir "${GITHUB_WORKSPACE}"/build
          cd "${GITHUB_WORKSPACE}"/build
          wget -q https://github.com/dragonflydb/dragonfly/releases/latest/download/dragonfly-x86_64.tar.gz
          tar xf dragonfly-x86_64.tar.gz
          mv dragonfly-x86_64 dragonfly
          ls -l

      - name: Build Dragonfly
        if: ${{ inputs.use_release != 'yes' }}
        uses: ./.github/actions/builder
        with:
          build-type: ${{matrix.build-type}}
          targets: 'dragonfly'

      - name: Sync valkey tests
        uses: ./.github/actions/sync-valkey-tests
      - name: Authenticate to AWS
        uses: aws-actions/configure-aws-credentials@v5
        with:
          role-to-assume: ${{ secrets.AWS_CI_S3_ROLE_ARN }}
          aws-region: us-east-1

      - name: Run tests on repeat
        uses: ./.github/actions/repeat
        with:
          run-only-on-ubuntu-latest: true
          dfly-executable: dragonfly
          build-folder-name: build
          s3-bucket: ${{ secrets.S3_REGTEST_BUCKET }}
          expression: ${{ inputs.expression }}
          count: ${{ inputs.count }}
          timeout: ${{ inputs.timeout }}
          epoll: ${{ inputs.epoll }}
          vmodule_expression: ${{ inputs.vmodule_expression }}

      - name: Upload logs on failure
        if: failure()
        uses: actions/upload-artifact@v6
        with:
          name: logs
          path: /tmp/failed/*

      - name: Copy binary on a self hosted runner
        if: failure()
        run: |
          # We must use sh syntax.
          if [ "$RUNNER_ENVIRONMENT" = "self-hosted" ]; then
            cd ${GITHUB_WORKSPACE}/build
            timestamp=$(date +%Y-%m-%d_%H:%M:%S)
            mv ./dragonfly /var/crash/dragonfy_${timestamp}
          fi


================================================
FILE: .github/workflows/test-fakeredis.yml
================================================
---
name: Test Dragonfly/Fakeredis

on:
  workflow_dispatch:
  pull_request:

permissions:
  contents: read
  checks: write

concurrency:
  group: dragonfly-${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  test:
    runs-on: ubuntu-latest
    container:
      image: ghcr.io/romange/ubuntu-dev:22
      options: --security-opt seccomp=unconfined --sysctl "net.ipv6.conf.all.disable_ipv6=0"
    strategy:
      fail-fast: false
    name: "Run tests: "
    permissions:
      pull-requests: write
      checks: read

    steps:
      - uses: actions/checkout@v6
        with:
          submodules: true

      - name: Install dependencies
        env:
          PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring
        shell: bash
        working-directory: tests/fakeredis
        run: |
          pip install poetry
          echo "$HOME/.poetry/bin" >> $GITHUB_PATH
          poetry install
      - name: Configure CMake
        run: |
          cmake -B ${GITHUB_WORKSPACE}/build \
            -DCMAKE_BUILD_TYPE=Debug -DWITH_AWS:BOOL=OFF -DWITH_GCP:BOOL=OFF -DWITH_GPERF:BOOL=OFF \
            -GNinja -L
          cd ${GITHUB_WORKSPACE}/build && pwd

      - name: Build
        run: |
          cd ${GITHUB_WORKSPACE}/build
          ninja dragonfly
          echo "-----------------------------"

          # The order of redirect is important
          ./dragonfly --proactor_threads=4  --noversion_check --port=6380  \
           --lua_resp2_legacy_float 1> /tmp/dragonfly.log 2>&1 &

      - name: Run tests
        working-directory: tests/fakeredis
        run: |
          # Some tests are pending on #5383
          poetry run pytest test/ \
          --ignore test/test_hypothesis/test_transaction.py \
          --ignore test/test_hypothesis/test_zset.py \
          --ignore test/test_hypotesis_joint/test_joint.py \
          --junit-xml=results-tests.xml  --html=report-tests.html -v
        continue-on-error: false  # Fail the job if tests fail

      - name: Show Dragonfly stats
        if: always()
        run: |
          redis-cli -p 6380 INFO ALL
      - name: Upload Tests Result xml
        if: always()
        uses: actions/upload-artifact@v6
        with:
          name: tests-result-logs
          path: |
            /tmp/dragonfly.*

      - name: Upload Tests Result html
        if: always()
        uses: actions/upload-artifact@v6
        with:
          name: report-tests.html
          path: tests/fakeredis/report-tests.html

      - name: Publish Test Report
        if: ${{ github.event_name == 'pull_request' }}
        uses: mikepenz/action-junit-report@v6
        with:
          report_paths: tests/fakeredis/results-tests.xml
          # Do not create a check run
          # annotate_only: true

  publish-html-results:
    name: Publish HTML Test Results to GitHub Pages
    needs: test
    if: ${{ github.ref == 'refs/heads/main' }}
    runs-on: ubuntu-latest
    permissions:
      pages: write      # to deploy to Pages
      id-token: write   # to verify the deployment originates from an appropriate source
    environment:
      name: github-pages
      url: ${{ steps.deployment.outputs.page_url }}
    steps:
      - name: Bundle Tests Result to one artifact
        uses: actions/upload-artifact/merge@v6
        with:
          delete-merged: true
          name: test-results-html
          pattern: '*.html'

      - name: Download html pages
        uses: actions/download-artifact@v7
        with:
          name: test-results-html
          path: results/

      - uses: actions/setup-python@v6
        with:
          cache-dependency-path: tests/fakeredis/poetry.lock
          python-version: "3.10"

      - name: Merge html results
        run: |
          pip install pytest-html-merger && mkdir merged
          pytest_html_merger -i results/ -o merged/index.html

      - name: Publish to GitHub Pages
        uses: actions/upload-pages-artifact@v4
        with:
          path: merged/
      - name: Deploy to GitHub Pages
        id: deployment
        uses: actions/deploy-pages@v4
        with:
          token: '${{ secrets.GITHUB_TOKEN }}'


================================================
FILE: .gitignore
================================================
build/*
build-*
clang/*
clang-*
.vscode/*.db
.vscode/settings.json
.vscode/launch.json
third_party
genfiles/*
*.sublime-*
*.orig
.tags
!third_party/include/*
*.pyc
/CMakeLists.txt.user
_deps
releases
.DS_Store
.idea/*
.hypothesis
.secrets
cmake-build-debug
.venv/
fuzz/artifacts/
fuzz/corpus/
tools/replay/traffic-replay

# Valkey-search integration tests (synced from external repo)
tests/dragonfly/valkey_search/integration/
_codeql_build_dir/


================================================
FILE: .gitmodules
================================================
[submodule "helio"]
	path = helio
	url = https://github.com/romange/helio.git


================================================
FILE: .gitorderfile
================================================
*.py
*.md
*.in
*.txt
*.sh
*.yml
*.h
*.cc
*.lua
*.go
*


================================================
FILE: .nvmrc
================================================
22.19


================================================
FILE: .pre-commit-config.yaml
================================================
default_stages: [pre-commit]
exclude: |
    (?x)(
      src/redis/.* |
      src/huff/.* |
      contrib/charts/dragonfly/ci/.* |
      patches/.*
    )
repos:
  - repo: local
    hooks:
      - id: conventional-commits
        name: Conventional Commit Minder
        entry: contrib/scripts/conventional-commits
        language: script
        stages: [commit-msg]
      - id: signed-commit
        name: Signed Commit Enforcer
        entry: contrib/scripts/signed-commit
        language: script
        stages: [commit-msg]

  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.3.0
    hooks:
      - id: trailing-whitespace
      - id: end-of-file-fixer

  - repo: https://github.com/pre-commit/mirrors-clang-format
    rev: v14.0.6
    hooks:
      - id: clang-format
        name: Clang formatting

  - repo: https://github.com/psf/black
    rev: 25.1.0
    hooks:
      - id: black


================================================
FILE: .pre-commit-hooks.yaml
================================================
- id: conventional-commits
  name: Conventional Commits Minder
  entry: contrib/scripts/conventional-commits
  language: script
  description: Conventional Commits Enforcement at the `git commit` client-side level
  always_run: true
  stages: [commit-msg]

- id: signed-commit
  name: Signed Commit Enforcer
  entry: contrib/scripts/signed-commit
  language: script
  description: Ensures all commits contain a Signed-off-by line
  always_run: true
  stages: [commit-msg]


================================================
FILE: .snyk
================================================
# Snyk (https://snyk.io) policy file
exclude:
 global:
   - tests/integration/**
   - contrib/charts/**


================================================
FILE: .vscode/c_cpp_properties.json
================================================
{
  "configurations": [
    {
      "name": "Linux",
      "includePath": [
        "${default}"
      ],
      "cStandard": "c17",
      "cppStandard": "c++17",
      "intelliSenseMode": "${default}",
      "compileCommands": "${workspaceFolder}/build-dbg/compile_commands.json",
      "configurationProvider": "ms-vscode.cmake-tools"
    }
  ],
  "version": 4
}


================================================
FILE: AGENTS.md
================================================
# Dragonfly Development Guide

> **Essential reference for working with the Dragonfly codebase**
> Architecture, build system, testing infrastructure, and development workflows.

---

## Table of Contents

1. [Critical Workflow Rules](#critical-workflow-rules)
2. [Quick Command Reference](#quick-command-reference)
3. [Project Overview](#project-overview)
4. [Repository Structure](#repository-structure)
5. [Build Instructions](#build-instructions)
6. [Testing](#testing)
7. [CI/CD Pipeline](#cicd-pipeline)
8. [Code Style & Pre-commit Hooks](#code-style--pre-commit-hooks)
9. [Third-Party Dependencies](#third-party-dependencies)
10. [Platform Support](#platform-support)
11. [CMake Build Options](#cmake-build-options)
12. [Key Files Reference](#key-files-reference)
13. [Common Pitfalls](#common-pitfalls)
14. [Debugging Tips](#debugging-tips)
15. [Validation Checklist](#validation-checklist)

---

## Critical Workflow Rules

**MANDATORY - Always Follow This Order:**

1. ✅ **Read Before Edit** - Always read files before modifying
2. ✅ **Use Correct Build Commands** - See [Quick Command Reference](#quick-command-reference) below
3. ✅ **Test After Changes** - Build and run a relevant unit test -
   `ninja <unit_test> && ./unit_test`
4. ✅ **Format Code** - `pre-commit run --files <files>`
5. ✅ **Follow Architecture** - See [Architecture Patterns](#architecture-patterns) below

### Pull Request Guidelines

**Conciseness is Key**: PR descriptions should be short, focused, and easy to scan.
- **Title**: Imperative, descriptive (e.g., "Fix fiber stack overflow in test_reply_guard_oom")
- **Summary**: 1-2 sentences explaining *what* changed and *why*
- **Changes**: Bullet points for key changes
- **Fixes**: Link issues (e.g., "Fixes #123")
- **Commit messages**: Keep every line (subject and body) <= 100 characters; wrap long descriptions

---

## Quick Command Reference

**CRITICAL: Read the full sections below for context. These are shortcuts only.**

### Building (see [Build Instructions](#build-instructions) for details)

```bash
# Debug build (for development)
./helio/blaze.sh
cd build-dbg && ninja dragonfly              # Build main binary
cd build-dbg && ninja generic_family_test    # Build specific test

# Release build (for production/benchmarking)
./helio/blaze.sh -release
cd build-opt && ninja dragonfly
```

### Testing (see [Testing](#testing) for details)

```bash
# C++ Unit Tests
cd build-dbg
ctest -V -L DFLY                                    # Run all tests
./generic_family_test                               # Run specific test binary
./generic_family_test --gtest_filter="Set.*"        # Run specific test case
```

### Code Formatting

```bash
# Setup (once)
pipx install pre-commit clang-format black
pre-commit install

# Format code
pre-commit run --files <files>              # Format specific files
pre-commit run --all-files                  # Format all files
```

### Common Operations

```bash
# Check git status
git status

# Check current branch
git branch

# View recent commits
git log --oneline -10
```

---

## Architecture Patterns

**Code Style**: [.clang-format](.clang-format) - snake_case vars, PascalCase functions, kPascalCase constants

**DO ✅**:
- Fiber-aware: `util::fb2::Mutex`, `util::fb2::Fiber` → [helio/util/fibers/](helio/util/fibers/)
- Per-shard ops (no global state) → [docs/df-share-nothing.md](docs/df-share-nothing.md)
- Command pattern → [src/server/set_family.cc](src/server/set_family.cc)
- Error handling: `OpStatus` → [src/server/common.h](src/server/common.h)
- Test patterns → [tests/dragonfly/conftest.py](tests/dragonfly/conftest.py)

**DON'T ❌**:
- `std::thread`, `std::mutex` (deadlocks!)
- Global mutable state
- Edit without reading
- Skip tests
- Use `./tools/docker/build.sh` for local development (use `ninja` instead)
- Use `make` for incremental builds (use `ninja` instead)

---

## Project Overview

**Dragonfly** is a high-performance, Redis and Memcached compatible in-memory data store written in C++20. It delivers significantly higher throughput than traditional single-threaded Redis implementations through innovative architectural choices.

### Key Characteristics

- **Language**: C++20 (Google C++ Style Guide 2020 version)
- **Architecture**: Shared-nothing multi-threaded design (via `helio` library)
- **Performance**: Uses io_uring (Linux 5.11+) for high-performance async I/O, with epoll fallback
- **Threading Model**: Fiber-based cooperative multitasking with lock-free data structures
- **Build System**: CMake + Ninja via `helio/blaze.sh` wrapper script
- **Target Platform**: Linux (kernel 5.11+ recommended), FreeBSD support available
- **Protocols**: Redis RESP2/RESP3, Memcached binary protocol
- **Compatibility**: Drop-in replacement for Redis API coverage

### Architectural Highlights

**For detailed architecture documentation, see [docs/df-share-nothing.md](docs/df-share-nothing.md)**

1. **Shared-Nothing Design**: Each thread operates independently with its own data structures, minimizing lock contention
2. **Helio Framework**: Custom I/O and threading library built on io_uring/epoll with fiber support
3. **DashTable**: Novel hash table implementation optimized for multi-core systems - see [docs/dashtable.md](docs/dashtable.md)
4. **Transaction Model**: Non-blocking optimistic transactions - see [docs/transaction.md](docs/transaction.md)
5. **Tiering Support**: Optional disk-backed storage for large datasets
6. **Search Module**: Full-text search capabilities (when enabled with WITH_SEARCH)

---

## Repository Structure

```
dragonfly/
├── src/                      # Main C++ source code
│   ├── server/               # Core server implementation
│   │   ├── dfly_main.cc      # Main entry point
│   │   ├── main_service.cc   # Service lifecycle & command routing
│   │   ├── db_slice.cc       # Per-thread database shard
│   │   ├── engine_shard_set.cc # Shard management
│   │   ├── cluster/          # Cluster mode implementation
│   │   ├── journal/          # Replication journal
│   │   ├── tiering/          # Tiered storage
│   │   ├── search/           # Search module
│   │   └── acl/              # Access control lists
│   ├── core/                 # Core data structures
│   │   ├── dash.h            # DashTable hash table
│   │   ├── dense_set.h       # Compact set implementation
│   │   ├── string_map.h      # Optimized string-keyed maps
│   │   ├── search/           # Search core algorithms
│   │   └── json/             # JSON support
│   ├── facade/               # Network & command handling
│   │   ├── dragonfly_connection.cc # Connection management
│   │   ├── redis_parser.cc   # RESP protocol parser
│   │   └── memcache_parser.cc # Memcached protocol
│   └── redis/                # Redis-specific implementations
│       └── lua/              # Lua scripting support
│
├── helio/                    # Git submodule: I/O and threading library
│   │                         # ** DO NOT EDIT unless contributing to helio **
│   ├── util/                 # Utilities: fibers, I/O, synchronization
│   ├── io/                   # io_uring & epoll abstraction
│   └── blaze.sh              # Build configuration wrapper
│
├── tests/                    # Test suite
│   ├── dragonfly/            # Python pytest integration/regression tests
│   │   ├── conftest.py       # Pytest fixtures & configuration
│   │   ├── requirements.txt  # Python test dependencies
│   │   └── *.py              # Test files
│   └── pytest.ini            # Pytest configuration & markers
│
├── docs/                     # Documentation
│   ├── build-from-source.md  # Build instructions
│   ├── dashtable.md          # DashTable internals
│   ├── transaction.md        # Transaction model
│   ├── df-share-nothing.md   # Shared-nothing architecture
│   └── differences.md        # Differences from Redis
│
├── contrib/                  # Utilities
│   ├── docker/               # Docker configurations
│   └── charts/dragonfly/     # Helm chart for Kubernetes
│
├── tools/                    # Benchmarking & utility tools
│   └── packaging/            # Packaging scripts
│
├── CMakeLists.txt            # Root CMake configuration
├── .clang-format             # C++ formatting rules (clang-format v14.0.6)
├── .pre-commit-config.yaml   # Pre-commit hooks configuration
├── pyproject.toml            # Python formatting (Black, 100 chars)
└── CONTRIBUTING.md           # Contribution guidelines
```

### Critical Paths to Remember

- **Main entry**: `src/server/dfly_main.cc`
- **Command dispatch**: `src/server/main_service.cc`
- **Data storage**: `src/server/db_slice.cc`
- **Networking**: `src/facade/dragonfly_connection.cc`
- **Helio library**: `helio/` (I/O and threading library)

---

## Build Instructions

**For complete build instructions, see [docs/build-from-source.md](docs/build-from-source.md)**

### Quick Start

**Debug build** (for development):
```bash
./helio/blaze.sh
cd build-dbg && ninja dragonfly
./dragonfly --alsologtostderr
```

**Release build** (for production/benchmarking):
```bash
./helio/blaze.sh -release
cd build-opt && ninja dragonfly
```

**Production release build** (static linking, optimized):
```bash
make release           # Configure + build
make package           # Create release packages with debug symbols
```

The [Makefile](Makefile) builds production releases with:
- Static linking: libstdc++, libgcc, Boost, OpenSSL
- Architecture optimizations (x86_64: `-march=core2 -msse4.1 -mtune=skylake`)
- Debug symbols (compressed)
- Output: `build-release/dragonfly-{arch}.tar.gz`

**Common build options**:
- See [docs/build-from-source.md](docs/build-from-source.md) for all options

---

## Testing

**For complete testing documentation, see [tests/README.md](tests/README.md)**

### Quick Reference

**C++ Unit Tests**:
```bash
cd build-dbg
ctest -V -L DFLY                                    # Run all tests
./generic_family_test                               # Run specific test binary
./generic_family_test --gtest_filter="Set.*"        # Run specific test case
```

---

## CI/CD Pipeline

**For complete CI configuration, see [.github/workflows/ci.yml](.github/workflows/ci.yml)**

The CI workflow runs on all PRs and includes:
- **Pre-commit checks**: clang-format, black formatters
- **Build matrix**: Multiple OS/compiler/sanitizer combinations (Ubuntu 20/24, Alpine, GCC/Clang, ASAN/UBSAN)
- **Test execution**: C++ unit tests, Python integration tests, cluster mode tests
- **Additional validations**: Helm charts, Docker image builds

---

## Code Style & Pre-commit Hooks

**For complete contribution guidelines, see [CONTRIBUTING.md](CONTRIBUTING.md)**

**Code style configuration files**:
- **C++**: [.clang-format](.clang-format) - Google C++ Style Guide (2020), clang-format v14.0.6, 100 char limit
- **Python**: [pyproject.toml](pyproject.toml) - Black formatter, 100 char limit, PEP 8 compliant
- **Pre-commit hooks**: [.pre-commit-config.yaml](.pre-commit-config.yaml) - Automated formatting checks

**Quick setup**:
```bash
pipx install pre-commit clang-format black
pre-commit install
pre-commit run --all-files                          # Run all formatters
```

---

## Third-Party Dependencies

**Key Libraries**: Abseil (strings/flags), Boost 1.71+ (context/intrusive), mimalloc (allocator), jsoncons (JSON), OpenSSL (TLS), libunwind (traces)

**Build artifacts**: `build-dbg/third_party/` - DO NOT edit

**For complete dependency info, see [docs/build-from-source.md](docs/build-from-source.md)**

---

## Platform Support

**Linux**: Primary platform. Kernel 5.11+ (io_uring), 5.1+ (basic), < 5.1 (epoll fallback)
- Check: `uname -r`
- Force epoll: `--proactor_type=epoll`
- Docker: `--security-opt seccomp=unconfined`

**FreeBSD**: Supported (kqueue backend)

**macOS**: Not supported for production (use Docker/Linux)

**For complete platform info, see [docs/build-from-source.md](docs/build-from-source.md)**

---

## CMake Build Options

**For complete list of build options, see [docs/build-from-source.md](docs/build-from-source.md)**

### Common Options

Pass options to `helio/blaze.sh` with `-D` prefix:

```bash
./helio/blaze.sh -DWITH_SEARCH=OFF -DWITH_AWS=ON
```

**Most useful options**:
- `WITH_ASAN=ON` / `WITH_USAN=ON` - Enable sanitizers for debugging
- `WITH_SEARCH=OFF` - Disable search module for faster builds
- `WITH_AWS=OFF` / `WITH_GCP=OFF` - Disable cloud libraries
- `WITH_TIERING=OFF` - Disable disk storage
- `USE_MOLD=ON` - Faster linking with LTO (production builds)

**Quick configurations**:
```bash
# Minimal build (fast compilation)
./helio/blaze.sh -DWITH_GPERF=OFF -DWITH_AWS=OFF -DWITH_GCP=OFF -DWITH_TIERING=OFF -DWITH_SEARCH=OFF

# Full-featured (all options ON by default)
./helio/blaze.sh

# Production optimized
./helio/blaze.sh -release -DUSE_MOLD=ON
```

---

## Key Files Reference

Quick reference to the most important files in the codebase.

| Purpose | File Path |
|---------|-----------|
| **Entry Points & Core** | |
| Main entry point | `src/server/dfly_main.cc` |
| Server lifecycle & command routing | `src/server/main_service.cc` |
| Per-thread database shard | `src/server/db_slice.cc` |
| Shard management | `src/server/engine_shard_set.cc` |
| **Data Structures** | |
| DashTable hash table | `src/core/dash.h` |
| Dense set implementation | `src/core/dense_set.h` |
| String map | `src/core/string_map.h` |
| **Networking** | |
| Connection handling | `src/facade/dragonfly_connection.cc` |
| Redis protocol parser | `src/facade/redis_parser.cc` |
| Memcached protocol parser | `src/facade/memcache_parser.cc` |
| **Build System** | |
| Root CMake config | `CMakeLists.txt` |
| Build script wrapper | `helio/blaze.sh` |
| Server CMake config | `src/server/CMakeLists.txt` |
| **CI/CD** | |
| Main CI workflow | `.github/workflows/ci.yml` |
| Pre-commit config | `.pre-commit-config.yaml` |
| **Code Style** | |
| C++ formatting | `.clang-format` |
| Python formatting | `pyproject.toml` |
| **Testing** | |
| Pytest configuration | `tests/pytest.ini` |
| Pytest fixtures | `tests/dragonfly/conftest.py` |
| Test requirements | `tests/dragonfly/requirements.txt` |
| **Documentation** | |
| Build instructions | `docs/build-from-source.md` |
| Architecture overview | `docs/df-share-nothing.md` |
| DashTable internals | `docs/dashtable.md` |
| Transaction model | `docs/transaction.md` |
| **Configuration** | |
| Contributing guide | `CONTRIBUTING.md` |
| CLA agreement | `CLA.txt` |

---

## Common Pitfalls

1. **Pre-commit not installed**: `pipx install pre-commit clang-format black && pre-commit install`
2. **Wrong binary**: Debug: `build-dbg/dragonfly`, Release: `build-opt/dragonfly`
3. **Wrong build command**: Use `cd build-dbg && ninja <target>`, NOT `./tools/docker/build.sh`
4. **Test timeouts**: `timeout 20m ctest -V -L DFLY`
5. **ASAN leaks**: Check CI, suppress in `helio/util/asan_suppressions.txt`
6. **Helio modifications**: DON'T edit `helio/` (it's a git submodule - changes go upstream)
7. **CodeQL checks**: DON'T run codeql_checker when testing changes - it's slow and unnecessary for development

---

## Debugging Tips

**Logging**: `--alsologtostderr --v=1 --vmodule=module=2`

**ASAN**: `ASAN_OPTIONS=detect_leaks=1:symbolize=1`, suppressions: `helio/util/asan_suppressions.txt`

**CI reproduction**: See [.github/workflows/ci.yml](.github/workflows/ci.yml)

**Troubleshooting**: Check fiber deadlocks (use `util::fb2` not `std::mutex`), timeout issues (`--test_timeout`), ASAN reports

---

## Validation Checklist

Before claiming a task is complete, verify:

### Code Quality

- [ ] Code compiles without errors: `cd build-dbg && ninja dragonfly`
- [ ] Code compiles without warnings (CI uses `-Werror`)
- [ ] Code follows Google C++ Style Guide (run `clang-format`)
- [ ] No new ASAN/UBSAN violations

### Testing

- [ ] All existing C++ unit tests pass: `ctest -V -L DFLY`
- [ ] New feature has corresponding test coverage
- [ ] Tests pass in both Debug and Release builds
- [ ] Tests pass with ASAN/UBSAN enabled (if applicable)
- [ ] **DO NOT run codeql_checker** - it's slow and unnecessary for development testing

### Pre-commit & Style

- [ ] Pre-commit hooks installed: `pre-commit install`
- [ ] Code formatted with clang-format (C++) and black (Python)

### Documentation

- [ ] Public APIs have comments explaining purpose
- [ ] Complex algorithms have explanatory comments
- [ ] README or docs updated if behavior changes
- [ ] No commented-out code left in final commit

### Performance

- [ ] No obvious performance regressions (run benchmarks if needed)
- [ ] No unnecessary allocations in hot paths
- [ ] Lock-free data structures used where appropriate


================================================
FILE: CLA.txt
================================================
Thanks for your interest in contributing to Dragonfly™. By contributing to this project
in any way form or media you grant DragonflyDB Ltd. and its affiliates a perpetual, worldwide, non-exclusive, free of charge, royalty-free, irrevocable license to use,
modify, make available, reproduce, make derivatives, publicly display and perform, sublicense, sell, and distribute your contributions and any derivatives thereof as part of Dragonfly™.
You represent that You are legally entitled to grant the above license. You acknowledge that DragonflyDB currently distributes Dragonfly™ under the Business Source License 1.1 (BSL-1.1) license, and agree that your contribution may be distributed under BSL-1.1 as part of Dragonfly™. You also represent that your contributions are your original work and that neither the content contributed, nor making the contribution to Dragonfly™ violates any third party’ rights. If you are making this contribution while being engaged by any other company or entity, please make sure you have the necessary permissions required to do so.


================================================
FILE: CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.15 FATAL_ERROR)
set(PROJECT_CONTACT romange@gmail.com)

include(CheckCXXCompilerFlag)

enable_testing()

set(CMAKE_EXPORT_COMPILE_COMMANDS 1)

# AFL++ fuzzing support - must be set BEFORE project() command
option(USE_AFL "Enable AFL++ fuzzing" OFF)
if(USE_AFL)
  # Automatically set AFL++ compilers if not already set
  if(NOT CMAKE_C_COMPILER MATCHES "afl-" AND NOT CMAKE_CXX_COMPILER MATCHES "afl-")
    find_program(AFL_CC afl-clang-fast)
    find_program(AFL_CXX afl-clang-fast++)

    if(AFL_CC AND AFL_CXX)
      message(STATUS "AFL++ fuzzing enabled - setting compilers")
      set(CMAKE_C_COMPILER ${AFL_CC})
      set(CMAKE_CXX_COMPILER ${AFL_CXX})
    else()
      message(FATAL_ERROR "USE_AFL=ON but AFL++ compilers not found!\n"
              "Please install AFL++: apt install afl++ or build from source\n"
              "https://github.com/AFLplusplus/AFLplusplus")
    endif()
  endif()
endif()

# Set targets in folders
set_property(GLOBAL PROPERTY USE_FOLDERS ON)
project(DRAGONFLY C CXX)
set(CMAKE_CXX_STANDARD 20)

# Disabled because it has false positives with ref-counted intrusive pointers.
CHECK_CXX_COMPILER_FLAG("-Wuse-after-free" HAS_USE_AFTER_FREE_WARN)
if (HAS_USE_AFTER_FREE_WARN)
    set(CMAKE_CXX_FLAGS "-Wno-use-after-free ${CMAKE_CXX_FLAGS}")
endif()

if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
    set(CMAKE_CXX_FLAGS "-Wthread-safety -Werror=thread-safety ${CMAKE_CXX_FLAGS}")
endif()

# We can not use here CHECK_CXX_COMPILER_FLAG because systems that do not support sanitizers
# fail during linking time.
set(CMAKE_REQUIRED_FLAGS "-fsanitize=address")
check_cxx_source_compiles("int main() { return 0; }" SUPPORT_ASAN)

set(CMAKE_REQUIRED_FLAGS "-fsanitize=undefined")
check_cxx_source_compiles("int main() { return 0; }" SUPPORT_USAN)
set(CMAKE_REQUIRED_FLAGS "")

# We must define all the required variables from the root cmakefile, otherwise
# they just disappear.
set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/helio/cmake" ${CMAKE_MODULE_PATH})
option(BUILD_SHARED_LIBS "Build shared libraries" OFF)
option(DF_USE_SSL "Provide support for SSL connections" ON)

find_package(OpenSSL)

# AFL++ configuration - must be before sanitizer checks
if(USE_AFL)
  message(STATUS "AFL++ fuzzing mode active")
  message(STATUS "  C compiler: ${CMAKE_C_COMPILER}")
  message(STATUS "  C++ compiler: ${CMAKE_CXX_COMPILER}")

  # Add USE_AFL as compile definition so #ifdef USE_AFL works in code
  add_compile_definitions(USE_AFL)

  # AFL++ requires specific compiler flags for coverage instrumentation
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g")

  # Force disable sanitizers when fuzzing (AFL++ incompatible with ASAN/UBSAN)
  message(STATUS "Disabling sanitizers (incompatible with AFL++ fuzzing)")
  set(WITH_ASAN OFF CACHE BOOL "Disable ASAN for fuzzing" FORCE)
  set(WITH_USAN OFF CACHE BOOL "Disable UBSAN for fuzzing" FORCE)

  # Disable AWS and GCP for fuzzing builds (not needed, reduces build time)
  message(STATUS "Disabling AWS and GCP integrations for fuzzing")
  set(WITH_AWS OFF CACHE BOOL "Disable AWS for fuzzing" FORCE)
  set(WITH_GCP OFF CACHE BOOL "Disable GCP for fuzzing" FORCE)
endif()

option(WITH_ASAN "Enable -fsanitize=address" OFF)
if (SUPPORT_ASAN AND WITH_ASAN)
  message(STATUS "address sanitizer enabled")
  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address")
endif()

option(WITH_USAN "Enable -fsanitize=undefined" OFF)
if (SUPPORT_USAN AND WITH_USAN)
  message(STATUS "ub sanitizer enabled")
  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=undefined")
endif()

include(third_party)
include(internal)

include_directories(src)
include_directories(helio)

add_subdirectory(helio)
add_subdirectory(src)


================================================
FILE: CODE_OF_CONDUCT.md
================================================
# Contributor Covenant Code of Conduct

## Our Pledge

We as members, contributors, and leaders pledge to make participation in our
community a harassment-free experience for everyone, regardless of age, body
size, visible or invisible disability, ethnicity, sex characteristics, gender
identity and expression, level of experience, education, socio-economic status,
nationality, personal appearance, race, religion, or sexual identity
and orientation.

We pledge to act and interact in ways that contribute to an open, welcoming,
diverse, inclusive, and healthy community.

## Our Standards

Examples of behavior that contributes to a positive environment for our
community include:

* Demonstrating empathy and kindness toward other people
* Being respectful of differing opinions, viewpoints, and experiences
* Giving and gracefully accepting constructive feedback
* Accepting responsibility and apologizing to those affected by our mistakes,
  and learning from the experience
* Focusing on what is best not just for us as individuals, but for the
  overall community

Examples of unacceptable behavior include:

* The use of sexualized language or imagery, and sexual attention or
  advances of any kind
* Trolling, insulting or derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or email
  address, without their explicit permission
* Other conduct which could reasonably be considered inappropriate in a
  professional setting

## Enforcement Responsibilities

Community leaders are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate, threatening, offensive,
or harmful.

Community leaders have the right and responsibility to remove, edit, or reject
comments, commits, code, wiki edits, issues, and other contributions that are
not aligned to this Code of Conduct, and will communicate reasons for moderation
decisions when appropriate.

## Scope

This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.
Examples of representing our community include using an official e-mail address,
posting via an official social media account, or acting as an appointed
representative at an online or offline event.

## Enforcement

Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement at
https://github.com/dragonflydb/dragonfly/discussions.
All complaints will be reviewed and investigated promptly and fairly.

All community leaders are obligated to respect the privacy and security of the
reporter of any incident.

## Enforcement Guidelines

Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:

### 1. Correction

**Community Impact**: Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.

**Consequence**: A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the
behavior was inappropriate. A public apology may be requested.

### 2. Warning

**Community Impact**: A violation through a single incident or series
of actions.

**Consequence**: A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or
permanent ban.

### 3. Temporary Ban

**Community Impact**: A serious violation of community standards, including
sustained inappropriate behavior.

**Consequence**: A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.

### 4. Permanent Ban

**Community Impact**: Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior,  harassment of an
individual, or aggression toward or disparagement of classes of individuals.

**Consequence**: A permanent ban from any sort of public interaction within
the community.

## Attribution

This Code of Conduct is adapted from the [Contributor Covenant][homepage],
version 2.0, available at
https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.

Community Impact Guidelines were inspired by [Mozilla's code of conduct
enforcement ladder](https://github.com/mozilla/diversity).

[homepage]: https://www.contributor-covenant.org

For answers to common questions about this code of conduct, see the FAQ at
https://www.contributor-covenant.org/faq. Translations are available at
https://www.contributor-covenant.org/translations.


================================================
FILE: CONTRIBUTING.md
================================================
# Contributing to Dragonfly DB

Thank you for your interest in Dragonfly DB.

Feel free to browse our [Discussions](https://github.com/dragonflydb/dragonfly/discussions) and [Issues](https://github.com/dragonflydb/dragonfly/issues)

## Build from source

See [building from source](./docs/build-from-source.md)

Please note that to build a development/debug version,
it's better to alter the configure and build steps above with:

```sh
./helio/blaze.sh   # without '-release' flag. Creates build-dbg subfolder
cd build-dbg && ninja dragonfly
```

## Before you make your changes

```sh
cd dragonfly   # project root

# Make sure you have 'pre-commit', 'clang-format' and black is installed
pipx install pre-commit clang-format
pipx install pre-commit black

# IMPORTANT! Enable our pre-commit message hooks
# This will ensure your commits match our formatting requirements
pre-commit install
```

This step must be done on each machine you wish to develop and contribute from to activate the `commit-msg` and `commit` hooks client-side.

Once you have done these things, we look forward to adding your contributions and improvements to the Dragonfly DB project.

## Unit testing

```
# Build a specific test
cd build-dbg && ninja [test_name]
# e.g cd build-dbg && ninja generic_family_test

# Run
./[test_name]
# e.g ./generic_family_test
```

## Rendering Helm golden files

A Golang golden test is included in the dragonfly helm chart. This test will render the chart and compare the output to a golden file. If the output has changed, the test will fail and the golden file will need to be updated. This can be done by running:

```bash
cd contrib/charts/dragonfly
go test -v ./... -update
```

This makes it easy to see the changes in the rendered output without having to manually run the `helm template` and diff the output.

## Signoff Commits

All community submissions must include a signoff.

```bash
git commit -s -m '...'
```

## Squash Commits

Please squash all commits for a change into a single commit (this can be done using "git rebase -i"). Do your best to have a well-formed commit message for the change.

## Use Conventional Commits

This repo uses [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/)

The Conventional Commits specification is a lightweight convention on top of commit messages.
It provides an easy set of rules for creating an explicit commit history;
which makes it easier to write automated tools on top of.
This convention dovetails with [SemVer](http://semver.org),
by describing the features, fixes, and breaking changes made in commit messages.

The commit message should be structured as follows:

---

```
<type>[optional scope]: <description>

[optional body]

[optional footer(s)]
```

---

This repo uses automated tools to standardize the formatting of code, text files, and commits.

- [Pre-commit hooks](#pre-commit-hooks) validate and automatically apply code
   formatting rules.

## `pre-commit` hooks

The Dragonfly DB team has agreed to systematically use several pre-commit hooks to
normalize the formatting of code. You need to install and enable pre-commit to have these used
when you do your commits.

## Codebase guidelines

This repo conforms to the Google's C++ Style Guide. Keep in mind we use an older version of the
style guide which can be found [here](https://github.com/google/styleguide/blob/505ba68c74eb97e6966f60907ce893001bedc706/cppguide.html).

Any exceptions to the rules specified in the style guide will be documented here.

## License terms for contributions

Please see our [CLA agreement](./CLA.txt)

## THANK YOU FOR YOUR CONTRIBUTIONS


================================================
FILE: CONTRIBUTORS.md
================================================
# Contributors (alphabetical by surname)

* **[Amir Alperin](https://github.com/iko1)**
* **[Philipp Born](https://github.com/tamcore)**
  * Helm Chart
* **[Meng Chen](https://github.com/matchyc)**
* **[Yuxuan Chen](https://github.com/YuxuanChen98)**
* **[Pawel Kaplinski](https://github.com/pawelKapl)**
* **[Redha Lhimeur](https://github.com/redhal)**
* **[Braydn Moore](https://github.com/braydnm)**
* **[Logan Raarup](https://github.com/logandk)**
* **[Ryan Russell](https://github.com/ryanrussell)**
  * Docs & Code Readability
* **[Ali-Akber Saifee](https://github.com/alisaifee)**
* **[Elle Y](https://github.com/inohime)**
* **[ATM SALEH](https://github.com/ATM-SALEH)**
* **[Shohei Shiraki](https://github.com/highpon)**
* **[Leonardo Mello](https://github.com/lsvmello)**
* **[Nico Coetzee](https://github.com/nicc777)**


================================================
FILE: LICENSE.md
================================================
# Dragonfly Business Source License 1.1

<u>License</u>: Business Source License 1.1 [BSL 1.1](https://spdx.org/licenses/BUSL-1.1.html)

<u>Licensor</u>: DragonflyDB, Ltd.

<u>Licensed Work</u>: Dragonfly including the software components, or any portion of them, and any modification.

<u>Change Date</u>: March 1, 2029

<u>Change License</u>: [Apache License, Version
2.0](https://www.apache.org/licenses/LICENSE-2.0), as published by the
Apache Foundation.

<u>Additional Use Grant</u>: You may make use of the Licensed Work (i) only as part of your own product or service, provided it is not an in-memory data store product or service; and (ii) provided that you do not use, provide, distribute, or make available the Licensed Work as a Service.
A “Service” is a commercial offering, product, hosted, or managed service, that allows third parties (other than your own employees and contractors acting on your behalf) to access and/or use the Licensed Work or a substantial set of the features or functionality of the Licensed Work to third parties as a software-as-a-service, platform-as-a-service, infrastructure-as-a-service or other similar services that compete with Licensor products or services.

Text of BSL 1.1

The Licensor hereby grants you the right to copy, modify, create
derivative works, redistribute, and make non-production use of the
Licensed Work. The Licensor may make an Additional Use Grant, above,
permitting limited production use.

Effective on the Change Date, or the fourth anniversary of the first
publicly available distribution of a specific version of the Licensed
Work under this License, whichever comes first, the Licensor hereby
grants you rights under the terms of the Change License, and the rights
granted in the paragraph above terminate.

If your use of the Licensed Work does not comply with the requirements
currently in effect as described in this License, you must purchase a
commercial license from the Licensor, its affiliated entities, or
authorized resellers, or you must refrain from using the Licensed Work.

All copies of the original and modified Licensed Work, and derivative
works of the Licensed Work, are subject to this License. This License
applies separately for each version of the Licensed Work and the Change
Date may vary for each version of the Licensed Work released by
Licensor.

You must conspicuously display this License on each original or modified
copy of the Licensed Work. If you receive the Licensed Work in original
or modified form from a third party, the terms and conditions set forth
in this License apply to your use of that work.

Any use of the Licensed Work in violation of this License will
automatically terminate your rights under this License for the current
and all other versions of the Licensed Work.

This License does not grant you any right in any trademark or logo of
Licensor or its affiliates (provided that you may use a trademark or
logo of Licensor as expressly required by this License).

TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED
ON AN “AS IS” BASIS. LICENSOR HEREBY DISCLAIMS ALL WARRANTIES AND
CONDITIONS, EXPRESS OR IMPLIED, INCLUDING (WITHOUT LIMITATION)
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE,
NON-INFRINGEMENT, AND TITLE.


================================================
FILE: Makefile
================================================
BUILD_ARCH := $(shell uname -m)
RELEASE_NAME := "dragonfly-${BUILD_ARCH}"
HELIO_RELEASE_FLAGS = -DHELIO_RELEASE_FLAGS="-g"
HELIO_USE_STATIC_LIBS = ON
HELIO_OPENSSL_USE_STATIC_LIBS = ON
HELIO_ENABLE_GIT_VERSION = ON
HELIO_WITH_UNWIND ?= OFF
RELEASE_DIR=build-release
WITH_SIMSIMD ?= ON

# Some distributions (old fedora) have incorrect dependencies for crypto
# so we add -lz for them.
LINKER_FLAGS=-lz

# equivalent to: if $(uname_m) == x86_64 || $(uname_m) == amd64
# Override HELIO_MARCH_OPT via environment: make HELIO_MARCH_OPT="-march=native"
ifneq (, $(filter $(BUILD_ARCH),x86_64 amd64))
HELIO_MARCH_OPT ?= -march=core2 -msse4.1 -mpopcnt -mtune=skylake
endif

# For release builds we link statically libstdc++ and libgcc. Currently,
# all the release builds are performed by gcc.
LINKER_FLAGS += -static-libstdc++ -static-libgcc

# Optional ASAN support: make ASAN=1 release
ifdef ASAN
SANITIZE_COMPILE_FLAGS = -fsanitize=address -Wno-maybe-uninitialized
SANITIZE_LINK_FLAGS = -fsanitize=address
endif

HELIO_FLAGS = -DHELIO_RELEASE_FLAGS="-g" \
			  -DCMAKE_CXX_FLAGS="$(SANITIZE_COMPILE_FLAGS)" \
			  -DCMAKE_EXE_LINKER_FLAGS="$(LINKER_FLAGS) $(SANITIZE_LINK_FLAGS)" \
              -DBoost_USE_STATIC_LIBS=$(HELIO_USE_STATIC_LIBS) \
              -DOPENSSL_USE_STATIC_LIBS=$(HELIO_OPENSSL_USE_STATIC_LIBS) \
              -DENABLE_GIT_VERSION=$(HELIO_ENABLE_GIT_VERSION) \
              -DWITH_SIMSIMD=$(WITH_SIMSIMD) \
              -DWITH_UNWIND=$(HELIO_WITH_UNWIND) -DMARCH_OPT="$(HELIO_MARCH_OPT)"

.PHONY: default

configure:
	cmake -L -B $(RELEASE_DIR) -DCMAKE_BUILD_TYPE=Release -GNinja $(HELIO_FLAGS)

build:
	cd $(RELEASE_DIR); \
	ninja dfly_bench dragonfly && ldd dragonfly

package:
	cd $(RELEASE_DIR); \
	tar cvfz $(RELEASE_NAME)-dbgsym.tar.gz dragonfly ../LICENSE.md; \
	objcopy \
		--remove-section=".debug_*" \
		--remove-section="!.debug_line" \
		--compress-debug-sections \
		dragonfly \
		$(RELEASE_NAME); \
	tar cvfz $(RELEASE_NAME).tar.gz $(RELEASE_NAME) ../LICENSE.md; \
	objcopy \
		--remove-section=".debug_*" \
		--remove-section="!.debug_line" \
		--compress-debug-sections \
		dfly_bench \
		dfly_bench-$(BUILD_ARCH); \
	tar cvfz dfly_bench-$(BUILD_ARCH).tar.gz dfly_bench-$(BUILD_ARCH)

release: configure build

default: release


================================================
FILE: README.ja-JP.md
================================================
<p align="center">
  <a href="https://dragonflydb.io">
    <img  src="/.github/images/logo-full.svg"
      width="284" border="0" alt="Dragonfly">
  </a>
</p>

[![ci-tests](https://github.com/dragonflydb/dragonfly/actions/workflows/ci.yml/badge.svg)](https://github.com/dragonflydb/dragonfly/actions/workflows/ci.yml) [![Twitter URL](https://img.shields.io/twitter/follow/dragonflydbio?style=social)](https://twitter.com/dragonflydbio)

その他の言語:  [English](README.md) [简体中文](README.zh-CN.md) [한국어](README.ko-KR.md) [Português](README.pt-BR.md)

[Web サイト](https://www.dragonflydb.io/) • [ドキュメント](https://dragonflydb.io/docs) • [クイックスタート](https://www.dragonflydb.io/docs/getting-started) • [コミュニティ Discord](https://discord.gg/HsPjXGVH85) • [Dragonfly Forum](https://dragonfly.discourse.group/) • [Join the Dragonfly Community](https://www.dragonflydb.io/community)

[GitHub Discussions](https://github.com/dragonflydb/dragonfly/discussions) • [GitHub Issues](https://github.com/dragonflydb/dragonfly/issues) • [コントリビュート](https://github.com/dragonflydb/dragonfly/blob/main/CONTRIBUTING.md)

## 世界最速のインメモリデータストア

Dragonfly は最新のアプリケーションワークロードのために構築されたインメモリデータストアです。

Redis や Memcached の API と完全に互換性があるため、Dragonfly を採用するためにコードを変更する必要はありません。従来のインメモリデータストアと比較して、Dragonfly は 25 倍のスループット、より低いテールレイテンシでより高いキャッシュヒット率、そして容易な垂直スケーラビリティを提供します。

## コンテンツ

- [ベンチマーク](#ベンチマーク)
- [クイックスタート](https://github.com/dragonflydb/dragonfly/tree/main/docs/quick-start)
- [コンフィグ](#コンフィグ)
- [ロードマップとステータス](#ロードマップとステータス)
- [デザイン決定](#デザイン決定)
- [バックグラウンド](#バックグラウンド)

## <a name="ベンチマーク"><a/>ベンチマーク

<img src="http://static.dragonflydb.io/repo-assets/aws-throughput.svg" width="80%" border="0"/>

ベンチマークでは、Dragonfly は Redis と比較して 25 倍のスループットを示し、c6gn.16xlarge で 3.8M QPS を超えました。

Dragonfly のピークスループットにおける 99 パーセンタイルのレイテンシ指標:

| op    | r6g   | c6gn  | c7g   |
|-------|-------|-------|-------|
| set   | 0.8ms | 1ms   | 1ms   |
| get   | 0.9ms | 0.9ms | 0.8ms |
| setex | 0.9ms | 1.1ms | 1.3ms |

*すべてのベンチマークは `memtier_benchmark` (下記参照) を使い、スレッド数はサーバーとインスタンスタイプごとに調整しました。`memtier` は別の c6gn.16xlarge マシンで実行した。SETEX ベンチマークの有効期限は 500 に設定し、テストが終了しても有効であることを確認しました。*

```bash
  memtier_benchmark --ratio ... -t <threads> -c 30 -n 200000 --distinct-client-seed -d 256 \
     --expiry-range=...
```

パイプラインモード `--pipeline=30` では、Dragonfly は SET 操作で **10M QPS**、GET 操作で **15M QPS** に達する。

### Dragonfly vs. Memcached

AWS 上の c6gn.16xlarge インスタンスで Dragonfly と Memcached を比較した。

同程度のレイテンシで、Dragonfly のスループットは Memcached のスループットを書き込みと読み込みの両方のワークロードで上回った。Dragonfly は、[Memcached の書き込みパス](docs/memcached_benchmark.md)での競合により、書き込みワークロードでより優れたレイテンシを示しました。

#### SET ベンチマーク

| Server    | QPS(thousands qps) | latency 99% | 99.9%   |
|:---------:|:------------------:|:-----------:|:-------:|
| Dragonfly |  🟩 3844           |🟩 0.9ms     | 🟩 2.4ms |
| Memcached |   806              |   1.6ms     | 3.2ms    |

#### GET ベンチマーク

| Server    | QPS(thousands qps) | latency 99% | 99.9%   |
|-----------|:------------------:|:-----------:|:-------:|
| Dragonfly | 🟩 3717            |   1ms       | 2.4ms   |
| Memcached |   2100             |  🟩 0.34ms  | 🟩 0.6ms |


Memcached は読み取りベンチマークでより低いレイテンシを示したが、スループットも低かった。

### メモリ効率

メモリ効率をテストするために、`debug populate 5000000 key 1024` コマンドを使用して Dragonfly と Redis に ~5GB のデータを入れ、`memtier` コマンドで更新トラフィックを送信し、`bgsave` コマンドでスナップショットを開始しました。

この図は、各サーバがメモリ効率の面でどのような挙動を示したかを示している。

<img src="http://static.dragonflydb.io/repo-assets/bgsave-memusage.svg" width="70%" border="0"/>

Dragonfly はアイドル状態では Redis よりも 30% メモリ効率が高く、スナップショットフェーズではメモリ使用量の目に見える増加は見られなかった。ピーク時には Redis のメモリ使用量は Dragonfly の 3 倍近くまで増加しました。

Dragonfly はスナップショットをより早く、数秒以内に終了させました。

Dragonfly のメモリ効率の詳細については、[Dashtable ドキュメント](/docs/dashtable.md)を参照してください。


## <a name="コンフィグ"><a/>コンフィグ

Dragonfly は一般的な Redis の引数をサポートしています。例えば `dragonfly --requirepass=foo --bind localhost`。

Dragonfly は現在、以下の Redis 固有の引数をサポートしています:
 * `port`： Redis 接続ポート (`default: 6379`).
 * `bind`： ローカルホストからの接続のみを許可する場合は `localhost` を、**その IP** アドレスへの接続 (つまり外部からの接続) を許可する場合はパブリック IP アドレスを指定する。
 * `requirepass`： AUTH 認証用のパスワード (`default: ""`)。
 * `maxmemory`： データベースが使用するメモリの上限 (人間が読めるバイト数) (`default: 0`)。 `maxmemory` に `0` を指定すると、プログラムが自動的に最大メモリ使用量を決定する。
 * `dir`： Dragonfly Docker はデフォルトで `/data` フォルダをスナップショットに使用し、CLI は `""` を使用する。`v` の Docker オプションでホストフォルダにマッピングできる。
 * `dbfilename`： データベースを保存・ロードするファイル名 (`default: dump`).

Dragonfly 特有の議論もある:
 * `memcached_port`: Memcached 互換 API を有効にするポート (`default: disabled`)。
 * `keys_output_limit`: `keys` コマンドで返されるキーの最大数（`default: 8192`）。`keys` は危険なコマンドであることに注意してください。あまりに多くのキーを取得するとメモリ使用量が増大するため、結果を切り捨てています。
 * `dbnum`: `select` でサポートされるデータベースの最大数。
 * `cache_mode`: 以下の[斬新なキャッシュデザイン](#斬新なキャッシュデザイン)のセクションを参照してください。
 * `hz`: キーの有効期限評価頻度 (`default: 100`)。この頻度が低いと、アイドル時の CPU 使用量が少なくなるが、その分古くなったキーをクリアする速度が遅くなる。
 * `primary_port_http_enabled`: もし `true` (`default: true`) なら、メイン TCP ポートで HTTP コンソールにアクセスできるようにする。
 * `admin_port`: 割り当てられたポートのコンソールへの管理者アクセスを有効にする(`default: disabled`)。HTTP と RESP プロトコルの両方をサポートする。
 * `admin_bind`: 管理コンソールの TCP 接続を指定されたアドレスにバインドする(`default: any`)。HTTP と RESP の両方のプロトコルをサポートする。
 * `admin_nopass`: 割り当てられたポートで、認証トークンなしでコンソールへのオープン管理アクセスを有効にする (`default: false`)。HTTP と RESP の両方のプロトコルをサポートする。
 * `cluster_mode`: サポートするクラスターモード (`default: ""`)。現在は `emulated` のみをサポートしている。
 * `cluster_announce_ip`: クラスタコマンドがクライアントにアナウンスする IP。

### 一般的なオプションを使用した開始スクリプトの例:

```bash
./dragonfly-x86_64 --logtostderr --requirepass=youshallnotpass --cache_mode=true -dbnum 1 --bind localhost --port 6379  --maxmemory=12gb --keys_output_limit=12288 --dbfilename dump.rdb
```

また、`dragonfly --flagfile <filename>` を実行することで、設定ファイルから引数を指定することもできる。ファイルには 1 行に 1 つのフラグを記述し、キーと値のフラグには空白の代わりに等号を記述します。

ログの管理や TLS のサポートなど、その他のオプションについては `dragonfly --help` を実行してください。

## <a name="ロードマップとステータス"><a/>ロードマップとステータス

Dragonfly は現在、~185 個の Redis コマンドと、`cas` 以外のすべての Memcached コマンドをサポートしている。ほぼ Redis 5 API と同等ですが、Dragonfly の次のマイルストーンは基本的な機能を安定させ、レプリケーション API を実装することです。まだ実装されていないコマンドで必要なものがあれば、issue を開いてください。

Dragonfly ネイティブのレプリケーションについては、桁違いに高速な分散ログフォーマットを設計中です。

レプリケーション機能に続いて、Redis バージョン 3-6 の API に不足しているコマンドを追加していく予定です。

現在 Dragonfly がサポートしているコマンドについては、[コマンドリファレンス](https://dragonflydb.io/docs/category/command-reference)をご覧ください。

## <a name="デザイン決定"><a/> デザイン決定

### 斬新なキャッシュデザイン

Dragonfly には、シンプルでメモリ効率の良い、単一の統一された適応型キャッシュアルゴリズムがあります。

`cache_mode=true` フラグを渡すことでキャッシュモードを有効にすることができます。このモードをオンにすると、Dragonfly は将来つまずく可能性が最も低いアイテムを退避させますが、`maxmemory` の限界に近づいたときのみ退避させます。

### 比較的正確な有効期限

有効期限は 8 年以内。

ミリ秒精度の有効期限（PEXPIRE、PSETEX など）は、**2^28ms** を超える期限については、最も近い秒に丸められます。この誤差は 0.001% 以下であり、大きな範囲であれば許容範囲となります。

Dragonfly の期限と Redis の実装の詳細な違いについては、[こちら](docs/differences.md)を参照してください。

### ネイティブ HTTP コンソールと Prometheus 互換メトリクス

デフォルトでは、Dragonfly はメイン TCP ポート(6379)経由での HTTP アクセスを許可しています。その通り、Redis プロトコル経由でも HTTP プロトコル経由でも Dragonfly に接続することができます。ブラウザで試してみてください。HTTP アクセスには現在あまり情報がありませんが、将来的にはデバッグや管理に役立つ情報が含まれるようになる予定です。

Prometheus 互換のメトリクスを見るには、URL `:6379/metrics` にアクセスしてください。

Prometheus からエクスポートされたメトリクスは Grafana ダッシュボードと互換性があります[こちらを参照](tools/local/monitoring/grafana/provisioning/dashboards/dashboard.json)。


重要です！HTTP コンソールは安全なネットワーク内でアクセスすることを想定しています。Dragonfly の TCP ポートを外部に公開する場合は、`--http_admin_console=false` または `--nohttp_admin_console` でコンソールを無効にすることをお勧めします。


## <a name="バックグラウンド"><a/>バックグラウンド

Dragonfly は、インメモリデータストアを 2022 年に設計したらどのようになるかという実験から始まりました。メモリストアのユーザーとして、またクラウド企業で働いたエンジニアとしての経験から学んだ教訓をもとに、Dragonfly では 2 つの重要な特性を維持する必要があると考えました: それは、すべてのオペレーションにおける原子性の保証と、非常に高いスループットにおけるミリ秒以下の低レイテンシーです。

私たちの最初の課題は、パブリッククラウドで現在利用可能なサーバーを使用して、CPU、メモリー、I/O リソースをフルに活用する方法でした。これを解決するために、私たちは[シェアードナッシングアーキテクチャ](https://en.wikipedia.org/wiki/Shared-nothing_architecture)を使用しています。このアーキテクチャでは、各スレッドが辞書データのスライスを独自に管理できるように、スレッド間でメモリストアの鍵空間を分割することができます。これらのスライスを "shards" と呼ぶ。シェアードナッシングアーキテクチャのスレッドと I/O 管理のためのライブラリは、[こちら](https://github.com/romange/helio)でオープンソースで提供されています。

複数キー操作に対する原子性保証を提供するために、我々は最近の学術研究の進歩を利用している。Dragonfly のトランザクションフレームワークの開発には、論文 ["VLL: a lock manager redesign for main memory database systems"](https://www.cs.umd.edu/~abadi/papers/vldbj-vll.pdf) を選びました。シェアードナッシングアーキテクチャと VLL の選択により、ミューテックスやスピンロックを使用せずにアトミックなマルチキー操作を構成することができました。これは我々の PoC にとって大きなマイルストーンであり、その性能は他の商用やオープンソースのソリューションよりも際立っていました。

私たちの第二の課題は、新しいストアのために、より効率的なデータ構造を設計することだった。この目標を達成するために、我々は論文 ["Dash: Scalable Hashing on Persistent Memory"](https://arxiv.org/pdf/2003.07302.pdf) に基づいたハッシュテーブル構造を核とした。この論文自体は、永続メモリ領域を中心にしており、メインメモリストアとは直接関係ありませんが、それでも私たちの問題に最も当てはまります。この論文で提案されているハッシュテーブル設計により、Redis の辞書に存在する 2 つの特別な特性を維持することができました: それは、データストアの成長中にハッシュをインクリメンタルする機能と、ステートレススキャン操作を使って変更中の辞書をトラバースする機能です。これら2つの特性に加え、Dash は CPU とメモリの使用効率が高い。Dash の設計を活用することで、私たちは以下のような機能をさらに革新することができました:
 * TTL レコードの効率的なレコード期限切れ。
 * LRU や LFU のような他のキャッシュ戦略よりも高いヒット率を、**ゼロメモリオーバーヘッド** で達成する新しいキャッシュエビクションアルゴリズム。
 * 新しい **フォークレス** スナップショットアルゴリズム。

Dragonfly の基盤を構築し、[そのパフォーマンスに満足したら](#ベンチマーク)、Redis と Memcached の機能を実装していきました。現在までに 185 個の Redis コマンド（Redis 5.0 API とほぼ同等）と 13 個の Memcached コマンドを実装しました。

そして最後に、<br>
<em>私たちの使命は、最新のハードウェアの進歩を活用した、クラウドワークロード向けの、優れた設計、超高速、コスト効率の良いインメモリデータストアを構築することです。現在のソリューションの API と提案を維持しながら、その問題点を解決するつもりです。</em>


================================================
FILE: README.ko-KR.md
================================================
<p align="center">
  <a href="https://dragonflydb.io">
    <img  src="/.github/images/logo-full.svg"
      width="284" border="0" alt="Dragonfly">
  </a>
</p>

[![ci-tests](https://github.com/dragonflydb/dragonfly/actions/workflows/ci.yml/badge.svg)](https://github.com/dragonflydb/dragonfly/actions/workflows/ci.yml) [![Twitter URL](https://img.shields.io/twitter/follow/dragonflydbio?style=social)](https://twitter.com/dragonflydbio)

다른 언어 번역본:  [English](README.zh-CN.md) [简体中文](README.zh-CN.md) [日本語](README.ja-JP.md) [Português](README.pt-BR.md)

[Website](https://www.dragonflydb.io/) • [Docs](https://dragonflydb.io/docs) • [Quick Start](https://www.dragonflydb.io/docs/getting-started) • [Community Discord](https://discord.gg/HsPjXGVH85) • [Dragonfly Forum](https://dragonfly.discourse.group/) • [Join the Dragonfly Community](https://www.dragonflydb.io/community)

[GitHub Discussions](https://github.com/dragonflydb/dragonfly/discussions) • [GitHub Issues](https://github.com/dragonflydb/dragonfly/issues) • [Contributing](https://github.com/dragonflydb/dragonfly/blob/main/CONTRIBUTING.md) • [Dragonfly Cloud](https://www.dragonflydb.io/cloud)

## 세상에서 가장 빠른 인-메모리 스토어

Dragonfly는 현대 애플리케이션 작업을 위한 인-메모리 데이터스토어입니다.

Dragonfly는 Redis와 Memcached API와 완벽하게 호환되며, 이를 적용하기 위한 코드 변경을 필요로 하지 않습니다. Dragonfly는 기존 레거시 인-메모리 데이터스토어와 비교하여 25배 이상의 높은 처리량과 캐시 히트율, 낮은 꼬리 지연시간을 갖고있으며 간편한 수직 확장성을 지니고 있습니다.

## 콘텐츠

- [벤치마크](#benchmarks)
- [빠른 시작](https://github.com/dragonflydb/dragonfly/tree/main/docs/quick-start)
- [설정](#configuration)
- [로드맵과 상태](#roadmap-status)
- [설계 의사결정](#design-decisions)
- [개발 배경](#background)

## <a name="benchmarks"><a/>벤치마크

<img src="http://static.dragonflydb.io/repo-assets/aws-throughput.svg" width="80%" border="0"/>

벤치마크에 따르면, Dragonfly는 레디스와 비교하여 처리량이 25배이상 증가하였고, c6gn.16xlarge 인스턴스에서 3.8M QPS를 돌파하였음을 보여줍니다.

Dragonfly의 피크 처리량에서의 99퍼센트 지연 시간 지표:

| op    | r6g   | c6gn  | c7g   |
|-------|-------|-------|-------|
| set   | 0.8ms | 1ms   | 1ms   |
| get   | 0.9ms | 0.9ms | 0.8ms |
| setex | 0.9ms | 1.1ms | 1.3ms |

*모든 벤치마크는 서버 및 인스턴스 유형별로 조정된 스레드 수를 사용하여 `memtier_benchmark`(아래를 참고) 수행되었습니다. `memtier`는 별도의 c6gn.16xlarge 머신에서 실행되었습니다. 저희는 테스트 종료 이후에도 유효하게 유지되도록 보장하기 위해 SETEX 벤치마크의 만료 시간을 500으로 설정하였습니다.*

```bash
  memtier_benchmark --ratio ... -t <threads> -c 30 -n 200000 --distinct-client-seed -d 256 \
     --expiry-range=...
```

파이프라인 모드에서 `--pipeline=30`은 Dragonfly가 SET 연산으로 **10M QPS**, GET 연산으로 **15M QPS**에 도달할 수 있음을 나타냅니다.

### Dragonfly vs. Memcached

저희는 AWS의 c6gn.16xlarge 인스턴스에서 Dragonfly와 Memcached를 비교하는 작업을 수행했습니다.

비슷한 지연시간을 가진 상황에서, Dragonfly의 처리량은 쓰기 및 읽기 작업 모두에서 Memcached보다 성능이 뛰어났습니다. 쓰기 작업에서는 [Memcached의 쓰기 경로](docs/memcached_benchmark.md)에서의 경합으로 인하여 Dragonfly가 보다 적은 지연시간을 보였다는 점이 입증되었습니다.

#### SET 벤치마크

| Server    | QPS(thousands qps) | latency 99% | 99.9%   |
|:---------:|:------------------:|:-----------:|:-------:|
| Dragonfly |  🟩 3844           |🟩 0.9ms     | 🟩 2.4ms |
| Memcached |   806              |   1.6ms     | 3.2ms    |

#### GET 벤치마크

| Server    | QPS(thousands qps) | latency 99% | 99.9%   |
|-----------|:------------------:|:-----------:|:-------:|
| Dragonfly | 🟩 3717            |   1ms       | 2.4ms   |
| Memcached |   2100             |  🟩 0.34ms  | 🟩 0.6ms |

Memcached는 읽기 벤치마크의 지연 시간은 적었지만, 처리량도 낮았습니다.

### 메모리 효율

메모리 효율을 테스트하기 위해서, 저희는 `debug populate 5000000 key 1024` 명령어를 활용하여 Dragonfly와 Redis에 ~5GB 정도의 데이터를 채운 후, `memtier` 를 통하여 업데이트 트래픽을 전송한 후, `bgsave` 명령을 통하여 스냅샷을 시작했습니다.

이 그림은 메모리 효율 측면에서 각 서버가 어떻게 동작했는지 보여줍니다.

<img src="http://static.dragonflydb.io/repo-assets/bgsave-memusage.svg" width="70%" border="0"/>

Dragonfly는 유휴 상태에서 Redis보다 메모리 효율이 30% 더 좋았으며, 스냅샷 단계에서 메모리 사용량이 눈에 띄게 증가하지 않았습니다. Redis는 고점에서 Dragonfly에 비해 메모리 사용량이 약 3배 증가하였습니다.

Dragonfly는 스냅샷 단계를 몇 초안에 더 빨리 마쳤습니다.

Dragonfly의 메모리 효율에 대한 정보가 더 필요하시다면, 저희의 [Dashtable 문서](/docs/dashtable.md)를 참고하시기 바랍니다.


## <a name="configuration"><a/>설정

Dragonfly는 적용 가능한 Redis 인수를 지원합니다. 예를 들면, `dragonfly --requirepass=foo --bind localhost`와 같은 명령어를 사용할 수 있습니다.

Dragonfly는 현재 아래와 같은 Redis 인수들을 지원합니다 :
  * `port`: Redis 연결 포트 (`기본값: 6379`).
  * `bind`: `localhost`를 사용하여 로컬호스트 연결만 허용하거나 공용 IP 주소를 사용하여 해당 IP 주소에 연결을 허용합니다.(즉, 외부에서도 가능)
  * `requirepass`: AUTH 인증을 위한 패스워드 (`기본값: ""`).
  * `maxmemory`: 데이터베이스에서 사용하는 최대 메모리 제한(사람이 읽을 수 있는 바이트 단위) (`기본값: 0`). `maxmemory` 의 값이 `0` 이면 프로그램이 최대 메모리 사용량을 자동으로 결정합니다.
  * `dir`: Dragonfly Docker는 스냅샷을 위해 기본적으로 `/data` 폴더를 사용하고, CLI은 `""`을 사용합니다. Docker 옵션인 `-v` 을 통해서 호스트 폴더에 매핑할 수 있습니다.
  * `dbfilename`: 저장하고 불러올 데이터베이스 파일 이름 (`기본값: dump`).

아래는 Dragonfly 전용 인수 입니다 :
  * `memcached_port`: Memcached 호환 API를 위한 포트 (`기본값: disabled`).
  * `keys_output_limit`: `keys` 명령을 통해 반환 되는 최대 키의 수 (`기본값: 8192`). `keys` 명령은 위험하기 때문에, 너무 많은 키를 가져올 때 메모리 사용량이 급증하지 않도록 결과를 해당 인수만큼 잘라냅니다.
  * `dbnum`: `select` 명령에 대해 지원되는 최대 데이터베이스 수.
  * `cache_mode`: 아래의 섹션 [새로운 캐시 설계](#novel-cache-design)을 참고해주시기 바랍니다.
  * `hz`: 키가 만료되었는지를 판단하는 빈도(`기본값: 100`). 낮은 빈도는 키 방출이 느려지는 대신, 유휴 상태일 때 CPU 사용량을 줄입니다.
  * `primary_port_http_enabled`: `true` 인 경우 HTTP 콘솔로 메인 TCP 포트 접근을 허용합니다. (`기본값: true`).
  * `admin_port`: 할당된 포트에서 관리자 콘솔 접근을 활성화합니다. (`기본값: disabled`). HTTP와 RESP 프로토콜 모두를 지원합니다.
  * `admin_bind`: 주어진 주소에 관리자 콘솔 TCP 연결을 바인딩합니다. (`기본값: any`). HTTP와 RESP 프로토콜 모두를 지원합니다.
  * `admin_nopass`: 할당된 포트에 대해서 인증 토큰 없이 관리자 콘솔 접근을 활성화합니다. (`default: false`). HTTP와 RESP 프로토콜 모두를 지원합니다.
  * `cluster_mode`: 클러스터 모드가 지원됩니다. (`기본값: ""`). 현재는`emulated` 만 지원합니다.
  * `cluster_announce_ip`: 클러스터 명령을 클라이언트에게 알리는 IP 주소.


### 주요 옵션을 활용한 실행 스크립트 예시:

```bash
./dragonfly-x86_64 --logtostderr --requirepass=youshallnotpass --cache_mode=true -dbnum 1 --bind localhost --port 6379  --maxmemory=12gb --keys_output_limit=12288 --dbfilename dump.rdb
```

인수들은 `dragonfly --flagfile <filename>`을 실행하여 설정 파일을 통해서도 전달할 수 있습니다. 전달될 파일은 각 줄에 키-값 형태의 플래그 나열 하기위해 등호를 사용합니다.

로그 관리나 TLS 지원과 같은 추가 옵션을 확인하고 싶다면, `dragonfly --help` 를 실행해보시길 바랍니다.

## <a name="roadmap-status"><a/>로드맵과 상태

Dragonfly는 현재 ~185개의 Redis 명령어들과 `cas` 뿐만 아니라 모든 Memcached 명령어를 지원합니다. 이는 거의 Redis 5 API와 동등하며, Dragonfly의 다음 마일스톤은 기본 기능 을 안정화하고 복제 API를 구현하는 것입니다. 아직 구현되지 않은 필요한 명령가 있다면, 이슈를 오픈해주세요.

Draginfly 고유 복제기능을 위해, 저희는 몇 배 높은 속도를 지원할 수 있는 분산 로그 형식을 설계하고 있습니다.

복제 기능을 추가한 뒤에 저희는 Redis 3-6 API에 해당되는 누락 명령어들을 계속 추가할 예정입니다.

Dragonfly에 의해 현재 지원되는 명령어를 확인하기 위해서 [명령어 레퍼런스](https://dragonflydb.io/docs/category/command-reference)를 참고해주시기 바랍니다.

## <a name="design-decisions"><a/>설계 의사결정

### 새로운 캐시 설계

Dragonfly는 단순하고 메모리 효율적인 단일, 통합, 적응형 캐싱 알고리즘을 제공합니다.

`--cache_mode=true` 플래그를 전달하여 캐싱 모드를 활성화할 수 있습니다. 이 모드가 활성화되면, Dragonfly는 `maxmemory` 한도에 가까워질 때만, 미래에 재사용 될 가능성이 가장 낮은 항목을 방출합니다.

### 상대적인 정확성을 가진 만료 기한

만료 범위는 약 ~8년으로 제한됩니다.

밀리초 단위의 정밀한 만료 기한(PEXPIRE, PSETEX, 등)은 **2^28ms보다 큰 기한에 대해** 가장 가까운 초로 반올림됩니다. 이는 0.001% 미만의 오차를 가지며, 큰 범위에 대해 적용될 때는 수용 가능한 수준입니다. 만약 이런 방식이 사용사례에 적합하지 않다면, 문의를 주시거나 해당 사용사례를 설명하는 이슈를 오픈해주세요.

Dragonfly와 Redis의 만료 기한에 대한 구현의 차이는 [여기서 확인하실 수 있습니다](docs/differences.md).

### 네이티브 HTTP 콘솔과 Prometheus 호환 매트릭

기본적으로, Dragonfly는 메인 TCP 포트(6379)에 HTTP 접근을 허용합니다. 즉, Redis 프로토콜과 HTTP 프로토콜 모두를 통해 Dragonfly에 연결할 수 있습니다. - 서버는 연결 초기화 과정에서 프로토콜을 자동으로 인식합니다. 웹 브라우저를 통하여 시도해보시기 바랍니다. 현재 HTTP 접근은 많은 정보를 제공하지 않지만, 유용한 디버깅 및 관리 정보를 향후 추가할 예정입니다.

`:6379/metrics` 에 접근하게 되면, Prometheus 호환 매트릭을 확인할 수 있습니다.

Prometheus에서 내보내는 매트릭들은 Grafana 대시보드와 호환됩니다. 자세한 내용은 [여기](tools/local/monitoring/grafana/provisioning/dashboards/dashboard.json)를 참조해주세요.

중요! HTTP 노솔은 안전한 네트워크 내에서 접근하도록 설계되었습니다. Dragonfly의 TCP 포트를 외부로 노출한다면, `--http_admin_console=false` 혹은 `--nohttp_admin_console`과 같은 인수를 활용하여 콘솔을 비활성화하는 것을 조언해드립니다.


## <a name="background"><a/>개발배경

Dragonfly는 2022년에 인-메모리 데이터스토어를 설계한다면 어땠을까에 대한 실험으로 시작되었습니다. 클라우드 회사에서 근무한 엔지니어 및 메모리 스토어 사용자의 경험을 바탕으로, 저희는 Dragonfly에 핵심적인 두 가지 핵심 특성을 보존해야함을 알았습니다: 모든 작업에 대한 원자성 보장과 매우 높은 처리량에 대한 밀리초 이하의 낮은 지연 시간을 보장하는 것이었습니다.

첫 번째 문제는 오늘날 퍼블릭 클라우드 환경에서 사용 가능한 서버를 사용하여 CPU, 메모리 및 I/O 자원을 어떻게 최대한 활용할 수 있을지였습니다. 이 문제를 해결하기 위해 저희는 [비공유 아키텍처(Shared Nothing Architecture)](https://en.wikipedia.org/wiki/Shared-nothing_architecture)를 사용했습니다. 이는 저희가 메모리 스토어의 각 스레드 사이의 키 공간을 분할할 수 있게하였습니다. 이를 통해 각 스레드들은 그들의 딕셔너리 데이터들의 조각을 관리할 수 있게 되었습니다. 저희는 이 조각들을 "샤드(shards)"라 불렀습니다. 비공유 아키텍처에 대한 스레드 및 I/O 관리를 위한 라이브러리는 [여기](https://github.com/romange/helio)에서 오픈소스로 제공됩니다.

멀티-키 작업에 대한 원자성 보장을 위해, Dragonfly의 트랜잭션 프레임워크를 개발하기 위해 저희는 최근 학계의 연구 발전을 활용했고 ["VLL: a lock manager redesign for main memory database systems”](https://www.cs.umd.edu/~abadi/papers/vldbj-vll.pdf) 논문을 채택했습니다. 비공유 아키텍처와 VLL의 선택은 우리가 뮤텍스나 스핀락을 사용하지 않고도 원자적 멀티-키 작업을 구성할 수 있게 했습니다.
이것은 저희의 PoC에 있어서 주요한 마일스톤이었고, 그 성능은 다른 상용 및 오픈소스 솔루션보다 성능이 뛰어났습니다.

두 번째 문제는 새로운 저장소를 위하여 더 효율적인 데이터 구조를 설계하는 것이었습니다. 이 목표를 달성하기 위해서 저희는 핵심 해시테이블 구조를 ["Dash: Scalable Hashing on Persistent Memory"](https://arxiv.org/pdf/2003.07302.pdf) 논문을 기반으로 작업했습니다. 이 논문은 영속적인 메모리 도메인을 중심으로 다루며, 이는 메인-메모리 저장소와 직접적인 연관관계는 없었습니다. 하지만 여전히 저희 문제를 해결하기 위해서 가장 적합했습니다. 해당 논문의 제안된 해시테이블 설계는 저희가 레디스 딕셔너리에 표현된 두 가지 특별한 특성을 유지 가능하게 해줬습니다: 데이터스토어 확장 중 증분 해싱 기능과 상태 없는 스캔 작업을 사용하여 변화하는 딕셔너리를 순회하는 능력이었습니다. 이 두 가지 속성 외에도 Dash는 CPU와 메모리 사용에서 더 효율적입니다. 저희는 다음과 같은 기능들로 더욱 혁신할 수 있었습니다:
 * TTL 레코드에 대한 효율적인 만료 처리
 * LRU와 LFU 같은 다른 캐시 전략보다 더 높은 히트율을 달성하는 새로운 캐시 방출 알고리즘과 **제로 메모리 오버헤드**.
 * 새로운 **fork-less** 스냅샷 알고리즘.

저희는 Dragonfly의 기반을 구축하고 성능에 만족하게 되었을 때, Redis와 Memcached의 기능을 구현하기 시작했습니다. 저희는 약 185개의 Redis 명령(대략적으로 Redis 5.0 API와 동등)과 13개의 Memecached 명령을 구현했습니다.

마지막으로, <br>
<em>저희의 임무는 최신 하드웨어 발전을 활용하는 클라우드 작업을 위한 멋진 설계와 초고속 처리량 그리고 비용효율적인 인-메모리 데이터스토어를 만드는 것입니다. 저희는 현재 솔루션의 제품 API들이나 제안을 유지하면서 당면 과제를 해결하고자 합니다.</em>


================================================
FILE: README.md
================================================
<p align="center">
  <a href="https://dragonflydb.io">
    <img  src="/.github/images/logo-full.svg"
      width="284" border="0" alt="Dragonfly">
  </a>
</p>

[![ci-tests](https://github.com/dragonflydb/dragonfly/actions/workflows/ci.yml/badge.svg)](https://github.com/dragonflydb/dragonfly/actions/workflows/ci.yml) [![Twitter URL](https://img.shields.io/twitter/follow/dragonflydbio?style=social)](https://twitter.com/dragonflydbio)

> Before moving on, please consider giving us a GitHub star ⭐️. Thank you!

Other languages:  [简体中文](README.zh-CN.md) [日本語](README.ja-JP.md) [한국어](README.ko-KR.md) [Português](README.pt-BR.md)

[Website](https://www.dragonflydb.io/) • [Docs](https://dragonflydb.io/docs) • [Quick Start](https://www.dragonflydb.io/docs/getting-started) • [Community Discord](https://discord.gg/HsPjXGVH85) • [Dragonfly Forum](https://dragonfly.discourse.group/) • [Join the Dragonfly Community](https://www.dragonflydb.io/community)

[GitHub Discussions](https://github.com/dragonflydb/dragonfly/discussions) • [GitHub Issues](https://github.com/dragonflydb/dragonfly/issues) • [Contributing](https://github.com/dragonflydb/dragonfly/blob/main/CONTRIBUTING.md) • [AI Agents Guide](AGENTS.md) • [Dragonfly Cloud](https://www.dragonflydb.io/cloud)

## The world's most efficient in-memory data store

Dragonfly is an in-memory data store built for modern application workloads.

Fully compatible with Redis and Memcached APIs, Dragonfly requires no code changes to adopt. Compared to legacy in-memory datastores, Dragonfly delivers 25X more throughput, higher cache hit rates with lower tail latency, and can run on up to 80% less resources for the same sized workload.

## Contents

- [Benchmarks](#benchmarks)
- [Quick start](https://github.com/dragonflydb/dragonfly/tree/main/docs/quick-start)
- [Configuration](#configuration)
- [Roadmap and status](#roadmap-status)
- [Design decisions](#design-decisions)
- [Background](#background)
- [Build from source](./docs/build-from-source.md)

## <a name="benchmarks"><a/>Benchmarks

We first compare Dragonfly with Redis on `m5.large` instance which is commonly used to run Redis
due to its single-threaded architecture. The benchmark program runs from another
load-test instance (c5n) in the same AZ using `memtier_benchmark  -c 20 --test-time 100 -t 4 -d 256 --distinct-client-seed`

Dragonfly shows a comparable performance:

1. SETs (`--ratio 1:0`):

|  Redis                                   |      DF                                |
| -----------------------------------------|----------------------------------------|
| QPS: 159K, P99.9: 1.16ms, P99: 0.82ms    | QPS:173K, P99.9: 1.26ms, P99: 0.9ms    |
|                                          |                                        |

2. GETs (`--ratio 0:1`):

|  Redis                                  |      DF                                |
| ----------------------------------------|----------------------------------------|
| QPS: 194K, P99.9: 0.8ms, P99: 0.65ms    | QPS: 191K, P99.9: 0.95ms, P99: 0.8ms   |

The benchmark above shows that the algorithmic layer inside DF that allows it to scale vertically
does not take a large toll when running single-threaded.

However, if we take a bit stronger instance (m5.xlarge), the gap between DF and Redis starts growing.
(`memtier_benchmark  -c 20 --test-time 100 -t 6 -d 256 --distinct-client-seed`):
1. SETs (`--ratio 1:0`):

|  Redis                                  |      DF                                |
| ----------------------------------------|----------------------------------------|
| QPS: 190K, P99.9: 2.45ms, P99: 0.97ms   |  QPS: 279K , P99.9: 1.95ms, P99: 1.48ms|

2. GETs (`--ratio 0:1`):

|  Redis                                  |      DF                                |
| ----------------------------------------|----------------------------------------|
| QPS: 220K, P99.9: 0.98ms , P99: 0.8ms   |  QPS: 305K, P99.9: 1.03ms, P99: 0.87ms |


Dragonfly throughput capacity continues to grow with instance size,
while single-threaded Redis is bottlenecked on CPU and reaches local maxima in terms of performance.

<img src="http://static.dragonflydb.io/repo-assets/aws-throughput.svg" width="80%" border="0"/>

If we compare Dragonfly and Redis on the most network-capable instance c6gn.16xlarge,
Dragonfly showed a 25X increase in throughput compared to Redis single process, crossing 3.8M QPS.

Dragonfly's 99th percentile latency metrics at its peak throughput:

| op    | r6g   | c6gn  | c7g   |
|-------|-------|-------|-------|
| set   | 0.8ms | 1ms   | 1ms   |
| get   | 0.9ms | 0.9ms | 0.8ms |
| setex | 0.9ms | 1.1ms | 1.3ms |

*All benchmarks were performed using `memtier_benchmark` (see below) with number of threads tuned per server and instance type. `memtier` was run on a separate c6gn.16xlarge machine. We set the expiry time to 500 for the SETEX benchmark to ensure it would survive the end of the test.*

```bash
  memtier_benchmark --ratio ... -t <threads> -c 30 -n 200000 --distinct-client-seed -d 256 \
     --expiry-range=...
```

In pipeline mode `--pipeline=30`, Dragonfly reaches **10M QPS** for SET and **15M QPS** for GET operations.

### Dragonfly vs. Memcached

We compared Dragonfly with Memcached on a c6gn.16xlarge instance on AWS.

With a comparable latency, Dragonfly throughput outperformed Memcached throughput in both write and read workloads. Dragonfly demonstrated better latency in write workloads due to contention on the [write path in Memcached](docs/memcached_benchmark.md).

#### SET benchmark

| Server    | QPS(thousands qps) | latency 99% | 99.9%   |
|:---------:|:------------------:|:-----------:|:-------:|
| Dragonfly |  🟩 3844           |🟩 0.9ms     | 🟩 2.4ms |
| Memcached |   806              |   1.6ms     | 3.2ms    |

#### GET benchmark

| Server    | QPS(thousands qps) | latency 99% | 99.9%   |
|-----------|:------------------:|:-----------:|:-------:|
| Dragonfly | 🟩 3717            |   1ms       | 2.4ms   |
| Memcached |   2100             |  🟩 0.34ms  | 🟩 0.6ms |


Memcached exhibited lower latency for the read benchmark, but also lower throughput.

### Memory efficiency

To test memory efficiency, we filled Dragonfly and Redis with ~5GB of data using the `debug populate 5000000 key 1024` command, sent update traffic with `memtier`, and kicked off the snapshotting with the `bgsave` command.

This figure demonstrates how each server behaved in terms of memory efficiency.

<img src="http://static.dragonflydb.io/repo-assets/bgsave-memusage.svg" width="70%" border="0"/>

Dragonfly was 30% more memory efficient than Redis in the idle state and did not show any visible increase in memory use during the snapshot phase. At peak, Redis memory use increased to almost 3X that of Dragonfly.

Dragonfly finished the snapshot faster, within a few seconds.

For more info about memory efficiency in Dragonfly, see our [Dashtable doc](/docs/dashtable.md).


## <a name="configuration"><a/>Configuration

Dragonfly supports common Redis arguments where applicable. For example, you can run: `dragonfly --requirepass=foo --bind localhost`.

Dragonfly currently supports the following Redis-specific arguments:
 * `port`: Redis connection port (`default: 6379`).
 * `bind`: Use `localhost` to only allow localhost connections or a public IP address to allow connections **to that IP** address (i.e. from outside too). Use `0.0.0.0` to allow all IPv4.
 * `requirepass`: The password for AUTH authentication (`default: ""`).
 * `maxmemory`: Limit on maximum memory (in human-readable bytes) used by the database (`default: 0`). A `maxmemory` value of `0` means the program will automatically determine its maximum memory usage.
 * `dir`: Dragonfly Docker uses the `/data` folder for snapshotting by default, the CLI uses `""`. You can use the `-v` Docker option to map it to your host folder.
 * `dbfilename`: The filename to save and load the database (`default: dump`).

There are also some Dragonfly-specific arguments:
 * `memcached_port`: The port to enable Memcached-compatible API on (`default: disabled`).
 * `keys_output_limit`: Maximum number of returned keys in `keys` command (`default: 8192`). Note that `keys` is a dangerous command. We truncate its result to avoid a blowup in memory use when fetching too many keys.
 * `dbnum`: Maximum number of supported databases for `select`.
 * `cache_mode`: See the [novel cache design](#novel-cache-design) section below.
 * `hz`: Key expiry evaluation frequency (`default: 100`). Lower frequency uses less CPU when idle at the expense of a slower eviction rate.
 * `snapshot_cron`: Cron schedule expression for automatic backup snapshots using standard cron syntax with the granularity of minutes (`default: ""`).
   Here are some cron schedule expression examples below, and feel free to read more about this argument in our [documentation](https://www.dragonflydb.io/docs/managing-dragonfly/backups#the-snapshot_cron-flag).

   | Cron Schedule Expression | Description                                |
   |--------------------------|--------------------------------------------|
   | `* * * * *`              | At every minute                            |
   | `*/5 * * * *`            | At every 5th minute                        |
   | `5 */2 * * *`            | At minute 5 past every 2nd hour            |
   | `0 0 * * *`              | At 00:00 (midnight) every day              |
   | `0 6 * * 1-5`            | At 06:00 (dawn) from Monday through Friday |

 * `primary_port_http_enabled`: Allows accessing HTTP console on main TCP port if `true` (`default: true`).
 * `admin_port`: To enable admin access to the console on the assigned port (`default: disabled`). Supports both HTTP and RESP protocols.
 * `admin_bind`: To bind the admin console TCP connection to a given address (`default: any`). Supports both HTTP and RESP protocols.
 * `admin_nopass`: To enable open admin access to console on the assigned port, without auth token needed (`default: false`). Supports both HTTP and RESP protocols.
 * `cluster_mode`: Cluster mode supported (`default: ""`). Currently supports only `emulated`.
 * `cluster_announce_ip`: The IP that cluster commands announce to the client.
 * `announce_port`: The port that cluster commands announce to the client, and to replication master.

### Example start script with popular options:

```bash
./dragonfly-x86_64 --logtostderr --requirepass=youshallnotpass --cache_mode=true -dbnum 1 --bind localhost --port 6379 --maxmemory=12gb --keys_output_limit=12288 --dbfilename dump.rdb
```

Arguments can be also provided via:
 * `--flagfile <filename>`: The file should list one flag per line, with equal signs instead of spaces for key-value flags. No quotes are needed for flag values.
 * Setting environment variables. Set `DFLY_x`, where `x` is the exact name of the flag, case sensitive.

For more options like logs management or TLS support, run `dragonfly --help`.

## <a name="roadmap-status"><a/>Roadmap and status

Dragonfly currently supports ~185 Redis commands and all Memcached commands besides `cas`. Almost on par with the Redis 5 API, Dragonfly's next milestone will be to stabilize basic functionality and implement the replication API. If there is a command you need that is not implemented yet, please open an issue.

For Dragonfly-native replication, we are designing a distributed log format that will support order-of-magnitude higher speeds.

Following the replication feature, we will continue adding missing commands for Redis versions 3-6 APIs.

Please see our [Command Reference](https://dragonflydb.io/docs/category/command-reference) for the current commands supported by Dragonfly.

## <a name="design-decisions"><a/> Design decisions

### Novel cache design

Dragonfly has a single, unified, adaptive caching algorithm that is simple and memory efficient.

You can enable caching mode by passing the `--cache_mode=true` flag. Once this mode is on, Dragonfly will evict items least likely to be stumbled upon in the future but only when it is near the `maxmemory` limit.

### Expiration deadlines with relative accuracy

Expiration ranges are limited to ~8 years.

Expiration deadlines with millisecond precision (PEXPIRE, PSETEX, etc.) are rounded to the closest second **for deadlines greater than 2^28ms**, which has less than 0.001% error and should be acceptable for large ranges. If this is not suitable for your use case, get in touch or open an issue explaining your case.

For more detailed differences between Dragonfly expiration deadlines and Redis implementations, [see here](docs/differences.md).

### Native HTTP console and Prometheus-compatible metrics

By default, Dragonfly allows HTTP access via its main TCP port (6379). That's right, you can connect to Dragonfly via Redis protocol and via HTTP protocol — the server recognizes the protocol automatically during the connection initiation. Go ahead and try it with your browser. HTTP access currently does not have much info but will include useful debugging and management info in the future.

Go to the URL `:6379/metrics` to view Prometheus-compatible metrics.

The Prometheus exported metrics are compatible with the Grafana dashboard, [see here](tools/local/monitoring/grafana/provisioning/dashboards/dashboard.json).


Important! The HTTP console is meant to be accessed within a safe network. If you expose Dragonfly's TCP port externally, we advise you to disable the console with `--http_admin_console=false` or `--nohttp_admin_console`.


## <a name="background"><a/>Background

Dragonfly started as an experiment to see how an in-memory datastore could look if it was designed in 2022. Based on lessons learned from our experience as users of memory stores and engineers who worked for cloud companies, we knew that we need to preserve two key properties for Dragonfly: Atomicity guarantees for all operations and low, sub-millisecond latency over very high throughput.

Our first challenge was how to fully utilize CPU, memory, and I/O resources using servers that are available today in public clouds. To solve this, we use [shared-nothing architecture](https://en.wikipedia.org/wiki/Shared-nothing_architecture), which allows us to partition the keyspace of the memory store between threads so that each thread can manage its own slice of dictionary data. We call these slices "shards". The library that powers thread and I/O management for shared-nothing architecture is open-sourced [here](https://github.com/romange/helio).

To provide atomicity guarantees for multi-key operations, we use the advancements from recent academic research. We chose the paper ["VLL: a lock manager redesign for main memory database systems”](https://www.cs.umd.edu/~abadi/papers/vldbj-vll.pdf) to develop the transactional framework for Dragonfly. The choice of shared-nothing architecture and VLL allowed us to compose atomic multi-key operations without using mutexes or spinlocks. This was a major milestone for our PoC and its performance stood out from other commercial and open-source solutions.

Our second challenge was to engineer more efficient data structures for the new store. To achieve this goal, we based our core hashtable structure on the paper ["Dash: Scalable Hashing on Persistent Memory"](https://arxiv.org/pdf/2003.07302.pdf). The paper itself is centered around the persistent memory domain and is not directly related to main-memory stores, but it's still most applicable to our problem. The hashtable design suggested in the paper allowed us to maintain two special properties that are present in the Redis dictionary: The incremental hashing ability during datastore growth the ability to traverse the dictionary under changes using a stateless scan operation. In addition to these two properties, Dash is more efficient in CPU and memory use. By leveraging Dash's design, we were able to innovate further with the following features:
 * Efficient record expiry for TTL records.
 * A novel cache eviction algorithm that achieves higher hit rates than other caching strategies like LRU and LFU with **zero memory overhead**.
 * A novel **fork-less** snapshotting algorithm.

Once we had built the foundation for Dragonfly and [we were happy with its performance](#benchmarks), we went on to implement the Redis and Memcached functionality. We have to date implemented ~185 Redis commands (roughly equivalent to Redis 5.0 API) and 13 Memcached commands.

And finally, <br>
<em>Our mission is to build a well-designed, ultra-fast, cost-efficient in-memory datastore for cloud workloads that takes advantage of the latest hardware advancements. We intend to address the pain points of current solutions while preserving their product APIs and propositions.</em>


================================================
FILE: README.pt-BR.md
================================================
<p align="center">
  <a href="https://dragonflydb.io">
    <img  src="/.github/images/logo-full.svg"
      width="284" border="0" alt="Dragonfly">
  </a>
</p>

[![ci-tests](https://github.com/dragonflydb/dragonfly/actions/workflows/ci.yml/badge.svg)](https://github.com/dragonflydb/dragonfly/actions/workflows/ci.yml) [![Twitter URL](https://img.shields.io/twitter/follow/dragonflydbio?style=social)](https://twitter.com/dragonflydbio)

> Antes de continuar, considere deixar uma estrela no nosso repositório ⭐️. Obrigado!

Outros idiomas: [简体中文](README.zh-CN.md) [日本語](README.ja-JP.md) [한국어](README.ko-KR.md) [English](README.md)

[Site oficial](https://www.dragonflydb.io/) • [Documentação](https://dragonflydb.io/docs) • [Guia Rápido](https://www.dragonflydb.io/docs/getting-started) • [Discord da Comunidade](https://discord.gg/HsPjXGVH85) • [Fórum Dragonfly](https://dragonfly.discourse.group/) • [Participe da Comunidade](https://www.dragonflydb.io/community)

[Discussões no GitHub](https://github.com/dragonflydb/dragonfly/discussions) • [Issues no GitHub](https://github.com/dragonflydb/dragonfly/issues) • [Contribuindo](https://github.com/dragonflydb/dragonfly/blob/main/CONTRIBUTING.md) • [Dragonfly Cloud](https://www.dragonflydb.io/cloud)

## O armazenamento de dados em memória mais eficiente do mundo

Dragonfly é um armazenamento de dados em memória projetado para cargas de trabalho modernas.

Totalmente compatível com as APIs do Redis e Memcached, o Dragonfly não requer alterações de código para adoção. Em comparação com armazenamentos legados, o Dragonfly oferece 25x mais throughput, maiores taxas de acerto em cache com menor latência de cauda e pode operar com até 80% menos recursos para a mesma carga.

## Conteúdo

- [Benchmarks](#benchmarks)
- [Guia rápido](https://github.com/dragonflydb/dragonfly/tree/main/docs/quick-start)
- [Configuração](#configuration)
- [Roteiro e status](#roadmap-status)
- [Decisões de design](#design-decisions)
- [Contexto](#background)
- [Compilação a partir do código-fonte](./docs/build-from-source.md)

## <a name="benchmarks"><a/>Benchmarks

Primeiro comparamos o Dragonfly com o Redis em uma instância `m5.large`, frequentemente usada para rodar Redis devido à sua arquitetura single-threaded. O benchmark roda de outra instância de carga (c5n) na mesma AZ usando `memtier_benchmark  -c 20 --test-time 100 -t 4 -d 256 --distinct-client-seed`.

O Dragonfly mostra desempenho comparável:

1. SETs (`--ratio 1:0`):

| Redis                                 | DF                                   |
| ------------------------------------- | ------------------------------------ |
| QPS: 159K, P99.9: 1.16ms, P99: 0.82ms | QPS: 173K, P99.9: 1.26ms, P99: 0.9ms |

2. GETs (`--ratio 0:1`):

| Redis                                | DF                                   |
| ------------------------------------ | ------------------------------------ |
| QPS: 194K, P99.9: 0.8ms, P99: 0.65ms | QPS: 191K, P99.9: 0.95ms, P99: 0.8ms |

O benchmark mostra que a camada algorítmica do DF, que permite escalabilidade vertical, não gera sobrecarga significativa em execução single-thread.

Com uma instância mais forte (m5.xlarge), a diferença entre DF e Redis cresce.
(`memtier_benchmark  -c 20 --test-time 100 -t 6 -d 256 --distinct-client-seed`):

1. SETs (`--ratio 1:0`):

| Redis                                 | DF                                    |
| ------------------------------------- | ------------------------------------- |
| QPS: 190K, P99.9: 2.45ms, P99: 0.97ms | QPS: 279K, P99.9: 1.95ms, P99: 1.48ms |

2. GETs (`--ratio 0:1`):

| Redis                                | DF                                    |
| ------------------------------------ | ------------------------------------- |
| QPS: 220K, P99.9: 0.98ms, P99: 0.8ms | QPS: 305K, P99.9: 1.03ms, P99: 0.87ms |

A capacidade de throughput do Dragonfly cresce com o tamanho da instância, enquanto o Redis single-thread atinge o limite de CPU.

<img src="http://static.dragonflydb.io/repo-assets/aws-throughput.svg" width="80%" border="0"/>

Na instância c6gn.16xlarge (maior capacidade de rede), o Dragonfly atinge 25x mais throughput que o Redis, superando 3.8M QPS.

Latência de 99% no pico de throughput do Dragonfly:

| op    | r6g   | c6gn  | c7g   |
| ----- | ----- | ----- | ----- |
| set   | 0.8ms | 1ms   | 1ms   |
| get   | 0.9ms | 0.9ms | 0.8ms |
| setex | 0.9ms | 1.1ms | 1.3ms |

_Todos os benchmarks foram realizados com `memtier_benchmark`, ajustando o número de threads conforme a instância. O `memtier` rodava em uma c6gn.16xlarge separada. No benchmark SETEX, foi definido tempo de expiração de 500 para garantir sobrevivência até o final do teste._

```bash
memtier_benchmark --ratio ... -t <threads> -c 30 -n 200000 --distinct-client-seed -d 256 \
   --expiry-range=...
```

Em modo pipeline `--pipeline=30`, o Dragonfly alcança **10M QPS** em SET e **15M QPS** em GET.

### Dragonfly vs. Memcached

Comparamos Dragonfly e Memcached em uma c6gn.16xlarge na AWS.

Com latência comparável, o throughput do Dragonfly superou o do Memcached tanto em leitura quanto escrita. Em escrita, a latência do Dragonfly foi melhor devido à contenção no [caminho de escrita do Memcached](docs/memcached_benchmark.md).

#### Benchmark de SET

| Servidor  | QPS (milhares) | latência 99% |  99.9%   |
| :-------: | :------------: | :----------: | :------: |
| Dragonfly |    🟩 3844     |   🟩 0.9ms   | 🟩 2.4ms |
| Memcached |      806       |    1.6ms     |  3.2ms   |

#### Benchmark de GET

| Servidor  | QPS (milhares) | latência 99% |  99.9%   |
| --------- | :------------: | :----------: | :------: |
| Dragonfly |    🟩 3717     |     1ms      |  2.4ms   |
| Memcached |      2100      |  🟩 0.34ms   | 🟩 0.6ms |

Memcached teve menor latência em leitura, mas também menor throughput.

### Eficiência de memória

Para testar a eficiência de memória, preenchemos o Dragonfly e o Redis com \~5GB de dados usando o comando `debug populate 5000000 key 1024`, enviamos tráfego de atualização com `memtier` e iniciamos o snapshot com o comando `bgsave`.

A figura abaixo demonstra como cada servidor se comportou em termos de eficiência de memória.

<img src="http://static.dragonflydb.io/repo-assets/bgsave-memusage.svg" width="70%" border="0"/>

O Dragonfly foi 30% mais eficiente em memória que o Redis em estado ocioso e não apresentou aumento visível no uso de memória durante a fase de snapshot. No pico, o uso de memória do Redis aumentou para quase 3 vezes o do Dragonfly.

O Dragonfly concluiu o snapshot mais rápido, em poucos segundos.

Para mais informações sobre eficiência de memória no Dragonfly, veja nosso [documento sobre Dashtable](/docs/dashtable.md).

## <a name="configuration"><a/>Configuração

O Dragonfly suporta argumentos comuns do Redis quando aplicável. Por exemplo, você pode executar: `dragonfly --requirepass=foo --bind localhost`.

Atualmente, o Dragonfly suporta os seguintes argumentos específicos do Redis:

- `port`: Porta de conexão Redis (`padrão: 6379`).
- `bind`: Use `localhost` para permitir conexões apenas locais ou um IP público para permitir conexões **para esse IP** (ou seja, externas também). Use `0.0.0.0` para permitir todas as conexões IPv4.
- `requirepass`: Senha para autenticação AUTH (`padrão: ""`).
- `maxmemory`: Limite de memória máxima (em bytes legíveis) usada pelo banco (`padrão: 0`). Um valor `0` significa que o programa determinará automaticamente o uso máximo de memória.
- `dir`: O Docker do Dragonfly usa a pasta `/data` para snapshots por padrão, o CLI usa `""`. Você pode usar a opção `-v` do Docker para mapear para uma pasta do host.
- `dbfilename`: Nome do arquivo para salvar/carregar o banco de dados (`padrão: dump`).

Também há argumentos específicos do Dragonfly:

- `memcached_port`: Porta para habilitar API compatível com Memcached (`padrão: desabilitado`).

- `keys_output_limit`: Número máximo de chaves retornadas no comando `keys` (`padrão: 8192`). Note que `keys` é um comando perigoso. Limitamos o resultado para evitar explosão de uso de memória ao buscar muitas chaves.

- `dbnum`: Número máximo de bancos de dados suportados para `select`.

- `cache_mode`: Veja a seção sobre [design de cache inovador](#novel-cache-design).

- `hz`: Frequência de avaliação de expiração de chave (`padrão: 100`). Frequências menores usam menos CPU em idle, mas têm menor taxa de remoção.

- `snapshot_cron`: Expressão cron para snapshots automáticos usando sintaxe cron padrão, com granularidade de minutos (`padrão: ""`).

  Exemplos:

  | Expressão Cron | Descrição                           |
  | -------------- | ----------------------------------- |
  | `* * * * *`    | A cada minuto                       |
  | `*/5 * * * *`  | A cada 5 minutos                    |
  | `5 */2 * * *`  | No minuto 5 de cada 2 horas         |
  | `0 0 * * *`    | Às 00:00 (meia-noite) todos os dias |
  | `0 6 * * 1-5`  | Às 06:00 (manhã) de segunda a sexta |

- `primary_port_http_enabled`: Permite acesso ao console HTTP na porta TCP principal se `true` (`padrão: true`).

- `admin_port`: Habilita acesso admin ao console na porta atribuída (`padrão: desabilitado`). Suporta protocolos HTTP e RESP.

- `admin_bind`: Define o IP de binding do console admin (`padrão: qualquer`). Suporta HTTP e RESP.

- `admin_nopass`: Habilita acesso admin sem autenticação (`padrão: false`). Suporta HTTP e RESP.

- `cluster_mode`: Modo cluster suportado (`padrão: ""`). Atualmente só `emulated`.

- `cluster_announce_ip`: IP que os comandos de cluster anunciam ao cliente.

- `announce_port`: Porta que os comandos de cluster anunciam ao cliente e ao master de replicação.

### Exemplo de script de inicialização com opções populares:

```bash
./dragonfly-x86_64 --logtostderr --requirepass=youshallnotpass --cache_mode=true -dbnum 1 --bind localhost --port 6379 --maxmemory=12gb --keys_output_limit=12288 --dbfilename dump.rdb
```

Argumentos também podem ser passados via:

- `--flagfile <arquivo>`: O arquivo deve conter um flag por linha, com `=` em vez de espaços para flags com valor. Não usar aspas.
- Variáveis de ambiente. Use `DFLY_x`, onde `x` é o nome exato do flag (case sensitive).

Para mais opções como logs ou suporte a TLS, execute `dragonfly --help`.

## <a name="roadmap-status"><a/>Roadmap e status

Atualmente o Dragonfly suporta \~185 comandos Redis e todos os comandos Memcached exceto `cas`. Já quase no nível da API do Redis 5, o próximo marco é estabilizar as funcionalidades básicas e implementar a API de replicação. Caso precise de um comando ainda não implementado, abra uma issue.

Para replicação nativa do Dragonfly, estamos projetando um formato de log distribuído que suportará velocidades ordens de magnitude maiores.

Após a replicação, continuaremos adicionando comandos faltantes das versões 3 a 6 do Redis.

Consulte nossa [Referência de Comandos](https://dragonflydb.io/docs/category/command-reference) para a lista atual.

## <a name="design-decisions"><a/>Decisões de design

### Design de cache inovador

O Dragonfly tem um algoritmo de cache adaptativo, unificado e simples, eficiente em memória.

Você pode habilitar o modo cache com o flag `--cache_mode=true`. Esse modo remove itens menos prováveis de serem acessados no futuro, mas **somente** próximo ao limite de `maxmemory`.

### Expiração com precisão relativa

Intervalos de expiração são limitados a \~8 anos.

Deadlines com precisão de milissegundos (PEXPIRE, PSETEX etc.) são arredondadas para o segundo mais próximo **quando superiores a 2^28ms**, com erro menor que 0.001%. Se isso for inadequado, entre em contato ou abra uma issue explicando o caso.

Para mais diferenças entre os deadlines do Dragonfly e do Redis, [clique aqui](docs/differences.md).

### Console HTTP nativo e métricas compatíveis com Prometheus

Por padrão, o Dragonfly permite acesso HTTP via porta TCP principal (6379). Ou seja, você pode conectar via protocolo Redis ou HTTP — o servidor reconhece automaticamente o protocolo ao conectar. Acesse com o navegador. Hoje o console HTTP tem pouca informação, mas no futuro incluirá debug e info de gerenciamento.

Acesse `:6379/metrics` para ver métricas Prometheus-compatíveis.

As métricas são compatíveis com o dashboard do Grafana, [veja aqui](tools/local/monitoring/grafana/provisioning/dashboards/dashboard.json).

Importante: o console HTTP deve ser acessado em rede segura. Se expor a porta TCP do Dragonfly externamente, desabilite o console com `--http_admin_console=false` ou `--nohttp_admin_console`.

## <a name="background"><a/>Contexto

O Dragonfly começou como um experimento para repensar um datastore in-memory em 2022. Baseado em lições como usuários e engenheiros de cloud, sabíamos que dois princípios deveriam ser preservados: garantias de atomicidade e latência sub-millisecond sob alto throughput.

Desafio 1: Utilizar ao máximo CPU, memória e I/O em servidores modernos. A solução foi adotar [arquitetura shared-nothing](https://en.wikipedia.org/wiki/Shared-nothing_architecture), particionando o keyspace entre threads. Chamamos os slices de “shards”. A biblioteca que gerencia threads e I/O foi open-sourceada [aqui](https://github.com/romange/helio).

Para garantir atomicidade em operações multi-key, usamos avanços recentes da pesquisa acadêmica. Escolhemos o paper ["VLL: a lock manager redesign for main memory database systems"](https://www.cs.umd.edu/~abadi/papers/vldbj-vll.pdf) como base para o framework transacional. A combinação VLL + shared-nothing permitiu compor operações atômicas multi-key **sem mutex ou spinlock**. O resultado foi um PoC com performance superior a outras soluções.

Desafio 2: Estruturas de dados mais eficientes. Baseamos o hashtable no paper ["Dash: Scalable Hashing on Persistent Memory"](https://arxiv.org/pdf/2003.07302.pdf). Mesmo voltado à memória persistente, foi aplicável. O design permitiu manter:

- Hash incremental durante crescimento.
- Scan stateless mesmo com mudanças.

Além disso, o Dash é mais eficiente em uso de CPU/memória. Com esse design, inovamos ainda com:

- Expiração eficiente para registros TTL.
- Algoritmo de cache com mais hits que LRU/LFU com **zero overhead**.
- Algoritmo de snapshot **sem fork**.

Com essa base pronta e [performance satisfatória](#benchmarks), implementamos as APIs Redis e Memcached (\~185 comandos Redis, equivalente ao Redis 5.0, e 13 do Memcached).

Por fim, <br> <em>Nossa missão é construir um datastore in-memory rápido, eficiente e bem projetado para cargas em nuvem, aproveitando o hardware moderno. Queremos resolver as dores das soluções atuais mantendo APIs e propostas de valor.</em>


================================================
FILE: README.zh-CN.md
================================================
<p align="center">
  <a href="https://dragonflydb.io">
    <img  src="/.github/images/logo-full.svg"
      width="284" border="0" alt="Dragonfly">
  </a>
</p>


[![ci-tests](https://github.com/dragonflydb/dragonfly/actions/workflows/ci.yml/badge.svg)](https://github.com/dragonflydb/dragonfly/actions/workflows/ci.yml) [![Twitter URL](https://img.shields.io/twitter/follow/dragonflydbio?style=social)](https://twitter.com/dragonflydbio)

> 在您继续之前，请考虑给我们一个 GitHub 星标 ⭐️。谢谢！

其他语言:  [English](README.md) [日本語](README.ja-JP.md) [한국어](README.ko-KR.md) [Português](README.pt-BR.md)

[主页](https://dragonflydb.io/) • [快速入门](https://github.com/dragonflydb/dragonfly/tree/main/docs/quick-start) • [社区 Discord](https://discord.gg/HsPjXGVH85) • [Dragonfly 论坛](https://dragonfly.discourse.group/) • [加入 Dragonfly 社区](https://www.dragonflydb.io/community)

[GitHub Discussions](https://github.com/dragonflydb/dragonfly/discussions) • [GitHub Issues](https://github.com/dragonflydb/dragonfly/issues) • [贡献指南](https://github.com/dragonflydb/dragonfly/blob/main/CONTRIBUTING.md)

## 全世界最快的内存数据库

Dragonfly是一种针对现代应用程序负荷需求而构建的内存数据库，完全兼容Redis和Memcached的 API，迁移时无需修改任何代码。相比于这些传统的内存数据库，Dragonfly提供了其25倍的吞吐量，高缓存命中率和低尾延迟，并且对于相同大小的工作负载运行资源最多可减少80%。

## 目录

- [基准测试](#基准测试)
- [快速入门](https://github.com/dragonflydb/dragonfly/tree/main/docs/quick-start)
- [配置方法](#配置方法)
- [开发路线和开发现状](#开发路线和开发现状)
- [设计决策](#设计决策)
- [开发背景](#开发背景)

## <a name="基准测试"><a/> 基准测试

<img src="http://static.dragonflydb.io/repo-assets/aws-throughput.svg" width="80%" border="0"/>

Dragonfly在c6gn.16xlarge上达到了每秒380万个查询（QPS），相比于Redis，吞吐量提高了25倍。

在Dragonfly的峰值吞吐量下，P99延迟如下：

| op    | r6g   | c6gn  | c7g   |
| ----- | ----- | ----- | ----- |
| set   | 0.8ms | 1ms   | 1ms   |
| get   | 0.9ms | 0.9ms | 0.8ms |
| setex | 0.9ms | 1.1ms | 1.3ms |

*所有基准测试均使用`memtier_benchmark`（见下文），根据服务器类型和实例类型调整线程数。`memtier`运行在独立的c6gn.16xlarge机器上。对于setex基准测试，我们使用了500的到期范围，以便其能够存活直到测试结束。*

```bash
  memtier_benchmark --ratio ... -t <threads> -c 30 -n 200000 --distinct-client-seed -d 256 \
     --expiry-range=...
```

当以管道模式运行，并设置参数`--pipeline=30`时，Dragonfly可以实现**10M qps**的SET操作和 **15M qps**的GET操作。

### Memcached / Dragonfly

我们在 AWS 的 `c6gn.16xlarge` 实例上比较了 memcached 和 Dragonfly。如下图所示，与 memcached 相比，Dragonfly 的吞吐量在读写两方面上都占据了优势，并且在延迟方面也还不错。对于写入工作，Dragonfly 的延迟更低，这是由于在 memcached 的写入路径上存在竞争（请参见[此处](docs/memcached_benchmark.md)）。

#### SET benchmark

|  Server   | QPS(thousands qps) | latency 99% |  99.9%  |
| :-------: | :----------------: | :---------: | :-----: |
| Dragonfly |       🟩 3844       |   🟩 0.9ms   | 🟩 2.4ms |
| Memcached |        806         |    1.6ms    |  3.2ms  |

#### GET benchmark

| Server    | QPS(thousands qps) | latency 99% |  99.9%  |
| --------- | :----------------: | :---------: | :-----: |
| Dragonfly |       🟩 3717       |     1ms     |  2.4ms  |
| Memcached |        2100        |  🟩 0.34ms   | 🟩 0.6ms |


对于读取基准测试，Memcached 表现出了更低的延迟，但在吞吐量方面比不上Dragonfly。

### 内存效率

在接下来的测试中，我们使用 `debug populate 5000000 key 1024` 命令向 Dragonfly 和 Redis 分别写入了约 5GB 的数据。然后我们使用 `memtier` 发送更新流量并使用 `bgsave` 命令启动快照。下图清楚地展示了这两个服务器在内存效率方面的表现。

<img src="http://static.dragonflydb.io/repo-assets/bgsave-memusage.svg" width="70%" border="0"/>

在空闲状态下，Dragonfly 比 Redis 节省约 30% 的内存。
在快照阶段，Dragonfly 也没有显示出任何明显的内存增加。
但同时，Redis 在峰值时的内存几乎达到了 Dragonfly 的 3 倍。
Dragonfly 完成快照也很快，仅在启动后几秒钟内就完成了。
有关 Dragonfly 内存效率的更多信息，请参见 [dashtable 文档](/docs/dashtable.md)。


## <a name="开发路线和开发现状"><a/>配置方法

Dragonfly 支持 Redis 的常见参数。
例如，您可以运行：`dragonfly --requirepass=foo --bind localhost`。

目前，Dragonfly 支持以下 Redis 特定参数：

* `port`：Redis 连接端口，默认为 `6379`。
* `bind`：使用本地主机名仅允许本地连接，使用公共 IP 地址允许外部连接到**该 IP 地址**。
* `requirepass`：AUTH 认证密码，默认为空 `""`。
* `maxmemory`：限制数据库使用的最大内存（以字节为单位）。`0` 表示程序将自动确定其最大内存使用量。默认为 `0`。
* `dir`：默认情况下，dragonfly docker 使用 `/data` 文件夹进行快照。CLI 使用的是 `""`。你可以使用 `-v` docker 选项将其映射到主机文件夹。
* `dbfilename`：保存/加载数据库的文件名。默认为 `dump`；

此外，还有 Dragonfly 特定的参数选项：

* `memcached_port`：在此端口上启用 memcached 兼容的 API。默认禁用。

* `keys_output_limit`：在`keys` 命令中返回的最大键数。默认为 `8192`。

  `keys` 命令是危险命令。我们会截断结果以避免在获取太多键时内存溢出。

* `dbnum`：`select` 支持的最大数据库数。

* `cache_mode`：请参见下面的 [缓存](#全新的缓存设计) 部分。

* `hz`：键到期评估频率。默认为 `100`。空闲时，使用较低的频率可以占用较少的 CPU资源，但这会导致清理过期键的速度下降。

* `snapshot_cron`：定时自动备份快照的 cron 表达式，使用标准的、精确到分钟的 cron 语法。默认为空 `""`。

  下面是一些 cron 表达式的示例，更多关于此参数的细节请参见[文档](https://www.dragonflydb.io/docs/managing-dragonfly/backups#the-snapshot_cron-flag)。

  | Cron 表达式      | 描述                               |
  |---------------|----------------------------------|
  | `* * * * *`   | 每分钟                              |
  | `*/5 * * * *` | 每隔 5 分钟 (00:00, 00:05, 00:10...) |
  | `5 */2 * * *` | 每隔 2 小时的第 5 分钟                   |
  | `0 0 * * *`   | 每天的 00:00 午夜                     |
  | `0 6 * * 1-5` | 从星期一到星期五的每天 06:00 黎明             |
* `primary_port_http_enabled`：如果为 true，则允许在主 TCP 端口上访问 HTTP 控制台。默认为 `true`。

* `admin_port`：如果设置，将在指定的端口上启用对控制台的管理访问。支持 HTTP 和 RESP 协议。默认禁用。

* `admin_bind`：如果设置，将管理控制台 TCP 连接绑定到给定地址。支持 HTTP 和 RESP 协议。默认为 `any`。

* `admin_nopass`: 如果设置，允许在不提供任何认证令牌的情况下，通过指定的端口访问管理控制台。同时支持 HTTP 和 RESP 协议。 默认为 `false`。

* `cluster_mode`：支持集群模式。目前仅支持 `emulated`。默认为空 `""`。

* `cluster_announce_ip`：集群模式下向客户端公开的 IP。

### 启动脚本示例，包含常用选项：

```bash
./dragonfly-x86_64 --logtostderr --requirepass=youshallnotpass --cache_mode=true -dbnum 1 --bind localhost --port 6379 --maxmemory=12gb --keys_output_limit=12288 --dbfilename dump.rdb
```
还可以通过运行 `dragonfly --flagfile <filename>` 从配置文件中获取参数，配置文件的每行应该列出一个参数，并用等号代替键值参数的空格。

要获取更多选项，如日志管理或TLS支持，请运行 `dragonfly --help`。

## <a name="开发路线和开发现状"><a/>开发路线和开发现状

目前，Dragonfly支持约185个Redis命令以及除 `cas` 之外的所有 Memcached 命令。
我们几乎达到了Redis 5 API的水平。我们的下一个里程碑更新将会稳定基本功能并实现复刻API。
如果您发现您需要的命令尚未实现，请提出一个Issue。

对于dragonfly-native复制技术，我们正在设计一种分布式日志格式，该格式将支持更高的速度。

在实现复制功能之后，我们将继续实现API 3-6中其他缺失的Redis命令。

请参见[命令参考](https://dragonflydb.io/docs/category/command-reference)以了解Dragonfly当前支持的命令。

## <a name="设计决策"><a/> 设计决策

### 全新的缓存设计

Dragonfly采用单一的自适应缓存算法，该算法非常简单且具备高内存效率。
你可以通过使用 `--cache_mode=true` 参数来启用缓存模式。一旦启用了此模式，Dragonfly将会删除最低概率可能被使用的内容，但这只会在接近最大内存限制时发生。

### 相对准确的过期期限

过期范围限制最高为约8年。此外，**对于大于2^28ms的到期期限**，毫秒精度级别（PEXPIRE/PSETEX等）会被简化到秒级。
这种舍入的误差小于0.001％，我希望这在长时间范围情况下是可以接受的。
如果这不符合你的使用需求，请与我联系或提出一个Issue，并解释您的情况。

关于与Redis实现之间的更多差异，请参见[此处](docs/differences.md)。

### 原生HTTP控制台和兼容Prometheus的标准

默认情况下，Dragonfly允许通过其主TCP端口（6379）进行HTTP访问。没错，您可以通过Redis协议或HTTP协议连接到Dragonfly - 服务器会在连接初始化期间自动识别协议。 不妨在你自己的浏览器中尝试一下。现在HTTP访问没有太多信息可供参考，但在将来，我们计划添加有用的调试和管理信息。如果您转到`: 6379/metrics` URL，您将看到一些兼容Prometheus的标准。

Prometheus导出的标准与Grafana仪表盘兼容，[请参见此处](tools/local/monitoring/grafana/provisioning/dashboards/dashboard.json)。

重要！HTTP控制台仅应在安全网络内访问。如果您将Dragonfly的TCP端口暴露在外部，则建议使用`--http_admin_console=false`或`--nohttp_admin_console`禁用控制台。


## <a name="开发背景"><a/>开发背景

Dragonfly始于一项实验，旨在探索如果在2022年重新设计内存数据库，它会是什么样子。基于我们作为内存存储的用户以及作为云服务公司的工程师的经验教训，我们得知需要保留Dragonfly的两个关键属性：a) 为其所有操作提供原子性保证，b) 保证在非常高的吞吐量下实现低于毫秒的延迟。

我们面临的首要挑战是如何充分利用当今云服务器的CPU、内存和I/O资源。为了解决这个问题，我们使用了 [无共享式架构（shared-nothing architecture）](https://en.wikipedia.org/wiki/Shared-nothing_architecture)，它允许我们在不同的线程之间分割内存存储的空间，使得每个线程可以管理自己的字典数据切片。我们称这些切片为“分片（shards）”。为无共享式架构提供线程和I/O管理功能的库在[这里](https://github.com/romange/helio)开源。

为了提供对多键并发操作的原子性保证，我们使用了最近学术研究的进展。我们选择了论文 ["VLL: a lock manager redesign for main memory database systems”](https://www.cs.umd.edu/~abadi/papers/vldbj-vll.pdf) 来开发Dragonfly的事务框架。无共享式架构和VLL的选择使我们能够在不使用互斥锁或自旋锁的情况下组合原子的多键操作。这是我们 PoC 的一个重要里程碑，它的性能在商业和开源解决方案中脱颖而出。

我们面临的第二个挑战是为新存储设计更高效的数据结构。为了实现这个目标，我们基于论文["Dash: Scalable Hashing on Persistent Memory"](https://arxiv.org/pdf/2003.07302.pdf)构建了核心哈希表结构。这篇论文本身是以持久性内存为中心的，与主存没有直接相关性。

然而，它非常适用于我们的问题。它提出了一种哈希表设计，允许我们维护Redis字典中存在的两个特殊属性：a) 数据存储增长时的渐进式哈希能力；b）使用无状态扫描操作时，遍历变化的字典的能力。除了这两个属性之外，Dash在CPU和内存方面都更加高效。通过利用Dash的设计，我们能够进一步创新，实现以下功能：

- 针对TTL的高效记录过期功能。
- 一种新颖的缓存驱逐算法，具有比其他缓存策略（如LRU和LFU）更高的命中率，同时**零内存开销**。
- 一种新颖的无fork快照算法。

在我们为Dragonfly打下基础并满意其[性能](#基准测试)后，我们开始实现Redis和Memcached功能。
目前，我们已经实现了约185个Redis命令（大致相当于Redis 5.0 API）和13个Memcached命令。

最后，<br>
<em>我们的使命是构建一个设计良好、超高速、成本效益高的云工作负载内存数据存储系统，利用最新的硬件技术。我们旨在解决当前解决方案的痛点，同时保留其产品API和优势。 </em>


================================================
FILE: TODO.md
================================================
1. To move lua_project to dragonfly from helio (DONE)
2. To limit lua stack to something reasonable like 4096.
3. To inject our own allocator to lua to track its memory.


## Object lifecycle and thread-safety.

Currently our transactional and locking model is based on an assumption that any READ or WRITE
access to objects must be performed in a shard where they belong.

However, this assumption can be relaxed to get significant gains for read-only queries.

### Explanation
Our transactional framework prevents from READ-locked objects to be mutated. It does not prevent from their PrimaryTable to grow or change, of course. These objects can move to different entries inside the table. However, our CompactObject maintains the following property - its reference CompactObject.AsRef() is valid no matter where the master object moves and it's valid and safe for reading even from other threads. The exception regarding thread safety is SmallString which uses translation table for its pointers.

If we change the SmallString translation table to be global and thread-safe (it should not have lots of write contention anyway) we may access primetable keys and values from another thread and write them directly to sockets.

Use-case: large strings that need to be copied. Sets that need to be serialized for SMEMBERS/HGETALL commands etc. Additional complexity - we will need to lock those variables even for single hop transactions and unlock them afterwards. The unlocking hop does not need to increase user-visible latency since it can be done after we send reply to the socket.

================================================
FILE: contrib/charts/dragonfly/.helmignore
================================================
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*.orig
*~
# Various IDEs
.project
.idea/
*.tmproj
.vscode/
ci/
*.go
go.mod
go.sum


================================================
FILE: contrib/charts/dragonfly/Chart.yaml
================================================
apiVersion: v2
name: dragonfly
description: Dragonfly is a modern in-memory datastore, fully compatible with Redis and Memcached APIs.

# A chart can be either an 'application' or a 'library' chart.
#
# Application charts are a collection of templates that can be packaged into versioned archives
# to be deployed.
#
# Library charts provide useful utilities or functions for the chart developer. They're included as
# a dependency of application charts to inject those utilities and functions into the rendering
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
type: application

# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: v1.37.0

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "v1.37.0"

home: https://dragonflydb.io/

keywords:
  - database
  - keyvalue
  - cache

sources:
  - https://github.com/dragonflydb/dragonfly

kubeVersion: ">=1.23.0-0"


================================================
FILE: contrib/charts/dragonfly/README.md
================================================
# dragonfly

![Version: v0.12.0](https://img.shields.io/badge/Version-v0.12.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: v0.12.0](https://img.shields.io/badge/AppVersion-v0.12.0-informational?style=flat-square)

Dragonfly is a modern in-memory datastore, fully compatible with Redis and Memcached APIs.

**Homepage:** <https://dragonflydb.io/>

## Source Code

* <https://github.com/dragonflydb/dragonfly>

## Requirements

Kubernetes: `>=1.23.0-0`


## Installing from a pre-packaged OCI

Pick a version from https://github.com/dragonflydb/dragonfly/pkgs/container/dragonfly%2Fhelm%2Fdragonfly

Example:

```shell
VERSION=v1.12.1
helm upgrade --install dragonfly oci://ghcr.io/dragonflydb/dragonfly/helm/dragonfly --version $VERSION
```

## Values

| Key | Type | Default | Description |
|-----|------|---------|-------------|
| affinity | object | `{}` | Affinity for pod assignment |
| command | list | `[]` | Allow overriding the container's command |
| commonLabels | object | `{}` | Common labels to add to all K8s resources |
| extraArgs | list | `[]` | Extra arguments to pass to the dragonfly binary |
| extraContainers | list | `[]` | Additional sidecar containers |
| extraObjects | list | `[]` | extra K8s manifests to deploy |
| extraVolumeMounts | list | `[]` | Extra volume mounts corresponding to the volumes mounted above |
| extraVolumes | list | `[]` | Extra volumes to mount into the pods |
| fullnameOverride | string | `""` | String to fully override dragonfly.fullname |
| image.pullPolicy | string | `"IfNotPresent"` | Dragonfly image pull policy |
| image.repository | string | `"docker.dragonflydb.io/dragonflydb/dragonfly"` | Container Image Registry to pull the image from |
| image.tag | string | `""` | Overrides the image tag whose default is the chart appVersion. |
| imagePullSecrets | list | `[]` | Container Registry Secret names in an array |
| initContainers | list | `[]` | A list of initContainers to run before each pod starts |
| nameOverride | string | `""` | String to partially override dragonfly.fullname |
| nodeSelector | object | `{}` | Node labels for pod assignment |
| podAnnotations | object | `{}` | Annotations for pods |
| podSecurityContext | object | `{}` | Set securityContext for pod itself |
| probes.livenessProbe.exec.command[0] | string | `"/bin/sh"` |  |
| probes.livenessProbe.exec.command[1] | string | `"/usr/local/bin/healthcheck.sh"` |  |
| probes.livenessProbe.failureThreshold | int | `3` |  |
| probes.livenessProbe.initialDelaySeconds | int | `10` |  |
| probes.livenessProbe.periodSeconds | int | `10` |  |
| probes.livenessProbe.successThreshold | int | `1` |  |
| probes.livenessProbe.timeoutSeconds | int | `5` |  |
| probes.readinessProbe.exec.command[0] | string | `"/bin/sh"` |  |
| probes.readinessProbe.exec.command[1] | string | `"/usr/local/bin/healthcheck.sh"` |  |
| probes.readinessProbe.failureThreshold | int | `3` |  |
| probes.readinessProbe.initialDelaySeconds | int | `10` |  |
| probes.readinessProbe.periodSeconds | int | `10` |  |
| probes.readinessProbe.successThreshold | int | `1` |  |
| probes.readinessProbe.timeoutSeconds | int | `5` |  |
| prometheusRule.enabled | bool | `false` | Deploy a PrometheusRule |
| prometheusRule.spec | list | `[]` | PrometheusRule.Spec https://awesome-prometheus-alerts.grep.to/rules |
| replicaCount | int | `1` | Number of replicas to deploy |
| resources.limits | object | `{}` | The resource limits for the containers |
| resources.requests | object | `{}` | The requested resources for the containers |
| env | list | `[]` | Extra environment variables |
| envFrom | list | `[]` | Extra environment variables from K8s objects |
| securityContext | object | `{}` | Set securityContext for containers |
| service.annotations | object | `{}` | Extra annotations for the service |
| service.labels | object | `{}` | Extra labels for the service |
| service.metrics.portName | string | `"metrics"` | name for the metrics port |
| service.metrics.serviceType | string | `"ClusterIP"` | serviceType for the metrics service |
| service.port | int | `6379` | Dragonfly service port |
| service.type | string | `"ClusterIP"` | Service type to provision. Can be NodePort, ClusterIP or LoadBalancer |
| serviceAccount.annotations | object | `{}` | Annotations to add to the service account |
| serviceAccount.create | bool | `true` | Specifies whether a service account should be created |
| serviceAccount.name | string | `""` | The name of the service account to use. If not set and create is true, a name is generated using the fullname template |
| serviceMonitor.annotations | object | `{}` | additional annotations to apply to the metrics |
| serviceMonitor.enabled | bool | `false` | If true, a ServiceMonitor CRD is created for a prometheus operator |
| serviceMonitor.interval | string | `"10s"` | scrape interval |
| serviceMonitor.labels | object | `{}` | additional labels to apply to the metrics |
| serviceMonitor.namespace | string | `""` | namespace in which to deploy the ServiceMonitor CR. defaults to the application namespace |
| serviceMonitor.scrapeTimeout | string | `"10s"` | scrape timeout |
| storage.enabled | bool | `false` | If /data should persist. This will provision a StatefulSet instead. |
| storage.requests | string | `"128Mi"` | Volume size to request for the PVC |
| storage.storageClassName | string | `""` | Global StorageClass for Persistent Volume(s) |
| tls.cert | string | `""` | TLS certificate |
| tls.createCerts | bool | `false` | use cert-manager to automatically create the certificate |
| tls.duration | string | `"87600h0m0s"` | duration or ttl of the validity of the created certificate |
| tls.enabled | bool | `false` | enable TLS |
| tls.existing_secret | string | `""` | use TLS certificates from existing secret |
| tls.issuer.kind | string | `"ClusterIssuer"` | cert-manager issuer kind. Usually Issuer or ClusterIssuer |
| tls.issuer.name | string | `"selfsigned"` | name of the referenced issuer |
| tls.key | string | `""` | TLS private key |
| tolerations | list | `[]` | Tolerations for pod assignment |

----------------------------------------------
Autogenerated from chart metadata using [helm-docs v1.11.0](https://github.com/norwoodj/helm-docs/releases/v1.11.0)


================================================
FILE: contrib/charts/dragonfly/ci/affinity-values.golden.yaml
================================================
---
# Source: dragonfly/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
---
# Source: dragonfly/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  type: ClusterIP
  ports:
    - port: 6379
      targetPort: dragonfly
      protocol: TCP
      name: dragonfly
  selector:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
---
# Source: dragonfly/templates/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: dragonfly
      app.kubernetes.io/instance: test
  template:
    metadata:
      annotations:
      labels:
        app.kubernetes.io/name: dragonfly
        app.kubernetes.io/instance: test
    spec:
      affinity:
        podAntiAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
          - podAffinityTerm:
              labelSelector:
                matchExpressions:
                - key: app.kubernetes.io/name
                  operator: In
                  values:
                  - dragonfly
              topologyKey: kubernetes.io/hostname
            weight: 100
      serviceAccountName: test-dragonfly
      containers:
        - name: dragonfly
          image: "docker.dragonflydb.io/dragonflydb/dragonfly:v1.37.0"
          imagePullPolicy: IfNotPresent
          ports:
            - name: dragonfly
              containerPort: 6379
              protocol: TCP
          livenessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          readinessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          args:
            - "--alsologtostderr"
          resources:
            limits: {}
            requests: {}


================================================
FILE: contrib/charts/dragonfly/ci/affinity-values.yaml
================================================
affinity:
  podAntiAffinity:
    preferredDuringSchedulingIgnoredDuringExecution:
    - podAffinityTerm:
        labelSelector:
          matchExpressions:
          - key: app.kubernetes.io/name
            operator: In
            values:
            - dragonfly
        topologyKey: kubernetes.io/hostname
      weight: 100


================================================
FILE: contrib/charts/dragonfly/ci/command_extraargs-values.golden.yaml
================================================
---
# Source: dragonfly/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
---
# Source: dragonfly/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  type: ClusterIP
  ports:
    - port: 6379
      targetPort: dragonfly
      protocol: TCP
      name: dragonfly
  selector:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
---
# Source: dragonfly/templates/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: dragonfly
      app.kubernetes.io/instance: test
  template:
    metadata:
      annotations:
      labels:
        app.kubernetes.io/name: dragonfly
        app.kubernetes.io/instance: test
    spec:
      serviceAccountName: test-dragonfly
      containers:
        - name: dragonfly
          image: "docker.dragonflydb.io/dragonflydb/dragonfly:v1.37.0"
          imagePullPolicy: IfNotPresent
          ports:
            - name: dragonfly
              containerPort: 6379
              protocol: TCP
          livenessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          readinessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          command:
            - /usr/local/bin/dragonfly
            - --logtostderr
          args:
            - "--alsologtostderr"
            - --cache_mode=true
          resources:
            limits: {}
            requests: {}


================================================
FILE: contrib/charts/dragonfly/ci/command_extraargs-values.yaml
================================================
command:
  - /usr/local/bin/dragonfly
  - --logtostderr

extraArgs:
  - --cache_mode=true


================================================
FILE: contrib/charts/dragonfly/ci/commonlabels-values.golden.yaml
================================================
---
# Source: dragonfly/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
    project: cache-infrastructure
    team: platform
---
# Source: dragonfly/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
    project: cache-infrastructure
    team: platform
spec:
  type: ClusterIP
  ports:
    - port: 6379
      targetPort: dragonfly
      protocol: TCP
      name: dragonfly
  selector:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
---
# Source: dragonfly/templates/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
    project: cache-infrastructure
    team: platform
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: dragonfly
      app.kubernetes.io/instance: test
  template:
    metadata:
      annotations:
      labels:
        app.kubernetes.io/name: dragonfly
        app.kubernetes.io/instance: test
        project: cache-infrastructure
        team: platform
    spec:
      serviceAccountName: test-dragonfly
      containers:
        - name: dragonfly
          image: "docker.dragonflydb.io/dragonflydb/dragonfly:v1.37.0"
          imagePullPolicy: IfNotPresent
          ports:
            - name: dragonfly
              containerPort: 6379
              protocol: TCP
          livenessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          readinessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          args:
            - "--alsologtostderr"
          resources:
            limits: {}
            requests: {}


================================================
FILE: contrib/charts/dragonfly/ci/commonlabels-values.yaml
================================================
commonLabels:
  team: platform
  project: cache-infrastructure


================================================
FILE: contrib/charts/dragonfly/ci/extracontainer-string-values.golden.yaml
================================================
---
# Source: dragonfly/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
---
# Source: dragonfly/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  type: ClusterIP
  ports:
    - port: 6379
      targetPort: dragonfly
      protocol: TCP
      name: dragonfly
  selector:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
---
# Source: dragonfly/templates/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: dragonfly
      app.kubernetes.io/instance: test
  template:
    metadata:
      annotations:
      labels:
        app.kubernetes.io/name: dragonfly
        app.kubernetes.io/instance: test
    spec:
      serviceAccountName: test-dragonfly
      containers:
        - args:
          - -c
          - date; sleep 3600;
          command:
          - /bin/sh
          image: busybox:latest
          name: sidecar-string
        - name: dragonfly
          image: "docker.dragonflydb.io/dragonflydb/dragonfly:v1.37.0"
          imagePullPolicy: IfNotPresent
          ports:
            - name: dragonfly
              containerPort: 6379
              protocol: TCP
          livenessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          readinessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          args:
            - "--alsologtostderr"
          resources:
            limits: {}
            requests: {}


================================================
FILE: contrib/charts/dragonfly/ci/extracontainer-string-values.yaml
================================================
extraContainers:
  - name: sidecar-string
    image: busybox:latest
    command: ["/bin/sh"]
    args: ["-c", "date; sleep 3600;"]


================================================
FILE: contrib/charts/dragonfly/ci/extracontainer-tpl-values.golden.yaml
================================================
---
# Source: dragonfly/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
---
# Source: dragonfly/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  type: ClusterIP
  ports:
    - port: 6379
      targetPort: dragonfly
      protocol: TCP
      name: dragonfly
  selector:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
---
# Source: dragonfly/templates/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: dragonfly
      app.kubernetes.io/instance: test
  template:
    metadata:
      annotations:
      labels:
        app.kubernetes.io/name: dragonfly
        app.kubernetes.io/instance: test
    spec:
      serviceAccountName: test-dragonfly
      containers:
        - name: sidecar-tpl
          image: docker.dragonflydb.io/dragonflydb/dragonfly:latest
          command: ["/bin/sh"]
          args: ["-c", "date; sleep 3600;"]
        - name: dragonfly
          image: "docker.dragonflydb.io/dragonflydb/dragonfly:v1.37.0"
          imagePullPolicy: IfNotPresent
          ports:
            - name: dragonfly
              containerPort: 6379
              protocol: TCP
          livenessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          readinessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          args:
            - "--alsologtostderr"
          resources:
            limits: {}
            requests: {}


================================================
FILE: contrib/charts/dragonfly/ci/extracontainer-tpl-values.yaml
================================================
extraContainers: |
  - name: sidecar-tpl
    image: {{ .Values.image.repository }}:latest
    command: ["/bin/sh"]
    args: ["-c", "date; sleep 3600;"]


================================================
FILE: contrib/charts/dragonfly/ci/extraenv-and-passwordSecret-values.golden.yaml
================================================
---
# Source: dragonfly/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
---
# Source: dragonfly/templates/extra-manifests.yaml
apiVersion: v1
kind: Secret
metadata:
  name: dfly-password
stringData:
  password: foobar
---
# Source: dragonfly/templates/extra-manifests.yaml
apiVersion: v1
kind: Secret
metadata:
  name: my-secret
stringData:
  password: password
  username: username
type: Opaque
---
# Source: dragonfly/templates/extra-manifests.yaml
apiVersion: v1
data:
  configKey1: configValue1
  configKey2: configValue2
kind: ConfigMap
metadata:
  name: my-configmap
---
# Source: dragonfly/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  type: ClusterIP
  ports:
    - port: 6379
      targetPort: dragonfly
      protocol: TCP
      name: dragonfly
  selector:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
---
# Source: dragonfly/templates/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: dragonfly
      app.kubernetes.io/instance: test
  template:
    metadata:
      annotations:
      labels:
        app.kubernetes.io/name: dragonfly
        app.kubernetes.io/instance: test
    spec:
      serviceAccountName: test-dragonfly
      containers:
        - name: dragonfly
          image: "docker.dragonflydb.io/dragonflydb/dragonfly:v1.37.0"
          imagePullPolicy: IfNotPresent
          ports:
            - name: dragonfly
              containerPort: 6379
              protocol: TCP
          livenessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          readinessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          args:
            - "--alsologtostderr"
          resources:
            limits: {}
            requests: {}
          
          env:
            - name: DFLY_requirepass
              valueFrom:
                secretKeyRef:
                  name: dfly-password
                  key: password
            - name: ENV_VAR43
              value: value1
            - name: ENV_VAR323
              value: value2
          envFrom:
            - configMapRef:
                name: my-configmap
            - secretRef:
                name: my-secret


================================================
FILE: contrib/charts/dragonfly/ci/extraenv-and-passwordSecret-values.yaml
================================================
extraObjects:
- apiVersion: v1
  kind: Secret
  metadata:
    name: dfly-password
  stringData:
    password: foobar
- apiVersion: v1
  kind: ConfigMap
  metadata:
    name: my-configmap
  data:
    configKey1: configValue1
    configKey2: configValue2
- apiVersion: v1
  kind: Secret
  metadata:
    name: my-secret
  type: Opaque
  stringData:
    username: username
    password: password

env:
  - name: ENV_VAR43
    value: value1
  - name: ENV_VAR323
    value: value2

envFrom:
  - configMapRef:
      name: my-configmap
  - secretRef:
      name: my-secret

passwordFromSecret:
  enable: true
  existingSecret:
    name: dfly-password
    key: password


================================================
FILE: contrib/charts/dragonfly/ci/extraenv-values.golden.yaml
================================================
---
# Source: dragonfly/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
---
# Source: dragonfly/templates/extra-manifests.yaml
apiVersion: v1
kind: Secret
metadata:
  name: my-secret
stringData:
  password: password
  username: username
type: Opaque
---
# Source: dragonfly/templates/extra-manifests.yaml
apiVersion: v1
data:
  configKey1: configValue1
  configKey2: configValue2
kind: ConfigMap
metadata:
  name: my-configmap
---
# Source: dragonfly/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  type: ClusterIP
  ports:
    - port: 6379
      targetPort: dragonfly
      protocol: TCP
      name: dragonfly
  selector:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
---
# Source: dragonfly/templates/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: dragonfly
      app.kubernetes.io/instance: test
  template:
    metadata:
      annotations:
      labels:
        app.kubernetes.io/name: dragonfly
        app.kubernetes.io/instance: test
    spec:
      serviceAccountName: test-dragonfly
      containers:
        - name: dragonfly
          image: "docker.dragonflydb.io/dragonflydb/dragonfly:v1.37.0"
          imagePullPolicy: IfNotPresent
          ports:
            - name: dragonfly
              containerPort: 6379
              protocol: TCP
          livenessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          readinessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          args:
            - "--alsologtostderr"
          resources:
            limits: {}
            requests: {}
          
          env:
            - name: ENV_VAR43
              value: value1
            - name: ENV_VAR323
              value: value2
          envFrom:
            - configMapRef:
                name: my-configmap
            - secretRef:
                name: my-secret


================================================
FILE: contrib/charts/dragonfly/ci/extraenv-values.yaml
================================================
extraObjects:
- apiVersion: v1
  kind: ConfigMap
  metadata:
    name: my-configmap
  data:
    configKey1: configValue1
    configKey2: configValue2
- apiVersion: v1
  kind: Secret
  metadata:
    name: my-secret
  type: Opaque
  stringData:
    username: username
    password: password

env:
  - name: ENV_VAR43
    value: value1
  - name: ENV_VAR323
    value: value2

envFrom:
  - configMapRef:
      name: my-configmap
  - secretRef:
      name: my-secret


================================================
FILE: contrib/charts/dragonfly/ci/extravolumes-values.golden.yaml
================================================
---
# Source: dragonfly/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
---
# Source: dragonfly/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  type: ClusterIP
  ports:
    - port: 6379
      targetPort: dragonfly
      protocol: TCP
      name: dragonfly
  selector:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
---
# Source: dragonfly/templates/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: dragonfly
      app.kubernetes.io/instance: test
  template:
    metadata:
      annotations:
      labels:
        app.kubernetes.io/name: dragonfly
        app.kubernetes.io/instance: test
    spec:
      serviceAccountName: test-dragonfly
      containers:
        - name: dragonfly
          image: "docker.dragonflydb.io/dragonflydb/dragonfly:v1.37.0"
          imagePullPolicy: IfNotPresent
          ports:
            - name: dragonfly
              containerPort: 6379
              protocol: TCP
          livenessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          readinessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          args:
            - "--alsologtostderr"
          resources:
            limits: {}
            requests: {}
          volumeMounts:
            - mountPath: /tmp
              name: tmp
      volumes:
        - emptyDir:
            sizeLimit: 500Mi
          name: tmp


================================================
FILE: contrib/charts/dragonfly/ci/extravolumes-values.yaml
================================================
extraVolumes:
  - name: tmp
    emptyDir:
      sizeLimit: 500Mi

extraVolumeMounts:
  - mountPath: /tmp
    name: tmp


================================================
FILE: contrib/charts/dragonfly/ci/initcontainer-string-values.golden.yaml
================================================
---
# Source: dragonfly/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
---
# Source: dragonfly/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  type: ClusterIP
  ports:
    - port: 6379
      targetPort: dragonfly
      protocol: TCP
      name: dragonfly
  selector:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
---
# Source: dragonfly/templates/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: dragonfly
      app.kubernetes.io/instance: test
  template:
    metadata:
      annotations:
      labels:
        app.kubernetes.io/name: dragonfly
        app.kubernetes.io/instance: test
    spec:
      serviceAccountName: test-dragonfly
      initContainers:
        - args:
          - -c
          - date; sleep 1;
          command:
          - /bin/sh
          image: busybox:1.28
          name: initcontainer-string
      containers:
        - name: dragonfly
          image: "docker.dragonflydb.io/dragonflydb/dragonfly:v1.37.0"
          imagePullPolicy: IfNotPresent
          ports:
            - name: dragonfly
              containerPort: 6379
              protocol: TCP
          livenessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          readinessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          args:
            - "--alsologtostderr"
          resources:
            limits: {}
            requests: {}


================================================
FILE: contrib/charts/dragonfly/ci/initcontainer-string-values.yaml
================================================
initContainers:
  - name: initcontainer-string
    image: busybox:1.28
    command: ["/bin/sh"]
    args: ["-c", "date; sleep 1;"]


================================================
FILE: contrib/charts/dragonfly/ci/initcontainer-tpl-values.golden.yaml
================================================
---
# Source: dragonfly/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
---
# Source: dragonfly/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  type: ClusterIP
  ports:
    - port: 6379
      targetPort: dragonfly
      protocol: TCP
      name: dragonfly
  selector:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
---
# Source: dragonfly/templates/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: dragonfly
      app.kubernetes.io/instance: test
  template:
    metadata:
      annotations:
      labels:
        app.kubernetes.io/name: dragonfly
        app.kubernetes.io/instance: test
    spec:
      serviceAccountName: test-dragonfly
      initContainers:
        - name: initcontainer-tpl
          image: docker.dragonflydb.io/dragonflydb/dragonfly:latest
          command: ["/bin/sh"]
          args: ["-c", "date; sleep 1;"]
      containers:
        - name: dragonfly
          image: "docker.dragonflydb.io/dragonflydb/dragonfly:v1.37.0"
          imagePullPolicy: IfNotPresent
          ports:
            - name: dragonfly
              containerPort: 6379
              protocol: TCP
          livenessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          readinessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          args:
            - "--alsologtostderr"
          resources:
            limits: {}
            requests: {}


================================================
FILE: contrib/charts/dragonfly/ci/initcontainer-tpl-values.yaml
================================================
initContainers: |
  - name: initcontainer-tpl
    image: {{ .Values.image.repository }}:latest
    command: ["/bin/sh"]
    args: ["-c", "date; sleep 1;"]


================================================
FILE: contrib/charts/dragonfly/ci/password-old-env-values.golden.yaml
================================================
---
# Source: dragonfly/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
---
# Source: dragonfly/templates/extra-manifests.yaml
apiVersion: v1
kind: Secret
metadata:
  name: dfly-password
stringData:
  password: foobar
---
# Source: dragonfly/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  type: ClusterIP
  ports:
    - port: 6379
      targetPort: dragonfly
      protocol: TCP
      name: dragonfly
  selector:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
---
# Source: dragonfly/templates/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: dragonfly
      app.kubernetes.io/instance: test
  template:
    metadata:
      annotations:
      labels:
        app.kubernetes.io/name: dragonfly
        app.kubernetes.io/instance: test
    spec:
      serviceAccountName: test-dragonfly
      containers:
        - name: dragonfly
          image: "docker.dragonflydb.io/dragonflydb/dragonfly:v1.13.0"
          imagePullPolicy: IfNotPresent
          ports:
            - name: dragonfly
              containerPort: 6379
              protocol: TCP
          livenessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          readinessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          args:
            - "--alsologtostderr"
          resources:
            limits: {}
            requests: {}
          
          env:
            - name: DFLY_PASSWORD
              valueFrom:
                secretKeyRef:
                  name: dfly-password
                  key: password


================================================
FILE: contrib/charts/dragonfly/ci/password-old-env-values.yaml
================================================
image:
  tag: "v1.13.0"

extraObjects:
  - apiVersion: v1
    kind: Secret
    metadata:
      name: dfly-password
    stringData:
      password: foobar

passwordFromSecret:
  enable: true
  existingSecret:
    name: dfly-password
    key: password


================================================
FILE: contrib/charts/dragonfly/ci/passwordsecret-values.golden.yaml
================================================
---
# Source: dragonfly/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
---
# Source: dragonfly/templates/extra-manifests.yaml
apiVersion: v1
kind: Secret
metadata:
  name: dfly-password
stringData:
  password: foobar
---
# Source: dragonfly/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  type: ClusterIP
  ports:
    - port: 6379
      targetPort: dragonfly
      protocol: TCP
      name: dragonfly
  selector:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
---
# Source: dragonfly/templates/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: dragonfly
      app.kubernetes.io/instance: test
  template:
    metadata:
      annotations:
      labels:
        app.kubernetes.io/name: dragonfly
        app.kubernetes.io/instance: test
    spec:
      serviceAccountName: test-dragonfly
      containers:
        - name: dragonfly
          image: "docker.dragonflydb.io/dragonflydb/dragonfly:v1.37.0"
          imagePullPolicy: IfNotPresent
          ports:
            - name: dragonfly
              containerPort: 6379
              protocol: TCP
          livenessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          readinessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          args:
            - "--alsologtostderr"
          resources:
            limits: {}
            requests: {}
          
          env:
            - name: DFLY_requirepass
              valueFrom:
                secretKeyRef:
                  name: dfly-password
                  key: password


================================================
FILE: contrib/charts/dragonfly/ci/passwordsecret-values.tpl.golden.yaml
================================================
---
# Source: dragonfly/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
---
# Source: dragonfly/templates/extra-manifests.yaml
apiVersion: v1
kind: Secret
metadata:
  name: dragonfly-password
stringData:
  password: foobar
---
# Source: dragonfly/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  type: ClusterIP
  ports:
    - port: 6379
      targetPort: dragonfly
      protocol: TCP
      name: dragonfly
  selector:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
---
# Source: dragonfly/templates/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: dragonfly
      app.kubernetes.io/instance: test
  template:
    metadata:
      annotations:
      labels:
        app.kubernetes.io/name: dragonfly
        app.kubernetes.io/instance: test
    spec:
      serviceAccountName: test-dragonfly
      containers:
        - name: dragonfly
          image: "docker.dragonflydb.io/dragonflydb/dragonfly:v1.37.0"
          imagePullPolicy: IfNotPresent
          ports:
            - name: dragonfly
              containerPort: 6379
              protocol: TCP
          livenessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          readinessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          args:
            - "--alsologtostderr"
          resources:
            limits: {}
            requests: {}
          
          env:
            - name: DFLY_requirepass
              valueFrom:
                secretKeyRef:
                  name: dragonfly-password
                  key: password


================================================
FILE: contrib/charts/dragonfly/ci/passwordsecret-values.tpl.yaml
================================================
extraObjects:
- apiVersion: v1
  kind: Secret
  metadata:
    name: dragonfly-password
  stringData:
    password: foobar

passwordFromSecret:
  enable: true
  existingSecret:
    name: '{{ include "dragonfly.name" $ }}-password'
    key: password


================================================
FILE: contrib/charts/dragonfly/ci/passwordsecret-values.yaml
================================================
extraObjects:
- apiVersion: v1
  kind: Secret
  metadata:
    name: dfly-password
  stringData:
    password: foobar

passwordFromSecret:
  enable: true
  existingSecret:
    name: dfly-password
    key: password


================================================
FILE: contrib/charts/dragonfly/ci/persistence-and-existing-secret.golden.yaml
================================================
---
# Source: dragonfly/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
---
# Source: dragonfly/templates/extra-manifests.yaml
apiVersion: v1
kind: Secret
metadata:
  name: dfly-password
stringData:
  password: foobar
---
# Source: dragonfly/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  type: ClusterIP
  ports:
    - port: 6379
      targetPort: dragonfly
      protocol: TCP
      name: dragonfly
  selector:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
---
# Source: dragonfly/templates/statefulset.yaml
apiVersion: apps/v1
kind: StatefulSet
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  serviceName: test
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: dragonfly
      app.kubernetes.io/instance: test
  template:
    metadata:
      annotations:
      labels:
        app.kubernetes.io/name: dragonfly
        app.kubernetes.io/instance: test
    spec:
      serviceAccountName: test-dragonfly
      containers:
        - name: dragonfly
          image: "docker.dragonflydb.io/dragonflydb/dragonfly:v1.37.0"
          imagePullPolicy: IfNotPresent
          ports:
            - name: dragonfly
              containerPort: 6379
              protocol: TCP
          livenessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          readinessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          args:
            - "--alsologtostderr"
          resources:
            limits: {}
            requests: {}
          volumeMounts:
            - mountPath: /data
              name: "test-data"
          env:
            - name: DFLY_requirepass
              valueFrom:
                secretKeyRef:
                  name: dfly-password
                  key: password
  volumeClaimTemplates:
    - metadata:
        name: "test-data"
      spec:
        accessModes: [ "ReadWriteOnce" ]
        storageClassName: standard
        resources:
          requests:
            storage: 128Mi


================================================
FILE: contrib/charts/dragonfly/ci/persistence-and-existing-secret.yaml
================================================
storage:
  enabled: true
  storageClassName: "standard"
  requests: 128Mi

extraObjects:
- apiVersion: v1
  kind: Secret
  metadata:
    name: dfly-password
  stringData:
    password: foobar

passwordFromSecret:
  enable: true
  existingSecret:
    name: dfly-password
    key: password


================================================
FILE: contrib/charts/dragonfly/ci/persistent-values.golden.yaml
================================================
---
# Source: dragonfly/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
---
# Source: dragonfly/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  type: ClusterIP
  ports:
    - port: 6379
      targetPort: dragonfly
      protocol: TCP
      name: dragonfly
  selector:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
---
# Source: dragonfly/templates/statefulset.yaml
apiVersion: apps/v1
kind: StatefulSet
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  serviceName: test
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: dragonfly
      app.kubernetes.io/instance: test
  template:
    metadata:
      annotations:
      labels:
        app.kubernetes.io/name: dragonfly
        app.kubernetes.io/instance: test
    spec:
      serviceAccountName: test-dragonfly
      containers:
        - name: dragonfly
          image: "docker.dragonflydb.io/dragonflydb/dragonfly:v1.37.0"
          imagePullPolicy: IfNotPresent
          ports:
            - name: dragonfly
              containerPort: 6379
              protocol: TCP
          livenessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          readinessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          args:
            - "--alsologtostderr"
          resources:
            limits: {}
            requests: {}
          volumeMounts:
            - mountPath: /data
              name: "test-data"
  volumeClaimTemplates:
    - metadata:
        name: "test-data"
      spec:
        accessModes: [ "ReadWriteOnce" ]
        storageClassName: standard
        resources:
          requests:
            storage: 128Mi


================================================
FILE: contrib/charts/dragonfly/ci/persistent-values.yaml
================================================
storage:
  enabled: true
  storageClassName: "standard"
  requests: 128Mi


================================================
FILE: contrib/charts/dragonfly/ci/priorityclassname-values.golden.yaml
================================================
---
# Source: dragonfly/templates/extra-manifests.yaml
apiVersion: scheduling.k8s.io/v1
description: This priority class should be used only for tests.
globalDefault: false
kind: PriorityClass
metadata:
  name: high-priority
value: 1000000
---
# Source: dragonfly/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
---
# Source: dragonfly/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  type: ClusterIP
  ports:
    - port: 6379
      targetPort: dragonfly
      protocol: TCP
      name: dragonfly
  selector:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
---
# Source: dragonfly/templates/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: dragonfly
      app.kubernetes.io/instance: test
  template:
    metadata:
      annotations:
      labels:
        app.kubernetes.io/name: dragonfly
        app.kubernetes.io/instance: test
    spec:
      priorityClassName: high-priority
      serviceAccountName: test-dragonfly
      containers:
        - name: dragonfly
          image: "docker.dragonflydb.io/dragonflydb/dragonfly:v1.37.0"
          imagePullPolicy: IfNotPresent
          ports:
            - name: dragonfly
              containerPort: 6379
              protocol: TCP
          livenessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          readinessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          args:
            - "--alsologtostderr"
          resources:
            limits: {}
            requests: {}


================================================
FILE: contrib/charts/dragonfly/ci/priorityclassname-values.yaml
================================================
priorityClassName: "high-priority"

extraObjects:
  - apiVersion: scheduling.k8s.io/v1
    kind: PriorityClass
    metadata:
      name: high-priority
    value: 1000000
    globalDefault: false
    description: "This priority class should be used only for tests."


================================================
FILE: contrib/charts/dragonfly/ci/prometheusrules-values.golden.yaml
================================================
---
# Source: dragonfly/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
---
# Source: dragonfly/templates/metrics-service.yaml
apiVersion: v1
kind: Service
metadata:
  name: test-dragonfly-metrics
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
    type: metrics
spec:
  type: ClusterIP
  ports:
    - name: metrics
      port: 6379
      targetPort: 6379
      protocol: TCP
  selector:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
---
# Source: dragonfly/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  type: ClusterIP
  ports:
    - port: 6379
      targetPort: dragonfly
      protocol: TCP
      name: dragonfly
  selector:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
---
# Source: dragonfly/templates/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: dragonfly
      app.kubernetes.io/instance: test
  template:
    metadata:
      annotations:
      labels:
        app.kubernetes.io/name: dragonfly
        app.kubernetes.io/instance: test
    spec:
      serviceAccountName: test-dragonfly
      containers:
        - name: dragonfly
          image: "docker.dragonflydb.io/dragonflydb/dragonfly:v1.37.0"
          imagePullPolicy: IfNotPresent
          ports:
            - name: dragonfly
              containerPort: 6379
              protocol: TCP
          livenessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          readinessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          args:
            - "--alsologtostderr"
          resources:
            limits: {}
            requests: {}
---
# Source: dragonfly/templates/servicemonitor.yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: test-dragonfly-metrics
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  endpoints:
    - interval: 10s
      scrapeTimeout: 10s
      honorLabels: true
      port: metrics
      path: /metrics
      scheme: http
  jobLabel: "test"
  selector:
    matchLabels:
      app.kubernetes.io/name: dragonfly
      app.kubernetes.io/instance: test
      type: metrics
  namespaceSelector:
    matchNames:
      - default


================================================
FILE: contrib/charts/dragonfly/ci/prometheusrules-values.yaml
================================================
serviceMonitor:
  enabled: true
prometheusRule:
  enabled: true
  namespace: default
  spec:
    - alert: RedisDown
      expr: absent(dragonfly_master > 0)
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Redis instance is down
        description: >
          "Redis instance is down"
        runbook_url: "https://octopus.com/docs/runbooks/runbook-examples"


================================================
FILE: contrib/charts/dragonfly/ci/resources-values.golden.yaml
================================================
---
# Source: dragonfly/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
---
# Source: dragonfly/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  type: ClusterIP
  ports:
    - port: 6379
      targetPort: dragonfly
      protocol: TCP
      name: dragonfly
  selector:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
---
# Source: dragonfly/templates/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: dragonfly
      app.kubernetes.io/instance: test
  template:
    metadata:
      annotations:
      labels:
        app.kubernetes.io/name: dragonfly
        app.kubernetes.io/instance: test
    spec:
      serviceAccountName: test-dragonfly
      containers:
        - name: dragonfly
          image: "docker.dragonflydb.io/dragonflydb/dragonfly:v1.37.0"
          imagePullPolicy: IfNotPresent
          ports:
            - name: dragonfly
              containerPort: 6379
              protocol: TCP
          livenessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          readinessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          args:
            - "--alsologtostderr"
          resources:
            limits:
              cpu: 100m
              memory: 400Mi
            requests:
              cpu: 100m
              memory: 300Mi


================================================
FILE: contrib/charts/dragonfly/ci/resources-values.yaml
================================================
resources:
  requests:
    cpu: 100m
    memory: 300Mi
  limits:
    cpu: 100m
    memory: 400Mi


================================================
FILE: contrib/charts/dragonfly/ci/securitycontext-values.golden.yaml
================================================
---
# Source: dragonfly/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
---
# Source: dragonfly/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  type: ClusterIP
  ports:
    - port: 6379
      targetPort: dragonfly
      protocol: TCP
      name: dragonfly
  selector:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
---
# Source: dragonfly/templates/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: dragonfly
      app.kubernetes.io/instance: test
  template:
    metadata:
      annotations:
      labels:
        app.kubernetes.io/name: dragonfly
        app.kubernetes.io/instance: test
    spec:
      serviceAccountName: test-dragonfly
      containers:
        - name: dragonfly
          securityContext:
            allowPrivilegeEscalation: false
            readOnlyRootFilesystem: true
          image: "docker.dragonflydb.io/dragonflydb/dragonfly:v1.37.0"
          imagePullPolicy: IfNotPresent
          ports:
            - name: dragonfly
              containerPort: 6379
              protocol: TCP
          livenessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          readinessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          args:
            - "--alsologtostderr"
          resources:
            limits: {}
            requests: {}


================================================
FILE: contrib/charts/dragonfly/ci/securitycontext-values.yaml
================================================
podSecurityContext: {}

securityContext:
  allowPrivilegeEscalation: false
  readOnlyRootFilesystem: true


================================================
FILE: contrib/charts/dragonfly/ci/service-loadbalancer-ip.golden.yaml
================================================
---
# Source: dragonfly/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
---
# Source: dragonfly/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  type: LoadBalancer
  loadBalancerIP: 127.0.0.1
  ports:
    - port: 6379
      targetPort: dragonfly
      protocol: TCP
      name: dragonfly
  selector:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
---
# Source: dragonfly/templates/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: dragonfly
      app.kubernetes.io/instance: test
  template:
    metadata:
      annotations:
      labels:
        app.kubernetes.io/name: dragonfly
        app.kubernetes.io/instance: test
    spec:
      serviceAccountName: test-dragonfly
      containers:
        - name: dragonfly
          image: "docker.dragonflydb.io/dragonflydb/dragonfly:v1.37.0"
          imagePullPolicy: IfNotPresent
          ports:
            - name: dragonfly
              containerPort: 6379
              protocol: TCP
          livenessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          readinessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          args:
            - "--alsologtostderr"
          resources:
            limits: {}
            requests: {}


================================================
FILE: contrib/charts/dragonfly/ci/service-loadbalancer-ip.yaml
================================================
service:
  type: LoadBalancer
  loadBalancerIP: "127.0.0.1"

================================================
FILE: contrib/charts/dragonfly/ci/service-monitor-values.golden.yaml
================================================
---
# Source: dragonfly/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
---
# Source: dragonfly/templates/metrics-service.yaml
apiVersion: v1
kind: Service
metadata:
  name: test-dragonfly-metrics
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
    type: metrics
spec:
  type: ClusterIP
  ports:
    - name: metrics
      port: 6379
      targetPort: 6379
      protocol: TCP
  selector:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
---
# Source: dragonfly/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  type: ClusterIP
  ports:
    - port: 6379
      targetPort: dragonfly
      protocol: TCP
      name: dragonfly
  selector:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
---
# Source: dragonfly/templates/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: dragonfly
      app.kubernetes.io/instance: test
  template:
    metadata:
      annotations:
      labels:
        app.kubernetes.io/name: dragonfly
        app.kubernetes.io/instance: test
    spec:
      serviceAccountName: test-dragonfly
      containers:
        - name: dragonfly
          image: "docker.dragonflydb.io/dragonflydb/dragonfly:v1.37.0"
          imagePullPolicy: IfNotPresent
          ports:
            - name: dragonfly
              containerPort: 6379
              protocol: TCP
          livenessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          readinessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          args:
            - "--alsologtostderr"
          resources:
            limits: {}
            requests: {}
---
# Source: dragonfly/templates/servicemonitor.yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: test-dragonfly-metrics
  namespace: default
  labels:
    release: prometheus-stack
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  endpoints:
    - interval: 10s
      scrapeTimeout: 10s
      honorLabels: true
      port: metrics
      path: /metrics
      scheme: http
  jobLabel: "test"
  selector:
    matchLabels:
      app.kubernetes.io/name: dragonfly
      app.kubernetes.io/instance: test
      type: metrics
  namespaceSelector:
    matchNames:
      - default


================================================
FILE: contrib/charts/dragonfly/ci/service-monitor-values.yaml
================================================
serviceMonitor:
  enabled: true
  namespace: ""
  labels:
    release: prometheus-stack
  annotations: {}
  interval: 10s
  scrapeTimeout: 10s


================================================
FILE: contrib/charts/dragonfly/ci/taints-tolerations-values.golden.yaml
================================================
---
# Source: dragonfly/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
---
# Source: dragonfly/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  type: ClusterIP
  ports:
    - port: 6379
      targetPort: dragonfly
      protocol: TCP
      name: dragonfly
  selector:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
---
# Source: dragonfly/templates/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: dragonfly
      app.kubernetes.io/instance: test
  template:
    metadata:
      annotations:
      labels:
        app.kubernetes.io/name: dragonfly
        app.kubernetes.io/instance: test
    spec:
      tolerations:
        - effect: NoSchedule
          key: key/high-memory
          operator: Equal
          value: "true"
        - effect: PreferNoSchedule
          key: key/high-memory
          operator: Equal
          value: "true"
      affinity:
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: key/node-kind
                operator: In
                values:
                - high-memory
      serviceAccountName: test-dragonfly
      containers:
        - name: dragonfly
          image: "docker.dragonflydb.io/dragonflydb/dragonfly:v1.37.0"
          imagePullPolicy: IfNotPresent
          ports:
            - name: dragonfly
              containerPort: 6379
              protocol: TCP
          livenessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          readinessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          args:
            - "--alsologtostderr"
          resources:
            limits: {}
            requests: {}


================================================
FILE: contrib/charts/dragonfly/ci/taints-tolerations-values.yaml
================================================
tolerations:
  - key: key/high-memory
    operator: "Equal"
    value: "true"
    effect: "NoSchedule"
  - key: key/high-memory
    operator: "Equal"
    value: "true"
    effect: "PreferNoSchedule"
affinity:
  nodeAffinity:
    requiredDuringSchedulingIgnoredDuringExecution:
      nodeSelectorTerms:
        - matchExpressions:
            - key: key/node-kind
              operator: In
              values:
                - high-memory


================================================
FILE: contrib/charts/dragonfly/ci/tls-values.golden.yaml
================================================
---
# Source: dragonfly/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
---
# Source: dragonfly/templates/extra-manifests.yaml
apiVersion: v1
kind: Secret
metadata:
  name: dfly-password
stringData:
  password: foobar
---
# Source: dragonfly/templates/tls-secret.yaml
apiVersion: v1
kind: Secret
metadata:
  name: test-dragonfly-tls
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
type: kubernetes.io/tls
data:
  tls.crt: "LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUI4ekNDQVpxZ0F3SUJBZ0lFYmIyWjJqQUtCZ2dxaGtqT1BRUURBekJaTVFzd0NRWURWUVFHRXdKR1R6RWcKTUI0R0ExVUVBd3dYWkhKaFoyOXVabXg1TG1SeVlXZHZibVpzZVM1emRtTXhEREFLQmdOVkJBZ01BMlp2YnpFTQpNQW9HQTFVRUJ3d0RabTl2TVF3d0NnWURWUVFLREFObWIyOHdIaGNOTWpJeE1qSTVNVEl3TXpJM1doY05Nekl4Ck1qSTJNVEl3TXpJM1dqQlpNUXN3Q1FZRFZRUUdFd0pHVHpFZ01CNEdBMVVFQXd3WFpISmhaMjl1Wm14NUxtUnkKWVdkdmJtWnNlUzV6ZG1NeEREQUtCZ05WQkFnTUEyWnZiekVNTUFvR0ExVUVCd3dEWm05dk1Rd3dDZ1lEVlFRSwpEQU5tYjI4d1dUQVRCZ2NxaGtqT1BRSUJCZ2dxaGtqT1BRTUJCd05DQUFRV05mVHVOamhQRWk3aDFjaUNTMEl0CmZLZ2lCaHhMR2xGM010amxGVGpDcnpreW5TU0FCb010TmxqY0RFMGhtL2l6YlJVb2dBY0RGY3ZrbnZDaHp4YXEKbzFBd1RqQWRCZ05WSFE0RUZnUVVTTjZGYnNKWjJFVWZYM2JlQ2g1Y0VvNmNrdFF3SHdZRFZSMGpCQmd3Rm9BVQpTTjZGYnNKWjJFVWZYM2JlQ2g1Y0VvNmNrdFF3REFZRFZSMFRCQVV3QXdFQi96QUtCZ2dxaGtqT1BRUURBd05ICkFEQkVBaUI2dEc1eHp5ajRpVC9lMHdwQ01SSE92bFFLUWV4QnloeU5QQWhybzlaQ1JnSWdhRGNkOXZNOHJDYmIKSlBSeXptMGlOOU9XTS9BMjRubW0zaXRuM0k0cmNEMD0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo="
  tls.key: "LS0tLS1CRUdJTiBFQyBQUklWQVRFIEtFWS0tLS0tCk1IY0NBUUVFSU5oNmVNRHJCbEFpVDY4VDhvdnpHbjZKWmJKZXZVZWZZa0lJWU5Xd3c1NXlvQW9HQ0NxR1NNNDkKQXdFSG9VUURRZ0FFRmpYMDdqWTRUeEl1NGRYSWdrdENMWHlvSWdZY1N4cFJkekxZNVJVNHdxODVNcDBrZ0FhRApMVFpZM0F4TkladjRzMjBWS0lBSEF4WEw1Sjd3b2M4V3FnPT0KLS0tLS1FTkQgRUMgUFJJVkFURSBLRVktLS0tLQo="
---
# Source: dragonfly/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  type: ClusterIP
  ports:
    - port: 6379
      targetPort: dragonfly
      protocol: TCP
      name: dragonfly
  selector:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
---
# Source: dragonfly/templates/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: dragonfly
      app.kubernetes.io/instance: test
  template:
    metadata:
      annotations:
        checksum/tls-secret: b97190b6585f160d4f709b965d275564bb51cd19202c6e014e1d42a972446a5c
      labels:
        app.kubernetes.io/name: dragonfly
        app.kubernetes.io/instance: test
    spec:
      serviceAccountName: test-dragonfly
      containers:
        - name: dragonfly
          image: "docker.dragonflydb.io/dragonflydb/dragonfly:v1.37.0"
          imagePullPolicy: IfNotPresent
          ports:
            - name: dragonfly
              containerPort: 6379
              protocol: TCP
          livenessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          readinessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          args:
            - "--alsologtostderr"
            - "--tls"
            - "--tls_cert_file=/etc/dragonfly/tls/tls.crt"
            - "--tls_key_file=/etc/dragonfly/tls/tls.key"
          resources:
            limits: {}
            requests: {}
          volumeMounts:
            - mountPath: /etc/dragonfly/tls
              name: tls
          env:
            - name: DFLY_requirepass
              valueFrom:
                secretKeyRef:
                  name: dfly-password
                  key: password
      volumes:
        - name: tls
          secret:
            secretName: test-dragonfly-tls


================================================
FILE: contrib/charts/dragonfly/ci/tls-values.yaml
================================================
tls:
  enabled: true
  existing_secret: ""
  cert: |
    -----BEGIN CERTIFICATE-----
    MIIB8zCCAZqgAwIBAgIEbb2Z2jAKBggqhkjOPQQDAzBZMQswCQYDVQQGEwJGTzEg
    MB4GA1UEAwwXZHJhZ29uZmx5LmRyYWdvbmZseS5zdmMxDDAKBgNVBAgMA2ZvbzEM
    MAoGA1UEBwwDZm9vMQwwCgYDVQQKDANmb28wHhcNMjIxMjI5MTIwMzI3WhcNMzIx
    MjI2MTIwMzI3WjBZMQswCQYDVQQGEwJGTzEgMB4GA1UEAwwXZHJhZ29uZmx5LmRy
    YWdvbmZseS5zdmMxDDAKBgNVBAgMA2ZvbzEMMAoGA1UEBwwDZm9vMQwwCgYDVQQK
    DANmb28wWTATBgcqhkjOPQIBBggqhkjOPQMBBwNCAAQWNfTuNjhPEi7h1ciCS0It
    fKgiBhxLGlF3MtjlFTjCrzkynSSABoMtNljcDE0hm/izbRUogAcDFcvknvChzxaq
    o1AwTjAdBgNVHQ4EFgQUSN6FbsJZ2EUfX3beCh5cEo6cktQwHwYDVR0jBBgwFoAU
    SN6FbsJZ2EUfX3beCh5cEo6cktQwDAYDVR0TBAUwAwEB/zAKBggqhkjOPQQDAwNH
    ADBEAiB6tG5xzyj4iT/e0wpCMRHOvlQKQexByhyNPAhro9ZCRgIgaDcd9vM8rCbb
    JPRyzm0iN9OWM/A24nmm3itn3I4rcD0=
    -----END CERTIFICATE-----

  key: |
    -----BEGIN EC PRIVATE KEY-----
    MHcCAQEEINh6eMDrBlAiT68T8ovzGn6JZbJevUefYkIIYNWww55yoAoGCCqGSM49
    AwEHoUQDQgAEFjX07jY4TxIu4dXIgktCLXyoIgYcSxpRdzLY5RU4wq85Mp0kgAaD
    LTZY3AxNIZv4s20VKIAHAxXL5J7woc8Wqg==
    -----END EC PRIVATE KEY-----

extraObjects:
- apiVersion: v1
  kind: Secret
  metadata:
    name: dfly-password
  stringData:
    password: foobar

passwordFromSecret:
  enable: true
  existingSecret:
    name: dfly-password
    key: password


================================================
FILE: contrib/charts/dragonfly/ci/tolerations-values.golden.yaml
================================================
---
# Source: dragonfly/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
---
# Source: dragonfly/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  type: ClusterIP
  ports:
    - port: 6379
      targetPort: dragonfly
      protocol: TCP
      name: dragonfly
  selector:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
---
# Source: dragonfly/templates/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: test-dragonfly
  namespace: default
  labels:
    app.kubernetes.io/name: dragonfly
    app.kubernetes.io/instance: test
    app.kubernetes.io/version: "v1.37.0"
    app.kubernetes.io/managed-by: Helm
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: dragonfly
      app.kubernetes.io/instance: test
  template:
    metadata:
      annotations:
      labels:
        app.kubernetes.io/name: dragonfly
        app.kubernetes.io/instance: test
    spec:
      tolerations:
        - effect: NoSchedule
          operator: Exists
      serviceAccountName: test-dragonfly
      containers:
        - name: dragonfly
          image: "docker.dragonflydb.io/dragonflydb/dragonfly:v1.37.0"
          imagePullPolicy: IfNotPresent
          ports:
            - name: dragonfly
              containerPort: 6379
              protocol: TCP
          livenessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          readinessProbe:
            exec:
              command:
              - /bin/sh
              - /usr/local/bin/healthcheck.sh
            failureThreshold: 3
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 5
          args:
            - "--alsologtostderr"
          resources:
            limits: {}
            requests: {}


================================================
FILE: contrib/charts/dragonfly/ci/tolerations-values.yaml
================================================
tolerations:
  - effect: NoSchedule
    operator: Exists


================================================
FILE: contrib/charts/dragonfly/go.mod
================================================
module dragonfly

go 1.24.0

toolchain go1.24.7

require github.com/gruntwork-io/terratest v0.51.0

require (
	filippo.io/edwards25519 v1.1.0 // indirect
	github.com/BurntSushi/toml v1.5.0 // indirect
	github.com/aws/aws-sdk-go-v2 v1.39.1 // indirect
	github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.1 // indirect
	github.com/aws/aws-sdk-go-v2/config v1.31.10 // indirect
	github.com/aws/aws-sdk-go-v2/credentials v1.18.14 // indirect
	github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.8 // indirect
	github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.19.8 // indirect
	github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.8 // indirect
	github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.8 // indirect
	github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3 // indirect
	github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.8 // indirect
	github.com/aws/aws-sdk-go-v2/service/acm v1.37.5 // indirect
	github.com/aws/aws-sdk-go-v2/service/autoscaling v1.59.2 // indirect
	github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs v1.58.1 // indirect
	github.com/aws/aws-sdk-go-v2/service/dynamodb v1.50.4 // indirect
	github.com/aws/aws-sdk-go-v2/service/ec2 v1.254.0 // indirect
	github.com/aws/aws-sdk-go-v2/service/ecr v1.50.4 // indirect
	github.com/aws/aws-sdk-go-v2/service/ecs v1.64.1 // indirect
	github.com/aws/aws-sdk-go-v2/service/iam v1.47.6 // indirect
	github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.1 // indirect
	github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.8.8 // indirect
	github.com/aws/aws-sdk-go-v2/service/internal/endpoint-discovery v1.11.8 // indirect
	github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.8 // indirect
	github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.8 // indirect
	github.com/aws/aws-sdk-go-v2/service/kms v1.45.5 // indirect
	github.com/aws/aws-sdk-go-v2/service/lambda v1.77.5 // indirect
	github.com/aws/aws-sdk-go-v2/service/rds v1.107.1 // indirect
	github.com/aws/aws-sdk-go-v2/service/route53 v1.58.3 // indirect
	github.com/aws/aws-sdk-go-v2/service/s3 v1.88.2 // indirect
	github.com/aws/aws-sdk-go-v2/service/secretsmanager v1.39.5 // indirect
	github.com/aws/aws-sdk-go-v2/service/sns v1.38.4 // indirect
	github.com/aws/aws-sdk-go-v2/service/sqs v1.42.7 // indirect
	github.com/aws/aws-sdk-go-v2/service/ssm v1.65.0 // indirect
	github.com/aws/aws-sdk-go-v2/service/sso v1.29.4 // indirect
	github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.0 // indirect
	github.com/aws/aws-sdk-go-v2/service/sts v1.38.5 // indirect
	github.com/aws/smithy-go v1.23.0 // indirect
	github.com/boombuler/barcode v1.1.0 // indirect
	github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect
	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
	github.com/emicklei/go-restful/v3 v3.13.0 // indirect
	github.com/fxamacker/cbor/v2 v2.9.0 // indirect
	github.com/go-errors/errors v1.5.1 // indirect
	github.com/go-logr/logr v1.4.3 // indirect
	github.com/go-openapi/jsonpointer v0.22.0 // indirect
	github.com/go-openapi/jsonreference v0.21.1 // indirect
	github.com/go-openapi/swag v0.25.0 // indirect
	github.com/go-openapi/swag/cmdutils v0.25.0 // indirect
	github.com/go-openapi/swag/conv v0.25.0 // indirect
	github.com/go-openapi/swag/fileutils v0.25.0 // indirect
	github.com/go-openapi/swag/jsonname v0.25.0 // indirect
	github.com/go-openapi/swag/jsonutils v0.25.0 // indirect
	github.com/go-openapi/swag/loading v0.25.0 // indirect
	github.com/go-openapi/swag/mangling v0.25.0 // indirect
	github.com/go-openapi/swag/netutils v0.25.0 // indirect
	github.com/go-openapi/swag/stringutils v0.25.0 // indirect
	github.com/go-openapi/swag/typeutils v0.25.0 // indirect
	github.com/go-openapi/swag/yamlutils v0.25.0 // indirect
	github.com/go-sql-driver/mysql v1.9.3 // indirect
	github.com/gogo/protobuf v1.3.2 // indirect
	github.com/gonvenience/bunt v1.4.2 // indirect
	github.com/gonvenience/idem v0.0.2 // indirect
	github.com/gonvenience/neat v1.3.16 // indirect
	github.com/gonvenience/term v1.0.4 // indirect
	github.com/gonvenience/text v1.0.9 // indirect
	github.com/gonvenience/ytbx v1.4.7 // indirect
	github.com/google/gnostic-models v0.7.0 // indirect
	github.com/google/uuid v1.6.0 // indirect
	github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect
	github.com/gruntwork-io/go-commons v0.17.2 // indirect
	github.com/hashicorp/errwrap v1.1.0 // indirect
	github.com/hashicorp/go-multierror v1.1.1 // indirect
	github.com/homeport/dyff v1.10.2 // indirect
	github.com/jackc/pgpassfile v1.0.0 // indirect
	github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
	github.com/jackc/pgx/v5 v5.7.6 // indirect
	github.com/jackc/puddle/v2 v2.2.2 // indirect
	github.com/json-iterator/go v1.1.12 // indirect
	github.com/lucasb-eyer/go-colorful v1.3.0 // indirect
	github.com/mattn/go-ciede2000 v0.0.0-20170301095244-782e8c62fec3 // indirect
	github.com/mattn/go-isatty v0.0.20 // indirect
	github.com/mattn/go-zglob v0.0.6 // indirect
	github.com/mitchellh/go-homedir v1.1.0 // indirect
	github.com/mitchellh/go-ps v1.0.0 // indirect
	github.com/mitchellh/hashstructure v1.1.0 // indirect
	github.com/moby/spdystream v0.5.0 // indirect
	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
	github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
	github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect
	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
	github.com/pquerna/otp v1.5.0 // indirect
	github.com/russross/blackfriday/v2 v2.1.0 // indirect
	github.com/sergi/go-diff v1.4.0 // indirect
	github.com/spf13/pflag v1.0.10 // indirect
	github.com/stretchr/testify v1.11.1 // indirect
	github.com/texttheater/golang-levenshtein v1.0.1 // indirect
	github.com/urfave/cli/v2 v2.27.7 // indirect
	github.com/virtuald/go-ordered-json v0.0.0-20170621173500-b18e6e673d74 // indirect
	github.com/x448/float16 v0.8.4 // indirect
	github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 // indirect
	go.yaml.in/yaml/v2 v2.4.3 // indirect
	go.yaml.in/yaml/v3 v3.0.4 // indirect
	golang.org/x/crypto v0.42.0 // indirect
	golang.org/x/exp v0.0.0-20250911091902-df9299821621 // indirect
	golang.org/x/net v0.44.0 // indirect
	golang.org/x/oauth2 v0.31.0 // indirect
	golang.org/x/sync v0.17.0 // indirect
	golang.org/x/sys v0.36.0 // indirect
	golang.org/x/term v0.35.0 // indirect
	golang.org/x/text v0.29.0 // indirect
	golang.org/x/time v0.13.0 // indirect
	google.golang.org/protobuf v1.36.9 // indirect
	gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect
	gopkg.in/inf.v0 v0.9.1 // indirect
	gopkg.in/yaml.v2 v2.4.0 // indirect
	gopkg.in/yaml.v3 v3.0.1 // indirect
	k8s.io/api v0.34.1 // indirect
	k8s.io/apimachinery v0.34.1 // indirect
	k8s.io/client-go v0.34.1 // indirect
	k8s.io/klog/v2 v2.130.1 // indirect
	k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect
	k8s.io/utils v0.0.0-20250820121507-0af2bda4dd1d // indirect
	sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect
	sigs.k8s.io/randfill v1.0.0 // indirect
	sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect
	sigs.k8s.io/yaml v1.6.0 // indirect
)


================================================
FILE: contrib/charts/dragonfly/go.sum
================================================
filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA=
filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4=
github.com/BurntSushi/toml v1.5.0 h1:W5quZX/G/csjUnuI8SUYlsHs9M38FC7znL0lIO+DvMg=
github.com/BurntSushi/toml v1.5.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho=
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs=
github.com/aws/aws-sdk-go-v2 v1.39.1 h1:fWZhGAwVRK/fAN2tmt7ilH4PPAE11rDj7HytrmbZ2FE=
github.com/aws/aws-sdk-go-v2 v1.39.1/go.mod h1:sDioUELIUO9Znk23YVmIk86/9DOpkbyyVb1i/gUNFXY=
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.1 h1:i8p8P4diljCr60PpJp6qZXNlgX4m2yQFpYk+9ZT+J4E=
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.1/go.mod h1:ddqbooRZYNoJ2dsTwOty16rM+/Aqmk/GOXrK8cg7V00=
github.com/aws/aws-sdk-go-v2/config v1.31.10 h1:7LllDZAegXU3yk41mwM6KcPu0wmjKGQB1bg99bNdQm4=
github.com/aws/aws-sdk-go-v2/config v1.31.10/go.mod h1:Ge6gzXPjqu4v0oHvgAwvGzYcK921GU0hQM25WF/Kl+8=
github.com/aws/aws-sdk-go-v2/credentials v1.18.14 h1:TxkI7QI+sFkTItN/6cJuMZEIVMFXeu2dI1ZffkXngKI=
github.com/aws/aws-sdk-go-v2/credentials v1.18.14/go.mod h1:12x4Uw/vijC11XkctTjy92TNCQ+UnNJkT7fzX0Yd93E=
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.8 h1:gLD09eaJUdiszm7vd1btiQUYE0Hj+0I2b8AS+75z9AY=
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.8/go.mod h1:4RW3oMPt1POR74qVOC4SbubxAwdP4pCT0nSw3jycOU4=
github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.19.8 h1:QcAh/TNGM3MWe95ilMWwnieXWXsyM33Mb/RuTGlWLm4=
github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.19.8/go.mod h1:72m/ZCCgYpXJzsgI8uJFYMnXEjtZ4kkaolL9NRXLSnU=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.8 h1:6bgAZgRyT4RoFWhxS+aoGMFyE0cD1bSzFnEEi4bFPGI=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.8/go.mod h1:KcGkXFVU8U28qS4KvLEcPxytPZPBcRawaH2Pf/0jptE=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.8 h1:HhJYoES3zOz34yWEpGENqJvRVPqpmJyR3+AFg9ybhdY=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.8/go.mod h1:JnA+hPWeYAVbDssp83tv+ysAG8lTfLVXvSsyKg/7xNA=
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3 h1:bIqFDwgGXXN1Kpp99pDOdKMTTb5d2KyU5X/BZxjOkRo=
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3/go.mod h1:H5O/EsxDWyU+LP/V8i5sm8cxoZgc2fdNR9bxlOFrQTo=
github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.8 h1:1/bT9kDdLQzfZ1e6J6hpW+SfNDd6xrV8F3M2CuGyUz8=
github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.8/go.mod h1:RbdwTONAIi59ej/+1H+QzZORt5bcyAtbrS7FQb2pvz0=
github.com/aws/aws-sdk-go-v2/service/acm v1.37.5 h1:vTmyvkmMJEKZgyhSuaEv8gZCJJlgNpSpYy/4CExjHoA=
github.com/aws/aws-sdk-go-v2/service/acm v1.37.5/go.mod h1:TmyW/AiLmFEXwFsm5hh2T86BpgFbcB1icshuzFu8LgY=
github.com/aws/aws-sdk-go-v2/service/autoscaling v1.59.2 h1:YOWVoIjUoiwAVIRVU3PG2yNldh9dQT5OegnO99RO4ls=
github.com/aws/aws-sdk-go-v2/service/autoscaling v1.59.2/go.mod h1:t08UbddtoRQcKiIW2ZTfxX5x6vRaTj6KrKcf1R0I4tw=
github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs v1.58.1 h1:JMYpgsJ31l0wjJCerJtIBo39HznZJ/ENJJzOSTcJh68=
github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs v1.58.1/go.mod h1:zqtpx8Y/EydPCFy5MA9AJJBfJ+mCQz8BNHj2CvDvaYA=
github.com/aws/aws-sdk-go-v2/service/dynamodb v1.50.4 h1:3EE5TTeBHPTKQNNeIHdXcJ6ENDsN7c2rCQUtbdolwV8=
github.com/aws/aws-sdk-go-v2/service/dynamodb v1.50.4/go.mod h1:8rWv4Lq/jrlspgd/wpdFeKrxLByJlfpFEk9g0Tw5iOw=
github.com/aws/aws-sdk-go-v2/service/ec2 v1.254.0 h1:fTLR6dLDTGChAjecRPlVrKeznT0rVdzR4yn9Z68MTGk=
github.com/aws/aws-sdk-go-v2/service/ec2 v1.254.0/go.mod h1:V0jbRy1/IPapnkqgXSwVOFB+u5pnCwd9S+R3pKWULC4=
github.com/aws/aws-sdk-go-v2/service/ecr v1.50.4 h1:kPe1ZLqERYZxxDi6ysoX4oYavSJ6lkGaadsN1ogg3I8=
github.com/aws/aws-sdk-go-v2/service/ecr v1.50.4/go.mod h1:cAJR/1pLXISKFSSJsrsTZPw05PLL5xOIpbbzxM7GLiI=
github.com/aws/aws-sdk-go-v2/service/ecs v1.64.1 h1:kAzHjjqQnu3ET5/cX1N5tKPqtExYk97wpD6MpRadq/A=
github.com/aws/aws-sdk-go-v2/service/ecs v1.64.1/go.mod h1:HIaZTpBD7+mgQEIv2wMzXYJw2T23sMFVNp2Mkw/ODFk=
github.com/aws/aws-sdk-go-v2/service/iam v1.47.6 h1:EWehQXACWr+6hzfZPwZChlfoVhiUCfLHE0Xh3kAfzWQ=
github.com/aws/aws-sdk-go-v2/service/iam v1.47.6/go.mod h1:qRXgEBWPIltrWHQwU+HkyBvwh1QgeigFcaCGCIVrWk0=
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.1 h1:oegbebPEMA/1Jny7kvwejowCaHz1FWZAQ94WXFNCyTM=
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.1/go.mod h1:kemo5Myr9ac0U9JfSjMo9yHLtw+pECEHsFtJ9tqCEI8=
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.8.8 h1:tIN8MFT1z5STK5kTdOT1TCfMN/bn5fSEnlKsTL8qBOU=
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.8.8/go.mod h1:VKS56txtNWjKI8FqD/hliL0BcshyF4ZaLBa1rm2Y+5s=
github.com/aws/aws-sdk-go-v2/service/internal/endpoint-discovery v1.11.8 h1:0lJ7+zL81zesTu1nd1ocKpEoYi6BqDppjoAJLn18Vr0=
github.com/aws/aws-sdk-go-v2/service/internal/endpoint-discovery v1.11.8/go.mod h1:5t+iImUczd3RYSVnc20t/ohBrmrkpdcy89pm62BSDQo=
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.8 h1:M6JI2aGFEzYxsF6CXIuRBnkge9Wf9a2xU39rNeXgu10=
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.8/go.mod h1:Fw+MyTwlwjFsSTE31mH211Np+CUslml8mzc0AFEG09s=
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.8 h1:AgYCo1Rb8XChJXA871BXHDNxNWOTAr6V5YdsRIBbgv0=
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.8/go.mod h1:Au9dvIGm1Hbqnt29d3VakOCQuN9l0WrkDDTRq8biWS4=
github.com/aws/aws-sdk-go-v2/service/kms v1.45.5 h1:5AsmehPcxIp+Y8GVRa91UKpu3AO1gxhdckippth6bnA=
github.com/aws/aws-sdk-go-v2/service/kms v1.45.5/go.mod h1:ooAdc5n3rjgEznIXncCYY6V9+YQDcJAYyZDJ4TwLSDM=
github.com/aws/aws-sdk-go-v2/service/lambda v1.77.5 h1:rKc5Ad3PJlXGo5pigWii+m/hSPgxbNJtOicEP5nbV2E=
github.com/aws/aws-sdk-go-v2/service/lambda v1.77.5/go.mod h1:fPYDox6U6puh6xhMyWpUWd19QIIqMlcQ6iCdC1jk2cE=
github.com/aws/aws-sdk-go-v2/service/rds v1.107.1 h1:j7GQZWF0CbHCObPEZUK6QuP3yUQwjBJmlaojHPRZ6f8=
github.com/aws/aws-sdk-go-v2/service/rds v1.107.1/go.mod h1:OW/mwGWAs6l1HnZpJupatcUFt1V0y6OiUMUp+Wd0DEc=
github.com/aws/aws-sdk-go-v2/service/route53 v1.58.3 h1:jQzRC+0eI/l5mFXVoPTyyolrqyZtKIYaKHSuKJoIJKs=
github.com/aws/aws-sdk-go-v2/service/route53 v1.58.3/go.mod h1:1GNaojT/gG4Ru9tT39ton6kRZ3FvptJ/QRKBoqUOVX4=
github.com/aws/aws-sdk-go-v2/service/s3 v1.88.2 h1:T7b3qniouutV5Wwa9B1q7gW+Y8s1B3g9RE9qa7zLBIM=
github.com/aws/aws-sdk-go-v2/service/s3 v1.88.2/go.mod h1:tW9TsLb6t1eaTdBE6LITyJW1m/+DjQPU78Q/jT2FJu8=
github.com/aws/aws-sdk-go-v2/service/secretsmanager v1.39.5 h1:ssRo1z8FdFaoZc1AWz1R6/amdsxy56akVPql15/AYSs=
github.com/aws/aws-sdk-go-v2/service/secretsmanager v1.39.5/go.mod h1:ut4ISJEOb5t2M1DNfx1787tF3UJGlwF3Q97uEulV/lU=
github.com/aws/aws-sdk-go-v2/service/sns v1.38.4 h1:MkaMcZGwW9vt0cW+N2i5JSF/zkxKyDqpGCP1VWip3YM=
github.com/aws/aws-sdk-go-v2/service/sns v1.38.4/go.mod h1:S0rwG+VHP1/jKoT6xJDe8f8Apz9HO42dUI8DmnOzYYU=
github.com/aws/aws-sdk-go-v2/service/sqs v1.42.7 h1:KZldI+77SMG8vHDE55HYSjPcKSeOy2WIRo+HtIz2IY8=
github.com/aws/aws-sdk-go-v2/service/sqs v1.42.7/go.mod h1:wbgNsM9psd+xQtLSDUAICjFCT/HXNZIgx3qyjqQNt88=
github.com/aws/aws-sdk-go-v2/service/ssm v1.65.0 h1:6bPuMpky+qG4L7VQ1RyYVkBrEix1JRC/JPweTRfRDko=
github.com/aws/aws-sdk-go-v2/service/ssm v1.65.0/go.mod h1:mbnkxOJSgkV4YHA5dWSlLolvC1EuxNcaGfn0Gf4e9UU=
github.com/aws/aws-sdk-go-v2/service/sso v1.29.4 h1:FTdEN9dtWPB0EOURNtDPmwGp6GGvMqRJCAihkSl/1No=
github.com/aws/aws-sdk-go-v2/service/sso v1.29.4/go.mod h1:mYubxV9Ff42fZH4kexj43gFPhgc/LyC7KqvUKt1watc=
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.0 h1:I7ghctfGXrscr7r1Ga/mDqSJKm7Fkpl5Mwq79Z+rZqU=
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.0/go.mod h1:Zo9id81XP6jbayIFWNuDpA6lMBWhsVy+3ou2jLa4JnA=
github.com/aws/aws-sdk-go-v2/service/sts v1.38.5 h1:+LVB0xBqEgjQoqr9bGZbRzvg212B0f17JdflleJRNR4=
github.com/aws/aws-sdk-go-v2/service/sts v1.38.5/go.mod h1:xoaxeqnnUaZjPjaICgIy5B+MHCSb/ZSOn4MvkFNOUA0=
github.com/aws/smithy-go v1.23.0 h1:8n6I3gXzWJB2DxBDnfxgBaSX6oe0d/t10qGz7OKqMCE=
github.com/aws/smithy-go v1.23.0/go.mod h1:t1ufH5HMublsJYulve2RKmHDC15xu1f26kHCp/HgceI=
github.com/boombuler/barcode v1.0.1-0.20190219062509-6c824513bacc/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8=
github.com/boombuler/barcode v1.1.0 h1:ChaYjBR63fr4LFyGn8E8nt7dBSt3MiU3zMOZqFvVkHo=
github.com/boombuler/barcode v1.1.0/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8=
github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo=
github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/emicklei/go-restful/v3 v3.13.0 h1:C4Bl2xDndpU6nJ4bc1jXd+uTmYPVUwkD6bFY/oTyCes=
github.com/emicklei/go-restful/v3 v3.13.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM=
github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ=
github.com/go-errors/errors v1.5.1 h1:ZwEMSLRCapFLflTpT7NKaAc7ukJ8ZPEjzlxt8rPN8bk=
github.com/go-errors/errors v1.5.1/go.mod h1:sIVyrIiJhuEF+Pj9Ebtd6P/rEYROXFi3BopGUQ5a5Og=
github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
github.com/go-openapi/jsonpointer v0.22.0 h1:TmMhghgNef9YXxTu1tOopo+0BGEytxA+okbry0HjZsM=
github.com/go-openapi/jsonpointer v0.22.0/go.mod h1:xt3jV88UtExdIkkL7NloURjRQjbeUgcxFblMjq2iaiU=
github.com/go-openapi/jsonreference v0.21.1 h1:bSKrcl8819zKiOgxkbVNRUBIr6Wwj9KYrDbMjRs0cDA=
github.com/go-openapi/jsonreference v0.21.1/go.mod h1:PWs8rO4xxTUqKGu+lEvvCxD5k2X7QYkKAepJyCmSTT8=
github.com/go-openapi/swag v0.25.0 h1:xyZhlgInBg6wOtyTD5b+pzwVqHSOliAvgvKW+POFUts=
github.com/go-openapi/swag v0.25.0/go.mod h1:yhsa7GJvO1JBFZccLq9uh/MawsC0PQd8sNz88VBXQlU=
github.com/go-openapi/swag/cmdutils v0.25.0 h1:iYZ24DEGPEk6L1jO09vw39KfpxbG7KhS+WeQexS8U5A=
github.com/go-openapi/swag/cmdutils v0.25.0/go.mod h1:pdae/AFo6WxLl5L0rq87eRzVPm/XRHM3MoYgRMvG4A0=
github.com/go-openapi/swag/conv v0.25.0 h1:5K+e44HkOgCVE0IJTbivurzHahT62DPr2DEJqR/+4pA=
github.com/go-openapi/swag/conv v0.25.0/go.mod h1:oa1ZZnb1jubNdZlD1iAhGXt6Ic4hHtuO23MwTgAXR88=
github.com/go-openapi/swag/fileutils v0.25.0 h1:t7aQRuRfsP29dY4vfrNvDZv7RurwRHuyjUedtYVDmYY=
github.com/go-openapi/swag/fileutils v0.25.0/go.mod h1:+NXtt5xNZZqmpIpjqcujqojGFek9/w55b3ecmOdtg8M=
github.com/go-openapi/swag/jsonname v0.25.0 h1:+fuNs9gdkb2w10hgsgOBx9jtx0pvtUaDRYxD91BEpEQ=
github.com/go-openapi/swag/jsonname v0.25.0/go.mod h1:71Tekow6UOLBD3wS7XhdT98g5J5GR13NOTQ9/6Q11Zo=
github.com/go-openapi/swag/jsonutils v0.25.0 h1:ELKpJT29T4N/AvmDqMeDFLx2QRZQOYFthzctbIX30+A=
github.com/go-openapi/swag/jsonutils v0.25.0/go.mod h1:KYL8GyGoi6tek9ajpvn0le4BWmKoUVVv8yPxklViIMo=
github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.0 h1:ca9vKxLnJegL2bzqXRWNabKdqVGxBzrnO8/UZnr5W0Y=
github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.0/go.mod h1:kjmweouyPwRUEYMSrbAidoLMGeJ5p6zdHi9BgZiqmsg=
github.com/go-openapi/swag/loading v0.25.0 h1:e9mjE5fJeaK0LTepHMtG0Ief+9ETXLFhWCx7ZfiI6LI=
github.com/go-openapi/swag/loading v0.25.0/go.mod h1:2ZCWXwVY1XYuoue8Bdjbn5GJK4/ufXbCfcvoSPFQJqM=
github.com/go-openapi/swag/mangling v0.25.0 h1:VdTfDWX5lS3yURxYHF5SK7kYelSK69Lv2xEAeudTzM8=
github.com/go-openapi/swag/mangling v0.25.0/go.mod h1:CdiMQ6pnfAgyQGSOIYnZkXvqhnnwOn997uXZMAd/7mQ=
github.com/go-openapi/swag/netutils v0.25.0 h1:/e1LPmXfF9fcOYbbaP3+SQgon1fRwe5EZ0FjpR4vAjs=
github.com/go-openapi/swag/netutils v0.25.0/go.mod h1:CAkkvqnUJX8NV96tNhEQvKz8SQo2KF0f7LleiJwIeRE=
github.com/go-openapi/swag/stringutils v0.25.0 h1:iYfCF45GUeI/1Yrh8rQtTFCp5K1ToqWhUdzJZwvXvv8=
github.com/go-openapi/swag/stringutils v0.25.0/go.mod h1:JLdSAq5169HaiDUbTvArA2yQxmgn4D6h4A+4HqVvAYg=
github.com/go-openapi/swag/typeutils v0.25.0 h1:iUTsxu3F3h9v6CBzVFGXKPSBQt6d8XXgYy1YAlu+HJ8=
github.com/go-openapi/swag/typeutils v0.25.0/go.mod h1:9McMC/oCdS4BKwk2shEB7x17P6HmMmA6dQRtAkSnNb8=
github.com/go-openapi/swag/yamlutils v0.25.0 h1:apgy77seWLEM9HKDcieIgW8bG9aSZgH6nQ9THlHYgHA=
github.com/go-openapi/swag/yamlutils v0.25.0/go.mod h1:0JvBRtc0mR02IqHURUeGgS9cG+Dfms4FCGXCnsgnt7c=
github.com/go-sql-driver/mysql v1.9.3 h1:U/N249h2WzJ3Ukj8SowVFjdtZKfu9vlLZxjPXV1aweo=
github.com/go-sql-driver/mysql v1.9.3/go.mod h1:qn46aNg1333BRMNU69Lq93t8du/dwxI64Gl8i5p1WMU=
github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI=
github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8=
github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
github.com/gonvenience/bunt v1.4.2 h1:nTgkFZsw38SIJKABhLj8aXj2rqion9Zo1so/EBkbFBY=
github.com/gonvenience/bunt v1.4.2/go.mod h1:WjyEO2rSYR+OLZg67Ucl+gjdXPs8GpFl63SCA02XDyI=
github.com/gonvenience/idem v0.0.2 h1:jWHknjPfSbiWgYKre9wB2FhMgVLd1RWXCXzVq+7VIWg=
github.com/gonvenience/idem v0.0.2/go.mod h1:0Xv1MpnNL40+dsyOxaJFa7L8ekeTRr63WaWXpiWLFFM=
github.com/gonvenience/neat v1.3.16 h1:Vb0iCkSHGWaA+ry69RY3HpQ6Ooo6o/g2wjI80db8DjI=
github.com/gonvenience/neat v1.3.16/go.mod h1:sLxdQNNluxbpROxTTHs3XBSJX8fwFX5toEULUy74ODA=
github.com/gonvenience/term v1.0.4 h1:qkCGfmUtpzs9W4jWgNijaGF6dg3oSIh+kZCzT5cPNZY=
github.com/gonvenience/term v1.0.4/go.mod h1:OzNdQC5NVBou9AifaHd1QG6EP8iDdpaT7GFm1bVgslg=
github.com/gonvenience/text v1.0.9 h1:U29BxT3NZnNPcfiEnAwt6yHXe38fQs2Q+WTqs1X+atI=
github.com/gonvenience/text v1.0.9/go.mod h1:JQF1ifXNRaa66jnPLqoITA+y8WATlG0eJzFC9ElJS3s=
github.com/gonvenience/ytbx v1.4.7 h1:3wJ7EOfdv3Lg+h0mzKo7f8d1zMY1EJtVzzYrA3UhjHQ=
github.com/gonvenience/ytbx v1.4.7/go.mod h1:ZmAU727eOTYeC4aUJuqyb9vogNAN7NiSKfw6Aoxbqys=
github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo=
github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ=
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8=
github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY=
github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ=
github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 h1:JeSE6pjso5THxAzdVpqr6/geYxZytqFMBCOtn/ujyeo=
github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674/go.mod h1:r4w70xmWCQKmi1ONH4KIaBptdivuRPyosB9RmPlGEwA=
github.com/gruntwork-io/go-commons v0.17.2 h1:14dsCJ7M5Vv2X3BIPKeG9Kdy6vTMGhM8L4WZazxfTuY=
github.com/gruntwork-io/go-commons v0.17.2/go.mod h1:zs7Q2AbUKuTarBPy19CIxJVUX/rBamfW8IwuWKniWkE=
github.com/gruntwork-io/terratest v0.51.0 h1:RCXlCwWlHqhUoxgF6n3hvywvbvrsTXqoqt34BrnLekw=
github.com/gruntwork-io/terratest v0.51.0/go.mod h1:evZHXb8VWDgv5O5zEEwfkwMhkx9I53QR/RB11cISrpg=
github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I=
github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo=
github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
github.com/homeport/dyff v1.10.2 h1:XyB+D0KVwjbUFTZYIkvPtsImwkfh+ObH2CEdEHTqdr4=
github.com/homeport/dyff v1.10.2/go.mod h1:0kIjL/JOGaXigzrLY6kcl5esSStbAa99r6GzEvr7lrs=
github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM=
github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg=
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo=
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM=
github.com/jackc/pgx/v5 v5.7.6 h1:rWQc5FwZSPX58r1OQmkuaNicxdmExaEz5A2DO2hUuTk=
github.com/jackc/pgx/v5 v5.7.6/go.mod h1:aruU7o91Tc2q2cFp5h4uP3f6ztExVpyVv88Xl/8Vl8M=
github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo=
github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/lucasb-eyer/go-colorful v1.3.0 h1:2/yBRLdWBZKrf7gB40FoiKfAWYQ0lqNcbuQwVHXptag=
github.com/lucasb-eyer/go-colorful v1.3.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
github.com/mattn/go-ciede2000 v0.0.0-20170301095244-782e8c62fec3 h1:BXxTozrOU8zgC5dkpn3J6NTRdoP+hjok/e+ACr4Hibk=
github.com/mattn/go-ciede2000 v0.0.0-20170301095244-782e8c62fec3/go.mod h1:x1uk6vxTiVuNt6S5R2UYgdhpj3oKojXvOXauHZ7dEnI=
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/mattn/go-zglob v0.0.6 h1:mP8RnmCgho4oaUYDIDn6GNxYk+qJGUs8fJLn+twYj2A=
github.com/mattn/go-zglob v0.0.6/go.mod h1:MxxjyoXXnMxfIpxTK2GAkw1w8glPsQILx3N5wrKakiY=
github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y=
github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
github.com/mitchellh/go-ps v1.0.0 h1:i6ampVEEF4wQFF+bkYfwYgY+F/uYJDktmvLPf7qIgjc=
github.com/mitchellh/go-ps v1.0.0/go.mod h1:J4lOc8z8yJs6vUwklHw2XEIiT4z4C40KtWVN3nvg8Pg=
github.com/mitchellh/hashstructure v1.1.0 h1:P6P1hdjqAAknpY/M1CGipelZgp+4y9ja9kmUZPXP+H0=
github.com/mitchellh/hashstructure v1.1.0/go.mod h1:xUDAozZz0Wmdiufv0uyhnHkUTN6/6d8ulp4AwfLKrmA=
github.com/moby/spdystream v0.5.0 h1:7r0J1Si3QO/kjRitvSLVVFUjxMEb/YLj6S9FF62JBCU=
github.com/moby/spdystream v0.5.0/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8=
github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus=
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw=
github.com/onsi/ginkgo/v2 v2.23.4 h1:ktYTpKJAVZnDT4VjxSbiBenUjmlL/5QkBEocaWXiQus=
github.com/onsi/ginkgo/v2 v2.23.4/go.mod h1:Bt66ApGPBFzHyR+JO10Zbt0Gsp4uWxu5mIOTusL46e8=
github.com/onsi/gomega v1.38.0 h1:c/WX+w8SLAinvuKKQFh77WEucCnPk4j2OTUr7lt7BeY=
github.com/onsi/gomega v1.38.0/go.mod h1:OcXcwId0b9QsE7Y49u+BTrL4IdKOBOKnD6VQNTJEB6o=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pquerna/otp v1.5.0 h1:NMMR+WrmaqXU4EzdGJEE1aUUI0AMRzsp96fFFWNPwxs=
github.com/pquerna/otp v1.5.0/go.mod h1:dkJfzwRKNiegxyNb54X/3fLwhCynbMspSyWKnvi1AEg=
github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/sergi/go-diff v1.4.0 h1:n/SP9D5ad1fORl+llWyN+D6qoUETXNZARKjyY2/KVCw=
github.com/sergi/go-diff v1.4.0/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4=
github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk=
github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
github.com/texttheater/golang-levenshtein v1.0.1 h1:+cRNoVrfiwufQPhoMzB6N0Yf/Mqajr6t1lOv8GyGE2U=
github.com/texttheater/golang-levenshtein v1.0.1/go.mod h1:PYAKrbF5sAiq9wd+H82hs7gNaen0CplQ9uvm6+enD/8=
github.com/urfave/cli/v2 v2.27.7 h1:bH59vdhbjLv3LAvIu6gd0usJHgoTTPhCFib8qqOwXYU=
github.com/urfave/cli/v2 v2.27.7/go.mod h1:CyNAG/xg+iAOg0N4MPGZqVmv2rCoP267496AOXUZjA4=
github.com/virtuald/go-ordered-json v0.0.0-20170621173500-b18e6e673d74 h1:JwtAtbp7r/7QSyGz8mKUbYJBg2+6Cd7OjM8o/GNOcVo=
github.com/virtuald/go-ordered-json v0.0.0-20170621173500-b18e6e673d74/go.mod h1:RmMWU37GKR2s6pgrIEB4ixgpVCt/cf7dnJv3fuH1J1c=
github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 h1:FnBeRrxr7OU4VvAzt5X7s6266i6cSVkkFPS0TuXWbIg=
github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs=
go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8=
go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0=
go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8=
go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.42.0 h1:chiH31gIWm57EkTXpwnqf8qeuMUi0yekh6mT2AvFlqI=
golang.org/x/crypto v0.42.0/go.mod h1:4+rDnOTJhQCx2q7/j6rAN5XDw8kPjeaXEUR2eL94ix8=
golang.org/x/exp v0.0.0-20250911091902-df9299821621 h1:2id6c1/gto0kaHYyrixvknJ8tUK/Qs5IsmBtrc+FtgU=
golang.org/x/exp v0.0.0-20250911091902-df9299821621/go.mod h1:TwQYMMnGpvZyc+JpB/UAuTNIsVJifOlSkrZkhcvpVUk=
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.44.0 h1:evd8IRDyfNBMBTTY5XRF1vaZlD+EmWx6x8PkhR04H/I=
golang.org/x/net v0.44.0/go.mod h1:ECOoLqd5U3Lhyeyo/QDCEVQ4sNgYsqvCZ722XogGieY=
golang.org/x/oauth2 v0.31.0 h1:8Fq0yVZLh4j4YA47vHKFTa9Ew5XIrCP8LC6UeNZnLxo=
golang.org/x/oauth2 v0.31.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug=
golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.36.0 h1:KVRy2GtZBrk1cBYA7MKu5bEZFxQk4NIDV6RLVcC8o0k=
golang.org/x/sys v0.36.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/term v0.35.0 h1:bZBVKBudEyhRcajGcNc3jIfWPqV4y/Kt2XcoigOWtDQ=
golang.org/x/term v0.35.0/go.mod h1:TPGtkTLesOwf2DE8CgVYiZinHAOuy5AYUYT1lENIZnA=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.29.0 h1:1neNs90w9YzJ9BocxfsQNHKuAT4pkghyXc4nhZ6sJvk=
golang.org/x/text v0.29.0/go.mod h1:7MhJOA9CD2qZyOKYazxdYMF85OwPdEr9jTtBpO7ydH4=
golang.org/x/time v0.13.0 h1:eUlYslOIt32DgYD6utsuUeHs4d7AsEYLuIAdg7FlYgI=
golang.org/x/time v0.13.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/tools v0.37.0 h1:DVSRzp7FwePZW356yEAChSdNcQo6Nsp+fex1SUW09lE=
golang.org/x/tools v0.37.0/go.mod h1:MBN5QPQtLMHVdvsbtarmTNukZDdgwdwlO5qGacAzF0w=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/protobuf v1.36.9 h1:w2gp2mA27hUeUzj9Ex9FBjsBm40zfaDtEWow293U7Iw=
google.golang.org/protobuf v1.36.9/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo=
gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M=
gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc=
gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
k8s.io/api v0.34.1 h1:jC+153630BMdlFukegoEL8E/yT7aLyQkIVuwhmwDgJM=
k8s.io/api v0.34.1/go.mod h1:SB80FxFtXn5/gwzCoN6QCtPD7Vbu5w2n1S0J5gFfTYk=
k8s.io/apimachinery v0.34.1 h1:dTlxFls/eikpJxmAC7MVE8oOeP1zryV7iRyIjB0gky4=
k8s.io/apimachinery v0.34.1/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw=
k8s.io/client-go v0.34.1 h1:ZUPJKgXsnKwVwmKKdPfw4tB58+7/Ik3CrjOEhsiZ7mY=
k8s.io/client-go v0.34.1/go.mod h1:kA8v0FP+tk6sZA0yKLRG67LWjqufAoSHA2xVGKw9Of8=
k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZcmKS3g6CthxToOb37KgwE=
k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ=
k8s.io/utils v0.0.0-20250820121507-0af2bda4dd1d h1:wAhiDyZ4Tdtt7e46e9M5ZSAJ/MnPGPs+Ki1gHw4w1R0=
k8s.io/utils v0.0.0-20250820121507-0af2bda4dd1d/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg=
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg=
sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU=
sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY=
sigs.k8s.io/structured-merge-diff/v6 v6.3.0 h1:jTijUJbW353oVOd9oTlifJqOGEkUw2jB/fXCbTiQEco=
sigs.k8s.io/structured-merge-diff/v6 v6.3.0/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE=
sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs=
sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4=


================================================
FILE: contrib/charts/dragonfly/golden_test.go
================================================
package golden

import (
	"flag"
	"fmt"
	"os"
	"path/filepath"
	"regexp"
	"strings"
	"testing"

	"github.com/gruntwork-io/terratest/modules/helm"
)

var update = flag.Bool("update", false, "update golden test output files")

func TestHelmRender(t *testing.T) {
	files, err := os.ReadDir("./ci")
	if err != nil {
		t.Fatal(err)
	}

	for _, f := range files {
		if !f.IsDir() && strings.HasSuffix(f.Name(), ".yaml") && !strings.HasSuffix(f.Name(), ".golden.yaml") {
			// Render this values.yaml file
			output := helm.RenderTemplate(t,
				&helm.Options{
					ValuesFiles: []string{"ci/" + f.Name()},
				},
				"../dragonfly",
				"test",
				nil,
			)

			goldenFile := "ci/" + strings.TrimSuffix(f.Name(), filepath.Ext(".yaml")) + ".golden.yaml"
			regex := regexp.MustCompile(`\s+helm.sh/chart:\s+.*`)
			bytes := regex.ReplaceAll([]byte(output), []byte(""))

			output = fmt.Sprintf("%s\n", string(bytes))

			if *update {
				err := os.WriteFile(goldenFile, []byte(output), 0644)
				if err != nil {
					t.Fatal(err)
				}
			}

			expected, err := os.ReadFile(goldenFile)
			if err != nil {
				t.Fatal(err)
			}

			if string(expected) != output {
				t.Fatalf("Expected %s, but got %s\n. Update golden files by running `go test -v ./... -update`", string(expected), output)
			}
		}
	}
}


================================================
FILE: contrib/charts/dragonfly/templates/NOTES.txt
================================================
1. Get the application URL by running these commands:

{{- if contains "NodePort" .Values.service.type }}
  export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "dragonfly.fullname" . }})
  export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}")
  echo http://$NODE_IP:$NODE_PORT
{{- else if contains "LoadBalancer" .Values.service.type }}
     NOTE: It may take a few minutes for the LoadBalancer IP to be available.
           You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "dragonfly.fullname" . }}'
  export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "dragonfly.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}")
  echo http://$SERVICE_IP:{{ .Values.service.port }}
{{- else if contains "ClusterIP" .Values.service.type }}
  export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "dragonfly.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}")
  export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}")
  echo "You can use redis-cli to connect against localhost:6379"
  kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 6379:$CONTAINER_PORT
{{- end }}

================================================
FILE: contrib/charts/dragonfly/templates/_helpers.tpl
================================================
{{/*
Expand the name of the chart.
*/}}
{{- define "dragonfly.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
{{- end }}

{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/}}
{{- define "dragonfly.fullname" -}}
{{- if .Values.fullnameOverride }}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- $name := default .Chart.Name .Values.nameOverride }}
{{- if contains $name .Release.Name }}
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
{{- end }}
{{- end }}
{{- end }}

{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "dragonfly.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
{{- end }}

{{/*
Common labels
*/}}
{{- define "dragonfly.labels" -}}
helm.sh/chart: {{ include "dragonfly.chart" . }}
{{ include "dragonfly.selectorLabels" . }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- include "dragonfly.commonLabels" . }}
{{- end }}

{{/*
User-defined common labels
*/}}
{{- define "dragonfly.commonLabels" -}}
{{- if .Values.commonLabels }}
{{- range $key, $value := .Values.commonLabels }}
{{ $key }}: {{ $value }}
{{- end }}
{{- end }}
{{- end }}

{{/*
Selector labels
*/}}
{{- define "dragonfly.selectorLabels" -}}
app.kubernetes.io/name: {{ include "dragonfly.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}

{{/*
Create the name of the service account to use
*/}}
{{- define "dragonfly.serviceAccountName" -}}
{{- if .Values.serviceAccount.create }}
{{- default (include "dragonfly.fullname" .) .Values.serviceAccount.name }}
{{- else }}
{{- default "default" .Values.serviceAccount.name }}
{{- end }}
{{- end }}


================================================
FILE: contrib/charts/dragonfly/templates/_pod.tpl
================================================
{{- define "dragonfly.volumemounts" -}}
{{- if or (.Values.storage.enabled) (.Values.extraVolumeMounts) (.Values.tls.enabled) }}
volumeMounts:
  {{- if .Values.storage.enabled }}
  - mountPath: /data
    name: "{{ .Release.Name }}-data"
  {{- end }}
  {{- if and .Values.tls .Values.tls.enabled }}
  - mountPath: /etc/dragonfly/tls
    name: tls
  {{- end }}
  {{- with .Values.extraVolumeMounts }}
    {{- toYaml . | trim | nindent 2 }}
  {{- end }}
{{- end }}
{{- end }}

{{- define "dragonfly.pod" -}}
{{- if ne .Values.priorityClassName "" }}
priorityClassName: {{ .Values.priorityClassName }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
  {{- toYaml . | trim | nindent 2 -}}
{{- end }}
{{- with .Values.nodeSelector }}
nodeSelector:
  {{- toYaml . | trim | nindent 2 -}}
{{- end }}
{{- with .Values.affinity }}
affinity:
  {{- toYaml . | trim | nindent 2 -}}
{{- end }}
serviceAccountName: {{ include "dragonfly.serviceAccountName" . }}
{{- with .Values.imagePullSecrets }}
imagePullSecrets:
  {{- toYaml . | trim | nindent 2 }}
{{- end }}
{{- with .Values.podSecurityContext }}
securityContext:
  {{- toYaml . | trim | nindent 2 }}
{{- end }}
{{- if and (eq (typeOf .Values.hostNetwork) "bool") .Values.hostNetwork }}
hostNetwork: true
{{- end }}
{{- with .Values.topologySpreadConstraints }}
topologySpreadConstraints:
  {{- toYaml . | trim | nindent 2 }}
{{- end }}
{{- with .Values.initContainers }}
initContainers:
  {{- if eq (typeOf .) "string" }}
  {{- tpl . $ | trim | nindent 2 }}
  {{- else }}
  {{- toYaml . | trim | nindent 2 }}
  {{- end }}
{{- end }}
containers:
  {{- with .Values.extraContainers }}
  {{- if eq (typeOf .) "string" -}}
  {{- tpl . $ | trim | nindent 2 }}
  {{- else }}
  {{- toYaml . | trim | nindent 2 }}
  {{- end }}
  {{- end }}
  - name: {{ .Chart.Name }}
    {{- with .Values.securityContext }}
    securityContext:
      {{- toYaml . | trim | nindent 6 }}
    {{- end }}
    image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
    imagePullPolicy: {{ .Values.image.pullPolicy }}
    ports:
      - name: dragonfly
        containerPort: 6379
        protocol: TCP
    {{- with .Values.probes }}
    {{- toYaml . | trim | nindent 4 }}
    {{- end }}
    {{- with .Values.command }}
    command:
      {{- toYaml . | trim | nindent 6 }}
    {{- end }}
    args:
      - "--alsologtostderr"
    {{- with .Values.extraArgs }}
      {{- toYaml . | trim | nindent 6 }}
    {{- end }}
    {{- if .Values.tls.enabled }}
      - "--tls"
      - "--tls_cert_file=/etc/dragonfly/tls/tls.crt"
      - "--tls_key_file=/etc/dragonfly/tls/tls.key"
    {{- end }}
    {{- with .Values.resources }}
    resources:
      {{- toYaml . | trim | nindent 6 }}
    {{- end }}
    {{- include "dragonfly.volumemounts" . | trim | nindent 4 }}
    {{- if or .Values.passwordFromSecret.enable .Values.env }}
    env:
    {{- if .Values.passwordFromSecret.enable }}
    {{- $appVersion := .Chart.AppVersion | trimPrefix "v" }}
    {{- $imageTag := .Values.image.tag | trimPrefix "v" }}
    {{- $effectiveVersion := $appVersion }}
    {{- if and $imageTag (ne $imageTag "") }}
      {{- $effectiveVersion = $imageTag }}
    {{- end }}
    {{- if semverCompare ">=1.14.0" $effectiveVersion }}
      - name: DFLY_requirepass
    {{- else }}
      - name: DFLY_PASSWORD
    {{- end }}
        valueFrom:
          secretKeyRef:
            name: {{ tpl .Values.passwordFromSecret.existingSecret.name $ }}
            key: {{ .Values.passwordFromSecret.existingSecret.key }}
    {{- end }}
    {{- with .Values.env }}
      {{- toYaml . | trim | nindent 6 }}
    {{- end }}
    {{- end }}
    {{- with .Values.envFrom }}
    envFrom:
      {{- toYaml . | trim | nindent 6 }}
    {{- end }}

{{- if or (.Values.tls.enabled) (.Values.extraVolumes) }}
volumes:
{{- if and .Values.tls .Values.tls.enabled }}
  {{- if .Values.tls.existing_secret }}
  - name: tls
    secret:
      secretName: {{ .Values.tls.existing_secret }}
  {{- else if .Values.tls.createCerts }}
  - name: tls
    secret:
      secretName: '{{ include "dragonfly.fullname" . }}-server-tls'
  {{- else }}
  - name: tls
    secret:
      secretName: {{ include "dragonfly.fullname" . }}-tls
  {{- end }}
{{- end }}
{{- with .Values.extraVolumes }}
  {{- toYaml . | trim | nindent 2 }}
{{- end }}
{{- end }}
{{- end }}


================================================
FILE: contrib/charts/dragonfly/templates/certificate.yaml
================================================
{{- if and .Values.tls.enabled .Values.tls.createCerts }}
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
  name: {{ include "dragonfly.fullname" . }}
  namespace: {{ .Release.Namespace }}
  labels:
    {{- include "dragonfly.labels" . | nindent 4 }}
spec:
  commonName: '{{ include "dragonfly.fullname" . }}'
  dnsNames:
  - '*.{{ include "dragonfly.fullname" . }}.{{ .Release.Namespace }}.svc.cluster.local'
  - '{{ include "dragonfly.fullname" . }}.{{ .Release.Namespace }}.svc.cluster.local'
  - '{{ include "dragonfly.fullname" . }}.{{ .Release.Namespace }}.svc'
  - '{{ include "dragonfly.fullname" . }}.{{ .Release.Namespace }}'
  - '{{ include "dragonfly.fullname" . }}'
  - localhost
  duration: {{ required "tls.duration is required, if createCerts is enabled" .Values.tls.duration }}
  ipAddresses:
  - 127.0.0.1
  issuerRef:
    kind: {{ required "tls.issuer.kind is required, if createCerts is enabled" .Values.tls.issuer.kind }}
    name: {{ required "tls.issuer.name is required, if createCerts is enabled" .Values.tls.issuer.name }}
    group: {{ .Values.tls.issuer.group }}
  secretName: '{{ include "dragonfly.fullname" . }}-server-tls'
  usages:
  - client auth
  - server auth
  - signing
  - key encipherment
{{- end }}


================================================
FILE: contrib/charts/dragonfly/templates/deployment.yaml
================================================
{{- if not .Values.storage.enabled }}
apiVersion: apps/v1
kind: Deployment
metadata:
  name: {{ include "dragonfly.fullname" . }}
  namespace: {{ .Release.Namespace }}
  labels:
    {{- include "dragonfly.labels" . | nindent 4 }}
spec:
  replicas: {{ .Values.replicaCount }}
  selector:
    matchLabels:
      {{- include "dragonfly.selectorLabels" . | nindent 6 }}
  template:
    metadata:
      annotations:
        {{- if and (.Values.tls.enabled) (not .Values.tls.existing_secret) }}
        checksum/tls-secret: {{ include (print $.Template.BasePath "/tls-secret.yaml") . | sha256sum }}
        {{- end }}
        {{- with .Values.podAnnotations }}
        {{- toYaml . | nindent 8 }}
        {{- end }}
      labels:
        {{- include "dragonfly.selectorLabels" . | nindent 8 }}
        {{- if .Values.commonLabels }}
        {{- include "dragonfly.commonLabels" . | trim | nindent 8 }}
        {{- end }}
    spec:
      {{- include "dragonfly.pod" . | trim | nindent 6 }}
{{- end }}


================================================
FILE: contrib/charts/dragonfly/templates/extra-manifests.yaml
================================================
{{ range .Values.extraObjects }}
---
{{ tpl (toYaml .) $ }}
{{ end }}


================================================
FILE: contrib/charts/dragonfly/templates/metrics-service.yaml
================================================
{{- if .Values.serviceMonitor.enabled }}
apiVersion: v1
kind: Service
metadata:
  name: {{ include "dragonfly.fullname" . }}-metrics
  namespace: {{ .Release.Namespace }}
  labels:
    {{- include "dragonfly.labels" . | nindent 4 }}
    type: metrics
spec:
  type: {{ .Values.service.metrics.serviceType }}
  ports:
    - name: {{ .Values.service.metrics.portName }}
      port: {{ .Values.service.port }}
      targetPort: {{ .Values.service.port }}
      protocol: TCP
  selector:
    {{- include "dragonfly.selectorLabels" . | nindent 4 }}
{{- end }}


================================================
FILE: contrib/charts/dragonfly/templates/prometheusrule.yaml
================================================
{{- if and ( .Capabilities.APIVersions.Has "monitoring.coreos.com/v1" ) .Values.serviceMonitor.enabled .Values.prometheusRule.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: {{ template "dragonfly.fullname" . }}-metrics
  namespace: {{ .Values.prometheusRule.namespace | default .Release.Namespace }}
  labels:
    {{- include "dragonfly.labels" . | nindent 4 }}
spec:
  groups:
  - name: {{ template "dragonfly.name" . }}
    rules:
      {{- toYaml .Values.prometheusRule.spec | nindent 6 }}
{{- end }}


================================================
FILE: contrib/charts/dragonfly/templates/service.yaml
================================================
apiVersion: v1
kind: Service
metadata:
  name: {{ include "dragonfly.fullname" . }}
  namespace: {{ .Release.Namespace }}
  {{- with .Values.service.annotations }}
  annotations:
    {{- toYaml . | nindent 4 }}
  {{- end }}
  labels:
    {{- with .Values.service.labels }}
    {{- toYaml . | nindent 4 }}
    {{- end }}
    {{- include "dragonfly.labels" . | nindent 4 }}
spec:
  type: {{ .Values.service.type }}
  {{- if and (eq .Values.service.type "LoadBalancer") (ne .Values.service.loadBalancerIP "") }}
  loadBalancerIP: {{ .Values.service.loadBalancerIP }}
  {{- end }}
  {{- if and (eq .Values.service.type "ClusterIP") (ne .Values.service.clusterIP "") }}
  clusterIP: {{ .Values.service.clusterIP }}
  {{- end }}
  ports:
    - port: {{ .Values.service.port }}
      targetPort: dragonfly
      protocol: TCP
      name: dragonfly
  selector:
    {{- include "dragonfly.selectorLabels" . | nindent 4 }}


================================================
FILE: contrib/charts/dragonfly/templates/serviceaccount.yaml
================================================
{{- if .Values.serviceAccount.create -}}
apiVersion: v1
kind: ServiceAccount
metadata:
  name: {{ include "dragonfly.serviceAccountName" . }}
  namespace: {{ .Release.Namespace }}
  {{- with .Values.serviceAccount.annotations }}
  annotations:
    {{- toYaml . | nindent 4 }}
  {{- end }}
  labels:
    {{- include "dragonfly.labels" . | nindent 4 }}
{{- end }}


================================================
FILE: contrib/charts/dragonfly/templates/servicemonitor.yaml
================================================
{{- if .Values.serviceMonitor.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: {{ template "dragonfly.fullname" . }}-metrics
  {{- if .Values.serviceMonitor.namespace }}
  namespace: {{ .Values.serviceMonitor.namespace }}
  {{- else }}
  namespace: {{ .Release.Namespace }}
  {{- end }}
  {{- with .Values.serviceMonitor.annotations }}
  annotations:
    {{- toYaml . | nindent 4 }}
  {{- end }}
  labels:
    {{- with .Values.serviceMonitor.labels }}
    {{- toYaml . | nindent 4 }}
    {{- end }}
    {{- include "dragonfly.labels" . | nindent 4 }}
spec:
  endpoints:
    - interval: {{ .Values.serviceMonitor.interval }}
      {{- with .Values.serviceMonitor.scrapeTimeout }}
      scrapeTimeout: {{ . }}
      {{- end }}
      honorLabels: true
      port: {{ default "metrics" .Values.service.metrics.portName }}
      path: /metrics
      {{- if .Values.tls.enabled }}
      scheme: https
      tlsConfig:
        insecureSkipVerify: true
      {{- else }}
      scheme: http
      {{- end }}
  jobLabel: "{{ .Release.Name }}"
  selector:
    matchLabels:
      {{- include "dragonfly.selectorLabels" . | nindent 6 }}
      type: metrics
  namespaceSelector:
    matchNames:
      - {{ .Release.Namespace }}
{{- end }}


================================================
FILE: contrib/charts/dragonfly/templates/statefulset.yaml
================================================
{{- if .Values.storage.enabled }}
apiVersion: apps/v1
kind: StatefulSet
metadata:
  name: {{ include "dragonfly.fullname" . }}
  namespace: {{ .Release.Namespace }}
  labels:
    {{- include "dragonfly.labels" . | nindent 4 }}
spec:
  serviceName: {{ .Release.Name }}
  replicas: {{ .Values.replicaCount }}
  selector:
    matchLabels:
      {{- include "dragonfly.selectorLabels" . | nindent 6 }}
  template:
    metadata:
      annotations:
        {{- if and (.Values.tls.enabled) (not .Values.tls.existing_secret) }}
        checksum/tls-secret: {{ include (print $.Template.BasePath "/tls-secret.yaml") . | sha256sum }}
        {{- end }}
        {{- with .Values.podAnnotations }}
        {{- toYaml . | nindent 8 }}
        {{- end }}
      labels:
        {{- include "dragonfly.selectorLabels" . | nindent 8 }}
        {{- if .Values.commonLabels }}
        {{- include "dragonfly.commonLabels" . | trim | nindent 8 }}
        {{- end }}
    spec:
      {{- include "dragonfly.pod" . | trim | nindent 6 }}
  volumeClaimTemplates:
    - metadata:
        name: "{{ .Release.Name }}-data"
      spec:
        accessModes: [ "ReadWriteOnce" ]
        storageClassName: {{ .Values.storage.storageClassName }}
        resources:
          requests:
            storage: {{ .Values.storage.requests }}
{{- end }}


================================================
FILE: contrib/charts/dragonfly/templates/tls-secret.yaml
================================================
{{- if and (.Values.tls.enabled) (.Values.tls.cert) (.Values.tls.key) (not .Values.tls.existing_secret) }}
apiVersion: v1
kind: Secret
metadata:
  name: {{ include "dragonfly.fullname" . }}-tls
  namespace: {{ .Release.Namespace }}
  labels:
    {{- include "dragonfly.labels" . | nindent 4 }}
type: kubernetes.io/tls
data:
  tls.crt: {{ default "" .Values.tls.cert | b64enc | quote }}
  tls.key: {{ default "" .Values.tls.key | b64enc | quote }}
{{- end }}


================================================
FILE: contrib/charts/dragonfly/values.yaml
================================================
# Default values for dragonfly.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.

# -- Number of replicas to deploy
replicaCount: 1

image:
  # -- Container Image Registry to pull the image from
  repository: docker.dragonflydb.io/dragonflydb/dragonfly
  # -- Dragonfly image pull policy
  pullPolicy: IfNotPresent
  # -- Overrides the image tag whose default is the chart appVersion.
  tag: ""

# -- Container Registry Secret names in an array
imagePullSecrets: []

# -- String to partially override dragonfly.fullname
nameOverride: ""

# -- String to fully override dragonfly.fullname
fullnameOverride: ""

# -- Common labels to add to all resources
commonLabels: {}

serviceAccount:
  # -- Specifies whether a service account should be created
  create: true
  # -- Annotations to add to the service account
  annotations: {}
  # -- The name of the service account to use.
  # If not set and create is true, a name is generated using the fullname template
  name: ""

# -- Annotations for pods
podAnnotations: {}

# -- Set securityContext for pod itself
podSecurityContext: {}
  # fsGroup: 2000

# -- Set securityContext for containers
securityContext: {}
  # capabilities:
  #   drop:
  #   - ALL
  # readOnlyRootFilesystem: true
  # runAsNonRoot: true
  # runAsUser: 1000

# -- Set hostNetwork for pod
hostNetwork: false

service:
  # -- Service type to provision. Can be NodePort, ClusterIP or LoadBalancer
  type: ClusterIP
  # -- Load balancer static ip to use when service type is set to LoadBalancer
  loadBalancerIP: ""
  # -- Cluster IP address to assign to the service. Leave empty to auto-allocate
  clusterIP: ""
  # -- Dragonfly service port
  port: 6379
  # -- Extra annotations for the service
  annotations: {}
  # -- Extra labels for the service
  labels: {}
  metrics:
    # -- name for the metrics port
    portName: metrics
    # -- serviceType for the metrics service
    serviceType: ClusterIP

serviceMonitor:
  # -- If true, a ServiceMonitor CRD is created for a prometheus operator
  enabled: false
  # -- namespace in which to deploy the ServiceMonitor CR. defaults to the application namespace
  namespace: ""
  # -- additional labels to apply to the metrics
  labels: {}
  # -- additional annotations to apply to the metrics
  annotations: {}
  # -- scrape interval
  interval: 10s
  # -- scrape timeout
  scrapeTimeout: 10s

prometheusRule:
  # -- Deploy a PrometheusRule
  enabled: false
  # -- PrometheusRule.Spec
  # https://awesome-prometheus-alerts.grep.to/rules
  spec: []

storage:
  # -- If /data should persist. This will provision a StatefulSet instead.
  enabled: false
  # -- Global StorageClass for Persistent Volume(s)
  storageClassName: ""
  # -- Volume size to request for the PVC
  requests: 128Mi

tls:
  # -- enable TLS
  enabled: false
  # -- use cert-manager to automatically create the certificate
  createCerts: false
  # -- duration or ttl of the validity of the created certificate
  duration: 87600h0m0s
  issuer:
    # -- cert-manager issuer kind. Usually Issuer or ClusterIssuer
    kind: ClusterIssuer
    # -- name of the referenced issuer
    name: selfsigned
    # -- group of the referenced issuer
    # if you are using an external issuer, change this to that issuer group.
    group: cert-manager.io
  # -- use TLS certificates from existing secret
  existing_secret: ""
  # -- TLS certificate
  cert: ""
  # cert: |
  #   -----BEGIN CERTIFICATE-----
  #   MIIDazCCAlOgAwIBAgIUfV3ygaaVW3+yzK5Dq6Aw6TsZ494wDQYJKoZIhvcNAQEL
  #   ...
  #   BQAwRTELMAkGA1UEBhMCQVUxEzARBgNVBAgMClNvbWUtU3RhdGUxITAfBgNVBAoM
  #   zJAL4hNw4Tr6E52fqdmX
  #   -----END CERTIFICATE-----
  # -- TLS private key
  key: ""
  # key: |
  #   -----BEGIN RSA PRIVATE KEY-----
  #   MIIEpAIBAAKCAQEAxeD5iQGQpCUlksFvjzzAxPTw6DMJd3MpifV+HoBY4LiTyDer
  #   ...
  #   HLunol88AeTOcKfD6hBYGvcRfu5NV29jJxZCOBfbFQXjnNlnrhRCag==
  #   -----END RSA PRIVATE KEY-----

# If enabled will set DFLY_PASSWORD environment variable with the specified existing secret value
# Note that if enabled and the secret does not exist pods will not start
passwordFromSecret:
  enable: false
  existingSecret:
    name: ""
    key: ""


probes:
  livenessProbe:
    exec:
      command:
        - /bin/sh
        - /usr/local/bin/healthcheck.sh
    initialDelaySeconds: 10
    periodSeconds: 10
    timeoutSeconds: 5
    failureThreshold: 3
    successThreshold: 1
  readinessProbe:
    exec:
      command:
        - /bin/sh
        - /usr/local/bin/healthcheck.sh
    initialDelaySeconds: 10
    periodSeconds: 10
    timeoutSeconds: 5
    failureThreshold: 3
    successThreshold: 1

# -- Allow overriding the container's command
command: []

# -- Extra arguments to pass to the dragonfly binary
extraArgs: []

# -- Extra volumes to mount into the pods
extraVolumes: []

# -- Extra volume mounts corresponding to the volumes mounted above
extraVolumeMounts: []

# -- A list of initContainers to run before each pod starts
initContainers: []

# -- Additional sidecar containers
extraContainers: []

# -- extra K8s manifests to deploy
extraObjects: []
  # - apiVersion: cert-manager.io/v1
  #   kind: ClusterIssuer
  #   metadata:
  #     name: selfsigned
  #   spec:
  #     selfSigned: {}

resources:
  # -- The requested resources for the containers
  requests: {}
  #   cpu: 100m
  #   memory: 128Mi
  # -- The resource limits for the containers
  limits: {}
  #   cpu: 100m
  #   memory: 128Mi

# -- extra environment variables
env: []

# -- extra environment variables from K8s objects
envFrom: []

# -- Priority class name for pod assignment
priorityClassName: ""

# -- Node labels for pod assignment
nodeSelector: {}

# -- Tolerations for pod assignment
tolerations: []

# -- Affinity for pod assignment
affinity: {}

# -- Topology Spread Constraints for pod assignment
topologySpreadConstraints: []


================================================
FILE: contrib/docker/README.md
================================================
<p align="center">
  <a href="https://dragonflydb.io">
    <img src="https://raw.githubusercontent.com/dragonflydb/dragonfly/main/.github/images/logo-full.svg"
      width="284" border="0" alt="Dragonfly">
  </a>
</p>


# Dragonfly DB with Docker Compose

This guide will have you up running DragonflyDB with `docker-compose` in just a few minutes.

| This guide assumes you have `docker` and `docker-compose` installed on your machine. If not, [Install Docker](https://docs.docker.com/get-docker/) and [Install Docker Compose](https://docs.docker.com/compose/install/) before continuing.

## Step 1

```bash
# Download Official Dragonfly DB Docker Compose File
wget https://raw.githubusercontent.com/dragonflydb/dragonfly/main/contrib/docker/docker-compose.yml

# Launch the Dragonfly DB Instance
docker-compose up -d

# Confirm image is up
docker ps | grep dragonfly
# ac94b5ba30a0   docker.dragonflydb.io/dragonflydb/dragonfly   "entrypoint.sh drago…"   45 seconds ago   Up 31 seconds         0.0.0.0:6379->6379/tcp, :::6379->6379/tcp   docker_dragonfly_1

# Log follow the dragonfly container
docker logs -f docker_dragonfly_1
```

Dragonfly DB will answer to both `http` and `redis` requests out of the box!

You can use `redis-cli` to connect to `localhost:6379` or open a browser and visit `http://localhost:6379`

## Step 2

Connect with a redis client.

From a new terminal:

```bash
redis-cli
127.0.0.1:6379> set hello world
OK
127.0.0.1:6379> keys *
1) "hello"
127.0.0.1:6379> get hello
"world"
127.0.0.1:6379> 
```

## Step 3

Continue being great and build your app with the power of DragonflyDB!  

## Tuning Dragonfly DB
If you are attempting to tune Dragonfly DB for performance, consider `NAT` performance costs associated with containerization.  
> ## Performance Tuning
> ---
> In `docker-compose`, there is a meaningful difference between an `overlay` network(which relies on docker `NAT` traversal on every request) and using the `host` network(see [`docker-compose.yml`](https://github.com/dragonflydb/dragonfly/blob/main/contrib/docker/docker-compose.yml)).  
> &nbsp;  
> Fore more information, see the [official docker-compose network_mode Docs](https://docs.docker.com/compose/compose-file/compose-file-v3/#network_mode)  
> &nbsp;  

### More Build Options
- [Docker Quick Start](/docs/quick-start/)
- [Kubernetes Deployment with Helm Chart](/contrib/charts/dragonfly/)
- [Build From Source](/docs/build-from-source.md)

================================================
FILE: contrib/docker/docker-compose.yml
================================================
services:
  dragonfly:
    image: 'docker.dragonflydb.io/dragonflydb/dragonfly'
    ulimits:
      memlock: -1
    ports:
      - "6379:6379"
    # For better performance, consider `host` mode instead `port` to avoid docker NAT.
    # `host` mode is NOT currently supported in Swarm Mode.
    # https://docs.docker.com/compose/compose-file/compose-file-v3/#network_mode
    # network_mode: "host"
    volumes:
      - dragonflydata:/data
volumes:
  dragonflydata:


================================================
FILE: contrib/scripts/conventional-commits
================================================
#!/usr/bin/env bash

# list of Conventional Commits types
cc_types=("feat" "fix")
default_types=("build" "chore" "ci" "docs" "${cc_types[@]}" "perf" "refactor" "revert" "style" "test")
types=( "${cc_types[@]}" )

if [ $# -eq 1 ]; then
    types=( "${default_types[@]}" )
else
    while [ $# -gt 1 ]; do
        types+=( "$1" )
        shift
    done
fi

msg_file="$1"

r_types="($(IFS='|'; echo "${types[*]}"))"
r_scope="(\([[:alnum:] \/-]+\))?"
r_delim='!?:'
r_subject=" [[:print:]].+"
pattern="^$r_types$r_scope$r_delim$r_subject$"

if grep -Eq "$pattern" "$msg_file"; then
    exit 0
fi

echo "[Commit message] $( cat "$msg_file" )"
echo "
Thank you for your interest in Dragonfly DB. 

To keep things clean, we ask all commits to meet the following criteria:
  - Be Signed (git commit -s -m ...)
  - Valid Conventional Commit https://www.conventionalcommits.org/
  
  Special Commit Words are correlated to versioning. Specifically \"fix\" and \"feat\"
  - fix: a commit of the type fix patches a bug in your codebase (this correlates with PATCH in Semantic Versioning).
  - feat: a commit of the type feat introduces a new feature to the codebase (this correlates with MINOR in Semantic Versioning).
  - Breaking changes have a ! before the \":\"

  Finally, If there is an Issue for this Commit, Please add it to the end of the commit message.
  - Reference Issue Number at End of Commit Message (Optional)

Thank you for helping us label a \`fix\` and \`feat\` properly so that our commits, issues and semantic versioning are all aligned!

A Signed Conventional Commit with Issue Number look like: 

    git commit -s -m \"type(scope): description #112\"

Valid types:

    $(IFS=' '; echo "${types[*]}")

Example Document Change:

    docs(readme): Fix Example Links #121

Example Breaking New Feature
    feat(ingest)!: Add new ingest # 122

This is an example of a fix with an Issue #

    fix(ingest): Refactor for loop to list comprehension #123

Thank you for your contribution!

Sincerely,
The Dragonfly DB Contributors
"
exit 1


================================================
FILE: contrib/scripts/signed-commit
================================================
#!/usr/bin/env bash

if [[ -z "$1" ]] || [[ ! -f "$1" ]]; then
  echo "ERROR: Commit message file not provided or does not exist."
  exit 1
fi

# Check if signed-off-by line is present (automatically added using -s flag)
if ! grep -q 'Signed-off-by:' "$1"; then
  echo "ERROR: Commit message must contain a Signed-off-by line."
  echo ""
  echo "To sign your commits, use the -s flag:"
  echo "  git commit -s -m \"your commit message\""
  exit 1
fi

exit 0


================================================
FILE: docs/README.md
================================================
<p align="center">
  <a href="https://dragonflydb.io">
    <img  src="https://raw.githubusercontent.com/dragonflydb/dragonfly/main/.github/images/logo-full.svg"
      width="284" border="0" alt="Dragonfly">
  </a>
</p>


# Quick Start

The easiest way to get started with Dragonfly is with Docker.

## Deployment Method

First, choose a deployment method.

If you are new to Dragonfly, we recommend the [DragonflyDB Docker Quick Start Guide](/docs/quick-start/)

Other options:

### - [Docker Compose](/contrib/docker/)

### - [Helm Chart for Kubernetes](/contrib/charts/dragonfly/)


# Learn About DragonflyDB
## [FAQ](/docs/faq.md)

## [Differences Between DragonflyDB and Redis](/docs/differences.md)

## [API Commands Reference](https://dragonflydb.io/docs/category/command-reference)


================================================
FILE: docs/async-tiering.md
================================================
# Async Tiering Design Document

## Background

Our current tiered storage component performs disk operations inline as part of executing shard-local operations. This approach introduces latency when processing commands, impacting both the system's throughput and overall command latency. The following document discusses a potential redesign that addresses this issue and enables the execution of operations without I/O blocking.

```mermaid
graph LR
    %% Left Side: No Tiering
    subgraph S1 [Shard queue no tiering]
        direction TB
        A1[get] --- B1[set]
        B1 --- C1[get]
        C1 --- D1["&nbsp;"]
    end

    %% Spacing and Arrows
    S1 --- Space1[ ]
    Space1 -.-> Space2[ ]
    Space2 --- S2

    %% Right Side: With Tiering
    subgraph S2 [Shard queue with tiering]
        direction TB
        A2["get<br/>I/O read"] --- B2["set<br/>I/O write"]
        B2 --- C2["get<br/>I/O read"]
        C2 --- D2["&nbsp;"]
    end

    %% Styling
    style S1 fill:#fff,stroke:#ffcc00,stroke-width:2px
    style S2 fill:#fff,stroke:#ffcc00,stroke-width:2px
    style A1 fill:#fff,stroke:#ffcc00
    style B1 fill:#fff,stroke:#ffcc00
    style C1 fill:#fff,stroke:#ffcc00
    style D1 fill:none,stroke:none
    style A2 fill:#fff,stroke:#ffcc00
    style B2 fill:#fff,stroke:#ffcc00
    style C2 fill:#fff,stroke:#ffcc00
    style D2 fill:none,stroke:none

    %% Hide the spacer nodes
    style Space1 fill:none,stroke:none
    style Space2 fill:none,stroke:none
```

## High level design

The core goal is to perform tiered I/O operations concurrently while maintaining transparency for the transaction framework designed for instant RAM operations.

Transactions issue asynchronous requests to the tiered storage, returning futures that the coordinating fiber awaits. Operations on the same key execute strictly in order, relying on the transactional framework for correctness, while operations on different keys can be interleaved for efficiency.

### The following diagram depicts a simplified flow for a GET operation:

```mermaid
sequenceDiagram
    participant Coordinator
    participant Shard
    participant Disk

    Coordinator->>Shard: Get
    Shard->>Disk: IO_Read
    Shard-->>Coordinator: ResultFuture
    Disk-->>Shard: ReadCallback
    Shard-->>Coordinator: ResultFulfilled
```

The coordinator fiber schedules a command on a shard thread. The command performs initial work, issues an asynchronous read, and returns a `ResultFuture` to the coordinator. The coordinator waits for fulfillment before replying. This parallelism hides most I/O latency (assuming non-saturated SSDs).

For complex operations like `APPEND`, that require reading the value and modifying it, a post-read handler runs on the shard thread. Since in-place disk modification isn't supported, `APPEND` becomes an IO-READ followed by a handler that modifies the value in memory. The result is returned to the coordinator and the modified value is uploaded to memory and is deleted on disk.

It is important to note that only a single read is issued for all pending asynchronous commands for a given key. Once the read finished, all callbacks are executed consecutively and atomically. This guarantees correctness of operation order and outside observers. This execution loops is aided by specialized Decoder classes that keep an intermediary value in-between modifications or avoid creating it at all for read-only sequences.

Unlike the previous design where `DbSlice::Find(...)` handled tiering transparently, command implementations handling offloaded values must now use callbacks or futures (e.g., via `TieredStorage::Read` or `Modify`).

### Tiered Storage Component

The `TieredStorage` component manages the lifecycle of offloaded items. Externalized blobs are immutable on disk; operations involve stashing new blobs, reading existing ones, or marking them for deletion.

#### Upstream API (TieredStorage)

The primary interface used by commands includes:

1.  `Read(DbIndex, Key, Value) -> Future<string>`: Asynchronously fetch an offloaded value.
2.  `Modify(DbIndex, Key, Value, ModFunc) -> Future<Result>`: Fetch, modify in memory (via callback), and update.
3.  `TryStash(DbIndex, Key, Value) -> Future<bool>`: Schedule a value for offloading.
4.  `Delete(DbIndex, Value)`: Remove offloaded value.
5.  `CancelStash(DbIndex, Key, Value)`: Start cancelling a pending stash operation.

#### Downstream API (DiskStorage)

`DiskStorage` handles file management and async I/O:

1.  `Read(DiskSegment, ReadCb)`: Read a segment from the backing file.
2.  `PrepareStash(Length) -> Result<pair<Offset, UringBuf>>`: Allocate a segment and prepare a buffer.
3.  `Stash(DiskSegment, UringBuf, StashCb)`: Write the buffer to the allocated segment.
4.  `MarkAsFree(DiskSegment)`: Mark a segment for reuse.

`DiskStorage` manages the underlying file growth and page allocation via an `ExternalAllocator`.


```mermaid
graph TB
    subgraph Commands["called by commands or db_slice"]
        READ[READ]
        REMOVE[REMOVE]
        STASH[STASH]
    end

    subgraph TieredStorage["TieredStorage"]
        %% Invisible node to act as a landing point for the box
        TS_TOP[ ]:::invisible

        PR[pending reads<br/>+ remove?<br/>offset -> futures]
        PS[pending stashes<br/>key -> version]

        TS_BOTTOM[ ]:::invisible
    end

    subgraph DiskStorage["DiskStorage"]
        DS_TOP[ ]:::invisible
        EA[external<br/>allocator]
        IM[io manager]
    end

    %% Interactions between Commands and TieredStorage
    READ -.-> |"Future&lt;string&gt;"| TS_TOP
    TS_TOP -.-> READ
    REMOVE -.-> TS_TOP
    STASH -.-> TS_TOP

    %% Interactions between TieredStorage and DiskStorage
    TS_BOTTOM -.-> |"callback based i/o operations"| DS_TOP
    DS_TOP -.-> TS_BOTTOM

    %% Notes
    Note1[pending reads for a specific<br/>offset are tracked to avoid<br/>duplicate reads and removal<br/>of segments still in use]
    Note2[pending stashes use incremental<br/>versions to discard results of<br/>outdated operations]

    Note1 -.-> TieredStorage
    Note2 -.-> TieredStorage

    %% Styling to make landing nodes invisible
    classDef invisible fill:none,stroke:none,color:none,width:0px,height:0px;
```

Consider, for example, two high level `Read` operations for two different keys K1 and K2 residing on the same page.
For K1, we issue a page read from `DiskStorage` tracked by its offset. For K2, if we check and find an active operation fetching that offset, we link the K2 callback to the K1 completion, avoiding duplicate I/O.

Consider issuing a `Read` request for a key (e.g., during `GET`). This triggers a disk read for the corresponding page. If `Delete` is called for the same key (e.g., via `DEL` or `SET` overwriting the key) while the read is in progress, we must be careful. Immediately calling `DiskStorage::MarkAsFree` could allow a subsequent `Stash` to overwrite the page while it's being read. To prevent this race condition, `MarkAsFree` calls are queued until concurrent reads on the affected segment complete.

These problems do not exist for `Stash` operations because they write to newly allocated pages that no other actor references yet.

## API->Ops translation table

Those that require I/O are colored in **bold**.

| API Sequence | I/O Ops Sequence | Explanation |
|---|---|---|
| `SET` (overwrite) | `Delete` | We remove the reference to the blob stashed on disk. No overwrite of existing entry. |
| `GET` | **`Read`**, `Delete` (optional) | Reads the value. Depending on policy, we might then remove the blob from storage and keep it in RAM ("warm up"). |
| `DEL`, `GET` | `Delete` | `DEL` removes the entry. Subsequent `GET` won't find it in TieredStorage. |
| `APPEND` | **`Read`**, `Delete` | Modify not done in place. Read to memory, append, then remove old disk entry. |
| `GET`, `SET` | **`Read`**, `Delete` | `GET` triggers `Read`. `SET` triggers `Delete`. If `Read` is in-flight, `DiskStorage::MarkAsFree` is delayed until `Read` completes to avoid reusing the page prematurely. |
| `SET`, `DEL` | **`TryStash`**, `Delete` | `SET` may be followed by `TryStash` in case we decide to offload an in-memory entry. In case `DEL` is processed when stash is still in flight, `CancelStash()` will be called. Otherwise, `MarkAsFree` will be called to mark the page as available. |


================================================
FILE: docs/cluster-node-health.md
================================================
# Cluster Node Health

**Node health is passive metadata provided by the cluster manager (control plane) via the
`DFLYCLUSTER CONFIG` command.** Dragonfly nodes do not actively determine their own health status;
instead, the cluster orchestrator monitors node states and communicates health information to each
node through the cluster configuration.

Dragonfly supports node health status reporting for cluster configurations, providing
Valkey-compatible behavior for cluster management commands. This feature allows the cluster
manager to track the health state of each node and communicate it to clients through various
cluster commands.

## Overview

The node health feature was introduced in [PR #4758](https://github.com/dragonflydb/dragonfly/pull/4758)
and [PR #4767](https://github.com/dragonflydb/dragonfly/pull/4767) to address
[issue #4741](https://github.com/dragonflydb/dragonfly/issues/4741).

The health status is part of the cluster configuration and can be set for both master and replica
nodes. Different cluster commands use this information to filter or display nodes based on their
health state.

## Health States

Dragonfly supports four health states for cluster nodes:

| State     | Description                                                                               | Visible in Commands |
|-----------|-------------------------------------------------------------------------------------------|---------------------|
| `online`  | Node is fully operational and ready to serve requests                                    | All commands        |
| `loading` | Node is still loading data (e.g., during initial sync or restart)                       | `CLUSTER SHARDS`, `CLUSTER NODES` |
| `fail`    | Node has failed or is unreachable                                                        | `CLUSTER SHARDS`, `CLUSTER NODES` |
| `hidden`  | Replica exists but should not be exposed to clients (internal use by cluster manager)   | Masters: all commands; Replicas: none |

### Default State

When no health status is specified in the configuration, nodes default to the `online` state.

## Configuration

Node health is specified in the cluster configuration JSON that is passed via the
`DFLYCLUSTER CONFIG` command. The health status is set using the `health` field for each node.

### Configuration Format

```json
[
  {
    "slot_ranges": [
      { "start": 0, "end": 16383 }
    ],
    "master": {
      "id": "node-master-1",
      "ip": "10.0.0.1",
      "port": 7000,
      "health": "online"
    },
    "replicas": [
      {
        "id": "node-replica-1",
        "ip": "10.0.0.2",
        "port": 7001,
        "health": "online"
      },
      {
        "id": "node-replica-2",
        "ip": "10.0.0.3",
        "port": 7002,
        "health": "loading"
      },
      {
        "id": "node-replica-3",
        "ip": "10.0.0.4",
        "port": 7003,
        "health": "fail"
      },
      {
        "id": "node-replica-4",
        "ip": "10.0.0.5",
        "port": 7004,
        "health": "hidden"
      }
    ]
  }
]
```

### Setting Configuration

Use the `DFLYCLUSTER CONFIG` command to set the cluster configuration with health information:

```bash
DFLYCLUSTER CONFIG <json_config>
```

The health field is optional and case-insensitive. Valid values are: `online`, `loading`, `fail`,
and `hidden`.

## Command Behavior

Different cluster commands handle node health status in different ways:

### CLUSTER SHARDS

The `CLUSTER SHARDS` command returns detailed information about cluster shards, including the
health status of all nodes except those marked as `hidden`.

**Example:**

```bash
127.0.0.1:6379> CLUSTER SHARDS
1) 1) "slots"
   2) 1) (integer) 0
      2) (integer) 16383
   3) "nodes"
   4) 1) 1) "id"
         2) "node-master-1"
         3) "endpoint"
         4) "10.0.0.1"
         5) "ip"
         6) "10.0.0.1"
         7) "port"
         8) (integer) 7000
         9) "role"
        10) "master"
        11) "replication-offset"
        12) (integer) 0
        13) "health"
        14) "online"
      2) 1) "id"
         2) "node-replica-1"
         3) "endpoint"
         4) "10.0.0.2"
         5) "ip"
         6) "10.0.0.2"
         7) "port"
         8) (integer) 7001
         9) "role"
        10) "replica"
        11) "replication-offset"
        12) (integer) 0
        13) "health"
        14) "online"
      3) 1) "id"
         2) "node-replica-2"
         3) "endpoint"
         4) "10.0.0.3"
         5) "ip"
         6) "10.0.0.3"
         7) "port"
         8) (integer) 7002
         9) "role"
        10) "replica"
        11) "replication-offset"
        12) (integer) 0
        13) "health"
        14) "loading"
      4) 1) "id"
         2) "node-replica-3"
         3) "endpoint"
         4) "10.0.0.4"
         5) "ip"
         6) "10.0.0.4"
         7) "port"
         8) (integer) 7003
         9) "role"
        10) "replica"
        11) "replication-offset"
        12) (integer) 0
        13) "health"
        14) "fail"
```

**Note:** Nodes with `hidden` health status are filtered out and do not appear in the output.

### CLUSTER SLOTS

The `CLUSTER SLOTS` command returns slot distribution information. This command filters out
replicas that are not ready to serve requests.

**Filtering behavior:**
- Includes replicas with `online` health status
- Excludes replicas with `loading`, `fail`, or `hidden` health status

**Example:**

```bash
127.0.0.1:6379> CLUSTER SLOTS
1) 1) (integer) 0
   2) (integer) 16383
   3) 1) "10.0.0.1"
      2) (integer) 7000
      3) "node-master-1"
   4) 1) "10.0.0.2"
      2) (integer) 7001
      3) "node-replica-1"
```

In this example, only the master and the `online` replica (`node-replica-1`) are shown. Replicas
with `loading`, `fail`, or `hidden` status are not included.

### CLUSTER NODES

The `CLUSTER NODES` command returns a list of all cluster nodes in a space-separated format. This
command shows nodes with most health states but excludes `hidden` nodes.

**Connection state mapping:**
- `online` and `loading` nodes: shown as `connected`
- `fail` nodes: shown as `disconnected`
- `hidden` nodes: not shown in output

**Example:**

```bash
127.0.0.1:6379> CLUSTER NODES
node-master-1 10.0.0.1:7000@7000 master - 0 0 0 connected 0-16383
node-replica-1 10.0.0.2:7001@7001 slave node-master-1 0 0 0 connected
node-replica-2 10.0.0.3:7002@7002 slave node-master-1 0 0 0 connected
node-replica-3 10.0.0.4:7003@7003 slave node-master-1 0 0 0 disconnected
```

**Note:**
- `node-replica-1` (online): appears as `connected`
- `node-replica-2` (loading): appears as `connected`
- `node-replica-3` (fail): appears as `disconnected`
- `node-replica-4` (hidden): not shown in output

## Use Cases

### 1. Gradual Node Addition

When adding a new replica to a cluster, you can set its health status to `loading` while it's
syncing data. This allows the cluster manager to track the node but prevents clients from
redirecting read requests to it via `CLUSTER SLOTS`.

### 2. Failed Node Handling

When a node fails or becomes unreachable, the cluster manager can mark it as `fail`. This
provides visibility in `CLUSTER SHARDS` and `CLUSTER NODES` while excluding it from
`CLUSTER SLOTS` responses.

### 3. Internal Replicas

The `hidden` health status is useful for replica nodes that are managed internally by the cluster
orchestrator but should not be visible to external clients. Hidden replicas are filtered out from
all cluster commands (`CLUSTER SHARDS`, `CLUSTER SLOTS`, and `CLUSTER NODES`). Note that masters
marked as `hidden` are still visible in all commands; the filtering only applies to replicas.

### 4. Valkey Compatibility

This feature provides Valkey-compatible behavior for cluster client APIs:
- `CLUSTER SHARDS` returns the health status of replica nodes
- `CLUSTER SLOTS` does not return replicas that have not finished loading

## Implementation Details

For developers interested in the implementation:

1. **Data Structure**: The `NodeHealth` enum is defined in `src/server/cluster/cluster_defs.h`
   with four values: `FAIL`, `LOADING`, `ONLINE`, and `HIDDEN`.

2. **Configuration Parsing**: Health status is parsed from JSON in
   `src/server/cluster/cluster_config.cc` in the `ParseClusterNode` function.

3. **Command Handlers**: The cluster commands in `src/server/cluster/cluster_family.cc` implement
   filtering logic based on health status:
   - `ClusterShards`: Filters out replicas with `HIDDEN` health before calling `ClusterShardsImpl`
     (masters are still included even if marked `HIDDEN`)
   - `ClusterSlotsImpl`: Filters out `HIDDEN`, `FAIL`, and `LOADING` replicas (masters are always
     included)
   - `ClusterNodesImpl`: Filters out replicas with `HIDDEN` health when listing replicas (masters
     with `HIDDEN` health are still included) and maps health to connection state

4. **Default Value**: When not specified in configuration, nodes default to `ONLINE` state as
   defined in `ClusterExtendedNodeInfo`.

## See Also

- [Dragonfly Cluster Mode Documentation](https://www.dragonflydb.io/docs/cluster)
- [CLUSTER SHARDS Command](https://redis.io/commands/cluster-shards/)
- [CLUSTER SLOTS Command](https://redis.io/commands/cluster-slots/)
- [CLUSTER NODES Command](https://redis.io/commands/cluster-nodes/)


================================================
FILE: docs/coordinator.excalidraw
================================================
{
  "type": "excalidraw",
  "version": 2,
  "source": "https://excalidraw.com",
  "elements": [
    {
      "type": "rectangle",
      "version": 498,
      "versionNonce": 987480120,
      "isDeleted": false,
      "id": "jPwIU_a9_nxvuDFAcbzxM",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "dotted",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 712.375,
      "y": 510.2500000000001,
      "strokeColor": "#000000",
      "backgroundColor": "#15aabf",
      "width": 307,
      "height": 30,
      "seed": 1029717964,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "type": "text",
          "id": "U2-I9a2X4amHnB7NZFWGv"
        },
        {
          "id": "MJoeQ6ylkFi5Z7UCzD-r-",
          "type": "arrow"
        },
        {
          "id": "KpIRIBeGsT3yzCPp6jbEN",
          "type": "arrow"
        },
        {
          "id": "Qnatw_Uix7cMFwAuW1DkJ",
          "type": "arrow"
        },
        {
          "id": "TLS6mZEI7BXyUdiiYHdrg",
          "type": "arrow"
        },
        {
          "id": "h_hyKP8N7nmD1NiZNa3ez",
          "type": "arrow"
        },
        {
          "id": "CrT6zZ8CKm_MSDw-CmcPG",
          "type": "arrow"
        }
      ],
      "updated": 1660733356396,
      "link": null,
      "locked": false
    },
    {
      "type": "text",
      "version": 389,
      "versionNonce": 1321365816,
      "isDeleted": false,
      "id": "U2-I9a2X4amHnB7NZFWGv",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 717.375,
      "y": 515.2500000000001,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 297,
      "height": 20,
      "seed": 1592449524,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1660733269433,
      "link": null,
      "locked": false,
      "fontSize": 16,
      "fontFamily": 1,
      "text": "coordinator",
      "baseline": 14,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "jPwIU_a9_nxvuDFAcbzxM",
      "originalText": "coordinator"
    },
    {
      "type": "rectangle",
      "version": 469,
      "versionNonce": 684925752,
      "isDeleted": false,
      "id": "BY5OdEEKT0Y_DTy9Zgr9C",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 714.375,
      "y": 217.41666666666669,
      "strokeColor": "#000000",
      "backgroundColor": "#fd7e14",
      "width": 77,
      "height": 192,
      "seed": 1621471436,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "id": "MJoeQ6ylkFi5Z7UCzD-r-",
          "type": "arrow"
        },
        {
          "id": "KpIRIBeGsT3yzCPp6jbEN",
          "type": "arrow"
        }
      ],
      "updated": 1660733316757,
      "link": null,
      "locked": false
    },
    {
      "type": "text",
      "version": 113,
      "versionNonce": 2140069448,
      "isDeleted": false,
      "id": "45U617mr0L9ob4mc7Xozt",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 724.875,
      "y": 171.0865384615385,
      "strokeColor": "#000000",
      "backgroundColor": "#fab005",
      "width": 56,
      "height": 40,
      "seed": 1285924468,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1660733195706,
      "link": null,
      "locked": false,
      "fontSize": 16,
      "fontFamily": 1,
      "text": "shard 1\n",
      "baseline": 34,
      "textAlign": "center",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "shard 1\n"
    },
    {
      "type": "text",
      "version": 123,
      "versionNonce": 738921016,
      "isDeleted": false,
      "id": "vY-LnNlhD3qWMEtRPoU0t",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 840.4375,
      "y": 171.0865384615385,
      "strokeColor": "#000000",
      "backgroundColor": "#fab005",
      "width": 64,
      "height": 20,
      "seed": 817296972,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1660733195706,
      "link": null,
      "locked": false,
      "fontSize": 16,
      "fontFamily": 1,
      "text": "shard 2",
      "baseline": 14,
      "textAlign": "center",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "shard 2"
    },
    {
      "type": "rectangle",
      "version": 499,
      "versionNonce": 1256651064,
      "isDeleted": false,
      "id": "xvkm28eoejETjF3M78jpN",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 943.125,
      "y": 221.875,
      "strokeColor": "#000000",
      "backgroundColor": "#fd7e14",
      "width": 77,
      "height": 187,
      "seed": 1482008524,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "id": "h_hyKP8N7nmD1NiZNa3ez",
          "type": "arrow"
        },
        {
          "id": "CrT6zZ8CKm_MSDw-CmcPG",
          "type": "arrow"
        }
      ],
      "updated": 1660733356396,
      "link": null,
      "locked": false
    },
    {
      "type": "text",
      "version": 193,
      "versionNonce": 731710264,
      "isDeleted": false,
      "id": "H72xWL9unzb1mQiLvx7L4",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 950.125,
      "y": 176.7115384615385,
      "strokeColor": "#000000",
      "backgroundColor": "#fab005",
      "width": 63,
      "height": 20,
      "seed": 1704611020,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1660733195706,
      "link": null,
      "locked": false,
      "fontSize": 16,
      "fontFamily": 1,
      "text": "shard 3",
      "baseline": 14,
      "textAlign": "center",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "shard 3"
    },
    {
      "type": "rectangle",
      "version": 547,
      "versionNonce": 1963108408,
      "isDeleted": false,
      "id": "jj-MVcNrzcH0DbFFo9noF",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 833.9375,
      "y": 221.16666666666669,
      "strokeColor": "#000000",
      "backgroundColor": "#fd7e14",
      "width": 77,
      "height": 193,
      "seed": 1374694167,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "id": "Qnatw_Uix7cMFwAuW1DkJ",
          "type": "arrow"
        },
        {
          "id": "TLS6mZEI7BXyUdiiYHdrg",
          "type": "arrow"
        }
      ],
      "updated": 1660733333008,
      "link": null,
      "locked": false
    },
    {
      "id": "MJoeQ6ylkFi5Z7UCzD-r-",
      "type": "arrow",
      "x": 717.875,
      "y": 501.1682692307693,
      "width": 24,
      "height": 87,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "#15aabf",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "round",
      "seed": 6593352,
      "version": 99,
      "versionNonce": 1021163848,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1660733308793,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          -24,
          -44
        ],
        [
          -3,
          -87
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "jPwIU_a9_nxvuDFAcbzxM",
        "focus": -0.8341352911917994,
        "gap": 9.08173076923083
      },
      "endBinding": {
        "elementId": "BY5OdEEKT0Y_DTy9Zgr9C",
        "focus": -0.13122256675640864,
        "gap": 4.751602564102598
      },
      "startArrowhead": null,
      "endArrowhead": "arrow"
    },
    {
      "id": "KpIRIBeGsT3yzCPp6jbEN",
      "type": "arrow",
      "x": 752.875,
      "y": 419.1682692307693,
      "width": 16,
      "height": 90,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "#15aabf",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "round",
      "seed": 1407934264,
      "version": 74,
      "versionNonce": 1205666632,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1660733316764,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          7,
          42
        ],
        [
          -9,
          90
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "BY5OdEEKT0Y_DTy9Zgr9C",
        "focus": 0.3233993962204972,
        "gap": 9.751602564102598
      },
      "endBinding": {
        "elementId": "jPwIU_a9_nxvuDFAcbzxM",
        "focus": -0.8035367629216211,
        "gap": 1.0817307692308304
      },
      "startArrowhead": null,
      "endArrowhead": "arrow"
    },
    {
      "id": "Qnatw_Uix7cMFwAuW1DkJ",
      "type": "arrow",
      "x": 837.875,
      "y": 506.1682692307693,
      "width": 7,
      "height": 83,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "#15aabf",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "round",
      "seed": 1927132472,
      "version": 74,
      "versionNonce": 1840565576,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1660733325799,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          7,
          -83
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "jPwIU_a9_nxvuDFAcbzxM",
        "focus": -0.191317746711659,
        "gap": 4.0817307692308304
      },
      "endBinding": {
        "elementId": "jj-MVcNrzcH0DbFFo9noF",
        "focus": 0.4002005378587657,
        "gap": 9.001602564102598
      },
      "startArrowhead": null,
      "endArrowhead": "arrow"
    },
    {
      "id": "TLS6mZEI7BXyUdiiYHdrg",
      "type": "arrow",
      "x": 872.875,
      "y": 423.1682692307693,
      "width": 13,
      "height": 82,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "#15aabf",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "round",
      "seed": 247434040,
      "version": 76,
      "versionNonce": 1827860040,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1660733333013,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          9,
          41
        ],
        [
          -4,
          82
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "jj-MVcNrzcH0DbFFo9noF",
        "focus": 0.38070164408537926,
        "gap": 9.001602564102598
      },
      "endBinding": {
        "elementId": "jPwIU_a9_nxvuDFAcbzxM",
        "focus": -0.02127803036140877,
        "gap": 5.0817307692308304
      },
      "startArrowhead": null,
      "endArrowhead": "arrow"
    },
    {
      "id": "h_hyKP8N7nmD1NiZNa3ez",
      "type": "arrow",
      "x": 995.875,
      "y": 418.1682692307693,
      "width": 13,
      "height": 90,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "#15aabf",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "round",
      "seed": 2138692424,
      "version": 57,
      "versionNonce": 178091592,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1660733348048,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          12,
          47
        ],
        [
          -1,
          90
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "xvkm28eoejETjF3M78jpN",
        "focus": 0.19231425235177602,
        "gap": 9.293269230769283
      },
      "endBinding": {
        "elementId": "jPwIU_a9_nxvuDFAcbzxM",
        "focus": 0.7835976013538369,
        "gap": 2.0817307692308304
      },
      "startArrowhead": null,
      "endArrowhead": "arrow"
    },
    {
      "id": "CrT6zZ8CKm_MSDw-CmcPG",
      "type": "arrow",
      "x": 957.875,
      "y": 502.1682692307693,
      "width": 18,
      "height": 91,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "#15aabf",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "round",
      "seed": 1991558200,
      "version": 58,
      "versionNonce": 1980388936,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1660733356402,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          -11,
          -39
        ],
        [
          7,
          -91
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "jPwIU_a9_nxvuDFAcbzxM",
        "focus": 0.6245467021802061,
        "gap": 8.08173076923083
      },
      "endBinding": {
        "elementId": "xvkm28eoejETjF3M78jpN",
        "focus": -0.23155463939046053,
        "gap": 2.2932692307692832
      },
      "startArrowhead": null,
      "endArrowhead": "arrow"
    }
  ],
  "appState": {
    "gridSize": null,
    "viewBackgroundColor": "#ffffff"
  },
  "files": {}
}


================================================
FILE: docs/dashtable.md
================================================

# Dashtable in Dragonfly

Dashtable is a very important data structure in Dragonfly. This document explains
how it fits inside the engine.

Each selectable database holds a primary dashtable that contains all its entries. Another instance of Dashtable holds an optional expiry information, for keys that have TTL expiry on them. Dashtable is equivalent to Redis dictionary but have some wonderful properties that make Dragonfly memory efficient in various situations.

![Database Overview](./db.svg)

## Redis dictionary

*“All problems in computer science can be solved by another level of indirection”*

This section is a brief refresher of how redis dictionary (RD) is implemented.
We shamelessly "borrowed" a diagram from [this blogpost](https://codeburst.io/a-closer-look-at-redis-dictionary-implementation-internals-3fd815aae535), so if you want a deep-dive, you can read the original article.

Each `RD` is in fact two hash-tables (see `ht` field in the diagram below). The second instance is used for incremental resizes of the dictionary.
Each hash-table `dictht` is implemented as a [classic hashtable with separate chaining](https://en.wikipedia.org/wiki/Hash_table#Separate_chaining). `dictEntry` is the link-list entry that wraps each key/value pair inside the table. Each dictEntry has three pointers and takes up 24 bytes of space. The bucket array of `dictht` is resized at powers of two, so usually its utilization is in [50, 100] range.

![RD structure](https://miro.medium.com/max/1400/1*gNc8VzCknWRxXTBP9cVEHQ.png)

<br>

Let's estimate the overhead of `dictht` table inside RD.

*Case 1*: it has `N` items at 100% load factor, in other words, buckets count equals to number of items. Each bucket holds a pointer to dictEntry, i.e. it's 8 bytes. In total we need: $8N + 24N = 32N$ bytes per record. <br>
*Case 2*: `N` items at 75% load factor, in other words, the number of buckets is 1.33 higher than number of items. In total we need: $N\*1.33\*8 + 24N \approx 34N$ bytes per record. <br>
*Case 3*: `N` items at 50% load factor, say right after table growth. Number of buckets is twice the number of items, hence we need $N\*2\*8 + 24N = 40N$ bytes per record.

In best possible case we need at least 16 bytes to store key/value pair into the table, therefore
the overhead of `dictht` is on average about 16-24 bytes per item.

Now lets take incremental growth into account. When `ht[0]` is full (i.e. RD needs to migrate data to a bigger table), it will instantiate a second temporary instance `ht[1]` that will hold additional 2*N buckets. Both instances will live in parallel until all data is migrated to `ht[1]` and then `ht[0]` bucket array will be deleted. All this complexity is hidden from a user by well engineered API of RD. Lets combine case 3 and case 1 to analyze memory spike at this point: `ht[0]` holds `N` items and it is fully utilized. `ht[1]` is allocated with `2N` buckets.
Overall, the memory needed during the spike is $32N + 16N=48N$ bytes.

To summarize, RD requires between **16-32 bytes overhead**.

## Dash table

[Dashtable](https://arxiv.org/abs/2003.07302) is an evolution of an algorithm from 1979 called [extendible hashing](https://en.wikipedia.org/wiki/Extendible_hashing).

Similarly to a classic hashtable, dashtable (DT) also holds an array of pointers at front. However, unlike with classic tables, it points to `segments` and not to linked lists of items. Each `segment` is, in fact, a mini-hashtable of constant size. The front array of pointers to segments is called `directory`. Similarly to a classic table, when an item is inserted into a DT, it first determines the destination segment based on item's hashvalue. The segment is implemented as a hashtable with open-addressed hashing scheme and as I said - constant in size. Once segment is determined, the item inserted into one of its buckets. If an item was successfully inserted, we finished, otherwise, the segment is "full" and needs splitting. The DT splits the contents of a full segment in two segments, and the additional segment is added to the directory. Then it tries to reinsert the item again. To summarize, the classic chaining hash-table is built upon a dynamic array of linked-lists while dashtable is more like a dynamic array of flat hash-tables of constant size.

![Dashtable Diagram](./dashtable.svg)

In the diagram above you can see how dashtable looks like. Each segment is comprised of `K` buckets. For example, in our implementation a dashtable has 60 buckets per segment (it's a compile-time parameter that can be configured).

### Segment zoom-in

Below you can see the diagram of a segment. It comprised of regular buckets and stash buckets. Each bucket has `k` slots and each slot can host a key-value record.

![Segment](./dashsegment.svg)

In our implementation, each segment has 56 regular buckets, 4 stash buckets and each bucket contains 14 slots. Overall, each dashtable segment has capacity to host 840 records. When an item is inserted into a segment, DT first determines its home bucket based on item's hash value. The home bucket is one of 56 regular buckets that reside in the table. Each bucket has 14 available slots and the item can reside in any free slot. If the home bucket is full,
then DT tries to insert to the regular bucket on the right. And if that bucket is also full,
it tries to insert into one of 4 stash buckets. These are kept deliberately aside to gather
spillovers from the regular buckets. The segment is "full" when the insertion fails, i.e. the home bucket and the neighbour bucket and all 4 stash buckets are full. Please note that segment is not necessary at full capacity, it can be that other buckets are not yet full, but unfortunately, that item can go only into these 6 buckets,
so the segment contents must be split. In case of split event, DT creates a new segment,
adds it to the directory and the items from the old segment partly moved to the new one,
 and partly rebalanced within the old one. Only two segments are touched during the split event.

Now we can explain why seemingly similar data-structure has an advantage over a classic hashtable
in terms of memory and cpu.

 1. Memory: we need `~N/840` entries or `8N/840` bytes in dashtable directory to host N items on average.
 Basically, the overhead of directory almost disappears in DT. Say for 1M items we will
 need ~1200 segments or 9600 bytes for the main array. That's in contrast to RD where
 we will need a solid `8N` bucket array overhead - no matter what.
 For 1M items, it will obviously be 8MB. In addition, dash segments use open addressing collision
 scheme with probing, that means that they do not need anything like `dictEntry`.
 Dashtable uses lots of tricks to make its own metadata small. In our implementation,
 the average `tax` per entry is short of 20 bits compared to 64 bits in RD (dictEntry.next).
 In addition, DT incremental resize does not allocate a bigger table - instead
 it adds a single segment per split event. Assuming that key/pair entry is two 8
 byte pointers like in RD, then DT requires $16N + (8N/840) + 2.5N + O(1) \approx 19N$
 bytes at 100% utilization. This number is very close to the optimum of 16 bytes.
 In unlikely case when all segments just doubled in size, i.e.
 DT is at 50% of utilization we may need $38N$ bytes per item.
 In practice, each segment grows independently from others,
 so the table has smooth memory usage of 22-32 bytes per item or **6-16 bytes overhead**.

 1. Speed: RD requires an allocation for dictEntry per insertion and deallocation per deletion. In addition, RD uses chaining, which is cache unfriendly on modern hardware. There is a consensus in engineering and research communities that classic chaining schemes are slower than open addressing alternatives.
 Having said that, DT also needs to go through a single level of indirection when
 fetching a segment pointer. However, DT's directory size is relatively small:
 in the example above, all 9K could resize in L1 cache. Once the segment is determined,
 the rest of the insertion, however, is very fast an mostly operates on 1-3 memory cache lines.
 Finally, during resizes, RD requires to allocate a bucket array of size `2N`.
 That could be time consuming - imagine an allocation of 100M buckets for example.
 DT on the other hand requires an allocation of constant size per new segment. DT is faster
 and what's more important - it's incremental ability is better. It eliminates latency spikes
 and reduces tail latency of the operations above.

Please note that with all efficiency of Dashtable, it can not decrease drastically the
overall memory usage. Its primary goal is to reduce waste around dictionary management.

Having said that, by reducing metadata waste we could insert dragonfly-specific attributes
into a table's metadata in order to implement other intelligent algorithms like forkless save. This is where some of the Dragonfly's disrupting qualities [can be seen](#forkless-save).

## Benchmarks

There are many other improvements in dragonfly that save memory besides DT. I will not be
able to cover them all here. The results below show the final result as of May 2022.

### Populate single-threaded

To compare RD vs DT I often use an internal debugging command "debug populate" that quickly fills both datastores with data. It just saves time and gives more consistent results compared to memtier_benchmark.
It also shows the raw speed at which each dictionary gets filled without intermediary factors like networking, parsing etc.
I deliberately fill datasets with a small data to show how overhead of metadata differs between two data structures.

I run "debug populate 20000000" (20M) on both engines on my home machine "AMD Ryzen 5 3400G with 8 cores".

|             | Dragonfly | Redis 6 |
|-------------|-----------|---------|
| Time        |   10.8s   |  16.0s  |
| Memory used |    1GB    |  1.73G  |

When looking at Redis6 "info memory" stats, you can see that `used_memory_overhead` field equals
to `1.0GB`. That means that out of 1.73GB bytes allocated, a whooping 1.0GB is used for
the metadata. For small data use-cases the cost of metadata in Redis is larger than the data itself.

### Populate multi-threaded

Now I run Dragonfly on all 8 cores. Redis has the same results, of course.

|             | Dragonfly | Redis 6 |
|-------------|-----------|---------|
| Time        |   2.43s   |  16.0s  |
| Memory used |    896MB  |  1.73G  |

Due to shared-nothing architecture, Dragonfly maintains a dashtable per thread with its own slice of data. Each thread fills 1/8th of 20M range it owns - and it much faster, almost 8 times faster. You can see that the total usage is even smaller, because now we maintain
smaller tables in each
thread (it's not always the case though - we could get slightly worse memory usage than with
single-threaded case, depends where we stand compared to hash table utilization).

### Forkless Save

This example shows how much memory Dragonfly uses during BGSAVE under load compared to Redis. Btw, BGSAVE and SAVE in Dragonfly is the same procedure because it's implemented using fully asynchronous algorithm that maintains point-in-time snapshot guarantees.

This test consists of 3 steps:

1. Execute `debug populate 5000000 key 1024` command on both servers to quickly fill them up
   with ~5GB of data.
2. Run `memtier_benchmark --ratio 1:0 -n 600000 --threads=2 -c 20 --distinct-client-seed  --key-prefix="key:"  --hide-histogram  --key-maximum=5000000 -d 1024` command in order to send constant update traffic. This traffic should not affect substantially the memory usage of both servers.
3. Finally, run `bgsave` on both servers while measuring their memory.

It's very hard, technically to measure exact memory usage of Redis during BGSAVE because it creates a child process that shares its parent memory in-part. We chose `cgroupsv2` as a tool to measure the memory. We put each server into a separate cgroup and we sampled `memory.current` attribute for each cgroup. Since a forked Redis process inherits the cgroup of the parent, we get an accurate estimation of their total memory usage. Although we did not need this for Dragonfly we applied the same approach for consistency.

![BGSAVE](./bgsave_memusage.svg)

As you can see on the graph, Redis uses 50% more memory even before BGSAVE starts. Around second 14, BGSAVE kicks off on both servers. Visually you can not see this event on Dragonfly graph, but it's seen very well on Redis graph. It took just few seconds for Dragonfly to finish its snapshot (again, not possible to see) and around second 20 Dragonfly is already behind BGSAVE. You can see a distinguishable cliff at second 39
where Redis finishes its snapshot, reaching almost x3 times more memory usage at peak.

### Expiry of items during writes

Efficient Expiry is very important for many scenarios. See, for example,
[Pelikan paper'21](https://pelikan.io/2021/segcache.html). Twitter team says
that their memory footprint could be reduced by as much as by 60% by employing better expiry methodology. The authors of the post above show pros and cons of expiration methods in the table below:

<img src="https://pelikan.io/assets/img/segcache/expiration.svg" width="400">

They argue that proactive expiration is very important for timely deletion of expired items.
Dragonfly, employs its own intelligent garbage collection procedure. By leveraging DashTable
compartmentalized structure it can actually employ a very efficient passive expiry algorithm with low CPU overhead. Our passive procedure is complimented with proactive gradual scanning of the table in background.

The procedure is a follows:
A dashtable grows when its segment becomes full during the insertion and needs to be split.
This is a convenient point to perform garbage collection, but only for that segment.
We scan its buckets for the expired items. If we delete some of them, we may avoid growing the table altogether! The cost of scanning the segment before potential split is no more the
split itself so can be estimated as `O(1)`.

We use `memtier_benchmark` for the experiment to demonstrate Dragonfly vs Redis expiry efficiency.
We run locally the following command:

```bash
memtier_benchmark --ratio 1:0 -n 600000 --threads=2 -c 20 --distinct-client-seed \
   --key-prefix="key:"  --hide-histogram --expiry-range=30-30 --key-maximum=100000000 -d 256
```

We load larger values (256 bytes) to reduce the impact of metadata savings
of Dragonfly.

|                      | Dragonfly | Redis 6 |
|----------------------|-----------|---------|
| Memory peak usage    | 1.45GB    |  1.95GB |
| Avg SET qps          | 131K      | 100K    |

Please note that Redis could sustain 30% less qps. That means that the optimal working sets for Dragonfly and Redis are different - the former needed to host at least `20s*131k` items
at any point of time and the latter only needed to keep `20s*100K` items.
So for `30%` bigger working set Dragonfly needed `25%` less memory at peak.

<em>*Please ignore the performance advantage of Dragonfly over Redis in this test - it has no meaning.
I run it locally on my machine and it does not represent a real throughput benchmark. </em>

<br>

*All diagrams in this doc are created in [drawio app](https://app.diagrams.net/).*


================================================
FILE: docs/dense_set.excalidraw
================================================
{
  "type": "excalidraw",
  "version": 2,
  "source": "https://excalidraw.com",
  "elements": [
    {
      "id": "LdnS4utc0Co8ZQl0k_99q",
      "type": "rectangle",
      "x": 278.57142857142867,
      "y": 767.857142857143,
      "width": 157,
      "height": 42,
      "angle": 0,
      "strokeColor": "#364fc7",
      "backgroundColor": "#4c6ef5",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "seed": 34309611,
      "version": 379,
      "versionNonce": 490192843,
      "isDeleted": false,
      "boundElements": [
        {
          "id": "wIo5IjqjKx5agDWM2U6y9",
          "type": "arrow"
        },
        {
          "type": "text",
          "id": "BV0b6Du7Nu_TpcyHxOq9M"
        }
      ],
      "updated": 1662257477282,
      "link": null,
      "locked": false
    },
    {
      "id": "6iemTDX54UBvWAow6YZUm",
      "type": "ellipse",
      "x": 785.5714285714287,
      "y": 670.857142857143,
      "width": 151,
      "height": 65,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "seed": 2110505739,
      "version": 615,
      "versionNonce": 1697849797,
      "isDeleted": false,
      "boundElements": [
        {
          "type": "text",
          "id": "CsENpV2URO6_T9J1e_EWv"
        },
        {
          "id": "h4EkHYMe6b4cxIpFk2aJ1",
          "type": "arrow"
        }
      ],
      "updated": 1662257477283,
      "link": null,
      "locked": false
    },
    {
      "id": "CsENpV2URO6_T9J1e_EWv",
      "type": "text",
      "x": 790.5714285714287,
      "y": 689.357142857143,
      "width": 141,
      "height": 28,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "seed": 1128884549,
      "version": 556,
      "versionNonce": 341608715,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1662257477283,
      "link": null,
      "locked": false,
      "text": "\"abcd...\"",
      "fontSize": 20,
      "fontFamily": 1,
      "textAlign": "center",
      "verticalAlign": "middle",
      "baseline": 19,
      "containerId": "6iemTDX54UBvWAow6YZUm",
      "originalText": "\"abcd...\""
    },
    {
      "id": "wIo5IjqjKx5agDWM2U6y9",
      "type": "arrow",
      "x": 436.80362915161936,
      "y": 789.7627222797395,
      "width": 81.53559883961861,
      "height": 0.030478424363479917,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "round",
      "seed": 1941199909,
      "version": 1319,
      "versionNonce": 319409605,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1662257477403,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          81.53559883961861,
          0.030478424363479917
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "LdnS4utc0Co8ZQl0k_99q",
        "focus": 0.041645385141281355,
        "gap": 1.2322005801906926
      },
      "endBinding": {
        "elementId": "9mWjCy5sUe-mID6u6k7Ll",
        "focus": -0.08136851610313917,
        "gap": 1.2322005801906926
      },
      "startArrowhead": null,
      "endArrowhead": "arrow"
    },
    {
      "type": "ellipse",
      "version": 1317,
      "versionNonce": 365900933,
      "isDeleted": false,
      "id": "tbWakWx-QT3DCK-_FZhx-",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 1687.5714285714287,
      "y": 681.857142857143,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 151,
      "height": 65,
      "seed": 429183979,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "id": "RqWMMUkOMQWtnqqIo_0RK",
          "type": "text"
        },
        {
          "id": "ZD_EGEh1PSlEhdhmPUGm3",
          "type": "arrow"
        },
        {
          "type": "text",
          "id": "RqWMMUkOMQWtnqqIo_0RK"
        }
      ],
      "updated": 1662257477283,
      "link": null,
      "locked": false
    },
    {
      "type": "text",
      "version": 1258,
      "versionNonce": 1498415691,
      "isDeleted": false,
      "id": "RqWMMUkOMQWtnqqIo_0RK",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 1692.5714285714287,
      "y": 700.357142857143,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 141,
      "height": 28,
      "seed": 365098053,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1662257477283,
      "link": null,
      "locked": false,
      "fontSize": 20,
      "fontFamily": 1,
      "text": "\"abcd...\"",
      "baseline": 19,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "tbWakWx-QT3DCK-_FZhx-",
      "originalText": "\"abcd...\""
    },
    {
      "type": "arrow",
      "version": 3623,
      "versionNonce": 1786105125,
      "isDeleted": false,
      "id": "ZD_EGEh1PSlEhdhmPUGm3",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 1593.5714285714287,
      "y": 767.857142857143,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 98.02649903619977,
      "height": 41.55714530998347,
      "seed": 1874017893,
      "groupIds": [],
      "strokeSharpness": "round",
      "boundElements": [],
      "updated": 1662257477405,
      "link": null,
      "locked": false,
      "startBinding": {
        "elementId": "RyzbgdtiyAgDl_Gg-xKD6",
        "focus": 0.44304364520670675,
        "gap": 2
      },
      "endBinding": {
        "elementId": "tbWakWx-QT3DCK-_FZhx-",
        "focus": 0.6558676754700489,
        "gap": 1
      },
      "lastCommittedPoint": null,
      "startArrowhead": null,
      "endArrowhead": "arrow",
      "points": [
        [
          0,
          0
        ],
        [
          57.5,
          -9.5
        ],
        [
          98.02649903619977,
          -41.55714530998347
        ]
      ]
    },
    {
      "type": "ellipse",
      "version": 1594,
      "versionNonce": 722835269,
      "isDeleted": false,
      "id": "hls1kkVvTEbIVUoHV9YjB",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 1688.0714285714287,
      "y": 848.357142857143,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 151,
      "height": 65,
      "seed": 464754437,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "id": "7OALlCUSo8C4wRunATj7i",
          "type": "text"
        },
        {
          "id": "ZD_EGEh1PSlEhdhmPUGm3",
          "type": "arrow"
        },
        {
          "id": "7OALlCUSo8C4wRunATj7i",
          "type": "text"
        },
        {
          "type": "text",
          "id": "7OALlCUSo8C4wRunATj7i"
        },
        {
          "id": "PtndVbqi061kx-2QVmX9B",
          "type": "arrow"
        }
      ],
      "updated": 1662257477283,
      "link": null,
      "locked": false
    },
    {
      "type": "text",
      "version": 1533,
      "versionNonce": 1814440843,
      "isDeleted": false,
      "id": "7OALlCUSo8C4wRunATj7i",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 1693.0714285714287,
      "y": 866.857142857143,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 141,
      "height": 28,
      "seed": 1547241419,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1662257477283,
      "link": null,
      "locked": false,
      "fontSize": 20,
      "fontFamily": 1,
      "text": "\"abcd...\"",
      "baseline": 19,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "hls1kkVvTEbIVUoHV9YjB",
      "originalText": "\"abcd...\""
    },
    {
      "id": "PtndVbqi061kx-2QVmX9B",
      "type": "arrow",
      "x": 1595.5714285714287,
      "y": 818.857142857143,
      "width": 128.1422939788249,
      "height": 32.825682301479674,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "round",
      "seed": 310414827,
      "version": 1513,
      "versionNonce": 652927109,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1662257477406,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          49.5,
          -8
        ],
        [
          128.1422939788249,
          24.825682301479674
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "LUhivcEGaeW_fHoMkT5PY",
        "focus": 0.2744377811094453,
        "gap": 5
      },
      "endBinding": {
        "elementId": "hls1kkVvTEbIVUoHV9YjB",
        "focus": 0.453665660258198,
        "gap": 9.270374749825422
      },
      "startArrowhead": null,
      "endArrowhead": "arrow"
    },
    {
      "type": "rectangle",
      "version": 539,
      "versionNonce": 447891973,
      "isDeleted": false,
      "id": "BmVwp90EOf01pxoCqayka",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 779.0714285714287,
      "y": 273.8571428571429,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 157,
      "height": 42,
      "seed": 817120651,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "id": "7-os26TSlkxMhDb-ALHK8",
          "type": "text"
        },
        {
          "id": "wIo5IjqjKx5agDWM2U6y9",
          "type": "arrow"
        },
        {
          "type": "text",
          "id": "7-os26TSlkxMhDb-ALHK8"
        },
        {
          "id": "2-4BatkaFqKxOF9ikfE9M",
          "type": "arrow"
        }
      ],
      "updated": 1662257477283,
      "link": null,
      "locked": false
    },
    {
      "type": "text",
      "version": 496,
      "versionNonce": 1475544267,
      "isDeleted": false,
      "id": "7-os26TSlkxMhDb-ALHK8",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 784.0714285714287,
      "y": 280.8571428571429,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 147,
      "height": 28,
      "seed": 398781605,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1662257477283,
      "link": null,
      "locked": false,
      "fontSize": 20,
      "fontFamily": 1,
      "text": "DensePtr",
      "baseline": 19,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "BmVwp90EOf01pxoCqayka",
      "originalText": "DensePtr"
    },
    {
      "id": "BV0b6Du7Nu_TpcyHxOq9M",
      "type": "text",
      "x": 283.57142857142867,
      "y": 774.857142857143,
      "width": 147,
      "height": 28,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "seed": 84057963,
      "version": 301,
      "versionNonce": 1123890789,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1662257477283,
      "link": null,
      "locked": false,
      "text": "DenseLinkKey",
      "fontSize": 20,
      "fontFamily": 1,
      "textAlign": "center",
      "verticalAlign": "middle",
      "baseline": 19,
      "containerId": "LdnS4utc0Co8ZQl0k_99q",
      "originalText": "DenseLinkKey"
    },
    {
      "type": "ellipse",
      "version": 1392,
      "versionNonce": 208548715,
      "isDeleted": false,
      "id": "oUfTPCoNOMOVUScypl9ov",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 1038.0714285714287,
      "y": 262.3571428571429,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 151,
      "height": 65,
      "seed": 1274116613,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "id": "_ZWQLsSXL62nm9Vxybs_T",
          "type": "text"
        },
        {
          "id": "ZD_EGEh1PSlEhdhmPUGm3",
          "type": "arrow"
        },
        {
          "id": "_ZWQLsSXL62nm9Vxybs_T",
          "type": "text"
        },
        {
          "id": "_ZWQLsSXL62nm9Vxybs_T",
          "type": "text"
        },
        {
          "id": "PtndVbqi061kx-2QVmX9B",
          "type": "arrow"
        },
        {
          "type": "text",
          "id": "_ZWQLsSXL62nm9Vxybs_T"
        },
        {
          "id": "2-4BatkaFqKxOF9ikfE9M",
          "type": "arrow"
        }
      ],
      "updated": 1662257477283,
      "link": null,
      "locked": false
    },
    {
      "type": "text",
      "version": 1329,
      "versionNonce": 1599163589,
      "isDeleted": false,
      "id": "_ZWQLsSXL62nm9Vxybs_T",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 1043.0714285714287,
      "y": 280.8571428571429,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 141,
      "height": 28,
      "seed": 1098831051,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1662257477283,
      "link": null,
      "locked": false,
      "fontSize": 20,
      "fontFamily": 1,
      "text": "\"abcd...\"",
      "baseline": 19,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "oUfTPCoNOMOVUScypl9ov",
      "originalText": "\"abcd...\""
    },
    {
      "id": "2-4BatkaFqKxOF9ikfE9M",
      "type": "arrow",
      "x": 937.5714285714287,
      "y": 296.8571428571429,
      "width": 97,
      "height": 2,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "round",
      "seed": 2010023531,
      "version": 543,
      "versionNonce": 1848018917,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1662257477407,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          41,
          1
        ],
        [
          97,
          -1
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "BmVwp90EOf01pxoCqayka",
        "focus": 0.0021287919105907396,
        "gap": 1.5
      },
      "endBinding": {
        "elementId": "oUfTPCoNOMOVUScypl9ov",
        "focus": 0.05585205610314286,
        "gap": 3.5285491921035828
      },
      "startArrowhead": null,
      "endArrowhead": "arrow"
    },
    {
      "type": "rectangle",
      "version": 751,
      "versionNonce": 235050155,
      "isDeleted": false,
      "id": "Suj1TA3n75lniv8ZthhOy",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 780.5714285714287,
      "y": 481.857142857143,
      "strokeColor": "#2b8a3e",
      "backgroundColor": "#12b886",
      "width": 157,
      "height": 42,
      "seed": 1337311947,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "id": "5l8sQoeycml7y43c3H6j4",
          "type": "text"
        },
        {
          "id": "wIo5IjqjKx5agDWM2U6y9",
          "type": "arrow"
        },
        {
          "id": "5l8sQoeycml7y43c3H6j4",
          "type": "text"
        },
        {
          "id": "XNzXS4nhlngVv4LqrpGWH",
          "type": "arrow"
        },
        {
          "type": "text",
          "id": "5l8sQoeycml7y43c3H6j4"
        }
      ],
      "updated": 1662257477283,
      "link": null,
      "locked": false
    },
    {
      "type": "text",
      "version": 707,
      "versionNonce": 949919621,
      "isDeleted": false,
      "id": "5l8sQoeycml7y43c3H6j4",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 785.5714285714287,
      "y": 488.8571428571431,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 147,
      "height": 28,
      "seed": 26534757,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1662257477283,
      "link": null,
      "locked": false,
      "fontSize": 20,
      "fontFamily": 1,
      "text": "DensePtr",
      "baseline": 19,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "Suj1TA3n75lniv8ZthhOy",
      "originalText": "DensePtr"
    },
    {
      "type": "ellipse",
      "version": 1601,
      "versionNonce": 822765285,
      "isDeleted": false,
      "id": "e0Z3-_Eg_DtzWKAJ00uZx",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 1039.5714285714287,
      "y": 470.357142857143,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 151,
      "height": 65,
      "seed": 959000939,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "id": "_Bita6RwDhub4HiG4-vAe",
          "type": "text"
        },
        {
          "id": "ZD_EGEh1PSlEhdhmPUGm3",
          "type": "arrow"
        },
        {
          "id": "_Bita6RwDhub4HiG4-vAe",
          "type": "text"
        },
        {
          "id": "_Bita6RwDhub4HiG4-vAe",
          "type": "text"
        },
        {
          "id": "PtndVbqi061kx-2QVmX9B",
          "type": "arrow"
        },
        {
          "id": "_Bita6RwDhub4HiG4-vAe",
          "type": "text"
        },
        {
          "id": "XNzXS4nhlngVv4LqrpGWH",
          "type": "arrow"
        },
        {
          "type": "text",
          "id": "_Bita6RwDhub4HiG4-vAe"
        }
      ],
      "updated": 1662257477283,
      "link": null,
      "locked": false
    },
    {
      "type": "text",
      "version": 1537,
      "versionNonce": 140486123,
      "isDeleted": false,
      "id": "_Bita6RwDhub4HiG4-vAe",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 1044.5714285714287,
      "y": 488.8571428571431,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 141,
      "height": 28,
      "seed": 776810181,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1662257477283,
      "link": null,
      "locked": false,
      "fontSize": 20,
      "fontFamily": 1,
      "text": "\"abcd...\"",
      "baseline": 19,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "e0Z3-_Eg_DtzWKAJ00uZx",
      "originalText": "\"abcd...\""
    },
    {
      "type": "arrow",
      "version": 1219,
      "versionNonce": 1379287877,
      "isDeleted": false,
      "id": "XNzXS4nhlngVv4LqrpGWH",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 939.0714285714287,
      "y": 504.8571428571431,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 97,
      "height": 2,
      "seed": 192269323,
      "groupIds": [],
      "strokeSharpness": "round",
      "boundElements": [],
      "updated": 1662257477408,
      "link": null,
      "locked": false,
      "startBinding": {
        "elementId": "Suj1TA3n75lniv8ZthhOy",
        "focus": 0.002128791910595701,
        "gap": 1.5
      },
      "endBinding": {
        "elementId": "e0Z3-_Eg_DtzWKAJ00uZx",
        "focus": 0.055852056103139376,
        "gap": 3.5285491921035685
      },
      "lastCommittedPoint": null,
      "startArrowhead": null,
      "endArrowhead": "arrow",
      "points": [
        [
          0,
          0
        ],
        [
          41,
          1
        ],
        [
          97,
          -1
        ]
      ]
    },
    {
      "id": "RGj3Y6CtyijvehUeHVywF",
      "type": "text",
      "x": 749.5714285714287,
      "y": 560.857142857143,
      "width": 474,
      "height": 46,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "#12b886",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "seed": 130163083,
      "version": 363,
      "versionNonce": 1546629541,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1662257477283,
      "link": null,
      "locked": false,
      "text": "Chain With Multiple Entries",
      "fontSize": 36,
      "fontFamily": 1,
      "textAlign": "center",
      "verticalAlign": "top",
      "baseline": 32,
      "containerId": null,
      "originalText": "Chain With Multiple Entries"
    },
    {
      "type": "rectangle",
      "version": 696,
      "versionNonce": 85550891,
      "isDeleted": false,
      "id": "1S3pVzBUuYFr-RbDX1FXv",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 536.0714285714287,
      "y": 747.857142857143,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 157,
      "height": 42,
      "seed": 715872619,
      "groupIds": [
        "kaOwSxozJCF4g6QcHxA1q"
      ],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "type": "text",
          "id": "P0w2r72h8lTF4KC0S8iK0"
        },
        {
          "id": "h4EkHYMe6b4cxIpFk2aJ1",
          "type": "arrow"
        }
      ],
      "updated": 1662257477283,
      "link": null,
      "locked": false
    },
    {
      "type": "rectangle",
      "version": 695,
      "versionNonce": 1599687115,
      "isDeleted": false,
      "id": "CdJtzp6w0n0rveWC1BWuQ",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 535.0714285714287,
      "y": 804.857142857143,
      "strokeColor": "#364fc7",
      "backgroundColor": "#4c6ef5",
      "width": 157,
      "height": 42,
      "seed": 582081413,
      "groupIds": [
        "kaOwSxozJCF4g6QcHxA1q"
      ],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "type": "text",
          "id": "LFc4k25ArlLXtoygEGUU6"
        },
        {
          "id": "iKBu85WHY4IL_69QEvdyT",
          "type": "arrow"
        }
      ],
      "updated": 1662257477283,
      "link": null,
      "locked": false
    },
    {
      "id": "9mWjCy5sUe-mID6u6k7Ll",
      "type": "rectangle",
      "x": 519.5714285714287,
      "y": 703.857142857143,
      "width": 188,
      "height": 159,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [
        "kaOwSxozJCF4g6QcHxA1q"
      ],
      "strokeSharpness": "sharp",
      "seed": 1146093355,
      "version": 582,
      "versionNonce": 952359019,
      "isDeleted": false,
      "boundElements": [
        {
          "id": "wIo5IjqjKx5agDWM2U6y9",
          "type": "arrow"
        },
        {
          "id": "5d6SPvHw2keIDl-5kNmEb",
          "type": "arrow"
        },
        {
          "type": "text",
          "id": "2HOa22It8IfsktBdjpTwo"
        }
      ],
      "updated": 1662257477283,
      "link": null,
      "locked": false
    },
    {
      "id": "LFc4k25ArlLXtoygEGUU6",
      "type": "text",
      "x": 540.0714285714287,
      "y": 811.857142857143,
      "width": 147,
      "height": 28,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [
        "kaOwSxozJCF4g6QcHxA1q"
      ],
      "strokeSharpness": "sharp",
      "seed": 1252566219,
      "version": 555,
      "versionNonce": 790860555,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1662257477283,
      "link": null,
      "locked": false,
      "text": "DenseLinkKey",
      "fontSize": 20,
      "fontFamily": 1,
      "textAlign": "center",
      "verticalAlign": "middle",
      "baseline": 19,
      "containerId": "CdJtzp6w0n0rveWC1BWuQ",
      "originalText": "DenseLinkKey"
    },
    {
      "id": "P0w2r72h8lTF4KC0S8iK0",
      "type": "text",
      "x": 541.0714285714287,
      "y": 754.857142857143,
      "width": 147,
      "height": 28,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [
        "kaOwSxozJCF4g6QcHxA1q"
      ],
      "strokeSharpness": "sharp",
      "seed": 1221832197,
      "version": 556,
      "versionNonce": 1960523557,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1662257477283,
      "link": null,
      "locked": false,
      "text": "DensePtr",
      "fontSize": 20,
      "fontFamily": 1,
      "textAlign": "center",
      "verticalAlign": "middle",
      "baseline": 19,
      "containerId": "1S3pVzBUuYFr-RbDX1FXv",
      "originalText": "DensePtr"
    },
    {
      "id": "2HOa22It8IfsktBdjpTwo",
      "type": "text",
      "x": 524.5714285714287,
      "y": 708.857142857143,
      "width": 178,
      "height": 28,
      "angle": 0,
      "strokeColor": "#a61e4d",
      "backgroundColor": "#12b886",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [
        "kaOwSxozJCF4g6QcHxA1q"
      ],
      "strokeSharpness": "sharp",
      "seed": 971803051,
      "version": 325,
      "versionNonce": 1123219883,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1662257477283,
      "link": null,
      "locked": false,
      "text": "DenseLinkKey",
      "fontSize": 20,
      "fontFamily": 1,
      "textAlign": "center",
      "verticalAlign": "top",
      "baseline": 19,
      "containerId": "9mWjCy5sUe-mID6u6k7Ll",
      "originalText": "DenseLinkKey"
    },
    {
      "type": "rectangle",
      "version": 877,
      "versionNonce": 1427311237,
      "isDeleted": false,
      "id": "RyzbgdtiyAgDl_Gg-xKD6",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 1434.5714285714287,
      "y": 745.107142857143,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 157,
      "height": 42,
      "seed": 1200981765,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "id": "K8fVzXRPoMTnOm4BIQdpC",
          "type": "text"
        },
        {
          "type": "text",
          "id": "K8fVzXRPoMTnOm4BIQdpC"
        },
        {
          "id": "ZD_EGEh1PSlEhdhmPUGm3",
          "type": "arrow"
        }
      ],
      "updated": 1662257477283,
      "link": null,
      "locked": false
    },
    {
      "type": "rectangle",
      "version": 876,
      "versionNonce": 1849346795,
      "isDeleted": false,
      "id": "LUhivcEGaeW_fHoMkT5PY",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 1433.5714285714287,
      "y": 802.107142857143,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 157,
      "height": 42,
      "seed": 1269700555,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "id": "d68fKPrkXvutq5CIgpKu0",
          "type": "text"
        },
        {
          "type": "text",
          "id": "d68fKPrkXvutq5CIgpKu0"
        },
        {
          "id": "PtndVbqi061kx-2QVmX9B",
          "type": "arrow"
        }
      ],
      "updated": 1662257477283,
      "link": null,
      "locked": false
    },
    {
      "type": "rectangle",
      "version": 767,
      "versionNonce": 114818213,
      "isDeleted": false,
      "id": "nqXx_jG0SMox2AHT2L5F2",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 1418.0714285714287,
      "y": 701.107142857143,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 188,
      "height": 159,
      "seed": 532176485,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "id": "wIo5IjqjKx5agDWM2U6y9",
          "type": "arrow"
        },
        {
          "id": "5d6SPvHw2keIDl-5kNmEb",
          "type": "arrow"
        },
        {
          "id": "P46vozsH8hY3lX5pMtPk8",
          "type": "text"
        },
        {
          "type": "text",
          "id": "P46vozsH8hY3lX5pMtPk8"
        },
        {
          "id": "CVx1AqnNI76hVX9-ObrtA",
          "type": "arrow"
        }
      ],
      "updated": 1662257477283,
      "link": null,
      "locked": false
    },
    {
      "type": "text",
      "version": 745,
      "versionNonce": 1071377733,
      "isDeleted": false,
      "id": "d68fKPrkXvutq5CIgpKu0",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 1438.5714285714287,
      "y": 809.107142857143,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 147,
      "height": 28,
      "seed": 491710059,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1662257477283,
      "link": null,
      "locked": false,
      "fontSize": 20,
      "fontFamily": 1,
      "text": "DensePtr",
      "baseline": 19,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "LUhivcEGaeW_fHoMkT5PY",
      "originalText": "DensePtr"
    },
    {
      "type": "text",
      "version": 736,
      "versionNonce": 22842443,
      "isDeleted": false,
      "id": "K8fVzXRPoMTnOm4BIQdpC",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 1439.5714285714287,
      "y": 752.107142857143,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 147,
      "height": 28,
      "seed": 1346980293,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1662257477283,
      "link": null,
      "locked": false,
      "fontSize": 20,
      "fontFamily": 1,
      "text": "DensePtr",
      "baseline": 19,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "RyzbgdtiyAgDl_Gg-xKD6",
      "originalText": "DensePtr"
    },
    {
      "type": "text",
      "version": 505,
      "versionNonce": 1882147883,
      "isDeleted": false,
      "id": "P46vozsH8hY3lX5pMtPk8",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 1423.0714285714287,
      "y": 706.107142857143,
      "strokeColor": "#a61e4d",
      "backgroundColor": "#12b886",
      "width": 178,
      "height": 28,
      "seed": 723312907,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1662257477283,
      "link": null,
      "locked": false,
      "fontSize": 20,
      "fontFamily": 1,
      "text": "DenseLinkKey",
      "baseline": 19,
      "textAlign": "center",
      "verticalAlign": "top",
      "containerId": "nqXx_jG0SMox2AHT2L5F2",
      "originalText": "DenseLinkKey"
    },
    {
      "id": "h4EkHYMe6b4cxIpFk2aJ1",
      "type": "arrow",
      "x": 698.5714285714287,
      "y": 769.857142857143,
      "width": 108,
      "height": 43,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "round",
      "seed": 313455205,
      "version": 995,
      "versionNonce": 1199018661,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1662257477410,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          67,
          -2.5
        ],
        [
          108,
          -43
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "1S3pVzBUuYFr-RbDX1FXv",
        "focus": 0.17277405270544205,
        "gap": 5.5
      },
      "endBinding": {
        "elementId": "6iemTDX54UBvWAow6YZUm",
        "focus": 0.37288545736724105,
        "gap": 1
      },
      "startArrowhead": null,
      "endArrowhead": "arrow"
    },
    {
      "type": "text",
      "version": 735,
      "versionNonce": 1684199269,
      "isDeleted": false,
      "id": "C55jJitM19fp12H5lRwCI",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 798.5714285714287,
      "y": 170.8571428571429,
      "strokeColor": "#000000",
      "backgroundColor": "#12b886",
      "width": 374,
      "height": 46,
      "seed": 22775717,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1662257477283,
      "link": null,
      "locked": false,
      "fontSize": 36,
      "fontFamily": 1,
      "text": "Chain With One Entry",
      "baseline": 32,
      "textAlign": "center",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "Chain With One Entry"
    },
    {
      "type": "text",
      "version": 957,
      "versionNonce": 694451563,
      "isDeleted": false,
      "id": "hGFgRua4wpyTtp4D694Ud",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 726.5714285714287,
      "y": 392.8571428571429,
      "strokeColor": "#000000",
      "backgroundColor": "#12b886",
      "width": 518,
      "height": 46,
      "seed": 840351749,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1662257477283,
      "link": null,
      "locked": false,
      "fontSize": 36,
      "fontFamily": 1,
      "text": "Chain With a Displaced Entry",
      "baseline": 32,
      "textAlign": "center",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "Chain With a Displaced Entry"
    },
    {
      "type": "ellipse",
      "version": 766,
      "versionNonce": 1799135941,
      "isDeleted": false,
      "id": "LGXZp6X5oRRKg9gzSJIcd",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 1229.8214285714287,
      "y": 669.982142857143,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 151,
      "height": 65,
      "seed": 1902833605,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "id": "uxrkX0MrXRysMixOCLf86",
          "type": "text"
        },
        {
          "id": "sGH1mRBDDfdaZOORzbU1h",
          "type": "arrow"
        },
        {
          "type": "text",
          "id": "uxrkX0MrXRysMixOCLf86"
        }
      ],
      "updated": 1662257477283,
      "link": null,
      "locked": false
    },
    {
      "type": "text",
      "version": 706,
      "versionNonce": 1852282891,
      "isDeleted": false,
      "id": "uxrkX0MrXRysMixOCLf86",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 1234.8214285714287,
      "y": 688.482142857143,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 141,
      "height": 28,
      "seed": 1657838347,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1662257477283,
      "link": null,
      "locked": false,
      "fontSize": 20,
      "fontFamily": 1,
      "text": "\"abcd...\"",
      "baseline": 19,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "LGXZp6X5oRRKg9gzSJIcd",
      "originalText": "\"abcd...\""
    },
    {
      "type": "rectangle",
      "version": 848,
      "versionNonce": 1701732011,
      "isDeleted": false,
      "id": "6SF7SEj50JLrJxpeJopdp",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 980.3214285714287,
      "y": 746.982142857143,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 157,
      "height": 42,
      "seed": 368070501,
      "groupIds": [
        "7648kMiz63bJLV7GO8sve"
      ],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "id": "rmScJhxvevICKMmx6PYQF",
          "type": "text"
        },
        {
          "id": "sGH1mRBDDfdaZOORzbU1h",
          "type": "arrow"
        },
        {
          "type": "text",
          "id": "rmScJhxvevICKMmx6PYQF"
        },
        {
          "id": "iKBu85WHY4IL_69QEvdyT",
          "type": "arrow"
        }
      ],
      "updated": 1662257477283,
      "link": null,
      "locked": false
    },
    {
      "type": "rectangle",
      "version": 844,
      "versionNonce": 1763922251,
      "isDeleted": false,
      "id": "Nz45mnUTSGpaOgsVNIEr-",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 979.3214285714287,
      "y": 803.982142857143,
      "strokeColor": "#364fc7",
      "backgroundColor": "#4c6ef5",
      "width": 157,
      "height": 42,
      "seed": 998933867,
      "groupIds": [
        "7648kMiz63bJLV7GO8sve"
      ],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "id": "dkP-6jzTOX9bdt8GCvOJw",
          "type": "text"
        },
        {
          "id": "CVx1AqnNI76hVX9-ObrtA",
          "type": "arrow"
        },
        {
          "type": "text",
          "id": "dkP-6jzTOX9bdt8GCvOJw"
        }
      ],
      "updated": 1662257477283,
      "link": null,
      "locked": false
    },
    {
      "type": "rectangle",
      "version": 734,
      "versionNonce": 450949099,
      "isDeleted": false,
      "id": "QvUMauaFoUm7amxqdJy2z",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 963.8214285714287,
      "y": 702.982142857143,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 188,
      "height": 159,
      "seed": 2086345413,
      "groupIds": [
        "7648kMiz63bJLV7GO8sve"
      ],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "id": "wIo5IjqjKx5agDWM2U6y9",
          "type": "arrow"
        },
        {
          "id": "5d6SPvHw2keIDl-5kNmEb",
          "type": "arrow"
        },
        {
          "id": "sEfRctJpRk7foZK9c0IAH",
          "type": "text"
        },
        {
          "type": "text",
          "id": "sEfRctJpRk7foZK9c0IAH"
        },
        {
          "id": "iKBu85WHY4IL_69QEvdyT",
          "type": "arrow"
        }
      ],
      "updated": 1662257477283,
      "link": null,
      "locked": false
    },
    {
      "type": "text",
      "version": 705,
      "versionNonce": 212974219,
      "isDeleted": false,
      "id": "dkP-6jzTOX9bdt8GCvOJw",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 984.3214285714287,
      "y": 810.982142857143,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 147,
      "height": 28,
      "seed": 1586274315,
      "groupIds": [
        "7648kMiz63bJLV7GO8sve"
      ],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1662257477283,
      "link": null,
      "locked": false,
      "fontSize": 20,
      "fontFamily": 1,
      "text": "DenseLinkKey",
      "baseline": 19,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "Nz45mnUTSGpaOgsVNIEr-",
      "originalText": "DenseLinkKey"
    },
    {
      "type": "text",
      "version": 706,
      "versionNonce": 474302373,
      "isDeleted": false,
      "id": "rmScJhxvevICKMmx6PYQF",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 985.3214285714287,
      "y": 753.982142857143,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 147,
      "height": 28,
      "seed": 370506277,
      "groupIds": [
        "7648kMiz63bJLV7GO8sve"
      ],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1662257477283,
      "link": null,
      "locked": false,
      "fontSize": 20,
      "fontFamily": 1,
      "text": "DensePtr",
      "baseline": 19,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "6SF7SEj50JLrJxpeJopdp",
      "originalText": "DensePtr"
    },
    {
      "type": "text",
      "version": 475,
      "versionNonce": 726675755,
      "isDeleted": false,
      "id": "sEfRctJpRk7foZK9c0IAH",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 968.8214285714287,
      "y": 707.982142857143,
      "strokeColor": "#a61e4d",
      "backgroundColor": "#12b886",
      "width": 178,
      "height": 28,
      "seed": 464676523,
      "groupIds": [
        "7648kMiz63bJLV7GO8sve"
      ],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1662257477283,
      "link": null,
      "locked": false,
      "fontSize": 20,
      "fontFamily": 1,
      "text": "DenseLinkKey",
      "baseline": 19,
      "textAlign": "center",
      "verticalAlign": "top",
      "containerId": "QvUMauaFoUm7amxqdJy2z",
      "originalText": "DenseLinkKey"
    },
    {
      "type": "arrow",
      "version": 1465,
      "versionNonce": 1519908357,
      "isDeleted": false,
      "id": "sGH1mRBDDfdaZOORzbU1h",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 1142.8214285714287,
      "y": 768.982142857143,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 108,
      "height": 43,
      "seed": 1144014277,
      "groupIds": [],
      "strokeSharpness": "round",
      "boundElements": [],
      "updated": 1662257477411,
      "link": null,
      "locked": false,
      "startBinding": {
        "elementId": "6SF7SEj50JLrJxpeJopdp",
        "focus": 0.17277405270544205,
        "gap": 5.5
      },
      "endBinding": {
        "elementId": "LGXZp6X5oRRKg9gzSJIcd",
        "focus": 0.37288545736724105,
        "gap": 1
      },
      "lastCommittedPoint": null,
      "startArrowhead": null,
      "endArrowhead": "arrow",
      "points": [
        [
          0,
          0
        ],
        [
          67,
          -2.5
        ],
        [
          108,
          -43
        ]
      ]
    },
    {
      "type": "arrow",
      "version": 1414,
      "versionNonce": 647294309,
      "isDeleted": false,
      "id": "CVx1AqnNI76hVX9-ObrtA",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 1141.8214285714287,
      "y": 826.982142857143,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 272,
      "height": 42,
      "seed": 171156747,
      "groupIds": [],
      "strokeSharpness": "round",
      "boundElements": [],
      "updated": 1662257477411,
      "link": null,
      "locked": false,
      "startBinding": {
        "elementId": "Nz45mnUTSGpaOgsVNIEr-",
        "focus": 0.5583475858439679,
        "gap": 5.5
      },
      "endBinding": {
        "elementId": "nqXx_jG0SMox2AHT2L5F2",
        "focus": 0.06888696200536025,
        "gap": 4.25
      },
      "lastCommittedPoint": null,
      "startArrowhead": null,
      "endArrowhead": "arrow",
      "points": [
        [
          0,
          0
        ],
        [
          95,
          -23
        ],
        [
          272,
          -42
        ]
      ]
    },
    {
      "type": "arrow",
      "version": 1469,
      "versionNonce": 180091077,
      "isDeleted": false,
      "id": "iKBu85WHY4IL_69QEvdyT",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 693.5714285714287,
      "y": 826.857142857143,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 269,
      "height": 43,
      "seed": 1191246795,
      "groupIds": [],
      "strokeSharpness": "round",
      "boundElements": [],
      "updated": 1662257477412,
      "link": null,
      "locked": false,
      "startBinding": {
        "elementId": "CdJtzp6w0n0rveWC1BWuQ",
        "focus": 0.5091435337455598,
        "gap": 1.5
      },
      "endBinding": {
        "elementId": "QvUMauaFoUm7amxqdJy2z",
        "focus": 0.10601094635015593,
        "gap": 1.25
      },
      "lastCommittedPoint": null,
      "startArrowhead": null,
      "endArrowhead": "arrow",
      "points": [
        [
          0,
          0
        ],
        [
          95,
          -23
        ],
        [
          269,
          -43
        ]
      ]
    }
  ],
  "appState": {
    "gridSize": null,
    "viewBackgroundColor": "#ffffff"
  },
  "files": {}
}

================================================
FILE: docs/dense_set.md
================================================
# DenseSet in Dragonfly

`DenseSet` uses [classic hashtable with separate chaining](https://en.wikipedia.org/wiki/Hash_table#Separate_chaining) similar to the Redis dictionary for lookup of items within the set.

The main optimization present in `DenseSet` is the ability for a pointer to **point to either an object or a link key**, removing the need to allocate a set entry for every entry. This is accomplished by using [pointer tagging](https://en.wikipedia.org/wiki/Tagged_pointer) exploiting the fact that the top 12 bits of any userspace address are not used and can be set to indicate if the current pointer points to nothing, a link key, or an object.

The following is what each bit in a pointer is used for

| Bit Index (from LSB) | Meaning |
| -------------------- |-------- |
|       0 - 52         | Memory address of data in the userspace |
|          53          | Indicates if this `DensePtr` points to data stored in the `DenseSet` or the next link in a chain |
|          54          | Displacement bit. Indicates if the current entry is in the correct list defined by the data's hash |
|          55          | Direction displaced, this only has meaning if the Displacement bit is set. 0 indicates the entry is to the left of its correct list, 1 indicates it is to the right of the correct list. |
|       56 - 63        | Unused |

Further, to reduce collisions items may be inserted into neighbors of the home chain (the chain determined by the hash) that are empty to reduce the number of unused spaces. These entries are then marked as displaced using pointer tagging.

An example of possible bucket configurations can be seen below.

![Dense Set Visualization](./dense_set.svg) *Created using [excalidraw](https://excalidraw.com)*

### Insertion
To insert an entry a `DenseSet` will take the following steps:

1. Check if the entry already exists in the set, if so return false
2. If the entry does not exist look for an empty chain at the hash index ± 1, prioritizing the home chain. If an empty entry is found the item will be inserted and return true
3. If step 2 fails and the growth prerequisites are met, increase the number of buckets in the table and repeat step 2
4. If step 3 fails, attempt to insert the entry in the home chain.
    - If the home chain is not occupied by a displaced entry insert the new entry in the front of the list
    - If the home chain is occupied by a displaced entry move the displaced entry to its home chain. This may cause a domino effect if the home chain of the displaced entry is occupied by a second displaced entry, resulting in up to `O(N)` "fixes"

### Searching
To find an entry in a `DenseSet`:

1. Check the first entry in the home and neighbour cells for matching entries
2. If step 1 fails iterate the home chain of the searched entry and check for equality

### Pending Improvements
Some further improvements to `DenseSet` include allowing entries to be inserted in their home chain without having to perform the current `O(N)` steps to fix displaced entries. By inserting an entry in their home chain after the displaced entry instead of fixing up displaced entries, searching incurs minimal added overhead and there is no domino effect in inserting a new entry. To move a displaced entry to its home chain eventually multiple heuristics may be implemented including:

- When an entry is erased if the chain becomes empty and there is a displaced entry in the neighbor chains move it to the now empty home chain
- If a displaced entry is found as a result of a search and is the root of a chain with multiple entries, the displaced node should be moved to its home bucket


## Benchmarks

At 100% utilization the Redis dictionary implementation uses approximately 32 bytes per record ([read the breakdown for more information](./dashtable.md#redis-dictionary))

In comparison using the neighbour cell optimization, `DenseSet` has ~21% of spaces unused at full utilization resulting in $N\*8 + 0.2\*16N \approx 11.2N$ or ~12 bytes per record, yielding ~20 byte savings. The number of bytes per record saved grows as utilization decreases.

Command `memtier_benchmark -p 6379 --command "sadd __key__ __data__"   -n 10000000 --threads=1 -c 1 --command-key-pattern=R   --data-size=10     --key-prefix="key:"  --hide-histogram --random-data --key-maximum=1 --randomize --pipeline 20`
produces two sets entries with lots of small records in them.

This is how memory usage looks like with DenseSet:

| Server                | Memory (RSS) |
|:---------------------:|:------:      |
| Dragonfly/DenseSet    |  323MB 🟩    |
| Redis                 |  586MB       |
| Dragonfly/RedisDict   |  663MB       |


================================================
FILE: docs/df-share-nothing.md
================================================
# Dragonfly Architecture

Dragonfly is a modern replacement for memory stores like Redis and Memcached. It scales vertically on a single instance to support millions of requests per second. It is more memory efficient, has been designed with reliability in mind, and includes a better caching design.

## Threading model

Dragonfly uses a single process with a multiple-thread architecture. Each Dragonfly thread is indirectly assigned several responsibilities via fibers.

One such responsibility is handling incoming connections. Once a socket listener accepts a client connection, the connection spends its entire lifetime bound to a single thread inside a fiber. Dragonfly is written to be 100% non-blocking; it uses fibers to provide asynchronicity in each thread. One of the essential properties of asynchronicity is that a thread cannot be blocked as long as it has pending CPU tasks. Dragonfly preserves this property by wrapping each unit of execution context in a fiber; we wrap units of execution that can potentially be blocked on I/O. For example, a connection loop runs within a fiber; a function that writes a snapshot runs inside a fiber, and so on.

As a side comment - asynchronicity and parallelism are different terms. Nodejs, for example, provides asynchronous execution but is single-threaded. Similarly, each Dragonfly thread is asynchronous on its own; therefore, Dragonfly is responsive to incoming events even when it handles long-running commands like saving to disk or running Lua scripts.


### Thread actors in DF

The DF in-memory database is sharded into `N` parts, where `N` is less or equal to the number of threads in the system. Each database shard is owned and accessed by a single thread.
The same thread can handle TCP connections and simultaneously host a database shard.
See the diagram below.


<br>
<img src="http://static.dragonflydb.io/repo-assets/thread-per-core.svg" border="0"/>

Here, our DF process spawns 4 threads, where threads 1 through 3 handle I/O (i.e., manage client connections) and threads 2 through 4 manage DB shards. Thread 2, for example, divides its CPU time between handling incoming requests and processing DB operations on the shard it owns.

So when we say that thread 1 is an I/O thread, we mean that Dragonfly can pin fibers that manage client connections to thread 1. In general, any thread can have many responsibilities that require CPU time; database management and connection handling are only two of those responsibilities.


## Fibers

I suggest reading my [intro post](https://www.romange.com/2018/12/15/introduction-to-fibers-in-c-/) about `Boost.Fibers` to learn more about fibers.

By the way, I want to compliment `Boost.Fibers` library–it has been exceptionally well designed:
it's unintrusive, lightweight, and efficient. Moreover, its default scheduler can be overridden. In the case of `helio`, the I/O library that powers Dragonfly, we overrode the `Boost.Fibers` scheduler to support shared-nothing architecture and integrate it with the I/O polling loop.

Importantly, fibers require bottom-up support in the application layer to preserve their asynchronicity. For example, in the snippet below, a blocking write into `fd` won't magically allow a fiber to preempt and switch to another fiber. No, the whole thread will be blocked.


```cpp
...
write(fd, buf, 1000000);

...
pthread_mutex_lock(...);

```

Similarly, with a `pthread_mutex_lock` call, the whole thread might be blocked, wasting precious CPU time.. Therefore, the Dragonfly code uses *fiber-friendly* primitives for I/O, communication, and coordination. These primitives are supplied by the `helio` and `Boost.Fibers` libraries.

## Life of a command request

This section explains how Dragonfly handles a command in the context of shared-nothing architecture. In most architectures used today, multi-threaded servers use mutex locks to protect their data structures, but Dragonfly does not. Why is this?

Inter-thread interactions in Dragonfly occur only via passing messages from thread to thread. For example, consider the following sequence diagram of handling a SET request:


```uml
@startuml

actor       User       as A1
boundary    connection  as B1
entity      "Shard K"   as E1
A1 ->  B1 : SET KEY VAL
B1 -> E1 : SET KEY VAL / k = HASH(KEY) % N
E1 -> B1 : OK
B1 -> A1 : Response

@enduml
```

<img src="https://www.plantuml.com/plantuml/svg/NOn12m8X48Nl_eh7Gb272Az1WGl2Wb6G5NGqLsW9PaBjqBzlL-lId6Q-zxvnFdD4dNCAlzKbA2bk_ABUnJS0U2OAFWzC9Msb29I7N3AWiNSNUvYckbeA9R7SOknX3QjFCFgAYzg9jd3zXx720njqodRp4IqmmrxegLe_7CnNLDDr3Ed9bC87"/>

Here, a connection fiber resides in a thread different from one that handles the `KEY` entity. We use hashing to decide which shard owns which key.

Another way to think of this flow is that a connection fiber serves as a coordinator for issuing transactional commands to other threads. In this simple example, the external "SET" command requires a single message passed from the coordinator to the destination shard thread. When we think of the Dragonfly model in the context of a single command request, I prefer to use the following diagram instead of the [one above](#thread-actors-in-df).

<br>
<img src="http://static.dragonflydb.io/repo-assets/coordinator.svg" border="0"/>

Here, a coordinator (or connection fiber) might even reside on one of the threads that coincidently owns one of the shards. However, it is easier to think of it as a separate entity that never directly accesses any shard data.

The coordinator serves as a virtualization layer that hides all the complexity of talking to multiple shards. It employs start-of-the-art algorithms to provide atomicity (and strict serializability) semantics for multi-key commands like "mset, mget, and blpop." It also offers strict serializability for Lua scripts and multi-command transactions.

Hiding such complexity is valuable to the end customer, but it comes with some CPU and latency costs. We believe the trade-off is worthwhile given the value that Dragonfly provides.

If you want to deep dive into Dragonfly architecture without the complexities of transactional code, it's worth checking [Midi Redis](https://github.com/romange/midi-redis/),
which implements a toy backend supporting `PING`, `SET`, and `GET` [commands](https://github.com/romange/midi-redis/blob/main/server/main_service.cc#L239).

In fact, Dragonfly grew from that project; they share a common commit history.

By the way, to learn how to build even simpler TCP backends than `midi-redis`, `helio` library provides sample backends like these: [echo_server](https://github.com/romange/helio/blob/master/examples/echo_server.cc) and [ping_iouring_server.cc](https://github.com/romange/helio/blob/master/examples/pingserver/ping_iouring_server.cc). These backends reach millions of QPS on multi-core servers much like Dragonfly and midi-redis do.


================================================
FILE: docs/differences.md
================================================
# Differences with Redis

## String lengths, indices.

String sizes are limited to 256MB.
Indices (say in GETRANGE and SETRANGE commands) should be signed 32 bit integers in range
[-2147483647, 2147483648].

### String handling.

SORT does not take any locale into account.

## Expiry ranges.
Expirations are limited to 8 years. For commands with millisecond precision like PEXPIRE or PSETEX,
expirations greater than 2^28ms are quietly rounded to the nearest second losing precision of less than 0.001%.

## Lua
We use lua 5.4.4 that has been released in 2022.
That means we also support [lua integers](https://github.com/redis/redis/issues/5261).


================================================
FILE: docs/faq.md
================================================
# Dragonfly Frequently Asked Questions

- [Dragonfly Frequently Asked Questions](#dragonfly-frequently-asked-questions)
  - [What is the license model of Dragonfly? Is it an open source?](#what-is-the-license-model-of-dragonfly-is-it-an-open-source)
  - [Can I use dragonfly in production?](#can-i-use-dragonfly-in-production)
  - [We benchmarked Dragonfly and we have not reached 4M qps throughput as you advertised.](#we-benchmarked-dragonfly-and-we-have-not-reached-4m-qps-throughput-as-you-advertised)
  - [Dragonfly provides vertical scale, but we can achieve similar throughput with X nodes in a Redis cluster.](#dragonfly-provides-vertical-scale-but-we-can-achieve-similar-throughput-with-x-nodes-in-a-redis-cluster)
  - [If only Dragonfly had this command I would use it for sure](#if-only-dragonfly-had-this-command-i-would-use-it-for-sure)


## What is the license model of Dragonfly? Is it an open source?
Dragonfly is released under [BSL 1.1](../LICENSE.md) (Business Source License).
BSL 1.1 is considered to be "source available" license and it's not strictly open-source license.
We believe that a [BSL 1.1](https://spdx.org/licenses/BUSL-1.1.html) license is more permissive
than licenses like AGPL, and it will allow us to
provide a competitive commercial service using our technology. In general terms,
it means that Dragonfly's code is free to use and free to change as long as you do not sell services directly related to
Dragonfly or in-memory datastores.
We followed the trend of other technological companies like Elastic, Redis, MongoDB, Cockroach labs,
Redpanda Data to protect our rights to provide service and support for the software we are building.

## Can I use dragonfly in production?
License wise you are free to use dragonfly in your production as long as you do not provide Dragonfly as a managed service.
From a code maturity point of view, Dragonfly's code is covered with unit testing and the regression tests.
However as with any new software there are use cases that are hard to test and predict.
We advise you to run your own particular use case on dragonfly for a few days before considering production usage.

## We benchmarked Dragonfly and we have not reached 4M qps throughput as you advertised.
We conducted our experiments using a load-test generator called `memtier_benchmark`,
and we run benchmarks on AWS network-enhanced instance `c6gn.16xlarge` on recent Linux kernel versions.
Dragonfly might reach smaller throughput on other instances, but we would
still expect to reach around 1M+ qps on instances with 16-32 vCPUs.

## Dragonfly provides vertical scale, but we can achieve similar throughput with X nodes in a Redis cluster.
Dragonfly optimizes the use of underlying hardware, allowing it to run efficiently on instances as small as 8GB,
 and scale vertically to large 2TB machines with 128 cores. This versatility significantly
 reduces the complexity of running cluster workloads on a single node, saving hardware resources and costs.
 More importantly, it diminishes the total cost
 of ownership associated with managing multi-node clusters. In contrast, Redis in cluster
 mode imposes limitations on multi-key and transactional operations, whereas Dragonfly maintains
 the same semantics as a single-node Redis system.
 Furthermore, scaling out horizontally with small instances can lead to instability
 in production environments.
 We believe that large-scale deployments of in-memory stores require both vertical and horizontal scaling,
 which is not efficiently achievable with an in-memory store like Redis.

## If only Dragonfly had this command I would use it for sure
Dragonfly implements ~190 Redis commands which we think represent a good coverage of the market.
However this is not based empirical data. Having said that, if you have commands that are not covered,
please feel free to open an issue for that or vote for an existing issue.
We will do our best to prioritise those commands according to their popularity.


================================================
FILE: docs/memcached_benchmark.md
================================================
Contention in memcached under the high write throughput.

<img src="http://static.dragonflydb.io/repo-assets/memcached_perf_top.png" width="100%" border="0"/>

Overall CPU usage of memcached when performing SETS benchmark:

<img src="http://static.dragonflydb.io/repo-assets/memcached_cpu_usage.png" width="100%" border="0"/>


================================================
FILE: docs/memory_bgsave.tsv
================================================
Time	Dragonfly	Redis
4	4738531328	6819917824
5	4738637824	6819917824
6	4738658304	6819913728
7	4738777088	6820589568
8	4738781184	6820638720
9	4738768896	6820769792
10	4738494464	6820777984
11	4738756608	6820683776
12	4740325376	6820687872
13	4740243456	6820691968
14	4740194304	6820687872
15	4740194304	7429746688
16	4740734976	7942115328
17	4740370432	8400957440
18	4740366336	8863305728
19	4740390912	9302515712
20	4740399104	9697935360
21	4740423680	10074103808
22	4748312576	10362601472
23	4750438400	10649939968
24	4750315520	10926985216
25	4750426112	11195555840
26	4750180352	11444666368
27	4750417920	11665764352
28	4750131200	11872944128
29	4750233600	12060946432
30	4750475264	12232212480
31		12379299840
32		12521598976
33		12647915520
34		12756508672
35		12848570368
36		12944240640
37		13025046528
38		13105799168
39		13181427712
40		8000053248
41		7048486912
42		7048507392

================================================
FILE: docs/namespaces.md
================================================
# Namespaces in Dragonfly

Dragonfly added an _experimental_ feature, allowing complete separation of data by different users.
We call this feature _namespaces_, and it allows using a single Dragonfly server with multiple
tenants, each using their own data, without being able to mix them together.

Note that this feature can alternatively be achieved by having each user `SELECT` a different
(numeric) database, or by asking that each user uses a unique prefix for their keys. This approach
has several disadvantages, like users forgetting to `SELECT` / use their prefix, accessing data
logically belonging to other users.

The advantage of using Namespaces is that data is completely isolated, and users cannot accidentally
use data they do not own. A user must authenticate in order to access the namespace it was assigned.
And as a bonus, each namespace can have multiple databases, switched via `SELECT` like any regular
data store.

However, before using this feature, please note that it is experimental. This means that:

* Some features are not supported for non-default namespaces, such as replication and save to RDB
* Some tools are missing, like breakdown of memory / load per namespace
* We do not yet consider this production ready, and it might still have some uncovered bugs

So kindly use it at your own risk.

## Usage

This section describes how, as a Dragonfly user / administrator, you could use namespaces.

A namespace is identified by a unique string id, defined by the user / admin. Each Dragonfly user
is associated with a single namespace. If not set explicitly, then the default namespace is used,
which is the empty string id.

Multiple users can use the same namespace if they are all assigned the same namespace id. This can
allow, for example, creating a read-only user as well as a mutating user over the same data.

To associate user `user1` with the namespace `namespace1`, use the `ACL` command with the
`NAMESPACE:namespace1` flag:

```
ACL SETUSER user1 NAMESPACE:namespace1 ON >user_pass +@all ~*
```

This sets / creates user `user`, using password `user_pass`, using namespace `namespace1`.

For more examples check out `tests/dragonfly/acl_family_test.py` - specifically the
`test_namespaces` function.

## Technical Details

This section describes how we _implemented_ namespaces in Dragonfly. It is meant to be used by those
who wish to contribute pull requests to Dragonfly.

Prior to adding namespaces to Dragonfly, each _shard_ had a single `DbSlice` that it owned. They
were thread-local, global-scope instances.

To support namespaces, we created a `Namespace` class (see `src/server/namespaces.h`) which contains
a `vector<DbSlice>`, with a `DbSlice` per shard. When first used, a `Namespace` calls the engine
shard set to initialize the array of `DbSlice`s.

To access all `Namespace`s, we also added a registry with the original name `Namespaces`. It is a
global, thread safe class that allows accessing all registered namespaces, and registering new ones
on the fly. Note that, while it is thread safe, it shouldn't be a bottle neck because it is supposed
to only be used during the authentication of a connection (or when adding new namespaces).

When a new connection is authenticated with Dragonfly, we look up (and create, if needed) the
namespace it is associated with. We then save a `Namespace* ns` inside the `dfly::ConnectionContext`
class to associate the user with the namespaces. Because we removed the global `DbSlice` objects,
this is now the only way to access namespaces, which protects users from accessing unowned data.

Currently, we do not have any support for removing namespaces, so they hang in memory until the
server exits.


================================================
FILE: docs/quick-start/README.md
================================================
<p align="center">
  <a href="https://dragonflydb.io">
    <img src="https://raw.githubusercontent.com/dragonflydb/dragonfly/main/.github/images/logo-full.svg"
      width="284" border="0" alt="Dragonfly">
  </a>
</p>


# Quick Start

Starting with `docker run` is the simplest way to get up and running with DragonflyDB.

If you do not have docker on your machine, [Install Docker](https://docs.docker.com/get-docker/) before continuing.

## Step 1

### On linux

```bash
docker run --network=host --ulimit memlock=-1 docker.dragonflydb.io/dragonflydb/dragonfly
```

### On macOS

_`network=host` doesn't work well on macOS, see [this issue](https://github.com/docker/for-mac/issues/1031)_

```bash
docker run -p 6379:6379 --ulimit memlock=-1 docker.dragonflydb.io/dragonflydb/dragonfly
```

Dragonfly DB will answer to both `http` and `redis` requests out of the box!

You can use `redis-cli` to connect to `localhost:6379` or open a browser and visit `http://localhost:6379`

**NOTE**: On some configurations, running with the `docker run --privileged ...` flag can fix some
initialization errors.

## Step 2

Connect with a redis client

```bash
redis-cli
127.0.0.1:6379> set hello world
OK
127.0.0.1:6379> keys *
1) "hello"
127.0.0.1:6379> get hello
"world"
127.0.0.1:6379>
```

## Step 3

Continue being great and build your app with the power of DragonflyDB!

## Known issues


## More Build Options
- [Docker Compose Deployment](/contrib/docker/)
- [Kubernetes Deployment with Helm Chart](/contrib/charts/dragonfly/)
- [Build From Source](/docs/build-from-source.md)


================================================
FILE: docs/rdbsave.excalidraw
================================================
{
  "type": "excalidraw",
  "version": 2,
  "source": "https://excalidraw.com",
  "elements": [
    {
      "type": "rectangle",
      "version": 586,
      "versionNonce": 345912761,
      "isDeleted": false,
      "id": "BY5OdEEKT0Y_DTy9Zgr9C",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 654.7020016982203,
      "y": 187.24519230769243,
      "strokeColor": "#000000",
      "backgroundColor": "#fd7e14",
      "width": 165,
      "height": 199,
      "seed": 1621471436,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "id": "sIrssFTnnb9f1o26g1j88",
          "type": "text"
        },
        {
          "type": "text",
          "id": "sIrssFTnnb9f1o26g1j88"
        },
        {
          "id": "1cq4mAkO92nzlk-wjAy0a",
          "type": "arrow"
        }
      ],
      "updated": 1661620421120,
      "link": null,
      "locked": false
    },
    {
      "type": "text",
      "version": 514,
      "versionNonce": 869523031,
      "isDeleted": false,
      "id": "sIrssFTnnb9f1o26g1j88",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 659.7020016982203,
      "y": 261.74519230769243,
      "strokeColor": "#000000",
      "backgroundColor": "#fd7e14",
      "width": 155,
      "height": 50,
      "seed": 711168500,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1661620421121,
      "link": null,
      "locked": false,
      "fontSize": 20,
      "fontFamily": 1,
      "text": "Thread-local\nSnapshot 1",
      "baseline": 43,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "BY5OdEEKT0Y_DTy9Zgr9C",
      "originalText": "Thread-local\nSnapshot 1"
    },
    {
      "type": "rectangle",
      "version": 622,
      "versionNonce": 1016232663,
      "isDeleted": false,
      "id": "OiDY20ES-4wBxFVAzHkHt",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 866.0673076923077,
      "y": 187.24519230769243,
      "strokeColor": "#000000",
      "backgroundColor": "#fd7e14",
      "width": 165,
      "height": 199,
      "seed": 1937655639,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "id": "nTSFevnRPYnvrSc57ZrgV",
          "type": "text"
        },
        {
          "id": "nTSFevnRPYnvrSc57ZrgV",
          "type": "text"
        },
        {
          "type": "text",
          "id": "nTSFevnRPYnvrSc57ZrgV"
        },
        {
          "id": "NGMUGV32wJmpMyvB3YQTx",
          "type": "arrow"
        }
      ],
      "updated": 1661620421121,
      "link": null,
      "locked": false
    },
    {
      "type": "text",
      "version": 539,
      "versionNonce": 941214039,
      "isDeleted": false,
      "id": "nTSFevnRPYnvrSc57ZrgV",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 871.0673076923077,
      "y": 256.74519230769243,
      "strokeColor": "#000000",
      "backgroundColor": "#fd7e14",
      "width": 155,
      "height": 60,
      "seed": 1072545177,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1661620424002,
      "link": null,
      "locked": false,
      "fontSize": 23.932285237126536,
      "fontFamily": 1,
      "text": "Thread-local\nSnapshot 2",
      "baseline": 51,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "OiDY20ES-4wBxFVAzHkHt",
      "originalText": "Thread-local\nSnapshot 2"
    },
    {
      "type": "rectangle",
      "version": 608,
      "versionNonce": 1548421111,
      "isDeleted": false,
      "id": "0DuGwtSiWQDXGbVDx_Yq4",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 1085.2980769230767,
      "y": 187.24519230769243,
      "strokeColor": "#000000",
      "backgroundColor": "#fd7e14",
      "width": 165,
      "height": 199,
      "seed": 1695403735,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "id": "dcrIif4WgKLztfzWXXskR",
          "type": "text"
        },
        {
          "id": "dcrIif4WgKLztfzWXXskR",
          "type": "text"
        },
        {
          "type": "text",
          "id": "dcrIif4WgKLztfzWXXskR"
        },
        {
          "id": "hgq3HgiDoEU1A13Sax2A5",
          "type": "arrow"
        }
      ],
      "updated": 1661620421121,
      "link": null,
      "locked": false
    },
    {
      "type": "text",
      "version": 530,
      "versionNonce": 667080441,
      "isDeleted": false,
      "id": "dcrIif4WgKLztfzWXXskR",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 1090.2980769230767,
      "y": 256.74519230769243,
      "strokeColor": "#000000",
      "backgroundColor": "#fd7e14",
      "width": 155,
      "height": 60,
      "seed": 379350553,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1661620421122,
      "link": null,
      "locked": false,
      "fontSize": 23.932285237126536,
      "fontFamily": 1,
      "text": "Thread-local\nSnapshot 3",
      "baseline": 51,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "0DuGwtSiWQDXGbVDx_Yq4",
      "originalText": "Thread-local\nSnapshot 3"
    },
    {
      "id": "577abnzpQuxk_hrNgIMkV",
      "type": "diamond",
      "x": 689.3365384615385,
      "y": 437.86057692307713,
      "width": 92,
      "height": 157,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "#12b886",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "seed": 520181305,
      "version": 125,
      "versionNonce": 1270149399,
      "isDeleted": false,
      "boundElements": [
        {
          "id": "1cq4mAkO92nzlk-wjAy0a",
          "type": "arrow"
        },
        {
          "type": "text",
          "id": "YWzMoutOj3POKIhzoAb6q"
        },
        {
          "id": "HjlV2QEoKO1Najg9D1xnm",
          "type": "arrow"
        }
      ],
      "updated": 1661620421122,
      "link": null,
      "locked": false
    },
    {
      "id": "1cq4mAkO92nzlk-wjAy0a",
      "type": "arrow",
      "x": 728.5673076923077,
      "y": 395.9759615384616,
      "width": 32.307692307692264,
      "height": 36.04730445962048,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "round",
      "seed": 2032795417,
      "version": 139,
      "versionNonce": 1145353783,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1661620421122,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          32.307692307692264,
          11.538461538461547
        ],
        [
          9.869210911479854,
          36.04730445962048
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "BY5OdEEKT0Y_DTy9Zgr9C",
        "focus": 0.8708968370314767,
        "gap": 9.73076923076917
      },
      "endBinding": {
        "elementId": "577abnzpQuxk_hrNgIMkV",
        "focus": -1.6111525113388454,
        "gap": 5.625821498015291
      },
      "startArrowhead": null,
      "endArrowhead": "arrow"
    },
    {
      "id": "YWzMoutOj3POKIhzoAb6q",
      "type": "text",
      "x": 694.3365384615385,
      "y": 498.36057692307713,
      "width": 82,
      "height": 36,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "seed": 381921847,
      "version": 39,
      "versionNonce": 405941433,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1661620421122,
      "link": null,
      "locked": false,
      "text": "Rdb\nSerializer",
      "fontSize": 16,
      "fontFamily": 2,
      "textAlign": "center",
      "verticalAlign": "middle",
      "baseline": 32,
      "containerId": "577abnzpQuxk_hrNgIMkV",
      "originalText": "Rdb\nSerializer"
    },
    {
      "id": "Ig1qNk-AOw_VTS_xlELs5",
      "type": "rectangle",
      "x": 717.798076923077,
      "y": 641.3605769230771,
      "width": 477,
      "height": 67,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "#fa5252",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "seed": 1664303159,
      "version": 124,
      "versionNonce": 111029657,
      "isDeleted": false,
      "boundElements": [
        {
          "type": "text",
          "id": "jE5wNvo8TFk1wC4v8bQ6s"
        },
        {
          "id": "HjlV2QEoKO1Najg9D1xnm",
          "type": "arrow"
        },
        {
          "id": "hLcR_BUncIusv-IFL2ucM",
          "type": "arrow"
        },
        {
          "id": "WHRznFJAFjpXbmv35tCsY",
          "type": "arrow"
        },
        {
          "id": "yVBhfXkyFmu2rg16oRlxu",
          "type": "arrow"
        }
      ],
      "updated": 1661620421122,
      "link": null,
      "locked": false
    },
    {
      "type": "diamond",
      "version": 140,
      "versionNonce": 1301746297,
      "isDeleted": false,
      "id": "MclWY93u6fXaKcMyYF-Jy",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 896.4134615384614,
      "y": 437.8605769230771,
      "strokeColor": "#000000",
      "backgroundColor": "#12b886",
      "width": 92,
      "height": 157,
      "seed": 755813689,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "id": "NGMUGV32wJmpMyvB3YQTx",
          "type": "arrow"
        },
        {
          "id": "_xhHeDkg3dVxrIbXlln8Z",
          "type": "text"
        },
        {
          "type": "text",
          "id": "_xhHeDkg3dVxrIbXlln8Z"
        },
        {
          "id": "hLcR_BUncIusv-IFL2ucM",
          "type": "arrow"
        }
      ],
      "updated": 1661620421122,
      "link": null,
      "locked": false
    },
    {
      "type": "arrow",
      "version": 167,
      "versionNonce": 1223962007,
      "isDeleted": false,
      "id": "NGMUGV32wJmpMyvB3YQTx",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 944.8750000000002,
      "y": 387.86057692307696,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 15.10726263633046,
      "height": 47.58370911007313,
      "seed": 282885847,
      "groupIds": [],
      "strokeSharpness": "round",
      "boundElements": [],
      "updated": 1661620421122,
      "link": null,
      "locked": false,
      "startBinding": {
        "elementId": "OiDY20ES-4wBxFVAzHkHt",
        "focus": 0.48198474540576314,
        "gap": 1.615384615384528
      },
      "endBinding": {
        "elementId": "MclWY93u6fXaKcMyYF-Jy",
        "focus": -0.9774990043807243,
        "gap": 2.921009509018951
      },
      "lastCommittedPoint": null,
      "startArrowhead": null,
      "endArrowhead": "arrow",
      "points": [
        [
          0,
          0
        ],
        [
          14.615384615384528,
          21.538461538461547
        ],
        [
          -0.4918780209459328,
          47.58370911007313
        ]
      ]
    },
    {
      "type": "text",
      "version": 51,
      "versionNonce": 299916121,
      "isDeleted": false,
      "id": "_xhHeDkg3dVxrIbXlln8Z",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 901.4134615384614,
      "y": 498.3605769230771,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 82,
      "height": 36,
      "seed": 1481686553,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1661620421122,
      "link": null,
      "locked": false,
      "fontSize": 16,
      "fontFamily": 2,
      "text": "Rdb\nSerializer",
      "baseline": 32,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "MclWY93u6fXaKcMyYF-Jy",
      "originalText": "Rdb\nSerializer"
    },
    {
      "type": "diamond",
      "version": 225,
      "versionNonce": 1063805623,
      "isDeleted": false,
      "id": "jGf5xxZ5eve-AtPae7Yly",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 1119.4903846153848,
      "y": 437.8605769230772,
      "strokeColor": "#000000",
      "backgroundColor": "#12b886",
      "width": 92,
      "height": 157,
      "seed": 538175673,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "id": "hgq3HgiDoEU1A13Sax2A5",
          "type": "arrow"
        },
        {
          "id": "WQcx4-r2uMVAquWROfq1l",
          "type": "text"
        },
        {
          "type": "text",
          "id": "WQcx4-r2uMVAquWROfq1l"
        },
        {
          "id": "WHRznFJAFjpXbmv35tCsY",
          "type": "arrow"
        }
      ],
      "updated": 1661620421122,
      "link": null,
      "locked": false
    },
    {
      "type": "arrow",
      "version": 390,
      "versionNonce": 332236857,
      "isDeleted": false,
      "id": "hgq3HgiDoEU1A13Sax2A5",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 1141.6872098880729,
      "y": 392.47596153846166,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 70.88009780423477,
      "height": 61.500951281640766,
      "seed": 168221527,
      "groupIds": [],
      "strokeSharpness": "round",
      "boundElements": [],
      "updated": 1661620421122,
      "link": null,
      "locked": false,
      "startBinding": {
        "elementId": "0DuGwtSiWQDXGbVDx_Yq4",
        "focus": 0.9791425008071145,
        "gap": 6.230769230769226
      },
      "endBinding": {
        "elementId": "jGf5xxZ5eve-AtPae7Yly",
        "focus": -0.5445868784908863,
        "gap": 4.55886494843503
      },
      "lastCommittedPoint": null,
      "startArrowhead": null,
      "endArrowhead": "arrow",
      "points": [
        [
          0,
          0
        ],
        [
          70.88009780423477,
          10.76923076923083
        ],
        [
          38.5310635413573,
          61.500951281640766
        ]
      ]
    },
    {
      "type": "text",
      "version": 138,
      "versionNonce": 2144924631,
      "isDeleted": false,
      "id": "WQcx4-r2uMVAquWROfq1l",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 1124.4903846153848,
      "y": 498.3605769230772,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 82,
      "height": 36,
      "seed": 585656729,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1661620421122,
      "link": null,
      "locked": false,
      "fontSize": 16,
      "fontFamily": 2,
      "text": "Rdb\nSerializer",
      "baseline": 32,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "jGf5xxZ5eve-AtPae7Yly",
      "originalText": "Rdb\nSerializer"
    },
    {
      "id": "jE5wNvo8TFk1wC4v8bQ6s",
      "type": "text",
      "x": 722.798076923077,
      "y": 656.8605769230771,
      "width": 467,
      "height": 36,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "#12b886",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "seed": 320154873,
      "version": 98,
      "versionNonce": 1177598807,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1661620459622,
      "link": null,
      "locked": false,
      "text": "Blob Channel (SliceSnapshot::RecordChannel)\nBucket-level granularity",
      "fontSize": 16,
      "fontFamily": 2,
      "textAlign": "center",
      "verticalAlign": "middle",
      "baseline": 32,
      "containerId": "Ig1qNk-AOw_VTS_xlELs5",
      "originalText": "Blob Channel (SliceSnapshot::RecordChannel)\nBucket-level granularity"
    },
    {
      "id": "HjlV2QEoKO1Najg9D1xnm",
      "type": "arrow",
      "x": 741.2581209970564,
      "y": 588.5811776062717,
      "width": 31.351415988958138,
      "height": 44.98870164238667,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "#12b886",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "round",
      "seed": 1489149785,
      "version": 105,
      "versionNonce": 1873907193,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1661620421122,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          31.351415988958138,
          44.98870164238667
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "577abnzpQuxk_hrNgIMkV",
        "gap": 1.9342976914014673,
        "focus": 0.8117909371106269
      },
      "endBinding": {
        "elementId": "Ig1qNk-AOw_VTS_xlELs5",
        "gap": 7.790697674418787,
        "focus": -0.593178549414425
      },
      "startArrowhead": null,
      "endArrowhead": "arrow"
    },
    {
      "id": "hLcR_BUncIusv-IFL2ucM",
      "type": "arrow",
      "x": 919.3365384615385,
      "y": 574.4375,
      "width": 31.736196893864076,
      "height": 60.69051878354196,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "#12b886",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "round",
      "seed": 328800759,
      "version": 85,
      "versionNonce": 304047833,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1661620421122,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          -14.615384615384642,
          25.384615384615472
        ],
        [
          17.120812278479434,
          60.69051878354196
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "MclWY93u6fXaKcMyYF-Jy",
        "focus": -0.22524576872402804,
        "gap": 9.584854518692971
      },
      "endBinding": {
        "elementId": "Ig1qNk-AOw_VTS_xlELs5",
        "gap": 6.232558139535168,
        "focus": 0.05517004727771827
      },
      "startArrowhead": null,
      "endArrowhead": "arrow"
    },
    {
      "id": "WHRznFJAFjpXbmv35tCsY",
      "type": "arrow",
      "x": 1123.951923076923,
      "y": 553.6682692307693,
      "width": 32.30769230769238,
      "height": 81.53846153846143,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "#12b886",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "round",
      "seed": 971531865,
      "version": 66,
      "versionNonce": 789696311,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1661620421122,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          -32.30769230769238,
          38.46153846153834
        ],
        [
          -23.84615384615404,
          81.53846153846143
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "jGf5xxZ5eve-AtPae7Yly",
        "focus": 0.2217391304347844,
        "gap": 15.012636648887266
      },
      "endBinding": {
        "elementId": "Ig1qNk-AOw_VTS_xlELs5",
        "focus": 0.6185597345566728,
        "gap": 6.153846153846416
      },
      "startArrowhead": null,
      "endArrowhead": "arrow"
    },
    {
      "id": "yVBhfXkyFmu2rg16oRlxu",
      "type": "arrow",
      "x": 864.7211538461538,
      "y": 717.5144230769231,
      "width": 67.97279116285586,
      "height": 64.8374913674163,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "#228be6",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "round",
      "seed": 651147575,
      "version": 635,
      "versionNonce": 116567415,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1661620421122,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          -42.30769230769215,
          16.923076923076792
        ],
        [
          -67.97279116285586,
          64.8374913674163
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "Ig1qNk-AOw_VTS_xlELs5",
        "focus": -0.04672674106343535,
        "gap": 9.153846153845961
      },
      "endBinding": {
        "elementId": "HK8F6p6Adyxvgasi9uzJo",
        "focus": -0.17323237259147364,
        "gap": 5.1625086325837515
      },
      "startArrowhead": null,
      "endArrowhead": "arrow"
    },
    {
      "id": "HK8F6p6Adyxvgasi9uzJo",
      "type": "rectangle",
      "x": 707.7980769230769,
      "y": 784.4375,
      "width": 155.84615384615387,
      "height": 98.27507912481072,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "#4c6ef5",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "seed": 2031988567,
      "version": 164,
      "versionNonce": 418531705,
      "isDeleted": false,
      "boundElements": [
        {
          "id": "yVBhfXkyFmu2rg16oRlxu",
          "type": "arrow"
        },
        {
          "type": "text",
          "id": "fB6sqnJqDlolUIDrydMk5"
        },
        {
          "id": "YVK4Nv0Onos-JNSI9I5YI",
          "type": "arrow"
        }
      ],
      "updated": 1661620421122,
      "link": null,
      "locked": false
    },
    {
      "id": "fB6sqnJqDlolUIDrydMk5",
      "type": "text",
      "x": 712.7980769230769,
      "y": 825.5750395624053,
      "width": 145.84615384615387,
      "height": 16,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "#4c6ef5",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "seed": 1340401175,
      "version": 194,
      "versionNonce": 1565255319,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1661620421123,
      "link": null,
      "locked": false,
      "text": "SaveBody",
      "fontSize": 14.404558404558403,
      "fontFamily": 2,
      "textAlign": "center",
      "verticalAlign": "middle",
      "baseline": 13,
      "containerId": "HK8F6p6Adyxvgasi9uzJo",
      "originalText": "SaveBody"
    },
    {
      "type": "rectangle",
      "version": 216,
      "versionNonce": 1292304185,
      "isDeleted": false,
      "id": "w6yJKrh_ucB0qKWLRrPA1",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 921.4134615384612,
      "y": 785.2230373606715,
      "strokeColor": "#000000",
      "backgroundColor": "#15aabf",
      "width": 156,
      "height": 98.27507912481072,
      "seed": 1894727609,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "id": "yVBhfXkyFmu2rg16oRlxu",
          "type": "arrow"
        },
        {
          "id": "JClqLh6OUtndfrUc-BbHt",
          "type": "text"
        },
        {
          "type": "text",
          "id": "JClqLh6OUtndfrUc-BbHt"
        },
        {
          "id": "XiGmqFegyOE2IKWoIo40s",
          "type": "arrow"
        }
      ],
      "updated": 1661620421123,
      "link": null,
      "locked": false
    },
    {
      "type": "text",
      "version": 259,
      "versionNonce": 710307031,
      "isDeleted": false,
      "id": "JClqLh6OUtndfrUc-BbHt",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 926.4134615384612,
      "y": 826.3605769230768,
      "strokeColor": "#000000",
      "backgroundColor": "#4c6ef5",
      "width": 146,
      "height": 16,
      "seed": 1215329367,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1661620421123,
      "link": null,
      "locked": false,
      "fontSize": 14.404558404558403,
      "fontFamily": 2,
      "text": "AlignedBuffer",
      "baseline": 13,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "w6yJKrh_ucB0qKWLRrPA1",
      "originalText": "AlignedBuffer"
    },
    {
      "id": "YVK4Nv0Onos-JNSI9I5YI",
      "type": "arrow",
      "x": 867.7980769230768,
      "y": 836.7451923076923,
      "width": 55.38461538461536,
      "height": 0,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "#15aabf",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "round",
      "seed": 2028321497,
      "version": 86,
      "versionNonce": 506769433,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1661620421123,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          55.38461538461536,
          0
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "HK8F6p6Adyxvgasi9uzJo",
        "focus": 0.0018973206471872748,
        "gap": 4.153846153846075
      },
      "endBinding": null,
      "startArrowhead": null,
      "endArrowhead": "arrow"
    },
    {
      "id": "cqCQRIsxqHSsV_j5V6fMA",
      "type": "ellipse",
      "x": 1165.490384615384,
      "y": 781.3605769230769,
      "width": 128,
      "height": 106,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "#e64980",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "seed": 1621295255,
      "version": 67,
      "versionNonce": 281065975,
      "isDeleted": false,
      "boundElements": [
        {
          "type": "text",
          "id": "6N8Vr1qw1YKDs9h0ze2LI"
        },
        {
          "id": "XiGmqFegyOE2IKWoIo40s",
          "type": "arrow"
        }
      ],
      "updated": 1661620421123,
      "link": null,
      "locked": false
    },
    {
      "id": "6N8Vr1qw1YKDs9h0ze2LI",
      "type": "text",
      "x": 1170.490384615384,
      "y": 816.3605769230769,
      "width": 118,
      "height": 36,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "#e64980",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "seed": 1910738841,
      "version": 45,
      "versionNonce": 1681474809,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1661620421123,
      "link": null,
      "locked": false,
      "text": "Direct I/O\nFile",
      "fontSize": 16,
      "fontFamily": 2,
      "textAlign": "center",
      "verticalAlign": "middle",
      "baseline": 32,
      "containerId": "cqCQRIsxqHSsV_j5V6fMA",
      "originalText": "Direct I/O\nFile"
    },
    {
      "id": "XiGmqFegyOE2IKWoIo40s",
      "type": "arrow",
      "x": 1082.4134615384614,
      "y": 834.4375,
      "width": 69.23076923076928,
      "height": 0.7692307692308304,
      "angle": 0,
      "strokeColor": "#000000",
      "backgroundColor": "#e64980",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "strokeSharpness": "round",
      "seed": 1724070359,
      "version": 21,
      "versionNonce": 178545431,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1661620421123,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          69.23076923076928,
          -0.7692307692308304
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "w6yJKrh_ucB0qKWLRrPA1",
        "focus": 0.01998122507071207,
        "gap": 5.000000000000227
      },
      "endBinding": {
        "elementId": "cqCQRIsxqHSsV_j5V6fMA",
        "focus": 0.029379713320443476,
        "gap": 13.85030430804018
      },
      "startArrowhead": null,
      "endArrowhead": "arrow"
    }
  ],
  "appState": {
    "gridSize": null,
    "viewBackgroundColor": "#ffffff"
  },
  "files": {}
}


================================================
FILE: docs/rdbsave.md
================================================
# RDB Snapshot design

The following document describes Dragonfly's point in time, forkless snapshotting procedure,
including all its configurations.

## Redis-compatible RDB snapshot

This snapshot is serialized into a single file or into a network socket.
This configuration is used to create redis-compatible backup snapshots.

The algorithm utilizes the shared-nothing architecture of Dragonfly and makes sure that each shard-thread serializes only its own data. Below is the high description of the flow.

<img src="http://static.dragonflydb.io/repo-assets/rdbsave.svg" width="80%" border="0"/>


1. The `RdbSave` class instantiates a single blocking channel (in red).
   Its purpose is to gather all the blobs from all the shards.
2. In addition it creates thread-local snapshot instances in each DF shard.
TODO: to rename them in the codebase to another name (SnapshotShard?) since `snapshot` word creates ambiguity here.
3. Each SnapshotShard instantiates its own RdbSerializer that is used to serialize each K/V entry into a binary representation according to the Redis format spec. SnapshotShards combine multiple blobs from the same Dash bucket into a single blob. They always send blob data at bucket granularity, i.e. they never send blob into the channel that only partially covers the bucket. This is needed in order to guarantee snapshot isolation.
4. The RdbSerializer uses `io::Sink` to emit binary data. The SnapshotShard instance passes into it a `StringFile` which is just a memory-only based sink that wraps `std::string` object. Once `StringFile` instance becomes large, it's flushed into the channel (as long as it follows the rules above).
4. RdbSave also creates a fiber (SaveBody) that pull all the blobs from the channel. Blobs migh come in unspecified order though it's guaranteed that each blob is self sufficient but itself.
5. DF uses direct I/O, to improve i/o throughput, which, in turn requires properly aligned memory buffers to work. Unfortunately, blobs that come from the rdb channel come in different sizes and they are not aligned by OS page granularity. Therefore, DF passes all the data from rdb channel through AlignedBuffer transformation. The purpose of this class is to copy the incoming data into a properly aligned buffer. Once it accumulates enough data, it flushes it into the output file.

To summarize, this configuration employs a single sink to create one file or one stream of data that represents the whole database.

## Dragonfly Snapshot (TBD)

Required for replication. Creates several multiple files, one file per SnapshotShard. Does not require a central sink. Each SnapshotShard still uses RdbSerializer together with StringFile to guarantee bucket level granularity. We still need AlignedBuffer if we want to use direct I/O.
For a DF process with N shard, it will create N files. Will probably require additional metadata file to provide file-level consistency, but for now we can assume that only N files are created,
since our use-case will be network based replication.

How it's gonna be used? Replica (slave) will hand-shake with the master and find out how many shard it has.
Then it will open `N` sockets and each one of them will pull shard data. First, they will pull snapshot data,
and replay it by distributing entries among `K` replica shards. After all the snapshot data is replayed,
they will continue with replaying the change log (stable state replication), which is out of context
of this document.

## Relaxed point-in-time (TBD)
When DF saves its snapshot file on disk, it maintains snapshot isolation by applying a virtual cut
through all the process shards. Snapshotting may take time, during which, DF may process many write requests.
These mutations won't be part of the snapshot, because the cut captures data up to the point
**it has started**. This is perfect for backups. I call this variation - conservative snapshotting.

However, when we perform snapshotting for replication, we would like to produce a snapshot
that includes all the data upto point in time when the snapshotting **finishes**. I called
this *relaxed snapshotting*. The reason for relaxed snapshotting is to avoid keeping the changelog
of all mutations during the snapshot creation.

As a side comment - we could, in theory, support the same (relaxed)
semantics for file snapshots, but it's not necessary since it might increase the snapshot sizes.

The snapshotting phase (full-sync) can take up lots of time which add lots of memory pressure on the system.
Keeping the change-log aside during the full-sync phase will only add more pressure.
We achieve relaxed snapshotting by pushing the changes into the replication sockets without saving them aside.
Of course, we would still need a point-in-time consistency,
in order to know when the snapshotting finished and the stable state replication started.

## Conservative and relaxed snapshotting variations

Both algorithms maintain a scanning process (fiber) that iteratively goes over the main dictionary
and serializes its data. Before starting the process, the SnapshotShard captures
the change epoch of its shard (this epoch is increased with each write request).

```cpp
SnapshotShard.epoch = shard.epoch++;
```

For sake of simplicity, we can assume that each entry in the shard maintains its own version counter.
By capturing the epoch number we establish a cut: all entries with `version <= SnapshotShard.epoch`
have not been serialized yet and were not modified by the concurrent writes.

The DashTable iteration algorithm guarantees convergence and coverage ("at most once"),
but it does not guarantee that each entry is visited *exactly once*.
Therefore, we use entry versions for two things: 1) to avoid serialization of the same entry multiple times,
and 2) to correctly serialize entries that need to change due to concurrent writes.

Serialization Fiber:

```cpp
 for (entry : table) {
    if (entry.version <= cut.epoch) {
      entry.version = cut.epoch + 1;
      SendToSerializationSink(entry);
    }
 }
```

To allow concurrent writes during the snapshotting phase, we setup a hook that is triggered on each
entry mutation in the table:

OnWriteHook:
```cpp
....
if (entry.version <= cut.version) {
  SendToSerializationSink(entry);
}
...
entry = new_entry;
entry.version = shard.epoch++;  // guaranteed to become > cut.version
```

Please note that this hook maintains point-in-time semantics for the conservative variation by pushing
the previous value of the entry into the sink before changing it.

However, for the relaxed point-in-time, we do not have to store the old value.
Therefore, we can do the following:

OnWriteHook:

```cpp
if (entry.version <= cut.version) {
  SendToSerializationSink(new_entry);  // do not have to send the old value
} else {
  // Keep sending the changes.
  SendToSerializationSink(IncrementalDiff(entry, new_entry));
}

entry = new_entry;
entry.version = shard.epoch++;
```

The change data is sent along with the rest of the contents, and it requires to extend
the existing rdb format to support differential operations like (hset, append, etc).
The Serialization Fiber loop is the same for this variation.


================================================
FILE: docs/shard-serialization.md
================================================
# Shard Serialization

This document describes how Dragonfly serializes a single shard's data via `SliceSnapshot`. It
covers both point-in-time (PIT) and non-PIT serialization modes, their correctness guarantees,
and the mechanisms used to coordinate concurrent mutations with the serialization process.

## Overview

Shard serialization is used for two purposes:

1. **Backups (RDB save)** — Must produce a consistent point-in-time snapshot. Always uses PIT mode.
2. **Replication (full sync)** — Serializes baseline data and then streams journal changes. Can
   use either PIT or non-PIT mode, controlled by the `--point_in_time_snapshot` flag (default: true).

Both modes share the same traversal infrastructure (`IterateBucketsFb` → `BucketSaveCb` →
`SerializeBucket` → `SerializeEntry`) and the same flushing/backpressure machinery
(`HandleFlushData` → `consumer_->ConsumeData`). They differ in **how they handle concurrent
mutations** during the traversal.

| | PIT mode | Non-PIT mode |
|---|----------|-------------|
| Flag | `use_snapshot_version_ == true` | `use_snapshot_version_ == false` |
| Used for | Backups and replication | Replication only |
| Consistency | Exact point-in-time snapshot | Eventual consistency (baseline + journal) |
| `OnDbChange` | Serializes bucket before mutation | Barrier only (no serialization) |
| `OnMoved` | Not registered | Handles DashTable item reshuffling |
| Bucket versioning | Yes — skip already-serialized buckets | No — serialize every bucket visited |
| Throughput | Lower (mutation path does serialization work) | Higher (mutation path only acquires mutex) |

## Core Types

| Type | Location | Role |
|------|----------|------|
| `SliceSnapshot` | `src/server/snapshot.h` | Orchestrates shard serialization |
| `RdbSerializer` | `src/server/rdb_save.h` | Serializes entries into RDB-format buffers |
| `SnapshotDataConsumerInterface` | `src/server/snapshot.h` | Downstream sink interface |
| `RdbSaver::Impl` | `src/server/rdb_save.cc` | Consumer impl: writes to socket or channel |
| `ThreadLocalMutex` | `src/server/synchronization.h` | Fiber-aware mutex for atomicity barrier |
| `ChangeReq` | `src/server/table.h` | Describes a table mutation (update or insert) |

## Data Flow Overview

```mermaid
flowchart TD
  subgraph ShardThread[Shard thread / fibers]
    MUT[DB mutation] -->|change callback| ODC[OnDbChange]
    ODC -->|lock big_value_mu_| SB1["SerializeBucket<br/>(PIT only)"]
    SB1 --> SE1[SerializeEntry]
    SE1 --> SAVE1[RdbSerializer::SaveEntry]

    TRAV[Snapshot fiber: IterateBucketsFb] --> BSCB[BucketSaveCb]
    BSCB -->|lock big_value_mu_ + GetLatch| SB2[SerializeBucket]
    SB2 --> SE2[SerializeEntry]
    SE2 --> SAVE2[RdbSerializer::SaveEntry]

    MOV[DashTable move] -->|non-PIT only| OMV[OnMoved]
    OMV -->|lock big_value_mu_| SB3["SerializeBucket<br/>(if moved across cursor)"]

    EXP["Expiry / Eviction<br/>(heartbeat, inline, lazy)"] -->|"RecordDelete<br/>(no OnDbChange)"| JRN_DIRECT["journal::RecordEntry<br/>(DEL)"]
    JRN_DIRECT --> CJC

    JRN[Journal change] --> CJC[ConsumeJournalChange]
    CJC -->|lock big_value_mu_| WJE[serializer_->WriteJournalEntry]
  end

  SAVE1 -->|consume_fun_ if buffer > threshold| HFD[HandleFlushData]
  SAVE2 -->|consume_fun_ if buffer > threshold| HFD

  TRAV -->|between buckets| PS[PushSerialized]
  PS --> FS[FlushSerialized]
  FS --> HFD

  HFD --> SEQ[seq_cond_.wait - ordering gate]
  SEQ --> CD[consumer_->ConsumeData]
  CD --> SINK[(Replica socket / sink)]
```

## PIT Mode (Point-in-Time Snapshot)

PIT mode captures an exact snapshot of the shard at the logical moment `snapshot_version_` was
assigned. It is the default for both backups and replication.

### Bucket Versioning

Dragonfly's `DashTable` ([dashtable.md](dashtable.md)) maintains a version counter per physical
bucket. The snapshot must serialize all buckets with version `< snapshot_version_`.

- `SerializeBucket` sets the bucket version to `snapshot_version_`, ensuring each bucket is
  serialized exactly once.
- Mutations bump bucket versions, so buckets mutated after the snapshot started will have
  version `>= snapshot_version_` and are skipped by the traversal.
- Buckets not yet traversed but about to be mutated require **serialize-before-mutate**,
  enforced by `OnDbChange()`.

### Ordering Invariant

> For any key, the replica must receive the baseline value **strictly before** any journal entry
> that mutates that key.

We will use two terms for journal changes:
- **Self-contained**: the journal entry fully determines the resulting logical state and can be
  replayed without the prior value (for example `SET`, `DEL`).
- **Baseline-dependent**: the journal entry describes a mutation of an existing value and requires
  the baseline state to be reconstructed first (for example `HSET`, `LPUSH`).

For **transaction-driven mutations** this is guaranteed because:
1. `OnDbChange` runs before the mutation commits and serializes the bucket if needed.
2. `OnDbChange` unconditionally acquires `big_value_mu_` first, so the mutation and its
  subsequent journal emission cannot overtake an in-progress bucket serialization.

**Important caveat:** not all journal entries follow the
`OnDbChange` → mutation → `RecordJournal` → `ConsumeJournalChange` sequence. Several code
paths emit journal entries via `journal::RecordEntry` directly, bypassing `PreUpdateBlocking`
and `OnDbChange` entirely. See [Journal Entries Without `OnDbChange`](#journal-entries-without-ondbchange)
below.

### Journal Entries Without `OnDbChange`

Not all journal entries follow the transaction-driven
`PreUpdateBlocking` → `OnDbChange` → mutation → `RecordJournal` → `ConsumeJournalChange`
sequence. Several code paths call `journal::RecordEntry` directly (→
`JournalSlice::AddLogRecord` → `ConsumeJournalChange`), bypassing `OnDbChange` entirely:

| Source | Journal command | Trigger |
|--------|----------------|---------|
| `ExpireIfNeeded` (`db_slice.cc`) | `DEL` | Lazy expiry during key lookup, active expiry sweep (`DeleteExpiredStep`), heartbeat-driven eviction (`FreeMemWithEvictionStepAtomic`) |
| `PrimeEvictionPolicy::Evict` (`db_slice.cc`) | `DEL` | Inline eviction when a DashTable bucket overflows during insert |
| `generic_family.cc` (SCAN-based deletion) | `DEL` | `RecordDelete` after `DbSlice::Del` in the RM command |
| `dflycmd.cc`, `replica.cc`, `cluster_family.cc` | `PING` / `DFLYCLUSTER` | Control signals: takeover sync, PING propagation, cluster config |

All data-mutating entries above are self-contained `DEL` commands. The non-mutating entries
(`PING`, `DFLYCLUSTER`) carry no key-level semantics.

**Why this matters for `ConsumeJournalChange` and `big_value_mu_`:** these journal entries
still flow through `ConsumeJournalChange`, which acquires `big_value_mu_`. Today the mutex
serves two purposes on these paths:

1. **Serializer buffer exclusivity** — preventing a journal write from interleaving with an
   in-progress `SerializeBucket` call that shares the same `serializer_` instance.
2. **Baseline-before-journal ordering** — a `DEL K` must not reach the output stream (or a
   separate journal stream) while K's baseline is still being serialized. Even with separate
   serializer buffers and tagged-chunk interleaving, the consumer could process `DEL K` before
   receiving the full baseline, violating the ordering invariant. The mutex prevents this today
   by blocking the journal write until `SerializeBucket` completes.

The lock is *not* needed for transaction-style ordering against `OnDbChange` (these paths
bypass it entirely), but it is needed for both concerns above. Removing it requires (a) separate
serializer buffers (Phase 2, item 7) **and** (b) a mechanism to defer the `DEL` until the
bucket's baseline is fully emitted (Phase 1, item 6 — deferred deletion queue).

**Could these paths call `OnDbChange` before deleting?** Not safely:

- **`ExpireIfNeeded`:** `SerializeBucket` (called from `OnDbChange`) can preempt, but
  `ExpireIfNeeded` must not — `ExpireAllIfNeeded` calls `serialization_latch_.Wait()` and
  lazy expiry in `FindInternal` relies on cooperative scheduling.
- **`PrimeEvictionPolicy::Evict`:** `Evict` runs inside DashTable's insert path while the
  table is mid-structural-mutation. `OnDbChange` calls `SerializeBucket` (iterates the
  bucket) and `CVCUponInsert` (probes the table) — both unsafe here. Re-entrancy risk.
- **`FreeMemWithEvictionStepAtomic`:** runs from heartbeat with `serialization_latch_` held;
  `OnDbChange` per evicted key would add overhead and preemption points inside the loop.

The ordering issue is twofold: byte-stream integrity
([§1](#1-shard-wide-stall-under-big_value_mu_)) and baseline-before-journal correctness — a
`DEL` must not be emitted (even to a separate stream) while the same key's baseline is still
being serialized. Roadmap item 6 proposes a **deferred deletion queue** to address this
without blocking or re-entrancy.

### Mutation Path: `OnDbChange` (PIT)

```
OnDbChange(db_index, req)
  lock(big_value_mu_)
  if req is update (existing bucket):
    bit = *req.update()
    if !bit.is_done() && bit.GetVersion() < snapshot_version_:
      -> SerializeBucket(db_index, *bit)
  else (insert, new key):
    key = get<string_view>(req.change)
    -> table->CVCUponInsert(snapshot_version_, key, callback)
         callback(bucket_iterator):
           -> SerializeBucket(db_index, it)
  unlock(big_value_mu_)
```

For updates, `ChangeReq::update()` returns a `PrimeTable::bucket_iterator`. If the bucket has not
been serialized yet (version `< snapshot_version_`), it is serialized now.

For inserts, `CVCUponInsert` (`src/core/dash.h`) simulates the insert to identify which buckets'
versions would change, and serializes each one with version `< snapshot_version_` via the callback.

### Traversal Path: `BucketSaveCb` (PIT)

```
BucketSaveCb(db_index, bucket_iterator)
  lock(big_value_mu_)
  if bucket version >= snapshot_version_:
    skip (already serialized by OnDbChange or a previous visit)
  FlushChangeToEarlierCallbacks(...)
  lock(*db_slice_->GetLatch())
  -> SerializeBucket(db_index, bucket_iterator)
       set bucket version = snapshot_version_
       for each occupied slot:
         -> SerializeEntry -> SaveEntry -> PushToConsumerIfNeeded
```

The version check is the key optimization: buckets already serialized by `OnDbChange` are skipped.

## Non-PIT Mode (Eventual Consistency)

Non-PIT mode is available **only for replication** (`stream_journal == true`) and is enabled by
setting `--point_in_time_snapshot=false`. It improves server throughput during full sync by
eliminating serialization work from the mutation path.

### Design Rationale

A replica does not need an exact point-in-time snapshot. It needs to reach eventual consistency:
after the full sync baseline is delivered and the journal stream catches up, the replica's state
must match the master's current state. This weaker guarantee allows the snapshot to be "fuzzy" —
it may include some mutations that happened after the snapshot started and miss others, as long as
the journal stream fills in the gaps.

### How It Differs from PIT

**`OnDbChange` does no serialization.** In non-PIT mode, the `if (use_snapshot_version_)` block
is skipped entirely. `OnDbChange` only acquires `big_value_mu_` and returns immediately. This
serves as a **barrier** — it prevents mutations from modifying a bucket while it is being
serialized by the traversal fiber — but it does not serialize anything itself.

**No bucket version tracking.** `SerializeBucket` does not set the bucket version. `BucketSaveCb`
does not check or skip based on version. Every bucket visited by the traversal is serialized
unconditionally.

**`OnMoved` handles DashTable reshuffling.** When items are inserted into DashTable, existing items
may be moved between buckets (due to hash table splitting/merging). In PIT mode this is handled by
`OnDbChange` + bucket versioning. In non-PIT mode, since `OnDbChange` does no serialization, a
separate `OnMoved` callback is needed to catch items that "jump" across the traversal cursor:

```
OnMoved(db_index, items)
  lock(big_value_mu_)
  for each (source_cursor, dest_cursor) in items:
    if IsPositionSerialized(dest_cursor) && !IsPositionSerialized(source_cursor):
      -> SerializeBucket(db_index, CursorToBucketIt(dest))
```

An item needs re-serialization when it moves **from** a not-yet-visited bucket **to** an
already-visited bucket. Without this, the item would be missed entirely: the traversal already
passed the destination, and the source bucket still has the item removed.

**`CVCUponInsert` is not used.** In PIT mode, `OnDbChange` calls `CVCUponInsert` for inserts
to proactively serialize *all* buckets the insert would touch (home, neighbor, stash — or the
entire segment on a split) **before** the insert commits. This is necessary because PIT must
capture the pre-mutation state of every affected bucket. Non-PIT has no such requirement.
Instead, the insert proceeds, and `OnMoved` reactively handles any items that were displaced
across the traversal cursor. For truly new keys (not displaced existing items), non-PIT relies on
the cursor visiting the key's bucket later, or on the journal stream capturing the insert.

### `IsPositionSerialized` — Cursor-Based Position Tracking

```cpp
bool IsPositionSerialized(DbIndex id, PrimeTable::Cursor cursor) {
  uint8_t depth = db_slice_->GetTables(id).first->depth();
  return id < snapshot_db_index_ ||
         (id == snapshot_db_index_ &&
          (cursor.bucket_id() < snapshot_cursor_.bucket_id() ||
           (cursor.bucket_id() == snapshot_cursor_.bucket_id() &&
            cursor.segment_id(depth) < snapshot_cursor_.segment_id(depth))));
}
```

Compares a cursor position against the current traversal position (`snapshot_cursor_`,
`snapshot_db_index_`). A position is "serialized" if it is behind the cursor — i.e., the
traversal has already visited it.

### Traversal Path: `BucketSaveCb` (Non-PIT)

```
BucketSaveCb(db_index, bucket_iterator)
  lock(big_value_mu_)
  // no version check — serialize every bucket unconditionally
  lock(*db_slice_->GetLatch())
  -> SerializeBucket(db_index, bucket_iterator)
       // no version update
       for each occupied slot:
         -> SerializeEntry -> SaveEntry -> PushToConsumerIfNeeded
```

### Correctness in Non-PIT Mode

Non-PIT mode guarantees:
- Every key that existed when the traversal started and was not deleted before being visited will
  be serialized at least once (by the traversal or by `OnMoved`).
- Keys inserted after the traversal started will appear in the journal stream.
- Keys may be serialized in a state newer than the snapshot start (since mutations are not blocked
  by `OnDbChange` serialization, only by the mutex barrier).
- The journal stream, combined with the baseline, produces an eventually consistent replica.

What it does **not** guarantee:
- Point-in-time consistency. The serialized baseline is a "fuzzy" view spanning the traversal
  duration.

## Shared Infrastructure

The following sections apply to both PIT and non-PIT modes.

### Traversal: `IterateBucketsFb`

```
IterateBucketsFb(send_full_sync_cut)
  for each database:
    for each logical bucket via PrimeTable::TraverseBuckets():
      -> BucketSaveCb(db_index, bucket_iterator)
      PushSerialized(false)  // explicit flush between buckets
      yield if CPU time > ~15us
    PushSerialized(true)     // force-flush after each database
  if send_full_sync_cut:
    serializer_->SendFullSyncCut()
    PushSerialized(true)
```

### Serialization: `SerializeBucket` and `SerializeEntry`

`SerializeBucket` iterates all occupied slots in a physical bucket and calls `SerializeEntry` for
each. `SerializeEntry` looks up expiry and memcache flags, then calls
`serializer_->SaveEntry(pk, pv, expire_time, mc_flags, db_index)`.

### Journal Path: `ConsumeJournalChange`

```
ConsumeJournalChange(item)
  lock(big_value_mu_)
  serializer_->WriteJournalEntry(item.journal_item.data)
  unlock(big_value_mu_)
```

Active in both modes when `stream_journal == true`. Acquires `big_value_mu_` to ensure journal
entries are not interleaved with bucket serialization. Does **not** flush data — only appends to
the serializer buffer. Flushing happens later via `ThrottleIfNeeded` → `PushSerialized(false)`,
called from `JournalSlice` after the journal callback returns.

### Flushing and Backpressure

#### `HandleFlushData(std::string data)` — Common Blocking Sink

All serialized data ultimately flows through `HandleFlushData`:

1. Assigns monotonically increasing record ID (`rec_id_++`).
2. Optionally yields (background mode).
3. **Blocks** on `seq_cond_.wait` until `id == last_pushed_id_ + 1` (sequential ordering).
4. **Blocks** on `consumer_->ConsumeData(data, cntx_)` (downstream write).
5. Updates `last_pushed_id_`, notifies waiters via `seq_cond_.notify_all()`.
6. Optionally sleeps to throttle CPU (non-background mode, up to 2ms proportional to CPU spent).

#### `FlushSerialized(RdbSerializer* serializer)`

Calls `serializer->Flush(kFlushEndEntry)` to extract and optionally compress the buffer, then
passes the result to `HandleFlushData`. Uses the main `serializer_` if no argument is given.

#### `PushSerialized(bool force)`

Skips if `!force` and `serializer_->SerializedLen() < kMinBlobSize` (8KB). Otherwise calls
`FlushSerialized()` to drain the main serializer buffer.

#### `RdbSerializer::PushToConsumerIfNeeded(FlushState flush_state)`

```cpp
void RdbSerializer::PushToConsumerIfNeeded(SerializerBase::FlushState flush_state) {
  if (consume_fun_ && SerializedLen() > flush_threshold_) {
    string blob = Flush(flush_state);
    consume_fun_(std::move(blob));  // synchronous!
  }
}
```

Only fires when `consume_fun_` is set **and** the buffer exceeds `flush_threshold_`. When it
fires, it **synchronously** invokes the callback, which for `SliceSnapshot` is `HandleFlushData`.

## All Code Paths That Acquire `big_value_mu_`

Currently there are **five** call sites in `snapshot.cc` that lock `big_value_mu_`. The diagrams
below show the complete call chain from lock acquisition to potential blocking points.

### Path 1: `BucketSaveCb` (traversal fiber, both modes)

```mermaid
flowchart LR
  A[IterateBucketsFb] --> B["BucketSaveCb<br/><b>lock big_value_mu_</b><br/>lock GetLatch()"]
  B --> C[SerializeBucket]
  C --> D[SerializeEntry]
  D --> E[SaveEntry]
  E -->|"if buffer > threshold"| F["consume_fun_()<br/>= HandleFlushData"]
  F --> G["seq_cond_.wait<br/>consumer_->ConsumeData<br/><b>BLOCKS</b>"]

  classDef lock fill:#FFF3E0,stroke:#EF6C00;
  classDef block fill:#FFEBEE,stroke:#C62828;
  class B lock;
  class G block;
```

### Path 2: `OnDbChange` (mutation fiber, PIT only)

```mermaid
flowchart LR
  A[DB mutation] --> B["OnDbChange<br/><b>lock big_value_mu_</b>"]
  B -->|PIT| C[SerializeBucket]
  C --> D[SerializeEntry]
  D --> E[SaveEntry]
  E -->|"if buffer > threshold"| F["consume_fun_()<br/>= HandleFlushData"]
  F --> G["seq_cond_.wait<br/>consumer_->ConsumeData<br/><b>BLOCKS</b>"]
  B -->|non-PIT| H["return<br/>(barrier only)"]

  classDef lock fill:#FFF3E0,stroke:#EF6C00;
  classDef block fill:#FFEBEE,stroke:#C62828;
  classDef safe fill:#E8F5E9,stroke:#2E7D32;
  class B lock;
  class G block;
  class H safe;
```

### Path 3: `OnMoved` (non-PIT only)

```mermaid
flowchart LR
  A[DashTable move] --> B["OnMoved<br/><b>lock big_value_mu_</b>"]
  B -->|"moved across cursor"| C[SerializeBucket]
  C --> D[SerializeEntry]
  D --> E[SaveEntry]
  E -->|"if buffer > threshold"| F["consume_fun_()<br/>= HandleFlushData"]
  F --> G["seq_cond_.wait<br/>consumer_->ConsumeData<br/><b>BLOCKS</b>"]
  B -->|"same side of cursor"| H[skip]

  classDef lock fill:#FFF3E0,stroke:#EF6C00;
  classDef block fill:#FFEBEE,stroke:#C62828;
  class B lock;
  class G block;
```

### Path 4: `ConsumeJournalChange` (journal callback, both modes)

```mermaid
flowchart LR
  A[Journal change] --> B["ConsumeJournalChange<br/><b>lock big_value_mu_</b>"]
  B --> C["serializer_->WriteJournalEntry<br/>(buffer append only)"]
  C --> D[returns]

  classDef lock fill:#FFF3E0,stroke:#EF6C00;
  class B lock;
```

This path does **not** reach `HandleFlushData`. It only appends to the serializer buffer.

## All Code Paths That Reach `HandleFlushData`

```mermaid
flowchart TD
  subgraph HAZARD["Under big_value_mu_ (HAZARD)"]
    A1["OnDbChange — PIT only<br/>lock big_value_mu_"] --> SB1["SerializeBucket → SerializeEntry → SaveEntry"]
    A2["BucketSaveCb — both modes<br/>lock big_value_mu_ + GetLatch()"] --> SB2["SerializeBucket → SerializeEntry → SaveEntry"]
    A3["OnMoved — non-PIT only<br/>lock big_value_mu_"] --> SB3["SerializeBucket → SerializeEntry → SaveEntry"]
    SB1 --> CF["PushToConsumerIfNeeded<br/>consume_fun_()"]
    SB2 --> CF
    SB3 --> CF
    CF --> HFD1[HandleFlushData]
  end

  subgraph SAFE["Outside big_value_mu_ (SAFE)"]
    B1["IterateBucketsFb loop<br/>(between buckets)"] --> PS1["PushSerialized(false)"]
    B2["IterateBucketsFb<br/>(end of database)"] --> PS2["PushSerialized(true)"]
    B3["IterateBucketsFb<br/>(full sync cut)"] --> PS3["PushSerialized(true)"]
    B4[FinalizeJournalStream] --> PS4["PushSerialized(true)"]
    B5["ThrottleIfNeeded<br/>(from JournalSlice)"] --> PS5["PushSerialized(false)"]
    PS1 --> FS[FlushSerialized]
    PS2 --> FS
    PS3 --> FS
    PS4 --> FS
    PS5 --> FS
    FS --> HFD2[HandleFlushData]
  end

  HFD1 --> BLOCK["seq_cond_.wait<br/>consumer_->ConsumeData<br/>(BLOCKING)"]
  HFD2 --> BLOCK

  classDef hazard fill:#FFEBEE,stroke:#C62828,stroke-width:2px,color:#B71C1C;
  classDef safe fill:#E8F5E9,stroke:#2E7D32,color:#1B5E20;
  classDef block fill:#FFF3E0,stroke:#EF6C00;
  class A1,A2,A3,CF,HFD1 hazard;
  class B1,B2,B3,B4,B5,PS1,PS2,PS3,PS4,PS5,FS,HFD2 safe;
  class BLOCK block;
```

## Delayed Serialization of tiered entities

Tiered string values are not read synchronously under `big_value_mu_`. Instead,
`SerializeExternal` pushes a `TieredDelayedEntry` into `delayed_entries_`; the actual read and
serialization happen later in `PushSerialized()`, outside the bucket-serialization critical
section. The current implementation is fragile — delayed entries live in a global side queue
rather than being associated with their originating bucket, and this can corrupt the output
stream — a delayed tiered value may be emitted after a journal entry for the same key,
violating baseline-before-journal ordering (see PR #6824).

Note: `RestoreStreamer` (used for slot migration) has its own delayed-entry mechanism via
`CmdSerializer`, which uses a keyed `flat_hash_map` rather than a plain deque. The analysis
below focuses on `SliceSnapshot`; the `RestoreStreamer` path has analogous concerns but a
different data structure.

This creates two distinct notions of "bucket finished":

1. **Traversal finished** — `SerializeBucket` has iterated every entry and returned.
2. **Baseline fully emitted** — all delayed tiered entries from that bucket have also been
   read, serialized, and flushed.

For in-memory values these coincide; for tiered values they do not.

The ordering invariant (`baseline(K)` before `journal(K)`) still applies. Because the baseline
for a tiered key `K` may only materialize when `PushSerialized()` drains `delayed_entries_`,
a bucket's completion point extends from "finished iterating" to "all delayed values serialized
and flushed".

## Locking and Synchronization

### `big_value_mu_` (ThreadLocalMutex)

A `ThreadLocalMutex` (`src/server/synchronization.cc`) serving as the primary synchronization
barrier.

**Important:** `ThreadLocalMutex::lock()` and `unlock()` are **no-ops** when
`serialization_max_chunk_size == 0`. This means `big_value_mu_` only provides actual
synchronization when big-value streaming is enabled. When it is disabled, all `lock_guard`
calls on this mutex are effectively free, and the system relies on cooperative scheduling
(no preemption during serialization) for correctness.

Its role differs by mode:

**PIT mode:** Prevents mutations from modifying a bucket while it is being serialized, and
prevents journal entries from being written during bucket serialization. This enforces both
serialize-before-mutate and the ordering invariant.

**Non-PIT mode:** Prevents mutations from modifying a bucket while `BucketSaveCb` is serializing
it (data consistency within a single bucket). Also serves as a barrier for `ConsumeJournalChange`
and `OnMoved`.

| Path | Mode | Lock held | Additional locks |
|------|------|-----------|-----------------|
| `BucketSaveCb` | Both | `big_value_mu_` | `GetLatch()` |
| `OnDbChange` | Both | `big_value_mu_` | none |
| `OnMoved` | Non-PIT | `big_value_mu_` | none |
| `ConsumeJournalChange` | Both | `big_value_mu_` | none |

### `GetLatch()` (LocalLatch)

Acquired by `BucketSaveCb` in addition to `big_value_mu_`. This is a non-preempting latch
(`src/server/synchronization.h`) that increments a blocking counter, preventing `Heartbeat()`
from running if `SerializeBucket` preempts (e.g., during large value serialization).

### `seq_cond_` (CondVarAny)

Condition variable used in `HandleFlushData` to ensure records are pushed to the consumer
in sequential order of their `rec_id_`. If fiber A has `id=5` and fiber B has `id=6`, B waits
until A finishes pushing and updates `last_pushed_id_` to 5.
This is needed because fibers are awakened in arbitrary order and reordering flushed chunks breaks
the wire protocol.


## Inefficiencies and Improvement Goals

This section identifies concrete problems in the current serialization design and the
improvements that address them. The [Technical Roadmap](#technical-roadmap) maps these into an ordered execution
plan.

**Hard constraints** (apply to all improvements):
- **Backpressure must be maintained.** A slow consumer must slow down the producer; we cannot
  buffer unboundedly.
- **Bounded serialization memory.** Intermediate buffers must not grow proportionally to the
  dataset size.


### 1. Shard-wide stall under `big_value_mu_`

**Problem.** `big_value_mu_` is a single shard-wide mutex that guards three distinct concerns simultaneously:

1. **Bucket atomicity** — the bucket must not be mutated while `SerializeBucket` iterates it.
2. **Serializer buffer exclusivity** — `serializer_` must not be written to by two fibers.
3. **Journal ordering** — journal entries must not interleave with bucket serialization.

When `consume_fun_` fires under the lock (large value → `PushToConsumerIfNeeded` →
`HandleFlushData`), the mutex is held across blocking I/O (`seq_cond_.wait`,
`consumer_->ConsumeData`). This stalls the entire shard: traversal, mutations, journal writes,
and `OnMoved` all contend on the same lock.

**Why the mutex is needed in `ConsumeJournalChange`.**
Transaction paths are already ordered by `OnDbChange` (it acquires `big_value_mu_` first, so
`ConsumeJournalChange` on the same fiber cannot start while traversal holds the lock). The
mutex matters for [paths that bypass `OnDbChange`](#journal-entries-without-ondbchange) —
inline eviction and heartbeat-driven deletions. Without it, inline eviction could produce:

**Counter-example without the `ConsumeJournalChange` mutex — inline eviction via `PrimeEvictionPolicy::Evict`:**
1. Traversal calls `SerializeBucket(B)` and begins iterating it; the bucket contains key `K`
   (a large hash, serialized element-by-element). The traversal preempts mid-entry via
   `consume_fun_`.
2. While the traversal is preempted, a client command triggers a DashTable insert on a different
   bucket. The insert finds no free slot in its home bucket and calls
   `PrimeEvictionPolicy::Evict`, which selects `K` as the victim.
3. `Evict` removes `K` from the table and — still on the same fiber, inside the DashTable
  insert — calls `journal::RecordEntry(DEL K)` directly, bypassing `OnDbChange`.
4. `ConsumeJournalChange` appends `DEL K` to the shared serializer buffer immediately, even
  though traversal has already emitted only a prefix of `K`'s baseline.
5. Traversal resumes and appends the remaining bytes of `K`'s baseline.

Result: the replica's byte stream contains `[partial baseline of K] [DEL K] [rest of baseline
of K]`. The RDB decoder sees a truncated entry followed by an unexpected journal opcode, or
parses garbage if the lengths happen to align. Even if the `DEL` is parsed out-of-band, the
subsequent baseline bytes reconstruct `K` on the replica, reversing the deletion.

**Goal.** Separate the three concerns so that:
- bucket atomicity uses bucket-level mechanisms (versioning + bucket completion state);
- buffer exclusivity uses per-serializer isolation (each producer owns its buffer);
- journal ordering uses bucket completion state and deferred deletion queues;
- no code path blocks on downstream I/O while holding a shard-wide lock.

**Approach.** See [§5 summary table](#5-summary-mutex-roles-and-their-replacements) for the
full mapping. Key mechanisms: bucket completion state ([§2](#2-imprecise-bucket-completion-tracking)),
separate serializer instances ([§3](#3-shared-serializer-buffer-and-wire-format-coupling)),
and non-preempting chunk production. See Roadmap items 6, 7, 8, 9.

### 2. Imprecise bucket completion tracking

**Problem.** The system has no explicit notion of when a bucket's baseline is *fully emitted*
(see [Delayed Serialization of tiered entities](#delayed-serialization-of-tiered-entities)
for details on how tiered values extend bucket completion beyond `SerializeBucket`'s return).
This creates two issues:

- A journal entry for key K can reach the output buffer (via `ConsumeJournalChange`) before
  K's delayed tiered baseline is drained — violating the
  [ordering invariant](#ordering-invariant) (see PR #6824).
- [Non-transaction journal entries](#journal-entries-without-ondbchange) (expiry, eviction)
  bypass `OnDbChange` entirely. Since there is no bucket completion state to consult, `DEL`
  entries can interleave mid-serialization of the deleted key's baseline.

**Goal.** Make "baseline fully emitted" precise for every bucket — including tiered values —
so that ordering decisions can be expressed through per-bucket state rather than shard-wide mutex exclusion.

**Approach.**
- Introduce a per snapshot instance/bucket state machine:
  `NotVisited` → `Serializing` → `DelayedPending` → `Covered`.
  Each bucket is identified by a stable `BucketIdentity`. A bucket must remain in the
  tracking map (`currently_serialized_: map<BucketIdentity, State>`) until all work completes; otherwise `version >= snapshot_version_` + absent-from-map would falsely read as `Covered`.
  State encoding:

  | State | Encoding | Meaning |
  |-------|----------|---------|
  | **NotVisited** | `version < snapshot_version_`, not in map | Traversal has not reached this bucket |
  | **Serializing** | `version >= snapshot_version_`, in map as `Serializing` | Traversal is iterating this bucket |
  | **DelayedPending** | `version >= snapshot_version_`, in map as `DelayedPending` | Iteration done, tiered entries still pending |
  | **Covered** | `version >= snapshot_version_`, not in map | Baseline fully emitted |

- Associate delayed tiered entries with their originating bucket instead of the global queue.
  Transition to `Covered` only after all delayed entries are flushed.
- **Transaction-driven mutations:** `OnDbChange` blocks (fiber-aware wait) on
  `Serializing`/`DelayedPending` buckets; proceeds immediately on `NotVisited` (serialize
  now) or `Covered` (baseline already emitted). Since `OnDbChange` → mutation →
  `RecordJournal` → `ConsumeJournalChange` is sequential on the mutation fiber, blocking
  `OnDbChange` guarantees baseline-before-journal.
- **Non-transaction deletions (expiry, eviction):** `OnDbChange` is
  [infeasible on these paths](#journal-entries-without-ondbchange). Instead, use a **deferred
  deletion queue**: enqueue the key when the bucket is `Serializing`/`DelayedPending`; drain
  (emit `DEL`) when the bucket transitions to `Covered`. See roadmap item 6 for details.
- **Latency tradeoff:** blocking `OnDbChange` on `DelayedPending` means a mutation fiber can
  stall for the duration of a tiered disk read (see roadmap item 6 for mitigation).

See Roadmap items 3, 5, 6.

### 3. Shared serializer buffer and wire-format coupling

**Problem.** `ConsumeJournalChange` and `SerializeBucket` write to the same `serializer_`
buffer (the "buffer exclusivity" role from [§1](#1-shard-wide-stall-under-big_value_mu_)).
Even with separate buffers, interleaved output from two serializers cannot be demuxed by the
consumer without a framing protocol — a journal entry injected mid-RDB-entry produces an
unparseable byte stream (see the [eviction counter-example](#1-shard-wide-stall-under-big_value_mu_)
for a concrete scenario).

**Goal.** Decouple journal and bucket serialization so they can produce data independently,
without sharing a buffer or requiring a shard-wide lock for output integrity.

**Approach.**
- **Tagged-chunk wire format.** Extend the serialization format with tagged chunks: each
  mid-entry flush produces a chunk tagged with a stream ID. The consumer reassembles same-ID
  chunks before decoding. Small values (single chunk) use the existing format unchanged —
  no overhead. Controlled by a master-side flag (`--serialization_tagged_chunks`).
- **Separate `RdbSerializer` per producer.** Give journal entries and bucket serialization
  their own serializer instances. Each produces tagged chunks independently. With separate
  buffers, `ConsumeJournalChange` no longer needs `big_value_mu_` for buffer exclusivity.
- **Flushing strategy:** small values serialize the entire bucket without preemption; large
  values release the lock between chunks and apply backpressure outside the critical section.
  Bucket contents remain stable across the gap because PIT versioning prevents re-serialization and `OnDbChange` blocking (§1) prevents mutation.

See Roadmap items 4, 7.

### 4. Non-PIT redundant journal traffic

**Problem.** Non-PIT mode (eventual consistency for replication) emits every journal entry regardless of whether the snapshot traversal will cover the mutation. For self-contained entries (`SET`, `DEL`) this is redundant but harmless. For baseline-dependent entries (`HSET`, `LPUSH`, etc.) the system emits both the baseline value and the journal entry for
every mutation, even when the traversal has not yet reached the bucket and will serialize the
post-mutation value.

**Goal.** In non-PIT mode, reduce journal traffic by skipping entries that are guaranteed to
be covered by the traversal, without compromising eventual consistency.

**Approach.** Use the bucket completion state machine (§1) to classify mutations:

- **Self-contained entries** (`SET`, `DEL`, `EXPIRE`): skip for `NotVisited` buckets (traversal will see post-mutation value); emit for `Covered` buckets; emit conservatively for
  `Serializing`/`DelayedPending`. Classification is by **emitted journal command form**, not
  the user-facing command — commands like `JSON.SET` may be self-contained or not depending
  on arguments and must be validated individually.

- **Baseline-dependent entries** (`HSET`, `LPUSH`, `SADD`, `ZADD`, `XADD`, `APPEND`, etc.):
  **SkipBoth** — suppress both baseline serialization and journal entry — when the bucket is
  `NotVisited`/`Serializing`, the mutation is a single-key in-memory update (no delete, no
  rehash, no insert), and no delayed tiered entry is in flight. Otherwise fall back to emit
  journal only or keep both. Each `SliceSnapshot` instance marks suppressed mutations locally;
  `ConsumeJournalChange` skips them without cross-instance coordination.

See Roadmap items 10–15.

### 5. Summary: mutex roles and their replacements

The previous subsections identify `big_value_mu_`'s three roles and the mechanisms that
replace each:

| Mutex role | Replacement | Source |
|-----------|-------------|--------|
| Journal ordering | Bucket completion state + deferred deletion queue | §1 |
| Buffer exclusivity | Separate `RdbSerializer` per producer + tagged chunks | §3 |
| Bucket atomicity (PIT) | Bucket versioning + `OnDbChange` blocking | §1, §2 |
| Bucket atomicity (non-PIT) | Non-preempting chunk production | §2, §3 |

Once all replacements are in place and validated, the mutex can be narrowed per mode and path,
and eventually removed entirely. The roadmap structures this as a sequence of incremental
steps (Phases 0–4), each validated before the next begins.

## Technical Roadmap

The improvements identified above are interdependent. The safest path is to split them into
small, verifiable steps that first improve observability and correctness scaffolding, then
improve PIT and PIT+tiered correctness/robustness, and only after that tackle non-PIT
optimizations and deeper serializer / lock-removal changes. Some of the groundwork —
especially bucket-level completion state — is shared and should be laid early even if the
first consumers are PIT-oriented. Because non-PIT is currently experimental and unused, the
roadmap below does **not** treat current non-PIT behavior as a compatibility constraint. Later
non-PIT phases may simplify, replace, or remove experimental behavior rather than preserving it.

### Phase 0 — Baseline and guardrails

1. **Document current invariants in code comments and tests.**
   - Make the key ordering rules explicit near `SliceSnapshot::OnDbChange`,
     `SliceSnapshot::ConsumeJournalChange`, `RestoreStreamer::OnDbChange`, and
     `DbSlice::FlushChangeToEarlierCallbacks`.
   - Prefer focused replication tests over purely end-to-end hash comparisons. The current
     broad replication suite is useful, but Phase 0 needs tests that fail specifically when an
     ordering invariant is broken.
   - Add focused tests for:
     - PIT: baseline-before-journal for baseline-dependent mutations.
     - tiered values: delayed serialization still preserves baseline-before-journal.
   - Suggested test strategy:
     - **PIT ordering guardrail:** add a test in `tests/dragonfly/replication_test.py` that
       starts full sync with `point_in_time_snapshot=true`, performs a small controlled set of
       baseline-dependent updates during full sync (`HSET`, `LPUSH`, `APPEND`, `XADD`), waits for
       stable sync, and then asserts exact key/value equality for only those keys. The intent is
       to make a baseline-before-journal violation fail on a tiny, debuggable workload.
     - **tiered delayed-entry guardrail:** rehabilitate the currently skipped tiered replication
       test in `tests/dragonfly/tiering_test.py` and make it assert not just final equivalence,
       but that concurrent writes to tiered keys during full sync do not lose updates.
   - Suggested assertions:
     - assert exact values for a small curated key set, not just whole-dataset hashes;
     - assert replica reaches stable sync and catches up via `check_all_replicas_finished`;
     - assert path-activation counters from logs where available (`side_saved`, `moved_saved`);
     - for tricky cases, prefer deterministic key-level checks over probabilistic stress-only
       validation.
   - Suggested scope split:
     - keep the existing large/stress replication tests as coarse regression coverage;
     - add a handful of small, deterministic Phase 0 tests whose only purpose is to guard the
       invariants this roadmap depends on.
   - Goal: freeze the current correctness contract before changing behavior.

2. **Add lightweight observability for snapshot/journal interleavings.**
   - Count how often `ConsumeJournalChange` runs while a bucket is being serialized.
   - Count flushes triggered under `big_value_mu_` versus outside it.
   - Suggested locations for counters / debug stats:
     - increment a counter when `ConsumeJournalChange` acquires the barrier while
       `serialize_bucket_running_` is true;
     - increment separate counters for `HandleFlushData` reached from under `big_value_mu_`
       versus from `PushSerialized` outside the critical section;
   - Suggested exposure:
     - start with log lines in the existing `Exit SnapshotSerializer` / replication progress logs;
     - if the signals become broadly useful, promote them to INFO/stats fields later.
   - Suggested rollout rule:
     - add observability before optimization, and require each new fast path to demonstrate that
       the expected path was actually exercised in tests.
   - Goal: validate which paths are actually hot and which optimizations are worth the risk.

### Phase 1 — PIT and PIT+tiered foundation

3. **Introduce explicit bucket-level completion state.**
   - **Prerequisites:** Phase 0.1–0.2.
   - Implement the per-snapshot-instance state machine described in
     [§1](#1-imprecise-bucket-completion-tracking): `NotVisited` → `Serializing` →
     `DelayedPending` → `Covered`, keyed by `BucketIdentity`.
   - Keep this state entirely instance-local to `SliceSnapshot` / `RestoreStreamer`.
   - Goal: replace vague "bucket iteration finished" reasoning with an explicit state machine
     that will later serve both PIT+tiered correctness and non-PIT decisions.

4. **Extend the wire format with tagged chunks.**
   - **Prerequisites:** none.
   - Implements the tagged-chunk format described in
     [§3](#3-shared-serializer-buffer-and-wire-format-coupling). Entries that may be split
     across preemption points are wrapped in a per-stream-tag envelope; single-chunk entries
     use the existing format unchanged (no overhead).
   - **Wire format:** `RDB_OPCODE_DF_MASK`-style flag bit (`DF_MASK_FLAG_CHUNKED`). When set,
     payload is `stream_tag: uint32, payload_length: uint32, payload: bytes`. Entries without
     the flag are unchanged.
   - **Enablement:** master-side flag (`--serialization_tagged_chunks`), not `DflyVersion`
     (which doesn't apply to DFS backups). The loader detects tagged chunks by the flag bit
     and reassembles transparently.
   - Pure format + loader-side work — no changes to serialization logic or locking. Can be
     developed independently of Phases 0–1.
   - **Scope:** replication and DFS backups. Only legacy `.rdb` format does not need tagged
     chunks (`SnapshotFlush::kDisallow`, no concurrent bucket serialization).
   - Why early: Phase 2 (item 7) needs separate serializers whose interleaved output requires
     tagged chunks for demuxing.
   - Goal: have the wire-format infrastructure ready before Phase 2 needs it.

5. **Associate delayed tiered serialization with bucket state.**
   - **Prerequisites:** 1.3.
   - Address the [tiered completion gap](#delayed-serialization-of-tiered-entities): associate
     `delayed_entries_` with their originating bucket instead of the global queue.
   - Only transition a bucket to `Covered` once its delayed tiered entries are emitted.
   - Goal: make "baseline fully emitted" precise, not just "bucket iteration finished".

6. **Use bucket completion state to harden PIT ordering guarantees.**
   - **Prerequisites:** 1.3 and 1.5.
   - Re-express the PIT ordering rule in terms of bucket completion state, not just mutex
     exclusion and `bucket.version`.
   - For in-memory values, PIT ordering is already sound by construction (sequential
     `OnDbChange` → mutation → `ConsumeJournalChange` on the same fiber). The real gap is
     **tiered delayed entries** (see
     [Delayed Serialization](#delayed-serialization-of-tiered-entities)): a journal entry
     can reach the buffer before the delayed baseline is drained.
   - **`OnDbChange` blocking:** block (fiber-aware wait) when the bucket is `Serializing` or
     `DelayedPending`; proceed on `NotVisited` (serialize now → `Covered`) or `Covered`
     (baseline already emitted). Because `OnDbChange` → mutation → `RecordJournal` →
     `ConsumeJournalChange` is sequential on the mutation fiber, blocking `OnDbChange`
     guarantees baseline-before-journal for all transaction-driven mutations.
   - **Deferred deletion queue** for
     [non-transaction journal paths](#journal-entries-without-ondbchange) (expiry, eviction —
     where `OnDbChange` is infeasible). When a deletion encounters a bucket in
     `Serializing`/`DelayedPending`, enqueue the key into a per-bucket
     `pending_deletions: vector<string>` (bounded by bucket capacity, typically 12–14 slots).
     The traversal fiber drains the queue — emitting deferred `DEL` entries — when
     transitioning the bucket to `Covered`. For `NotVisited`/`Covered` buckets, `DEL` is
     emitted immediately as today. Properties:
     - no blocking, re-entrancy, or preemption on the deletion fiber;
     - baseline-before-journal ordering preserved by construction.
   - After this item, `big_value_mu_` is no longer needed for journal ordering, but is still
     needed for [buffer exclusivity](#3-shared-serializer-buffer-and-wire-format-coupling)
     (items 7–8).
   - **Latency tradeoff:** blocking `OnDbChange` on `DelayedPending` can stall a mutation
     fiber for the duration of a tiered disk read (`Future<io::Result<string>>`). Acceptable
     for correctness; monitor and consider `KeepBoth` fallback if latency is excessive.
   - Use Phase 0 tests to validate PIT+tiered behavior under preemption and backpressure.
   - Goal: make the existing production path easier to reason about before adding new behavior.

### Phase 2 — Reduce PIT blocking and serializer fragility

7. **Give journal and bucket serialization separate `RdbSerializer` instances.**
   - **Prerequisites:** 1.4 and 1.6.
   - NOTE: maybe unnecessary if rely on 1.4.
   - Addresses the [shared buffer problem](#3-shared-serializer-buffer-and-wire-format-coupling)
     and the primary [shard-wide stall hazard](#blocking-under-big_value_mu_).
   - The fix: give journal entries their own `RdbSerializer` instance. Bucket serialization
     and journal serialization never share a buffer. Each produces tagged chunks (item 4)
     that the consumer (replica or DFS loader) reassembles by stream tag.
   - The same separation is needed for **DFS backups** (no journal, but still PIT): once
     per-bucket locks (item 6) replace the shard-wide `big_value_mu_`, two concurrent
     `SerializeBucket` calls can run on different buckets (traversal fiber on bucket A
     preempts mid-entry via `consume_fun_`, `OnDbChange` serializes bucket B). Each call
     needs its own buffer; tagged chunks allow their interleaved output to be reassembled.
   - With separate serializers, `big_value_mu_` is no longer needed for buffer exclusivity.
     `ConsumeJournalChange` writes to its own serializer without acquiring `big_value_mu_`
     at all (journal ordering is already guaranteed by bucket completion state from item 6).
   - The flushing strategy depends on value size:
     - **Small values (typical case):** `consume_fun_` is disabled (or made a no-op) while
       the lock is held. `SerializeBucket` serializes the entire bucket into the bucket
       serializer's buffer without preempting — the buffer grows but stays bounded because
       most buckets contain only small entries. After `SerializeBucket` returns and the lock
       is released, the accumulated buffer is flushed as a tagged chunk outside the lock.
     - **Large values (e.g., a 1 GB set):** the existing `kFlushMidEntry` boundaries become
       lock-release points. After serializing a bounded batch of elements, the lock is
       released, the accumulated chunk is flushed (with backpressure) outside the lock, and
       the lock is re-acquired for the next batch. Bucket contents remain stable across the
       gap because (a) PIT versioning prevents re-serialization and (b) `OnDbChange` blocking
       (item 6) prevents the mutation from committing. Both are required: (a) alone prevents
       double-serialization but not mid-value mutation; (b) alone prevents mutation but not
       concurrent `SerializeBucket` entry.
   - Goal: eliminate blocking under `big_value_mu_` by removing the shared-buffer reason for
     holding it, rather than by restructuring the lock/unlock pattern around the same buffer.

8. **Simplify `rec_id_` / `seq_cond_` ordering once tagged-chunk delivery is proven.**
   - **Prerequisites:** 2.7, 1.4.
   - With tagged chunks support, we may not need a consistent global order between different
     fibers. In that case `rec_id_` / `seq_cond_.wait` become redundant.
   - Remove `rec_id_` / `seq_cond_` only after demonstrating (via tests and observability)
     that we do not corrupt the replication stream.
   - Goal: avoid removing an ordering mechanism before its replacement is demonstrably sound.

9. **Narrow `big_value_mu_` for PIT only after the above is proven.**
   - **Prerequisites:** 2.7–2.8.
   - Keep serialize-before-mutate semantics intact.
   - Remove or narrow mutex roles only where bucket state, serializer isolation, and
     tagged-chunk delivery already provide an equivalent correctness guarantee.
   - Goal: simplify the active production path incrementally, not speculatively.

### Phase 3 — Bring non-PIT onto the new foundation

10. **Add non-PIT-specific guardrails before changing non-PIT behavior.**
    - **Prerequisites:** 1.3 and 1.5.
    - Add focused tests for:
      - self-contained journal entries produce correct final state when baseline is fully
        emitted before or after the journal entry (no mid-entry interleaving);
      - moved items that cross the cursor are not lost;
      - any first non-PIT bucket-state redesign still converges under concurrent full-sync writes.
    - Suggested test strategy:
      - add a dedicated test with `point_in_time_snapshot=false` that mutates only with
        self-contained emitted commands (`SET`, `DEL`, `BITOP` rewritten to `SET`/`DEL`);
      - rehabilitate the currently skipped `test_replication_onmove_flow` instead of replacing
        it; if it is too flaky for CI, first reduce it to a smaller deterministic reproducer that
        still asserts both replica equality and `moved_saved > 0` from snapshot logs;
      - add non-PIT-specific observability such as counting how often `OnMoved` actually
        serializes a bucket and optionally classifying self-contained vs baseline-dependent
        journal entries by emitted command.
    - Goal: avoid touching experimental non-PIT behavior without dedicated guardrails.

11. **Stamp bucket version in non-PIT mode behind a feature flag.**
    - **Prerequisites:** 1.3 and 1.5 and 3.10.
    - Teach non-PIT `SerializeBucket` to call `SetVersion(snapshot_version_)`.
    - Since non-PIT is experimental, prefer the simplest implementation that matches the new
      bucket-state model rather than preserving legacy bookkeeping.
    - Validate that traversal, `OnMoved`, and any remaining bucket-version assumptions remain
      correct under the new design.
    - Goal: align non-PIT with the new foundation, not preserve its old implementation details.

12. **Implement self-contained journal classification in `ConsumeJournalChange`.**
    - **Prerequisites:** 3.11.
    - Classify emitted journal commands as self-contained vs baseline-dependent.
    - Initially use a conservative allowlist (`SET`, `DEL`, rewritten `BITOP`).
    - Skip `big_value_mu_` only for self-contained entries in non-PIT mode.
    - Goal: harvest the simplest safe non-PIT redesign win first, on top of the PIT-hardened
      foundation.

13. **Add instance-local suppression state for `SkipBoth`.**
    - **Prerequisites:** 1.3 and 1.5 and 3.11.
    - Let `OnDbChange` record a local suppression decision for mutations whose effects will be
      covered by future traversal.
    - Let the same snapshot instance's `ConsumeJournalChange` consult and clear that state.
    - Do not introduce shard-wide or cross-instance aggregation.
    - Goal: keep the redesign entirely within the existing per-instance callback pair, without
      carrying forward unnecessary experimental structure.

14. **Implement `SkipBoth` for the narrowest safe mutation subset.**
    - **Prerequisites:** 3.13.
    - Start with single-key, single-bucket, in-memory updates only.
    - Exclude inserts, deletes, rehash-triggering operations, and tiered cases.
    - Require bucket state to be `NotVisited` or `Serializing`.
    - Goal: prove the mechanism on a subset where correctness is easy to reason about.

15. **Expand `SkipBoth` eligibility only after targeted validation.**
    - **Prerequisites:** 3.14.
    - Re-evaluate `DelayedPending` once delayed-entry ownership is explicit.
    - Re-evaluate inserts only if bucket-touch coverage can be proven cheaply.
    - Re-evaluate tiered keys only if suppression can be tied to delayed-entry completion.
    - Goal: expand cautiously instead of generalizing the hard cases upfront.

### Phase 4 — Reassess `big_value_mu_` globally

16. **Narrow the lock's role by mode and path.**
    - **Prerequisites:** 2.9 for PIT changes; 3.12–3.15 for non-PIT changes.
    - PIT: keep only what is still required for serialize-before-mutate correctness.
    - non-PIT: remove it from self-contained journal entries first; then reconsider `OnMoved`
      and traversal interactions once serialization becomes non-preempting.
    - Goal: shrink the lock surface incrementally instead of attempting full removal at once;
      for non-PIT there is no obligation to preserve locking structure that exists only because
      of the experimental implementation.

17. **Attempt full `big_value_mu_` removal only after all prerequisites are in place.**
    - **Prerequisites:** 4.16.
    - Preconditions:
      - non-preempting bounded serialization chunks,
      - precise bucket coverage state,
      - delayed tiered ownership tracked to completion,
      - journal ordering independent of the mutex,
      - tests covering PIT, non-PIT, `OnMoved`, and tiered cases.
    - Goal: ensure lock removal is the final simplification step, not the first risky rewrite.


================================================
FILE: docs/thread-per-core.excalidraw
================================================
{
  "type": "excalidraw",
  "version": 2,
  "source": "https://excalidraw.com",
  "elements": [
    {
      "type": "text",
      "version": 158,
      "versionNonce": 1897755639,
      "isDeleted": false,
      "id": "N2nJ6OaFNRqcFW23SO0u2",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 714.625,
      "y": 507.5390625000001,
      "strokeColor": "#000000",
      "backgroundColor": "#fab005",
      "width": 90,
      "height": 20,
      "seed": 1339600844,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1658676475959,
      "link": null,
      "locked": false,
      "fontSize": 16,
      "fontFamily": 1,
      "text": "I/O thread",
      "baseline": 14,
      "textAlign": "center",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "I/O thread"
    },
    {
      "type": "text",
      "version": 212,
      "versionNonce": 1838113753,
      "isDeleted": false,
      "id": "pZs66qxoJlWQcWuBsvAxk",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 829.125,
      "y": 509.4140625000001,
      "strokeColor": "#000000",
      "backgroundColor": "#fd7e14",
      "width": 90,
      "height": 20,
      "seed": 1172993740,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1658676475959,
      "link": null,
      "locked": false,
      "fontSize": 16,
      "fontFamily": 1,
      "text": "I/O thread",
      "baseline": 14,
      "textAlign": "center",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "I/O thread"
    },
    {
      "type": "text",
      "version": 223,
      "versionNonce": 1421110391,
      "isDeleted": false,
      "id": "qhrDskacRkr-tNl2Q3atR",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 948.6875,
      "y": 508.02455357142867,
      "strokeColor": "#000000",
      "backgroundColor": "#fd7e14",
      "width": 90,
      "height": 20,
      "seed": 1936794996,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1658676504307,
      "link": null,
      "locked": false,
      "fontSize": 16,
      "fontFamily": 1,
      "text": "I/O thread",
      "baseline": 14,
      "textAlign": "center",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "I/O thread"
    },
    {
      "type": "rectangle",
      "version": 344,
      "versionNonce": 1641244985,
      "isDeleted": false,
      "id": "jPwIU_a9_nxvuDFAcbzxM",
      "fillStyle": "cross-hatch",
      "strokeWidth": 1,
      "strokeStyle": "dotted",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 712.375,
      "y": 537.2500000000001,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 431,
      "height": 30,
      "seed": 1029717964,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "type": "text",
          "id": "U2-I9a2X4amHnB7NZFWGv"
        }
      ],
      "updated": 1658676541606,
      "link": null,
      "locked": false
    },
    {
      "type": "text",
      "version": 239,
      "versionNonce": 1717412567,
      "isDeleted": false,
      "id": "U2-I9a2X4amHnB7NZFWGv",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 717.375,
      "y": 542.2500000000001,
      "strokeColor": "#000000",
      "backgroundColor": "transparent",
      "width": 421,
      "height": 20,
      "seed": 1592449524,
      "groupIds": [],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1658676541606,
      "link": null,
      "locked": false,
      "fontSize": 16,
      "fontFamily": 1,
      "text": "message bus",
      "baseline": 14,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "jPwIU_a9_nxvuDFAcbzxM",
      "originalText": "message bus"
    },
    {
      "type": "rectangle",
      "version": 315,
      "versionNonce": 208875257,
      "isDeleted": false,
      "id": "mBFE2wiT175ZxMSdmWcvQ",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 712.375,
      "y": 305.7916666666667,
      "strokeColor": "#000000",
      "backgroundColor": "#fab005",
      "width": 77,
      "height": 192,
      "seed": 352036980,
      "groupIds": [
        "DYa5vdmfX68EvWPAq2Beo"
      ],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "type": "text",
          "id": "tK1EcrkpG35slJ07z1dTT"
        }
      ],
      "updated": 1658676546251,
      "link": null,
      "locked": false
    },
    {
      "type": "text",
      "version": 194,
      "versionNonce": 181803287,
      "isDeleted": false,
      "id": "tK1EcrkpG35slJ07z1dTT",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 717.375,
      "y": 376.7916666666667,
      "strokeColor": "#000000",
      "backgroundColor": "#fab005",
      "width": 67,
      "height": 50,
      "seed": 1251432308,
      "groupIds": [
        "DYa5vdmfX68EvWPAq2Beo"
      ],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1658676546251,
      "link": null,
      "locked": false,
      "fontSize": 20,
      "fontFamily": 1,
      "text": "thread\n1",
      "baseline": 43,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "mBFE2wiT175ZxMSdmWcvQ",
      "originalText": "thread\n1"
    },
    {
      "type": "rectangle",
      "version": 430,
      "versionNonce": 1426120247,
      "isDeleted": false,
      "id": "BY5OdEEKT0Y_DTy9Zgr9C",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 833.375,
      "y": 306.4166666666667,
      "strokeColor": "#000000",
      "backgroundColor": "#fd7e14",
      "width": 77,
      "height": 192,
      "seed": 1621471436,
      "groupIds": [
        "DYa5vdmfX68EvWPAq2Beo"
      ],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "id": "sIrssFTnnb9f1o26g1j88",
          "type": "text"
        },
        {
          "type": "text",
          "id": "sIrssFTnnb9f1o26g1j88"
        }
      ],
      "updated": 1658676546251,
      "link": null,
      "locked": false
    },
    {
      "type": "text",
      "version": 310,
      "versionNonce": 514622649,
      "isDeleted": false,
      "id": "sIrssFTnnb9f1o26g1j88",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 838.375,
      "y": 377.4166666666667,
      "strokeColor": "#000000",
      "backgroundColor": "#fd7e14",
      "width": 67,
      "height": 50,
      "seed": 711168500,
      "groupIds": [
        "DYa5vdmfX68EvWPAq2Beo"
      ],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1658676546251,
      "link": null,
      "locked": false,
      "fontSize": 20,
      "fontFamily": 1,
      "text": "thread\n2",
      "baseline": 43,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "BY5OdEEKT0Y_DTy9Zgr9C",
      "originalText": "thread\n2"
    },
    {
      "type": "text",
      "version": 76,
      "versionNonce": 1406533463,
      "isDeleted": false,
      "id": "45U617mr0L9ob4mc7Xozt",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 845.375,
      "y": 260.0865384615385,
      "strokeColor": "#000000",
      "backgroundColor": "#fab005",
      "width": 53,
      "height": 40,
      "seed": 1285924468,
      "groupIds": [
        "DYa5vdmfX68EvWPAq2Beo"
      ],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1658676546251,
      "link": null,
      "locked": false,
      "fontSize": 16,
      "fontFamily": 1,
      "text": "shard\nthread",
      "baseline": 34,
      "textAlign": "center",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "shard\nthread"
    },
    {
      "type": "text",
      "version": 85,
      "versionNonce": 2081260953,
      "isDeleted": false,
      "id": "vY-LnNlhD3qWMEtRPoU0t",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 964.9375,
      "y": 260.0865384615385,
      "strokeColor": "#000000",
      "backgroundColor": "#fab005",
      "width": 53,
      "height": 40,
      "seed": 817296972,
      "groupIds": [
        "DYa5vdmfX68EvWPAq2Beo"
      ],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1658676546251,
      "link": null,
      "locked": false,
      "fontSize": 16,
      "fontFamily": 1,
      "text": "shard\nthread",
      "baseline": 34,
      "textAlign": "center",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "shard\nthread"
    },
    {
      "type": "rectangle",
      "version": 458,
      "versionNonce": 190540409,
      "isDeleted": false,
      "id": "xvkm28eoejETjF3M78jpN",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 1062.125,
      "y": 310.875,
      "strokeColor": "#000000",
      "backgroundColor": "#fa5252",
      "width": 77,
      "height": 187,
      "seed": 1482008524,
      "groupIds": [
        "DYa5vdmfX68EvWPAq2Beo"
      ],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "id": "nSQOBHdmN0bLo5OeoOD0P",
          "type": "text"
        },
        {
          "type": "text",
          "id": "nSQOBHdmN0bLo5OeoOD0P"
        }
      ],
      "updated": 1658676546251,
      "link": null,
      "locked": false
    },
    {
      "type": "text",
      "version": 337,
      "versionNonce": 2051102103,
      "isDeleted": false,
      "id": "nSQOBHdmN0bLo5OeoOD0P",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 1067.125,
      "y": 379.375,
      "strokeColor": "#000000",
      "backgroundColor": "#fab005",
      "width": 67,
      "height": 50,
      "seed": 1058179828,
      "groupIds": [
        "DYa5vdmfX68EvWPAq2Beo"
      ],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1658676546251,
      "link": null,
      "locked": false,
      "fontSize": 20,
      "fontFamily": 1,
      "text": "thread\n4",
      "baseline": 43,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "xvkm28eoejETjF3M78jpN",
      "originalText": "thread\n4"
    },
    {
      "type": "text",
      "version": 156,
      "versionNonce": 1163506521,
      "isDeleted": false,
      "id": "H72xWL9unzb1mQiLvx7L4",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 1074.125,
      "y": 265.7115384615385,
      "strokeColor": "#000000",
      "backgroundColor": "#fab005",
      "width": 53,
      "height": 40,
      "seed": 1704611020,
      "groupIds": [
        "DYa5vdmfX68EvWPAq2Beo"
      ],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1658676546251,
      "link": null,
      "locked": false,
      "fontSize": 16,
      "fontFamily": 1,
      "text": "shard\nthread",
      "baseline": 34,
      "textAlign": "center",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "shard\nthread"
    },
    {
      "type": "rectangle",
      "version": 510,
      "versionNonce": 1046208569,
      "isDeleted": false,
      "id": "jj-MVcNrzcH0DbFFo9noF",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 952.9375,
      "y": 310.1666666666667,
      "strokeColor": "#000000",
      "backgroundColor": "#fd7e14",
      "width": 77,
      "height": 193,
      "seed": 1374694167,
      "groupIds": [
        "DYa5vdmfX68EvWPAq2Beo"
      ],
      "strokeSharpness": "sharp",
      "boundElements": [
        {
          "id": "NxhycN5eOsL0I52k0H-lh",
          "type": "text"
        },
        {
          "id": "NxhycN5eOsL0I52k0H-lh",
          "type": "text"
        },
        {
          "type": "text",
          "id": "NxhycN5eOsL0I52k0H-lh"
        }
      ],
      "updated": 1658676546251,
      "link": null,
      "locked": false
    },
    {
      "type": "text",
      "version": 391,
      "versionNonce": 1308367831,
      "isDeleted": false,
      "id": "NxhycN5eOsL0I52k0H-lh",
      "fillStyle": "hachure",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 957.9375,
      "y": 381.6666666666667,
      "strokeColor": "#000000",
      "backgroundColor": "#fd7e14",
      "width": 67,
      "height": 50,
      "seed": 617412057,
      "groupIds": [
        "DYa5vdmfX68EvWPAq2Beo"
      ],
      "strokeSharpness": "sharp",
      "boundElements": [],
      "updated": 1658676546251,
      "link": null,
      "locked": false,
      "fontSize": 20,
      "fontFamily": 1,
      "text": "thread\n3",
      "baseline": 43,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "jj-MVcNrzcH0DbFFo9noF",
      "originalText": "thread\n3"
    }
  ],
  "appState": {
    "gridSize": null,
    "viewBackgroundColor": "#ffffff"
  },
  "files": {}
}


================================================
FILE: docs/transaction.md
================================================
# Life of a transaction

This document describes how Dragonfly transactions provide atomicity and serializability for its multi-key and multi-command operations.

## Definitions

### Serializability

Serializability is an isolation level for database transactions. Serializability describes multiple transactions, where a transaction is usually composed of multiple operations on multiple objects.

Database can executed transactions in parallel (and the operations in parallel). Serializability guarantees the result is the same with, as if the transactions were executed one by one. i.e. to behave like executed in a serial order.

Serializability doesn’t guarantee the resulting serial order respects recency. I.e. the serial order can be different from the order in which transactions were actually executed. E.g. Tx1 begins earlier than Tx2, but the result behaves as if Tx2 executed before Tx1. That is also to say, to satisfy the same Serializability, there can be more than one possible execution schedulings.

### Strict Serializability

Strict serializability means that operations appear to have occurred in some order, consistent with the real-time ordering of those operations; e.g. if operation A completes before operation B begins, then A should appear to precede B in the serialization order.

Strict serializability implies atomicity meaning, a transaction’s sub-operations do not appear to interleave with sub-operations from other transactions. It also implies serializability
by definition (appear in some order...).

Note that simple, single-key operations in Dragonfly are already strictly serializable because in a shared-nothing architecture each shard-thread performs operations on its keys sequentially.
The complexity rises when we need to provide strict-serializability (aka serializability and linearizability) for operations spawning multiple keys.

## Transactions high level overview
Transactions in Dragonfly are orchestrated by an abstract entity, called coordination layer.
In reality, a client connection instance takes on itself the role of a coordinator: it coordinates a transaction every time it drives a redis or memcached command to completion. The algorithm behind Dragonfly transactions is based on the [VLL paper](https://www.cs.umd.edu/~abadi/papers/vldbj-vll.pdf).

Every step within a coordinator is done sequentially. Therefore, it's easier to describe the flow using a sequence diagram. Below is a sequence diagram of a generic transaction consisting of multiple execution steps. In this diagram, the operation it executes touches keys in two different shards: `Shard1` and `Shard2`.

```mermaid
%%{init: {'theme':'base'}}%%
sequenceDiagram
    participant C as Coordinator
    participant S1 as Data Shard 1
    participant S2 as Data Shard 2

    par hop1
    C->>+S1: Schedule
    and
    C->>+S2: Schedule
    S1--)C: Ack
    S2--)C: Ack
    end

    par hop2
    C->>S1: Exec1
    and
    C->>S2: Exec1
    S1--)C: Ack
    S2--)C: Ack
    end
    par hop N+1
    C->>S1: Exec N+Fin
    and
    C->>S2: Exec N+Fin
    S1--)-C: Ack
    S2--)-C: Ack
    end
```

The shared-nothing architecture of Dragonfly does not allow accessing each shard data directly from a coordinator fiber. Instead, the coordinator sends messages to the shards and instructs them what to do at each step. Every time, the coordinator sends a message, it blocks until it gets an answer. We call such interaction a *message hop* or a *hop* in short.

The flow consists of two different phases: *scheduling* a transaction, and *executing* it. The execution phase may consist of one or more hops, depending on the complexity of the operation we model.

*Note, that only the coordinator fiber is blocked. Its thread can still execute other fibers - like processing requests on other connections or handling operations for the shard it owns. This is the advantage of adopting fibers - they allow us to separate the execution context from OS threads.*

## Scheduling a transaction

The transaction initiates with a scheduling hop, during which the coordinator sends to each shard the keys that shards handle. The coordinator sends messages to multiple shards asynchronously but it waits until all shards ack and confirm that the scheduling succeeded before it proceeds to the next steps.

When the scheduling message is processed by a data shard, it adds the transaction to its local transaction queue (tx-queue). In order to provide serializability, i.e. to make sure that all shards order their scheduled transactions in the same order, Dragonfly maintains a global sequence counter that is used to induce a total order for all its transactions.

This global counter is shared by all coordinator entities and is represented by an atomic integer. *This counter may be a source of contention - it breaks the shared nothing model, after all. However, in practice, we have not observed a significant impact on Dragonfly performance due to other optimizations we added. These will be detailed in the [Optimization](#optimizations) section below.

Transactions in tx-queue in each shard are arranged by their sequence counter.

As shown in the snippet below, a shard thread may receive transactions in a different sequence, so a transaction with a smaller id can be added to the tx-queue after a transaction with a larger id. If the scheduling algorithm running on the data shard, can not reorder the last added transaction, it fails the scheduling request. In that case, the coordinator reverts the scheduling operation by removing the tx from the shards, and retries the whole hop again by allocating a new sequence number. In reality the fail-rate of a scheduling attempt is low and the retries are rare (subject to contention on the keys). Note, inconsistent reordering happens when two coordinators try to schedule multi-shard transactions concurrently:

```
C1: enqueue msg to Shard1 to schedule T1
C2: enqueue msg to Shard1 to schedule T2  # enqueued earlier than C1

C1: enqueue msg to Shard2 to schedule T1
C2: enqueue msg to Shard2 to schedule T2 # enqueued later than C1

shard1: pull T2, add it to TxQueue, pull T1, add it to TxQueue
shard2: pull T1, add it to TxQueue, pull T2, add it to TxQueue

TxQueue1: T2, T1  # wrong order
TxQueue2: T1, T2
```


Once the transaction is added to the tx-queue, the shard also marks the tx-keys using the *intent* locks. Those locks do not block the flow of the underlying operation but merely express the intent to touch or modify the key. In reality, they are represented by a map: `lock:str->counter`. If `lock[key] == 2` it means the tx-queue has 2 pending transactions that plan to modify `key`. These intent locks are used for optimizations detailed below and are not required to implement the naive version of VLL algorithm.

Once the scheduling hops converges, it means that the transaction entered the execution phase, in which it never rollbacks, or retries. Once it's been scheduled, VLL guarantees the progress of subsequent execution operations while providing strict-serializability guarantees.

It's important to note that a scheduled transaction does not hold exclusivity on its keys. There could be other transactions that still mutate the keys it touches - these transactions were scheduled earlier and have not finished running yet, or even have not even started running.

## Executing a transaction

Once the transaction is scheduled, the coordinator starts sending the execution messages. We break each command to one or more micro-ops and each operation corresponds to a single message hop.

For example, "MSET" corresponds to a single micro-op "mset" that has the same semantics, but runs in parallel on all the involved shards.

However, "RENAME" requires two micro-ops: fetching the data from two keys, and then the second hop - deleting/writing a key (depending whether the key is a source or a destination).

Once a coordinator sends the micro-op request to all the shards, it waits for an answer. Only when all shards executed the micro-op and return the result, the coordinator is unblocked and it can proceed to the next hop. The coordinator is allowed to process the intermediary responses from the previous hops in order to define the next execution request.

When a coordinator sends an execution request to data shards, it also specifies whether
this execution is the last hop for that command. This is necessary, so that shards could do clean-up operations when running the last execution request: unlocking the keys and removing the transaction from the tx-queue.

The shards always execute transactions at the head of the tx-queue. When the last execution hop for that transaction is executed the transaction is removed from the queue and the next one can be executed. This way we maintain the ordering guarantees specified by the scheduling order of the transactions and we maintain
the serializability of operations across multiple shards.

## Multi-op transactions (Redis transactions)

Redis transactions (MULTI/EXEC sequences) and commands produced by Lua scripts are modelled as consecutive commands within a Dragonfly transaction. In order to avoid ambiguity with terms, we call a Redis transaction - a multi-transaction in Dragonfly.

The multi feature of the transactional framework allows running consecutive commands without rescheduling the transaction for each command as if they are part of one single transaction. This feature is transparent to the commands itself, so no changes are required for them to be used in a multi-transaction.

There are three modes called "multi modes" in which a multi transaction can be executed, each with its own benefits and drawbacks.

__1. Global mode__

The transaction is equivalent to a global transaction with multiple hops. It is scheduled globally and the commands are executed as a series of consequitive hops. This mode is required for global commands (like MOVE) and for accessing undeclared keys in Lua scripts. Otherwise, it should be avoided, because it prevents Dragonfly from running concurrently and thus greatly decreases throughput.

__2. Lock ahead mode__

The transaction is equivalent to a regular transaction with multiple hops. It is scheduled on all keys used by the commands in the transaction block, or Lua script, and the commands are executed as a series of consecutive hops.

__3. Non atomic mode__

All commands are executed as separate transactions making the multi-transaction not atomic. It vastly improves the throughput with contended keys, as locks are acquired only for single commands. This mode is useful for Lua scripts without atomicity requirements.

## Multi-op command squashing

There are two fundamental problems to executing a series of consecutive commands on Dragonfly:
* each command invocation requires an expensive hop
* executing commands sequentially makes no use of our multi-threaded architecture

Luckily we can make one important observation about command sequences. Given a sequence of commands _where each command needs to access only a single shard_, we can conclude that as long as they are part of one atomic transaction:
* each command needs to preserve its order only relative to other commands accessing the same shard
* commands accessing different shards can run in parallel

The basic idea behind command squashing is identifying consecutive series of single-shard commands and separating them by shards, while maintaing their relative order withing each shard. Once the commands are separated, we can execute a single hop on all relevant shards. Within each shard the hop callback will execute one by one only those commands, that assigned to its respective shard. Because all commands are already placed on their relevant threads, no further hops are required and all command callbacks are executed inline.

Reviewing our initial problems, command squashing:
* Allows executing many commands with only one hop
* Allows executing commands in pararllel

## Optimizations
Out of order transactions - TBD

## Blocking commands (BLPOP)

Redis has a rich api with around 200 commands. Few of those commands provide blocking semantics, which allow using Redis as publisher/subscriber broker.

Redis (when running as a single node) is famously single threaded, and all its operations are strictly serializable. In order to build a multi-threaded memory store with the equivalent semantics as Redis, we had to design an algorithm that can parallelize potentially blocking operations and still provide strict serializability guarantees. This section focuses mainly on how to solve this challenge for BLPOP (BRPOP) command since it involves coordinating multiple keys and is considered the more complicated case. Other blocking commands can benefit from the same principles.


### BLPOP spec

BLPOP key1 key2 key3 0

*BLPOP is a blocking list pop primitive. It is the blocking version of LPOP because it blocks the client connection when there are no elements to pop from any of the given lists. An element is popped from the head of the first list that is non-empty, with the given keys being checked in the order that they are given.*

### Non-blocking behavior of BLPOP
When BLPOP is called, if at least one of the specified keys contains a non-empty list, an element is popped from the head of the list and returned to the caller together with the key it was popped from. Keys are checked in the order that they are given. Let's say that the key1 doesn't exist and key2 and key3 hold non-empty lists. Therefore, in the example above, BLPOP returns the element from list2.

### Blocking behavior
If none of the specified keys exist, BLPOP blocks the connection until another client performs a LPUSH or RPUSH operation against one of the keys. Once new data is present on one of the lists, the client returns with the name of the key unblocking it and the popped value.

### Ordering semantics
If a client tries to wait on multiple keys, but at least one key contains elements, the returned key / element pair is the first key from left to right that has one or more elements. In this case the client will not be blocked. So for instance, BLPOP key1 key2 key3 key4 0, assuming that both key2 and key4 are non-empty, will always return an element from key2.

If multiple clients are blocked for the same key, the first client to be served is the one that was waiting longer (the first that was blocked for the key). Once a client is unblocked it does not retain any priority, when it blocks again with the next call to BLPOP, it will be served according to the queue order of clients already waiting for the same key.

When a client is blocking on multiple keys at the same time, and elements are becoming available at the same time in multiple keys (because of a transaction), the client will be unblocked with the first key on the left that received data via push operation (assuming it has enough elements to serve our client, as there could be earlier clients waiting for this key as well).

### BLPOP and transactions
If multiple elements are pushed either via a transaction or via variadic arguments of LPUSH command then BLPOP is waked after that transaction or command completely finished. Specifically, when a client performs
`LPUSH listkey a b c`, then `BLPOP listkey 0` will pop `c`, because `lpush` pushes first `a`, then `b` and then `c` which will be the first one on the left.

If a client executes a transaction that first pushes into a list and then pops from it atomically, then another client blocked on `BLPOP` won’t pop anything, because it waits for the transaction to finish. When BLPOP itself is run in a transaction its blocking behavior is disabled and it returns the “timed-out” response if there is no element to pop.

### Complexity of implementing BLPOP in Dragonfly
The ordering semantics of BLPOP assume total order of the underlying operations. BLPOP must “observe” multiple keys simultaneously in order to determine which one is non-empty in left-to-right order. If there are no keys with items, BLPOP blocks, waits, and “observes” which key is being filled first.

For the single-threaded Redis the order is determined by following the natural execution of operations inside the main execution thread.  However, for a multi-threaded, shared-nothing execution, there is no concept of total order or a global synchronized timeline. For non-blockign scenario, "observing" keys is atomic because we lock the keys when executing a command in Dragonfly.

However with blocking scenario for BLPOP, we do not have a built-in mechanism to determine which key was filled earlier - since, as stated, the concept of total order does not exist for multiple shards.

### Interesing examples to consider:

**Ex1:**
```
client1: blpop X, Y  // blocks
client2: lpush X A
client3: exist X Y
```

Client3 should always return 0.

**Ex2:**

```
client1: BLPOP X Y Z
client2: RPUSH X A
client3: RPUSH X B;  RPUSH Y B
```

**Ex3:**

```
client1: BLPOP X Y Z
client2: RPUSH Z C
client3: RPUSH X A
client4: RPUSH X B; RPUSH Y B
```

### BLPOP Ramblings
There are two cases of how a key can appear and wake a blocking `BLPOP`:

a. with lpush/rpush/rename commands.
b. via multi-transaction.

`(a)` is actually easy to reason about, because those commands operate on a single key and single key operations are strictly serializable in shared-nothing architecture.

With `(b)` we need to consider the case where we have "BLPOP X Y 0" and then a multi-transaction fills both `y` and `x` using multiple "lpush" commands. Luckily, a multi-transaction in Dragonfly introduces a global barrier across all its shards, and it does not allow any other transactions to run as long as it does not finish. So the blocking "blpop" won't be awaken until the multi-transaction finishes its run. By that time the state of the keys will be well defined and "blpop" will be able to choose the first non empty key to pop from.


## Background reading:

### Strict Serializability
Here is a [very nice diagram](https://jepsen.io/consistency) showing how various consistency models relate.

Single node Redis is strictly serializable because all its operation are executed sequentially
and atomically in a single thread.

More formally: following the definition from https://jepsen.io/consistency/models/strict-serializable - due to the single threaded design of Redis, its transactions are executed in a global order, which is consistent with the main thread clock, hence it’s strictly serializable.

Serializability is a global property that given a transaction log, there is an order with which transactions are consistent (the log order is not relevant).

Example of serializable but not linearizable transaction: https://gist.github.com/pbailis/8279494

More material to read:
* [Fauna Serializability vs Linearizability](https://fauna.com/blog/serializability-vs-strict-serializability-the-dirty-secret-of-database-isolation-levels)
* [Jepsen consistency diagrams](https://jepsen.io/consistency)
* [Strict Serializability definition](https://jepsen.io/consistency/models/strict-serializable)
* [Example of serializable but not linearizable schedule](https://gist.github.com/pbailis/8279494)
* [Atomic clocks and distributed databases](https://www.cockroachlabs.com/blog/living-without-atomic-clocks/)
* [Another cockroach article about consistency](https://www.cockroachlabs.com/blog/consistency-model/)
* [Abadi blog](http://dbmsmusings.blogspot.com/)
* [Peter Beilis blog](http://www.bailis.org/blog) (both wrote lots of material on the subject)


================================================
FILE: fuzz/FUZZING.md
================================================
# AFL++ Fuzzing for Dragonfly

## Install AFL++

AFL++ must be built from source with `AFL_PERSISTENT_RECORD` enabled for crash replay.

```bash
sudo apt update
sudo apt install llvm-18-dev clang-18 lld-18 gcc-13-plugin-dev

git clone --depth=1 --branch v4.34c https://github.com/AFLplusplus/AFLplusplus.git
cd AFLplusplus

# Enable AFL_PERSISTENT_RECORD (required for stateful crash replay)
sed -i 's|// #define AFL_PERSISTENT_RECORD|#define AFL_PERSISTENT_RECORD|' include/config.h

make distrib
sudo make install
```

## Prepare System

```bash
sudo afl-system-config
```

`run_fuzzer.sh` also runs these checks automatically (core_pattern, CPU governor).

## Build Dragonfly

```bash
cmake -B build-dbg -DUSE_AFL=ON -DCMAKE_BUILD_TYPE=Debug -GNinja
ninja -C build-dbg dragonfly
```

## Run Fuzzer

```bash
cd fuzz
./run_fuzzer.sh              # RESP protocol (default)
./run_fuzzer.sh memcache     # Memcache text protocol
```

Configuration via environment variables:

| Variable | Default | Description |
|----------|---------|-------------|
| `AFL_PROACTOR_THREADS` | `1` | Server threads (1 = most stable coverage) |
| `AFL_LOOP_LIMIT` | `10000` | Iterations before server restart (= `AFL_PERSISTENT_RECORD`) |
| `BUILD_DIR` | `build-dbg` | Path to build directory |

## Custom Mutators

Each target has a custom AFL++ mutator that operates at the protocol level.
Instead of flipping random bytes (which mostly breaks protocol framing and
gets rejected by the parser), they:

- Parse input into a list of commands
- Mutate at the command/argument level (replace command, change argument,
  insert/remove commands, swap order)
- Serialize back to valid protocol format

| Target | Mutator | Details |
|--------|---------|---------|
| `resp` | `resp_mutator.py` | 150+ Redis commands, wraps in MULTI/EXEC |
| `memcache` | `memcache_mutator.py` | Store/get/meta commands, noreply toggle |

Mutators are loaded automatically by `run_fuzzer.sh`. AFL++'s built-in
byte-level mutations also run alongside them (useful for parser edge cases).

To use only the custom mutator: `export AFL_CUSTOM_MUTATOR_ONLY=1`.

## Crash Replay

Dragonfly uses AFL++ persistent mode — the server accumulates state across
iterations. A crash at iteration N depends on state built by inputs 1..N-1.

`run_fuzzer.sh` syncs `AFL_PERSISTENT_RECORD` with `afl_loop_limit`
so the full state history is always available on crash.

When a crash occurs, AFL++ saves:
```
crashes/id:000000,sig:06,...           # the crashing input
crashes/RECORD:000000,cnt:000000      # first input after server start
crashes/RECORD:000000,cnt:000001      # second input
...
crashes/RECORD:000000,cnt:NNNNNN      # input before the crash
```

### Replay (RESP)

```bash
./build/dragonfly --port 6379 --logtostderr --proactor_threads 1 --dbfilename=""

python3 fuzz/replay_crash.py fuzz/artifacts/resp/default/crashes 000000
```

### Replay (memcache)

```bash
./build/dragonfly --port 6379 --memcached_port=11211 --logtostderr --proactor_threads 1 --dbfilename=""

python3 fuzz/replay_crash.py fuzz/artifacts/memcache/default/crashes 000000 127.0.0.1 11211
```

### Package crash for sharing

```bash
cd fuzz
# RESP
./package_crash.sh 000000
# Memcache
./package_crash.sh 000000 fuzz/artifacts/memcache/default/crashes
```

Creates `crash-000000.tar.gz` containing crash data and `replay_crash.py`.
The recipient runs:

```bash
# RESP
./build/dragonfly --port 6379 --logtostderr --proactor_threads 1 --dbfilename=""
python3 replay_crash.py crashes 000000

# Memcache
./build/dragonfly --port 6379 --memcached_port=11211 --logtostderr --proactor_threads 1 --dbfilename=""
python3 replay_crash.py crashes 000000 127.0.0.1 11211
```

## Seed Corpus

| Target | Directory | Seeds | Coverage |
|--------|-----------|-------|----------|
| `resp` | `seeds/resp/` | 79 | string, list, hash, set, zset, stream, JSON, search, bloom, geo, HLL, bitops, scripting, ACL, pub/sub, transactions, server ops |
| `memcache` | `seeds/memcache/` | 15 | set/get, add/replace, append/prepend, cas, incr/decr, delete, multiget, gat, noreply, meta commands, flush, stats |

To add a new RESP seed:
```
*3
$3
SET
$3
key
$5
value
```

To add a new memcache seed:
```
set mykey 0 0 5
hello
get mykey
```


================================================
FILE: fuzz/dict/memcache.dict
================================================
# Memcache text protocol dictionary for AFL++

# Store commands
"set"
"add"
"replace"
"append"
"prepend"
"cas"

# Retrieval commands
"get"
"gets"
"gat"
"gats"

# Utility commands
"delete"
"incr"
"decr"
"flush_all"
"stats"
"version"
"quit"

# Meta commands
"ms"
"mg"
"md"
"ma"
"mn"
"me"

# Flags/options
"noreply"

# Common keys
"key"
"mykey"
"k1"
"k2"
"k3"
"counter"

# Numbers
"0"
"1"
"5"
"10"
"100"
"1000"
"65535"
"4294967295"
"99999999999"

# Expiry values
"0"
"30"
"3600"
"9999999"

# Line endings
"\x0d\x0a"

# Partial commands for edge cases
"set "
"get "
"delete "
"incr "
"decr "
"cas "
"gat "

# Malformed patterns
"\x0d"
"\x0a"
"\x00"
"\xff"
" "
"  "
""


================================================
FILE: fuzz/dict/resp.dict
================================================
# AFL++ dictionary for RESP protocol
# Dragonfly command keywords and common patterns

# RESP protocol markers
"*"
"$"
"+"
"-"
":"
"\x0d\x0a"

# Common commands - String operations
"GET"
"SET"
"MGET"
"MSET"
"INCR"
"DECR"
"APPEND"
"STRLEN"
"SETEX"
"SETNX"
"GETSET"
"GETRANGE"
"SETRANGE"

# List operations
"LPUSH"
"RPUSH"
"LPOP"
"RPOP"
"LLEN"
"LRANGE"
"LINDEX"
"LSET"
"LTRIM"

# Hash operations
"HSET"
"HGET"
"HMSET"
"HMGET"
"HGETALL"
"HDEL"
"HEXISTS"
"HLEN"
"HKEYS"
"HVALS"
"HINCRBY"

# Set operations
"SADD"
"SREM"
"SMEMBERS"
"SISMEMBER"
"SCARD"
"SINTER"
"SUNION"
"SDIFF"
"SPOP"

# Sorted set operations
"ZADD"
"ZREM"
"ZRANGE"
"ZRANGEBYSCORE"
"ZRANK"
"ZSCORE"
"ZCARD"
"ZCOUNT"
"ZINCRBY"

# Key operations
"DEL"
"EXISTS"
"EXPIRE"
"TTL"
"PERSIST"
"KEYS"
"SCAN"
"TYPE"
"RENAME"
"RENAMENX"

# Transaction commands
"MULTI"
"EXEC"
"DISCARD"
"WATCH"
"UNWATCH"

# Pub/Sub commands
"PUBLISH"
"SUBSCRIBE"
"UNSUBSCRIBE"
"PSUBSCRIBE"
"PUNSUBSCRIBE"

# Stream commands
"XADD"
"XREAD"
"XRANGE"
"XLEN"
"XDEL"
"XTRIM"
"XGROUP"
"XREADGROUP"

# JSON commands
"JSON.SET"
"JSON.GET"
"JSON.DEL"
"JSON.TYPE"
"JSON.NUMINCRBY"
"JSON.ARRAPPEND"
"JSON.ARRLEN"

# Bloom filter commands
"BF.ADD"
"BF.EXISTS"
"BF.RESERVE"
"BF.MADD"
"BF.MEXISTS"

# HyperLogLog commands
"PFADD"
"PFCOUNT"
"PFMERGE"

# Geo commands
"GEOADD"
"GEODIST"
"GEORADIUS"
"GEOSEARCH"

# Server commands
"PING"
"ECHO"
"INFO"
"DBSIZE"
"SELECT"

# Cluster commands
"CLUSTER"
"READONLY"
"READWRITE"

# Common keys for testing
"key"
"mykey"
"key1"
"key2"
"test"
"foo"
"bar"
"user:1"
"session:123"

# Common values
"value"
"hello"
"world"
"123"
"0"
"1"
"-1"

# Number patterns (0, 1, -1 already above)
"100"
"1000"
"-100"

# Special arguments
"NX"
"XX"
"EX"
"PX"
"GT"
"LT"
"WITHSCORES"
"LIMIT"
"COUNT"
"MATCH"

# Small RESP framing patterns (larger patterns removed — AFL++ warned about >33B tokens)
"*1\x0d\x0a$"
"*2\x0d\x0a$"
"*3\x0d\x0a$"

# Scripting commands
"EVAL"
"EVALSHA"
"EVAL_RO"
"EVALSHA_RO"
"SCRIPT"

# Bitfield commands
"BITFIELD"
"BITFIELD_RO"
"BITOP"
"BITCOUNT"
"BITPOS"
"GETBIT"
"SETBIT"

# More sorted set operations
"ZINTER"
"ZUNION"
"ZINTERSTORE"
"ZUNIONSTORE"
"ZPOPMIN"
"ZPOPMAX"
"ZMPOP"

# Edge case numbers
"9223372036854775807"
"-9223372036854775808"
"2147483647"
"-2147483648"
"0.0"
"-0.0"
"inf"
"-inf"
"+inf"
"nan"

# Stream IDs and patterns
"0-0"
"0-*"
"$"
">"
"*"
"MAXLEN"
"MINID"

# JSON paths
"$.."
"$[*]"
"$[-1]"
"$.name"
"$..name"

# RESP protocol edge cases
"*-1\x0d\x0a"
"$-1\x0d\x0a"
"*0\x0d\x0a"
"$0\x0d\x0a\x0d\x0a"

# Lua scripting patterns
"return redis.call"
"redis.pcall"
"KEYS[1]"
"ARGV[1]"

# Bitfield subcommands
"OVERFLOW"
"WRAP"
"SAT"
"FAIL"

# Aggregate options
"AGGREGATE"
"SUM"
"MIN"
"MAX"
"WEIGHTS"

# Binary edge cases
"\x00"
"\xff"
"\x00\x00\x00\x00"

# --- Additional commands for broader coverage ---

# Missing key operations
"COPY"
"SORT"
"SORT_RO"
"UNLINK"
"TOUCH"
"OBJECT"
"RANDOMKEY"
"DUMP"
"RESTORE"
"WAIT"
"EXPIREAT"
"PEXPIRE"
"PEXPIREAT"
"PEXPIRETIME"
"EXPIRETIME"
"PTTL"

# String commands
"GETDEL"
"GETEX"
"INCRBYFLOAT"
"DECRBY"
"INCRBY"
"MSETNX"
"PSETEX"
"SUBSTR"

# List commands
"LPOS"
"LMPOP"
"LMOVE"
"BLMOVE"
"BLMPOP"
"BLPOP"
"BRPOP"
"LPUSHX"
"RPUSHX"
"RPOPLPUSH"

# Set commands
"SRANDMEMBER"
"SMOVE"
"SMISMEMBER"
"SINTERCARD"
"SDIFFSTORE"
"SINTERSTORE"
"SUNIONSTORE"

# Sorted set commands
"ZDIFF"
"ZDIFFSTORE"
"ZLEXCOUNT"
"ZRANGEBYLEX"
"ZRANGESTORE"
"ZRANDMEMBER"
"ZREVRANGE"
"ZREVRANGEBYLEX"
"ZREVRANGEBYSCORE"
"ZREVRANK"
"ZMSCORE"
"ZREMRANGEBYLEX"
"ZREMRANGEBYRANK"
"ZREMRANGEBYSCORE"
"BZMPOP"
"BZPOPMIN"
"BZPOPMAX"

# Hash commands
"HRANDFIELD"
"HSCAN"
"HSETEX"
"HSETNX"
"HSTRLEN"
"HINCRBYFLOAT"
"HEXPIRE"

# Server/client commands
"CLIENT"
"CONFIG"
"MEMORY"
"ACL"
"HELLO"
"COMMAND"
"LATENCY"
"SLOWLOG"
"BGSAVE"
"LASTSAVE"
"ROLE"

# Subcommands
"OBJECT ENCODING"
"OBJECT HELP"
"OBJECT FREQ"
"OBJECT IDLETIME"
"CLIENT SETNAME"
"CLIENT GETNAME"
"CLIENT LIST"
"CLIENT ID"
"CLIENT INFO"
"CONFIG GET"
"CONFIG SET"
"MEMORY USAGE"
"MEMORY DOCTOR"
"ACL LIST"
"ACL WHOAMI"
"ACL SETUSER"
"COMMAND COUNT"
"COMMAND INFO"

# Scan operations (HSCAN already above)
"SSCAN"
"ZSCAN"

# Function/script commands
"FUNCTION"
"FUNCTION LOAD"
"FUNCTION LIST"
"FUNCTION DELETE"

# More JSON commands
"JSON.ARRINSERT"
"JSON.ARRTRIM"
"JSON.ARRPOP"
"JSON.ARRINDEX"
"JSON.OBJKEYS"
"JSON.OBJLEN"
"JSON.STRAPPEND"
"JSON.STRLEN"
"JSON.TOGGLE"
"JSON.CLEAR"
"JSON.MERGE"
"JSON.MGET"
"JSON.MSET"
"JSON.DEBUG"
"JSON.RESP"

# More Geo commands
"GEOPOS"
"GEOHASH"
"GEOSEARCHSTORE"
"GEORADIUSBYMEMBER"

# Search commands
"FT.CREATE"
"FT.SEARCH"
"FT.DROPINDEX"
"FT.INFO"
"FT.ALTER"

# Additional arguments
"REPLACE"
"ABSTTL"
"IDLETIME"
"FREQ"
"LEFT"
"RIGHT"
"BEFORE"
"AFTER"
"BY"
"ASC"
"DESC"
"ALPHA"
"STORE"
"REV"
"BYSCORE"
"BYLEX"
"CH"
"KEEPTTL"
"EXAT"
"PXAT"
"ENCODING"
"REFCOUNT"

# Malformed RESP for edge-case testing
"*-2\x0d\x0a"
"*999999\x0d\x0a"
"$-2\x0d\x0a"
"$999999999\x0d\x0a"
"*\x0d\x0a"
"$\x0d\x0a"
"+\x0d\x0a"
"-\x0d\x0a"
":\x0d\x0a"

# Inline commands (no RESP framing)
"PING\x0d\x0a"
"PING\x0a"
"SET key value\x0d\x0a"
"GET key\x0a"
"QUIT\x0d\x0a"

# More binary patterns
"\xfe\xff\x00\x01"
"\x0d\x0a\x0d\x0a"
"\x0d\x0d\x0a\x0a"
"\x00\x01\x02\x03"

# RESP edge cases (small fragments only)
"$0\x0d\x0a\x0d\x0a"
"$-1\x0d\x0a"


================================================
FILE: fuzz/generate_targeted_seeds.py
================================================
#!/usr/bin/env python3
"""Generate PR-targeted fuzzing inputs from a code diff using an LLM.

Fuzzing terminology used in this file:
  - Seed:  An initial input file for the fuzzer. Each seed is a sequence of
           commands encoded in RESP wire format (see fuzz/seeds/resp/*.resp for examples).
           The fuzzer starts from these seeds and mutates them to explore code paths.
  - Targeted seed:  A seed crafted specifically to exercise code paths changed in a PR.
           We send the PR diff + all existing seeds to an LLM, and it generates new seeds
           that target the changed code.
  - Focus commands:  A list of command names (e.g. ["SET", "GET"]) that the
           AFL++ mutator should prefer. When set, the mutator picks these commands ~70%
           of the time instead of choosing uniformly from all known commands.

Flow:
  1. Read unified diff from stdin, extract changed C++ file paths.
  2. Load all existing seed files so the LLM knows what's already covered.
  3. Call Claude API: send the diff + seeds, get back JSON with command arrays + focus commands.
  4. Encode commands as RESP wire format, write to output dir.

The LLM returns commands as plain arrays (e.g. ["SET", "key", "value"]) and we handle
RESP encoding ourselves — this avoids JSON escaping issues and byte-count mismatches.

When ANTHROPIC_API_KEY is not available (e.g. fork PRs), exits with no output and
the fuzzer runs with the existing seed corpus as-is.

Usage:
    git diff base..HEAD | python3 fuzz/generate_targeted_seeds.py --output-dir /tmp/seeds
"""

import argparse
import glob
import json
import os
import re
import sys

# Max diff lines to send to the LLM (Haiku handles ~200K tokens, so this is generous)
MAX_DIFF_LINES = 20000

LLM_SYSTEM_PROMPT = """\
You are a fuzzing expert for Dragonfly, a Redis-compatible in-memory database written in C++.

Your job: given a code diff and existing seed files, generate NEW fuzzing seeds that \
target the changed code paths. You also return a list of Redis commands to focus on.

## Dragonfly architecture (for context)
- src/server/*_family.cc — command implementations (e.g. string_family.cc has GET/SET/INCR)
- src/server/main_service.cc — command dispatch, MULTI/EXEC
- src/server/db_slice.cc — per-shard key-value storage
- src/facade/redis_parser.cc — RESP protocol parsing
- src/facade/dragonfly_connection.cc — connection handling
- src/core/ — data structures (dash table, dense_set, compact_object, etc.)
- src/server/journal/ — replication journal
- src/server/cluster/ — cluster mode
- src/server/search/ — search module (FT.* commands)
- src/server/tiering/ — SSD tiering

## What to generate
Based on the diff, figure out:
1. What commands are affected (new, modified, or impacted by infrastructure changes)
2. What edge cases the changes introduce (boundary values, empty inputs, error paths)
3. What command sequences would stress the changed code

## Output format
Return valid JSON (no markdown, no explanation):
{
  "focus_commands": ["CMD1", "CMD2", ...],
  "seeds": [
    {
      "name": "pr_something.resp",
      "commands": [
        ["SET", "mykey", "myvalue"],
        ["GET", "mykey"]
      ]
    }
  ]
}

Each "commands" entry is a list of Redis commands. Each command is a list of strings \
(command name + arguments). We handle RESP wire encoding — just give plain strings.

CRITICAL: Output must be valid JSON. Do NOT use code expressions like "x" * 1024 or \
string concatenation. For long values write actual repeated characters inline, e.g. \
"xxxxxxxxxx" (just the literal string). Keep values short (under 100 chars) — \
the fuzzer will mutate and grow them.

Rules for seeds:
- 3-10 commands per seed, forming a logical sequence
- Include setup commands before queries (e.g. SET before GET)
- Test edge cases from the diff: boundary values, empty/huge inputs, type mismatches
- Include at least one seed wrapping commands in MULTI/EXEC
- Generate 3-8 seeds total
- Prefix all names with "pr_"
"""


def extract_changed_files(diff_text):
    """Extract C++/header file paths from a unified diff."""
    files = []
    for match in re.finditer(r"^diff --git a/(.+?) b/(.+?)$", diff_text, re.MULTILINE):
        path = match.group(2)
        if re.search(r"\.(cc|h)$", path):
            files.append(path)
    return sorted(set(files))


def load_example_seeds(seeds_dir):
    """Load ALL existing seed files to show the LLM what's already covered.

    We send every seed so the LLM has full context about existing coverage
    and can generate complementary seeds for new/changed code paths.
    """
    examples = []
    for path in sorted(glob.glob(os.path.join(seeds_dir, "*.resp"))):
        name = os.path.basename(path)
        with open(path) as f:
            examples.append({"name": name, "content": f.read()})
    return examples


def truncate_diff(diff_text, max_lines=MAX_DIFF_LINES):
    """Truncate diff to max_lines."""
    lines = diff_text.splitlines(True)
    if len(lines) <= max_lines:
        return diff_text, len(lines)
    return "".join(lines[:max_lines]), max_lines


def encode_resp(commands):
    """Encode a list of commands as RESP wire format.

    Each command is a list of string arguments, e.g. ["SET", "key", "value"].
    Returns bytes in RESP format: *N\\r\\n$len\\r\\narg\\r\\n...
    """
    result = bytearray()
    for cmd in commands:
        if not cmd:
            continue
        result.extend(b"*%d\r\n" % len(cmd))
        for arg in cmd:
            arg_bytes = arg.encode() if isinstance(arg, str) else arg
            result.extend(b"$%d\r\n%s\r\n" % (len(arg_bytes), arg_bytes))
    return bytes(result)


def call_llm(diff_text, changed_files, example_seeds, api_key, model):
    """Call Claude API to generate targeted seeds from the diff."""
    try:
        import anthropic
    except ImportError:
        print("anthropic package not available", file=sys.stderr)
        return None

    truncated, num_lines = truncate_diff(diff_text)

    # Build examples section — show existing seeds so the LLM knows what's covered
    examples_text = ""
    for ex in example_seeds:
        examples_text += "--- %s ---\n%s\n\n" % (ex["name"], ex["content"].rstrip())

    prompt = (
        "Here are ALL existing seed files (RESP wire format) so you know what's already covered:\n\n"
        "%s\n"
        "Now analyze this diff and generate targeted fuzzing seeds.\n\n"
        "Changed files: %s\n\n"
        "Diff (%d lines):\n```\n%s\n```\n\n"
        "Respond with valid JSON only."
    ) % (examples_text, ", ".join(changed_files), num_lines, truncated)

    client = anthropic.Anthropic(api_key=api_key)
    response = client.messages.create(
        model=model,
        max_tokens=16384,
        system=LLM_SYSTEM_PROMPT,
        messages=[{"role": "user", "content": prompt}],
    )

    text = response.content[0].text.strip()

    # Try to extract JSON from the response (LLMs sometimes wrap in markdown)
    json_match = re.search(r"```(?:json)?\s*\n(.*?)\n```", text, re.DOTALL)
    if json_match:
        text = json_match.group(1)

    try:
        return json.loads(text)
    except json.JSONDecodeError:
        pass

    # Try to find the outermost { ... } and parse that
    brace_match = re.search(r"\{.*\}", text, re.DOTALL)
    if brace_match:
        try:
            return json.loads(brace_match.group(0))
        except json.JSONDecodeError:
            pass

    # Log raw response for debugging and raise
    print("Raw LLM response (first 2000 chars):\n%s" % text[:2000], file=sys.stderr)
    raise ValueError("Could not parse LLM response as JSON")


def write_output(output_dir, focus_commands, seeds):
    """Write seed files and focus_commands.json to output directory."""
    os.makedirs(output_dir, exist_ok=True)

    focus_path = os.path.join(output_dir, "focus_commands.json")
    with open(focus_path, "w") as f:
        json.dump(focus_commands, f)
    print("Wrote %d focus commands to %s" % (len(focus_commands), focus_path), file=sys.stderr)

    written = 0
    for seed in seeds:
        name = seed.get("name") or "pr_seed_%d.resp" % written
        if not name.endswith(".resp"):
            name += ".resp"
        path = os.path.join(output_dir, name)
        with open(path, "wb") as f:
            f.write(seed["content"])
        written += 1

    print("Wrote %d seed files to %s" % (written, output_dir), file=sys.stderr)


def main():
    parser = argparse.ArgumentParser(description="Generate targeted fuzzing seeds from a PR diff")
    parser.add_argument(
        "--output-dir",
        default="fuzz/seeds/pr_targeted",
        help="Directory to write seeds and focus_commands.json",
    )
    parser.add_argument(
        "--seeds-dir",
        default=None,
        help="Directory with existing seed files (auto-detected if not set)",
    )
    parser.add_argument(
        "--api-key", default=None, help="Anthropic API key (or set ANTHROPIC_API_KEY env var)"
    )
    parser.add_argument("--model", default="claude-haiku-4-5-20251001", help="Claude model to use")
    args = parser.parse_args()

    api_key = args.api_key or os.environ.get("ANTHROPIC_API_KEY")
    if not api_key:
        print("No ANTHROPIC_API_KEY set, skipping seed generation", file=sys.stderr)
        return

    diff_text = sys.stdin.read()
    if not diff_text.strip():
        print("No diff provided, skipping", file=sys.stderr)
        return

    changed_files = extract_changed_files(diff_text)
    if not changed_files:
        print("No C++ files in diff, skipping", file=sys.stderr)
        return

    print("Changed C++ files: %s" % ", ".join(changed_files), file=sys.stderr)

    # Find seeds directory
    seeds_dir = args.seeds_dir
    if not seeds_dir:
        script_dir = os.path.dirname(os.path.abspath(__file__))
        seeds_dir = os.path.join(script_dir, "seeds", "resp")

    example_seeds = load_example_seeds(seeds_dir)
    print("Loaded %d existing seeds" % len(example_seeds), file=sys.stderr)

    try:
        result = call_llm(diff_text, changed_files, example_seeds, api_key, args.model)
    except Exception as e:
        print("LLM call failed: %s" % e, file=sys.stderr)
        return

    if not result:
        return

    # Extract focus commands
    focus_commands = result.get("focus_commands", [])
    if not isinstance(focus_commands, list):
        focus_commands = []

    # Encode command arrays as RESP and collect valid seeds
    valid_seeds = []
    for s in result.get("seeds", []):
        if not isinstance(s, dict) or "commands" not in s:
            continue
        commands = s["commands"]
        if not isinstance(commands, list) or not commands:
            continue
        # Filter out non-list entries and ensure all args are strings
        clean_commands = []
        for cmd in commands:
            if isinstance(cmd, list) and cmd:
                clean_commands.append([str(arg) for arg in cmd])
        if not clean_commands:
            continue
        content = encode_resp(clean_commands)
        if content:
            valid_seeds.append({"name": s.get("name") or "", "content": content})
        else:
            print("Discarding empty seed: %s" % s.get("name", "?"), file=sys.stderr)

    if not valid_seeds and not focus_commands:
        print("LLM returned no usable output", file=sys.stderr)
        return

    print(
        "Generated %d seeds, %d focus commands" % (len(valid_seeds), len(focus_commands)),
        file=sys.stderr,
    )
    write_output(args.output_dir, focus_commands, valid_seeds)


if __name__ == "__main__":
    main()


================================================
FILE: fuzz/memcache_mutator.py
================================================
"""AFL++ custom mutator for memcache text protocol.

Mutates at the command level instead of random bytes,
keeping memcache protocol framing valid.

Usage:
    export PYTHONPATH=/path/to/dragonfly/fuzz
    export AFL_PYTHON_MODULE=memcache_mutator
    afl-fuzz ...
"""

import random

# fmt: off
# (command, type, min_extra_args, max_extra_args)
# type: "store" = key flags exptime bytes [noreply]\r\ndata\r\n
#       "cas"   = key flags exptime bytes cas_unique [noreply]\r\ndata\r\n
#       "get"   = key [key ...]\r\n
#       "gat"   = exptime key [key ...]\r\n
#       "delta" = key delta [noreply]\r\n
#       "del"   = key [noreply]\r\n
#       "bare"  = \r\n (no args)
#       "meta_store" = key datalen [flags...]\r\ndata\r\n
#       "meta"  = key [flags...]\r\n

COMMANDS = [
    # Store commands
    ("set",     "store"),
    ("add",     "store"),
    ("replace", "store"),
    ("append",  "store"),
    ("prepend", "store"),
    ("cas",     "cas"),
    # Retrieval
    ("get",     "get"),
    ("gets",    "get"),
    ("gat",     "gat"),
    ("gats",    "gat"),
    # Delete / arithmetic
    ("delete",  "del"),
    ("incr",    "delta"),
    ("decr",    "delta"),
    # Utility
    ("flush_all", "bare"),
    ("stats",     "bare"),
    ("version",   "bare"),
    ("quit",      "bare"),
    # Meta commands
    ("ms",      "meta_store"),
    ("mg",      "meta"),
    ("md",      "meta"),
    ("ma",      "meta"),
    ("mn",      "bare"),
    ("me",      "meta"),
]
# fmt: on

KEYS = [b"k", b"key", b"k1", b"k2", b"k3", b"mykey", b"counter", b"buf"]
VALUES = [b"abc", b"hello", b"x", b"", b"0", b"12345", b"\x00\xff", b"a" * 100]
EXPIRY = [b"0", b"10", b"100", b"3600", b"9999999"]
FLAGS = [b"0", b"1", b"255", b"65535", b"4294967295"]
DELTAS = [b"1", b"5", b"10", b"100", b"0", b"99999999999"]
META_FLAGS = [b"T30", b"N10", b"R", b"v", b"h", b"l", b"t", b"c", b"f1", b"q", b"k"]
FUZZ_VALUES = [b"\x00", b"\xff" * 4, b"\r\n", b"A" * 256, b"-1", b"NaN"]


def init(seed):
    random.seed(seed)


def _random_key():
    if random.random() < 0.8:
        return random.choice(KEYS)
    return random.choice(FUZZ_VALUES)


def _random_value():
    if random.random() < 0.7:
        return random.choice(VALUES)
    return random.choice(FUZZ_VALUES)


def _random_command():
    """Generate a single random memcache command."""
    cmd_name, cmd_type = random.choice(COMMANDS)
    cmd = cmd_name.encode() if isinstance(cmd_name, str) else cmd_name

    if cmd_type == "store":
        key = _random_key()
        flags = random.choice(FLAGS)
        expiry = random.choice(EXPIRY)
        value = _random_value()
        noreply = b" noreply" if random.random() < 0.3 else b""
        return (
            cmd
            + b" "
            + key
            + b" "
            + flags
            + b" "
            + expiry
            + b" "
            + str(len(value)).encode()
            + noreply
            + b"\r\n"
            + value
            + b"\r\n"
        )

    elif cmd_type == "cas":
        key = _random_key()
        flags = random.choice(FLAGS)
        expiry = random.choice(EXPIRY)
        value = _random_value()
        cas_id = str(random.randint(0, 99999)).encode()
        noreply = b" noreply" if random.random() < 0.3 else b""
        return (
            cmd
            + b" "
            + key
            + b" "
            + flags
            + b" "
            + expiry
            + b" "
            + str(len(value)).encode()
            + b" "
            + cas_id
            + noreply
            + b"\r\n"
            + value
            + b"\r\n"
        )

    elif cmd_type == "get":
        nkeys = random.randint(1, 4)
        keys = b" ".join(_random_key() for _ in range(nkeys))
        return cmd + b" " + keys + b"\r\n"

    elif cmd_type == "gat":
        expiry = random.choice(EXPIRY)
        nkeys = random.randint(1, 3)
        keys = b" ".join(_random_key() for _ in range(nkeys))
        return cmd + b" " + expiry + b" " + keys + b"\r\n"

    elif cmd_type == "delta":
        key = _random_key()
        delta = random.choice(DELTAS)
        noreply = b" noreply" if random.random() < 0.3 else b""
        return cmd + b" " + key + b" " + delta + noreply + b"\r\n"

    elif cmd_type == "del":
        key = _random_key()
        noreply = b" noreply" if random.random() < 0.3 else b""
        return cmd + b" " + key + noreply + b"\r\n"

    elif cmd_type == "meta_store":
        key = _random_key()
        value = _random_value()
        meta_flags = b" ".join(random.sample(META_FLAGS, random.randint(0, 3)))
        extra = (b" " + meta_flags) if meta_flags else b""
        return (
            cmd + b" " + key + b" " + str(len(value)).encode() + extra + b"\r\n" + value + b"\r\n"
        )

    elif cmd_type == "meta":
        key = _random_key()
        meta_flags = b" ".join(random.sample(META_FLAGS, random.randint(0, 3)))
        extra = (b" " + meta_flags) if meta_flags else b""
        return cmd + b" " + key + extra + b"\r\n"

    else:  # bare
        return cmd + b"\r\n"


def _parse_mc_commands(buf):
    """Best-effort parse of memcache text protocol into list of raw command lines.
    Returns (commands, success) where commands is a list of bytes."""
    commands = []
    data = bytes(buf)
    pos = 0

    while pos < len(data):
        end = data.find(b"\r\n", pos)
        if end < 0:
            break

        line = data[pos:end]
        pos = end + 2

        # Check if this is a store command that has a data block
        parts = line.split(b" ")
        if len(parts) >= 5 and parts[0].lower() in (
            b"set",
            b"add",
            b"replace",
            b"append",
            b"prepend",
            b"cas",
        ):
            try:
                nbytes = int(parts[4])
                if pos + nbytes + 2 <= len(data):
                    value = data[pos : pos + nbytes]
                    pos += nbytes + 2  # skip value + \r\n
                    commands.append((line, value))
                    continue
            except (ValueError, IndexError):
                pass
        elif len(parts) >= 3 and parts[0].lower() == b"ms":
            try:
                nbytes = int(parts[2]) if len(parts) > 2 else int(parts[1])
                if pos + nbytes + 2 <= len(data):
                    value = data[pos : pos + nbytes]
                    pos += nbytes + 2
                    commands.append((line, value))
                    continue
            except (ValueError, IndexError):
                pass

        commands.append((line, None))

    return (commands, len(commands) > 0)


def _commands_to_bytes(commands):
    """Serialize parsed commands back to memcache protocol bytes."""
    parts = []
    for line, value in commands:
        parts.append(line + b"\r\n")
        if value is not None:
            parts.append(value + b"\r\n")
    return b"".join(parts)


def _mutate_commands(commands):
    """Apply random mutations to parsed memcache commands."""
    result = list(commands)

    mutation = random.random()

    if mutation < 0.25 and len(result) > 0:
        # Replace a command entirely
        idx = random.randint(0, len(result) - 1)
        new_cmd = _random_command()
        # Parse the generated command back
        parsed, _ = _parse_mc_commands(new_cmd)
        if parsed:
            result[idx] = parsed[0]

    elif mutation < 0.45 and len(result) > 0:
        # Mutate a key or value in a command
        idx = random.randint(0, len(result) - 1)
        line, value = result[idx]
        parts = line.split(b" ")
        if len(parts) >= 2:
            cmd = parts[0].lower()
            # Mutate the correct key index depending on command
            if cmd in (b"gat", b"gats") and len(parts) >= 3:
                key_idx = random.randint(2, len(parts) - 1)
                parts[key_idx] = _random_key()
            else:
                parts[1] = _random_key()
            if value is not None:
                new_value = _random_value()
                # Update byte count in the header
                length_idx = None
                if cmd == b"ms" and len(parts) >= 3:
                    length_idx = 2
                elif len(parts) >= 5:
                    length_idx = 4
                if length_idx is not None:
                    try:
                        int(parts[length_idx])
                        parts[length_idx] = str(len(new_value)).encode()
                    except ValueError:
                        pass
                value = new_value
            result[idx] = (b" ".join(parts), value)

    elif mutation < 0.6:
        # Insert a new random command
        new_cmd = _random_command()
        parsed, _ = _parse_mc_commands(new_cmd)
        if parsed:
            pos = random.randint(0, len(result))
            result.insert(pos, parsed[0])

    elif mutation < 0.7 and len(result) > 1:
        # Remove a command
        idx = random.randint(0, len(result) - 1)
        result.pop(idx)

    elif mutation < 0.8 and len(result) >= 2:
        # Swap two commands
        i, j = random.sample(range(len(result)), 2)
        result[i], result[j] = result[j], result[i]

    elif mutation < 0.9 and len(result) > 0:
        # Duplicate a command
        idx = random.randint(0, len(result) - 1)
        result.insert(idx + 1, result[idx])

    else:
        # Toggle noreply on a command
        if len(result) > 0:
            idx = random.randint(0, len(result) - 1)
            line, value = result[idx]
            if line.endswith(b" noreply"):
                line = line[:-8]
            else:
                line = line + b" noreply"
            result[idx] = (line, value)

    return result


def fuzz(buf, add_buf, max_size):
    """Main mutation function called by AFL++."""
    commands, ok = _parse_mc_commands(buf)

    if ok and commands:
        mutated = _mutate_commands(commands)
        result = _commands_to_bytes(mutated)
    else:
        n = random.randint(1, 5)
        result = b"".join(_random_command() for _ in range(n))

    if len(result) > max_size:
        result = result[:max_size]
    return bytearray(result)


def havoc_mutation(buf, max_size):
    """Called during havoc stage."""
    commands, ok = _parse_mc_commands(buf)
    if not ok or not commands:
        return bytearray(_random_command()[:max_size])

    mutated = _mutate_commands(commands)
    result = _commands_to_bytes(mutated)
    if len(result) > max_size:
        result = result[:max_size]
    return bytearray(result)


def havoc_mutation_probability():
    return 50


================================================
FILE: fuzz/package_crash.sh
================================================
#!/usr/bin/env bash

set -e

GREEN='\033[0;32m'
RED='\033[0;31m'
NC='\033[0m'

print_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
print_error() { echo -e "${RED}[ERROR]${NC} $1"; }

usage() {
    echo "Usage: $0 <crash_id> [crashes_dir]"
    echo ""
    echo "Packages a crash and its RECORD files into a self-contained archive"
    echo "that can be sent to another developer for reproduction."
    echo ""
    echo "Arguments:"
    echo "  crash_id      Crash ID (e.g. 000000)"
    echo "  crashes_dir   Path to crashes directory (default: fuzz/artifacts/resp/default/crashes)"
    echo ""
    echo "Example:"
    echo "  $0 000000"
    echo "  $0 000001 /path/to/crashes"
    exit 1
}

if [[ $# -lt 1 ]]; then
    usage
fi

CRASH_ID="$1"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
CRASHES_DIR="${2:-$SCRIPT_DIR/artifacts/resp/default/crashes}"

if [[ ! -d "$CRASHES_DIR" ]]; then
    print_error "Crashes directory not found: $CRASHES_DIR"
    exit 1
fi

# Find the crash input file
CRASH_FILE=$(find "$CRASHES_DIR" -maxdepth 1 -name "id:${CRASH_ID},*" ! -name "RECORD:*" | head -1)
if [[ -z "$CRASH_FILE" ]]; then
    print_error "Crash input not found for id:${CRASH_ID} in $CRASHES_DIR"
    exit 1
fi

# Count RECORD files
RECORD_COUNT=$(find "$CRASHES_DIR" -maxdepth 1 -name "RECORD:${CRASH_ID},cnt:*" | wc -l)

ARCHIVE_NAME="crash-${CRASH_ID}"
TMPDIR=$(mktemp -d)
DEST="$TMPDIR/$ARCHIVE_NAME"
mkdir -p "$DEST/crashes"

print_info "Packaging crash ${CRASH_ID}..."
print_info "Crash input: $(basename "$CRASH_FILE")"
print_info "RECORD files: ${RECORD_COUNT}"

# Copy crash input and RECORD files into crashes/ subdirectory
cp "$CRASH_FILE" "$DEST/crashes/"
if [[ $RECORD_COUNT -gt 0 ]]; then
    find "$CRASHES_DIR" -maxdepth 1 -name "RECORD:${CRASH_ID},cnt:*" -exec cp {} "$DEST/crashes/" \;
fi

# Copy replay_crash.py
cp "$SCRIPT_DIR/replay_crash.py" "$DEST/"

# Create archive
OUTPUT="$(pwd)/${ARCHIVE_NAME}.tar.gz"
tar -czf "$OUTPUT" -C "$TMPDIR" "$ARCHIVE_NAME"
rm -rf "$TMPDIR"

SIZE=$(du -h "$OUTPUT" | cut -f1)
print_info "Archive created: ${OUTPUT} (${SIZE})"
echo ""
# Detect target from directory structure: artifacts/<target>/default/crashes
TARGET_NAME=$(basename "$(dirname "$(dirname "$CRASHES_DIR")")")
IS_MEMCACHE=false
if [[ "$TARGET_NAME" == "memcache" ]]; then
    IS_MEMCACHE=true
fi

echo "To reproduce:"
echo "  1. Start dragonfly:"
if [[ "$IS_MEMCACHE" == true ]]; then
    echo "     ./build/dragonfly --port 6379 --memcached_port=11211 --logtostderr --proactor_threads 1 --dbfilename=\"\""
else
    echo "     ./build/dragonfly --port 6379 --logtostderr --proactor_threads 1 --dbfilename=\"\""
fi
echo "  2. Extract and replay:"
echo "     tar xzf ${ARCHIVE_NAME}.tar.gz"
echo "     cd ${ARCHIVE_NAME}"
if [[ "$IS_MEMCACHE" == true ]]; then
    echo "     python3 replay_crash.py crashes ${CRASH_ID} 127.0.0.1 11211"
else
    echo "     python3 replay_crash.py crashes ${CRASH_ID}"
fi


================================================
FILE: fuzz/replay_crash.py
================================================
#!/usr/bin/env python3
"""Replays a crash from AFL++ persistent mode RECORD files.

In persistent mode, a crash depends on accumulated server state from all
previous iterations. AFL_PERSISTENT_RECORD saves these as RECORD files.
This script replays them in order against a running Dragonfly instance.

Usage:
    # Start dragonfly in another terminal:
    ./build-dbg/dragonfly --port 6379 --logtostderr --proactor_threads 1

    # Replay crash:
    python3 fuzz/replay_crash.py fuzz/artifacts/resp/default/crashes 000000
"""

import glob
import os
import socket
import sys


def send_input(host, port, data):
    """Send data over TCP. Mirrors SendFuzzInputToServer."""
    try:
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        s.settimeout(0.2)
        s.connect((host, port))
    except ConnectionRefusedError:
        print("\033[0;31m[ERROR]\033[0m Connection refused — is Dragonfly running?")
        sys.exit(1)

    try:
        s.sendall(data)
    except Exception:
        pass

    try:
        s.recv(4096)
    except Exception:
        pass
    s.close()


def main():
    if len(sys.argv) < 3:
        print(f"Usage: {sys.argv[0]} <crash_dir> <crash_id> [host] [port]")
        sys.exit(1)

    crash_dir = sys.argv[1]
    crash_id = sys.argv[2]
    host = sys.argv[3] if len(sys.argv) > 3 else "127.0.0.1"
    port = int(sys.argv[4]) if len(sys.argv) > 4 else 6379

    # Find RECORD files sorted by cnt
    pattern = os.path.join(crash_dir, f"RECORD:{crash_id},cnt:*")
    records = sorted(glob.glob(pattern))

    # Find crash input file
    crash_files = [
        f
        for f in glob.glob(os.path.join(crash_dir, f"id:{crash_id},*"))
        if not os.path.basename(f).startswith("RECORD:")
    ]
    if not crash_files:
        print(f"\033[0;31m[ERROR]\033[0m Crash input not found for id:{crash_id}")
        sys.exit(1)

    crash_file = crash_files[0]

    print(f"\033[0;32m[INFO]\033[0m Replaying crash {crash_id} against {host}:{port}")
    print(f"\033[0;32m[INFO]\033[0m RECORD files: {len(records)}")
    print(f"\033[0;32m[INFO]\033[0m Crash file: {crash_file}")
    print()

    # Replay all RECORD inputs
    for i, rec in enumerate(records):
        if i % 1000 == 0:
            print(f"\033[1;33m[REPLAY]\033[0m Progress: {i} / {len(records)}")
        with open(rec, "rb") as f:
            data = f.read()
        send_input(host, port, data)

    # Send the crash input
    print(f"\033[1;33m[REPLAY]\033[0m Sending crash input: {os.path.basename(crash_file)}")
    with open(crash_file, "rb") as f:
        data = f.read()
    send_input(host, port, data)

    print()
    print("\033[0;32m[INFO]\033[0m Replay complete. Check if the Dragonfly process crashed.")
    print(
        "\033[0;32m[INFO]\033[0m If not, the bug may depend on thread timing (non-deterministic)."
    )


if __name__ == "__main__":
    main()


================================================
FILE: fuzz/resp_mutator.py
================================================
"""AFL++ custom mutator for RESP protocol.

Instead of random byte-level mutations (which would break protocol framing and get
rejected by the parser), this mutator operates at the command level: it parses
the input into commands, then randomly replaces/inserts/removes/reorders commands and
arguments while keeping RESP encoding valid. This ensures mutated inputs actually
reach command execution code paths.

Focus commands (optional, set via FUZZ_FOCUS_COMMANDS env var):
    When running PR-targeted fuzzing, generate_targeted_seeds.py produces a list of
    command names affected by the code change. This mutator reads that list and
    picks those commands ~70% of the time, concentrating mutations on the changed code.
    Commands not already in the COMMANDS table are auto-registered with default arity.

Usage:
    export PYTHONPATH=/path/to/dragonfly/fuzz
    export AFL_PYTHON_MODULE=resp_mutator
    export AFL_CUSTOM_MUTATOR_ONLY=1
    afl-fuzz ...
"""

import json
import os
import random
import struct

# fmt: off
# Commands grouped by arity pattern: (name, min_args, max_args)
# min/max are argument counts AFTER the command name itself.
COMMANDS = [
    # String
    (b"GET", 1, 1), (b"SET", 2, 6), (b"MGET", 1, 5), (b"MSET", 2, 10),
    (b"SETNX", 2, 2), (b"SETEX", 3, 3), (b"PSETEX", 3, 3),
    (b"INCR", 1, 1), (b"DECR", 1, 1), (b"INCRBY", 2, 2), (b"DECRBY", 2, 2),
    (b"INCRBYFLOAT", 2, 2), (b"APPEND", 2, 2), (b"STRLEN", 1, 1),
    (b"GETRANGE", 3, 3), (b"SETRANGE", 3, 3), (b"GETSET", 2, 2),
    (b"GETDEL", 1, 1), (b"GETEX", 1, 3), (b"SUBSTR", 3, 3),
    (b"MSETNX", 2, 10),
    # Key
    (b"DEL", 1, 5), (b"UNLINK", 1, 5), (b"EXISTS", 1, 5),
    (b"EXPIRE", 2, 3), (b"EXPIREAT", 2, 3), (b"PEXPIRE", 2, 3),
    (b"PEXPIREAT", 2, 3), (b"PERSIST", 1, 1),
    (b"TTL", 1, 1), (b"PTTL", 1, 1), (b"EXPIRETIME", 1, 1), (b"PEXPIRETIME", 1, 1),
    (b"TYPE", 1, 1), (b"RENAME", 2, 2), (b"RENAMENX", 2, 2),
    (b"COPY", 2, 4), (b"DUMP", 1, 1), (b"TOUCH", 1, 5),
    (b"OBJECT", 2, 2), (b"RANDOMKEY", 0, 0), (b"KEYS", 1, 1),
    (b"SCAN", 1, 5), (b"SORT", 1, 7), (b"SORT_RO", 1, 7),
    # List
    (b"LPUSH", 2, 5), (b"RPUSH", 2, 5), (b"LPOP", 1, 2), (b"RPOP", 1, 2),
    (b"LLEN", 1, 1), (b"LINDEX", 2, 2), (b"LSET", 3, 3),
    (b"LRANGE", 3, 3), (b"LTRIM", 3, 3), (b"LREM", 3, 3),
    (b"LPOS", 2, 6), (b"LMOVE", 4, 4), (b"LMPOP", 2, 4),
    (b"LPUSHX", 2, 5), (b"RPUSHX", 2, 5), (b"RPOPLPUSH", 2, 2),
    (b"BLPOP", 2, 5), (b"BRPOP", 2, 5), (b"BLMOVE", 5, 5), (b"BLMPOP", 3, 5),
    # Hash
    (b"HSET", 3, 9), (b"HGET", 2, 2), (b"HDEL", 2, 5),
    (b"HEXISTS", 2, 2), (b"HLEN", 1, 1), (b"HKEYS", 1, 1),
    (b"HVALS", 1, 1), (b"HGETALL", 1, 1), (b"HINCRBY", 3, 3),
    (b"HINCRBYFLOAT", 3, 3), (b"HMSET", 3, 9), (b"HMGET", 2, 5),
    (b"HSETNX", 3, 3), (b"HSTRLEN", 2, 2), (b"HRANDFIELD", 1, 3),
    (b"HSCAN", 2, 6),
    # Set
    (b"SADD", 2, 5), (b"SREM", 2, 5), (b"SMEMBERS", 1, 1),
    (b"SISMEMBER", 2, 2), (b"SMISMEMBER", 2, 5), (b"SCARD", 1, 1),
    (b"SPOP", 1, 2), (b"SRANDMEMBER", 1, 2), (b"SMOVE", 3, 3),
    (b"SDIFF", 1, 3), (b"SINTER", 1, 3), (b"SUNION", 1, 3),
    (b"SDIFFSTORE", 2, 4), (b"SINTERSTORE", 2, 4), (b"SUNIONSTORE", 2, 4),
    (b"SINTERCARD", 2, 5), (b"SSCAN", 2, 6),
    # Sorted set
    (b"ZADD", 3, 9), (b"ZREM", 2, 5), (b"ZSCORE", 2, 2), (b"ZMSCORE", 2, 5),
    (b"ZRANK", 2, 2), (b"ZREVRANK", 2, 2), (b"ZCARD", 1, 1),
    (b"ZCOUNT", 3, 3), (b"ZLEXCOUNT", 3, 3),
    (b"ZRANGE", 3, 7), (b"ZRANGEBYLEX", 3, 7), (b"ZRANGEBYSCORE", 3, 7),
    (b"ZREVRANGE", 3, 5), (b"ZREVRANGEBYLEX", 3, 7), (b"ZREVRANGEBYSCORE", 3, 7),
    (b"ZRANGESTORE", 4, 8),
    (b"ZINCRBY", 3, 3), (b"ZRANDMEMBER", 1, 3),
    (b"ZPOPMIN", 1, 2), (b"ZPOPMAX", 1, 2),
    (b"BZPOPMIN", 2, 4), (b"BZPOPMAX", 2, 4),
    (b"ZDIFF", 2, 5), (b"ZDIFFSTORE", 3, 5),
    (b"ZMPOP", 2, 4), (b"BZMPOP", 3, 5),
    (b"ZREMRANGEBYRANK", 3, 3), (b"ZREMRANGEBYSCORE", 3, 3),
    (b"ZREMRANGEBYLEX", 3, 3),
    (b"ZSCAN", 2, 6),
    # Stream
    (b"XADD", 3, 9), (b"XLEN", 1, 1), (b"XRANGE", 3, 5),
    (b"XREVRANGE", 3, 5), (b"XREAD", 3, 7), (b"XTRIM", 2, 4),
    (b"XDEL", 2, 5), (b"XINFO", 2, 3), (b"XACK", 3, 5),
    (b"XGROUP", 3, 6), (b"XREADGROUP", 5, 9),
    (b"XAUTOCLAIM", 4, 6), (b"XCLAIM", 4, 8),
    # HyperLogLog
    (b"PFADD", 1, 5), (b"PFCOUNT", 1, 3), (b"PFMERGE", 2, 4),
    # Geo
    (b"GEOADD", 4, 10), (b"GEODIST", 3, 4), (b"GEOPOS", 2, 5),
    (b"GEOHASH", 2, 5), (b"GEOSEARCH", 4, 10), (b"GEOSEARCHSTORE", 5, 11),
    # Pub/Sub
    (b"SUBSCRIBE", 1, 3), (b"PUBLISH", 2, 2), (b"PSUBSCRIBE", 1, 3),
    # Transaction
    (b"MULTI", 0, 0), (b"EXEC", 0, 0), (b"DISCARD", 0, 0),
    (b"WATCH", 1, 3), (b"UNWATCH", 0, 0),
    # Script
    (b"EVAL", 2, 6), (b"EVALSHA", 2, 6), (b"EVALRO", 2, 6),
    # JSON
    (b"JSON.SET", 3, 4), (b"JSON.GET", 1, 4), (b"JSON.DEL", 1, 2),
    (b"JSON.TYPE", 1, 2), (b"JSON.NUMINCRBY", 3, 3),
    (b"JSON.ARRAPPEND", 3, 6), (b"JSON.ARRLEN", 1, 2),
    (b"JSON.ARRINSERT", 4, 6), (b"JSON.ARRTRIM", 4, 4),
    (b"JSON.ARRPOP", 1, 3), (b"JSON.ARRINDEX", 3, 5),
    (b"JSON.OBJKEYS", 1, 2), (b"JSON.OBJLEN", 1, 2),
    (b"JSON.STRAPPEND", 2, 3), (b"JSON.STRLEN", 1, 2),
    (b"JSON.TOGGLE", 2, 2), (b"JSON.CLEAR", 1, 2),
    (b"JSON.MERGE", 3, 3), (b"JSON.MGET", 2, 5),
    # Bloom filter
    (b"BF.ADD", 2, 2), (b"BF.EXISTS", 2, 2), (b"BF.MADD", 2, 5),
    (b"BF.MEXISTS", 2, 5), (b"BF.RESERVE", 3, 5),
    # Server
    (b"PING", 0, 1), (b"ECHO", 1, 1), (b"SELECT", 1, 1),
    (b"DBSIZE", 0, 0), (b"INFO", 0, 1),
    (b"CONFIG", 2, 3), (b"CLIENT", 1, 3), (b"COMMAND", 0, 2),
    (b"MEMORY", 1, 2), (b"ACL", 1, 5),
    (b"MONITOR", 0, 0), (b"RESET", 0, 0), (b"HELLO", 0, 5),
    (b"WAIT", 2, 2), (b"BGSAVE", 0, 1),
    (b"OBJECT", 2, 2), (b"LATENCY", 1, 2), (b"SLOWLOG", 1, 2),
    # Bitops
    (b"SETBIT", 3, 3), (b"GETBIT", 2, 2), (b"BITCOUNT", 1, 4),
    (b"BITOP", 3, 5), (b"BITPOS", 2, 5), (b"BITFIELD", 2, 8),
    # Search
    (b"FT.CREATE", 3, 15), (b"FT.SEARCH", 2, 10), (b"FT.DROPINDEX", 1, 2),
    (b"FT.INFO", 1, 1), (b"FT.ALTER", 3, 8),
    # Throttle
    (b"CL.THROTTLE", 5, 5),
]
# fmt: on

KEYS = [b"k", b"key", b"k1", b"k2", b"k3", b"src", b"dst", b"mylist", b"myset", b"myhash"]
VALUES = [b"v", b"val", b"hello", b"0", b"1", b"-1", b"100", b"3.14", b"", b"a b"]
SPECIAL = [b"*", b"?", b"[", b"NX", b"XX", b"EX", b"PX", b"GT", b"LT", b"KEEPTTL"]
JSON_VALUES = [b'{"a":1}', b"[1,2,3]", b'"str"', b"42", b"null", b"true"]
JSON_PATHS = [b"$", b"$.a", b"$.*", b"$.arr[0]", b"."]
SCORE_VALUES = [b"0", b"1", b"-inf", b"+inf", b"(1", b"(5", b"3.14"]
STREAM_IDS = [b"*", b"0-0", b"1-1", b"$", b">"]

# Fuzzy values: binary junk, edge cases
FUZZ_VALUES = [
    b"\x00",
    b"\xff" * 4,
    b"\r\n",
    b"$-1\r\n",
    b"*0\r\n",
    b"A" * 256,
    b"-1",
    b"99999999999",
    b"NaN",
    b"inf",
]

# Focus commands: when set via FUZZ_FOCUS_COMMANDS env var (JSON list of command names),
# the mutator will prefer these commands ~70% of the time. Used by PR fuzzing to
# concentrate mutations on commands affected by the code change.
_FOCUS_COMMANDS = []
_FOCUS_WEIGHT = 0.7

_focus_env = os.environ.get("FUZZ_FOCUS_COMMANDS", "")
if _focus_env:
    try:
        raw = json.loads(_focus_env)
        if isinstance(raw, str):
            raw = [raw]
        if isinstance(raw, list):
            _focus_names = {s.strip().upper() for s in raw if isinstance(s, str) and s.strip()}
        else:
            _focus_names = set()
        _FOCUS_COMMANDS = [c for c in COMMANDS if c[0].decode().upper() in _focus_names]
        # Add unknown commands (e.g. newly added in a PR) with default arity
        _known = {c[0].decode().upper() for c in COMMANDS}
        for name in _focus_names - _known:
            entry = (name.encode(), 1, 3)
            COMMANDS.append(entry)
            _FOCUS_COMMANDS.append(entry)
    except (json.JSONDecodeError, TypeError, ValueError):
        pass


def _pick_command():
    """Pick a command tuple, preferring focus commands when available."""
    if _FOCUS_COMMANDS and random.random() < _FOCUS_WEIGHT:
        return random.choice(_FOCUS_COMMANDS)
    return random.choice(COMMANDS)


def init(seed):
    random.seed(seed)


def _encode_resp(*args):
    """Encode a list of args into RESP array."""
    parts = [b"*%d\r\n" % len(args)]
    for a in args:
        if not isinstance(a, bytes):
            a = str(a).encode()
        parts.append(b"$%d\r\n%s\r\n" % (len(a), a))
    return b"".join(parts)


def _random_arg():
    """Generate a random argument value."""
    r = random.random()
    if r < 0.3:
        return random.choice(KEYS)
    if r < 0.55:
        return random.choice(VALUES)
    if r < 0.7:
        return random.choice(SPECIAL)
    if r < 0.8:
        return random.choice(FUZZ_VALUES)
    if r < 0.85:
        return random.choice(JSON_VALUES)
    if r < 0.9:
        return random.choice(JSON_PATHS)
    if r < 0.95:
        return random.choice(SCORE_VALUES)
    return random.choice(STREAM_IDS)


def _random_command():
    """Generate a single random RESP command."""
    cmd_name, min_args, max_args = _pick_command()
    nargs = random.randint(min_args, max_args)
    args = [cmd_name] + [_random_arg() for _ in range(nargs)]
    return _encode_resp(*args)


def _parse_resp_commands(buf):
    """Best-effort parse of RESP buffer into list of commands (each is list of bytes).
    Returns (commands, success). On parse failure returns ([], False)."""
    commands = []
    pos = 0
    data = bytes(buf)

    while pos < len(data):
        # Skip whitespace/newlines
        while pos < len(data) and data[pos : pos + 1] in (b"\r", b"\n", b" "):
            pos += 1
        if pos >= len(data):
            break

        if data[pos : pos + 1] != b"*":
            return ([], False)

        # Parse *N\r\n
        end = data.find(b"\r\n", pos)
        if end < 0:
            return ([], False)
        try:
            nargs = int(data[pos + 1 : end])
        except ValueError:
            return ([], False)
        pos = end + 2

        args = []
        for _ in range(nargs):
            if pos >= len(data) or data[pos : pos + 1] != b"$":
                return ([], False)
            end = data.find(b"\r\n", pos)
            if end < 0:
                return ([], False)
            try:
                slen = int(data[pos + 1 : end])
            except ValueError:
                return ([], False)
            pos = end + 2
            if slen < 0:
                args.append(b"")
                continue
            if pos + slen + 2 > len(data):
                return ([], False)
            args.append(data[pos : pos + slen])
            pos += slen + 2

        if args:
            commands.append(args)

    return (commands, True)


def _mutate_commands(commands):
    """Apply random mutations to a list of parsed commands."""
    result = list(commands)

    mutation = random.random()

    if mutation < 0.2 and len(result) > 0:
        # Replace a random command entirely
        idx = random.randint(0, len(result) - 1)
        cmd_name, min_args, max_args = _pick_command()
        nargs = random.randint(min_args, max_args)
        result[idx] = [cmd_name] + [_random_arg() for _ in range(nargs)]

    elif mutation < 0.4 and len(result) > 0:
        # Mutate an argument of a random command
        idx = random.randint(0, len(result) - 1)
        cmd = list(result[idx])
        if len(cmd) > 1:
            arg_idx = random.randint(1, len(cmd) - 1)
            cmd[arg_idx] = _random_arg()
            result[idx] = cmd

    elif mutation < 0.55:
        # Insert a new random command
        pos = random.randint(0, len(result))
        cmd_name, min_args, max_args = _pick_command()
        nargs = random.randint(min_args, max_args)
        result.insert(pos, [cmd_name] + [_random_arg() for _ in range(nargs)])

    elif mutation < 0.65 and len(result) > 1:
        # Remove a random command
        idx = random.randint(0, len(result) - 1)
        result.pop(idx)

    elif mutation < 0.75 and len(result) >= 2:
        # Swap two commands
        i, j = random.sample(range(len(result)), 2)
        result[i], result[j] = result[j], result[i]

    elif mutation < 0.85 and len(result) > 0:
        # Duplicate a command
        idx = random.randint(0, len(result) - 1)
        result.insert(idx + 1, list(result[idx]))

    elif mutation < 0.92 and len(result) > 0:
        # Wrap some commands in MULTI/EXEC
        start = random.randint(0, len(result) - 1)
        end = random.randint(start + 1, min(start + 5, len(result)))
        result.insert(start, [b"MULTI"])
        result.insert(end + 1, [b"EXEC"])

    else:
        # Add extra argument to a random command
        if len(result) > 0:
            idx = random.randint(0, len(result) - 1)
            result[idx] = list(result[idx]) + [_random_arg()]

    return result


def _commands_to_resp(commands):
    """Serialize list of commands back to RESP bytes."""
    parts = []
    for cmd in commands:
        parts.append(_encode_resp(*cmd))
    return b"".join(parts)


def fuzz(buf, add_buf, max_size):
    """Main mutation function called by AFL++."""
    # Try to parse the input as RESP
    commands, ok = _parse_resp_commands(buf)

    if ok and commands:
        # Parsed successfully — mutate at command level
        mutated = _mutate_commands(commands)
        result = _commands_to_resp(mutated)
    else:
        # Could not parse — generate random commands from scratch
        n = random.randint(1, 5)
        result = b"".join(_random_command() for _ in range(n))

    if len(result) > max_size:
        result = result[:max_size]

    return bytearray(result)


def havoc_mutation(buf, max_size):
    """Called during havoc stage — single small mutation."""
    commands, ok = _parse_resp_commands(buf)
    if not ok or not commands:
        return bytearray(_random_command()[:max_size])

    # Single small mutation
    mutated = _mutate_commands(commands)
    result = _commands_to_resp(mutated)
    if len(result) > max_size:
        result = result[:max_size]
    return bytearray(result)


def havoc_mutation_probability():
    """How often our havoc_mutation is called vs AFL++'s built-in mutations."""
    return 50


================================================
FILE: fuzz/run_fuzzer.sh
================================================
#!/usr/bin/env bash

set -e

GREEN='\033[0;32m'
BLUE='\033[0;34m'
YELLOW='\033[1;33m'
NC='\033[0m'

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"

# Target: "resp" (default) or "memcache"
TARGET="${1:-resp}"
BUILD_DIR="${BUILD_DIR:-$PROJECT_ROOT/build-dbg}"
FUZZ_DIR="$SCRIPT_DIR"
OUTPUT_DIR="${OUTPUT_DIR:-$FUZZ_DIR/artifacts/$TARGET}"
CORPUS_DIR="${CORPUS_DIR:-$FUZZ_DIR/corpus/$TARGET}"
SEEDS_DIR="${SEEDS_DIR:-$FUZZ_DIR/seeds/$TARGET}"
DICT_FILE="${DICT_FILE:-$FUZZ_DIR/dict/$TARGET.dict}"
TIMEOUT="5000"
FUZZ_TARGET="$BUILD_DIR/dragonfly"
AFL_PROACTOR_THREADS="${AFL_PROACTOR_THREADS:-1}"

# Persistent record: restart server every N iterations and record the last N inputs.
# This ensures that on crash, ALL inputs that built the current server state are available
# for replay. Without this, state from earlier iterations is lost and crashes become
# non-reproducible. Max recommended by AFL++: 10000.
AFL_LOOP_LIMIT="${AFL_LOOP_LIMIT:-10000}"

print_info() {
    echo -e "${GREEN}[INFO]${NC} $1"
}

print_note() {
    echo -e "${BLUE}[NOTE]${NC} $1"
}

print_warning() {
    echo -e "${YELLOW}[WARNING]${NC} $1"
}

check_requirements() {
    if [[ ! -f "${FUZZ_TARGET}" ]]; then
        print_warning "Dragonfly not found at ${FUZZ_TARGET}"
        print_warning "Build with: -DUSE_AFL=ON"
        exit 1
    fi

    if [[ "$TARGET" != "resp" && "$TARGET" != "memcache" ]]; then
        print_warning "Unknown target: $TARGET (use 'resp' or 'memcache')"
        exit 1
    fi
}

setup_directories() {
    print_info "Setting up directories..."
    mkdir -p "${OUTPUT_DIR}"
    mkdir -p "${CORPUS_DIR}"

    if [[ -z "$(ls -A "$CORPUS_DIR" 2>/dev/null)" ]]; then
        if [[ -d "${SEEDS_DIR}" ]] && [[ -n "$(ls -A "${SEEDS_DIR}" 2>/dev/null)" ]]; then
            print_info "Copying seeds to corpus..."
            cp "${SEEDS_DIR}"/* "${CORPUS_DIR}/" 2>/dev/null || true
        else
            print_warning "No seeds found, creating minimal seed"
            if [[ "$TARGET" == "memcache" ]]; then
                printf 'version\r\n' > "${CORPUS_DIR}/version"
            else
                echo -e '*1\r\n$4\r\nPING\r\n' > "${CORPUS_DIR}/ping"
            fi
        fi
    fi
}

show_config() {
    echo ""
    print_info "AFL++ Persistent Mode Configuration:"
    echo "  Target:           ${TARGET}"
    echo "  Binary:           ${FUZZ_TARGET}"
    echo "  Corpus:           ${CORPUS_DIR}"
    echo "  Output:           ${OUTPUT_DIR}"
    echo "  Dictionary:       ${DICT_FILE}"
    echo "  Timeout:          ${TIMEOUT}ms"
    echo "  Proactor threads: ${AFL_PROACTOR_THREADS}"
    echo "  Loop limit:      ${AFL_LOOP_LIMIT} (= AFL_PERSISTENT_RECORD)"
    echo ""
    print_note "Fuzzing integrated in dragonfly (USE_AFL + persistent mode)"
    print_note "Usage: ./run_fuzzer.sh [resp|memcache]"
    print_note "To change proactor threads: export AFL_PROACTOR_THREADS=N (default: 1)"
    print_note "To change loop limit: export AFL_LOOP_LIMIT=N (default: 10000)"
    echo ""
}

run_fuzzer() {
    print_info "Starting AFL++ persistent mode fuzzing (target: $TARGET)..."
    print_info "Press Ctrl+C to stop"
    echo ""

    AFL_CMD=(
        afl-fuzz
        -o "${OUTPUT_DIR}"
        -t "${TIMEOUT}"
        -m 4096
        -i "${CORPUS_DIR}"
    )

    if [[ -f "${DICT_FILE}" ]]; then
        AFL_CMD+=(-x "${DICT_FILE}")
    fi

    AFL_CMD+=(
        --
        "${FUZZ_TARGET}"
        --port=6379
        --logtostderr
        --proactor_threads=${AFL_PROACTOR_THREADS}
        --afl_loop_limit=${AFL_LOOP_LIMIT}
        --bind=0.0.0.0
        --bind=::
        --dbfilename=""
        --omit_basic_usage
        --rename_command=SHUTDOWN=
        --rename_command=DEBUG=
        --rename_command=FLUSHALL=
        --rename_command=FLUSHDB=
        --max_bulk_len=1048576
    )

    if [[ "$TARGET" == "memcache" ]]; then
        AFL_CMD+=(--memcached_port=11211 --afl_target_port=11211)
    fi

    print_info "Running: ${AFL_CMD[*]}"
    echo ""

    cd "${OUTPUT_DIR}"

    # Run AFL++ - fuzzing integrated in dragonfly via USE_AFL
    # AFL_HANG_TMOUT: Only consider it a hang if no response for 60 seconds
    # This prevents false positives from slow but legitimate operations
    export AFL_HANG_TMOUT=60000

    # Dragonfly has ~350K edges, default AFL++ bitmap is 64KB (massive collisions).
    # Use 512KB bitmap to reduce hash collisions and improve stability.
    export AFL_MAP_SIZE=524288

    # Record the last N inputs before a crash for replay.
    # Synced with afl_loop_limit so the full server state history is always captured.
    export AFL_PERSISTENT_RECORD=${AFL_LOOP_LIMIT}

    # Even with 1 proactor thread, some coverage instability is expected.
    # Tell AFL++ to continue despite unstable coverage — don't bail on flaky edges.
    export AFL_IGNORE_PROBLEMS=1

    # More aggressive havoc mutations from the start — don't wait for deterministic
    # stages to finish. Useful for protocol fuzzing where random mutations find new paths.
    export AFL_EXPAND_HAVOC_NOW=1

    # Custom protocol mutator — mutates at command/argument level
    # instead of random bytes, keeping protocol framing valid.
    export PYTHONPATH="$FUZZ_DIR"
    if [[ "$TARGET" == "memcache" ]]; then
        export AFL_PYTHON_MODULE=memcache_mutator
    else
        export AFL_PYTHON_MODULE=resp_mutator
    fi

    exec "${AFL_CMD[@]}"
}

main() {
    check_requirements
    setup_directories
    show_config
    run_fuzzer
}

main "$@"


================================================
FILE: fuzz/seeds/memcache/add_replace.mc
================================================
set key1 0 0 3
abc
add key2 0 0 3
def
replace key1 0 0 3
xyz


================================================
FILE: fuzz/seeds/memcache/append_prepend.mc
================================================
set buf 0 0 5
hello
append buf 0 0 6
 world
prepend buf 0 0 4
say
get buf


================================================
FILE: fuzz/seeds/memcache/cas.mc
================================================
set mykey 0 0 3
abc
gets mykey
cas mykey 0 0 3 1
xyz


================================================
FILE: fuzz/seeds/memcache/delete.mc
================================================
set key1 0 0 1
a
set key2 0 0 1
b
delete key1
delete key2 noreply
get key1


================================================
FILE: fuzz/seeds/memcache/expiry.mc
================================================
set exp1 0 10 3
abc
set exp2 0 0 3
def
set exp3 0 9999999 3
ghi
get exp1 exp2 exp3


================================================
FILE: fuzz/seeds/memcache/flags.mc
================================================
set f1 0 0 3
abc
set f2 1 0 3
def
set f3 65535 0 3
ghi
set f4 4294967295 0 3
jkl
gets f1 f2 f3 f4


================================================
FILE: fuzz/seeds/memcache/flush.mc
================================================
set a 0 0 1
x
set b 0 0 1
y
flush_all
get a b


================================================
FILE: fuzz/seeds/memcache/gat.mc
================================================
set mykey 0 100 5
hello
gat 200 mykey
gats 300 mykey


================================================
FILE: fuzz/seeds/memcache/incr_decr.mc
================================================
set counter 0 0 1
0
incr counter 1
incr counter 10
decr counter 5
get counter


================================================
FILE: fuzz/seeds/memcache/large_value.mc
================================================
set big 0 0 100
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
get big


================================================
FILE: fuzz/seeds/memcache/meta_commands.mc
================================================
ms mykey 5
hello
mg mykey
md mykey
ma counter
mn


================================================
FILE: fuzz/seeds/memcache/multiget.mc
================================================
set k1 0 0 1
a
set k2 0 0 1
b
set k3 0 0 1
c
get k1 k2 k3
gets k1 k2 k3


================================================
FILE: fuzz/seeds/memcache/noreply.mc
================================================
set key1 0 0 3 noreply
abc
add key2 0 0 3 noreply
def
replace key1 0 0 3 noreply
xyz
incr counter 1 noreply
delete key2 noreply


================================================
FILE: fuzz/seeds/memcache/set_get.mc
================================================
set mykey 0 0 5
hello
get mykey


================================================
FILE: fuzz/seeds/memcache/stats_version.mc
================================================
stats
version
quit


================================================
FILE: fuzz/seeds/resp/acl.resp
================================================
*2
$3
ACL
$6
WHOAMI
*2
$3
ACL
$4
LIST


================================================
FILE: fuzz/seeds/resp/acl_ops.resp
================================================
*2
$3
ACL
$6
WHOAMI
*2
$3
ACL
$4
LIST
*2
$3
ACL
$5
USERS
*3
$3
ACL
$3
CAT
*2
$3
ACL
$7
GENPASS
*2
$7
COMMAND
$5
COUNT
*2
$7
COMMAND
$4
DOCS


================================================
FILE: fuzz/seeds/resp/acl_ops2.resp
================================================
*2
$3
ACL
$6
WHOAMI
*2
$3
ACL
$4
LIST
*2
$3
ACL
$5
USERS
*3
$3
ACL
$3
CAT
$6
string
*2
$3
ACL
$7
GENPASS
*4
$3
ACL
$7
SETUSER
$8
testuser
$2
on
*3
$3
ACL
$7
GETUSER
$8
testuser
*4
$3
ACL
$6
DRYRUN
$8
testuser
$3
GET
$3
key
*2
$3
ACL
$4
HELP
*3
$3
ACL
$3
LOG
$5
RESET
*3
$3
ACL
$7
DELUSER
$8
testuser
*2
$3
ACL
$4
SAVE
*2
$3
ACL
$4
LOAD
*3
$4
AUTH
$8
testuser
$8
password
*2
$7
COMMAND
$5
COUNT
*3
$7
COMMAND
$4
INFO
$3
GET
*2
$7
COMMAND
$4
DOCS
$3
SET
*2
$7
COMMAND
$4
LIST


================================================
FILE: fuzz/seeds/resp/bf_add.resp
================================================
*4
$10
BF.RESERVE
$7
mybloom
$4
0.01
$4
1000
*3
$6
BF.ADD
$7
mybloom
$5
item1
*3
$9
BF.EXISTS
$7
mybloom
$5
item1


================================================
FILE: fuzz/seeds/resp/bitfield.resp
================================================
*6
$8
BITFIELD
$3
key
$3
GET
$2
u8
$1
0


================================================
FILE: fuzz/seeds/resp/bitfield_ops.resp
================================================
*8
$8
BITFIELD
$2
bk
$3
SET
$3
u8
$1
0
$3
200
$3
GET
$3
u8
$1
0
*5
$8
BITFIELD
$2
bk
$6
INCRBY
$3
u8
$1
0
$2
10
*6
$8
BITFIELD
$2
bk
$8
OVERFLOW
$3
SAT
$6
INCRBY
$3
u8
$1
0
$3
100
*5
$5
BITOP
$3
AND
$4
dest
$2
bk
$2
bk
*3
$6
BITPOS
$2
bk
$1
1


================================================
FILE: fuzz/seeds/resp/bitops.resp
================================================
*4
$6
SETBIT
$2
bk
$1
7
$1
1
*3
$6
GETBIT
$2
bk
$1
0
*2
$8
BITCOUNT
$2
bk


================================================
FILE: fuzz/seeds/resp/bloom_ops.resp
================================================
*4
$10
BF.RESERVE
$2
bf
$4
0.01
$4
1000
*3
$6
BF.ADD
$2
bf
$5
item1
*3
$9
BF.EXISTS
$2
bf
$5
item1
*5
$7
BF.MADD
$2
bf
$5
item2
$5
item3
$5
item4
*5
$10
BF.MEXISTS
$2
bf
$5
item1
$5
item2
$5
itemX


================================================
FILE: fuzz/seeds/resp/client.resp
================================================
*3
$6
CLIENT
$7
SETNAME
$10
testclient
*2
$6
CLIENT
$7
GETNAME
*2
$6
CLIENT
$4
LIST


================================================
FILE: fuzz/seeds/resp/config.resp
================================================
*3
$6
CONFIG
$3
GET
$9
maxmemory


================================================
FILE: fuzz/seeds/resp/copy.resp
================================================
*3
$3
SET
$3
src
$5
hello
*3
$4
COPY
$3
src
$3
dst


================================================
FILE: fuzz/seeds/resp/del.resp
================================================
*2
$3
DEL
$3
key


================================================
FILE: fuzz/seeds/resp/eval.resp
================================================
*3
$4
EVAL
$26
return redis.call("PING")
$0


================================================
FILE: fuzz/seeds/resp/expire_ops.resp
================================================
*3
$3
SET
$2
ek
$3
val
*3
$6
EXPIRE
$2
ek
$3
300
*2
$3
TTL
$2
ek
*2
$4
PTTL
$2
ek
*2
$10
EXPIRETIME
$2
ek
*2
$11
PEXPIRETIME
$2
ek
*3
$8
EXPIREAT
$2
ek
$10
9999999999
*2
$7
PERSIST
$2
ek
*3
$7
PEXPIRE
$2
ek
$6
300000
*3
$9
PEXPIREAT
$2
ek
$13
9999999999000
*2
$5
TOUCH
$2
ek


================================================
FILE: fuzz/seeds/resp/function.resp
================================================
*3
$8
FUNCTION
$4
LOAD
$56
#!lua name=mylib
redis.register_function('myfunc', function() return 1 end)
*2
$8
FUNCTION
$4
LIST


================================================
FILE: fuzz/seeds/resp/function_ops.resp
================================================
*3
$8
FUNCTION
$4
LOAD
$56
#!lua name=mylib
redis.register_function('myfunc', function() return 1 end)
*2
$8
FUNCTION
$4
LIST
*3
$8
FUNCTION
$6
DELETE
$5
mylib


================================================
FILE: fuzz/seeds/resp/generic_ops.resp
================================================
*3
$3
SET
$2
gk
$3
val
*2
$4
TYPE
$2
gk
*2
$6
EXISTS
$2
gk
*3
$6
EXPIRE
$2
gk
$3
300
*2
$3
TTL
$2
gk
*2
$4
PTTL
$2
gk
*2
$10
EXPIRETIME
$2
gk
*3
$7
PEXPIRE
$2
gk
$6
300000
*2
$11
PEXPIRETIME
$2
gk
*2
$7
PERSIST
$2
gk
*3
$4
COPY
$2
gk
$3
gk2
*3
$6
RENAME
$3
gk2
$3
gk3
*2
$4
DUMP
$3
gk3
*2
$6
UNLINK
$3
gk3
*2
$4
KEYS
$1
*
*3
$4
SCAN
$1
0
$5
COUNT
$1
5
*2
$9
RANDOMKEY
*2
$6
DBSIZE
*2
$4
TIME
*3
$6
SELECT
$1
0
*5
$4
SORT
$2
gk
$2
BY
$6
nosort
$5
ALPHA


================================================
FILE: fuzz/seeds/resp/generic_ops2.resp
================================================
*3
$3
SET
$3
gk1
$3
val
*3
$3
SET
$3
gk2
$3
val
*2
$2
DEL
$3
gk1
*2
$3
GET
$3
gk2
*2
$3
TTL
$3
gk2
*3
$8
RENAMENX
$3
gk2
$3
gk3
*2
$4
ECHO
$5
hello
*3
$5
STICK
$3
gk3
*2
$5
TOUCH
$3
gk3
*2
$4
TYPE
$3
gk3
*3
$4
MOVE
$3
gk3
$1
1
*2
$7
SORT_RO
$3
gk3
*3
$3
SET
$3
gk4
$3
val
*4
$7
RESTORE
$3
gk5
$1
0
$5
dummy


================================================
FILE: fuzz/seeds/resp/geo_ops.resp
================================================
*8
$6
GEOADD
$2
gk
$9
13.361389
$9
38.115556
$7
Palermo
$9
15.087269
$9
37.502669
$7
Catania
*5
$7
GEODIST
$2
gk
$7
Palermo
$7
Catania
$2
km
*3
$7
GEOHASH
$2
gk
$7
Palermo
*3
$6
GEOPOS
$2
gk
$7
Palermo
*7
$9
GEOSEARCH
$2
gk
$9
FROMLONLAT
$2
15
$2
37
$6
BYRADIUS
$3
200
$2
km
*6
$10
GEORADIUS
$2
gk
$2
15
$2
37
$3
200
$2
km


================================================
FILE: fuzz/seeds/resp/geo_ops2.resp
================================================
*11
$6
GEOADD
$3
gx1
$9
13.361389
$9
38.115556
$7
Palermo
$9
15.087269
$9
37.502669
$7
Catania
$9
2.349014
$9
48.864716
$5
Paris
*7
$10
GEORADIUS
$3
gx1
$2
15
$2
37
$3
200
$2
km
*6
$19
GEORADIUSBYMEMBER
$3
gx1
$7
Palermo
$3
200
$2
km
*7
$13
GEORADIUS_RO
$3
gx1
$2
15
$2
37
$3
200
$2
km
*6
$22
GEORADIUSBYMEMBER_RO
$3
gx1
$7
Palermo
$3
200
$2
km
*9
$9
GEOSEARCH
$3
gx1
$10
FROMLONLAT
$2
15
$2
37
$6
BYRADIUS
$3
200
$2
km
$3
ASC
*10
$14
GEOSEARCHSTORE
$4
gdst
$3
gx1
$10
FROMLONLAT
$2
15
$2
37
$6
BYRADIUS
$3
200
$2
km


================================================
FILE: fuzz/seeds/resp/geoadd.resp
================================================
*5
$6
GEOADD
$5
mygeo
$9
13.361389
$9
38.115556
$7
Palermo
*5
$7
GEODIST
$5
mygeo
$7
Palermo
$7
Catania
$2
km


================================================
FILE: fuzz/seeds/resp/get.resp
================================================
*2
$3
GET
$3
key


================================================
FILE: fuzz/seeds/resp/getdel.resp
================================================
*3
$3
SET
$1
k
$1
v
*2
$6
GETDEL
$1
k


================================================
FILE: fuzz/seeds/resp/hash_ops.resp
================================================
*8
$4
HSET
$2
hh
$2
f1
$2
v1
$2
f2
$2
v2
$2
f3
$2
10
*3
$4
HGET
$2
hh
$2
f1
*4
$5
HMGET
$2
hh
$2
f1
$2
f2
*2
$7
HGETALL
$2
hh
*2
$5
HKEYS
$2
hh
*2
$5
HVALS
$2
hh
*2
$4
HLEN
$2
hh
*3
$7
HEXISTS
$2
hh
$2
f1
*3
$7
HSTRLEN
$2
hh
$2
f1
*4
$7
HINCRBY
$2
hh
$2
f3
$1
5
*4
$12
HINCRBYFLOAT
$2
hh
$2
f3
$3
1.5
*3
$10
HRANDFIELD
$2
hh
$1
2
*4
$6
HSETNX
$2
hh
$4
newf
$4
newv
*3
$4
HDEL
$2
hh
$2
f2
*3
$5
HSCAN
$2
hh
$1
0


================================================
FILE: fuzz/seeds/resp/hash_ops2.resp
================================================
*6
$4
HSET
$3
hx1
$2
f1
$2
v1
$2
f2
$2
v2
*4
$5
HMSET
$3
hx1
$2
f3
$2
v3
*4
$6
HSETNX
$3
hx1
$6
newkey
$5
newvl
*4
$7
HSTRLEN
$3
hx1
$2
f1
*3
$12
HINCRBYFLOAT
$3
hx1
$2
f1
$3
1.5
*3
$9
HRANDFIELD
$3
hx1
$1
2
*5
$6
HSETEX
$3
hx1
$3
300
$2
f4
$2
v4
*4
$7
HEXPIRE
$3
hx1
$3
300
$2
f4


================================================
FILE: fuzz/seeds/resp/hll_ops.resp
================================================
*5
$5
PFADD
$4
hll1
$1
a
$1
b
$1
c
*4
$5
PFADD
$4
hll2
$1
c
$1
d
$1
e
*2
$7
PFCOUNT
$4
hll1
*3
$7
PFCOUNT
$4
hll1
$4
hll2
*4
$7
PFMERGE
$4
hll3
$4
hll1
$4
hll2


================================================
FILE: fuzz/seeds/resp/hset.resp
================================================
*4
$4
HSET
$4
hash
$5
field
$5
value


================================================
FILE: fuzz/seeds/resp/json.resp
================================================
*4
$8
JSON.SET
$3
doc
$1
$
$15
{"name":"test"}


================================================
FILE: fuzz/seeds/resp/json_ops.resp
================================================
*4
$8
JSON.SET
$2
jk
$1
$
$52
{"name":"test","age":30,"tags":["a","b"],"nested":{"x":1}}
*3
$8
JSON.GET
$2
jk
$1
$
*3
$9
JSON.TYPE
$2
jk
$1
$
*3
$10
JSON.STRLEN
$2
jk
$6
$.name
*3
$11
JSON.OBJLEN
$2
jk
$1
$
*3
$11
JSON.OBJKEYS
$2
jk
$1
$
*3
$10
JSON.ARRLEN
$2
jk
$6
$.tags
*4
$13
JSON.ARRAPPEND
$2
jk
$6
$.tags
$3
"c"
*5
$13
JSON.ARRINSERT
$2
jk
$6
$.tags
$1
0
$3
"z"
*4
$11
JSON.ARRPOP
$2
jk
$6
$.tags
$2
-1
*5
$12
JSON.ARRTRIM
$2
jk
$6
$.tags
$1
0
$1
2
*4
$12
JSON.ARRINDEX
$2
jk
$6
$.tags
$3
"a"
*3
$14
JSON.NUMINCRBY
$2
jk
$5
$.age
$1
1
*3
$14
JSON.NUMMULTBY
$2
jk
$5
$.age
$1
2
*4
$12
JSON.STRAPPEND
$2
jk
$6
$.name
$4
"_x"
*3
$11
JSON.TOGGLE
$2
jk
$6
$.tags
*3
$10
JSON.CLEAR
$2
jk
$6
$.tags
*3
$8
JSON.DEL
$2
jk
$8
$.nested
*3
$9
JSON.RESP
$2
jk
$1
$


================================================
FILE: fuzz/seeds/resp/json_ops2.resp
================================================
*4
$8
JSON.SET
$3
jm1
$1
$
$13
{"a":1,"b":2}
*4
$8
JSON.SET
$3
jm2
$1
$
$13
{"a":3,"c":4}
*3
$9
JSON.MGET
$3
jm1
$3
jm2
$1
$
*4
$9
JSON.MSET
$3
jm1
$3
$.a
$1
9
*4
$10
JSON.MERGE
$3
jm1
$1
$
$9
{"d":"new"}
*3
$10
JSON.DEBUG
$6
MEMORY
$3
jm1
$1
$
*3
$10
JSON.FORGET
$3
jm2
$3
$.c


================================================
FILE: fuzz/seeds/resp/list_blocking.resp
================================================
*5
$5
RPUSH
$3
lb1
$1
a
$1
b
$1
c
*5
$5
RPUSH
$3
lb2
$1
x
$1
y
$1
z
*3
$10
RPOPLPUSH
$3
lb1
$3
lb2
*5
$5
LMOVE
$3
lb1
$3
lb2
$4
LEFT
$5
RIGHT
*4
$5
LMPOP
$1
2
$3
lb1
$3
lb2
$4
LEFT
*4
$5
LPUSH
$3
bq1
$1
1
$1
2
*3
$5
BLPOP
$3
bq1
$1
1
*3
$5
BRPOP
$3
bq1
$1
1
*5
$6
BLMOVE
$3
lb1
$3
lb2
$4
LEFT
$5
RIGHT
$1
1
*5
$6
BLMPOP
$1
1
$1
1
$3
lb1
$4
LEFT


================================================
FILE: fuzz/seeds/resp/list_ops.resp
================================================
*5
$5
RPUSH
$2
ll
$1
a
$1
b
$1
c
*3
$6
LPUSHX
$2
ll
$1
x
*3
$6
RPUSHX
$2
ll
$1
z
*2
$4
LLEN
$2
ll
*4
$6
LRANGE
$2
ll
$1
0
$2
-1
*3
$6
LINDEX
$2
ll
$1
2
*5
$7
LINSERT
$2
ll
$6
BEFORE
$1
b
$4
new1
*4
$4
LSET
$2
ll
$1
0
$4
head
*4
$5
LTRIM
$2
ll
$1
0
$1
4
*4
$4
LREM
$2
ll
$1
1
$1
a
*2
$4
LPOP
$2
ll
*2
$4
RPOP
$2
ll
*5
$5
RPUSH
$2
l2
$1
1
$1
2
$1
3
*4
$5
LMOVE
$2
ll
$2
l2
$4
LEFT


================================================
FILE: fuzz/seeds/resp/lmpop.resp
================================================
*5
$5
RPUSH
$6
mylist
$1
a
$1
b
$1
c
*4
$5
LMPOP
$1
1
$6
mylist
$4
LEFT


================================================
FILE: fuzz/seeds/resp/lpos.resp
================================================
*7
$5
RPUSH
$6
mylist
$1
a
$1
b
$1
c
$1
a
$1
d
*3
$4
LPOS
$6
mylist
$1
a


================================================
FILE: fuzz/seeds/resp/lpush.resp
================================================
*3
$5
LPUSH
$4
list
$4
item


================================================
FILE: fuzz/seeds/resp/memory.resp
================================================
*3
$3
SET
$5
mykey
$9
somevalue
*3
$6
MEMORY
$5
USAGE
$5
mykey


================================================
FILE: fuzz/seeds/resp/monitor.resp
================================================
*1
$7
MONITOR


================================================
FILE: fuzz/seeds/resp/mset.resp
================================================
*5
$4
MSET
$1
a
$1
1
$1
b
$1
2
*3
$4
MGET
$1
a
$1
b


================================================
FILE: fuzz/seeds/resp/multi_type_pipeline.resp
================================================
*3
$3
SET
$2
pk
$5
hello
*5
$5
RPUSH
$2
pl
$1
a
$1
b
$1
c
*4
$4
HSET
$2
ph
$1
f
$1
v
*4
$4
SADD
$2
ps
$1
x
$1
y
*6
$4
ZADD
$2
pz
$1
1
$1
a
$1
2
$1
b
*5
$4
XADD
$2
px
$1
*
$1
k
$1
v
*4
$8
JSON.SET
$2
pj
$1
$
$13
{"a":1,"b":2}
*2
$4
TYPE
$2
pk
*2
$4
TYPE
$2
pl
*2
$4
TYPE
$2
ph
*2
$4
TYPE
$2
ps
*2
$4
TYPE
$2
pz
*2
$4
TYPE
$2
px
*8
$3
DEL
$2
pk
$2
pl
$2
ph
$2
ps
$2
pz
$2
px
$2
pj


================================================
FILE: fuzz/seeds/resp/object.resp
================================================
*3
$3
SET
$5
mykey
$3
val
*3
$6
OBJECT
$8
ENCODING
$5
mykey


================================================
FILE: fuzz/seeds/resp/pfadd.resp
================================================
*5
$5
PFADD
$4
hll1
$1
a
$1
b
$1
c
*2
$7
PFCOUNT
$4
hll1


================================================
FILE: fuzz/seeds/resp/ping.resp
================================================
*1
$4
PING


================================================
FILE: fuzz/seeds/resp/pipeline.resp
================================================
*1
$4
PING
*3
$3
SET
$1
a
$1
1
*2
$4
INCR
$1
a
*2
$3
GET
$1
a
*2
$3
DEL
$1
a


================================================
FILE: fuzz/seeds/resp/pubsub_ops.resp
================================================
*3
$7
PUBLISH
$4
chan
$5
hello
*2
$6
PUBSUB
$8
CHANNELS
*3
$6
PUBSUB
$6
NUMSUB
$4
chan


================================================
FILE: fuzz/seeds/resp/pubsub_ops2.resp
================================================
*3
$7
PUBLISH
$5
chan1
$3
msg
*3
$7
PUBLISH
$5
chan2
$4
msg2
*2
$6
PUBSUB
$8
CHANNELS
*3
$6
PUBSUB
$6
NUMSUB
$5
chan1
*2
$6
PUBSUB
$8
NUMPAT
*2
$9
SUBSCRIBE
$5
chan1
*2
$11
UNSUBSCRIBE
$5
chan1
*2
$10
PSUBSCRIBE
$5
chan*
*2
$12
PUNSUBSCRIBE
$5
chan*
*2
$10
SSUBSCRIBE
$5
chan1
*3
$8
SPUBLISH
$5
chan1
$4
smsg


================================================
FILE: fuzz/seeds/resp/rename.resp
================================================
*3
$3
SET
$3
foo
$5
hello
*3
$6
RENAME
$3
foo
$3
bar


================================================
FILE: fuzz/seeds/resp/rpoplpush.resp
================================================
*3
$5
LPUSH
$3
src
$1
a
*3
$5
LPUSH
$3
src
$1
b
*3
$9
RPOPLPUSH
$3
src
$3
dst


================================================
FILE: fuzz/seeds/resp/sadd.resp
================================================
*3
$4
SADD
$3
set
$6
member


================================================
FILE: fuzz/seeds/resp/scan_hscan.resp
================================================
*6
$4
HSET
$1
h
$2
f1
$2
v1
$2
f2
$2
v2
*3
$5
HSCAN
$1
h
$1
0


================================================
FILE: fuzz/seeds/resp/script_ops.resp
================================================
*3
$4
EVAL
$28
return redis.call('PING')
$1
0
*4
$4
EVAL
$44
return redis.call('SET', KEYS[1], ARGV[1])
$1
1
$2
ek
$2
ev
*4
$7
EVAL_RO
$37
return redis.call('GET', KEYS[1])
$1
1
$2
ek
*2
$6
SCRIPT
$5
FLUSH


================================================
FILE: fuzz/seeds/resp/script_ops2.resp
================================================
*4
$4
EVAL
$44
return redis.call('SET', KEYS[1], ARGV[1])
$1
1
$3
esk
$5
esval
*4
$7
EVAL_RO
$37
return redis.call('GET', KEYS[1])
$1
1
$3
esk
*3
$6
SCRIPT
$5
FLUSH
$5
ASYNC
*3
$6
SCRIPT
$6
EXISTS
$40
e0e1f9fabfc9d4800c877a703b823ac0578ff831
*4
$8
EVALSHA
$40
e0e1f9fabfc9d4800c877a703b823ac0578ff831
$1
0
*4
$11
EVALSHA_RO
$40
e0e1f9fabfc9d4800c877a703b823ac0578ff831
$1
0


================================================
FILE: fuzz/seeds/resp/sdiffstore.resp
================================================
*4
$4
SADD
$2
s1
$1
a
$1
b
*3
$4
SADD
$2
s2
$1
b
*4
$10
SDIFFSTORE
$3
dst
$2
s1
$2
s2


================================================
FILE: fuzz/seeds/resp/search_ops.resp
================================================
*8
$9
FT.CREATE
$5
myidx
$2
ON
$4
HASH
$6
SCHEMA
$5
title
$4
TEXT
$5
score
$7
NUMERIC
*3
$7
FT.INFO
$5
myidx
*8
$4
HSET
$4
doc1
$5
title
$5
hello
$5
score
$1
1
*8
$4
HSET
$4
doc2
$5
title
$5
world
$5
score
$1
2
*3
$9
FT.SEARCH
$5
myidx
$5
hello
*5
$9
FT.SEARCH
$5
myidx
$1
*
$5
LIMIT
$1
0
$1
5
*2
$8
FT._LIST
*3
$12
FT.DROPINDEX
$5
myidx


================================================
FILE: fuzz/seeds/resp/search_ops2.resp
================================================
*8
$9
FT.CREATE
$5
idx2
$2
ON
$4
HASH
$6
PREFIX
$1
1
$4
doc:
$6
SCHEMA
$5
title
$4
TEXT
$5
score
$7
NUMERIC
*4
$4
HSET
$5
doc:1
$5
title
$5
hello
$5
score
$1
1
*4
$4
HSET
$5
doc:2
$5
title
$5
world
$5
score
$1
2
*3
$9
FT.SEARCH
$5
idx2
$5
hello
*7
$9
FT.SEARCH
$5
idx2
$1
*
$6
SORTBY
$5
score
$5
LIMIT
$1
0
$1
1
*2
$7
FT.INFO
$5
idx2
*5
$8
FT.ALTER
$5
idx2
$6
SCHEMA
$3
ADD
$3
tag
$3
TAG
*3
$9
FT.CONFIG
$3
GET
$1
*
*3
$9
FT.CONFIG
$3
SET
$14
MAXSEARCHRESULTS
$5
10000
*6
$12
FT.SYNUPDATE
$5
idx2
$2
g1
$5
hello
$2
hi
$3
hey
*2
$10
FT.SYNDUMP
$5
idx2
*3
$12
FT.AGGREGATE
$5
idx2
$1
*
*2
$10
FT.TAGVALS
$5
idx2
$3
tag
*2
$12
FT.DROPINDEX
$5
idx2


================================================
FILE: fuzz/seeds/resp/server_ops.resp
================================================
*2
$4
INFO
$6
server
*2
$4
INFO
$6
memory
*2
$4
INFO
$11
replication
*1
$6
DBSIZE
*3
$6
CLIENT
$7
SETNAME
$4
fuzz
*2
$6
CLIENT
$7
GETNAME
*2
$6
CLIENT
$2
ID
*2
$6
CLIENT
$4
INFO
*3
$6
CONFIG
$3
GET
$9
maxmemory
*2
$4
ROLE
*2
$7
LASTSAVE
*3
$6
MEMORY
$5
USAGE
$4
nokey
*2
$7
SLOWLOG
$3
LEN
*2
$7
LATENCY
$6
LATEST
*3
$5
HELLO
$1
2


================================================
FILE: fuzz/seeds/resp/server_ops2.resp
================================================
*2
$4
INFO
$3
all
*2
$6
CLIENT
$4
LIST
*3
$6
CLIENT
$4
INFO
*2
$7
CLUSTER
$4
INFO
*2
$7
CLUSTER
$5
MYID
*2
$7
CLUSTER
$5
SLOTS
*1
$8
READONLY
*1
$9
READWRITE
*2
$7
SLOWLOG
$3
GET
*2
$7
LATENCY
$7
HISTORY
$5
event
*2
$6
MEMORY
$6
DOCTOR
*2
$6
MEMORY
$5
STATS
*3
$5
HELLO
$1
3
*4
$4
DFLY
$7
CLUSTER
$6
CONFIG
$2
{}
*2
$1
QUIT


================================================
FILE: fuzz/seeds/resp/set.resp
================================================
*3
$3
SET
$3
key
$5
value


================================================
FILE: fuzz/seeds/resp/set_ops.resp
================================================
*6
$4
SADD
$2
s1
$1
a
$1
b
$1
c
$1
d
*5
$4
SADD
$2
s2
$1
c
$1
d
$1
e
*2
$5
SCARD
$2
s1
*2
$8
SMEMBERS
$2
s1
*3
$9
SISMEMBER
$2
s1
$1
a
*4
$10
SMISMEMBER
$2
s1
$1
a
$1
z
*3
$4
SREM
$2
s1
$1
d
*3
$5
SMOVE
$2
s1
$2
s2
$1
a
*3
$6
SUNION
$2
s1
$2
s2
*3
$5
SINTER
$2
s1
$2
s2
*3
$5
SDIFF
$2
s1
$2
s2
*4
$11
SUNIONSTORE
$4
sdst
$2
s1
$2
s2
*4
$11
SINTERSTORE
$4
idst
$2
s1
$2
s2
*4
$10
SDIFFSTORE
$4
ddst
$2
s1
$2
s2
*4
$10
SINTERCARD
$1
2
$2
s1
$2
s2
*3
$4
SPOP
$2
s1
$1
1
*3
$5
SSCAN
$2
s2
$1
0


================================================
FILE: fuzz/seeds/resp/set_ops2.resp
================================================
*4
$4
SADD
$3
sx1
$1
a
$1
b
*4
$6
SADDEX
$3
sx1
$3
300
$1
c


================================================
FILE: fuzz/seeds/resp/smove.resp
================================================
*3
$4
SADD
$3
src
$1
a
*3
$4
SADD
$3
dst
$1
b
*4
$5
SMOVE
$3
src
$3
dst
$1
a


================================================
FILE: fuzz/seeds/resp/sort.resp
================================================
*4
$5
LPUSH
$4
list
$1
3
$1
1
*3
$5
LPUSH
$4
list
$1
2
*4
$4
SORT
$4
list
$5
STORE
$6
sorted


================================================
FILE: fuzz/seeds/resp/srandmember.resp
================================================
*7
$4
SADD
$5
myset
$1
a
$1
b
$1
c
$1
d
$1
e
*3
$11
SRANDMEMBER
$5
myset
$1
3


================================================
FILE: fuzz/seeds/resp/stream_ops.resp
================================================
*5
$4
XADD
$2
st
$1
*
$1
k
$1
v
*5
$4
XADD
$2
st
$1
*
$1
k
$2
v2
*5
$4
XADD
$2
st
$1
*
$1
k
$2
v3
*2
$4
XLEN
$2
st
*4
$6
XRANGE
$2
st
$1
-
$1
+
*4
$9
XREVRANGE
$2
st
$1
+
$1
-
*4
$5
XTRIM
$2
st
$6
MAXLEN
$1
2
*4
$6
XGROUP
$6
CREATE
$2
st
$2
g1
$1
0
*7
$10
XREADGROUP
$5
GROUP
$2
g1
$2
c1
$7
STREAMS
$2
st
$1
>
*4
$4
XACK
$2
st
$2
g1
$3
0-1
*4
$8
XPENDING
$2
st
$2
g1
$1
-
$1
+
$2
10
*4
$5
XINFO
$6
STREAM
$2
st
*3
$6
XSETID
$2
st
$3
0-5


================================================
FILE: fuzz/seeds/resp/stream_ops2.resp
================================================
*5
$4
XADD
$3
sx1
$1
*
$1
k
$2
v1
*5
$4
XADD
$3
sx1
$1
*
$1
k
$2
v2
*5
$4
XADD
$3
sx1
$1
*
$1
k
$2
v3
*4
$6
XGROUP
$6
CREATE
$3
sx1
$3
sg1
$1
0
*7
$10
XREADGROUP
$5
GROUP
$3
sg1
$2
c1
$7
STREAMS
$3
sx1
$1
>
*5
$6
XCLAIM
$3
sx1
$3
sg1
$2
c1
$1
0
$3
0-1
*6
$10
XAUTOCLAIM
$3
sx1
$3
sg1
$2
c1
$1
0
$3
0-0
*3
$4
XDEL
$3
sx1
$3
0-1


================================================
FILE: fuzz/seeds/resp/string_ops.resp
================================================
*3
$3
SET
$2
sk
$5
hello
*3
$6
APPEND
$2
sk
$6
_world
*2
$6
STRLEN
$2
sk
*4
$8
GETRANGE
$2
sk
$1
0
$1
4
*4
$8
SETRANGE
$2
sk
$1
6
$3
foo
*3
$5
SETEX
$3
sk2
$2
60
$4
temp
*3
$6
PSETEX
$3
sk3
$5
60000
$4
temp
*3
$5
SETNX
$3
sk4
$3
new
*3
$6
GETSET
$2
sk
$3
old
*6
$4
MSET
$2
m1
$2
v1
$2
m2
$2
v2
*3
$4
MGET
$2
m1
$2
m2
*3
$3
SET
$2
ci
$1
0
*2
$4
INCR
$2
ci
*2
$4
DECR
$2
ci
*3
$6
INCRBY
$2
ci
$2
10
*3
$6
DECRBY
$2
ci
$1
5
*3
$12
INCRBYFLOAT
$2
ci
$3
1.5
*2
$6
GETDEL
$2
m2
*4
$5
GETEX
$2
m1
$2
EX
$2
60


================================================
FILE: fuzz/seeds/resp/string_ops2.resp
================================================
*4
$5
MSETNX
$2
nx1
$2
v1
$2
nx2
$2
v2
*3
$7
PREPEND
$2
nx1
$3
pre
*3
$6
SUBSTR
$2
nx1
$1
0
$1
3
*2
$6
DIGEST
$2
nx1
*4
$5
SETEX
$2
sx
$1
3
$3
val
*4
$6
PSETEX
$2
px
$4
3000
$3
val
*3
$5
GETEX
$2
sx
$2
EX
$1
5
*3
$6
APPEND
$2
sx
$4
_end
*3
$8
SETRANGE
$2
sx
$1
0
$3
NEW
*2
$6
GETDEL
$2
px


================================================
FILE: fuzz/seeds/resp/subscribe.resp
================================================
*2
$9
SUBSCRIBE
$9
mychannel


================================================
FILE: fuzz/seeds/resp/throttle.resp
================================================
*6
$11
CL.THROTTLE
$6
myrate
$2
10
$2
30
$2
60
$1
1


================================================
FILE: fuzz/seeds/resp/transaction.resp
================================================
*1
$5
MULTI
*3
$3
SET
$1
a
$1
1
*1
$4
EXEC


================================================
FILE: fuzz/seeds/resp/transaction_ops2.resp
================================================
*3
$3
SET
$2
tk
$3
val
*1
$5
WATCH
$2
tk
*1
$5
MULTI
*3
$3
SET
$2
tk
$4
new1
*1
$7
DISCARD
*1
$7
UNWATCH
*1
$5
MULTI
*3
$3
SET
$2
tk
$4
new2
*1
$4
EXEC


================================================
FILE: fuzz/seeds/resp/watch.resp
================================================
*2
$5
WATCH
$1
a
*1
$5
MULTI
*3
$3
SET
$1
a
$1
1
*1
$4
EXEC


================================================
FILE: fuzz/seeds/resp/watch_multi.resp
================================================
*2
$5
WATCH
$1
k
*1
$5
MULTI
*3
$3
SET
$1
k
$1
1
*1
$4
EXEC


================================================
FILE: fuzz/seeds/resp/xadd.resp
================================================
*5
$4
XADD
$6
stream
$1
*
$5
field
$5
value


================================================
FILE: fuzz/seeds/resp/xread.resp
================================================
*5
$5
XREAD
$5
COUNT
$1
1
$7
STREAMS
$6
stream
$1
0


================================================
FILE: fuzz/seeds/resp/zadd.resp
================================================
*5
$4
ZADD
$4
zset
$1
1
$6
member


================================================
FILE: fuzz/seeds/resp/zmpop.resp
================================================
*8
$4
ZADD
$5
myzst
$1
1
$1
a
$1
2
$1
b
$1
3
$1
c
*4
$5
ZMPOP
$1
1
$5
myzst
$3
MIN


================================================
FILE: fuzz/seeds/resp/zrangebyscore.resp
================================================
*5
$13
ZRANGEBYSCORE
$4
zset
$4
-inf
$4
+inf
$10
WITHSCORES


================================================
FILE: fuzz/seeds/resp/zset_ops.resp
================================================
*8
$4
ZADD
$2
z1
$1
1
$1
a
$1
2
$1
b
$1
3
$1
c
*6
$4
ZADD
$2
z2
$1
2
$1
b
$1
4
$1
d
*3
$7
ZINCRBY
$2
z1
$1
5
$1
a
*3
$5
ZSCORE
$2
z1
$1
a
*4
$7
ZMSCORE
$2
z1
$1
a
$1
c
*2
$5
ZCARD
$2
z1
*4
$6
ZCOUNT
$2
z1
$4
-inf
$4
+inf
*3
$5
ZRANK
$2
z1
$1
b
*3
$8
ZREVRANK
$2
z1
$1
b
*4
$6
ZRANGE
$2
z1
$1
0
$2
-1
*4
$9
ZREVRANGE
$2
z1
$1
0
$2
-1
*4
$12
ZRANGEBYLEX
$2
z1
$1
-
$1
+
*5
$12
ZRANGEBYSCORE
$2
z1
$1
1
$1
3
$10
WITHSCORES
*4
$15
ZREMRANGEBYRANK
$2
z2
$1
0
$1
0
*4
$16
ZREMRANGEBYSCORE
$2
z2
$1
0
$1
2
*3
$7
ZPOPMIN
$2
z1
$1
1
*3
$7
ZPOPMAX
$2
z1
$1
1
*3
$6
ZUNION
$1
2
$2
z1
$2
z2
*4
$11
ZUNIONSTORE
$4
zdst
$1
2
$2
z1
*3
$5
ZSCAN
$2
z1
$1
0
*3
$11
ZRANDMEMBER
$2
z1
$1
2


================================================
FILE: fuzz/seeds/resp/zset_ops2.resp
================================================
*8
$4
ZADD
$3
za1
$1
1
$1
a
$1
2
$1
b
$1
3
$1
c
*8
$4
ZADD
$3
za2
$1
2
$1
b
$1
4
$1
d
$1
5
$1
e
*4
$6
ZINTER
$1
2
$3
za1
$3
za2
*4
$11
ZINTERSTORE
$4
zint
$1
2
$3
za1
$3
za2
*5
$10
ZINTERCARD
$1
2
$3
za1
$3
za2
*4
$5
ZDIFF
$1
2
$3
za1
$3
za2
*4
$10
ZDIFFSTORE
$5
zdiff
$1
2
$3
za1
*3
$4
ZREM
$3
za2
$1
d
*4
$14
ZREMRANGEBYLEX
$3
za1
$3
[a]
$3
[b]
*6
$11
ZRANGESTORE
$5
zrngs
$3
za1
$1
0
$2
-1
$7
BYSCORE
*4
$9
ZLEXCOUNT
$3
za1
$1
-
$1
+
*6
$15
ZREVRANGEBYSCORE
$3
za1
$4
+inf
$4
-inf
$10
WITHSCORES
$5
LIMIT
$1
0
$1
2
*4
$13
ZREVRANGEBYLEX
$3
za1
$1
+
$1
-


================================================
FILE: fuzz/triage_crashes.sh
================================================
#!/usr/bin/env bash
# Triage AFL++ crash artifacts: replay each crash against a fresh Dragonfly
# instance and report whether it's confirmed or a false positive.
#
# Usage:
#   ./fuzz/triage_crashes.sh <dragonfly_binary> <mode> <crashes.zip>
#
#   dragonfly_binary  Path to Dragonfly binary
#   mode              Protocol: 'resp' or 'memcache'
#   crashes.zip       .zip downloaded from CI artifacts (contains crash-*.tar.gz files)
#
# Examples:
#   ./fuzz/triage_crashes.sh ./build-dbg/dragonfly resp fuzz-long-resp-crashes-35.zip
#   ./fuzz/triage_crashes.sh ./build-dbg/dragonfly memcache fuzz-long-memcache-crashes-35.zip

set -euo pipefail

# ─── Colors ───────────────────────────────────────────────────────────────────
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
CYAN='\033[0;36m'
BOLD='\033[1m'
NC='\033[0m'

# ─── Config ───────────────────────────────────────────────────────────────────
RESP_PORT=6379
MC_PORT=11211
STARTUP_TIMEOUT=5   # seconds to wait for Dragonfly to accept connections
POST_REPLAY_WAIT=3  # seconds to wait after replay for Dragonfly to crash

print_info()  { echo -e "${GREEN}[INFO]${NC}  $1"; }
print_error() { echo -e "${RED}[ERROR]${NC} $1"; }
print_warn()  { echo -e "${YELLOW}[WARN]${NC}  $1"; }

usage() {
    echo -e "${BOLD}Usage:${NC} $0 <dragonfly_binary> <mode> <crashes.zip>"
    echo ""
    echo "  dragonfly_binary  Path to Dragonfly binary"
    echo "  mode              Protocol: 'resp' or 'memcache'"
    echo "  crashes.zip       .zip downloaded from CI artifacts"
    echo ""
    echo "Examples:"
    echo "  $0 ./build-dbg/dragonfly resp fuzz-long-resp-crashes-35.zip"
    echo "  $0 ./build-dbg/dragonfly memcache fuzz-long-memcache-crashes-35.zip"
    exit 1
}

# ─── Args ─────────────────────────────────────────────────────────────────────
if [[ $# -lt 3 ]]; then
    usage
fi

DRAGONFLY_BIN="$(realpath "$1")"
MODE="$2"
CRASHES_ZIP="$(realpath "$3")"

if [[ ! -f "$DRAGONFLY_BIN" ]]; then
    print_error "Dragonfly binary not found: $DRAGONFLY_BIN"
    exit 1
fi
if [[ "$MODE" != "resp" && "$MODE" != "memcache" ]]; then
    print_error "Mode must be 'resp' or 'memcache', got: $MODE"
    exit 1
fi
if [[ ! -f "$CRASHES_ZIP" ]]; then
    print_error "Crashes zip not found: $CRASHES_ZIP"
    exit 1
fi
if [[ "$CRASHES_ZIP" != *.zip ]]; then
    print_error "Expected a .zip file (CI artifact), got: $CRASHES_ZIP"
    exit 1
fi

# ─── Working directory ────────────────────────────────────────────────────────
WORK_DIR=$(mktemp -d /tmp/triage_XXXXXX)
DF_PID=""
cleanup() {
    [[ -n "$DF_PID" ]] && kill -9 "$DF_PID" 2>/dev/null || true
    rm -rf "$WORK_DIR"
}
trap cleanup EXIT INT TERM

# ─── Extract zip ──────────────────────────────────────────────────────────────
print_info "Extracting $(basename "$CRASHES_ZIP")..."
unzip -q "$CRASHES_ZIP" -d "$WORK_DIR/input"
CRASHES_DIR="$WORK_DIR/input"

# ─── Find crash archives ──────────────────────────────────────────────────────
mapfile -t CRASH_ARCHIVES < <(find "$CRASHES_DIR" -name 'crash-*.tar.gz' | sort)
TOTAL=${#CRASH_ARCHIVES[@]}

if [[ $TOTAL -eq 0 ]]; then
    print_error "No crash-*.tar.gz files found in: $CRASHES_DIR"
    exit 1
fi

print_info "Found $TOTAL crash archive(s)  mode=$MODE  binary=$DRAGONFLY_BIN"
echo ""

# ─── Locate replay_crash.py ───────────────────────────────────────────────────
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPLAY_SCRIPT="$SCRIPT_DIR/replay_crash.py"
if [[ ! -f "$REPLAY_SCRIPT" ]]; then
    print_error "replay_crash.py not found at: $REPLAY_SCRIPT"
    print_error "Run this script from the repository root or fuzz/ directory."
    exit 1
fi

# ─── Helpers ──────────────────────────────────────────────────────────────────
# Wait until a TCP port accepts connections
wait_for_port() {
    local host="$1" port="$2" timeout_sec="$3"
    local deadline=$((SECONDS + timeout_sec))
    while [[ $SECONDS -lt $deadline ]]; do
        if (>/dev/tcp/"$host"/"$port") 2>/dev/null; then
            return 0
        fi
        sleep 0.2
    done
    return 1
}

# Wait until a TCP port stops accepting connections
wait_port_free() {
    local port="$1" timeout_sec="${2:-5}"
    local deadline=$((SECONDS + timeout_sec))
    while [[ $SECONDS -lt $deadline ]]; do
        if ! (>/dev/tcp/127.0.0.1/"$port") 2>/dev/null; then
            return 0
        fi
        sleep 0.2
    done
    return 1
}

# Show crash info from glog log directory.
show_crash_log() {
    local log_dir="$1"
    local fatal_link="$log_dir/dragonfly.FATAL"

    if [[ -f "$fatal_link" ]]; then
        # Skip the 4-line glog file header, show crash message + stack trace
        sed -n '5,$p' "$fatal_link" | head -40 | sed 's/^/    /'
        return
    fi

    # No FATAL file — fall back to tail of INFO log
    local info_log
    info_log=$(ls -t "$log_dir"/dragonfly.*.log.INFO.* 2>/dev/null | head -1 || true)
    if [[ -n "$info_log" ]]; then
        echo "    (no FATAL log — last INFO log lines:)"
        tail -20 "$info_log" | sed 's/^/    /'
    else
        echo "    (no log files found in $log_dir)"
    fi
}

# ─── Main loop ────────────────────────────────────────────────────────────────
CONFIRMED=0
FALSE_POSITIVE=0
FAILED=0

for CRASH_ARCHIVE in "${CRASH_ARCHIVES[@]}"; do
    CRASH_NAME=$(basename "$CRASH_ARCHIVE" .tar.gz)   # crash-000000
    CRASH_ID="${CRASH_NAME#crash-}"                    # 000000
    IDX=$((CONFIRMED + FALSE_POSITIVE + FAILED + 1))

    echo -e "${CYAN}${BOLD}─── [$IDX/$TOTAL] Crash ${CRASH_ID} ───${NC}"

    # Extract this crash archive
    EXTRACT_DIR="$WORK_DIR/current_crash"
    rm -rf "$EXTRACT_DIR"
    mkdir -p "$EXTRACT_DIR"
    tar -xzf "$CRASH_ARCHIVE" -C "$EXTRACT_DIR"

    CRASH_DATA_DIR="$EXTRACT_DIR/${CRASH_NAME}/crashes"
    if [[ ! -d "$CRASH_DATA_DIR" ]]; then
        print_warn "Expected directory not found: $CRASH_DATA_DIR — skipping"
        FAILED=$((FAILED + 1))
        echo ""
        continue
    fi

    # Kill any leftover process on the port from a previous iteration
    if (>/dev/tcp/127.0.0.1/"$RESP_PORT") 2>/dev/null; then
        print_warn "Port $RESP_PORT still in use — waiting..."
        wait_port_free "$RESP_PORT" 5 || {
            print_error "Port $RESP_PORT still blocked after 5s — cannot start Dragonfly"
            FAILED=$((FAILED + 1))
            echo ""
            continue
        }
    fi

    # Start Dragonfly — use --log_dir so glog writes to separate per-level files
    # (dragonfly.FATAL symlink is created on crash and contains the fatal message)
    LOG_DIR="$WORK_DIR/logs_${CRASH_ID}"
    mkdir -p "$LOG_DIR"

    # Mirror the exact flags used by run_fuzzer.sh so replay runs in the same
    # server configuration as when the crash was found.
    # Missing rename_command flags are the most common cause of false positives:
    # if FLUSHALL/FLUSHDB/SHUTDOWN are not disabled, they execute during replay,
    # wiping state or shutting down the server before the crash can trigger.
    DF_ARGS=(
        --port "$RESP_PORT"
        --log_dir="$LOG_DIR"
        --proactor_threads 1
        --dbfilename=""
        --omit_basic_usage
        --rename_command=SHUTDOWN=
        --rename_command=DEBUG=
        --rename_command=FLUSHALL=
        --rename_command=FLUSHDB=
        --max_bulk_len=1048576
    )
    [[ "$MODE" == "memcache" ]] && DF_ARGS+=(--memcached_port="$MC_PORT")

    "$DRAGONFLY_BIN" "${DF_ARGS[@]}" >/dev/null 2>&1 &
    DF_PID=$!

    if ! wait_for_port 127.0.0.1 "$RESP_PORT" "$STARTUP_TIMEOUT"; then
        print_error "Dragonfly did not start within ${STARTUP_TIMEOUT}s (crash $CRASH_ID)"
        kill -9 "$DF_PID" 2>/dev/null || true
        wait "$DF_PID" 2>/dev/null && true || true
        DF_PID=""
        FAILED=$((FAILED + 1))
        echo ""
        continue
    fi
    # In memcache mode also verify the memcache listener is up before replaying
    if [[ "$MODE" == "memcache" ]] && ! wait_for_port 127.0.0.1 "$MC_PORT" 3; then
        print_error "Memcache port $MC_PORT not ready (crash $CRASH_ID)"
        kill -9 "$DF_PID" 2>/dev/null || true
        wait "$DF_PID" 2>/dev/null && true || true
        DF_PID=""
        FAILED=$((FAILED + 1))
        echo ""
        continue
    fi

    # Replay the crash
    REPLAY_PORT="$RESP_PORT"
    [[ "$MODE" == "memcache" ]] && REPLAY_PORT="$MC_PORT"

    if ! python3 "$REPLAY_SCRIPT" \
            "$CRASH_DATA_DIR" "$CRASH_ID" 127.0.0.1 "$REPLAY_PORT" \
            >/dev/null 2>&1; then
        print_warn "Replay script failed for crash $CRASH_ID — skipping"
        kill -9 "$DF_PID" 2>/dev/null || true
        wait "$DF_PID" 2>/dev/null && true || true
        DF_PID=""
        FAILED=$((FAILED + 1))
        echo ""
        continue
    fi

    # Wait for Dragonfly to die (poll every 100ms)
    DIED=false
    for _ in $(seq 1 $((POST_REPLAY_WAIT * 10))); do
        if ! kill -0 "$DF_PID" 2>/dev/null; then
            DIED=true
            break
        fi
        sleep 0.1
    done

    if ! $DIED; then
        echo -e "  ${YELLOW}FALSE POSITIVE${NC} — Dragonfly alive after replay"
        FALSE_POSITIVE=$((FALSE_POSITIVE + 1))
        kill -9 "$DF_PID" 2>/dev/null || true
        wait "$DF_PID" 2>/dev/null && true || true
        DF_PID=""
    else
        # Capture signal without triggering set -e (assignment always exits 0)
        wait "$DF_PID" 2>/dev/null && EXIT_CODE=0 || EXIT_CODE=$?
        DF_PID=""
        # Sanity check: exit code > 128 means killed by signal; otherwise not a signal death
        if [[ $EXIT_CODE -le 128 ]]; then
            echo -e "  ${YELLOW}FALSE POSITIVE${NC} — Dragonfly exited cleanly (code $EXIT_CODE)"
            FALSE_POSITIVE=$((FALSE_POSITIVE + 1))
            echo ""
            continue
        fi
        SIGNAL=$((EXIT_CODE - 128))
        CONFIRMED=$((CONFIRMED + 1))

        if [[ $SIGNAL -eq 6 ]]; then
            echo -e "  ${RED}CONFIRMED${NC} — SIGABRT (signal 6) — assertion / LOG(FATAL)"
            show_crash_log "$LOG_DIR"
        elif [[ $SIGNAL -eq 11 ]]; then
            echo -e "  ${RED}CONFIRMED${NC} — SIGSEGV (signal 11) — segmentation fault"
            show_crash_log "$LOG_DIR"
        else
            echo -e "  ${RED}CONFIRMED${NC} — signal $SIGNAL (exit code $EXIT_CODE)"
            show_crash_log "$LOG_DIR"
        fi
    fi
    echo ""
done

# ─── Summary ──────────────────────────────────────────────────────────────────
echo -e "${CYAN}${BOLD}═══ Triage Summary ═══${NC}"
printf "  %-18s %d\n" "Total:" "$TOTAL"
printf "  ${RED}%-18s %d${NC}\n" "Confirmed:" "$CONFIRMED"
printf "  ${YELLOW}%-18s %d${NC}\n" "False positive:" "$FALSE_POSITIVE"
[[ $FAILED -gt 0 ]] && printf "  ${RED}%-18s %d${NC}\n" "Failed/skipped:" "$FAILED"

# Exit 1 if any confirmed crashes found
[[ $CONFIRMED -gt 0 ]] && exit 1
exit 0


================================================
FILE: go.work
================================================
go 1.24.0

toolchain go1.24.7

use (
	./contrib/charts/dragonfly
	./tools/replay
)


================================================
FILE: go.work.sum
================================================
cel.dev/expr v0.16.1/go.mod h1:AsGA5zb3WruAEQeQng1RZdGEXmBj0jvMWh6l5SnNuC8=
cloud.google.com/go v0.116.0/go.mod h1:cEPSRWPzZEswwdr9BxE6ChEn01dWlTaF05LiC2Xs70U=
cloud.google.com/go/auth v0.10.2/go.mod h1:xxA5AqpDrvS+Gkmo9RqrGGRh6WSNKKOXhY3zNOr38tI=
cloud.google.com/go/auth/oauth2adapt v0.2.5/go.mod h1:AlmsELtlEBnaNTL7jCj8VQFLy6mbZv0s4Q7NGBeQ5E8=
cloud.google.com/go/cloudbuild v1.19.0/go.mod h1:ZGRqbNMrVGhknIIjwASa6MqoRTOpXIVMSI+Ew5DMPuY=
cloud.google.com/go/compute v1.19.1/go.mod h1:6ylj3a05WF8leseCdIf77NK0g1ey+nj5IKd5/kvShxE=
cloud.google.com/go/compute/metadata v0.5.2/go.mod h1:C66sj2AluDcIqakBq/M8lw8/ybHgOZqin2obFxa/E5k=
cloud.google.com/go/iam v1.2.2/go.mod h1:0Ys8ccaZHdI1dEUilwzqng/6ps2YB6vRsjIe00/+6JY=
cloud.google.com/go/longrunning v0.6.2/go.mod h1:k/vIs83RN4bE3YCswdXC5PFfWVILjm3hpEUlSko4PiI=
cloud.google.com/go/monitoring v1.21.2/go.mod h1:hS3pXvaG8KgWTSz+dAdyzPrGUYmi2Q+WFX8g2hqVEZU=
cloud.google.com/go/storage v1.47.0/go.mod h1:Ks0vP374w0PW6jOUameJbapbQKXqkjGd/OJRp2fb9IQ=
github.com/Azure/azure-sdk-for-go v51.0.0+incompatible/go.mod h1:9XXNKU+eRnpl9moKnB4QOLf1HestfXbmab5FXxiDBjc=
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.17.0/go.mod h1:XCW7KnZet0Opnr7HccfUw1PLc4CjHqpcaxW8DHklNkQ=
github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.7.0/go.mod h1:9kIvujWAA58nmPmWB1m23fyWic1kYZMxD9CxaWn4Qpg=
github.com/Azure/azure-sdk-for-go/sdk/internal v1.10.0/go.mod h1:iZDifYGJTIgIIkYRNWPENUnqx6bJ2xnSDFI2tjwZNuY=
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/appcontainers/armappcontainers/v3 v3.0.0/go.mod h1:LDN3sr8FJ36sY6ZmMes6Q2vHJ+5r1aFsE3wEo7VbXJg=
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.2.0/go.mod h1:5kakwfW5CjC9KK+Q4wjXAg+ShuIm2mBMua0ZFj2C8PE=
github.com/Azure/go-autorest v14.2.0+incompatible/go.mod h1:r+4oMnoxhatjLLJ6zxSWATqVooLgysK6ZNox3g/xq24=
github.com/Azure/go-autorest/autorest v0.11.20/go.mod h1:o3tqFY+QR40VOlk+pV4d77mORO64jOXSgEnPQgLK6JY=
github.com/Azure/go-autorest/autorest/adal v0.9.13/go.mod h1:W/MM4U6nLxnIskrw4UwWzlHfGjwUS50aOsc/I3yuU8M=
github.com/Azure/go-autorest/autorest/azure/auth v0.5.8/go.mod h1:kxyKZTSfKh8OVFWPAgOgQ/frrJgeYQJPyR5fLFmXko4=
github.com/Azure/go-autorest/autorest/azure/cli v0.4.2/go.mod h1:7qkJkT+j6b+hIpzMOwPChJhTqS8VbsqqgULzMNRugoM=
github.com/Azure/go-autorest/autorest/date v0.3.0/go.mod h1:BI0uouVdmngYNUzGWeSYnokU+TrmwEsOqdt8Y6sso74=
github.com/Azure/go-autorest/autorest/to v0.4.0/go.mod h1:fE8iZBn7LQR7zH/9XU2NcPR4o9jEImooCeWJcYV/zLE=
github.com/Azure/go-autorest/autorest/validation v0.3.1/go.mod h1:yhLgjC0Wda5DYXl6JAsWyUe4KVNffhoDhG0zVzUMo3E=
github.com/Azure/go-autorest/logger v0.2.1/go.mod h1:T9E3cAhj2VqvPOtCYAvby9aBXkZmbF5NWuPV8+WeEW8=
github.com/Azure/go-autorest/tracing v0.6.0/go.mod h1:+vhtPC754Xsa23ID7GlGsrdKBpUA79WCAKPPZVC2DeU=
github.com/AzureAD/microsoft-authentication-library-for-go v1.2.2/go.mod h1:wP83P5OoQ5p6ip3ScPr0BAq0BvuPAvacpEuSzyouqAI=
github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.24.1/go.mod h1:itPGVDKf9cC/ov4MdvJ2QZ0khw4bfoo9jzwTJlaxy2k=
github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.48.1/go.mod h1:jyqM3eLpJ3IbIFDTKVz2rF9T/xWGW0rIriGwnz8l9Tk=
github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/resourcemapping v0.48.1/go.mod h1:viRWSEhtMZqz1rhwmOVKkWl6SwmVowfL9O2YR5gI2PE=
github.com/NYTimes/gziphandler v1.1.1/go.mod h1:n/CVRwUEOgIxrgPvAQhUUr9oeUtvrhMomdKFjzJNB0c=
github.com/agext/levenshtein v1.2.3/go.mod h1:JEDfjyjHDjOF/1e4FlBE/PkbqA9OfWu2ki2W0IB5558=
github.com/apparentlymart/go-textseg/v13 v13.0.0/go.mod h1:ZK2fH7c4NqDTLtiYLvIkEghdlcqw7yxLeM89kiTRPUo=
github.com/apparentlymart/go-textseg/v15 v15.0.0/go.mod h1:K8XmNZdhEBkdlyDdvbmmsvpAG721bKi0joRfFdHIWJ4=
github.com/aws/aws-lambda-go v1.47.0/go.mod h1:dpMpZgvWx5vuQJfBt0zqBha60q7Dd7RfgJv23DymV8A=
github.com/aws/aws-sdk-go v1.44.122/go.mod h1:y4AeaBuwd2Lk+GepC1E9v0qOiTws0MIWAX4oIKwKHZo=
github.com/bgentry/go-netrc v0.0.0-20140422174119-9fd32a8b3d3d/go.mod h1:6QX/PXZ00z/TKoufEY6K/a0k6AhaJrQKdFe6OfVXsa4=
github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs=
github.com/bradleyfalzon/ghinstallation v1.1.1/go.mod h1:vyCmHTciHx/uuyN82Zc3rXN3X2KTK8nUTCrTMwAhcug=
github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c=
github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0=
github.com/census-instrumentation/opencensus-proto v0.4.1/go.mod h1:4T9NM4+4Vw91VeyqjLS6ao50K5bOcLKN6Q42XnYaRYw=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/cncf/xds/go v0.0.0-20240905190251-b4127c9b8d78/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8=
github.com/containerd/stargz-snapshotter/estargz v0.14.3/go.mod h1:KY//uOCIkSuNAHhJogcZtrNHdKrA99/FCCRjE3HD36o=
github.com/denisenkom/go-mssqldb v0.12.3/go.mod h1:k0mtMFOnU+AihqFxPMiF05rtiDrorD1Vrm1KEz5hxDo=
github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
github.com/dimchansky/utfbom v1.1.1/go.mod h1:SxdoEBH5qIqFocHMyGOXVAybYJdr71b1Q/j0mACtrfE=
github.com/docker/cli v27.1.1+incompatible/go.mod h1:JLrzqnKDaYBop7H2jaqPtU4hHvMKP+vjCwu2uszcLI8=
github.com/docker/distribution v2.8.2+incompatible/go.mod h1:J2gT2udsDAN96Uj4KfcMRqY0/ypR+oyYUYmja8H+y+w=
github.com/docker/docker-credential-helpers v0.7.0/go.mod h1:rETQfLdHNT3foU5kuNkFR1R1V12OJRRO5lzt2D1b5X0=
github.com/envoyproxy/go-control-plane v0.13.0/go.mod h1:GRaKG3dwvFoTg4nj7aXdZnvMg4d7nvT/wl9WgVXn3Q8=
github.com/envoyproxy/protoc-gen-validate v1.1.0/go.mod h1:sXRDRVmzEbkM7CVcM06s9shE/m23dg3wzjl0UWqJ2q4=
github.com/fatih/color v1.13.0/go.mod h1:kLAiJbzzSOZDVNGyDpeOxJ47H46qBXwg5ILebYFFOfk=
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
github.com/form3tech-oss/jwt-go v3.2.2+incompatible/go.mod h1:pbq4aXjuKjdthFRnoDwaVPLA+WlJuPGy+QneDUgJi2k=
github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
github.com/golang-jwt/jwt/v5 v5.2.2/go.mod h1:pqrtFR0X4osieyHYxtmOUWsAWrfe1Q5UVIyoH402zdk=
github.com/golang-sql/civil v0.0.0-20190719163853-cb61b32ac6fe/go.mod h1:8vg3r2VgvsThLBIFL93Qb5yWzgyZWhEmBwUJWevAkK0=
github.com/golang-sql/sqlexp v0.1.0/go.mod h1:J4ad9Vo8ZCWQ2GMrC4UCQy1JpCbwU9m3EOqtpKwwwHI=
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
github.com/gonvenience/wrap v1.1.2/go.mod h1:GiryBSXoI3BAAhbWD1cZVj7RZmtiu0ERi/6R6eJfslI=
github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4=
github.com/google/go-containerregistry v0.20.2/go.mod h1:z38EKdKh4h7IP2gSfUUqEvalZBqs6AoLeWfUy34nQC8=
github.com/google/go-github/v29 v29.0.2/go.mod h1:CHKiKKPHJ0REzfwc14QMklvtHwCveD0PxlMjLlzAM5E=
github.com/google/go-github/v44 v44.1.0/go.mod h1:iWn00mWcP6PRWHhXm0zuFJ8wbEjE5AGO5D5HXYM4zgw=
github.com/google/go-querystring v1.1.0/go.mod h1:Kcdr2DB4koayq7X8pmAG4sNG59So17icRSOU623lUBU=
github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/s2a-go v0.1.8/go.mod h1:6iNWHTpQ+nfNRN5E00MSdfDwVesa8hhS32PhPO8deJA=
github.com/googleapis/enterprise-certificate-proxy v0.3.4/go.mod h1:YKe7cfqYXjKGpGvmSg28/fFvhNzinZQm8DGnaburhGA=
github.com/googleapis/gax-go/v2 v2.14.0/go.mod h1:lhBCnjdLrWRaPvLWhmc8IS24m9mr07qSYnHncrgo+zk=
github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA=
github.com/hashicorp/go-cleanhttp v0.5.2/go.mod h1:kO/YDlP8L1346E6Sodw+PrpBSV4/SoxCXGY6BqNFT48=
github.com/hashicorp/go-getter v1.7.5/go.mod h1:W7TalhMmbPmsSMdNjD0ZskARur/9GJ17cfHTRtXV744=
github.com/hashicorp/go-getter/v2 v2.2.3/go.mod h1:hp5Yy0GMQvwWVUmwLs3ygivz1JSLI323hdIE9J9m7TY=
github.com/hashicorp/go-safetemp v1.0.0/go.mod h1:oaerMy3BhqiTbVye6QuFhFtIceqFoDHxNAB65b+Rj1I=
github.com/hashicorp/go-version v1.7.0/go.mod h1:fltr4n8CU8Ke44wwGCBoEymUuxUHl09ZGVZPK5anwXA=
github.com/hashicorp/hcl/v2 v2.22.0/go.mod h1:62ZYHrXgPoX8xBnzl8QzbWq4dyDsDtfCRgIq1rbJEvA=
github.com/hashicorp/terraform-json v0.23.0/go.mod h1:MHdXbBAbSg0GvzuWazEGKAn/cyNfIB7mN6y7KJN6y2c=
github.com/imdario/mergo v0.3.11/go.mod h1:jmQim1M+e3UYxmgPu/WyfjB3N3VflVyUjjjwH0dnCYA=
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
github.com/jinzhu/copier v0.0.0-20190924061706-b57f9002281a/go.mod h1:yL958EeXv8Ylng6IfnvG4oflryUi3vgA3xPs9hmII1s=
github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
github.com/jstemmer/go-junit-report v1.0.0/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk=
github.com/klauspost/compress v1.16.5/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE=
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0=
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
github.com/mattn/go-colorable v0.1.9/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc=
github.com/miekg/dns v1.1.62/go.mod h1:mvDlcItzm+br7MToIKqkglaGhlFMHJ9DTNNWONWXbNQ=
github.com/mitchellh/go-testing-interface v1.14.1/go.mod h1:gfgS7OtZj6MA4U1UrDRp04twqAjfvlZyCfX3sDjEym8=
github.com/mitchellh/go-wordwrap v1.0.1/go.mod h1:R62XHJLzvMFRBbcrT7m7WgmE1eOyTSsCt+hzestvNj0=
github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
github.com/opencontainers/image-spec v1.1.0-rc3/go.mod h1:X4pATf0uXsnn3g5aiGIsVnJBR4mxhKzfwmvK/B2NTm8=
github.com/oracle/oci-go-sdk v7.1.0+incompatible/go.mod h1:VQb79nF8Z2cwLkLS35ukwStZIg5F66tcBccjip/j888=
github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU=
github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c/go.mod h1:7rwL4CYBLnjLxUqIJNnCWiEdr3bn6IUYi15bNlnbCCU=
github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10/go.mod h1:t/avpk3KcrXxUnYOhZhMXJlSEyie6gQbtLq5NM3loB8=
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0=
github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
github.com/slack-go/slack v0.15.0/go.mod h1:hlGi5oXA+Gt+yWTPP0plCdRKmjsDxecdHxYQdlMQKOw=
github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0=
github.com/tmccombs/hcl2json v0.6.4/go.mod h1:+ppKlIW3H5nsAsZddXPy2iMyvld3SHxyjswOZhavRDk=
github.com/ulikunitz/xz v0.5.10/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
github.com/urfave/cli v1.22.16 h1:MH0k6uJxdwdeWQTwhSO42Pwr4YLrNLwBtg1MRgTqPdQ=
github.com/urfave/cli v1.22.16/go.mod h1:EeJR6BKodywf4zciqrdw6hpCPk68JO9z5LazXZMn5Po=
github.com/vbatts/tar-split v0.11.3/go.mod h1:9QlHN18E+fEH7RdG+QAJJcuya3rqT7eXSTY7wGrAokY=
github.com/zclconf/go-cty v1.15.0/go.mod h1:VvMs5i0vgZdhYawQNq5kePSpLAoz8u1xvZgrPIxfnZE=
go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo=
go.opentelemetry.io/contrib/detectors/gcp v1.29.0/go.mod h1:GW2aWZNwR2ZxDLdv8OyC2G8zkRoQBuURgV7RPQgcPoU=
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.54.0/go.mod h1:B9yO6b04uB80CzjedvewuqDhxJxi11s7/GtiGa8bAjI=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.54.0/go.mod h1:L7UH0GbB0p47T4Rri3uHjbpCFYrVrwc1I25QhNPiGK8=
go.opentelemetry.io/otel v1.29.0/go.mod h1:N/WtXPs1CNCUEx+Agz5uouwCba+i+bJGFicT8SR4NP8=
go.opentelemetry.io/otel/metric v1.29.0/go.mod h1:auu/QWieFVWx+DmQOUMgj0F8LHWdgalxXqvp7BII/W8=
go.opentelemetry.io/otel/sdk v1.29.0/go.mod h1:pM8Dx5WKnvxLCb+8lG1PRNIDxu9g9b9g59Qr7hfAAok=
go.opentelemetry.io/otel/sdk/metric v1.29.0/go.mod h1:6zZLdCl2fkauYoZIOn/soQIDSWFmNSRcICarHfuhNJQ=
go.opentelemetry.io/otel/trace v1.29.0/go.mod h1:eHl3w0sp3paPkYstJOmAimxhiFXPg+MMTlEh3nsQgWQ=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
golang.org/x/mod v0.28.0/go.mod h1:yfB/L0NOf/kmEbXjzCPOx1iK1fRutOydrCMsqRhEBxI=
golang.org/x/tools/go/expect v0.1.1-deprecated/go.mod h1:eihoPOH+FgIqa3FpoTwguz/bVUSGBlGQU67vpBeOrBY=
golang.org/x/tools/go/packages/packagestest v0.1.1-deprecated/go.mod h1:RVAQXBGNv1ib0J382/DPCRS/BPnsGebyM1Gj5VSDpG8=
golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2/go.mod h1:K8+ghG5WaK9qNqU5K3HdILfMLy1f3aNYFI/wnl100a8=
google.golang.org/api v0.206.0/go.mod h1:BtB8bfjTYIrai3d8UyvPmV9REGgox7coh+ZRwm0b+W8=
google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
google.golang.org/genproto v0.0.0-20241113202542-65e8d215514f/go.mod h1:Q5m6g8b5KaFFzsQFIGdJkSJDGeJiybVenoYFMMa3ohI=
google.golang.org/genproto/googleapis/api v0.0.0-20241104194629-dd2ea8efbc28/go.mod h1:dguCy7UOdZhTvLzDyt15+rOrawrpM4q7DD9dQ1P11P4=
google.golang.org/genproto/googleapis/rpc v0.0.0-20241104194629-dd2ea8efbc28/go.mod h1:GX3210XPVPUjJbTUbvwI8f2IpZDMZuPJWDzDuebbviI=
google.golang.org/grpc v1.67.1/go.mod h1:1gLDyUQU7CTLJI90u3nXZ9ekeghjeM7pTDZlqFNg2AA=
google.golang.org/grpc/stats/opentelemetry v0.0.0-20240907200651-3ffb98b2c93a/go.mod h1:9i1T9n4ZinTUZGgzENMi8MDDgbGC5mqTS75JAv6xN3A=
gotest.tools/v3 v3.5.1/go.mod h1:isy3WKz7GK6uNw/sbHzfKBLvlvXwUyV06n6brMxxopU=
k8s.io/gengo/v2 v2.0.0-20250604051438-85fd79dbfd9f/go.mod h1:EJykeLsmFC60UQbYJezXkEsG2FLrt0GPNkU5iK5GWxU=
sigs.k8s.io/structured-merge-diff/v4 v4.2.3/go.mod h1:qjx8mGObPmV2aSZepjQjbmb2ihdVs8cGKBraizNC69E=


================================================
FILE: patches/mimalloc-v2.2.4/0_base.patch
================================================
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5ce084f6..00eba70c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.18)
+cmake_minimum_required(VERSION 3.16)
 project(libmimalloc C CXX)
 
 set(CMAKE_C_STANDARD 11)
@@ -44,7 +44,38 @@ option(MI_WIN_USE_FLS       "Use Fiber local storage on Windows to detect thread
 option(MI_CHECK_FULL        "Use full internal invariant checking in DEBUG mode (deprecated, use MI_DEBUG_FULL instead)" OFF)
 option(MI_USE_LIBATOMIC     "Explicitly link with -latomic (on older systems) (deprecated and detected automatically)" OFF)
 
-include(CheckLinkerFlag)    # requires cmake 3.18
+function(CHECK_LINKER_FLAG _lang _flag _var)
+  get_property (_supported_languages GLOBAL PROPERTY ENABLED_LANGUAGES)
+  if (NOT _lang IN_LIST _supported_languages)
+    message (SEND_ERROR "check_linker_flag: ${_lang}: unknown language.")
+    return()
+  endif()
+  include (Check${_lang}SourceCompiles)
+  set(CMAKE_REQUIRED_LINK_OPTIONS "${_flag}")
+  # Normalize locale during test compilation.
+  set(_locale_vars LC_ALL LC_MESSAGES LANG)
+  foreach(v IN LISTS _locale_vars)
+    set(_locale_vars_saved_${v} "$ENV{${v}}")
+    set(ENV{${v}} C)
+  endforeach()
+  if (_lang MATCHES "^(C|CXX)$")
+    set (_source "int main() { return 0; }")
+  elseif (_lang STREQUAL "Fortran")
+    set (_source "       program test\n       stop\n       end program")
+  elseif (_lang MATCHES "^(OBJC|OBJCXX)$")
+    set (_source "#ifndef __OBJC__\n#  error \"Not an Objective-C++ compiler\"\n#endif\nint main(void) { return 0; }")
+  else()
+    message (SEND_ERROR "check_linker_flag: ${_lang}: unsupported language.")
+    return()
+  endif()
+  set(_common_patterns "")
+  check_c_source_compiles("${_source}" ${_var} ${_common_patterns})
+  foreach(v IN LISTS _locale_vars)
+    set(ENV{${v}} ${_locale_vars_saved_${v}})
+  endforeach()
+  set(${_var} "${${_var}}" PARENT_SCOPE)
+endfunction()
+
 include(CheckIncludeFiles)
 include(GNUInstallDirs)
 include("cmake/mimalloc-config-version.cmake")
diff --git a/src/alloc.c b/src/alloc.c
index 0fed5e75..870f8d10 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -670,6 +670,24 @@ mi_decl_restrict void* _mi_heap_malloc_guarded(mi_heap_t* heap, size_t size, boo
 }
 #endif
 
+bool mi_heap_page_is_underutilized(mi_heap_t* heap, void* p, float ratio) mi_attr_noexcept {
+  mi_page_t* page = _mi_ptr_page(p);   // get the page that this belongs to
+
+  mi_heap_t* page_heap = (mi_heap_t*)(mi_atomic_load_acquire(&(page)->xheap));
+
+  // the heap id matches and it is not a full page
+  if (mi_likely(page_heap == heap && page->flags.x.in_full == 0)) {
+    // first in the list, meaning it's the head of page queue, thus being used for malloc
+    if (page->prev == NULL)
+      return false;
+
+    // this page belong to this heap and is not first in the page queue. Lets check its
+    // utilization.
+    return page->used <= (unsigned)(page->capacity * ratio);
+  }
+  return false;
+}
+
 // ------------------------------------------------------
 // ensure explicit external inline definitions are emitted!
 // ------------------------------------------------------


================================================
FILE: patches/mimalloc-v2.2.4/1_add_stat_type.patch
================================================
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index a15d9cba..ee822ca9 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -682,4 +682,23 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
 #define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)
 #define mi_heap_stat_adjust_decrease(heap,stat,amount)  mi_stat_adjust_decrease( (heap)->tld->stats.stat, amount)
 
+#define MI_DFLY_PAGE_BELOW_THRESHOLD 1
+#define MI_DFLY_PAGE_FULL 2
+#define MI_DFLY_HEAP_MISMATCH 4
+#define MI_DFLY_PAGE_USED_FOR_MALLOC 8
+
+typedef struct mi_page_usage_stats_s {
+  uintptr_t page_address;
+  size_t block_size;
+  uint16_t capacity;
+  uint16_t reserved;
+  uint16_t used;
+  // Collects the current state of page as returned by mi_heap_page_is_underutilized
+  // 0th bit set: page usage is below threshold: MI_DFLY_PAGE_BELOW_THRESHOLD
+  // 1st bit set: the page is full: MI_DFLY_PAGE_FULL
+  // 2nd bit set: the page heap did not match the heap requested: MI_DFLY_HEAP_MISMATCH
+  // 3rd bit set: that the page is currently used for malloc operations: MI_DFLY_PAGE_USED_FOR_MALLOC
+  uint8_t flags;
+} mi_page_usage_stats_t;
+
 #endif


================================================
FILE: patches/mimalloc-v2.2.4/2_return_stat.patch
================================================
diff --git a/src/alloc.c b/src/alloc.c
index 893f3094..88318d0e 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -676,22 +676,45 @@ mi_decl_restrict void* _mi_heap_malloc_guarded(mi_heap_t* heap, size_t size, boo
 }
 #endif
 
-bool mi_heap_page_is_underutilized(mi_heap_t* heap, void* p, float ratio) mi_attr_noexcept {
-  mi_page_t* page = _mi_ptr_page(p);   // get the page that this belongs to
+mi_page_usage_stats_t mi_heap_page_is_underutilized(mi_heap_t *heap, void *p, float ratio,
+                                                    bool return_detailed_stats) mi_attr_noexcept {
+  mi_page_t *page = _mi_ptr_page(p); // get the page that this belongs to
+  mi_heap_t *page_heap = (mi_heap_t *) (mi_atomic_load_acquire(&(page)->xheap));
+
+  if (!return_detailed_stats) {
+    mi_page_usage_stats_t result = {.flags = 0};
+    if (mi_likely(page_heap == heap && page->flags.x.in_full == 0)) {
+      if (page->prev != NULL && page->used <= (unsigned) (page->capacity * ratio))
+        result.flags = MI_DFLY_PAGE_BELOW_THRESHOLD;
+    }
+    return result;
+  }
+
+  mi_page_usage_stats_t result = {
+    .page_address = (uintptr_t) page,
+    .block_size = page->block_size,
+    .capacity = page->capacity,
+    .reserved = page->reserved,
+    .used = page->used,
+    .flags = 0,
+  };

-  mi_heap_t* page_heap = (mi_heap_t*)(mi_atomic_load_acquire(&(page)->xheap));
+  if (page->flags.x.in_full == 1) {
+    result.flags |= MI_DFLY_PAGE_FULL;
+  }
+
+  if (page_heap != heap) {
+    result.flags |= MI_DFLY_HEAP_MISMATCH;
+  }

-  // the heap id matches and it is not a full page
-  if (mi_likely(page_heap == heap && page->flags.x.in_full == 0)) {
-    // first in the list, meaning it's the head of page queue, thus being used for malloc
-    if (page->prev == NULL)
-      return false;
+  if (page->prev == NULL) {
+    result.flags |= MI_DFLY_PAGE_USED_FOR_MALLOC;
+  }

-    // this page belong to this heap and is not first in the page queue. Lets check its
-    // utilization.
-    return page->used <= (unsigned)(page->capacity * ratio);
+  if (result.flags == 0 && result.used <= (unsigned) (result.capacity * ratio)) {
+    result.flags = MI_DFLY_PAGE_BELOW_THRESHOLD;
   }
-  return false;
+  return result;
 }
 
 // ------------------------------------------------------


================================================
FILE: patches/mimalloc-v2.2.4/3_track_full_size.patch
================================================
commit e0cda4eb4a54cfcd33afcd5fbd7ecd86510ac4f9
Author: Roman Gershman <romange@gmail.com>
Date:   Wed Sep 3 23:30:34 2025 +0300

    chore: track comitted size of full pages in a heap
    
    Signed-off-by: Roman Gershman <romange@gmail.com>

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index a15d9cba..34d99a94 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -559,9 +559,10 @@ struct mi_heap_s {
   uintptr_t             cookie;                              // random cookie to verify pointers (see `_mi_ptr_cookie`)
   uintptr_t             keys[2];                             // two random keys used to encode the `thread_delayed_free` list
   mi_random_ctx_t       random;                              // random number context used for secure allocation
-  size_t                page_count;                          // total number of pages in the `pages` queues.
-  size_t                page_retired_min;                    // smallest retired index (retired pages are fully free, but still in the page queues)
-  size_t                page_retired_max;                    // largest retired index into the `pages` array.
+  uint32_t              page_count;                          // total number of pages in the `pages` queues.
+  uint16_t              page_retired_min;                    // smallest retired index (retired pages are fully free, but still in the page queues)
+  uint16_t              page_retired_max;                    // largest retired index into the `pages` array.
+  size_t                full_page_size;                      // total size of pages residing in MI_BIN_FULL bin.
   long                  generic_count;                       // how often is `_mi_malloc_generic` called?
   long                  generic_collect_count;               // how often is `_mi_malloc_generic` called without collecting?
   mi_heap_t*            next;                                // list of heaps per thread
diff --git a/src/init.c b/src/init.c
index 3fc8b033..61ee4c76 100644
--- a/src/init.c
+++ b/src/init.c
@@ -118,6 +118,7 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
   { {0}, {0}, 0, true }, // random
   0,                // page count
   MI_BIN_FULL, 0,   // page retired min/max
+  0,                // full page size
   0, 0,             // generic count
   NULL,             // next
   false,            // can reclaim
@@ -167,6 +168,7 @@ mi_decl_cache_align mi_heap_t _mi_heap_main = {
   { {0x846ca68b}, {0}, 0, true },  // random
   0,                // page count
   MI_BIN_FULL, 0,   // page retired min/max
+  0,                // full page size
   0, 0,             // generic count
   NULL,             // next heap
   false,            // can reclaim
diff --git a/src/page-queue.c b/src/page-queue.c
index c719b626..524b09d8 100644
--- a/src/page-queue.c
+++ b/src/page-queue.c
@@ -232,6 +232,10 @@ static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
   page->next = NULL;
   page->prev = NULL;
   // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), NULL);
+  if (mi_page_queue_is_full(queue)) {
+    mi_assert_internal(heap->full_page_size >= mi_page_block_size(page) * page->capacity);
+    heap->full_page_size -= mi_page_block_size(page) * page->capacity;
+  }
   mi_page_set_in_full(page,false);
 }
 
@@ -246,6 +250,9 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_
                       (mi_page_is_large_or_huge(page) && mi_page_queue_is_huge(queue)) ||
                         (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
 
+  if (mi_page_queue_is_full(queue)) {
+    heap->full_page_size += mi_page_block_size(page) * page->capacity;
+  }
   mi_page_set_in_full(page, mi_page_queue_is_full(queue));
   // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), heap);
   page->next = queue->first;
@@ -339,6 +346,12 @@ static void mi_page_queue_enqueue_from_ex(mi_page_queue_t* to, mi_page_queue_t*
     }
   }
 
+  if (mi_page_queue_is_full(to)) {
+    heap->full_page_size += mi_page_block_size(page) * page->capacity;
+  } else if (mi_page_queue_is_full(from)) {
+    mi_assert_internal(heap->full_page_size >= mi_page_block_size(page) * page->capacity);
+    heap->full_page_size -= mi_page_block_size(page) * page->capacity;
+  }
   mi_page_set_in_full(page, mi_page_queue_is_full(to));
 }
 

================================================
FILE: patches/mimalloc-v2.2.4/4_fix_heap_collect.patch
================================================
diff --git a/src/heap.c b/src/heap.c
index f96e60d0..5cb7c1ff 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -24,7 +24,7 @@ terms of the MIT license. A copy of the license can be found in the file
 typedef bool (heap_page_visitor_fun)(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2);
 
 // Visit all pages in a heap; returns `false` if break was called.
-static bool mi_heap_visit_pages(mi_heap_t* heap, heap_page_visitor_fun* fn, void* arg1, void* arg2)
+static bool mi_heap_visit_pages(mi_heap_t* heap, size_t max_q_id,  heap_page_visitor_fun* fn, void* arg1, void* arg2)
 {
   if (heap==NULL || heap->page_count==0) return 0;
 
@@ -34,7 +34,7 @@ static bool mi_heap_visit_pages(mi_heap_t* heap, heap_page_visitor_fun* fn, void
   size_t count = 0;
   #endif
 
-  for (size_t i = 0; i <= MI_BIN_FULL; i++) {
+  for (size_t i = 0; i <= max_q_id; i++) {
     mi_page_queue_t* pq = &heap->pages[i];
     mi_page_t* page = pq->first;
     while(page != NULL) {
@@ -47,7 +47,6 @@ static bool mi_heap_visit_pages(mi_heap_t* heap, heap_page_visitor_fun* fn, void
       page = next; // and continue
     }
   }
-  mi_assert_internal(count == total);
   return true;
 }
 
@@ -67,7 +66,7 @@ static bool mi_heap_page_is_valid(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
 #if MI_DEBUG>=3
 static bool mi_heap_is_valid(mi_heap_t* heap) {
   mi_assert_internal(heap!=NULL);
-  mi_heap_visit_pages(heap, &mi_heap_page_is_valid, NULL, NULL);
+  mi_heap_visit_pages(heap, MI_BIN_FULL, &mi_heap_page_is_valid, NULL, NULL);
   return true;
 }
 #endif
@@ -149,7 +148,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
 
   // if abandoning, mark all pages to no longer add to delayed_free
   if (collect == MI_ABANDON) {
-    mi_heap_visit_pages(heap, &mi_heap_page_never_delayed_free, NULL, NULL);
+    mi_heap_visit_pages(heap, MI_BIN_FULL, &mi_heap_page_never_delayed_free, NULL, NULL);
   }
 
   // free all current thread delayed blocks.
@@ -160,7 +159,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
   _mi_heap_collect_retired(heap, force);

   // collect all pages owned by this thread
-  mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
+  mi_heap_visit_pages(heap, collect == MI_NORMAL ? MI_BIN_HUGE : MI_BIN_FULL, &mi_heap_page_collect, &collect, NULL);
   mi_assert_internal( collect != MI_ABANDON || mi_atomic_load_ptr_acquire(mi_block_t,&heap->thread_delayed_free) == NULL );
 
   // collect abandoned segments (in particular, purge expired parts of segments in the abandoned segment list)
@@ -368,7 +367,7 @@ static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
 }
 
 void _mi_heap_destroy_pages(mi_heap_t* heap) {
-  mi_heap_visit_pages(heap, &_mi_heap_page_destroy, NULL, NULL);
+  mi_heap_visit_pages(heap, MI_BIN_FULL, &_mi_heap_page_destroy, NULL, NULL);
   mi_heap_reset_pages(heap);
 }
 
@@ -539,7 +538,7 @@ bool mi_heap_check_owned(mi_heap_t* heap, const void* p) {
   if (heap==NULL || !mi_heap_is_initialized(heap)) return false;
   if (((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0) return false;  // only aligned pointers
   bool found = false;
-  mi_heap_visit_pages(heap, &mi_heap_page_check_owned, (void*)p, &found);
+  mi_heap_visit_pages(heap, MI_BIN_FULL, &mi_heap_page_check_owned, (void*)p, &found);
   return found;
 }
 
@@ -705,7 +704,7 @@ static bool mi_heap_visit_areas_page(mi_heap_t* heap, mi_page_queue_t* pq, mi_pa
 // Visit all heap pages as areas
 static bool mi_heap_visit_areas(const mi_heap_t* heap, mi_heap_area_visit_fun* visitor, void* arg) {
   if (visitor == NULL) return false;
-  return mi_heap_visit_pages((mi_heap_t*)heap, &mi_heap_visit_areas_page, (void*)(visitor), arg); // note: function pointer to void* :-{
+  return mi_heap_visit_pages((mi_heap_t*)heap, MI_BIN_FULL, &mi_heap_visit_areas_page, (void*)(visitor), arg); // note: function pointer to void* :-{
 }
 
 // Just to pass arguments


================================================
FILE: pyproject.toml
================================================
[tool.black]
line-length = 100
include = '\.py$'
extend-exclude = '''
/(
    | .git
    | .__pycache__
    | build-dbg
    | build-opt
    | helio
)/
'''


================================================
FILE: src/.gitignore
================================================
server/version.cc

================================================
FILE: src/CMakeLists.txt
================================================
option(ENABLE_GIT_VERSION "Build with Git metadata" OFF)

option(WITH_SIMSIMD "Enable SimSIMD vector optimizations" OFF)
option(SIMSIMD_NATIVE_F16 "Enable native float16 support in SimSIMD" OFF)
option(WITH_SEARCH "Enable compilation of search module" ON)

if ("${CMAKE_SYSTEM_NAME}" STREQUAL "FreeBSD")
  set(DFLY_TOOLS_MAKE "gmake")
else()
  set(DFLY_TOOLS_MAKE "make")
endif()

function(cur_gen_dir out_dir)
  file(RELATIVE_PATH _rel_folder "${PROJECT_SOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}")

  set(_tmp_dir ${ROOT_GEN_DIR}/${_rel_folder})
  set(${out_dir} ${_tmp_dir} PARENT_SCOPE)
  file(MAKE_DIRECTORY ${_tmp_dir})
endfunction()

set(ROOT_GEN_DIR ${CMAKE_SOURCE_DIR}/genfiles)
file(MAKE_DIRECTORY ${ROOT_GEN_DIR})
include_directories(${ROOT_GEN_DIR}/src)

function(gen_bison name)
  GET_FILENAME_COMPONENT(_in ${name}.y ABSOLUTE)
  cur_gen_dir(gen_dir)
  # add_library(${lib_name} ${gen_dir}/${name}.cc)
  set(full_path_cc ${gen_dir}/${name}.cc ${gen_dir}/${name}.hh)

  ADD_CUSTOM_COMMAND(
           OUTPUT ${full_path_cc}
           COMMAND mkdir -p ${gen_dir}
           COMMAND bison --language=c++ -o ${gen_dir}/${name}.cc ${name}.y -Wconflicts-sr
           DEPENDS ${_in}
           WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
           COMMENT "Generating parser from ${name}.y" VERBATIM)
 set_source_files_properties(${name}.cc ${name}_base.h PROPERTIES GENERATED TRUE)
endfunction()


Message(STATUS "THIRD_PARTY_LIB_DIR ${THIRD_PARTY_LIB_DIR}")

include(external_libs.cmake)

if(ENABLE_GIT_VERSION)
    include(GetGitRevisionDescription.cmake)
    get_git_head_revision(GIT_REFSPEC GIT_SHA1)
    git_local_changes(GIT_CLEAN_DIRTY)
    if("${GIT_CLEAN_DIRTY}" STREQUAL "DIRTY")
        set(GIT_CLEAN_DIRTY "-dirty")
        else()
        set(GIT_CLEAN_DIRTY "")
    endif()
    Message(STATUS "GIT_SHA1 ${GIT_SHA1}")
    git_describe(GIT_VER --always)
    Message(STATUS "GIT_VER ${GIT_VER}")
    string(TIMESTAMP PRJ_BUILD_TIME "%Y-%m-%d %H:%M:%S" UTC)
else(ENABLE_GIT_VERSION)
    set(GIT_VER "dev")
    set(GIT_SHA1 "0000000")
    set(GIT_CLEAN_DIRTY "-dev")
    set(PRJ_BUILD_TIME "bigbang")
endif(ENABLE_GIT_VERSION)


function(gen_flex name)
  GET_FILENAME_COMPONENT(_in ${name}.lex ABSOLUTE)
  cur_gen_dir(gen_dir)

  ADD_CUSTOM_COMMAND(
           OUTPUT ${gen_dir}/${name}.cc ${gen_dir}/${name}.h
           COMMAND mkdir -p ${gen_dir}

           COMMAND ${REFLEX} -o ${gen_dir}/${name}.cc  --unicode --header-file=${gen_dir}/${name}.h
                             --bison-complete  --bison-locations  ${_in}
           DEPENDS ${_in} reflex_project
           WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
           COMMENT "Generating lexer from ${name}.lex" VERBATIM)

  set_source_files_properties(${gen_dir}/${name}.h ${gen_dir}/${name}.cc
                              PROPERTIES GENERATED TRUE)
endfunction()

# the output file resides in the build directory.
configure_file(server/version.cc.in "${CMAKE_CURRENT_SOURCE_DIR}/server/version.cc" @ONLY)

add_subdirectory(redis)
add_subdirectory(core)
add_subdirectory(facade)
add_subdirectory(server)


================================================
FILE: src/GetGitRevisionDescription.cmake
================================================
# - Returns a version string from Git
#
# These functions force a re-configure on each git commit so that you can
# trust the values of the variables in your build system.
#
#  get_git_head_revision(<refspecvar> <hashvar> [ALLOW_LOOKING_ABOVE_CMAKE_SOURCE_DIR])
#
# Returns the refspec and sha hash of the current head revision
#
#  git_describe(<var> [<additional arguments to git describe> ...])
#
# Returns the results of git describe on the source tree, and adjusting
# the output so that it tests false if an error occurs.
#
#  git_describe_working_tree(<var> [<additional arguments to git describe> ...])
#
# Returns the results of git describe on the working tree (--dirty option),
# and adjusting the output so that it tests false if an error occurs.
#
#  git_get_exact_tag(<var> [<additional arguments to git describe> ...])
#
# Returns the results of git describe --exact-match on the source tree,
# and adjusting the output so that it tests false if there was no exact
# matching tag.
#
#  git_local_changes(<var>)
#
# Returns either "CLEAN" or "DIRTY" with respect to uncommitted changes.
# Uses the return code of "git diff-index --quiet HEAD --".
# Does not regard untracked files.
#
# Requires CMake 2.6 or newer (uses the 'function' command)
#
# Original Author:
# 2009-2020 Ryan Pavlik <ryan.pavlik@gmail.com> <abiryan@ryand.net>
# http://academic.cleardefinition.com
#
# Copyright 2009-2013, Iowa State University.
# Copyright 2013-2020, Ryan Pavlik
# Copyright 2013-2020, Contributors
# SPDX-License-Identifier: BSL-1.0
# Distributed under the Boost Software License, Version 1.0.
# (See accompanying file LICENSE_1_0.txt or copy at
# http://www.boost.org/LICENSE_1_0.txt)

if(__get_git_revision_description)
    return()
endif()
set(__get_git_revision_description YES)

# We must run the following at "include" time, not at function call time,
# to find the path to this module rather than the path to a calling list file
get_filename_component(_gitdescmoddir ${CMAKE_CURRENT_LIST_FILE} PATH)

# Function _git_find_closest_git_dir finds the next closest .git directory
# that is part of any directory in the path defined by _start_dir.
# The result is returned in the parent scope variable whose name is passed
# as variable _git_dir_var. If no .git directory can be found, the
# function returns an empty string via _git_dir_var.
#
# Example: Given a path C:/bla/foo/bar and assuming C:/bla/.git exists and
# neither foo nor bar contain a file/directory .git. This wil return
# C:/bla/.git
#
function(_git_find_closest_git_dir _start_dir _git_dir_var)
    set(cur_dir "${_start_dir}")
    set(git_dir "${_start_dir}/.git")
    while(NOT EXISTS "${git_dir}")
        # .git dir not found, search parent directories
        set(git_previous_parent "${cur_dir}")
        get_filename_component(cur_dir "${cur_dir}" DIRECTORY)
        if(cur_dir STREQUAL git_previous_parent)
            # We have reached the root directory, we are not in git
            set(${_git_dir_var}
                ""
                PARENT_SCOPE)
            return()
        endif()
        set(git_dir "${cur_dir}/.git")
    endwhile()
    set(${_git_dir_var}
        "${git_dir}"
        PARENT_SCOPE)
endfunction()

function(get_git_head_revision _refspecvar _hashvar)
    _git_find_closest_git_dir("${CMAKE_CURRENT_SOURCE_DIR}" GIT_DIR)

    if("${ARGN}" STREQUAL "ALLOW_LOOKING_ABOVE_CMAKE_SOURCE_DIR")
        set(ALLOW_LOOKING_ABOVE_CMAKE_SOURCE_DIR TRUE)
    else()
        set(ALLOW_LOOKING_ABOVE_CMAKE_SOURCE_DIR FALSE)
    endif()
    if(NOT "${GIT_DIR}" STREQUAL "")
        file(RELATIVE_PATH _relative_to_source_dir "${CMAKE_SOURCE_DIR}"
             "${GIT_DIR}")
        if("${_relative_to_source_dir}" MATCHES "[.][.]" AND NOT ALLOW_LOOKING_ABOVE_CMAKE_SOURCE_DIR)
            # We've gone above the CMake root dir.
            set(GIT_DIR "")
        endif()
    endif()
    if("${GIT_DIR}" STREQUAL "")
        set(${_refspecvar}
            "GITDIR-NOTFOUND"
            PARENT_SCOPE)
        set(${_hashvar}
            "GITDIR-NOTFOUND"
            PARENT_SCOPE)
        return()
    endif()

    # Check if the current source dir is a git submodule or a worktree.
    # In both cases .git is a file instead of a directory.
    #
    if(NOT IS_DIRECTORY ${GIT_DIR})
        # The following git command will return a non empty string that
        # points to the super project working tree if the current
        # source dir is inside a git submodule.
        # Otherwise the command will return an empty string.
        #
        execute_process(
            COMMAND "${GIT_EXECUTABLE}" rev-parse
                    --show-superproject-working-tree
            WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
            OUTPUT_VARIABLE out
            ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
        if(NOT "${out}" STREQUAL "")
            # If out is empty, GIT_DIR/CMAKE_CURRENT_SOURCE_DIR is in a submodule
            file(READ ${GIT_DIR} submodule)
            string(REGEX REPLACE "gitdir: (.*)$" "\\1" GIT_DIR_RELATIVE
                                 ${submodule})
            string(STRIP ${GIT_DIR_RELATIVE} GIT_DIR_RELATIVE)
            get_filename_component(SUBMODULE_DIR ${GIT_DIR} PATH)
            get_filename_component(GIT_DIR ${SUBMODULE_DIR}/${GIT_DIR_RELATIVE}
                                   ABSOLUTE)
            set(HEAD_SOURCE_FILE "${GIT_DIR}/HEAD")
        else()
            # GIT_DIR/CMAKE_CURRENT_SOURCE_DIR is in a worktree
            file(READ ${GIT_DIR} worktree_ref)
            # The .git directory contains a path to the worktree information directory
            # inside the parent git repo of the worktree.
            #
            string(REGEX REPLACE "gitdir: (.*)$" "\\1" git_worktree_dir
                                 ${worktree_ref})
            string(STRIP ${git_worktree_dir} git_worktree_dir)
            _git_find_closest_git_dir("${git_worktree_dir}" GIT_DIR)
            set(HEAD_SOURCE_FILE "${git_worktree_dir}/HEAD")
        endif()
    else()
        set(HEAD_SOURCE_FILE "${GIT_DIR}/HEAD")
    endif()
    set(GIT_DATA "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/git-data")
    if(NOT EXISTS "${GIT_DATA}")
        file(MAKE_DIRECTORY "${GIT_DATA}")
    endif()

    if(NOT EXISTS "${HEAD_SOURCE_FILE}")
        return()
    endif()
    set(HEAD_FILE "${GIT_DATA}/HEAD")
    configure_file("${HEAD_SOURCE_FILE}" "${HEAD_FILE}" COPYONLY)

    configure_file("${_gitdescmoddir}/GetGitRevisionDescription.cmake.in"
                   "${GIT_DATA}/grabRef.cmake" @ONLY)
    include("${GIT_DATA}/grabRef.cmake")

    set(${_refspecvar}
        "${HEAD_REF}"
        PARENT_SCOPE)
    set(${_hashvar}
        "${HEAD_HASH}"
        PARENT_SCOPE)
endfunction()

function(git_describe _var)
    if(NOT GIT_FOUND)
        find_package(Git QUIET)
    endif()
    get_git_head_revision(refspec hash)
    if(NOT GIT_FOUND)
        set(${_var}
            "GIT-NOTFOUND"
            PARENT_SCOPE)
        return()
    endif()
    if(NOT hash)
        set(${_var}
            "HEAD-HASH-NOTFOUND"
            PARENT_SCOPE)
        return()
    endif()

    # TODO sanitize
    #if((${ARGN}" MATCHES "&&") OR
    #	(ARGN MATCHES "||") OR
    #	(ARGN MATCHES "\\;"))
    #	message("Please report the following error to the project!")
    #	message(FATAL_ERROR "Looks like someone's doing something nefarious with git_describe! Passed arguments ${ARGN}")
    #endif()

    #message(STATUS "Arguments to execute_process: ${ARGN}")

    execute_process(
        COMMAND "${GIT_EXECUTABLE}" describe --tags --always ${hash} ${ARGN}
        WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
        RESULT_VARIABLE res
        OUTPUT_VARIABLE out
        ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
    if(NOT res EQUAL 0)
        set(out "${out}-${res}-NOTFOUND")
    endif()

    set(${_var}
        "${out}"
        PARENT_SCOPE)
endfunction()

function(git_describe_working_tree _var)
    if(NOT GIT_FOUND)
        find_package(Git QUIET)
    endif()
    if(NOT GIT_FOUND)
        set(${_var}
            "GIT-NOTFOUND"
            PARENT_SCOPE)
        return()
    endif()

    execute_process(
        COMMAND "${GIT_EXECUTABLE}" describe --dirty ${ARGN}
        WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
        RESULT_VARIABLE res
        OUTPUT_VARIABLE out
        ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
    if(NOT res EQUAL 0)
        set(out "${out}-${res}-NOTFOUND")
    endif()

    set(${_var}
        "${out}"
        PARENT_SCOPE)
endfunction()

function(git_get_exact_tag _var)
    git_describe(out --exact-match ${ARGN})
    set(${_var}
        "${out}"
        PARENT_SCOPE)
endfunction()

function(git_local_changes _var)
    if(NOT GIT_FOUND)
        find_package(Git QUIET)
    endif()
    get_git_head_revision(refspec hash)
    if(NOT GIT_FOUND)
        set(${_var}
            "GIT-NOTFOUND"
            PARENT_SCOPE)
        return()
    endif()
    if(NOT hash)
        set(${_var}
            "HEAD-HASH-NOTFOUND"
            PARENT_SCOPE)
        return()
    endif()

    execute_process(
        COMMAND "${GIT_EXECUTABLE}" diff-index --quiet HEAD --
        WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
        RESULT_VARIABLE res
        OUTPUT_VARIABLE out
        ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
    if(res EQUAL 0)
        set(${_var}
            "CLEAN"
            PARENT_SCOPE)
    else()
        set(${_var}
            "DIRTY"
            PARENT_SCOPE)
    endif()
endfunction()


================================================
FILE: src/GetGitRevisionDescription.cmake.in
================================================
#
# Internal file for GetGitRevisionDescription.cmake
#
# Requires CMake 2.6 or newer (uses the 'function' command)
#
# Original Author:
# 2009-2010 Ryan Pavlik <rpavlik@iastate.edu> <abiryan@ryand.net>
# http://academic.cleardefinition.com
# Iowa State University HCI Graduate Program/VRAC
#
# Copyright 2009-2012, Iowa State University
# Copyright 2011-2015, Contributors
# Distributed under the Boost Software License, Version 1.0.
# (See accompanying file LICENSE_1_0.txt or copy at
# http://www.boost.org/LICENSE_1_0.txt)
# SPDX-License-Identifier: BSL-1.0

set(HEAD_HASH)

file(READ "@HEAD_FILE@" HEAD_CONTENTS LIMIT 1024)

string(STRIP "${HEAD_CONTENTS}" HEAD_CONTENTS)
if(HEAD_CONTENTS MATCHES "ref")
	# named branch
	string(REPLACE "ref: " "" HEAD_REF "${HEAD_CONTENTS}")
	if(EXISTS "@GIT_DIR@/${HEAD_REF}")
		configure_file("@GIT_DIR@/${HEAD_REF}" "@GIT_DATA@/head-ref" COPYONLY)
	else()
		configure_file("@GIT_DIR@/packed-refs" "@GIT_DATA@/packed-refs" COPYONLY)
		file(READ "@GIT_DATA@/packed-refs" PACKED_REFS)
		if(${PACKED_REFS} MATCHES "([0-9a-z]*) ${HEAD_REF}")
			set(HEAD_HASH "${CMAKE_MATCH_1}")
		endif()
	endif()
else()
	# detached HEAD
	configure_file("@GIT_DIR@/HEAD" "@GIT_DATA@/head-ref" COPYONLY)
endif()

if(NOT HEAD_HASH)
	file(READ "@GIT_DATA@/head-ref" HEAD_HASH LIMIT 1024)
	string(STRIP "${HEAD_HASH}" HEAD_HASH)
endif()


================================================
FILE: src/common/arg_range.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/types/span.h>

#include <string_view>
#include <variant>

#include "base/iterator.h"

namespace cmn {

using ArgSlice = absl::Span<const std::string_view>;
using OwnedArgSlice = absl::Span<const std::string>;

inline std::string_view ToSV(std::string_view slice) {
  return slice;
}

inline std::string_view ToSV(const std::string& slice) {
  return slice;
}

inline std::string_view ToSV(std::string&& slice) = delete;

constexpr auto kToSV = [](auto&& v) { return ToSV(std::forward<decltype(v)>(v)); };

struct ArgRange {
  ArgRange(ArgRange&&) = default;
  ArgRange(const ArgRange&) = default;
  ArgRange(ArgRange& range) : ArgRange((const ArgRange&)range) {
  }

  template <typename T, std::enable_if_t<!std::is_same_v<ArgRange, T>, bool> = true>
  ArgRange(T&& span) : span(std::forward<T>(span)) {  // NOLINT google-explicit-constructor)
  }

  size_t Size() const {
    return std::visit([](const auto& span) { return span.size(); }, span);
  }

  auto Range() const {
    return base::it::Wrap(kToSV, span);
  }

  auto begin() const {
    return Range().first;
  }

  auto end() const {
    return Range().second;
  }

  std::string_view operator[](size_t idx) const {
    return std::visit([idx](const auto& span) -> std::string_view { return span[idx]; }, span);
  }

  std::variant<ArgSlice, OwnedArgSlice> span;
};

}  // namespace cmn


================================================
FILE: src/common/backed_args.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/inlined_vector.h>

#include <cstdint>
#include <string_view>

namespace cmn {

class BackedArguments {
  constexpr static size_t kLenCap = 5;
  constexpr static size_t kStorageCap = 88;

 public:
  using value_type = std::string_view;

  BackedArguments() {
  }

  class iterator {
   public:
    using iterator_category = std::random_access_iterator_tag;
    using value_type = std::string_view;
    using difference_type = std::ptrdiff_t;
    using pointer = const std::string_view*;
    using reference = std::string_view;

    iterator(const BackedArguments* ba, size_t index) : ba_(ba), index_(index) {
    }

    iterator& operator++() {
      ++index_;
      return *this;
    }

    iterator& operator--() {
      --index_;
      return *this;
    }

    iterator& operator+=(int delta) {
      index_ += delta;
      return *this;
    }

    iterator operator+(int delta) const {
      iterator res(*this);
      res += delta;
      return res;
    }

    ptrdiff_t operator-(iterator other) const {
      return ptrdiff_t(index_) - ptrdiff_t(other.index_);
    }

    bool operator==(const iterator& other) const {
      return index_ == other.index_ && ba_ == other.ba_;
    }

    bool operator!=(const iterator& other) const {
      return !(*this == other);
    }

    std::string_view operator*() const {
      return ba_->at(index_);
    }

   private:
    const BackedArguments* ba_;
    size_t index_;
  };

  // Construct the arguments from iterator range.
  // TODO: In general we could get away without the len argument,
  // but that would require fixing base::it::CompoundIterator to support subtraction.
  // Similarly, I wish that CompoundIterator supported the -> operator.
  template <typename I> BackedArguments(I begin, I end, size_t len) {
    Assign(begin, end, len);
  }

  template <typename I> void Assign(I begin, I end, size_t len);

  void Reserve(size_t arg_cnt, size_t total_size) {
    offsets_.reserve(arg_cnt);
    storage_.reserve(total_size);
  }

  size_t HeapMemory() const {
    size_t s1 = offsets_.capacity() <= kLenCap ? 0 : offsets_.capacity() * sizeof(uint32_t);
    size_t s2 = storage_.capacity() <= kStorageCap ? 0 : storage_.capacity();
    return s1 + s2;
  }

  void SwapArgs(cmn::BackedArguments& other) {
    offsets_.swap(other.offsets_);
    storage_.swap(other.storage_);
  }

  // The capacity is chosen so that we allocate a fully utilized (128 bytes) block.
  using StorageType = absl::InlinedVector<char, kStorageCap>;

  std::string_view Front() const {
    return std::string_view{storage_.data(), elem_len(0)};
  }

  size_t size() const {
    return offsets_.size();
  }

  bool empty() const {
    return offsets_.empty();
  }

  size_t elem_len(size_t i) const {
    return elem_capacity(i) - 1;
  }

  size_t elem_capacity(size_t i) const {
    uint32_t next_offs = i + 1 >= offsets_.size() ? storage_.size() : offsets_[i + 1];
    return next_offs - offsets_[i];
  }

  std::string_view at(uint32_t index) const {
    uint32_t offset = offsets_[index];
    return std::string_view{storage_.data() + offset, elem_len(index)};
  }

  char* data(uint32_t index) {
    uint32_t offset = offsets_[index];
    return storage_.data() + offset;
  }

  std::string_view operator[](uint32_t index) const {
    return at(index);
  }

  iterator begin() const {
    return {this, 0};
  }

  iterator end() const {
    return {this, offsets_.size()};
  }

  void clear() {
    // Clear the contents without deallocating memory. clear() deallocates inlined_vector.
    offsets_.resize(0);
    storage_.resize(0);
  }

  std::string_view back() const {
    assert(size() > 0);
    return at(size() - 1);
  }

  // Reserves space for additional argument of given length at the end.
  void PushArg(size_t len) {
    size_t old_size = storage_.size();
    offsets_.push_back(old_size);
    storage_.resize(old_size + len + 1);
  }

  void PushArg(std::string_view arg) {
    PushArg(arg.size());
    char* dest = storage_.data() + offsets_.back();
    if (arg.size() > 0)
      memcpy(dest, arg.data(), arg.size());
    dest[arg.size()] = '\0';
  }

  void PopArg() {
    uint32_t last_offs = offsets_.back();
    offsets_.pop_back();
    storage_.resize(last_offs);
  }

 protected:
  absl::InlinedVector<uint32_t, kLenCap> offsets_;
  StorageType storage_;
};

static_assert(sizeof(BackedArguments) == 128);

template <typename I> void BackedArguments::Assign(I begin, I end, size_t len) {
  offsets_.resize(len);
  size_t total_size = 0;
  unsigned idx = 0;
  for (auto it = begin; it != end; ++it) {
    offsets_[idx++] = total_size;
    total_size += (*it).size() + 1;  // +1 for '\0'
  }
  storage_.resize(total_size);

  // Reclaim memory if we have too much allocated.
  if (storage_.capacity() > kStorageCap && total_size < storage_.capacity() / 2)
    storage_.shrink_to_fit();

  char* next = storage_.data();
  for (auto it = begin; it != end; ++it) {
    size_t sz = (*it).size();
    if (sz > 0) {
      memcpy(next, (*it).data(), sz);
    }
    next[sz] = '\0';
    next += sz + 1;
  }
}

}  // namespace cmn


================================================
FILE: src/common/heap_size.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

// This file provides utilities to *estimate* heap memory usage of classes.
// The main function exposed here is HeapSize() (with various overloads).
// It supports simple structs (returns 0), std::string (returns capacity if it's larger than SSO)
// and common containers, such as std::vector, std::deque, absl::flat_hash_map and unique_ptr.
//
// Example usage:
// absl::flat_hash_map<std::string, std::vector<std::unique_ptr<int>>> m;
// ...
// size_t size = HeapSize(m);

#pragma once

#include <absl/container/flat_hash_map.h>
#include <absl/container/flat_hash_set.h>
#include <absl/container/inlined_vector.h>
#include <absl/types/span.h>

#include <deque>
#include <string>
#include <string_view>
#include <type_traits>
#include <vector>

namespace cmn {

namespace heap_size_detail {

template <class, class = void> struct has_marked_stackonly : std::false_type {};

template <class T>
struct has_marked_stackonly<T, std::void_t<typename T::is_stackonly>> : std::true_type {};

template <typename T> constexpr bool StackOnlyType() {
  return std::is_trivial_v<T> || std::is_same_v<T, std::string_view> ||
         has_marked_stackonly<T>::value;
}

template <typename T, typename = void> struct has_used_mem : std::false_type {};

template <typename T>
struct has_used_mem<T, std::void_t<decltype(&T::UsedMemory)>> : std::true_type {};

template <typename Container> size_t AccumulateContainer(const Container& c);
}  // namespace heap_size_detail

inline size_t HeapSize(const std::string& s) {
  constexpr size_t kSmallStringOptSize = 15;
  return s.capacity() > kSmallStringOptSize ? s.capacity() : 0UL;
}

template <typename T, std::enable_if_t<heap_size_detail::has_used_mem<T>::value, bool> = true>
size_t HeapSize(const T& t) {
  return t.UsedMemory();
}

template <typename T, std::enable_if_t<heap_size_detail::StackOnlyType<T>(), bool> = true>
size_t HeapSize(const T& t) {
  return 0;
}

template <typename T> size_t HeapSize(absl::Span<T>) {
  return 0;
}

// Declare first, so that we can use these "recursively"
template <typename T> size_t HeapSize(const std::vector<T>& v);
template <typename T> size_t HeapSize(const std::unique_ptr<T>& t);
template <typename T> size_t HeapSize(const std::deque<T>& d);
template <typename T1, typename T2> size_t HeapSize(const std::pair<T1, T2>& p);
template <typename T, size_t N> size_t HeapSize(const absl::InlinedVector<T, N>& v);
template <typename K, typename V> size_t HeapSize(const absl::flat_hash_map<K, V>& m);
template <typename K> size_t HeapSize(const absl::flat_hash_set<K>& s);

template <typename T> size_t HeapSize(const std::unique_ptr<T>& t) {
  if (t == nullptr) {
    return 0;
  } else {
    return sizeof(T) + HeapSize(*t);
  }
}

template <typename T> size_t HeapSize(const std::vector<T>& v) {
  return (v.capacity() * sizeof(T)) + heap_size_detail::AccumulateContainer(v);
}

template <typename T> size_t HeapSize(const std::deque<T>& d) {
  return (d.size() * sizeof(T)) + heap_size_detail::AccumulateContainer(d);
}

template <typename T1, typename T2> size_t HeapSize(const std::pair<T1, T2>& p) {
  return HeapSize(p.first) + HeapSize(p.second);
}

template <typename T, size_t N> size_t HeapSize(const absl::InlinedVector<T, N>& v) {
  size_t size = 0;
  if (v.capacity() > N) {
    size += v.capacity() * sizeof(T);
  }
  size += heap_size_detail::AccumulateContainer(v);
  return size;
}

template <typename K, typename V> size_t HeapSize(const absl::flat_hash_map<K, V>& m) {
  size_t size = m.capacity() * sizeof(typename absl::flat_hash_map<K, V>::value_type);

  if constexpr (!heap_size_detail::StackOnlyType<K>() || !heap_size_detail::StackOnlyType<V>()) {
    for (const auto& kv : m) {
      size += HeapSize(kv);
    }
  }

  return size;
}

template <typename K> size_t HeapSize(const absl::flat_hash_set<K>& s) {
  size_t size = s.capacity() * sizeof(typename absl::flat_hash_set<K>::value_type);

  if constexpr (!heap_size_detail::StackOnlyType<K>()) {
    for (const auto& k : s) {
      size += HeapSize(k);
    }
  }

  return size;
}

namespace heap_size_detail {
template <typename Container> size_t AccumulateContainer(const Container& c) {
  size_t size = 0;

  if constexpr (!heap_size_detail::StackOnlyType<typename Container::value_type>()) {
    for (const auto& e : c) {
      size += HeapSize(e);
    }
  }

  return size;
}
}  // namespace heap_size_detail

}  // namespace cmn


================================================
FILE: src/common/string_or_view.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <string>
#include <string_view>
#include <variant>

namespace cmn {

class StringOrView {
 public:
  static StringOrView FromString(std::string s) {
    StringOrView sov;
    sov.val_ = std::move(s);
    return sov;
  }

  static StringOrView FromView(std::string_view sv) {
    StringOrView sov;
    sov.val_ = sv;
    return sov;
  }

  StringOrView() = default;
  StringOrView(const StringOrView& o) = default;
  StringOrView(StringOrView&& o) = default;
  StringOrView& operator=(const StringOrView& o) = default;
  StringOrView& operator=(StringOrView&& o) = default;

  bool operator==(const StringOrView& o) const {
    return *this == o.view();
  }

  bool operator==(std::string_view o) const {
    return view() == o;
  }

  bool operator!=(const StringOrView& o) const {
    return *this != o.view();
  }

  bool operator!=(std::string_view o) const {
    return !(*this == o);
  }

  std::string_view view() const {
    return visit([](const auto& s) -> std::string_view { return s; }, val_);
  }

  friend std::ostream& operator<<(std::ostream& o, const StringOrView& key) {
    return o << key.view();
  }

  // Make hashable
  template <typename H> friend H AbslHashValue(H h, const StringOrView& c) {
    return H::combine(std::move(h), c.view());
  }

  // If the key is backed by a string_view, replace it with a string with the same value
  void MakeOwned() {
    if (std::holds_alternative<std::string_view>(val_))
      val_ = std::string{std::get<std::string_view>(val_)};
  }

  // Move out of value as string
  std::string Take() && {
    MakeOwned();
    return std::move(std::get<std::string>(val_));
  }

  std::string* GetMutable() {
    MakeOwned();
    return &std::get<std::string>(val_);
  }

  bool empty() const {
    return visit([](const auto& s) { return s.empty(); }, val_);
  }

 private:
  std::variant<std::string_view, std::string> val_;
};

}  // namespace cmn


================================================
FILE: src/core/CMakeLists.txt
================================================
find_library(LIB_PCRE2 NAMES pcre2-8)
if(LIB_PCRE2)
  set(PCRE2_LIB ${LIB_PCRE2})
else()
  message(STATUS "pcre2-8 not found. Building without PCRE2 support.")
  set(PCRE2_LIB "")
endif()

find_library(LIB_RE2 NAMES re2)
if(LIB_RE2)
  set(RE2_LIB ${LIB_RE2})
else()
  message(STATUS "re2 not found. Building without RE2 support.")
  set(RE2_LIB "")
endif()

if (WITH_SEARCH)
  add_subdirectory(search)
else()
  add_library(dfly_search_core INTERFACE)
endif()

add_subdirectory(json)
add_subdirectory(page_usage)

add_library(dfly_core allocation_tracker.cc bloom.cc topk.cc compact_object.cc cms.cc dense_set.cc
    dragonfly_core.cc extent_tree.cc huff_coder.cc
    interpreter.cc glob_matcher.cc mi_memory_resource.cc qlist.cc dict_builder.cc sds_utils.cc
    segment_allocator.cc score_map.cc small_string.cc sorted_map.cc task_queue.cc
    tx_queue.cc string_set.cc string_map.cc tiering_types.cc top_keys.cc
    detail/bitpacking.cc detail/listpack_wrap.cc detail/listpack.cc
    oah_entry.cc)

cxx_link(dfly_core base dfly_search_core dfly_page_usage fibers2 jsonpath
    absl::flat_hash_map absl::str_format absl::random_random redis_lib
    TRDP::lua lua_modules
    OpenSSL::Crypto TRDP::dconv TRDP::lz4 TRDP::hdr_histogram)

add_executable(dash_bench dash_bench.cc)
cxx_link(dash_bench dfly_core redis_test_lib)

helio_cxx_test(dfly_core_test dfly_core TRDP::fast_float ${PCRE2_LIB} ${RE2_LIB} LABELS DFLY)
helio_cxx_test(compact_object_test dfly_core LABELS DFLY)
helio_cxx_test(extent_tree_test dfly_core LABELS DFLY)
helio_cxx_test(dash_test dfly_core file redis_test_lib DATA testdata/ids.txt.zst LABELS DFLY)
helio_cxx_test(interpreter_test dfly_core LABELS DFLY)

helio_cxx_test(string_set_test dfly_core LABELS DFLY)
helio_cxx_test(string_map_test dfly_core LABELS DFLY)
helio_cxx_test(oah_set_test dfly_core LABELS DFLY)
helio_cxx_test(sorted_map_test dfly_core redis_test_lib LABELS DFLY)
helio_cxx_test(bptree_set_test dfly_core LABELS DFLY)
helio_cxx_test(linear_search_map_test dfly_core LABELS DFLY)
helio_cxx_test(score_map_test dfly_core LABELS DFLY)
helio_cxx_test(flatbuffers_test dfly_core TRDP::flatbuffers LABELS DFLY)
helio_cxx_test(bloom_test dfly_core LABELS DFLY)
helio_cxx_test(allocation_tracker_test dfly_core absl::random_random LABELS DFLY)
helio_cxx_test(qlist_test dfly_core DATA testdata/list.txt.zst LABELS DFLY)
helio_cxx_test(listpack_test dfly_core redis_lib LABELS DFLY)
helio_cxx_test(zstd_test dfly_core TRDP::zstd LABELS DFLY)
helio_cxx_test(dict_builder_test dfly_core LABELS DFLY)
helio_cxx_test(top_keys_test dfly_core LABELS DFLY)
helio_cxx_test(topk_test dfly_core LABELS DFLY)
helio_cxx_test(page_usage_stats_test dfly_core LABELS DFLY)
helio_cxx_test(cms_test dfly_core LABELS DFLY)
helio_cxx_test(memory_test TRDP::mimalloc2 LABELS DFLY)

if(LIB_PCRE2)
  target_compile_definitions(dfly_core_test PRIVATE USE_PCRE2=1)
  # target_compile_definitions(dfly_core PUBLIC USE_PCRE2=1)
endif()

if(LIB_RE2)
  target_compile_definitions(dfly_core_test PRIVATE USE_RE2)
endif()


================================================
FILE: src/core/allocation_tracker.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/allocation_tracker.h"

#include "absl/random/random.h"
#include "base/logging.h"
#include "util/fibers/stacktrace.h"

namespace dfly {
namespace {
thread_local AllocationTracker g_tracker;
thread_local absl::InsecureBitGen g_bitgen;

bool CanCallVlog(std::string_view trace) {
  // GLOG fails when logging while flushing the current log under a mutex
  return trace.find("LogMessage::Flush") == std::string::npos;
}

}  // namespace

AllocationTracker& AllocationTracker::Get() {
  return g_tracker;
}

bool AllocationTracker::Add(const TrackingInfo& info) {
  if (tracking_.size() >= tracking_.capacity()) {
    return false;
  }

  tracking_.push_back(info);

  UpdateAbsSizes();

  return true;
}

bool AllocationTracker::Remove(size_t lower_bound, size_t upper_bound) {
  size_t before_size = tracking_.size();

  tracking_.erase(std::remove_if(tracking_.begin(), tracking_.end(),
                                 [&](const TrackingInfo& info) {
                                   return info.lower_bound == lower_bound &&
                                          info.upper_bound == upper_bound;
                                 }),
                  tracking_.end());

  UpdateAbsSizes();

  return before_size != tracking_.size();
}

void AllocationTracker::Clear() {
  tracking_.clear();
}

absl::Span<const AllocationTracker::TrackingInfo> AllocationTracker::GetRanges() const {
  return absl::MakeConstSpan(tracking_);
}

void AllocationTracker::ProcessNew(void* ptr, size_t size) {
  if (size < abs_min_size_ || size > abs_max_size_) {
    return;
  }

  if (inside_tracker_) {
    return;
  }

  // Prevent endless recursion, in case logging allocates memory
  inside_tracker_ = true;
  for (const auto& band : tracking_) {
    if (size > band.upper_bound || size < band.lower_bound) {
      continue;
    }

    // Micro optimization: in case sample_odds == 1.0 - do not draw a random number
    if (band.sample_odds != 1.0 && absl::Uniform(g_bitgen, 0.0, 1.0) >= band.sample_odds) {
      continue;
    }

    size_t usable = mi_usable_size(ptr);
    std::string trace = util::fb2::GetStacktrace();

    if (CanCallVlog(trace)) {
      DCHECK_GE(usable, size);
      LOG(INFO) << "Allocating " << usable << " bytes (" << ptr << "). Stack: " << trace;
    }

    break;
  }
  inside_tracker_ = false;
}

void AllocationTracker::ProcessDelete(void* ptr) {
  if (inside_tracker_) {
    return;
  }

  inside_tracker_ = true;
  // we partially handle deletes, specifically when specifying a single range with
  // 100% sampling rate.
  if (tracking_.size() == 1 && tracking_.front().sample_odds == 1) {
    size_t usable = mi_usable_size(ptr);
    if (usable <= tracking_.front().upper_bound && usable >= tracking_.front().lower_bound) {
      std::string trace = util::fb2::GetStacktrace();
      LOG_IF(INFO, CanCallVlog(trace)) << "Deallocating " << usable << " bytes (" << ptr << ")\n"
                                       << trace;
    }
  }
  inside_tracker_ = false;
}

void AllocationTracker::UpdateAbsSizes() {
  abs_min_size_ = 0;
  abs_max_size_ = 0;
  for (const auto& tracker : tracking_) {
    abs_min_size_ = std::min(abs_min_size_, tracker.lower_bound);
    abs_max_size_ = std::max(abs_max_size_, tracker.upper_bound);
  }
}

}  // namespace dfly


================================================
FILE: src/core/allocation_tracker.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once

#include <absl/container/inlined_vector.h>
#include <mimalloc.h>

#include <cstddef>

namespace dfly {

// Allows "tracking" of memory allocations by size bands. Tracking is naive in that it only prints
// the stack trace of the memory allocation, if matched by size & sampling criteria.
// Supports up to 4 different bands in parallel.
//
// Thread-local. Must be configured in all relevant threads separately.
//
// #define INJECT_ALLOCATION_TRACKER before #include exactly once to override new/delete
class AllocationTracker {
 public:
  struct TrackingInfo {
    size_t lower_bound = 0;
    size_t upper_bound = 0;
    double sample_odds = 0.0;
  };

  // Returns a thread-local reference.
  static AllocationTracker& Get();

  // Will track memory allocations in range [lower, upper]. Sample odds must be between [0, 1],
  // where 1 means all allocations are tracked and 0 means none.
  bool Add(const TrackingInfo& info);

  // Removes all tracking exactly matching lower_bound and upper_bound.
  // Returns true if the tracking range [lower_bound, upper_bound] was removed
  // and false, otherwise.
  bool Remove(size_t lower_bound, size_t upper_bound);

  // Clears *all* tracking.
  void Clear();

  absl::Span<const TrackingInfo> GetRanges() const;

  void ProcessNew(void* ptr, size_t size);
  void ProcessDelete(void* ptr);

 private:
  void UpdateAbsSizes();

  absl::InlinedVector<TrackingInfo, 4> tracking_;
  bool inside_tracker_ = false;
  size_t abs_min_size_ = 0;
  size_t abs_max_size_ = 0;
};

}  // namespace dfly

#ifdef INJECT_ALLOCATION_TRACKER
// Code here is copied from mimalloc-new-delete, and modified to add tracking
void operator delete(void* p) noexcept {
  dfly::AllocationTracker::Get().ProcessDelete(p);
  mi_free(p);
};
void operator delete[](void* p) noexcept {
  dfly::AllocationTracker::Get().ProcessDelete(p);
  mi_free(p);
};

void operator delete(void* p, const std::nothrow_t&) noexcept {
  dfly::AllocationTracker::Get().ProcessDelete(p);
  mi_free(p);
}
void operator delete[](void* p, const std::nothrow_t&) noexcept {
  dfly::AllocationTracker::Get().ProcessDelete(p);
  mi_free(p);
}

void* operator new(std::size_t n) noexcept(false) {
  auto v = mi_new(n);
  dfly::AllocationTracker::Get().ProcessNew(v, n);
  return v;
}
void* operator new[](std::size_t n) noexcept(false) {
  auto v = mi_new(n);
  dfly::AllocationTracker::Get().ProcessNew(v, n);
  return v;
}

void* operator new(std::size_t n, const std::nothrow_t& tag) noexcept {
  (void)(tag);
  auto v = mi_new_nothrow(n);
  dfly::AllocationTracker::Get().ProcessNew(v, n);
  return v;
}
void* operator new[](std::size_t n, const std::nothrow_t& tag) noexcept {
  (void)(tag);
  auto v = mi_new_nothrow(n);
  dfly::AllocationTracker::Get().ProcessNew(v, n);
  return v;
}

#if (__cplusplus >= 201402L || _MSC_VER >= 1916)
void operator delete(void* p, std::size_t n) noexcept {
  dfly::AllocationTracker::Get().ProcessDelete(p);
  mi_free_size(p, n);
};
void operator delete[](void* p, std::size_t n) noexcept {
  dfly::AllocationTracker::Get().ProcessDelete(p);
  mi_free_size(p, n);
};
#endif

#if (__cplusplus > 201402L || defined(__cpp_aligned_new))
void operator delete(void* p, std::align_val_t al) noexcept {
  dfly::AllocationTracker::Get().ProcessDelete(p);
  mi_free_aligned(p, static_cast<size_t>(al));
}
void operator delete[](void* p, std::align_val_t al) noexcept {
  dfly::AllocationTracker::Get().ProcessDelete(p);
  mi_free_aligned(p, static_cast<size_t>(al));
}
void operator delete(void* p, std::size_t n, std::align_val_t al) noexcept {
  dfly::AllocationTracker::Get().ProcessDelete(p);
  mi_free_size_aligned(p, n, static_cast<size_t>(al));
};
void operator delete[](void* p, std::size_t n, std::align_val_t al) noexcept {
  dfly::AllocationTracker::Get().ProcessDelete(p);
  mi_free_size_aligned(p, n, static_cast<size_t>(al));
};
void operator delete(void* p, std::align_val_t al, const std::nothrow_t&) noexcept {
  dfly::AllocationTracker::Get().ProcessDelete(p);
  mi_free_aligned(p, static_cast<size_t>(al));
}
void operator delete[](void* p, std::align_val_t al, const std::nothrow_t&) noexcept {
  dfly::AllocationTracker::Get().ProcessDelete(p);
  mi_free_aligned(p, static_cast<size_t>(al));
}

void* operator new(std::size_t n, std::align_val_t al) noexcept(false) {
  auto v = mi_new_aligned(n, static_cast<size_t>(al));
  dfly::AllocationTracker::Get().ProcessNew(v, n);
  return v;
}
void* operator new[](std::size_t n, std::align_val_t al) noexcept(false) {
  auto v = mi_new_aligned(n, static_cast<size_t>(al));
  dfly::AllocationTracker::Get().ProcessNew(v, n);
  return v;
}
void* operator new(std::size_t n, std::align_val_t al, const std::nothrow_t&) noexcept {
  auto v = mi_new_aligned_nothrow(n, static_cast<size_t>(al));
  dfly::AllocationTracker::Get().ProcessNew(v, n);
  return v;
}
void* operator new[](std::size_t n, std::align_val_t al, const std::nothrow_t&) noexcept {
  auto v = mi_new_aligned_nothrow(n, static_cast<size_t>(al));
  dfly::AllocationTracker::Get().ProcessNew(v, n);
  return v;
}
#endif
#endif  // INJECT_ALLOCATION_TRACKER


================================================
FILE: src/core/allocation_tracker_test.cc
================================================
#include <absl/strings/match.h>
#include <gmock/gmock.h>
#include <gtest/gtest.h>

#include <string>
#include <vector>

#include "base/gtest.h"
#include "base/logging.h"

#define INJECT_ALLOCATION_TRACKER
#include "core/allocation_tracker.h"

namespace dfly {
namespace {
using namespace std;
using namespace testing;

class LogSink : public google::LogSink {
 public:
  void send(google::LogSeverity severity, const char* full_filename, const char* base_filename,
            int line, const struct tm* tm_time, const char* message, size_t message_len) override {
    logs_.push_back(string(message, message_len));
  }

  const vector<string>& GetLogs() const {
    return logs_;
  }

  void Clear() {
    logs_.clear();
  }

 private:
  vector<string> logs_;
};

class AllocationTrackerTest : public Test {
 protected:
  AllocationTrackerTest() {
    google::AddLogSink(&log_sink_);
  }

  ~AllocationTrackerTest() {
    google::RemoveLogSink(&log_sink_);
    AllocationTracker::Get().Clear();
  }

  vector<string> GetLogsDelta() {
    auto logs = log_sink_.GetLogs();
    log_sink_.Clear();
    return logs;
  }

  void Allocate(size_t s) {
    CHECK(buffer_.empty());
    buffer_.resize(s);  // allocate 1mb before setting up tracking
  }

  void Deallocate() {
    buffer_.clear();
    // Force deallocation
    buffer_.shrink_to_fit();
  }

 private:
  LogSink log_sink_;
  string buffer_;
};

TEST_F(AllocationTrackerTest, UnusedTracker) {
  Allocate(1'000'000);  // allocate 1mb before setting up tracking
  EXPECT_THAT(GetLogsDelta(), Not(Contains(HasSubstr("Allocating"))));
  Deallocate();
}

TEST_F(AllocationTrackerTest, UsedTracker) {
  AllocationTracker::Get().Add(
      {.lower_bound = 1'000'000, .upper_bound = 2'000'000, .sample_odds = 1.0});
  Allocate(1'000'000);  // allocate 1mb before setting up tracking
  EXPECT_THAT(GetLogsDelta(), Contains(HasSubstr("Allocating")));
  EXPECT_THAT(GetLogsDelta(), Not(Contains(HasSubstr("Deallocating"))));
  Deallocate();
  EXPECT_THAT(GetLogsDelta(), Contains(HasSubstr("Deallocating")));

  // Allocate below threshold
  Allocate(100'000);
  EXPECT_THAT(GetLogsDelta(), Not(Contains(HasSubstr("Allocating"))));
  Deallocate();
  EXPECT_THAT(GetLogsDelta(), Not(Contains(HasSubstr("Deallocating"))));

  // Allocate above threshold
  Allocate(10'000'000);
  EXPECT_THAT(GetLogsDelta(), Not(Contains(HasSubstr("Allocating"))));
  Deallocate();
  EXPECT_THAT(GetLogsDelta(), Not(Contains(HasSubstr("Deallocating"))));

  // Remove allocator - stops logging
  EXPECT_TRUE(AllocationTracker::Get().Remove(1'000'000, 2'000'000));
  Allocate(1'000'000);  // allocate 1mb before setting up tracking
  EXPECT_THAT(GetLogsDelta(), Not(Contains(HasSubstr("Allocating"))));
  Deallocate();
  EXPECT_THAT(GetLogsDelta(), Not(Contains(HasSubstr("Deallocating"))));
}

TEST_F(AllocationTrackerTest, MultipleRanges) {
  AllocationTracker::Get().Add(
      {.lower_bound = 1'000'000, .upper_bound = 2'000'000, .sample_odds = 1.0});
  AllocationTracker::Get().Add(
      {.lower_bound = 100'000'000, .upper_bound = 200'000'000, .sample_odds = 1.0});

  // Below all ranges
  Allocate(100'000);
  EXPECT_THAT(GetLogsDelta(), Not(Contains(HasSubstr("Allocating"))));
  Deallocate();

  // Between ranges
  Allocate(10'000'000);
  EXPECT_THAT(GetLogsDelta(), Not(Contains(HasSubstr("Allocating"))));
  Deallocate();

  // Above all ranges
  Allocate(500'000'000);
  EXPECT_THAT(GetLogsDelta(), Not(Contains(HasSubstr("Allocating"))));
  Deallocate();

  // First range
  Allocate(1'000'000);
  EXPECT_THAT(GetLogsDelta(), Contains(HasSubstr("Allocating")));
  Deallocate();

  // Second range
  Allocate(100'000'000);
  EXPECT_THAT(GetLogsDelta(), Contains(HasSubstr("Allocating")));
  Deallocate();
}

TEST_F(AllocationTrackerTest, Sampling) {
  // Statistically, 80% of logs should be logged
  AllocationTracker::Get().Add(
      {.lower_bound = 1'000'000, .upper_bound = 2'000'000, .sample_odds = 0.8});

  const int kIterations = 10'000;
  for (int i = 0; i < kIterations; ++i) {
    Allocate(1'000'000);
    Deallocate();
  }

  int allocations = 0;
  int deallocations = 0;
  for (const string& s : GetLogsDelta()) {
    if (absl::StrContains(s, "Allocating")) {
      ++allocations;
    }
    if (absl::StrContains(s, "Deallocating")) {
      ++deallocations;
    }
  }

  EXPECT_GE(allocations, kIterations * 0.7);
  EXPECT_LE(allocations, kIterations * 0.9);
  EXPECT_EQ(deallocations, 0);  // we only track deletions when sample_odds == 1.0
}

}  // namespace
}  // namespace dfly


================================================
FILE: src/core/bloom.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/bloom.h"

#include <absl/base/internal/endian.h>
#include <absl/numeric/bits.h>
#include <xxhash.h>

#include <algorithm>
#include <cmath>

#include "base/logging.h"

namespace dfly {

using namespace std;

namespace {

XXH128_hash_t Hash(string_view str) {
  return XXH3_128bits_withSeed(str.data(), str.size(), 0xc6a4a7935bd1e995ULL);  // murmur2 seed
}

uint64_t GetMask(unsigned log) {
  return (1ULL << log) - 1;
}

uint64_t BitIndex(uint64_t low, uint64_t hi, unsigned i, uint64_t mask) {
  return (low + hi * i) & mask;
}

constexpr double kDenom = M_LN2 * M_LN2;
constexpr double kSBFErrorFactor = 0.5;

double BPE(double fp_prob) {
  return -log(fp_prob) / kDenom;
}

}  // namespace

Bloom::~Bloom() {
  CHECK(bf_ == nullptr);
}

Bloom::Bloom(Bloom&& o) noexcept : hash_cnt_(o.hash_cnt_), bit_log_(o.bit_log_), bf_(o.bf_) {
  o.bf_ = nullptr;
}

void Bloom::Init(uint64_t entries, double fp_prob, PMR_NS::memory_resource* heap) {
  CHECK(bf_ == nullptr);
  CHECK(fp_prob > 0 && fp_prob < 1);

  if (fp_prob > 0.5)
    fp_prob = 0.5;
  double bpe = BPE(fp_prob);

  hash_cnt_ = ceil(M_LN2 * bpe);

  uint64_t bits = uint64_t(ceil(entries * bpe));
  if (bits < 512) {
    bits = 512;
  }
  bits = absl::bit_ceil(bits);  // make it power of 2.

  uint64_t length = bits / 8;
  bf_ = (uint8_t*)heap->allocate(length);
  memset(bf_, 0, length);
  bit_log_ = absl::countr_zero(bits);
}

void Bloom::Init(uint8_t* blob, size_t len, unsigned hash_cnt) {
  DCHECK_EQ(len * 8, absl::bit_ceil(len * 8));  // must be power of two.
  CHECK(bf_ == nullptr);
  hash_cnt_ = hash_cnt;
  bf_ = blob;
  bit_log_ = absl::countr_zero(len * 8);
}

void Bloom::Destroy(PMR_NS::memory_resource* resource) {
  resource->deallocate(CHECK_NOTNULL(bf_), bitlen() / 8);
  bf_ = nullptr;
}

bool Bloom::Exists(std::string_view str) const {
  XXH128_hash_t hash = Hash(str);
  uint64_t fp[2] = {hash.low64, hash.high64};

  return Exists(fp);
}

bool Bloom::Exists(const uint64_t fp[2]) const {
  uint64_t mask = GetMask(bit_log_);
  for (unsigned i = 0; i < hash_cnt_; ++i) {
    uint64_t index = BitIndex(fp[0], fp[1], i, mask);
    if (!IsSet(index))
      return false;
  }
  return true;
}

bool Bloom::Add(std::string_view str) {
  XXH128_hash_t hash = Hash(str);
  uint64_t fp[2] = {hash.low64, hash.high64};
  return Add(fp);
}

bool Bloom::Add(const uint64_t fp[2]) {
  uint64_t mask = GetMask(bit_log_);

  unsigned changes = 0;
  for (uint64_t i = 0; i < hash_cnt_; i++) {
    uint64_t index = BitIndex(fp[0], fp[1], i, mask);
    changes += Set(index);
  }

  return changes != 0;
}

size_t Bloom::Capacity(double fp_prob) const {
  if (fp_prob > 0.5)
    fp_prob = 0.5;
  double bpe = BPE(fp_prob);
  return floor(bitlen() / bpe);
}

inline bool Bloom::IsSet(size_t bit_idx) const {
  uint64_t byte_idx = bit_idx / 8;
  bit_idx %= 8;  // index within the byte
  uint8_t b = bf_[byte_idx];
  return (b & (1 << bit_idx)) != 0;
}

inline bool Bloom::Set(size_t bit_idx) {
  uint64_t byte_idx = bit_idx / 8;
  bit_idx %= 8;

  uint8_t b = bf_[byte_idx];
  bf_[byte_idx] |= (1 << bit_idx);
  return bf_[byte_idx] != b;
}

///////////////////////////////////////////////////////////////////////////////
// SBF implementation
///////////////////////////////////////////////////////////////////////////////
SBF::SBF(uint64_t initial_capacity, double fp_prob, double grow_factor, PMR_NS::memory_resource* mr)
    : filters_(1, mr), grow_factor_(grow_factor), fp_prob_(fp_prob * kSBFErrorFactor) {
  filters_.front().Init(initial_capacity, fp_prob_, mr);
  max_capacity_ = filters_.front().Capacity(fp_prob_);
}

SBF::SBF(double grow_factor, double fp_prob, size_t max_capacity, size_t prev_size,
         size_t current_size, PMR_NS::memory_resource* mr)
    : filters_(mr),
      grow_factor_(grow_factor),
      fp_prob_(fp_prob),
      prev_size_(prev_size),
      current_size_(current_size),
      max_capacity_(max_capacity) {
}

SBF::~SBF() {
  PMR_NS::memory_resource* mr = filters_.get_allocator().resource();
  for (auto& f : filters_)
    f.Destroy(mr);
}

SBF& SBF::operator=(SBF&& src) noexcept {
  filters_.clear();
  filters_.swap(src.filters_);
  grow_factor_ = src.grow_factor_;
  fp_prob_ = src.fp_prob_;
  current_size_ = src.current_size_;
  max_capacity_ = src.max_capacity_;

  return *this;
}

void SBF::AddFilter(const std::string& blob, unsigned hash_cnt) {
  PMR_NS::memory_resource* mr = filters_.get_allocator().resource();
  uint8_t* ptr = (uint8_t*)mr->allocate(blob.size(), 1);
  memcpy(ptr, blob.data(), blob.size());
  filters_.emplace_back().Init(ptr, blob.size(), hash_cnt);
}

bool SBF::Add(std::string_view str) {
  DCHECK_LT(current_size_, max_capacity_);

  XXH128_hash_t hash = Hash(str);
  uint64_t fp[2] = {hash.low64, hash.high64};

  auto exists = [fp](const Bloom& b) { return b.Exists(fp); };

  // Check for all the previous filters whether the item exists.
  if (any_of(next(filters_.crbegin()), filters_.crend(), exists)) {
    return false;
  }

  if (!filters_.back().Add(fp))
    return false;

  ++current_size_;

  // Based on the paper, the optimal fill ratio for SBF is 50%.
  // Lets add a new slice if we reach it.
  if (current_size_ >= max_capacity_) {
    fp_prob_ *= kSBFErrorFactor;
    filters_.emplace_back().Init(max_capacity_ * grow_factor_, fp_prob_,
                                 filters_.get_allocator().resource());
    current_size_ = 0;
    max_capacity_ = filters_.back().Capacity(fp_prob_);
  }

  return true;
}

bool SBF::Exists(std::string_view str) const {
  XXH128_hash_t hash = Hash(str);
  uint64_t fp[2] = {hash.low64, hash.high64};

  auto exists = [fp](const Bloom& b) { return b.Exists(fp); };

  return any_of(filters_.crbegin(), filters_.crend(), exists);
}

size_t SBF::MallocUsed() const {
  size_t res = filters_.capacity() * sizeof(Bloom);
  for (const auto& b : filters_) {
    res += (b.bitlen() / 8);
  }
  res += sizeof(SBF);

  return res;
}

}  // namespace dfly


================================================
FILE: src/core/bloom.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <cstdint>
#include <string_view>
#include <vector>

#include "base/pmr/memory_resource.h"

namespace dfly {

/// Bloom filter based on the design of https://github.com/jvirkki/libbloom
class Bloom {
 public:
  Bloom() = default;
  Bloom(const Bloom&) = delete;
  Bloom& operator=(const Bloom&) = delete;

  // Note, that Destroy() must be called before calling the d'tor
  ~Bloom();

  // Initializes a new Bloom object
  // entries - entries are silently rounded up to the minimum capacity.
  // fp_prob - False-positive probability of collision. Must be in (0, 1) range.
  // heap
  void Init(uint64_t entries, double fp_prob, PMR_NS::memory_resource* resource);

  // Direct initializer. len*8 must be power of 2.
  void Init(uint8_t* blob, size_t len, unsigned hash_cnt);

  // Destroys the object, must be called before destructing the object.
  // resource - resource with which the object was initialized.
  void Destroy(PMR_NS::memory_resource* resource);

  Bloom(Bloom&& o) noexcept;

  bool Exists(std::string_view str) const;

  // Equivalent to the Exist above but accepts two fingerprints of the item.
  bool Exists(const uint64_t fp[2]) const;

  // Adds an item to the bloom filter.
  // Returns true if element was not present and was added,
  // false - if element (or a collision) had already been added previously.
  bool Add(std::string_view str);
  bool Add(const uint64_t fp[2]);

  size_t bitlen() const {
    return 1ULL << bit_log_;
  }

  // Max element capacity for this bloom filter.
  // Note that capacity is floor(bit_len / bpe), where bpe (bits per element) is
  // derived from fp_prob.
  size_t Capacity(double fp_prob) const;

  std::string_view data() const {
    return std::string_view{reinterpret_cast<const char*>(bf_), bitlen() / 8};
  }

  unsigned hash_cnt() const {
    return hash_cnt_;
  }

 private:
  bool IsSet(size_t index) const;
  bool Set(size_t index);  // return true if bit was set (i.e was 0 before)

  uint8_t hash_cnt_ = 0;
  uint8_t bit_log_ = 0;    // log of bit length of the filter. bit length is always power of 2.
  uint8_t* bf_ = nullptr;  // pointer to the blob.
};

/**
 * @brief Scalable bloom filter.
 * Based on https://gsd.di.uminho.pt/members/cbm/ps/dbloom.pdf
 * Please note that for SBF, the original paper assumes partitioning of bit space into K
 * disjoint segments where K is number of hash functions. This is done to reduce index collisions.
 * We do not do this, because we use power of 2 bit lengths.
 * TODO: to test the actual rate of this filter.
 */
class SBF {
 public:
  SBF(uint64_t initial_capacity, double fp_prob, double grow_factor, PMR_NS::memory_resource* mr);
  SBF(const SBF&) = delete;

  // C'tor used for loading persisted filters into SBF.
  // Should be followed by AddFilter.
  SBF(double grow_factor, double fp_prob, size_t max_capacity, size_t prev_size,
      size_t current_size, PMR_NS::memory_resource* mr);
  ~SBF();

  SBF& operator=(SBF&& src) noexcept;

  void AddFilter(const std::string& blob, unsigned hash_cnt);

  bool Add(std::string_view str);
  bool Exists(std::string_view str) const;

  size_t current_size() const {
    return current_size_;
  }

  size_t prev_size() const {
    return prev_size_;
  }

  double grow_factor() const {
    return grow_factor_;
  }

  // expected fp probability for the current filter.
  double fp_probability() const {
    return fp_prob_;
  }

  uint32_t num_filters() const {
    return filters_.size();
  }

  std::string_view data(size_t idx) const {
    return filters_[idx].data();
  }

  unsigned hashfunc_cnt(size_t idx) const {
    return filters_[idx].hash_cnt();
  }

  // max capacity of the current filter.
  size_t max_capacity() const {
    return max_capacity_;
  }

  size_t MallocUsed() const;

 private:
  // multiple filters from the smallest to the largest.
  std::vector<Bloom, PMR_NS::polymorphic_allocator<Bloom>> filters_;
  double grow_factor_;
  double fp_prob_;
  size_t prev_size_ = 0;
  size_t current_size_ = 0;
  size_t max_capacity_;
};

}  // namespace dfly


================================================
FILE: src/core/bloom_test.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/bloom.h"

#include <absl/strings/str_cat.h>
#include <gmock/gmock.h>

#include "base/gtest.h"

namespace dfly {

using namespace std;

class BloomTest : public ::testing::Test {
 protected:
  BloomTest() {
    bloom_.Init(1000, 0.001, PMR_NS::get_default_resource());
  }

  ~BloomTest() {
    bloom_.Destroy(PMR_NS::get_default_resource());
  }

  Bloom bloom_;
};

TEST_F(BloomTest, Basic) {
  EXPECT_FALSE(bloom_.Exists(string_view{}));
  EXPECT_TRUE(bloom_.Add(string_view{}));
  EXPECT_TRUE(bloom_.Exists(string_view{}));
  EXPECT_FALSE(bloom_.Add(string_view{}));

  vector<string> values;
  for (unsigned i = 0; i < 100; ++i) {
    values.push_back(absl::StrCat("val", i));
  }

  for (const auto& val : values) {
    EXPECT_FALSE(bloom_.Exists(val));
    EXPECT_TRUE(bloom_.Add(val));
    EXPECT_TRUE(bloom_.Exists(val));
    EXPECT_FALSE(bloom_.Add(val));
  }
}

TEST_F(BloomTest, ErrorBound) {
  size_t max_capacity = bloom_.Capacity(0.001);
  for (unsigned i = 0; i < max_capacity; ++i) {
    ASSERT_FALSE(bloom_.Exists(absl::StrCat("item", i)));
  }

  unsigned collisions = 0;
  for (unsigned i = 0; i < max_capacity; ++i) {
    if (!bloom_.Add(absl::StrCat("item", i))) {
      ++collisions;
    }
  }

  EXPECT_EQ(collisions, 0) << max_capacity;
}

TEST_F(BloomTest, Extreme) {
  Bloom b2;

  // Init with unreasonable large error probability.
  b2.Init(10, 0.999, PMR_NS::get_default_resource());

  EXPECT_EQ(512, b2.bitlen());  // minimal bit length, even though requested smaller capacity.
  EXPECT_LT(b2.Capacity(0.999), 512);  // make sure our element capacity is smaller.
  b2.Destroy(PMR_NS::get_default_resource());
}

TEST_F(BloomTest, SBF) {
  SBF sbf(10, 0.001, 2, PMR_NS::get_default_resource());

  unsigned collisions = 0;
  constexpr unsigned kNumElems = 2000000;
  for (unsigned i = 0; i < kNumElems; ++i) {
    if (!sbf.Add(absl::StrCat("item", i))) {
      ++collisions;
    }
  }

  // TODO: to revisit the math for deriving number of hash functions for each filter
  // according the the SBF paper.
  EXPECT_LE(collisions, kNumElems * 0.008);
}

static void BM_BloomExist(benchmark::State& state) {
  constexpr size_t kCapacity = 1U << 22;
  Bloom bloom;
  bloom.Init(kCapacity, 0.001, PMR_NS::get_default_resource());
  for (size_t i = 0; i < kCapacity * 0.8; ++i) {
    bloom.Add(absl::StrCat("val", i));
  }
  unsigned i = 0;
  char buf[32];
  memset(buf, 'x', sizeof(buf));
  string_view sv{buf, sizeof(buf)};
  while (state.KeepRunning()) {
    absl::numbers_internal::FastIntToBuffer(i, buf);
    bloom.Exists(sv);
  }
  bloom.Destroy(PMR_NS::get_default_resource());
}
BENCHMARK(BM_BloomExist);

}  // namespace dfly


================================================
FILE: src/core/bptree_set.h
================================================
// Copyright 2023, Roman Gershman.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <functional>
#include <optional>

#include "core/detail/bptree_internal.h"
#include "core/detail/stateless_allocator.h"

namespace dfly {

template <typename T> struct DefaultCompareTo {
  int operator()(const T& a, const T& b) const {
    std::less<T> cmp;
    return cmp(a, b) ? -1 : (cmp(b, a) ? 1 : 0);
  }
};

template <typename T> struct BPTreePolicy {
  using KeyT = T;

  // The three way comparator that should accept a query ( or key) on the left, and the key
  // on the right.
  using KeyCompareTo = DefaultCompareTo<T>;
};

template <typename T, typename Policy = BPTreePolicy<T>> class BPTree {
  BPTree(const BPTree&) = delete;
  BPTree& operator=(const BPTree&) = delete;

  using BPTreeNode = detail::BPTreeNode<T>;
  using BPTreePath = detail::BPTreePath<T>;

 public:
  using KeyT = typename Policy::KeyT;

  BPTree(PMR_NS::memory_resource* mr = PMR_NS::get_default_resource()) : mr_(mr) {
  }

  ~BPTree() {
    Clear();
  }

  // true if inserted, false if skipped.
  bool Insert(KeyT item);

  bool Contains(KeyT item) const;

  bool Delete(KeyT item);

  std::optional<uint32_t> GetRank(KeyT item, bool reverse = false) const;

  size_t Height() const {
    return height_;
  }

  size_t Size() const {
    return count_;  // number of items in the tree
  }

  bool Empty() const {
    return count_ == 0;
  }

  size_t NodeCount() const {
    // number of nodes in the tree (usually, order of magnitude smaller than Size()).
    return num_nodes_;
  }

  void Clear();

  const BPTreeNode* DEBUG_root() const {
    return root_;
  }

  BPTreePath FromRank(uint32_t rank) const {
    BPTreePath path;
    ToRank(rank, &path);
    return path;
  }

  /// @brief Iterates over all items in the range [rank_start, rank_end] by rank.
  /// @param rank_start
  /// @param rank_end - inclusive.
  /// @param cb - callback to be called for each item in the range.
  ///             Should return false to stop iteration.
  bool Iterate(uint32_t rank_start, uint32_t rank_end, std::function<bool(KeyT)> cb) const;

  /// @brief Iterates over all items in the range [rank_start, rank_end] by rank in reverse order.
  /// @param rank_start
  /// @param rank_end
  /// @param cb - callback to be called for each item in the range.
  ///             Should return false to stop iteration.
  bool IterateReverse(uint32_t rank_start, uint32_t rank_end, std::function<bool(KeyT)> cb) const;

  /// @brief Returns the path to the first item in the tree for which comp(q, key) >= 0.
  /// @param item
  /// @return the path if such item exists, empty path otherwise.
  template <typename Q> BPTreePath GEQ(Q&& query) const;

  /// @brief Returns the path to the largest item in the tree such that comp(q, key) <= 0.
  /// @param key
  /// @return the path if such item exists, empty path otherwise.
  template <typename Q> BPTreePath LEQ(Q&& query) const;

  /// @brief Deletes the element pointed by path.
  /// @param path
  void Delete(BPTreePath path);

  /// @brief Forces an update to the key. Assumes key has the same value.
  /// Replaces old with new_obj.
  void ForceUpdate(KeyT old, KeyT new_obj);

 private:
  BPTreeNode* CreateNode(bool leaf);

  void DestroyNode(BPTreeNode* node);

  void InsertToFullLeaf(KeyT item, const BPTreePath& path);

  // Returns true if insertion was handled by rebalancing.
  bool RebalanceLeafAndInsert(const BPTreePath& path, unsigned parent_depth, KeyT item,
                              unsigned insert_pos);

  void IncreaseSubtreeCounts(const BPTreePath& path, unsigned depth, int32_t delta);

  // Charts the path towards key. Returns true if key is found.
  // In that case comp(q, path->Last().first->Key(path->Last().second)) == 0.
  // Fills the tree path not including the key itself. In case key was not found,
  // returns the path to the item that is greater than the key.
  template <typename Q> bool Locate(Q&& q, BPTreePath* path) const;

  // Sets the tree path to item at specified rank. Rank is 0-based and must be less than Size().
  // returns the index of the key in the last node of the path.
  void ToRank(uint32_t rank, BPTreePath* path) const;

  BPTreeNode* root_ = nullptr;  // root node or NULL if empty tree
  uint32_t count_ = 0;          // number of items in tree
  uint32_t height_ = 0;         // height of tree from root to leaf
  uint32_t num_nodes_ = 0;      // number of nodes in tree
  PMR_NS::memory_resource* mr_;
};

template <typename T, typename Policy> bool BPTree<T, Policy>::Contains(KeyT item) const {
  BPTreePath path;
  bool found = Locate(item, &path);
  return found;
}

template <typename T, typename Policy> void BPTree<T, Policy>::Clear() {
  if (!root_)
    return;

  BPTreePath path;
  BPTreeNode* node = root_;

  auto deep_left = [&](unsigned pos) {
    do {
      path.Push(node, pos);
      node = node->Child(pos);
      pos = 0;
    } while (!node->IsLeaf());
  };

  if (!root_->IsLeaf())
    deep_left(0);

  while (true) {
    DestroyNode(node);

    if (path.Depth() == 0) {
      break;
    }
    node = path.Last().first;
    unsigned pos = path.Last().second;
    path.Pop();
    if (pos < node->NumItems()) {
      deep_left(pos + 1);
    }
  }
  root_ = nullptr;
  height_ = count_ = 0;
}

template <typename T, typename Policy> bool BPTree<T, Policy>::Insert(KeyT item) {
  if (!root_) {
    root_ = CreateNode(true);
    root_->InitSingle(item);
    count_ = height_ = 1;

    return true;
  }

  BPTreePath path;
  bool found = Locate(item, &path);

  if (found) {
    return false;
  }

  assert(path.Depth() > 0u);

  BPTreeNode* leaf = path.Last().first;
  assert(leaf->IsLeaf());

  if (leaf->NumItems() == detail::BPNodeLayout<T>::kMaxLeafKeys) {
    InsertToFullLeaf(item, path);
  } else {
    unsigned pos = path.Last().second;
    leaf->LeafInsert(pos, item);
    if (path.Depth() > 1)
      IncreaseSubtreeCounts(path, path.Depth() - 2, 1);
  }
  count_++;
  return true;
}

template <typename T, typename Policy> bool BPTree<T, Policy>::Delete(KeyT item) {
  if (!root_)
    return false;

  BPTreePath path;
  bool found = Locate(item, &path);
  if (!found)
    return false;

  Delete(path);
  return true;
}

template <typename T, typename Policy>
std::optional<uint32_t> BPTree<T, Policy>::GetRank(KeyT item, bool reverse) const {
  if (!root_)
    return std::nullopt;

  BPTreePath path;
  bool found = Locate(item, &path);
  if (!found)
    return std::nullopt;

  if (reverse) {
    return count_ - path.Rank() - 1;
  }

  return path.Rank();
}

template <typename T, typename Policy>
template <typename Q>
bool BPTree<T, Policy>::Locate(Q&& q, BPTreePath* path) const {
  assert(root_);
  BPTreeNode* node = root_;
  typename Policy::KeyCompareTo cmp;
  auto cmp_cb = [&](const KeyT& key) { return cmp(q, key); };

  while (true) {
    typename BPTreeNode::SearchResult res = node->BSearch(cmp_cb);
    path->Push(node, res.index);
    if (res.found) {
      return true;
    }
    assert(res.index <= node->NumItems());

    if (node->IsLeaf()) {
      break;
    }
    node = node->Child(res.index);
  }
  return false;
}

template <typename T, typename Policy>
void BPTree<T, Policy>::InsertToFullLeaf(KeyT item, const BPTreePath& path) {
  using Layout = detail::BPNodeLayout<T>;
  using Comp [[maybe_unused]] = typename Policy::KeyCompareTo;

  assert(path.Depth() > 0u);

  BPTreeNode* node = path.Last().first;
  assert(node->IsLeaf() && node->AvailableSlotCount() == 0);

  unsigned insert_pos = path.Last().second;
  unsigned level = path.Depth() - 1;
  if (level > 0 && RebalanceLeafAndInsert(path, level - 1, item, insert_pos)) {
    // Update the tree count of the ascendants.
    IncreaseSubtreeCounts(path, level - 1, 1);
    return;
  }

  KeyT median;
  BPTreeNode* right = CreateNode(true);
  node->Split(right, &median);

  assert(node->NumItems() < Layout::kMaxLeafKeys);

  if (insert_pos <= node->NumItems()) {
    assert(Comp()(item, median) < 0);
    node->LeafInsert(insert_pos, item);
  } else {
    assert(Comp()(item, median) > 0);
    right->LeafInsert(insert_pos - node->NumItems() - 1, item);
  }

  // we must add the newly created `right` to the parent and update its tree count.
  while (level > 0) {
    --level;
    // level up, now node is parent.
    node = path.Node(level);
    unsigned pos = path.Position(level);  // position of the child node in parent.

    assert(!node->IsLeaf() && pos <= node->NumItems());
    assert(right);

    // Terminal case: Node is not full so we can just add `right` to it.
    if (node->NumItems() < Layout::kMaxInnerKeys) {
      // We do not update the subtree count of the node here because the surpus of another item
      // resulted with the additional key in this node.
      node->InnerInsert(pos, median, right);
      node->IncreaseTreeCount(1);
      right = nullptr;
      break;
    }

    // We need to insert right into a node as position pos. Node is full so we must handle it
    // either via rebalancing "node" or via its splitting. Rebalancing is a better case, we try
    // it first.
    if (level > 0) {
      // see if we can rebalance node (right's parent) via node's parent.
      BPTreeNode* parent = path.Node(level - 1);
      unsigned parent_pos = path.Position(level - 1);
      assert(parent->Child(parent_pos) == node);

      auto [new_node, inner_pos] = parent->RebalanceChild(parent_pos, pos);
      if (new_node) {
        // we rebalanced inner_full so we can insert (median, right) and stop propagating.
        new_node->InnerInsert(inner_pos, median, right);

        if (new_node != node) {
          // Fix subtree counts if right was migrated to the sibling.
          node->IncreaseTreeCount(-right->TreeCount());
          new_node->IncreaseTreeCount(right->TreeCount() + 1);
        } else {
          node->IncreaseTreeCount(1);
        }
        right = nullptr;
        break;
      }
    }

    // node is not rebalanced, so we need to split it.
    BPTreeNode* next_right = CreateNode(false);
    KeyT next_median;
    node->Split(next_right, &next_median);
    assert(node->NumItems() < Layout::kMaxInnerKeys);

    if (pos <= node->NumItems()) {
      assert(Comp()(median, next_median) < 0);

      node->InnerInsert(pos, median, right);
      node->IncreaseTreeCount(1);
    } else {
      assert(Comp()(median, next_median) > 0);

      next_right->InnerInsert(pos - node->NumItems() - 1, median, right);

      // Fix tree counts.
      node->IncreaseTreeCount(-right->TreeCount());
      next_right->IncreaseTreeCount(right->TreeCount() + 1);
    }
    right = next_right;
    median = next_median;
  }

  if (right) {
    assert(level == 0);
    BPTreeNode* new_root = CreateNode(false);
    new_root->InitSingle(median);
    new_root->SetChild(0, root_);
    new_root->SetChild(1, right);
    new_root->SetTreeCount(root_->TreeCount() + right->TreeCount() + 1);
    root_ = new_root;
    height_++;
  } else {
    if (level > 0) {
      IncreaseSubtreeCounts(path, level - 1, 1);
    }
  }
}

template <typename T, typename Policy>
bool BPTree<T, Policy>::RebalanceLeafAndInsert(const BPTreePath& path, unsigned parent_depth,
                                               KeyT item, unsigned insert_pos) {
  BPTreeNode* parent = path.Node(parent_depth);
  unsigned pos = path.Position(parent_depth);

  std::pair<BPTreeNode*, unsigned> rebalance_res = parent->RebalanceChild(pos, insert_pos);
  if (rebalance_res.first) {
    rebalance_res.first->LeafInsert(rebalance_res.second, item);
    return true;
  }
  return false;
}

template <typename T, typename Policy>
void BPTree<T, Policy>::IncreaseSubtreeCounts(const BPTreePath& path, unsigned depth,
                                              int32_t delta) {
  for (int i = depth; i >= 0; --i) {
    BPTreeNode* node = path.Node(i);
    node->IncreaseTreeCount(delta);
  }
}

template <typename T, typename Policy>
bool BPTree<T, Policy>::Iterate(uint32_t rank_start, uint32_t rank_end,
                                std::function<bool(KeyT)> cb) const {
  if (rank_start >= Size())
    return true;

  assert(rank_start <= rank_end);

  BPTreePath path;
  ToRank(rank_start, &path);
  for (uint32_t i = rank_start; i <= rank_end; ++i) {
    if (!cb(path.Terminal()))
      return false;

    if (!path.Next())
      return true;
  }
  return true;
}

template <typename T, typename Policy>
bool BPTree<T, Policy>::IterateReverse(uint32_t rank_start, uint32_t rank_end,
                                       std::function<bool(KeyT)> cb) const {
  assert(rank_start <= rank_end && rank_end < count_);

  BPTreePath path;
  ToRank(count_ - 1 - rank_start, &path);
  for (uint32_t i = rank_start; i <= rank_end; ++i) {
    if (!cb(path.Terminal()))
      return false;

    path.Prev();
  }
  return true;
}

template <typename T, typename Policy>
void BPTree<T, Policy>::ToRank(uint32_t rank, BPTreePath* path) const {
  assert(root_ && rank < count_);
  BPTreeNode* node = root_;

  if (rank + 1 == count_) {
    // Corner case where we search for the node on the right.
    while (!node->IsLeaf()) {
      path->Push(node, node->NumItems());
      node = node->Child(node->NumItems());
    }
    path->Push(node, node->NumItems() - 1);
    return;
  }

  while (!node->IsLeaf()) {
    // handle common corner case of search of left-most node, and avoid counting sub-tree count.
    if (rank == 0) {
      path->Push(node, 0);
      node = node->Child(0);
      continue;
    }

    for (unsigned i = 0; i <= node->NumItems(); ++i) {
      uint32_t subtree_cnt = node->GetChildTreeCount(i);
      if (subtree_cnt > rank) {
        path->Push(node, i);
        node = node->Child(i);
        break;
      }
      assert(i < node->NumItems());
      rank -= subtree_cnt;
      if (rank == 0) {
        path->Push(node, i);
        return;
      }
      --rank;
    }
  }

  assert(node->IsLeaf());
  assert(rank < node->NumItems());
  path->Push(node, rank);
}

template <typename T, typename Policy>
template <typename Q>
auto BPTree<T, Policy>::GEQ(Q&& query) const -> BPTreePath {
  BPTreePath path;

  bool res = Locate(query, &path);

  // if we did not find the item and the path does not lead to any key in the node,
  // adjust the path to point to the next key in the tree.
  // In case we are past all items in the tree, Next() will collapse to the empty path.
  if (!res && path.Last().second >= path.Last().first->NumItems()) {
    path.Next();
  }

  return path;
}

template <typename T, typename Policy>
template <typename Q>
auto BPTree<T, Policy>::LEQ(Q&& query) const -> BPTreePath {
  BPTreePath path;
  bool res = Locate(query, &path);

  if (!res) {  // fix the result in case the path leads to key greater than item.
    path.Prev();
  }

  return path;
}

template <typename T, typename Policy>
detail::BPTreeNode<T>* BPTree<T, Policy>::CreateNode(bool leaf) {
  num_nodes_++;
  void* ptr = mr_->allocate(detail::kBPNodeSize, 8);
  BPTreeNode* node = new (ptr) BPTreeNode(leaf);

  return node;
}

template <typename T, typename Policy> void BPTree<T, Policy>::Delete(BPTreePath path) {
  using Comp [[maybe_unused]] = typename Policy::KeyCompareTo;

  BPTreeNode* node = path.Last().first;
  unsigned key_pos = path.Last().second;

  // Remove the key from the node.
  if (node->IsLeaf()) {
    node->ShiftLeft(key_pos);  // shift left everything after key_pos.
  } else {
    // We can not remove the item from the inner node because it also serves as a separator.
    // Therefore, we swap it the rightmost key in the left subtree and pop from there instead.
    path.DigRight();

    BPTreeNode* leaf = path.Last().first;
    assert(Comp()(leaf->Key(leaf->NumItems() - 1), node->Key(key_pos)) < 0);

    // set a new separator.
    node->SetKey(key_pos, leaf->Key(leaf->NumItems() - 1));
    leaf->LeafEraseRight();  // pop the rightmost key from the leaf.
    node = leaf;
  }
  count_--;

  assert(node->IsLeaf());

  // go up the tree and rebalance if number of items in the node is less
  // than low limit. We either merge or rebalance nodes.
  while (node->NumItems() < node->MinItems()) {
    if (node == root_) {
      if (node->NumItems() == 0) {
        // terminal case, we reached the root - and it has either a single child (0 delimiters)
        // or no children at all (leaf). The former is more common case: the tree can only shrink
        // through the root.
        if (node->IsLeaf()) {
          assert(count_ == 0u);
          root_ = nullptr;
        } else {
          root_ = root_->Child(0);
        }
        --height_;
        DestroyNode(node);
      }
      return;
    }

    // The node has a parent. Pop the node from the path and try rebalance it via its parent.
    assert(path.Depth() > 0u);
    path.Pop();

    BPTreeNode* parent = path.Last().first;
    unsigned pos = path.Last().second;
    assert(parent->Child(pos) == node);
    node = parent->MergeOrRebalanceChild(pos);

    parent->IncreaseTreeCount(-1);

    if (node == nullptr)  // succeeded to merge/rebalance without the need to propagate.
      break;

    DestroyNode(node);

    // assert(parent->TreeCount() == parent->DEBUG_TreeCount());
    node = parent;
  }

  if (path.Depth() >= 2) {
    IncreaseSubtreeCounts(path, path.Depth() - 2, -1);
  }
}

template <typename T, typename Policy> void BPTree<T, Policy>::DestroyNode(BPTreeNode* node) {
  void* ptr = node;
  mr_->deallocate(ptr, detail::kBPNodeSize, 8);
  num_nodes_--;
}

template <typename T, typename Policy> void BPTree<T, Policy>::ForceUpdate(KeyT old, KeyT new_obj) {
  BPTreePath path;
  [[maybe_unused]] bool found = Locate(old, &path);

  assert(path.Depth() > 0u);
  assert(found);

  BPTreeNode* node = path.Last().first;
  node->SetKey(path.Last().second, new_obj);
}

}  // namespace dfly


================================================
FILE: src/core/bptree_set_test.cc
================================================
// Copyright 2023, Roman Gershman.  All rights reserved.
// See LICENSE for licensing terms.
//
#include "core/bptree_set.h"

#include <absl/container/btree_set.h>
#include <gmock/gmock.h>
#include <mimalloc.h>

#include <random>

extern "C" {
#include "redis/sds.h"
#include "redis/zmalloc.h"
}

#include "base/gtest.h"
#include "base/init.h"
#include "base/logging.h"
#include "core/mi_memory_resource.h"

using namespace std;

namespace dfly {

namespace {

template <typename Node, typename Policy>
bool ValidateNode(const Node* node, typename Node::KeyT ubound) {
  typename Policy::KeyCompareTo cmp;

  for (unsigned i = 1; i < node->NumItems(); ++i) {
    if (cmp(node->Key(i - 1), node->Key(i)) > -1)
      return false;
  }

  if (!node->IsLeaf()) {
    unsigned mask = 0;
    uint32_t subtree_cnt = node->NumItems();
    for (unsigned i = 0; i <= node->NumItems(); ++i) {
      mask |= (1 << node->Child(i)->IsLeaf());
      DCHECK_EQ(node->Child(i)->DEBUG_TreeCount(), node->Child(i)->TreeCount());
      subtree_cnt += node->Child(i)->TreeCount();
    }
    if (mask == 3)
      return false;

    if (subtree_cnt != node->TreeCount()) {
      LOG(ERROR) << "Expected " << subtree_cnt << " got " << node->TreeCount();
      return false;
    }
  }

  return cmp(node->Key(node->NumItems() - 1), ubound) == -1;
}

struct ZsetPolicy {
  struct KeyT {
    double d;
    sds s;
  };

  struct KeyCompareTo {
    int operator()(const KeyT& left, const KeyT& right) {
      if (left.d < right.d)
        return -1;
      if (left.d > right.d)
        return 1;

      // Note that sdscmp can return values outside of [-1, 1] range.
      return sdscmp(left.s, right.s);
    }
  };
};

using SDSTree = BPTree<ZsetPolicy::KeyT, ZsetPolicy>;

}  // namespace

class BPTreeSetTest : public ::testing::Test {
  using Node = detail::BPTreeNode<uint64_t>;

 protected:
  static constexpr size_t kNumElems = 7000;

  BPTreeSetTest() : mi_alloc_(mi_heap_get_backing()), bptree_(&mi_alloc_) {
  }
  static void SetUpTestSuite() {
  }

  void FillTree(unsigned start, unsigned factor) {
    for (unsigned i = start; i < kNumElems; ++i) {
      bptree_.Insert(i * factor);
    }
  }

  void FillTree(unsigned factor = 1) {
    FillTree(0, factor);
  }

  bool Validate();

  MiMemoryResource mi_alloc_;
  BPTree<uint64_t> bptree_;
  mt19937 generator_{1};
};

bool BPTreeSetTest::Validate() {
  auto* root = bptree_.DEBUG_root();
  if (!root)
    return true;

  // node, upper bound
  vector<pair<const Node*, uint64_t>> stack;

  stack.emplace_back(root, UINT64_MAX);

  while (!stack.empty()) {
    const Node* node = stack.back().first;
    uint64_t ubound = stack.back().second;
    stack.pop_back();

    if (!ValidateNode<Node, BPTreePolicy<uint64_t>>(node, ubound))
      return false;

    if (!node->IsLeaf()) {
      for (unsigned i = 0; i < node->NumItems(); ++i) {
        stack.emplace_back(node->Child(i), node->Key(i));
      }
      stack.emplace_back(node->Child(node->NumItems()), ubound);
    }
  }
  return true;
}

TEST_F(BPTreeSetTest, BPtreeInsert) {
  for (unsigned i = 1; i < 7000; ++i) {
    ASSERT_TRUE(bptree_.Insert(i));
    ASSERT_EQ(i, bptree_.Size());
    ASSERT_EQ(i - 1, bptree_.GetRank(i));
    // ASSERT_TRUE(Validate()) << i;
  }
  ASSERT_TRUE(Validate());

  ASSERT_GT(mi_alloc_.used(), 56000u);
  ASSERT_LT(mi_alloc_.used(), 66000u);

  for (unsigned i = 1; i < 7000; ++i) {
    ASSERT_TRUE(bptree_.Contains(i));
  }

  bptree_.Clear();
  ASSERT_EQ(mi_alloc_.used(), 0u);

  uniform_int_distribution<uint64_t> dist(0, 100000);
  for (unsigned i = 0; i < 20000; ++i) {
    bptree_.Insert(dist(generator_));
    // ASSERT_TRUE(Validate()) << i;
  }
  ASSERT_TRUE(Validate());
  ASSERT_GT(mi_alloc_.used(), 10000u);
  LOG(INFO) << bptree_.Height() << " " << bptree_.Size();

  bptree_.Clear();
  ASSERT_EQ(mi_alloc_.used(), 0u);

  for (unsigned i = 20000; i > 1; --i) {
    bptree_.Insert(i);
  }
  ASSERT_TRUE(Validate());
  for (unsigned i = 2; i <= 20000; ++i) {
    ASSERT_EQ(i - 2, bptree_.GetRank(i));
  }

  LOG(INFO) << bptree_.Height() << " " << bptree_.Size();
  ASSERT_GT(mi_alloc_.used(), 20000 * 8);
  ASSERT_LT(mi_alloc_.used(), 20000 * 10);
  bptree_.Clear();
  ASSERT_EQ(mi_alloc_.used(), 0u);
}

TEST_F(BPTreeSetTest, Delete) {
  for (unsigned i = 31; i > 10; --i) {
    bptree_.Insert(i);
  }

  for (unsigned i = 1; i < 10; ++i) {
    ASSERT_FALSE(bptree_.Delete(i));
  }

  for (unsigned i = 11; i < 32; ++i) {
    ASSERT_TRUE(bptree_.Delete(i));
  }
  ASSERT_EQ(mi_alloc_.used(), 0u);
  ASSERT_EQ(bptree_.Size(), 0u);

  FillTree();

  ASSERT_GT(bptree_.NodeCount(), 2u);
  unsigned sz = bptree_.Size();
  for (unsigned i = 0; i < kNumElems; ++i) {
    --sz;
    ASSERT_EQ(bptree_.GetRank(kNumElems - 1), sz);

    ASSERT_TRUE(bptree_.Delete(i));
    ASSERT_EQ(bptree_.Size(), sz);
    // ASSERT_TRUE(Validate()) << i;
  }

  ASSERT_EQ(mi_alloc_.used(), 0u);
  ASSERT_EQ(bptree_.Size(), 0u);
  ASSERT_EQ(bptree_.Height(), 0u);
  ASSERT_EQ(bptree_.NodeCount(), 0u);

  FillTree(2);
  for (unsigned i = 0; i < 20000; ++i) {
    unsigned val = generator_() % 15000;
    bool res = bptree_.Delete(val);

    if (val % 2 == 1) {
      ASSERT_FALSE(res);
    }
    if (res) {
      ASSERT_TRUE(Validate());
    }
  }
}

TEST_F(BPTreeSetTest, Iterate) {
  FillTree(2);

  unsigned cnt = 0;
  bool res = bptree_.Iterate(31, 543, [&](uint64_t val) {
    if ((31 + cnt) * 2 != val)
      return false;
    ++cnt;
    return true;
  });
  ASSERT_EQ(543 - 31 + 1, cnt);
  ASSERT_TRUE(res);

  for (unsigned j = 0; j < 10; ++j) {
    cnt = 0;
    unsigned from = generator_() % kNumElems;
    unsigned to = from + generator_() % (kNumElems - from);
    res = bptree_.Iterate(from, to, [&](uint64_t val) {
      if ((from + cnt) * 2 != val)
        return false;
      ++cnt;
      return true;
    });

    ASSERT_EQ(to - from + 1, cnt);
    ASSERT_TRUE(res);
  }
}

TEST_F(BPTreeSetTest, Ranges) {
  FillTree(2);

  auto path = bptree_.GEQ(31);
  EXPECT_EQ(32, path.Terminal());

  path = bptree_.GEQ(32);
  EXPECT_EQ(32, path.Terminal());

  path = bptree_.GEQ(13998);
  EXPECT_EQ(13998, path.Terminal());

  path = bptree_.LEQ(14000);
  EXPECT_EQ(13998, path.Terminal());

  path = bptree_.GEQ(14000);
  EXPECT_EQ(0, path.Depth());

  ASSERT_TRUE(bptree_.Delete(0));
  path = bptree_.GEQ(0);
  EXPECT_EQ(2, path.Terminal());

  path = bptree_.LEQ(1);
  EXPECT_TRUE(path.Empty());
}

TEST_F(BPTreeSetTest, HalfRanges) {
  FillTree(1, 3);  // 3, 6, 9 ...
  auto path = bptree_.FromRank(bptree_.Size() - 1);
  uint64_t val = path.Terminal();
  for (unsigned i = 0; i <= val; ++i) {
    path = bptree_.GEQ(i);
    ASSERT_FALSE(path.Empty()) << i;
  }
  path = bptree_.GEQ(val + 1);
  ASSERT_TRUE(path.Empty());

  for (unsigned i = 3; i <= val + 10; ++i) {
    path = bptree_.LEQ(i);
    ASSERT_FALSE(path.Empty()) << i;
  }
  path = bptree_.LEQ(2);
  ASSERT_TRUE(path.Empty());
}

#if 0
TEST_F(BPTreeSetTest, MemoryUsage) {
  zskiplist* zsl = zslCreate();
  std::vector<sds> sds_vec;

  constexpr size_t kLength = 3000;
  for (size_t i = 0; i < kLength; ++i) {
    sds_vec.push_back(sdsnew("f"));
  }
  size_t sz_before = zmalloc_used_memory_tl;
  LOG(INFO) << "zskiplist before: " << sz_before << " bytes";

  for (size_t i = 0; i < sds_vec.size(); ++i) {
    zslInsert(zsl, i, sds_vec[i]);
  }
  LOG(INFO) << "zskiplist takes: " << double(zmalloc_used_memory_tl - sz_before) / sds_vec.size()
            << " bytes per entry";
  zslFree(zsl);

  sds_vec.clear();
  for (size_t i = 0; i < kLength; ++i) {
    sds_vec.push_back(sdsnew("f"));
  }

  MiMemoryResource mi_alloc(mi_heap_get_backing());
  using AllocType = PMR_NS::polymorphic_allocator<std::pair<double, sds>>;
  AllocType alloc(&mi_alloc);
  absl::btree_set<pair<double, sds>, std::greater<pair<double, sds>>, AllocType> btree(alloc);

  ASSERT_EQ(0, mi_alloc.used());
  for (size_t i = 0; i < sds_vec.size(); ++i) {
    btree.emplace(i, sds_vec[i]);
  }
  ASSERT_GT(mi_alloc.used(), 0u);
  LOG(INFO) << "abseil btree: " << double(mi_alloc.used()) / sds_vec.size() << " bytes per entry";
  btree.clear();

  ASSERT_EQ(0, mi_alloc.used());
  SDSTree df_tree(&mi_alloc);
  for (size_t i = 0; i < sds_vec.size(); ++i) {
    btree.emplace(i, sds_vec[i]);
    VLOG(1) << "df btree: " << i << " " << double(mi_alloc.used()) / btree.size()
            << " bytes per entry";
  }
  ASSERT_GT(mi_alloc.used(), 0u);
  LOG(INFO) << "df btree: " << double(mi_alloc.used()) / sds_vec.size() << " bytes per entry";
}
#endif

TEST_F(BPTreeSetTest, InsertSDS) {
  vector<ZsetPolicy::KeyT> vals;
  for (unsigned i = 0; i < 256; ++i) {
    sds s = sdsempty();

    s = sdscatfmt(s, "a%u", i);
    vals.emplace_back(ZsetPolicy::KeyT{.d = 1000, .s = s});
  }

  SDSTree tree(&mi_alloc_);
  for (size_t i = 0; i < vals.size(); ++i) {
    ASSERT_TRUE(tree.Insert(vals[i]));
  }

  for (auto v : vals) {
    sdsfree(v.s);
  }
}

TEST_F(BPTreeSetTest, ReverseIterate) {
  vector<ZsetPolicy::KeyT> vals;
  for (int i = -1000; i < 1000; ++i) {
    sds s = sdsempty();

    s = sdscatfmt(s, "a%u", i);
    vals.emplace_back(ZsetPolicy::KeyT{.d = (double)i, .s = s});
  }

  SDSTree tree(&mi_alloc_);
  for (auto v : vals) {
    ASSERT_TRUE(tree.Insert(v));
    {
      double score = 0;
      tree.IterateReverse(0, 0, [&score](auto i) {
        score = i.d;
        return false;
      });
      EXPECT_EQ(score, v.d);
    }
    {
      double score = 0;
      tree.Iterate(0, 0, [&score](auto i) {
        score = i.d;
        return false;
      });
      EXPECT_EQ(score, vals[0].d);
    }
  }

  vector<int> res;
  tree.IterateReverse(0, 1, [&](auto i) {
    res.push_back(i.d);
    return true;
  });
  EXPECT_THAT(res, testing::ElementsAre(999, 998));

  for (auto v : vals) {
    sdsfree(v.s);
  }
}

static string RandomString(mt19937& rand, unsigned len) {
  const string_view alpanum = "1234567890abcdefghijklmnopqrstuvwxyz";
  string ret;
  ret.reserve(len);

  for (size_t i = 0; i < len; ++i) {
    ret += alpanum[rand() % alpanum.size()];
  }

  return ret;
}

std::vector<ZsetPolicy::KeyT> GenerateRandomPairs(unsigned len) {
  mt19937 dre(10);
  std::vector<ZsetPolicy::KeyT> vals(len, ZsetPolicy::KeyT{});
  for (unsigned i = 0; i < len; ++i) {
    vals[i].d = dre();
    vals[i].s = sdsnew(RandomString(dre, 10).c_str());
  }
  return vals;
}

static void BM_FindRandomBPTree(benchmark::State& state) {
  unsigned iters = state.range(0);
  std::vector<ZsetPolicy::KeyT> vals = GenerateRandomPairs(iters);
  SDSTree bptree;
  for (unsigned i = 0; i < iters; ++i) {
    bptree.Insert(vals[i]);
  }

  unsigned i = 0;
  while (state.KeepRunningBatch(10)) {
    for (unsigned j = 0; j < 10; ++j) {
      benchmark::DoNotOptimize(bptree.GEQ(vals[i]));
      ++i;
      if (vals.size() == i)
        i = 0;
    }
  }
  for (const auto v : vals) {
    sdsfree(v.s);
  }
}
BENCHMARK(BM_FindRandomBPTree)->Arg(1024)->Arg(1 << 16)->Arg(1 << 20);

#if 0
static void BM_FindRandomZSL(benchmark::State& state) {
  zskiplist* zsl = zslCreate();
  unsigned iters = state.range(0);
  std::vector<ZsetPolicy::KeyT> vals = GenerateRandomPairs(iters);
  for (unsigned i = 0; i < iters; ++i) {
    zslInsert(zsl, vals[i].d, sdsdup(vals[i].s));
  }

  zrangespec spec;
  spec.maxex = 0;
  spec.minex = 0;

  unsigned i = 0;
  while (state.KeepRunningBatch(10)) {
    for (unsigned j = 0; j < 10; ++j) {
      spec.min = vals[i].d;
      spec.max = spec.min;
      benchmark::DoNotOptimize(zslFirstInRange(zsl, &spec));

      ++i;
      if (vals.size() == i)
        i = 0;
    }
  }

  zslFree(zsl);

  for (const auto v : vals) {
    sdsfree(v.s);
  }
}
BENCHMARK(BM_FindRandomZSL)->Arg(1024)->Arg(1 << 16)->Arg(1 << 20);
#endif

void RegisterBPTreeBench() {
  auto* tlh = mi_heap_get_backing();
  init_zmalloc_threadlocal(tlh);
};

REGISTER_MODULE_INITIALIZER(Bptree, RegisterBPTreeBench());

TEST_F(BPTreeSetTest, ForceUpdate) {
  struct Policy {
    // Similar to how it's used in SortedMap just a little simpler.
    using KeyT = int*;

    struct KeyCompareTo {
      int operator()(KeyT a, KeyT b) const {
        if (*a < *b)
          return -1;
        if (*a > *b)
          return 1;
        return 0;
      }
    };
  };

  auto gen_vector = []() {
    std::vector<std::unique_ptr<int>> tmp;
    for (size_t i = 0; i < 1000; ++i) {
      tmp.push_back(std::make_unique<int>(i));
    }
    return tmp;
  };

  std::vector<std::unique_ptr<int>> original = gen_vector();
  std::vector<std::unique_ptr<int>> modified = gen_vector();

  BPTree<int*, Policy> bptree;
  for (auto& item : original) {
    bptree.Insert(item.get());
  }

  for (auto& item : modified) {
    bptree.ForceUpdate(item.get(), item.get());
  }

  original.clear();
  size_t index = 0;
  bptree.Iterate(0, 1000, [&](int* ptr) {
    EXPECT_EQ(modified[index].get(), ptr);
    ++index;
    return true;
  });
}

}  // namespace dfly


================================================
FILE: src/core/cms.cc
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/cms.h"

#include <xxhash.h>

#include <algorithm>
#include <cmath>
#include <limits>

#include "base/logging.h"

namespace dfly {
namespace {

uint32_t Offset(uint64_t h1, uint64_t h2, uint32_t row, uint32_t width) {
  uint32_t idx = static_cast<uint32_t>((h1 + (row * h2)) % width);
  return row * width + idx;
}

}  // namespace

CMS::CMS(uint32_t width, uint32_t depth, PMR_NS::memory_resource* mr)
    : width_(width), depth_(depth), mr_(mr) {
  size_t len = NumCounters();
  counters_ = static_cast<int64_t*>(mr_->allocate(len * sizeof(int64_t), alignof(int64_t)));
  std::fill_n(counters_, len, 0);
}

CMS::~CMS() {
  if (counters_) {
    mr_->deallocate(counters_, NumCounters() * sizeof(int64_t), alignof(int64_t));
  }
}

CMS::CMS(CMS&& other) noexcept
    : width_(other.width_),
      depth_(other.depth_),
      mr_(other.mr_),
      count_(other.count_),
      counters_(other.counters_) {
  other.width_ = 0;
  other.depth_ = 0;
  other.count_ = 0;
  other.counters_ = nullptr;
}

CMS& CMS::operator=(CMS&& other) noexcept {
  if (this != &other) {
    if (counters_) {
      mr_->deallocate(counters_, NumCounters() * sizeof(int64_t), alignof(int64_t));
    }
    width_ = other.width_;
    depth_ = other.depth_;
    mr_ = other.mr_;
    count_ = other.count_;
    counters_ = other.counters_;
    other.width_ = 0;
    other.depth_ = 0;
    other.count_ = 0;
    other.counters_ = nullptr;
  }
  return *this;
}

CMS::CMS(ErrorRateTag /*tag*/, double error, double probability, PMR_NS::memory_resource* mr)
    : CMS(static_cast<uint32_t>(std::ceil(M_E / error)),
          static_cast<uint32_t>(std::ceil(std::log(1.0 / probability))), mr) {
}

int64_t CMS::IncrBy(std::string_view item, int64_t increment) {
  count_ += increment;

  int64_t min_count = std::numeric_limits<int64_t>::max();
  XXH128_hash_t hash = XXH3_128bits(item.data(), item.size());
  uint64_t h1 = hash.low64;
  uint64_t h2 = hash.high64;

  for (uint32_t row = 0; row < depth_; ++row) {
    uint32_t offset = Offset(h1, h2, row, width_);
    counters_[offset] += increment;
    min_count = std::min(min_count, counters_[offset]);
  }

  return min_count;
}

int64_t CMS::Query(std::string_view item) const {
  XXH128_hash_t hash = XXH3_128bits(item.data(), item.size());
  uint64_t h1 = hash.low64;
  uint64_t h2 = hash.high64;

  int64_t min_count = std::numeric_limits<int64_t>::max();
  for (uint32_t row = 0; row < depth_; ++row) {
    uint32_t offset = Offset(h1, h2, row, width_);
    min_count = std::min(min_count, counters_[offset]);
  }

  return min_count;
}

bool CMS::MergeFrom(const CMS& other, int64_t weight) {
  if (width_ != other.width_ || depth_ != other.depth_) {
    return false;
  }

  for (size_t i = 0; i < NumCounters(); ++i) {
    counters_[i] += other.counters_[i] * weight;
  }

  count_ += other.count_ * weight;
  return true;
}

void CMS::Reset() {
  std::fill_n(counters_, NumCounters(), 0);
  count_ = 0;
}

void CMS::Load(int64_t total_incr_count, const int64_t* data) {
  count_ = total_incr_count;
  std::copy_n(data, NumCounters(), counters_);
}

}  // namespace dfly


================================================
FILE: src/core/cms.h
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <cstdint>
#include <string_view>

#include "base/pmr/memory_resource.h"

namespace dfly {

/// Count-Min Sketch implementation compatible with Redis CMS commands.
class CMS {
 public:
  // Create a CMS with given width and depth dimensions.
  // width: number of counters per row
  // depth: number of rows (hash functions)
  CMS(uint32_t width, uint32_t depth, PMR_NS::memory_resource* mr);

  CMS(const CMS&) = delete;
  CMS& operator=(const CMS&) = delete;

  CMS(CMS&& other) noexcept;
  CMS& operator=(CMS&& other) noexcept;

  ~CMS();

  // Tag type to disambiguate CMS construction by error rate and probability.
  struct ErrorRateTag {};

  // Create a CMS from error rate and probability parameters.
  // error: relative error (e.g. 0.01 for 1%), must be in (0, 1).
  // probability: probability of exceeding the error, must be in (0, 1).
  // width = ceil(e / error), depth = ceil(ln(1 / probability)).
  CMS(ErrorRateTag, double error, double probability, PMR_NS::memory_resource* mr);

  // Increment the count for an item by the given value.
  // Returns the new estimated count for the item.
  int64_t IncrBy(std::string_view item, int64_t increment);

  // Query the estimated count for an item.
  int64_t Query(std::string_view item) const;

  // Merge another CMS into this one with the given weight.
  // The other CMS must have the same dimensions.
  // Returns false if dimensions don't match.
  bool MergeFrom(const CMS& other, int64_t weight = 1);

  // Reset all counters and total count to zero.
  void Reset();

  // Load serialized counter state. data must have exactly NumCounters() elements.
  void Load(int64_t total_incr_count, const int64_t* data);

  // Accessors for CMS properties
  uint32_t width() const {
    return width_;
  }

  uint32_t depth() const {
    return depth_;
  }

  // Total count of all IncrBy operations (used by CMS.INFO).
  int64_t total_count() const {
    return count_;
  }

  // Memory usage in bytes
  size_t MallocUsed() const {
    return NumCounters() * sizeof(int64_t);
  }

  size_t NumCounters() const {
    return static_cast<size_t>(width_) * depth_;
  }

  const int64_t* Data() const {
    return counters_;
  }

 private:
  uint32_t width_;
  uint32_t depth_;
  PMR_NS::memory_resource* mr_ = nullptr;
  int64_t count_ = 0;  // Total count of all IncrBy operations
  int64_t* counters_ = nullptr;
};

}  // namespace dfly


================================================
FILE: src/core/cms_test.cc
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/cms.h"

#include <absl/strings/str_cat.h>

#include <cmath>

#include "base/gtest.h"

namespace dfly {

using namespace std;

class CMSTest : public ::testing::Test {
 protected:
  CMSTest() : cms_(CMS(1000, 5, PMR_NS::get_default_resource())) {
  }

  CMS cms_;
};

// A freshly created CMS must return 0 for any item.
TEST_F(CMSTest, InitialCountIsZero) {
  EXPECT_EQ(cms_.Query("nonexistent"), 0);
  EXPECT_EQ(cms_.Query(""), 0);
  EXPECT_EQ(cms_.Query("anything"), 0);
}

// Use width=1 so every item maps to column 0, exercising all counters.
// This catches initialization bugs (e.g. counters not zeroed).
TEST(CMSBasic, InitialCountIsZeroSmall) {
  CMS cms(1, 1, PMR_NS::get_default_resource());
  EXPECT_EQ(cms.Query("x"), 0);
  EXPECT_EQ(cms.Query("y"), 0);
}

TEST(CMSBasic, IncrBySmall) {
  CMS cms(1, 1, PMR_NS::get_default_resource());
  EXPECT_EQ(cms.IncrBy("a", 3), 3);
  // width=1 means all items collide; "b" should also return 3.
  EXPECT_EQ(cms.Query("b"), 3);
}

// Inspired by fakeredis test_cms_create: initbyprob computes correct dimensions.
TEST(CMSBasic, InitByProb) {
  CMS cms(CMS::ErrorRateTag{}, 0.01, 0.01, PMR_NS::get_default_resource());

  // width = ceil(e / 0.01) = ceil(271.8..) = 272
  EXPECT_EQ(cms.width(), static_cast<uint32_t>(std::ceil(M_E / 0.01)));
  // depth = ceil(ln(1/0.01)) = ceil(4.605..) = 5
  EXPECT_EQ(cms.depth(), static_cast<uint32_t>(std::ceil(std::log(100.0))));
  EXPECT_EQ(cms.Query("anything"), 0);
}

// Inspired by fakeredis test_cms_incrby: multiple items, incremental updates.
TEST_F(CMSTest, IncrByMultipleItems) {
  EXPECT_EQ(cms_.IncrBy("foo", 3), 3);
  cms_.IncrBy("foo", 4);
  cms_.IncrBy("bar", 1);

  EXPECT_GE(cms_.Query("foo"), 7);
  EXPECT_GE(cms_.Query("bar"), 1);
  EXPECT_EQ(cms_.Query("noexist"), 0);
}

TEST_F(CMSTest, BasicIncrBy) {
  int64_t count = cms_.IncrBy("foo", 5);
  EXPECT_EQ(count, 5);

  count = cms_.IncrBy("foo", 3);
  EXPECT_EQ(count, 8);

  EXPECT_EQ(cms_.Query("foo"), 8);
}

TEST_F(CMSTest, QueryReturnsMinimum) {
  cms_.IncrBy("a", 10);
  cms_.IncrBy("b", 20);

  // CMS can overestimate, but never underestimate.
  EXPECT_GE(cms_.Query("a"), 10);
  EXPECT_GE(cms_.Query("b"), 20);
}

TEST_F(CMSTest, NeverUnderestimates) {
  for (int i = 0; i < 500; ++i) {
    string key = absl::StrCat("item", i);
    cms_.IncrBy(key, i + 1);
  }

  for (int i = 0; i < 500; ++i) {
    string key = absl::StrCat("item", i);
    EXPECT_GE(cms_.Query(key), i + 1) << "Underestimate for " << key;
  }
}

TEST_F(CMSTest, UnseenItemIsZero) {
  cms_.IncrBy("known", 100);
  // With width=1000 and depth=5 and only one item inserted, collisions are unlikely.
  EXPECT_LE(cms_.Query("unknown"), 5);
}

TEST_F(CMSTest, Dimensions) {
  EXPECT_EQ(cms_.width(), 1000u);
  EXPECT_EQ(cms_.depth(), 5u);
}

TEST_F(CMSTest, MallocUsed) {
  EXPECT_EQ(cms_.MallocUsed(), 1000u * 5 * sizeof(int64_t));
}

// Inspired by fakeredis test_cms_merge: basic merge of two sketches.
TEST_F(CMSTest, MergeFrom) {
  CMS other(1000, 5, PMR_NS::get_default_resource());
  cms_.IncrBy("foo", 3);
  other.IncrBy("foo", 4);
  other.IncrBy("bar", 1);

  EXPECT_TRUE(cms_.MergeFrom(other));
  EXPECT_GE(cms_.Query("foo"), 7);
  EXPECT_GE(cms_.Query("bar"), 1);
}

TEST_F(CMSTest, MergeFromWithWeight) {
  CMS other(1000, 5, PMR_NS::get_default_resource());
  other.IncrBy("x", 5);

  cms_.IncrBy("x", 10);
  EXPECT_TRUE(cms_.MergeFrom(other, 3));
  // 10 + 5*3 = 25
  EXPECT_GE(cms_.Query("x"), 25);
}

TEST_F(CMSTest, MergeDimensionMismatch) {
  CMS other(500, 5, PMR_NS::get_default_resource());
  EXPECT_FALSE(cms_.MergeFrom(other));

  CMS other2(1000, 3, PMR_NS::get_default_resource());
  EXPECT_FALSE(cms_.MergeFrom(other2));
}

// Inspired by fakeredis test_cms_info: merge multiple sources with weights, verify counts.
// Mirrors the exact sequence: C=A+B, C+=A*1+B*2, C+=A*2+B*3, then check info.count.
TEST(CMSBasic, MergeMultipleWithWeights) {
  auto* mr = PMR_NS::get_default_resource();
  CMS a(1000, 5, mr);
  CMS b(1000, 5, mr);
  CMS c(1000, 5, mr);

  a.IncrBy("foo", 5);
  a.IncrBy("bar", 3);
  a.IncrBy("baz", 9);

  b.IncrBy("foo", 2);
  b.IncrBy("bar", 3);
  b.IncrBy("baz", 1);

  EXPECT_EQ(a.Query("foo"), 5);
  EXPECT_EQ(a.Query("bar"), 3);
  EXPECT_EQ(a.Query("baz"), 9);
  EXPECT_EQ(b.Query("foo"), 2);
  EXPECT_EQ(b.Query("bar"), 3);
  EXPECT_EQ(b.Query("baz"), 1);

  // C = A*1 + B*1
  EXPECT_TRUE(c.MergeFrom(a));
  EXPECT_TRUE(c.MergeFrom(b));
  EXPECT_EQ(c.Query("foo"), 7);
  EXPECT_EQ(c.Query("bar"), 6);
  EXPECT_EQ(c.Query("baz"), 10);

  // C += A*1 + B*2
  EXPECT_TRUE(c.MergeFrom(a, 1));
  EXPECT_TRUE(c.MergeFrom(b, 2));
  EXPECT_EQ(c.Query("foo"), 16);
  EXPECT_EQ(c.Query("bar"), 15);
  EXPECT_EQ(c.Query("baz"), 21);

  // C += A*2 + B*3
  EXPECT_TRUE(c.MergeFrom(a, 2));
  EXPECT_TRUE(c.MergeFrom(b, 3));
  EXPECT_EQ(c.Query("foo"), 32);
  EXPECT_EQ(c.Query("bar"), 30);
  EXPECT_EQ(c.Query("baz"), 42);
}

// Inspired by fakeredis test_cms_info: verify count tracks total of all IncrBy operations.
TEST(CMSBasic, CountTracking) {
  auto* mr = PMR_NS::get_default_resource();
  CMS a(1000, 5, mr);

  EXPECT_EQ(a.total_count(), 0);

  a.IncrBy("foo", 5);
  a.IncrBy("bar", 3);
  a.IncrBy("baz", 9);
  // total_count = 5 + 3 + 9 = 17 (matches fakeredis test_cms_info assertion)
  EXPECT_EQ(a.total_count(), 17);
}

// Inspired by fakeredis test_cms_info: count is updated by MergeFrom.
TEST(CMSBasic, CountAfterMerge) {
  auto* mr = PMR_NS::get_default_resource();
  CMS a(1000, 5, mr);
  CMS b(1000, 5, mr);
  CMS c(1000, 5, mr);

  a.IncrBy("foo", 5);
  a.IncrBy("bar", 3);
  a.IncrBy("baz", 9);
  EXPECT_EQ(a.total_count(), 17);

  b.IncrBy("foo", 2);
  b.IncrBy("bar", 3);
  b.IncrBy("baz", 1);
  EXPECT_EQ(b.total_count(), 6);

  // C = A + B -> total_count = 17 + 6 = 23
  c.MergeFrom(a);
  c.MergeFrom(b);
  EXPECT_EQ(c.total_count(), 23);

  // C += A*1 + B*2 -> total_count = 23 + 17*1 + 6*2 = 52
  // (matches fakeredis test_cms_merge_fail assertion: count == 52)
  c.MergeFrom(a, 1);
  c.MergeFrom(b, 2);
  EXPECT_EQ(c.total_count(), 52);
}

TEST_F(CMSTest, MoveConstruct) {
  cms_.IncrBy("foo", 42);
  CMS moved(std::move(cms_));

  EXPECT_EQ(moved.Query("foo"), 42);
  EXPECT_EQ(moved.width(), 1000u);
  EXPECT_EQ(moved.depth(), 5u);
}

TEST_F(CMSTest, MoveAssign) {
  cms_.IncrBy("foo", 42);
  CMS other(500, 3, PMR_NS::get_default_resource());
  other = std::move(cms_);

  EXPECT_EQ(other.Query("foo"), 42);
  EXPECT_EQ(other.width(), 1000u);
  EXPECT_EQ(other.depth(), 5u);
}

}  // namespace dfly


================================================
FILE: src/core/collection_entry.h
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/strings/str_cat.h>

#include <cstddef>
#include <string>
#include <string_view>

namespace dfly {

// Stores either:
// - A single long long value (longval) when value = nullptr
// - A single char* (value) when value != nullptr
struct CollectionEntry {
  CollectionEntry(const char* value, size_t length) : value_{value}, length_{length} {
  }
  explicit CollectionEntry(long long longval) : value_{nullptr}, longval_{longval} {
  }

  CollectionEntry(const CollectionEntry&) = default;
  CollectionEntry& operator=(const CollectionEntry&) = default;

  std::string ToString() const {
    if (value_)
      return {value_, length_};
    else
      return absl::StrCat(longval_);
  }

  bool IsString() const {
    return value_ != nullptr;
  }

  bool is_int() const {
    return value_ == nullptr;
  }

  const char* data() const {
    return value_;
  }

  size_t size() const {
    return length_;
  }

  long long as_long() const {
    return longval_;
  }

  // Assumes value is not null.
  std::string_view view() const {
    return {value_, length_};
  }

  // compatibility method
  std::string to_string() const {
    return ToString();
  }

  // compatibility method
  long long ival() const {
    return longval_;
  }

  bool operator==(std::string_view sv) const;
  friend bool operator==(std::string_view sv, const CollectionEntry& entry) {
    return entry == sv;
  }

 private:
  const char* value_;
  union {
    size_t length_;
    long long longval_;
  };
};

inline bool CollectionEntry::operator==(std::string_view sv) const {
  if (value_ == nullptr) {
    char buf[absl::numbers_internal::kFastToBufferSize];
    char* end = absl::numbers_internal::FastIntToBuffer(longval_, buf);
    return sv == std::string_view(buf, end - buf);
  }
  return view() == sv;
}

}  // namespace dfly


================================================
FILE: src/core/compact_object.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/compact_object.h"

// #define XXH_INLINE_ALL
#include <xxhash.h>

#include <array>

extern "C" {
#include "redis/intset.h"
#include "redis/listpack.h"
#include "redis/redis_aux.h"
#include "redis/sds.h"
#include "redis/stream.h"
#include "redis/util.h"
#include "redis/zmalloc.h"  // for non-string objects.
}
#include <absl/strings/str_cat.h>
#include <absl/strings/strip.h>

#include "base/flags.h"
#include "base/logging.h"
#include "base/pod_array.h"
#include "core/bloom.h"
#include "core/cms.h"
#include "core/detail/bitpacking.h"
#include "core/huff_coder.h"
#include "core/page_usage/page_usage_stats.h"
#include "core/qlist.h"
#include "core/sorted_map.h"
#include "core/string_map.h"
#include "core/string_set.h"
#include "core/tiering_types.h"
#include "core/topk.h"

ABSL_FLAG(bool, experimental_flat_json, false, "If true uses flat json implementation.");
ABSL_FLAG(bool, disable_json_defragmentation, false, "If true disable json object defragmentation");

namespace dfly {
using namespace std;
using detail::ascii_len;
using detail::binpacked_len;
using MemoryResource = detail::RobjWrapper::MemoryResource;

namespace {

constexpr XXH64_hash_t kHashSeed = 24061983;
constexpr size_t kAlignSize = 8u;

size_t UpdateSize(size_t size, int64_t update) {
  int64_t result = static_cast<int64_t>(size) + update;
  if (result < 0) {
    DCHECK(false) << "Can't decrease " << size << " from " << -update;
    LOG_EVERY_T(ERROR, 30) << "Can't decrease " << size << " from " << -update;
  }
  return result;
}

inline void FreeObjSet(unsigned encoding, void* ptr, MemoryResource* mr) {
  switch (encoding) {
    case kEncodingStrMap2: {
      CompactObj::DeleteMR<StringSet>(ptr);
      break;
    }

    case kEncodingIntSet:
      zfree((void*)ptr);
      break;
    default:
      LOG(FATAL) << "Unknown set encoding type";
  }
}

void FreeList(unsigned encoding, void* ptr, MemoryResource* mr) {
  if (encoding == kEncodingListPack) {
    lpFree((uint8_t*)ptr);
    return;
  }
  CHECK_EQ(encoding, kEncodingQL2);
  CompactObj::DeleteMR<QList>(ptr);
}

size_t MallocUsedSet(unsigned encoding, void* ptr) {
  switch (encoding) {
    case kEncodingStrMap2: {
      StringSet* ss = (StringSet*)ptr;
      return ss->ObjMallocUsed() + ss->SetMallocUsed() + zmalloc_usable_size(ptr);
    }
    case kEncodingIntSet:
      return intsetBlobLen((intset*)ptr);
  }

  LOG(DFATAL) << "Unknown set encoding type " << encoding;
  return 0;
}

size_t MallocUsedHSet(unsigned encoding, void* ptr) {
  switch (encoding) {
    case kEncodingListPack:
      return zmalloc_usable_size(reinterpret_cast<uint8_t*>(ptr));
    case kEncodingStrMap2: {
      StringMap* sm = (StringMap*)ptr;
      return sm->ObjMallocUsed() + sm->SetMallocUsed() + zmalloc_usable_size(ptr);
    }
  }
  LOG(DFATAL) << "Unknown set encoding type " << encoding;
  return 0;
}

size_t MallocUsedZSet(unsigned encoding, void* ptr) {
  switch (encoding) {
    case OBJ_ENCODING_LISTPACK:
      return zmalloc_usable_size(reinterpret_cast<uint8_t*>(ptr));
    case OBJ_ENCODING_SKIPLIST: {
      detail::SortedMap* ss = (detail::SortedMap*)ptr;
      return ss->MallocSize() + zmalloc_usable_size(ptr);  // DictMallocSize(zs->dict);
    }
  }
  LOG(DFATAL) << "Unknown set encoding type " << encoding;
  return 0;
}

/* This is a helper function with the goal of estimating the memory
 * size of a radix tree that is used to store Stream IDs.
 *
 * Note: to guess the size of the radix tree is not trivial, so we
 * approximate it considering 16 bytes of data overhead for each
 * key (the ID), and then adding the number of bare nodes, plus some
 * overhead due by the data and child pointers. This secret recipe
 * was obtained by checking the average radix tree created by real
 * workloads, and then adjusting the constants to get numbers that
 * more or less match the real memory usage.
 *
 * Actually the number of nodes and keys may be different depending
 * on the insertion speed and thus the ability of the radix tree
 * to compress prefixes. */
size_t streamRadixTreeMemoryUsage(rax* rax) {
  size_t size = sizeof(*rax);
  size = rax->numele * sizeof(streamID);
  size += rax->numnodes * sizeof(raxNode);
  /* Add a fixed overhead due to the aux data pointer, children, ... */
  size += rax->numnodes * sizeof(long) * 30;
  return size;
}

size_t MallocUsedStream(stream* s) {
  size_t asize = sizeof(*s);
  asize += streamRadixTreeMemoryUsage(s->rax);

  /* Now we have to add the listpacks. The last listpack is often non
   * complete, so we estimate the size of the first N listpacks, and
   * use the average to compute the size of the first N-1 listpacks, and
   * finally add the real size of the last node. */
  raxIterator ri;
  raxStart(&ri, s->rax);
  raxSeek(&ri, "^", NULL, 0);
  size_t lpsize = 0, samples = 0;
  while (raxNext(&ri)) {
    uint8_t* lp = (uint8_t*)ri.data;
    /* Use the allocated size, since we overprovision the node initially. */
    lpsize += zmalloc_size(lp);
    samples++;
  }
  if (s->rax->numele <= samples) {
    asize += lpsize;
  } else {
    if (samples)
      lpsize /= samples; /* Compute the average. */
    asize += lpsize * (s->rax->numele - 1);
    /* No need to check if seek succeeded, we enter this branch only
     * if there are a few elements in the radix tree. */
    raxSeek(&ri, "$", NULL, 0);
    raxNext(&ri);
    /* Use the allocated size, since we overprovision the node initially. */
    asize += zmalloc_size(ri.data);
  }
  raxStop(&ri);

  /* Consumer groups also have a non trivial memory overhead if there
   * are many consumers and many groups, let's count at least the
   * overhead of the pending entries in the groups and consumers
   * PELs. */
  if (s->cgroups) {
    raxStart(&ri, s->cgroups);
    raxSeek(&ri, "^", NULL, 0);
    while (raxNext(&ri)) {
      streamCG* cg = (streamCG*)ri.data;
      asize += sizeof(*cg);
      asize += streamRadixTreeMemoryUsage(cg->pel);
      asize += sizeof(streamNACK) * raxSize(cg->pel);

      /* For each consumer we also need to add the basic data
       * structures and the PEL memory usage. */
      raxIterator cri;
      raxStart(&cri, cg->consumers);
      raxSeek(&cri, "^", NULL, 0);
      while (raxNext(&cri)) {
        const streamConsumer* consumer = (const streamConsumer*)cri.data;
        asize += sizeof(*consumer);
        asize += sdslen(consumer->name);
        asize += streamRadixTreeMemoryUsage(consumer->pel);
        /* Don't count NACKs again, they are shared with the
         * consumer group PEL. */
      }
      raxStop(&cri);
    }
    raxStop(&ri);
  }
  return asize;
}

inline void FreeObjHash(unsigned encoding, void* ptr) {
  switch (encoding) {
    case kEncodingStrMap2:
      CompactObj::DeleteMR<StringMap>(ptr);
      break;
    case kEncodingListPack:
      lpFree((uint8_t*)ptr);
      break;
    default:
      LOG(FATAL) << "Unknown hset encoding type " << encoding;
  }
}

inline void FreeObjZset(unsigned encoding, void* ptr) {
  switch (encoding) {
    case OBJ_ENCODING_SKIPLIST:
      CompactObj::DeleteMR<detail::SortedMap>(ptr);
      break;
    case OBJ_ENCODING_LISTPACK:
      zfree(ptr);
      break;
    default:
      LOG(FATAL) << "Unknown sorted set encoding" << encoding;
  }
}

pair<void*, bool> DefragStrMap2(StringMap* sm, PageUsage* page_usage) {
  bool realloced = false;

  for (auto it = sm->begin(); it != sm->end(); ++it)
    realloced |= it.ReallocIfNeeded(page_usage);

  return {sm, realloced};
}

pair<void*, bool> DefragListPack(uint8_t* lp, PageUsage* page_usage) {
  if (!page_usage->IsPageForObjectUnderUtilized(lp))
    return {lp, false};

  size_t lp_bytes = lpBytes(lp);
  uint8_t* replacement = lpNew(lpBytes(lp));
  memcpy(replacement, lp, lp_bytes);
  lpFree(lp);

  return {replacement, true};
}

pair<void*, bool> DefragIntSet(intset* is, PageUsage* page_usage) {
  if (!page_usage->IsPageForObjectUnderUtilized(is))
    return {is, false};

  const size_t blob_len = intsetBlobLen(is);
  intset* replacement = (intset*)zmalloc(blob_len);
  memcpy(replacement, is, blob_len);

  zfree(is);
  return {replacement, true};
}

pair<void*, bool> DefragSortedMap(detail::SortedMap* sm, PageUsage* page_usage) {
  const bool reallocated = sm->DefragIfNeeded(page_usage);
  return {sm, reallocated};
}

pair<void*, bool> DefragStrSet(StringSet* ss, PageUsage* page_usage) {
  bool realloced = false;

  for (auto it = ss->begin(); it != ss->end(); ++it)
    realloced |= it.ReallocIfNeeded(page_usage);

  return {ss, realloced};
}

// Iterates over allocations of internal hash data structures and re-allocates
// them if their pages are underutilized.
// Returns pointer to new object ptr and whether any re-allocations happened.
pair<void*, bool> DefragHash(unsigned encoding, void* ptr, PageUsage* page_usage) {
  switch (encoding) {
    // Listpack is stored as a single contiguous array
    case kEncodingListPack: {
      return DefragListPack((uint8_t*)ptr, page_usage);
    }

    // StringMap supports re-allocation of it's internal nodes
    case kEncodingStrMap2: {
      return DefragStrMap2((StringMap*)ptr, page_usage);
    }

    default:
      ABSL_UNREACHABLE();
  }
}

pair<void*, bool> DefragSet(unsigned encoding, void* ptr, PageUsage* page_usage) {
  switch (encoding) {
    // Int sets have flat storage
    case kEncodingIntSet: {
      return DefragIntSet((intset*)ptr, page_usage);
    }

    case kEncodingStrMap2: {
      return DefragStrSet((StringSet*)ptr, page_usage);
    }

    default:
      ABSL_UNREACHABLE();
  }
}

pair<void*, bool> DefragZSet(unsigned encoding, void* ptr, PageUsage* page_usage) {
  switch (encoding) {
    // Listpack is stored as a single contiguous array
    case OBJ_ENCODING_LISTPACK: {
      return DefragListPack((uint8_t*)ptr, page_usage);
    }

    // SKIPLIST really means ScoreMap
    case OBJ_ENCODING_SKIPLIST: {
      return DefragSortedMap((detail::SortedMap*)ptr, page_usage);
    }

    default:
      ABSL_UNREACHABLE();
  }
}

pair<void*, bool> DefragList(unsigned encoding, void* ptr, PageUsage* page_usage) {
  if (encoding == kEncodingListPack) {
    return DefragListPack((uint8_t*)ptr, page_usage);
  }
  auto* qlist_ptr = static_cast<QList*>(ptr);
  bool reallocated = qlist_ptr->DefragIfNeeded(page_usage);
  return {ptr, reallocated};
}

inline void FreeObjStream(void* ptr) {
  freeStream((stream*)ptr);
}

inline const uint8_t* to_byte(const void* s) {
  return reinterpret_cast<const uint8_t*>(s);
}

static_assert(binpacked_len(7) == 7);
static_assert(binpacked_len(8) == 7);
static_assert(binpacked_len(15) == 14);
static_assert(binpacked_len(16) == 14);
static_assert(binpacked_len(17) == 15);
static_assert(binpacked_len(18) == 16);
static_assert(binpacked_len(19) == 17);
static_assert(binpacked_len(20) == 18);
static_assert(ascii_len(14) == 16);
static_assert(ascii_len(15) == 17);
static_assert(ascii_len(16) == 18);
static_assert(ascii_len(17) == 19);

struct Huffman {
  HuffmanEncoder encoder;
  HuffmanDecoder decoder;
};

struct TL {
  MemoryResource* local_mr = PMR_NS::get_default_resource();
  base::PODArray<uint8_t> tmp_buf;
  string tmp_str;
  size_t small_str_bytes;
  Huffman huff_keys, huff_string_values;
  uint64_t huff_encode_total = 0, huff_encode_success = 0;  // success/total metrics.

  const HuffmanDecoder& GetHuffmanDecoder(uint8_t huffman_domain) const {
    return huffman_domain == CompactObj::HUFF_KEYS ? huff_keys.decoder : huff_string_values.decoder;
  }
};

thread_local TL tl;

constexpr bool kUseAsciiEncoding = true;

}  // namespace

static_assert(sizeof(CompactObj) == 18);

namespace detail {

size_t RobjWrapper::MallocUsed(bool slow) const {
  if (!inner_obj_)
    return 0;

  switch (type_) {
    case OBJ_STRING:
      CHECK_EQ(OBJ_ENCODING_RAW, encoding_);
      return InnerObjMallocUsed();
    case OBJ_LIST:
      if (encoding_ == kEncodingListPack) {
        return zmalloc_usable_size(inner_obj_);
      }
      return ((QList*)inner_obj_)->MallocUsed(slow);
    case OBJ_SET:
      return MallocUsedSet(encoding_, inner_obj_);
    case OBJ_HASH:
      return MallocUsedHSet(encoding_, inner_obj_);
    case OBJ_ZSET:
      return MallocUsedZSet(encoding_, inner_obj_);
    case OBJ_STREAM:
      return slow ? MallocUsedStream((stream*)inner_obj_) : sz_;

    default:
      LOG(FATAL) << "Not supported " << type_;
  }

  return 0;
}

size_t RobjWrapper::Size() const {
  switch (type_) {
    case OBJ_STRING:
      DCHECK_EQ(OBJ_ENCODING_RAW, encoding_);
      return sz_;
    case OBJ_LIST:
      if (encoding_ == kEncodingListPack) {
        return lpLength((uint8_t*)inner_obj_);
      }
      return ((QList*)inner_obj_)->Size();
    case OBJ_ZSET: {
      switch (encoding_) {
        case OBJ_ENCODING_SKIPLIST: {
          SortedMap* ss = (SortedMap*)inner_obj_;
          return ss->Size();
        }
        case OBJ_ENCODING_LISTPACK:
          return lpLength((uint8_t*)inner_obj_) / 2;
        default:
          LOG(FATAL) << "Unknown sorted set encoding" << encoding_;
      }
    }
    case OBJ_SET:
      switch (encoding_) {
        case kEncodingIntSet: {
          intset* is = (intset*)inner_obj_;
          return intsetLen(is);
        }
        case kEncodingStrMap2: {
          StringSet* ss = (StringSet*)inner_obj_;
          return ss->UpperBoundSize();
        }
        default:
          LOG(FATAL) << "Unexpected encoding " << encoding_;
      };
    case OBJ_HASH:
      switch (encoding_) {
        case kEncodingListPack: {
          uint8_t* lp = (uint8_t*)inner_obj_;
          return lpLength(lp) / 2;
        } break;

        case kEncodingStrMap2: {
          StringMap* sm = (StringMap*)inner_obj_;
          return sm->UpperBoundSize();
        }
        default:
          LOG(FATAL) << "Unexpected encoding " << encoding_;
      }
    case OBJ_STREAM:
      // Size mean malloc bytes for streams
      return sz_;
    default:;
  }
  return 0;
}

void RobjWrapper::Free(MemoryResource* mr) {
  if (!inner_obj_)
    return;
  DVLOG(1) << "RobjWrapper::Free " << inner_obj_;

  switch (type_) {
    case OBJ_STRING:
      DVLOG(2) << "Freeing string object";
      DCHECK_EQ(OBJ_ENCODING_RAW, encoding_);
      mr->deallocate(inner_obj_, 0, 8);  // we do not keep the allocated size.
      break;
    case OBJ_LIST:
      FreeList(encoding_, inner_obj_, mr);
      break;
    case OBJ_SET:
      FreeObjSet(encoding_, inner_obj_, mr);
      break;
    case OBJ_ZSET:
      FreeObjZset(encoding_, inner_obj_);
      break;
    case OBJ_HASH:
      FreeObjHash(encoding_, inner_obj_);
      break;
    case OBJ_MODULE:
      LOG(FATAL) << "Unsupported OBJ_MODULE type";
      break;
    case OBJ_STREAM:
      FreeObjStream(inner_obj_);
      break;
    default:
      LOG(FATAL) << "Unknown object type";
      break;
  }
  Set(nullptr, 0);
}

uint64_t RobjWrapper::HashCode() const {
  switch (type_) {
    case OBJ_STRING:
      DCHECK_EQ(OBJ_ENCODING_RAW, encoding());
      {
        auto str = AsView();
        return XXH3_64bits_withSeed(str.data(), str.size(), kHashSeed);
      }
      break;
    default:
      LOG(FATAL) << "Unsupported type for hashcode " << type_;
  }
  return 0;
}

bool RobjWrapper::Equal(const RobjWrapper& ow) const {
  if (ow.type_ != type_ || ow.encoding_ != encoding_)
    return false;

  if (type_ == OBJ_STRING) {
    DCHECK_EQ(OBJ_ENCODING_RAW, encoding());
    return AsView() == ow.AsView();
  }
  LOG(FATAL) << "Unsupported type " << type_;
  return false;
}

bool RobjWrapper::Equal(string_view sv) const {
  if (type() != OBJ_STRING)
    return false;

  DCHECK_EQ(OBJ_ENCODING_RAW, encoding());
  return AsView() == sv;
}

void RobjWrapper::SetString(string_view s, MemoryResource* mr) {
  type_ = OBJ_STRING;
  encoding_ = OBJ_ENCODING_RAW;

  if (s.size() > sz_) {
    size_t cur_cap = InnerObjMallocUsed();
    if (s.size() > cur_cap) {
      MakeInnerRoom(cur_cap, s.size(), mr);
    }
    memcpy(inner_obj_, s.data(), s.size());
    sz_ = s.size();
  }
}

void RobjWrapper::ReserveString(size_t size, MemoryResource* mr) {
  CHECK_EQ(inner_obj_, nullptr);
  type_ = OBJ_STRING;
  encoding_ = OBJ_ENCODING_RAW;
  MakeInnerRoom(0, size, mr);
}

void RobjWrapper::AppendString(string_view s, MemoryResource* mr) {
  size_t cur_cap = InnerObjMallocUsed();
  CHECK(cur_cap >= sz_ + s.size()) << cur_cap << " " << sz_ << " " << s.size();
  memcpy(reinterpret_cast<uint8_t*>(inner_obj_) + sz_, s.data(), s.size());
  sz_ += s.size();
}

void RobjWrapper::SetSize(uint64_t size) {
  sz_ = size;
}

bool RobjWrapper::DefragIfNeeded(PageUsage* page_usage) {
  auto do_defrag = [this, &page_usage](auto defrag_fun) mutable {
    auto [new_ptr, realloced] = defrag_fun(encoding_, inner_obj_, page_usage);
    inner_obj_ = new_ptr;
    return realloced;
  };

  if (type() == OBJ_STRING) {
    if (page_usage->IsPageForObjectUnderUtilized(inner_obj())) {
      ReallocateString(tl.local_mr);
      return true;
    }
  } else if (type() == OBJ_HASH) {
    return do_defrag(DefragHash);
  } else if (type() == OBJ_SET) {
    return do_defrag(DefragSet);
  } else if (type() == OBJ_ZSET) {
    return do_defrag(DefragZSet);
  } else if (type() == OBJ_LIST) {
    return do_defrag(DefragList);
  }

  page_usage->RecordNotSupported();
  return false;
}

void RobjWrapper::ReallocateString(MemoryResource* mr) {
  DCHECK_EQ(type(), OBJ_STRING);
  void* old_ptr = inner_obj_;
  inner_obj_ = mr->allocate(sz_, kAlignSize);
  memcpy(inner_obj_, old_ptr, sz_);
  mr->deallocate(old_ptr, 0, kAlignSize);
}

void RobjWrapper::Init(unsigned type, unsigned encoding, void* inner) {
  type_ = type;
  encoding_ = encoding;
  Set(inner, 0);
}

inline size_t RobjWrapper::InnerObjMallocUsed() const {
  return zmalloc_size(inner_obj_);
}

void RobjWrapper::MakeInnerRoom(size_t current_cap, size_t desired, MemoryResource* mr) {
  if (current_cap * 2 > desired) {
    if (desired < SDS_MAX_PREALLOC)
      desired *= 2;
    else
      desired += SDS_MAX_PREALLOC;
  }

  void* newp = mr->allocate(desired, kAlignSize);
  if (sz_) {
    memcpy(newp, inner_obj_, sz_);
  }

  if (current_cap) {
    mr->deallocate(inner_obj_, current_cap, kAlignSize);
  }
  inner_obj_ = newp;
}

}  // namespace detail

uint32_t JsonEnconding() {
  thread_local uint32_t json_enc =
      absl::GetFlag(FLAGS_experimental_flat_json) ? kEncodingJsonFlat : kEncodingJsonCons;
  return json_enc;
}

using namespace std;

auto CompactObj::GetStatsThreadLocal() -> Stats {
  Stats res;
  res.small_string_bytes = tl.small_str_bytes;
  res.huff_encode_total = tl.huff_encode_total;
  res.huff_encode_success = tl.huff_encode_success;
  return res;
}

void CompactObj::InitThreadLocal(MemoryResource* mr) {
  tl.local_mr = mr;
  tl.tmp_buf = base::PODArray<uint8_t>{mr};
}

bool CompactObj::InitHuffmanThreadLocal(HuffmanDomain domain, std::string_view hufftable) {
  string err_msg;

  Huffman* huffman = nullptr;
  switch (domain) {
    case HUFF_KEYS:
      huffman = &tl.huff_keys;
      break;
    case HUFF_STRING_VALUES:
      huffman = &tl.huff_string_values;
      break;
  }

  // We do not allow overriding the existing huffman table once it is set.
  if (huffman->encoder.valid()) {
    return false;
  }

  if (!huffman->encoder.Load(hufftable, &err_msg)) {
    LOG(DFATAL) << "Failed to load huffman table: " << err_msg;
    return false;
  }

  if (!huffman->decoder.Load(hufftable, &err_msg)) {
    LOG(DFATAL) << "Failed to load huffman table: " << err_msg;
    return false;
  }
  return true;
}

CompactObj::~CompactObj() {
  if (HasAllocated()) {
    Free();
  }
}

CompactObj& CompactObj::operator=(CompactObj&& o) noexcept {
  DCHECK(&o != this);
  DCHECK_EQ(is_key_, o.is_key_);

  SetMeta(o.taglen_, o.mask_);  // frees own previous resources
  encoding_ = o.encoding_;
  memcpy(&u_, &o.u_, sizeof(u_));

  o.taglen_ = 0;  // forget all data
  o.encoding_ = 0;
  o.mask_ = 0;
  return *this;
}

size_t CompactObj::Size() const {
  auto decoded_str_size = [this](size_t raw_size, uint8_t first_byte) {
    DCHECK_EQ(ObjType(), OBJ_STRING);
    return GetStrEncoding().DecodedSize(raw_size, first_byte);
  };

  if (IsInline())
    return decoded_str_size(taglen_, u_.inline_str[0]);

  switch (taglen_) {
    case SMALL_TAG:
      return decoded_str_size(u_.small_str.size(), u_.small_str.first_byte());
    case EXTERNAL_TAG:
      if (ObjType() == OBJ_STRING)
        return decoded_str_size(u_.ext_ptr.serialized_size, GetFirstByte());
      else
        return u_.ext_ptr.serialized_size;
    case ROBJ_TAG:
      if (size_t size = u_.r_obj.Size(); u_.r_obj.type() != OBJ_STRING)
        return size;
      else
        return decoded_str_size(size, *(uint8_t*)u_.r_obj.inner_obj());
    case INT_TAG:
      return absl::AlphaNum(u_.ival).size();
    case SDS_TTL_TAG:
      return decoded_str_size(sdslen(u_.sds_ttl.sds_ptr), u_.sds_ttl.sds_ptr[0]);
    case JSON_TAG:
      if (JsonEnconding() == kEncodingJsonFlat)
        return u_.json_obj.flat.json_len;
      else
        return u_.json_obj.cons.json_ptr->size();
    case SBF_TAG:
      return u_.sbf->current_size();
    case CMS_TAG:
      return 0;
    case TOPK_TAG:
      return u_.topk->Size();
    default:
      LOG(DFATAL) << "Should not reach " << int(taglen_);
      return 0;
  }
}

uint64_t CompactObj::HashCode() const {
  DCHECK(taglen_ != JSON_TAG) << "JSON type cannot be used for keys!";

  if (encoding_ == NONE_ENC) {
    if (IsInline()) {
      return XXH3_64bits_withSeed(u_.inline_str, taglen_, kHashSeed);
    }

    switch (taglen_) {
      case SMALL_TAG:
        return u_.small_str.HashCode();
      case ROBJ_TAG:
        return u_.r_obj.HashCode();
      case INT_TAG: {
        absl::AlphaNum an(u_.ival);
        return XXH3_64bits_withSeed(an.data(), an.size(), kHashSeed);
      }
      case SDS_TTL_TAG:
        return XXH3_64bits_withSeed(u_.sds_ttl.sds_ptr, sdslen(u_.sds_ttl.sds_ptr), kHashSeed);
    }
  }

  DCHECK(encoding_);

  if (IsInline()) {
    // Buffer must accommodate maximum decompressed size from inline storage
    // Highly compressible data can achieve ~8x compression (e.g., repeated character)
    // kInlineLen (16 bytes) compressed -> up to 128 bytes decompressed
    char buf[kInlineLen * 8];
    size_t decoded_len = GetStrEncoding().Decode(string_view{u_.inline_str, taglen_}, buf);
    return XXH3_64bits_withSeed(buf, decoded_len, kHashSeed);
  }

  string_view sv = GetSlice(&tl.tmp_str);
  return XXH3_64bits_withSeed(sv.data(), sv.size(), kHashSeed);
}

uint64_t CompactObj::HashCode(string_view str) {
  return XXH3_64bits_withSeed(str.data(), str.size(), kHashSeed);
}

CompactObjType CompactObj::ObjType() const {
  if (IsInline() || taglen_ == INT_TAG || taglen_ == SMALL_TAG || taglen_ == SDS_TTL_TAG)
    return OBJ_STRING;

  if (taglen_ == EXTERNAL_TAG) {
    switch (static_cast<ExternalRep>(u_.ext_ptr.representation)) {
      case ExternalRep::STRING:
        return OBJ_STRING;
      case ExternalRep::SERIALIZED_MAP:
        return OBJ_HASH;
    };
  }

  if (taglen_ == ROBJ_TAG)
    return u_.r_obj.type();

  if (taglen_ == JSON_TAG) {
    return OBJ_JSON;
  }

  if (taglen_ == SBF_TAG) {
    return OBJ_SBF;
  }

  if (taglen_ == CMS_TAG) {
    return OBJ_CMS;
  }

  if (taglen_ == TOPK_TAG) {
    return OBJ_TOPK;
  }

  LOG(FATAL) << "TBD " << int(taglen_);
  return kInvalidCompactObjType;
}

unsigned CompactObj::Encoding() const {
  switch (taglen_) {
    case ROBJ_TAG:
      return u_.r_obj.encoding();
    case INT_TAG:
      return OBJ_ENCODING_INT;
    default:
      return OBJ_ENCODING_RAW;
  }
}

void CompactObj::InitRobj(CompactObjType type, unsigned encoding, void* obj) {
  DCHECK_NE(type, OBJ_STRING);
  SetMeta(ROBJ_TAG, mask_);
  u_.r_obj.Init(type, encoding, obj);
}

void CompactObj::SetInt(int64_t val) {
  DCHECK(!IsExternal());

  if (INT_TAG != taglen_) {
    SetMeta(INT_TAG, mask_);
    encoding_ = NONE_ENC;
  }

  u_.ival = val;
}

std::optional<int64_t> CompactObj::TryGetInt() const {
  if (taglen_ != INT_TAG)
    return std::nullopt;
  int64_t val = u_.ival;
  return val;
}

auto CompactObj::GetJson() const -> JsonType* {
  if (ObjType() == OBJ_JSON) {
    DCHECK_EQ(JsonEnconding(), kEncodingJsonCons);
    return u_.json_obj.cons.json_ptr;
  }
  return nullptr;
}

void CompactObj::SetJson(JsonType&& j) {
  if (taglen_ == JSON_TAG && JsonEnconding() == kEncodingJsonCons) {
    DCHECK(u_.json_obj.cons.json_ptr != nullptr);  // must be allocated
    u_.json_obj.cons.json_ptr->swap(j);
    DCHECK(jsoncons::is_trivial_storage(u_.json_obj.cons.json_ptr->storage_kind()) ||
           u_.json_obj.cons.json_ptr->get_allocator().resource() == tl.local_mr);

    // We do not set bytes_used as this is needed. Consider the two following cases:
    // 1. old json contains 50 bytes. The delta for new one is 50, so the total bytes
    // the new json occupies is 100.
    // 2. old json contains 100 bytes. The delta for new one is -50, so the total bytes
    // the new json occupies is 50.
    // Both of the cases are covered in SetJsonSize and JsonMemTracker. See below.
    return;
  }

  SetMeta(JSON_TAG);
  u_.json_obj.cons.json_ptr = AllocateMR<JsonType>(std::move(j));

  // With trivial storage json_ptr->get_allocator() throws an exception.
  DCHECK(jsoncons::is_trivial_storage(u_.json_obj.cons.json_ptr->storage_kind()) ||
         u_.json_obj.cons.json_ptr->get_allocator().resource() == tl.local_mr);
  u_.json_obj.cons.bytes_used = 0;
}

void CompactObj::SetJsonSize(int64_t size) {
  if (taglen_ == JSON_TAG && JsonEnconding() == kEncodingJsonCons) {
    // JSON.SET or if mem hasn't changed from a JSON op then we just update.
    int64_t result = static_cast<int64_t>(u_.json_obj.cons.bytes_used) + size;
    if (result < 1) {
      LOG_EVERY_T(ERROR, 20) << "JSON size underflow: " << u_.json_obj.cons.bytes_used << " + "
                             << size << " = " << result;
      u_.json_obj.cons.bytes_used = 1;
    } else {
      u_.json_obj.cons.bytes_used = static_cast<size_t>(result);
    }
  }
}

void CompactObj::AddStreamSize(int64_t size) {
  if (size < 0) {
    // We might have a negative size. For example, if we remove a consumer,
    // the tracker will report a negative net (since we deallocated),
    // so the object now consumes less memory than it did before. This DCHECK
    // is for fanity and to catch any potential issues with our tracking approach.
    DCHECK(static_cast<int64_t>(u_.r_obj.Size()) >= size);
  }
  u_.r_obj.SetSize((u_.r_obj.Size() + size));
}

void CompactObj::SetJson(const uint8_t* buf, size_t len) {
  SetMeta(JSON_TAG);
  u_.json_obj.flat.flat_ptr = (uint8_t*)tl.local_mr->allocate(len, kAlignSize);
  memcpy(u_.json_obj.flat.flat_ptr, buf, len);
  u_.json_obj.flat.json_len = len;
}

void CompactObj::SetSBF(uint64_t initial_capacity, double fp_prob, double grow_factor) {
  if (taglen_ == SBF_TAG) {  // already json
    *u_.sbf = SBF(initial_capacity, fp_prob, grow_factor, tl.local_mr);
  } else {
    SetMeta(SBF_TAG);
    u_.sbf = AllocateMR<SBF>(initial_capacity, fp_prob, grow_factor, tl.local_mr);
  }
}

SBF* CompactObj::GetSBF() const {
  DCHECK_EQ(SBF_TAG, taglen_);
  return u_.sbf;
}

void CompactObj::SetCMS(uint32_t width, uint32_t depth) {
  if (taglen_ == CMS_TAG) {
    *u_.cms = CMS(width, depth, tl.local_mr);
  } else {
    SetMeta(CMS_TAG);
    u_.cms = AllocateMR<CMS>(width, depth, tl.local_mr);
  }
}

CMS* CompactObj::GetCMS() const {
  DCHECK_EQ(CMS_TAG, taglen_);
  return u_.cms;
}

void CompactObj::SetTOPK(uint32_t k, uint32_t width, uint32_t depth, double decay) {
  if (taglen_ == TOPK_TAG) {
    *u_.topk = TOPK(memory_resource(), k, width, depth, decay);
  } else {
    SetMeta(TOPK_TAG);
    u_.topk = AllocateMR<TOPK>(memory_resource(), k, width, depth, decay);
  }
}

TOPK* CompactObj::GetTOPK() const {
  DCHECK_EQ(TOPK_TAG, taglen_);
  return u_.topk;
}

void CompactObj::SetString(std::string_view str) {
  CHECK(!IsExternal());
  encoding_ = NONE_ENC;

  // Trying auto-detection heuristics first.
  if (str.size() <= 20) {
    long long ival;
    static_assert(sizeof(long long) == 8);

    // We use redis string2ll to be compatible with Redis.
    if (string2ll(str.data(), str.size(), &ival)) {
      SetMeta(INT_TAG, mask_);
      u_.ival = ival;

      return;
    }

    if (str.size() <= kInlineLen) {
      SetMeta(str.size(), mask_);
      if (!str.empty())
        memcpy(u_.inline_str, str.data(), str.size());
      return;
    }
  }

  EncodeString(str);
}

void CompactObj::ReserveString(size_t size) {
  encoding_ = NONE_ENC;
  SetMeta(ROBJ_TAG, mask_);

  u_.r_obj.ReserveString(size, tl.local_mr);
}

void CompactObj::AppendString(std::string_view str) {
  u_.r_obj.AppendString(str, tl.local_mr);
}

string_view CompactObj::GetSlice(string* scratch) const {
  CHECK(!IsExternal());

  if (encoding_) {
    GetString(scratch);
    return *scratch;
  }

  if (IsInline()) {
    return string_view{u_.inline_str, taglen_};
  }

  if (taglen_ == INT_TAG) {
    absl::AlphaNum an(u_.ival);
    scratch->assign(an.Piece());

    return *scratch;
  }

  // no encoding.
  if (taglen_ == ROBJ_TAG) {
    CHECK_EQ(OBJ_STRING, u_.r_obj.type());
    DCHECK_EQ(OBJ_ENCODING_RAW, u_.r_obj.encoding());
    return u_.r_obj.AsView();
  }

  if (taglen_ == SMALL_TAG) {
    u_.small_str.Get(scratch);
    return *scratch;
  }

  if (taglen_ == SDS_TTL_TAG) {
    return u_.sds_ttl.view();
  }

  LOG(FATAL) << "Bad tag " << int(taglen_);

  return string_view{};
}

bool CompactObj::DefragIfNeeded(PageUsage* page_usage) {
  static const bool disable_json_defragmentation =
      absl::GetFlag(FLAGS_disable_json_defragmentation);

  if (OmitDefrag()) {
    page_usage->RecordNotRequired();
    return false;
  }

  switch (taglen_) {
    case ROBJ_TAG:
      // currently only these object types are supported for this operation
      if (u_.r_obj.inner_obj() != nullptr) {
        return u_.r_obj.DefragIfNeeded(page_usage);
      }
      return false;
    case SMALL_TAG:
      return u_.small_str.DefragIfNeeded(page_usage);
    case JSON_TAG:
      if (disable_json_defragmentation) {
        return false;
      }
      return u_.json_obj.DefragIfNeeded(page_usage);
    case SDS_TTL_TAG:
      if (page_usage->IsPageForObjectUnderUtilized(u_.sds_ttl.sds_ptr)) {
        size_t len = sdslen(u_.sds_ttl.sds_ptr);
        char* new_sds = sdsnewlen(u_.sds_ttl.sds_ptr, len);
        sdsfree(u_.sds_ttl.sds_ptr);
        u_.sds_ttl.sds_ptr = new_sds;
        return true;
      }
      return false;
    case INT_TAG:
      page_usage->RecordNotRequired();
      // this is not relevant in this case
      return false;
    case EXTERNAL_TAG:
      page_usage->RecordNotRequired();
      return false;
    default:
      page_usage->RecordNotRequired();
      // This is the case when the object is at inline_str
      return false;
  }
}

bool CompactObj::HasAllocated() const {
  if (IsRef() || taglen_ == INT_TAG || IsInline() || taglen_ == EXTERNAL_TAG ||
      (taglen_ == ROBJ_TAG && u_.r_obj.inner_obj() == nullptr))
    return false;

  DCHECK(taglen_ == ROBJ_TAG || taglen_ == SMALL_TAG || taglen_ == JSON_TAG || taglen_ == SBF_TAG ||
         taglen_ == CMS_TAG || taglen_ == SDS_TTL_TAG || taglen_ == TOPK_TAG);
  return true;
}

bool CompactObj::TagAllowsEmptyValue() const {
  const auto type = ObjType();
  return type == OBJ_JSON || type == OBJ_STREAM || type == OBJ_STRING || type == OBJ_SBF ||
         type == OBJ_CMS || type == OBJ_TOPK || type == OBJ_SET;
}

void __attribute__((noinline)) CompactObj::GetString(string* res) const {
  res->resize(Size());
  GetString(res->data());
}

void CompactObj::GetString(char* dest) const {
  CHECK(!IsExternal());

  if (IsInline()) {
    GetStrEncoding().Decode({u_.inline_str, taglen_}, dest);
    return;
  }

  if (taglen_ == INT_TAG) {
    absl::AlphaNum an(u_.ival);
    memcpy(dest, an.data(), an.size());
    return;
  }

  if (encoding_) {
    StrEncoding str_encoding = GetStrEncoding();
    string_view decode_blob = GetEncodedBlob(str_encoding, dest);

    str_encoding.Decode(decode_blob, dest);
    return;
  }

  // no encoding.
  if (taglen_ == ROBJ_TAG) {
    CHECK_EQ(OBJ_STRING, u_.r_obj.type());
    DCHECK_EQ(OBJ_ENCODING_RAW, u_.r_obj.encoding());
    memcpy(dest, u_.r_obj.inner_obj(), u_.r_obj.Size());
    return;
  }

  if (taglen_ == SDS_TTL_TAG) {
    memcpy(dest, u_.sds_ttl.sds_ptr, sdslen(u_.sds_ttl.sds_ptr));
    return;
  }

  if (taglen_ == SMALL_TAG)
    return u_.small_str.Get(dest);

  LOG(FATAL) << "Bad tag " << int(taglen_);
}

void CompactObj::SetExternal(size_t offset, uint32_t sz, ExternalRep rep) {
  uint8_t first_byte = 0;
  if (encoding_ == HUFFMAN_ENC) {
    CHECK(rep == ExternalRep::STRING);
    first_byte = GetFirstByte();
  }
  SetMeta(EXTERNAL_TAG, mask_);

  u_.ext_ptr.is_cool = 0;
  u_.ext_ptr.representation = static_cast<uint8_t>(rep);
  u_.ext_ptr.first_byte = first_byte;
  u_.ext_ptr.page_offset = offset % 4096;
  u_.ext_ptr.serialized_size = sz;
  u_.ext_ptr.offload.page_index = offset / 4096;
}

CompactObj::ExternalRep CompactObj::GetExternalRep() const {
  DCHECK(IsExternal());
  return static_cast<CompactObj::ExternalRep>(u_.ext_ptr.representation);
}

void CompactObj::SetCool(size_t offset, uint32_t sz, ExternalRep rep,
                         tiering::TieredCoolRecord* record) {
  encoding_ = record->value.encoding_;
  SetMeta(EXTERNAL_TAG, record->value.mask_);

  u_.ext_ptr.is_cool = 1;
  u_.ext_ptr.representation = static_cast<uint8_t>(rep);
  u_.ext_ptr.page_offset = offset % 4096;
  u_.ext_ptr.serialized_size = sz;
  u_.ext_ptr.cool_record = record;
}

auto CompactObj::GetCool() const -> CoolItem {
  DCHECK(IsExternal() && u_.ext_ptr.is_cool);

  CoolItem res;
  res.page_offset = u_.ext_ptr.page_offset;
  res.serialized_size = u_.ext_ptr.serialized_size;
  res.record = u_.ext_ptr.cool_record;
  return res;
}

void CompactObj::Freeze(size_t offset, size_t sz) {
  SetExternal(offset, sz, GetExternalRep());
}

std::pair<size_t, size_t> CompactObj::GetExternalSlice() const {
  DCHECK_EQ(EXTERNAL_TAG, taglen_);
  auto& ext = u_.ext_ptr;
  size_t offset = ext.page_offset;
  offset += size_t(ext.is_cool ? ext.cool_record->page_index : ext.offload.page_index) * 4096;
  return {offset, size_t(u_.ext_ptr.serialized_size)};
}

string_view CompactObj::GetEncodedBlob(StrEncoding str_encoding, char* opt_dest) const {
  if (taglen_ == ROBJ_TAG) {
    CHECK_EQ(OBJ_STRING, u_.r_obj.type());
    DCHECK_EQ(OBJ_ENCODING_RAW, u_.r_obj.encoding());
    return u_.r_obj.AsView();
  } else if (IsInline()) {
    return {u_.inline_str, taglen_};
  } else if (taglen_ == SDS_TTL_TAG) {
    return u_.sds_ttl.view();
  }

  CHECK_EQ(taglen_, SMALL_TAG);
  auto& ss = u_.small_str;
  char* copy_dest = nullptr;
  if (opt_dest && str_encoding.enc_ != HUFFMAN_ENC) {
    // Write to rightmost location of dest buffer to leave some bytes for inline unpacking
    size_t decoded_len = str_encoding.DecodedSize(ss.size(), ss.first_byte());
    copy_dest = opt_dest + (decoded_len - ss.size());
  } else {
    tl.tmp_buf.resize(ss.size());
    copy_dest = reinterpret_cast<char*>(tl.tmp_buf.data());
  }
  ss.Get(copy_dest);
  return {copy_dest, ss.size()};
}

void CompactObj::Materialize(std::string_view blob, bool is_raw) {
  CHECK(IsExternal()) << int(taglen_);
  DCHECK_EQ(u_.ext_ptr.representation, static_cast<uint8_t>(ExternalRep::STRING));
  DCHECK_GT(blob.size(), kInlineLen);  // There are no mutable commands that shrink strings

  if (is_raw) {
    if (SmallString::CanAllocate(blob.size())) {
      SetMeta(SMALL_TAG, mask_);
      tl.small_str_bytes += u_.small_str.Assign(blob);
    } else {
      SetMeta(ROBJ_TAG, mask_);
      u_.r_obj.SetString(blob, tl.local_mr);
    }
  } else {
    encoding_ = NONE_ENC;  // reset encoding
    EncodeString(blob);
  }
}

void CompactObj::Reset() {
  if (HasAllocated()) {
    Free();
  }
  taglen_ = 0;
  encoding_ = 0;
  mask_ = 0;
}

uint8_t CompactObj::GetFirstByte() const {
  DCHECK_EQ(ObjType(), OBJ_STRING);

  if (IsInline()) {
    return u_.inline_str[0];
  }

  if (taglen_ == ROBJ_TAG) {
    CHECK_EQ(OBJ_STRING, u_.r_obj.type());
    DCHECK_EQ(OBJ_ENCODING_RAW, u_.r_obj.encoding());
    return *(uint8_t*)u_.r_obj.inner_obj();
  }

  if (taglen_ == SMALL_TAG) {
    return u_.small_str.first_byte();
  }

  if (taglen_ == SDS_TTL_TAG) {
    return u_.sds_ttl.sds_ptr[0];
  }

  if (taglen_ == EXTERNAL_TAG) {
    if (u_.ext_ptr.is_cool) {
      const CompactObj& cooled_obj = u_.ext_ptr.cool_record->value;
      return cooled_obj.GetFirstByte();
    }
    return u_.ext_ptr.first_byte;
  }

  LOG(DFATAL) << "Bad tag " << int(taglen_);
  return 0;
}

bool CompactObj::GetByteAtIndex(size_t idx, uint8_t* res) const {
  CHECK(!IsExternal());
  DCHECK_EQ(ObjType(), OBJ_STRING);

  if (encoding_) {
    StrEncoding str_encoding = GetStrEncoding();
    string_view decode_blob = GetEncodedBlob(str_encoding, nullptr);

    if (!str_encoding.DecodeByte(decode_blob, idx, res)) {
      VLOG(1) << "Offset out of bounds for encoded string: " << idx
              << " >= " << str_encoding.DecodedSize(decode_blob.size(), decode_blob[0]);
      *res = 0;
      return false;
    }
    return true;
  }

  // No encoding, we can directly access the byte at index.
  string_view sv = GetSlice(&tl.tmp_str);
  if (idx >= sv.size()) {
    VLOG(1) << "Offset out of bounds: " << idx << " >= " << sv.size();
    *res = 0;
    return false;
  }
  *res = sv[idx];
  return true;
}

std::pair<bool, bool> CompactObj::SetByteAtIndex(size_t idx, uint8_t val) {
  CHECK(!IsExternal());
  DCHECK_EQ(ObjType(), OBJ_STRING);

  // Inline string without encoding: modify directly.
  if (IsInline() && !encoding_) {
    if (idx >= taglen_) {
      VLOG(1) << "Offset out of bounds for inline string: " << idx << " >= " << int(taglen_);
      return {false, false};
    }
    u_.inline_str[idx] = val;
    return {true, true};
  }

  // SDS_TTL_TAG raw string without encoding: modify directly.
  if (taglen_ == SDS_TTL_TAG && !encoding_) {
    size_t len = sdslen(u_.sds_ttl.sds_ptr);
    if (idx >= len) {
      return {false, false};
    }
    u_.sds_ttl.sds_ptr[idx] = val;
    return {true, true};
  }

  // ROBJ_TAG raw string without encoding: modify the underlying buffer directly.
  if (taglen_ == ROBJ_TAG && !encoding_) {
    CHECK_EQ(OBJ_STRING, u_.r_obj.type());
    DCHECK_EQ(OBJ_ENCODING_RAW, u_.r_obj.encoding());
    if (idx >= u_.r_obj.Size()) {
      VLOG(1) << "Offset out of bounds for raw string: " << idx << " >= " << u_.r_obj.Size();
      return {false, false};
    }
    reinterpret_cast<char*>(u_.r_obj.inner_obj())[idx] = val;
    return {true, true};
  }

  // For ASCII encoded ROBJ strings we can modify the underlying buffer directly.
  if (encoding_ && (encoding_ == ASCII1_ENC || encoding_ == ASCII2_ENC) && taglen_ == ROBJ_TAG &&
      absl::ascii_isascii(val)) {
    DCHECK_EQ(OBJ_ENCODING_RAW, u_.r_obj.encoding());
    auto* buf = reinterpret_cast<uint8_t*>(u_.r_obj.inner_obj());
    size_t decoded_len = GetStrEncoding().DecodedSize(u_.r_obj.Size(), buf[0]);
    if (idx >= decoded_len) {
      VLOG(1) << "Offset out of bounds for ASCII encoded string: " << idx << " >= " << decoded_len;
      return {false, false};
    }
    detail::ascii_pack_byte(buf, decoded_len, idx, val);
    return {true, true};
  }

  // For other encoded strings, INT_TAG, SMALL_TAG we need to decode, modify, and re-encode.
  string str;
  GetString(&str);
  if (idx >= str.size()) {
    VLOG(1) << "Offset out of bounds: " << idx << " >= " << str.size();
    return {false, false};
  }
  str[idx] = val;
  SetString(str);
  return {true, false};
}

// Frees all resources if owns.
void CompactObj::Free() {
  DCHECK(HasAllocated());

  if (taglen_ == ROBJ_TAG) {
    u_.r_obj.Free(tl.local_mr);
  } else if (taglen_ == SMALL_TAG) {
    tl.small_str_bytes -= u_.small_str.MallocUsed();
    u_.small_str.Free();
  } else if (taglen_ == JSON_TAG) {
    DVLOG(1) << "Freeing JSON object";
    if (JsonEnconding() == kEncodingJsonCons) {
      DeleteMR<JsonType>(u_.json_obj.cons.json_ptr);
    } else {
      tl.local_mr->deallocate(u_.json_obj.flat.flat_ptr, u_.json_obj.flat.json_len, kAlignSize);
    }
  } else if (taglen_ == SBF_TAG) {
    DeleteMR<SBF>(u_.sbf);
  } else if (taglen_ == TOPK_TAG) {
    DeleteMR<TOPK>(u_.topk);
  } else if (taglen_ == CMS_TAG) {
    DeleteMR<CMS>(u_.cms);
  } else if (taglen_ == SDS_TTL_TAG) {
    sdsfree(u_.sds_ttl.sds_ptr);
  } else {
    LOG(FATAL) << "Unsupported tag " << int(taglen_);
  }

  memset(u_.inline_str, 0, kInlineLen);
}

size_t CompactObj::MallocUsed(bool slow) const {
  if (!HasAllocated())
    return 0;

  if (taglen_ == ROBJ_TAG) {
    return u_.r_obj.MallocUsed(slow);
  }

  if (taglen_ == JSON_TAG) {
    // TODO fix this once we fully support flat json
    // This is here because accessing a union field that is not active
    // is UB.
    if (JsonEnconding() == kEncodingJsonFlat) {
      return 0;
    }
    return u_.json_obj.cons.bytes_used;
  }

  if (taglen_ == SMALL_TAG) {
    return u_.small_str.MallocUsed();
  }

  if (taglen_ == SBF_TAG) {
    return u_.sbf->MallocUsed();
  }

  if (taglen_ == CMS_TAG) {
    return u_.cms->MallocUsed();
  }

  if (taglen_ == SDS_TTL_TAG) {
    return sdsAllocSize(u_.sds_ttl.sds_ptr);
  }

  if (taglen_ == TOPK_TAG) {
    return u_.topk->MallocUsed();
  }

  LOG(DFATAL) << "should not reach";
  return 0;
}

// TODO: we need this operator ONLY because we search in prime-table based on the ExpireKey
// which is a reference to the CompactKey. Therefore operator== currently works
// specifically for this particular use-case.
// So once we remove the expire table, we can remove this operator too.
// In addition - we MUST remove AsRef/IsRef api as well as it will break
// once we start using SetExpireTime/ClearExpireTime methods.
// All in all, we will free up two additional bits.
bool CompactKey::operator==(const CompactKey& o) const {
  DCHECK(taglen_ != JSON_TAG && o.taglen_ != JSON_TAG) << "cannot use JSON type to check equal";

  // Cross-tag/encoding comparison: fall back to decoded string comparison for OBJ_STRING.
  // This handles e.g. SDS_TTL_TAG vs ROBJ_TAG/inline/INT_TAG with same logical content.
  if (taglen_ != o.taglen_ || encoding_ != o.encoding_) {
    if (ObjType() == OBJ_STRING && o.ObjType() == OBJ_STRING) {
      std::string tmp;
      return *this == o.GetSlice(&tmp);
    }
    return false;
  }

  if (taglen_ == ROBJ_TAG)
    return u_.r_obj.Equal(o.u_.r_obj);

  if (taglen_ == INT_TAG)
    return u_.ival == o.u_.ival;

  if (taglen_ == SMALL_TAG)
    return u_.small_str.Equal(o.u_.small_str);

  if (taglen_ == SDS_TTL_TAG)
    return u_.sds_ttl.view() == o.u_.sds_ttl.view();

  DCHECK(IsInline() && o.IsInline());

  return memcmp(u_.inline_str, o.u_.inline_str, taglen_) == 0;
}

bool CompactObj::CmpNonInline(std::string_view sv) const {
  DCHECK_GT(taglen_, kInlineLen);
  switch (taglen_) {
    case INT_TAG:
      return absl::AlphaNum(u_.ival).Piece() == sv;
    case ROBJ_TAG:
      return u_.r_obj.Equal(sv);
    case SMALL_TAG:
      return u_.small_str.Equal(sv);
    case SDS_TTL_TAG:
      return u_.sds_ttl.view() == sv;
    default:
      break;
  }
  return false;
}

bool CompactObj::CmpEncoded(string_view sv) const {
  DCHECK(encoding_);

  if (encoding_ == HUFFMAN_ENC) {
    size_t sz = Size();
    if (sv.size() != sz)
      return false;

    if (IsInline()) {
      // Buffer must accommodate maximum decompressed size from inline storage (~8x compression)
      constexpr size_t kMaxHuffLen = kInlineLen * 8;
      if (sz <= kMaxHuffLen) {
        char buf[kMaxHuffLen];
        auto domain = is_key_ ? HUFF_KEYS : HUFF_STRING_VALUES;
        const auto& decoder = tl.GetHuffmanDecoder(domain);
        CHECK(decoder.Decode({u_.inline_str + 1, size_t(taglen_ - 1)}, sz, buf));
        return sv == string_view(buf, sz);
      }
    }
    tl.tmp_str.resize(sz);
    GetString(tl.tmp_str.data());
    return sv == tl.tmp_str;
  }

  size_t encode_len = binpacked_len(sv.size());
  if (IsInline()) {
    if (encode_len != taglen_)
      return false;

    char buf[kInlineLen * 2];
    detail::ascii_unpack(to_byte(u_.inline_str), sv.size(), buf);

    return sv == string_view(buf, sv.size());
  }

  if (taglen_ == ROBJ_TAG) {
    if (u_.r_obj.type() != OBJ_STRING)
      return false;

    if (u_.r_obj.Size() != encode_len)
      return false;

    if (!detail::validate_ascii_fast(sv.data(), sv.size()))
      return false;

    return detail::compare_packed(to_byte(u_.r_obj.inner_obj()), sv.data(), sv.size());
  }

  if (taglen_ == SDS_TTL_TAG) {
    size_t sds_len = sdslen(u_.sds_ttl.sds_ptr);
    if (sds_len != encode_len)
      return false;

    if (!detail::validate_ascii_fast(sv.data(), sv.size()))
      return false;

    return detail::compare_packed(to_byte(u_.sds_ttl.sds_ptr), sv.data(), sv.size());
  }

  if (taglen_ == JSON_TAG) {
    return false;  // cannot compare json with string
  }

  if (taglen_ == SMALL_TAG) {
    if (u_.small_str.size() != encode_len)
      return false;

    if (!detail::validate_ascii_fast(sv.data(), sv.size()))
      return false;

    // We need to compare an unpacked sv with 2 packed parts.
    // To compare easily ascii with binary we would need to split ascii at 8 bytes boundaries
    // so that we could pack it into complete binary bytes (8 ascii chars produce 7 bytes).
    // I choose a minimal 16 byte prefix:
    // 1. sv must be longer than 16 if we are here (at least 18 actually).
    // 2. 16 chars produce 14 byte blob that should cover the first slice (10 bytes) and 4 bytes
    //    of the second slice.
    // 3. I assume that the first slice is less than 14 bytes which is correct since small string
    //    has only 9-10 bytes in its inline prefix storage.
    DCHECK_GT(sv.size(), 16u);  // we would not be in SMALL_TAG, otherwise.

    auto slice = u_.small_str.Get();
    DCHECK_LT(slice[0].size(), 14u);

    uint8_t tmpbuf[14];
    detail::ascii_pack(sv.data(), 16, tmpbuf);

    // Compare the first slice.
    if (memcmp(slice[0].data(), tmpbuf, slice[0].size()) != 0)
      return false;

    // Compare the prefix of the second slice.
    size_t pref_len = 14 - slice[0].size();

    if (memcmp(slice[1].data(), tmpbuf + slice[0].size(), pref_len) != 0)
      return false;

    // We verified that the first 16 chars (or 14 bytes) are equal.
    // Lets verify the rest - suffix of the second slice and the suffix of sv.
    return detail::compare_packed(to_byte(slice[1].data() + pref_len), sv.data() + 16,
                                  sv.size() - 16);
  }
  LOG(FATAL) << "Unsupported tag " << int(taglen_);
  return false;
}

void CompactObj::EncodeString(string_view str) {
  DCHECK_GT(str.size(), kInlineLen);
  DCHECK_EQ(NONE_ENC, encoding_);

  string_view encoded = str;
  bool huff_encoded = false;

  // We chose such length that we can store the decoded length delta into 1 byte.
  // The maximum huffman compression is 1/8, so 288 / 8 = 36.
  // 288 - 36 = 252, which is smaller than 256.
  // TODO: introduce variable length huffman length.
  constexpr unsigned kMaxHuffLen = 288;

  // For sizes 17, 18 we would like to test ascii encoding first as it's more efficient.
  // And if it succeeds we can squash into the inline buffer.
  bool is_ascii =
      kUseAsciiEncoding && str.size() < 19 && detail::validate_ascii_fast(str.data(), str.size());

  // if !is_ascii, we try huffman encoding next.
  if (!is_ascii && str.size() <= kMaxHuffLen) {
    auto& huffman = is_key_ ? tl.huff_keys : tl.huff_string_values;
    if (huffman.encoder.valid()) {
      unsigned dest_len = huffman.encoder.CompressedBound(str.size());
      // 1 byte for storing the size delta.
      tl.tmp_buf.resize(1 + dest_len);
      string err_msg;
      ++tl.huff_encode_total;
      bool res = huffman.encoder.Encode(str, tl.tmp_buf.data() + 1, &dest_len, &err_msg);
      if (res) {
        // we accept huffman encoding only if it is:
        // 1. smaller than the original string by 20%
        // 2. allows us to store the encoded string in the inline buffer
        if (dest_len && (dest_len < kInlineLen || (dest_len + dest_len / 5) < str.size())) {
          huff_encoded = true;
          tl.huff_encode_success++;
          encoded = string_view{reinterpret_cast<char*>(tl.tmp_buf.data()), dest_len + 1};
          unsigned delta = str.size() - dest_len;
          DCHECK_LT(delta, 256u);
          tl.tmp_buf[0] = static_cast<uint8_t>(delta);
          encoding_ = HUFFMAN_ENC;
          if (encoded.size() <= kInlineLen) {
            SetMeta(encoded.size(), mask_);
            memcpy(u_.inline_str, tl.tmp_buf.data(), encoded.size());
            return;
          }
        }
      } else {
        // Should not happen, means we have an internal buf.
        LOG(DFATAL) << "Failed to encode string with huffman: " << err_msg;
      }
    }
  }

  // Finally we try ascii encoding for longer strings if we have not encoded them with huffman.
  if (kUseAsciiEncoding && !is_ascii && str.size() >= 19 && !huff_encoded) {
    is_ascii = detail::validate_ascii_fast(str.data(), str.size());
  }

  if (is_ascii) {
    size_t encode_len = binpacked_len(str.size());
    size_t rev_len = ascii_len(encode_len);

    if (rev_len == str.size()) {
      encoding_ = ASCII2_ENC;  // str hits its highest bound.
    } else {
      CHECK_EQ(str.size(), rev_len - 1) << "Bad ascii encoding for len " << str.size();
      encoding_ = ASCII1_ENC;  // str is shorter than its highest bound.
    }

    tl.tmp_buf.resize(encode_len);
    detail::ascii_pack_simd2(str.data(), str.size(), tl.tmp_buf.data());
    encoded = string_view{reinterpret_cast<char*>(tl.tmp_buf.data()), encode_len};

    if (encoded.size() <= kInlineLen) {
      SetMeta(encoded.size(), mask_);
      detail::ascii_pack(str.data(), str.size(), reinterpret_cast<uint8_t*>(u_.inline_str));

      return;
    }
  }

  DCHECK_GT(encoded.size(), kInlineLen);

  if (SmallString::CanAllocate(encoded.size())) {
    if (taglen_ == SMALL_TAG)
      tl.small_str_bytes -= u_.small_str.MallocUsed();
    else
      SetMeta(SMALL_TAG, mask_);

    tl.small_str_bytes += u_.small_str.Assign(encoded);
    return;
  }

  SetMeta(ROBJ_TAG, mask_);
  u_.r_obj.SetString(encoded, tl.local_mr);
}

std::array<std::string_view, 2> CompactObj::GetRawString() const {
  DCHECK(!IsExternal());

  if (taglen_ == ROBJ_TAG) {
    CHECK_EQ(OBJ_STRING, u_.r_obj.type());
    DCHECK_EQ(OBJ_ENCODING_RAW, u_.r_obj.encoding());
    return {u_.r_obj.AsView(), {}};
  }

  if (taglen_ == SMALL_TAG) {
    return u_.small_str.Get();
  }

  if (taglen_ == SDS_TTL_TAG) {
    return {u_.sds_ttl.view(), {}};
  }

  LOG(FATAL) << "Unsupported tag for GetRawString(): " << int(taglen_);
  return {};
}

MemoryResource* CompactObj::memory_resource() {
  return tl.local_mr;
}

string_view CompactObj::SdsTtlString::view() const {
  return string_view{sds_ptr, sdslen(sds_ptr)};
}

bool CompactObj::JsonConsT::DefragIfNeeded(PageUsage* page_usage) {
  const MiMemoryResource* mr = static_cast<MiMemoryResource*>(memory_resource());

  const int64_t before = static_cast<int64_t>(mr->used());
  DCHECK_GE(before, 0) << "Memory usage is more than int64_t max value";

  bool did_defragment = Defragment(*json_ptr, page_usage);

  const int64_t after = static_cast<int64_t>(mr->used());
  DCHECK_GE(after, 0) << "Memory usage is more than int64_t max value";

  if (const int64_t delta = after - before; delta != 0) {
    bytes_used = UpdateSize(bytes_used, delta);
  }

  return did_defragment;
}

bool CompactObj::FlatJsonT::DefragIfNeeded(PageUsage* page_usage) {
  if (uint8_t* old = flat_ptr; page_usage->IsPageForObjectUnderUtilized(old)) {
    const uint32_t size = json_len;
    flat_ptr = static_cast<uint8_t*>(tl.local_mr->allocate(size, kAlignSize));
    memcpy(flat_ptr, old, size);
    tl.local_mr->deallocate(old, size, kAlignSize);
    return true;
  }

  return false;
}

bool CompactObj::JsonWrapper::DefragIfNeeded(PageUsage* page_usage) {
  if (JsonEnconding() == kEncodingJsonCons) {
    return cons.DefragIfNeeded(page_usage);
  }

  return flat.DefragIfNeeded(page_usage);
}

constexpr std::pair<CompactObjType, std::string_view> kObjTypeToString[] = {
    {OBJ_STRING, "string"sv},  {OBJ_LIST, "list"sv},     {OBJ_SET, "set"sv},
    {OBJ_ZSET, "zset"sv},      {OBJ_HASH, "hash"sv},     {OBJ_STREAM, "stream"sv},
    {OBJ_KEY, "key"sv},  // pseudo-type used for memory tracking
    {OBJ_JSON, "ReJSON-RL"sv}, {OBJ_SBF, "MBbloom--"sv}, {OBJ_CMS, "CMSk-TYPE"sv},
    {OBJ_TOPK, "TopK-TYPE"sv}};

std::string_view ObjTypeToString(CompactObjType type) {
  for (auto& p : kObjTypeToString) {
    if (type == p.first) {
      return p.second;
    }
  }

  LOG(DFATAL) << "Unsupported type " << type;
  return "Invalid type"sv;
}

CompactObjType ObjTypeFromString(std::string_view sv) {
  for (auto& p : kObjTypeToString) {
    if (absl::EqualsIgnoreCase(sv, p.second)) {
      return p.first;
    }
  }
  return kInvalidCompactObjType;
}

void CompactKey::SetExpireTime(uint64_t abs_ms) {
  DCHECK(!IsRef() && !IsExternal());

  // Already SDS_TTL_TAG — update TTL in place.
  if (taglen_ == SDS_TTL_TAG) {
    u_.sds_ttl.exp_ms = abs_ms;
    return;
  }

  char* new_sds = nullptr;

  if (IsInline()) {
    new_sds = sdsnewlen(u_.inline_str, taglen_);
    // encoding_ preserved as-is.
  } else if (taglen_ == INT_TAG) {
    absl::AlphaNum an(u_.ival);
    new_sds = sdsnewlen(an.data(), an.size());
    encoding_ = NONE_ENC;
  } else if (taglen_ == SMALL_TAG) {
    size_t total = u_.small_str.size();
    new_sds = sdsnewlen(nullptr, total);
    u_.small_str.Get(new_sds);
    tl.small_str_bytes -= u_.small_str.MallocUsed();
    u_.small_str.Free();
  } else if (taglen_ == ROBJ_TAG) {
    CHECK_EQ(OBJ_STRING, u_.r_obj.type());
    auto view = u_.r_obj.AsView();
    new_sds = sdsnewlen(view.data(), view.size());
    u_.r_obj.Free(tl.local_mr);
  } else {
    LOG(FATAL) << "Unexpected tag for SetExpireTime: " << int(taglen_);
  }

  u_.sds_ttl.sds_ptr = new_sds;
  u_.sds_ttl.exp_ms = abs_ms;
  taglen_ = SDS_TTL_TAG;
  mask_bits_.expire = 1;
}

bool CompactKey::ClearExpireTime() {
  if (taglen_ != SDS_TTL_TAG)
    return false;
  DCHECK(!IsRef() && !IsExternal());

  string decoded;
  GetString(&decoded);
  SetMeta(0, mask_);
  encoding_ = NONE_ENC;
  mask_bits_.expire = 0;

  SetString(decoded);
  return true;
}

uint64_t CompactKey::GetExpireTime() const {
  if (taglen_ != SDS_TTL_TAG)
    return 0;
  DCHECK(!IsRef() && !IsExternal());
  return u_.sds_ttl.exp_ms;
}

size_t CompactObj::StrEncoding::DecodedSize(string_view blob) const {
  return DecodedSize(blob.size(), blob[0]);
}

size_t CompactObj::StrEncoding::DecodedSize(size_t blob_size, uint8_t first_byte) const {
  switch (enc_) {
    case NONE_ENC:
      return blob_size;
    case ASCII1_ENC:
    case ASCII2_ENC:
      return ascii_len(blob_size) - (enc_ == ASCII1_ENC);
    case HUFFMAN_ENC:
      return blob_size + int(first_byte) - 1;
  };
  return 0;
}

size_t CompactObj::StrEncoding::Decode(std::string_view blob, char* dest) const {
  if (blob.empty())
    return 0;
  size_t decoded_len = DecodedSize(blob);
  switch (enc_) {
    case NONE_ENC:
      memcpy(dest, blob.data(), blob.size());
      break;
    case ASCII1_ENC:
    case ASCII2_ENC:
      detail::ascii_unpack(reinterpret_cast<const uint8_t*>(blob.data()), decoded_len, dest);
      break;
    case HUFFMAN_ENC: {
      auto domain = is_key_ ? HUFF_KEYS : HUFF_STRING_VALUES;
      const auto& decoder = tl.GetHuffmanDecoder(domain);
      decoder.Decode(blob.substr(1), decoded_len, dest);
      break;
    }
  };
  return decoded_len;
}

bool CompactObj::StrEncoding::DecodeByte(std::string_view blob, size_t idx, uint8_t* dest) const {
  if (blob.empty()) {
    return false;
  }
  size_t decoded_len = DecodedSize(blob);
  if (idx >= decoded_len) {
    return false;
  }
  switch (enc_) {
    case NONE_ENC:
      *dest = blob[idx];
      break;
    case ASCII1_ENC:
    case ASCII2_ENC:
      *dest = detail::ascii_unpack_byte(reinterpret_cast<const uint8_t*>(blob.data()), decoded_len,
                                        idx);
      break;
    case HUFFMAN_ENC: {
      std::string decoded_huff_string(decoded_len, 0);
      auto domain = is_key_ ? HUFF_KEYS : HUFF_STRING_VALUES;
      const auto& decoder = tl.GetHuffmanDecoder(domain);
      decoder.Decode(blob.substr(1), decoded_len, decoded_huff_string.data());
      *dest = decoded_huff_string[idx];
      break;
    }
  };
  return true;
}

StringOrView CompactObj::StrEncoding::Decode(std::string_view blob) const {
  switch (enc_) {
    case NONE_ENC:
      return StringOrView::FromView(blob);
    default: {
      string out;
      out.resize(DecodedSize(blob));
      Decode(blob, out.data());
      return StringOrView::FromString(std::move(out));
    }
  }
  return {};
}

/* Create a new stream data structure. */
stream* streamNew() {
  stream* s = (stream*)zmalloc(sizeof(stream));
  s->rax = raxNew();
  s->length = 0;
  s->first_id.ms = 0;
  s->first_id.seq = 0;
  s->last_id.ms = 0;
  s->last_id.seq = 0;
  s->max_deleted_entry_id.seq = 0;
  s->max_deleted_entry_id.ms = 0;
  s->entries_added = 0;
  s->cgroups = NULL; /* Created on demand to save memory when not used. */
  return s;
}

/* Free a consumer and associated data structures. Note that this function
 * will not reassign the pending messages associated with this consumer
 * nor will delete them from the stream, so when this function is called
 * to delete a consumer, and not when the whole stream is destroyed, the caller
 * should do some work before. */
static void streamFreeConsumer(streamConsumer* sc) {
  raxFree(sc->pel); /* No value free callback: the PEL entries are shared
                       between the consumer and the main stream PEL. */
  sdsfree(sc->name);
  zfree(sc);
}

/* Used for generic free functions. */
static void streamFreeConsumerVoid(void* sc) {
  streamFreeConsumer((streamConsumer*)sc);
}

/* Used for generic free functions. */
static void streamFreeCGVoid(void* cg_) {
  streamCG* cg = (streamCG*)cg_;
  raxFreeWithCallback(cg->pel, zfree);
  raxFreeWithCallback(cg->consumers, streamFreeConsumerVoid);
  zfree(cg);
}

static void lpFreeVoid(void* lp) {
  lpFree((uint8_t*)lp);
}

/* Free a stream, including the listpacks stored inside the radix tree. */
void freeStream(stream* s) {
  raxFreeWithCallback(s->rax, lpFreeVoid);
  if (s->cgroups)
    raxFreeWithCallback(s->cgroups, streamFreeCGVoid);
  zfree(s);
}

}  // namespace dfly


================================================
FILE: src/core/compact_object.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/base/internal/endian.h>

#include <optional>
#include <type_traits>

#include "base/pmr/memory_resource.h"
#include "common/string_or_view.h"
#include "core/json/json_object.h"
#include "core/mi_memory_resource.h"
#include "core/small_string.h"

typedef struct stream stream;

namespace dfly {

namespace tiering {
struct TieredCoolRecord;
}

constexpr unsigned kEncodingIntSet = 0;
constexpr unsigned kEncodingStrMap2 = 2;  // for set/map encodings of strings using DenseSet
constexpr unsigned kEncodingQL2 = 1;
constexpr unsigned kEncodingListPack = 3;
constexpr unsigned kEncodingJsonCons = 0;
constexpr unsigned kEncodingJsonFlat = 1;

class SBF;
class TOPK;
class CMS;
class PageUsage;

using cmn::StringOrView;
namespace detail {

// redis objects or blobs of upto 4GB size.
class RobjWrapper {
 public:
  using MemoryResource = PMR_NS::memory_resource;

  RobjWrapper() : sz_(0), type_(0), encoding_(0) {
  }

  size_t MallocUsed(bool slow) const;

  uint64_t HashCode() const;
  bool Equal(const RobjWrapper& ow) const;
  bool Equal(std::string_view sv) const;
  size_t Size() const;
  void Free(MemoryResource* mr);

  void SetString(std::string_view s, MemoryResource* mr);
  void ReserveString(size_t size, MemoryResource* mr);
  void AppendString(std::string_view s, MemoryResource* mr);
  // Used when sz_ is used to denote memory usage
  void SetSize(uint64_t size);
  void Init(unsigned type, unsigned encoding, void* inner);

  unsigned type() const {
    return type_;
  }
  unsigned encoding() const {
    return encoding_;
  }
  void* inner_obj() const {
    return inner_obj_;
  }

  void set_inner_obj(void* ptr) {
    inner_obj_ = ptr;
  }

  std::string_view AsView() const {
    return std::string_view{reinterpret_cast<char*>(inner_obj_), sz_};
  }

  // Try reducing memory fragmentation by re-allocating values from underutilized pages.
  // Returns true if re-allocated.
  bool DefragIfNeeded(PageUsage* page_usage);

 private:
  void ReallocateString(MemoryResource* mr);

  size_t InnerObjMallocUsed() const;
  void MakeInnerRoom(size_t current_cap, size_t desired, MemoryResource* mr);

  void Set(void* p, size_t s) {
    inner_obj_ = p;
    sz_ = s;
  }

  void* inner_obj_ = nullptr;

  // semantics depend on the type. For OBJ_STRING it's string length.
  uint64_t sz_ : 56;

  uint64_t type_ : 4;
  uint64_t encoding_ : 4;
} __attribute__((packed));

static_assert(sizeof(RobjWrapper) == 16);

}  // namespace detail

using CompactObjType = unsigned;

constexpr CompactObjType kInvalidCompactObjType = std::numeric_limits<CompactObjType>::max();

uint32_t JsonEnconding();

class CompactObj {
  static constexpr unsigned kInlineLen = 16;

  void operator=(const CompactObj&) = delete;
  CompactObj(const CompactObj&) = delete;

 protected:
  // 0-16 is reserved for inline lengths of string type.
  enum TagEnum : uint8_t {
    INT_TAG = 17,
    SMALL_TAG = 18,
    ROBJ_TAG = 19,
    EXTERNAL_TAG = 20,
    JSON_TAG = 21,
    SBF_TAG = 22,
    CMS_TAG = 23,
    SDS_TTL_TAG = 24,
    TOPK_TAG = 25,
  };

  // String encoding types.
  // With ascii compression it compresses 8 bytes to 7 but also 7 to 7.
  // Therefore, in order to know the original length we introduce 2 states that
  // correct the length upon decoding. ASCII1_ENC rounds down the decoded length,
  // while ASCII2_ENC rounds it up. See DecodedLen implementation for more info.
  enum EncodingEnum : uint8_t {
    NONE_ENC = 0,
    ASCII1_ENC = 1,
    ASCII2_ENC = 2,
    HUFFMAN_ENC = 3,
  };

 public:
  // Utility class for working with different string encodings (ascii, huffman, etc)
  struct StrEncoding {
    size_t DecodedSize(std::string_view blob) const;         // Size of decoded blob
    size_t Decode(std::string_view blob, char* dest) const;  // Decode into dest, return size
    StringOrView Decode(std::string_view blob) const;
    // Decode a byte at offset into dest. Return true if decoded successfully,
    // false if idx is out of bounds.
    bool DecodeByte(std::string_view blob, size_t idx, uint8_t* dest) const;

   private:
    friend class CompactObj;
    explicit StrEncoding(uint8_t enc, bool is_key)
        : enc_(static_cast<EncodingEnum>(enc)), is_key_(is_key) {
    }

    size_t DecodedSize(size_t compr_size, uint8_t first_byte) const;

    EncodingEnum enc_;
    bool is_key_;
  };

  using MemoryResource = detail::RobjWrapper::MemoryResource;

  // Different representations of external values
  enum class ExternalRep : uint8_t {
    STRING,         // OBJ_STRING, Basic representation with various string encodings
    SERIALIZED_MAP  // OBJ_HASH, Serialized map
  };

  explicit CompactObj(bool is_key)
      : is_key_{is_key}, taglen_{0}, encoding_{0} {  // default - empty string
  }

  CompactObj(std::string_view str, bool is_key) : CompactObj(is_key) {
    SetString(str);
  }

  CompactObj(CompactObj&& cs) noexcept : CompactObj(cs.is_key_) {
    operator=(std::move(cs));
  };

  ~CompactObj();

  CompactObj& operator=(CompactObj&& o) noexcept;

  // Returns object size depending on the semantics.
  // For strings - returns the length of the string.
  // For containers - returns number of elements in the container.
  size_t Size() const;

  bool IsRef() const {
    return mask_bits_.ref;
  }

  std::string_view GetSlice(std::string* scratch) const;

  std::string ToString() const {
    std::string res;
    GetString(&res);
    return res;
  }

  uint64_t HashCode() const;
  static uint64_t HashCode(std::string_view str);

  bool HasFlag() const {
    return mask_bits_.mc_flag;
  }

  void SetFlag(bool e) {
    mask_bits_.mc_flag = e;
  }

  bool WasTouched() const {
    return mask_bits_.touched;
  }

  void SetTouched(bool e) {
    mask_bits_.touched = e;
  }

  bool DefragIfNeeded(PageUsage* page_usage);

  void SetOmitDefrag(bool v) {
    mask_bits_.omit_defrag = v;
  }

  bool OmitDefrag() const {
    return mask_bits_.omit_defrag;
  }

  bool HasStashPending() const {
    return mask_bits_.io_pending;
  }

  void SetStashPending(bool b) {
    mask_bits_.io_pending = b;
  }

  bool IsSticky() const {
    return mask_bits_.sticky;
  }

  void SetSticky(bool e) {
    mask_bits_.sticky = e;
  }

  unsigned Encoding() const;
  CompactObjType ObjType() const;

  void* RObjPtr() const {
    return u_.r_obj.inner_obj();
  }

  void SetRObjPtr(void* ptr) {
    u_.r_obj.Init(u_.r_obj.type(), u_.r_obj.encoding(), ptr);
  }

  // takes ownership over obj_inner.
  // type should not be OBJ_STRING.
  void InitRobj(CompactObjType type, unsigned encoding, void* obj_inner);

  // For STR object.
  void SetInt(int64_t val);
  std::optional<int64_t> TryGetInt() const;

  void GetString(std::string* res) const;

  void SetString(std::string_view str);
  void ReserveString(size_t size);
  void AppendString(std::string_view str);

  // Will set this to hold OBJ_JSON, after that it is safe to call GetJson
  // NOTE: in order to avid copy which can be expensive in this case,
  // you need to move an object that created with the function JsonFromString
  // into here, no copying is allowed!
  void SetJson(JsonType&& j);
  void SetJson(const uint8_t* buf, size_t len);
  // Adjusts the size used by json
  void SetJsonSize(int64_t size);
  // Adjusts the size used by a stream
  void AddStreamSize(int64_t size);

  // pre condition - the type here is OBJ_JSON and was set with SetJson
  JsonType* GetJson() const;

  void SetSBF(SBF* sbf) {
    SetMeta(SBF_TAG);
    u_.sbf = sbf;
  }

  void SetSBF(uint64_t initial_capacity, double fp_prob, double grow_factor);
  SBF* GetSBF() const;

  void SetTOPK(TOPK* topk) {
    SetMeta(TOPK_TAG);
    u_.topk = topk;
  }

  void SetTOPK(uint32_t k, uint32_t width, uint32_t depth, double decay);
  TOPK* GetTOPK() const;

  void SetCMS(CMS* cms) {
    SetMeta(CMS_TAG);
    u_.cms = cms;
  }

  void SetCMS(uint32_t width, uint32_t depth);
  CMS* GetCMS() const;

  // dest must have at least Size() bytes available
  void GetString(char* dest) const;

  bool IsExternal() const {
    return taglen_ == EXTERNAL_TAG;
  }

  // returns true if the value is stored in the cooling storage. Cooling storage has an item both
  // on disk and in memory.
  bool IsCool() const {
    assert(IsExternal());
    return u_.ext_ptr.is_cool;
  }

  void SetExternal(size_t offset, uint32_t sz, ExternalRep rep);
  ExternalRep GetExternalRep() const;

  // Switches to empty, non-external string.
  // Preserves all the attributes.
  void RemoveExternal() {
    encoding_ = NONE_ENC;
    SetMeta(0, mask_);
  }

  // Assigns a cooling record to the object together with its external slice.
  void SetCool(size_t offset, uint32_t serialized_size, ExternalRep rep,
               tiering::TieredCoolRecord* record);

  struct CoolItem {
    uint16_t page_offset;
    size_t serialized_size;
    tiering::TieredCoolRecord* record;
  };

  // Prerequisite: IsCool() is true.
  // Returns the external data of the object incuding its ColdRecord.
  CoolItem GetCool() const;

  // Prequisite: IsCool() is true.
  // Keeps cool record only as external value and discard in-memory part.
  void Freeze(size_t offset, size_t sz);

  std::pair<size_t, size_t> GetExternalSlice() const;

  // Injects either the the raw string (extracted with GetRawString()) or the usual string
  // back to the compact object. In the latter case, encoding is performed.
  // Precondition: The object must be in the EXTERNAL state.
  // Postcondition: The object is an in-memory string.
  void Materialize(std::string_view str, bool is_raw);

  // Returns the approximation of memory used by the object.
  // If slow is true, may use more expensive methods to calculate the precise size.
  size_t MallocUsed(bool slow = false) const;

  // Resets the object to empty state (string).
  void Reset();

  bool IsInline() const {
    return taglen_ <= kInlineLen;
  }

  uint8_t GetFirstByte() const;
  // Returns true if the byte was decoded successfully, false if idx is out of bounds.
  bool GetByteAtIndex(size_t idx, uint8_t* res) const;
  // Returns a pair of booleans: {success, in_place}. success is false if offset is out of bounds
  // in_place is true if the byte was set without needing to rewrite the string.
  std::pair<bool, bool> SetByteAtIndex(size_t idx, uint8_t val);

  struct Stats {
    size_t small_string_bytes = 0;
    uint64_t huff_encode_total = 0, huff_encode_success = 0;
  };

  static Stats GetStatsThreadLocal();
  static void InitThreadLocal(MemoryResource* mr);

  enum HuffmanDomain : uint8_t {
    HUFF_KEYS = 0,
    HUFF_STRING_VALUES = 1,
    // TODO: add more domains.
  };

  static bool InitHuffmanThreadLocal(HuffmanDomain domain, std::string_view hufftable);
  static MemoryResource* memory_resource();  // thread-local.

  template <typename T, typename... Args> static T* AllocateMR(Args&&... args) {
    void* ptr = memory_resource()->allocate(sizeof(T), alignof(T));
    if constexpr (std::is_constructible_v<T, decltype(memory_resource())> && sizeof...(args) == 0)
      return new (ptr) T{memory_resource()};
    else
      return new (ptr) T{std::forward<Args>(args)...};
  }

  template <typename T> static void DeleteMR(void* ptr) {
    T* t = (T*)ptr;
    t->~T();
    memory_resource()->deallocate(ptr, sizeof(T), alignof(T));
  }

  // Return raw (non-decoded) string as two views. First is guaranteed to be non-empty.
  // Precondition: the object is a non-inline string.
  std::array<std::string_view, 2> GetRawString() const;

  StrEncoding GetStrEncoding() const {
    return StrEncoding{encoding_, is_key_};
  }

  bool HasAllocated() const;

  bool TagAllowsEmptyValue() const;

  uint8_t Tag() const {
    return taglen_;
  }

 private:
  // Returns a string_view corresponding to the serialized encoded blob.
  // If opt_dest is provided, it may be used to decode directly into the destination buffer.
  std::string_view GetEncodedBlob(StrEncoding str_encoding, char* opt_dest) const;

 protected:
  void EncodeString(std::string_view str);

  // Requires: HasAllocated() - true.
  void Free();

  bool CmpEncoded(std::string_view sv) const;
  bool CmpNonInline(std::string_view sv) const;

  void SetMeta(uint8_t taglen, uint8_t mask = 0) {
    if (HasAllocated()) {
      Free();
    } else {
      memset(u_.inline_str, 0, kInlineLen);
    }
    taglen_ = taglen;
    mask_ = mask;
  }

  struct ExternalPtr {
    uint32_t serialized_size;
    uint16_t page_offset;  // 0 for multi-page blobs. != 0 for small blobs.
    uint8_t is_cool : 1;
    uint8_t representation : 2;  // See ExternalRep
    uint8_t is_reserved : 5;
    uint8_t first_byte;

    // We do not have enough space in the common area to store page_index together with
    // cool_record pointer. Therefore, we moved this field into TieredCoolRecord itself.
    struct Offload {
      uint32_t page_index;
      uint32_t reserved;
    };

    union {
      Offload offload;
      tiering::TieredCoolRecord* cool_record;
    };
  } __attribute__((packed));
  static_assert(sizeof(ExternalPtr) == 16);

  struct SdsTtlString {
    char* sds_ptr;    // SDS string (length via sdslen)
    uint64_t exp_ms;  // absolute expiry time in ms

    std::string_view view() const;
  } __attribute__((packed));

  struct JsonConsT {
    JsonType* json_ptr;
    size_t bytes_used;

    bool DefragIfNeeded(PageUsage* page_usage);
  };

  struct FlatJsonT {
    uint32_t json_len;
    uint8_t* flat_ptr;

    bool DefragIfNeeded(PageUsage* page_usage);
  };

  struct JsonWrapper {
    union {
      JsonConsT cons;
      FlatJsonT flat;
    };

    bool DefragIfNeeded(PageUsage* page_usage);
  };

  // Union of different representations
  union U {
    char inline_str[kInlineLen];

    SmallString small_str;
    detail::RobjWrapper r_obj;

    // using 'packed' to reduce alignment of U to 1.
    JsonWrapper json_obj __attribute__((packed));
    SBF* sbf __attribute__((packed));
    TOPK* topk __attribute__((packed));
    CMS* cms __attribute__((packed));
    int64_t ival __attribute__((packed));
    ExternalPtr ext_ptr;
    SdsTtlString sds_ttl;

    U() : r_obj() {
    }
  } u_;

  static_assert(sizeof(u_) == 16);

  union {
    uint8_t mask_ = 0;
    struct {
      uint8_t ref : 1;      // Mark objects that don't own their allocation.
      uint8_t expire : 1;   // Mark objects that have expiry timestamp assigned.
      uint8_t mc_flag : 1;  // Marks keys that have memcache flags assigned.

      // IO_PENDING is set when the tiered storage has issued an i/o request to save the value.
      // It is cleared when the io request finishes or is cancelled.
      uint8_t io_pending : 1;
      uint8_t sticky : 1;

      // TOUCHED used to determin which items are hot/cold.
      // by checking if the item was touched from the last time we
      // reached this item while travering the database to set items as cold.
      // https://junchengyang.com/publication/nsdi24-SIEVE.pdf
      uint8_t touched : 1;  // used to mark keys that were accessed.

      uint8_t omit_defrag : 1;  // mark object to skip defragmentation.
    } mask_bits_;
  };

  // TODO: use c++20 bitfield initializers
  const bool is_key_ : 1;
  uint8_t taglen_ : 5;    // Either length of inline string or tag of type
  uint8_t encoding_ : 2;  // Encoding of string values
};

struct CompactKey : public CompactObj {
  CompactKey() : CompactObj(true) {
  }

  explicit CompactKey(std::string_view str) : CompactObj{str, true} {
  }

  CompactKey AsRef() const {
    CompactKey res;
    memcpy(&res.u_, &u_, sizeof(u_));
    res.encoding_ = encoding_;
    res.taglen_ = taglen_;
    res.mask_ = mask_;
    res.mask_bits_.ref = 1;

    return res;
  }

  bool HasExpire() const {
    return mask_bits_.expire;
  }

  void SetExpire(bool e) {
    mask_bits_.expire = e;
  }

  // Embed expire time directly in the key by converting to SDS_TTL_TAG.
  void SetExpireTime(uint64_t abs_ms);

  // Remove embedded expire time and convert back to optimal string form.
  bool ClearExpireTime();

  // Read the embedded expire time.
  // Returns 0 if there is no embedded expire time, otherwise
  // returns the absolute expire time in ms.
  uint64_t GetExpireTime() const;

  CompactKey& operator=(std::string_view sv) noexcept {
    SetString(sv);
    return *this;
  }

  bool operator==(const CompactKey& o) const;

  bool operator==(std::string_view sl) const;

  bool operator!=(std::string_view sl) const {
    return !(*this == sl);
  }

  friend bool operator!=(const CompactKey& lhs, const CompactKey& rhs) {
    return !(lhs == rhs);
  }

  friend bool operator==(std::string_view sl, const CompactKey& o) {
    return o.operator==(sl);
  }
};

inline bool CompactKey::operator==(std::string_view sv) const {
  if (encoding_)
    return CmpEncoded(sv);

  if (IsInline()) {
    return std::string_view{u_.inline_str, taglen_} == sv;
  }
  return CmpNonInline(sv);
}

struct CompactValue : public CompactObj {
  CompactValue() : CompactObj(false) {
  }

  explicit CompactValue(std::string_view str) : CompactObj{str, false} {
  }
};

std::string_view ObjTypeToString(CompactObjType type);

// Returns kInvalidCompactObjType if sv is not a valid type.
CompactObjType ObjTypeFromString(std::string_view sv);

stream* streamNew();
void freeStream(stream* s);

}  // namespace dfly


================================================
FILE: src/core/compact_object_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#include "core/compact_object.h"

#include <absl/functional/overload.h>
#include <absl/strings/str_cat.h>
#include <gtest/gtest.h>
#include <mimalloc.h>
#include <xxhash.h>

#include <cstddef>
#include <random>

#include "base/gtest.h"
#include "base/logging.h"
#include "core/detail/bitpacking.h"
#include "core/huff_coder.h"
#include "core/mi_memory_resource.h"
#include "core/page_usage/page_usage_stats.h"
#include "core/string_map.h"
#include "core/string_set.h"

extern "C" {
#include "redis/intset.h"
#include "redis/redis_aux.h"
#include "redis/stream.h"
#include "redis/zmalloc.h"
}

namespace dfly {

XXH64_hash_t kSeed = 24061983;
constexpr size_t kRandomStartIndex = 24;
constexpr size_t kRandomStep = 26;
constexpr float kUnderUtilizedRatio = 1.0f;  // ensure that we would detect
using namespace std;
using namespace jsoncons;
using namespace jsoncons::jsonpath;

void PrintTo(const CompactObj& cobj, std::ostream* os) {
  if (cobj.ObjType() == OBJ_STRING) {
    *os << "'" << cobj.ToString() << "' ";
    return;
  }
  *os << "cobj: [" << cobj.ObjType() << "]";
}

// This is for the mimalloc test - being able to find an address in memory
// where we have memory underutilzation
// see issue number 448 (https://github.com/dragonflydb/dragonfly/issues/448)
std::vector<void*> AllocateForTest(int size, std::size_t allocate_size, int factor1 = 1,
                                   int factor2 = 1) {
  const int kAllocRandomChangeSize = 13;  // just some random value
  std::vector<void*> ptrs;
  for (int index = 0; index < size; index++) {
    auto alloc_size =
        index % kAllocRandomChangeSize == 0 ? allocate_size * factor1 : allocate_size * factor2;
    auto heap_alloc = mi_heap_get_backing();
    void* ptr = mi_heap_malloc(heap_alloc, alloc_size);
    ptrs.push_back(ptr);
  }
  return ptrs;
}

bool HasUnderutilizedMemory(const std::vector<void*>& ptrs, float ratio) {
  PageUsage page_usage{CollectPageStats::NO, ratio};
  auto it = std::find_if(ptrs.begin(), ptrs.end(), [&](auto p) {
    int r = p && page_usage.IsPageForObjectUnderUtilized(p);
    return r > 0;
  });
  return it != ptrs.end();
}

// Go over ptrs vector and free memory at locations every "steps".
// This is so that we will trigger the under utilization - some
// pages will have "holes" in them and we are expecting to find these pages.
void DeallocateAtRandom(size_t steps, std::vector<void*>* ptrs) {
  for (size_t i = kRandomStartIndex; i < ptrs->size(); i += steps) {
    mi_free(ptrs->at(i));
    ptrs->at(i) = nullptr;
  }
}

static void InitThreadStructs() {
  auto* tlh = mi_heap_get_backing();
  init_zmalloc_threadlocal(tlh);
  SmallString::InitThreadLocal(tlh);
  thread_local MiMemoryResource mi_resource(tlh);
  CompactObj::InitThreadLocal(&mi_resource);
  InitTLStatelessAllocMR(&mi_resource);
};

static void CheckEverythingDeallocated() {
  mi_heap_collect(mi_heap_get_backing(), true);

  auto cb_visit = [](const mi_heap_t* heap, const mi_heap_area_t* area, void* block,
                     size_t block_size, void* arg) {
    LOG(ERROR) << "Unfreed allocations: block_size " << block_size
               << ", allocated: " << area->used * block_size;
    return true;
  };

  mi_heap_visit_blocks(mi_heap_get_backing(), false /* do not visit all blocks*/, cb_visit,
                       nullptr);
}

class CompactObjectTest : public ::testing::Test {
 protected:
  static void SetUpTestSuite() {
    InitRedisTables();  // to initialize server struct.

    InitThreadStructs();
  }

  static void TearDownTestSuite() {
    CheckEverythingDeallocated();
    CleanupStatelessAllocMR();
  }

  CompactValue cobj_;
  CompactKey ckey_;
  string tmp_;
};

TEST_F(CompactObjectTest, WastedMemoryDetection) {
  size_t allocated = 0, commited = 0, wasted = 0;
  // By setting the threshold to high value we are expecting
  // To find locations where we have wasted memory
  float ratio = 0.8;
  zmalloc_get_allocator_wasted_blocks(ratio, &allocated, &commited, &wasted);
  EXPECT_EQ(allocated, 0);
  EXPECT_EQ(commited, 0);
  EXPECT_EQ(wasted, (commited - allocated));

  std::size_t allocated_mem = 64;
  auto* myheap = mi_heap_get_backing();

  void* p1 = mi_heap_malloc(myheap, 64);

  void* ptrs_end[50];
  for (size_t i = 0; i < 50; ++i) {
    ptrs_end[i] = mi_heap_malloc(myheap, 128);
    allocated_mem += 128;
  }

  allocated = commited = wasted = 0;
  zmalloc_get_allocator_wasted_blocks(ratio, &allocated, &commited, &wasted);
  EXPECT_EQ(allocated, allocated_mem);
  EXPECT_GT(commited, allocated_mem);
  EXPECT_EQ(wasted, (commited - allocated));
  void* ptr[50];
  // allocate 50
  for (size_t i = 0; i < 50; ++i) {
    ptr[i] = mi_heap_malloc(myheap, 256);
    allocated_mem += 256;
  }

  // At this point all the blocks has committed > 0 and used > 0
  // and since we expecting to find these locations, the size of
  // wasted == commited memory - allocated memory.
  allocated = commited = wasted = 0;
  zmalloc_get_allocator_wasted_blocks(ratio, &allocated, &commited, &wasted);
  EXPECT_EQ(allocated, allocated_mem);
  EXPECT_GT(commited, allocated_mem);
  EXPECT_EQ(wasted, (commited - allocated));

  // free 50/50 -
  for (size_t i = 0; i < 50; ++i) {
    mi_free(ptr[i]);
    allocated_mem -= 256;
  }

  // After all the memory at block size 256 is free, we would have commited there
  // but the used is expected to be 0, so the number now is different from the
  // case above
  allocated = commited = wasted = 0;
  zmalloc_get_allocator_wasted_blocks(ratio, &allocated, &commited, &wasted);
  EXPECT_EQ(allocated, allocated_mem);
  EXPECT_GT(commited, allocated_mem);
  // since we release all 256 memory block, it should not be counted
  EXPECT_EQ(wasted, (commited - allocated));
  for (size_t i = 0; i < 50; ++i) {
    mi_free(ptrs_end[i]);
  }
  mi_free(p1);

  // Now that its all freed, we are not expecting to have any wasted memory any more
  allocated = commited = wasted = 0;
  zmalloc_get_allocator_wasted_blocks(ratio, &allocated, &commited, &wasted);
  EXPECT_EQ(allocated, 0);
  EXPECT_GT(commited, allocated);
  EXPECT_EQ(wasted, (commited - allocated));

  mi_collect(false);
}

TEST_F(CompactObjectTest, WastedMemoryDontCount) {
  // The commited memory per blocks are:
  // 64bit => 4K
  // 128bit => 8k
  // 256 => 16k
  // and so on, which mean every n * sizeof(ptr) ^ 2 == 2^11*2*(n-1) (where n starts with 1)
  constexpr std::size_t kExpectedFor256MemWasted = 0x4000;  // memory block 256
  auto* myheap = mi_heap_get_backing();

  size_t allocated = 0, commited = 0, wasted = 0;
  // By setting the threshold to a very low number
  // we don't expect to find and locations where memory is wasted
  float ratio = 0.01;
  zmalloc_get_allocator_wasted_blocks(ratio, &allocated, &commited, &wasted);
  EXPECT_EQ(allocated, 0);
  EXPECT_EQ(commited, 0);
  EXPECT_EQ(wasted, (commited - allocated));

  std::size_t allocated_mem = 64;

  void* p1 = mi_heap_malloc(myheap, 64);

  void* ptrs_end[50];
  for (size_t i = 0; i < 50; ++i) {
    ptrs_end[i] = mi_heap_malloc(myheap, 128);
    (void)p1;
    allocated_mem += 128;
  }

  void* ptr[50];

  // allocate 50
  for (size_t i = 0; i < 50; ++i) {
    ptr[i] = mi_heap_malloc(myheap, 256);
    allocated_mem += 256;
  }
  allocated = commited = wasted = 0;
  zmalloc_get_allocator_wasted_blocks(ratio, &allocated, &commited, &wasted);
  // Threshold is low so we are not expecting any wasted memory to be found.
  EXPECT_EQ(allocated, allocated_mem);
  EXPECT_GT(commited, allocated_mem);
  EXPECT_EQ(wasted, 0);

  // free 50/50 -
  for (size_t i = 0; i < 50; ++i) {
    mi_free(ptr[i]);
    allocated_mem -= 256;
  }
  allocated = commited = wasted = 0;
  zmalloc_get_allocator_wasted_blocks(ratio, &allocated, &commited, &wasted);

  EXPECT_EQ(allocated, allocated_mem);
  EXPECT_GT(commited, allocated_mem);
  // We will detect only wasted memory for block size of
  // 256 - and all of it is wasted.
  EXPECT_EQ(wasted, kExpectedFor256MemWasted);
  // Threshold is low so we are not expecting any wasted memory to be found.
  for (size_t i = 0; i < 50; ++i) {
    mi_free(ptrs_end[i]);
  }
  mi_free(p1);

  mi_collect(false);
}

TEST_F(CompactObjectTest, NonInline) {
  string s(22, 'a');
  CompactKey obj{s};

  uint64_t expected_val = XXH3_64bits_withSeed(s.data(), s.size(), kSeed);
  EXPECT_EQ(18261733907982517826UL, expected_val);
  EXPECT_EQ(expected_val, obj.HashCode());
  EXPECT_EQ(s, obj);

  s.assign(25, 'b');
  obj.SetString(s);
  EXPECT_EQ(s, obj);
  EXPECT_EQ(s.size(), obj.Size());
}

TEST_F(CompactObjectTest, InlineAsciiEncoded) {
  string s = "key:0000000000000";
  uint64_t expected_val = XXH3_64bits_withSeed(s.data(), s.size(), kSeed);
  CompactValue obj{s};
  EXPECT_EQ(expected_val, obj.HashCode());
  EXPECT_EQ(s.size(), obj.Size());
}

TEST_F(CompactObjectTest, Int) {
  ckey_.SetString("0");
  EXPECT_EQ(0, ckey_.TryGetInt());
  EXPECT_EQ(1, ckey_.Size());
  EXPECT_EQ(ckey_, "0");
  EXPECT_EQ("0", ckey_.GetSlice(&tmp_));
  EXPECT_EQ(OBJ_STRING, ckey_.ObjType());
}

TEST_F(CompactObjectTest, Expire) {
  CompactKey key;
  key.SetExpire(true);
  key.SetString("42");
  EXPECT_EQ(8181779779123079347, key.HashCode());
  EXPECT_EQ(OBJ_ENCODING_INT, key.Encoding());
  EXPECT_EQ(2, key.Size());
  EXPECT_TRUE(key.HasExpire());
}

TEST_F(CompactObjectTest, SdsTtlTag) {
  // 1. Inline key + SetTtl
  {
    CompactKey key("hello");
    ASSERT_TRUE(key.IsInline());
    uint64_t hash_before = key.HashCode();

    key.SetExpireTime(1000);
    EXPECT_TRUE(key.HasExpire());
    EXPECT_EQ(1000, key.GetExpireTime());
    EXPECT_EQ(hash_before, key.HashCode());
    EXPECT_TRUE(key == string_view("hello"));
    EXPECT_EQ(5, key.Size());
    EXPECT_EQ(OBJ_STRING, key.ObjType());

    string slice;
    EXPECT_EQ("hello", key.GetSlice(&slice));
    EXPECT_GT(key.MallocUsed(), 0u);
  }

  // 2. INT_TAG key + SetTtl
  {
    CompactKey key("42");
    ASSERT_TRUE(key.TryGetInt().has_value());
    uint64_t hash_before = key.HashCode();

    key.SetExpireTime(2000);
    EXPECT_TRUE(key.HasExpire());
    EXPECT_EQ(2000, key.GetExpireTime());
    EXPECT_TRUE(key == string_view("42"));
    EXPECT_EQ(hash_before, key.HashCode());
    // No longer INT_TAG — TryGetInt should return nullopt.
    EXPECT_FALSE(key.TryGetInt().has_value());
  }

  // 3. SMALL_TAG key + SetTtl
  {
    string s(64, 'x');
    for (size_t i = 0; i < s.size(); ++i)
      s[i] = 'a' + (i % 26);
    CompactKey key(s);
    uint64_t hash_before = key.HashCode();

    key.SetExpireTime(3000);
    EXPECT_TRUE(key.HasExpire());
    EXPECT_EQ(3000, key.GetExpireTime());
    EXPECT_TRUE(key == string_view(s));
    EXPECT_EQ(hash_before, key.HashCode());
    EXPECT_EQ(s.size(), key.Size());
  }

  // 4. ROBJ_TAG key + SetExpireTime
  {
    string s(512, 'z');
    for (size_t i = 0; i < s.size(); ++i)
      s[i] = static_cast<char>(128 + (i % 128));
    CompactKey key(s);
    uint64_t hash_before = key.HashCode();

    key.SetExpireTime(4000);
    EXPECT_TRUE(key.HasExpire());
    EXPECT_EQ(4000, key.GetExpireTime());
    EXPECT_TRUE(key == string_view(s));
    EXPECT_EQ(hash_before, key.HashCode());
    EXPECT_EQ(s.size(), key.Size());
  }

  // 5. ExpireTime update in-place
  {
    CompactKey key("hello");
    key.SetExpireTime(1000);
    EXPECT_EQ(1000, key.GetExpireTime());

    key.SetExpireTime(2000);
    EXPECT_EQ(2000, key.GetExpireTime());
    EXPECT_TRUE(key == string_view("hello"));
  }

  // 6. ClearTtl (inline recovery)
  {
    CompactKey key("hello");
    key.SetExpireTime(1000);
    EXPECT_TRUE(key.ClearExpireTime());

    EXPECT_FALSE(key.HasExpire());
    EXPECT_TRUE(key.IsInline());
    EXPECT_TRUE(key == string_view("hello"));
  }

  // 7. ClearTtl (INT recovery)
  {
    CompactKey key("42");
    key.SetExpireTime(1000);
    EXPECT_TRUE(key.ClearExpireTime());
    EXPECT_FALSE(key.HasExpire());
    EXPECT_TRUE(key.TryGetInt().has_value());
    EXPECT_EQ(42, key.TryGetInt().value());
  }

  // 8. ClearTtl (SMALL recovery)
  {
    string s(64, 'x');
    for (size_t i = 0; i < s.size(); ++i)
      s[i] = 'a' + (i % 26);
    CompactKey key(s);
    key.SetExpireTime(1000);
    EXPECT_TRUE(key.ClearExpireTime());
    EXPECT_FALSE(key.HasExpire());
    EXPECT_TRUE(key == string_view(s));
  }

  // 9. Move semantics
  {
    CompactKey a("test");
    a.SetExpireTime(100);
    CompactKey b(std::move(a));
    EXPECT_TRUE(b.HasExpire());
    EXPECT_EQ(100, b.GetExpireTime());
    EXPECT_TRUE(b == string_view("test"));
  }

  // 10. Free/destructor — just verify no leaks (TearDown catches them).
  {
    CompactKey key("hello");
    key.SetExpireTime(5000);
  }

  // 11. Cross-tag operator== (SDS_TTL_TAG vs inline/INT_TAG).
  {
    CompactKey a("hello");
    CompactKey b("hello");
    b.SetExpireTime(999);
    // b is SDS_TTL_TAG, a is inline — must compare equal as OBJ_STRING.
    EXPECT_TRUE(a == b);
    EXPECT_TRUE(b == a);

    CompactKey c("42");
    CompactKey d("42");
    d.SetExpireTime(1);
    EXPECT_TRUE(c == d);
    EXPECT_TRUE(d == c);

    // Different content must not compare equal.
    CompactKey e("world");
    e.SetExpireTime(1);
    EXPECT_FALSE(a == e);
  }
}

TEST_F(CompactObjectTest, MediumString) {
  string tmp(511, 'b');

  cobj_.SetString(tmp);
  EXPECT_EQ(tmp.size(), cobj_.Size());

  cobj_.SetString(tmp);
  EXPECT_EQ(tmp.size(), cobj_.Size());
  cobj_.Reset();

  tmp.assign(27463, 'c');
  cobj_.SetString(tmp);
  EXPECT_EQ(27463, cobj_.Size());
}

TEST_F(CompactObjectTest, AsciiUtil) {
  std::string_view data{"aaaaaabb"};
  uint8_t buf[32];

  char outbuf[32] = "xxxxxxxxxxxxxx";
  detail::ascii_pack_simd(data.data(), 7, buf);
  detail::ascii_unpack_simd(buf, 7, outbuf);

  ASSERT_EQ('x', outbuf[7]) << outbuf;
  std::string_view actual{outbuf, 7};
  ASSERT_EQ(data.substr(0, 7), actual);

  string data3;
  for (unsigned i = 0; i < 13; ++i) {
    data3.append("12345678910");
  }
  string act_str(data3.size(), 'y');
  std::vector<uint8_t> binvec(detail::binpacked_len(data3.size()));
  detail::ascii_pack_simd2(data3.data(), data3.size(), binvec.data());
  detail::ascii_unpack_simd(binvec.data(), data3.size(), act_str.data());

  ASSERT_EQ(data3, act_str);
}

TEST_F(CompactObjectTest, AsciiPackByte) {
  // Test ascii_pack_byte and ascii_unpack_byte for correctness.
  for (size_t len : {8, 16, 24, 31, 32, 33, 64, 100}) {
    string original(len, 'a');
    for (size_t i = 0; i < len; ++i)
      original[i] = 'A' + (i % 26);

    size_t packed_len = detail::binpacked_len(len);
    vector<uint8_t> packed(packed_len);
    detail::ascii_pack(original.data(), len, packed.data());

    // Verify initial pack/unpack round-trip at byte level.
    for (size_t i = 0; i < len; ++i) {
      uint8_t got = detail::ascii_unpack_byte(packed.data(), len, i);
      ASSERT_EQ(static_cast<uint8_t>(original[i]), got) << "len=" << len << " offset=" << i;
    }

    // Now set each byte to a different value via ascii_pack_byte, verify round-trip.
    for (size_t i = 0; i < len; ++i) {
      uint8_t new_val = 'a' + ((i + 3) % 26);

      // Pack the full string, then modify one byte.
      vector<uint8_t> modified(packed);
      detail::ascii_pack_byte(modified.data(), len, i, new_val);

      // The modified byte should read back correctly.
      uint8_t got = detail::ascii_unpack_byte(modified.data(), len, i);
      EXPECT_EQ(new_val, got) << "len=" << len << " set offset=" << i;

      // All other bytes should be unchanged.
      for (size_t j = 0; j < len; ++j) {
        if (j == i)
          continue;
        uint8_t other = detail::ascii_unpack_byte(modified.data(), len, j);
        EXPECT_EQ(static_cast<uint8_t>(original[j]), other)
            << "len=" << len << " set offset=" << i << " check offset=" << j;
      }
    }

    // Test setting all bytes to zero (edge case: clearing bits).
    {
      vector<uint8_t> zeroed(packed);
      string expected = original;
      for (size_t i = 0; i < len; ++i) {
        detail::ascii_pack_byte(zeroed.data(), len, i, 0);
        expected[i] = '\0';
      }
      for (size_t i = 0; i < len; ++i) {
        uint8_t got = detail::ascii_unpack_byte(zeroed.data(), len, i);
        EXPECT_EQ(0, got) << "len=" << len << " zero check offset=" << i;
      }
    }

    // Test setting all bytes to 0x7F (all bits set in 7-bit ASCII).
    {
      vector<uint8_t> maxed(packed);
      for (size_t i = 0; i < len; ++i) {
        detail::ascii_pack_byte(maxed.data(), len, i, 0x7F);
      }
      for (size_t i = 0; i < len; ++i) {
        uint8_t got = detail::ascii_unpack_byte(maxed.data(), len, i);
        EXPECT_EQ(0x7F, got) << "len=" << len << " max check offset=" << i;
      }
    }
  }
}

TEST_F(CompactObjectTest, IntSet) {
  intset* is = intsetNew();
  cobj_.InitRobj(OBJ_SET, kEncodingIntSet, is);

  EXPECT_EQ(0, cobj_.Size());
  is = (intset*)cobj_.RObjPtr();
  uint8_t success = 0;

  is = intsetAdd(is, 10, &success);
  EXPECT_EQ(1, success);
  is = intsetAdd(is, 10, &success);
  EXPECT_EQ(0, success);
  cobj_.SetRObjPtr(is);

  EXPECT_GT(cobj_.MallocUsed(), 0);
}

TEST_F(CompactObjectTest, ZSet) {
  // unrelated, checking that sds static encoding works.
  // it is used in zset special strings.
  char kMinStrData[] =
      "\110"
      "minstring";
  EXPECT_EQ(9, sdslen(kMinStrData + 1));

  cobj_.InitRobj(OBJ_ZSET, OBJ_ENCODING_LISTPACK, lpNew(0));

  EXPECT_EQ(OBJ_ZSET, cobj_.ObjType());
  EXPECT_EQ(OBJ_ENCODING_LISTPACK, cobj_.Encoding());
}

TEST_F(CompactObjectTest, Hash) {
  uint8_t* lp = lpNew(0);
  lp = lpAppend(lp, reinterpret_cast<const uint8_t*>("foo"), 3);
  lp = lpAppend(lp, reinterpret_cast<const uint8_t*>("barrr"), 5);
  cobj_.InitRobj(OBJ_HASH, kEncodingListPack, lp);
  EXPECT_EQ(OBJ_HASH, cobj_.ObjType());
  EXPECT_EQ(1, cobj_.Size());
}

TEST_F(CompactObjectTest, SBF) {
  cobj_.SetSBF(1000, 0.001, 2);
  EXPECT_EQ(cobj_.ObjType(), OBJ_SBF);
  EXPECT_GT(cobj_.MallocUsed(), 0);
}

TEST_F(CompactObjectTest, MimallocUnderutilzation) {
  // We are testing with the same object size allocation here
  // This test is for https://github.com/dragonflydb/dragonfly/issues/448
  size_t allocation_size = 94;
  int count = 2000;
  std::vector<void*> ptrs = AllocateForTest(count, allocation_size);
  bool found = HasUnderutilizedMemory(ptrs, kUnderUtilizedRatio);
  ASSERT_FALSE(found);
  DeallocateAtRandom(kRandomStep, &ptrs);
  found = HasUnderutilizedMemory(ptrs, kUnderUtilizedRatio);
  ASSERT_TRUE(found);
  for (auto* ptr : ptrs) {
    mi_free(ptr);
  }
}

TEST_F(CompactObjectTest, MimallocUnderutilzationDifferentSizes) {
  // This test uses different objects sizes to cover more use cases
  // related to issue https://github.com/dragonflydb/dragonfly/issues/448
  size_t allocation_size = 97;
  int count = 2000;
  int mem_factor_1 = 3;
  int mem_factor_2 = 2;
  std::vector<void*> ptrs = AllocateForTest(count, allocation_size, mem_factor_1, mem_factor_2);
  bool found = HasUnderutilizedMemory(ptrs, kUnderUtilizedRatio);
  ASSERT_FALSE(found);
  DeallocateAtRandom(kRandomStep, &ptrs);
  found = HasUnderutilizedMemory(ptrs, kUnderUtilizedRatio);
  ASSERT_TRUE(found);
  for (auto* ptr : ptrs) {
    mi_free(ptr);
  }
}

TEST_F(CompactObjectTest, MimallocUnderutilzationWithRealloc) {
  // This test is checking underutilzation with reallocation as well as deallocation
  // of the memory - see issue https://github.com/dragonflydb/dragonfly/issues/448
  size_t allocation_size = 102;
  int count = 2000;
  int mem_factor_1 = 4;
  int mem_factor_2 = 1;

  std::vector<void*> ptrs = AllocateForTest(count, allocation_size, mem_factor_1, mem_factor_2);
  bool found = HasUnderutilizedMemory(ptrs, kUnderUtilizedRatio);
  ASSERT_FALSE(found);
  DeallocateAtRandom(kRandomStep, &ptrs);

  //  This is another case, where we are filling the "gaps" by doing re-allocations
  //  in this case, since we are not setting all the values back it should still have
  //  places that are not used. Plus since we are not looking at the first page
  //  other pages should be underutilized.
  for (size_t i = kRandomStartIndex; i < ptrs.size(); i += kRandomStep) {
    if (!ptrs[i]) {
      ptrs[i] = mi_heap_malloc(mi_heap_get_backing(), allocation_size);
    }
  }
  found = HasUnderutilizedMemory(ptrs, kUnderUtilizedRatio);
  ASSERT_TRUE(found);
  for (auto* ptr : ptrs) {
    mi_free(ptr);
  }
}

TEST_F(CompactObjectTest, JsonTypeTest) {
  using namespace jsoncons;
  // This test verify that we can set a json type
  // and that we "know", it JSON and not a string
  std::string_view json_str = R"(
    {"firstName":"John","lastName":"Smith","age":27,"weight":135.25,"isAlive":true,
    "address":{"street":"21 2nd Street","city":"New York","state":"NY","zipcode":"10021-3100"},
    "phoneNumbers":[{"type":"home","number":"212 555-1234"},{"type":"office","number":"646 555-4567"}],
    "children":[],"spouse":null}
  )";
  std::optional<JsonType> json_option2 =
      ParseJsonUsingShardHeap(R"({"a":{}, "b":{"a":1}, "c":{"a":1, "b":2}})");

  cobj_.SetString(json_str);
  ASSERT_TRUE(cobj_.ObjType() == OBJ_STRING);  // we set this as a string
  JsonType* failed_json = cobj_.GetJson();
  ASSERT_TRUE(failed_json == nullptr);
  ASSERT_TRUE(cobj_.ObjType() == OBJ_STRING);
  std::optional<JsonType> json_option = ParseJsonUsingShardHeap(json_str);
  ASSERT_TRUE(json_option.has_value());
  cobj_.SetJson(std::move(json_option.value()));
  ASSERT_TRUE(cobj_.ObjType() == OBJ_JSON);  // and now this is a JSON type
  JsonType* json = cobj_.GetJson();
  ASSERT_TRUE(json != nullptr);
  ASSERT_TRUE(json->contains("firstName"));
  // set second object make sure that we don't have any memory issue
  ASSERT_TRUE(json_option2.has_value());
  cobj_.SetJson(std::move(json_option2.value()));
  ASSERT_TRUE(cobj_.ObjType() == OBJ_JSON);  // still is a JSON type
  json = cobj_.GetJson();
  ASSERT_TRUE(json != nullptr);
  ASSERT_TRUE(json->contains("b"));
  ASSERT_FALSE(json->contains("firstName"));
  std::optional<JsonType> set_array = ParseJsonUsingShardHeap("");
  // now set it to string again
  cobj_.SetString(R"({"a":{}, "b":{"a":1}, "c":{"a":1, "b":2}})");
  ASSERT_TRUE(cobj_.ObjType() == OBJ_STRING);  // we set this as a string
  failed_json = cobj_.GetJson();
  ASSERT_TRUE(failed_json == nullptr);
}

TEST_F(CompactObjectTest, JsonTypeWithPathTest) {
  std::string_view books_json =
      R"({"books":[{
            "category": "fiction",
            "title" : "A Wild Sheep Chase",
            "author" : "Haruki Murakami"
        },{
            "category": "fiction",
            "title" : "The Night Watch",
            "author" : "Sergei Lukyanenko"
        },{
            "category": "fiction",
            "title" : "The Comedians",
            "author" : "Graham Greene"
        },{
            "category": "memoir",
            "title" : "The Night Watch",
            "author" : "Phillips, David Atlee"
        }]})";
  std::optional<JsonType> json_array = ParseJsonUsingShardHeap(books_json);
  ASSERT_TRUE(json_array.has_value());
  cobj_.SetJson(std::move(json_array.value()));
  ASSERT_TRUE(cobj_.ObjType() == OBJ_JSON);  // and now this is a JSON type
  auto f = [](const auto& /*path*/, JsonType& book) {
    if (book.at("category") == "memoir" && !book.contains("price")) {
      book.try_emplace("price", 140.0);
    }
  };
  JsonType* json = cobj_.GetJson();
  ASSERT_TRUE(json != nullptr);
  auto allocator_set = jsoncons::combine_allocators(json->get_allocator());
  jsonpath::json_replace(allocator_set, *json, "$.books[*]"sv, f);

  // Check whether we've changed the entry for json in place
  // we should have prices only for memoir books
  JsonType* json2 = cobj_.GetJson();
  ASSERT_TRUE(json != nullptr);
  ASSERT_TRUE(json->contains("books"));
  for (auto&& book : (*json2)["books"].array_range()) {
    // make sure that we add prices only to "memoir"
    if (book.at("category") == "memoir") {
      ASSERT_TRUE(book.contains("price"));
    } else {
      ASSERT_FALSE(book.contains("price"));
    }
  }
}

// Test listpack defragmentation.
// StringMap has built-in defragmantation that is tested in its own test suite.
TEST_F(CompactObjectTest, DefragHash) {
  auto build_str = [](size_t i) { return string(111, 'v') + to_string(i); };

  vector<uint8_t*> lps(10'00);

  for (size_t i = 0; i < lps.size(); i++) {
    uint8_t* lp = lpNew(100);
    for (size_t j = 0; j < 100; j++) {
      auto s = build_str(j);
      lp = lpAppend(lp, reinterpret_cast<const unsigned char*>(s.data()), s.length());
    }
    DCHECK_EQ(lpLength(lp), 100u);
    lps[i] = lp;
  }

  for (size_t i = 0; i < lps.size(); i++) {
    if (i % 10 == 0)
      continue;
    lpFree(lps[i]);
  }

  // Find a listpack that is located on a underutilized page
  uint8_t* target_lp = nullptr;
  PageUsage page_usage{CollectPageStats::NO, 0.8};
  for (size_t i = 0; i < lps.size(); i += 10) {
    if (page_usage.IsPageForObjectUnderUtilized(lps[i]))
      target_lp = lps[i];
  }
  CHECK_NE(target_lp, nullptr);

  // Trigger re-allocation
  cobj_.InitRobj(OBJ_HASH, kEncodingListPack, target_lp);
  ASSERT_TRUE(cobj_.DefragIfNeeded(&page_usage));

  // Check the pointer changes as the listpack needed defragmentation
  auto lp = (uint8_t*)cobj_.RObjPtr();
  EXPECT_NE(lp, target_lp) << "must have changed due to realloc";

  uint8_t* fptr = lpFirst(lp);
  for (size_t i = 0; i < 100; i++) {
    int64_t len;
    auto* s = lpGet(fptr, &len, nullptr);

    string_view sv{reinterpret_cast<const char*>(s), static_cast<uint64_t>(len)};
    EXPECT_EQ(sv, build_str(i));

    fptr = lpNext(lp, fptr);
  }

  for (size_t i = 0; i < lps.size(); i += 10) {
    if (lps[i] != target_lp)
      lpFree(lps[i]);
  }
}

TEST_F(CompactObjectTest, DefragSet) {
  // This is still not implemented
  StringSet* s = CompactObj::AllocateMR<StringSet>();
  s->Add("str");
  cobj_.InitRobj(OBJ_SET, kEncodingStrMap2, s);
  PageUsage page_usage{CollectPageStats::NO, 0.8};
  ASSERT_FALSE(cobj_.DefragIfNeeded(&page_usage));
}

TEST_F(CompactObjectTest, StrEncodingAndMaterialize) {
  for (bool ascii : {true, false}) {
    for (size_t len : {64, 128, 256, 512, 1024}) {
      string test_str(len, 'a');
      for (size_t i = 0; i < len; i++)
        test_str[i] = char('a' + (i % 10));
      if (!ascii)
        test_str.push_back(char(200));  // non-ascii

      CompactValue obj;
      obj.SetString(test_str);

      // Test StrEncoding helper
      auto strs = obj.GetRawString();
      string raw_str = string{strs[0]} + string{strs[1]};
      CompactObj::StrEncoding enc = obj.GetStrEncoding();
      EXPECT_EQ(test_str, enc.Decode(raw_str).Take());

      // Test Materialize
      obj.SetExternal(0, 0, CompactObj::ExternalRep::STRING);  // dummy values
      obj.Materialize(raw_str, true);
      EXPECT_EQ(test_str, obj.ToString());

      // Restore from external again, but not as a raw value
      obj.SetExternal(0, 0, CompactObj::ExternalRep::STRING);
      auto test_str2 = test_str + "updated";
      obj.Materialize(test_str2, false);
      EXPECT_EQ(obj.ToString(), test_str2);
    }
  }
}

TEST_F(CompactObjectTest, ExternalRepresentation) {
  {
    CompactValue obj;
    obj.SetString("test");
    obj.SetExternal(0, 4, CompactObj::ExternalRep::STRING);
    EXPECT_EQ(obj.ObjType(), OBJ_STRING);
  }
  {
    StringMap sm{};
    CompactValue obj;
    obj.SetRObjPtr(&sm);
    obj.SetExternal(0, 4, CompactObj::ExternalRep::SERIALIZED_MAP);
    EXPECT_EQ(obj.ObjType(), OBJ_HASH);
  }
}

TEST_F(CompactObjectTest, AsanTriggerReadOverflow) {
  cobj_.SetString(string(32, 'a'));
  auto dest = make_unique<char[]>(32);
  cobj_.GetString(dest.get());
}

TEST_F(CompactObjectTest, lpGetInteger) {
  int64_t val = -1;
  uint8_t* lp = lpNew(0);
  for (int j = 0; j < 60; ++j) {
    lp = lpAppendInteger(lp, val);
    val *= 2;
  }
  val = 1;
  for (int j = 0; j < 600; ++j) {
    string str(j * 500, 'a');
    lp = lpAppend(lp, reinterpret_cast<const uint8_t*>(str.data()), str.size());
  }
  uint8_t* ptr = lpFirst(lp);
  while (ptr) {
    int64_t len1, len2;
    uint8_t* val1 = lpGet(ptr, &len1, nullptr);
    int res = lpGetInteger(ptr, &len2);
    if (res) {
      ASSERT_EQ(len1, len2);
      ASSERT_TRUE(val1 == NULL);
    } else {
      ASSERT_TRUE(val1 != NULL);
    }
    ptr = lpNext(lp, ptr);
  }
  lpFree(lp);
}

static void BuildEncoderAB(HuffmanEncoder* encoder) {
  array<unsigned, 256> hist;
  hist.fill(1);
  hist['a'] = 100;
  hist['b'] = 50;
  CHECK(encoder->Build(hist.data(), hist.size() - 1, nullptr));
}

TEST_F(CompactObjectTest, Huffman) {
  HuffmanEncoder encoder;
  BuildEncoderAB(&encoder);
  string bindata = encoder.Export();

  for (CompactObj::HuffmanDomain domain : {CompactObj::HUFF_KEYS, CompactObj::HUFF_STRING_VALUES}) {
    ASSERT_TRUE(CompactObj::InitHuffmanThreadLocal(domain, bindata));
    for (unsigned i = 30; i < 2048; i += 10) {
      string data(i, 'a');

      variant<CompactKey, CompactValue> obj_backing;
      if (domain)
        obj_backing = CompactValue{};
      auto& cobj = visit([&](auto& co) -> CompactObj& { return co; }, obj_backing);

      visit([&](auto& co) { co.SetString(data); }, obj_backing);
      bool malloc_used = i >= 60;
      ASSERT_EQ(malloc_used, cobj.MallocUsed() > 0) << i;
      ASSERT_EQ(data.size(), cobj.Size());
      ASSERT_EQ(CompactObj::HashCode(data), cobj.HashCode());

      string actual;
      cobj.GetString(&actual);
      EXPECT_EQ(data, actual);
      visit(absl::Overload{[&](CompactKey& co) { EXPECT_EQ(co, data); }, [&](CompactValue& co) {}},
            obj_backing);
    }
  }
}

TEST_F(CompactObjectTest, GetByteAtOffset) {
  // Inline string (INLINE_TAG)
  {
    string s = "hello";
    cobj_.SetString(s);
    for (size_t i = 0; i < s.size(); ++i) {
      uint8_t res = 0;
      EXPECT_TRUE(cobj_.GetByteAtIndex(i, &res));
      EXPECT_EQ(s[i], res) << "inline offset " << i;
    }
  }

  // Integer-encoded string (INT_TAG)
  {
    cobj_.SetString("12345");
    string expected = "12345";
    for (size_t i = 0; i < expected.size(); ++i) {
      uint8_t res = 0;
      EXPECT_TRUE(cobj_.GetByteAtIndex(i, &res));
      EXPECT_EQ(expected[i], res) << "int offset " << i;
    }
  }

  //  ASCII string with SMALL_TAG
  {
    string s(64, 'x');
    for (size_t i = 0; i < s.size(); ++i)
      s[i] = 'a' + (i % 26);
    cobj_.SetString(s);
    for (size_t i = 0; i < s.size(); ++i) {
      uint8_t res = 0;
      EXPECT_TRUE(cobj_.GetByteAtIndex(i, &res));
      EXPECT_EQ(static_cast<uint8_t>(s[i]), res) << "long ascii offset " << i;
    }
  }

  // Non-ASCII string with SMALL_TAG
  {
    string s(64, '\xC0');
    for (size_t i = 0; i < s.size(); ++i)
      s[i] = static_cast<char>(128 + (i % 128));
    cobj_.SetString(s);
    for (size_t i = 0; i < s.size(); ++i) {
      uint8_t res = 0;
      EXPECT_TRUE(cobj_.GetByteAtIndex(i, &res));
      EXPECT_EQ(static_cast<uint8_t>(s[i]), res) << "non-ascii offset " << i;
    }
  }

  // ASCII string ROBJ_TAG
  {
    string s(512, 'z');
    for (size_t i = 0; i < s.size(); ++i)
      s[i] = 'A' + (i % 26);
    cobj_.SetString(s);
    for (size_t i = 0; i < s.size(); ++i) {
      uint8_t res = 0;
      EXPECT_TRUE(cobj_.GetByteAtIndex(i, &res));
      EXPECT_EQ(static_cast<uint8_t>(s[i]), res) << "medium offset " << i;
    }
  }

  // Non-ASCII string ROBJ_TAG
  {
    string s(512, 'z');
    for (size_t i = 0; i < s.size(); ++i)
      s[i] = static_cast<char>(128 + (i % 128));
    cobj_.SetString(s);
    for (size_t i = 0; i < s.size(); ++i) {
      uint8_t res = 0;
      EXPECT_TRUE(cobj_.GetByteAtIndex(i, &res));
      EXPECT_EQ(static_cast<uint8_t>(s[i]), res) << "medium offset " << i;
    }
  }

  cobj_.Reset();
}

TEST_F(CompactObjectTest, SetByteAtOffset) {
  // Inline string (INLINE_TAG)
  {
    string s = "abcde";
    cobj_.SetString(s);
    for (size_t i = 0; i < s.size(); ++i) {
      std::pair<bool, bool> res_set_byte = cobj_.SetByteAtIndex(i, 'Z');
      EXPECT_TRUE(res_set_byte.first);
      EXPECT_TRUE(res_set_byte.second);
      uint8_t res = 0;
      EXPECT_TRUE(cobj_.GetByteAtIndex(i, &res));
      EXPECT_EQ('Z', res) << "inline set offset " << i;
    }
    // All bytes should now be 'Z'
    string result;
    cobj_.GetString(&result);
    EXPECT_EQ(string(5, 'Z'), result);
  }

  // Integer-encoded string (INT_TAG)
  {
    cobj_.SetString("999");
    std::pair<bool, bool> res_set_byte = cobj_.SetByteAtIndex(0, 'x');
    EXPECT_TRUE(res_set_byte.first);
    // We didn't modify in-place, SetString is called
    EXPECT_FALSE(res_set_byte.second);
    string result;
    cobj_.GetString(&result);
    EXPECT_EQ("x99", result);
  }

  // ASCII string with SMALL_TAG
  {
    string s(64, 'a');
    for (size_t i = 0; i < s.size(); ++i)
      s[i] = 'a' + (i % 26);
    cobj_.SetString(s);

    // Modify every 10th byte
    for (size_t i = 0; i < s.size(); i += 10) {
      std::pair<bool, bool> res_set_byte = cobj_.SetByteAtIndex(i, '!');
      EXPECT_TRUE(res_set_byte.first);
      EXPECT_FALSE(res_set_byte.second);
      s[i] = '!';
    }

    // Verify all bytes
    for (size_t i = 0; i < s.size(); ++i) {
      uint8_t res = 0;
      EXPECT_TRUE(cobj_.GetByteAtIndex(i, &res));
      EXPECT_EQ(static_cast<uint8_t>(s[i]), res) << "long ascii set offset " << i;
    }
  }

  // Non-ASCII string with SMALL_TAG
  {
    string s(64, '\x80');
    for (size_t i = 0; i < s.size(); ++i)
      s[i] = static_cast<char>(128 + (i % 128));
    cobj_.SetString(s);

    std::pair<bool, bool> res_set_byte = cobj_.SetByteAtIndex(63, 0xFF);
    EXPECT_TRUE(res_set_byte.first);
    EXPECT_FALSE(res_set_byte.second);
    s[63] = '\xFF';

    for (size_t i = 0; i < s.size(); ++i) {
      uint8_t res = 0;
      EXPECT_TRUE(cobj_.GetByteAtIndex(i, &res));
      EXPECT_EQ(static_cast<uint8_t>(s[i]), res) << "non-ascii set offset " << i;
    }
  }

  // ASCII string with ROBJ_TAG
  {
    string s(512, 'a');
    for (size_t i = 0; i < s.size(); ++i)
      s[i] = 'a' + (i % 26);
    cobj_.SetString(s);

    // Modify every 10th byte
    for (size_t i = 0; i < s.size(); i += 10) {
      std::pair<bool, bool> res_set_byte = cobj_.SetByteAtIndex(i, '!');
      EXPECT_TRUE(res_set_byte.first);
      EXPECT_TRUE(res_set_byte.second);
      s[i] = '!';
    }

    // Verify all bytes
    for (size_t i = 0; i < s.size(); ++i) {
      uint8_t res = 0;
      EXPECT_TRUE(cobj_.GetByteAtIndex(i, &res));
      EXPECT_EQ(static_cast<uint8_t>(s[i]), res) << "long ascii set offset " << i;
    }
  }

  // ASCII string with ROBJ_TAG modified to non-ASCII
  {
    string s(512, 'a');
    for (size_t i = 0; i < s.size(); ++i)
      s[i] = 'a' + (i % 26);
    cobj_.SetString(s);

    // Modify in-place ascii packed string
    std::pair<bool, bool> res_set_byte = cobj_.SetByteAtIndex(0, 'A');
    EXPECT_TRUE(res_set_byte.first);
    EXPECT_TRUE(res_set_byte.second);

    // Adding non-ascii byte modification should still succeed, but not in-place
    res_set_byte = cobj_.SetByteAtIndex(255, 0xFF);
    EXPECT_TRUE(res_set_byte.first);
    EXPECT_FALSE(res_set_byte.second);

    // Modification of non-ascii ROBJ string should succeed and in-place
    res_set_byte = cobj_.SetByteAtIndex(511, 'C');
    EXPECT_TRUE(res_set_byte.first);
    EXPECT_TRUE(res_set_byte.second);

    uint8_t res;
    EXPECT_TRUE(cobj_.GetByteAtIndex(0, &res));
    EXPECT_EQ('A', res);
    EXPECT_TRUE(cobj_.GetByteAtIndex(255, &res));
    EXPECT_EQ(0xFF, res);
    EXPECT_TRUE(cobj_.GetByteAtIndex(511, &res));
    EXPECT_EQ('C', res);
  }

  // Out-of-bounds access should be handled gracefully.
  {
    string s = "abc";
    cobj_.SetString(s);
    // SetByteAtIndex: index equal to size() is out-of-bounds.
    auto res_pair = cobj_.SetByteAtIndex(s.size(), 'X');
    EXPECT_FALSE(res_pair.first);
    EXPECT_FALSE(res_pair.second);
    // GetByteAtIndex: out-of-bounds should set result to 0.
    uint8_t res = 123;  // sentinel non-zero value
    EXPECT_FALSE(cobj_.GetByteAtIndex(s.size(), &res));
    EXPECT_EQ(0u, res);
  }

  cobj_.Reset();
}

static void ascii_pack_naive(const char* ascii, size_t len, uint8_t* bin) {
  const char* end = ascii + len;

  unsigned i = 0;
  while (ascii + 8 <= end) {
    for (i = 0; i < 7; ++i) {
      *bin++ = (ascii[0] >> i) | (ascii[1] << (7 - i));
      ++ascii;
    }
    ++ascii;
  }

  // epilog - we do not pack since we have less than 8 bytes.
  while (ascii < end) {
    *bin++ = *ascii++;
  }
}

static void BM_PackNaive(benchmark::State& state) {
  string val(1024, 'a');
  uint8_t buf[1024];

  while (state.KeepRunning()) {
    ascii_pack_naive(val.data(), val.size(), buf);
  }
}
BENCHMARK(BM_PackNaive);

static void BM_Pack(benchmark::State& state) {
  string val(1024, 'a');
  uint8_t buf[1024];

  while (state.KeepRunning()) {
    detail::ascii_pack(val.data(), val.size(), buf);
  }
}
BENCHMARK(BM_Pack);

static void BM_PackSimd(benchmark::State& state) {
  string val(1024, 'a');
  uint8_t buf[1024];

  while (state.KeepRunning()) {
    detail::ascii_pack_simd(val.data(), val.size(), buf);
  }
}
BENCHMARK(BM_PackSimd);

static void BM_PackSimd2(benchmark::State& state) {
  string val(1024, 'a');
  uint8_t buf[1024];

  while (state.KeepRunning()) {
    detail::ascii_pack_simd2(val.data(), val.size(), buf);
  }
}
BENCHMARK(BM_PackSimd2);

static void BM_Unpack(benchmark::State& state) {
  string val(1024, 'a');
  uint8_t buf[1024];

  detail::ascii_pack(val.data(), val.size(), buf);

  while (state.KeepRunning()) {
    detail::ascii_unpack(buf, val.size(), val.data());
  }
}
BENCHMARK(BM_Unpack);

static void BM_UnpackSimd(benchmark::State& state) {
  string val(1024, 'a');
  uint8_t buf[1024];

  detail::ascii_pack(val.data(), val.size(), buf);

  while (state.KeepRunning()) {
    detail::ascii_unpack_simd(buf, val.size(), val.data());
  }
}
BENCHMARK(BM_UnpackSimd);

static void BM_LpCompare(benchmark::State& state) {
  std::mt19937_64 rd;
  uint8_t* lp = lpNew(0);
  for (unsigned i = 0; i < 100; ++i) {
    lp = lpAppendInteger(lp, rd() % (1ULL << 48));
  }

  string val = absl::StrCat(1ULL << 49);
  while (state.KeepRunning()) {
    uint8_t* elem = lpLast(lp);
    while (elem) {
      lpCompare(elem, reinterpret_cast<const uint8_t*>(val.data()), val.size());
      elem = lpPrev(lp, elem);
    }
  }
  lpFree(lp);
}
BENCHMARK(BM_LpCompare);

static void BM_LpCompareInt(benchmark::State& state) {
  std::mt19937_64 rd;
  uint8_t* lp = lpNew(0);
  for (unsigned i = 0; i < 100; ++i) {
    lp = lpAppendInteger(lp, rd() % (1ULL << 48));
  }

  int64_t val = 1ULL << 49;
  while (state.KeepRunning()) {
    uint8_t* elem = lpLast(lp);
    int64_t sz;
    while (elem) {
      DCHECK_NE(0xFF, *elem);
      lpGetInteger(elem, &sz);
      int res = sz == val;
      benchmark::DoNotOptimize(res);
      elem = lpPrev(lp, elem);
    }
  }
  lpFree(lp);
}
BENCHMARK(BM_LpCompareInt);

static void BM_LpGet(benchmark::State& state) {
  unsigned version = state.range(0);
  uint8_t* lp = lpNew(0);
  int64_t val = -1;
  for (unsigned i = 0; i < 60; ++i) {
    lp = lpAppendInteger(lp, val);
    val *= 2;
  }

  while (state.KeepRunning()) {
    uint8_t* elem = lpLast(lp);
    int64_t ival;
    if (version == 1) {
      while (elem) {
        unsigned char* value = lpGet(elem, &ival, NULL);
        benchmark::DoNotOptimize(value);
        elem = lpPrev(lp, elem);
      }
    } else {
      while (elem) {
        int res = lpGetInteger(elem, &ival);
        benchmark::DoNotOptimize(res);
        elem = lpPrev(lp, elem);
      }
    }
  }
  lpFree(lp);
}
BENCHMARK(BM_LpGet)->Arg(1)->Arg(2);

extern "C" int lpStringToInt64(const char* s, unsigned long slen, int64_t* value);

static void BM_LpString2Int(benchmark::State& state) {
  int version = state.range(0);
  std::mt19937_64 rd;
  vector<string> values;
  for (unsigned i = 0; i < 1000; ++i) {
    int64_t val = rd();
    values.push_back(absl::StrCat(val));
  }

  int64_t ival = 0;
  while (state.KeepRunning()) {
    for (const auto& val : values) {
      int res = version == 1 ? lpStringToInt64(val.data(), val.size(), &ival)
                             : absl::SimpleAtoi(val, &ival);
      benchmark::DoNotOptimize(res);
    }
  }
}
BENCHMARK(BM_LpString2Int)->Arg(1)->Arg(2);

}  // namespace dfly


================================================
FILE: src/core/dash.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once

#include <vector>

#include "absl/random/random.h"
#include "base/pmr/memory_resource.h"
#include "core/dash_internal.h"

namespace dfly {

// DASH: Dynamic And Scalable Hashing.

template <typename _Key, typename _Value, typename Policy>
class DashTable : public detail::DashTableBase {
  DashTable(const DashTable&) = delete;
  DashTable& operator=(const DashTable&) = delete;

  using Base = detail::DashTableBase;
  using SegmentType = detail::Segment<_Key, _Value, Policy>;
  using SegmentIterator = typename SegmentType::Iterator;

 public:
  using Key_t = _Key;
  using Value_t = _Value;
  using Segment_t = SegmentType;

  //! Total number of buckets in a segment (including stash).
  static constexpr double kTaxAmount = SegmentType::kTaxSize;
  static constexpr size_t kSegBytes = sizeof(SegmentType);

  // How many bytes the non-stash part is taking.
  static constexpr size_t kSegRegularBytes =
      kSegBytes - (SegmentType::kStashBucketNum * SegmentType::kBucketSz);

  static constexpr size_t kSegCapacity = SegmentType::capacity();
  static constexpr size_t kSlotNum = SegmentType::kSlotNum;
  static constexpr size_t kBucketNum = SegmentType::kBucketNum;

  // if IsSingleBucket is true - iterates only over a single bucket.
  template <bool IsConst, bool IsSingleBucket = false> class Iterator;

  using const_iterator = Iterator<true>;
  using iterator = Iterator<false>;

  using const_bucket_iterator = Iterator<true, true>;
  using bucket_iterator = Iterator<false, true>;
  using Cursor = detail::DashCursor;

  struct HotBuckets {
    static constexpr size_t kRegularBuckets = 4;
    static constexpr size_t kNumBuckets = kRegularBuckets + SegmentType::kStashBucketNum;

    struct ByType {
      bucket_iterator regular_buckets[kRegularBuckets];
      bucket_iterator stash_buckets[SegmentType::kStashBucketNum];
    };

    union Probes {
      ByType by_type;
      bucket_iterator arr[kNumBuckets];

      Probes() : arr() {
      }
    } probes;

    // id must be in the range [0, kNumBuckets).
    bucket_iterator at(unsigned id) const {
      return probes.arr[id];
    }

    unsigned num_buckets;
    // key_hash of a key that we try to insert.
    // I use it as pseudo-random number in my gc/eviction heuristics.
    uint64_t key_hash;
  };

  struct DefaultEvictionPolicy {
    static constexpr bool can_gc = false;
    static constexpr bool can_evict = false;

    bool CanGrow(const DashTable&) {
      return true;
    }

    void OnMove(Cursor source, Cursor dest) {
    }

    void RecordSplit(SegmentType* segment) {
    }
    /*
       /// Required interface in case can_gc is true
       // Returns number of garbage collected items deleted. 0 - means nothing has been
       // deleted.
       unsigned GarbageCollect(const EvictionBuckets& eb, DashTable* me) const {
         return 0;
       }

       // Required interface in case can_gc is true
       // returns number of items evicted from the table.
       // 0 means - nothing has been evicted.
       unsigned Evict(const EvictionBuckets& eb, DashTable* me) {
         return 0;
       }
   */
  };

  DashTable(size_t capacity_log = 1, const Policy& policy = Policy{},
            PMR_NS::memory_resource* mr = PMR_NS::get_default_resource());
  ~DashTable();

  void Reserve(size_t size);

  // false for duplicate, true if inserted.
  template <typename U, typename V> std::pair<iterator, bool> Insert(U&& key, V&& value) {
    DefaultEvictionPolicy policy;
    return InsertInternal(std::forward<U>(key), std::forward<V>(value), policy,
                          InsertMode::kInsertIfNotFound);
  }

  template <typename U, typename V, typename EvictionPolicy>
  std::pair<iterator, bool> Insert(U&& key, V&& value, EvictionPolicy& ev) {
    return InsertInternal(std::forward<U>(key), std::forward<V>(value), ev,
                          InsertMode::kInsertIfNotFound);
  }

  template <typename U, typename V> iterator InsertNew(U&& key, V&& value) {
    DefaultEvictionPolicy policy;
    return InsertNew(std::forward<U>(key), std::forward<V>(value), policy);
  }

  template <typename U, typename V, typename EvictionPolicy>
  iterator InsertNew(U&& key, V&& value, EvictionPolicy& ev) {
    return InsertInternal(std::forward<U>(key), std::forward<V>(value), ev,
                          InsertMode::kForceInsert)
        .first;
  }

  template <typename U> const_iterator Find(U&& key) const;
  template <typename U> iterator Find(U&& key);

  // Prefetches the memory where the key would resize into the cache.
  template <typename U> void Prefetch(U&& key) const;

  // Find first entry with given key hash that evaulates to true on pred.
  // Pred accepts either (const key&) or (const key&, const value&)
  template <typename Pred> iterator FindFirst(uint64_t key_hash, Pred&& pred);

  // it must be valid.
  void Erase(iterator it);

  size_t Erase(const Key_t& k);

  iterator begin() {
    iterator it{this, 0, 0, 0};
    it.Seek2Occupied();
    return it;
  }

  const_iterator cbegin() const {
    const_iterator it{this, 0, 0, 0};
    it.Seek2Occupied();
    return it;
  }

  iterator end() const {
    return iterator{};
  }
  const_iterator cend() const {
    return const_iterator{};
  }

  using Base::depth;
  using Base::Empty;
  using Base::size;
  using Base::unique_segments;

  // Direct access to the segment for debugging purposes.
  Segment_t* GetSegment(unsigned segment_id) {
    return segment_[segment_id];
  }

  // - If there is no buddy for segment_id return segment_id.
  //   Otherwise, return buddy_id.
  // - A buddy is a sibling segment that was created from the
  //   same parent during split and can be merged back together.
  //   It's the adjacent subtree of the same depth.
  unsigned FindBuddyId(unsigned segment_id) {
    auto* seg = GetSegment(segment_id);
    uint8_t depth = seg->local_depth();

    if (depth <= 1) {
      return segment_id;
    }

    const size_t bit_pos = global_depth_ - depth;
    const size_t buddy_idx = segment_id ^ (1u << bit_pos);
    assert(buddy_idx < segment_.size());

    auto* buddy = GetSegment(buddy_idx);
    // There is no adjacent subtree of the same depth
    if (buddy->local_depth() != depth) {
      return segment_id;
    }

    return buddy_idx;
  }

  // - Moves all items from `buddy_id` to `keep_id` (merges the two segments).
  //   After merge completes, `buddy_id` segment is deleted.
  // - Return true if the two segments merged successfully.
  // - If an insertion fails we rollback and abort the merge (return false).
  // - Merge can run only if there are no active snapshots.
  // - Prefer calling this function only when the combined size of both segments
  //   than x * segment_capacity. With x: 0 < x < 0.25 as statistically this won't
  //   trigger rollbacks.
  bool Merge(unsigned keep_id, unsigned buddy_id) {
    auto* keep = GetSegment(keep_id);
    auto* buddy = GetSegment(buddy_id);

    assert((keep->local_depth() == buddy->local_depth()));
    // assert((keep->SlowSize() + buddy->SlowSize() < (0.25 * buddy->capacity())));
    assert(keep->local_depth() != 1);
    assert(keep != buddy);
    assert(keep_id < buddy_id);  // Callers must iterate low to high to ensure correct orientation

    // Don't merge below initial_depth to maintain Clear() invariant
    // After merge, keep will have depth-1, which determines unique_segments
    uint8_t depth_after_merge = keep->local_depth() - 1;
    if (depth_after_merge < initial_depth_) {
      return false;
    }

    bool should_rollback = false;

    // Decrease depth (merge back to parent)
    keep->set_local_depth(keep->local_depth() - 1);

    // Move all items from buddy to keep
    buddy->TraverseAll([&](const auto& it) {
      if (should_rollback) {
        return;
      }

      uint64_t hash = DoHash(buddy->Key(it.index, it.slot));

      auto& src_bucket = buddy->GetBucket(it.index);
      auto res =
          keep->InsertUniq(std::move(src_bucket.key[it.slot]), std::move(src_bucket.value[it.slot]),
                           hash, false, [](auto&&...) {});

      if (!res.found()) {
        should_rollback = true;
        return;
      }

      // Clear the slot in buddy so rollback can reuse the space
      src_bucket.Delete(it.slot);
    });

    if (should_rollback) {
      auto hash_fn = [this](const auto& k) { return policy_.HashFn(k); };
      keep->Split(hash_fn, buddy, [](auto&&...) {});

      return false;
    }

    // Same as Split()
    uint32_t buddy_chunk_size = 1u << (global_depth_ - buddy->local_depth());
    uint32_t buddy_start = buddy_id & ~(buddy_chunk_size - 1u);
    for (size_t i = buddy_start; i < buddy_start + buddy_chunk_size; ++i) {
      segment_[i] = keep;
    }

    // Free buddy segment
    PMR_NS::polymorphic_allocator<SegmentType> pa(segment_.get_allocator());
    using alloc_traits = std::allocator_traits<decltype(pa)>;
    alloc_traits::destroy(pa, buddy);
    alloc_traits::deallocate(pa, buddy, 1);

    // Decrement unique segment counter
    --unique_segments_;
    bucket_count_ -= keep->num_buckets();

    return true;
  }

  size_t GetSegmentCount() const {
    return segment_.size();
  }

  size_t NextSeg(size_t sid) const {
    size_t delta = (1u << (global_depth_ - segment_[sid]->local_depth()));
    return sid + delta;
  }

  template <typename U> uint64_t DoHash(const U& k) const {
    return policy_.HashFn(k);
  }

  // Flat memory usage (allocated) of the table, not including the the memory allocated
  // by the hosted objects.
  size_t mem_usage() const {
    return segment_.capacity() * sizeof(void*) + sizeof(SegmentType) * unique_segments_;
  }

  // Returns the total number of buckets in the table, in contrast to capacity() which
  // returns the total number of slots.
  size_t bucket_count() const {
    return bucket_count_;
  }

  // Overall capacity of the table (including stash buckets) in number of keys.
  size_t capacity() const {
    return bucket_count() * kSlotNum;
  }

  double load_factor() const {
    return double(size()) / capacity();
  }

  static constexpr unsigned LargestBucketId() {
    return SegmentType::kBucketNum + SegmentType::kStashBucketNum - 1;
  }

  // Gets a random cursor based on the available segments and buckets.
  // Returns: cursor with a random position
  Cursor GetRandomCursor(absl::BitGen* bitgen);

  // Traverses over a single logical bucket in table and calls cb(iterator) 0 or more
  // times. if cursor=0 starts traversing from the beginning, otherwise continues from where it
  // stopped. returns 0 if the supplied cursor reached end of traversal. Traverse iterates at bucket
  // logical granularity, which means for each non-empty bucket it calls cb per each entry in the
  // logical bucket before returning. Unlike begin/end interface, traverse is stable during table
  // mutations. It guarantees that if key exists (1)at the beginning of traversal, (2) stays in the
  // table during the traversal, then Traverse() will eventually reach it even when the table
  // shrinks or grows. Returns: cursor that is guaranteed to be less than 2^40.
  template <typename Cb> Cursor Traverse(Cursor curs, Cb&& cb);

  // Traverses over physical buckets. It calls cb once for each bucket by passing a bucket iterator.
  // if cursor=0 starts traversing from the beginning, otherwise continues from where
  // it stopped. returns 0 if the supplied cursor reached end of traversal.
  // Unlike Traverse, TraverseBuckets calls cb once on bucket iterator and not on each entry in
  // bucket. TraverseBuckets is stable during table mutations. It guarantees traversing all buckets
  // that existed at the beginning of traversal.
  template <typename Cb> Cursor TraverseBuckets(Cursor curs, Cb&& cb);

  // Traverses over a single bucket in table and calls cb(iterator). The traverse order will be
  // segment by segment over physical backets.
  // traverse by segment order does not guarantees coverage if the table grows/shrinks, it is useful
  // when formal full coverage is not critically important.
  template <typename Cb> Cursor TraverseBySegmentOrder(Cursor curs, Cb&& cb);

  // Discards slots information.
  static const_bucket_iterator BucketIt(const_iterator it) {
    return const_bucket_iterator{it.owner_, it.seg_id_, it.bucket_id_, 0};
  }

  // Seeks to the first occupied slot if exists in the bucket.
  const_bucket_iterator BucketIt(unsigned segment_id, unsigned bucket_id) const {
    return const_bucket_iterator{this, segment_id, uint8_t(bucket_id)};
  }

  bucket_iterator BucketIt(unsigned segment_id, unsigned bucket_id) {
    return bucket_iterator{this, segment_id, uint8_t(bucket_id)};
  }

  iterator GetIterator(unsigned segment_id, unsigned bucket_id, unsigned slot_id) {
    return iterator{this, segment_id, uint8_t(bucket_id), uint8_t(slot_id)};
  }

  const_bucket_iterator CursorToBucketIt(Cursor c) const {
    return const_bucket_iterator{this, c.segment_id(global_depth_), c.bucket_id(), 0};
  }
  bucket_iterator CursorToBucketIt(Cursor c) {
    return bucket_iterator{this, c.segment_id(global_depth_), c.bucket_id(), 0};
  }

  // Capture Version Change. Runs cb(it) on every bucket! (not entry) in the table whose version
  // would potentially change upon insertion of 'k'.
  // In practice traversal is limited to a single segment. The operation is read-only and
  // simulates insertion process. 'cb' must accept bucket_iterator.
  // Note: the interface a bit hacky.
  // The functions call cb on physical buckets with version smaller than ver_threshold that
  // due to entry movements might update its version to version greater than ver_threshold.
  //
  // These are not const functions because they send non-const iterators that allow
  // updating contents/versions of the passed iterators.
  template <typename U, typename Cb>
  void CVCUponInsert(uint64_t ver_threshold, const U& key, Cb&& cb);

  template <typename Cb> void CVCUponBump(uint64_t ver_threshold, const_iterator it, Cb&& cb);

  void Clear();

  // Returns true if an element was deleted i.e the rightmost slot was busy.
  bool ShiftRight(bucket_iterator it);

  template <typename BumpPolicy> iterator BumpUp(iterator it, BumpPolicy& bp) {
    SegmentIterator seg_it = segment_[it.seg_id_]->BumpUp(
        it.bucket_id_, it.slot_id_, DoHash(it->first), bp,
        [&](uint32_t segment_id, detail::PhysicalBid from, detail::PhysicalBid to) {
          // OnMove is used to notify policy about the items moves across buckets.
          bp.OnMove(Cursor{global_depth_, segment_id, from}, Cursor{global_depth_, segment_id, to});
        });

    return iterator{this, it.seg_id_, seg_it.index, seg_it.slot};
  }

  uint64_t garbage_collected() const {
    return garbage_collected_;
  }

  uint64_t stash_unloaded() const {
    return stash_unloaded_;
  }

 private:
  enum class InsertMode {
    kInsertIfNotFound,
    kForceInsert,
  };

  Cursor AdvanceCursorBucketOrder(Cursor cursor);

  template <typename U, typename V, typename EvictionPolicy>
  std::pair<iterator, bool> InsertInternal(U&& key, V&& value, EvictionPolicy& policy,
                                           InsertMode mode);

  void IncreaseDepth(unsigned new_depth);
  template <typename EvictionPolicy> void Split(uint32_t seg_id, EvictionPolicy& ev);

  // Segment directory contains multiple segment pointers, some of them pointing to
  // the same object. IterateDistinct goes over all distinct segments in the table.
  template <typename Cb> void IterateDistinct(Cb&& cb);

  template <typename K> auto EqPred(const K& key) const {
    return [p = &policy_, &key](const auto& probe) -> bool { return p->Equal(probe, key); };
  }

  SegmentType* ConstructSegment(uint8_t depth, uint32_t id) {
    auto* mr = segment_.get_allocator().resource();
    PMR_NS::polymorphic_allocator<SegmentType> pa(mr);
    SegmentType* res = pa.allocate(1);
    pa.construct(res, depth, id, mr);  //   new SegmentType(depth);
    bucket_count_ += res->num_buckets();
    return res;
  }

  Policy policy_;
  std::vector<SegmentType*, PMR_NS::polymorphic_allocator<SegmentType*>> segment_;

  uint64_t garbage_collected_ = 0;
  uint64_t stash_unloaded_ = 0;
};  // DashTable

template <typename _Key, typename _Value, typename Policy>
template <bool IsConst, bool IsSingleBucket>
class DashTable<_Key, _Value, Policy>::Iterator {
  using Owner = std::conditional_t<IsConst, const DashTable, DashTable>;

  Owner* owner_;
  uint32_t seg_id_;
  detail::PhysicalBid bucket_id_;
  uint8_t slot_id_;

  friend class DashTable;

  Iterator(Owner* me, uint32_t seg_id, detail::PhysicalBid bid, uint8_t sid)
      : owner_(me), seg_id_(seg_id), bucket_id_(bid), slot_id_(sid) {
  }

  Iterator(Owner* me, uint32_t seg_id, detail::PhysicalBid bid)
      : owner_(me), seg_id_(seg_id), bucket_id_(bid), slot_id_(0) {
    Seek2Occupied();
  }

 public:
  using iterator_category = std::forward_iterator_tag;
  using difference_type = std::ptrdiff_t;
  using IteratorPairType =
      std::conditional_t<IsConst, detail::IteratorPair<const Key_t, const Value_t>,
                         detail::IteratorPair<Key_t, Value_t>>;

  // Copy constructor from iterator to const_iterator.
  template <bool TIsConst = IsConst, bool TIsSingleB,
            typename std::enable_if<TIsConst>::type* = nullptr>
  Iterator(const Iterator<!TIsConst, TIsSingleB>& other) noexcept
      : owner_(other.owner_),
        seg_id_(other.seg_id_),
        bucket_id_(other.bucket_id_),
        slot_id_(other.slot_id_) {
  }

  // Copy constructor from iterator to bucket_iterator and vice versa.
  template <bool TIsSingle>
  Iterator(const Iterator<IsConst, TIsSingle>& other) noexcept
      : owner_(other.owner_),
        seg_id_(other.seg_id_),
        bucket_id_(other.bucket_id_),
        slot_id_(IsSingleBucket ? 0 : other.slot_id_) {
    // if this - is a bucket_iterator - we reset slot_id to the first occupied space.
    if constexpr (IsSingleBucket) {
      Seek2Occupied();
    }
  }

  Iterator() : owner_(nullptr), seg_id_(0), bucket_id_(0), slot_id_(0) {
  }

  Iterator(const Iterator& other) = default;

  Iterator(Iterator&& other) = default;

  Iterator& operator=(const Iterator& other) = default;
  Iterator& operator=(Iterator&& other) = default;

  // pre
  Iterator& operator++() {
    ++slot_id_;
    Seek2Occupied();
    return *this;
  }

  Iterator& operator+=(int delta) {
    slot_id_ += delta;
    Seek2Occupied();
    return *this;
  }

  Iterator& AdvanceIfNotOccupied() {
    if (!IsOccupied()) {
      this->operator++();
    }
    return *this;
  }

  IteratorPairType operator->() const {
    auto* seg = owner_->segment_[seg_id_];
    return {seg->Key(bucket_id_, slot_id_), seg->Value(bucket_id_, slot_id_)};
  }

  // Make it self-contained. Does not need container::end().
  bool is_done() const {
    return owner_ == nullptr;
  }

  bool IsOccupied() const {
    return (seg_id_ < owner_->segment_.size()) &&
           ((owner_->segment_[seg_id_]->IsBusy(bucket_id_, slot_id_)));
  }

  Owner& owner() const {
    return *owner_;
  }

  template <bool B = Policy::kUseVersion> std::enable_if_t<B, uint64_t> GetVersion() const {
    assert(owner_ && seg_id_ < owner_->segment_.size());
    return owner_->segment_[seg_id_]->GetVersion(bucket_id_);
  }

  template <bool B = Policy::kUseVersion> std::enable_if_t<B> SetVersion(uint64_t v) {
    return owner_->segment_[seg_id_]->SetVersion(bucket_id_, v);
  }

  friend bool operator==(const Iterator& lhs, const Iterator& rhs) {
    if (lhs.owner_ == nullptr && rhs.owner_ == nullptr)
      return true;
    return lhs.owner_ == rhs.owner_ && lhs.seg_id_ == rhs.seg_id_ &&
           lhs.bucket_id_ == rhs.bucket_id_ && lhs.slot_id_ == rhs.slot_id_;
  }

  friend bool operator!=(const Iterator& lhs, const Iterator& rhs) {
    return !(lhs == rhs);
  }

  // Bucket resolution cursor that is safe to use with insertions/removals.
  // Serves as a hint really to the placement of the original item, i.e. the item
  // could have moved.
  detail::DashCursor bucket_cursor() const {
    return detail::DashCursor(owner_->global_depth_, seg_id_, bucket_id_);
  }

  detail::PhysicalBid bucket_id() const {
    return bucket_id_;
  }

  // Returns the unique address of the physical bucket as an integer.
  // Stable for the lifetime of a serialization (mutations that could trigger
  // segment splits are blocked while a snapshot version is registered).
  uintptr_t bucket_address() const {
    assert(owner_ && seg_id_ < owner_->segment_.size());
    return reinterpret_cast<uintptr_t>(&owner_->segment_[seg_id_]->GetBucket(bucket_id_));
  }

  unsigned slot_id() const {
    return slot_id_;
  }

  unsigned segment_id() const {
    return seg_id_;
  }

 private:
  void Seek2Occupied();
};  // Iterator

/**
  _____                 _                           _        _   _
 |_   _|               | |                         | |      | | (_)
   | |  _ __ ___  _ __ | | ___ _ __ ___   ___ _ __ | |_ __ _| |_ _  ___  _ __
   | | | '_ ` _ \| '_ \| |/ _ \ '_ ` _ \ / _ \ '_ \| __/ _` | __| |/ _ \| '_ \
  _| |_| | | | | | |_) | |  __/ | | | | |  __/ | | | || (_| | |_| | (_) | | | |
 |_____|_| |_| |_| .__/|_|\___|_| |_| |_|\___|_| |_|\__\__,_|\__|_|\___/|_| |_|
                 | |
                 |_|

**/

template <typename _Key, typename _Value, typename Policy>
template <bool IsConst, bool IsSingleBucket>
void DashTable<_Key, _Value, Policy>::Iterator<IsConst, IsSingleBucket>::Seek2Occupied() {
  if (owner_ == nullptr)
    return;
  assert(seg_id_ < owner_->segment_.size());

  if constexpr (IsSingleBucket) {
    const auto& b = owner_->segment_[seg_id_]->GetBucket(bucket_id_);
    uint32_t mask = b.GetBusy() >> slot_id_;
    if (mask) {
      int slot = __builtin_ctz(mask);
      slot_id_ += slot;
      return;
    }
  } else {
    while (seg_id_ < owner_->segment_.size()) {
      auto seg_it = owner_->segment_[seg_id_]->FindValidStartingFrom(bucket_id_, slot_id_);
      if (seg_it.found()) {
        bucket_id_ = seg_it.index;
        slot_id_ = seg_it.slot;
        return;
      }
      seg_id_ = owner_->NextSeg(seg_id_);
      bucket_id_ = slot_id_ = 0;
    }
  }
  owner_ = nullptr;
}

template <typename _Key, typename _Value, typename Policy>
DashTable<_Key, _Value, Policy>::DashTable(size_t capacity_log, const Policy& policy,
                                           PMR_NS::memory_resource* mr)
    : Base(capacity_log), policy_(policy), segment_(mr) {
  segment_.resize(unique_segments_);

  // I assume we have enough memory to create the initial table and do not check allocations.
  for (uint32_t i = 0; i < segment_.size(); ++i) {
    segment_[i] = ConstructSegment(global_depth_, i);  //   new SegmentType(global_depth_);
  }
}

template <typename _Key, typename _Value, typename Policy>
DashTable<_Key, _Value, Policy>::~DashTable() {
  Clear();
  auto* resource = segment_.get_allocator().resource();
  PMR_NS::polymorphic_allocator<SegmentType> pa(resource);
  using alloc_traits = std::allocator_traits<decltype(pa)>;

  IterateDistinct([&](SegmentType* seg) {
    alloc_traits::destroy(pa, seg);
    alloc_traits::deallocate(pa, seg, 1);
    return false;
  });
}

template <typename _Key, typename _Value, typename Policy>
template <typename U, typename Cb>
void DashTable<_Key, _Value, Policy>::CVCUponInsert(uint64_t ver_threshold, const U& key, Cb&& cb) {
  uint64_t key_hash = DoHash(key);
  uint32_t seg_id = SegmentId(key_hash);
  assert(seg_id < segment_.size());
  const SegmentType* target = segment_[seg_id];

  uint8_t bids[2];
  unsigned num_touched = target->CVCOnInsert(ver_threshold, key_hash, bids);
  if (num_touched < UINT16_MAX) {
    for (unsigned i = 0; i < num_touched; ++i) {
      cb(bucket_iterator{this, seg_id, bids[i]});
    }
    return;
  }

  // Segment is full, we need to return the whole segment, because it can be split
  // and its entries can be reshuffled into different buckets.
  for (uint8_t i = 0; i < target->num_buckets(); ++i) {
    if (target->GetVersion(i) < ver_threshold && !target->GetBucket(i).IsEmpty()) {
      cb(bucket_iterator{this, seg_id, i});
    }
  }
}

template <typename _Key, typename _Value, typename Policy>
template <typename Cb>
void DashTable<_Key, _Value, Policy>::CVCUponBump(uint64_t ver_upperbound, const_iterator it,
                                                  Cb&& cb) {
  uint64_t key_hash = DoHash(it->first);
  uint32_t seg_id = it.segment_id();
  assert(seg_id < segment_.size());
  const SegmentType* target = segment_[seg_id];

  uint8_t bids[3];
  unsigned num_touched =
      target->CVCOnBump(ver_upperbound, it.bucket_id(), it.slot_id(), key_hash, bids);

  for (unsigned i = 0; i < num_touched; ++i) {
    cb(bucket_iterator{this, seg_id, bids[i]});
  }
}

template <typename _Key, typename _Value, typename Policy>
void DashTable<_Key, _Value, Policy>::Clear() {
  auto cb = [this](SegmentType* seg) {
    seg->TraverseAll([this, seg](const SegmentIterator& it) {
      policy_.DestroyKey(seg->Key(it.index, it.slot));
      policy_.DestroyValue(seg->Value(it.index, it.slot));
    });
    seg->Clear();
    return false;
  };

  IterateDistinct(cb);
  size_ = 0;

  // Consider the following case: table with 8 segments overall, 4 distinct.
  // S1, S1, S1, S1, S2, S3, S4, S4
  /* This corresponds to the tree:
            R
          /  \
        S1   /\
            /\ S4
           S2 S3
     We want to collapse this tree into, say, 2 segment directory.
     That means we need to keep S1, S2 but delete S3, S4.
     That means, we need to move representative segments until we reached the desired size
     and then erase all other distinct segments.
  **********/
  if (global_depth_ > initial_depth_) {
    PMR_NS::polymorphic_allocator<SegmentType> pa(segment_.get_allocator());
    using alloc_traits = std::allocator_traits<decltype(pa)>;

    size_t dest = 0, src = 0;
    size_t new_size = (1 << initial_depth_);
    bucket_count_ = 0;
    while (src < segment_.size()) {
      auto* seg = segment_[src];
      size_t next_src = NextSeg(src);  // must do before because NextSeg is dependent on seg.
      if (dest < new_size) {
        seg->set_local_depth(initial_depth_);
        bucket_count_ += seg->num_buckets();
        segment_[dest++] = seg;
      } else {
        alloc_traits::destroy(pa, seg);
        alloc_traits::deallocate(pa, seg, 1);
      }

      src = next_src;
    }

    global_depth_ = initial_depth_;
    unique_segments_ = new_size;
    segment_.resize(new_size);
  }
}

template <typename _Key, typename _Value, typename Policy>
bool DashTable<_Key, _Value, Policy>::ShiftRight(bucket_iterator it) {
  auto* seg = segment_[it.seg_id_];

  typename Segment_t::Hash_t hash_val = 0;
  auto& bucket = seg->GetBucket(it.bucket_id_);

  if (bucket.GetBusy() & (1 << (kSlotNum - 1))) {
    it.slot_id_ = kSlotNum - 1;
    hash_val = DoHash(it->first);
    policy_.DestroyKey(it->first);
    policy_.DestroyValue(it->second);
  }

  bool deleted = seg->ShiftRight(it.bucket_id_, hash_val);
  size_ -= unsigned(deleted);

  return deleted;
}

template <typename _Key, typename _Value, typename Policy>
template <typename Cb>
void DashTable<_Key, _Value, Policy>::IterateDistinct(Cb&& cb) {
  size_t i = 0;
  while (i < segment_.size()) {
    auto* seg = segment_[i];
    size_t next_id = NextSeg(i);
    if (cb(seg))
      break;
    i = next_id;
  }
}

template <typename _Key, typename _Value, typename Policy>
template <typename U>
auto DashTable<_Key, _Value, Policy>::Find(U&& key) const -> const_iterator {
  uint64_t key_hash = DoHash(key);
  uint32_t seg_id = SegmentId(key_hash);  // seg_id takes up global_depth_ high bits.

  // Hash structure is like this: [SSUUUUBF], where S is segment id, U - unused,
  // B - bucket id and F is a fingerprint. Segment id is needed to identify the correct segment.
  // Once identified, the segment instance uses the lower part of hash to locate the key.
  // It uses 8 least significant bits for a fingerprint and few more bits for bucket id.
  if (auto seg_it = segment_[seg_id]->FindIt(key_hash, EqPred(key)); seg_it.found()) {
    return {this, seg_id, seg_it.index, seg_it.slot};
  }
  return {};
}

template <typename _Key, typename _Value, typename Policy>
template <typename U>
auto DashTable<_Key, _Value, Policy>::Find(U&& key) -> iterator {
  return FindFirst(DoHash(key), EqPred(key));
}

template <typename _Key, typename _Value, typename Policy>
template <typename U>
void DashTable<_Key, _Value, Policy>::Prefetch(U&& key) const {
  uint64_t key_hash = DoHash(key);
  uint32_t seg_id = SegmentId(key_hash);
  segment_[seg_id]->Prefetch(key_hash);
}

template <typename _Key, typename _Value, typename Policy>
template <typename Pred>
auto DashTable<_Key, _Value, Policy>::FindFirst(uint64_t key_hash, Pred&& pred) -> iterator {
  uint32_t seg_id = SegmentId(key_hash);
  if (auto seg_it = segment_[seg_id]->FindIt(key_hash, pred); seg_it.found()) {
    return {this, seg_id, seg_it.index, seg_it.slot};
  }
  return {};
}

template <typename _Key, typename _Value, typename Policy>
size_t DashTable<_Key, _Value, Policy>::Erase(const Key_t& key) {
  uint64_t key_hash = DoHash(key);
  size_t x = SegmentId(key_hash);
  auto* target = segment_[x];
  auto it = target->FindIt(key_hash, EqPred(key));
  if (!it.found())
    return 0;

  policy_.DestroyKey(target->Key(it.index, it.slot));
  policy_.DestroyValue(target->Value(it.index, it.slot));
  target->Delete(it, key_hash);
  --size_;

  return 1;
}

template <typename _Key, typename _Value, typename Policy>
void DashTable<_Key, _Value, Policy>::Erase(iterator it) {
  auto* target = segment_[it.seg_id_];
  uint64_t key_hash = DoHash(it->first);
  SegmentIterator sit{it.bucket_id_, it.slot_id_};

  policy_.DestroyKey(it->first);
  policy_.DestroyValue(it->second);

  target->Delete(sit, key_hash);
  --size_;
}

template <typename _Key, typename _Value, typename Policy>
void DashTable<_Key, _Value, Policy>::Reserve(size_t size) {
  if (size <= capacity())
    return;

  size_t sg_floor = (size - 1) / SegmentType::capacity();
  if (sg_floor < segment_.size()) {
    return;
  }
  assert(sg_floor > 1u);
  unsigned new_depth = 1 + (63 ^ __builtin_clzll(sg_floor));

  IncreaseDepth(new_depth);
}

template <typename _Key, typename _Value, typename Policy>
template <typename U, typename V, typename EvictionPolicy>
auto DashTable<_Key, _Value, Policy>::InsertInternal(U&& key, V&& value, EvictionPolicy& ev,
                                                     InsertMode mode) -> std::pair<iterator, bool> {
  uint64_t key_hash = DoHash(key);
  uint32_t target_seg_id = SegmentId(key_hash);

  while (true) {
    // Keep last global_depth_ msb bits of the hash.
    assert(target_seg_id < segment_.size());
    SegmentType* target = segment_[target_seg_id];

    // Load heap allocated segment data - to avoid TLB miss when accessing the bucket.
    __builtin_prefetch(target, 0, 1);

    typename SegmentType::Iterator it;
    bool res = true;
    unsigned num_buckets = target->num_buckets();

    auto move_cb = [&](uint32_t segment_id, detail::PhysicalBid from, detail::PhysicalBid to) {
      // OnMove is used to notify policy about the move of items across buckets.
      ev.OnMove(Cursor{global_depth_, segment_id, from}, Cursor{global_depth_, segment_id, to});
    };

    if (mode == InsertMode::kForceInsert) {
      it =
          target->InsertUniq(std::forward<U>(key), std::forward<V>(value), key_hash, true, move_cb);
      res = it.found();
    } else {
      std::tie(it, res) = target->Insert(std::forward<U>(key), std::forward<V>(value), key_hash,
                                         EqPred(key), move_cb);
    }

    if (res) {  // success
      // in case segment bucket count changed, we need to update total bucket count.
      bucket_count_ += (target->num_buckets() - num_buckets);
      ++size_;
      return std::make_pair(iterator{this, target_seg_id, it.index, it.slot}, true);
    }

    /*duplicate insert, insertion failure*/
    if (it.found()) {
      return std::make_pair(iterator{this, target_seg_id, it.index, it.slot}, false);
    }

    bool consider_throw = true;

    // At this point we must split the segment.
    // try garbage collect or evict.
    if constexpr (EvictionPolicy::can_evict || EvictionPolicy::can_gc) {
      // Try gc.
      uint8_t bid[HotBuckets::kRegularBuckets];
      SegmentType::FillProbeArray(key_hash, bid);
      HotBuckets hotspot;
      hotspot.key_hash = key_hash;

      for (unsigned j = 0; j < HotBuckets::kRegularBuckets; ++j) {
        hotspot.probes.by_type.regular_buckets[j] = bucket_iterator{this, target_seg_id, bid[j]};
      }

      for (unsigned i = 0; i < SegmentType::kStashBucketNum; ++i) {
        hotspot.probes.by_type.stash_buckets[i] =
            bucket_iterator{this, target_seg_id, uint8_t(Policy::kBucketNum + i), 0};
      }
      hotspot.num_buckets = HotBuckets::kNumBuckets;

      // The difference between gc and eviction is that gc can be applied even if
      // the table can grow since we throw away logically deleted items.
      // For eviction to be applied we should reach the growth limit.
      if constexpr (EvictionPolicy::can_gc) {
        unsigned res = ev.GarbageCollect(hotspot, this);
        garbage_collected_ += res;
        if (res) {
          // We succeeded to gc. Lets continue with the momentum.
          // In terms of API abuse it's an awful hack, just to see if it works.
          /*unsigned start = (bid[HotBuckets::kNumBuckets - 1] + 1) % kLogicalBucketNum;
          for (unsigned i = 0; i < HotBuckets::kNumBuckets; ++i) {
            uint8_t id = (start + i) % kLogicalBucketNum;
            buckets.probes.arr[i] = bucket_iterator{this, target_seg_id, id};
          }
          garbage_collected_ += ev.GarbageCollect(buckets, this);
          */
          continue;
        }
      }

      auto hash_fn = [this](const auto& k) { return policy_.HashFn(k); };
      unsigned moved = target->UnloadStash(hash_fn, move_cb);
      if (moved > 0) {
        stash_unloaded_ += moved;
        continue;
      }

      // We evict only if our policy says we can not grow
      if constexpr (EvictionPolicy::can_evict) {
        bool can_grow = ev.CanGrow(*this);
        if (can_grow) {
          consider_throw = false;
        } else {
          unsigned res = ev.Evict(hotspot, this);
          if (res)
            continue;
        }
      }
    }

    if (consider_throw && !ev.CanGrow(*this)) {
      throw std::bad_alloc{};
    }

    // Split the segment.
    if (target->local_depth() == global_depth_) {
      IncreaseDepth(global_depth_ + 1);

      target_seg_id = SegmentId(key_hash);
      assert(target_seg_id < segment_.size() && segment_[target_seg_id] == target);
    }

    ev.RecordSplit(target);
    Split(target_seg_id, ev);
  }

  return std::make_pair(iterator{}, false);
}

template <typename _Key, typename _Value, typename Policy>
void DashTable<_Key, _Value, Policy>::IncreaseDepth(unsigned new_depth) {
  assert(!segment_.empty());
  assert(new_depth > global_depth_);
  size_t prev_sz = segment_.size();
  size_t repl_cnt = 1ul << (new_depth - global_depth_);
  segment_.resize(1ul << new_depth);

  for (int i = prev_sz - 1; i >= 0; --i) {
    size_t offs = i * repl_cnt;
    std::fill(segment_.begin() + offs, segment_.begin() + offs + repl_cnt, segment_[i]);
    segment_[i]->set_segment_id(offs);  // update segment id.
  }
  global_depth_ = new_depth;
}

template <typename _Key, typename _Value, typename Policy>
template <typename EvictionPolicy>
void DashTable<_Key, _Value, Policy>::Split(uint32_t seg_id, EvictionPolicy& ev) {
  SegmentType* source = segment_[seg_id];

  uint32_t chunk_size = 1u << (global_depth_ - source->local_depth());
  uint32_t start_idx = seg_id & (~(chunk_size - 1));
  assert(segment_[start_idx] == source && segment_[start_idx + chunk_size - 1] == source);
  uint32_t target_id = start_idx + chunk_size / 2;
  SegmentType* target = ConstructSegment(source->local_depth() + 1, target_id);

  auto hash_fn = [this](const auto& k) { return policy_.HashFn(k); };

  // remove current segment bucket count.
  bucket_count_ -= (source->num_buckets() + target->num_buckets());

  source->Split(
      std::move(hash_fn), target,
      [&](uint32_t segment_from, detail::PhysicalBid from, uint32_t segment_to,
          detail::PhysicalBid to) {
        // OnMove is used to notify eviction policy about the moves across
        // buckets/segments during the split.
        ev.OnMove(Cursor{global_depth_, segment_from, from}, Cursor{global_depth_, segment_to, to});
      });

  // add back the updated bucket count.
  bucket_count_ += (target->num_buckets() + source->num_buckets());
  ++unique_segments_;

  for (size_t i = target_id; i < start_idx + chunk_size; ++i) {
    segment_[i] = target;
  }
}

template <typename _Key, typename _Value, typename Policy>
template <typename Cb>
auto DashTable<_Key, _Value, Policy>::TraverseBySegmentOrder(Cursor curs, Cb&& cb) -> Cursor {
  uint32_t sid = curs.segment_id(global_depth_);
  assert(sid < segment_.size());
  SegmentType* s = segment_[sid];
  assert(s);
  uint8_t bid = curs.bucket_id();

  auto dt_cb = [&](const SegmentIterator& it) { cb(iterator{this, sid, it.index, it.slot}); };
  s->TraverseBucket(bid, std::move(dt_cb));

  ++bid;
  if (SegmentType::OutOfRange(bid)) {
    sid = NextSeg(sid);
    if (sid >= segment_.size()) {
      return Cursor::end();
    }
    bid = 0;
  }

  return Cursor{global_depth_, sid, bid};
}

template <typename _Key, typename _Value, typename Policy>
auto DashTable<_Key, _Value, Policy>::GetRandomCursor(absl::BitGen* bitgen) -> Cursor {
  uint32_t sid = absl::Uniform<uint32_t>(*bitgen, 0, segment_.size());
  uint8_t bid = absl::Uniform<uint8_t>(*bitgen, 0, Policy::kBucketNum);

  return Cursor{global_depth_, sid, bid};
}

template <typename _Key, typename _Value, typename Policy>
template <typename Cb>
auto DashTable<_Key, _Value, Policy>::Traverse(Cursor curs, Cb&& cb) -> Cursor {
  uint32_t sid = curs.segment_id(global_depth_);
  uint8_t bid = curs.bucket_id();

  // Test validity of the cursor.
  if (bid >= Policy::kBucketNum || sid >= segment_.size())
    return Cursor::end();

  auto hash_fun = [this](const auto& k) { return policy_.HashFn(k); };

  bool fetched = false;

  // We fix bid and go over all segments. Once we reach the end we increase bid and repeat.
  do {
    SegmentType* s = segment_[sid];
    assert(s);

    auto dt_cb = [&](const SegmentIterator& it) { cb(iterator{this, sid, it.index, it.slot}); };

    fetched = s->TraverseLogicalBucket(bid, hash_fun, std::move(dt_cb));
    sid = NextSeg(sid);
    if (sid >= segment_.size()) {
      sid = 0;
      ++bid;

      if (bid >= Policy::kBucketNum)
        return Cursor::end();
    }
  } while (!fetched);

  return Cursor{global_depth_, sid, bid};
}

template <typename _Key, typename _Value, typename Policy>
auto DashTable<_Key, _Value, Policy>::AdvanceCursorBucketOrder(Cursor cursor) -> Cursor {
  // We fix bid and go over all segments. Once we reach the end we increase bid and repeat.
  uint32_t sid = cursor.segment_id(global_depth_);
  uint8_t bid = cursor.bucket_id();
  sid = NextSeg(sid);
  if (sid >= segment_.size()) {
    sid = 0;
    ++bid;

    if (SegmentType::OutOfRange(bid))
      return Cursor::end();
  }
  return Cursor{global_depth_, sid, bid};
}

template <typename _Key, typename _Value, typename Policy>
template <typename Cb>
auto DashTable<_Key, _Value, Policy>::TraverseBuckets(Cursor cursor, Cb&& cb) -> Cursor {
  if (SegmentType::OutOfRange(cursor.bucket_id()))  // sanity.
    return Cursor::end();

  constexpr uint32_t kMaxIterations = 8;
  bool invoked = false;

  for (uint32_t i = 0; i < kMaxIterations; ++i) {
    uint32_t sid = cursor.segment_id(global_depth_);
    uint8_t bid = cursor.bucket_id();
    SegmentType* s = segment_[sid];
    assert(s);
    if (bid < s->num_buckets()) {
      const auto& bucket = s->GetBucket(bid);
      if (bucket.GetBusy()) {  // Invoke callback only if bucket has elements.
        cb(BucketIt(sid, bid));
        invoked = true;
      }
    }
    cursor = AdvanceCursorBucketOrder(cursor);
    if (invoked || !cursor)  // Break end of traversal or callback invoked.
      return cursor;
  }
  return cursor;
}

}  // namespace dfly


================================================
FILE: src/core/dash_bench.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include <absl/base/internal/cycleclock.h>
#include <absl/container/flat_hash_map.h>
#include <mimalloc.h>

#include "base/hash.h"
#include "base/histogram.h"
#include "base/init.h"
#include "core/dash.h"

extern "C" {
#include "redis/dict.h"
#include "redis/sds.h"
#include "redis/zmalloc.h"
}

using namespace std;

ABSL_FLAG(uint32_t, n, 100000, "num items");
ABSL_FLAG(string, type, "dash", "");
ABSL_FLAG(bool, sds, false, "If true, uses sds as primary key");

namespace dfly {

static uint64_t dictSdsHash(const void* key) {
  return dictGenHashFunction((unsigned char*)key, sdslen((char*)key));
}

static int dictSdsKeyCompare(dict*, const void* key1, const void* key2) {
  int l1, l2;

  l1 = sdslen((sds)key1);
  l2 = sdslen((sds)key2);
  if (l1 != l2)
    return 0;
  return memcmp(key1, key2, l1) == 0;
}

static dictType SdsDict = {
    dictSdsHash,       /* hash function */
    NULL,              /* key dup */
    NULL,              /* val dup */
    dictSdsKeyCompare, /* key compare */
    NULL,
    // dictSdsDestructor, /* key destructor */
    NULL, /* val destructor */
    NULL,
};

struct UInt64Policy {
  enum { kSlotNum = 12, kBucketNum = 64, kStashBucketNum = 2 };
  static constexpr bool kUseVersion = false;

  static uint64_t HashFn(uint64_t v) {
    return XXH3_64bits(&v, sizeof(v));
  }

  template <typename U> static void DestroyValue(const U&) {
  }
  template <typename U> static void DestroyKey(const U&) {
  }

  template <typename U, typename V> static bool Equal(U&& u, V&& v) {
    return u == v;
  }
};

struct SdsDashPolicy {
  enum { kSlotNum = 14, kBucketNum = 56, kStashBucketNum = 4 };
  static constexpr bool kUseVersion = false;

  static uint64_t HashFn(sds u) {
    return XXH3_64bits(reinterpret_cast<const uint8_t*>(u), sdslen(u));
  }

  static uint64_t HashFn(std::string_view u) {
    return XXH3_64bits(u.data(), u.size());
  }

  static void DestroyKey(sds s) {
    sdsfree(s);
  }

  static void DestroyValue(uint64_t) {
  }

  static bool Equal(sds u1, sds u2) {
    return dictSdsKeyCompare(nullptr, u1, u2) == 0;
  }

  static bool Equal(sds u1, std::string_view u2) {
    return u2 == std::string_view{u1, sdslen(u1)};
  }
};

using Dash64 = DashTable<uint64_t, uint64_t, UInt64Policy>;
using DashSds = DashTable<sds, uint64_t, SdsDashPolicy>;

using absl::GetFlag;

inline void Sample(int64_t start, int64_t end, base::Histogram* hist) {
  hist->Add((end - start) / 100);
}

Dash64 udt;
DashSds sds_dt;
base::Histogram hist;

#define USE_TIME 1

int64_t GetNow() {
#if USE_TIME
  return absl::GetCurrentTimeNanos();
#else
  return absl::base_internal::CycleClock::Now();
#endif
}

#if defined(__i386__) || defined(__amd64__)
#define LFENCE __asm__ __volatile__("lfence")
#else
#define LFENCE __asm__ __volatile__("ISB")
#endif

absl::flat_hash_map<uint64_t, uint64_t> mymap;

void BenchFlat(uint64_t num) {
  for (uint64_t i = 0; i < num; ++i) {
    time_t start = GetNow();
    mymap.emplace(i, 0);
    LFENCE;

    time_t end = GetNow();
    Sample(start, end, &hist);
  }
}

void BenchDash(uint64_t num) {
  for (uint64_t i = 0; i < num; ++i) {
    time_t start = GetNow();
    udt.Insert(i, 0);
    LFENCE;

    time_t end = GetNow();
    Sample(start, end, &hist);
  }
}

inline sds Prefix() {
  return sdsnew("xxxxxxxxxxxxxxxxxxxxxxx");
}

void BenchDashSds(uint64_t num) {
  sds key = sdscatsds(Prefix(), sdsfromlonglong(0));
  for (uint64_t i = 0; i < num; ++i) {
    time_t start = GetNow();
    sds_dt.Insert(key, 0);
    time_t end = GetNow();
    Sample(start, end, &hist);

    key = sdscatsds(Prefix(), sdsfromlonglong(i + 1));
  }
}

static uint64_t callbackHash(const void* key) {
  return XXH64(&key, sizeof(key), 0);
}

static dictType IntDict = {callbackHash, NULL, NULL, NULL, NULL, NULL, NULL};

dict* redis_dict = nullptr;

void BenchDict(uint64_t num) {
  redis_dict = dictCreate(&IntDict);

  for (uint64_t i = 0; i < num; ++i) {
    time_t start = GetNow();
    dictAdd(redis_dict, (void*)i, nullptr);
    LFENCE;
    time_t end = GetNow();
    Sample(start, end, &hist);
  }
}

void BenchDictSds() {
  uint64_t num = GetFlag(FLAGS_n);

  sds key = sdscat(Prefix(), sdsfromlonglong(0));
  redis_dict = dictCreate(&SdsDict);

  for (uint64_t i = 0; i < num; ++i) {
    time_t start = GetNow();
    dictAdd(redis_dict, key, nullptr);
    time_t end = GetNow();
    Sample(start, end, &hist);

    key = sdscatsds(Prefix(), sdsfromlonglong(i + 1));
  }
}

}  // namespace dfly

using namespace dfly;

int main(int argc, char* argv[]) {
  MainInitGuard guard(&argc, &argv);

  init_zmalloc_threadlocal(mi_heap_get_backing());

  string table_type = GetFlag(FLAGS_type);

  bool is_sds = GetFlag(FLAGS_sds);
  uint64_t start = absl::GetCurrentTimeNanos();
  uint64_t num = GetFlag(FLAGS_n);

  if (table_type == "dash") {
    if (is_sds) {
      BenchDashSds(num);
    } else {
      BenchDash(num);
    }
  } else if (table_type == "dict") {
    if (is_sds) {
      BenchDictSds();
    } else {
      BenchDict(num);
    }
  } else if (table_type == "flat") {
    BenchFlat(num);
  } else {
    LOG(FATAL) << "Unknown type " << table_type;
  }

  CONSOLE_INFO << "latencies histogram (jiffies, 100ns):\n" << hist.ToString();
  uint64_t delta = (absl::GetCurrentTimeNanos() - start) / 1000000;
  CONSOLE_INFO << "Took " << delta << " ms";

  return 0;
}


================================================
FILE: src/core/dash_internal.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/base/internal/endian.h>

#include <array>
#include <cassert>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <type_traits>

#include "base/pmr/memory_resource.h"
#include "core/sse_port.h"

namespace dfly {
namespace detail {

template <unsigned NUM_SLOTS> class SlotBitmap {
  static_assert(NUM_SLOTS > 0 && NUM_SLOTS <= 28);
  static constexpr bool SINGLE = NUM_SLOTS <= 14;
  static constexpr unsigned kLen = SINGLE ? 1 : 2;
  static constexpr unsigned kAllocMask = (1u << NUM_SLOTS) - 1;
  static constexpr unsigned kBitmapLenMask = (1 << 4) - 1;

 public:
  // probe - true means the entry is probing, i.e. not owning.
  // probe=true GetProbe returns index of probing entries, i.e. hosted but not owned by this bucket.
  // probe=false - mask of owning entries
  uint32_t GetProbe(bool probe) const {
    if constexpr (SINGLE)
      return ((val_[0].d >> 4) & kAllocMask) ^ ((!probe) * kAllocMask);
    else
      return (val_[1].d & kAllocMask) ^ ((!probe) * kAllocMask);
  }

  // GetBusy returns the busy mask.
  uint32_t GetBusy() const {
    return SINGLE ? val_[0].d >> 18 : val_[0].d;
  }

  bool IsFull() const {
    return Size() == NUM_SLOTS;
  }

  unsigned Size() const {
    return SINGLE ? (val_[0].d & kBitmapLenMask) : __builtin_popcount(val_[0].d);
  }

  // Precondition: Must have empty slot
  // returns result in [0, NUM_SLOTS) range.
  int FindEmptySlot() const {
    uint32_t mask = ~(GetBusy());

    // returns the index for first set bit (FindLSBSetNonZero). mask must be non-zero.
    int slot = __builtin_ctz(mask);
    assert(slot < int(NUM_SLOTS));
    return slot;
  }

  // mask is NUM_SLOTS bits saying which slots needs to be freed (1 - should clear).
  void ClearSlots(uint32_t mask);

  void Clear() {
    if (SINGLE) {
      val_[0].d = 0;
    } else {
      val_[0].d = val_[1].d = 0;
    }
  }

  void ClearSlot(unsigned index);
  void SetSlot(unsigned index, bool probe);

  // cell 0 corresponds to first lsb bit in the busy mask, hence we need to shift left
  // the bitmap in order to shift right the cell-array.
  // Returns true if discarded the last slot (i.e. it was busy).
  bool ShiftLeft();

  void Swap(unsigned slot_a, unsigned slot_b);

 private:
  // SINGLE:
  //   val_[0] is [14 bit- busy][14bit-probing, whether the key does not belong to this
  //   bucket][4bit-count]
  // kLen == 2:
  //  val_[0] is 28 bit busy
  //  val_[1] is 28 bit probing
  //  count is implemented via popcount of val_[0].
  struct Unaligned {
    // Apparently with wrapping struct we can persuade compiler to declare an unaligned int.
    // https://stackoverflow.com/questions/19915303/packed-qualifier-ignored
    uint32_t d __attribute__((packed, aligned(1)));

    Unaligned() : d(0) {
    }
  };

  Unaligned val_[kLen];
};  // SlotBitmap

template <unsigned NUM_SLOTS> class BucketBase {
  // We can not allow more than 4 stash fps because we hold stash positions in single byte
  // stash_pos_ variable that uses 2 bits per stash bucket to point which bucket holds that fp.
  // Hence we can point at most from 4 fps to 4 stash buckets.
  // If any of those limits need to be raised we should increase stash_pos_ similarly to how we did
  // with SlotBitmap.
  static constexpr unsigned kStashFpLen = 4;
  static constexpr unsigned kStashPresentBit = 1 << 4;

  using FpArray = std::array<uint8_t, NUM_SLOTS>;
  using StashFpArray = std::array<uint8_t, kStashFpLen>;

 public:
  using SlotId = uint8_t;
  static constexpr SlotId kNanSlot = 255;

  bool IsFull() const {
    return Size() == NUM_SLOTS;
  }

  bool IsEmpty() const {
    return GetBusy() == 0;
  }

  unsigned Size() const {
    return slotb_.Size();
  }

  void Delete(SlotId sid) {
    slotb_.ClearSlot(sid);
  }

  unsigned Find(uint8_t fp_hash, bool probe) const {
    unsigned mask = CompareFP(fp_hash) & GetBusy();
    return mask & GetProbe(probe);
  }

  uint8_t Fp(unsigned i) const {
    assert(i < finger_arr_.size());
    return finger_arr_[i];
  }

  void SetStashPtr(unsigned stash_pos, uint8_t meta_hash, BucketBase* next);

  // returns 0 if stash was cleared from this bucket, 1 if it was cleared from next bucket.
  unsigned UnsetStashPtr(uint8_t fp_hash, unsigned stash_pos, BucketBase* next);

  // probe - true means the entry is probing, i.e. not owning.
  // probe=true GetProbe returns index of probing entries, i.e. hosted but not owned by this bucket.
  // probe=false - mask of owning entries
  uint32_t GetProbe(bool probe) const {
    return slotb_.GetProbe(probe);
  }

  // GetBusy returns the busy mask.
  uint32_t GetBusy() const {
    return slotb_.GetBusy();
  }

  bool IsBusy(unsigned slot) const {
    return (GetBusy() & (1u << slot)) != 0;
  }

  // mask is saying which slots needs to be freed (1 - should clear).
  void ClearSlots(uint32_t mask) {
    slotb_.ClearSlots(mask);
  }

  void Clear() {
    slotb_.Clear();
  }

  void ClearStashPtrs() {
    stash_busy_ = 0;
    stash_pos_ = 0;
    stash_probe_mask_ = 0;
    overflow_count_ = 0;
  }

  bool HasStash() const {
    return stash_busy_ & kStashPresentBit;
  }

  void SetHash(unsigned slot_id, uint8_t meta_hash, bool probe);

  bool HasStashOverflow() const {
    return overflow_count_ > 0;
  }

  // func accepts an fp_index in range [0, kStashFpLen) and
  // stash position [0, STASH_BUCKET_NUM) that with fingerprint=fp. func must return
  // a slot id if it found whatever it searched for when iterating or kNanSlot to continue.
  // IterateStash returns: first - stash position [0, STASH_BUCKET_NUM), second - slot id
  // pointing to that stash.
  template <typename F>
  std::pair<unsigned, SlotId> IterateStash(uint8_t fp, bool is_probe, F&& func) const;

  void Swap(unsigned slot_a, unsigned slot_b) {
    slotb_.Swap(slot_a, slot_b);
    std::swap(finger_arr_[slot_a], finger_arr_[slot_b]);
  }

 protected:
  uint32_t CompareFP(uint8_t fp) const;
  bool ShiftRight();

  // Returns true if stash_pos was stored, false overwise
  bool SetStash(uint8_t fp, unsigned stash_pos, bool probe);
  bool ClearStash(uint8_t fp, unsigned stash_pos, bool probe);

  SlotBitmap<NUM_SLOTS> slotb_;  // allocation bitmap + pointer bitmap + counter

  /*only use the first 14 bytes, can be accelerated by
    SSE instruction,0-13 for finger, 14-17 for overflowed*/
  FpArray finger_arr_;
  StashFpArray stash_arr_;

  uint8_t stash_busy_ = 0;  // kStashFpLen+1 bits are used
  uint8_t stash_pos_ = 0;   // 4x2 bits for pointing to stash bucket.

  // stash_probe_mask_ indicates whether the overflow fingerprint is for the neighbour (1)
  // or for this bucket (0). kStashFpLen bits are used.
  uint8_t stash_probe_mask_ = 0;

  // number of overflowed items stored in stash buckets that do not have fp hashes.
  uint8_t overflow_count_ = 0;
};  // BucketBase

static_assert(sizeof(BucketBase<12>) == 24);
static_assert(alignof(BucketBase<14>) == 1);
static_assert(alignof(BucketBase<12>) == 1);

// Optional version support as part of DashTable.
// This works like this: each slot has 2 bytes for version and a bucket has another 6.
// therefore all slots in the bucket shared the same 6 high bytes of 8-byte version.
// In order to achieve this we store high6(max{version(entry)}) for every entry.
// Hence our version control may have false positives, i.e. signal that an entry has changed
// when in practice its neighbour incremented the high6 part of its bucket.
template <unsigned NUM_SLOTS> class VersionedBB : public BucketBase<NUM_SLOTS> {
  using Base = BucketBase<NUM_SLOTS>;

 public:
  // one common version per bucket.
  void SetVersion(uint64_t version);

  uint64_t GetVersion() const {
    uint64_t c = absl::little_endian::Load64(version_);
    // c |= low_[slot_id];
    return c;
  }

  void UpdateVersion(uint64_t version) {
    uint64_t c = std::max(GetVersion(), version);
    absl::little_endian::Store64(version_, c);
  }

  void Clear() {
    Base::Clear();
    // low_.fill(0);
    memset(version_, 0, sizeof(version_));
  }

  bool ShiftRight() {
    bool res = Base::ShiftRight();
    return res;
  }

  void Swap(unsigned slot_a, unsigned slot_b) {
    Base::Swap(slot_a, slot_b);
  }

 private:
  uint8_t version_[8] = {0};
};

static_assert(alignof(VersionedBB<14>) == 1);
static_assert(sizeof(VersionedBB<12>) == 12 * 2 + 8);
static_assert(sizeof(VersionedBB<14>) <= 14 * 2 + 8);

// Segment - static-hashtable of size kSlotNum*(kBucketNum + kStashBucketNum).
struct DefaultSegmentPolicy {
  static constexpr unsigned kSlotNum = 12;
  static constexpr unsigned kBucketNum = 64;
  static constexpr bool kUseVersion = true;
};

using PhysicalBid = uint8_t;
using LogicalBid = uint8_t;

template <typename KeyType, typename ValueType, typename Policy = DefaultSegmentPolicy>
class Segment {
 public:
  static constexpr unsigned kSlotNum = Policy::kSlotNum;
  static constexpr unsigned kBucketNum = Policy::kBucketNum;
  static constexpr unsigned kStashBucketNum = 4;
  static constexpr bool kUseVersion = Policy::kUseVersion;

 private:
  static_assert(kBucketNum + kStashBucketNum < 255);
  static constexpr unsigned kFingerBits = 8;

  using BucketType = std::conditional_t<kUseVersion, VersionedBB<kSlotNum>, BucketBase<kSlotNum>>;

  struct Bucket : public BucketType {
    using BucketType::kNanSlot;
    using typename BucketType::SlotId;

    KeyType key[kSlotNum];
    ValueType value[kSlotNum];

    template <typename U, typename V>
    void Insert(uint8_t slot, U&& u, V&& v, uint8_t meta_hash, bool probe) {
      assert(slot < kSlotNum);

      key[slot] = std::forward<U>(u);
      value[slot] = std::forward<V>(v);

      this->SetHash(slot, meta_hash, probe);
    }

    // Returns slot id if insertion is successful, -1 if no free slots are found.
    template <typename U, typename V>
    int TryInsertToBucket(U&& key, V&& value, uint8_t meta_hash, bool probe) {
      if (this->IsFull()) {
        return -1;  // no free space in the bucket.
      }

      int slot = this->slotb_.FindEmptySlot();
      assert(slot >= 0);
      Insert(slot, std::forward<U>(key), std::forward<V>(value), meta_hash, probe);
      return slot;
    }

    template <typename Pred> SlotId FindByFp(uint8_t fp_hash, bool probe, Pred&& pred) const;

    bool ShiftRight();

    void Swap(unsigned slot_a, unsigned slot_b) {
      BucketType::Swap(slot_a, slot_b);
      std::swap(key[slot_a], key[slot_b]);
      std::swap(value[slot_a], value[slot_b]);
    }

    template <typename This, typename Cb> void ForEachSlotImpl(This obj, Cb&& cb) const {
      uint32_t mask = this->GetBusy();
      uint32_t probe_mask = this->GetProbe(true);

      for (unsigned j = 0; j < kSlotNum; ++j) {
        if (mask & 1) {
          cb(obj, j, probe_mask & 1);
        }
        mask >>= 1;
        probe_mask >>= 1;
      }
    }

    // calls for each busy slot: cb(iterator, probe)
    template <typename Cb> void ForEachSlot(Cb&& cb) const {
      ForEachSlotImpl(this, std::forward<Cb&&>(cb));
    }

    // calls for each busy slot: cb(iterator, probe)
    template <typename Cb> void ForEachSlot(Cb&& cb) {
      ForEachSlotImpl(this, std::forward<Cb&&>(cb));
    }
  };  // class Bucket

  static constexpr PhysicalBid kNanBid = 0xFF;
  using SlotId = typename BucketType::SlotId;

 public:
  struct Iterator {
    PhysicalBid index;  // bucket index
    uint8_t slot;

    Iterator() : index(kNanBid), slot(BucketType::kNanSlot) {
    }

    Iterator(PhysicalBid bi, uint8_t sid) : index(bi), slot(sid) {
    }

    bool found() const {
      return index != kNanBid;
    }
  };

  struct Stats {
    size_t neighbour_probes = 0;
    size_t stash_probes = 0;
    size_t stash_overflow_probes = 0;
  };

  static constexpr size_t kFpMask = (1 << kFingerBits) - 1;

  using Value_t = ValueType;
  using Key_t = KeyType;
  using Hash_t = uint64_t;

  explicit Segment(size_t depth, uint32_t id, PMR_NS::memory_resource* mr)
      : local_depth_(depth), segment_id_(id), mr_(mr) {
  }

  ~Segment() {
    Clear();
  }

  Segment(const Segment&) = delete;
  Segment& operator=(const Segment&) = delete;

  // Returns (iterator, true) if insert succeeds,
  // (iterator, false) for duplicate and (invalid-iterator, false) if it's full
  template <typename K, typename V, typename Pred, typename OnMoveCb>
  std::pair<Iterator, bool> Insert(K&& key, V&& value, Hash_t key_hash, Pred&& pred,
                                   OnMoveCb&& on_move_cb);

  template <typename HashFn, typename OnMoveCb>
  void Split(HashFn&& hfunc, Segment* dest, OnMoveCb&& on_move_cb);

  void Delete(const Iterator& it, Hash_t key_hash);

  void Clear();  // clears the segment.

  size_t SlowSize() const;

  static constexpr size_t capacity() {
    return kMaxSize;
  }

  static constexpr bool OutOfRange(PhysicalBid bid) {
    return bid >= kBucketNum + kStashBucketNum;
  }

  size_t local_depth() const {
    return local_depth_;
  }

  void set_local_depth(uint32_t depth) {
    local_depth_ = depth;
  }

  template <bool UV = kUseVersion>
  std::enable_if_t<UV, uint64_t> GetVersion(PhysicalBid bid) const {
    return GetBucket(bid).GetVersion();
  }

  template <bool UV = kUseVersion> std::enable_if_t<UV> SetVersion(PhysicalBid bid, uint64_t v) {
    return GetBucket(bid).SetVersion(v);
  }

  // Traverses over Segment's bucket bid and calls cb(const Iterator& it) 0 or more times
  // for each slot in the bucket. returns false if bucket is empty.
  // Please note that `it` will not necessary point to bid due to probing and stash buckets
  // containing items that should have been resided in bid.
  template <typename Cb, typename HashFn>
  bool TraverseLogicalBucket(LogicalBid bid, HashFn&& hfun, Cb&& cb) const;

  // Cb  accepts (const Iterator&).
  template <typename Cb> void TraverseAll(Cb&& cb) const;

  // Traverses over Segment's bucket bid and calls cb(Iterator& it)
  // for each slot in the bucket. The iteration goes over a physical bucket.
  template <typename Cb> void TraverseBucket(PhysicalBid bid, Cb&& cb);

  // Used in test.
  unsigned NumProbingBuckets() const {
    unsigned res = 0;
    for (PhysicalBid i = 0; i < kBucketNum; ++i) {
      res += (bucket_[i].GetProbe(true) != 0);
    }
    return res;
  };

  const Bucket& GetBucket(PhysicalBid i) const {
    return bucket_[i];
  }

  Bucket& GetBucket(PhysicalBid i) {
    return bucket_[i];
  }

  bool IsBusy(PhysicalBid bid, unsigned slot) const {
    return GetBucket(bid).GetBusy() & (1U << slot);
  }

  Key_t& Key(PhysicalBid bid, unsigned slot) {
    assert(IsBusy(bid, slot));
    return GetBucket(bid).key[slot];
  }

  const Key_t& Key(PhysicalBid bid, unsigned slot) const {
    assert(IsBusy(bid, slot));
    return GetBucket(bid).key[slot];
  }

  Value_t& Value(PhysicalBid bid, unsigned slot) {
    assert(IsBusy(bid, slot));
    return GetBucket(bid).value[slot];
  }

  const Value_t& Value(PhysicalBid bid, unsigned slot) const {
    assert(IsBusy(bid, slot));
    return GetBucket(bid).value[slot];
  }

  // fill bucket ids that may be used probing for this key_hash.
  // The order is: exact, neighbour buckets.
  static void FillProbeArray(Hash_t key_hash, uint8_t dest[4]) {
    dest[1] = HomeIndex(key_hash);
    dest[0] = PrevBid(dest[1]);
    dest[2] = NextBid(dest[1]);
    dest[3] = NextBid(dest[2]);
  }

  // Find item with given key hash and truthy predicate
  template <typename Pred> Iterator FindIt(Hash_t key_hash, Pred&& pred) const;
  void Prefetch(Hash_t key_hash) const;

  // Returns valid iterator if succeeded or invalid if not (it's full).
  // Requires: key should be not present in the segment.
  // if spread is true, tries to spread the load between neighbour and home buckets,
  // otherwise chooses home bucket first.
  // TODO: I am actually not sure if spread optimization is helpful. Worth checking
  // whether we get higher occupancy rates when using it.
  template <typename U, typename V, typename OnMoveCb>
  Iterator InsertUniq(U&& key, V&& value, Hash_t key_hash, bool spread, OnMoveCb&& on_move_cb);

  // capture version change in case of insert.
  // Returns ids of buckets whose version would cross ver_threshold upon insertion of key_hash
  // into the segment.
  // Returns UINT16_MAX if segment is full. Otherwise, returns number of touched bucket ids (1 or 2)
  // if the insertion would happen. The ids are put into bid array that should have at least 2
  // spaces.
  template <bool UV = kUseVersion>
  std::enable_if_t<UV, unsigned> CVCOnInsert(uint64_t ver_threshold, Hash_t key_hash,
                                             PhysicalBid bid[2]) const;

  // Returns bucket ids whose versions will change as a result of bumping up the item
  // Can return upto 3 buckets.
  template <bool UV = kUseVersion>
  std::enable_if_t<UV, unsigned> CVCOnBump(uint64_t ver_threshold, unsigned bid, unsigned slot,
                                           Hash_t hash, PhysicalBid result_bid[3]) const;

  // Finds a valid entry going from specified indices up.
  Iterator FindValidStartingFrom(PhysicalBid bid, unsigned slot) const;

  // Shifts all slots in the bucket right.
  // Returns true if the last slot was busy and the entry has been deleted.
  bool ShiftRight(PhysicalBid bid, Hash_t right_hashval) {
    if (bid >= kBucketNum) {  // Stash
      constexpr auto kLastSlotMask = 1u << (kSlotNum - 1);
      if (GetBucket(bid).GetBusy() & kLastSlotMask)
        RemoveStashReference(bid - kBucketNum, right_hashval);
    }

    return bucket_[bid].ShiftRight();
  }

  // Bumps up this entry making it more "important" for the eviction policy.
  template <typename BumpPolicy, typename OnMoveCb>
  Iterator BumpUp(PhysicalBid bid, SlotId slot, Hash_t key_hash, const BumpPolicy& ev,
                  OnMoveCb&& cb);

  // Tries to move stash entries back to their normal buckets (exact or neighbour).
  // Returns number of entries that succeeded to unload.
  // Important! Affects versions of the moved items and the items in the destination
  // buckets.
  template <typename HFunc, typename OnMoveCb> unsigned UnloadStash(HFunc&& hfunc, OnMoveCb&& cb);

  unsigned num_buckets() const {
    return kBucketNum + kStashBucketNum;
  }

  uint32_t segment_id() const {
    return segment_id_;
  }

  // needed only when DashTable grows its segment table.
  void set_segment_id(uint32_t new_id) {
    segment_id_ = new_id;
  }

 private:
  static_assert(sizeof(Iterator) == 2);

  static LogicalBid HomeIndex(Hash_t hash) {
    return (hash >> kFingerBits) % kBucketNum;
  }

  static LogicalBid NextBid(LogicalBid bid) {
    return bid < kBucketNum - 1 ? bid + 1 : 0;
  }

  static LogicalBid PrevBid(LogicalBid bid) {
    return bid ? bid - 1 : kBucketNum - 1;
  }

  // if own_items is true it means we try to move owned item to probing bucket.
  // if own_items false it means we try to move non-owned item from probing bucket back to its host.
  int MoveToOther(bool own_items, unsigned from, unsigned to);

  // dry-run version of MoveToOther.
  bool CheckIfMovesToOther(bool own_items, unsigned from, unsigned to) const;

  /*both clear this bucket and its neighbor bucket*/
  void RemoveStashReference(unsigned stash_pos, Hash_t key_hash);

  // returns a valid iterator if succeeded.
  Iterator TryMoveFromStash(unsigned stash_id, unsigned stash_slot_id, Hash_t key_hash);

  const static unsigned kTotalBuckets = kBucketNum + kStashBucketNum;
  static_assert(kTotalBuckets < 0xFF);

  Bucket bucket_[kTotalBuckets];
  uint8_t local_depth_;
  uint32_t segment_id_;  // segment id in the table.
  PMR_NS::memory_resource* mr_ = nullptr;

 public:
  static constexpr size_t kBucketSz = sizeof(Bucket);
  static constexpr size_t kMaxSize = (kBucketNum + kStashBucketNum) * kSlotNum;
  static constexpr double kTaxSize =
      (double(sizeof(Segment)) / kMaxSize) - sizeof(Key_t) - sizeof(Value_t);

#ifdef ENABLE_DASH_STATS
  mutable Stats stats;
#endif
};  // Segment

class DashTableBase {
 public:
  explicit DashTableBase(uint32_t gd)
      : unique_segments_(1 << gd), initial_depth_(gd), global_depth_(gd) {
  }

  DashTableBase(const DashTableBase&) = delete;
  DashTableBase& operator=(const DashTableBase&) = delete;

  uint32_t unique_segments() const {
    return unique_segments_;
  }

  uint16_t depth() const {
    return global_depth_;
  }

  size_t size() const {
    return size_;
  }

  size_t Empty() const {
    return size_ == 0;
  }

 protected:
  uint32_t SegmentId(size_t hash) const {
    if (global_depth_) {
      return hash >> (64 - global_depth_);
    }

    return 0;
  }

  size_t size_ = 0;
  uint32_t unique_segments_ = 0, bucket_count_ = 0;
  uint8_t initial_depth_;
  uint8_t global_depth_;
};  // DashTableBase

template <typename KeyType, typename ValueType> class IteratorPair {
 public:
  IteratorPair(KeyType& k, ValueType& v) : first(k), second(v) {
  }

  IteratorPair* operator->() {
    return this;
  }

  const IteratorPair* operator->() const {
    return this;
  }

  KeyType& first;
  ValueType& second;
};

// Represents a cursor that points to a bucket in dash table.
// One major difference with iterator is that the cursor survives dash table resizes and
// will always point to the most appropriate segment with the same bucket.
// It uses 40 lsb bits out of 64 assuming that number of segments does not cross 4B.
// It's a reasonable assumption in shared nothing architecture when we usually have no more than
// 32GB per CPU. Each segment spawns hundreds of entries so we can not grow segment table
// to billions.
class DashCursor {
 public:
  explicit DashCursor(uint64_t token = 0) : val_(token) {
  }

  DashCursor(uint8_t depth, uint32_t seg_id, PhysicalBid bid)
      : val_((uint64_t(seg_id) << (40 - depth)) | bid) {
  }

  static DashCursor end() {
    return DashCursor{};
  }

  PhysicalBid bucket_id() const {
    return val_ & 0xFF;
  }

  // segment_id is padded to the left of 32 bit region:
  // | segment_id......| bucket_id
  // 40                8          0
  // By using depth we take most significant bits of segment_id if depth has decreased
  // since the cursor has been created, or extend the least significant bits with zeros,
  // if depth was increased.
  uint32_t segment_id(uint8_t depth) const {
    return val_ >> (40 - depth);
  }

  uint64_t token() const {
    return val_;
  }

  explicit operator bool() const {
    return val_ != 0;
  }

 private:
  uint64_t val_;
};

/***********************************************************
 * Implementation section.
 */

template <unsigned NUM_SLOTS> void SlotBitmap<NUM_SLOTS>::SetSlot(unsigned index, bool probe) {
  if constexpr (SINGLE) {
    assert(((val_[0].d >> (index + 18)) & 1) == 0);
    val_[0].d |= (1 << (index + 18));
    val_[0].d |= (unsigned(probe) << (index + 4));

    assert((val_[0].d & kBitmapLenMask) < NUM_SLOTS);
    ++val_[0].d;
    assert(__builtin_popcount(val_[0].d >> 18) == (val_[0].d & kBitmapLenMask));
  } else {
    assert(((val_[0].d >> index) & 1) == 0);
    val_[0].d |= (1u << index);
    val_[1].d |= (unsigned(probe) << index);
  }
}

template <unsigned NUM_SLOTS> void SlotBitmap<NUM_SLOTS>::ClearSlot(unsigned index) {
  assert(Size() > 0);
  if constexpr (SINGLE) {
    uint32_t new_bitmap = val_[0].d & (~(1u << (index + 18))) & (~(1u << (index + 4)));
    new_bitmap -= 1;
    val_[0].d = new_bitmap;
  } else {
    uint32_t mask = 1u << index;
    val_[0].d &= ~mask;
    val_[1].d &= ~mask;
  }
}

template <unsigned NUM_SLOTS> bool SlotBitmap<NUM_SLOTS>::ShiftLeft() {
  constexpr uint32_t kBusyLastSlot = (kAllocMask >> 1) + 1;
  bool res;
  if constexpr (SINGLE) {
    constexpr uint32_t kShlMask = kAllocMask - 1;  // reset lsb
    res = (val_[0].d & (kBusyLastSlot << 18)) != 0;
    uint32_t l = (val_[0].d << 1) & (kShlMask << 4);
    uint32_t p = (val_[0].d << 1) & (kShlMask << 18);
    val_[0].d = __builtin_popcount(p) | l | p;
  } else {
    res = (val_[0].d & kBusyLastSlot) != 0;
    val_[0].d <<= 1;
    val_[0].d &= kAllocMask;
    val_[1].d <<= 1;
    val_[1].d &= kAllocMask;
  }
  return res;
}

template <unsigned NUM_SLOTS> void SlotBitmap<NUM_SLOTS>::ClearSlots(uint32_t mask) {
  if (SINGLE) {
    uint32_t count = __builtin_popcount(mask);
    assert(count <= (val_[0].d & 0xFF));
    mask = (mask << 4) | (mask << 18);
    val_[0].d &= ~mask;
    val_[0].d -= count;
  } else {
    val_[0].d &= ~mask;
    val_[1].d &= ~mask;
  }
}

template <unsigned NUM_SLOTS> void SlotBitmap<NUM_SLOTS>::Swap(unsigned slot_a, unsigned slot_b) {
  if (slot_a > slot_b)
    std::swap(slot_a, slot_b);

  if constexpr (SINGLE) {
    uint32_t a = (val_[0].d << (slot_b - slot_a)) ^ val_[0].d;
    uint32_t bm = (1 << (slot_b + 4)) | (1 << (slot_b + 18));
    a &= bm;
    a |= (a >> (slot_b - slot_a));
    val_[0].d ^= a;
  } else {
    uint32_t a = (val_[0].d << (slot_b - slot_a)) ^ val_[0].d;
    a &= (1 << slot_b);
    a |= (a >> (slot_b - slot_a));
    val_[0].d ^= a;

    a = (val_[1].d << (slot_b - slot_a)) ^ val_[1].d;
    a &= (1 << slot_b);
    a |= (a >> (slot_b - slot_a));
    val_[1].d ^= a;
  }
}

/*
___  _  _ ____ _  _ ____ ___    ___  ____ ____ ____
|__] |  | |    |_/  |___  |     |__] |__| [__  |___
|__] |__| |___ | \_ |___  |     |__] |  | ___] |___

*/

template <unsigned NUM_SLOTS>
bool BucketBase<NUM_SLOTS>::ClearStash(uint8_t fp, unsigned stash_pos, bool probe) {
  auto cb = [stash_pos, this](unsigned i, unsigned pos) -> SlotId {
    if (pos == stash_pos) {
      stash_busy_ &= (~(1u << i));
      stash_probe_mask_ &= (~(1u << i));
      stash_pos_ &= (~(3u << (i * 2)));

      assert(0u == ((stash_pos_ >> (i * 2)) & 3));
      return 0;
    }
    return kNanSlot;
  };

  std::pair<unsigned, SlotId> res = IterateStash(fp, probe, std::move(cb));
  return res.second != kNanSlot;
}

template <unsigned NUM_SLOTS>
void BucketBase<NUM_SLOTS>::SetHash(unsigned slot_id, uint8_t meta_hash, bool probe) {
  assert(slot_id < finger_arr_.size());

  finger_arr_[slot_id] = meta_hash;
  slotb_.SetSlot(slot_id, probe);
}

template <unsigned NUM_SLOTS>
bool BucketBase<NUM_SLOTS>::SetStash(uint8_t fp, unsigned stash_pos, bool probe) {
  // stash_busy_ is never 0xFFFFF so it's safe to run __builtin_ctz below.
  unsigned free_slot = __builtin_ctz(~stash_busy_);
  if (free_slot >= kStashFpLen)
    return false;

  stash_arr_[free_slot] = fp;
  stash_busy_ |= (1u << free_slot);  // set the overflow slot

  // stash_probe_mask_ specifies which records relate to other bucket.
  stash_probe_mask_ |= (unsigned(probe) << free_slot);

  // 2 bits denote the bucket index.
  free_slot *= 2;
  stash_pos_ &= (~(3 << free_slot));       // clear (can be removed?)
  stash_pos_ |= (stash_pos << free_slot);  // and set
  return true;
}

template <unsigned NUM_SLOTS>
void BucketBase<NUM_SLOTS>::SetStashPtr(unsigned stash_pos, uint8_t meta_hash, BucketBase* next) {
  assert(stash_pos < 4);

  // we use only kStashFpLen fp slots for handling stash buckets,
  // therefore if all those slots are used we try neighbor (probing bucket) as a fallback to point
  // to stash buckets. otherwise we increment overflow count.
  // if overflow is incremented we will need to check all the stash buckets when looking for a key,
  //  otherwise we can use overflow_index_ to find the the stash bucket efficiently.
  if (!SetStash(meta_hash, stash_pos, false)) {
    if (!next->SetStash(meta_hash, stash_pos, true)) {
      overflow_count_++;
    }
  }
  stash_busy_ |= kStashPresentBit;
}

template <unsigned NUM_SLOTS>
unsigned BucketBase<NUM_SLOTS>::UnsetStashPtr(uint8_t fp_hash, unsigned stash_pos,
                                              BucketBase* next) {
  /*also needs to ensure that this meta_hash must belongs to other bucket*/
  bool clear_success = ClearStash(fp_hash, stash_pos, false);
  unsigned res = 0;

  if (!clear_success) {
    clear_success = next->ClearStash(fp_hash, stash_pos, true);
    res += clear_success;
  }

  if (!clear_success) {
    assert(overflow_count_ > 0);
    overflow_count_--;
  }

  // kStashPresentBit helps with summarizing all the stash states into a single binary flag.
  // We need it because of the next, though if we make sure to move stash pointers upon split/delete
  // towards the owner we should not reach the state where mask1 == 0 but mask2 &
  // next->stash_probe_mask_ != 0.
  unsigned mask1 = stash_busy_ & (kStashPresentBit - 1);
  unsigned mask2 = next->stash_busy_ & (kStashPresentBit - 1);

  if (((mask1 & (~stash_probe_mask_)) == 0) && (overflow_count_ == 0) &&
      ((mask2 & next->stash_probe_mask_) == 0)) {
    stash_busy_ &= ~kStashPresentBit;
  }

  return res;
}

#ifdef __s390x__
template <unsigned NUM_SLOTS> uint32_t BucketBase<NUM_SLOTS>::CompareFP(uint8_t fp) const {
  static_assert(FpArray{}.size() <= 16);
  vector unsigned char v1;

  // Replicate 16 times fp to key_data.
  for (int i = 0; i < 16; i++) {
    v1[i] = fp;
  }

  // Loads 16 bytes of src into seg_data.
  vector unsigned char v2 = vec_load_len(finger_arr_.data(), 16);

  // compare 1-byte vectors seg_data and key_data, dst[i] := ( a[i] == b[i] ) ? 0xFF : 0.
  vector bool char rv_mask = vec_cmpeq(v1, v2);

  // collapses 16 msb bits from each byte in rv_mask into mask.
  int mask = 0;
  for (int i = 0; i < 16; i++) {
    if (rv_mask[i]) {
      mask |= 1 << i;
    }
  }

  return mask;
}
#else
template <unsigned NUM_SLOTS> uint32_t BucketBase<NUM_SLOTS>::CompareFP(uint8_t fp) const {
  static_assert(FpArray{}.size() <= 16);

  // Replicate 16 times fp to key_data.
  const __m128i key_data = _mm_set1_epi8(fp);

  // Loads 16 bytes of src into seg_data.
  __m128i seg_data = mm_loadu_si128(reinterpret_cast<const __m128i*>(finger_arr_.data()));

  // compare 16-byte vectors seg_data and key_data, dst[i] := ( a[i] == b[i] ) ? 0xFF : 0.
  __m128i rv_mask = _mm_cmpeq_epi8(seg_data, key_data);

  // collapses 16 msb bits from each byte in rv_mask into mask.
  int mask = _mm_movemask_epi8(rv_mask);

  // Note: Last 2 operations can be combined in skylake with _mm_cmpeq_epi8_mask.
  return mask;
}
#endif

// Bucket slot array goes from left to right: [x, x, ...]
// Shift right vacates the first slot on the left by shifting all the elements right and
// possibly deleting the last one on the right.
template <unsigned NUM_SLOTS> bool BucketBase<NUM_SLOTS>::ShiftRight() {
  for (int i = NUM_SLOTS - 1; i > 0; --i) {
    finger_arr_[i] = finger_arr_[i - 1];
  }

  // confusing but correct - slot bit mask LSB corresponds to left part of slot array.
  // therefore, we shift left slot mask.
  bool res = slotb_.ShiftLeft();
  assert(slotb_.FindEmptySlot() == 0);
  return res;
}

template <unsigned NUM_SLOTS>
template <typename F>
auto BucketBase<NUM_SLOTS>::IterateStash(uint8_t fp, bool is_probe, F&& func) const
    -> ::std::pair<unsigned, SlotId> {
  unsigned om = is_probe ? stash_probe_mask_ : ~stash_probe_mask_;
  unsigned ob = stash_busy_;

  for (unsigned i = 0; i < kStashFpLen; ++i) {
    if ((ob & 1) && (stash_arr_[i] == fp) && (om & 1)) {
      unsigned pos = (stash_pos_ >> (i * 2)) & 3;
      auto sid = func(i, pos);
      if (sid != BucketBase::kNanSlot) {
        return std::pair<unsigned, SlotId>(pos, sid);
      }
    }
    ob >>= 1;
    om >>= 1;
  }
  return {0, BucketBase::kNanSlot};
}

template <unsigned NUM_SLOTS> void VersionedBB<NUM_SLOTS>::SetVersion(uint64_t version) {
  absl::little_endian::Store64(version_, version);
}

/*
____ ____ ____ _  _ ____ _  _ ___
[__  |___ | __ |\/| |___ |\ |  |
___] |___ |__] |  | |___ | \|  |

*/

// for clang ignore -Wunused-lambda-capture
#ifdef __clang__
#pragma clang diagnostic ignored "-Wunused-lambda-capture"
#endif

template <typename Key, typename Value, typename Policy>
template <typename Pred>
auto Segment<Key, Value, Policy>::Bucket::FindByFp(uint8_t fp_hash, bool probe, Pred&& pred) const
    -> SlotId {
  unsigned mask = this->Find(fp_hash, probe);
  if (!mask)
    return kNanSlot;

  unsigned delta = __builtin_ctz(mask);
  mask >>= delta;
  for (unsigned i = delta; i < kSlotNum; ++i) {
    // Filterable just by key
    if constexpr (std::is_invocable_v<Pred, const Key_t&>) {
      if ((mask & 1) && pred(key[i]))
        return i;
    }

    // Filterable by key and value
    if constexpr (std::is_invocable_v<Pred, const Key_t&, const Value_t&>) {
      if ((mask & 1) && pred(key[i], value[i]))
        return i;
    }

    mask >>= 1;
  };

  return kNanSlot;
}

template <typename Key, typename Value, typename Policy>
bool Segment<Key, Value, Policy>::Bucket::ShiftRight() {
  bool res = BucketType::ShiftRight();
  for (int i = kSlotNum - 1; i > 0; i--) {
    std::swap(key[i], key[i - 1]);
    std::swap(value[i], value[i - 1]);
  }
  return res;
}

// stash_pos is index of the stash bucket, in the range of [0, STASH_BUCKET_NUM).
template <typename Key, typename Value, typename Policy>
void Segment<Key, Value, Policy>::RemoveStashReference(unsigned stash_pos, Hash_t key_hash) {
  LogicalBid y = HomeIndex(key_hash);
  uint8_t fp_hash = key_hash & kFpMask;
  auto* target = &bucket_[y];
  auto* next = &bucket_[NextBid(y)];

  target->UnsetStashPtr(fp_hash, stash_pos, next);
}

template <typename Key, typename Value, typename Policy>
auto Segment<Key, Value, Policy>::TryMoveFromStash(unsigned stash_id, unsigned stash_slot_id,
                                                   Hash_t key_hash) -> Iterator {
  LogicalBid bid = HomeIndex(key_hash);
  uint8_t hash_fp = key_hash & kFpMask;
  PhysicalBid stash_bid = kBucketNum + stash_id;
  auto& key = Key(stash_bid, stash_slot_id);
  auto& value = Value(stash_bid, stash_slot_id);

  int reg_slot = bucket_[bid].TryInsertToBucket(std::forward<Key_t>(key),
                                                std::forward<Value_t>(value), hash_fp, false);

  if (reg_slot < 0) {
    bid = NextBid(bid);
    reg_slot = bucket_[bid].TryInsertToBucket(std::forward<Key_t>(key),
                                              std::forward<Value_t>(value), hash_fp, true);
  }

  if (reg_slot >= 0) {
    if constexpr (kUseVersion) {
      // We maintain the invariant for the physical bucket by updating the version when
      // the entries move between buckets.
      uint64_t ver = bucket_[stash_bid].GetVersion();
      bucket_[bid].UpdateVersion(ver);
    }
    RemoveStashReference(stash_id, key_hash);
    return Iterator{bid, SlotId(reg_slot)};
  }

  return Iterator{};
}

template <typename Key, typename Value, typename Policy>
template <typename U, typename V, typename Pred, typename OnMoveCb>
auto Segment<Key, Value, Policy>::Insert(U&& key, V&& value, Hash_t key_hash, Pred&& pred,
                                         OnMoveCb&& on_move_cb) -> std::pair<Iterator, bool> {
  Iterator it = FindIt(key_hash, pred);
  if (it.found()) {
    return std::make_pair(it, false); /* duplicate insert*/
  }

  it = InsertUniq(std::forward<U>(key), std::forward<V>(value), key_hash, true,
                  std::forward<OnMoveCb>(on_move_cb));

  return std::make_pair(it, it.found());
}

template <typename Key, typename Value, typename Policy>
template <typename Pred>
auto Segment<Key, Value, Policy>::FindIt(Hash_t key_hash, Pred&& pred) const -> Iterator {
  LogicalBid bidx = HomeIndex(key_hash);
  const Bucket& target = bucket_[bidx];

  // It helps a bit (10% on my home machine) and more importantly, it does not hurt
  // since we are going to access this memory in a bit.
  __builtin_prefetch(&target);

  uint8_t fp_hash = key_hash & kFpMask;
  SlotId sid = target.FindByFp(fp_hash, false, pred);
  if (sid != BucketType::kNanSlot) {
    return Iterator{bidx, sid};
  }

  LogicalBid nid = NextBid(bidx);
  const Bucket& probe = GetBucket(nid);

  sid = probe.FindByFp(fp_hash, true, pred);

#ifdef ENABLE_DASH_STATS
  stats.neighbour_probes++;
#endif

  if (sid != BucketType::kNanSlot) {
    return Iterator{nid, sid};
  }

  if (!target.HasStash()) {
    return Iterator{};
  }

  auto stash_cb = [&](unsigned overflow_index, PhysicalBid pos) -> SlotId {
    assert(pos < kStashBucketNum);

    pos += kBucketNum;
    const Bucket& bucket = bucket_[pos];
    return bucket.FindByFp(fp_hash, false, pred);
  };

  if (target.HasStashOverflow()) {
#ifdef ENABLE_DASH_STATS
    stats.stash_overflow_probes++;
#endif

    for (unsigned i = 0; i < kStashBucketNum; ++i) {
      auto sid = stash_cb(0, i);
      if (sid != BucketType::kNanSlot) {
        return Iterator{PhysicalBid(kBucketNum + i), sid};
      }
    }

    // We exit because we searched through all stash buckets anyway, no need to use overflow fps.
    return Iterator{};
  }

#ifdef ENABLE_DASH_STATS
  stats.stash_probes++;
#endif

  auto stash_res = target.IterateStash(fp_hash, false, stash_cb);
  if (stash_res.second != BucketType::kNanSlot) {
    return Iterator{PhysicalBid(kBucketNum + stash_res.first), stash_res.second};
  }

  stash_res = probe.IterateStash(fp_hash, true, stash_cb);
  if (stash_res.second != BucketType::kNanSlot) {
    return Iterator{PhysicalBid(kBucketNum + stash_res.first), stash_res.second};
  }
  return Iterator{};
}

template <typename Key, typename Value, typename Policy>
void Segment<Key, Value, Policy>::Prefetch(Hash_t key_hash) const {
  LogicalBid bidx = HomeIndex(key_hash);
  const Bucket& target = bucket_[bidx];

  // Prefetch the home bucket that might hold the key with high probability.
  __builtin_prefetch(&target, 0, 1);
}

template <typename Key, typename Value, typename Policy>
template <typename Cb>
void Segment<Key, Value, Policy>::TraverseAll(Cb&& cb) const {
  for (uint8_t i = 0; i < kTotalBuckets; ++i) {
    bucket_[i].ForEachSlot([&](auto*, SlotId slot, bool) { cb(Iterator{i, slot}); });
  }
}

template <typename Key, typename Value, typename Policy> void Segment<Key, Value, Policy>::Clear() {
  for (unsigned i = 0; i < kTotalBuckets; ++i) {
    bucket_[i].Clear();
    bucket_[i].ClearStashPtrs();
  }
}

template <typename Key, typename Value, typename Policy>
void Segment<Key, Value, Policy>::Delete(const Iterator& it, Hash_t key_hash) {
  assert(it.found());

  auto& b = bucket_[it.index];

  if (it.index >= kBucketNum) {
    RemoveStashReference(it.index - kBucketNum, key_hash);
  }

  b.Delete(it.slot);
}

// Split items from the left segment to the right during the growth phase.
// right segment will have all the items with lsb at local_depth ==1 .
template <typename Key, typename Value, typename Policy>
template <typename HFunc, typename MoveCb>
void Segment<Key, Value, Policy>::Split(HFunc&& hfn, Segment* dest_right, MoveCb&& on_move_cb) {
  ++local_depth_;
  dest_right->local_depth_ = local_depth_;

  // versioning does not work when entries move across buckets.
  // we need to setup rules on how we do that
  // do_versioning();
  auto is_mine = [this](Hash_t hash) { return (hash >> (64 - local_depth_) & 1) == 0; };

  auto update_version = [dest_right](const Bucket& src, PhysicalBid dest_id) {
    (void)dest_id;
    if constexpr (kUseVersion) {
      // Maintaining consistent versioning.
      uint64_t ver = src.GetVersion();
      dest_right->bucket_[dest_id].UpdateVersion(ver);
    }
  };

  for (unsigned i = 0; i < kBucketNum; ++i) {
    uint32_t invalid_mask = 0;

    auto cb = [&](auto* bucket, unsigned slot, bool probe) {
      auto& key = bucket->key[slot];
      Hash_t hash = hfn(key);

      // we extract local_depth bits from the left part of the hash. Since we extended local_depth,
      // we added an additional bit to the right, therefore we need to look at lsb of the extract.
      if (is_mine(hash))
        return;  // keep this key in the source

      invalid_mask |= (1u << slot);

      // We pass dummy callback because we are not interested to track movements in the newly
      // created segment.
      Iterator it = dest_right->InsertUniq(std::forward<Key_t>(bucket->key[slot]),
                                           std::forward<Value_t>(bucket->value[slot]), hash, false,
                                           [](auto&&...) {});

      // we move items residing in a regular bucket to a new segment.
      // Note 1: in case we are somehow attacked with items that after the split
      // will go into the same segment, we may have a problem.
      // It is highly unlikely that this happens with real world data.
      // Note 2: Dragonfly replication is in fact is such unlikely attack. Since we go over
      // the source table in a special order (go over all the segments for bucket 0,
      // then for all the segments for bucket 1 etc), what happens is that the rdb stream is full
      // of items with the same bucket id, say 0. Lots of items will go to the initial segment
      // into bucket 0, which will become full, then bucket 1 will get full,
      // and then the 4 stash buckets in the segment. Then the segment will have to split even
      // though only 6 buckets are used just because of this
      // extreme skewness of keys distribution. When a segment splits, we will still
      // have items going into bucket 0 in the new segment. To alleviate this effect we usually
      // reserve dash table to have enough segments during full sync to avoid handling those
      // ill-formed splits.
      // TODO: To protect ourselves again such situations we should use random seed
      // for our dash hash function, thus avoiding the case where someone, on purpose or due to
      // selective bias will be able to hit our dashtable with items with the same bucket id.
      assert(it.found());
      update_version(*bucket, it.index);
      on_move_cb(segment_id_, i, dest_right->segment_id_, it.index);
    };

    bucket_[i].ForEachSlot(std::move(cb));
    bucket_[i].ClearSlots(invalid_mask);
  }

  for (unsigned i = 0; i < kStashBucketNum; ++i) {
    uint32_t invalid_mask = 0;
    PhysicalBid bid = kBucketNum + i;
    Bucket& stash = bucket_[bid];

    auto cb = [&](auto* bucket, unsigned slot, bool probe) {
      auto& key = bucket->key[slot];
      Hash_t hash = hfn(key);

      if (is_mine(hash)) {
        // If the entry stays in the same segment we try to unload it back to the regular bucket.
        Iterator it = TryMoveFromStash(i, slot, hash);
        if (it.found()) {
          invalid_mask |= (1u << slot);
          on_move_cb(segment_id_, i, segment_id_, it.index);
        }

        return;
      }

      invalid_mask |= (1u << slot);
      auto it = dest_right->InsertUniq(std::forward<Key_t>(bucket->key[slot]),
                                       std::forward<Value_t>(bucket->value[slot]), hash, false,
                                       /* not interested in these movements */ [](auto&&...) {});
      (void)it;
      assert(it.index != kNanBid);
      update_version(*bucket, it.index);
      on_move_cb(segment_id_, i, dest_right->segment_id_, it.index);

      // Remove stash reference pointing to stash bucket i.
      RemoveStashReference(i, hash);
    };

    stash.ForEachSlot(std::move(cb));
    stash.ClearSlots(invalid_mask);
  }
}

template <typename Key, typename Value, typename Policy>
int Segment<Key, Value, Policy>::MoveToOther(bool own_items, unsigned from_bid, unsigned to_bid) {
  assert(from_bid < kBucketNum && to_bid < kBucketNum);
  auto& src = bucket_[from_bid];
  uint32_t mask = src.GetProbe(!own_items);
  if (mask == 0) {
    return -1;
  }

  int src_slot = __builtin_ctz(mask);
  int dst_slot = bucket_[to_bid].TryInsertToBucket(std::forward<Key_t>(src.key[src_slot]),
                                                   std::forward<Value_t>(src.value[src_slot]),
                                                   src.Fp(src_slot), own_items);
  if (dst_slot < 0)
    return -1;

  // We never decrease the version of the entry.
  if constexpr (kUseVersion) {
    auto& dst = bucket_[to_bid];
    dst.UpdateVersion(src.GetVersion());
  }

  src.Delete(src_slot);

  return src_slot;
}

template <typename Key, typename Value, typename Policy>
bool Segment<Key, Value, Policy>::CheckIfMovesToOther(bool own_items, unsigned from,
                                                      unsigned to) const {
  const auto& src = GetBucket(from);
  uint32_t mask = src.GetProbe(!own_items);
  if (mask == 0) {
    return false;
  }

  const auto& dest = GetBucket(to);
  return dest.IsFull() ? false : true;
}

template <typename Key, typename Value, typename Policy>
template <typename U, typename V, typename OnMoveCb>
auto Segment<Key, Value, Policy>::InsertUniq(U&& key, V&& value, Hash_t key_hash, bool spread,
                                             OnMoveCb&& on_move_cb) -> Iterator {
  const uint8_t bid = HomeIndex(key_hash);
  const uint8_t nid = NextBid(bid);

  Bucket& target = bucket_[bid];
  Bucket& neighbor = bucket_[nid];
  Bucket* insert_first = &target;

  uint8_t meta_hash = key_hash & kFpMask;
  unsigned ts = target.Size(), ns = neighbor.Size();
  bool probe = false;

  if (spread && ts > ns) {
    insert_first = &neighbor;
    probe = true;
  }

  int slot = insert_first->TryInsertToBucket(std::forward<U>(key), std::forward<V>(value),
                                             meta_hash, probe);

  if (slot >= 0) {
    return Iterator{PhysicalBid(insert_first - bucket_), uint8_t(slot)};
  }

  if (!spread) {
    int slot =
        neighbor.TryInsertToBucket(std::forward<U>(key), std::forward<V>(value), meta_hash, true);
    if (slot >= 0) {
      return Iterator{nid, uint8_t(slot)};
    }
  }

  int displace_index = MoveToOther(true, nid, NextBid(nid));
  if (displace_index >= 0) {
    neighbor.Insert(displace_index, std::forward<U>(key), std::forward<V>(value), meta_hash, true);
    on_move_cb(segment_id_, nid, NextBid(nid));
    return Iterator{nid, uint8_t(displace_index)};
  }

  unsigned prev_idx = PrevBid(bid);
  displace_index = MoveToOther(false, bid, prev_idx);
  if (displace_index >= 0) {
    target.Insert(displace_index, std::forward<U>(key), std::forward<V>(value), meta_hash, false);
    on_move_cb(segment_id_, bid, prev_idx);
    return Iterator{bid, uint8_t(displace_index)};
  }

  // we balance stash fill rate  by starting from y % STASH_BUCKET_NUM.
  for (unsigned i = 0; i < kStashBucketNum; ++i) {
    unsigned stash_pos = (bid + i) % kStashBucketNum;

    int stash_slot = bucket_[kBucketNum + stash_pos].TryInsertToBucket(
        std::forward<U>(key), std::forward<V>(value), meta_hash, false);
    if (stash_slot >= 0) {
      target.SetStashPtr(stash_pos, meta_hash, &neighbor);
      return Iterator{PhysicalBid(kBucketNum + stash_pos), uint8_t(stash_slot)};
    }
  }

  return Iterator{};
}

template <typename Key, typename Value, typename Policy>
template <bool UV>
std::enable_if_t<UV, unsigned> Segment<Key, Value, Policy>::CVCOnInsert(uint64_t ver_threshold,
                                                                        Hash_t key_hash,
                                                                        uint8_t bid_res[2]) const {
  const LogicalBid bid = HomeIndex(key_hash);
  const LogicalBid nid = NextBid(bid);

  const Bucket& target = GetBucket(bid);
  const Bucket& neighbor = GetBucket(nid);
  uint8_t first = target.Size() > neighbor.Size() ? nid : bid;

  const Bucket& bfirst = bucket_[first];
  if (!bfirst.IsFull()) {
    unsigned cnt = 0;
    if (!bfirst.IsEmpty() && bfirst.GetVersion() < ver_threshold) {
      bid_res[cnt++] = first;
    }
    return cnt;
  }

  // both nid and bid are full.
  const LogicalBid after_next = NextBid(nid);

  auto do_fun = [this, ver_threshold, &bid_res](auto bid, auto nid) {
    unsigned cnt = 0;
    // We could tighten the checks here and below because
    // if nid is less than ver_threshold, than nid won't be affected and won't cross
    // ver_threshold as well.
    if (GetBucket(bid).GetVersion() < ver_threshold)
      bid_res[cnt++] = bid;

    if (!GetBucket(nid).IsEmpty() && GetBucket(nid).GetVersion() < ver_threshold)
      bid_res[cnt++] = nid;
    return cnt;
  };

  if (CheckIfMovesToOther(true, nid, after_next)) {
    return do_fun(nid, after_next);
  }

  const uint8_t prev_bid = PrevBid(bid);
  if (CheckIfMovesToOther(false, bid, prev_bid)) {
    return do_fun(bid, prev_bid);
  }

  // Important to repeat exactly the insertion logic of InsertUnique.
  for (unsigned i = 0; i < kStashBucketNum; ++i) {
    PhysicalBid stash_bid = kBucketNum + ((bid + i) % kStashBucketNum);
    const Bucket& stash = GetBucket(stash_bid);
    if (!stash.IsFull()) {
      unsigned cnt = 0;
      if (!stash.IsEmpty() && stash.GetVersion() < ver_threshold)
        bid_res[cnt++] = stash_bid;

      return cnt;
    }
  }

  return UINT16_MAX;
}

template <typename Key, typename Value, typename Policy>
template <bool UV>
std::enable_if_t<UV, unsigned> Segment<Key, Value, Policy>::CVCOnBump(uint64_t ver_threshold,
                                                                      unsigned bid, unsigned slot,
                                                                      Hash_t hash,
                                                                      uint8_t result_bid[3]) const {
  if (bid < kBucketNum) {
    // Right now we do not migrate entries from nid to bid, only from stash to normal buckets.
    // The reason for this is that CVCOnBump implementation swaps the slots of the same bucket
    // so there is no further action needed.
    return 0;
  }

  // Stash case.
  // There are three actors (interesting buckets). The stash bucket, the target bucket and its
  // adjacent bucket (probe). To understand the code below consider the cases in CVCOnBump:
  // 1. If the bid is not a stash bucket, then just swap the slots of the target.
  // 2. If there is empty space in target or probe bucket insert the slot there and remove
  //    it from the stash bucket.
  // 3. If there is no empty space then we need to swap slots with either the target or the probe
  //    bucket. Furthermore, if the target or the probe have one of their stash bits reference the
  //    stash, then the stash bit entry is cleared. In total 2 buckets are modified.
  // Case 1 is handled by the if statement above and cases 2 and 3 below. We should return via
  // result_bid all the buckets(with version less than threshold) that CVCOnBump will modify.
  // Note, that for case 2 & 3 we might return an extra bucket id even though this bucket was not
  // changed. An example of that is TryMoveFromStash which will first try to insert on the target
  // bucket and if that fails it will retry with the probe bucket. Since we don't really know
  // which of the two we insert to we are pesimistic and assume that both of them got modified. I
  // suspect we could optimize this out by looking at the fingerprints but for now I care about
  // correctness and returning the correct modified buckets. Besides, we are on a path of updating
  // the version anyway which will assert that the bucket won't be send again during snapshotting.
  unsigned result = 0;
  if (bucket_[bid].GetVersion() < ver_threshold) {
    result_bid[result++] = bid;
  }
  const uint8_t target_bid = HomeIndex(hash);
  result_bid[result++] = target_bid;
  const uint8_t probing_bid = NextBid(target_bid);
  result_bid[result++] = probing_bid;

  return result;
}

template <typename Key, typename Value, typename Policy>
template <typename Cb>
void Segment<Key, Value, Policy>::TraverseBucket(PhysicalBid bid, Cb&& cb) {
  assert(bid < kTotalBuckets);

  const Bucket& b = GetBucket(bid);
  b.ForEachSlot([&](auto* bucket, uint8_t slot, bool probe) { cb(Iterator{bid, slot}); });
}

template <typename Key, typename Value, typename Policy>
template <typename Cb, typename HashFn>
bool Segment<Key, Value, Policy>::TraverseLogicalBucket(LogicalBid bid, HashFn&& hfun,
                                                        Cb&& cb) const {
  assert(bid < kBucketNum);

  const Bucket& b = bucket_[bid];
  bool found = false;
  if (b.GetProbe(false)) {  // Check items that this bucket owns.
    b.ForEachSlot([&](auto* bucket, SlotId slot, bool probe) {
      if (!probe) {
        found = true;
        cb(Iterator{bid, slot});
      }
    });
  }

  uint8_t nid = NextBid(bid);
  const Bucket& next = GetBucket(nid);

  // check for probing entries in the next bucket, i.e. those that should reside in b.
  if (next.GetProbe(true)) {
    next.ForEachSlot([&](auto* bucket, SlotId slot, bool probe) {
      if (probe) {
        found = true;
        assert(HomeIndex(hfun(bucket->key[slot])) == bid);
        cb(Iterator{nid, slot});
      }
    });
  }

  // Finally go over stash buckets and find those entries that belong to b.
  if (b.HasStash()) {
    // do not bother with overflow fps. Just go over all the stash buckets.
    for (uint8_t j = kBucketNum; j < kTotalBuckets; ++j) {
      const auto& stashb = bucket_[j];
      stashb.ForEachSlot([&](auto* bucket, SlotId slot, bool probe) {
        if (HomeIndex(hfun(bucket->key[slot])) == bid) {
          found = true;
          cb(Iterator{j, slot});
        }
      });
    }
  }

  return found;
}

template <typename Key, typename Value, typename Policy>
size_t Segment<Key, Value, Policy>::SlowSize() const {
  size_t res = 0;
  for (unsigned i = 0; i < kTotalBuckets; ++i) {
    res += bucket_[i].Size();
  }
  return res;
}

template <typename Key, typename Value, typename Policy>
auto Segment<Key, Value, Policy>::FindValidStartingFrom(PhysicalBid bid, unsigned slot) const
    -> Iterator {
  while (bid < kTotalBuckets) {
    uint32_t mask = bucket_[bid].GetBusy();
    mask >>= slot;
    if (mask) {
      return Iterator(bid, slot + __builtin_ctz(mask));
    }
    ++bid;
    slot = 0;
  }
  return Iterator{};
}

template <typename Key, typename Value, typename Policy>
template <typename BumpPolicy, typename OnMoveCb>
auto Segment<Key, Value, Policy>::BumpUp(uint8_t bid, SlotId slot, Hash_t key_hash,
                                         const BumpPolicy& bp, OnMoveCb&& on_move_cb) -> Iterator {
  auto& from = GetBucket(bid);

  if (!bp.CanBump(from.key[slot])) {
    return Iterator{bid, slot};
  }

  if (bid < kBucketNum) {
    // non stash case.
    if (slot > 0 && bp.CanBump(from.key[slot - 1])) {
      from.Swap(slot - 1, slot);
      return Iterator{bid, uint8_t(slot - 1)};
    }
    // TODO: We could promote further, by swapping probing bucket with its previous one.
    return Iterator{bid, slot};
  }

  // stash bucket
  // We swap the item with the item in the "normal" bucket in the last slot.
  unsigned stash_pos = bid - kBucketNum;

  // If we have an empty space for some reason just unload the stash entry.
  if (Iterator it = TryMoveFromStash(stash_pos, slot, key_hash); it.found()) {
    // TryMoveFromStash handles versions internally.
    from.Delete(slot);
    on_move_cb(segment_id_, bid, it.index);
    return it;
  }

  uint8_t target_bid = HomeIndex(key_hash);
  uint8_t nid = NextBid(target_bid);
  uint8_t fp_hash = key_hash & kFpMask;
  assert(fp_hash == from.Fp(slot));

  // determine which bucket one we gonna swap.
  // we swap with the bucket the references the stash entry, not necessary its owning
  // bucket.
  auto& target = bucket_[target_bid];
  auto& next = bucket_[nid];

  // bucket_offs - 0 if exact bucket, 1 if neighbour
  unsigned bucket_offs = target.UnsetStashPtr(fp_hash, stash_pos, &next);
  uint8_t swap_bid = (target_bid + bucket_offs) % kBucketNum;
  auto& swapb = bucket_[swap_bid];

  constexpr unsigned kLastSlot = kSlotNum - 1;
  assert(swapb.GetBusy() & (1 << kLastSlot));

  // Don't move sticky items back to the stash because they're not evictable
  // TODO: search for first swappable item
  if (!bp.CanBump(swapb.key[kLastSlot])) {
    target.SetStashPtr(stash_pos, fp_hash, &next);
    return Iterator{bid, slot};
  }

  uint8_t swap_fp = swapb.Fp(kLastSlot);

  // is_probing for the existing entry in swapb. It's unrelated to bucket_offs,
  // i.e. it could be true even if bucket_offs is 0.
  bool is_probing = swapb.GetProbe(true) & (1 << kLastSlot);

  // swap keys, values and fps. update slots meta.
  std::swap(from.key[slot], swapb.key[kLastSlot]);
  std::swap(from.value[slot], swapb.value[kLastSlot]);
  from.Delete(slot);
  from.SetHash(slot, swap_fp, false);

  swapb.Delete(kLastSlot);
  swapb.SetHash(kLastSlot, fp_hash, bucket_offs == 1);

  // update versions.
  if constexpr (kUseVersion) {
    uint64_t from_ver = from.GetVersion();
    uint64_t swap_ver = swapb.GetVersion();
    if (from_ver < swap_ver) {
      from.SetVersion(swap_ver);
    } else {
      swapb.SetVersion(from_ver);
    }
  }

  // update ptr for swapped items
  if (is_probing) {
    LogicalBid prev_bid = PrevBid(swap_bid);
    auto& prevb = bucket_[prev_bid];
    prevb.SetStashPtr(stash_pos, swap_fp, &swapb);
  } else {
    // stash_ptr resides in the current or the next bucket.
    LogicalBid next_bid = NextBid(swap_bid);
    swapb.SetStashPtr(stash_pos, swap_fp, bucket_ + next_bid);
  }

  on_move_cb(segment_id_, bid, swap_bid);
  on_move_cb(segment_id_, swap_bid, bid);
  return Iterator{swap_bid, kLastSlot};
}

template <typename Key, typename Value, typename Policy>
template <typename HFunc, typename OnMoveCb>
unsigned Segment<Key, Value, Policy>::UnloadStash(HFunc&& hfunc, OnMoveCb&& on_move_cb) {
  unsigned moved = 0;

  for (unsigned i = 0; i < kStashBucketNum; ++i) {
    unsigned bid = kBucketNum + i;
    Bucket& stash = bucket_[bid];
    uint32_t invalid_mask = 0;

    auto cb = [&](auto* bucket, unsigned slot, bool probe) {
      auto& key = bucket->key[slot];
      Hash_t hash = hfunc(key);
      Iterator res = TryMoveFromStash(i, slot, hash);
      if (res.found()) {
        ++moved;
        invalid_mask |= (1u << slot);
        on_move_cb(segment_id_, i, res.index);
      }
    };

    stash.ForEachSlot(cb);
    stash.ClearSlots(invalid_mask);
  }

  return moved;
}

}  // namespace detail
}  // namespace dfly


================================================
FILE: src/core/dash_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include <cstdint>
#define ENABLE_DASH_STATS

#include <absl/container/flat_hash_map.h>
#include <absl/strings/str_cat.h>
#include <mimalloc.h>

#include <functional>
#include <set>

#include "base/gtest.h"
#include "base/hash.h"
#include "base/logging.h"
#include "base/zipf_gen.h"
#include "core/dash.h"
#include "io/file.h"
#include "io/line_reader.h"

extern "C" {
#include "redis/dict.h"
#include "redis/sds.h"
#include "redis/zmalloc.h"
}

#if defined(__clang__)
#pragma clang diagnostic ignored "-Wunused-const-variable"
#endif

namespace dfly {

static uint64_t callbackHash(const void* key) {
  return XXH64(&key, sizeof(key), 0);
}

template <typename K> auto EqTo(const K& key) {
  return [&key](const auto& probe) { return key == probe; };
}

static dictType IntDict = {callbackHash, NULL, NULL, NULL, NULL, NULL, NULL};

static uint64_t dictSdsHash(const void* key) {
  return dictGenHashFunction((unsigned char*)key, sdslen((char*)key));
}

static int dictSdsKeyCompare(dict*, const void* key1, const void* key2) {
  int l1, l2;

  l1 = sdslen((sds)key1);
  l2 = sdslen((sds)key2);
  if (l1 != l2)
    return 0;
  return memcmp(key1, key2, l1) == 0;
}

static dictType SdsDict = {
    dictSdsHash,       /* hash function */
    NULL,              /* key dup */
    NULL,              /* val dup */
    dictSdsKeyCompare, /* key compare */
    NULL,
    // dictSdsDestructor, /* key destructor */
    NULL, /* val destructor */
    NULL,
};

using namespace std;
struct Buf24 {
  char buf[20];
  uint32_t index;

  Buf24(uint32_t i = 0) : index(i) {
  }
};

struct BasicDashPolicy {
  enum { kSlotNum = 12, kBucketNum = 64 };
  static constexpr bool kUseVersion = false;

  template <typename U> static void DestroyValue(const U&) {
  }
  template <typename U> static void DestroyKey(const U&) {
  }

  template <typename U, typename V> static bool Equal(U&& u, V&& v) {
    return u == v;
  }
};
struct UInt64Policy : public BasicDashPolicy {
  static uint64_t HashFn(uint64_t v) {
    return XXH3_64bits(&v, sizeof(v));
  }
};

class CappedResource final : public PMR_NS::memory_resource {
 public:
  explicit CappedResource(size_t cap) : cap_(cap) {
  }

  size_t used() const {
    return used_;
  }

 private:
  void* do_allocate(std::size_t size, std::size_t align) {
    if (used_ + size > cap_)
      throw std::bad_alloc{};

    void* res = PMR_NS::get_default_resource()->allocate(size, align);
    used_ += size;

    return res;
  }

  void do_deallocate(void* ptr, std::size_t size, std::size_t align) {
    used_ -= size;
    PMR_NS::get_default_resource()->deallocate(ptr, size, align);
  }

  bool do_is_equal(const PMR_NS::memory_resource& o) const noexcept {
    return this == &o;
  }

  size_t cap_;
  size_t used_ = 0;
};

using Segment = detail::Segment<uint64_t, Buf24>;
using Dash64 = DashTable<uint64_t, uint64_t, UInt64Policy>;

struct RelaxedBumpPolicy {
  bool CanBump(uint64_t key) const {
    return true;
  }
  void OnMove(Dash64::Cursor source, Dash64::Cursor dest) {
  }
};

constexpr auto kSegTax = Segment::kTaxSize;
constexpr size_t kMaxSize = Segment::kMaxSize;
constexpr size_t kSegSize = sizeof(Segment);

class DashTest : public testing::Test {
 protected:
  static void SetUpTestSuite() {
    init_zmalloc_threadlocal(mi_heap_get_backing());
  }

  DashTest() : segment_(1, 0, PMR_NS::get_default_resource()) {
  }

  bool Find(Segment::Key_t key, Segment::Value_t* val) const {
    uint64_t hash = dt_.DoHash(key);

    auto it = segment_.FindIt(hash, EqTo(key));
    if (!it.found())
      return false;
    *val = segment_.Value(it.index, it.slot);
    return true;
  }

  bool Contains(Segment::Key_t key) const {
    uint64_t hash = dt_.DoHash(key);
    auto it = segment_.FindIt(hash, EqTo(key));
    return it.found();
  }

  set<Segment::Key_t> FillSegment(unsigned bid);

  Segment segment_;
  Dash64 dt_;
};

set<Segment::Key_t> DashTest::FillSegment(unsigned bid) {
  std::set<Segment::Key_t> keys;
  for (Segment::Key_t key = 0; key < 1000000u; ++key) {
    uint64_t hash = dt_.DoHash(key);
    unsigned bi = (hash >> 8) % Segment::kBucketNum;
    if (bi != bid)
      continue;
    uint8_t fp = hash & 0xFF;
    if (fp > 2)  // limit fps considerably to find interesting cases.
      continue;
    auto [it, success] = segment_.Insert(key, 0, hash, EqTo(key), [](auto&&...) {});
    if (!success) {
      LOG(INFO) << "Stopped at " << key;
      break;
    }
    CHECK(it.found());
    keys.insert(key);
  }

  return keys;
}

TEST_F(DashTest, Hash) {
  for (uint64_t i = 0; i < 100; ++i) {
    uint64_t hash = dt_.DoHash(i);
    if (hash >> 63) {
      VLOG(1) << "i " << i << ", Hash " << hash;
    }
  }
}

TEST_F(DashTest, SlotBitmap) {
  detail::SlotBitmap<14> slot;
  slot.SetSlot(1, true);
  slot.SetSlot(5, false);
  EXPECT_EQ(34, slot.GetBusy());
  EXPECT_EQ(2, slot.GetProbe(true));
}

TEST_F(DashTest, Basic) {
  Segment::Key_t key = 0;
  Segment::Value_t val = 0;
  uint64_t hash = dt_.DoHash(key);

  EXPECT_TRUE(segment_.Insert(key, val, hash, EqTo(key), [](auto&&...) {}).second);
  auto [it, res] = segment_.Insert(key, val, hash, EqTo(key), [](auto&&...) {});
  EXPECT_TRUE(!res && it.found());

  EXPECT_TRUE(Find(key, &val));
  EXPECT_EQ(0, val.index);

  EXPECT_FALSE(Find(1, &val));
  EXPECT_EQ(1, segment_.SlowSize());

  unsigned has_called = 0;
  auto cb = [&](const auto& it) { ++has_called; };

  auto hfun = &UInt64Policy::HashFn;

  auto cursor = segment_.TraverseLogicalBucket((hash >> 8) % Segment::kBucketNum, hfun, cb);
  ASSERT_EQ(1, has_called);
  ASSERT_EQ(0, segment_.TraverseLogicalBucket(cursor, hfun, cb));
  ASSERT_EQ(1, has_called);
  EXPECT_EQ(0, segment_.GetVersion(0));
}

TEST_F(DashTest, Segment) {
  std::unique_ptr<Segment> seg(new Segment(1, 0, PMR_NS::get_default_resource()));

#ifndef __APPLE__
  LOG(INFO) << "Segment size " << sizeof(Segment)
            << " malloc size: " << malloc_usable_size(seg.get());
#endif

  set<Segment::Key_t> keys = FillSegment(0);

  EXPECT_TRUE(segment_.GetBucket(0).IsFull() && segment_.GetBucket(1).IsFull());
  for (size_t i = 2; i < Segment::kBucketNum; ++i) {
    EXPECT_EQ(0, segment_.GetBucket(i).Size());
  }
  EXPECT_EQ(6 * Segment::kSlotNum, keys.size());
  EXPECT_EQ(6 * Segment::kSlotNum, segment_.SlowSize());

  auto hfun = &UInt64Policy::HashFn;
  unsigned has_called = 0;

  auto cb = [&](const Segment::Iterator& it) {
    ++has_called;
    ASSERT_EQ(1, keys.count(segment_.Key(it.index, it.slot)));
  };

  segment_.TraverseAll(cb);
  ASSERT_EQ(keys.size(), has_called);

  ASSERT_TRUE(segment_.GetBucket(Segment::kBucketNum).IsFull());
  std::array<uint64_t, Segment::kSlotNum * 2> arr;
  uint64_t* next = arr.begin();
  for (unsigned i = Segment::kBucketNum; i < Segment::kBucketNum + 2; ++i) {
    const auto* k = &segment_.Key(i, 0);
    next = std::copy(k, k + Segment::kSlotNum, next);
  }

  for (auto k : arr) {
    auto hash = hfun(k);
    auto it = segment_.FindIt(hash, [&k](const auto& probe) { return k == probe; });
    ASSERT_TRUE(it.found());
    segment_.Delete(it, hash);
  }
  EXPECT_EQ(4 * Segment::kSlotNum, segment_.SlowSize());
  ASSERT_FALSE(Contains(arr.front()));
}

TEST_F(DashTest, SegmentFull) {
  std::equal_to<> eq;
  for (Segment::Key_t key = 8000; key < 15000u; ++key) {
    uint64_t hash = dt_.DoHash(key);
    bool res = segment_.Insert(key, 0, hash, eq, [](auto&&...) {}).second;
    if (!res) {
      LOG(INFO) << "Stopped at " << key;
      break;
    }
  }
  EXPECT_GT(segment_.SlowSize(), Segment::capacity() * 0.85);

  LOG(INFO) << "Utilization " << double(segment_.SlowSize()) / Segment::capacity()
            << " num probing buckets: " << segment_.NumProbingBuckets();

  LOG(INFO) << "NB: " << segment_.stats.neighbour_probes << " SP: " << segment_.stats.stash_probes
            << " SOP: " << segment_.stats.stash_overflow_probes;
  segment_.stats.neighbour_probes = segment_.stats.stash_overflow_probes =
      segment_.stats.stash_probes = 0;
  for (Segment::Key_t key = 0; key < 10000u; ++key) {
    Contains(key);
  }
  LOG(INFO) << segment_.stats.neighbour_probes << " " << segment_.stats.stash_probes << " "
            << segment_.stats.stash_overflow_probes;

  uint32_t busy = segment_.GetBucket(0).GetBusy();
  uint32_t probe = segment_.GetBucket(0).GetProbe(true);

  EXPECT_EQ((1 << 12) - 1, busy);  // Size 12
  EXPECT_EQ(539, probe);           // verified by running since the test is deterministic.

  unsigned keys[12] = {8045, 8085, 8217, 8330, 8337, 8381, 8432, 8506, 8587, 8605, 8612, 8725};
  for (unsigned i = 0; i < 12; ++i) {
    ASSERT_EQ(keys[i], segment_.Key(0, i));
  }
}

TEST_F(DashTest, FirstStash) {
  constexpr unsigned kRegularCapacity = Segment::kBucketNum * Segment::kSlotNum;
  unsigned less_seventy = 0;
  for (unsigned j = 0; j < 100; ++j) {
    unsigned num_items = 0;
    for (unsigned i = 0; i < 1000; ++i) {
      uint64_t key = i + j * 2000;
      uint64_t hash = dt_.DoHash(key);
      auto [it, inserted] = segment_.Insert(key, 0, hash, equal_to<>{}, [](auto&&...) {});
      ASSERT_TRUE(inserted);
      if (it.index >= Segment::kBucketNum) {  // stash iterator
        break;
      }
      ++num_items;
    }
    segment_.Clear();

    // With high probability, we can expect 66% of the keys added without stashes.
    ASSERT_GT(num_items, kRegularCapacity * 0.66);
    if (num_items < kRegularCapacity * 0.7) {
      ++less_seventy;
    }
  }
  LOG(INFO) << "Less than 70% of keys in regular buckets: " << less_seventy;
}

TEST_F(DashTest, Split) {
  // fills segment with maximum keys that must reside in bucket id 0.
  set<Segment::Key_t> keys = FillSegment(0);
  Segment::Value_t val;
  Segment s2{2, 0, PMR_NS::get_default_resource()};  // segment with local depth 2.

  segment_.Split(&UInt64Policy::HashFn, &s2, [](auto&...) {});
  unsigned sum[2] = {0};
  for (auto key : keys) {
    auto eq = [key](const auto& probe) { return key == probe; };
    auto it1 = segment_.FindIt(dt_.DoHash(key), eq);
    auto it2 = s2.FindIt(dt_.DoHash(key), eq);
    ASSERT_NE(it1.found(), it2.found()) << key;

    sum[0] += it1.found();
    sum[1] += it2.found();
  }

  ASSERT_EQ(segment_.SlowSize(), sum[0]);
  EXPECT_EQ(s2.SlowSize(), sum[1]);
  EXPECT_EQ(keys.size(), sum[0] + sum[1]);
  EXPECT_EQ(6 * Segment::kSlotNum, keys.size());
}

TEST_F(DashTest, Merge) {
  constexpr size_t kNumItems = 4000;
  std::vector<uint64_t> keys;

  for (uint64_t i = 0; i < kNumItems; ++i) {
    auto [it, inserted] = dt_.Insert(i, i);
    if (inserted) {
      keys.push_back(i);
    }
  }

  EXPECT_EQ(dt_.depth(), 3);

  // keep only ~5%
  size_t keys_to_keep = keys.size() * 0.05;

  for (size_t i = keys_to_keep; i < keys.size(); ++i) {
    dt_.Erase(keys[i]);
  }

  keys.resize(keys_to_keep);

  EXPECT_EQ(dt_.unique_segments(), 8);
  size_t dir_size = dt_.GetSegmentCount();

  // Iteratively merge segments until all reach depth 1
  // Use multiple passes since merging changes buddy relationships
  while (true) {
    bool merged_any = false;

    for (size_t seg_id = 0; seg_id < dir_size; seg_id++) {
      auto* seg = dt_.GetSegment(seg_id);

      size_t local_depth = seg->local_depth();
      if (local_depth == 1)
        continue;

      size_t buddy_id = dt_.FindBuddyId(seg_id);
      if (buddy_id == seg_id)
        continue;

      // Skip if seg_id > buddy_id to avoid processing the same pair twice
      // (FindBuddyId is symmetric, so we see each pair from both directions)
      if (seg_id > buddy_id)
        continue;

      auto* buddy = dt_.GetSegment(buddy_id);

      // Preconditions to merge: (< 25% of capacity)
      size_t combined_size = seg->SlowSize() + buddy->SlowSize();
      size_t safe_threshold = static_cast<size_t>(0.25 * seg->capacity());

      if (combined_size <= safe_threshold) {
        dt_.Merge(seg_id, buddy_id);
        merged_any = true;
      }
    }

    if (!merged_any)
      break;
  }
  EXPECT_EQ(dt_.unique_segments(), 2);
  for (size_t seg_id = 0; seg_id < dir_size; seg_id++) {
    auto* seg = dt_.GetSegment(seg_id);
    EXPECT_EQ(seg->local_depth(), 1);
  }

  for (size_t key : keys) {
    EXPECT_EQ(dt_.Find(key).is_done(), false);
  }
  EXPECT_EQ(dt_.bucket_count(), (Segment::kBucketNum + Segment::kStashBucketNum) * 2);
}

TEST_F(DashTest, MergeFailureRollback) {
  std::vector<uint64_t> all_keys;
  std::vector<uint64_t> keep_keys;
  std::vector<uint64_t> buddy_keys;

  // Insert enough items to create 4 segments (depth 2) and fill them more
  for (uint64_t i = 0; i < 5000; ++i) {
    auto [it, inserted] = dt_.Insert(i, i);
    if (inserted) {
      all_keys.push_back(i);
    }
  }

  EXPECT_GE(dt_.depth(), 2);

  unsigned sid = 0;
  size_t buddy_id = dt_.FindBuddyId(sid);
  EXPECT_NE(buddy_id, sid);

  auto* src = dt_.GetSegment(sid);
  auto* buddy = dt_.GetSegment(buddy_id);

  for (uint64_t key : all_keys) {
    auto it = dt_.Find(key);
    if (!it.is_done()) {
      uint64_t hash = dt_.DoHash(key);
      uint32_t seg_id = hash >> (64 - dt_.depth());

      if (seg_id == 0) {
        keep_keys.push_back(key);
      } else if (seg_id == buddy_id) {
        buddy_keys.push_back(key);
      }
    }
  }

  size_t total_size_before = dt_.size();

  bool merge_succeeded = dt_.Merge(sid, buddy_id);

  EXPECT_EQ(dt_.size(), total_size_before);

  // Bucket layout might change after rollback. We only get data parity, not
  // a complete layout rollback.
  // For example, InsertUniq can displace existing items in the keep segment
  // to make room for items being moved from buddy.
  // After rollback, src and buddy pointers should still be valid
  for (auto key : keep_keys) {
    uint64_t hash = dt_.DoHash(key);
    auto it = src->FindIt(hash, EqTo(key));
    EXPECT_TRUE(it.found());
  }

  for (auto key : buddy_keys) {
    uint64_t hash = dt_.DoHash(key);
    auto it = buddy->FindIt(hash, EqTo(key));
    EXPECT_TRUE(it.found());
  }

  EXPECT_FALSE(merge_succeeded);
}

// Verify that FindBuddyId is symmetric: if FindBuddyId(x) = y, then FindBuddyId(y) = x.
TEST_F(DashTest, FindBuddySymmetry) {
  for (uint64_t i = 0; i < 4000; ++i) {
    dt_.Insert(i, i);
  }

  EXPECT_GE(dt_.depth(), 3);
  size_t dir_size = dt_.GetSegmentCount();

  for (size_t seg_id = 0; seg_id < dir_size; seg_id++) {
    auto* seg = dt_.GetSegment(seg_id);
    if (seg->local_depth() == 1)
      continue;

    size_t buddy_id = dt_.FindBuddyId(seg_id);
    if (buddy_id == seg_id)
      continue;

    // Symmetry check
    size_t reverse_buddy_id = dt_.FindBuddyId(buddy_id);
    EXPECT_EQ(reverse_buddy_id, seg_id)
        << "FindBuddyId not symmetric: FindBuddyId(" << seg_id << ")=" << buddy_id
        << " but FindBuddyId(" << buddy_id << ")=" << reverse_buddy_id;
  }
}

// Verify dt_.size() is unchanged after merge (items moved, not deleted).
TEST_F(DashTest, MergePreservesSize) {
  for (uint64_t i = 0; i < 4000; ++i) {
    dt_.Insert(i, i);
  }

  // Delete most keys to make merge feasible
  for (uint64_t i = 200; i < 4000; ++i) {
    dt_.Erase(i);
  }

  size_t size_before = dt_.size();
  size_t dir_size = dt_.GetSegmentCount();

  // Do one merge pass
  for (size_t seg_id = 0; seg_id < dir_size; seg_id++) {
    auto* seg = dt_.GetSegment(seg_id);
    if (seg->local_depth() == 1)
      continue;

    size_t buddy_id = dt_.FindBuddyId(seg_id);
    if (buddy_id == seg_id || seg_id > buddy_id)
      continue;

    auto* buddy = dt_.GetSegment(buddy_id);
    size_t combined_size = seg->SlowSize() + buddy->SlowSize();
    if (combined_size <= static_cast<size_t>(0.25 * seg->capacity())) {
      bool merged = dt_.Merge(seg_id, buddy_id);
      if (merged) {
        // Size must be unchanged after each merge
        EXPECT_EQ(dt_.size(), size_before)
            << "size changed after merging seg_id=" << seg_id << " buddy_id=" << buddy_id;
      }
    }
  }
}

// After merging, verify all remaining keys are still findable via dt_.Find().
// This tests that directory routing is correct after merge.
TEST_F(DashTest, MergeKeyLookupConsistency) {
  constexpr size_t kNumItems = 4000;
  std::vector<uint64_t> all_keys;

  for (uint64_t i = 0; i < kNumItems; ++i) {
    auto [it, inserted] = dt_.Insert(i, i);
    if (inserted)
      all_keys.push_back(i);
  }

  // Keep only ~10% of keys
  size_t keep_count = all_keys.size() / 10;
  for (size_t i = keep_count; i < all_keys.size(); ++i) {
    dt_.Erase(all_keys[i]);
  }
  all_keys.resize(keep_count);

  size_t dir_size = dt_.GetSegmentCount();

  // Merge all eligible pairs
  bool merged_any = true;
  while (merged_any) {
    merged_any = false;
    for (size_t seg_id = 0; seg_id < dir_size; seg_id++) {
      auto* seg = dt_.GetSegment(seg_id);
      if (seg->local_depth() == 1)
        continue;

      size_t buddy_id = dt_.FindBuddyId(seg_id);
      if (buddy_id == seg_id || seg_id > buddy_id)
        continue;

      auto* buddy = dt_.GetSegment(buddy_id);
      size_t combined_size = seg->SlowSize() + buddy->SlowSize();
      if (combined_size <= static_cast<size_t>(0.25 * seg->capacity())) {
        if (dt_.Merge(seg_id, buddy_id)) {
          merged_any = true;
        }
      }
    }
  }

  // All remaining keys must be findable via the table-level Find
  for (uint64_t key : all_keys) {
    auto it = dt_.Find(key);
    EXPECT_FALSE(it.is_done()) << "Key " << key << " not found after merge";
  }
}

// Test that after merging to depth 1, inserting more keys works correctly —
// the table can split again and all data remains intact.
TEST_F(DashTest, MergeAndGrow) {
  constexpr size_t kPhase1 = 4000;
  std::vector<uint64_t> surviving_keys;

  for (uint64_t i = 0; i < kPhase1; ++i) {
    dt_.Insert(i, i);
  }

  // Delete enough to enable merge
  size_t keep_count = kPhase1 / 20;  // ~5%
  for (uint64_t i = keep_count; i < kPhase1; ++i) {
    dt_.Erase(i);
  }
  for (uint64_t i = 0; i < keep_count; ++i) {
    surviving_keys.push_back(i);
  }

  size_t dir_size = dt_.GetSegmentCount();
  bool merged_any = true;
  while (merged_any) {
    merged_any = false;
    for (size_t seg_id = 0; seg_id < dir_size; seg_id++) {
      auto* seg = dt_.GetSegment(seg_id);
      if (seg->local_depth() == 1)
        continue;

      size_t buddy_id = dt_.FindBuddyId(seg_id);
      if (buddy_id == seg_id || seg_id > buddy_id)
        continue;

      auto* buddy = dt_.GetSegment(buddy_id);
      size_t combined = seg->SlowSize() + buddy->SlowSize();
      if (combined <= static_cast<size_t>(0.25 * seg->capacity())) {
        dt_.Merge(seg_id, buddy_id);
        merged_any = true;
      }
    }
  }

  EXPECT_EQ(dt_.unique_segments(), 2);

  // Now insert a new batch — the table should grow (split) again
  constexpr size_t kPhase2 = 3000;
  for (uint64_t i = kPhase1; i < kPhase1 + kPhase2; ++i) {
    auto [it, inserted] = dt_.Insert(i, i);
    if (inserted)
      surviving_keys.push_back(i);
  }

  EXPECT_GT(dt_.depth(), 1);

  // ALL surviving keys must be findable after growth
  for (uint64_t key : surviving_keys) {
    auto it = dt_.Find(key);
    EXPECT_FALSE(it.is_done()) << "Key " << key << " lost after merge+grow";
  }
}

// Verify that after merging, all directory entries that span the merged
// segment range point to the same segment object (the kept one).
TEST_F(DashTest, MergeDirectoryConsistency) {
  // Insert enough for depth 2 (4 segments)
  for (uint64_t i = 0; i < 2000; ++i) {
    dt_.Insert(i, i);
  }

  EXPECT_GE(dt_.depth(), 2);

  // Delete most items to enable merge
  for (uint64_t i = 50; i < 2000; ++i) {
    dt_.Erase(i);
  }

  unsigned keep_id = 0;
  unsigned buddy_id = dt_.FindBuddyId(0);

  if (buddy_id == 0) {
    // No buddy for segment 0 - try segment 2
    keep_id = 2;
    buddy_id = dt_.FindBuddyId(2);
  }

  // Only proceed if we found a mergeable buddy pair
  if (buddy_id != keep_id) {
    auto* keep = dt_.GetSegment(keep_id);
    auto* buddy = dt_.GetSegment(buddy_id);

    if (keep->local_depth() == buddy->local_depth() && keep->local_depth() > 1 &&
        keep_id < buddy_id) {
      uint8_t depth = keep->local_depth();
      size_t combined = keep->SlowSize() + buddy->SlowSize();

      if (combined <= static_cast<size_t>(0.25 * keep->capacity())) {
        bool merged = dt_.Merge(keep_id, buddy_id);
        ASSERT_TRUE(merged);

        // After merge, all dir entries that covered buddy must now point to keep
        auto* kept_seg = dt_.GetSegment(keep_id);
        uint32_t chunk_size = 1u << (dt_.depth() - (depth - 1));
        uint32_t start = keep_id & ~(chunk_size - 1u);

        for (size_t i = start; i < start + chunk_size; ++i) {
          EXPECT_EQ(dt_.GetSegment(i), kept_seg)
              << "Directory entry " << i << " does not point to merged segment";
        }
      }
    }
  }
}

// Test merging a table with global_depth > local_depth (aliased directory entries).
// When a segment at depth D < global_depth is merged with its buddy,
// the merged segment at depth D-1 should span the correct directory range.
TEST_F(DashTest, MergeWithAliasedEntries) {
  // Create depth-3 table (8 dir entries), then merge two depth-3 pairs to get depth-2 segments
  // alongside other depth-3 segments. This creates aliased entries.
  for (uint64_t i = 0; i < 4000; ++i) {
    dt_.Insert(i, i);
  }

  EXPECT_EQ(dt_.depth(), 3);

  // Delete most items
  for (uint64_t i = 200; i < 4000; ++i) {
    dt_.Erase(i);
  }

  // Merge segments 0 and 1 (both at depth 3) -> depth 2 segment spanning entries {0,1}
  auto* seg0 = dt_.GetSegment(0);
  auto* seg1 = dt_.GetSegment(1);

  if (seg0->local_depth() == 3 && seg1->local_depth() == 3) {
    size_t combined = seg0->SlowSize() + seg1->SlowSize();
    size_t threshold = static_cast<size_t>(0.25 * seg0->capacity());

    if (combined <= threshold) {
      bool ok = dt_.Merge(0, 1);
      ASSERT_TRUE(ok);

      // Now segment at entries 0 and 1 is the same depth-2 object
      EXPECT_EQ(dt_.GetSegment(0), dt_.GetSegment(1));
      EXPECT_EQ(dt_.GetSegment(0)->local_depth(), 2);

      // global_depth should still be 3
      EXPECT_EQ(dt_.depth(), 3);

      // Entries 2 and 3 should still be distinct depth-3 segments
      EXPECT_NE(dt_.GetSegment(2), dt_.GetSegment(3));

      // Since entries 2 and 3 are still at depth 3 (not yet merged into a depth-2 segment),
      // the true buddy of the depth-2 segment {0,1} does NOT yet exist.
      // FindBuddyId computes: bit_pos = global_depth(3) - local_depth(2) = 1
      //   FindBuddyId(0) -> buddy_idx = 0^2 = 2, GetSegment(2)->local_depth() = 3 != 2 -> returns 0
      //   FindBuddyId(1) -> buddy_idx = 1^2 = 3, GetSegment(3)->local_depth() = 3 != 2 -> returns 1
      // Both aliased entries correctly report "no buddy" (returning themselves).
      EXPECT_EQ(dt_.FindBuddyId(0), 0u)
          << "No buddy exists for depth-2 segment when entries 2,3 are still depth-3";
      EXPECT_EQ(dt_.FindBuddyId(1), 1u)
          << "Aliased entry 1 of same depth-2 segment also finds no buddy";

      // Now merge entries 2 and 3 to create a second depth-2 segment covering {2,3}
      auto* seg2 = dt_.GetSegment(2);
      auto* seg3 = dt_.GetSegment(3);
      if (seg2 != seg3) {
        size_t combined23 = seg2->SlowSize() + seg3->SlowSize();
        if (combined23 <= static_cast<size_t>(0.25 * seg2->capacity())) {
          bool ok23 = dt_.Merge(2, 3);
          if (ok23) {
            // Now both {0,1} and {2,3} are depth-2 segments — they ARE buddies
            // FindBuddyId(0): bit_pos=1, buddy_idx=0^2=2, GetSegment(2)->local_depth()=2 == 2 -> 2
            // FindBuddyId(2): bit_pos=1, buddy_idx=2^2=0, GetSegment(0)->local_depth()=2 == 2 -> 0
            EXPECT_EQ(dt_.FindBuddyId(0), 2u)
                << "After both pairs merged to depth-2, FindBuddyId(0)=2";
            EXPECT_EQ(dt_.FindBuddyId(2), 0u) << "FindBuddyId(2) should return 0 (symmetric)";
            // Aliased entry 1 looks for buddy at 1^2=3
            EXPECT_EQ(dt_.FindBuddyId(1), 3u) << "FindBuddyId(1) returns 3 (alias buddy)";
          }
        }
      }
    }
  }
}

// Test that FindBuddyId resolves to the same buddy *instance* for all alias ids in a stripe.
//
// When global_depth > local_depth a segment is referenced by a contiguous "stripe" of
// stripe_size = 2^(global_depth - local_depth) directory entries that all point to the
// same segment object.
// The canonical id is the stripe's first entry (lowest index).
//
// FindBuddyId(alias) computes:
//   depth    = GetSegment(alias)->local_depth()    // reads from the instance, same for all
//   bit_pos  = global_depth - depth                // same for every alias in the stripe
//   buddy_ix = alias ^ (1 << bit_pos)              // XOR differs per alias
//
// For a stripe starting at canonical id C (i.e. C is a multiple of stripe_size):
//   alias k = C + k  (0 <= k < stripe_size)
//   buddy_ix(k) = (C + k) ^ (1 << bit_pos)
//              = C ^ (1 << bit_pos) + k    (because k < stripe_size = 1<<bit_pos, so k
//                                           does not interfere with bit bit_pos)
//
// buddy_ix(k) and buddy_ix(0) differ by k, which is still within the buddy stripe
// (a stripe of the same size starting at C ^ (1<<bit_pos)).  Therefore
// GetSegment(buddy_ix(k)) returns the same buddy instance for all k.
//
// In other words: FindBuddyId returns *different id values* for different alias ids,
// but all those ids are aliases of the *same buddy segment instance*.
TEST_F(DashTest, FindBuddyIdCanonicalForStripe) {
  // Fill enough to force global_depth >= 3, giving segments at local_depth 3.
  for (uint64_t i = 0; i < 8000; ++i) {
    dt_.Insert(i, i);
  }
  ASSERT_GE(dt_.depth(), 3u);

  // Erase most items so segments are sparse enough to merge.
  for (uint64_t i = 100; i < 8000; ++i) {
    dt_.Erase(i);
  }

  // To get a real buddy we must merge TWO adjacent pairs at the same depth.
  // After merging pair A (keep_a, buddy_a) the kept segment drops to depth d-1,
  // but its buddy stripe still has the old depth d, so FindBuddyId returns self.
  // Only after merging the adjacent pair B (keep_b, buddy_b) to d-1 as well do
  // the two resulting stripes become buddies of each other.
  //
  // We find four consecutive canonical segments at the same depth d > 2 and merge
  // pairs (0,1) and (2,3) within that group.
  unsigned keep_a = UINT_MAX, bud_a = UINT_MAX, keep_b = UINT_MAX, bud_b = UINT_MAX;
  for (size_t i = 0; i < dt_.GetSegmentCount();) {
    auto* s0 = dt_.GetSegment(i);
    uint8_t d = s0->local_depth();
    if (d <= 2) {
      i = dt_.NextSeg(i);
      continue;
    }
    size_t i1 = dt_.NextSeg(i);
    if (i1 >= dt_.GetSegmentCount())
      break;
    size_t i2 = dt_.NextSeg(i1);
    if (i2 >= dt_.GetSegmentCount())
      break;
    size_t i3 = dt_.NextSeg(i2);
    if (i3 >= dt_.GetSegmentCount())
      break;

    auto* s1 = dt_.GetSegment(i1);
    auto* s2 = dt_.GetSegment(i2);
    auto* s3 = dt_.GetSegment(i3);
    size_t cap = s0->capacity();
    if (s1->local_depth() == d && s2->local_depth() == d && s3->local_depth() == d &&
        s0->SlowSize() + s1->SlowSize() <= static_cast<size_t>(0.25 * cap) &&
        s2->SlowSize() + s3->SlowSize() <= static_cast<size_t>(0.25 * cap)) {
      keep_a = static_cast<unsigned>(i);
      bud_a = static_cast<unsigned>(i1);
      keep_b = static_cast<unsigned>(i2);
      bud_b = static_cast<unsigned>(i3);
      break;
    }
    i = dt_.NextSeg(i);
  }

  ASSERT_NE(keep_a, UINT_MAX);
  ASSERT_TRUE(dt_.Merge(keep_a, bud_a));
  ASSERT_TRUE(dt_.Merge(keep_b, bud_b));

  // After both merges:
  //   - segment at keep_a has local_depth = d-1, aliased by stripe {keep_a, keep_a+1}
  //   - segment at keep_b has local_depth = d-1, aliased by stripe {keep_b, keep_b+1}
  //   - The two stripes are buddies of each other (same depth, adjacent subtrees).
  auto* seg_a = dt_.GetSegment(keep_a);
  uint8_t new_depth = seg_a->local_depth();
  ASSERT_GE(new_depth, 2u);  // depth<=1 guard in FindBuddyId must not fire

  size_t stripe_size = 1u << (dt_.depth() - new_depth);
  size_t stripe_start = keep_a & ~(stripe_size - 1);

  // FindBuddyId from the canonical id of stripe A must resolve to seg_b.
  auto* seg_b = dt_.GetSegment(keep_b);
  unsigned canonical_bid = dt_.FindBuddyId(static_cast<unsigned>(stripe_start));
  ASSERT_EQ(dt_.GetSegment(canonical_bid), seg_b)
      << "FindBuddyId from canonical id must resolve to the buddy segment";

  EXPECT_EQ(stripe_size, 2);
  for (size_t k = 0; k < stripe_size; ++k) {
    size_t alias = stripe_start + k;
    EXPECT_EQ(dt_.GetSegment(alias), seg_a) << "Directory entry " << alias << " must alias seg_a";

    unsigned bid = dt_.FindBuddyId(static_cast<unsigned>(alias));
    // Different alias -> different buddy id value, but same buddy instance.
    EXPECT_EQ(bid, canonical_bid + k)
        << "FindBuddyId(" << alias << ") should equal canonical_bid + " << k;
    EXPECT_EQ(dt_.GetSegment(bid), seg_b)
        << "FindBuddyId(" << alias << ") must resolve to seg_b for all aliases";
    // Stripe B is at higher indices than stripe A (Merge requires keep_id < buddy_id).
    EXPECT_GT(bid, alias);
  }
}

// Test that NextSeg is correct when called with the canonical (first) id of a stripe,
// and documents the expected behavior for non-canonical (middle-of-stripe) ids.
//
// NextSeg(sid) computes:
//   delta = 1 << (global_depth - segment_[sid]->local_depth())
//   return sid + delta
//
// For the canonical (first) id of a stripe, sid is already aligned to a multiple of
// delta, so sid + delta is exactly the first id of the next stripe — correct.
//
// For a non-canonical id sid = canonical + k  (0 < k < delta), the result is
//   (canonical + k) + delta
// which lands k positions into the next stripe, not at its start.
TEST_F(DashTest, NextSegCanonicalBehavior) {
  // Build a table large enough for global_depth >= 2.
  for (uint64_t i = 0; i < 2000; ++i) {
    dt_.Insert(i, i);
  }
  ASSERT_GE(dt_.depth(), 2u);

  // NextSeg from id 0 always uses canonical ids (0 is always canonical).
  // Verify it visits every distinct segment exactly once by comparing against
  // unique_segments() which is maintained as a counter by Insert/Merge.
  size_t visited = 0;
  for (size_t i = 0; i < dt_.GetSegmentCount(); i = dt_.NextSeg(i)) {
    ++visited;
  }
  EXPECT_EQ(visited, dt_.unique_segments())
      << "NextSeg traversal from id 0 (canonical) must visit each unique segment once";

  // Erase most entries and merge to create a stripe (local_depth < global_depth).
  for (uint64_t i = 100; i < 2000; ++i) {
    dt_.Erase(i);
  }

  // Find and perform a merge to produce a stripe.
  for (size_t i = 0; i < dt_.GetSegmentCount(); i = dt_.NextSeg(i)) {
    auto* seg = dt_.GetSegment(i);
    if (seg->local_depth() <= 1)
      continue;
    size_t next = dt_.NextSeg(i);
    if (next >= dt_.GetSegmentCount())
      break;
    auto* buddy = dt_.GetSegment(next);
    if (buddy->local_depth() == seg->local_depth() &&
        seg->SlowSize() + buddy->SlowSize() <= static_cast<size_t>(0.25 * seg->capacity())) {
      bool ok = dt_.Merge(static_cast<unsigned>(i), static_cast<unsigned>(next));
      if (ok)
        break;
    }
  }

  // After a potential merge, re-verify that canonical traversal is consistent.
  size_t manual2 = 0;
  for (size_t i = 0; i < dt_.GetSegmentCount(); i = dt_.NextSeg(i)) {
    ++manual2;
  }
  EXPECT_EQ(manual2, dt_.unique_segments())
      << "After merge, canonical NextSeg traversal must still match unique_segments()";

  // Show the non-canonical case: for any stripe of size > 1, NextSeg from a non-first
  // alias does NOT land on the start of the next stripe.
  for (size_t i = 0; i < dt_.GetSegmentCount(); i = dt_.NextSeg(i)) {
    auto* seg = dt_.GetSegment(i);
    size_t delta = 1u << (dt_.depth() - seg->local_depth());
    if (delta <= 1)
      continue;  // no stripe aliases for this segment

    // i is canonical; i+1 is a non-canonical alias of the same segment.
    size_t non_canonical = i + 1;
    ASSERT_LT(non_canonical, i + delta) << "non_canonical must still be within the stripe";

    // NextSeg from the non-canonical id lands at (non_canonical + delta), which is
    // one position past the start of the next stripe — demonstrating the offset.
    size_t next_from_canonical = dt_.NextSeg(i);          // i + delta  (correct)
    size_t next_from_alias = dt_.NextSeg(non_canonical);  // i+1+delta  (offset by 1)
    EXPECT_EQ(next_from_alias, next_from_canonical + 1)
        << "NextSeg from a non-canonical alias is offset by the same amount as the alias "
           "itself; callers must always use canonical (stripe-start) ids";
    break;  // one example is sufficient to document the behavior
  }
}

TEST_F(DashTest, BumpUp) {
  set<Segment::Key_t> keys = FillSegment(0);
  constexpr unsigned kFirstStashId = Segment::kBucketNum;
  constexpr unsigned kSecondStashId = Segment::kBucketNum + 1;
  constexpr unsigned kSlotNum = Segment::kSlotNum;

  EXPECT_TRUE(segment_.GetBucket(0).IsFull());
  EXPECT_TRUE(segment_.GetBucket(1).IsFull());
  EXPECT_TRUE(segment_.GetBucket(kFirstStashId).IsFull());
  EXPECT_TRUE(segment_.GetBucket(kSecondStashId).IsFull());

  // Segment::Iterator it{kFirstStashId, 1};
  Segment::Key_t key = segment_.Key(1, 2);  // key at bucket 1, slot 2
  uint8_t touched_bid[3];

  uint64_t hash = dt_.DoHash(key);

  segment_.Delete(Segment::Iterator{1, 2}, hash);
  EXPECT_FALSE(segment_.GetBucket(1).IsFull());

  segment_.SetVersion(kFirstStashId, 1);
  key = segment_.Key(kFirstStashId, 5);
  hash = dt_.DoHash(key);

  EXPECT_EQ(2, segment_.CVCOnBump(1, kFirstStashId, 5, hash, touched_bid));
  EXPECT_EQ(touched_bid[0], 0);
  EXPECT_EQ(touched_bid[1], 1);

  // Bump up
  std::vector<std::pair<uint8_t, uint8_t>> moved_buckets;
  auto move_cb = [&moved_buckets](uint32_t /* segment_id */, uint8_t a, uint8_t b) {
    moved_buckets.emplace_back(a, b);
  };
  segment_.BumpUp(kFirstStashId, 5, hash, RelaxedBumpPolicy{}, move_cb);

  // expect the key to move
  EXPECT_TRUE(segment_.GetBucket(1).IsFull());
  EXPECT_FALSE(segment_.GetBucket(kFirstStashId).IsFull());
  EXPECT_EQ(segment_.Key(1, 2), key);
  EXPECT_EQ(moved_buckets.size(), 1);
  EXPECT_EQ(moved_buckets.at(0).first, kFirstStashId);
  EXPECT_EQ(moved_buckets.at(0).second, 1);
  moved_buckets.clear();

  EXPECT_TRUE(Contains(key));

  // 9 is just a random slot id.
  key = segment_.Key(kSecondStashId, 9);
  hash = dt_.DoHash(key);

  EXPECT_EQ(3, segment_.CVCOnBump(2, kSecondStashId, 9, hash, touched_bid));
  EXPECT_EQ(touched_bid[0], kSecondStashId);
  // Bumpup will move the key to either its original bucket or a probing bucket.
  // Since we can't determine the exact bucket before calling bumpup, CVCOnBump
  // returns both the original bucket and the probing bucket.
  EXPECT_EQ(touched_bid[1], 0);
  EXPECT_EQ(touched_bid[2], 1);

  auto it = segment_.BumpUp(kSecondStashId, 9, hash, RelaxedBumpPolicy{}, move_cb);
  ASSERT_TRUE(key == segment_.Key(0, kSlotNum - 1) || key == segment_.Key(1, kSlotNum - 1));
  EXPECT_TRUE(segment_.GetBucket(kSecondStashId).IsFull());
  EXPECT_TRUE(Contains(key));
  EXPECT_TRUE(segment_.Key(kSecondStashId, 9));
  EXPECT_EQ(moved_buckets.size(), 2);
  EXPECT_EQ(moved_buckets.at(0).first, kSecondStashId);
  EXPECT_EQ(moved_buckets.at(0).second, it.index);
  EXPECT_EQ(moved_buckets.at(1).first, it.index);
  EXPECT_EQ(moved_buckets.at(1).second, kSecondStashId);
}

TEST_F(DashTest, BumpPolicy) {
  struct RestrictedBumpPolicy {
    bool CanBump(uint64_t key) const {
      return false;
    }
    void OnMove(Dash64::Cursor source, Dash64::Cursor dest) {
    }
  };

  set<Segment::Key_t> keys = FillSegment(0);
  constexpr unsigned kFirstStashId = Segment::kBucketNum;

  EXPECT_TRUE(segment_.GetBucket(0).IsFull());
  EXPECT_TRUE(segment_.GetBucket(1).IsFull());
  EXPECT_TRUE(segment_.GetBucket(kFirstStashId).IsFull());

  // check items are immovable in bucket
  Segment::Key_t key = segment_.Key(1, 2);
  uint64_t hash = dt_.DoHash(key);
  segment_.BumpUp(1, 2, hash, RestrictedBumpPolicy{}, [](auto&&...) {});
  EXPECT_EQ(key, segment_.Key(1, 2));

  // check items don't swap from stash
  key = segment_.Key(kFirstStashId, 2);
  hash = dt_.DoHash(key);
  segment_.BumpUp(kFirstStashId, 2, hash, RestrictedBumpPolicy{}, [](auto&&...) {});
  EXPECT_EQ(key, segment_.Key(kFirstStashId, 2));
}

TEST_F(DashTest, Insert2) {
  uint64_t k = 1191;
  ASSERT_EQ(2019837007031366716, UInt64Policy::HashFn(k));

  Dash64 dt;
  for (unsigned i = 0; i < 2000; ++i) {
    dt.Insert(i, 0);
  }
}

TEST_F(DashTest, InsertOOM) {
  CappedResource resource(1 << 15);
  Dash64 dt{1, UInt64Policy{}, &resource};

  ASSERT_THROW(
      {
        for (size_t i = 0; i < (1 << 14); ++i) {
          dt.Insert(i, 0);
        }
      },
      bad_alloc);
}

struct Item {
  char buf[24];
};

constexpr size_t ItemAlign = alignof(Item);

struct MyBucket : public detail::BucketBase<16> {
  Item key[14];
};

constexpr size_t kMySz = sizeof(MyBucket);
constexpr size_t kBBSz = sizeof(detail::BucketBase<16>);

TEST_F(DashTest, Custom) {
  using ItemSegment = detail::Segment<Item, uint64_t>;
  constexpr double kTax = ItemSegment::kTaxSize;
  constexpr size_t kMaxSize = ItemSegment::kMaxSize;
  constexpr size_t kSegSize = sizeof(ItemSegment);
  constexpr size_t kBuckSz = ItemSegment::kBucketSz;
  (void)kTax;
  (void)kMaxSize;
  (void)kSegSize;
  (void)kBuckSz;

  ItemSegment seg{2, 0, PMR_NS::get_default_resource()};

  auto eq = [v = Item{1, 1}](auto u) { return v.buf[0] == u.buf[0] && v.buf[1] == u.buf[1]; };
  auto it = seg.FindIt(42, eq);
  ASSERT_FALSE(it.found());
}

TEST_F(DashTest, FindByValue) {
  using ItemSegment = detail::Segment<Item, uint64_t>;
  auto no_op_cb = [](auto&&...) {};

  // Insert three different values with the same hash
  ItemSegment segment{2, 0, PMR_NS::get_default_resource()};
  segment.Insert(
      Item{1}, 1, 42, [](const auto& pred) { return pred.buf[0] == 1; }, no_op_cb);
  segment.Insert(
      Item{2}, 2, 42, [](const auto& pred) { return pred.buf[0] == 2; }, no_op_cb);
  segment.Insert(
      Item{3}, 3, 42, [](const auto& pred) { return pred.buf[0] == 3; }, no_op_cb);

  // We should be able to find the middle one by value
  auto it = segment.FindIt(42, [](const auto& key, const auto& value) { return value == 2; });
  EXPECT_TRUE(it.found());
  EXPECT_EQ(segment.Value(it.index, it.slot), 2);
}

TEST_F(DashTest, Reserve) {
  unsigned bc = dt_.capacity();
  for (unsigned i = 0; i <= bc * 2; ++i) {
    dt_.Reserve(i);
    ASSERT_GE((1 << dt_.depth()) * Dash64::kSegCapacity, i);
  }
}

TEST_F(DashTest, Insert) {
  constexpr size_t kNumItems = 10000;
  double sum = 0;
  for (size_t i = 0; i < kNumItems; ++i) {
    dt_.Insert(i, i);
    double u = (dt_.size() * 100.0) / (dt_.unique_segments() * Segment::capacity());

    sum += u;
    VLOG(1) << "Num items " << dt_.size() << ", load factor " << u << ", size per entry "
            << double(dt_.mem_usage()) / dt_.size();
  }
  EXPECT_EQ(kNumItems, dt_.size());
  LOG(INFO) << "Average load factor is " << sum / kNumItems;

  for (size_t i = 0; i < kNumItems; ++i) {
    Dash64::const_iterator it = dt_.Find(i);
    ASSERT_TRUE(it != dt_.end());

    ASSERT_EQ(it->second, i);
    ASSERT_LE(dt_.load_factor(), 1) << i;
  }

  for (size_t i = kNumItems; i < kNumItems * 10; ++i) {
    Dash64::const_iterator it = dt_.Find(i);
    ASSERT_TRUE(it == dt_.end());
  }

  EXPECT_EQ(kNumItems, dt_.size());
  EXPECT_EQ(1, dt_.Erase(0));
  EXPECT_EQ(0, dt_.Erase(0));
  EXPECT_EQ(kNumItems - 1, dt_.size());

  auto it = dt_.begin();
  ASSERT_FALSE(it.is_done());
  auto some_val = it->second;
  dt_.Erase(it);
  ASSERT_TRUE(dt_.Find(some_val).is_done());
}

TEST_F(DashTest, Traverse) {
  constexpr auto kNumItems = 50;
  for (size_t i = 0; i < kNumItems; ++i) {
    dt_.Insert(i, i);
  }

  Dash64::Cursor cursor;
  vector<unsigned> nums;
  auto tr_cb = [&](Dash64::iterator it) {
    nums.push_back(it->first);
    VLOG(1) << it.bucket_id() << " " << it.slot_id() << " " << it->first;
  };

  do {
    cursor = dt_.Traverse(cursor, tr_cb);
  } while (cursor);
  sort(nums.begin(), nums.end());
  nums.resize(unique(nums.begin(), nums.end()) - nums.begin());
  ASSERT_EQ(kNumItems, nums.size());
  EXPECT_EQ(0, nums[0]);
  EXPECT_EQ(kNumItems - 1, nums.back());
}

TEST_F(DashTest, TraverseSegmentOrder) {
  constexpr auto kNumItems = 50;
  for (size_t i = 0; i < kNumItems; ++i) {
    dt_.Insert(i, i);
  }

  vector<unsigned> nums;
  auto tr_cb = [&](Dash64::iterator it) {
    nums.push_back(it->first);
    VLOG(1) << it.bucket_id() << " " << it.slot_id() << " " << it->first;
  };

  Dash64::Cursor cursor;
  do {
    cursor = dt_.TraverseBySegmentOrder(cursor, tr_cb);
  } while (cursor);

  sort(nums.begin(), nums.end());
  nums.resize(unique(nums.begin(), nums.end()) - nums.begin());
  ASSERT_EQ(kNumItems, nums.size());
  EXPECT_EQ(0, nums[0]);
  EXPECT_EQ(kNumItems - 1, nums.back());
}

TEST_F(DashTest, TraverseBucketOrder) {
  constexpr auto kNumItems = 18000;
  for (size_t i = 0; i < kNumItems; ++i) {
    dt_.Insert(i, i);
  }
  for (size_t i = 0; i < kNumItems; ++i) {
    dt_.Erase(i);
  }
  constexpr auto kSparseItems = kNumItems / 50;
  for (size_t i = 0; i < kSparseItems; ++i) {  // create sparse table
    dt_.Insert(i, i);
  }

  vector<unsigned> nums;
  auto tr_cb = [&](Dash64::bucket_iterator it) {
    VLOG(1) << "call cb";
    while (!it.is_done()) {
      nums.push_back(it->first);
      VLOG(1) << it.bucket_id() << " " << it.slot_id() << " " << it->first;
      ++it;
    }
  };

  Dash64::Cursor cursor;
  do {
    cursor = dt_.TraverseBuckets(cursor, tr_cb);
  } while (cursor);

  sort(nums.begin(), nums.end());
  nums.resize(unique(nums.begin(), nums.end()) - nums.begin());
  ASSERT_EQ(kSparseItems, nums.size());
  EXPECT_EQ(0, nums[0]);
  EXPECT_EQ(kSparseItems - 1, nums.back());
}

struct TestEvictionPolicy {
  static constexpr bool can_evict = true;
  static constexpr bool can_gc = false;

  explicit TestEvictionPolicy(unsigned max_cap) : max_capacity(max_cap) {
  }

  bool CanGrow(const Dash64& tbl) const {
    return tbl.capacity() < max_capacity;
  }
  void OnMove(Dash64::Cursor source, Dash64::Cursor dest) {
  }

  void RecordSplit(Dash64::Segment_t*) {
  }

  unsigned Evict(const Dash64::HotBuckets& hotb, Dash64* me) const {
    if (!evict_enabled)
      return 0;

    auto it = hotb.probes.by_type.regular_buckets[0];
    unsigned res = 0;
    for (; !it.is_done(); ++it) {
      LOG(INFO) << "Deleting " << it->first;
      me->Erase(it);
      ++res;
    }

    return res;
  }

  bool evict_enabled = false;
  unsigned max_capacity;
};

TEST_F(DashTest, Eviction) {
  TestEvictionPolicy ev(1540);

  size_t num = 0;
  auto loop = [&] {
    for (; num < 5000; ++num) {
      dt_.Insert(num, 0, ev);
    }
  };

  ASSERT_THROW(loop(), bad_alloc);
  ASSERT_LT(num, 5000);
  ASSERT_EQ(2, dt_.unique_segments());
  EXPECT_LT(dt_.size(), ev.max_capacity);
  LOG(INFO) << "size is " << dt_.size();

  set<uint64_t> keys;
  Dash64::bucket_iterator bit = dt_.begin();
  unsigned last_slot = 0;
  while (!bit.is_done()) {
    keys.insert(bit->first);
    last_slot = bit.slot_id();
    ++bit;
  }
  ASSERT_LT(last_slot, Dash64::kSlotNum);

  bit = dt_.begin();
  dt_.ShiftRight(bit);
  bit = dt_.begin();
  size_t sz = 0;
  while (!bit.is_done()) {
    EXPECT_EQ(1, keys.count(bit->first));
    ++sz;
    ++bit;
  }
  EXPECT_EQ(sz, keys.size());

  while (!dt_.GetSegment(0)->GetBucket(0).IsFull()) {
    try {
      dt_.Insert(num++, 0, ev);
    } catch (bad_alloc&) {
    }
  }

  // Now the bucket is full.
  keys.clear();
  uint64_t last_key = dt_.GetSegment(0)->Key(0, Dash64::kSlotNum - 1);
  for (Dash64::bucket_iterator bit = dt_.begin(); !bit.is_done(); ++bit) {
    keys.insert(bit->first);
  }

  bit = dt_.begin();
  dt_.ShiftRight(bit);
  bit = dt_.begin();
  sz = 0;

  while (!bit.is_done()) {
    EXPECT_NE(last_key, bit->first);
    EXPECT_EQ(1, keys.count(bit->first));
    ++sz;
    ++bit;
  }
  EXPECT_EQ(sz + 1, keys.size());

  ev.evict_enabled = true;
  unsigned bucket_cnt = dt_.bucket_count();
  auto [it, res] = dt_.Insert(num, 0, ev);
  EXPECT_TRUE(res);
  EXPECT_EQ(bucket_cnt, dt_.bucket_count());
}

struct VersionPolicy : public BasicDashPolicy {
  static constexpr bool kUseVersion = true;

  static uint64_t HashFn(int v) {
    return XXH3_64bits(&v, sizeof(v));
  }
};

using VersionDT = DashTable<int, int, VersionPolicy>;
TEST_F(DashTest, Version) {
  VersionDT dt;
  auto [it, inserted] = dt.Insert(1, 1);

  EXPECT_EQ(0, it.GetVersion());
  it.SetVersion(5);
  EXPECT_EQ(5, it.GetVersion());

  dt.Clear();
  ASSERT_EQ(0, dt.size());
  ASSERT_EQ(2, dt.unique_segments());
  ASSERT_EQ(136, dt.bucket_count());
  constexpr int kNum = 68000;
  for (int i = 0; i < kNum; ++i) {
    auto it = dt.Insert(i, 0).first;
    it.SetVersion(i + 65000);
    if (i) {
      auto p = dt.Find(i - 1);
      ASSERT_GE(p.GetVersion(), i - 1 + 65000) << i;
    }
  }

  unsigned items = 0;
  for (auto it = dt.begin(); it != dt.end(); ++it) {
    ASSERT_FALSE(it.is_done());
    ASSERT_GE(it.GetVersion(), it->first + 65000)
        << it.segment_id() << " " << it.bucket_id() << " " << it.slot_id();
    ++items;
  }
  ASSERT_EQ(kNum, items);
}

TEST_F(DashTest, CVCUponInsert) {
  VersionDT dt;
  auto [it, added] = dt.Insert(10, 20);  // added to slot 0
  ASSERT_TRUE(added);

  int i = 11;
  while (true) {
    auto [it2, added] = dt.Insert(i, 30);
    if (it2.bucket_id() == it.bucket_id() && it2.segment_id() == it.segment_id()) {
      ASSERT_EQ(1, it2.slot_id());

      break;
    }
    ++i;
  }

  // freed slot 0 but the bucket still has i at slot 1.
  dt.Erase(10);

  auto cb = [](VersionDT::bucket_iterator bit) {
    LOG(INFO) << "sid: " << bit.segment_id() << " " << bit.bucket_id();
    while (!bit.is_done()) {
      LOG(INFO) << "key: " << bit->first;
      ++bit;
    }
  };
  dt.CVCUponInsert(1, i, cb);
}

TEST_F(DashTest, CVCUponInsertStress) {
  VersionDT dt;
  for (int i = 0; i < 5000; ++i) {
    dt.CVCUponInsert(1, i, [](VersionDT::bucket_iterator) {
      // empty callback
    });
    dt.Insert(i, 0);
  }
}

struct A {
  int a = 0;
  unsigned moved = 0;

  A(int i = 0) : a(i) {
  }
  A(const A&) = delete;
  A(A&& o) : a(o.a), moved(o.moved + 1) {
    o.a = -1;
  }

  A& operator=(const A&) = delete;
  A& operator=(A&& o) noexcept {
    o.moved = o.moved + 1;
    a = o.a;
    o.a = -1;
    return *this;
  }

  bool operator==(const A& o) const {
    return o.a == a;
  }
};

struct ADashPolicy : public BasicDashPolicy {
  static uint64_t HashFn(const A& a) {
    auto val = XXH3_64bits(&a.a, sizeof(a.a));
    return val;
  }
};

TEST_F(DashTest, Moveable) {
  using DType = DashTable<A, A, ADashPolicy>;

  DType table{1};
  ASSERT_TRUE(table.Insert(A{1}, A{2}).second);
  ASSERT_FALSE(table.Insert(A{1}, A{3}).second);
  EXPECT_EQ(1, table.size());
  table.Clear();
  EXPECT_EQ(0, table.size());
}

struct SdsDashPolicy {
  enum { kSlotNum = 12, kBucketNum = 64, kStashBucketNum = 2 };
  static constexpr bool kUseVersion = false;

  static uint64_t HashFn(sds u) {
    return XXH3_64bits(reinterpret_cast<const uint8_t*>(u), sdslen(u));
  }

  static uint64_t HashFn(std::string_view u) {
    return XXH3_64bits(u.data(), u.size());
  }

  static void DestroyValue(uint64_t) {
  }
  static void DestroyKey(sds s) {
    sdsfree(s);
  }

  static bool Equal(sds u1, sds u2) {
    return dictSdsKeyCompare(nullptr, u1, u2) == 0;
  }

  static bool Equal(sds u1, std::string_view u2) {
    return u2 == std::string_view{u1, sdslen(u1)};
  }
};

TEST_F(DashTest, Sds) {
  DashTable<sds, uint64_t, SdsDashPolicy> dt;

  sds foo = sdscatlen(sdsempty(), "foo", 3);
  dt.Insert(foo, 0);
  // dt.Insert(std::string_view{"bar"}, 1);
}

struct BlankPolicy : public BasicDashPolicy {
  static uint64_t HashFn(uint64_t v) {
    return v;
  }
};

// The bug was that for very rare cases when during segment splitting we move all the items
// into a new segment, not every item finds a place.
TEST_F(DashTest, SplitBug) {
  DashTable<uint64_t, uint64_t, BlankPolicy> table;
  string path = base::ProgramRunfile("testdata/ids.txt.zst");
  io::Result<io::Source*> src = io::OpenUncompressed(path);
  ASSERT_TRUE(src) << src.error();

  io::LineReader lr(*src, TAKE_OWNERSHIP);
  string_view line;
  uint64_t val;
  while (lr.Next(&line)) {
    CHECK(absl::SimpleHexAtoi(line, &val)) << line;
    table.Insert(val, 0);
  }
  EXPECT_EQ(746, table.size());
}

/**
 ______     _      _   _               _______        _
|  ____|   (_)    | | (_)             |__   __|      | |
| |____   ___  ___| |_ _  ___  _ __      | | ___  ___| |_ ___
|  __\ \ / / |/ __| __| |/ _ \| '_ \     | |/ _ \/ __| __/ __|
| |___\ V /| | (__| |_| | (_) | | | |    | |  __/\__ \ |_\__ \
|______\_/ |_|\___|\__|_|\___/|_| |_|    |_|\___||___/\__|___/
 *
 */
struct EvictParams {
  bool use_bumpups;
  double zipf_param;

  string PrintTo() const {
    string name = absl::StrCat(use_bumpups ? "" : "no", "bumps");
    absl::StrAppend(&name, unsigned(zipf_param * 1000));

    return name;
  }
};

string PrintParams(const testing::TestParamInfo<EvictParams>& info) {
  return info.param.PrintTo();
}

struct U64DashPolicy {
  enum { kSlotNum = 14, kBucketNum = 64, kStashBucketNum = 4 };
  static constexpr bool kUseVersion = false;

  static void DestroyValue(uint64_t) {
  }
  static void DestroyKey(uint64_t) {
  }

  static bool Equal(uint64_t u, uint64_t v) {
    return u == v;
  }

  static uint64_t HashFn(uint64_t v) {
    return XXH3_64bits(&v, sizeof(v));
  }
};

using U64Dash = DashTable<uint64_t, unsigned, U64DashPolicy>;

struct SimpleEvictPolicy {
  static constexpr bool can_gc = false;
  static constexpr bool can_evict = true;

  bool CanGrow(const U64Dash& tbl) {
    return tbl.capacity() + U64Dash::kSegCapacity < max_capacity;
  }

  void OnMove(U64Dash::Cursor source, U64Dash::Cursor dest) {
  }

  void RecordSplit(U64Dash::Segment_t* segment) {
  }

  // Required interface in case can_gc is true
  // returns number of items evicted from the table.
  // 0 means - nothing has been evicted.
  unsigned Evict(const U64Dash::HotBuckets& hotb, U64Dash* me) {
    constexpr unsigned kBucketNum = U64Dash::HotBuckets::kNumBuckets;

    uint32_t bid = hotb.key_hash % kBucketNum;

    unsigned slot_index = (hotb.key_hash >> 32) % U64Dash::kSlotNum;

    for (unsigned i = 0; i < kBucketNum; ++i) {
      auto it = hotb.at((bid + i) % kBucketNum);
      it += slot_index;

      if (it.is_done())
        continue;

      me->Erase(it);
      ++evicted;

      return 1;
    }
    return 0;
  }

  size_t max_capacity = SIZE_MAX;
  unsigned evicted = 0;
  // default_random_engine rand_eng_{42};
};

struct ShiftRightPolicy {
  absl::flat_hash_map<uint64_t, unsigned> evicted;
  size_t max_capacity = SIZE_MAX;
  unsigned evicted_sum = 0;

  static constexpr bool can_gc = false;
  static constexpr bool can_evict = true;

  bool CanGrow(const U64Dash& tbl) {
    return tbl.capacity() + U64Dash::kSegCapacity < max_capacity;
  }

  void RecordSplit(U64Dash::Segment_t* segment) {
  }

  void OnMove(U64Dash::Cursor source, U64Dash::Cursor dest) {
  }

  unsigned Evict(const U64Dash::HotBuckets& hotb, U64Dash* me) {
    constexpr unsigned kNumStashBuckets = ABSL_ARRAYSIZE(hotb.probes.by_type.stash_buckets);

    unsigned stash_pos = hotb.key_hash % kNumStashBuckets;
    auto stash_it = hotb.probes.by_type.stash_buckets[stash_pos];
    stash_it += (U64Dash::kSlotNum - 1);  // go to the last slot.

    uint64_t k = stash_it->first;
    DVLOG(1) << "Deleting key " << k << " from " << unsigned(stash_it.bucket_id()) << "/"
             << stash_it.slot_id();
    evicted[k]++;

    CHECK(me->ShiftRight(stash_it));
    ++evicted_sum;

    return 1;
  };
};

class EvictionPolicyTest : public testing::TestWithParam<EvictParams> {
 protected:
  template <typename Policy> void FillUniform(unsigned max_range, Policy& policy);

  uint64_t Rand() {
    return zipf_ ? zipf_->Next(rand_eng_) : udist_(rand_eng_);
  }

  void SetUp() final {
    if (GetParam().zipf_param > 0)
      zipf_.emplace(0, 15000, GetParam().zipf_param);
    else {
      uniform_int_distribution<uint64_t>::param_type p{0, 15000};
      udist_.param(p);
    }
  }

  default_random_engine rand_eng_{42};
  U64Dash dt_;
  std::optional<base::ZipfianGenerator> zipf_;
  uniform_int_distribution<uint64_t> udist_;
};

template <typename Policy>
void EvictionPolicyTest::FillUniform(unsigned max_range, Policy& policy) {
  std::uniform_int_distribution<uint64_t> dist(0, max_range - 1);
  for (unsigned i = 0; i < 100000; ++i) {
    auto [it, res] = dt_.Insert(dist(rand_eng_), 0, policy);
    if (!res && it.is_done())  // filled up till the capacity limit
      break;
  }
  LOG(INFO) << dt_.size();
}

TEST_P(EvictionPolicyTest, HitRate) {
  CHECK_LT(GetParam().zipf_param, 1);
  SimpleEvictPolicy ev_policy;
  ev_policy.max_capacity = 3000;
  FillUniform(15000, ev_policy);

  unsigned hits = 0;
  for (unsigned i = 0; i < 150000; ++i) {
    auto [it, res] = dt_.Insert(Rand(), 0, ev_policy);
    CHECK(!it.is_done());
    if (!res) {
      ++hits;
    }
  }
  LOG(INFO) << "Zipf: " << GetParam().zipf_param << ", hits " << hits << " evictions "
            << ev_policy.evicted;
}

TEST_P(EvictionPolicyTest, HitRateZipf) {
  base::ZipfianGenerator gen(1, 15000, 0.9);
  SimpleEvictPolicy ev_policy;
  ev_policy.max_capacity = 3000;

  FillUniform(15000, ev_policy);

  bool use_bumps = GetParam().use_bumpups;

  unsigned hits = 0;
  for (unsigned i = 0; i < 150000; ++i) {
    uint64_t key = Rand();
    auto [it, res] = dt_.Insert(key, 0, ev_policy);
    CHECK(!it.is_done());
    if (res) {
      DVLOG(1) << "Inserted new key " << key << " to bucket " << it.bucket_id() << " slot "
               << it.slot_id();
    } else {
      if (use_bumps) {
        RelaxedBumpPolicy policy;
        dt_.BumpUp(it, policy);
      }

      ++hits;
    }
  }
  LOG(INFO) << "Zipf: " << GetParam().PrintTo() << " hits " << hits << " evictions "
            << ev_policy.evicted;
}

TEST_P(EvictionPolicyTest, HitRateZipfShr) {
  ShiftRightPolicy ev_policy;
  ev_policy.max_capacity = 3000;

  FillUniform(15000, ev_policy);

  unsigned hits = 0;
  unsigned inserted_evicted = 0;
  bool use_bumps = GetParam().use_bumpups;
  for (unsigned i = 0; i < 150000; ++i) {
    unsigned key = Rand();

    auto [it, res] = dt_.Insert(key, 0, ev_policy);
    if (!it.is_done()) {
      if (res) {
        DVLOG(1) << "Inserted new key " << key << " to bucket " << it.bucket_id() << " slot "
                 << it.slot_id();
        if (ev_policy.evicted.contains(key)) {
          ++inserted_evicted;
        }
      } else {
        if (use_bumps) {
          RelaxedBumpPolicy policy;
          dt_.BumpUp(it, policy);
          DVLOG(1) << "Bump up key " << key << " " << it.bucket_id() << " slot " << it.slot_id();
        } else {
          DVLOG(1) << "Hit on key " << key;
        }
        ++hits;
      }
    }
  }

  vector<pair<unsigned, uint64_t>> freq_evicted;
  for (const auto& k_v : ev_policy.evicted) {
    freq_evicted.emplace_back(k_v.second, k_v.first);
  }
  sort(freq_evicted.rbegin(), freq_evicted.rend());

  LOG(INFO) << "Params " << GetParam().PrintTo() << " hits " << hits << " evictions "
            << ev_policy.evicted_sum << " "
            << "reinserted " << inserted_evicted;
  unsigned num_outs = 0;
  for (const auto& k_v : freq_evicted) {
    LOG(INFO) << "Evicted " << k_v.first << " : " << k_v.second;
    if (++num_outs > 100 || k_v.first < 5)
      break;
  }
}

INSTANTIATE_TEST_SUITE_P(Eviction, EvictionPolicyTest,
                         testing::Values(EvictParams{false, 0}, EvictParams{false, 0.9},
                                         EvictParams{true, 0.9}),
                         PrintParams);

// Benchmarks
static void BM_Insert(benchmark::State& state) {
  unsigned count = state.range(0);

  size_t next = 0;
  while (state.KeepRunning()) {
    Dash64 dt;

    for (unsigned i = 0; i < count; ++i) {
      dt.Insert(next++, 0);
    }
  }
}
BENCHMARK(BM_Insert)->Arg(10000)->Arg(100000)->Arg(1000000);

struct NoDestroySdsPolicy : public SdsDashPolicy {
  static void DestroyKey(sds s) {
  }
};

static void BM_StringInsert(benchmark::State& state) {
  unsigned count = state.range(0);

  std::vector<sds> strs(count);
  for (unsigned i = 0; i < count; ++i) {
    strs[i] = sdscatprintf(sdsempty(), "key__%x", 100 + i);
  }

  while (state.KeepRunning()) {
    DashTable<sds, uint64_t, NoDestroySdsPolicy> dt;

    for (unsigned i = 0; i < count; ++i) {
      dt.Insert(strs[i], 0);
    }
  }

  for (sds s : strs) {
    sdsfree(s);
  }
}
BENCHMARK(BM_StringInsert)->Arg(1000)->Arg(10000)->Arg(100000);

static void BM_FindExisting(benchmark::State& state) {
  unsigned count = state.range(0);

  Dash64 dt;
  for (unsigned i = 0; i < count; ++i) {
    dt.Insert(i, 0);
  }

  size_t next = 0;
  while (state.KeepRunning()) {
    for (unsigned i = 0; i < 100; ++i) {
      dt.Find(next++);
    }
  }
}
BENCHMARK(BM_FindExisting)->Arg(1000000)->Arg(2000000);

// dict memory usage is in [32*n + 8*n, 32*n + 16*n], or
// per entry usage is [40, 48].
static void BM_RedisDictFind(benchmark::State& state) {
  unsigned count = state.range(0);
  dict* d = dictCreate(&IntDict);

  for (unsigned i = 0; i < count; ++i) {
    size_t key = i;
    dictAdd(d, (void*)key, nullptr);
  }

  size_t next = 0;
  while (state.KeepRunning()) {
    for (size_t i = 0; i < 100; ++i) {
      size_t k = next++;
      dictFind(d, (void*)k);
    }
  }
  dictRelease(d);
}
BENCHMARK(BM_RedisDictFind)->Arg(1000000)->Arg(2000000);

// dict memory usage is in [32*n + 8*n, 32*n + 16*n], or
// per entry usage is [40, 48].
static void BM_RedisDictInsert(benchmark::State& state) {
  unsigned count = state.range(0);
  size_t next = 0;
  while (state.KeepRunning()) {
    dict* d = dictCreate(&IntDict);
    for (unsigned i = 0; i < count; ++i) {
      dictAdd(d, (void*)next, nullptr);
      ++next;
    }
    dictRelease(d);
  }
}
BENCHMARK(BM_RedisDictInsert)->Arg(10000)->Arg(100000)->Arg(1000000);

static void BM_RedisStringInsert(benchmark::State& state) {
  unsigned count = state.range(0);
  std::vector<sds> strs(count);
  for (unsigned i = 0; i < count; ++i) {
    strs[i] = sdscatprintf(sdsempty(), "key__%x", 100 + i);
  }

  while (state.KeepRunning()) {
    dict* d = dictCreate(&SdsDict);
    for (unsigned i = 0; i < count; ++i) {
      dictAdd(d, strs[i], nullptr);
    }
    dictRelease(d);
  }

  for (sds s : strs) {
    sdsfree(s);
  }
}
BENCHMARK(BM_RedisStringInsert)->Arg(1000)->Arg(10000)->Arg(100000);

}  // namespace dfly


================================================
FILE: src/core/dense_set.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/dense_set.h"

#include <absl/numeric/bits.h>

#include <cstddef>
#include <cstdint>
#include <stack>
#include <type_traits>
#include <vector>

#include "absl/random/distributions.h"
#include "absl/random/random.h"
#include "base/logging.h"

extern "C" {
#include "redis/zmalloc.h"
}

namespace dfly {
using namespace std;

constexpr size_t kMinSizeShift = 2;
constexpr size_t kMinSize = 1 << kMinSizeShift;
constexpr bool kAllowDisplacements = true;

thread_local absl::InsecureBitGen tl_bit_gen;

#define PREFETCH_READ(x) __builtin_prefetch(x, 0, 1)

DenseSet::IteratorBase::IteratorBase(const DenseSet* owner, bool is_end)
    : owner_(const_cast<DenseSet*>(owner)), curr_entry_(nullptr) {
  curr_list_ = is_end ? owner_->entries_.end() : owner_->entries_.begin();

  // Even if `is_end` is `false`, the list can be empty.
  if (curr_list_ == owner->entries_.end()) {
    curr_entry_ = nullptr;
    owner_ = nullptr;
  } else {
    curr_entry_ = &(*curr_list_);
    owner->ExpireIfNeeded(nullptr, curr_entry_);

    // find the first non null entry
    if (curr_entry_->IsEmpty()) {
      Advance();
    }
  }
}

void DenseSet::IteratorBase::SetExpiryTime(uint32_t ttl_sec) {
  DensePtr* ptr = curr_entry_->IsLink() ? curr_entry_->AsLink() : curr_entry_;
  void* src = ptr->GetObject();
  if (!HasExpiry()) {
    const size_t old_size = owner_->ObjectAllocSize(ptr->Raw());
    void* new_obj = owner_->ObjectClone(src, false, true);
    ptr->SetObject(new_obj);

    const size_t new_size = owner_->ObjectAllocSize(ptr->Raw());

    // Important: we set the ttl bit on the wrapping pointer.
    curr_entry_->SetTtl(true);
    owner_->ObjDelete(src, false);
    src = new_obj;

    // Because setting TTL requires an extra 4 bytes for the key, the allocated size may push the
    // object into a different mi-malloc page category (e.g. 16 byte page -> 32 byte page). This
    // results in increased reporting in ObjAllocSize.
    //
    // If this size increase is not accounted for, it will cause an overflow in
    // DenseSet::AddOrReplaceObj due to subtracting larger size from smaller and the type of
    // obj_malloc_used_ being size_t.
    if (old_size != new_size) {
      owner_->DecreaseMallocUsed(old_size);
      owner_->IncreaseMallocUsed(new_size);
    }
  }
  owner_->ObjUpdateExpireTime(src, ttl_sec);
}

void DenseSet::IteratorBase::Advance() {
  bool step_link = false;
  DCHECK(curr_entry_);

  if (curr_entry_->IsLink()) {
    DenseLinkKey* plink = curr_entry_->AsLink();
    if (!owner_->ExpireIfNeeded(curr_entry_, &plink->next) || curr_entry_->IsLink()) {
      curr_entry_ = &plink->next;
      step_link = true;
    }
  }

  if (!step_link) {
    DCHECK(curr_list_ != owner_->entries_.end());
    do {
      ++curr_list_;
      if (curr_list_ == owner_->entries_.end()) {
        curr_entry_ = nullptr;
        owner_ = nullptr;
        return;
      }
      owner_->ExpireIfNeeded(nullptr, &(*curr_list_));
    } while (curr_list_->IsEmpty());
    DCHECK(curr_list_ != owner_->entries_.end());
    curr_entry_ = &(*curr_list_);
  }
  DCHECK(!curr_entry_->IsEmpty());
}

DenseSet::DenseSet() {
  static_assert(sizeof(entries_) == 24);
}

DenseSet::~DenseSet() {
  // We can not call Clear from the base class because it internally calls ObjDelete which is
  // a virtual function. Therefore, destructor of the derived classes must clean up the table.
  CHECK(entries_.empty());
}

size_t DenseSet::PushFront(DenseSet::ChainVectorIterator it, void* data, bool has_ttl) {
  // if this is an empty list assign the value to the empty placeholder pointer
  DCHECK(!it->IsDisplaced());
  if (it->IsEmpty()) {
    it->SetObject(data);
  } else {
    // otherwise make a new link and connect it to the front of the list
    it->SetLink(NewLink(data, *it));
  }

  if (has_ttl) {
    it->SetTtl(true);
    expiration_used_ = true;
  }
  return ObjectAllocSize(data);
}

void DenseSet::PushFront(DenseSet::ChainVectorIterator it, DenseSet::DensePtr ptr) {
  DVLOG(2) << "PushFront to " << distance(entries_.begin(), it) << ", "
           << ObjectAllocSize(ptr.GetObject());
  DCHECK(!it->IsDisplaced());

  if (it->IsEmpty()) {
    it->SetObject(ptr.GetObject());
    if (ptr.HasTtl()) {
      it->SetTtl(true);
      expiration_used_ = true;
    }
    if (ptr.IsLink()) {
      FreeLink(ptr.AsLink());
    }
  } else if (ptr.IsLink()) {
    // if the pointer is already a link then no allocation needed.
    *ptr.Next() = *it;
    *it = ptr;
    DCHECK(!it->AsLink()->next.IsEmpty());
  } else {
    DCHECK(ptr.IsObject());

    // allocate a new link if needed and copy the pointer to the new link
    it->SetLink(NewLink(ptr.Raw(), *it));
    if (ptr.HasTtl()) {
      it->SetTtl(true);
      expiration_used_ = true;
    }
    DCHECK(!it->AsLink()->next.IsEmpty());
  }
}

auto DenseSet::PopPtrFront(DenseSet::ChainVectorIterator it) -> DensePtr {
  if (it->IsEmpty()) {
    return DensePtr{};
  }

  DensePtr front = *it;

  // if this is an object, then it's also the only record in this chain.
  // therefore, we should just reset DensePtr.
  if (it->IsObject()) {
    it->Reset();
  } else {
    DCHECK(it->IsLink());
    DenseLinkKey* link = it->AsLink();
    *it = link->next;
  }

  return front;
}

uint32_t DenseSet::ClearStep(uint32_t start, uint32_t count) {
  constexpr unsigned kArrLen = 32;
  ClearItem arr[kArrLen];
  unsigned len = 0;

  size_t end = min<size_t>(entries_.size(), start + count);
  for (size_t i = start; i < end; ++i) {
    DensePtr& ptr = entries_[i];
    if (ptr.IsEmpty())
      continue;

    auto& dest = arr[len++];
    dest.has_ttl = ptr.HasTtl();

    PREFETCH_READ(ptr.Raw());
    if (ptr.IsObject()) {
      dest.obj = ptr.Raw();
      dest.ptr.Reset();
    } else {
      dest.ptr = ptr;
      dest.obj = nullptr;
    }
    ptr.Reset();
    if (len == kArrLen) {
      ClearBatch(kArrLen, arr);
      len = 0;
    }
  }

  ClearBatch(len, arr);

  if (size_ == 0) {
    entries_.clear();
    num_links_ = 0;
    obj_malloc_used_ = 0;
    expiration_used_ = false;
  }
  return end;
}

bool DenseSet::Equal(DensePtr dptr, const void* ptr, uint32_t cookie) const {
  if (dptr.IsEmpty()) {
    return false;
  }

  return ObjEqual(dptr.GetObject(), ptr, cookie);
}

void DenseSet::CloneBatch(unsigned len, CloneItem* items, DenseSet* other) const {
  // We handle a batch of items to minimize data dependencies when accessing memory for a single
  // item. We prefetch the memory for entire batch before actually reading data from any of the
  // elements.

  auto clone = [this](void* obj, bool has_ttl, DenseSet* other) {
    // The majority of the CPU is spent in this block.
    void* new_obj = other->ObjectClone(obj, has_ttl, false);
    uint64_t hash = this->Hash(obj, 0);
    other->AddUnique(new_obj, has_ttl, hash);
  };

  while (len) {
    unsigned dest_id = 0;
    // we walk "len" linked lists in parallel, and prefetch their next, obj pointers
    // before actually processing them.
    for (unsigned i = 0; i < len; ++i) {
      auto& src = items[i];
      if (src.obj) {
        clone(src.obj, src.has_ttl, other);
        src.obj = nullptr;
      }

      if (src.ptr.IsEmpty()) {
        continue;
      }

      if (src.ptr.IsObject()) {
        clone(src.ptr.Raw(), src.has_ttl, other);
      } else {
        auto& dest = items[dest_id++];
        DenseLinkKey* link = src.ptr.AsLink();
        dest.obj = link->Raw();
        DCHECK(!link->HasTtl());

        // ttl is attached to the wrapping pointer.
        dest.has_ttl = src.ptr.HasTtl();
        dest.ptr = link->next;
        PREFETCH_READ(dest.ptr.Raw());
        PREFETCH_READ(dest.obj);
      }
    }

    // update the length of the batch for the next iteration.
    len = dest_id;
  }
}

void DenseSet::ClearBatch(unsigned len, ClearItem* items) {
  while (len) {
    unsigned dest_id = 0;
    // we walk "len" linked lists in parallel, and prefetch their next, obj pointers
    // before actually processing them.
    for (unsigned i = 0; i < len; ++i) {
      auto& src = items[i];
      if (src.obj) {
        ObjDelete(src.obj, src.has_ttl);
        --size_;
        src.obj = nullptr;
      }

      if (src.ptr.IsEmpty())
        continue;

      if (src.ptr.IsObject()) {
        ObjDelete(src.ptr.Raw(), src.has_ttl);
        --size_;
      } else {
        auto& dest = items[dest_id++];
        DenseLinkKey* link = src.ptr.AsLink();
        DCHECK(!link->HasTtl());
        dest.obj = link->Raw();
        dest.has_ttl = src.ptr.HasTtl();
        dest.ptr = link->next;
        PREFETCH_READ(dest.ptr.Raw());
        PREFETCH_READ(dest.obj);
        FreeLink(link);
      }
    }

    // update the length of the batch for the next iteration.
    len = dest_id;
  }
}
bool DenseSet::NoItemBelongsBucket(uint32_t bid) const {
  auto& entries = const_cast<DenseSet*>(this)->entries_;
  DensePtr* curr = &entries[bid];
  ExpireIfNeeded(nullptr, curr);
  if (!curr->IsEmpty() && !curr->IsDisplaced()) {
    return false;
  }

  if (bid + 1 < entries_.size()) {
    DensePtr* right_bucket = &entries[bid + 1];
    ExpireIfNeeded(nullptr, right_bucket);
    if (!right_bucket->IsEmpty() && right_bucket->IsDisplaced() &&
        right_bucket->GetDisplacedDirection() == 1)
      return false;
  }

  if (bid > 0) {
    DensePtr* left_bucket = &entries[bid - 1];
    ExpireIfNeeded(nullptr, left_bucket);
    if (!left_bucket->IsEmpty() && left_bucket->IsDisplaced() &&
        left_bucket->GetDisplacedDirection() == -1)
      return false;
  }
  return true;
}

auto DenseSet::FindEmptyAround(uint32_t bid) -> ChainVectorIterator {
  ExpireIfNeeded(nullptr, &entries_[bid]);

  if (entries_[bid].IsEmpty()) {
    return entries_.begin() + bid;
  }

  if (!kAllowDisplacements) {
    return entries_.end();
  }

  if (bid + 1 < entries_.size()) {
    auto it = next(entries_.begin(), bid + 1);
    ExpireIfNeeded(nullptr, &(*it));
    if (it->IsEmpty())
      return it;
  }

  if (bid) {
    auto it = next(entries_.begin(), bid - 1);
    ExpireIfNeeded(nullptr, &(*it));
    if (it->IsEmpty())
      return it;
  }

  return entries_.end();
}

void DenseSet::Reserve(size_t sz) {
  sz = std::max<size_t>(sz, kMinSize);

  sz = absl::bit_ceil(sz);
  if (sz > entries_.size()) {
    size_t prev_size = entries_.size();
    entries_.resize(sz);
    capacity_log_ = absl::bit_width(sz) - 1;
    Grow(prev_size);
  }
}

void DenseSet::ShrinkBucket(size_t bucket_idx) {
  // Take the entire bucket to avoid infinite loop when new_bid == bucket_idx
  DensePtr bucket = entries_[bucket_idx];
  entries_[bucket_idx].Reset();

  // Process the taken bucket chain
  while (!bucket.IsEmpty()) {
    // Pop front from local chain
    DensePtr dptr = bucket;
    bucket = bucket.IsObject() ? DensePtr{} : bucket.AsLink()->next;

    void* obj = dptr.GetObject();
    bool has_ttl = dptr.HasTtl();

    // Free link unconditionally - PushFront will create new one if needed
    if (dptr.IsLink()) {
      FreeLink(dptr.AsLink());
    }

    if (has_ttl && ObjExpireTime(obj) <= time_now_) {
      ObjDelete(obj, true);
      --size_;
      continue;
    }

    uint32_t new_bid = BucketId(obj, 0);
    DVLOG(2) << " Shrink: Moving from " << bucket_idx << " to " << new_bid;
    PushFront(entries_.begin() + new_bid, obj, has_ttl);
  }
}

void DenseSet::Shrink(size_t new_size) {
  DCHECK(absl::has_single_bit(new_size));
  DCHECK_GE(new_size, kMinSize);
  DCHECK_LT(new_size, entries_.size());

  size_t prev_size = entries_.size();
  capacity_log_ = absl::bit_width(new_size) - 1;

  // Process from low to high (opposite of Grow).
  // This prevents double-processing: when moving elements from bucket i to bucket j < i,
  // bucket j has already been processed, so the element won't be processed again.
  for (size_t i = 0; i < prev_size; ++i) {
    ShrinkBucket(i);
  }

  entries_.resize(new_size);
}

void DenseSet::Fill(DenseSet* other) const {
  DCHECK(other->entries_.empty());

  other->Reserve(UpperBoundSize());

  constexpr unsigned kArrLen = 32;
  CloneItem arr[kArrLen];
  unsigned len = 0;

  for (auto it = entries_.begin(); it != entries_.end(); ++it) {
    DensePtr ptr = *it;

    if (ptr.IsEmpty())
      continue;

    auto& item = arr[len++];
    item.has_ttl = ptr.HasTtl();

    if (ptr.IsObject()) {
      item.ptr.Reset();
      item.obj = ptr.Raw();
      PREFETCH_READ(item.obj);
    } else {
      item.ptr = ptr;
      item.obj = nullptr;
      PREFETCH_READ(item.ptr.Raw());
    }

    if (len == kArrLen) {
      CloneBatch(kArrLen, arr, other);
      len = 0;
    }
  }
  CloneBatch(len, arr, other);
}

void DenseSet::Grow(size_t prev_size) {
  DensePtr first;

  // Corner case. Usually elements are moved to higher buckets during rehashing.
  // By moving upper elements first we make sure that there are no displaced elements
  // when we move the lower elements.
  // However the (displaced) elements at bucket_id=1 can move to bucket 0, and
  // bucket 0 can host displaced elements from bucket 1. To avoid this situation, we
  // stash the displaced element from bucket 0 and move it to the correct bucket at the end.
  if (entries_.front().IsDisplaced()) {
    first = PopPtrFront(entries_.begin());
  }

  // perform rehashing of items in the array, chain by chain.
  for (long i = prev_size - 1; i >= 0; --i) {
    DensePtr* curr = &entries_[i];
    DensePtr* prev = nullptr;

    do {
      if (ExpireIfNeeded(prev, curr)) {
        // if curr has disappeared due to expiry and prev was converted from Link to a
        // regular DensePtr
        if (prev && !prev->IsLink())
          break;
      }

      if (curr->IsEmpty())
        break;
      void* ptr = curr->GetObject();

      DCHECK(ptr != nullptr && ObjectAllocSize(ptr));

      uint32_t bid = BucketId(ptr, 0);

      // if the item does not move from the current chain, ensure
      // it is not marked as displaced and move to the next item in the chain
      if (bid == i) {
        curr->ClearDisplaced();
        prev = curr;
        curr = curr->Next();
        if (curr == nullptr)
          break;
      } else {
        // if the entry is in the wrong chain remove it and
        // add it to the correct chain. This will also correct
        // displaced entries
        auto dest = entries_.begin() + bid;
        DensePtr dptr = *curr;

        if (curr->IsObject()) {
          if (prev) {
            DCHECK(prev->IsLink());

            DenseLinkKey* plink = prev->AsLink();
            DCHECK(&plink->next == curr);

            // we want to make *prev a DensePtr instead of DenseLink and we
            // want to deallocate the link.
            DensePtr tmp = DensePtr::From(plink);

            // Important to transfer the ttl flag.
            tmp.SetTtl(prev->HasTtl());
            DCHECK(ObjectAllocSize(tmp.GetObject()));

            FreeLink(plink);
            // we deallocated the link, curr is invalid now.
            curr = nullptr;
            *prev = tmp;
          } else {
            // prev == nullptr
            curr->Reset();  // reset the root placeholder.
          }
        } else {
          // !curr.IsObject
          *curr = *dptr.Next();
          DCHECK(!curr->IsEmpty());
        }

        DVLOG(2) << " Pushing to " << bid << " " << dptr.GetObject();
        DCHECK_EQ(BucketId(dptr.GetObject(), 0), bid);
        PushFront(dest, dptr);
      }
    } while (curr);
  }
  if (!first.IsEmpty()) {
    uint32_t bid = BucketId(first.GetObject(), 0);
    PushFront(entries_.begin() + bid, first);
  }
}

// Assumes that the object does not exist in the set.
void DenseSet::AddUnique(void* obj, bool has_ttl, uint64_t hashcode) {
  if (entries_.empty()) {
    capacity_log_ = kMinSizeShift;
    entries_.resize(kMinSize);
  }

  uint32_t bucket_id = BucketId(hashcode);

  DCHECK_LT(bucket_id, entries_.size());

  // Try insert into flat surface first. Also handle the grow case
  // if utilization is too high.
  for (unsigned j = 0; j < 2; ++j) {
    ChainVectorIterator list = FindEmptyAround(bucket_id);
    if (list != entries_.end()) {
      obj_malloc_used_ += PushFront(list, obj, has_ttl);
      if (std::distance(entries_.begin(), list) != bucket_id) {
        list->SetDisplaced(std::distance(entries_.begin() + bucket_id, list));
      }
      ++size_;
      return;
    }

    if (size_ < entries_.size()) {
      break;
    }

    size_t prev_size = entries_.size();
    entries_.resize(prev_size * 2);
    ++capacity_log_;

    Grow(prev_size);
    bucket_id = BucketId(hashcode);
  }

  DCHECK(!entries_[bucket_id].IsEmpty());

  /**
   * Since the current entry is not empty, it is either a valid chain
   * or there is a displaced node here. In the latter case it is best to
   * move the displaced node to its correct bucket. However there could be
   * a displaced node there and so forth. Keep to avoid having to keep a stack
   * of displacements we can keep track of the current displaced node, add it
   * to the correct chain, and if the correct chain contains a displaced node
   * unlink it and repeat the steps
   */

  DensePtr to_insert(obj);
  if (has_ttl) {
    to_insert.SetTtl(true);
    expiration_used_ = true;
  }

  while (!entries_[bucket_id].IsEmpty() && entries_[bucket_id].IsDisplaced()) {
    DensePtr unlinked = PopPtrFront(entries_.begin() + bucket_id);

    PushFront(entries_.begin() + bucket_id, to_insert);
    to_insert = unlinked;
    bucket_id -= unlinked.GetDisplacedDirection();
  }

  DCHECK_EQ(BucketId(to_insert.GetObject(), 0), bucket_id);
  ChainVectorIterator list = entries_.begin() + bucket_id;
  PushFront(list, to_insert);
  obj_malloc_used_ += ObjectAllocSize(obj);
  DCHECK(!entries_[bucket_id].IsDisplaced());

  ++size_;
}

void DenseSet::Prefetch(uint64_t hash) {
  uint32_t bid = BucketId(hash);
  PREFETCH_READ(&entries_[bid]);
}

auto DenseSet::Find2(const void* ptr, uint32_t bid, uint32_t cookie)
    -> tuple<size_t, DensePtr*, DensePtr*> {
  DCHECK_LT(bid, entries_.size());

  DensePtr* curr = &entries_[bid];
  ExpireIfNeeded(nullptr, curr);

  if (Equal(*curr, ptr, cookie)) {
    return {bid, nullptr, curr};
  }

  // first look for displaced nodes since this is quicker than iterating a potential long chain
  if (bid > 0) {
    curr = &entries_[bid - 1];
    if (curr->IsDisplaced() && curr->GetDisplacedDirection() == -1) {
      ExpireIfNeeded(nullptr, curr);

      if (Equal(*curr, ptr, cookie)) {
        return {bid - 1, nullptr, curr};
      }
    }
  }

  if (bid + 1 < entries_.size()) {
    curr = &entries_[bid + 1];
    if (curr->IsDisplaced() && curr->GetDisplacedDirection() == 1) {
      ExpireIfNeeded(nullptr, curr);

      if (Equal(*curr, ptr, cookie)) {
        return {bid + 1, nullptr, curr};
      }
    }
  }

  // if the node is not displaced, search the correct chain
  DensePtr* prev = &entries_[bid];
  curr = prev->Next();
  while (curr != nullptr) {
    ExpireIfNeeded(prev, curr);

    if (Equal(*curr, ptr, cookie)) {
      return {bid, prev, curr};
    }
    prev = curr;
    curr = curr->Next();
  }

  // not in the Set
  return {0, nullptr, nullptr};
}

void* DenseSet::Delete(DensePtr* prev, DensePtr* ptr, bool detach) {
  void* obj = nullptr;

  if (ptr->IsObject()) {
    obj = ptr->Raw();
    ptr->Reset();
    if (prev) {
      DCHECK(prev->IsLink());

      DenseLinkKey* plink = prev->AsLink();
      DensePtr tmp = DensePtr::From(plink);
      // Transfer TTL flag
      tmp.SetTtl(prev->HasTtl());
      DCHECK(ObjectAllocSize(tmp.GetObject()));

      FreeLink(plink);
      *prev = tmp;
      DCHECK(!prev->IsLink());
    }
  } else {
    DCHECK(ptr->IsLink());

    DenseLinkKey* link = ptr->AsLink();
    obj = link->Raw();
    *ptr = link->next;
    FreeLink(link);
  }

  obj_malloc_used_ -= ObjectAllocSize(obj);
  --size_;

  if (detach) {
    return obj;
  }
  ObjDelete(obj, false);
  return nullptr;
}

DenseSet::ChainVectorIterator DenseSet::GetRandomChain() {
  if (entries_.empty() || size_ == 0) {
    return entries_.end();
  }

  size_t offset = absl::Uniform<size_t>(tl_bit_gen, 0u, entries_.size());

  // Start at random position and scan linearly with wrap-around
  auto it = entries_.begin() + offset;
  for (size_t n = 0; n < entries_.size(); n++) {
    // Check IsEmpty first to avoid ExpireIfNeeded overhead on empty buckets
    if (!it->IsEmpty()) {
      ExpireIfNeeded(nullptr, &*it);
      if (!it->IsEmpty()) {
        return it;
      }
    }

    if (++it == entries_.end()) {
      it = entries_.begin();
    }
  }

  return entries_.end();
}

DenseSet::IteratorBase DenseSet::GetRandomIterator() {
  ChainVectorIterator chain_it = GetRandomChain();
  if (chain_it == entries_.end())
    return IteratorBase{};

  DensePtr* ptr = &*chain_it;
  while (ptr->IsLink() && absl::Bernoulli(tl_bit_gen, 0.5)) {
    DensePtr* next = ptr->Next();
    if (ExpireIfNeeded(ptr, next))  // stop if we break the chain with expiration
      break;
    ptr = next;
  }

  return IteratorBase{(DenseSet*)this, chain_it, ptr};
}

void* DenseSet::PopInternal() {
  auto bucket_iter = GetRandomChain();  // Find first non empty chain
  if (bucket_iter == entries_.end())
    return nullptr;

  // unlink the first node in the first non-empty chain
  obj_malloc_used_ -= ObjectAllocSize(bucket_iter->GetObject());

  DensePtr front = PopPtrFront(bucket_iter);
  void* ret = front.GetObject();

  if (front.IsLink()) {
    FreeLink(front.AsLink());
  }

  --size_;
  return ret;
}

void* DenseSet::AddOrReplaceObj(void* obj, bool has_ttl) {
  uint64_t hc = Hash(obj, 0);

  DensePtr* dptr = entries_.empty() ? nullptr : Find(obj, BucketId(hc), 0).second;
  if (dptr) {  // replace existing object.
    // A bit confusing design: ttl bit is located on the wrapping pointer,
    // therefore we must set ttl bit before unrapping below.
    dptr->SetTtl(has_ttl);

    if (dptr->IsLink())  // unwrap the pointer.
      dptr = dptr->AsLink();

    void* res = dptr->Raw();
    const size_t res_sz = ObjectAllocSize(res);
    DCHECK_GE(obj_malloc_used_, res_sz);
    obj_malloc_used_ -= res_sz;
    obj_malloc_used_ += ObjectAllocSize(obj);

    dptr->SetObject(obj);

    return res;
  }

  AddUnique(obj, has_ttl, hc);
  return nullptr;
}

/**
 * stable scanning api. has the same guarantees as redis scan command.
 * we avoid doing bit-reverse by using a different function to derive a bucket id
 * from hash values. By using msb part of hash we make it "stable" with respect to
 * rehashes. For example, with table log size 4 (size 16), entries in bucket id
 * 1110 come from hashes 1110XXXXX.... When a table grows to log size 5,
 * these entries can move either to 11100 or 11101. So if we traversed with our cursor
 * range [0000-1110], it's guaranteed that in grown table we do not need to cover again
 * [00000-11100]. Similarly with shrinkage, if a table is shrunk to log size 3,
 * keys from 1110 and 1111 will move to bucket 111. Again, it's guaranteed that we
 * covered the range [000-111] (all keys in that case).
 * Returns: next cursor or 0 if reached the end of scan.
 * cursor = 0 - initiates a new scan.
 */

uint32_t DenseSet::Scan(uint32_t cursor, const ItemCb& cb) const {
  // empty set
  if (capacity_log_ == 0) {
    return 0;
  }

  uint32_t entries_idx = cursor >> (32 - capacity_log_);

  auto& entries = const_cast<DenseSet*>(this)->entries_;

  // First find the bucket to scan, skip empty buckets.
  // A bucket is empty if the current index is empty and the data is not displaced
  // to the right or to the left.
  while (entries_idx < entries_.size() && NoItemBelongsBucket(entries_idx)) {
    ++entries_idx;
  }

  if (entries_idx == entries_.size()) {
    return 0;
  }

  DensePtr* curr = &entries[entries_idx];

  // Check home bucket
  if (!curr->IsEmpty() && !curr->IsDisplaced()) {
    // scanning add all entries in a given chain
    while (true) {
      cb(curr->GetObject());
      if (!curr->IsLink())
        break;

      DensePtr* mcurr = const_cast<DensePtr*>(curr);

      if (ExpireIfNeeded(mcurr, &mcurr->AsLink()->next) && !mcurr->IsLink()) {
        break;
      }
      curr = &curr->AsLink()->next;
    }
  }

  // Check if the bucket on the left belongs to the home bucket.
  if (entries_idx > 0) {
    DensePtr* left_bucket = &entries[entries_idx - 1];
    ExpireIfNeeded(nullptr, left_bucket);

    if (left_bucket->IsDisplaced() &&
        left_bucket->GetDisplacedDirection() == -1) {  // left of the home bucket
      cb(left_bucket->GetObject());
    }
  }

  // move to the next index for the next scan and check if we are done
  ++entries_idx;
  if (entries_idx >= entries_.size()) {
    return 0;
  }

  // Check if the bucket on the right belongs to the home bucket.
  DensePtr* right_bucket = &entries[entries_idx];
  ExpireIfNeeded(nullptr, right_bucket);

  if (right_bucket->IsDisplaced() &&
      right_bucket->GetDisplacedDirection() == 1) {  // right of the home bucket
    cb(right_bucket->GetObject());
  }

  return entries_idx << (32 - capacity_log_);
}

auto DenseSet::NewLink(void* data, DensePtr next) -> DenseLinkKey* {
  using LinkAllocator = StatelessAllocator<DenseLinkKey>;

  LinkAllocator la;
  DenseLinkKey* lk = la.allocate(1);
  la.construct(lk);

  lk->next = next;
  lk->SetObject(data);
  ++num_links_;

  return lk;
}

bool DenseSet::ExpireIfNeededInternal(DensePtr* prev, DensePtr* node) const {
  DCHECK(node != nullptr);
  DCHECK(node->HasTtl());

  bool deleted = false;
  do {
    uint32_t obj_time = ObjExpireTime(node->GetObject());
    if (obj_time > time_now_) {
      break;
    }

    // updates the *node to next item if relevant or resets it to empty.
    const_cast<DenseSet*>(this)->Delete(prev, node);
    deleted = true;
  } while (node->HasTtl());

  return deleted;
}

void DenseSet::CollectExpired() {
  // Simply iterating over all items will remove expired
  auto it = IteratorBase(this, false);
  while (it.curr_entry_ != nullptr) {
    it.Advance();
  }
}

size_t DenseSet::SizeSlow() {
  CollectExpired();
  return size_;
}

}  // namespace dfly


================================================
FILE: src/core/dense_set.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once

#include <cassert>
#include <cstddef>
#include <cstdint>
#include <functional>
#include <type_traits>
#include <vector>

#include "core/detail/stateless_allocator.h"

namespace dfly {

// DenseSet is a nice but over-optimized data-structure. Probably is not worth it in the first
// place but sometimes the OCD kicks in and one can not resist.
// The advantage of it over redis-dict is smaller meta-data waste.
// dictEntry is 24 bytes, i.e it uses at least 32N bytes where N is the expected length.
// dict requires to allocate dictEntry per each addition in addition to the supplied key.
// It also wastes space in case of a set because it stores a value pointer inside dictEntry.
// To summarize:
// 100% utilized dict uses N*24 + N*8 = 32N bytes not including the key space.
// for 75% utilization (1/0.75 buckets): N*1.33*8 + N*24 = 35N
//
// This class uses 8 bytes per bucket (similarly to dictEntry*) but it used it for both
// links and keys. For most cases, we remove the need for another redirection layer
// and just store the key, so no "dictEntry" allocations occur.
// For those cells that require chaining, the bucket is
// changed in run-time to represent a linked chain.
// Additional feature - in order to to reduce collisions, we insert items into
// neighbour cells but only if they are empty (not chains). This way we reduce the number of
// empty (unused) spaces at full utilization from 36% to ~21%.
// 100% utilized table requires: N*8 + 0.2N*16 = 11.2N bytes or ~20 bytes savings.
// 75% utilization: N*1.33*8 + 0.12N*16 = 13N or ~22 bytes savings per record.
// with potential replacements of hset/zset data structures.
// static_assert(sizeof(dictEntry) == 24);

class DenseSet {
  struct DenseLinkKey;
  // we can assume that high 12 bits of user address space
  // can be used for tagging. At most 52 bits of address are reserved for
  // some configurations, and usually it's 48 bits.
  // https://docs.kernel.org/arch/arm64/memory.html
  static constexpr size_t kLinkBit = 1ULL << 52;
  static constexpr size_t kDisplaceBit = 1ULL << 53;
  static constexpr size_t kDisplaceDirectionBit = 1ULL << 54;
  static constexpr size_t kTtlBit = 1ULL << 55;
  static constexpr size_t kTagMask = 4095ULL << 52;  // we reserve 12 high bits.

  class DensePtr {
   public:
    explicit DensePtr(void* p = nullptr) : ptr_(p) {
    }

    // Imports the object with its metadata except the link bit that is reset.
    static DensePtr From(DenseLinkKey* o) {
      DensePtr res;
      res.ptr_ = (void*)(o->uptr() & (~kLinkBit));
      return res;
    }

    uint64_t uptr() const {
      return uint64_t(ptr_);
    }

    bool IsObject() const {
      return (uptr() & kLinkBit) == 0;
    }

    bool IsLink() const {
      return (uptr() & kLinkBit) != 0;
    }

    bool HasTtl() const {
      return (uptr() & kTtlBit) != 0;
    }

    bool IsEmpty() const {
      return ptr_ == nullptr;
    }

    void* Raw() const {
      return (void*)(uptr() & ~kTagMask);
    }

    bool IsDisplaced() const {
      return (uptr() & kDisplaceBit) == kDisplaceBit;
    }

    void SetLink(DenseLinkKey* lk) {
      ptr_ = (void*)(uintptr_t(lk) | kLinkBit);
    }

    void SetDisplaced(int direction) {
      ptr_ = (void*)(uptr() | kDisplaceBit);
      if (direction == 1) {
        ptr_ = (void*)(uptr() | kDisplaceDirectionBit);
      }
    }

    void ClearDisplaced() {
      ptr_ = (void*)(uptr() & ~(kDisplaceBit | kDisplaceDirectionBit));
    }

    // returns 1 if the displaced node is right of the correct bucket and -1 if it is left
    int GetDisplacedDirection() const {
      return (uptr() & kDisplaceDirectionBit) == kDisplaceDirectionBit ? 1 : -1;
    }

    void SetTtl(bool b) {
      if (b)
        ptr_ = (void*)(uptr() | kTtlBit);
      else
        ptr_ = (void*)(uptr() & (~kTtlBit));
    }

    void Reset() {
      ptr_ = nullptr;
    }

    void* GetObject() const {
      if (IsObject()) {
        return Raw();
      }

      return AsLink()->Raw();
    }

    // Sets pointer but preserves tagging info
    void SetObject(void* obj) {
      assert(IsObject());
      ptr_ = (void*)((uptr() & kTagMask) | (uintptr_t(obj) & ~kTagMask));
    }

    DenseLinkKey* AsLink() {
      return (DenseLinkKey*)Raw();
    }

    const DenseLinkKey* AsLink() const {
      return (const DenseLinkKey*)Raw();
    }

    DensePtr* Next() {
      if (!IsLink()) {
        return nullptr;
      }

      return &AsLink()->next;
    }

    const DensePtr* Next() const {
      if (!IsLink()) {
        return nullptr;
      }

      return &AsLink()->next;
    }

   private:
    void* ptr_ = nullptr;
  };

  struct DenseLinkKey : public DensePtr {
    DensePtr next;  // could be LinkKey* or Object *.
  };

  static_assert(sizeof(DensePtr) == sizeof(uintptr_t));
  static_assert(sizeof(DenseLinkKey) == 2 * sizeof(uintptr_t));

 protected:
  using DensePtrAllocator = StatelessAllocator<DensePtr>;
  using ChainVectorIterator = std::vector<DensePtr, DensePtrAllocator>::iterator;
  using ChainVectorConstIterator = std::vector<DensePtr, DensePtrAllocator>::const_iterator;

  class IteratorBase {
    friend class DenseSet;

   public:
    IteratorBase(DenseSet* owner, ChainVectorIterator list_it, DensePtr* e)
        : owner_(owner), curr_list_(list_it), curr_entry_(e) {
    }

    // returns the expiry time of the current entry or UINT32_MAX if no ttl is set.
    uint32_t ExpiryTime() const {
      return curr_entry_->HasTtl() ? owner_->ObjExpireTime(curr_entry_->GetObject()) : UINT32_MAX;
    }

    void SetExpiryTime(uint32_t ttl_sec);

    bool HasExpiry() const {
      return curr_entry_->HasTtl();
    }

   protected:
    IteratorBase() : owner_(nullptr), curr_entry_(nullptr) {
    }

    IteratorBase(const DenseSet* owner, bool is_end);

    void Advance();

    DenseSet* owner_;
    ChainVectorIterator curr_list_;
    DensePtr* curr_entry_;
  };

 public:
  static constexpr uint32_t kMaxBatchLen = 32;

  explicit DenseSet();
  virtual ~DenseSet();

  void Clear() {
    ClearStep(0, entries_.size());
  }

  // Returns the next bucket index that should be cleared.
  // Returns BucketCount when all objects are erased.
  uint32_t ClearStep(uint32_t start, uint32_t count);

  // Returns the number of elements in the map. Note that it might be that some of these elements
  // have expired and can't be accessed.
  size_t UpperBoundSize() const {
    return size_;
  }

  // Returns an accurate size, post-expiration. O(n).
  size_t SizeSlow();

  bool Empty() const {
    return size_ == 0;
  }

  size_t BucketCount() const {
    return entries_.size();
  }

  size_t ObjMallocUsed() const {
    return obj_malloc_used_;
  }

  size_t SetMallocUsed() const {
    return entries_.capacity() * sizeof(DensePtr) + num_links_ * sizeof(DenseLinkKey);
  }

  using ItemCb = std::function<void(const void*)>;

  uint32_t Scan(uint32_t cursor, const ItemCb& cb) const;
  void Reserve(size_t sz);

  // Shrinks the table to the specified size. The size must be a power of 2,
  // >= kMinSize, and >= current number of elements.
  // This method should be called explicitly when memory reclamation is needed.
  void Shrink(size_t new_size);

  void Fill(DenseSet* other) const;

  // set an abstract time that allows expiry.
  void set_time(uint32_t val) {
    time_now_ = val;
  }

  uint32_t time_now() const {
    return time_now_;
  }

  bool ExpirationUsed() const {
    return expiration_used_;
  }

 protected:
  // Virtual functions to be implemented for generic data
  virtual uint64_t Hash(const void* obj, uint32_t cookie) const = 0;
  virtual bool ObjEqual(const void* left, const void* right, uint32_t right_cookie) const = 0;
  virtual size_t ObjectAllocSize(const void* obj) const = 0;
  virtual uint32_t ObjExpireTime(const void* obj) const = 0;
  virtual void ObjUpdateExpireTime(const void* obj, uint32_t ttl_sec) = 0;
  virtual void ObjDelete(void* obj, bool has_ttl) const = 0;
  virtual void* ObjectClone(const void* obj, bool has_ttl, bool add_ttl) const = 0;

  void CollectExpired();

  bool EraseInternal(void* obj, uint32_t cookie) {
    auto [prev, found] = Find(obj, BucketId(obj, cookie), cookie);
    if (found) {
      Delete(prev, found);
      return true;
    }
    return false;
  }

  // Like EraseInternal but returns the detached object instead of deleting it.
  // Returns nullptr if the object was not found.
  void* DetachInternal(void* obj, uint32_t cookie) {
    auto [prev, found] = Find(obj, BucketId(obj, cookie), cookie);
    if (found) {
      return Delete(prev, found, true);
    }
    return nullptr;
  }

  void* FindInternal(const void* obj, uint64_t hashcode, uint32_t cookie) const;

  IteratorBase FindIt(const void* ptr, uint32_t cookie) {
    if (Empty())
      return IteratorBase{};

    auto [bid, _, curr] = Find2(ptr, BucketId(ptr, cookie), cookie);
    if (curr) {
      return IteratorBase(this, entries_.begin() + bid, curr);
    }
    return IteratorBase{};
  }

  // Get iterator to start of random non-empty chain (bucket)
  ChainVectorIterator GetRandomChain();

  // Wrap RandomChain() into iterator and advance with reservoir sampling
  IteratorBase GetRandomIterator();

  void* PopInternal();

  void IncreaseMallocUsed(size_t delta) {
    obj_malloc_used_ += delta;
  }

  void DecreaseMallocUsed(size_t delta) {
    obj_malloc_used_ -= delta;
  }

  // Returns the previous object if it has been replaced.
  // nullptr, if obj was added.
  void* AddOrReplaceObj(void* obj, bool has_ttl);

  // Assumes that the object does not exist in the set.
  void AddUnique(void* obj, bool has_ttl, uint64_t hashcode);

  void Prefetch(uint64_t hash);

 private:
  DenseSet(const DenseSet&) = delete;
  DenseSet& operator=(DenseSet&) = delete;

  bool Equal(DensePtr dptr, const void* ptr, uint32_t cookie) const;

  struct CloneItem {
    DensePtr ptr;
    void* obj = nullptr;
    bool has_ttl = false;
  };

  void CloneBatch(unsigned len, CloneItem* items, DenseSet* other) const;

  using ClearItem = CloneItem;
  void ClearBatch(unsigned len, ClearItem* items);

  uint32_t BucketId(uint64_t hash) const {
    assert(capacity_log_ > 0);
    return hash >> (64 - capacity_log_);
  }

  uint32_t BucketId(const void* ptr, uint32_t cookie) const {
    return BucketId(Hash(ptr, cookie));
  }

  // return a ChainVectorIterator (a.k.a iterator) or end if there is an empty chain found
  ChainVectorIterator FindEmptyAround(uint32_t bid);

  // Return if bucket has no item which is not displaced and right/left bucket has no displaced item
  // belong to given bid
  bool NoItemBelongsBucket(uint32_t bid) const;
  void Grow(size_t prev_size);

  // ============ Pseudo Linked List Functions for interacting with Chains ==================
  size_t PushFront(ChainVectorIterator, void* obj, bool has_ttl);
  void PushFront(ChainVectorIterator, DensePtr);

  DensePtr PopPtrFront(ChainVectorIterator);

  // ============ Pseudo Linked List in DenseSet end ==================

  // returns (prev, item) pair. If item is root, then prev is null.
  std::pair<DensePtr*, DensePtr*> Find(const void* ptr, uint32_t bid, uint32_t cookie) {
    auto [_, p, c] = Find2(ptr, bid, cookie);
    return {p, c};
  }

  // returns bid and (prev, item) pair. If item is root, then prev is null.
  std::tuple<size_t, DensePtr*, DensePtr*> Find2(const void* ptr, uint32_t bid, uint32_t cookie);

  DenseLinkKey* NewLink(void* data, DensePtr next);

  inline void FreeLink(DenseLinkKey* plink) {
    // deallocate the link if it is no longer a link as it is now in an empty list
    DensePtrAllocator::resource()->deallocate(plink, sizeof(DenseLinkKey), alignof(DenseLinkKey));
    --num_links_;
  }

  // Returns true if *node was deleted.
  bool ExpireIfNeeded(DensePtr* prev, DensePtr* node) const {
    if (node->HasTtl()) {
      return ExpireIfNeededInternal(prev, node);
    }
    return false;
  }

  bool ExpireIfNeededInternal(DensePtr* prev, DensePtr* node) const;

  // Deletes the object pointed by ptr and removes it from the set.
  // If ptr is a link then it will be deleted internally.
  // If detach is true, returns the raw object instead of calling ObjDelete.
  void* Delete(DensePtr* prev, DensePtr* ptr, bool detach = false);

  // Processes a single bucket during Shrink, relocating elements as needed.
  void ShrinkBucket(size_t bucket_idx);

  std::vector<DensePtr, DensePtrAllocator> entries_;

  mutable size_t obj_malloc_used_ = 0;
  mutable uint32_t size_ = 0;       // number of elements in the set.
  mutable uint32_t num_links_ = 0;  // number of links in the set.
  unsigned capacity_log_ = 0;

  uint32_t time_now_ = 0;

  mutable bool expiration_used_ = false;
};

inline void* DenseSet::FindInternal(const void* obj, uint64_t hashcode, uint32_t cookie) const {
  if (entries_.empty())
    return nullptr;

  uint32_t bid = BucketId(hashcode);
  DensePtr* ptr = const_cast<DenseSet*>(this)->Find(obj, bid, cookie).second;
  return ptr ? ptr->GetObject() : nullptr;
}

}  // namespace dfly


================================================
FILE: src/core/detail/bitpacking.cc
================================================
// Copyright 2022, Roman Gershman.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "src/core/detail/bitpacking.h"

#include <absl/base/internal/endian.h>

#include "base/logging.h"
#include "core/sse_port.h"

using namespace std;

namespace dfly {

namespace detail {

#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC push_options
#pragma GCC optimize("Ofast")
#endif

static inline uint64_t Compress8x7bit(uint64_t x) {
  x = ((x & 0x7F007F007F007F00) >> 1) | (x & 0x007F007F007F007F);
  x = ((x & 0x3FFF00003FFF0000) >> 2) | (x & 0x00003FFF00003FFF);
  x = ((x & 0x0FFFFFFF00000000) >> 4) | (x & 0x000000000FFFFFFF);

  return x;
}

#if defined(__SSE3__) || defined(__aarch64__)
static inline pair<const char*, uint8_t*> simd_variant1_pack(const char* ascii, const char* end,
                                                             uint8_t* bin) {
  __m128i val, rpart, lpart;

  // Skips 8th byte (indexc 7) in the lower 8-byte part.
  const __m128i control = _mm_set_epi8(-1, -1, 14, 13, 12, 11, 10, 9, 8, 6, 5, 4, 3, 2, 1, 0);

  // Based on the question I asked here: https://stackoverflow.com/q/74831843/2280111
  while (ascii <= end) {
    val = mm_loadu_si128(reinterpret_cast<const __m128i*>(ascii));

    /*
    x = ((x & 0x7F007F007F007F00) >> 1) | (x & 0x007F007F007F007F);
    x = ((x & 0x3FFF00003FFF0000) >> 2) | (x & 0x00003FFF00003FFF);
    x = ((x & 0x0FFFFFFF00000000) >> 4) | (x & 0x000000000FFFFFFF);
    */

    rpart = _mm_and_si128(val, _mm_set1_epi64x(0x007F007F007F007F));
    lpart = _mm_and_si128(val, _mm_set1_epi64x(0x7F007F007F007F00));
    val = _mm_or_si128(_mm_srli_epi64(lpart, 1), rpart);

    rpart = _mm_and_si128(val, _mm_set1_epi64x(0x00003FFF00003FFF));
    lpart = _mm_and_si128(val, _mm_set1_epi64x(0x3FFF00003FFF0000));
    val = _mm_or_si128(_mm_srli_epi64(lpart, 2), rpart);

    rpart = _mm_and_si128(val, _mm_set1_epi64x(0x000000000FFFFFFF));
    lpart = _mm_and_si128(val, _mm_set1_epi64x(0x0FFFFFFF00000000));
    val = _mm_or_si128(_mm_srli_epi64(lpart, 4), rpart);

    val = _mm_shuffle_epi8(val, control);
    _mm_storeu_si128(reinterpret_cast<__m128i*>(bin), val);
    bin += 14;
    ascii += 16;
  }

  return make_pair(ascii, bin);
}

static inline pair<const char*, uint8_t*> simd_variant2_pack(const char* ascii, const char* end,
                                                             uint8_t* bin) {
  // Skips 8th byte (indexc 7) in the lower 8-byte part.
  const __m128i control = _mm_set_epi8(-1, -1, 14, 13, 12, 11, 10, 9, 8, 6, 5, 4, 3, 2, 1, 0);

  __m128i val, rpart, lpart;

  // Based on the question I asked here: https://stackoverflow.com/q/74831843/2280111
  while (ascii <= end) {
    val = mm_loadu_si128(reinterpret_cast<const __m128i*>(ascii));

    /*
    x = ((x & 0x7F007F007F007F00) >> 1) | (x & 0x007F007F007F007F);
    x = ((x & 0x3FFF00003FFF0000) >> 2) | (x & 0x00003FFF00003FFF);
    x = ((x & 0x0FFFFFFF00000000) >> 4) | (x & 0x000000000FFFFFFF);
    */
    val = _mm_maddubs_epi16(_mm_set1_epi16(0x8001), val);
    val = _mm_madd_epi16(_mm_set1_epi32(0x40000001), val);

    rpart = _mm_and_si128(val, _mm_set1_epi64x(0x000000000FFFFFFF));
    lpart = _mm_and_si128(val, _mm_set1_epi64x(0x0FFFFFFF00000000));
    val = _mm_or_si128(_mm_srli_epi64(lpart, 4), rpart);

    val = _mm_shuffle_epi8(val, control);
    _mm_storeu_si128(reinterpret_cast<__m128i*>(bin), val);
    bin += 14;
    ascii += 16;
  }
  return make_pair(ascii, bin);
}

#endif

// Daniel Lemire's function validate_ascii_fast() - under Apache/MIT license.
// See https://github.com/lemire/fastvalidate-utf-8/
// The function returns true (1) if all chars passed in src are
// 7-bit values (0x00..0x7F). Otherwise, it returns false (0).
#ifdef __s390x__
bool validate_ascii_fast(const char* src, size_t len) {
  size_t i = 0;

  // Initialize a vector in which all the elements are set to zero.
  vector unsigned char has_error = vec_splat_s8(0);
  if (len >= 16) {
    for (; i <= len - 16; i += 16) {
      // Load 16 bytes from buffer into a vector.
      vector unsigned char current_bytes = vec_load_len((signed char*)(src + i), 16);
      // Perform a bitwise OR operation between the current and the previously loaded contents.
      has_error = vec_orc(has_error, current_bytes);
    }
  }

  // Initialize a vector in which all the elements are set to an invalid ASCII value.
  vector unsigned char rep_invalid_values = vec_splat_s8(0x80);

  // Perform bitwise AND-complement operation between two vectors.
  vector unsigned char andc_result = vec_andc(rep_invalid_values, has_error);

  // Tests whether any of corresponding elements of the given vectors are not equal.
  // After the bitwise operation, both vectors should be equal if ASCII values.
  if (!vec_all_eq(rep_invalid_values, andc_result)) {
    return false;
  }

  for (; i < len; i++) {
    if (src[i] & 0x80) {
      return false;
    }
  }

  return true;
}
#else
bool validate_ascii_fast(const char* src, size_t len) {
  size_t i = 0;
  __m128i has_error = _mm_setzero_si128();
  if (len >= 16) {
    for (; i <= len - 16; i += 16) {
      __m128i current_bytes = mm_loadu_si128((const __m128i*)(src + i));
      has_error = _mm_or_si128(has_error, current_bytes);
    }
  }
  int error_mask = _mm_movemask_epi8(has_error);

  char tail_has_error = 0;
  for (; i < len; i++) {
    tail_has_error |= src[i];
  }
  error_mask |= (tail_has_error & 0x80);

  return !error_mask;
}
#endif

// len must be at least 16
void ascii_pack(const char* ascii, size_t len, uint8_t* bin) {
  uint64_t val;
  const char* end = ascii + len;

  while (ascii + 8 <= end) {
    val = absl::little_endian::Load64(ascii);
    uint64_t dest = (val & 0xFF);
    for (unsigned i = 1; i <= 7; ++i) {
      val >>= 1;
      dest |= (val & (0x7FUL << 7 * i));
    }
    memcpy(bin, &dest, 7);
    bin += 7;
    ascii += 8;
  }

  // epilog - we do not pack since we have less than 8 bytes.
  while (ascii < end) {
    *bin++ = *ascii++;
  }
}

void ascii_pack2(const char* ascii, size_t len, uint8_t* bin) {
  uint64_t val;
  const char* end = ascii + len;

  while (ascii + 8 <= end) {
    val = absl::little_endian::Load64(ascii);
    val = Compress8x7bit(val);
    memcpy(bin, &val, 7);
    bin += 7;
    ascii += 8;
  }

  // epilog - we do not pack since we have less than 8 bytes.
  while (ascii < end) {
    *bin++ = *ascii++;
  }
}

// The algo - do in parallel what ascii_pack does on two uint64_t integers
void ascii_pack_simd(const char* ascii, size_t len, uint8_t* bin) {
#if defined(__SSE3__) || defined(__aarch64__)
  // I leave out 16 bytes in addition to 16 that we load in the loop
  // because we store into bin full 16 bytes instead of 14. To prevent data
  // overwrite we finish loop one iteration earlier.
  const char* end = ascii + len - 32;

  tie(ascii, bin) = simd_variant1_pack(ascii, end, bin);

  end += 32;  // Bring back end.
  DCHECK(ascii < end);
  ascii_pack(ascii, end - ascii, bin);
#else
  ascii_pack(ascii, len, bin);
#endif
}

void ascii_pack_simd2(const char* ascii, size_t len, uint8_t* bin) {
#if defined(__SSE3__) || defined(__aarch64__)
  // I leave out 16 bytes in addition to 16 that we load in the loop
  // because we store into bin full 16 bytes instead of 14. To prevent data
  // overwrite we finish loop one iteration earlier.
  const char* end = ascii + len - 32;

  // on arm var
#if defined(__aarch64__)
  tie(ascii, bin) = simd_variant1_pack(ascii, end, bin);
#else
  tie(ascii, bin) = simd_variant2_pack(ascii, end, bin);
#endif

  end += 32;  // Bring back end.
  DCHECK(ascii < end);
  ascii_pack(ascii, end - ascii, bin);
#else
  ascii_pack(ascii, len, bin);
#endif
}

// unpacks 8->7 encoded blob back to ascii.
// generally, we can not unpack inplace because ascii (dest) buffer is 8/7 bigger than
// the source buffer.
// however, if binary data is positioned on the right of the ascii buffer with empty space on the
// left than we can unpack inplace.
void ascii_unpack(const uint8_t* bin, size_t ascii_len, char* ascii) {
  constexpr uint8_t kM = 0x7F;
  uint8_t p = 0;
  unsigned i = 0;

  while (ascii_len >= 8) {
    for (i = 0; i < 7; ++i) {
      uint8_t src = *bin;  // keep on stack in case we unpack inplace.
      *ascii++ = (p >> (8 - i)) | ((src << i) & kM);
      p = src;
      ++bin;
    }

    ascii_len -= 8;
    *ascii++ = p >> 1;
  }

  DCHECK_LT(ascii_len, 8u);
  for (i = 0; i < ascii_len; ++i) {
    *ascii++ = *bin++;
  }
}

uint8_t ascii_unpack_byte(const uint8_t* bin, size_t ascii_len, size_t idx) {
  DCHECK(idx < ascii_len) << "Index oob for ascii byte unpacking: " << idx << " >= " << ascii_len;
  const size_t packed_groups = ascii_len / 8;
  const size_t group = idx / 8;
  const size_t idx_in_group = idx % 8;

  // Tail bytes (after the last full 8-char group) are stored unpacked.
  if (group >= packed_groups) {
    return bin[packed_groups * 7 + idx_in_group];
  }

  // Unpack ascii group and return byte at idx.
  char buf[8];
  ascii_unpack(bin + group * 7, 8, buf);
  return buf[idx_in_group];
}

void ascii_pack_byte(uint8_t* bin, size_t ascii_len, size_t idx, uint8_t val) {
  DCHECK(idx < ascii_len) << "Index oob for ascii byte packing: " << idx << " >= " << ascii_len;
  DCHECK_LT(val, 128u) << "Only 7-bit ASCII values can be packed";

  const size_t packed_groups = ascii_len / 8;
  const size_t group = idx / 8;
  const size_t idx_in_group = idx % 8;

  // Tail bytes (after the last full 8-char group) are stored unpacked.
  if (group >= packed_groups) {
    bin[packed_groups * 7 + idx_in_group] = val;
    return;
  }

  // Unpack ascii group and return, modify byte at idx and pack back.
  uint8_t* group_bin = bin + group * 7;
  char buf[8];
  ascii_unpack(group_bin, 8, buf);
  buf[idx_in_group] = val;
  ascii_pack(buf, 8, group_bin);
}

// See CompactObjectTest.AsanTriggerReadOverflow for more details.
void ascii_unpack_simd(const uint8_t* bin, size_t ascii_len, char* ascii) {
#if defined(__SSE3__) || defined(__aarch64__)

  if (ascii_len < 18) {  // ascii_len >=18 means bin length >=16.
    ascii_unpack(bin, ascii_len, ascii);
    return;
  }

  __m128i val, rpart, lpart;

  // we read 16 bytes from bin even when we need only 14 bytes.
  // So for last iteration we may access 2 bytes outside of the bin buffer.
  // To prevent this we need to round down the length of the bin buffer but since we
  // limit by ascii_len we reduce the ascii_len by two before computing number of iterations.
  size_t simd_len = ((ascii_len - 2) / 16) * 16;
  const char* end = ascii + simd_len;

  // shifts the second 7-byte blob to the left.
  const __m128i control = _mm_set_epi8(14, 13, 12, 11, 10, 9, 8, 7, -1, 6, 5, 4, 3, 2, 1, 0);

  while (ascii < end) {
    val = mm_loadu_si128(reinterpret_cast<const __m128i*>(bin));
    val = _mm_shuffle_epi8(val, control);

    rpart = _mm_and_si128(val, _mm_set1_epi64x(0x000000000FFFFFFF));
    lpart = _mm_and_si128(val, _mm_set1_epi64x(0x00FFFFFFF0000000));
    val = _mm_or_si128(_mm_slli_epi64(lpart, 4), rpart);

    rpart = _mm_and_si128(val, _mm_set1_epi64x(0x00003FFF00003FFF));
    lpart = _mm_and_si128(val, _mm_set1_epi64x(0xFFFFC000FFFFC000));
    val = _mm_or_si128(_mm_slli_epi64(lpart, 2), rpart);

    rpart = _mm_and_si128(val, _mm_set1_epi64x(0x007F007F007F007F));
    lpart = _mm_and_si128(val, _mm_set1_epi64x(0x7F807F807F807F80));
    val = _mm_or_si128(_mm_slli_epi64(lpart, 1), rpart);

    _mm_storeu_si128(reinterpret_cast<__m128i*>(ascii), val);
    ascii += 16;
    bin += 14;
  }

  ascii_len -= simd_len;
  if (ascii_len)
    ascii_unpack(bin, ascii_len, ascii);
#else
  ascii_unpack(bin, ascii_len, ascii);
#endif
}

// compares packed and unpacked strings. packed must be of length = binpacked_len(ascii_len).
bool compare_packed(const uint8_t* packed, const char* ascii, size_t ascii_len) {
  unsigned i = 0;
  bool res = true;
  const char* end = ascii + ascii_len;

  while (ascii + 8 <= end) {
    for (i = 0; i < 7; ++i) {
      uint8_t conv = (ascii[0] >> i) | (ascii[1] << (7 - i));
      res &= (conv == *packed);
      ++ascii;
      ++packed;
    }

    if (!res)
      return false;

    ++ascii;
  }

  while (ascii < end) {
    if (*ascii++ != *packed++) {
      return false;
    }
  }

  return true;
}

#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC pop_options
#endif

}  // namespace detail

}  // namespace dfly


================================================
FILE: src/core/detail/bitpacking.h
================================================
// Copyright 2022, Roman Gershman.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <cstddef>
#include <cstdint>

namespace dfly {

namespace detail {

bool validate_ascii_fast(const char* src, size_t len);

// unpacks 8->7 encoded blob back to ascii.
// generally, we can not unpack inplace because ascii (dest) buffer is 8/7 bigger than
// the source buffer.
// however, if binary data is positioned on the right of the ascii buffer with empty space on the
// left than we can unpack inplace.
void ascii_unpack(const uint8_t* bin, size_t ascii_len, char* ascii);
void ascii_unpack_simd(const uint8_t* bin, size_t ascii_len, char* ascii);

// Access a single byte in a 7-bit ASCII-packed string without unpacking the entire buffer.
// These helpers read/write the ASCII byte at logical position `idx` in the unpacked string
// directly from/into the packed `bin` representation.
// It's up to caller to verify:
// `1. idx` must be less than `ascii_len` to avoid out-of-bounds access.
// 2. `ascii` must be less than 128 (7-bit ASCII) for packing.
uint8_t ascii_unpack_byte(const uint8_t* bin, size_t ascii_len, size_t idx);
void ascii_pack_byte(uint8_t* bin, size_t ascii_len, size_t idx, uint8_t ascii);

// packs ascii string (does not verify) into binary form saving 1 bit per byte on average (12.5%).
void ascii_pack(const char* ascii, size_t len, uint8_t* bin);
void ascii_pack2(const char* ascii, size_t len, uint8_t* bin);

// SIMD implementation 1 of ascii_pack.
void ascii_pack_simd(const char* ascii, size_t len, uint8_t* bin);

// SIMD implementation 2 of ascii_pack.
void ascii_pack_simd2(const char* ascii, size_t len, uint8_t* bin);

bool compare_packed(const uint8_t* packed, const char* ascii, size_t ascii_len);

// maps ascii len to 7-bit packed length. Each 8 bytes are converted to 7 bytes.
inline constexpr size_t binpacked_len(size_t ascii_len) {
  return (ascii_len * 7 + 7) / 8; /* rounded up */
}

// converts 7-bit packed length back to ascii length. Note that this conversion
// is not accurate since it maps 7 bytes to 8 bytes (rounds up), while we may have
// 7 byte strings converted to 7 byte as well.
inline constexpr size_t ascii_len(size_t bin_len) {
  return (bin_len * 8) / 7;
}

}  // namespace detail
}  // namespace dfly


================================================
FILE: src/core/detail/bptree_internal.h
================================================
// Copyright 2023, Roman Gershman.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <array>
#include <cassert>
#include <cstdint>
#include <cstring>

namespace dfly {

template <typename T, typename Policy> class BPTree;

namespace detail {

// Internal classes related to B+tree implementation. The design is largely based on the
// implementation of absl::bPtree_map/set.
// The motivation for replacing zskiplist - significant size reduction:
//   we reduce the metadata overhead per record from 45 bytes in zskiplist to just a
//   few bytes with b-tree. The trick is using significantly large nodes (256 bytes) so that
//   their overhead is negligible compared to the items they store.
//   Why not use absl::bPtree_set? We must support Rank tree functionality that
//   absl does not supply.
//   Hacking into absl is not a simple task, implementing our own tree is easier.
// Below some design decisions:
// 1. We use predefined node size of 256 bytes and derive number of items in each node from it.
//    Inner nodes have less items than leaf nodes because they also need to store child pointers.
// 2. BPTreeNode does not predeclare fields besides the 8 bytes metadata - everything else is
//    calculated at run-time and has dedicated accessors (similarly to absl). This allows
//    dense and efficient representation of tree nodes.
// 3. We assume that we store small items (8, 16 bytes) which will have a large branching
//    factor (248/16), meaning the tree will stay shallow even for sizes reaching billion nodes.
// 4. We do not store parent pointer like in absl tree. Instead we use BPTreePath to store
//    hierarchy of parent nodes. That should reduce our overhead even further by few bits per item.
// 5. We assume we store trivially copyable types - this reduces the
//    complexity of the generics in the code.
// 6. We support pmr memory resource. This allows us to use pluggable heaps.
//
// TODO: (all the ideas taken from absl implementation)
//       1. to introduce slices when removing items from the tree (avoid shifts).
//       2. to avoid merging/rebalancing when removing max/min items from the tree.
//       3. Small tree optimization: when the tree is small with a single root node, we can
//          allocate less then 256 bytes (special case) to avoid relative blowups in memory for
//          small trees.

constexpr uint16_t kBPNodeSize = 256;

/**
 * @brief The BPNodeLayout class is a helper class that defines the layout of the B+tree node.
 *        The inner node looks like this:
 *        | 4 bytes metadata | keys ... | 4 bytes tree-count | children nodes |
 *        The leaf node looks like this:
 *        | 4 bytes metadata | keys ... |
 *
 * @tparam T
 */
template <typename T> class BPNodeLayout {
  static_assert(std::is_trivially_copyable<T>::value, "KeyT must be triviall copyable");

  static constexpr uint16_t kKeyOffset = 4;                  // 4 bytes for metadata
  static constexpr uint16_t kSubTreeLen = sizeof(uint32_t);  // 4 bytes for count.
 public:
  static constexpr uint16_t kKeySize = sizeof(T);
  static constexpr uint16_t kMaxLeafKeys = (kBPNodeSize - kKeyOffset) / kKeySize;
  static constexpr uint16_t kMinLeafKeys = kMaxLeafKeys / 2;

  // internal node:
  // x slots, (x+1) children: x * kKeySize + (x+1) * sizeof(BPTreeNode*) = x * (kKeySize + 8) + 8
  // x = (kBPNodeSize - kInnerKeyOffset - 8) / (kKeySize + 8)
  static constexpr uint16_t kMaxInnerKeys =
      (kBPNodeSize - sizeof(void*) - kKeyOffset - kSubTreeLen) / (kKeySize + sizeof(void*));
  static constexpr uint16_t kMinInnerKeys = kMaxInnerKeys / 2;

  using KeyT = T;

  // The class is constructed inside a block of memory of size kBPNodeSize.
  // Only BPTree can create it, hence it can access the memory outside its fields.
  static uint8_t* KeyPtr(unsigned index, void* node) {
    return reinterpret_cast<uint8_t*>(node) + kKeyOffset + kKeySize * index;
  }

  static const uint8_t* KeyPtr(unsigned index, const void* node) {
    return reinterpret_cast<const uint8_t*>(node) + kKeyOffset + kKeySize * index;
  }

  static uint8_t* TreeCountPtr(void* node) {
    return reinterpret_cast<uint8_t*>(node) + kKeyOffset + kKeySize * kMaxInnerKeys;
  }

  static const uint8_t* TreeCountPtr(const void* node) {
    return reinterpret_cast<const uint8_t*>(node) + kKeyOffset + kKeySize * kMaxInnerKeys;
  }

  static uint8_t* ChildrenStart(void* node) {
    return TreeCountPtr(node) + kSubTreeLen;
  }

  static const uint8_t* ChildrenStart(const void* node) {
    return TreeCountPtr(node) + kSubTreeLen;
  }

  static_assert(kMaxLeafKeys < 128);
};

template <typename T> class BPTreeNode {
  template <typename K, typename Policy> friend class ::dfly::BPTree;

  BPTreeNode(const BPTreeNode&) = delete;
  BPTreeNode& operator=(const BPTreeNode&) = delete;

  BPTreeNode(bool leaf) : num_items_(0), leaf_(leaf) {
  }

  using Layout = BPNodeLayout<T>;

 public:
  using KeyT = T;

  void InitSingle(T key) {
    SetKey(0, key);
    num_items_ = 1;
  }

  KeyT Key(unsigned index) const {
    KeyT res;
    memcpy(&res, Layout::KeyPtr(index, this), sizeof(KeyT));
    return res;
  }

  void SetKey(size_t index, KeyT item) {
    uint8_t* slot = Layout::KeyPtr(index, this);
    memcpy(slot, &item, sizeof(KeyT));
  }

  bool IsLeaf() const {
    return leaf_;
  }

  struct SearchResult {
    uint16_t index;
    bool found;
  };

  // Searches for key in the node using binary search.
  // Returns SearchResult with index of the smallest key for which comp(key) >=0.
  // comp: is a three way comparator.
  template <typename Comp> SearchResult BSearch(Comp&& comp) const;

  void Split(BPTreeNode* right, KeyT* median);

  unsigned NumItems() const {
    return num_items_;
  }

  unsigned AvailableSlotCount() const {
    return MaxItems() - num_items_;
  }

  unsigned MaxItems() const {
    return IsLeaf() ? Layout::kMaxLeafKeys : Layout::kMaxInnerKeys;
  }

  unsigned MinItems() const {
    return IsLeaf() ? Layout::kMinLeafKeys : Layout::kMinInnerKeys;
  }

  // Returns the overall number of iterms for a subtree rooted at this node.
  // Equals to NumItems() for leaf nodes and GetInnerTreeCount() for inner nodes.
  uint32_t TreeCount() const {
    return IsLeaf() ? NumItems() : GetInnerTreeCount();
  }

  void ShiftRight(unsigned index);
  void ShiftLeft(unsigned index, bool child_step_right = false);

  void LeafEraseRight() {
    assert(IsLeaf() && num_items_ > 0);
    --num_items_;
  }

  // Inserts item into a leaf node.
  // Assumes: the node is IsLeaf() and has some space.
  void LeafInsert(unsigned index, KeyT item) {
    assert(IsLeaf() && NumItems() < MaxItems());
    InsertItem(index, item);
  }

  void Validate(KeyT upper_bound) const;

  //
  // Below is the inner node API
  //

  BPTreeNode* Child(unsigned i) {
    BPTreeNode* res;
    memcpy(&res, Layout::ChildrenStart(this) + sizeof(BPTreeNode*) * i, sizeof(BPTreeNode*));
    return res;
  }

  const BPTreeNode* Child(unsigned i) const {
    BPTreeNode* res;
    memcpy(&res, Layout::ChildrenStart(this) + sizeof(BPTreeNode*) * i, sizeof(BPTreeNode*));
    return res;
  }

  void SetChild(unsigned i, BPTreeNode* child) {
    memcpy(Layout::ChildrenStart(this) + sizeof(BPTreeNode*) * i, &child, sizeof(BPTreeNode*));
  }

  // TODO: instead of storing counts at nodes we could keep at parent level
  //       along the children array. Unfortunately, this complicates implementation of the tree,
  //       so we will do it after the whole functionality is completed.
  uint32_t GetChildTreeCount(unsigned i) {
    return Child(i)->TreeCount();
  }

  void SetChildTreeCount(unsigned i, uint32_t cnt) {
    Child(i)->SetTreeCount(cnt);
  }

  void IncreaseTreeCount(int32_t delta) {
    uint32_t cnt = GetInnerTreeCount();
    cnt += delta;
    memcpy(Layout::TreeCountPtr(this), &cnt, sizeof(uint32_t));
  }

  // Rebalance a full child at position pos, at which we tried to insert at insert_pos.
  // Returns the node and the position to insert into if rebalancing succeeded.
  // Returns nullptr if rebalancing did not succeed.
  std::pair<BPTreeNode*, unsigned> RebalanceChild(unsigned pos, unsigned insert_pos);

  // We do not update tree count and it is done on the caller side.
  // Inserts item into a inner node at position pos and adds `child` at position pos+1.
  void InnerInsert(unsigned index, KeyT item, BPTreeNode* child) {
    InsertItem(index, item);
    SetChild(index + 1, child);
  }

  // Tries to merge the child at position pos with its sibling.
  // If we did not succeed to merge, we try to rebalance.
  // Returns retired BPTreeNode* if children got merged and this parent node's children
  // count decreased, otherwise, we return nullptr (rebalanced).
  BPTreeNode* MergeOrRebalanceChild(unsigned pos);

  uint32_t DEBUG_TreeCount() const {
    uint32_t res = NumItems();
    if (!IsLeaf()) {
      for (unsigned i = 0; i <= NumItems(); ++i) {
        res += Child(i)->DEBUG_TreeCount();
      }
    }
    return res;
  }

 private:
  void SetTreeCount(uint32_t cnt) {
    assert(!IsLeaf());
    memcpy(Layout::TreeCountPtr(this), &cnt, sizeof(uint32_t));
  }

  void RebalanceChildToLeft(unsigned child_pos, unsigned count);
  void RebalanceChildToRight(unsigned child_pos, unsigned count);

  void MergeFromRight(KeyT key, BPTreeNode* right);

  void InsertItem(unsigned index, KeyT item) {
    assert(index <= num_items_);

    ShiftRight(index);
    SetKey(index, item);
  }

  uint32_t GetInnerTreeCount() const {
    assert(!IsLeaf());
    uint32_t res;
    memcpy(&res, Layout::TreeCountPtr(this), sizeof(uint32_t));
    return res;
  }

  struct {
    uint32_t num_items_ : 7;
    uint32_t leaf_ : 1;
    uint32_t : 24;
  };
};

// Contains parent/index pairs. Meaning that node0->Child(index0) == node1.
template <typename T> class BPTreePath {
  static constexpr unsigned kMaxDepth = 16;

 public:
  void Push(BPTreeNode<T>* node, unsigned pos) {
    assert(depth_ < kMaxDepth);
    assert(depth_ == 0 || !record_[depth_ - 1].node->IsLeaf());
    record_[depth_].node = node;
    record_[depth_].pos = pos;
    depth_++;
  }

  unsigned Depth() const {
    return depth_;
  }

  void Clear() {
    depth_ = 0;
  }

  bool Empty() const {
    return depth_ == 0;
  }

  std::pair<BPTreeNode<T>*, unsigned> Last() const {
    assert(depth_ > 0u);
    return {record_[depth_ - 1].node, record_[depth_ - 1].pos};
  }

  BPTreeNode<T>* Node(unsigned i) const {
    assert(i < depth_);
    return record_[i].node;
  }

  unsigned Position(unsigned i) const {
    assert(i < depth_);
    return record_[i].pos;
  }

  void Pop() {
    assert(depth_ > 0u);
    depth_--;
  }

  bool HasValidTerminal() const {
    return depth_ > 0u && Last().second < Last().first->NumItems();
  }

  T Terminal() const {
    assert(Last().second < Last().first->NumItems());
    return Last().first->Key(Last().second);
  }

  /// @brief Returns the rank of the path's terminal item.
  /// Requires that the path is valid and has a terminal item.
  uint32_t Rank() const;

  /// @brief Advances the path to the next item.
  /// @return true if succeeded, false if reached the end.
  bool Next();

  /// @brief Advances the path to the previous item.
  /// @return true if succeeded, false if reached the end.
  bool Prev();

  // Extend the path to the leaf by always taking the rightmost child.
  void DigRight();

 private:
  struct Record {
    BPTreeNode<T>* node;
    unsigned pos;
  };

  std::array<Record, kMaxDepth> record_;
  unsigned depth_ = 0;
};

// Returns the position of the first item whose key is greater or equal than key.
// if all items are smaller than key, returns num_items_.
template <typename T>
template <typename Comp>
auto BPTreeNode<T>::BSearch(Comp&& cmp_op) const -> SearchResult {
  uint16_t lo = 0;
  uint16_t hi = num_items_;
  assert(hi > 0);

  // optimization: check the last item first.
  int cmp_res = cmp_op(Key(hi - 1));
  if (cmp_res >= 0) {
    return cmp_res > 0 ? SearchResult{.index = hi, .found = false}
                       : SearchResult{.index = uint16_t(hi - 1), .found = true};
  }

  // key < Key(hi - 1)

  --hi;
  while (lo < hi) {
    uint16_t mid = (lo + hi) >> 1;
    assert(mid < hi);

    KeyT item = Key(mid);

    int cmp_res = cmp_op(item);
    if (cmp_res == 0) {
      return SearchResult{.index = mid, .found = true};
    }

    if (cmp_res < 0) {
      hi = mid;
    } else {
      lo = mid + 1;  // we never return indices upto mid because they are strictly less than key.
    }
  }
  assert(lo == hi);

  return {.index = hi, .found = false};
}

template <typename T> void BPTreeNode<T>::ShiftRight(unsigned index) {
  unsigned num_items_to_shift = num_items_ - index;
  if (num_items_to_shift > 0) {
    uint8_t* ptr = Layout::KeyPtr(index, this);
    memmove(ptr + Layout::kKeySize, ptr, num_items_to_shift * Layout::kKeySize);

    if (!IsLeaf()) {
      uint8_t* src = Layout::ChildrenStart(this) + index * sizeof(BPTreeNode*);
      uint8_t* dest = src + sizeof(BPTreeNode*);
      memmove(dest, src, (num_items_to_shift + 1) * sizeof(BPTreeNode*));
    }
  }
  num_items_++;
}

template <typename T> void BPTreeNode<T>::ShiftLeft(unsigned index, bool child_step_right) {
  assert(index < num_items_);

  unsigned num_items_to_shift = num_items_ - index - 1;
  if (num_items_to_shift > 0) {
    memmove(Layout::KeyPtr(index, this), Layout::KeyPtr(index + 1, this),
            num_items_to_shift * Layout::kKeySize);
    if (!leaf_) {
      index += unsigned(child_step_right);
      num_items_to_shift = num_items_ - index;
      if (num_items_to_shift > 0) {
        uint8_t* dest = Layout::ChildrenStart(this) + index * sizeof(BPTreeNode*);
        uint8_t* src = dest + sizeof(BPTreeNode*);
        memmove(dest, src, num_items_to_shift * sizeof(BPTreeNode*));
      }
    }
  }
  num_items_--;
}

/***
 *  Rebalances the (full) child at position pos with its sibling. `this` node is an inner node.
 *  It first tried to rebalance (move items) from the full child to its left sibling. If the left
 *  sibling does not have enough space, it tries to rebalance to the right sibling. The caller
 *  passes the original position of the item it tried to insert into the full child. In case the
 *  rebalance succeeds the function returns the new node and the position to insert into. Otherwise,
 *  it returns result.first == nullptr.
 */
template <typename T>
std::pair<BPTreeNode<T>*, unsigned> BPTreeNode<T>::RebalanceChild(unsigned pos,
                                                                  unsigned insert_pos) {
  unsigned to_move = 0;
  BPTreeNode* node = Child(pos);

  if (pos > 0) {
    BPTreeNode* left = Child(pos - 1);
    unsigned dest_free = left->AvailableSlotCount();
    if (dest_free > 0) {
      // We bias rebalancing based on the position being inserted. If we're
      // inserting at the end of the right node then we bias rebalancing to
      // fill up the left node.
      if (insert_pos == node->NumItems()) {
        to_move = dest_free;
        assert(to_move < node->NumItems());
      } else if (dest_free > 1) {
        // we move less than left free capacity which leaves as some space in the node.
        to_move = dest_free / 2;
      }

      if (to_move) {
        unsigned dest_old_count = left->NumItems();
        RebalanceChildToLeft(pos, to_move);
        assert(node->AvailableSlotCount() == to_move);
        if (insert_pos < to_move) {
          assert(left->AvailableSlotCount() > 0u);       // we did not fill up the left node.
          insert_pos = dest_old_count + insert_pos + 1;  // +1 because we moved the separator.
          node = left;
        } else {
          insert_pos -= to_move;
        }

        return {node, insert_pos};
      }
    }
  }

  if (pos < NumItems()) {
    BPTreeNode* right = Child(pos + 1);
    unsigned dest_free = right->AvailableSlotCount();
    if (dest_free > 0) {
      if (insert_pos == 0) {
        to_move = dest_free;
        assert(to_move < node->NumItems());
      } else if (dest_free > 1) {
        to_move = dest_free / 2;
      }

      if (to_move) {
        RebalanceChildToRight(pos, to_move);
        if (insert_pos > node->NumItems()) {
          insert_pos -= (node->NumItems() + 1);
          node = right;
        }
        return {node, insert_pos};
      }
    }
  }
  return {nullptr, 0};
}

template <typename T> void BPTreeNode<T>::RebalanceChildToLeft(unsigned child_pos, unsigned count) {
  assert(child_pos > 0u);
  BPTreeNode* src = Child(child_pos);
  BPTreeNode* dest = Child(child_pos - 1);
  assert(src->NumItems() >= count);
  assert(count >= 1u);
  assert(dest->AvailableSlotCount() >= count);

  unsigned dest_items = dest->NumItems();

  // Move the delimiting value to the left node.
  dest->SetKey(dest_items, Key(child_pos - 1));

  // Copy src keys [0, count-1] to dest keys [dest_items+1, dest_items+count].
  for (unsigned i = 1; i < count; ++i) {
    dest->SetKey(dest_items + i, src->Key(i - 1));
  }

  SetKey(child_pos - 1, src->Key(count - 1));

  // Shift the values in the right node to their correct position.
  for (unsigned i = count; i < src->NumItems(); ++i) {
    src->SetKey(i - count, src->Key(i));
  }

  if (!src->IsLeaf()) {
    // Move the child pointers from the right to the left node.
    uint32_t src_move_count = 0;
    for (unsigned i = 0; i < count; ++i) {
      src_move_count += src->GetChildTreeCount(i);
      dest->SetChild(1 + dest->NumItems() + i, src->Child(i));
    }

    uint32_t dest_tree_count = GetChildTreeCount(child_pos - 1);
    uint32_t src_tree_count = GetChildTreeCount(child_pos);
    SetChildTreeCount(child_pos - 1, dest_tree_count + src_move_count + count);
    SetChildTreeCount(child_pos, src_tree_count - src_move_count - count);

    for (unsigned i = count; i <= src->NumItems(); ++i) {
      src->SetChild(i - count, src->Child(i));
      src->SetChild(i, NULL);
    }
  }

  // Fixup the counts on the src and dest nodes.
  dest->num_items_ += count;
  src->num_items_ -= count;
}

template <typename T>
void BPTreeNode<T>::RebalanceChildToRight(unsigned child_pos, unsigned count) {
  assert(child_pos < NumItems());
  BPTreeNode* src = Child(child_pos);
  BPTreeNode* dest = Child(child_pos + 1);

  assert(src->NumItems() >= count);
  assert(count >= 1u);
  assert(dest->AvailableSlotCount() >= count);

  unsigned dest_items = dest->NumItems();

  assert(dest_items > 0u);

  // Shift the values in the right node to their correct position.
  for (int i = dest_items - 1; i >= 0; --i) {
    dest->SetKey(i + count, dest->Key(i));
  }

  // Move the delimiting value to the left node and the new delimiting value
  // from the right node.
  KeyT new_delim = src->Key(src->NumItems() - count);
  for (unsigned i = 1; i < count; ++i) {
    unsigned src_id = src->NumItems() - count + i;
    dest->SetKey(i - 1, src->Key(src_id));
  }
  // Move parent's delimiter to destination and update it with new delimiter.
  dest->SetKey(count - 1, Key(child_pos));
  SetKey(child_pos, new_delim);

  if (!src->IsLeaf()) {
    // Shift child pointers in the right node to their correct position.
    for (int i = dest_items; i >= 0; --i) {
      dest->SetChild(i + count, dest->Child(i));
    }

    // Move child pointers from the left node to the right.
    uint32_t src_move_count = 0;
    for (unsigned i = 0; i < count; ++i) {
      unsigned src_id = src->NumItems() - (count - 1) + i;
      src_move_count += src->Child(src_id)->TreeCount();
      dest->SetChild(i, src->Child(src_id));
      src->SetChild(src_id, NULL);
    }

    uint32_t dest_tree_count = GetChildTreeCount(child_pos + 1);
    uint32_t src_tree_count = GetChildTreeCount(child_pos);
    SetChildTreeCount(child_pos + 1, dest_tree_count + src_move_count + count);
    SetChildTreeCount(child_pos, src_tree_count - src_move_count - count);
  }

  // Fixup the counts on the src and dest nodes.
  dest->num_items_ += count;
  src->num_items_ -= count;
}

template <typename T> BPTreeNode<T>* BPTreeNode<T>::MergeOrRebalanceChild(unsigned pos) {
  BPTreeNode* node = Child(pos);
  BPTreeNode* left = nullptr;

  assert(NumItems() >= 1u);
  assert(node->NumItems() < node->MinItems());

  if (pos > 0) {
    left = Child(pos - 1);
    if (left->NumItems() + 1 + node->NumItems() <= left->MaxItems()) {
      left->MergeFromRight(Key(pos - 1), node);
      ShiftLeft(pos - 1, true);
      return node;
    }
  }

  if (pos < NumItems()) {
    BPTreeNode* right = Child(pos + 1);
    if (node->NumItems() + 1 + right->NumItems() <= right->MaxItems()) {
      node->MergeFromRight(Key(pos), right);
      ShiftLeft(pos, true);
      return right;
    }

    // Try rebalancing with our right sibling.
    // TODO: don't perform rebalancing if
    // we deleted the first element from node and the node is not
    // empty. This is a small optimization for the common pattern of deleting
    // from the front of the tree.
    if (true) {
      unsigned to_move = (right->NumItems() - node->NumItems()) / 2;
      assert(to_move < right->NumItems());

      RebalanceChildToLeft(pos + 1, to_move);
      return nullptr;
    }
  }

  assert(left);

  if (left) {
    // Try rebalancing with our left sibling.
    // TODO: don't perform rebalancing if we deleted the last element from node and the
    // node is not empty. This is a small optimization for the common pattern of deleting
    // from the back of the tree.
    if (true) {
      unsigned to_move = (left->NumItems() - node->NumItems()) / 2;
      assert(to_move < left->NumItems());
      RebalanceChildToRight(pos - 1, to_move);
      return nullptr;
    }
  }
  return nullptr;
}

// splits the node into two nodes. The left node is the current node and the right node is
// is filled with the right half of the items. The median key is returned in *median.
template <typename T> void BPTreeNode<T>::Split(BPTreeNode<T>* right, T* median) {
  unsigned mid = num_items_ / 2;
  *median = Key(mid);
  right->leaf_ = leaf_;
  right->num_items_ = num_items_ - (mid + 1);
  memmove(Layout::KeyPtr(0, right), Layout::KeyPtr(mid + 1, this),
          right->num_items_ * Layout::kKeySize);
  if (!IsLeaf()) {
    uint32_t right_subtree_count = right->num_items_;
    for (size_t i = 0; i <= right->num_items_; i++) {
      BPTreeNode* child = Child(mid + 1 + i);
      right_subtree_count += child->TreeCount();
      right->SetChild(i, child);
    }
    right->SetTreeCount(right_subtree_count);
    IncreaseTreeCount(-(right_subtree_count + 1));
  }
  num_items_ = mid;
}

template <typename T> void BPTreeNode<T>::MergeFromRight(KeyT key, BPTreeNode<T>* right) {
  assert(NumItems() + 1 + right->NumItems() <= MaxItems());

  unsigned dest_items = NumItems();
  SetKey(dest_items, key);
  for (unsigned i = 0; i < right->NumItems(); ++i) {
    SetKey(dest_items + 1 + i, right->Key(i));
  }

  if (!IsLeaf()) {
    for (unsigned i = 0; i <= right->NumItems(); ++i) {
      SetChild(dest_items + 1 + i, right->Child(i));
    }
    IncreaseTreeCount(right->TreeCount() + 1);
  }
  num_items_ += 1 + right->NumItems();
  right->num_items_ = 0;
}

template <typename T> uint32_t BPTreePath<T>::Rank() const {
  uint32_t rank = 0;
  unsigned bound = Depth();

  for (unsigned i = 0; i < bound; ++i) {
    auto* node = Node(i);
    unsigned pos = Position(i);
    if (!node->IsLeaf()) {
      unsigned delta = (i == bound - 1) ? 1 : 0;
      for (unsigned j = 0; j < pos + delta; ++j) {
        rank += node->Child(j)->TreeCount();
      }
    }
    rank += pos;
  }

  return rank;
}

template <typename T> bool BPTreePath<T>::Next() {
  assert(depth_ > 0);
  BPTreeNode<T>* node = Last().first;

  // The data in BPTree is stored in both the leaf nodes and the inner nodes.
  if (node->IsLeaf()) {
    ++record_[depth_ - 1].pos;
    if (record_[depth_ - 1].pos < node->NumItems()) {
      return true;
    }

    // Advance to the next item, which is Key(i) in some ascendent of the subtree with
    // root Child(i). i in that case must be less than NumItems().
    // Note, that subtree Child(i) in a inner node is located before Key(i).
    do {
      Pop();
    } while (depth_ > 0 && Position(depth_ - 1) == Node(depth_ - 1)->NumItems());

    // we either point now on separator Key(i) in the parent node or we finished the tree.
    return depth_ > 0;
  }

  // We are in the inner node after the ascent from the leaf node. We need to advance to the next
  // Child and dig left.
  assert(!node->IsLeaf());
  assert(record_[depth_ - 1].pos < node->NumItems());

  // we are in the inner node pointing to the separator.
  // now we need to advance to the next child and dig to the leftmost leaf.
  record_[depth_ - 1].pos++;
  do {
    node = node->Child(record_[depth_ - 1].pos);
    Push(node, 0);
  } while (!node->IsLeaf());

  return true;
}

template <typename T> bool BPTreePath<T>::Prev() {
  assert(depth_ > 0);

  auto* node = record_[depth_ - 1].node;
  if (node->IsLeaf()) {
    /*
        node
        / \
       l   r

       We must go left (decrement pos), and if there is no left, we must go up until we can
       go left.
    */
    while (record_[depth_ - 1].pos == 0) {
      Pop();
      if (depth_ == 0) {
        return false;
      }
    }
    assert(depth_ > 0 && record_[depth_ - 1].pos > 0);

    // we finished backtracking from child(i+1) or stayed in the leaf.
    // either way stop at the next key on the left.
    --record_[depth_ - 1].pos;
    return true;
  }

  DigRight();
  return true;
}

template <typename T> void BPTreePath<T>::DigRight() {
  assert(depth_ > 0);
  BPTreeNode<T>* node = Last().first;

  assert(!node->IsLeaf());

  // we are in the inner node pointing to the separator.
  // we now must explore the left subtree which is located under the same index as the separator.
  // we go far-right in the left subtree.
  do {
    node = node->Child(record_[depth_ - 1].pos);
    Push(node, node->NumItems());
  } while (!node->IsLeaf());

  // we reached the leaf node, fix the position to point to the last key.
  assert(record_[depth_ - 1].node->IsLeaf());
  --record_[depth_ - 1].pos;
}

}  // namespace detail
}  // namespace dfly


================================================
FILE: src/core/detail/gen_utils.h
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/random/random.h>
#include <absl/strings/str_cat.h>

#include <string>

namespace dfly {

inline std::string GetRandomHex(absl::InsecureBitGen& gen, size_t len, size_t len_deviation = 0) {
  static_assert(std::is_same<uint64_t, decltype(gen())>::value);
  if (len_deviation) {
    len += (gen() % len_deviation);
  }

  std::string res(len, '\0');
  size_t indx = 0;

  for (size_t i = 0; i < len / 16; ++i) {  // 2 chars per byte
    absl::numbers_internal::FastHexToBufferZeroPad16(gen(), res.data() + indx);
    indx += 16;
  }

  if (indx < res.size()) {
    char buf[32];
    absl::numbers_internal::FastHexToBufferZeroPad16(gen(), buf);

    for (unsigned j = 0; indx < res.size(); indx++, j++) {
      res[indx] = buf[j];
    }
  }

  return res;
}

}  // namespace dfly


================================================
FILE: src/core/detail/listpack.cc
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/detail/listpack.h"

#include "base/logging.h"

namespace dfly {
namespace detail {

using namespace std;

QList::Entry ListPack::GetEntry(uint8_t* pos) {
  unsigned int slen;
  long long lval;
  uint8_t* vstr = lpGetValue(pos, &slen, &lval);
  return vstr ? QList::Entry(reinterpret_cast<char*>(vstr), slen) : QList::Entry(lval);
}

string ListPack::Pop(QList::Where where) {
  uint8_t* pos = GetFirst(where);
  DCHECK(pos);

  string res = GetEntry(pos).to_string();
  lp_ = lpDelete(lp_, pos, nullptr);
  return res;
}

void ListPack::Push(string_view value, QList::Where where) {
  if (where == QList::HEAD) {
    lp_ = lpPrepend(lp_, (unsigned char*)value.data(), value.size());
  } else {
    lp_ = lpAppend(lp_, (unsigned char*)value.data(), value.size());
  }
}

string ListPack::First(QList::Where where) const {
  uint8_t* pos = GetFirst(where);
  DCHECK(pos);

  return GetEntry(pos).to_string();
}

std::optional<string> ListPack::At(long index) const {
  uint8_t* pos = lpSeek(lp_, index);
  if (!pos)
    return nullopt;

  return GetEntry(pos).to_string();
}

vector<uint32_t> ListPack::Pos(string_view element, uint32_t rank, uint32_t count, uint32_t max_len,
                               QList::Where where) const {
  DCHECK_GT(rank, 0u);
  vector<uint32_t> matches;

  uint8_t* p = GetFirst(where);
  unsigned index = 0;
  while (p && (max_len == 0 || index < max_len)) {
    if (GetEntry(p) == element) {
      if (rank == 1) {
        size_t sz = lpLength(lp_);
        auto k = (where == QList::HEAD) ? index : sz - index - 1;
        matches.push_back(k);
        if (count && matches.size() >= count)
          break;
      } else {
        rank--;
      }
    }
    index++;
    p = (where == QList::HEAD) ? lpNext(lp_, p) : lpPrev(lp_, p);
  }
  return matches;
}

uint8_t* ListPack::Find(std::string_view elem) const {
  uint8_t* p = lpFirst(lp_);
  while (p) {
    if (GetEntry(p) == elem) {
      return p;
    }
    p = lpNext(lp_, p);
  }
  return nullptr;
}

unsigned ListPack::Remove(const CollectionEntry& elem, unsigned count, QList::Where where) {
  unsigned removed = 0;

  auto is_match = [&](const QList::Entry& entry) {
    return elem.is_int() ? entry.is_int() && entry.ival() == elem.ival() : entry == elem.view();
  };

  uint8_t* p = GetFirst(where);

  while (p) {
    if (is_match(GetEntry(p))) {
      // lpDelete returns pointer to the element AFTER the deleted one (toward tail)
      lp_ = lpDelete(lp_, p, &p);

      if (where == QList::TAIL) {
        // Iterating backward (from TAIL): need to get the previous element
        if (p) {
          p = lpPrev(lp_, p);
        } else {
          // Deleted the tail element, lpDelete returned nullptr (no element after tail).
          // We need to continue from the new tail to keep moving towards HEAD.
          p = lpLast(lp_);
        }
      }
      // For HEAD direction, 'p' already points to the next element to check

      removed++;
      if (count && removed == count)
        break;
      continue;
    }

    p = (where == QList::HEAD) ? lpNext(lp_, p) : lpPrev(lp_, p);
  }

  return removed;
}

}  // namespace detail
}  // namespace dfly


================================================
FILE: src/core/detail/listpack.h
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <optional>
#include <string>
#include <string_view>

#include "core/qlist.h"

extern "C" {
#include "redis/listpack.h"
}

namespace dfly {
namespace detail {

// A listpack wrapper that provides basic list operations.
// Unfortunately, we already have a listpack wrapper in core/detail/listpack_wrap.h but
// it's more map oriented and doesn't provide the basic list operations we need here.
// TODO: to unify both wrappers into one.
class ListPack {
 public:
  explicit ListPack(uint8_t* lp = nullptr) : lp_(lp) {
  }

  size_t Size() const {
    return lpLength(lp_);
  }

  // Removes and returns an element from the specified end (HEAD or TAIL).
  std::string Pop(QList::Where where);

  // Adds an element to the specified end (HEAD or TAIL).
  void Push(std::string_view value, QList::Where where);

  // Returns the first element from the specified end without removing it.
  std::string First(QList::Where where) const;

  // Returns the element at the specified index, or std::nullopt if out of bounds.
  std::optional<std::string> At(long index) const;

  // Finds positions of an element matching the given criteria.
  std::vector<uint32_t> Pos(std::string_view element, uint32_t rank, uint32_t count,
                            uint32_t max_len, QList::Where where) const;

  uint8_t* Find(std::string_view elem) const;

  uint8_t* Seek(long index) const {
    return lpSeek(lp_, index);
  }

  // Inserts an element before or after the specified pivot element.
  void Insert(uint8_t* pivot, std::string_view elem, QList::InsertOpt insert_opt) {
    int where = (insert_opt == QList::BEFORE) ? LP_BEFORE : LP_AFTER;
    lp_ = lpInsertString(lp_, (unsigned char*)elem.data(), elem.size(), pivot, where, nullptr);
  }

  // Removes up to count occurrences of elem from the specified direction.
  unsigned Remove(const CollectionEntry& elem, unsigned count, QList::Where where);

  // Replaces the element at the specified index with a new value.
  void Replace(uint8_t* pos, std::string_view elem) {
    lp_ = lpReplace(lp_, &pos, (unsigned char*)elem.data(), elem.size());
  }

  // Removes count elements starting from the specified index.
  void Erase(long start, long count) {
    lp_ = lpDeleteRange(lp_, start, count);
  }

  // Returns the raw listpack pointer.
  uint8_t* GetPointer() const {
    return lp_;
  }

  size_t BytesSize() const {
    return lpBytes(lp_);
  }

 private:
  static CollectionEntry GetEntry(uint8_t* pos);

  uint8_t* GetFirst(QList::Where where) const {
    return (where == QList::HEAD) ? lpFirst(lp_) : lpLast(lp_);
  }

  uint8_t* lp_;
};

}  // namespace detail
}  // namespace dfly


================================================
FILE: src/core/detail/listpack_wrap.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#include "core/detail/listpack_wrap.h"

#include "base/logging.h"

extern "C" {
#include "redis/listpack.h"
}

namespace dfly::detail {

ListpackWrap::Iterator::Iterator(uint8_t* lp, uint8_t* ptr, IntBuf& intbuf)
    : lp_{lp}, ptr_{ptr}, next_ptr_{nullptr}, intbuf_(intbuf) {
  static_assert(sizeof(intbuf_[0]) >= LP_INTBUF_SIZE);  // to avoid header dependency
  Read();
}

ListpackWrap::Iterator& ListpackWrap::Iterator::operator++() {
  ptr_ = next_ptr_;
  Read();
  return *this;
}

void ListpackWrap::Iterator::Read() {
  if (!ptr_)
    return;

  key_v_ = GetView(ptr_, intbuf_[0]);
  next_ptr_ = lpNext(lp_, ptr_);
  value_v_ = GetView(next_ptr_, intbuf_[1]);
  next_ptr_ = lpNext(lp_, next_ptr_);
}

ListpackWrap::~ListpackWrap() {
  DCHECK(!dirty_);
}

ListpackWrap ListpackWrap::WithCapacity(size_t capacity) {
  return ListpackWrap{lpNew(capacity)};
}

uint8_t* ListpackWrap::GetPointer() {
  dirty_ = false;
  return lp_;
}

ListpackWrap::Iterator ListpackWrap::Find(std::string_view key) const {
  if (size() == 0)
    return end();

  uint8_t* ptr = lpFind(lp_, lpFirst(lp_), (unsigned char*)key.data(), key.size(), 1);
  return Iterator{lp_, ptr, intbuf_};
}

bool ListpackWrap::Delete(std::string_view key) {
  if (size() == 0)
    return false;

  uint8_t* ptr = lpFind(lp_, lpFirst(lp_), (unsigned char*)key.data(), key.size(), 1);
  if (ptr == nullptr)
    return false;

  lp_ = lpDeleteRangeWithEntry(lp_, &ptr, 2);
  dirty_ = true;
  return true;
}

bool ListpackWrap::Insert(std::string_view key, std::string_view value, bool skip_exists) {
  uint8_t* vptr;
  uint8_t* fptr = lpFirst(lp_);
  uint8_t* fsrc = key.empty() ? lp_ : (uint8_t*)key.data();
  // if we vsrc is NULL then lpReplace will delete the element, which is not what we want.
  // therefore, for an empty val we set it to some other valid address so that lpReplace
  // will do the right thing and encode empty string instead of deleting the element.
  uint8_t* vsrc = value.empty() ? lp_ : (uint8_t*)value.data();

  bool updated = false;
  if (fptr) {
    fptr = lpFind(lp_, fptr, fsrc, key.size(), 1);
    if (fptr) {
      if (skip_exists)
        return false;

      // Grab pointer to the value (fptr points to the field)
      vptr = lpNext(lp_, fptr);

      // Replace value
      lp_ = lpReplace(lp_, &vptr, vsrc, value.size());
      DCHECK_EQ(0u, lpLength(lp_) % 2);

      dirty_ = true;
      updated = true;
    }
  }

  if (!updated) {
    // Push new field/value pair onto the tail of the listpack.
    // TODO: we should at least allocate once for both elements
    lp_ = lpAppend(lp_, fsrc, key.size());
    lp_ = lpAppend(lp_, vsrc, value.size());
    dirty_ = true;
  }

  return !updated;
}

size_t ListpackWrap::size() const {
  return lpLength(lp_) / 2;
}

ListpackWrap::Iterator ListpackWrap::begin() const {
  return Iterator{lp_, lpFirst(lp_), intbuf_};
}

ListpackWrap::Iterator ListpackWrap::end() const {
  return Iterator{lp_, nullptr, intbuf_};
}

size_t ListpackWrap::UsedBytes() const {
  return lpBytes(lp_);
}

std::string_view ListpackWrap::GetView(uint8_t* lp_it, uint8_t int_buf[]) {
  int64_t ele_len = 0;
  uint8_t* elem = lpGet(lp_it, &ele_len, int_buf);
  DCHECK(elem);
  return std::string_view{reinterpret_cast<char*>(elem), size_t(ele_len)};
}

bool ListpackWrap::Iterator::operator==(const Iterator& other) const {
  return lp_ == other.lp_ && ptr_ == other.ptr_;
}
}  // namespace dfly::detail


================================================
FILE: src/core/detail/listpack_wrap.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once

#include <cstdint>
#include <cstdio>
#include <string_view>

namespace dfly::detail {

// Wrapper around map data structure based on listpack
struct ListpackWrap {
 private:
  using IntBuf = uint8_t[2][24];

 public:
  ~ListpackWrap();

  struct Iterator {
    using iterator_category = std::forward_iterator_tag;
    using difference_type = std::ptrdiff_t;
    using value_type = std::pair<std::string_view, std::string_view>;
    using reference = value_type;
    using pointer = value_type*;

    Iterator(uint8_t* lp, uint8_t* ptr, IntBuf& intbuf);
    Iterator& operator++();

    value_type operator*() const {
      return {key_v_, value_v_};
    }

    bool operator==(const Iterator& other) const;

    bool operator!=(const Iterator& other) const {
      return !(operator==(other));
    }

   private:
    void Read();  // Read next entry at ptr and determine next_ptr

    uint8_t *lp_ = nullptr, *ptr_ = nullptr, *next_ptr_ = nullptr;
    std::string_view key_v_, value_v_;
    IntBuf& intbuf_;
  };

  explicit ListpackWrap(uint8_t* lp) : lp_{lp} {
  }

  // Create listpack with capacity
  static ListpackWrap WithCapacity(size_t capacity);

  uint8_t* GetPointer();                      // Get new updated pointer
  Iterator Find(std::string_view key) const;  // Linear search
  bool Delete(std::string_view key);
  bool Insert(std::string_view key, std::string_view value, bool skip_exists);

  Iterator begin() const;
  Iterator end() const;
  size_t size() const;  // number of entries
  size_t UsedBytes() const;

  // Get view from raw listpack iterator
  static std::string_view GetView(uint8_t* lp_it, uint8_t int_buf[]);

 private:
  uint8_t* lp_;            // the listpack itself
  mutable IntBuf intbuf_;  // buffer for integers decoded to strings
  bool dirty_ = false;     // whether lp_ was updated, but never retrieved with GetPointer
};

}  // namespace dfly::detail


================================================
FILE: src/core/detail/stateless_allocator.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.

#pragma once

#include <cassert>

#include "base/pmr/memory_resource.h"

namespace dfly {

namespace detail {
inline thread_local PMR_NS::memory_resource* tl_mr = nullptr;
}  // namespace detail

template <typename T, typename Impl> class StatelessAllocatorBase {
 public:
  using value_type = T;
  using size_type = std::size_t;
  using difference_type = std::ptrdiff_t;
  using is_always_equal = std::true_type;

  template <typename U, typename... _Args> void construct(U* __p, _Args&&... __args) {
    ::new (static_cast<void*>(__p)) U(std::forward<_Args>(__args)...);
  }

  static value_type* allocate(size_type n) {
    static_assert(
        std::is_empty_v<Impl>,
        "StatelessAllocator must not contain state, so it can use empty base optimization");

    void* ptr = Impl::resource()->allocate(n * sizeof(value_type), alignof(value_type));
    return static_cast<value_type*>(ptr);
  }

  static void deallocate(value_type* ptr, size_type n) noexcept {
    Impl::resource()->deallocate(ptr, n * sizeof(value_type), alignof(value_type));
  }
};

template <typename T>
class StatelessAllocator : public StatelessAllocatorBase<T, StatelessAllocator<T>> {
 public:
  StatelessAllocator() noexcept {
    assert(detail::tl_mr != nullptr);
  }

  template <typename U> StatelessAllocator(const StatelessAllocator<U>&) noexcept {  // NOLINT
  }

  static PMR_NS::memory_resource* resource() {
    return detail::tl_mr;
  }
};

template <typename T, typename U>
bool operator==(const StatelessAllocator<T>&, const StatelessAllocator<U>&) noexcept {
  return true;
}

template <typename T, typename U>
bool operator!=(const StatelessAllocator<T>&, const StatelessAllocator<U>&) noexcept {
  return false;
}

inline void InitTLStatelessAllocMR(PMR_NS::memory_resource* mr) {
  detail::tl_mr = mr;
}

inline void CleanupStatelessAllocMR() {
  detail::tl_mr = nullptr;
}

}  // namespace dfly


================================================
FILE: src/core/dfly_core_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include <absl/strings/charconv.h>
#include <absl/strings/numbers.h>
#include <fast_float/fast_float.h>

#ifdef USE_PCRE2
#define PCRE2_CODE_UNIT_WIDTH 8
#include <pcre2.h>
#endif

#ifdef USE_RE2
#include <re2/re2.h>
#endif

#include <reflex/matcher.h>

#include <random>
#include <regex>

#include "base/gtest.h"
#include "base/logging.h"
#include "core/glob_matcher.h"
#include "core/huff_coder.h"
#include "core/intent_lock.h"
#include "core/tx_queue.h"

namespace dfly {

using namespace std;

std::random_device rd;

static string GetRandomHex(size_t len) {
  std::string res(len, '\0');
  size_t indx = 0;

  for (; indx < len; indx += 16) {  // 2 chars per byte
    absl::numbers_internal::FastHexToBufferZeroPad16(rd(), res.data() + indx);
  }

  if (indx < len) {
    char buf[24];
    absl::numbers_internal::FastHexToBufferZeroPad16(rd(), buf);

    for (unsigned j = 0; indx < len; indx++, j++) {
      res[indx] = buf[j];
    }
  }

  return res;
}

extern int stringmatchlen(const char* pattern, int patternLen, const char* string, int stringLen,
                          int nocase);

class TxQueueTest : public ::testing::Test {
 protected:
  TxQueueTest() {
  }

  uint64_t Pop() {
    if (pq_.Empty())
      return uint64_t(-1);
    TxQueue::ValueType val = pq_.Front();
    pq_.PopFront();

    return std::get<uint64_t>(val);
  }

  TxQueue pq_;
};

TEST_F(TxQueueTest, Basic) {
  pq_.Insert(4);
  pq_.Insert(3);
  pq_.Insert(2);

  unsigned cnt = 0;
  auto head = pq_.Head();
  auto it = head;
  do {
    ++cnt;
    it = pq_.Next(it);
  } while (it != head);
  EXPECT_EQ(3, cnt);

  ASSERT_EQ(2, Pop());
  ASSERT_EQ(3, Pop());
  ASSERT_EQ(4, Pop());
  ASSERT_TRUE(pq_.Empty());

  EXPECT_EQ(TxQueue::kEnd, pq_.Head());

  pq_.Insert(10);
  ASSERT_EQ(10, Pop());
}

class IntentLockTest : public ::testing::Test {
 protected:
  IntentLock lk_;
};

TEST_F(IntentLockTest, Basic) {
  ASSERT_TRUE(lk_.Acquire(IntentLock::SHARED));
  ASSERT_FALSE(lk_.Acquire(IntentLock::EXCLUSIVE));
  lk_.Release(IntentLock::EXCLUSIVE);

  ASSERT_FALSE(lk_.Check(IntentLock::EXCLUSIVE));
  lk_.Release(IntentLock::SHARED);
  ASSERT_TRUE(lk_.Check(IntentLock::EXCLUSIVE));
}

class StringMatchTest : public ::testing::Test {
 protected:
  // wrapper around stringmatchlen with stringview arguments
  bool MatchLen(string_view pattern, string_view str, bool nocase) {
    GlobMatcher matcher(pattern, !nocase);
    return matcher.Matches(str);
  }
};

TEST_F(StringMatchTest, Glob2Regex) {
  EXPECT_EQ(GlobMatcher::Glob2Regex(""), "");
  EXPECT_EQ(GlobMatcher::Glob2Regex("*"), ".*");
  EXPECT_EQ(GlobMatcher::Glob2Regex("\\*"), "\\*");
  EXPECT_EQ(GlobMatcher::Glob2Regex("\\?"), "\\?");
  EXPECT_EQ(GlobMatcher::Glob2Regex("[abc]"), "[abc]");
  EXPECT_EQ(GlobMatcher::Glob2Regex("[^abc]"), "[^abc]");
  EXPECT_EQ(GlobMatcher::Glob2Regex("h\\[^|"), "h\\[\\^\\|");
  EXPECT_EQ(GlobMatcher::Glob2Regex("[$?^]a"), "[$?^]a");
  EXPECT_EQ(GlobMatcher::Glob2Regex("[^]a"), ".a");
  EXPECT_EQ(GlobMatcher::Glob2Regex("[]a"), "[]a");
  EXPECT_EQ(GlobMatcher::Glob2Regex("\\d"), "d");
  EXPECT_EQ(GlobMatcher::Glob2Regex("[\\d]"), "[\\\\d]");
  EXPECT_EQ(GlobMatcher::Glob2Regex("abc\\"), "abc\\\\");
  EXPECT_EQ(GlobMatcher::Glob2Regex("[\\]]"), "[\\]]");
  reflex::Matcher matcher("abc[\\\\d]e");
  matcher.input("abcde");
  ASSERT_TRUE(matcher.find());
}

TEST_F(StringMatchTest, Basic) {
  EXPECT_EQ(MatchLen("", "", 0), 1);

  EXPECT_EQ(MatchLen("*", "", 0), 0);
  EXPECT_EQ(MatchLen("*", "", 1), 0);
  EXPECT_EQ(MatchLen("\\\\", "\\", 0), 1);
  EXPECT_EQ(MatchLen("h\\\\llo", "h\\llo", 0), 1);
  EXPECT_EQ(MatchLen("a\\bc", "ABC", 1), 1);

  // ExactMatch
  EXPECT_EQ(MatchLen("hello", "hello", 0), 1);
  EXPECT_EQ(MatchLen("hello", "world", 0), 0);

  // Wildcards
  EXPECT_EQ(MatchLen("*", "hello", 0), 1);
  EXPECT_EQ(MatchLen("*", "1234567890123456", 0), 1);
  EXPECT_EQ(MatchLen("h*", "hello", 0), 1);
  EXPECT_EQ(MatchLen("h*", "abc", 0), 0);
  EXPECT_EQ(MatchLen("h*o", "hello", 0), 1);
  EXPECT_EQ(MatchLen("hel*o*", "hello*", 0), 1);
  EXPECT_EQ(MatchLen("h\\*llo", "h*llo", 0), 1);

  // Single character wildcard
  EXPECT_EQ(MatchLen("h[aeiou]llo", "hello", 0), 1);
  EXPECT_EQ(MatchLen("h[aeiou]llo", "hallo", 0), 1);
  EXPECT_EQ(MatchLen("h[^aeiou]llo", "hallo", 0), 0);
  EXPECT_EQ(MatchLen("h[a-z]llo", "hello", 0), 1);
  EXPECT_EQ(MatchLen("h[A-Z]llo", "HeLLO", 1), 1);
  EXPECT_EQ(MatchLen("[[]", "[", 0), 1);
  EXPECT_EQ(MatchLen("[^]a", "xa", 0), 1);

  // ?
  EXPECT_EQ(MatchLen("h?llo", "hello", 0), 1);
  EXPECT_EQ(MatchLen("h??llo", "ha llo", 0), 1);
  EXPECT_EQ(MatchLen("h??llo", "hallo", 0), 0);
  EXPECT_EQ(MatchLen("h\\?llo", "hallo", 0), 0);
  EXPECT_EQ(MatchLen("h\\?llo", "h?llo", 0), 1);
  EXPECT_EQ(MatchLen("abc?", "abc\n", 0), 1);
}

#define TEST_STRINGMATCH(pattern, str, case_res, nocase_res) \
  {                                                          \
    EXPECT_EQ(int(MatchLen(pattern, str, 0)), case_res);     \
    EXPECT_EQ(int(MatchLen(pattern, str, 1)), nocase_res);   \
  }

TEST_F(StringMatchTest, Special) {
  EXPECT_TRUE(MatchLen("h\\[^|", "h[^|", 0));
  EXPECT_FALSE(MatchLen("[^", "[^", 0));
  EXPECT_TRUE(MatchLen("[$?^]a", "?a", 0));
  EXPECT_TRUE(MatchLen("abc[\\d]e", "abcde", 0));
  EXPECT_TRUE(MatchLen("foo\\", "foo\\", 0));

  /* Case sensitivity: */
  TEST_STRINGMATCH("a", "a", 1, 1);
  TEST_STRINGMATCH("a", "A", 0, 1);
  TEST_STRINGMATCH("A", "A", 1, 1);
  TEST_STRINGMATCH("A", "a", 0, 1);
  TEST_STRINGMATCH("\\a", "a", 1, 1);
  TEST_STRINGMATCH("\\a", "A", 0, 1);
  TEST_STRINGMATCH("\\A", "A", 1, 1);
  TEST_STRINGMATCH("\\A", "a", 0, 1);
  TEST_STRINGMATCH("[\\a]", "a", 1, 1);

  // TODO: to fix this: TEST_STRINGMATCH("[\\a]", "A", 0, 1);
  TEST_STRINGMATCH("[\\A]", "A", 1, 1);
  // TODO: to fix this: TEST_STRINGMATCH("[\\A]", "a", 0, 1);

  /* Escaped metacharacters: */
  TEST_STRINGMATCH("\\*", "*", 1, 1);
  TEST_STRINGMATCH("\\?", "?", 1, 1);
  TEST_STRINGMATCH("\\\\", "\\", 1, 1);
  TEST_STRINGMATCH("\\[", "[", 1, 1);
  TEST_STRINGMATCH("\\]", "]", 1, 1);
  TEST_STRINGMATCH("\\^", "^", 1, 1);
  TEST_STRINGMATCH("\\-", "-", 1, 1);
  TEST_STRINGMATCH("[\\*]", "*", 1, 1);
  TEST_STRINGMATCH("[\\?]", "?", 1, 1);
  TEST_STRINGMATCH("[\\\\]", "\\", 1, 1);
  TEST_STRINGMATCH("[\\[]", "[", 1, 1);
  TEST_STRINGMATCH("[\\]]", "]", 1, 1);
  TEST_STRINGMATCH("[\\^]", "^", 1, 1);
  TEST_STRINGMATCH("[\\-]", "-", 1, 1);

  /* Not special outside character classes: */
  TEST_STRINGMATCH("]", "]", 1, 1);
  TEST_STRINGMATCH("^", "^", 1, 1);
  TEST_STRINGMATCH("-", "-", 1, 1);
  /* Not special inside character classes: */
  TEST_STRINGMATCH("[*]", "*", 1, 1);
  TEST_STRINGMATCH("[?]", "?", 1, 1);
  TEST_STRINGMATCH("[[]", "[", 1, 1);
  /* Not special as the first character in a character class: */
  TEST_STRINGMATCH("[-]", "-", 1, 1);

  /* Not special as range end (undocumented): */
  TEST_STRINGMATCH("[+-]]", "*", 0, 0); /*   but not * (below) */
  TEST_STRINGMATCH("[+-]]", "^", 0, 0); /*   or ^ (above) */
  TEST_STRINGMATCH("[+--]", ",", 1, 1); /* ASCII range + to - includes , */
  TEST_STRINGMATCH("[+--]", "*", 0, 0); /*   but not * (below) */
  TEST_STRINGMATCH("[+--]", ".", 0, 0); /*   or . (above) */

  /* And the same, but unclosed: */
  TEST_STRINGMATCH("[+-]", "*", 0, 0);
  TEST_STRINGMATCH("[+-]", "^", 0, 0);
  TEST_STRINGMATCH("[+--", ",", 1, 1);
  TEST_STRINGMATCH("[+--", "*", 0, 0);
  TEST_STRINGMATCH("[+--", ".", 0, 0);

  /* Escaped ] alone is literal: */
  TEST_STRINGMATCH("[\\]a]", "]", 1, 1);
  TEST_STRINGMATCH("[\\]a]", "a", 1, 1);

  /* Escapes at range end: */
  TEST_STRINGMATCH("[+-\\\\]", ",", 1, 1); /* ASCII range + to \ includes , */
  TEST_STRINGMATCH("[+-\\\\]", "*", 0, 0); /*   but not * (below) */
  TEST_STRINGMATCH("[+-\\]]", "*", 0, 0);  /*   but not * (below) */
  TEST_STRINGMATCH("[+-\\]]", "^", 0, 0);  /*   or ^ (above) */

  /* Unclosed is the same: */
  TEST_STRINGMATCH("[+-\\\\", ",", 1, 1);
  TEST_STRINGMATCH("[+-\\\\", "*", 0, 0);
  TEST_STRINGMATCH("[+-\\\\", "]", 0, 0);
  TEST_STRINGMATCH("[+-\\]", ",", 1, 1);
  TEST_STRINGMATCH("[+-\\]", "*", 0, 0);
  TEST_STRINGMATCH("[+-\\]", "^", 0, 0);
  /* An incomplete escape is treated as literal backslash: */
  TEST_STRINGMATCH("[+-\\", ",", 1, 1);
  TEST_STRINGMATCH("[+-\\", "*", 0, 0);
  TEST_STRINGMATCH("[+-\\", "]", 0, 0);

  /* Empty character class matches nothing: */
  TEST_STRINGMATCH("[]", "", 0, 0);
  TEST_STRINGMATCH("[]", "a", 0, 0);
  TEST_STRINGMATCH("[", "", 0, 0); /* Unclosed is the same */
  TEST_STRINGMATCH("[", "a", 0, 0);

  /* Empty negated character class is equivalent to pattern "?": */
  TEST_STRINGMATCH("[^]", "", 0, 0);
  TEST_STRINGMATCH("[^]", "a", 1, 1);
  TEST_STRINGMATCH("[^]", "ab", 0, 0);
  TEST_STRINGMATCH("[^", "", 0, 0); /* Unclosed is the same */
  TEST_STRINGMATCH("[^", "a", 1, 1);
  TEST_STRINGMATCH("[^", "ab", 0, 0);

  /* Unclosed character classes are not an error (undocumented): */
  TEST_STRINGMATCH("[A-", "B", 0, 0);
}

class HuffCoderTest : public ::testing::Test {
 protected:
  HuffmanEncoder encoder_;
  HuffmanDecoder decoder_;
  string error_msg_;
  const string_view good_table_{
      "\x1b\x10\xd8\n\n\x19\xc6\x0c\xc3\x30\x0c\x43\x1e\x93\xe4\x11roB\xf6\xde\xbb\x18V\xc2Zk\x03"sv};
};

TEST_F(HuffCoderTest, Load) {
  string data("bad");

  ASSERT_FALSE(encoder_.Load(data, &error_msg_));

  data = good_table_;
  ASSERT_TRUE(encoder_.Load(data, &error_msg_)) << error_msg_;

  data.append("foo");
  encoder_.Reset();
  ASSERT_FALSE(encoder_.Load(data, &error_msg_));
}

TEST_F(HuffCoderTest, Encode) {
  ASSERT_TRUE(encoder_.Load(good_table_, &error_msg_)) << error_msg_;

  EXPECT_EQ(1, encoder_.GetNBits('x'));
  EXPECT_EQ(3, encoder_.GetNBits(':'));
  EXPECT_EQ(5, encoder_.GetNBits('2'));
  EXPECT_EQ(5, encoder_.GetNBits('3'));

  string data("x:23xx");

  array<uint8_t, 100> dest;
  uint32_t dest_size = dest.size();
  ASSERT_TRUE(encoder_.Encode(data, dest.data(), &dest_size, &error_msg_));
  ASSERT_EQ(3, dest_size);

  // testing small destination buffer.
  data = "3333333333333333333";
  dest_size = 16;
  EXPECT_TRUE(encoder_.Encode(data, dest.data(), &dest_size, &error_msg_));

  // destination too small
  ASSERT_EQ(0, dest_size);
  ASSERT_EQ("", error_msg_);
}

TEST_F(HuffCoderTest, Decode) {
  array<unsigned, 256> hist;
  hist.fill(1);
  hist['a'] = 100;
  hist['b'] = 50;

  ASSERT_TRUE(encoder_.Build(hist.data(), hist.size() - 1, &error_msg_));
  string data("aab");

  array<uint8_t, 100> encoded{0};
  uint32_t encoded_size = encoded.size();
  ASSERT_TRUE(encoder_.Encode(data, encoded.data(), &encoded_size, &error_msg_));
  ASSERT_EQ(1, encoded_size);

  EXPECT_EQ(2, encoder_.GetNBits('a'));
  EXPECT_EQ(3, encoder_.GetNBits('b'));

  string bindata = encoder_.Export();
  ASSERT_TRUE(decoder_.Load(bindata, &error_msg_)) << error_msg_;

  const char* src_ptr = reinterpret_cast<const char*>(encoded.data());
  array<char, 100> decode_dest{0};
  size_t decoded_size = data.size();
  ASSERT_TRUE(decoder_.Decode({src_ptr, encoded_size}, decoded_size, decode_dest.data()));
  ASSERT_EQ("aab", string_view(decode_dest.data(), decoded_size));
}

TEST_F(HuffCoderTest, HugeHistogram) {
  array<unsigned, 256> hist{
      1,         1,         1,         1,         1,         1,         1,         1,
      5,         26,        543,       1,         1,         1,         1,         1,
      4,         1,         1,         1,         1,         1,         1,         1,
      1,         1,         1,         1,         1,         1,         1,         1,
      114012534, 12081,     13038,     1596,      1334,      83320,     706165,    475568,
      2779,      2548,      998,       29249967,  53961,     13175485,  99000,     69726435,
      69422967,  182172009, 123544533, 76493373,  96341977,  64601914,  48105392,  60215630,
      69253599,  48811529,  818580990, 1226,      69,        922,       140,       720,
      230,       333714212, 95995178,  65692203,  50995122,  52156728,  44187793,  32988519,
      46978428,  49648957,  43769567,  68958857,  56765240,  80721594,  51577447,  70298692,
      56957407,  93372706,  47400672,  70912347,  78241282,  49291723,  69807896,  48372387,
      39312015,  58020704,  60084247,  1378,      2471,      1584,      14,        37880886,
      117,       184273430, 80952783,  135676228, 101229664, 230479318, 70652028,  137836653,
      70943805,  154072333, 29316298,  58302725,  109445030, 117306062, 129270567, 166048852,
      103000639, 54174517,  174819705, 166323524, 124543976, 80215452,  49650895,  101281709,
      49817574,  56668585,  50459552,  273352049, 166,       273352009, 16,        1,
      57668,     1724,      1886,      3668,      3960,      1963,      1124,      945,
      1836,      1882,      1709,      2389,      921,       2154,      1020,      1792,
      3747,      6750,      1318,      3100,      4506,      1175,      1514,      1430,
      3474,      44548,     3179,      1149,      2410,      9689,      727,       2348,
      2148,      1785,      5025,      1040,      3246,      1699,      505,       1034,
      9995,      24776,     3345,      1897,      1019,      1614,      35349,     988,
      2469,      5759,      2043,      7976,      1229,      896,       2692,      962,
      3341,      2490,      2648,      1162,      4812,      8404,      949,       3132,
      1,         1,         34754,     58694,     3400,      561,       6,         5,
      3,         47,        41,        19,        292,       24,        17,        12,
      626,       382,       6,         1,         1,         9,         1,         433,
      879,       743,       7,         9,         1,         1,         1,         60,
      746,       224,       54115,     4566,      5463,      10917,     5446,      7960,
      5382,      2204,      281,       649,       761,       188,       1,         2630,
      6680,      1,         1,         1,         1,         1,         1,         1,
      1,         1,         1,         1,         1,         1,         1,         1};

  // for huge values we need to scale down the histogram because the Huffman algorithm
  // implementation crashes otherwise.
  // The bug is in the following code in huf_compress.c:
  // huffNode0[0].count = (U32)(1U<<31);  /* fake entry, strong barrier */
  // where it uses the count as a sentinel assuming that no other counts can be larger than 2^31.
  // this may not be true for histograms with huge counts, so we need to make sure that sum of all
  // counts is smaller than 2^31.
  uint64_t sum = 0;
  for (unsigned i = 0; i < hist.size(); ++i) {
    sum += hist[i];
    hist[i] /= 4;  // Without this the algorithm causes a data race and crash.
  }
  LOG(INFO) << "Total sum: " << sum << " reduced sum: " << sum / 4;
  ASSERT_TRUE(encoder_.Build(hist.data(), hist.size() - 1, &error_msg_)) << error_msg_;

  string bindata = encoder_.Export();
  encoder_.Reset();
  ASSERT_TRUE(encoder_.Load(bindata, &error_msg_)) << error_msg_;
}

using benchmark::DoNotOptimize;

// Parse Double benchmarks
static void BM_ParseFastFloat(benchmark::State& state) {
  std::vector<std::string> args(100);
  std::random_device rd;

  for (auto& arg : args) {
    arg = std::to_string(std::uniform_real_distribution<double>(0, 1e5)(rd));
  }
  double res;
  while (state.KeepRunning()) {
    for (const auto& arg : args) {
      fast_float::from_chars(arg.data(), arg.data() + arg.size(), res);
    }
  }
}
BENCHMARK(BM_ParseFastFloat);

static void BM_ParseDoubleAbsl(benchmark::State& state) {
  std::vector<std::string> args(100);

  for (auto& arg : args) {
    arg = std::to_string(std::uniform_real_distribution<double>(0, 1e5)(rd));
  }

  double res;
  while (state.KeepRunning()) {
    for (const auto& arg : args) {
      absl::from_chars(arg.data(), arg.data() + arg.size(), res);
    }
  }
}
BENCHMARK(BM_ParseDoubleAbsl);

template <clockid_t cid> void BM_ClockType(benchmark::State& state) {
  timespec ts;
  while (state.KeepRunning()) {
    DoNotOptimize(clock_gettime(cid, &ts));
  }
}

BENCHMARK_TEMPLATE(BM_ClockType, CLOCK_REALTIME);
BENCHMARK_TEMPLATE(BM_ClockType, CLOCK_MONOTONIC);
BENCHMARK_TEMPLATE(BM_ClockType, CLOCK_PROCESS_CPUTIME_ID);
BENCHMARK_TEMPLATE(BM_ClockType, CLOCK_THREAD_CPUTIME_ID);

// These clocks are not available on apple platform
#if !defined(__APPLE__)
BENCHMARK_TEMPLATE(BM_ClockType, CLOCK_REALTIME_COARSE);
BENCHMARK_TEMPLATE(BM_ClockType, CLOCK_MONOTONIC_COARSE);
BENCHMARK_TEMPLATE(BM_ClockType, CLOCK_BOOTTIME);
BENCHMARK_TEMPLATE(BM_ClockType, CLOCK_BOOTTIME_ALARM);
#endif

static void BM_MatchGlob(benchmark::State& state) {
  string random_val = GetRandomHex(state.range(0));
  GlobMatcher matcher("*foobar*", true);
  while (state.KeepRunning()) {
    DoNotOptimize(matcher.Matches(random_val));
  }
}
BENCHMARK(BM_MatchGlob)->Arg(32)->Arg(1000)->Arg(10000);

static void BM_MatchGlob2(benchmark::State& state) {
  string random_val = GetRandomHex(state.range(0));
  GlobMatcher matcher("bull:*:meta", true);
  while (state.KeepRunning()) {
    DoNotOptimize(matcher.Matches(random_val));
  }
}
BENCHMARK(BM_MatchGlob2)->Arg(32)->Arg(1000)->Arg(10000);

// See https://nvd.nist.gov/vuln/detail/cve-2022-36021
static void BM_MatchGlobExp(benchmark::State& state) {
  GlobMatcher matcher("a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*b", true);
  while (state.KeepRunning()) {
    DoNotOptimize(matcher.Matches("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"));
  }
}
BENCHMARK(BM_MatchGlobExp);

static void BM_MatchFindSubstr(benchmark::State& state) {
  string random_val = GetRandomHex(state.range(0));

  while (state.KeepRunning()) {
    DoNotOptimize(random_val.find("foobar"));
  }
}
BENCHMARK(BM_MatchFindSubstr)->Arg(1000)->Arg(10000);

static void BM_MatchReflexFind(benchmark::State& state) {
  string random_val = GetRandomHex(state.range(0));
  reflex::Matcher matcher("foobar");
  while (state.KeepRunning()) {
    matcher.input(random_val);
    DoNotOptimize(matcher.find());
  }
}
BENCHMARK(BM_MatchReflexFind)->Arg(1000)->Arg(10000);

static void BM_MatchReflexFindStar(benchmark::State& state) {
  string random_val = GetRandomHex(state.range(0));
  reflex::Matcher matcher(".*foobar");

  while (state.KeepRunning()) {
    matcher.input(random_val);
    DoNotOptimize(matcher.find());
  }
}
BENCHMARK(BM_MatchReflexFindStar)->Arg(1000)->Arg(10000);

static void BM_MatchStd(benchmark::State& state) {
  string random_val = GetRandomHex(state.range(0));
  std::regex regex(".*foobar");
  std::match_results<std::string::const_iterator> results;
  while (state.KeepRunning()) {
    std::regex_match(random_val, results, regex);
  }
}
BENCHMARK(BM_MatchStd)->Arg(1000)->Arg(10000);

static void BM_MatchRedisGlob(benchmark::State& state) {
  string random_val = GetRandomHex(state.range(0));
  const char* pattern = "*foobar*";
  while (state.KeepRunning()) {
    DoNotOptimize(
        stringmatchlen(pattern, strlen(pattern), random_val.c_str(), random_val.size(), 0));
  }
}
BENCHMARK(BM_MatchRedisGlob)->Arg(1000)->Arg(10000);

static void BM_MatchRedisGlob2(benchmark::State& state) {
  string random_val = GetRandomHex(state.range(0));
  const char* pattern = "bull:*:meta";
  while (state.KeepRunning()) {
    DoNotOptimize(
        stringmatchlen(pattern, strlen(pattern), random_val.c_str(), random_val.size(), 0));
  }
}
BENCHMARK(BM_MatchRedisGlob2)->Arg(32)->Arg(1000)->Arg(10000);

static void BM_MatchData(benchmark::State& state) {
  vector<string> keys(5000);
  for (unsigned i = 0; i < keys.size(); ++i) {
    keys[i] = GetRandomHex(80);
  }
  string_view pattern =
      "*2addb1c3-eae5-5265-ac8e-9fc9106dda8d*77de68daecd823babbb58edb1c8e14d7106e83bb"sv;
  if (state.range(0) == 1) {
    GlobMatcher matcher(pattern, true);
    while (state.KeepRunning()) {
      for (const auto& key : keys) {
        DoNotOptimize(matcher.Matches(key));
      }
    }
  } else {
    while (state.KeepRunning()) {
      for (const auto& key : keys) {
        DoNotOptimize(stringmatchlen(pattern.data(), pattern.size(), key.c_str(), key.size(), 0));
      }
    }
  }
}
BENCHMARK(BM_MatchData)->ArgName("algo")->Arg(0)->Arg(1);

#ifdef USE_RE2
static void BM_MatchRe2(benchmark::State& state) {
  string random_val = GetRandomHex(state.range(0));
  re2::RE2 re(".*foobar.*", re2::RE2::Latin1);
  CHECK(re.ok());

  while (state.KeepRunning()) {
    DoNotOptimize(re2::RE2::FullMatch(random_val, re));
  }
}
BENCHMARK(BM_MatchRe2)->Arg(1000)->Arg(10000);
#endif

#ifdef USE_PCRE2

pair<pcre2_code*, pcre2_match_data*> create_pcre2(const char* pattern) {
  int errnum;
  PCRE2_SIZE erroffset;
  pcre2_code* re =
      pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED, 0, &errnum, &erroffset, nullptr);
  CHECK(re);
  CHECK_EQ(0, pcre2_jit_compile(re, PCRE2_JIT_COMPLETE));

  pcre2_match_data* match_data = pcre2_match_data_create_from_pattern(re, NULL);
  return {re, match_data};
}

int pcre2_do_match(string_view str, pcre2_code* re, pcre2_match_data* match_data) {
  int rc = pcre2_jit_match(re, (PCRE2_SPTR)str.data(), str.size(), 0,
                           PCRE2_ANCHORED | PCRE2_ENDANCHORED, match_data, NULL);
  return rc;
}

static void BM_MatchPcre2Jit(benchmark::State& state) {
  string random_val = GetRandomHex(state.range(0));
  auto [re, match_data] = create_pcre2(".*foobar.*");
  const char sample[] = "aaaaaaaaaaaaafoobar";
  int rc = pcre2_do_match(sample, re, match_data);
  CHECK_EQ(1, rc);

  while (state.KeepRunning()) {
    rc = pcre2_do_match(random_val, re, match_data);
    CHECK_EQ(PCRE2_ERROR_NOMATCH, rc);
  }
  pcre2_match_data_free(match_data);
  pcre2_code_free(re);
}
BENCHMARK(BM_MatchPcre2Jit)->Arg(32)->Arg(1000)->Arg(10000);

static void BM_MatchPcre2Jit2(benchmark::State& state) {
  string random_val = GetRandomHex(state.range(0));
  auto [re, match_data] = create_pcre2("foo.*bar");

  while (state.KeepRunning()) {
    int rc = pcre2_do_match(random_val, re, match_data);
    CHECK_EQ(PCRE2_ERROR_NOMATCH, rc);
  }
  pcre2_match_data_free(match_data);
  pcre2_code_free(re);
}
BENCHMARK(BM_MatchPcre2Jit2)->Arg(32)->Arg(1000)->Arg(10000);

static void BM_MatchPcre2JitExp(benchmark::State& state) {
  string exponent_pattern = "a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*b";
  string str = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
  auto [re, match_data] = create_pcre2(exponent_pattern.c_str());
  while (state.KeepRunning()) {
    int rc = pcre2_do_match(str, re, match_data);
    CHECK_EQ(PCRE2_ERROR_NOMATCH, rc);
  }
  pcre2_match_data_free(match_data);
  pcre2_code_free(re);
}
BENCHMARK(BM_MatchPcre2JitExp);

#endif

static void BM_MatchGlobSlow(benchmark::State& state) {
  GlobMatcher matcher("a*a*a*a*a*.pt", false);
  while (state.KeepRunning()) {
    DoNotOptimize(GlobMatcher("a*a*a*a*a*.pt", false));
  }
}
BENCHMARK(BM_MatchGlobSlow);
}  // namespace dfly


================================================
FILE: src/core/dict_builder.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/dict_builder.h"

#include <algorithm>
#include <bit>
#include <cmath>
#include <cstring>
#include <memory>
#include <vector>

#include "base/logging.h"

namespace dfly {

using namespace std;
namespace {

constexpr unsigned kDmerLength = 6;

// Fast hash for 6-byte d-mers. Uses a simple multiplicative hash.
inline uint32_t HashDmer(const uint8_t* data) {
  uint64_t val = 0;
  memcpy(&val, data, 6);

  // ZSTD_hash6 algorithm
  constexpr uint64_t kPrime6Bytes = 227718039650203ULL;
  uint64_t hash64 = ((val << 16) * kPrime6Bytes) >> 32;
  return static_cast<uint32_t>(hash64);
}

constexpr unsigned kRegisterLen = 1024;
constexpr uint32_t kRegisterMask = kRegisterLen - 1;
constexpr unsigned kRegisterBits = 10;
constexpr unsigned kRankBits = 32 - kRegisterBits;

inline void UpdateHllRegister(uint32_t h, uint8_t* registers) {
  uint32_t index = h & kRegisterMask;
  // Use upper bits for rank calculation, ensuring it's never zero
  uint32_t w = (h >> kRegisterBits) | (1u << kRankBits);
  uint8_t rank = countr_zero(w) + 1;
  registers[index] = std::max(registers[index], rank);
}

double EstimateHllCardinality(const uint8_t* registers) {
  double sum = 0.0;
  int zero_registers = 0;
  for (unsigned i = 0; i < kRegisterLen; ++i) {
    if (registers[i] == 0) {
      zero_registers++;
    }
    sum += 1.0 / (1 << registers[i]);
  }

  // alpha_m * m^2 where m = kRegisterLen
  // Constants from original HyperLogLog paper (Flajolet et al.)
  constexpr double kAlphaInf = 0.7213;
  constexpr double kAlphaCorrection = 1.079;
  constexpr double kM = static_cast<double>(kRegisterLen);
  constexpr double kAlphaM2 = (kAlphaInf / (1.0 + kAlphaCorrection / kM)) * (kM * kM);
  double estimate = kAlphaM2 / sum;

  // Small range correction
  constexpr double kSmallRangeThreshold = 2.5 * kM;
  if (estimate <= kSmallRangeThreshold && zero_registers > 0) {
    estimate = kM * std::log(kM / zero_registers);
  }
  return estimate;
}

uint32_t CalculateFreqTableSize(absl::Span<const std::pair<const uint8_t*, size_t>> data_pieces) {
  size_t total_input_size = 0;
  for (const auto& [data, sz] : data_pieces) {
    total_input_size += sz;
  }
  size_t target_size = std::max<size_t>(1024, total_input_size);
  return std::bit_ceil(static_cast<uint32_t>(std::min<size_t>(target_size, 1u << 24)));
}

// Scans all provided data pieces to compute a histogram of 6-byte sequence (d-mer) hashes.
void PopulateFrequencyTable(absl::Span<const std::pair<const uint8_t*, size_t>> data_pieces,
                            uint16_t* freq, uint32_t freq_table_mask) {
  for (const auto& [data, sz] : data_pieces) {
    if (sz < kDmerLength)
      continue;

    size_t limit = sz - kDmerLength + 1;
    for (size_t i = 0; i < limit; ++i) {
      uint32_t idx = HashDmer(data + i) & freq_table_mask;
      if (freq[idx] < UINT16_MAX) {
        ++freq[idx];
      }
    }
  }
}

struct BestSegmentResult {
  std::pair<const uint8_t*, size_t> data_piece{nullptr, 0};
  uint64_t score = 0;
};

// Iterates across all data pieces to find a contiguous byte window of `segment_size`
// that maximizes the sum of previously computed sequence frequencies.
BestSegmentResult FindBestSegment(absl::Span<const std::pair<const uint8_t*, size_t>> data_pieces,
                                  size_t segment_size, const uint16_t* freq,
                                  uint32_t freq_table_mask) {
  BestSegmentResult best;

  for (const auto& [data, sz] : data_pieces) {
    if (sz < segment_size)
      continue;

    size_t window_dmers = segment_size - kDmerLength + 1;
    uint64_t score = 0;

    // Compute initial window score
    for (size_t j = 0; j < window_dmers; ++j) {
      score += freq[HashDmer(data + j) & freq_table_mask];
    }

    if (score > best.score) {
      best.score = score;
      best.data_piece = {data, segment_size};
    }

    // Slide the window
    size_t limit = sz - segment_size;
    for (size_t i = 1; i <= limit; ++i) {
      score -= freq[HashDmer(data + i - 1) & freq_table_mask];
      score += freq[HashDmer(data + i + window_dmers - 1) & freq_table_mask];

      if (score > best.score) {
        best.score = score;
        best.data_piece = {data + i, segment_size};
      }
    }
  }

  return best;
}

void ZeroOutFrequencies(std::pair<const uint8_t*, size_t> data_piece, uint16_t* freq,
                        uint32_t freq_table_mask) {
  if (data_piece.second < kDmerLength)
    return;
  size_t seg_dmers = data_piece.second - kDmerLength + 1;
  for (size_t j = 0; j < seg_dmers; ++j) {
    freq[HashDmer(data_piece.first + j) & freq_table_mask] = 0;
  }
}

}  // namespace

// Estimates dictionary compressibility by observing the cardinality
// of unique 6-byte substrings via a simplified internal HyperLogLog.
double EstimateCompressibility(absl::Span<const std::pair<const uint8_t*, size_t>> data_pieces,
                               unsigned step) {
  DCHECK_GT(step, 0u);

  unique_ptr<uint8_t[]> registers(new uint8_t[kRegisterLen]());
  uint64_t total_dmers = 0;

  for (const auto& [data, sz] : data_pieces) {
    if (sz < kDmerLength)
      continue;
    size_t limit = sz - kDmerLength + 1;
    for (size_t i = 0; i < limit; i += step) {
      UpdateHllRegister(HashDmer(data + i), registers.get());
      ++total_dmers;
    }
  }

  if (total_dmers == 0) {
    return 1.0;  // No d-mers - we consider it incompressible
  }

  double estimate = EstimateHllCardinality(registers.get());
  double ratio = estimate / static_cast<double>(total_dmers);
  return std::min(ratio, 1.0);
}

// Trains a dictionary using FastCover-style iterative segment selection.
// 1. Builds a frequency table of 6-byte d-mer hashes.
// 2. For each data piece (epoch), selects the segment of segment_size bytes
//    that maximizes the sum of d-mer frequencies.
// 3. Appends selected segment to dictionary, zeros out its d-mer frequencies.
// Returns raw dictionary bytes of approximately dict_size.
string TrainDictionary(absl::Span<const pair<const uint8_t*, size_t>> data_pieces, size_t dict_size,
                       size_t segment_size) {
  DCHECK_GT(dict_size, 0u);
  DCHECK_GT(segment_size, kDmerLength);

  uint32_t freq_table_size = CalculateFreqTableSize(data_pieces);
  uint32_t freq_table_mask = freq_table_size - 1;

  unique_ptr<uint16_t[]> freq(new uint16_t[freq_table_size]());
  PopulateFrequencyTable(data_pieces, freq.get(), freq_table_mask);

  std::string dictionary;
  dictionary.reserve(dict_size);

  while (dictionary.size() < dict_size) {
    auto best = FindBestSegment(data_pieces, segment_size, freq.get(), freq_table_mask);

    if (!best.data_piece.first || best.score == 0) {
      break;  // No useful segments left.
    }

    size_t append_size = std::min(best.data_piece.second, dict_size - dictionary.size());
    dictionary.append(reinterpret_cast<const char*>(best.data_piece.first), append_size);

    ZeroOutFrequencies(best.data_piece, freq.get(), freq_table_mask);
  }

  return dictionary;
}

}  // namespace dfly


================================================
FILE: src/core/dict_builder.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/types/span.h>

#include <cstddef>
#include <cstdint>
#include <string>
#include <utility>

namespace dfly {

// Estimates compressibility by counting unique 6-byte d-mers using HyperLogLog.
// data_pieces: spans of raw data (e.g., one per QList node).
// step: sampling stride (1 = every offset, higher = faster but less accurate).
// Returns a value in [0, 1] where 0 means very compressible, and 1 means incompressible.
double EstimateCompressibility(absl::Span<const std::pair<const uint8_t*, size_t>> data_pieces,
                               unsigned step);

// Trains a compression dictionary from a collection of sample data.
//
// Arguments:
//   data_pieces:  Input data sources (spans of bytes) to extract dictionary segments from.
//   dict_size:    The maximum target size of the resulting dictionary in bytes.
//   segment_size: The size of continuous byte segments chosen and appended per iteration.
//
// Returns a raw string containing the trained dictionary up to `dict_size` bytes.
std::string TrainDictionary(absl::Span<const std::pair<const uint8_t*, size_t>> data_pieces,
                            size_t dict_size = 4096, size_t segment_size = 256);

}  // namespace dfly


================================================
FILE: src/core/dict_builder_test.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/dict_builder.h"

#include <gmock/gmock.h>
#include <zstd.h>

#include <random>
#include <string>
#include <vector>

#include "base/logging.h"

namespace dfly {

using namespace std;

class DictBuilderTest : public ::testing::Test {
 protected:
  using DataPiece = pair<const uint8_t*, size_t>;

  // Generate Celery-like JSON entries with small variations.
  vector<string> GenerateCeleryEntries(unsigned count) {
    vector<string> entries;
    entries.reserve(count);
    for (unsigned i = 0; i < count; ++i) {
      string id = to_string(100000 + i);
      string entry =
          "{\"body\": \"W10=\", \"content-encoding\": \"utf-8\", "
          "\"content-type\": \"application/json\", "
          "\"headers\": {\"lang\": \"py\", \"task\": \"process_job\", "
          "\"id\": \"b3e4b923-8a77-4053-aff0-" +
          id +
          "\", \"shadow\": null, \"eta\": null, "
          "\"expires\": null, \"group\": null, \"retries\": 0, "
          "\"timelimit\": [null, null], "
          "\"root_id\": \"b3e4b923-8a77-4053-aff0-" +
          id +
          "\", \"parent_id\": null, "
          "\"argsrepr\": \"('job" +
          to_string(i) +
          "',)\", \"kwargsrepr\": \"{}\", "
          "\"origin\": \"gen917779@hut\"}, "
          "\"properties\": {\"correlation_id\": \"b3e4b923\", "
          "\"reply_to\": \"9933040c\", \"delivery_mode\": 2, "
          "\"delivery_info\": {\"exchange\": \"\", \"routing_key\": \"my_queue\"}, "
          "\"priority\": 0}}";
      entries.push_back(std::move(entry));
    }
    return entries;
  }

  vector<DataPiece> ToPieces(const vector<string>& entries) {
    vector<DataPiece> pieces;
    pieces.reserve(entries.size());
    for (const auto& e : entries) {
      pieces.emplace_back(reinterpret_cast<const uint8_t*>(e.data()), e.size());
    }
    return pieces;
  }

  // Generate random binary data.
  vector<string> GenerateRandomEntries(unsigned count, size_t entry_size) {
    vector<string> entries;
    entries.reserve(count);
    mt19937 rng(42);
    for (unsigned i = 0; i < count; ++i) {
      string entry(entry_size, '\0');
      for (auto& c : entry) {
        c = static_cast<char>(rng() & 0xFF);
      }
      entries.push_back(std::move(entry));
    }
    return entries;
  }
};

TEST_F(DictBuilderTest, RepetitiveDataIsCompressible) {
  auto entries = GenerateCeleryEntries(200);
  auto pieces = ToPieces(entries);

  double ratio = EstimateCompressibility(pieces, 1);
  LOG(INFO) << "Celery data uniqueness ratio: " << ratio;
  EXPECT_LT(ratio, 0.5f);
}

TEST_F(DictBuilderTest, RandomDataIsIncompressible) {
  auto entries = GenerateRandomEntries(200, 400);
  auto pieces = ToPieces(entries);

  double ratio = EstimateCompressibility(pieces, 1);
  LOG(INFO) << "Random data uniqueness ratio: " << ratio;
  EXPECT_FALSE(ratio < 0.85);
}

TEST_F(DictBuilderTest, TrainDictionaryProducesOutput) {
  auto entries = GenerateCeleryEntries(200);
  auto pieces = ToPieces(entries);

  string dict = TrainDictionary(pieces, 4096, 256);
  LOG(INFO) << "Trained dictionary size: " << dict.size() << " bytes";
  EXPECT_GT(dict.size(), 0u);
  EXPECT_LE(dict.size(), 4096u);
}

TEST_F(DictBuilderTest, TrainDictionaryEmptyForTinyData) {
  // Single small entry - not enough for segment selection.
  string tiny = "hello";
  vector<DataPiece> pieces = {{reinterpret_cast<const uint8_t*>(tiny.data()), tiny.size()}};

  string dict = TrainDictionary(pieces, 4096, 256);
  EXPECT_TRUE(dict.empty());
}

TEST_F(DictBuilderTest, ZstdCompressionWithTrainedDict) {
  auto entries = GenerateCeleryEntries(200);
  auto pieces = ToPieces(entries);

  string dict = TrainDictionary(pieces, 4096, 256);
  ASSERT_GT(dict.size(), 0u);

  // Create ZSTD CDict/DDict from trained dictionary.
  ZSTD_CDict* cdict = ZSTD_createCDict(dict.data(), dict.size(), 1);
  ASSERT_TRUE(cdict);
  ZSTD_DDict* ddict = ZSTD_createDDict(dict.data(), dict.size());
  ASSERT_TRUE(ddict);

  ZSTD_CCtx* cctx = ZSTD_createCCtx();
  ZSTD_DCtx* dctx = ZSTD_createDCtx();

  size_t total_raw = 0;
  size_t total_compressed_dict = 0;
  size_t total_compressed_nodict = 0;

  for (const auto& entry : entries) {
    total_raw += entry.size();

    // Compress with dictionary.
    size_t bound = ZSTD_compressBound(entry.size());
    string compressed(bound, '\0');
    size_t csz =
        ZSTD_compress_usingCDict(cctx, compressed.data(), bound, entry.data(), entry.size(), cdict);
    ASSERT_FALSE(ZSTD_isError(csz)) << ZSTD_getErrorName(csz);
    compressed.resize(csz);
    total_compressed_dict += csz;

    // Compress without dictionary for comparison.
    string compressed_nodict(bound, '\0');
    size_t csz_nodict =
        ZSTD_compressCCtx(cctx, compressed_nodict.data(), bound, entry.data(), entry.size(), 1);
    ASSERT_FALSE(ZSTD_isError(csz_nodict));
    total_compressed_nodict += csz_nodict;

    // Verify roundtrip.
    string decompressed(entry.size(), '\0');
    size_t dsz = ZSTD_decompress_usingDDict(dctx, decompressed.data(), entry.size(),
                                            compressed.data(), csz, ddict);
    ASSERT_FALSE(ZSTD_isError(dsz)) << ZSTD_getErrorName(dsz);
    ASSERT_EQ(dsz, entry.size());
    EXPECT_EQ(decompressed, entry);
  }

  double ratio_dict = double(total_raw) / double(total_compressed_dict);
  double ratio_nodict = double(total_raw) / double(total_compressed_nodict);
  LOG(INFO) << "Total raw: " << total_raw << " bytes";
  LOG(INFO) << "With dict: " << total_compressed_dict << " bytes (ratio " << ratio_dict << "x)";
  LOG(INFO) << "No dict:   " << total_compressed_nodict << " bytes (ratio " << ratio_nodict << "x)";
  LOG(INFO) << "Dict advantage: " << ratio_dict / ratio_nodict << "x better";

  // Dictionary compression should be significantly better for repetitive data.
  EXPECT_GT(ratio_dict, ratio_nodict);
  EXPECT_GT(ratio_dict, 3.0f);  // Expect at least 3x compression with dict.

  ZSTD_freeCCtx(cctx);
  ZSTD_freeDCtx(dctx);
  ZSTD_freeCDict(cdict);
  ZSTD_freeDDict(ddict);
}

TEST_F(DictBuilderTest, StepParameterWorks) {
  auto entries = GenerateCeleryEntries(200);
  auto pieces = ToPieces(entries);

  double step1_ratio = EstimateCompressibility(pieces, 1);
  double step4_ratio = EstimateCompressibility(pieces, 4);

  // Both should detect compressibility, though with slightly different ratios.
  EXPECT_TRUE(step1_ratio < 0.85);
  EXPECT_TRUE(step4_ratio < 0.85);
  LOG(INFO) << "Step=1 ratio: " << step1_ratio << ", Step=4 ratio: " << step4_ratio;
}

}  // namespace dfly


================================================
FILE: src/core/dragonfly_core.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include <absl/base/macros.h>

#include "base/logging.h"
#include "core/intent_lock.h"

namespace dfly {

const char* IntentLock::ModeName(Mode m) {
  switch (m) {
    case IntentLock::SHARED:
      return "SHARED";
    case IntentLock::EXCLUSIVE:
      return "EXCLUSIVE";
  }

  ABSL_UNREACHABLE();
}

void IntentLock::VerifyDebug() {
  constexpr uint32_t kMsb = 1ULL << (sizeof(cnt_[0]) * 8 - 1);
  DCHECK_EQ(0u, cnt_[0] & kMsb);
  DCHECK_EQ(0u, cnt_[1] & kMsb);
}

}  // namespace dfly


================================================
FILE: src/core/expire_period.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <cstdint>

namespace dfly {

class ExpirePeriod {
 public:
  static constexpr size_t kMaxGenId = 15;

  ExpirePeriod() : val_(0), gen_(0), precision_(0) {
    static_assert(sizeof(ExpirePeriod) == 8);  // TODO
  }

  explicit ExpirePeriod(uint64_t ms, unsigned gen = 0) : ExpirePeriod() {
    Set(ms);
  }

  // always returns milliseconds value.
  uint64_t duration_ms() const {
    return precision_ ? uint64_t(val_) * 1000 : val_;
  }

  // generation id for the base of this duration.
  // when we update the generation, we need to update the value as well according to this
  // logic:
  // new_val = (old_val + old_base) - new_base.
  unsigned generation_id() const {
    return gen_;
  }

  void Set(uint64_t ms);

  bool is_second_precision() { return precision_ == 1;}

 private:
  uint64_t val_ : 59;
  uint64_t gen_ : 4;
  uint64_t precision_ : 1;  // 0 - ms, 1 - sec.
};

inline void ExpirePeriod::Set(uint64_t ms) {
  constexpr uint64_t kBarrier = (1ULL << 48);

  if (ms < kBarrier) {
    val_ = ms;
    precision_ = 0;   // ms
    return;
  }

  precision_ = 1;
  if (ms < kBarrier << 10) {
    ms = (ms + 500) / 1000;   // seconds
  }
  val_ = ms >= kBarrier ? kBarrier - 1 : ms;
}

}  // namespace dfly


================================================
FILE: src/core/extent_tree.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/extent_tree.h"

#include "base/logging.h"

namespace dfly {

using namespace std;

// offset, len must be multiplies of 256MB.
void ExtentTree::Add(size_t start, size_t len) {
  DCHECK_GT(len, 0u);
  DCHECK_EQ(len_extents_.size(), extents_.size());

  auto it = extents_.lower_bound(start);
  optional<size_t> prev_extent_key;

  if (it != extents_.begin()) {
    auto prev = it;
    --prev;

    DCHECK_LE(prev->second, start);
    if (prev->second == start) {  // combine with the previous extent
      size_t prev_len = prev->second - prev->first;
      CHECK_EQ(1u, len_extents_.erase(pair{prev_len, prev->first}));
      prev->second += len;
      start = prev->first;
      len += prev_len;
      prev_extent_key = prev->first;
    }
  }

  if (it != extents_.end()) {
    DCHECK_GE(it->first, start + len);
    if (start + len == it->first) {  // merge with the next extent
      size_t it_len = it->second - it->first;
      CHECK_EQ(1u, len_extents_.erase(pair{it_len, it->first}));
      extents_.erase(it);
      len += it_len;
    }
  }

  len_extents_.emplace(len, start);
  if (prev_extent_key) {
    DCHECK(extents_.find(*prev_extent_key) != extents_.end());
    extents_[*prev_extent_key] = start + len;
  } else {
    extents_.emplace(start, start + len);
  }
}

optional<pair<size_t, size_t>> ExtentTree::GetRange(size_t len, size_t align) {
  DCHECK_GT(align, 0u);
  DCHECK_EQ(0u, align & (align - 1));
  DCHECK_EQ(0u, len & (align - 1));

  auto it = len_extents_.lower_bound(pair{len, 0});
  if (it == len_extents_.end())
    return nullopt;

  size_t amask = align - 1;
  size_t aligned_start = it->second;
  size_t extent_end = it->first + it->second;

  while (true) {
    if ((aligned_start & amask) == 0)  // aligned
      break;

    // round up to the next aligned address
    aligned_start = (aligned_start + amask) & (~amask);

    if (aligned_start + len <= extent_end)  // check if we still inside the extent
      break;
    ++it;

    if (it == len_extents_.end())
      return nullopt;

    aligned_start = it->second;
    extent_end = it->first + it->second;
  }

  DCHECK_GE(aligned_start, it->second);

  // if we are here - we found the range starting at aligned_start.
  // now we need to possibly break the existing extent to several parts or completely
  // delete it.
  auto eit = extents_.find(it->second);
  DCHECK(eit != extents_.end());
  size_t range_end = aligned_start + len;

  len_extents_.erase(it);

  // we break the extent [eit->first, eit->second] to either 0, 1 or 2 intervals.
  if (aligned_start > eit->first) {  // do we have prefix?
    eit->second = aligned_start;
    len_extents_.emplace(eit->second - eit->first, eit->first);
  } else {
    extents_.erase(eit);
  }

  if (range_end < extent_end) {  // do we have suffix?
    extents_.emplace(range_end, extent_end);
    len_extents_.emplace(extent_end - range_end, range_end);
  }

  DCHECK_EQ(range_end - aligned_start, len);

  return pair{aligned_start, range_end};
}

}  // namespace dfly


================================================
FILE: src/core/extent_tree.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/btree_map.h>
#include <absl/container/btree_set.h>

#include <optional>

namespace dfly {

// represents a tree of disjoint extents.
// check-fails if overlapping ranges are added.
// automatically handles union of the consequent ranges that are added to the tree.
class ExtentTree {
 public:
  void Add(size_t start, size_t len);

  // in case of success, returns (start, end) pair, where (end-start) >= len and
  // start is aligned by align.
  std::optional<std::pair<size_t, size_t>> GetRange(size_t len, size_t align);

 private:
  absl::btree_map<size_t, size_t> extents_;                 // start -> end).
  absl::btree_set<std::pair<size_t, size_t>> len_extents_;  // (length, start)
};

}  // namespace dfly


================================================
FILE: src/core/extent_tree_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/extent_tree.h"

#include <gmock/gmock.h>

#include "base/gtest.h"
#include "base/logging.h"

namespace dfly {

using namespace std;

class ExtentTreeTest : public ::testing::Test {
 protected:
  static void SetUpTestSuite() {
  }

  static void TearDownTestSuite() {
  }

  ExtentTree tree_;
};

TEST_F(ExtentTreeTest, Basic) {
  tree_.Add(0, 256);
  auto op = tree_.GetRange(64, 16);
  EXPECT_TRUE(op);
  EXPECT_THAT(*op, testing::Pair(0, 64));  // [64, 256)

  tree_.Add(56, 8);
  op = tree_.GetRange(64, 16);
  EXPECT_TRUE(op);
  EXPECT_THAT(*op, testing::Pair(64, 128));  // {[56, 64), [128, 256)}

  op = tree_.GetRange(18, 2);
  EXPECT_TRUE(op);
  EXPECT_THAT(*op, testing::Pair(128, 146));  // {[56, 64), [146, 256)}

  op = tree_.GetRange(80, 16);
  EXPECT_TRUE(op);
  EXPECT_THAT(*op, testing::Pair(160, 240));  // {[56, 64), [146, 160), [240, 256)}

  op = tree_.GetRange(4, 1);
  EXPECT_TRUE(op);
  EXPECT_THAT(*op, testing::Pair(56, 60));  // {[60, 64), [146, 160), [240, 256)}

  op = tree_.GetRange(32, 1);
  EXPECT_FALSE(op);
  tree_.Add(64, 146 - 64);
  op = tree_.GetRange(32, 4);
  EXPECT_TRUE(op);
  EXPECT_THAT(*op, testing::Pair(60, 92));
}

TEST_F(ExtentTreeTest, Union) {
  tree_.Add(0, 16);
  tree_.Add(16, 16);
  auto range = tree_.GetRange(32, 1);
  ASSERT_TRUE(range);
  EXPECT_THAT(*range, testing::Pair(0, 32));
}

}  // namespace dfly


================================================
FILE: src/core/flatbuffers.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#ifndef __USE_GNU  // needed to flatbuffers to compile with musl libc.
#define FLATBUFFERS_LOCALE_INDEPENDENT 0
#endif

#include <flatbuffers/flatbuffers.h>
#include <flatbuffers/flexbuffers.h>
#include <flatbuffers/idl.h>

namespace dfly {
using FlatJson = flexbuffers::Reference;
}  // namespace dfly


================================================
FILE: src/core/flatbuffers_test.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/flatbuffers.h"

#include <absl/strings/escaping.h>

#include "base/gtest.h"
#include "base/logging.h"

using namespace std;

namespace dfly {
class FlatBuffersTest : public ::testing::Test {
 protected:
};

TEST_F(FlatBuffersTest, Basic) {
  flexbuffers::Builder fbb;
  fbb.Map([&] {
    fbb.String("foo", "bar");
    fbb.Double("bar", 1.5);
    fbb.Vector("strs", [&] {
      fbb.String("hello");
      fbb.String("world");
    });
  });

  fbb.Finish();
  auto buffer = fbb.GetBuffer();
  flexbuffers::Reference ref = flexbuffers::GetRoot(buffer);
  auto map = ref.AsMap();
  EXPECT_EQ("bar", map["foo"].AsString().str());
}

TEST_F(FlatBuffersTest, FlexiParser) {
  flatbuffers::Parser parser;
  const char* json = R"(
    {
      "foo": "bar",
      "bar": 1.5,
      "strs": ["hello", "world"]
    }
  )";
  flexbuffers::Builder fbb;
  ASSERT_TRUE(parser.ParseFlexBuffer(json, nullptr, &fbb));
  fbb.Finish();
  const auto& buffer = fbb.GetBuffer();
  string_view buf_view{reinterpret_cast<const char*>(buffer.data()), buffer.size()};
  LOG(INFO) << "Binary buffer: " << absl::CHexEscape(buf_view);
  flexbuffers::Reference root = flexbuffers::GetRoot(buffer);
  auto map = root.AsMap();
  EXPECT_EQ("bar", map["foo"].AsString().str());
}

TEST_F(FlatBuffersTest, ParseJson) {
  const char* schema = R"(
    namespace dfly;
    table Foo {
      foo: string;
      bar: double;
      strs: [string];
    }
    root_type Foo;
  )";

  flatbuffers::Parser parser;
  ASSERT_TRUE(parser.Parse(schema));
  parser.Serialize();
  flatbuffers::DetachedBuffer bsb = parser.builder_.Release();

  // This schema will always reference bsb.
  auto* fbs_schema = reflection::GetSchema(bsb.data());

  flatbuffers::Verifier verifier(bsb.data(), bsb.size());
  ASSERT_TRUE(fbs_schema->Verify(verifier));

  auto* root_table = fbs_schema->root_table();
  auto* fields = root_table->fields();
  auto* field_foo = fields->LookupByKey("foo");
  ASSERT_EQ(field_foo->type()->base_type(), reflection::String);

  const char* json = R"(
    {
      "foo": "value",
      "bar": 1.5,
      "strs": ["hello", "world"]
    }
  )";

  ASSERT_TRUE(parser.Parse(json));
  size_t buf_size = parser.builder_.GetSize();

  ASSERT_TRUE(
      flatbuffers::Verify(*fbs_schema, *root_table, parser.builder_.GetBufferPointer(), buf_size));
  auto* root_obj = flatbuffers::GetAnyRoot(parser.builder_.GetBufferPointer());

  const flatbuffers::String* value = flatbuffers::GetFieldS(*root_obj, *field_foo);
  EXPECT_EQ("value", value->str());

  // wrong type.
  ASSERT_FALSE(parser.Parse(R"({"foo": 1})"));
}

}  // namespace dfly


================================================
FILE: src/core/generate_bin_sizes.py
================================================
#!/usr/bin/env python3

import argparse
import random
from array import array

# We print in 64 bit words.
ALIGN = 1 << 10  # 1KB alignment


def print_small_bins():
    prev_val = 0
    for i in range(56, 1, -1):
        len = (4096 - i*8)  # reduce by size of hashes
        len = (len // 8)*8  # make it 8 bytes aligned
        if len != prev_val:
            print(i, len)
            prev_val = len
    print()


def main():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-n', type=int, dest='num',
                        help='number of quadruplets', default=9)
    parser.add_argument('-small', action='store_true')

    args = parser.parse_args()
    if args.small:
        print("small")
        print_small_bins()
        return

    size = 512*4
    print ('{512, 512*2, 512*3, ', end=' ')
    # print ('{', end=' ')
    for i in range(args.num):
        incr = size // 4
        for j in range(4):
            assert size % 512 == 0, size
            print (f'{size}, ', end=' ')
            size += incr
        if i % 2 == 1:
            print('')
    print('};')

if __name__ == "__main__":
    main()


================================================
FILE: src/core/glob_matcher.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/glob_matcher.h"

#include <absl/strings/ascii.h>

#include "base/logging.h"

namespace dfly {
using namespace std;

/* Glob-style pattern matching taken from Valkey. */
static int stringmatchlen_impl(const char* pattern, int patternLen, const char* string,
                               int stringLen, int nocase, int* skipLongerMatches, int nesting) {
  /* Protection against abusive patterns. */
  if (nesting > 1000)
    return 0;

  while (patternLen && stringLen) {
    switch (pattern[0]) {
      case '*':
        while (patternLen && pattern[1] == '*') {
          pattern++;
          patternLen--;
        }
        if (patternLen == 1)
          return 1; /* match */
        while (stringLen) {
          if (stringmatchlen_impl(pattern + 1, patternLen - 1, string, stringLen, nocase,
                                  skipLongerMatches, nesting + 1))
            return 1; /* match */
          if (*skipLongerMatches)
            return 0; /* no match */
          string++;
          stringLen--;
        }
        /* There was no match for the rest of the pattern starting
         * from anywhere in the rest of the string. If there were
         * any '*' earlier in the pattern, we can terminate the
         * search early without trying to match them to longer
         * substrings. This is because a longer match for the
         * earlier part of the pattern would require the rest of the
         * pattern to match starting later in the string, and we
         * have just determined that there is no match for the rest
         * of the pattern starting from anywhere in the current
         * string. */
        *skipLongerMatches = 1;
        return 0; /* no match */
        break;
      case '?':
        string++;
        stringLen--;
        break;
      case '[': {
        int not_op, match;

        pattern++;
        patternLen--;
        not_op = patternLen && pattern[0] == '^';
        if (not_op) {
          pattern++;
          patternLen--;
        }
        match = 0;
        while (1) {
          if (patternLen >= 2 && pattern[0] == '\\') {
            pattern++;
            patternLen--;
            if (pattern[0] == string[0])
              match = 1;
          } else if (patternLen == 0) {
            pattern--;
            patternLen++;
            break;
          } else if (pattern[0] == ']') {
            break;
          } else if (patternLen >= 3 && pattern[1] == '-') {
            int start = pattern[0];
            int end = pattern[2];
            int c = string[0];
            if (start > end) {
              int t = start;
              start = end;
              end = t;
            }
            if (nocase) {
              start = tolower(start);
              end = tolower(end);
              c = tolower(c);
            }
            pattern += 2;
            patternLen -= 2;
            if (c >= start && c <= end)
              match = 1;
          } else {
            if (!nocase) {
              if (pattern[0] == string[0])
                match = 1;
            } else {
              if (tolower((int)pattern[0]) == tolower((int)string[0]))
                match = 1;
            }
          }
          pattern++;
          patternLen--;
        }
        if (not_op)
          match = !match;
        if (!match)
          return 0; /* no match */
        string++;
        stringLen--;
        break;
      }
      case '\\':
        if (patternLen >= 2) {
          pattern++;
          patternLen--;
        }
        /* fall through */
      default:
        if (!nocase) {
          if (pattern[0] != string[0])
            return 0; /* no match */
        } else {
          if (tolower((int)pattern[0]) != tolower((int)string[0]))
            return 0; /* no match */
        }
        string++;
        stringLen--;
        break;
    }
    pattern++;
    patternLen--;
    if (stringLen == 0) {
      while (patternLen && *pattern == '*') {
        pattern++;
        patternLen--;
      }
      break;
    }
  }
  if (patternLen == 0 && stringLen == 0)
    return 1;
  return 0;
}

int stringmatchlen(const char* pattern, int patternLen, const char* string, int stringLen,
                   int nocase) {
  int skipLongerMatches = 0;
  return stringmatchlen_impl(pattern, patternLen, string, stringLen, nocase, &skipLongerMatches, 0);
}

string GlobMatcher::Glob2Regex(string_view glob) {
  string regex;
  regex.reserve(glob.size());
  size_t in_group = 0;

  for (size_t i = 0; i < glob.size(); i++) {
    char c = glob[i];
    if (in_group > 0) {
      if (c == ']') {
        if (i == in_group + 1) {
          if (glob[in_group] == '^') {  // [^
            regex.pop_back();
            regex.back() = '.';
            in_group = 0;
            continue;
          }
        }
        in_group = 0;
      }
      regex.push_back(c);
      if (c == '\\') {
        if (i + 1 < glob.size() && glob[i + 1] == ']') {
          ++i;
          regex.push_back(']');
        } else {
          regex.push_back('\\');  // escape the backslash
        }
      }
      continue;
    }

    switch (c) {
      case '*':
        regex.append(".*");
        break;
      case '?':
        regex.append(".");
        break;
      case '.':
      case '(':
      case ')':
      case '{':
      case '}':
      case '^':
      case '$':
      case '+':
      case '|':
        regex.push_back('\\');
        regex.push_back(c);
        break;
      case '\\':
        if (i + 1 < glob.size()) {
          ++i;
        }
        if (absl::ascii_ispunct(glob[i])) {
          regex.push_back('\\');
        }
        regex.push_back(glob[i]);
        break;
      case '[':
        regex.push_back('[');
        if (i + 1 < glob.size()) {
          in_group = i + 1;
        }
        break;
      default:
        regex.push_back(c);
        break;
    }
  }
  return regex;
}

GlobMatcher::GlobMatcher(string_view pattern, bool case_sensitive)
    : glob_(pattern), case_sensitive_(case_sensitive) {
#ifdef REFLEX_PERFORMANCE
  if (!pattern.empty()) {
    starts_with_star_ = pattern.front() == '*';
    pattern.remove_prefix(starts_with_star_);

    if (!pattern.empty()) {
      ends_with_star_ =
          (pattern.back() == '*') && (pattern.size() == 1 || pattern[pattern.size() - 2] != '\\');
      pattern.remove_suffix(ends_with_star_);
    }
  }

  string regex("(?s");  // dotall mode
  if (!case_sensitive) {
    regex.push_back('i');
  }
  regex.push_back(')');
  if (pattern.empty()) {
    regex.append(Glob2Regex("*"));
  } else {
    regex.append(Glob2Regex(pattern));
  }
  matcher_.pattern(regex);
#elif defined(USE_PCRE2)
  string regex("(?s");  // dotall mode
  if (!case_sensitive) {
    regex.push_back('i');
  }
  regex.push_back(')');
  regex.append(Glob2Regex(pattern));

  int errnum;
  PCRE2_SIZE erroffset;
  re_ = pcre2_compile((PCRE2_SPTR)regex.c_str(), regex.size(), 0, &errnum, &erroffset, nullptr);
  if (re_) {
    CHECK_EQ(0, pcre2_jit_compile(re_, PCRE2_JIT_COMPLETE));
    match_data_ = pcre2_match_data_create_from_pattern(re_, NULL);
  }
#endif
}

bool GlobMatcher::Matches(std::string_view str) const {
#ifdef REFLEX_PERFORMANCE
  if (str.size() < 16) {
    return stringmatchlen(glob_.data(), glob_.size(), str.data(), str.size(), !case_sensitive_);
  }
  if (glob_.empty()) {
    return true;
  }

  DCHECK(!matcher_.pattern().empty());

  matcher_.input(reflex::Input(str.data(), str.size()));

  bool use_find = starts_with_star_ || ends_with_star_;
  if (!use_find) {
    return matcher_.matches() > 0;
  }

  bool found = matcher_.find() > 0;
  if (!found) {
    return false;
  }

  if (!ends_with_star_ && matcher_.last() != str.size()) {
    return false;
  }
  if (!starts_with_star_ && matcher_.first() != 0) {
    return false;
  }

  return true;
#elif defined(USE_PCRE2)
  if (!re_ || str.size() < 16) {
    return stringmatchlen(glob_.data(), glob_.size(), str.data(), str.size(), !case_sensitive_);
  }

  if (glob_.empty()) {
    return true;
  }

  int rc = pcre2_jit_match(re_, (PCRE2_SPTR)str.data(), str.size(), 0, 0, match_data_, NULL);
  return rc > 0;

#else
  return stringmatchlen(glob_.data(), glob_.size(), str.data(), str.size(), !case_sensitive_);
#endif
}

GlobMatcher::~GlobMatcher() {
#ifdef REFLEX_PERFORMANCE
#elif defined(USE_PCRE2)
  if (re_) {
    pcre2_code_free(re_);
    pcre2_match_data_free(match_data_);
  }
#endif
}

}  // namespace dfly


================================================
FILE: src/core/glob_matcher.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once

#include <reflex/matcher.h>

#include <string>
#include <string_view>

// We opt for using Reflex library for glob matching.
// While I find PCRE2 faster, it's not substantially faster to justify the shared lib dependency.

// For some regex, Reflex (and pcre2) have extremely slow compile times(70+ms).
// This latency is significant for the hot path and therefore both are disabled
// and we fall back to the plain old stringmatchlen. For more info, refer to #5547 on gh.
//#define REFLEX_PERFORMANCE

#ifndef REFLEX_PERFORMANCE
#ifdef USE_PCRE2
#define PCRE2_CODE_UNIT_WIDTH 8
#include <pcre2.h>
#endif
#endif

namespace dfly {

class GlobMatcher {
  GlobMatcher(const GlobMatcher&) = delete;
  GlobMatcher& operator=(const GlobMatcher&) = delete;

 public:
  explicit GlobMatcher(std::string_view pattern, bool case_sensitive);
  ~GlobMatcher();

  bool Matches(std::string_view str) const;

  // Exposed for testing purposes.
  static std::string Glob2Regex(std::string_view glob);

 private:
  // TODO: we fix the problem of stringmatchlen being much
  // faster when the result is immediately known to be false, for example: "a*" vs "bxxxxx".
  // The goal is to demonstrate on-par performance for the following case:
  // > debug populate 5000000 keys 32 RAND
  // > while true; do time valkey-cli scan 0 match 'foo*bar'; done
  // Also demonstrate that the "improved" performance via SCAN command and not only via
  // micro-benchmark.
  // The performance of naive algorithm becomes worse in cases where string is long enough,
  // and the pattern has a star at the start (or it matches at first).
#ifdef REFLEX_PERFORMANCE
  mutable reflex::Matcher matcher_;

  bool starts_with_star_ = false;
  bool ends_with_star_ = false;
#elif defined(USE_PCRE2)
  pcre2_code_8* re_ = nullptr;
  pcre2_match_data_8* match_data_ = nullptr;
#endif
  std::string_view glob_;
  bool case_sensitive_;
};

}  // namespace dfly


================================================
FILE: src/core/huff_coder.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/huff_coder.h"

#include "base/logging.h"

extern "C" {
#include "huff/huf.h"
}

using namespace std;

namespace dfly {

constexpr size_t kWspSize = HUF_CTABLE_WORKSPACE_SIZE;

bool HuffmanEncoder::Load(std::string_view binary_data, std::string* error_msg) {
  CHECK(!huf_ctable_);

  huf_ctable_.reset(new HUF_CElt[HUF_CTABLE_SIZE_ST(255)]);
  table_max_symbol_ = 255;

  unsigned has_zero_weights = 0;
  size_t read_size = HUF_readCTable(huf_ctable_.get(), &table_max_symbol_, binary_data.data(),
                                    binary_data.size(), &has_zero_weights);

  if (HUF_isError(read_size)) {
    huf_ctable_.reset();
    *error_msg = HUF_getErrorName(read_size);
    return false;
  }
  if (read_size != binary_data.size()) {
    *error_msg = "Corrupted data";
    huf_ctable_.reset();
    return false;
  }
  HUF_CTableHeader header = HUF_readCTableHeader(huf_ctable_.get());
  num_bits_ = header.tableLog;
  table_max_symbol_ = header.maxSymbolValue;

  return true;
}

bool HuffmanEncoder::Build(const unsigned hist[], unsigned max_symbol, std::string* error_msg) {
  CHECK(!huf_ctable_);
  huf_ctable_.reset(new HUF_CElt[HUF_CTABLE_SIZE_ST(max_symbol)]);

  unique_ptr<uint32_t[]> wrkspace(new uint32_t[HUF_CTABLE_WORKSPACE_SIZE_U32]);

  size_t num_bits =
      HUF_buildCTable_wksp(huf_ctable_.get(), hist, max_symbol, 0, wrkspace.get(), kWspSize);
  if (HUF_isError(num_bits)) {
    *error_msg = HUF_getErrorName(num_bits);
    huf_ctable_.reset();
    return false;
  }
  num_bits_ = static_cast<uint8_t>(num_bits);
  table_max_symbol_ = max_symbol;
  return true;
}

void HuffmanEncoder::Reset() {
  huf_ctable_.reset();
  table_max_symbol_ = 0;
}

bool HuffmanEncoder::Encode(std::string_view data, uint8_t* dest, uint32_t* dest_size,
                            std::string* error_msg) const {
  DCHECK(huf_ctable_);

  size_t res =
      HUF_compress1X_usingCTable(dest, *dest_size, data.data(), data.size(), huf_ctable_.get(), 0);

  if (HUF_isError(res)) {
    *error_msg = HUF_getErrorName(res);
    return false;
  }
  *dest_size = static_cast<uint32_t>(res);
  return true;
}

unsigned HuffmanEncoder::GetNBits(uint8_t symbol) const {
  DCHECK(huf_ctable_);
  return HUF_getNbBitsFromCTable(huf_ctable_.get(), symbol);
}

size_t HuffmanEncoder::EstimateCompressedSize(const unsigned hist[], unsigned max_symbol) const {
  DCHECK(huf_ctable_);
  size_t res = HUF_estimateCompressedSize(huf_ctable_.get(), hist, max_symbol);
  return res;
}

string HuffmanEncoder::Export() const {
  DCHECK(huf_ctable_);

  // Reverse engineered: (maxSymbolValue + 1) / 2 + 1.
  constexpr unsigned kMaxTableSize = 130;
  string res;
  res.resize(kMaxTableSize);

  unique_ptr<uint32_t[]> wrkspace(new uint32_t[HUF_CTABLE_WORKSPACE_SIZE_U32]);

  // Seems we can reuse the same workspace, its capacity is enough.
  size_t size = HUF_writeCTable_wksp(res.data(), res.size(), huf_ctable_.get(), table_max_symbol_,
                                     num_bits_, wrkspace.get(), kWspSize);
  CHECK(!HUF_isError(size));
  res.resize(size);
  return res;
}

// Copied from HUF_tightCompressBound.
size_t HuffmanEncoder::CompressedBound(size_t src_size) const {
  return ((src_size * num_bits_) >> 3) + 8;
}

bool HuffmanDecoder::Load(std::string_view binary_data, std::string* error_msg) {
  DCHECK(!huf_dtable_);
  huf_dtable_.reset(new HUF_DTable[HUF_DTABLE_SIZE(HUF_TABLELOG_MAX)]);
  huf_dtable_[0] = (HUF_TABLELOG_MAX - 1) * 0x01000001;  // some sort of magic number

  constexpr size_t kWspSize = HUF_DECOMPRESS_WORKSPACE_SIZE;
  unique_ptr<uint8_t[]> wrksp(new uint8_t[kWspSize]);

  size_t res = HUF_readDTableX1_wksp(huf_dtable_.get(), binary_data.data(), binary_data.size(),
                                     wrksp.get(), kWspSize, 0);
  if (HUF_isError(res)) {
    *error_msg = HUF_getErrorName(res);
    huf_dtable_.reset();
    return false;
  }
  if (res != binary_data.size()) {
    *error_msg = "Corrupted data";
    huf_dtable_.reset();
    return false;
  }
  return true;
}

bool HuffmanDecoder::Decode(std::string_view src, size_t dest_size, char* dest) const {
  DCHECK(huf_dtable_);
  size_t res =
      HUF_decompress1X_usingDTable(dest, dest_size, src.data(), src.size(), huf_dtable_.get(), 1);

  if (HUF_isError(res)) {
    LOG(DFATAL) << "Failed to decompress: " << HUF_getErrorName(res);
    return false;
  }
  return true;
}

}  // namespace dfly


================================================
FILE: src/core/huff_coder.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <cstdint>
#include <memory>
#include <string_view>

namespace dfly {

class HuffmanEncoder {
 public:
  bool Build(const unsigned hist[], unsigned max_symbol, std::string* error_msg);

  bool Encode(std::string_view data, uint8_t* dest, uint32_t* dest_size,
              std::string* error_msg) const;

  size_t EstimateCompressedSize(const unsigned hist[], unsigned max_symbol) const;

  void Reset();

  // Load using the serialized data produced by Export().
  bool Load(std::string_view binary_data, std::string* error_msg);

  // Exports a binary representation of the table, that can be loaded using Load().
  std::string Export() const;

  uint8_t num_bits() const {
    return num_bits_;
  }

  bool valid() const {
    return bool(huf_ctable_);
  }

  unsigned max_symbol() const {
    return table_max_symbol_;
  }

  unsigned GetNBits(uint8_t symbol) const;

  // Estimation of the size of the destination buffer needed to store the compressed data.
  // destination of this size must be passed to Encode().
  size_t CompressedBound(size_t src_size) const;

 private:
  using HUF_CElt = size_t;
  std::unique_ptr<HUF_CElt[]> huf_ctable_;
  unsigned table_max_symbol_ = 0;
  uint8_t num_bits_ = 0;
};

class HuffmanDecoder {
 public:
  bool Load(std::string_view binary_data, std::string* error_msg);
  bool valid() const {
    return bool(huf_dtable_);
  }

  // decoded_size should be the *precise* size of the decoded data, otherwise the function will
  // fail. dest should point to a buffer of at least decoded_size bytes.
  // Returns true if decompression was successful, false if the data is corrupted.
  bool Decode(std::string_view src, size_t decoded_size, char* dest) const;

 private:
  using HUF_DTable = uint32_t;
  std::unique_ptr<HUF_DTable[]> huf_dtable_;
};

}  // namespace dfly


================================================
FILE: src/core/intent_lock.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#include <assert.h>

#include <ostream>

#pragma once

namespace dfly {

// SHARED - can be acquired multiple times as long as other intents are absent.
// EXCLUSIVE - is acquired only if it's the only lock recorded.
// Transactions at the head of tx-queue are considered to be the ones that acquired the lock
class IntentLock {
 public:
  enum Mode { SHARED = 0, EXCLUSIVE = 1 };

  // Returns true if lock was acquired. In any case, the intent is recorded.
  bool Acquire(Mode m) {
    ++cnt_[m];

    if (cnt_[1 ^ int(m)])
      return false;
    return m == SHARED || cnt_[EXCLUSIVE] == 1;
  }

  // Returns true if lock can be acquired using `m` mode.
  bool Check(Mode m) const {
    unsigned s = cnt_[EXCLUSIVE];
    if (s)
      return false;

    return (m == SHARED) ? true : cnt_[SHARED] == 0;
  }

  // Returns true if this lock would block transactions from running unless they are at the head
  // of the transaction queue (first ones)
  bool IsContended() const {
    return (cnt_[EXCLUSIVE] > 1) || (cnt_[EXCLUSIVE] == 1 && cnt_[SHARED] > 0);
  }

  // A heuristic function to estimate the contention amount with a single score.
  unsigned ContentionScore() const {
    return cnt_[EXCLUSIVE] * 256 + cnt_[SHARED];
  }

  void Release(Mode m, unsigned val = 1) {
    assert(cnt_[m] >= val);

    cnt_[m] -= val;
    // return cnt_[m] == 0 ? cnt_[1 ^ int(m)] : 0;
  }

  bool IsFree() const {
    return (cnt_[0] | cnt_[1]) == 0;
  }

  static const char* ModeName(Mode m);

  void VerifyDebug();

  friend std::ostream& operator<<(std::ostream& o, const IntentLock& lock) {
    return o << "{SHARED: " << lock.cnt_[0] << ", EXCLUSIVE: " << lock.cnt_[1] << "}";
  }

 private:
  unsigned cnt_[2] = {0, 0};
};

}  // namespace dfly


================================================
FILE: src/core/interpreter.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/interpreter.h"

#include <absl/base/casts.h>
#include <absl/container/fixed_array.h>
#include <absl/strings/str_cat.h>
#include <absl/strings/str_split.h>
#include <absl/time/clock.h>
#include <mimalloc.h>
#include <openssl/evp.h>
#include <xxhash.h>

#include <cstring>
#include <optional>
#include <regex>
#include <set>
#include <variant>

#include "base/flags.h"
#include "core/interpreter_polyfill.h"
#include "overloaded.h"

extern "C" {
#include <lauxlib.h>
#include <lua.h>
#include <lualib.h>

#include "redis/sds.h"
#include "redis/util.h"

LUALIB_API int(luaopen_cjson)(lua_State* L);
LUALIB_API int(luaopen_struct)(lua_State* L);
LUALIB_API int(luaopen_cmsgpack)(lua_State* L);
LUALIB_API int(luaopen_bit)(lua_State* L);
}

#include <absl/strings/str_format.h>

#include "base/logging.h"

struct LuaGcGen {
  int minormul = 20;
  int majormul = 100;
};
struct LuaGcInc {
  int pause = 200;
  int stepmul = 100;
  int stepsize = 13;
};

using LuaGcFlag = std::variant<std::monostate, LuaGcGen, LuaGcInc>;

ABSL_FLAG(LuaGcFlag, luagc, {},
          "Specifies Lua garabage collector preferences. By default used default lua GC parameters."
          "Format should be 'inc/200/100/13' or 'gen/20/100' where 'inc' and 'gen' are types of "
          "GC, numbers are parameters."
          "For more information check https://www.lua.org/manual/5.4/manual.html#2.5");

ABSL_FLAG(uint64_t, lua_mem_gc_threshold, 10000000,
          "Specifies Lua interpreter's per thread memory limit in bytes after which the GC will be "
          "called forcefully. 0 value remove forced GC calls");

ABSL_FLAG(bool, lua_enable_redis_log, false, "Enable redis.log to write logs from lua script.");

static bool AbslParseFlag(std::string_view in, LuaGcFlag* flag, std::string* err) {
  if (in.empty()) {
    *flag = LuaGcFlag{};
    return true;
  }
  std::vector<std::string_view> parts = absl::StrSplit(in, '/');
  if (parts.size() == 3) {
    if (parts[0] == "gen") {
      LuaGcGen args;
      if (absl::SimpleAtoi(parts[1], &args.minormul) &&
          absl::SimpleAtoi(parts[2], &args.majormul)) {
        *flag = args;
        return true;
      }
    }
  } else if (parts.size() == 4) {
    if (parts[0] == "inc") {
      LuaGcInc args;
      if (absl::SimpleAtoi(parts[1], &args.pause) && absl::SimpleAtoi(parts[2], &args.stepmul) &&
          absl::SimpleAtoi(parts[3], &args.stepsize)) {
        *flag = LuaGcFlag{args};
        return true;
      }
    }
  }
  *err = absl::StrCat("Invalid luagc flag parameters");
  return false;
}

static std::string AbslUnparseFlag(const LuaGcFlag& flag) {
  return std::visit(dfly::Overloaded{
                        [](std::monostate) { return std::string(); },
                        [](const LuaGcGen& gen) {
                          return absl::StrCat("gen", "/", gen.minormul, "/", gen.majormul);
                        },
                        [](const LuaGcInc& inc) {
                          return absl::StrCat("inc", "/", inc.pause, "/", inc.stepmul, "/",
                                              inc.stepsize);
                        },
                    },
                    flag);
}

namespace dfly {
using namespace std;

namespace {

// EVP_Q_digest is not present in the older versions of OpenSSL.
int EVPDigest(const void* data, size_t datalen, unsigned char* md, size_t* mdlen) {
  unsigned int temp = 0;
  int ret = EVP_Digest(data, datalen, md, &temp, EVP_sha1(), NULL);

  if (mdlen != NULL)
    *mdlen = temp;
  return ret;
}

/* This function is used in order to push an error on the Lua stack in the
 * format used by redis.pcall to return errors, which is a lua table
 * with a single "err" field set to the error string. Note that this
 * table is never a valid reply by proper commands, since the returned
 * tables are otherwise always indexed by integers, never by strings. */
void PushError(lua_State* lua, string_view error, bool trace = true) {
  lua_Debug dbg;

  lua_newtable(lua);
  lua_pushstring(lua, "err");

  /* Attempt to figure out where this function was called, if possible */
  if (trace && lua_getstack(lua, 1, &dbg) && lua_getinfo(lua, "nSl", &dbg)) {
    string msg = absl::StrCat(dbg.source, ": ", dbg.currentline, ": ", error);
    lua_pushlstring(lua, msg.c_str(), msg.size());
  } else {
    lua_pushlstring(lua, error.data(), error.size());
  }
  lua_settable(lua, -3);
}

// Custom object explorer that collects all values into string array
struct StringCollectorTranslator : public ObjectExplorer {
  void OnString(std::string_view str) final {
    values.emplace_back(str);
  }
  void OnArrayStart(unsigned len) final {
    // if values is n't empty it means we can not predict the needed size so reserve can
    // significantly decrease performance
    if (values.empty()) {
      values.reserve(len);
    }
  }
  void OnArrayEnd() final {
  }
  void OnBool(bool b) final {
    OnString(absl::AlphaNum(b).Piece());
  }
  void OnDouble(double d) final {
    OnString(absl::AlphaNum(d).Piece());
  }
  void OnInt(int64_t val) final {
    OnString(absl::AlphaNum(val).Piece());
  }
  void OnNil() final {
    OnString("");
  }
  void OnStatus(std::string_view str) final {
    OnString(str);
  }
  void OnError(std::string_view str) final {
    LOG(ERROR) << str;
  }

  vector<string> values;
};

class RedisTranslator : public ObjectExplorer {
 public:
  RedisTranslator(lua_State* lua) : lua_(lua) {
  }
  void OnBool(bool b) final;
  void OnString(std::string_view str) final;
  void OnDouble(double d) final;
  void OnInt(int64_t val) final;
  void OnArrayStart(unsigned len) final;
  void OnArrayEnd() final;
  void OnNil() final;
  void OnStatus(std::string_view str) final;
  void OnError(std::string_view str) final;

  bool HasError();

 private:
  void ArrayPre() {
  }

  void ArrayPost() {
    if (!array_index_.empty()) {
      lua_rawseti(lua_, -2, array_index_.back()++); /* set table at key `i' */
    }
  }

  lua_State* lua_;
  bool has_error_{false};
  vector<unsigned> array_index_{};
};

void RedisTranslator::OnBool(bool b) {
  CHECK(!b) << "Only false (nil) supported";
  ArrayPre();
  lua_pushboolean(lua_, 0);
  ArrayPost();
}

void RedisTranslator::OnString(std::string_view str) {
  ArrayPre();
  lua_pushlstring(lua_, str.data(), str.size());
  ArrayPost();
}

void RedisTranslator::OnDouble(double d) {
  const double kConvertEps = std::numeric_limits<double>::epsilon();

  double fractpart, intpart;
  fractpart = modf(d, &intpart);

  ArrayPre();

  // Convert to integer when possible to allow converting to string without trailing zeros.
  if (abs(fractpart) < kConvertEps && intpart < double(std::numeric_limits<lua_Integer>::max()) &&
      intpart > std::numeric_limits<lua_Integer>::min())
    lua_pushinteger(lua_, static_cast<lua_Integer>(d));
  else
    lua_pushnumber(lua_, d);
  ArrayPost();
}

void RedisTranslator::OnInt(int64_t val) {
  ArrayPre();
  lua_pushinteger(lua_, val);
  ArrayPost();
}

void RedisTranslator::OnNil() {
  ArrayPre();
  lua_pushboolean(lua_, 0);
  ArrayPost();
}

void RedisTranslator::OnStatus(std::string_view str) {
  CHECK(array_index_.empty()) << "unexpected status";
  lua_createtable(lua_, 0, 1);
  lua_pushstring(lua_, "ok");
  lua_pushlstring(lua_, str.data(), str.size());
  lua_settable(lua_, -3);
}

void RedisTranslator::OnError(std::string_view str) {
  has_error_ = true;
  PushError(lua_, str, false);
}

void RedisTranslator::OnArrayStart(unsigned len) {
  ArrayPre();
  lua_createtable(lua_, len, 0);
  array_index_.push_back(1);
}

void RedisTranslator::OnArrayEnd() {
  CHECK(!array_index_.empty());
  DCHECK(lua_istable(lua_, -1));

  array_index_.pop_back();
  ArrayPost();
}

bool RedisTranslator::HasError() {
  return has_error_;
}

void RunSafe(lua_State* lua, string_view buf, const char* name) {
  CHECK_EQ(0, luaL_loadbuffer(lua, buf.data(), buf.size(), name));
  int err = lua_pcall(lua, 0, 0, 0);
  if (err) {
    const char* errstr = lua_tostring(lua, -1);
    LOG(FATAL) << "Error running " << name << " " << errstr;
  }
}

void Require(lua_State* lua, const char* name, lua_CFunction openf) {
  luaL_requiref(lua, name, openf, 1);
  lua_pop(lua, 1); /* remove lib */
}

string_view TopSv(lua_State* lua) {
  return string_view{lua_tostring(lua, -1), lua_rawlen(lua, -1)};
}

optional<int> FetchKey(lua_State* lua, const char* key) {
  lua_pushcfunction(lua, [](lua_State* lua) -> int {
    lua_gettable(lua, -3);
    return 1;
  });
  lua_pushstring(lua, key);
  int status = lua_pcall(lua, 1, 1, 0);
  if (status != LUA_OK) {
    lua_pop(lua, 1);
    return nullopt;
  }
  int type = lua_type(lua, -1);
  if (type == LUA_TNIL) {
    lua_pop(lua, 1);
    return nullopt;
  }
  return type;
}

void SetGlobalArrayInternal(lua_State* lua, const char* name, Interpreter::SliceSpan args) {
  lua_createtable(lua, args.size(), 0);
  for (size_t j = 0; j < args.size(); j++) {
    lua_pushlstring(lua, args[j].data(), args[j].size());
    lua_rawseti(lua, -2, j + 1);
  }
  lua_setglobal(lua, name);
}

/* In case the error set into the Lua stack by PushError() was generated
 * by the non-error-trapping version of redis.pcall(), which is redis.call(),
 * this function will raise the Lua error so that the execution of the
 * script will be halted.
 * This function never returns, it unwinds the Lua call stack until an error handler is found or the
 * script exits */
int RaiseErrorAndAbort(lua_State* lua) {
  lua_pushstring(lua, "err");
  lua_gettable(lua, -2);
  return lua_error(lua);
}

void LoadLibrary(lua_State* lua, const char* libname, lua_CFunction luafunc) {
  lua_pushcfunction(lua, luafunc);
  lua_pushstring(lua, libname);
  lua_call(lua, 1, 0);
}

void InitLua(lua_State* lua) {
  Require(lua, "", luaopen_base);
  Require(lua, LUA_TABLIBNAME, luaopen_table);
  Require(lua, LUA_STRLIBNAME, luaopen_string);
  Require(lua, LUA_MATHLIBNAME, luaopen_math);
  Require(lua, LUA_DBLIBNAME, luaopen_debug);

  LoadLibrary(lua, "cjson", luaopen_cjson);
  LoadLibrary(lua, "struct", luaopen_struct);
  LoadLibrary(lua, "cmsgpack", luaopen_cmsgpack);
  LoadLibrary(lua, "bit", luaopen_bit);

  /* Add a helper function we use for pcall error reporting.
   * Note that when the error is in the C function we want to report the
   * information about the caller, that's what makes sense from the point
   * of view of the user debugging a script. */
  {
    const char errh_func[] =
        "local dbg = debug\n"
        "function __redis__err__handler(err)\n"
        "  local i = dbg.getinfo(2,'nSl')\n"
        "  if i and i.what == 'C' then\n"
        "    i = dbg.getinfo(3,'nSl')\n"
        "  end\n"
        "  if i then\n"
        "    return i.source .. ':' .. i.currentline .. ': ' .. err\n"
        "  else\n"
        "    return err\n"
        "  end\n"
        "end\n";
    RunSafe(lua, errh_func, "@err_handler_def");
  }

  {
    const char code[] = R"(
local dbg=debug
local mt = {}

setmetatable(_G, mt)
mt.__newindex = function (t, n, v)
  if dbg.getinfo(2) then
    local w = dbg.getinfo(2, "S").what
    if w ~= "main" and w ~= "C" then
      error("Script attempted to create global variable '"..tostring(n).."'", 2)
    end
  end
  rawset(t, n, v)
end
mt.__index = function (t, n)
  if dbg.getinfo(2) and dbg.getinfo(2, "S").what ~= "C" then
    error("Script attempted to access nonexistent global variable '"..tostring(n).."'", 2)
  end
  return rawget(t, n)
end
debug = nil
)";
    RunSafe(lua, code, "@enable_strict_lua");
  }

  lua_pushnil(lua);
  lua_setglobal(lua, "loadfile");
  lua_pushnil(lua);
  lua_setglobal(lua, "dofile");

  // Register deprecated or removed functions to maintain compatibility with 5.1
  register_polyfills(lua);
}

// dest must have at least 41 chars.
void ToHex(const uint8_t* src, char* dest) {
  const char cset[] = "0123456789abcdef";
  for (size_t j = 0; j < 20; j++) {
    dest[j * 2] = cset[((src[j] & 0xF0) >> 4)];
    dest[j * 2 + 1] = cset[(src[j] & 0xF)];
  }
  dest[40] = '\0';
}

int DragonflyHashCommand(lua_State* lua) {
  XXH64_hash_t hash = absl::bit_cast<XXH64_hash_t>(lua_tointeger(lua, 1));
  bool requires_sort = lua_toboolean(lua, 2);

  // Pop first two arguments to call RedisGenericCommand from this function with tail
  lua_remove(lua, 1);
  lua_remove(lua, 1);

  // Compute key hash; for MGET hash all key arguments, otherwise just the first
  {
    size_t cmd_len;
    const char* cmd = lua_tolstring(lua, 1, &cmd_len);
    int top = lua_gettop(lua);
    int key_end = absl::EqualsIgnoreCase(absl::string_view(cmd, cmd_len), "mget") ? top : 2;
    for (int i = 2; i <= key_end; ++i) {
      size_t len;
      const char* key = lua_tolstring(lua, i, &len);
      hash = XXH64(key, len, hash);
    }
  }

  // Collect output into custom string collector
  StringCollectorTranslator translator;
  void** ptr = static_cast<void**>(lua_getextraspace(lua));
  reinterpret_cast<Interpreter*>(*ptr)->RedisGenericCommand(false, false, &translator);

  if (requires_sort)
    sort(translator.values.begin(), translator.values.end());

  // Compute new hash and return it
  for (string_view str : translator.values)
    hash = XXH64(str.data(), str.size(), hash);

  lua_pushinteger(lua, absl::bit_cast<lua_Integer>(hash));
  return 1;
}

int DragonflyRandstrCommand(lua_State* state) {
  int argc = lua_gettop(state);
  lua_Integer dsize = lua_tonumber(state, 1);
  lua_remove(state, 1);

  std::string buf(dsize, ' ');

  auto push_str = [dsize, state, &buf]() {
    static const char alphanum[] =
        "0123456789"
        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        "abcdefghijklmnopqrstuvwxyz";

    static const char pattern[] = "DRAGONFLY";
    constexpr int pattern_len = sizeof(pattern) - 1;
    constexpr int pattern_interval = 53;
    for (int i = 0; i < dsize; ++i) {
      if (i % pattern_interval == 0 && i + pattern_len <= dsize) {
        // Insert the repeating pattern for better compression of random string.
        buf.replace(i, pattern_len, pattern, pattern_len);
        i += pattern_len - 1;  // Adjust index to skip the pattern
      } else {
        // Fill the rest with semi-random characters for variation
        buf[i] = alphanum[rand() % (sizeof(alphanum) - 1)];
      }
    }
    lua_pushlstring(state, buf.c_str(), buf.length());
  };

  if (argc == 1) {
    push_str();
  } else {
    lua_Integer num = lua_tonumber(state, 1);
    lua_createtable(state, num, 0);
    for (int i = 1; i <= num; i++) {
      push_str();
      lua_rawseti(state, -2, i);
    }
  }

  return 1;
}

int RedisSha1Command(lua_State* lua) {
  int argc = lua_gettop(lua);
  if (argc != 1) {
    lua_pushstring(lua, "wrong number of arguments");
    return lua_error(lua);
  }

  size_t len;
  const char* s = lua_tolstring(lua, 1, &len);

  uint8_t digest[EVP_MAX_MD_SIZE];
  EVPDigest(s, len, digest, NULL);

  char hex[41];
  ToHex(digest, hex);

  lua_pushstring(lua, hex);
  return 1;
}

/* Returns a table with a single field 'field' set to the string value
 * passed as argument. This helper function is handy when returning
 * a Redis Protocol error or status reply from Lua:
 *
 * return redis.error_reply("ERR Some Error")
 * return redis.status_reply("ERR Some Error")
 */
int SingleFieldTable(lua_State* lua, const char* field) {
  if (lua_gettop(lua) != 1 || lua_type(lua, -1) != LUA_TSTRING) {
    PushError(lua, "wrong number or type of arguments");
    return 1;
  }

  lua_newtable(lua);
  lua_pushstring(lua, field);
  lua_pushvalue(lua, -3);
  lua_settable(lua, -3);
  return 1;
}

int RedisErrorReplyCommand(lua_State* lua) {
  return SingleFieldTable(lua, "err");
}

int RedisStatusReplyCommand(lua_State* lua) {
  return SingleFieldTable(lua, "ok");
}

// no-op
int RedisReplicateCommands(lua_State* lua) {
  lua_pushinteger(lua, 1);
  // number of results (the number of elements pushed to the lua stack
  return 1;
}

int RedisLogCommand(lua_State* lua) {
  int j, argc = lua_gettop(lua);
  sds log;

  if (argc < 2) {
    PushError(lua, "redis.log() requires two arguments or more.");
    return RaiseErrorAndAbort(lua);
  } else if (!lua_isnumber(lua, -argc)) {
    PushError(lua, "First argument must be a number (log level).");
    return RaiseErrorAndAbort(lua);
  }

  if (absl::GetFlag(FLAGS_lua_enable_redis_log)) {
    int level = lua_tonumber(lua, -argc);
    if (level < LL_DEBUG || level > LL_WARNING) {
      PushError(lua, "Invalid log level.");
      return RaiseErrorAndAbort(lua);
    }

    /* Glue together all the arguments */
    log = sdsempty();
    for (j = 1; j < argc; j++) {
      size_t len;
      char* s;

      s = (char*)lua_tolstring(lua, (-argc) + j, &len);
      if (s) {
        if (j != 1)
          log = sdscatlen(log, " ", 1);
        log = sdscatlen(log, s, len);
      }
    }

    switch (level) {
      case LL_DEBUG:
      case LL_VERBOSE:
        VLOG(1) << log;
        break;
      case LL_NOTICE:
        LOG(INFO) << log;
        break;
      case LL_WARNING:
        LOG(WARNING) << log;
      default:
        break;
    }
    sdsfree(log);
  }

  return 0;
}

// See https://www.lua.org/manual/5.3/manual.html#lua_Alloc
void* mimalloc_glue(void* ud, void* ptr, size_t osize, size_t nsize) {
  int64_t& used_bytes = *static_cast<int64_t*>(ud);

  if (nsize == 0) {
    used_bytes -= mi_usable_size(ptr);
    mi_free_size(ptr, osize);
    return nullptr;
  } else if (ptr == nullptr) {
    ptr = mi_malloc(nsize);
    used_bytes += mi_usable_size(ptr);
    return ptr;
  } else {
    const auto old_size = mi_usable_size(ptr);
    ptr = mi_realloc(ptr, nsize);
    if (ptr) {
      used_bytes -= old_size;
      used_bytes += mi_usable_size(ptr);
    }

    return ptr;
  }
}

}  // namespace

Interpreter::Interpreter() {
  InterpreterManager::tl_stats().interpreter_cnt++;

  // interpreter can be runnned in different threads so we need to calculate
  // used memory via &used_bytes_ additional parameter
  lua_ = lua_newstate(mimalloc_glue, &used_bytes_);
  InitLua(lua_);
  void** ptr = static_cast<void**>(lua_getextraspace(lua_));
  *ptr = this;
  // SaveOnRegistry(lua_, kInstanceKey, this);

  /* Register the dragonfly commands table and fields */
  lua_newtable(lua_);

  /* dragonfly.ihash - compute quick integer hash of command result */
  lua_pushstring(lua_, "ihash");
  lua_pushcfunction(lua_, DragonflyHashCommand);
  lua_settable(lua_, -3);

  /* dragonfly.randstr - generate random string or table of random strings */
  lua_pushstring(lua_, "randstr");
  lua_pushcfunction(lua_, DragonflyRandstrCommand);
  lua_settable(lua_, -3);

  /* Finally set the table as 'dragonfly' global var. */
  lua_setglobal(lua_, "dragonfly");
  CHECK(lua_checkstack(lua_, 64));

  /* Register the redis commands table and fields */
  lua_newtable(lua_);

  /* redis.call */
  lua_pushstring(lua_, "call");
  lua_pushcfunction(lua_, RedisCallCommand);
  lua_settable(lua_, -3);

  /* redis.pcall */
  lua_pushstring(lua_, "pcall");
  lua_pushcfunction(lua_, RedisPCallCommand);
  lua_settable(lua_, -3);

  /* redis.acall */
  lua_pushstring(lua_, "acall");
  lua_pushcfunction(lua_, RedisACallCommand);
  lua_settable(lua_, -3);

  /* redis.apcall */
  lua_pushstring(lua_, "apcall");
  lua_pushcfunction(lua_, RedisAPCallCommand);
  lua_settable(lua_, -3);

  lua_pushstring(lua_, "sha1hex");
  lua_pushcfunction(lua_, RedisSha1Command);
  lua_settable(lua_, -3);

  /* redis.error_reply and redis.status_reply */
  lua_pushstring(lua_, "error_reply");
  lua_pushcfunction(lua_, RedisErrorReplyCommand);
  lua_settable(lua_, -3);
  lua_pushstring(lua_, "status_reply");
  lua_pushcfunction(lua_, RedisStatusReplyCommand);
  lua_settable(lua_, -3);

  /* no-op functions */

  /* redis.replicate_commands*/
  lua_pushstring(lua_, "replicate_commands");
  lua_pushcfunction(lua_, RedisReplicateCommands);
  lua_settable(lua_, -3);

  /* redis.log*/
  lua_pushstring(lua_, "log");
  lua_pushcfunction(lua_, RedisLogCommand);
  lua_settable(lua_, -3);

  lua_pushinteger(lua_, LL_DEBUG);
  lua_setfield(lua_, -2, "LOG_DEBUG");

  lua_pushinteger(lua_, LL_VERBOSE);
  lua_setfield(lua_, -2, "LOG_VERBOSE");

  lua_pushinteger(lua_, LL_NOTICE);
  lua_setfield(lua_, -2, "LOG_NOTICE");

  lua_pushinteger(lua_, LL_WARNING);
  lua_setfield(lua_, -2, "LOG_WARNING");

  /* Finally set the table as 'redis' global var. */
  lua_setglobal(lua_, "redis");
  CHECK(lua_checkstack(lua_, 64));

  UpdateGCParameters();
}

Interpreter::~Interpreter() {
  InterpreterManager::tl_stats().interpreter_cnt--;

  lua_close(lua_);
}

void Interpreter::FuncSha1(string_view body, char* fp) {
  uint8_t digest[EVP_MAX_MD_SIZE];
  EVPDigest(body.data(), body.size(), digest, NULL);

  ToHex(digest, fp);
}

auto Interpreter::AddFunction(string_view sha, string_view body, string* result) -> AddResult {
  char funcname[43];
  funcname[0] = 'f';
  funcname[1] = '_';
  DCHECK(sha.size() == 40);
  memcpy(funcname + 2, sha.data(), sha.size());
  funcname[42] = '\0';

  int type = lua_getglobal(lua_, funcname);
  lua_pop(lua_, 1);

  if (type == LUA_TNIL && !AddInternal(funcname, body, result))
    return COMPILE_ERR;

  return type == LUA_TNIL ? ADD_OK : ALREADY_EXISTS;
}

bool Interpreter::Exists(string_view sha) const {
  DCHECK(lua_);

  if (sha.size() != 40)
    return false;

  char fname[43];
  fname[0] = 'f';
  fname[1] = '_';
  fname[42] = '\0';
  memcpy(fname + 2, sha.data(), 40);

  int type = lua_getglobal(lua_, fname);
  lua_pop(lua_, 1);

  return type == LUA_TFUNCTION;
}

auto Interpreter::RunFunction(string_view sha, std::string* error) -> RunResult {
  DVLOG(2) << "RunFunction " << sha << " " << lua_gettop(lua_);

  DCHECK_EQ(40u, sha.size());

  lua_getglobal(lua_, "__redis__err__handler");
  char fname[43];
  fname[0] = 'f';
  fname[1] = '_';
  memcpy(fname + 2, sha.data(), 40);
  fname[42] = '\0';

  int type = lua_getglobal(lua_, fname);
  if (type != LUA_TFUNCTION) {
    lua_pop(lua_, 2);

    return NOT_EXISTS;
  }

  // At this point lua stack has 2 globals.

  /* We have zero arguments and expect
   * a single return value. */
  int err = lua_pcall(lua_, 0, 1, -2);

  if (err) {
    *error = lua_tostring(lua_, -1);
  }

  return err == 0 ? RUN_OK : RUN_ERR;
}

void Interpreter::SetGlobalArray(const char* name, SliceSpan args) {
  SetGlobalArrayInternal(lua_, name, args);
}

optional<string> Interpreter::DetectPossibleAsyncCalls(string_view body_sv) {
  // We want to detect `redis.call` expressions with unused return values, i.e. they are a
  // standalone statement, not part of a expression, condition, function call or assignment.
  //
  // We search for all `redis.(p)call` statements, that are preceeded on the same line by
  // - `do` or `then` -> first statement in a new block, certainly unused value
  // - no tokens      -> we need to check the previous line, if its part of a multi-line expression.
  //
  // If we need to check the previous line, we search for the last word (before comments, if it has
  // one).
  static const regex kRegex{"(?:(\\S+)(\\s*--.*?)*\\s*\n|(then)|(do)|(^))\\s*redis\\.(p*call)"};

  // Taken from https://www.lua.org/manual/5.4/manual.html - 3.1 - Lexical conventions

  // If a line ends with it, then most likely the next line belongs to it as well
  static const set<string_view> kContOperators = {
      "+",  "-",  "*",  "/", "%", "^", "#", "&", "~", "|",  "<<", ">>", "//", "==",
      "~=", "<=", ">=", "<", ">", "=", "(", "{", "[", "::", ":",  ",",  ".",  ".."};

  // If a line ends with it, then most likely the next line belongs to it as well
  static const set<string_view> kContTokens = {"and",    "else",   "elseif", "for",  "goto",
                                               "if",     "in",     "local",  "not",  "or",
                                               "repeat", "return", "until",  "while"};

  auto last_n = [](const string& s, size_t n) {
    return s.size() < n ? s : s.substr(s.size() - n, n);
  };

  smatch sm;
  string body{body_sv};
  vector<size_t> targets;

  // We don't handle comment blocks yet.
  if (body.find("--[[") != string::npos)
    return {};

  sregex_iterator it{body.begin(), body.end(), kRegex};
  sregex_iterator end{};

  for (; it != end; it++) {
    auto last_word = it->str(1);

    if (kContOperators.count(last_n(last_word, 2)) > 0 ||
        kContOperators.count(last_n(last_word, 1)) > 0)
      continue;

    if (kContTokens.count(last_word) > 0)
      continue;

    targets.push_back(it->position(it->size() - 1));
  }

  if (targets.empty())
    return nullopt;

  // Insert 'a' before 'call' and 'pcall'. Reverse order to preserve positions
  reverse(targets.begin(), targets.end());
  body.reserve(body.size() + targets.size());
  for (auto pos : targets)
    body.insert(pos, "a");

  VLOG(1) << "Detected " << targets.size() << " aync calls in script";

  return body;
}

bool Interpreter::IsResultSafe() const {
  int top = lua_gettop(lua_);
  if (top >= 128)
    return false;

  int t = lua_type(lua_, -1);
  if (t != LUA_TTABLE)
    return true;

  bool res = IsTableSafe();

  // Stack can contain intermediate unwindings that were not clean up.
  DCHECK_GE(lua_gettop(lua_), top);
  lua_settop(lua_, top);  // restore to the original setting.

  return res;
}

bool Interpreter::AddInternal(const char* f_id, string_view body, string* error) {
  string script = absl::StrCat("function ", f_id, "() \n");
  absl::StrAppend(&script, body, "\nend");

  int res = luaL_loadbuffer(lua_, script.data(), script.size(), "@user_script");
  if (res == 0) {
    res = lua_pcall(lua_, 0, 0, 0);  // run func definition code
  }

  if (res) {
    error->assign(lua_tostring(lua_, -1));
    lua_pop(lua_, 1);  // Remove the error.

    return false;
  }

  return true;
}

// Stack is cleaned for us, we can leave it dirty
bool Interpreter::IsTableSafe() const {
  auto fres = FetchKey(lua_, "err");
  if (fres && *fres == LUA_TSTRING) {
    return true;
  }

  fres = FetchKey(lua_, "ok");
  if (fres && *fres == LUA_TSTRING) {
    return true;
  }

  // Copy root table because we remove it upon finishing traversal
  lua_pushnil(lua_);
  lua_copy(lua_, -2, -1);

  int depth = 1;
  lua_pushnil(lua_);

  // DFS based on lua stack: [parent-table] [parent-key] [parent-value = table] [key]
  while (depth > 0) {
    if (lua_checkstack(lua_, 3) == 0 || depth > 128)
      return false;

    bool descending = false;
    for (; lua_next(lua_, -2) != 0; lua_pop(lua_, 1)) {
      if (lua_type(lua_, -1) != LUA_TTABLE)
        continue;

      // If we descend, keep value as new table and push nil for start key
      depth++;
      lua_pushnil(lua_);
      descending = true;
      break;
    }

    if (!descending) {
      lua_pop(lua_, 1);
      depth--;
    }
  }

  return true;
}

void Interpreter::SerializeResult(ObjectExplorer* serializer) {
  int t = lua_type(lua_, -1);

  switch (t) {
    case LUA_TSTRING:
      serializer->OnString(TopSv(lua_));
      break;
    case LUA_TBOOLEAN:
      serializer->OnBool(lua_toboolean(lua_, -1));
      break;
    case LUA_TNUMBER:
      if (lua_isinteger(lua_, -1)) {
        serializer->OnInt(lua_tointeger(lua_, -1));
      } else {
        serializer->OnDouble(lua_tonumber(lua_, -1));
      }
      break;
    case LUA_TTABLE: {
      auto fres = FetchKey(lua_, "err");
      if (fres && *fres == LUA_TSTRING) {
        serializer->OnError(TopSv(lua_));
        lua_pop(lua_, 1);
        break;
      }

      fres = FetchKey(lua_, "ok");
      if (fres && *fres == LUA_TSTRING) {
        serializer->OnStatus(TopSv(lua_));
        lua_pop(lua_, 1);
        break;
      }

      fres = FetchKey(lua_, "map");
      if (fres && *fres == LUA_TTABLE) {
        // Calculate length of map part, there is sadly no other way
        unsigned len = 0;
        for (lua_pushnil(lua_); lua_next(lua_, -2) != 0; lua_pop(lua_, 1))
          len++;

        serializer->OnMapStart(len);
        for (lua_pushnil(lua_); lua_next(lua_, -2) != 0;) {
          // Push key to stack top: key value key
          lua_pushnil(lua_);
          lua_copy(lua_, -3, -1);
          SerializeResult(serializer);  // pops key
          SerializeResult(serializer);  // pop value
        }
        serializer->OnMapEnd();

        lua_pop(lua_, 2);
        break;
      }

      unsigned len = lua_rawlen(lua_, -1);

      serializer->OnArrayStart(len);
      for (unsigned i = 0; i < len; ++i) {
        t = lua_rawgeti(lua_, -1, i + 1);  // push table element

        // TODO: we should make sure that we have enough stack space
        // to traverse each object. This can be done as a dry-run before doing real serialization.
        // Once we are sure we are safe we can simplify the serialization flow and
        // remove the error factor.
        SerializeResult(serializer);  // pops the element
      }
      serializer->OnArrayEnd();
      break;
    }
    case LUA_TNIL:
      serializer->OnNil();
      break;
    default:
      LOG(ERROR) << "Unsupported type " << lua_typename(lua_, t);
      serializer->OnNil();
  }

  lua_pop(lua_, 1);
}

void Interpreter::ResetStack() {
  lua_settop(lua_, 0);
}

int64_t Interpreter::RunGC() {
  int64_t before_kb = lua_gc(lua_, LUA_GCCOUNT);
  lua_gc(lua_, LUA_GCCOLLECT);
  int64_t after_kb = lua_gc(lua_, LUA_GCCOUNT);
  LOG_IF(DFATAL, after_kb > before_kb) << "LUA_GCCOLLECT increase memory consumption from "
                                       << before_kb << "kB to " << after_kb << "kB";
  int64_t res = (before_kb - after_kb) * 1024;
  return std::max(int64_t(0), res);
}

void Interpreter::UpdateGCParameters() {
  auto gc = absl::GetFlag(FLAGS_luagc);

  std::visit(dfly::Overloaded{
                 [](std::monostate) {},
                 [&](const LuaGcGen& gen) { lua_gc(lua_, LUA_GCGEN, gen.minormul, gen.majormul); },
                 [&](const LuaGcInc& inc) {
                   lua_gc(lua_, LUA_GCINC, inc.pause, inc.stepmul, inc.stepsize);
                 },
             },
             gc);
}

std::optional<absl::FixedArray<std::string_view, 4>> Interpreter::PrepareArgs() {
  int argc = lua_gettop(lua_);
  /* Require at least one argument */
  if (argc == 0) {
    PushError(lua_, "Please specify at least one argument for redis.call()");
    return std::nullopt;
  }

  size_t blob_len = 0;
  char tmpbuf[64];

  // Determine size required for backing storage for all args.
  // Skip command name (idx=1), as its stored in a separate buffer.
  for (int idx = 2; idx <= argc; idx++) {
    switch (lua_type(lua_, idx)) {
      case LUA_TNUMBER:
        if (lua_isinteger(lua_, idx)) {
          blob_len += absl::AlphaNum(lua_tointeger(lua_, idx)).size();
        } else {
          int fmt_len = absl::SNPrintF(tmpbuf, sizeof(tmpbuf), "%.17g", lua_tonumber(lua_, idx));
          CHECK_GT(fmt_len, 0);
          blob_len += fmt_len;
        }
        continue;
      case LUA_TSTRING:
        blob_len += lua_rawlen(lua_, idx) + 1;
        continue;
      default:
        PushError(lua_, "Lua redis() command arguments must be strings or integers");
        return std::nullopt;
    }
  }

  absl::FixedArray<string_view, 4> args(argc);

  // Copy command name to name_buffer and set it as first arg.
  unsigned name_len = lua_rawlen(lua_, 1);
  if (name_len >= sizeof(name_buffer_)) {
    PushError(lua_, "Lua redis() command name too long");
    return std::nullopt;
  }

  memcpy(name_buffer_, lua_tostring(lua_, 1), name_len);
  args[0] = {name_buffer_, name_len};
  buffer_.resize(blob_len + 4, '\0');  // backing storage for args

  char* cur = buffer_.data();
  char* end = cur + blob_len;
  for (int idx = 2; idx <= argc; idx++) {
    size_t len = 0;
    switch (lua_type(lua_, idx)) {
      case LUA_TNUMBER:
        if (lua_isinteger(lua_, idx)) {
          char* next = absl::numbers_internal::FastIntToBuffer(lua_tointeger(lua_, idx), cur);
          len = next - cur;
        } else if (lua_isnumber(lua_, idx)) {
          // we pass `end - cur + 1` because we do not want to skip the last character
          // if it's the last argument.
          int fmt_len = absl::SNPrintF(cur, end - cur + 1, "%.17g", lua_tonumber(lua_, idx));
          CHECK_GT(fmt_len, 0);
          len = fmt_len;
        }
        break;
      case LUA_TSTRING:
        len = lua_rawlen(lua_, idx);
        memcpy(cur, lua_tostring(lua_, idx), len + 1);  // + 1 for null terminator
    };

    args[idx - 1] = {cur, len};
    cur += len;
  }

  /* Pop all arguments from the stack, we do not need them anymore
   * and this way we guaranty we will have room on the stack for the result. */
  lua_pop(lua_, argc);
  return args;
}

// Calls redis function
// Returns false if error needs to be raised.
bool Interpreter::CallRedisFunction(bool raise_error, bool async, ObjectExplorer* explorer,
                                    SliceSpan args) {
  // Calling with custom explorer is not supported with errors or async
  DCHECK(explorer == nullptr || (!raise_error && !async));

  // If no custom explorer is set, use default translator
  optional<RedisTranslator> translator;
  if (explorer == nullptr) {
    translator.emplace(lua_);
    explorer = &*translator;
  }
  cmd_depth_++;
  redis_func_(CallArgs{args, &buffer_, explorer, async, raise_error, &raise_error});
  cmd_depth_--;

  // Shrink reusable buffer if it's too big.
  if (buffer_.capacity() > 128) {
    buffer_.clear();
    buffer_.shrink_to_fit();
  }

  if (!translator)
    return true;

  // Raise error for regular 'call' command if needed.
  if (raise_error && translator->HasError()) {
    // error is already on top of stack
    return false;
  }

  if (!async)
    DCHECK_EQ(1, lua_gettop(lua_));

  return true;
}

// Returns number of results, which is always 1 in this case.
// Please note that lua resets the stack once the function returns so no need
// to unwind the stack manually in the function (though lua allows doing this).
int Interpreter::RedisGenericCommand(bool raise_error, bool async, ObjectExplorer* explorer) {
  /* By using Lua debug hooks it is possible to trigger a recursive call
   * to luaRedisGenericCommand(), which normally should never happen.
   * To make this function reentrant is futile and makes it slower, but
   * we should at least detect such a misuse, and abort. */
  if (cmd_depth_) {
    const char* recursion_warning =
        "luaRedisGenericCommand() recursive call detected. "
        "Are you doing funny stuff with Lua debug hooks?";
    PushError(lua_, recursion_warning);
    return 1;
  }

  if (!redis_func_) {
    PushError(lua_, "internal error - redis function not defined");
    if (raise_error) {
      return RaiseErrorAndAbort(lua_);
    }
    return 1;
  }

  // IMPORTANT! all allocations within this funciton must be freed
  // BEFORE calling RaiseErrorAndAbort in case of script error. RaiseErrorAndAbort
  // uses longjmp which bypasses stack unwinding and skips the destruction of objects.
  {
    std::optional<absl::FixedArray<std::string_view, 4>> args = PrepareArgs();
    if (args.has_value()) {
      raise_error = !CallRedisFunction(raise_error, async, explorer, SliceSpan{*args});
    }
  }
  if (!raise_error) {
    return 1;
  }
  return RaiseErrorAndAbort(lua_);  // this function never returns, it unwinds the Lua call stack
}

int Interpreter::RedisCallCommand(lua_State* lua) {
  void** ptr = static_cast<void**>(lua_getextraspace(lua));
  return reinterpret_cast<Interpreter*>(*ptr)->RedisGenericCommand(true, false);
}

int Interpreter::RedisPCallCommand(lua_State* lua) {
  void** ptr = static_cast<void**>(lua_getextraspace(lua));
  return reinterpret_cast<Interpreter*>(*ptr)->RedisGenericCommand(false, false);
}

int Interpreter::RedisACallCommand(lua_State* lua) {
  void** ptr = static_cast<void**>(lua_getextraspace(lua));
  return reinterpret_cast<Interpreter*>(*ptr)->RedisGenericCommand(true, true);
}

int Interpreter::RedisAPCallCommand(lua_State* lua) {
  void** ptr = static_cast<void**>(lua_getextraspace(lua));
  return reinterpret_cast<Interpreter*>(*ptr)->RedisGenericCommand(false, true);
}

InterpreterManager::Stats& InterpreterManager::Stats::operator+=(const Stats& other) {
  this->used_bytes += other.used_bytes;
  this->interpreter_cnt += other.interpreter_cnt;
  this->blocked_cnt += other.blocked_cnt;

  this->force_gc_calls += other.force_gc_calls;
  this->gc_duration_ns += other.gc_duration_ns;
  this->interpreter_return += other.interpreter_return;
  this->gc_freed_memory += other.gc_freed_memory;

  return *this;
}

InterpreterManager::Stats& InterpreterManager::tl_stats() {
  static thread_local Stats stats;
  return stats;
}

Interpreter* InterpreterManager::Get() {
  // Grow if none is available and we have unused capacity left.
  if (available_.empty() && storage_.size() < storage_.capacity()) {
    storage_.emplace_back();
    return &storage_.back();
  }

  bool blocked = waker_.await([this]() { return !available_.empty(); });
  tl_stats().blocked_cnt += (uint64_t)blocked;

  Interpreter* ir = available_.back();
  available_.pop_back();
  return ir;
}

void InterpreterManager::Return(Interpreter* ir) {
  const uint64_t max_memory_usage = absl::GetFlag(FLAGS_lua_mem_gc_threshold);
  using namespace chrono;
  ++tl_stats().interpreter_return;
  tl_stats().used_bytes += ir->TakeUsedBytes();
  if (max_memory_usage != 0 && tl_stats().used_bytes > max_memory_usage) {
    ++tl_stats().force_gc_calls;
    auto before = steady_clock::now();
    tl_stats().gc_freed_memory += ir->RunGC();

    VLOG(2) << "stats_used_bytes: " << tl_stats().used_bytes
            << " lua_mem_gc_threshold: " << max_memory_usage
            << " force_gc_calls: " << tl_stats().force_gc_calls
            << " freed_mem: " << tl_stats().gc_freed_memory;

    auto after = steady_clock::now();
    tl_stats().gc_duration_ns += duration_cast<nanoseconds>(after - before).count();
  }
  if (ir >= storage_.data() && ir < storage_.data() + storage_.size()) {
    available_.push_back(ir);
    waker_.notify();
  } else if (return_untracked_ > 0) {
    return_untracked_--;
    if (return_untracked_ == 0) {
      reset_ec_.notify();
    }
  } else {
    LOG(DFATAL) << "Returning untracked interpreter";
  }
}

void InterpreterManager::Reset() {
  lock_guard guard{reset_mu_};

  // we perform double buffer swapping with storage and wait for the old interepreters to be
  // returned.
  return_untracked_ = storage_.size() - available_.size();

  std::vector<Interpreter> next_storage;
  next_storage.reserve(storage_.capacity());
  next_storage.resize(storage_.size());
  next_storage.swap(storage_);

  available_.clear();
  for (auto& ir : storage_) {
    available_.push_back(&ir);
  }

  reset_ec_.await([this]() { return return_untracked_ == 0; });
  VLOG(1) << "InterpreterManager::Reset ended";
}

void InterpreterManager::Alter(std::function<void(Interpreter*)> modf) {
  vector<Interpreter*> taken;
  swap(taken, available_);  // swap data because modf can preempt

  for (Interpreter* ir : taken) {
    modf(ir);
    Return(ir);
  }
}

}  // namespace dfly


================================================
FILE: src/core/interpreter.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/fixed_array.h>
#include <absl/types/span.h>

#include <functional>
#include <optional>
#include <string_view>

#include "util/fibers/synchronization.h"

typedef struct lua_State lua_State;

namespace dfly {

class ObjectExplorer {
 public:
  virtual ~ObjectExplorer() = default;

  virtual void OnBool(bool b) = 0;
  virtual void OnString(std::string_view str) = 0;
  virtual void OnDouble(double d) = 0;
  virtual void OnInt(int64_t val) = 0;
  virtual void OnArrayStart(unsigned len) = 0;
  virtual void OnArrayEnd() = 0;
  virtual void OnNil() = 0;
  virtual void OnStatus(std::string_view str) = 0;
  virtual void OnError(std::string_view str) = 0;

  virtual void OnMapStart(unsigned len) {
    OnArrayStart(len * 2);
  }

  virtual void OnMapEnd() {
    OnArrayEnd();
  }
};

class Interpreter {
 public:
  using SliceSpan = absl::Span<const std::string_view>;

  // Arguments received from redis.call
  struct CallArgs {
    // Full arguments, including cmd name.
    SliceSpan args;

    // Pointer to backing storage for args (excluding cmd name).
    // Moving can invalidate arg slice pointers. Moved by async to re-use buffer.
    std::string* buffer;

    ObjectExplorer* translator;

    bool async;        // async by acall
    bool error_abort;  // abort on errors (not pcall)

    // The function can request an abort due to an error, even if error_abort is false.
    // It happens when async cmds are flushed and result in an uncatched error.
    bool* requested_abort;
  };

  using RedisFunc = std::function<void(CallArgs)>;

  Interpreter();
  ~Interpreter();

  Interpreter(const Interpreter&) = delete;
  void operator=(const Interpreter&) = delete;

  Interpreter(Interpreter&&) = default;
  Interpreter& operator=(Interpreter&&) = default;

  // Note: We leak the state for now.
  // Production code should not access this method.
  lua_State* lua() {
    return lua_;
  }

  enum AddResult {
    ADD_OK = 0,
    ALREADY_EXISTS = 1,
    COMPILE_ERR = 2,
  };

  // Add function with sha and body to interpreter.
  AddResult AddFunction(std::string_view sha, std::string_view body, std::string* error);

  int64_t TakeUsedBytes() {
    return std::exchange(used_bytes_, 0);
  }

  bool Exists(std::string_view sha) const;

  enum RunResult {
    RUN_OK = 0,
    NOT_EXISTS = 1,
    RUN_ERR = 2,
  };

  void SetGlobalArray(const char* name, SliceSpan args);

  // Runs already added function sha returned by a successful call to AddFunction().
  // Returns: true if the call succeeded, otherwise fills error and returns false.
  // sha must be 40 char length.
  RunResult RunFunction(std::string_view sha, std::string* err);

  // Checks whether the result is safe to serialize.
  // Should fit 2 conditions:
  // 1. Be the only value on the stack.
  // 2. Should have depth of no more than 128.
  bool IsResultSafe() const;

  void SerializeResult(ObjectExplorer* serializer);

  void ResetStack();

  // run gc and returns size of freed memory in bytes
  int64_t RunGC();

  void UpdateGCParameters();

  // fp must point to buffer with at least 41 chars.
  // fp[40] will be set to '\0'.
  static void FuncSha1(std::string_view body, char* fp);

  static std::optional<std::string> DetectPossibleAsyncCalls(std::string_view body);

  template <typename U> void SetRedisFunc(U&& u) {
    redis_func_ = std::forward<U>(u);
  }

  // Invoke command with arguments from lua stack, given options and possibly custom explorer
  int RedisGenericCommand(bool raise_error, bool async, ObjectExplorer* explorer = nullptr);

 private:
  // Returns true if function was successfully added,
  // otherwise returns false and sets the error.
  bool AddInternal(const char* f_id, std::string_view body, std::string* error);
  bool IsTableSafe() const;

  static int RedisCallCommand(lua_State* lua);
  static int RedisPCallCommand(lua_State* lua);
  static int RedisACallCommand(lua_State* lua);
  static int RedisAPCallCommand(lua_State* lua);

  std::optional<absl::FixedArray<std::string_view, 4>> PrepareArgs();
  bool CallRedisFunction(bool raise_error, bool async, ObjectExplorer* explorer, SliceSpan args);

  lua_State* lua_;
  unsigned cmd_depth_ = 0;
  RedisFunc redis_func_;
  std::string buffer_;
  int64_t used_bytes_ = 0;
  char name_buffer_[32];  // backing storage for cmd name
};

// Manages an internal interpreter pool. This allows multiple connections residing on the same
// thread to run multiple lua scripts in parallel.
class InterpreterManager {
 public:
  struct Stats {
    Stats& operator+=(const Stats& other);

    uint64_t used_bytes = 0;
    uint64_t interpreter_cnt = 0;
    uint64_t blocked_cnt = 0;
    uint64_t force_gc_calls = 0;
    uint64_t gc_duration_ns = 0;
    uint64_t interpreter_return = 0;
    int64_t gc_freed_memory = 0;
  };

 public:
  InterpreterManager(unsigned num) : waker_{}, available_{}, storage_{} {
    // We pre-allocate the backing storage during initialization and
    // start storing pointers to slots in the available vector.
    storage_.reserve(num);
  }

  // Borrow interpreter. Always return it after usage.
  Interpreter* Get();
  void Return(Interpreter*);

  // Clear all interpreters, keeps capacity. Waits until all are returned.
  void Reset();

  // Run on all unused interpreters. Those are marked as used at once, so the callback can preempt
  void Alter(std::function<void(Interpreter*)> modf);

  static Stats& tl_stats();

 private:
  util::fb2::EventCount waker_, reset_ec_;
  std::vector<Interpreter*> available_;
  std::vector<Interpreter> storage_;

  util::fb2::Mutex reset_mu_;  // Acts as a singleton.

  unsigned return_untracked_ = 0;  // Number of returned interpreters during reset.
};

}  // namespace dfly


================================================
FILE: src/core/interpreter_polyfill.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
// This header contains implementations of deprecated, removed or renamed lua functions.

#pragma once

extern "C" {
#include <lauxlib.h>
#include <lua.h>
#include <lualib.h>

// TODO: Fix checktab
#define aux_getn(L, n, w) (luaL_len(L, n))

LUA_API void lua_len(lua_State* L, int idx);

static int polyfill_table_getn(lua_State* L) {
  lua_len(L, 1);
  return 1;
}

static int polyfill_table_setn(lua_State* L) {
  // From Lua 5.1, ltablib.c
  luaL_checktype(L, 1, LUA_TTABLE);
  luaL_error(L, "setn is obsolete");
  lua_pushvalue(L, 1);
  return 1;
}

static int polyfill_table_foreach(lua_State* L) {
  // From Lua 5.1, ltablib.c
  luaL_checktype(L, 1, LUA_TTABLE);
  luaL_checktype(L, 2, LUA_TFUNCTION);
  lua_pushnil(L); /* first key */
  while (lua_next(L, 1)) {
    lua_pushvalue(L, 2);  /* function */
    lua_pushvalue(L, -3); /* key */
    lua_pushvalue(L, -3); /* value */
    lua_call(L, 2, 1);
    if (!lua_isnil(L, -1))
      return 1;
    lua_pop(L, 2); /* remove value and result */
  }
  return 0;
}

static int polyfill_table_foreachi(lua_State* L) {
  luaL_checktype(L, 1, LUA_TTABLE);  // Check type here because aux_getn is stripped
  // From Lua 5.1, ltablib.c
  int i;
  int n = aux_getn(L, 1, 0b11);
  luaL_checktype(L, 2, LUA_TFUNCTION);
  for (i = 1; i <= n; i++) {
    lua_pushvalue(L, 2);   /* function */
    lua_pushinteger(L, i); /* 1st argument */
    lua_rawgeti(L, 1, i);  /* 2nd argument */
    lua_call(L, 2, 1);
    if (!lua_isnil(L, -1))
      return 1;
    lua_pop(L, 1); /* remove nil result */
  }
  return 0;
}

static void register_polyfills(lua_State* lua) {
  lua_getglobal(lua, "table");

  // unpack was a global function until Lua 5.2
  lua_getfield(lua, -1, "unpack");
  lua_setglobal(lua, "unpack");

  // table.getn - removed, length operator # should be used instead
  lua_pushcfunction(lua, polyfill_table_getn);
  lua_setfield(lua, -2, "getn");

  // table.setn - removed, freely resizing a table is no longer possible
  lua_pushcfunction(lua, polyfill_table_setn);
  lua_setfield(lua, -2, "setn");

  // table.getn - removed, instead the length operator # should be used
  lua_pushcfunction(lua, polyfill_table_foreach);
  lua_setfield(lua, -2, "foreach");

  // table.forachi - removed, use for loops should be used instead
  lua_pushcfunction(lua, polyfill_table_foreachi);
  lua_setfield(lua, -2, "foreachi");

  lua_remove(lua, -1);
}
}


================================================
FILE: src/core/interpreter_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/interpreter.h"

extern "C" {
#include <lauxlib.h>
#include <lua.h>
}

#include <absl/strings/str_cat.h>
#include <absl/strings/str_replace.h>
#include <gmock/gmock.h>
#include <mimalloc.h>

#include <thread>

#include "base/gtest.h"
#include "base/logging.h"

extern "C" {
#include "redis/zmalloc.h"
}

namespace dfly {
using namespace std;

class TestSerializer : public ObjectExplorer {
 public:
  string res;

  void OnBool(bool b) final {
    absl::StrAppend(&res, "bool(", b, ") ");
  }

  void OnString(std::string_view str) final {
    absl::StrAppend(&res, "str(", str, ") ");
  }

  void OnDouble(double d) final {
    absl::StrAppend(&res, "d(", d, ") ");
  }

  void OnInt(int64_t val) final {
    absl::StrAppend(&res, "i(", val, ") ");
  }

  void OnArrayStart(unsigned len) final {
    absl::StrAppend(&res, "[");
  }

  void OnArrayEnd() final {
    if (res.back() == ' ')
      res.pop_back();

    absl::StrAppend(&res, "] ");
  }

  void OnNil() final {
    absl::StrAppend(&res, "nil ");
  }

  void OnMapStart(unsigned len) final {
    absl::StrAppend(&res, "{");
  }

  void OnMapEnd() final {
    if (res.back() == ' ')
      res.pop_back();
    absl::StrAppend(&res, "} ");
  }

  void OnStatus(std::string_view str) {
    absl::StrAppend(&res, "status(", str, ") ");
  }

  void OnError(std::string_view str) {
    absl::StrAppend(&res, "err(", str, ") ");
  }
};

using SliceSpan = Interpreter::SliceSpan;
class InterpreterTest : public ::testing::Test {
 protected:
  InterpreterTest() {
    // configure redis lib zmalloc which requires mimalloc heap to work.
    auto* tlh = mi_heap_get_backing();
    init_zmalloc_threadlocal(tlh);
  }

  lua_State* lua() {
    return intptr_.lua();
  }

  void RunInline(string_view buf, const char* name, unsigned num_results = 0) {
    CHECK_EQ(0, luaL_loadbuffer(lua(), buf.data(), buf.size(), name));
    CHECK_EQ(0, lua_pcall(lua(), 0, num_results, 0));
  }

  void SetGlobalArray(const char* name, const vector<string_view>& vec);

  // returns true if script run successfully.
  bool Execute(string_view script);

  Interpreter intptr_;
  TestSerializer ser_;
  string error_;
  vector<unique_ptr<string>> strings_;
};

void InterpreterTest::SetGlobalArray(const char* name, const vector<string_view>& vec) {
  vector<string_view> slices(vec.size());
  for (size_t i = 0; i < vec.size(); ++i) {
    strings_.emplace_back(new string(vec[i]));
    slices[i] = string_view{*strings_.back()};
  }
  intptr_.SetGlobalArray(name, SliceSpan{slices});
}

bool InterpreterTest::Execute(string_view script) {
  char sha_buf[64];
  Interpreter::FuncSha1(script, sha_buf);
  string_view sha{sha_buf, std::strlen(sha_buf)};

  string result;
  Interpreter::AddResult add_res = intptr_.AddFunction(sha, script, &result);
  if (add_res == Interpreter::COMPILE_ERR) {
    error_ = result;
    return false;
  }

  Interpreter::RunResult run_res = intptr_.RunFunction(sha, &error_);
  if (run_res != Interpreter::RUN_OK) {
    return false;
  }

  ser_.res.clear();
  intptr_.SerializeResult(&ser_);
  ser_.res.pop_back();

  return true;
}

TEST_F(InterpreterTest, Basic) {
  RunInline(R"(
    function foo(n)
      return n,n+1
    end)",
            "code1");

  int type = lua_getglobal(lua(), "foo");
  ASSERT_EQ(LUA_TFUNCTION, type);
  lua_pushnumber(lua(), 42);
  lua_pcall(lua(), 1, 2, 0);
  int val1 = lua_tointeger(lua(), -1);
  int val2 = lua_tointeger(lua(), -2);
  lua_pop(lua(), 2);

  EXPECT_EQ(43, val1);
  EXPECT_EQ(42, val2);
  EXPECT_EQ(0, lua_gettop(lua()));

  lua_pushstring(lua(), "foo");
  EXPECT_EQ(3, lua_rawlen(lua(), 1));
  lua_pop(lua(), 1);

  RunInline("return {nil, 'b'}", "code2", 1);
  ASSERT_EQ(1, lua_gettop(lua()));
  LOG(INFO) << lua_typename(lua(), lua_type(lua(), -1));

  ASSERT_TRUE(lua_istable(lua(), -1));
  ASSERT_EQ(2, lua_rawlen(lua(), -1));
  lua_len(lua(), -1);
  ASSERT_EQ(2, lua_tointeger(lua(), -1));
  lua_pop(lua(), 1);

  lua_pushnil(lua());
  while (lua_next(lua(), -2)) {
    /* uses 'key' (at index -2) and 'value' (at index -1) */
    int kt = lua_type(lua(), -2);
    int vt = lua_type(lua(), -1);
    LOG(INFO) << "k/v : " << lua_typename(lua(), kt) << "/" << lua_tonumber(lua(), -2) << " "
              << lua_typename(lua(), vt);
    lua_pop(lua(), 1);
  }
}

TEST_F(InterpreterTest, UnknownFunc) {
  string_view code(R"(
    function foo(n)
      return myunknownfunc(1, n)
    end)");

  CHECK_EQ(0, luaL_loadbuffer(lua(), code.data(), code.size(), "code1"));
  CHECK_EQ(0, lua_pcall(lua(), 0, 0, 0));
  int type = lua_getglobal(lua(), "myunknownfunc");
  ASSERT_EQ(LUA_TNIL, type);
}

TEST_F(InterpreterTest, Stack) {
  RunInline(R"(
local x = {}
for i=1,127 do
   x = {x}
end
return x
)",
            "code1", 1);

  ASSERT_EQ(1, lua_gettop(lua()));
  ASSERT_TRUE(intptr_.IsResultSafe());
  lua_pop(lua(), 1);

  RunInline(R"(
local x = {}
for i=1,128 do
   x = {x}
end
return x
)",
            "code1", 1);

  ASSERT_EQ(1, lua_gettop(lua()));
  ASSERT_FALSE(intptr_.IsResultSafe());
}

TEST_F(InterpreterTest, Add) {
  const char* s1 = "return 0";
  const char* s2 = "foobar";

  char sha_buf1[64], sha_buf2[64];
  Interpreter::FuncSha1(s1, sha_buf1);
  Interpreter::FuncSha1(s2, sha_buf2);
  string_view sha1{sha_buf1, std::strlen(sha_buf1)};
  string_view sha2{sha_buf2, std::strlen(sha_buf2)};

  string err;

  EXPECT_EQ(Interpreter::ADD_OK, intptr_.AddFunction(sha1, "return 0", &err));
  EXPECT_EQ(0, lua_gettop(lua()));

  EXPECT_EQ(Interpreter::COMPILE_ERR, intptr_.AddFunction(sha2, "foobar", &err));
  EXPECT_THAT(err, testing::HasSubstr("syntax error"));
  EXPECT_EQ(0, lua_gettop(lua()));

  EXPECT_TRUE(intptr_.Exists(sha1));
}

// Test cases taken from scripting.tcl
TEST_F(InterpreterTest, Execute) {
  ASSERT_TRUE(Execute("return 42"));
  EXPECT_EQ("i(42)", ser_.res);

  EXPECT_TRUE(Execute("return 'hello'"));
  EXPECT_EQ("str(hello)", ser_.res);

  // Breaks compatibility.
  EXPECT_TRUE(Execute("return 100.5"));
  EXPECT_EQ("d(100.5)", ser_.res);

  EXPECT_TRUE(Execute("return true"));
  EXPECT_EQ("bool(1)", ser_.res);

  EXPECT_TRUE(Execute("return false"));
  EXPECT_EQ("bool(0)", ser_.res);

  EXPECT_TRUE(Execute("return {ok='fine'}"));
  EXPECT_EQ("status(fine)", ser_.res);

  EXPECT_TRUE(Execute("return {err= 'bla'}"));
  EXPECT_EQ("err(bla)", ser_.res);

  EXPECT_TRUE(Execute("return {1, 2, nil, 3}"));
  EXPECT_EQ("[i(1) i(2) nil i(3)]", ser_.res);

  EXPECT_TRUE(Execute("return {1,2,3,'ciao', {1,2}}"));
  EXPECT_EQ("[i(1) i(2) i(3) str(ciao) [i(1) i(2)]]", ser_.res);

  EXPECT_TRUE(Execute("return {map={a=1,b=2}}"));
  EXPECT_THAT(ser_.res, testing::AnyOf("{str(a) i(1) str(b) i(2)}", "{str(b) i(2) str(a) i(1)}"));
}

TEST_F(InterpreterTest, Call) {
  auto cb = [](auto ca) {
    auto* reply = ca.translator;
    auto span = ca.args;
    CHECK_GE(span.size(), 1u);
    string_view cmd{span[0].data(), span[0].size()};
    if (cmd == "string") {
      reply->OnString("foo");
    } else if (cmd == "double") {
      reply->OnDouble(3.1415);
    } else if (cmd == "int") {
      reply->OnInt(42);
    } else if (cmd == "err") {
      reply->OnError("myerr");
    } else if (cmd == "status") {
      reply->OnStatus("mystatus");
    } else {
      LOG(FATAL) << "Invalid param";
    }
  };

  intptr_.SetRedisFunc(cb);
  ASSERT_TRUE(Execute("local var = redis.pcall('string'); return {type(var), var}"));
  EXPECT_EQ("[str(string) str(foo)]", ser_.res);

  EXPECT_TRUE(Execute("local var = redis.pcall('double'); return {type(var), var}"));
  EXPECT_EQ("[str(number) d(3.1415)]", ser_.res);

  EXPECT_TRUE(Execute("local var = redis.pcall('int'); return {type(var), var}"));
  EXPECT_EQ("[str(number) i(42)]", ser_.res);

  EXPECT_TRUE(Execute("local var = redis.pcall('err'); return {type(var), var}"));
  EXPECT_EQ("[str(table) err(myerr)]", ser_.res);

  EXPECT_TRUE(Execute("local var = redis.pcall('status'); return {type(var), var}"));
  EXPECT_EQ("[str(table) status(mystatus)]", ser_.res);
}

TEST_F(InterpreterTest, CallArray) {
  auto cb = [](auto ca) {
    auto* reply = ca.translator;
    reply->OnArrayStart(2);
    reply->OnArrayStart(1);
    reply->OnArrayStart(2);
    reply->OnNil();
    reply->OnString("s2");
    reply->OnArrayEnd();
    reply->OnArrayEnd();
    reply->OnInt(42);
    reply->OnArrayEnd();
  };

  intptr_.SetRedisFunc(cb);
  EXPECT_TRUE(Execute("local var = redis.call(''); return {type(var), var}"));
  EXPECT_EQ("[str(table) [[[bool(0) str(s2)]] i(42)]]", ser_.res);
}

TEST_F(InterpreterTest, ArgKeys) {
  vector<string> vec_arr{};
  vector<string_view> slices;
  SetGlobalArray("ARGV", {"foo", "bar"});
  SetGlobalArray("KEYS", {"key1", "key2"});
  EXPECT_TRUE(Execute("return {ARGV[1], KEYS[1], KEYS[2]}"));
  EXPECT_EQ("[str(foo) str(key1) str(key2)]", ser_.res);

  SetGlobalArray("INTKEYS", {"123456", "1"});
  EXPECT_TRUE(Execute("return INTKEYS[1] + 0")) << error_;
  EXPECT_EQ("i(123456)", ser_.res);
}

TEST_F(InterpreterTest, Modules) {
  // cjson module
  EXPECT_TRUE(Execute("return cjson.encode({1, 2, 3})"));
  EXPECT_EQ("str([1,2,3])", ser_.res);
  EXPECT_TRUE(Execute("return cjson.decode('{\"a\": 1}')['a']"));
  EXPECT_EQ("i(1)", ser_.res);

  // cmsgpack module
  EXPECT_TRUE(Execute("return cmsgpack.pack('ok', true)"));
  EXPECT_EQ("str(\xA2ok\xC3)", ser_.res);

  // bit module
  EXPECT_TRUE(Execute("return bit.bor(8, 4, 5)"));
  EXPECT_EQ("i(13)", ser_.res);

  // struct module
  EXPECT_TRUE(Execute("return struct.pack('bbc4', 1, 2, 'test')"));
  EXPECT_EQ("str(\x1\x2test)", ser_.res);
}

// Check compatibility with Lua 5.1
TEST_F(InterpreterTest, Compatibility) {
  // unpack is no longer global
  EXPECT_TRUE(Execute("return unpack{1,2,3}"));
  EXPECT_EQ("i(1)", ser_.res);

  string_view test_foreach_template =
      "local t = {1,'two',3;four='yes'}; local out = {};"
      "table.{TESTF} (t, function(k, v) table.insert(out, {k, v}) end); "
      "return out; ";

  // table.foreach was removed
  string test_foreach = absl::StrReplaceAll(test_foreach_template, {{"{TESTF}", "foreach"}});
  EXPECT_TRUE(Execute(test_foreach));
  EXPECT_EQ("[[i(1) i(1)] [i(2) str(two)] [i(3) i(3)] [str(four) str(yes)]]", ser_.res);

  // table.foreachi was removed
  string test_foreachi = absl::StrReplaceAll(test_foreach_template, {{"{TESTF}", "foreachi"}});
  EXPECT_TRUE(Execute(test_foreachi));
  EXPECT_EQ("[[i(1) i(1)] [i(2) str(two)] [i(3) i(3)]]", ser_.res);

  EXPECT_FALSE(Execute("table.foreachi('not-a-table', print);"));  // check invalid args

  // table.getn was replaced with length operator
  EXPECT_TRUE(Execute("return table.getn{1, 2, 3};"));
  EXPECT_EQ("i(3)", ser_.res);

  // table.setn was removed, resizing is no longer needed, it thows an error
  EXPECT_FALSE(Execute("local t = {}; local a = 1; table.setn(t, 100); return a+123;"));
}

TEST_F(InterpreterTest, AsyncReplacement) {
  const string_view kCases[] = {
      R"(
      redis.[A]call('INCR', 'A')
      redis.[A]call('INCR', 'A')
    )",
      R"(
      function test()
        redis.[A]call('INCR', 'A')
      end
    )",
      R"(
      local b = redis.call('GET', 'A') + redis.call('GET', 'B')
    )",
      R"(
      if redis.call('EXISTS', 'A') then redis.[A]call('SET', 'B', 1) end
    )",
      R"(
      while redis.call('EXISTS', 'A') do redis.[A]call('SET', 'B', 1) end
    )",
      R"(
      while
      redis.call('EXISTS', 'A') do
        print("OK")
      end
    )",
      R"(
      print(redis.call('GET', 'A'))
    )",
      R"(
      local table = {
        redis.call('GET', 'A')
      }
    )",
      R"(
      while true do
        redis.[A]call('INCR', 'A')
      end
    )",
      R"(
      if 1 + -- now this is a tricky comment
        redis.call('GET', 'A')
        > 0
      then end
    )",
      R"(
      print('Output'
      ..
      redis.call('GET', 'A')
      )
    )",
      R"(
      while
      0 < -- we have a comment here unfortunately
      redis.call('GET', 'A')
      then end
    )",
      R"(
    while
    -- we have
    -- a tricky
    -- multiline comment
    redis.call('EXISTS')
    do end
    )",
      R"(
    --[[ WE SKIP COMMENT BLOCKS FOR NOW ]]
    redis.call('ECHO', 'TEST')
    )"};

  for (auto test : kCases) {
    auto expected = absl::StrReplaceAll(test, {{"[A]", "a"}});
    auto input = absl::StrReplaceAll(test, {{"[A]", ""}});

    auto result = Interpreter::DetectPossibleAsyncCalls(input);
    string_view output = result ? *result : input;

    EXPECT_EQ(expected, output);
  }
}

TEST_F(InterpreterTest, ReplicateCommands) {
  EXPECT_TRUE(Execute("return redis.replicate_commands()"));
  EXPECT_EQ("i(1)", ser_.res);
  EXPECT_TRUE(Execute("redis.replicate_commands()"));
  EXPECT_EQ("nil", ser_.res);
}

TEST_F(InterpreterTest, Log) {
  EXPECT_FALSE(Execute(R"(redis.log('nonsense', 'nonsense'))"));
  EXPECT_THAT(error_, testing::HasSubstr("First argument must be a number (log level)."));
  EXPECT_TRUE(Execute(R"(redis.log(redis.LOG_WARNING, 'warn'))"));
  EXPECT_EQ("nil", ser_.res);
  EXPECT_FALSE(Execute(R"(redis.log(4))"));
  EXPECT_THAT(error_, testing::HasSubstr("requires two arguments or more"));
}

TEST_F(InterpreterTest, Robust) {
  EXPECT_FALSE(Execute(R"(eval "local a = {}
      setmetatable(a,{__index=function() foo() end})
      return a")"));
  EXPECT_EQ("", ser_.res);
}

TEST_F(InterpreterTest, Unpack) {
  auto cb = [](Interpreter::CallArgs ca) {
    auto* reply = ca.translator;
    reply->OnInt(1);
  };
  intptr_.SetRedisFunc(cb);
  ASSERT_TRUE(lua_checkstack(lua(), 7000));
  bool res = Execute(R"(
local N = 7000

local stringTable = {}
for i = 1, N do
    stringTable[i] = "String " .. i
end
  return redis.pcall('func', unpack(stringTable))
)");

  ASSERT_TRUE(res) << error_;
  EXPECT_EQ("i(1)", ser_.res);
}

TEST_F(InterpreterTest, AvoidIntOverflow) {
  EXPECT_TRUE(Execute("return bit.tohex(65535, -2147483648)"));
  EXPECT_EQ("str(0000FFFF)", ser_.res);
}

TEST_F(InterpreterTest, LuaIntOverflow) {
  EXPECT_FALSE(Execute("EVAL \"struct.pack('>I2147483648', '10')\" 0"));
}

TEST_F(InterpreterTest, LuaGcStatistic) {
  InterpreterManager im(1);
  auto* interpreter = im.Get();

  std::string_view keys[] = {"key1", "key2", "key3", "key4", "key5", "key6", "key7"};
  interpreter->SetGlobalArray("KEYS", SliceSpan{keys});

  auto cb = [](Interpreter::CallArgs ca) {
    auto* reply = ca.translator;
    reply->OnInt(1);
  };
  interpreter->SetRedisFunc(cb);
  // next script generate several big values and set them to the keys
  // after the script is finished, GM isn't called for all values and
  // in the most cases we have more than 300k allocated memory
  // that will be cleaned later in the separate thread
  std::string script = R"(
        for i = 1, 7 do
          local str = string.rep(i, 1024 * 100)
          redis.call('SET', KEYS[1], str .. str)
        end
       )";

  char sha_buf[64];
  Interpreter::FuncSha1(script, sha_buf);
  string_view sha{sha_buf, std::strlen(sha_buf)};

  string result;
  Interpreter::AddResult add_res = interpreter->AddFunction(sha, script, &result);
  EXPECT_EQ(Interpreter::ADD_OK, add_res);

  // When script is executed in the most cases we see that not all memory was deallocated
  // immediately and can be deallocated later
  Interpreter::RunResult run_res = interpreter->RunFunction(sha, &error_);
  EXPECT_EQ(Interpreter::RUN_OK, run_res);

  // check that after script is finished not the all memory was deallocated
  uint64_t used_bytes = InterpreterManager::tl_stats().used_bytes;
  EXPECT_GE(used_bytes, 0);

  auto force_gc_calls = InterpreterManager::tl_stats().force_gc_calls;
  // we need return interpreter to update statistic
  // force_gc_calls shouldn't be called
  im.Return(interpreter);
  EXPECT_EQ(force_gc_calls, InterpreterManager::tl_stats().force_gc_calls);
  EXPECT_LE(used_bytes, InterpreterManager::tl_stats().used_bytes);

  used_bytes = InterpreterManager::tl_stats().used_bytes;

  // we get the same interpeter again to call GC in separate thread
  auto* new_interpreter = im.Get();
  EXPECT_EQ(interpreter, new_interpreter);

  // check that even if memory is deallocated in separate thread our statistic is correct
  std::thread t([&] {
    interpreter->RunGC();
    EXPECT_EQ(InterpreterManager::tl_stats().used_bytes, 0);
  });
  t.join();

  im.Return(interpreter);
  EXPECT_GE(used_bytes, InterpreterManager::tl_stats().used_bytes);
}

}  // namespace dfly


================================================
FILE: src/core/json/CMakeLists.txt
================================================
gen_flex(jsonpath_lexer)
gen_bison(jsonpath_grammar)

cur_gen_dir(gen_dir)

add_library(jsonpath lexer_impl.cc driver.cc path.cc
            ${gen_dir}/jsonpath_lexer.cc ${gen_dir}/jsonpath_grammar.cc json_object.cc
            detail/jsoncons_dfs.cc detail/flat_dfs.cc
            detail/interned_blob.cc
            detail/interned_string.cc)
target_link_libraries(jsonpath base absl::strings TRDP::reflex TRDP::jsoncons TRDP::flatbuffers dfly_page_usage)

helio_cxx_test(jsonpath_test jsonpath dfly_core LABELS DFLY)
helio_cxx_test(json_test jsonpath TRDP::jsoncons LABELS DFLY)
helio_cxx_test(interned_blob_test dfly_core TRDP::mimalloc2 LABELS DFLY)


================================================
FILE: src/core/json/detail/common.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

namespace dfly::json::detail {
enum MatchStatus {
  OUT_OF_BOUNDS,
  MISMATCH,
};

}


================================================
FILE: src/core/json/detail/flat_dfs.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/json/detail/flat_dfs.h"

#include "base/logging.h"

namespace dfly::json::detail {

using namespace std;
using nonstd::make_unexpected;

inline bool IsRecursive(flexbuffers::Type type) {
  return type == flexbuffers::FBT_MAP || type == flexbuffers::FBT_VECTOR;
}

// Binary search of a key, returns UINT_MAX if not found.
unsigned FindByKey(const flexbuffers::TypedVector& keys, const char* elem) {
  unsigned s = 0, end = keys.size();
  while (s < end) {
    unsigned mid = (s + end) / 2;
    flexbuffers::String mid_elem = keys[mid].AsString();
    int res = strcmp(elem, mid_elem.c_str());
    if (res < 0) {
      end = mid;
    } else if (res > 0) {
      s = mid + 1;
    } else {
      return mid;
    }
  }
  return UINT_MAX;
}

auto FlatDfsItem::Init(const PathSegment& segment) -> AdvanceResult {
  switch (segment.type()) {
    case SegmentType::IDENTIFIER: {
      if (obj().IsMap()) {
        auto map = obj().AsMap();
        flexbuffers::TypedVector keys = map.Keys();
        unsigned index = FindByKey(keys, segment.identifier().c_str());
        if (index == UINT_MAX) {
          return Exhausted();
        }
        state_.emplace(index, index);
        return DepthState{obj().AsVector()[index], depth_state_.second + 1};
      }
      break;
    }
    case SegmentType::INDEX: {
      auto vec = obj().AsVector();
      IndexExpr index = segment.index().Normalize(vec.size());
      if (index.Empty()) {
        return make_unexpected(OUT_OF_BOUNDS);
      }

      state_ = index;
      return Next(vec[index.first]);
      break;
    }

    case SegmentType::DESCENT:
      if (segment_step_ == 1) {
        // first time, branching to return the same object but with the next segment,
        // exploring the path of ignoring the DESCENT operator.
        // Also, shift the state (segment_step) to bypass this branch next time.
        segment_step_ = 0;
        return DepthState{depth_state_.first, depth_state_.second + 1};
      }

      // Now traverse all the children but do not progress with segment path.
      // This is why segment_step_ is set to 0.
      [[fallthrough]];
    case SegmentType::WILDCARD: {
      auto vec = obj().AsVector();
      if (vec.size() == 0) {
        return Exhausted();
      }
      state_ = IndexExpr::All();
      return Next(vec[0]);
    } break;

    default:
      LOG(DFATAL) << "Unknown segment " << SegmentName(segment.type());
  }  // end switch

  return nonstd::make_unexpected(MISMATCH);
}

auto FlatDfsItem::Advance(const PathSegment& segment) -> AdvanceResult {
  if (!state_) {
    return Init(segment);
  }

  ++state_->first;
  if (state_->Empty())
    return Exhausted();
  auto vec = obj().AsVector();

  return Next(vec[state_->first]);
}

FlatDfs FlatDfs::Traverse(absl::Span<const PathSegment> path, const flexbuffers::Reference root,
                          const PathFlatCallback& callback) {
  DCHECK(!path.empty());
  FlatDfs dfs;

  if (path.size() == 1) {
    dfs.PerformStep(path[0], root, callback);
    return dfs;
  }

  using ConstItem = FlatDfsItem;
  vector<ConstItem> stack;
  stack.emplace_back(root);

  do {
    unsigned segment_index = stack.back().segment_idx();
    const auto& path_segment = path[segment_index];

    // init or advance the current object
    ConstItem::AdvanceResult res = stack.back().Advance(path_segment);
    if (res && !res->first.IsNull()) {
      const flexbuffers::Reference next = res->first;
      DVLOG(2) << "Handling now " << next.GetType() << " " << next.ToString();

      // We descent only if next is object or an array.
      if (IsRecursive(next.GetType())) {
        unsigned next_seg_id = res->second;

        if (next_seg_id + 1 < path.size()) {
          stack.emplace_back(next, next_seg_id);
        } else {
          // terminal step
          // TODO: to take into account MatchStatus
          // for `json.set foo $.a[10]` or for `json.set foo $.*.b`
          dfs.PerformStep(path[next_seg_id], next, callback);
        }
      }
    } else {
      stack.pop_back();
    }
  } while (!stack.empty());

  return dfs;
}

auto FlatDfs::PerformStep(const PathSegment& segment, const flexbuffers::Reference node,
                          const PathFlatCallback& callback) -> nonstd::expected<void, MatchStatus> {
  switch (segment.type()) {
    case SegmentType::IDENTIFIER: {
      if (!node.IsMap())
        return make_unexpected(MISMATCH);
      auto map = node.AsMap();
      flexbuffers::Reference value = map[segment.identifier().c_str()];
      if (!value.IsNull()) {
        DoCall(callback, string_view{segment.identifier()}, value);
      }
    } break;
    case SegmentType::INDEX: {
      if (!node.IsUntypedVector())
        return make_unexpected(MISMATCH);
      auto vec = node.AsVector();
      IndexExpr index = segment.index().Normalize(vec.size());
      if (index.Empty()) {
        return make_unexpected(OUT_OF_BOUNDS);
      }
      for (; index.first <= index.second; ++index.first)
        DoCall(callback, nullopt, vec[index.first]);
    } break;

    case SegmentType::DESCENT:
    case SegmentType::WILDCARD: {
      auto vec = node.AsVector();       // always succeeds
      auto keys = node.AsMap().Keys();  // always succeeds
      string str;
      for (size_t i = 0; i < vec.size(); ++i) {
        flexbuffers::Reference key = keys[i];
        optional<string_view> opt_key;
        if (key.IsString()) {
          str = key.ToString();
          opt_key = str;
        }
        DoCall(callback, opt_key, vec[i]);
      }
    } break;
    default:
      LOG(DFATAL) << "Unknown segment " << SegmentName(segment.type());
  }
  return {};
}

}  // namespace dfly::json::detail


================================================
FILE: src/core/json/detail/flat_dfs.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/types/span.h>

#include <nonstd/expected.hpp>
#include <variant>

#include "core/flatbuffers.h"
#include "core/json/detail/common.h"
#include "core/json/path.h"

namespace dfly::json::detail {

class FlatDfsItem {
 public:
  using ValueType = flexbuffers::Reference;
  using DepthState = std::pair<ValueType, unsigned>;  // object, segment_idx pair
  using AdvanceResult = nonstd::expected<DepthState, MatchStatus>;

  FlatDfsItem(ValueType val, unsigned idx = 0) : depth_state_(val, idx) {
  }

  // Returns the next object to traverse
  // or null if traverse was exhausted or the segment does not match.
  AdvanceResult Advance(const PathSegment& segment);

  unsigned segment_idx() const {
    return depth_state_.second;
  }

 private:
  ValueType obj() const {
    return depth_state_.first;
  }

  DepthState Next(ValueType obj) const {
    return {obj, depth_state_.second + segment_step_};
  }

  DepthState Exhausted() const {
    return {ValueType(), 0};
  }

  AdvanceResult Init(const PathSegment& segment);

  // For most operations we advance the path segment by 1 when we descent into the children.
  unsigned segment_step_ = 1;

  DepthState depth_state_;
  std::optional<IndexExpr> state_;
};

// Traverses a json object according to the given path and calls the callback for each matching
// field. With DESCENT segments it will match 0 or more fields in depth.
// MATCH(node, DESCENT|SUFFIX) = MATCH(node, SUFFIX) ||
// { MATCH(node->child, DESCENT/SUFFIX) for each child of node }

class FlatDfs {
 public:
  // TODO: for some operations we need to know the type of mismatches.
  static FlatDfs Traverse(absl::Span<const PathSegment> path, const flexbuffers::Reference root,
                          const PathFlatCallback& callback);
  unsigned matches() const {
    return matches_;
  }

 private:
  bool TraverseImpl(absl::Span<const PathSegment> path, const PathFlatCallback& callback);

  nonstd::expected<void, MatchStatus> PerformStep(const PathSegment& segment,
                                                  const flexbuffers::Reference node,
                                                  const PathFlatCallback& callback);

  void DoCall(const PathFlatCallback& callback, std::optional<std::string_view> key,
              const flexbuffers::Reference node) {
    ++matches_;
    callback(key, node);
  }

  unsigned matches_ = 0;
};

}  // namespace dfly::json::detail


================================================
FILE: src/core/json/detail/interned_blob.cc
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.

#include "core/json/detail/interned_blob.h"

#include <glog/logging.h>
#include <mimalloc.h>

#include "core/detail/stateless_allocator.h"

namespace {
constexpr size_t kUint32Size = sizeof(uint32_t);
constexpr size_t kHeaderSize = sizeof(uint32_t) * 2;
}  // namespace

namespace dfly::detail {

InternedBlobHandle InternedBlobHandle::Create(std::string_view sv) {
  if (sv.empty()) {
    return InternedBlobHandle{nullptr};
  }

  constexpr uint32_t ref_count = 1;
  DCHECK_LE(sv.size(), std::numeric_limits<uint32_t>::max());

  const uint32_t str_len = sv.size();

  // We need +1 byte for \0 because jsoncons expects c_str() and data() style accessors on keys
  BlobPtr blob = StatelessAllocator<char>{}.allocate(kHeaderSize + str_len + 1);

  std::memcpy(blob, &str_len, kUint32Size);
  std::memcpy(blob + kUint32Size, &ref_count, kUint32Size);

  std::memcpy(blob + kHeaderSize, sv.data(), str_len);

  // null terminate so jsoncons can directly access the char* as string
  blob[kHeaderSize + str_len] = '\0';
  return InternedBlobHandle{blob + kHeaderSize};
}

uint32_t InternedBlobHandle::Size() const {
  if (!blob_)
    return 0;
  uint32_t size;
  std::memcpy(&size, blob_ - kHeaderSize, kUint32Size);
  return size;
}

uint32_t InternedBlobHandle::RefCount() const {
  DCHECK(blob_) << "Called RefCount() on empty blob";
  uint32_t ref_count;
  std::memcpy(&ref_count, blob_ - kUint32Size, kUint32Size);
  return ref_count;
}

void InternedBlobHandle::IncrRefCount() {  // NOLINT - non-const, mutates via ptr
  const uint32_t ref_count = RefCount();
  DCHECK_LT(ref_count, std::numeric_limits<uint32_t>::max()) << "Attempt to increase max refcount";
  const uint32_t updated_count = ref_count + 1;
  std::memcpy(blob_ - kUint32Size, &updated_count, kUint32Size);
}

void InternedBlobHandle::DecrRefCount() {  // NOLINT - non-const, mutates via ptr
  const uint32_t ref_count = RefCount();
  DCHECK_GE(ref_count, 1ul) << "Attempt to decrease zero refcount";
  const uint32_t updated_count = ref_count - 1;
  std::memcpy(blob_ - kUint32Size, &updated_count, kUint32Size);
}

size_t InternedBlobHandle::MemUsed() const {
  return blob_ ? mi_usable_size(blob_ - kHeaderSize) : 0;
}

void InternedBlobHandle::Destroy(InternedBlobHandle& handle) {
  if (handle.blob_) {
    const size_t to_destroy = kHeaderSize + handle.Size() + 1;
    StatelessAllocator<char>{}.deallocate(handle.blob_ - kHeaderSize, to_destroy);
    handle.blob_ = nullptr;
  }
}

}  // namespace dfly::detail


================================================
FILE: src/core/json/detail/interned_blob.h
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.

#pragma once

#include <absl/container/flat_hash_set.h>

#include <string_view>

namespace dfly::detail {

// Layout is: 4 bytes size, 4 bytes refcount, char data, followed by nul-char.
// The trailing nul-char is required because jsoncons needs to access c_str/data without a
// size. The blob_ itself points directly to the data, so that callers do not have to perform
// pointer arithmetic for c_str() and data() calls:
//     [size:4] [refcount:4] [string] [\0]
//     ^-8      ^- 4         ^blob_
using BlobPtr = char*;

// A lightweight handle around a blob pointer, used to wrap the blob data when storing it in hashset
// and also within interned strings. Does not handle lifetime of the data. Only provides convenience
// methods to change state inside the blob and "view" style methods to access the string inside the
// blob. Multiple handles can point to the same blob.
class InternedBlobHandle {
 public:
  InternedBlobHandle() = default;

  [[nodiscard]] static InternedBlobHandle Create(std::string_view sv);

  uint32_t Size() const;

  uint32_t RefCount() const;

  const char* Data() const {
    return blob_;
  }

  // The refcount methods are explicitly part of the public API and not tied to the handle lifetime
  // to keep control over exactly when we modify data in the blob ptr. We do not want to increase
  // ref count on each handle creation and conversely decrease it when a handle is destroyed, eg on
  // every hash table lookup etc. The ref count is only increased or decreased at the InternedString
  // API level, when a new string is created, and when a string is destroyed. This allows us to
  // avoid writing to memory unless absolutely necessary, making the handle cheap.

  // Increment ref count, asserts if count grows over type max limit
  void IncrRefCount();

  // Decrement ref count, asserts if count falls below 0
  void DecrRefCount();

  // Returns bytes used, including string, header and trailing byte
  size_t MemUsed() const;

  // Convenience method to deallocate storage. Not for use in destructor.
  static void Destroy(InternedBlobHandle& handle);

  operator std::string_view() const {  // NOLINT (non-explicit operator for easier comparisons)
    return blob_ ? std::string_view{blob_, Size()} : "";
  }
  auto operator<=>(const InternedBlobHandle& other) const = default;
  bool operator==(const InternedBlobHandle& other) const = default;

  explicit operator bool() const {
    return blob_;
  }

 private:
  explicit InternedBlobHandle(BlobPtr blob) : blob_{blob} {
  }

  BlobPtr blob_{nullptr};
};

struct BlobHash {
  using is_transparent = void;
  size_t operator()(std::string_view sv) const {
    return std::hash<std::string_view>{}(sv);
  }
};

struct BlobEq {
  using is_transparent = void;
  bool operator()(const InternedBlobHandle& a, const InternedBlobHandle& b) const {
    return a.Data() == b.Data();
  }

  bool operator()(std::string_view a, std::string_view b) const {
    return a == b;
  }
};

// This pool holds blob handles and is used by InternedString to manage string access. It would be
// nice to keep this on the mimalloc heap by using StatelessAllocator. However, JSON memory usage is
// estimated by comparing mimalloc usage before and after creating an object. If we keep this pool
// on mimalloc, it can introduce variations such as resizing of its internal store when adding a new
// object. This results in non-deterministic memory usage, which introduces incorrectness in tests
// and the memory usage command. To keep memory estimation per object accurate, the pool is
// allocated on the default heap.
using InternedBlobPool = absl::flat_hash_set<InternedBlobHandle, BlobHash, BlobEq>;
static_assert(sizeof(InternedBlobHandle) == sizeof(char*));

}  // namespace dfly::detail


================================================
FILE: src/core/json/detail/interned_string.cc
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.

#include "core/json/detail/interned_string.h"

namespace {
constexpr auto kLoadFactorToShrinkPool = 0.2;

thread_local dfly::InternedStringStats tl_stats;

}  // namespace

namespace dfly::detail {

InternedString& InternedString::operator=(InternedString other) {
  swap(other);
  return *this;
}

void InternedString::ResetPool() {
  InternedBlobPool& pool = GetPoolRef();
  for (InternedBlobHandle handle : pool) {
    InternedBlobHandle::Destroy(handle);
  }
  pool.clear();

  // Pool hits and misses are not reset, they are monotonically increasing counters
  // TODO reset these two fields in config resetstats
  tl_stats.pool_bytes = 0;
  tl_stats.pool_entries = 0;
  tl_stats.pool_table_bytes = 0;
  tl_stats.live_references = 0;
}

InternedBlobHandle InternedString::Intern(const std::string_view sv) {
  if (sv.empty())
    return {};

  tl_stats.live_references += 1;
  InternedBlobPool& pool_ref = GetPoolRef();
  if (const auto it = pool_ref.find(sv); it != pool_ref.end()) {
    tl_stats.hits++;
    InternedBlobHandle blob = *it;
    blob.IncrRefCount();
    return blob;
  }

  InternedBlobHandle handle = InternedBlobHandle::Create(sv);
  pool_ref.emplace(handle);
  tl_stats.pool_entries++;
  tl_stats.pool_bytes += handle.MemUsed();
  tl_stats.misses++;
  return handle;
}

void InternedString::Acquire() {  // NOLINT
  if (!entry_)
    return;

  tl_stats.live_references += 1;
  entry_.IncrRefCount();
}

void InternedString::Release() {
  if (!entry_)
    return;

  entry_.DecrRefCount();
  tl_stats.live_references -= 1;

  if (entry_.RefCount() == 0) {
    InternedBlobPool& pool_ref = GetPoolRef();
    pool_ref.erase(entry_);
    tl_stats.pool_entries--;
    tl_stats.pool_bytes -= entry_.MemUsed();
    InternedBlobHandle::Destroy(entry_);

    // When pool is underutilized, shrink it by swapping.
    if (const auto load_factor = pool_ref.load_factor();
        ABSL_PREDICT_FALSE(load_factor > 0 && load_factor < kLoadFactorToShrinkPool)) {
      // The LHS of swap is a new pool constructed from the original pool reference. The RHS is the
      // original pool. After the swap, the temporary is destroyed. Note that this is not a strict
      // shrink. The new pool internally allocates enough capacity so that the load factor is around
      // 0.8. So the capacity after swap is still larger than size, but the load factor is improved.
      InternedBlobPool(pool_ref).swap(pool_ref);
    }
  }
}

InternedBlobPool& InternedString::GetPoolRef() {
  // Note on lifetimes: this pool is thread local and depends on the thread local memory resource
  // defined in the stateless allocator in src/core/detail/stateless_allocator.h. Since there is no
  // well-defined order of destruction, this pool must be manually reset before the memory resource
  // destruction.
  thread_local InternedBlobPool pool;
  return pool;
}

}  // namespace dfly::detail

namespace dfly {

InternedStringStats& InternedStringStats::operator+=(const InternedStringStats& other) {
  pool_entries += other.pool_entries;
  pool_bytes += other.pool_bytes;
  hits += other.hits;
  misses += other.misses;
  pool_table_bytes += other.pool_table_bytes;
  live_references += other.live_references;
  return *this;
}

InternedStringStats GetInternedStringStats() {
  tl_stats.pool_table_bytes =
      detail::InternedString::GetPoolRef().capacity() * (sizeof(detail::InternedBlobHandle) + 1);
  return tl_stats;
}

}  // namespace dfly


================================================
FILE: src/core/json/detail/interned_string.h
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.

#pragma once

#include "core/detail/stateless_allocator.h"
#include "core/json/detail/interned_blob.h"

namespace dfly::detail {

// InternedString handles incrementing and decrementing reference counts of the blobs tied to its
// own lifecycle. It deletes the blob from a shard local pool when refcount is 0.
// TODO examine cross shard json object interactions. Can a pool end up access from another shard?
class InternedString {
 public:
  using allocator_type = StatelessAllocator<char>;

  InternedString() = default;

  explicit InternedString(const std::string_view sv) : entry_(Intern(sv)) {
  }

  // The following constructors and members are added because they are required by jsoncons for
  // keys. Each of these is added in response to compiler errors and should not be removed, even if
  // they are seemingly a no-op or duplicated.

  // jsoncons sometimes creates empty obj with custom allocator. If it creates an object with any
  // other allocator, we should fail during compilation.
  template <typename T> explicit InternedString(StatelessAllocator<T> /*unused*/) {
  }

  template <typename Alloc> InternedString(const char* data, size_t size, Alloc alloc);

  template <std::contiguous_iterator It> InternedString(It begin, It end);

  InternedString(const InternedString& other) : entry_{other.entry_} {
    Acquire();
  }

  InternedString(InternedString&& other) noexcept : entry_{other.entry_} {
    other.entry_ = {};
  }

  InternedString& operator=(InternedString other);

  ~InternedString() {
    Release();
  }

  operator std::string_view() const {
    return entry_;
  }

  const char* data() const {
    return entry_ ? entry_.Data() : "";
  }

  const char* c_str() const {
    return data();
  }

  void swap(InternedString& other) noexcept {
    std::swap(entry_, other.entry_);
  }

  size_t length() const {
    return size();
  }

  size_t size() const {
    return entry_.Size();
  }

  int compare(const InternedString& other) const {
    return std::string_view{*this}.compare(other);
  }

  int compare(std::string_view other) const {
    return std::string_view{*this}.compare(other);
  }

  // lex. comparison
  auto operator<=>(const InternedString& other) const {
    return std::string_view{*this} <=> std::string_view{other};
  }

  bool operator==(const InternedString& other) const = default;

  void shrink_to_fit() {  // NOLINT (must be non-const to align with jsoncons usage)
  }

  // Destroys all strings in the pool. Must be called on process shutdown before the backing memory
  // resource is destroyed.
  static void ResetPool();
  static InternedBlobPool& GetPoolRef();

  size_t MemUsed() const {
    return entry_.MemUsed();
  }

 private:
  // If a string exists in the pool, increments its refcount. If not, adds the string to the pool.
  // Returns a handle wrapping the string.
  static InternedBlobHandle Intern(std::string_view sv);

  // Increments the refcount if the entry is not null
  void Acquire();

  // Decrements the refcount, removes entry from the pool if necessary, destroying the interned
  // blob. A side effect may be shrinking the pool if the load factor is suboptimal (see
  // kLoadFactorToShrinkPool in the implementation)
  void Release();

  // Wraps a null pointer by default
  InternedBlobHandle entry_;
};

template <typename Alloc>
InternedString::InternedString(const char* data, size_t size, Alloc /*unused*/)
    : InternedString(std::string_view{data, size}) {
}

template <std::contiguous_iterator It> InternedString::InternedString(It begin, It end) {
  if (begin == end) {
    return;
  }

  const auto size = std::distance(begin, end);
  const auto data_ptr = &*begin;
  entry_ = Intern(std::string_view(data_ptr, size));
}

}  // namespace dfly::detail

namespace dfly {

struct InternedStringStats {
  size_t pool_entries = 0;
  size_t pool_bytes = 0;
  size_t hits = 0;
  size_t misses = 0;
  size_t pool_table_bytes = 0;
  size_t live_references = 0;

  InternedStringStats& operator+=(const InternedStringStats& other);
};

InternedStringStats GetInternedStringStats();

}  // namespace dfly


================================================
FILE: src/core/json/detail/jsoncons_dfs.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

// clang-format off
#include <glog/logging.h>
// clang-format on

#include "core/json/detail/jsoncons_dfs.h"

namespace dfly::json::detail {

using namespace std;
using nonstd::make_unexpected;

ostream& operator<<(ostream& os, const PathSegment& ps) {
  os << SegmentName(ps.type());
  return os;
}

inline bool IsRecursive(jsoncons::json_type type) {
  return type == jsoncons::json_type::object_value || type == jsoncons::json_type::array_value;
}

Dfs Dfs::Traverse(absl::Span<const PathSegment> path, const JsonType& root, const Cb& callback) {
  DCHECK(!path.empty());

  Dfs dfs;

  if (path.size() == 1) {
    dfs.PerformStep(path[0], root, callback);
    return dfs;
  }

  using ConstItem = JsonconsDfsItem<true>;
  vector<ConstItem> stack;
  stack.emplace_back(&root);

  do {
    unsigned segment_index = stack.back().segment_idx();
    const auto& path_segment = path[segment_index];

    // init or advance the current object
    DVLOG(2) << "Advance segment [" << segment_index << "] " << path_segment;
    ConstItem::AdvanceResult res = stack.back().Advance(path_segment);
    if (res && res->first != nullptr) {
      const JsonType* next = res->first;

      // We descent only if next is object or an array.
      if (IsRecursive(next->type())) {
        unsigned next_seg_id = res->second;

        if (next_seg_id + 1 < path.size()) {
          DVLOG(2) << "Exploring node[" << stack.size() << "] " << next->type() << " "
                   << next->to_string();
          stack.emplace_back(next, next_seg_id);
        } else {
          DVLOG(2) << "Terminal node[" << stack.size() << "] " << next->type() << " "
                   << next->to_string() << ", segment:" << path[next_seg_id];
          // terminal step
          // TODO: to take into account MatchStatus
          // for `json.set foo $.a[10]` or for `json.set foo $.*.b`
          dfs.PerformStep(path[next_seg_id], *next, callback);
        }
      }
    } else {
      stack.pop_back();
    }
  } while (!stack.empty());

  return dfs;
}

Dfs Dfs::Mutate(absl::Span<const PathSegment> path, const MutateCallback& callback,
                JsonType* json) {
  DCHECK(!path.empty());

  Dfs dfs;

  if (path.size() == 1) {
    dfs.MutateStep(path[0], callback, json);
    return dfs;
  }

  // Use vector to maintain order
  std::vector<JsonType*> nodes_to_mutate;

  using Item = detail::JsonconsDfsItem<false>;
  vector<Item> stack;
  stack.emplace_back(json);

  do {
    unsigned segment_index = stack.back().segment_idx();
    const auto& path_segment = path[segment_index];

    // init or advance the current object
    Item::AdvanceResult res = stack.back().Advance(path_segment);
    if (res && res->first != nullptr) {
      JsonType* next = res->first;
      DVLOG(2) << "Handling now " << next->type() << " " << next->to_string();

      // We descent only if next is object or an array.
      if (IsRecursive(next->type())) {
        unsigned next_seg_id = res->second;

        if (next_seg_id + 1 < path.size()) {
          stack.emplace_back(next, next_seg_id);
        } else {
          // Terminal step: collect node for mutation
          nodes_to_mutate.push_back(next);
        }
      }
    } else {
      // If Advance failed (e.g., MISMATCH or OUT_OF_BOUNDS), the current node itself
      // might still be a terminal match because of the previous DESCENT segment.
      // Instead of mutating immediately (which could break ordering guarantees),
      // collect the node and defer mutation until after traversal.
      if (!res && segment_index > 0 && path[segment_index - 1].type() == SegmentType::DESCENT &&
          stack.back().get_segment_step() == 0) {
        if (segment_index + 1 == path.size()) {
          // Terminal node discovered via DESCENT – store for later processing.
          nodes_to_mutate.push_back(stack.back().obj_ptr());
        }
      }
      stack.pop_back();
    }
  } while (!stack.empty());

  // Apply mutations after DFS traversal is complete
  const PathSegment& terminal_segment = path.back();

  for (auto it = nodes_to_mutate.begin(); it != nodes_to_mutate.end(); ++it) {
    dfs.MutateStep(terminal_segment, callback, *it);
  }

  return dfs;
}

Dfs Dfs::Delete(absl::Span<const PathSegment> path, JsonType* json) {
  DCHECK(!path.empty());

  Dfs dfs;

  if (path.size() == 1) {
    dfs.DeleteStep(path[0], json);
    return dfs;
  }

  using Item = detail::JsonconsDfsItem<false>;
  vector<Item> stack;
  stack.emplace_back(json);

  do {
    unsigned segment_index = stack.back().segment_idx();
    const auto& path_segment = path[segment_index];

    Item::AdvanceResult res = stack.back().Advance(path_segment);
    if (res && res->first != nullptr) {
      JsonType* next = res->first;

      if (IsRecursive(next->type())) {
        unsigned next_seg_id = res->second;

        if (next_seg_id + 1 < path.size()) {
          stack.emplace_back(next, next_seg_id);
        } else {
          // Terminal step: perform deletion immediately
          // At this point we're in the deepest level, so safe to delete
          dfs.DeleteStep(path[next_seg_id], next);
        }
      }
    } else {
      if (!res && segment_index > 0 && path[segment_index - 1].type() == SegmentType::DESCENT &&
          stack.back().get_segment_step() == 0) {
        if (segment_index + 1 == path.size()) {
          // Terminal node discovered via DESCENT - safe to delete immediately
          // as we're backtracking
          dfs.DeleteStep(path[segment_index], stack.back().obj_ptr());
        }
      }
      stack.pop_back();
    }
  } while (!stack.empty());

  return dfs;
}

auto Dfs::PerformStep(const PathSegment& segment, const JsonType& node, const Cb& callback)
    -> nonstd::expected<void, MatchStatus> {
  switch (segment.type()) {
    case SegmentType::IDENTIFIER: {
      if (!node.is_object())
        return make_unexpected(MISMATCH);

      auto it = node.find(segment.identifier());
      if (it != node.object_range().end()) {
        DoCall(callback, it->key(), it->value());
      }
    } break;
    case SegmentType::INDEX: {
      if (!node.is_array())
        return make_unexpected(MISMATCH);
      IndexExpr index = segment.index().Normalize(node.size());
      if (index.Empty()) {
        return make_unexpected(OUT_OF_BOUNDS);
      }
      for (; index.first <= index.second; ++index.first) {
        DoCall(callback, nullopt, node[index.first]);
      }
    } break;

    case SegmentType::DESCENT:
    case SegmentType::WILDCARD: {
      if (node.is_object()) {
        for (const auto& k_v : node.object_range()) {
          DoCall(callback, k_v.key(), k_v.value());
        }
      } else if (node.is_array()) {
        for (const auto& item : node.array_range()) {
          DoCall(callback, nullopt, item);
        }
      }
    } break;
    default:
      LOG(DFATAL) << "Unknown segment " << SegmentName(segment.type());
  }
  return {};
}

auto Dfs::MutateStep(const PathSegment& segment, const MutateCallback& cb, JsonType* node)
    -> nonstd::expected<void, MatchStatus> {
  switch (segment.type()) {
    case SegmentType::IDENTIFIER: {
      if (!node->is_object())
        return make_unexpected(MISMATCH);

      auto it = node->find(segment.identifier());
      if (it != node->object_range().end()) {
        cb(it->key(), &it->value());
      }
    } break;
    case SegmentType::INDEX: {
      if (!node->is_array())
        return make_unexpected(MISMATCH);
      IndexExpr index = segment.index().Normalize(node->size());
      if (index.Empty()) {
        return make_unexpected(OUT_OF_BOUNDS);
      }

      while (index.first <= index.second) {
        auto it = node->array_range().begin() + index.first;
        cb(nullopt, &*it);
        ++index.first;
      }
    } break;

    case SegmentType::DESCENT:
    case SegmentType::WILDCARD: {
      if (node->is_object()) {
        auto it = node->object_range().begin();
        while (it != node->object_range().end()) {
          cb(it->key(), &it->value());
          ++it;
        }
      } else if (node->is_array()) {
        auto it = node->array_range().begin();
        while (it != node->array_range().end()) {
          cb(nullopt, &*it);
          ++it;
        }
      }
    } break;
    case SegmentType::FUNCTION:
      LOG(DFATAL) << "Function segment is not supported for mutation";
      break;
  }
  return {};
}

auto Dfs::DeleteStep(const PathSegment& segment, JsonType* node)
    -> nonstd::expected<void, MatchStatus> {
  switch (segment.type()) {
    case SegmentType::IDENTIFIER: {
      if (!node->is_object())
        return make_unexpected(MISMATCH);

      auto it = node->find(segment.identifier());
      if (it != node->object_range().end()) {
        node->erase(it);
        ++matches_;
      }
    } break;
    case SegmentType::INDEX: {
      if (!node->is_array())
        return make_unexpected(MISMATCH);
      IndexExpr index = segment.index().Normalize(node->size());
      if (index.Empty()) {
        return make_unexpected(OUT_OF_BOUNDS);
      }

      // Delete from end to beginning to maintain indices
      for (int i = index.second; i >= index.first; --i) {
        auto it = node->array_range().begin() + i;
        node->erase(it);
        ++matches_;
      }
    } break;

    case SegmentType::DESCENT:
    case SegmentType::WILDCARD: {
      size_t initial_size = node->size();
      node->clear();
      matches_ += initial_size;
    } break;
    case SegmentType::FUNCTION:
      LOG(DFATAL) << "Function segment is not supported for deletion";
      break;
  }
  return {};
}

}  // namespace dfly::json::detail


================================================
FILE: src/core/json/detail/jsoncons_dfs.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/types/span.h>

#include <nonstd/expected.hpp>
#include <variant>

#include "core/json/detail/common.h"
#include "core/json/json_object.h"
#include "core/json/path.h"
#include "core/overloaded.h"

namespace dfly::json::detail {

// Describes the current state of the DFS traversal for a single node inside json hierarchy.
// Specifically it holds the parent object (can be a either a real object or an array),
// and the iterator to one of its children that is currently being traversed.
template <bool IsConst> class JsonconsDfsItem {
 public:
  using ValueType = std::conditional_t<IsConst, const JsonType, JsonType>;
  using Ptr = ValueType*;
  using Ref = ValueType&;
  using ObjIterator =
      std::conditional_t<IsConst, JsonType::const_object_iterator, JsonType::object_iterator>;
  using ArrayIterator =
      std::conditional_t<IsConst, JsonType::const_array_iterator, JsonType::array_iterator>;

  using DepthState = std::pair<Ptr, unsigned>;  // object, segment_idx pair
  using AdvanceResult = nonstd::expected<DepthState, MatchStatus>;

  JsonconsDfsItem(Ptr o, unsigned idx = 0) : depth_state_(o, idx) {
  }

  // Returns the next object to traverse
  // or null if traverse was exhausted or the segment does not match.
  AdvanceResult Advance(const PathSegment& segment);

  unsigned segment_idx() const {
    return depth_state_.second;
  }

  Ptr obj_ptr() const {
    return depth_state_.first;
  }

  unsigned get_segment_step() const {
    return segment_step_;
  }

 private:
  static bool ShouldIterateAll(SegmentType type) {
    return type == SegmentType::WILDCARD || type == SegmentType::DESCENT;
  }

  ObjIterator Begin() const {
    if constexpr (IsConst) {
      return obj().object_range().cbegin();
    } else {
      return obj().object_range().begin();
    }
  }

  ArrayIterator ArrBegin() const {
    if constexpr (IsConst) {
      return obj().array_range().cbegin();
    } else {
      return obj().array_range().begin();
    }
  }

  ArrayIterator ArrEnd() const {
    if constexpr (IsConst) {
      return obj().array_range().cend();
    } else {
      return obj().array_range().end();
    }
  }

  Ref obj() const {
    return *depth_state_.first;
  }

  DepthState Next(Ref obj) const {
    return {&obj, depth_state_.second + segment_step_};
  }

  DepthState Exhausted() const {
    return {nullptr, 0};
  }

  AdvanceResult Init(const PathSegment& segment);

  // For most operations we advance the path segment by 1 when we descent into the children.
  unsigned segment_step_ = 1;

  DepthState depth_state_;
  std::variant<std::monostate, ObjIterator, std::pair<ArrayIterator, ArrayIterator>> state_;
};

// Traverses a json object according to the given path and calls the callback for each matching
// field. With DESCENT segments it will match 0 or more fields in depth.
// MATCH(node, DESCENT|SUFFIX) = MATCH(node, SUFFIX) ||
// { MATCH(node->child, DESCENT/SUFFIX) for each child of node }

class Dfs {
 public:
  using Cb = PathCallback;

  // TODO: for some operations we need to know the type of mismatches.
  static Dfs Traverse(absl::Span<const PathSegment> path, const JsonType& json, const Cb& callback);
  static Dfs Mutate(absl::Span<const PathSegment> path, const MutateCallback& callback,
                    JsonType* json);

  // Simplified deletion without callback - more efficient for deletion operations
  static Dfs Delete(absl::Span<const PathSegment> path, JsonType* json);

  unsigned matches() const {
    return matches_;
  }

 private:
  bool TraverseImpl(absl::Span<const PathSegment> path, const Cb& callback);

  nonstd::expected<void, MatchStatus> PerformStep(const PathSegment& segment, const JsonType& node,
                                                  const Cb& callback);

  nonstd::expected<void, MatchStatus> MutateStep(const PathSegment& segment,
                                                 const MutateCallback& cb, JsonType* node);

  nonstd::expected<void, MatchStatus> DeleteStep(const PathSegment& segment, JsonType* node);

  void DoCall(const Cb& callback, std::optional<std::string_view> key, const JsonType& node) {
    ++matches_;
    callback(key, node);
  }

  unsigned matches_ = 0;
};

template <bool IsConst>
auto JsonconsDfsItem<IsConst>::Advance(const PathSegment& segment) -> AdvanceResult {
  AdvanceResult result = std::visit(  // line break
      Overloaded{
          [&](std::monostate) { return Init(segment); },  // Init state
          [&](ObjIterator& it) -> AdvanceResult {
            if (!ShouldIterateAll(segment.type()))
              return Exhausted();

            ++it;
            return it == obj().object_range().end() ? Exhausted() : Next(it->value());
          },
          [&](std::pair<ArrayIterator, ArrayIterator>& pair) -> AdvanceResult {
            if (pair.first == pair.second)
              return Exhausted();
            ++pair.first;
            return Next(*pair.first);
          },
      },
      state_);
  return result;
}

template <bool IsConst>
auto JsonconsDfsItem<IsConst>::Init(const PathSegment& segment) -> AdvanceResult {
  switch (segment.type()) {
    case SegmentType::IDENTIFIER: {
      if (obj().is_object()) {
        auto it = obj().find(segment.identifier());
        if (it != obj().object_range().end()) {
          state_ = it;
          return DepthState{&it->value(), depth_state_.second + 1};
        } else {
          return Exhausted();
        }
      }
      break;
    }
    case SegmentType::INDEX:
      if (obj().is_array()) {
        IndexExpr index = segment.index().Normalize(obj().size());
        if (index.Empty()) {
          return nonstd::make_unexpected(OUT_OF_BOUNDS);
        }

        auto start = ArrBegin() + index.first, end = ArrBegin() + index.second;
        state_ = std::make_pair(start, end);
        return Next(*start);
      }
      break;
    case SegmentType::DESCENT:
      if (segment_step_ == 1) {
        // first time, branching to return the same object but with the next segment,
        // exploring the path of ignoring the DESCENT operator.
        // Also, shift the state (segment_step) to bypass this branch next time.
        segment_step_ = 0;
        return DepthState{depth_state_.first, depth_state_.second + 1};
      }

      // Now traverse all the children but do not progress with segment path.
      // This is why segment_step_ is set to 0.
      [[fallthrough]];
    case SegmentType::WILDCARD: {
      if (obj().is_object()) {
        jsoncons::range rng = obj().object_range();
        if (rng.cbegin() == rng.cend()) {
          return Exhausted();
        }
        state_ = Begin();
        return Next(Begin()->value());
      }

      if (obj().is_array()) {
        auto start = ArrBegin(), end = ArrEnd();
        if (start == end) {
          return Exhausted();
        }
        state_ = std::make_pair(start, end - 1);  // end is inclusive
        return Next(*start);
      }
      break;
    }
    default:
      LOG(DFATAL) << "Unknown segment " << SegmentName(segment.type());
  }  // end switch

  return nonstd::make_unexpected(MISMATCH);
}

}  // namespace dfly::json::detail


================================================
FILE: src/core/json/driver.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "src/core/json/driver.h"

#include <absl/strings/str_cat.h>

#include "base/logging.h"
#include "src/core/json/lexer_impl.h"
#include "src/core/overloaded.h"

using namespace std;

namespace dfly::json {

namespace {

class SingleValueImpl : public AggFunction {
  Result GetResultImpl() const final {
    return val_;
  }

 protected:
  void Init(const JsonType& src) {
    if (src.is_double()) {
      val_.emplace<double>(src.as_double());
    } else {
      val_.emplace<int64_t>(src.as<int64_t>());
    }
  }

  void Init(const flexbuffers::Reference src) {
    if (src.IsFloat()) {
      val_.emplace<double>(src.AsDouble());
    } else {
      val_.emplace<int64_t>(src.AsInt64());
    }
  }

  Result val_;
};

class MaxImpl : public SingleValueImpl {
  bool ApplyImpl(const JsonType& src) final {
    if (!src.is_number()) {
      return false;
    }

    visit(Overloaded{
              [&](monostate) { Init(src); },
              [&](double d) { val_ = max(d, src.as_double()); },
              [&](int64_t i) {
                if (src.is_double())
                  val_ = max(double(i), src.as_double());
                else
                  val_ = max(i, src.as<int64_t>());
              },
          },
          val_);

    return true;
  }

  bool ApplyImpl(flexbuffers::Reference src) final {
    if (!src.IsNumeric()) {
      return false;
    }

    visit(Overloaded{
              [&](monostate) { Init(src); },
              [&](double d) { val_ = max(d, src.AsDouble()); },
              [&](int64_t i) {
                if (src.IsFloat())
                  val_ = max(double(i), src.AsDouble());
                else
                  val_ = max(i, src.AsInt64());
              },
          },
          val_);
    return true;
  }
};

class MinImpl : public SingleValueImpl {
 private:
  bool ApplyImpl(const JsonType& src) final {
    if (!src.is_number()) {
      return false;
    }

    visit(Overloaded{
              [&](monostate) { Init(src); },
              [&](double d) { val_ = min(d, src.as_double()); },
              [&](int64_t i) {
                if (src.is_double())
                  val_ = min(double(i), src.as_double());
                else
                  val_ = min(i, src.as<int64_t>());
              },
          },
          val_);

    return true;
  }

  bool ApplyImpl(flexbuffers::Reference src) final {
    if (!src.IsNumeric()) {
      return false;
    }

    visit(Overloaded{
              [&](monostate) { Init(src); },
              [&](double d) { val_ = min(d, src.AsDouble()); },
              [&](int64_t i) {
                if (src.IsFloat())
                  val_ = min(double(i), src.AsDouble());
                else
                  val_ = min(i, src.AsInt64());
              },
          },
          val_);
    return true;
  }
};

class AvgImpl : public AggFunction {
 private:
  bool ApplyImpl(const JsonType& src) final {
    if (!src.is_number()) {
      return false;
    }
    sum_ += src.as_double();
    count_++;

    return true;
  }

  bool ApplyImpl(flexbuffers::Reference src) final {
    if (!src.IsNumeric()) {
      return false;
    }
    sum_ += src.AsDouble();
    count_++;

    return true;
  }

  Result GetResultImpl() const final {
    DCHECK_GT(count_, 0u);  // AggFunction guarantees that
    return Result(double(sum_ / count_));
  }

  double sum_ = 0;
  uint64_t count_ = 0;
};

}  // namespace

Driver::Driver() : lexer_(make_unique<Lexer>()) {
}

Driver::~Driver() {
}

void Driver::SetInput(string str) {
  cur_str_ = std::move(str);
  lexer_->in(cur_str_);
  path_.clear();
}

void Driver::ResetScanner() {
  lexer_ = make_unique<Lexer>();
}

void Driver::AddFunction(string_view fname) {
  if (!path_.empty()) {
    throw Parser::syntax_error(lexer_->location(),
                               "function can be only at the beginning of the path");
  }

  shared_ptr<AggFunction> func;
  if (fname == "max") {
    func = make_shared<MaxImpl>();
  } else if (fname == "min") {
    func = make_shared<MinImpl>();
  } else if (fname == "avg") {
    func = make_shared<AvgImpl>();
  } else {
    throw Parser::syntax_error(lexer_->location(), absl::StrCat("Unknown function: ", fname));
  }
  path_.emplace_back(std::move(func));
}

}  // namespace dfly::json


================================================
FILE: src/core/json/driver.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <memory>
#include <string>

#include "src/core/json/path.h"

namespace dfly {
namespace json {

class Lexer;
class location;  // from jsonpath_grammar.hh

class Driver {
 public:
  Driver();
  virtual ~Driver();

  Lexer* lexer() {
    return lexer_.get();
  }

  void SetInput(std::string str);
  void ResetScanner();
  virtual void Error(const location& l, const std::string& msg) = 0;

  void AddIdentifier(const std::string& identifier) {
    AddSegment(PathSegment(SegmentType::IDENTIFIER, identifier));
  }

  void AddFunction(std::string_view fname);

  void AddWildcard() {
    AddSegment(PathSegment(SegmentType::WILDCARD));
  }

  void AddSegment(PathSegment segment) {
    path_.push_back(std::move(segment));
  }

  Path TakePath() {
    return std::move(path_);
  }

 private:
  Path path_;
  std::string cur_str_;
  std::unique_ptr<Lexer> lexer_;
};

}  // namespace json
}  // namespace dfly


================================================
FILE: src/core/json/interned_blob_test.cc
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.

#include "base/gtest.h"
#include "core/detail/stateless_allocator.h"
#include "core/json/detail/interned_string.h"
#include "core/mi_memory_resource.h"

using namespace std::literals;
using namespace dfly;

namespace {

MiMemoryResource* MemoryResource() {
  thread_local mi_heap_t* heap = mi_heap_new();
  thread_local MiMemoryResource memory_resource{heap};
  return &memory_resource;
}

}  // namespace

class InternedBlobTest : public testing::Test {
 protected:
  void SetUp() override {
    InitTLStatelessAllocMR(MemoryResource());
  }

  void TearDown() override {
    CleanupStatelessAllocMR();
  }
};

using detail::BlobPtr;
using detail::InternedBlobHandle;

TEST_F(InternedBlobTest, MemoryUsage) {
  const auto* mr = MemoryResource();
  const auto usage_before = mr->used();
  InternedBlobHandle blob = InternedBlobHandle::Create("1234567");
  const auto usage_after = mr->used();
  const auto expected_delta = blob.MemUsed();
  EXPECT_EQ(usage_before + expected_delta, usage_after);
  InternedBlobHandle::Destroy(blob);
  EXPECT_EQ(usage_before, mr->used());
}

void CheckBlob(InternedBlobHandle& blob, std::string_view expected, uint32_t ref_cnt = 1) {
  EXPECT_EQ(blob, expected);
  EXPECT_EQ(blob.Size(), expected.size());
  EXPECT_EQ(blob.RefCount(), ref_cnt);
}

TEST_F(InternedBlobTest, Ctors) {
  auto blob = InternedBlobHandle::Create("");
  EXPECT_EQ(blob.Size(), 0);
  EXPECT_FALSE(blob);
  InternedBlobHandle::Destroy(blob);

  InternedBlobHandle src = InternedBlobHandle::Create("foobar");
  InternedBlobHandle dest{src};
  CheckBlob(dest, "foobar");
  CheckBlob(src, "foobar");
  InternedBlobHandle::Destroy(dest);
}

TEST_F(InternedBlobTest, Comparison) {
  auto blob = InternedBlobHandle::Create("foobar");
  constexpr detail::BlobEq blob_eq;

  EXPECT_TRUE(blob_eq(blob, "foobar"));
  EXPECT_TRUE(blob_eq("foobar", blob));

  InternedBlobHandle second = blob;
  second.IncrRefCount();

  EXPECT_TRUE(blob_eq(blob, second));
  InternedBlobHandle::Destroy(blob);
}

TEST_F(InternedBlobTest, RefCounts) {
  auto blob = InternedBlobHandle::Create("1234567");
  EXPECT_EQ(blob.RefCount(), 1);
  blob.DecrRefCount();
  EXPECT_DEBUG_DEATH(blob.DecrRefCount(), "Attempt to decrease zero refcount");
  InternedBlobHandle::Destroy(blob);
}

TEST_F(InternedBlobTest, Pool) {
  detail::InternedBlobPool pool{};
  InternedBlobHandle b1 = InternedBlobHandle::Create("foo");
  pool.emplace(b1);

  // search by string view
  EXPECT_TRUE(pool.contains("foo"));

  // increment the refcount. The blob is still found because the hasher only looks at the string
  b1.IncrRefCount();
  EXPECT_TRUE(pool.contains("foo"));
  InternedBlobHandle::Destroy(b1);
}

using detail::InternedString;

namespace {

void StringCheck(const InternedString& s, const char* ptr) {
  std::string_view sv{ptr};

  EXPECT_STREQ(s.data(), ptr);
  EXPECT_STREQ(s.c_str(), ptr);

  EXPECT_EQ(s.size(), sv.size());
  EXPECT_EQ(s.length(), sv.size());

  EXPECT_EQ(std::string_view(s), sv);
  EXPECT_EQ(std::string_view(s.data(), s.size()), sv);
  EXPECT_EQ(std::string_view(s.c_str(), s.size()), sv);
}

}  // namespace

TEST_F(InternedBlobTest, StringPool) {
  size_t hits = GetInternedStringStats().hits;
  size_t misses = GetInternedStringStats().misses;
  const auto& pool = InternedString::GetPoolRef();
  EXPECT_TRUE(pool.empty());
  {
    const InternedString s1{"foobar"};
    StringCheck(s1, "foobar");
    EXPECT_EQ(pool.size(), 1);
    misses += 1;
    EXPECT_EQ(GetInternedStringStats().misses, misses);
    EXPECT_EQ(GetInternedStringStats().pool_entries, 1);
    {
      const InternedString s2{"foobar"};
      StringCheck(s2, "foobar");
      EXPECT_EQ(pool.size(), 1);
      EXPECT_EQ(GetInternedStringStats().misses, misses);
      EXPECT_EQ(GetInternedStringStats().pool_entries, 1);
      hits += 1;
      EXPECT_EQ(GetInternedStringStats().hits, hits);
    }
    EXPECT_EQ(pool.size(), 1);
  }
  EXPECT_TRUE(pool.empty());
  EXPECT_EQ(GetInternedStringStats().misses, misses);
  EXPECT_EQ(GetInternedStringStats().pool_entries, 0);
  EXPECT_EQ(GetInternedStringStats().pool_bytes, 0);
  EXPECT_EQ(GetInternedStringStats().hits, hits);

  std::vector<InternedString> strings;
  for (auto i = 0; i < 1000; ++i) {
    strings.emplace_back(std::to_string(i));
  }

  EXPECT_EQ(pool.size(), 1000);
  EXPECT_EQ(GetInternedStringStats().pool_entries, 1000);
  misses += 1000;
  EXPECT_EQ(GetInternedStringStats().misses, misses);
  strings.clear();
  EXPECT_TRUE(pool.empty());
  EXPECT_EQ(GetInternedStringStats().pool_entries, 0);
  EXPECT_EQ(GetInternedStringStats().pool_bytes, 0);

  for (auto i = 0; i < 1000; ++i) {
    strings.emplace_back("zyx");
  }
  EXPECT_EQ(pool.size(), 1);
  EXPECT_EQ(GetInternedStringStats().pool_entries, 1);
  hits += 999;
  EXPECT_EQ(GetInternedStringStats().hits, hits);
  strings.clear();
  EXPECT_TRUE(pool.empty());

  InternedString empty;
  EXPECT_TRUE(pool.empty());
}

TEST_F(InternedBlobTest, StringApi) {
  InternedString s1{"foobar"};
  EXPECT_EQ(std::string_view{s1}, "foobar"sv);
  StringCheck(s1, "foobar");

  const auto& pool = InternedString::GetPoolRef();
  InternedString s2{"psi"};
  StringCheck(s2, "psi");

  EXPECT_EQ(pool.size(), 2);

  // swap pointers into the pool
  s1.swap(s2);

  EXPECT_EQ(pool.size(), 2);

  StringCheck(s1, "psi");
  StringCheck(s2, "foobar");

  EXPECT_NE(s1, s2);
  EXPECT_EQ(s1, s1);
  // foobar < psi lexicographically
  EXPECT_LT(s2, s1);
}

TEST_F(InternedBlobTest, StringCtors) {
  const auto& pool = InternedString::GetPoolRef();
  InternedString s1{"foobar"};
  EXPECT_EQ(pool.size(), 1);

  // move ctor
  auto to = std::move(s1);
  EXPECT_EQ(pool.size(), 1);

  StringCheck(to, "foobar");
  StringCheck(s1, "");

  // These tests exercise self-move and self-copy behavior. This causes errors on newer GCC when
  // warnings are treated as errors (on CI). We need to version gate this because on older GCC this
  // check is not present.
#if defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 13
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wself-move"
#endif
  to = std::move(to);
  StringCheck(to, "foobar");

  auto copied = to;
  EXPECT_EQ(pool.size(), 1);

  StringCheck(to, "foobar");
  StringCheck(copied, "foobar");

  copied = copied;
#if defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 13
#pragma GCC diagnostic pop
#endif
  StringCheck(copied, "foobar");
  EXPECT_EQ(pool.size(), 1);

  const auto* mr = MemoryResource();
  const auto before = mr->used();

  std::string_view sv{"......."};
  // ptr and size with some allocator, allocator will be ignored
  InternedString x{sv.data(), sv.size(), std::allocator<char>{}};
  StringCheck(x, ".......");
  EXPECT_EQ(pool.size(), 2);

  EXPECT_GE(mr->used(), before + x.MemUsed());

  InternedString k{sv.begin(), sv.end()};
  StringCheck(k, ".......");
  EXPECT_EQ(pool.size(), 2);
}

TEST_F(InternedBlobTest, PoolShrink) {
  InternedString::ResetPool();
  std::vector<InternedString> v;
  const auto& ref = InternedString::GetPoolRef();
  for (const auto i : std::views::iota(0, 1000))
    v.emplace_back(std::to_string(i));

  std::vector<size_t> caps;

  constexpr auto jitter = std::views::iota(0, 6);

  while (!v.empty()) {
    constexpr auto step = 20;
    const auto from = v.end() - std::min<size_t>(step, v.size());
    v.erase(from, v.end());
    // Interleaving inserts right after a possible resize, to ensure we don't have to increase
    // capacity right after a shrink. The caps vector should remain monotonically decreasing.
    for (const auto j : jitter)
      v.emplace_back(std::to_string(10000 + j));
    caps.push_back(ref.capacity());
    for (size_t i = 0; i < jitter.size(); ++i)
      v.pop_back();
  }

  EXPECT_EQ(ref.load_factor(), 0);
  EXPECT_TRUE(std::ranges::is_sorted(caps, std::ranges::greater{}));

  // Check that capacity changes very infrequently
  size_t cap_trans = 0;
  for (size_t i = 1; i < caps.size(); ++i) {
    if (caps[i] != caps[i - 1])
      ++cap_trans;
  }
  EXPECT_LT(cap_trans, caps.size() / 2);
}


================================================
FILE: src/core/json/json_object.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/json/json_object.h"

#include <stack>

#include "base/logging.h"
#include "core/page_usage/page_usage_stats.h"

using namespace jsoncons;

namespace {

template <typename T>
std::optional<T> ParseWithDecoder(std::string_view input, json_decoder<T>&& decoder) {
  std::error_code ec;
  auto JsonErrorHandler = [](json_errc ec, const ser_context&) {
    VLOG(1) << "Error while decode JSON: " << make_error_code(ec).message();
    return false;
  };

  // The maximum allowed JSON nesting depth is 64.
  // The limit was reduced from 256 to 64. This change is reasonable, as most documents contain
  // no more than 20-30 levels of nesting. In the test case, over 128 levels were used, causing
  // the parser to enter a long stall due to excessive resource consumption. Even a limit of 128
  // does not mitigate the issue. A limit of 64 is a sensible compromise.
  // See https://github.com/dragonflydb/dragonfly/issues/5028
  const uint32_t json_nesting_depth_limit = 64;

  /* The maximum possible JSON nesting depth is either the specified json_nesting_depth_limit or
     half of the input size. Since nesting a JSON object requires at least 2 characters. */
  auto parser_options = json_options{}.max_nesting_depth(
      std::min(json_nesting_depth_limit, uint32_t(input.size() / 2)));

  json_parser parser(parser_options, JsonErrorHandler);

  parser.update(input);
  parser.finish_parse(decoder, ec);

  if (!ec && decoder.is_valid()) {
    return decoder.get_result();
  }
  return std::nullopt;
}

using namespace dfly;

// The following two functions allocate a string-based object by copying data to a fresh memory
// page. Then the move-assignment operator swaps it with the input node (swap_l_r in jsoncons), and
// the temporary is destroyed at the end of the scope.
bool DefragmentByteString(JsonType& j, PageUsage* page_usage) {
  const auto& byte_storage = j.cast<JsonType::byte_string_storage>();
  if (byte_storage.length() == 0 ||
      !page_usage->IsPageForObjectUnderUtilized(const_cast<uint8_t*>(byte_storage.data())))
    return false;

  const byte_string_view bsv{byte_storage.data(), byte_storage.length()};
  if (j.tag() == semantic_tag::ext) {
    j = JsonType(byte_string_arg, bsv, j.ext_tag(), byte_storage.get_allocator());
    return true;
  }

  j = JsonType(byte_string_arg, bsv, j.tag(), byte_storage.get_allocator());
  return true;
}

bool DefragmentLongString(JsonType& j, PageUsage* page_usage) {
  const auto& str_storage = j.cast<JsonType::long_string_storage>();
  if (str_storage.length() == 0 ||
      !page_usage->IsPageForObjectUnderUtilized(const_cast<char*>(str_storage.data())))
    return false;

  JsonType::string_view_type svt{str_storage.data(), str_storage.length()};
  j = JsonType(svt, j.tag(), str_storage.get_allocator());
  return true;
}

// Allocates a new json object of type json_object_arg, with fresh memory allocation for its
// contained vector of key value pairs. Then moves members from j to this new object. Finally j is
// swapped with the new object.
bool DefragmentJsonObject(JsonType& j, PageUsage* page_usage) {
  auto& object = j.cast<JsonType::object_storage>().value();
  if (object.empty() || !page_usage->IsPageForObjectUnderUtilized(&*object.begin()))
    return false;

  // Creates a fresh object and reserves space for the underlying vector.
  JsonType new_node{json_object_arg, j.tag(), object.get_allocator()};
  new_node.reserve(object.size());

  for (auto& member : object) {
    // The member values are JsonType themselves, they just wrap pointers to actual storage.
    // Their move invokes the move ctor in jsoncons, which will move the value wrappers to new_node,
    // and leave the original in `j` holding references to `null_storage` type, see
    // `uninitialized_move_a` in jsoncons. The member key (a string) is not moved but copied into
    // new_node members.
    new_node.try_emplace(member.key(), std::move(member.value()));
  }

  // Invokes move assignment. A swap is performed, and new_node now holds null_storage
  // references instead of `j`. It will be destroyed on leaving scope, cleaning up its memory.
  j = std::move(new_node);
  return true;
}

// Same as DefragmentJsonObject except uses an array object. The contained members are moved
// similarly, and on exit the old node is destroyed.
bool DefragmentJsonArray(JsonType& j, PageUsage* page_usage) {
  auto& array = j.cast<JsonType::array_storage>().value();
  if (array.empty() || !page_usage->IsPageForObjectUnderUtilized(&*array.begin()))
    return false;

  JsonType new_node{json_array_arg, j.tag(), array.get_allocator()};
  new_node.reserve(array.size());

  for (JsonType& member : array) {
    new_node.push_back(std::move(member));
  }

  j = std::move(new_node);
  return true;
}

}  // namespace

namespace dfly {

std::optional<TmpJson> JsonFromString(std::string_view input) {
  return ParseWithDecoder(input, json_decoder<TmpJson>{});
}

optional<JsonType> ParseJsonUsingShardHeap(string_view input) {
  return ParseWithDecoder(input, json_decoder<JsonType>{StatelessAllocator<char>{}});
}

bool Defragment(JsonType& j, PageUsage* page_usage) {
  bool did_defragment = false;
  // stack-based traversal inspired from jsoncons::basic_json::compute_memory_size
  std::stack<JsonType*> stack;
  stack.push(&j);

  while (!stack.empty()) {
    JsonType* current = stack.top();
    stack.pop();

    const json_storage_kind storage_kind = current->storage_kind();
    switch (storage_kind) {
      case json_storage_kind::byte_str:
        did_defragment |= DefragmentByteString(*current, page_usage);
        break;
      case json_storage_kind::long_str:
        did_defragment |= DefragmentLongString(*current, page_usage);
        break;
      case json_storage_kind::object: {
        did_defragment |= DefragmentJsonObject(*current, page_usage);
        auto& object = current->cast<JsonType::object_storage>().value();
        for (auto& member : object) {
          stack.push(&member.value());
        }
        break;
      }
      case json_storage_kind::array: {
        did_defragment |= DefragmentJsonArray(*current, page_usage);
        auto& array = current->cast<JsonType::array_storage>().value();
        for (auto& member : array) {
          stack.push(&member);
        }
        break;
      }
      default:
        DCHECK(is_trivial_storage(storage_kind))
            << "unexpected non trivial storage type:" << storage_kind;
        break;
    }
  }
  return did_defragment;
}

size_t ComputeMemorySize(const JsonType& j) {
  std::stack<const JsonType*> stack;
  stack.push(&j);

  size_t total = 0;
  auto add_used_memory = [&total](const auto* data) {
    if (data)
      total += mi_usable_size(data);
  };

  using enum json_storage_kind;
  while (!stack.empty()) {
    const auto* current = stack.top();
    stack.pop();

    const auto storage = current->storage_kind();
    if (is_trivial_storage(storage))
      continue;

    switch (storage) {
      case object: {
        const auto& object_storage = current->cast<JsonType::object_storage>().value();
        if (!object_storage.empty())
          add_used_memory(&*object_storage.begin());
        for (const auto& member : object_storage) {
          total += member.key().MemUsed();
          const auto& value = member.value();
          if (!is_trivial_storage(value.storage_kind()))
            stack.push(&value);
        }
      } break;
      case array: {
        const auto& arr = current->cast<JsonType::array_storage>().value();
        if (!arr.empty())
          add_used_memory(&arr[0]);
        for (const auto& elem : arr)
          if (!is_trivial_storage(elem.storage_kind()))
            stack.push(&elem);
      } break;
      case long_str:
        add_used_memory(current->cast<JsonType::long_string_storage>().data());
        break;
      case byte_str:
        add_used_memory(current->cast<JsonType::byte_string_storage>().data());
        break;
      default:
        DCHECK(false) << "unexpected non trivial storage type:" << storage;
    }
  }
  return total;
}

}  // namespace dfly


================================================
FILE: src/core/json/json_object.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <version>  // for __cpp_lib_to_chars macro.

#include "core/detail/stateless_allocator.h"
#include "core/json/detail/interned_string.h"

// std::from_chars is available in C++17 if __cpp_lib_to_chars is defined.
#if __cpp_lib_to_chars >= 201611L
#define JSONCONS_HAS_STD_FROM_CHARS 1
#endif

#include <jsoncons/json.hpp>
#include <jsoncons_ext/jsonpath/jsonpath.hpp>
#include <memory>
#include <optional>
#include <string_view>

namespace dfly {
class PageUsage;

using TmpJson = jsoncons::json;

struct InternedStringPolicy : jsoncons::sorted_policy {
  template <typename, typename, typename> using member_key = detail::InternedString;
};

using JsonType = jsoncons::basic_json<char, InternedStringPolicy, StatelessAllocator<char>>;

// A helper type to use in template functions which are expected to work with both TmpJson
// and JsonType
template <typename Allocator>
using JsonWithAllocator = jsoncons::basic_json<char, jsoncons::sorted_policy, Allocator>;

// Parses string into JSON. Any allocatons are done using the std allocator. This method should be
// used for generic JSON parsing, in particular, it should not be used to parse objects which will
// be stored in the db, as the backing storage is not managed by mimalloc.
std::optional<TmpJson> JsonFromString(std::string_view input);

// Parses string into JSON, using mimalloc heap for allocations. This method should only be used on
// shards where mimalloc heap is initialized.
std::optional<JsonType> ParseJsonUsingShardHeap(std::string_view input);

// Defragments the given json object by traversing its tree structure non-recursively, examining
// nodes and defragmenting as needed. Returns true if any object within the node was reallocated
bool Defragment(JsonType& j, PageUsage* page_usage);

template <typename Json = JsonType>
auto MakeJsonPathExpr(std::string_view path, std::error_code& ec)
    -> jsoncons::jsonpath::jsonpath_expression<Json> {
  using ResultAllocT = typename Json::allocator_type;
  using TmpAllocT = std::allocator<char>;
  using AllocSetT = jsoncons::allocator_set<ResultAllocT, TmpAllocT>;
  return jsoncons::jsonpath::make_expression<Json, TmpAllocT>(AllocSetT(), path, ec);
}

size_t ComputeMemorySize(const JsonType& j);

}  // namespace dfly


================================================
FILE: src/core/json/json_test.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include <gmock/gmock.h>

#include <jsoncons/json.hpp>
#include <jsoncons_ext/jsonpath/jsonpath.hpp>
#include <memory_resource>

#include "base/gtest.h"
#include "base/logging.h"

namespace dfly {
using namespace jsoncons;
using namespace jsoncons::literals;
using namespace testing;

class JsonTest : public ::testing::Test {
 protected:
  JsonTest() {
  }
};

TEST_F(JsonTest, Basic) {
  std::string data = R"(
    {
       "application": "hiking",
       "reputons": [
       {
           "rater": "HikingAsylum",
           "assertion": "advanced",
           "rated": "Marilyn C",
           "rating": 0.90,
           "confidence": 0.99
         }
       ]
    }
)";

  pmr::json j = pmr::json::parse(data);
  EXPECT_TRUE(j.contains("reputons"));
  jsonpath::json_replace(j, "$.reputons[*].rating", 1.1);
  EXPECT_EQ(1.1, j["reputons"][0]["rating"].as_double());
}

TEST_F(JsonTest, SetEmpty) {
  pmr::json dest{json_object_arg};  // crashes on UB without the tag.
  dest["bar"] = "foo";
}

TEST_F(JsonTest, Query) {
  json j = R"(
{"a":{}, "b":{"a":1}, "c":{"a":1, "b":2}}
)"_json;

  json out = jsonpath::json_query(j, "$..*");
  EXPECT_EQ(R"([{},{"a":1},{"a":1,"b":2},1,1,2])"_json, out);

  json j2 = R"(
    {"firstName":"John","lastName":"Smith","age":27,"weight":135.25,"isAlive":true,"address":{"street":"21 2nd Street","city":"New York","state":"NY","zipcode":"10021-3100"},"phoneNumbers":[{"type":"home","number":"212 555-1234"},{"type":"office","number":"646 555-4567"}],"children":[],"spouse":null}
  )"_json;

  // json_query always returns arrays.
  // See here: https://github.com/danielaparker/jsoncons/issues/82
  // Therefore we are going to only support the "extended" semantics
  // of json API (as they are called in AWS documentation).
  out = jsonpath::json_query(j2, "$.address");
  EXPECT_EQ(R"([{"street":"21 2nd Street","city":"New York",
      "state":"NY","zipcode":"10021-3100"}])"_json,
            out);
}

TEST_F(JsonTest, Errors) {
  auto cb = [](json_errc, const ser_context&) { return false; };

  json_decoder<json> decoder;
  basic_json_parser<char> parser(basic_json_decode_options<char>{}, cb);

  std::string_view input{"\000bla"};
  parser.update(input.data(), input.size());

  std::error_code ec;
  parser.parse_some(decoder, ec);

  EXPECT_TRUE(ec);

  EXPECT_EQ(ec, json_errc::unexpected_eof);
  EXPECT_FALSE(decoder.is_valid());
}

TEST_F(JsonTest, Path) {
  std::error_code ec;
  json j1 = R"({"field" : 1, "field-dash": 2})"_json;

  auto expr = jsonpath::make_expression<json>("$.field", ec);
  EXPECT_FALSE(ec);

  expr.evaluate(j1, [](const std::string& path, const json& val) {
    ASSERT_EQ("$['field']", path);
    ASSERT_EQ(1, val.as<int>());
  });

  expr = jsonpath::make_expression<json>("$.field-dash", ec);
  ASSERT_FALSE(ec);  // parses '-'

  expr.evaluate(j1, [](const std::string& path, const json& val) {
    ASSERT_EQ("$['field-dash']", path);
    ASSERT_EQ(2, val.as<int>());
  });

  int called = 0;
  jsonpath::json_query(j1, "max($.*)", [&](const std::string& path, const json& val) {
    EXPECT_EQ("$", path);
    ASSERT_EQ(2, val.as<int>());
    ++called;
  });
  EXPECT_EQ(1, called);

  auto res = jsonpath::json_query(j1, "max($.*)");
  ASSERT_TRUE(res.is_array() && res.size() == 1);
  EXPECT_EQ(2, res[0].as<int>());

  called = 0;
  json j2 = R"({"field" : [1, 2, 3, 4, 5]})"_json;
  jsonpath::json_query(j2, "$.field[1:2]", [&](const std::string& path, const json& val) {
    EXPECT_EQ("$['field'][1]", path);
    ASSERT_EQ(2, val.as<int>());
    ++called;
  });
  EXPECT_EQ(1, called);

  std::vector<int> vals;
  jsonpath::json_query(j2, "$.field[1:]", [&](const std::string& path, const json& val) {
    vals.push_back(val.as<int>());
  });
  EXPECT_THAT(vals, ElementsAre(2, 3, 4, 5));

  jsonpath::json_query(j2, "$.field[-1]", [&](const std::string& path, const json& val) {
    EXPECT_EQ(5, val.as<int>());
  });

  jsonpath::json_query(j2, "$.field[-6:1]", [&](const std::string& path, const json& val) {
    EXPECT_EQ(1, val.as<int>());
  });
}

TEST_F(JsonTest, Delete) {
  json j1 = R"({"c":{"a":1, "b":2}, "d":{"a":1, "b":2, "c":3}, "e": [1,2]})"_json;

  auto deleter = [](const json::string_view_type& path, json& val) {
    LOG(INFO) << "path: " << path;
    // val.evaluate();
    // if (val.is_object())
    //   val.erase(val.object_range().begin(), val.object_range().end());
  };
  jsonpath::json_replace(j1, "$.d.*", deleter);

  auto expr = jsonpath::make_expression<json>("$.d.*");

  auto callback = [](const std::string& path, const json& val) {
    LOG(INFO) << path << ": " << val << "\n";
  };
  expr.evaluate(j1, callback, jsonpath::result_options::path);
  auto it = j1.find("d");
  ASSERT_TRUE(it != j1.object_range().end());

  it->value().erase("a");
  EXPECT_EQ(R"({"c":{"a":1, "b":2}, "d":{"b":2, "c":3}, "e": [1,2]})"_json, j1);
}

TEST_F(JsonTest, JsonWithPolymorhicAllocator) {
  char buffer[1024] = {};
  std::pmr::monotonic_buffer_resource pool{std::data(buffer), std::size(buffer)};
  std::pmr::polymorphic_allocator<char> alloc(&pool);

  std::string input = R"(
{ "store": {
    "book": [
      { "category": "Roman",
        "author": "Felix Lobrecht",
        "title": "Sonne und Beton",
        "price": 12.99
      },
      { "category": "Roman",
        "author": "Thomas F. Schneider",
        "title": "Im Westen nichts Neues",
        "price": 10.00
      }
    ]
  }
}
)";

  auto j1 = pmr::json::parse(combine_allocators(alloc), input, json_options{});
  EXPECT_EQ("Roman", j1["store"]["book"][0]["category"].as_string());
  EXPECT_EQ("Felix Lobrecht", j1["store"]["book"][0]["author"].as_string());
  EXPECT_EQ(12.99, j1["store"]["book"][0]["price"].as_double());

  EXPECT_EQ("Roman", j1["store"]["book"][1]["category"].as_string());
  EXPECT_EQ("Im Westen nichts Neues", j1["store"]["book"][1]["title"].as_string());
  EXPECT_EQ(10.00, j1["store"]["book"][1]["price"].as_double());
}
}  // namespace dfly


================================================
FILE: src/core/json/jsonpath_grammar.y
================================================
%skeleton "lalr1.cc" // -*- C++ -*-
%require "3.5"  // fedora 32 has this one.

%defines  // %header starts from 3.8.1

%define api.namespace {dfly::json}
%define api.token.raw
%define api.token.constructor
%define api.value.type variant
%define api.parser.class {Parser}
%define parse.assert

// Added to header file before parser declaration.
%code requires {
  #include "src/core/json/path.h"
  namespace dfly {
  namespace json {
    class Driver;
  }
  }
}

// Added to cc file
%code {

#include "src/core/json/lexer_impl.h"
#include "src/core/json/driver.h"
#include <absl/strings/numbers.h>
#include "base/logging.h"

// GCC 13+ yields spurious warnings about uninitialized variant members in bison-generated code
#if !defined(__clang__) && __GNUC__ >= 13
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#endif

#define yylex driver->lexer()->Lex

using namespace std;

static int unsafe_stoi(std::string_view s) {
  int value;
  bool success = absl::SimpleAtoi(s, &value);
  DCHECK(success);
  return value;
}
}

%parse-param { Driver *driver  }

%locations

%define parse.trace
%define parse.error verbose  // detailed
%define parse.lac full
%define api.token.prefix {TOK_}

%token
  LBRACKET "["
  RBRACKET "]"
  COLON    ":"
  LPARENT  "("
  RPARENT  ")"
  ROOT "$"
  DOT  "."
  WILDCARD "*"
  DESCENT ".."
  SINGLE_QUOTE "'"
  DOUBLE_QUOTE "\""

// Needed 0 at the end to satisfy bison 3.5.1
%token YYEOF 0
%token <std::string> UNQ_STR "unquoted string"
%token <std::string> INT "integer"

%nterm <std::string> identifier
%nterm <PathSegment> bracket_index
%nterm <std::string> single_quoted_string
%nterm <std::string> double_quoted_string
%nterm <std::string> quoted_content


%%
// Based on the following specification:
// https://danielaparker.github.io/JsonCons.Net/articles/JsonPath/Specification.html

jsonpath: ROOT { /* skip adding root */ } opt_relative_location
         | function_expr opt_relative_location

opt_relative_location:
        | relative_location

relative_location: DOT relative_path
        | DESCENT { driver->AddSegment(PathSegment{SegmentType::DESCENT}); } relative_path
        | bracket_expr

relative_path: identifier { driver->AddIdentifier($1); } opt_relative_location
        | WILDCARD { driver->AddWildcard(); } opt_relative_location
        | bracket_expr

identifier: UNQ_STR
        | INT

bracket_expr: LBRACKET bracket_index RBRACKET { driver->AddSegment($2); } opt_relative_location

bracket_index: single_quoted_string { $$ = PathSegment(SegmentType::IDENTIFIER, $1); }
              | double_quoted_string { $$ = PathSegment(SegmentType::IDENTIFIER, $1); }
              | WILDCARD { $$ = PathSegment{SegmentType::INDEX, IndexExpr::All()}; }
              | INT { int tmp_idx = unsafe_stoi($1);
                      $$ = PathSegment(SegmentType::INDEX, IndexExpr(tmp_idx, tmp_idx)); }
              | INT COLON INT { $$ = PathSegment(SegmentType::INDEX, IndexExpr::HalfOpen(
                unsafe_stoi($1), unsafe_stoi($3))); }
              | INT COLON { $$ = PathSegment(SegmentType::INDEX, IndexExpr(unsafe_stoi($1), INT_MAX)); }
              | COLON INT { $$ = PathSegment(SegmentType::INDEX, IndexExpr::HalfOpen(0, unsafe_stoi($2))); }

single_quoted_string: SINGLE_QUOTE quoted_content SINGLE_QUOTE { $$ = $2; }

double_quoted_string: DOUBLE_QUOTE quoted_content DOUBLE_QUOTE { $$ = $2; }

quoted_content: UNQ_STR { $$ = $1; }
              | INT { $$ = $1; }
              | quoted_content DOT UNQ_STR { $$ = $1 + "." + $3; }
              | quoted_content DOT INT { $$ = $1 + "." + $3; }

function_expr: UNQ_STR { driver->AddFunction($1); } LPARENT ROOT relative_location RPARENT
%%


void dfly::json::Parser::error(const location_type& l, const string& m)
{
  driver->Error(l, m);
}


================================================
FILE: src/core/json/jsonpath_lexer.lex
================================================
%top{
  // generated in the header file.
  #include "core/json/jsonpath_grammar.hh"
}


%o bison-cc-namespace="dfly.json" bison-cc-parser="Parser"
%o namespace="dfly.json"

// Generated class and main function
%o lexer="AbstractLexer" lex="Lex"

// our derived class from AbstractLexer
%o class="Lexer"

/* nodefault removes default echo rule */
%o nodefault batch
%option unicode

/* Declarations before lexer implementation.  */
%{
    #define DFLY_LEXER_CC 1
    #include "src/core/json/lexer_impl.h"
    #undef DFLY_LEXER_CC
%}


%{
  // Code run each time a pattern is matched.
%}

%%

%{
  // Code run each time lex() is called.
%}

[[:space:]]+     ; // skip white space

"$"         return Parser::make_ROOT(loc());
".."        return Parser::make_DESCENT(loc());
"."         return Parser::make_DOT(loc());
":"         return Parser::make_COLON(loc());
"["         return Parser::make_LBRACKET(loc());
"]"         return Parser::make_RBRACKET(loc());
"*"         return Parser::make_WILDCARD(loc());
"("         return Parser::make_LPARENT(loc());
")"         return Parser::make_RPARENT(loc());
"'"         return Parser::make_SINGLE_QUOTE(loc());
"\""        return Parser::make_DOUBLE_QUOTE(loc());
-?[0-9]{1,9} return Parser::make_INT(str(), loc());

[\w_\-]+    return Parser::make_UNQ_STR(str(), loc());
<<EOF>>     return Parser::make_YYEOF(loc());
.           throw Parser::syntax_error(loc(), UnknownTokenMsg());
%%

// Function definitions


================================================
FILE: src/core/json/jsonpath_test.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include <gmock/gmock.h>

#include "base/gtest.h"
#include "base/logging.h"
#include "core/json/driver.h"
#include "core/json/lexer_impl.h"
#include "core/mi_memory_resource.h"

namespace flexbuffers {
bool operator==(const Reference left, const Reference right) {
  return left.ToString() == right.ToString();
}
}  // namespace flexbuffers

namespace dfly::json {

using namespace std;

using testing::ElementsAre;

MATCHER_P(SegType, value, "") {
  return ExplainMatchResult(testing::Property(&PathSegment::type, value), arg, result_listener);
}

void PrintTo(SegmentType st, std::ostream* os) {
  *os << " segment(" << SegmentName(st) << ")";
}

class TestDriver : public Driver {
 public:
  void Error(const location& l, const std::string& msg) final {
    LOG(INFO) << "Error at " << l << ": " << msg;
  }
};

template <typename JSON> JSON ValidJson(string_view str);

template <> JsonType ValidJson<JsonType>(string_view str) {
  auto res = ParseJsonUsingShardHeap(str);
  CHECK(res) << "Failed to parse json: " << str;
  return *res;
}

template <> FlatJson ValidJson<FlatJson>(string_view str) {
  static flexbuffers::Builder fbb;
  flatbuffers::Parser parser;

  fbb.Clear();
  CHECK(parser.ParseFlexBuffer(str.data(), nullptr, &fbb));
  fbb.Finish();
  const auto& buffer = fbb.GetBuffer();
  return flexbuffers::GetRoot(buffer);
}

bool is_int(const JsonType& val) {
  return val.is<int>();
}

int to_int(const JsonType& val) {
  return val.as<int>();
}

bool is_object(const JsonType& val) {
  return val.is_object();
}

bool is_array(const JsonType& val) {
  return val.is_array();
}

int is_int(FlatJson ref) {
  return ref.IsInt();
}

int to_int(FlatJson ref) {
  return ref.AsInt32();
}

bool is_object(FlatJson ref) {
  return ref.IsMap();
}

bool is_array(FlatJson ref) {
  return ref.IsUntypedVector();
}

class ScannerTest : public ::testing::Test {
 protected:
  void SetUp() override {
    Test::SetUp();
    InitTLStatelessAllocMR(&m_);
  }

  ScannerTest() : m_(mi_heap_get_backing()) {
    driver_.lexer()->set_debug(1);
  }

  void SetInput(const std::string& str) {
    driver_.SetInput(str);
  }

  Parser::symbol_type Lex() {
    try {
      return driver_.lexer()->Lex();
    } catch (const Parser::syntax_error& e) {
      LOG(INFO) << "Caught exception: " << e.what();

      // with later bison versions we can return make_YYerror
      return Parser::make_YYEOF(e.location);
    }
  }

  MiMemoryResource m_;
  TestDriver driver_;
};

template <typename JSON> class JsonPathTest : public ScannerTest {
 protected:
  int Parse(const std::string& str) {
    driver_.ResetScanner();
    driver_.SetInput(str);

    return Parser(&driver_)();
  }
};
using MyTypes = ::testing::Types<JsonType, FlatJson>;
TYPED_TEST_SUITE(JsonPathTest, MyTypes);

#define NEXT_TOK(tok_enum)                                    \
  {                                                           \
    auto tok = Lex();                                         \
    ASSERT_EQ(Parser::token::TOK_##tok_enum, tok.type_get()); \
  }

#define NEXT_EQ(tok_enum, type, val)                          \
  {                                                           \
    auto tok = Lex();                                         \
    ASSERT_EQ(Parser::token::TOK_##tok_enum, tok.type_get()); \
    EXPECT_EQ(val, tok.value.as<type>());                     \
  }

TEST_F(ScannerTest, Basic) {
  SetInput("$.мага-зин2.book[0].*");
  NEXT_TOK(ROOT);
  NEXT_TOK(DOT);
  NEXT_EQ(UNQ_STR, string, "мага-зин2");
  NEXT_TOK(DOT);
  NEXT_EQ(UNQ_STR, string, "book");
  NEXT_TOK(LBRACKET);
  NEXT_EQ(INT, string, "0");
  NEXT_TOK(RBRACKET);
  NEXT_TOK(DOT);
  NEXT_TOK(WILDCARD);

  SetInput("|");
  NEXT_TOK(YYEOF);

  SetInput("$..*");
  NEXT_TOK(ROOT);
  NEXT_TOK(DESCENT);
  NEXT_TOK(WILDCARD);
}

TEST_F(ScannerTest, FlatToJson) {
  flatbuffers::Parser parser;
  const char* json = R"(
    {
      "foo": "bar",
      "bar": 1.5,
      "strs": ["hello", "world"]
    }
  )";
  flexbuffers::Builder fbb;
  ASSERT_TRUE(parser.ParseFlexBuffer(json, nullptr, &fbb));
  fbb.Finish();

  flexbuffers::Reference root = flexbuffers::GetRoot(fbb.GetBuffer());
  JsonType res = FromFlat(root);
  EXPECT_EQ(res, JsonType::parse(json));
  fbb.Clear();
  FromJsonType(res, &fbb);
  fbb.Finish();
  string actual;
  flexbuffers::GetRoot(fbb.GetBuffer()).ToString(false, true, actual);
  EXPECT_EQ(res, JsonType::parse(actual));
}

TYPED_TEST(JsonPathTest, Parser) {
  EXPECT_NE(0, this->Parse("foo"));
  EXPECT_NE(0, this->Parse("$foo"));
  EXPECT_NE(0, this->Parse("$|foo"));

  EXPECT_EQ(0, this->Parse("$.foo.bar"));
  Path path = this->driver_.TakePath();

  // TODO: to improve the UX with gmock/c++ magic.
  ASSERT_EQ(2, path.size());
  EXPECT_THAT(path[0], SegType(SegmentType::IDENTIFIER));
  EXPECT_THAT(path[1], SegType(SegmentType::IDENTIFIER));
  EXPECT_EQ("foo", path[0].identifier());
  EXPECT_EQ("bar", path[1].identifier());

  EXPECT_EQ(0, this->Parse("$.*.bar[1]"));
  path = this->driver_.TakePath();
  ASSERT_EQ(3, path.size());
  EXPECT_THAT(path[0], SegType(SegmentType::WILDCARD));
  EXPECT_THAT(path[1], SegType(SegmentType::IDENTIFIER));
  EXPECT_THAT(path[2], SegType(SegmentType::INDEX));
  EXPECT_EQ("bar", path[1].identifier());
  EXPECT_EQ(IndexExpr(1, 1), path[2].index());

  EXPECT_EQ(0, this->Parse("$.plays[*].game"));
  EXPECT_EQ(0, this->Parse("$.bar[ -1]"));
  path = this->driver_.TakePath();
  EXPECT_THAT(path[1], SegType(SegmentType::INDEX));
  EXPECT_EQ(IndexExpr(-1, -1), path[1].index());
}

TYPED_TEST(JsonPathTest, Root) {
  TypeParam json = ValidJson<TypeParam>(R"({"foo" : 1, "bar": "str" })");
  ASSERT_EQ(0, this->Parse("$"));
  Path path = this->driver_.TakePath();
  int called = 0;
  EvaluatePath(path, json, [&](optional<string_view>, const TypeParam& val) {
    ++called;
    ASSERT_TRUE(is_object(val));
    ASSERT_EQ(json, val);
  });
  ASSERT_EQ(1, called);
}

TYPED_TEST(JsonPathTest, Functions) {
  ASSERT_EQ(0, this->Parse("max($.plays[*].score)"));
  Path path = this->driver_.TakePath();
  ASSERT_EQ(4, path.size());
  EXPECT_THAT(path[0], SegType(SegmentType::FUNCTION));
  EXPECT_THAT(path[1], SegType(SegmentType::IDENTIFIER));
  EXPECT_THAT(path[2], SegType(SegmentType::INDEX));
  EXPECT_THAT(path[3], SegType(SegmentType::IDENTIFIER));
  EXPECT_EQ(IndexExpr::All(), path[2].index());

  TypeParam json = ValidJson<TypeParam>(R"({"plays": [{"score": 1}, {"score": 2}]})");
  int called = 0;
  EvaluatePath(path, json, [&](auto, const TypeParam& val) {
    ++called;
    ASSERT_TRUE(is_int(val));
    ASSERT_EQ(2, to_int(val));
  });
  ASSERT_EQ(1, called);
}

TYPED_TEST(JsonPathTest, Descent) {
  EXPECT_EQ(0, this->Parse("$..foo"));
  Path path = this->driver_.TakePath();
  ASSERT_EQ(2, path.size());
  EXPECT_THAT(path[0], SegType(SegmentType::DESCENT));
  EXPECT_THAT(path[1], SegType(SegmentType::IDENTIFIER));
  EXPECT_EQ("foo", path[1].identifier());

  EXPECT_EQ(0, this->Parse("$..*"));
  ASSERT_EQ(2, path.size());
  path = this->driver_.TakePath();
  EXPECT_THAT(path[0], SegType(SegmentType::DESCENT));
  EXPECT_THAT(path[1], SegType(SegmentType::WILDCARD));

  EXPECT_NE(0, this->Parse("$.."));
  EXPECT_NE(0, this->Parse("$...foo"));
}

TYPED_TEST(JsonPathTest, QuotedStrings) {
  EXPECT_EQ(0, this->Parse("$[\"foo\"]"));
  Path path = this->driver_.TakePath();

  ASSERT_EQ(1, path.size());
  EXPECT_THAT(path[0], SegType(SegmentType::IDENTIFIER));
  EXPECT_EQ("foo", path[0].identifier());

  EXPECT_EQ(0, this->Parse("$['foo']"));  // single quoted string
  path = this->driver_.TakePath();

  ASSERT_EQ(1, path.size());
  EXPECT_THAT(path[0], SegType(SegmentType::IDENTIFIER));
  EXPECT_EQ("foo", path[0].identifier());

  EXPECT_EQ(0, this->Parse("$.[\"foo\"]"));
  path = this->driver_.TakePath();

  ASSERT_EQ(1, path.size());
  EXPECT_THAT(path[0], SegType(SegmentType::IDENTIFIER));
  EXPECT_EQ("foo", path[0].identifier());

  EXPECT_EQ(0, this->Parse("$..[\"foo\"]"));
  path = this->driver_.TakePath();

  ASSERT_EQ(2, path.size());
  EXPECT_THAT(path[0], SegType(SegmentType::DESCENT));
  EXPECT_THAT(path[1], SegType(SegmentType::IDENTIFIER));
  EXPECT_EQ("foo", path[1].identifier());

  EXPECT_NE(0, this->Parse("\"a\""));
  EXPECT_NE(0, this->Parse("$\"a\""));
  EXPECT_NE(0, this->Parse("$.\"a\""));
  EXPECT_NE(0, this->Parse("$..\"a\""));

  // Single quoted string
  EXPECT_NE(0, this->Parse("'a'"));
  EXPECT_NE(0, this->Parse("$'a'"));
  EXPECT_NE(0, this->Parse("$.'a'"));
  EXPECT_NE(0, this->Parse("$..'a'"));
}

TYPED_TEST(JsonPathTest, Path) {
  Path path;
  TypeParam json = ValidJson<TypeParam>(R"({"v11":{ "f" : 1, "a2": [0]}, "v12": {"f": 2, "a2": [1]},
      "v13": 3
      })");
  int called = 0;

  // Empty path
  EvaluatePath(path, json, [&](optional<string_view>, const TypeParam& val) { ++called; });
  ASSERT_EQ(1, called);
  called = 0;

  path.emplace_back(SegmentType::IDENTIFIER, "v13");
  EvaluatePath(path, json, [&](optional<string_view> key, const TypeParam& val) {
    ++called;
    ASSERT_EQ(3, to_int(val));
    EXPECT_EQ("v13", key);
  });
  ASSERT_EQ(1, called);

  path.clear();
  path.emplace_back(SegmentType::IDENTIFIER, "v11");
  path.emplace_back(SegmentType::IDENTIFIER, "f");
  called = 0;
  EvaluatePath(path, json, [&](optional<string_view> key, const TypeParam& val) {
    ++called;
    ASSERT_EQ(1, to_int(val));
    EXPECT_EQ("f", key);
  });
  ASSERT_EQ(1, called);

  path.clear();
  path.emplace_back(SegmentType::WILDCARD);
  path.emplace_back(SegmentType::IDENTIFIER, "f");
  called = 0;
  EvaluatePath(path, json, [&](optional<string_view> key, const TypeParam& val) {
    ++called;
    ASSERT_TRUE(is_int(val));
    EXPECT_EQ("f", key);
  });
  ASSERT_EQ(2, called);
}

TYPED_TEST(JsonPathTest, EvalDescent) {
  TypeParam json = ValidJson<TypeParam>(R"(
    {"v11":{ "f" : 1, "a2": [0]}, "v12": {"f": 2, "v21": {"f": 3, "a2": [1]}},
      "v13": { "a2" : { "b" : {"f" : 4}}}
      })");

  Path path;

  int called_arr = 0, called_obj = 0;

  path.emplace_back(SegmentType::DESCENT);
  path.emplace_back(SegmentType::IDENTIFIER, "a2");
  EvaluatePath(path, json, [&](optional<string_view> key, const TypeParam& val) {
    EXPECT_EQ("a2", key);
    if (is_array(val)) {
      ++called_arr;
    } else if (is_object(val)) {
      ++called_obj;
    } else {
      FAIL() << "Unexpected type";
    }
  });
  ASSERT_EQ(2, called_arr);
  ASSERT_EQ(1, called_obj);

  path.pop_back();
  path.emplace_back(SegmentType::IDENTIFIER, "f");
  int called = 0;
  EvaluatePath(path, json, [&](optional<string_view> key, const TypeParam& val) {
    ASSERT_TRUE(is_int(val));
    ASSERT_EQ("f", key);
    ++called;
  });
  ASSERT_EQ(4, called);

  json = ValidJson<TypeParam>(R"(
    {"a":[7], "inner": {"a": {"b": 2, "c": 1337}}}
  )");
  path.pop_back();
  path.emplace_back(SegmentType::IDENTIFIER, "a");

  vector<char> arr;
  auto gettype = [](const TypeParam& p) {
    if (is_array(p))
      return 'a';
    return is_object(p) ? 'o' : 'u';
  };

  EvaluatePath(path, json, [&](optional<string_view> key, const TypeParam& val) {
    arr.push_back(gettype(val));
    ASSERT_EQ("a", key);
  });
  ASSERT_THAT(arr, ElementsAre('a', 'o'));
}

TYPED_TEST(JsonPathTest, EvalDescent2) {
  TypeParam json = ValidJson<TypeParam>(R"(
    {"a":[{"val": 1}, {"val": 2}, {"val": 3}]}
  )");

  ASSERT_EQ(0, this->Parse("$..val"));
  Path path = this->driver_.TakePath();
  vector<int> arr;
  EvaluatePath(path, json, [&](optional<string_view> key, const TypeParam& val) {
    arr.push_back(to_int(val));
  });
  ASSERT_THAT(arr, ElementsAre(1, 2, 3));

  int called = 0;
  ASSERT_EQ(0, this->Parse("$..*"));
  path = this->driver_.TakePath();
  EvaluatePath(path, json, [&](optional<string_view> key, const TypeParam& val) { ++called; });
  EXPECT_EQ(7, called);

  called = 0;
  json = ValidJson<TypeParam>(R"(
    {
       "store": {
        "nums": [
         5
       ]
      }
    }
    )");
  EvaluatePath(path, json, [&](optional<string_view> key, const TypeParam& val) { ++called; });
  EXPECT_EQ(3, called);
}

TYPED_TEST(JsonPathTest, Wildcard) {
  ASSERT_EQ(0, this->Parse("$.arr[*]"));
  Path path = this->driver_.TakePath();
  ASSERT_EQ(2, path.size());
  EXPECT_THAT(path[1], SegType(SegmentType::INDEX));

  TypeParam json = ValidJson<TypeParam>(R"({"arr": [1, 2, 3], "i":1})");
  vector<int> arr;
  EvaluatePath(path, json, [&](optional<string_view> key, const TypeParam& val) {
    ASSERT_FALSE(key);
    arr.push_back(to_int(val));
  });
  ASSERT_THAT(arr, ElementsAre(1, 2, 3));

  ASSERT_EQ(0, this->Parse("$.i[*]"));
  path = this->driver_.TakePath();
  arr.clear();
  EvaluatePath(path, json, [&](optional<string_view> key, const TypeParam& val) {
    arr.push_back(to_int(val));
  });
  ASSERT_THAT(arr, ElementsAre());
}

TYPED_TEST(JsonPathTest, Mutate) {
  ASSERT_EQ(0, this->Parse("$[*]"));
  Path path = this->driver_.TakePath();

  TypeParam json = ValidJson<TypeParam>(R"([1, 2, 3, 5, 6])");
  auto cb = [](optional<string_view>, JsonType* val) {
    int intval = val->as<int>();
    *val = intval + 1;
  };

  vector<int> arr;

  if constexpr (std::is_same_v<TypeParam, JsonType>) {
    MutatePath(path, cb, &json);

    for (JsonType& el : json.array_range()) {
      arr.push_back(to_int(el));
    }
  } else {
    flexbuffers::Builder fbb;
    MutatePath(path, cb, json, &fbb);
    FlatJson fj = flexbuffers::GetRoot(fbb.GetBuffer());
    auto vec = fj.AsVector();
    for (unsigned i = 0; i < vec.size(); ++i) {
      arr.push_back(to_int(vec[i]));
    }
  }
  ASSERT_THAT(arr, ElementsAre(2, 3, 4, 6, 7));

  json = ValidJson<TypeParam>(R"(
    {"a":[7], "inner": {"a": {"bool": true, "c": 42}}}
  )");
  ASSERT_EQ(0, this->Parse("$..a.*"));
  path = this->driver_.TakePath();

  auto cb2 = [](optional<string_view> key, JsonType* val) {
    if (val->is_int64() && !key) {  // array element
      *val = 42;
    }
    if (val->is_bool()) {
      *val = false;
    }
  };

  auto expected = ValidJson<JsonType>(R"({"a":[42],"inner":{"a":{"bool":false,"c":42}}})");
  if constexpr (std::is_same_v<TypeParam, JsonType>) {
    MutatePath(path, cb2, &json);

    ASSERT_EQ(expected, json);
  } else {
    flexbuffers::Builder fbb;
    MutatePath(path, cb2, json, &fbb);
    FlatJson fj = flexbuffers::GetRoot(fbb.GetBuffer());
    ASSERT_EQ(expected, FromFlat(fj));
  }
}

TYPED_TEST(JsonPathTest, MutateRecursiveDescentKey) {
  ASSERT_EQ(0, this->Parse("$..value"));
  Path path = this->driver_.TakePath();

  JsonType json = ValidJson<JsonType>(R"({"data":{"value":10,"subdata":{"value":20}}})");
  JsonType replacement = ValidJson<JsonType>(R"({"value": 30})");

  auto cb = [&](optional<string_view> key, JsonType* val) {
    if (key && key.value() == "value" && (val->is_int64() || val->is_double())) {
      *val = replacement;
    }
  };

  unsigned reported_matches = MutatePath(path, cb, &json);

  JsonType expected =
      ValidJson<JsonType>(R"({"data":{"subdata":{"value":{"value":30}},"value":{"value":30}}})");

  EXPECT_EQ(expected, json);
  EXPECT_EQ(0, reported_matches);
}

TYPED_TEST(JsonPathTest, SubRange) {
  TypeParam json = ValidJson<TypeParam>(R"({"arr": [1, 2, 3, 4, 5]})");
  ASSERT_EQ(0, this->Parse("$.arr[1:2]"));
  Path path = this->driver_.TakePath();
  ASSERT_EQ(2, path.size());
  EXPECT_THAT(path[1], SegType(SegmentType::INDEX));

  vector<int> arr;
  auto cb = [&arr](optional<string_view> key, const TypeParam& val) {
    ASSERT_FALSE(key);
    arr.push_back(to_int(val));
  };

  EvaluatePath(path, json, cb);
  ASSERT_THAT(arr, ElementsAre(2));
  arr.clear();

  ASSERT_EQ(0, this->Parse("$.arr[0:2]"));
  path = this->driver_.TakePath();
  EvaluatePath(path, json, cb);
  ASSERT_THAT(arr, ElementsAre(1, 2));
  arr.clear();

  ASSERT_EQ(0, this->Parse("$.arr[2:-1]"));
  path = this->driver_.TakePath();
  EvaluatePath(path, json, cb);
  ASSERT_THAT(arr, ElementsAre(3, 4));
  arr.clear();

  ASSERT_EQ(0, this->Parse("$.arr[-2:-1]"));
  path = this->driver_.TakePath();
  EvaluatePath(path, json, cb);
  ASSERT_THAT(arr, ElementsAre(4));
  arr.clear();

  ASSERT_EQ(0, this->Parse("$.arr[-2:-2]"));
  path = this->driver_.TakePath();
  EvaluatePath(path, json, cb);
  ASSERT_THAT(arr, ElementsAre());
  arr.clear();

  ASSERT_EQ(0, this->Parse("$.arr[:2]"));
  path = this->driver_.TakePath();
  EvaluatePath(path, json, cb);
  ASSERT_THAT(arr, ElementsAre(1, 2));
  arr.clear();

  ASSERT_EQ(0, this->Parse("$.arr[2:]"));
  path = this->driver_.TakePath();
  EvaluatePath(path, json, cb);
  ASSERT_THAT(arr, ElementsAre(3, 4, 5));
  arr.clear();
}

TYPED_TEST(JsonPathTest, DeleteNestedWithSameKey) {
  // Test for deleting nested elements with the same key using "$..a"
  // Corresponds to command: JSON.DEL doc1 "$..a"
  ASSERT_EQ(0, this->Parse("$..a"));
  Path path = this->driver_.TakePath();

  TypeParam json = ValidJson<TypeParam>(R"({"a": 1, "nested": {"a": 2, "b": 3}})");

  if constexpr (std::is_same_v<TypeParam, JsonType>) {
    unsigned reported_matches = DeletePath(path, &json);
    EXPECT_EQ(2, reported_matches);

    auto expected = ValidJson<JsonType>(R"({"nested": {"b": 3}})");
    EXPECT_EQ(expected, json);
  } else {
    flexbuffers::Builder fbb;
    unsigned reported_matches = DeletePath(path, json, &fbb);

    EXPECT_EQ(2, reported_matches);

    FlatJson result = flexbuffers::GetRoot(fbb.GetBuffer());
    auto expected = ValidJson<JsonType>(R"({"nested": {"b": 3}})");
    EXPECT_EQ(expected, FromFlat(result));
  }
}

TYPED_TEST(JsonPathTest, DeleteRecursiveWithKeysAndArrayValues) {
  ASSERT_EQ(0, this->Parse("$..a"));
  Path path = this->driver_.TakePath();

  TypeParam json = ValidJson<TypeParam>(
      R"({"a": {"a": 2, "b": 3}, "b": ["a", "b"], "nested": {"b": [true, "a", "b"]}})");

  if constexpr (std::is_same_v<TypeParam, JsonType>) {
    unsigned reported_matches = DeletePath(path, &json);
    EXPECT_EQ(1, reported_matches);

    auto expected = ValidJson<JsonType>(R"({"b": ["a", "b"], "nested": {"b": [true, "a", "b"]}})");
    EXPECT_EQ(expected, json);
  } else {
    flexbuffers::Builder fbb;
    unsigned reported_matches = DeletePath(path, json, &fbb);
    EXPECT_EQ(1, reported_matches);

    FlatJson result = flexbuffers::GetRoot(fbb.GetBuffer());
    auto expected = ValidJson<JsonType>(R"({"b": ["a", "b"], "nested": {"b": [true, "a", "b"]}})");
    EXPECT_EQ(expected, FromFlat(result));
  }
}

}  // namespace dfly::json


================================================
FILE: src/core/json/lexer_impl.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "src/core/json/lexer_impl.h"

#include <absl/strings/str_cat.h>

namespace dfly::json {

Lexer::Lexer() {
}

Lexer::~Lexer() {
}

std::string Lexer::UnknownTokenMsg() const {
  std::string res = absl::StrCat("Unknown token '", text(), "'");
  return res;
}

}  // namespace dfly::json


================================================
FILE: src/core/json/lexer_impl.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

// We should not include lexer.h when compiling from lexer.cc file because it already
// includes lexer.h
#ifndef DFLY_LEXER_CC
#include "src/core/json/jsonpath_lexer.h"
#endif

#include "src/core/json/jsonpath_grammar.hh"

namespace dfly {
namespace json {

class Lexer : public AbstractLexer {
 public:
  Lexer();
  ~Lexer();

  Parser::symbol_type Lex() final;

 private:
  dfly::json::location loc() {
    return location();
  }

  std::string UnknownTokenMsg() const;
};

}  // namespace json
}  // namespace dfly


================================================
FILE: src/core/json/path.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "src/core/json/path.h"

#include <absl/strings/str_cat.h>
#include <absl/types/span.h>

#include "base/logging.h"
#include "core/json/detail/flat_dfs.h"
#include "core/json/detail/jsoncons_dfs.h"
#include "core/json/jsonpath_grammar.hh"
#include "src/core/json/driver.h"
#include "src/core/overloaded.h"

using namespace std;
using nonstd::make_unexpected;

namespace dfly::json {

using detail::Dfs;
using detail::FlatDfs;

namespace {

class JsonPathDriver : public json::Driver {
 public:
  string msg;
  void Error(const json::location& l, const std::string& msg) final {
    this->msg = absl::StrCat("Error: ", msg);
  }
};

}  // namespace

const char* SegmentName(SegmentType type) {
  switch (type) {
    case SegmentType::IDENTIFIER:
      return "IDENTIFIER";
    case SegmentType::INDEX:
      return "INDEX";
    case SegmentType::WILDCARD:
      return "WILDCARD";
    case SegmentType::DESCENT:
      return "DESCENT";
    case SegmentType::FUNCTION:
      return "FUNCTION";
  }
  return nullptr;
}

IndexExpr IndexExpr::Normalize(size_t array_len) const {
  if (array_len == 0)
    return IndexExpr(1, 0);  // empty range.

  IndexExpr res = *this;
  auto wrap = [array_len](int negative) {
    unsigned positive = -negative;
    return positive > array_len ? 0 : array_len - positive;
  };

  if (res.second >= int(array_len)) {
    res.second = array_len - 1;
  } else if (res.second < 0) {
    res.second = wrap(res.second);
    DCHECK_GE(res.second, 0);
  }
  if (res.first < 0) {
    res.first = wrap(res.first);
    DCHECK_GE(res.first, 0);
  }
  return res;
}

void PathSegment::Evaluate(const JsonType& json) const {
  CHECK(type() == SegmentType::FUNCTION);
  AggFunction* func = std::get<shared_ptr<AggFunction>>(value_).get();
  CHECK(func);
  func->Apply(json);
}

void PathSegment::Evaluate(FlatJson json) const {
  CHECK(type() == SegmentType::FUNCTION);
  AggFunction* func = std::get<shared_ptr<AggFunction>>(value_).get();
  CHECK(func);
  func->Apply(json);
}

AggFunction::Result PathSegment::GetResult() const {
  CHECK(type() == SegmentType::FUNCTION);
  const auto& func = std::get<shared_ptr<AggFunction>>(value_).get();
  CHECK(func);
  return func->GetResult();
}

void EvaluatePath(const Path& path, const JsonType& json, PathCallback callback) {
  if (path.empty()) {  // root node
    callback(nullopt, json);
    return;
  }

  if (path.front().type() != SegmentType::FUNCTION) {
    Dfs::Traverse(path, json, std::move(callback));
    return;
  }

  // Handling the case of `func($.somepath)`
  // We pass our own callback to gather all the results and then call the function.
  JsonType result(JsonType::null());
  absl::Span<const PathSegment> path_tail(path.data() + 1, path.size() - 1);

  const PathSegment& func_segment = path.front();

  if (path_tail.empty()) {
    LOG(DFATAL) << "Invalid path";  // parser should not allow this.
  } else {
    Dfs::Traverse(path_tail, json, [&](auto, const JsonType& val) { func_segment.Evaluate(val); });
  }

  AggFunction::Result res = func_segment.GetResult();
  JsonType val = visit(  // Transform the result to JsonType.
      Overloaded{
          [](monostate) { return JsonType::null(); },
          [&](double d) { return JsonType(d); },

          [&](int64_t i) { return JsonType(i); },
      },
      res);
  callback(nullopt, val);
}

nonstd::expected<json::Path, string> ParsePath(string_view path) {
  if (path.size() > 8192)
    return nonstd::make_unexpected("Path too long");

  VLOG(2) << "Parsing path: " << path;

  JsonPathDriver driver;
  Parser parser(&driver);

  driver.SetInput(string(path));
  int res = parser();
  if (res != 0) {
    return nonstd::make_unexpected(driver.msg);
  }

  return driver.TakePath();
}

unsigned MutatePath(const Path& path, MutateCallback callback, JsonType* json) {
  if (path.empty()) {
    callback(nullopt, json);
    return 1;
  }

  Dfs dfs = Dfs::Mutate(path, callback, json);
  return dfs.matches();
}

unsigned DeletePath(const Path& path, JsonType* json) {
  if (path.empty()) {
    // For empty path, we cannot delete the root JSON itself within this function
    // as it would require modifying the pointer itself. Return 0 for no deletion.
    return 0;
  }

  Dfs dfs = Dfs::Delete(path, json);
  return dfs.matches();
}

// Flat json path evaluation
void EvaluatePath(const Path& path, FlatJson json, PathFlatCallback callback) {
  if (path.empty()) {  // root node
    callback(nullopt, json);
    return;
  }

  if (path.front().type() != SegmentType::FUNCTION) {
    FlatDfs::Traverse(path, json, std::move(callback));
    return;
  }

  // Handling the case of `func($.somepath)`
  // We pass our own callback to gather all the results and then call the function.
  FlatJson result;
  absl::Span<const PathSegment> path_tail(path.data() + 1, path.size() - 1);

  const PathSegment& func_segment = path.front();

  if (path_tail.empty()) {
    LOG(DFATAL) << "Invalid path";  // parser should not allow this.
  } else {
    FlatDfs::Traverse(path_tail, json, [&](auto, FlatJson val) { func_segment.Evaluate(val); });
  }
  AggFunction::Result res = func_segment.GetResult();
  flexbuffers::Builder fbb;
  FlatJson val = visit(  // Transform the result to a flexbuffer reference.
      Overloaded{
          [](monostate) { return FlatJson{}; },
          [&](double d) {
            fbb.Double(d);
            fbb.Finish();
            return flexbuffers::GetRoot(fbb.GetBuffer());
          },

          [&](int64_t i) {
            fbb.Int(i);
            fbb.Finish();
            return flexbuffers::GetRoot(fbb.GetBuffer());
          },
      },
      res);

  callback(nullopt, val);
}

JsonType FromFlat(FlatJson src) {
  if (src.IsNull()) {
    return JsonType::null();
  }

  if (src.IsBool()) {
    return JsonType(src.AsBool());
  }

  if (src.IsInt()) {
    return JsonType(src.AsInt64());
  }

  if (src.IsFloat()) {
    return JsonType(src.AsDouble());
  }
  if (src.IsString()) {
    flexbuffers::String str = src.AsString();
    return JsonType(string_view{str.c_str(), str.size()});
  }

  CHECK(src.IsVector());
  auto vec = src.AsVector();
  JsonType js =
      src.IsMap() ? JsonType{jsoncons::json_object_arg} : JsonType{jsoncons::json_array_arg};
  auto keys = src.AsMap().Keys();
  for (unsigned i = 0; i < vec.size(); ++i) {
    JsonType value = FromFlat(vec[i]);
    if (src.IsMap()) {
      js[keys[i].AsKey()] = std::move(value);
    } else {
      js.push_back(std::move(value));
    }
  }
  return js;
}

void FromJsonType(const JsonType& src, flexbuffers::Builder* fbb) {
  if (src.is_null()) {
    return fbb->Null();
  }

  if (src.is_bool()) {
    return fbb->Bool(src.as_bool());
  }

  if (src.is_int64()) {
    return fbb->Int(src.as<int64_t>());
  }

  if (src.is_double()) {
    return fbb->Double(src.as_double());
  }

  if (src.is_string()) {
    string_view sv = src.as_string_view();
    fbb->String(sv.data(), sv.size());
    return;
  }

  if (src.is_object()) {
    auto range = src.object_range();
    size_t start = fbb->StartMap();
    for (auto it = range.cbegin(); it != range.cend(); ++it) {
      fbb->Key(it->key().c_str(), it->key().size());
      FromJsonType(it->value(), fbb);
    }
    fbb->EndMap(start);
    return;
  }

  CHECK(src.is_array());
  auto range = src.array_range();
  size_t start = fbb->StartVector();
  for (auto it = range.cbegin(); it != range.cend(); ++it) {
    FromJsonType(*it, fbb);
  }
  fbb->EndVector(start, false, false);
}

unsigned MutatePath(const Path& path, MutateCallback callback, FlatJson json,
                    flexbuffers::Builder* fbb) {
  JsonType mut_json = FromFlat(json);
  unsigned res = MutatePath(path, std::move(callback), &mut_json);

  // Populate the output builder 'fbb' with the resulting JSON state
  // (mutated or original if res == 0) and finalize it.
  // The builder MUST be finished before returning so that the caller
  // can safely access the resulting flatbuffer data (e.g., via GetBuffer()).
  // Skipping Finish() would leave the builder in an invalid, unusable state.
  FromJsonType(mut_json, fbb);  // Always convert (changed or not) JSON
  fbb->Finish();                // Always finish the builder

  // Return the number of actual mutations that occurred.
  return res;
}

unsigned DeletePath(const Path& path, FlatJson json, flexbuffers::Builder* fbb) {
  JsonType mut_json = FromFlat(json);
  unsigned res = DeletePath(path, &mut_json);

  FromJsonType(mut_json, fbb);
  fbb->Finish();
  return res;
}

}  // namespace dfly::json


================================================
FILE: src/core/json/path.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/functional/function_ref.h>

#include <nonstd/expected.hpp>
#include <string>
#include <variant>
#include <vector>

#include "core/flatbuffers.h"
#include "core/json/json_object.h"

namespace dfly::json {

enum class SegmentType {
  IDENTIFIER = 1,  // $.identifier
  INDEX = 2,       // $.array[index_expr]
  WILDCARD = 3,    // $.*
  DESCENT = 4,     // $..identifier
  FUNCTION = 5,    // max($.prices[*])
};

const char* SegmentName(SegmentType type);

class AggFunction {
 public:
  using Result = std::variant<std::monostate, double, int64_t>;
  virtual ~AggFunction() {
  }

  void Apply(const JsonType& src) {
    if (valid_ != 0)
      valid_ = ApplyImpl(src);
  }

  void Apply(FlatJson src) {
    if (valid_ != 0)
      valid_ = ApplyImpl(src);
  }

  // returns null if Apply was not called or ApplyImpl failed.
  Result GetResult() const {
    return valid_ == 1 ? GetResultImpl() : Result{};
  }

 protected:
  virtual bool ApplyImpl(const JsonType& src) = 0;
  virtual bool ApplyImpl(FlatJson src) = 0;
  virtual Result GetResultImpl() const = 0;

  int valid_ = -1;
};

// Bracket index representation, IndexExpr is a closed range, i.e. both ends are inclusive.
// Single index is: <I, I>, wildcard: <0, INT_MAX>,
// [begin:end): <begin, end - 1>
// IndexExpr is 0-based, with negative indices referring to the array size of the applied object.
struct IndexExpr : public std::pair<int, int> {
  bool Empty() const {
    return first > second;
  }

  static IndexExpr All() {
    return IndexExpr{0, INT_MAX};
  }

  using pair::pair;

  // Returns subrange with length `array_len`.
  IndexExpr Normalize(size_t array_len) const;

  // Returns IndexExpr representing [left_closed, right_open) range.
  static IndexExpr HalfOpen(int left_closed, int right_open) {
    return IndexExpr(left_closed, right_open - 1);
  }
};

class PathSegment {
 public:
  PathSegment() : PathSegment(SegmentType::IDENTIFIER) {
  }

  PathSegment(SegmentType type, std::string identifier = std::string())
      : type_(type), value_(std::move(identifier)) {
  }

  PathSegment(SegmentType type, IndexExpr index) : type_(type), value_(index) {
  }

  explicit PathSegment(std::shared_ptr<AggFunction> func)
      : type_(SegmentType::FUNCTION), value_(std::move(func)) {
  }

  SegmentType type() const {
    return type_;
  }

  const std::string& identifier() const {
    return std::get<std::string>(value_);
  }

  IndexExpr index() const {
    return std::get<IndexExpr>(value_);
  }

  void Evaluate(const JsonType& json) const;
  void Evaluate(FlatJson json) const;
  AggFunction::Result GetResult() const;

 private:
  SegmentType type_;

  // shared_ptr to preserve copy semantics.
  std::variant<std::string, IndexExpr, std::shared_ptr<AggFunction>> value_;
};

using Path = std::vector<PathSegment>;

// Passes the key name for object fields or nullopt for array elements.
// The second argument is a json value of either object fields or array elements.
using PathCallback = absl::FunctionRef<void(std::optional<std::string_view>, const JsonType&)>;
using PathFlatCallback = absl::FunctionRef<void(std::optional<std::string_view>, FlatJson)>;

// Returns true if the entry should be deleted, false otherwise.
using MutateCallback = absl::FunctionRef<void(std::optional<std::string_view>, JsonType*)>;

void EvaluatePath(const Path& path, const JsonType& json, PathCallback callback);

// Same as above but for flatbuffers.
void EvaluatePath(const Path& path, FlatJson json, PathFlatCallback callback);

// returns number of matches found with the given path.
unsigned MutatePath(const Path& path, MutateCallback callback, JsonType* json);
unsigned MutatePath(const Path& path, MutateCallback callback, FlatJson json,
                    flexbuffers::Builder* fbb);

// Simplified deletion operation without callback - more efficient for JSON.DEL operations
unsigned DeletePath(const Path& path, JsonType* json);
unsigned DeletePath(const Path& path, FlatJson json, flexbuffers::Builder* fbb);

// utility function to parse a jsonpath. Returns an error message if a parse error was
// encountered.
nonstd::expected<Path, std::string> ParsePath(std::string_view path);

// Transforms FlatJson to JsonType.
JsonType FromFlat(FlatJson src);

// Transforms JsonType to a buffer using flexbuffers::Builder.
// Does not call flexbuffers::Builder::Finish.
void FromJsonType(const JsonType& src, flexbuffers::Builder* fbb);

}  // namespace dfly::json


================================================
FILE: src/core/linear_search_map.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/inlined_vector.h>

#include "base/logging.h"

namespace dfly {

/* LinearSearchMap is a small key-value map implemented using an inlined vector of (key, value)
   pairs. It performs key lookup using linear search (O(n)) and is optimized for small maps
   (typically <32 keys).

   Compared to a hash map, it avoids hashing overhead and has better memory locality and cache
   performance. Use it when:
    - The number of keys is small
    - You care about minimal memory usage
    - Fast iteration is more important than fast lookup

   NOTE:
     - Insert() and Emplace() do NOT check for duplicate keys at runtime.
       Inserting a duplicate key results in undefined behavior.
     - You must ensure keys are unique when inserting.
     - This syntax is used to maintain compatibility with absl::InlinedVector. */
template <typename Key, typename Value, size_t N = 8>
class LinearSearchMap : public absl::InlinedVector<std::pair<Key, Value>, N> {
 private:
  using Base = absl::InlinedVector<std::pair<Key, Value>, N>;

 public:
  using Base::operator[];
  using Base::erase;

  using iterator = typename Base::iterator;
  using const_iterator = typename Base::const_iterator;

  // Does not check if key already exists.
  // If key already exists - undefined behavior.
  void insert(Key key, Value value);
  template <typename... Args> void emplace(Key key, Args&&... args);

  void erase(const Key& key);

  bool contains(const Key& key) const;

  iterator find(const Key& key);
  const_iterator find(const Key& key) const;
  size_t find_index(const Key& key) const;

  Value& operator[](const Key& key);
  const Value& operator[](const Key& key) const;
};

// Implementation
/******************************************************************/
template <typename Key, typename Value, size_t N>
void LinearSearchMap<Key, Value, N>::insert(Key key, Value value) {
  DCHECK(!contains(key)) << "Key already exists: " << key;
  this->emplace_back(std::move(key), std::move(value));
}

template <typename Key, typename Value, size_t N>
template <typename... Args>
void LinearSearchMap<Key, Value, N>::emplace(Key key, Args&&... args) {
  DCHECK(!contains(key)) << "Key already exists: " << key;
  this->emplace_back(std::piecewise_construct, std::forward_as_tuple(std::move(key)),
                     std::forward_as_tuple(std::forward<Args>(args)...));
}

template <typename Key, typename Value, size_t N>
void LinearSearchMap<Key, Value, N>::erase(const Key& key) {
  erase(find(key));
}

template <typename Key, typename Value, size_t N>
bool LinearSearchMap<Key, Value, N>::contains(const Key& key) const {
  return find(key) != this->end();
}

template <typename Key, typename Value, size_t N>
typename LinearSearchMap<Key, Value, N>::iterator LinearSearchMap<Key, Value, N>::find(
    const Key& key) {
  return std::find_if(this->begin(), this->end(),
                      [&key](const auto& pair) { return pair.first == key; });
}

template <typename Key, typename Value, size_t N>
typename LinearSearchMap<Key, Value, N>::const_iterator LinearSearchMap<Key, Value, N>::find(
    const Key& key) const {
  return std::find_if(this->begin(), this->end(),
                      [&key](const auto& pair) { return pair.first == key; });
}

template <typename Key, typename Value, size_t N>
size_t LinearSearchMap<Key, Value, N>::find_index(const Key& key) const {
  return std::distance(this->begin(), find(key));
}

template <typename Key, typename Value, size_t N>
Value& LinearSearchMap<Key, Value, N>::operator[](const Key& key) {
  return find(key)->second;
}

template <typename Key, typename Value, size_t N>
const Value& LinearSearchMap<Key, Value, N>::operator[](const Key& key) const {
  return find(key)->second;
}

}  // namespace dfly


================================================
FILE: src/core/linear_search_map_test.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/linear_search_map.h"

#include <gmock/gmock.h>
#include <gtest/gtest.h>

#include <utility>

#include "base/gtest.h"
#include "base/logging.h"

namespace dfly {

class LinearSearchMapTest : public testing::Test {
 protected:
};

TEST_F(LinearSearchMapTest, Insert) {
  LinearSearchMap<int, double> map;

  for (int i = 0; i < 100; ++i) {
    map.insert(i, i * 1.1);
  }

  for (int i = 199; i >= 100; --i) {
    map.insert(i, i * 12.1);
  }

  for (int i = 0; i < 200; ++i) {
    auto it = map.find(i);
    EXPECT_NE(it, map.end());
    EXPECT_TRUE(map.contains(i));

    EXPECT_EQ(it->second, (i < 100) ? i * 1.1 : i * 12.1);
  }
}

TEST_F(LinearSearchMapTest, Emplace) {
  struct Value {
    Value(double value_, std::string str_) : value(value_), str(std::move(str_)) {
    }

    double value;
    std::string str;
  };

  LinearSearchMap<int, Value> map;

  for (int i = 0; i < 100; ++i) {
    map.emplace(i, i * 1.1, "value_" + std::to_string(i));
  }

  for (int i = 199; i >= 100; --i) {
    map.emplace(i, i * 12.1, "value_" + std::to_string(i));
  }

  for (int i = 0; i < 200; ++i) {
    auto it = map.find(i);
    EXPECT_NE(it, map.end());
    EXPECT_TRUE(map.contains(i));

    EXPECT_EQ(it->second.value, (i < 100) ? i * 1.1 : i * 12.1);
    EXPECT_EQ(it->second.str, "value_" + std::to_string(i));
  }
}

TEST_F(LinearSearchMapTest, EraseSimple) {
  LinearSearchMap<int, double> map;

  for (int i = 0; i < 200; ++i) {
    map.insert(i, i * 1.1);
  }

  // Erase by iterator
  for (int i = 0; i < 100; ++i) {
    auto it = map.find(i);
    EXPECT_NE(it, map.end());
    EXPECT_TRUE(map.contains(i));

    map.erase(it);
    EXPECT_FALSE(map.contains(i));
  }

  // Erase by key
  for (int i = 100; i < 200; ++i) {
    EXPECT_TRUE(map.contains(i));
    map.erase(i);
    EXPECT_FALSE(map.contains(i));
  }

  EXPECT_TRUE(map.empty());
}

TEST_F(LinearSearchMapTest, Erase) {
  std::unordered_map<int, double> expected_map;
  LinearSearchMap<int, double> map;

  // First wave insert / erase
  for (int i = 0; i < 300; i++) {
    double value = i * 1.1;
    map.insert(i, value);
    expected_map[i] = value;
  }

  for (int i = 0; i < 300; i += 3) {
    EXPECT_TRUE(map.contains(i));
    map.erase(i);
    expected_map.erase(i);
    EXPECT_FALSE(map.contains(i));
  }

  // Second wave insert / erase
  for (int i = 300; i < 600; i++) {
    double value = i * 2.2;
    map.insert(i, value);
    expected_map[i] = value;
  }

  for (int i = 300; i < 600; i += 5) {
    EXPECT_TRUE(map.contains(i));
    map.erase(i);
    expected_map.erase(i);
    EXPECT_FALSE(map.contains(i));
  }

  // Erase all remaining elements
  while (!expected_map.empty()) {
    size_t index = 0;
    const size_t step = 7;

    for (auto it = expected_map.begin(); it != expected_map.end(); ++index) {
      auto [i, value] = *it;
      EXPECT_TRUE(map.contains(i));
      EXPECT_EQ(map.find(i)->second, value);

      if (index % step == 0) {
        map.erase(i);
        it = expected_map.erase(it);
      } else {
        ++it;
      }
    }
  }

  EXPECT_TRUE(map.empty());
}

TEST_F(LinearSearchMapTest, BasicFunctionality) {
  LinearSearchMap<double, double> map;

  for (double i = 0; i < 100; ++i) {
    map.insert(i, i * 1.1);
  }

  EXPECT_EQ(map.size(), 100);

  // Using indexes
  for (size_t i = 0; i < map.size(); ++i) {
    auto [key, value] = map[i];
    EXPECT_EQ(value, key * 1.1);
  }

  // Get index by key
  for (double i = 0; i < 100; ++i) {
    size_t index = map.find_index(i);
    auto [key, value] = map[index];
    EXPECT_EQ(value, key * 1.1);
  }

  // Get value by key
  for (double i = 0; i < 100; ++i) {
    EXPECT_EQ(map[i], i * 1.1);
  }

  // Iterate through the map
  for (const auto& [key, value] : map) {
    EXPECT_EQ(value, key * 1.1);
  }
}

}  // namespace dfly


================================================
FILE: src/core/listpack_test.cc
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/detail/listpack.h"

#include <gmock/gmock.h>
#include <mimalloc.h>

#include "base/gtest.h"
#include "base/logging.h"

extern "C" {
#include "redis/listpack.h"
#include "redis/zmalloc.h"
}

namespace dfly {
namespace detail {

using namespace std;
using namespace testing;

class ListPackTest : public ::testing::Test {
 protected:
  static void SetUpTestSuite() {
    init_zmalloc_threadlocal(mi_heap_get_backing());
  }

  void SetUp() override {
    ptr_ = lpNew(0);
    lp_ = ListPack(ptr_);
  }

  void TearDown() override {
    ptr_ = lp_.GetPointer();
    lpFree(ptr_);
    // Ensure there are no memory leaks after every test
    EXPECT_EQ(zmalloc_used_memory_tl, 0);
  }

  unsigned Remove(string_view elem, unsigned count, QList::Where where) {
    return lp_.Remove(CollectionEntry{elem.data(), elem.size()}, count, where);
  }

  ListPack lp_;
  uint8_t* ptr_ = nullptr;
};

TEST_F(ListPackTest, FindNotFound) {
  lp_.Push("first", QList::TAIL);
  lp_.Push("third", QList::TAIL);

  EXPECT_EQ(lp_.Find("second"), nullptr);
}

TEST_F(ListPackTest, RemoveIntegerFromHead) {
  lp_.Push("1", QList::TAIL);
  lp_.Push("2", QList::TAIL);
  lp_.Push("1", QList::TAIL);
  lp_.Push("3", QList::TAIL);

  // Remove integer value "1" from head
  unsigned removed = Remove("1", 0, QList::HEAD);
  EXPECT_EQ(2, removed);
  EXPECT_EQ(2, lp_.Size());

  EXPECT_EQ("2", lp_.At(0));
  EXPECT_EQ("3", lp_.At(1));
}

TEST_F(ListPackTest, RemoveFromTailAll) {
  // List: a, b, a, c, a
  lp_.Push("a", QList::TAIL);
  lp_.Push("b", QList::TAIL);
  lp_.Push("a", QList::TAIL);
  lp_.Push("c", QList::TAIL);
  lp_.Push("a", QList::TAIL);

  // Remove all "a" from tail direction
  unsigned removed = Remove("a", 0, QList::TAIL);
  EXPECT_EQ(3, removed);
  EXPECT_EQ(2, lp_.Size());

  // Remaining elements: b, c
  EXPECT_EQ("b", lp_.At(0));
  EXPECT_EQ("c", lp_.At(1));
}

TEST_F(ListPackTest, RemoveFromTailWithCount) {
  // List: a, b, a, c, a
  lp_.Push("a", QList::TAIL);
  lp_.Push("b", QList::TAIL);
  lp_.Push("a", QList::TAIL);
  lp_.Push("c", QList::TAIL);
  lp_.Push("a", QList::TAIL);

  // Remove only 2 occurrences of "a" from tail (removes indices 4 and 2)
  unsigned removed = Remove("a", 2, QList::TAIL);
  EXPECT_EQ(2, removed);
  EXPECT_EQ(3, lp_.Size());

  // Remaining elements: a, b, c
  EXPECT_EQ("a", lp_.At(0));
  EXPECT_EQ("b", lp_.At(1));
  EXPECT_EQ("c", lp_.At(2));
}

// Test removing consecutive tail elements - verifies lpLast is called correctly
// after deleting the tail element to continue finding remaining matches.
TEST_F(ListPackTest, RemoveFromTailConsecutive) {
  // List: x, target, target, target - three consecutive at tail
  lp_.Push("x", QList::TAIL);
  lp_.Push("target", QList::TAIL);
  lp_.Push("target", QList::TAIL);
  lp_.Push("target", QList::TAIL);

  unsigned removed = Remove("target", 0, QList::TAIL);
  EXPECT_EQ(3, removed);
  EXPECT_EQ(1, lp_.Size());
  EXPECT_EQ("x", lp_.At(0));
}

// Test removing the head element while iterating from TAIL direction.
// After checking all elements from tail to head and deleting the head,
// lpDelete returns pointer to element after head, and lpPrev on that returns nullptr,
// correctly ending iteration.
TEST_F(ListPackTest, RemoveFromTailDeletesHead) {
  // List: a, b, c - removing "a" (at head) while iterating from tail
  lp_.Push("a", QList::TAIL);
  lp_.Push("b", QList::TAIL);
  lp_.Push("c", QList::TAIL);

  unsigned removed = Remove("a", 0, QList::TAIL);
  EXPECT_EQ(1, removed);
  EXPECT_EQ(2, lp_.Size());

  EXPECT_EQ("b", lp_.At(0));
  EXPECT_EQ("c", lp_.At(1));
}

TEST_F(ListPackTest, ReplaceAtIndex) {
  lp_.Push("first", QList::TAIL);
  lp_.Push("second", QList::TAIL);
  lp_.Push("third", QList::TAIL);

  // Replace element at index 1
  uint8_t* pos = lp_.Seek(1);
  EXPECT_NE(pos, nullptr);
  lp_.Replace(pos, "replaced");
  EXPECT_EQ(3, lp_.Size());

  EXPECT_EQ("first", lp_.At(0));
  EXPECT_EQ("replaced", lp_.At(1));
  EXPECT_EQ("third", lp_.At(2));
}

TEST_F(ListPackTest, ReplaceAtNegativeIndex) {
  lp_.Push("first", QList::TAIL);
  lp_.Push("second", QList::TAIL);
  lp_.Push("third", QList::TAIL);

  // Replace element at index -1 (last element)
  uint8_t* pos = lp_.Seek(-1);
  EXPECT_NE(pos, nullptr);
  lp_.Replace(pos, "new_last");
  EXPECT_EQ(3, lp_.Size());

  EXPECT_EQ("first", lp_.At(0));
  EXPECT_EQ("second", lp_.At(1));
  EXPECT_EQ("new_last", lp_.At(2));
}

TEST_F(ListPackTest, ReplaceOutOfBounds) {
  lp_.Push("first", QList::TAIL);
  lp_.Push("second", QList::TAIL);

  // Replace at out-of-bounds index should return false
  uint8_t* pos = lp_.Seek(5);
  EXPECT_EQ(pos, nullptr);
  pos = lp_.Seek(-5);
  EXPECT_EQ(pos, nullptr);
}

TEST_F(ListPackTest, ReplaceWithLargerString) {
  lp_.Push("a", QList::TAIL);
  lp_.Push("b", QList::TAIL);

  // Replace with a much larger string
  string large(500, 'x');
  uint8_t* pos = lp_.Seek(0);
  EXPECT_NE(pos, nullptr);
  lp_.Replace(pos, large);
  EXPECT_EQ(2, lp_.Size());

  EXPECT_EQ(large, lp_.At(0));
  EXPECT_EQ("b", lp_.At(1));
}

TEST_F(ListPackTest, ReplaceWithEmptyString) {
  lp_.Push("first", QList::TAIL);
  lp_.Push("second", QList::TAIL);

  // Replace with empty string
  uint8_t* pos = lp_.Seek(0);
  EXPECT_NE(pos, nullptr);
  lp_.Replace(pos, "");
  EXPECT_EQ(2, lp_.Size());

  EXPECT_EQ("", lp_.At(0));
  EXPECT_EQ("second", lp_.At(1));
}

}  // namespace detail
}  // namespace dfly


================================================
FILE: src/core/memory_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

// Disable mimalloc internal debug assertions for accessing internal structures
#define MI_DEBUG 0

#include <mimalloc.h>
#include <mimalloc/internal.h>
#include <mimalloc/types.h>

#include <thread>
#include <vector>

#include "base/gtest.h"
#include "base/logging.h"

// Stub out internal mimalloc assertions that aren't exported
// These are used by inline functions in internal.h
[[noreturn]] void _mi_assert_fail(const char* assertion, const char* fname, unsigned int line,
                                  const char* func) noexcept {
  fprintf(stderr, "mimalloc assertion failed: %s at %s:%u in %s\n", assertion, fname, line, func);
  abort();
}

namespace dfly {

class MiHeapTest : public ::testing::Test {
 protected:
  MiHeapTest() {
  }
};

TEST_F(MiHeapTest, Basic) {
  mi_heap_t* heap = mi_heap_get_default();
  void* ptr = mi_heap_malloc_aligned(heap, 1024 /* size*/, 64 /* alignment*/);
  ASSERT_TRUE(ptr != nullptr);

  EXPECT_EQ(heap->tld->stats.malloc_normal.current, 1024);
  EXPECT_EQ(heap->tld->stats.malloc_huge.current, 0);

  void* ptr2 = mi_heap_malloc_aligned(heap, 1024 * 1024 /* size*/, 64 /* alignment*/);

  EXPECT_EQ(heap->tld->stats.malloc_normal.current, 1024);
  EXPECT_GE(heap->tld->stats.malloc_huge.current, 1024 * 1024);

  mi_free(ptr);

  EXPECT_EQ(heap->tld->stats.malloc_normal.current, 0);
  EXPECT_GE(heap->tld->stats.malloc_huge.current, 1024 * 1024);

  mi_free(ptr2);
  EXPECT_EQ(heap->tld->stats.malloc_huge.current, 0);
}

TEST_F(MiHeapTest, Threaded) {
  mi_heap_t* heap = mi_heap_get_default();

  void* ptr = mi_heap_malloc_aligned(heap, 1024 /* size*/, 64 /* alignment*/);
  ASSERT_TRUE(ptr != nullptr);

  // adding ptr to heap->thread_delayed_free
  std::thread t2([ptr]() {
    mi_free(ptr);
    // thread local stats are updated.
    EXPECT_EQ(mi_heap_get_default()->tld->stats.malloc_normal.current, -1024);
  });

  t2.join();
  EXPECT_EQ(heap->tld->stats.malloc_normal.current, 1024);
  EXPECT_EQ(heap->generic_collect_count, 0);

  // Force many mallocs to trigger delayed blocks collection.
  for (unsigned i = 0; i < 200; ++i) {
    ptr = mi_malloc(16 * i);
    mi_free(ptr);
  }

  // delayed collections was triggered
  EXPECT_GE(heap->generic_collect_count, 1);

  // mi_malloc does not track malloc back sizes back to the original heap threads.
  EXPECT_EQ(heap->tld->stats.malloc_normal.current, 1024);
}

// Verify that xthread_free lists are processed correctly during force collection
// on full pages.
TEST_F(MiHeapTest, FullPageThreadFreeInternal) {
  mi_heap_t* heap = mi_heap_get_default();
  constexpr size_t block_size = 64;
  std::vector<void*> allocations;

  // Allocate blocks until page is full
  void* first_ptr = mi_heap_malloc(heap, block_size);
  ASSERT_TRUE(first_ptr != nullptr);
  allocations.push_back(first_ptr);

  mi_page_t* page = _mi_ptr_page(first_ptr);
  ASSERT_TRUE(page != nullptr);

  while (page->used < page->capacity) {
    void* ptr = mi_heap_malloc(heap, block_size);
    ASSERT_TRUE(ptr != nullptr);
    if (_mi_ptr_page(ptr) == page) {
      allocations.push_back(ptr);
    } else {
      mi_free(ptr);
      break;
    }
  }

  EXPECT_EQ(page->used, page->capacity);

  // Free one block from another thread
  void* cross_thread_ptr = allocations.back();
  allocations.pop_back();

  std::thread t([cross_thread_ptr]() { mi_free(cross_thread_ptr); });
  t.join();

  EXPECT_EQ(page->used, page->capacity);
  EXPECT_NE(mi_atomic_load_relaxed(&page->xthread_free), 0);

  // Force collection should process xthread_free
  mi_heap_collect(heap, true);

  EXPECT_LT(page->used, page->capacity);
  EXPECT_EQ(mi_atomic_load_relaxed(&page->xthread_free), 0);

  // New allocation should reuse the freed block
  void* new_ptr = mi_heap_malloc(heap, block_size);
  EXPECT_EQ(_mi_ptr_page(new_ptr), page);

  // Clean up
  mi_free(new_ptr);
  for (void* ptr : allocations) {
    mi_free(ptr);
  }
}

// Verify that MI_BIN_FULL pages are cleared during collection.
TEST_F(MiHeapTest, FullBinQueueCollection) {
  mi_heap_t* heap = mi_heap_get_default();
  constexpr size_t block_size = 64;

  auto count_xthread_free = [&heap]() {
    size_t count = 0;
    for (size_t i = 0; i <= MI_BIN_FULL; ++i) {
      for (mi_page_t* page = heap->pages[i].first; page != nullptr; page = page->next) {
        if (mi_atomic_load_relaxed(&page->xthread_free) != 0) {
          count++;
        }
      }
    }
    return count;
  };

  // Allocate and cross-thread free to populate xthread_free lists
  std::vector<void*> allocations(2000);
  for (size_t i = 0; i < allocations.size(); ++i) {
    allocations[i] = mi_heap_malloc(heap, block_size);
    ASSERT_TRUE(allocations[i] != nullptr);
  }

  std::thread t([&allocations]() {
    for (size_t i = 0; i < allocations.size() / 2; ++i) {
      mi_free(allocations[i]);
    }
  });
  t.join();

  size_t xthread_before = count_xthread_free();
  EXPECT_GT(xthread_before, 0);

  mi_heap_collect(heap, true);

  EXPECT_EQ(count_xthread_free(), 0) << "All xthread_free lists should be cleared";

  // Clean up
  for (size_t i = allocations.size() / 2; i < allocations.size(); ++i) {
    mi_free(allocations[i]);
  }
}

// Test that verifies memory accounting and reclamation behavior when allocations are made in
// one thread and freed in another after the allocating thread exits. This exercises the
// MI_ABANDON / cross-thread free handling where mimalloc should properly reclaim pages from
// the abandoned thread heap once collection runs.
//
// This test uses the default heap and verifies reclamation by checking its statistics.
TEST_F(MiHeapTest, AbandonedHeapReclamation) {
  constexpr size_t block_size = 128;
  constexpr size_t num_blocks = 2000;
  std::vector<void*> allocations(num_blocks);

  mi_heap_t* main_heap = mi_heap_get_default();

  // Allocate memory in a separate thread, then exit the thread
  std::thread allocator_thread([&]() {
    for (size_t i = 0; i < num_blocks; ++i) {
      allocations[i] = mi_malloc(block_size);
      ASSERT_TRUE(allocations[i] != nullptr);
    }
  });

  allocator_thread.join();

  // Free all allocations from the main thread (cross-thread free to abandoned heap)
  for (void* ptr : allocations) {
    mi_free(ptr);
  }

  // Force collection to reclaim abandoned segments
  mi_collect(true);

  // Verify memory and abandoned pages are reclaimed
  EXPECT_EQ(main_heap->tld->stats.malloc_normal.current, 0);
}

}  // namespace dfly


================================================
FILE: src/core/mi_memory_resource.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#include "core/mi_memory_resource.h"

#include <sys/mman.h>

#include "base/logging.h"

namespace dfly {

using namespace std;

void* MiMemoryResource::do_allocate(size_t size, size_t align) {
  DCHECK(align);

  void* res = mi_heap_malloc_aligned(heap_, size, align);

  if (!res)
    throw bad_alloc{};

  // It seems that mimalloc has a bug with larger allocations that causes
  // mi_heap_contains_block to lie. See https://github.com/microsoft/mimalloc/issues/587
  // For now I avoid the check by checking the size. mi_usable_size works though.
  DCHECK(size > 33554400 || mi_heap_contains_block(heap_, res));
  size_t delta = mi_usable_size(res);

  used_ += delta;
  DVLOG(1) << "do_allocate: " << heap_ << " " << delta;

  return res;
}

void MiMemoryResource::do_deallocate(void* ptr, size_t size, size_t align) {
  DCHECK(size > 33554400 || mi_heap_contains_block(heap_, ptr));

  size_t usable = mi_usable_size(ptr);

  DVLOG(1) << "do_deallocate: " << heap_ << " " << usable;

  DCHECK_GE(used_, size);
  used_ -= usable;
  mi_free_size_aligned(ptr, size, align);
}

}  // namespace dfly


================================================
FILE: src/core/mi_memory_resource.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <mimalloc.h>

#include "base/pmr/memory_resource.h"

namespace dfly {

// Per thread memory resource that uses mimalloc.
class MiMemoryResource : public PMR_NS::memory_resource {
 public:
  explicit MiMemoryResource(mi_heap_t* heap) : heap_(heap) {
  }

  mi_heap_t* heap() {
    return heap_;
  }

  size_t used() const {
    return used_;
  }

 private:
  void* do_allocate(std::size_t size, std::size_t align) final;

  void do_deallocate(void* ptr, std::size_t size, std::size_t align) final;

  bool do_is_equal(const PMR_NS::memory_resource& o) const noexcept {
    return this == &o;
  }

  mi_heap_t* heap_;
  size_t used_ = 0;
};

}  // namespace dfly


================================================
FILE: src/core/oah_entry.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/oah_entry.h"

#include "base/hash.h"
#include "base/logging.h"

namespace dfly {

OAHEntry::OAHEntry(std::string_view key, uint32_t expiry) {
  uint32_t key_size = key.size();

  uint32_t expiry_size = (expiry != UINT32_MAX) * sizeof(expiry);

  uint32_t key_len_field_size = key_size <= std::numeric_limits<uint8_t>::max() ? 1 : 4;

  auto size = key_len_field_size + key_size + expiry_size;

  auto* expiry_pos = (char*)zmalloc(size);
  data_ = reinterpret_cast<uint64_t>(expiry_pos);
  if (expiry_size) {
    SetExpiryBit(true);
    std::memcpy(expiry_pos, &expiry, sizeof(expiry));
  }

  auto* key_size_pos = expiry_pos + expiry_size;
  if (key_len_field_size == 1) {
    SetSsoBit();
    uint8_t sso_key_size = key_size;
    std::memcpy(key_size_pos, &sso_key_size, key_len_field_size);
  } else {
    std::memcpy(key_size_pos, &key_size, key_len_field_size);
  }

  auto* key_pos = key_size_pos + key_len_field_size;
  std::memcpy(key_pos, key.data(), key_size);
}

// returns the expiry time of the current entry or UINT32_MAX if no expiry is set.
uint32_t OAHEntry::GetExpiry() const {
  std::uint32_t res = UINT32_MAX;
  if (HasExpiry()) {
    assert(!IsVector());
    std::memcpy(&res, Raw(), sizeof(res));
  }
  return res;
}

bool OAHEntry::CheckNoCollisions(const uint64_t ext_hash) {
  auto stored_hash = GetHash();
  return ((stored_hash != ext_hash) & (stored_hash != 0)) | (Empty());
}

void OAHEntry::SetExtHash(uint64_t ext_hash) {
  assert(data_);
  assert(!IsVector());
  data_ = (data_ & ~kExtHashShiftedMask) | (ext_hash << kExtHashShift);
}

void OAHEntry::SetExpiry(uint32_t at_sec) {
  assert(!IsVector());
  if (HasExpiry()) {
    auto* expiry_pos = Raw();
    std::memcpy(expiry_pos, &at_sec, sizeof(at_sec));
  } else {
    *this = OAHEntry(Key(), at_sec);
  }
}

void OAHEntry::ExpireIfNeeded(uint32_t time_now, uint32_t* set_size, size_t* alloc_used) {
  assert(!IsVector());
  if (GetExpiry() <= time_now) {
    *alloc_used -= AllocSize();
    Clear();
    --*set_size;
  }
}

// TODO refactor, because it's inefficient
size_t OAHEntry::Insert(OAHEntry&& e) {
  if (Empty()) {
    *this = std::move(e);
    return 0;
  } else if (!IsVector()) {
    OAHEntry tmp(PtrVector<OAHEntry>::FromLogSize(1));
    auto& arr = tmp.AsVector();
    arr[0] = std::move(*this);
    arr[1] = std::move(e);
    auto res = arr.AllocSize();
    *this = std::move(tmp);
    return res;
  } else {
    auto& arr = AsVector();
    size_t i = 0;
    for (; i < arr.Size(); ++i) {
      if (!arr[i]) {
        arr[i] = std::move(e);
        return 0;
      }
    }
    size_t prev_alloc_size = arr.AllocSize();
    auto new_pos = arr.Size();
    arr.ResizeLog(arr.LogSize() + 1);
    arr[new_pos] = (std::move(e));
    return arr.AllocSize() - prev_alloc_size;
  }
}

uint32_t OAHEntry::ElementsNum() {
  if (Empty()) {
    return 0;
  } else if (!IsVector()) {
    return 1;
  }
  return AsVector().Size();
}

// TODO remove, it is inefficient
OAHEntry& OAHEntry::operator[](uint32_t pos) {
  assert(!Empty());
  if (!IsVector()) {
    assert(pos == 0);
    return *this;
  } else {
    auto& arr = AsVector();
    assert(pos < arr.Size());
    return arr[pos];
  }
}

OAHEntry OAHEntry::Remove(uint32_t pos) {
  if (Empty()) {
    // I'm not sure that this scenario should be check at all
    assert(pos == 0);
    return OAHEntry();
  } else if (!IsVector()) {
    assert(pos == 0);
    return std::move(*this);
  } else {
    auto& arr = AsVector();
    assert(pos < arr.Size());
    return std::move(arr[pos]);
  }
}

OAHEntry OAHEntry::Pop() {
  if (IsVector()) {
    auto& arr = AsVector();
    for (auto& e : arr) {
      if (e)
        return std::move(e);
    }
    return {};
  }
  return std::move(*this);
}

void OAHEntry::Clear() {
  // TODO add optimization to avoid destructor calls during vector allocator
  if (!data_)
    return;

  if (IsVector()) {
    AsVector().~PtrVector<OAHEntry>();
  } else {
    zfree(Raw());
  }
  data_ = 0;
}

uint32_t OAHEntry::GetKeySize() const {
  if (HasSso()) {
    uint8_t size = 0;
    std::memcpy(&size, Raw() + GetExpirySize(), sizeof(size));
    return size;
  }
  uint32_t size = 0;
  std::memcpy(&size, Raw() + GetExpirySize(), sizeof(size));
  return size;
}

void OAHEntry::SetExpiryBit(bool b) {
  if (b)
    data_ |= kExpiryBit;
  else
    data_ &= ~kExpiryBit;
}

size_t OAHEntry::Size() {
  size_t key_field_size = HasSso() ? 1 : 4;
  size_t expiry_field_size = HasExpiry() ? 4 : 0;
  return expiry_field_size + key_field_size + GetKeySize();
}

}  // namespace dfly


================================================
FILE: src/core/oah_entry.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <cassert>
#include <cstring>
#include <string_view>

#include "base/hash.h"

extern "C" {
#include "redis/zmalloc.h"
}

namespace dfly {

#define PREFETCH_READ(x) __builtin_prefetch(x, 0, 1)
#define FORCE_INLINE __attribute__((always_inline))

// TODO add allocator support
template <class T> class PtrVector {
  static constexpr size_t kVectorBit = 1ULL << 0;          // first 3 bits aren't used by pointer
  static constexpr size_t kTagMask = (4095ULL << 52) | 7;  // we reserve 12 high bits and 3 low bits

  static constexpr size_t kLogSizeShift = 56;
  static constexpr size_t kLogSizeMask = 0xFFULL;
  static constexpr size_t kLogSizeShiftedMask = kLogSizeMask << kLogSizeShift;

 public:
  static PtrVector FromLogSize(uint64_t log_size) {
    return PtrVector(log_size);
  }

  T* begin() const {
    return &Raw()[0];
  }

  T* end() const {
    return &Raw()[Size()];
  }

  PtrVector(PtrVector&& other) {
    uptr_ = other.uptr_;
    other.uptr_ = 0;
  }

  ~PtrVector() {
    Clear();
  }

  size_t LogSize() const {
    return (uptr_ >> kLogSizeShift) & kLogSizeMask;
  }

  size_t Size() const {
    return 1 << LogSize();
  }

  uint64_t Release() {
    uint64_t res = uptr_;
    uptr_ = 0;
    return res;
  }

  bool Empty() const {
    if (uptr_ == 0)
      return true;

    for (auto& el : *this) {
      if (el)
        return false;
    }
    return true;
  }

  void ResizeLog(uint64_t new_log_size) {
    auto new_ptr = reinterpret_cast<T*>(zmalloc(sizeof(T) << new_log_size));
    size_t new_size = 1 << new_log_size;
    const size_t size = std::min(Size(), new_size);
    for (size_t i = 0; i < size; ++i) {
      new (new_ptr + i) T(std::move(Raw()[i]));
    }
    for (size_t i = size; i < new_size; ++i) {
      new (new_ptr + i) T();
    }
    Clear();
    uptr_ = reinterpret_cast<uint64_t>(new_ptr);
    SetLogSize(new_log_size);
  }

  T& operator[](size_t idx) {
    return Raw()[idx];
  }

  const T& operator[](size_t idx) const {
    return Raw()[idx];
  }

  T* Raw() const {
    return (T*)(uptr_ & ~kTagMask);
  }

  size_t AllocSize() const {
    return Size() * sizeof(T);
  }

 private:
  void Clear() {
    const size_t size = Size();
    T* raw = Raw();
    if (!raw)
      return;
    for (size_t i = 0; i < size; ++i) {
      if (raw[i])
        raw[i].~T();
    }

    zfree(Raw());
    uptr_ = 0;
  }
  // because of log_size I prefer to hide it
  PtrVector(uint64_t log_size) {
    assert(log_size <= 32);
    uptr_ = reinterpret_cast<uint64_t>(zmalloc(sizeof(T) << log_size));
    const uint64_t size = 1 << log_size;
    for (uint64_t i = 0; i < size; ++i) {
      new (reinterpret_cast<T*>(uptr_) + i) T();
    }
    SetLogSize(log_size);
  }

  void SetLogSize(uint64_t log_size) {
    uptr_ = (uptr_ & ~kLogSizeShiftedMask) | kVectorBit | (uint64_t(log_size) << kLogSizeShift);
  }

  uint64_t uptr_ = 0;
};

// doesn't possess memory, it should be created and release manually
class OAHEntry {
 public:
  // we can assume that high 12 bits of user address space
  // can be used for tagging. At most 52 bits of address are reserved for
  // some configurations, and usually it's 48 bits.
  // https://docs.kernel.org/arch/arm64/memory.html
  // first 3 bits aren't used by pointer
  static constexpr size_t kVectorBit = 1ULL << 0;
  static constexpr size_t kExpiryBit = 1ULL << 1;
  // if bit is set the string length field is 1 byte instead of 4
  static constexpr size_t kSsoBit = 1ULL << 2;

  // extended hash allows us to reduce keys comparisons
  static constexpr size_t kExtHashShift = 52;
  static constexpr uint32_t kExtHashSize = 12;
  static constexpr size_t kExtHashMask = 0xFFFULL;
  static constexpr size_t kExtHashShiftedMask = kExtHashMask << kExtHashShift;

  static constexpr size_t kTagMask = (4095ULL << 52) | 7;  // we reserve 12 high bits and 3 low.

  OAHEntry() = default;

  OAHEntry(std::string_view key, uint32_t expiry = UINT32_MAX);

  // TODO add initializer list constructor
  OAHEntry(PtrVector<OAHEntry>&& vec) {
    data_ = vec.Release() | kVectorBit;
  }

  OAHEntry(const OAHEntry& e) = delete;
  OAHEntry(OAHEntry&& e) {
    data_ = e.data_;
    e.data_ = 0;
  }

  // consider manual removing, we waste a lot of time to check nullptr
  ~OAHEntry() {
    Clear();
  }

  OAHEntry& operator=(const OAHEntry& e) = delete;
  OAHEntry& operator=(OAHEntry&& e) {
    std::swap(data_, e.data_);
    return *this;
  }

  bool Empty() const {
    return data_ == 0;
  }

  operator bool() const {
    return !Empty();
  }

  bool IsVector() const {
    return (data_ & kVectorBit) != 0;
  }

  bool IsEntry() const {
    return (data_ != 0) & !(data_ & kVectorBit);
  }

  size_t AllocSize() const {
    return zmalloc_usable_size(Raw());
  }

  PtrVector<OAHEntry>& AsVector() {
    static_assert(sizeof(PtrVector<OAHEntry>) == sizeof(uint64_t));
    return *reinterpret_cast<PtrVector<OAHEntry>*>(&data_);
  }

  std::string_view Key() const {
    assert(!IsVector());
    return {GetKeyData(), GetKeySize()};
  }

  bool HasExpiry() const {
    return (data_ & kExpiryBit) != 0;
  }

  // returns the expiry time of the current entry or UINT32_MAX if no expiry is set.
  uint32_t GetExpiry() const;

  // TODO consider another option to implement iterator
  OAHEntry* operator->() {
    return this;
  }

  uint64_t GetHash() const {
    return (data_ & kExtHashShiftedMask) >> kExtHashShift;
  }

  bool CheckNoCollisions(const uint64_t ext_hash);

  void SetExtHash(uint64_t ext_hash);

  void ClearHash() {
    data_ &= ~kExtHashShiftedMask;
  }

  void SetExpiry(uint32_t at_sec);

  void ExpireIfNeeded(uint32_t time_now, uint32_t* set_size, size_t* alloc_used);

  // TODO refactor, because it's inefficient
  // Returns additional allocation size of ptrVector
  [[nodiscard]] size_t Insert(OAHEntry&& e);

  uint32_t ElementsNum();

  // TODO remove, it is inefficient
  OAHEntry& operator[](uint32_t pos);

  OAHEntry Remove(uint32_t pos);

  OAHEntry Pop();

  char* Raw() const {
    return (char*)(data_ & ~kTagMask);
  }

 protected:
  void Clear();

  const char* GetKeyData() const {
    uint32_t key_field_size = HasSso() ? 1 : 4;
    return Raw() + GetExpirySize() + key_field_size;
  }

  uint32_t GetKeySize() const;

  void SetExpiryBit(bool b);

  void SetVectorBit() {
    data_ |= kVectorBit;
  }

  void SetSsoBit() {
    data_ |= kSsoBit;
  }

  bool HasSso() const {
    return (data_ & kSsoBit) != 0;
  }

  size_t Size();

  std::uint32_t GetExpirySize() const {
    return HasExpiry() ? sizeof(std::uint32_t) : 0;
  }

  // memory daya layout [Expiry, key_size, key]
  uint64_t data_ = 0;
};

}  // namespace dfly


================================================
FILE: src/core/oah_set.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/numeric/bits.h>
#include <absl/types/span.h>

#include <vector>

#include "core/detail/stateless_allocator.h"
#include "oah_entry.h"

namespace dfly {

// TODO add template parameter instead of OAHEntry
class OAHSet {  // Open Addressing Hash Set
  using OAHEntryAllocator = StatelessAllocator<OAHEntry>;
  using Buckets = std::vector<OAHEntry, OAHEntryAllocator>;

 public:
  class iterator {
   public:
    using iterator_category = std::forward_iterator_tag;
    using difference_type = std::ptrdiff_t;
    using value_type = OAHEntry;
    using pointer = OAHEntry*;
    using reference = OAHEntry&;

    iterator(OAHSet* owner, uint32_t bucket_id, uint32_t pos_in_bucket)
        : owner_(owner), bucket_(bucket_id), pos_(pos_in_bucket) {
    }

    void SetExpiryTime(uint32_t ttl_sec) {
      auto& entry = owner_->entries_[bucket_][pos_];
      owner_->obj_alloc_used_ -= entry.AllocSize();
      owner_->entries_[bucket_][pos_].SetExpiry(owner_->EntryTTL(ttl_sec));
      owner_->obj_alloc_used_ += entry.AllocSize();
    }

    iterator& operator++() {
      ++pos_;
      SetEntryIt();
      return *this;
    }

    bool operator==(const iterator& r) const {
      if (owner_ == nullptr || r.owner_ == nullptr) {
        return owner_ == r.owner_;
      }
      assert(owner_ == r.owner_);
      return bucket_ == r.bucket_ && pos_ == r.pos_;
    }

    bool operator!=(const iterator& r) const {
      return !operator==(r);
    }

    reference operator*() {
      return owner_->entries_[bucket_][pos_];
    }

    reference operator->() {
      return owner_->entries_[bucket_][pos_];
    }

    bool HasExpiry() {
      return owner_->entries_[bucket_][pos_].HasExpiry();
    }

    uint32_t ExpiryTime() {
      return owner_->entries_[bucket_][pos_].GetExpiry();
    }

    uint32_t bucket_id() const {
      return bucket_;
    }

    operator bool() const {
      return owner_;
    }

    // find valid entry_ iterator starting from buckets_it_ and set it
    void SetEntryIt() {
      if (!owner_)
        return;
      for (auto num_entries = owner_->entries_.size(); bucket_ < num_entries; ++bucket_) {
        auto& bucket = owner_->entries_[bucket_];
        for (uint32_t bucket_size = bucket.ElementsNum(); pos_ < bucket_size; ++pos_) {
          if (bucket[pos_])
            return;
        }
        pos_ = 0;
      }
      owner_ = nullptr;
    }

   private:
    OAHSet* owner_ = nullptr;
    uint32_t bucket_ = 0;
    uint32_t pos_ = 0;
  };

  iterator begin() {
    iterator res(this, 0, 0);
    res.SetEntryIt();
    return res;
  }

  iterator end() {
    return iterator(nullptr, 0, 0);
  }

  explicit OAHSet() = default;

  bool Add(std::string_view str, uint32_t ttl_sec = UINT32_MAX) {
    uint64_t hash = Hash(str);
    auto bucket_id = BucketId(hash, capacity_log_);
    PREFETCH_READ(entries_.data() + bucket_id);
    PREFETCH_READ(entries_.data() + bucket_id + 8);

    if (size_ >= entries_.size()) {
      Reserve(BucketCount() * 2);
      bucket_id = BucketId(hash, capacity_log_);
    }

    uint32_t at = EntryTTL(ttl_sec);
    // TODO maybe we should split memory allocation and copying for the case when we can't add it
    // into set
    OAHEntry entry(str, at);
    SetEntryHash(entry, hash);

    if (FastCheck(bucket_id, str, hash)) {
      return false;
    }

    obj_alloc_used_ += entry.AllocSize();
    AddUnique(std::move(entry), bucket_id, ttl_sec);
    return true;
  }

  void Reserve(size_t sz) {
    sz = absl::bit_ceil(sz);
    if (sz > entries_.size()) {
      auto prev_capacity_log = capacity_log_;
      capacity_log_ = std::max(kMinCapacityLog, uint32_t(absl::bit_width(sz) - 1));
      size_t prev_size = entries_.size();
      entries_.resize(Capacity());
      Rehash(prev_capacity_log, prev_size);
    }
    assert(entries_.size() >= kDisplacementSize);
  }

  // Shrinks the table to the specified size. The new_size must be a power of 2,
  // >= kMinCapacity (which is 1 << kMinCapacityLog), and >= current number of elements.
  // This method should be called explicitly when memory reclamation is needed.
  void Shrink(size_t new_size) {
    assert(absl::has_single_bit(new_size));
    assert(new_size >= (1u << kMinCapacityLog));
    assert(new_size < entries_.size());

    size_t prev_size = entries_.size();
    capacity_log_ = absl::bit_width(new_size) - 1;

    // Process from low to high (opposite of Grow/Rehash).
    for (size_t i = 0; i < prev_size; ++i) {
      ShrinkBucket(i);
    }

    entries_.resize(Capacity());
    entries_.shrink_to_fit();
  }

  void Clear() {
    capacity_log_ = 0;
    entries_.resize(0);
    size_ = 0;
    obj_alloc_used_ = 0;
    ptr_vectors_alloc_used_ = 0;
  }

  // TODO should be removed, inefficient
  void AddUnique(OAHEntry&& e, uint32_t bid, uint32_t ttl_sec = UINT32_MAX) {
    ++size_;
    assert(Capacity() >= kDisplacementSize);
    for (uint32_t i = 0; i < kDisplacementSize; i++) {
      const uint32_t bucket_id = bid + i;
      if (entries_[bucket_id].Empty()) {
        entries_[bucket_id] = std::move(e);
        return;
      }

      // TODO add expiration logic
    }

    bid = GetExtensionPoint(bid);
    assert(bid < entries_.size());

    ptr_vectors_alloc_used_ += entries_[bid].Insert(std::move(e));
  }

  unsigned AddMany(absl::Span<std::string_view> span, uint32_t ttl_sec = UINT32_MAX) {
    Reserve(span.size());
    unsigned res = 0;
    for (auto& s : span) {
      if (Add(s, ttl_sec) != end()) {
        res++;
      }
    }
    return res;
  }

  // TODO: Consider using chunks for this as in StringSet
  void Fill(OAHSet* other) {
    assert(other->entries_.empty());
    other->Reserve(UpperBoundSize());
    other->set_time(time_now());
    for (auto it = begin(), it_end = end(); it != it_end; ++it) {
      other->Add(it->Key(), it.HasExpiry() ? it.ExpiryTime() - time_now() : UINT32_MAX);
    }
  }

  /**
   * stable scanning api. has the same guarantees as redis scan command.
   * we avoid doing bit-reverse by using a different function to derive a bucket id
   * from hash values. By using msb part of hash we make it "stable" with respect to
   * rehashes. For example, with table log size 4 (size 16), entries in bucket id
   * 1110 come from hashes 1110XXXXX.... When a table grows to log size 5,
   * these entries can move either to 11100 or 11101. So if we traversed with our cursor
   * range [0000-1110], it's guaranteed that in grown table we do not need to cover again
   * [00000-11100]. Similarly with shrinkage, if a table is shrunk to log size 3,
   * keys from 1110 and 1111 will move to bucket 111. Again, it's guaranteed that we
   * covered the range [000-111] (all keys in that case).
   * Returns: next cursor or 0 if reached the end of scan.
   * cursor = 0 - initiates a new scan.
   */

  using ItemCb = std::function<void(std::string_view)>;

  uint32_t Scan(uint32_t cursor, const ItemCb& cb) {
    if (entries_.empty())
      return 0;

    uint32_t bucket_id = cursor >> (32 - capacity_log_);

    // First find the bucket to scan, skip empty buckets.
    for (; bucket_id < BucketCount(); ++bucket_id) {
      bool res = false;
      for (uint32_t i = 0; i < kDisplacementSize; i++) {
        const uint32_t shifted_bid = bucket_id + i;
        res |= ScanBucket(entries_[shifted_bid], cb, bucket_id);
      }
      if (res)
        break;
    }

    if (++bucket_id >= BucketCount()) {
      return 0;
    }

    return bucket_id << (32 - capacity_log_);
  }

  OAHEntry Pop() {
    for (auto& bucket : entries_) {
      if (auto res = bucket.Pop(); !res.Empty()) {
        assert(!res.IsVector());
        --size_;
        obj_alloc_used_ -= res.AllocSize();
        if (bucket.IsVector()) {
          if (bucket.AsVector().Empty()) {
            ptr_vectors_alloc_used_ -= bucket.AsVector().AllocSize();
            bucket = OAHEntry();
          }
        }
        return res;
      }
    }
    return {};
  }

  bool Erase(std::string_view str) {
    if (entries_.empty())
      return false;

    uint64_t hash = Hash(str);
    auto bucket_id = BucketId(hash, capacity_log_);
    auto item = FindInternal(bucket_id, str, hash);
    if (item != end()) {
      --size_;
      obj_alloc_used_ -= item->AllocSize();
      *item = OAHEntry();
      uint32_t erase_bucket = item.bucket_id();
      if (entries_[erase_bucket].IsVector()) {
        if (entries_[erase_bucket].AsVector().Empty()) {
          ptr_vectors_alloc_used_ -= entries_[erase_bucket].AsVector().AllocSize();
          entries_[erase_bucket] = OAHEntry();
        }
      }
      return true;
    }
    return false;
  }

  iterator Find(std::string_view member) {
    if (entries_.empty())
      return end();

    uint64_t hash = Hash(member);
    auto bucket_id = BucketId(hash, capacity_log_);

    const auto ext_hash = CalcExtHash(hash, capacity_log_);

    // fast check
    for (uint32_t i = 0; i < kDisplacementSize; i++) {
      const uint32_t bid = bucket_id + i;
      if ((entries_[bid].GetHash() == ext_hash) && entries_[bid].IsEntry()) {
        if (entries_[bid].Key() == member) {
          entries_[bid].ExpireIfNeeded(time_now_, &size_, &obj_alloc_used_);
          return !entries_[bid].Empty() ? iterator{this, bid, 0} : end();
        }
      }
    }

    auto res = FindInternal(bucket_id, member, hash);
    return res;
  }

  bool Contains(std::string_view member) {
    return Find(member) != end();
  }

  // Returns the number of elements in the map. Note that it might be that some of these elements
  // have expired and can't be accessed.
  size_t UpperBoundSize() const {
    return size_;
  }

  bool Empty() const {
    return size_ == 0;
  }

  std::uint32_t BucketCount() const {
    return entries_.empty() ? 0 : (1 << capacity_log_);
  }

  std::uint32_t Capacity() const {
    return (1 << capacity_log_) + kDisplacementSize - 1;
  }

  // set an abstract time that allows expiry.
  void set_time(uint32_t val) {
    time_now_ = val;
  }

  uint32_t time_now() const {
    return time_now_;
  }

  size_t ObjAllocUsed() const {
    return obj_alloc_used_;
  }

  size_t SetAllocUsed() const {
    return entries_.capacity() * sizeof(OAHEntry) + ptr_vectors_alloc_used_;
  }

  bool ExpirationUsed() const {
    // TODO
    assert(false);
    return true;
  }

  size_t SizeSlow() {
    // TODO
    assert(false);
    // CollectExpired();
    return size_;
  }

 private:
  static uint64_t Hash(std::string_view str) {
    constexpr XXH64_hash_t kHashSeed = 24061983;
    return XXH3_64bits_withSeed(str.data(), str.size(), kHashSeed);
  }

  static uint32_t BucketId(uint64_t hash, uint32_t capacity_log) {
    return hash >> (64 - capacity_log);
  }
  // was Grow in StringSet
  void Rehash(uint32_t prev_capacity_log, uint32_t prev_size) {
    if (prev_size == 0) {
      return;
    }
    // we should prevent moving elements before current possition to avoid double processing
    constexpr size_t mix_size = (2 << kShiftLog) - 1;
    std::array<OAHEntry, mix_size> old_buckets{};
    for (size_t i = 0; i < mix_size; ++i) {
      old_buckets[i] = std::move(entries_[i]);
    }

    for (size_t bucket_id = prev_size - 1; bucket_id >= mix_size; --bucket_id) {
      auto bucket = std::move(entries_[bucket_id]);
      for (uint32_t pos = 0, size = bucket.ElementsNum(); pos < size; ++pos) {
        if (bucket[pos]) {
          auto new_bucket_id = RehashEntry(bucket[pos], bucket_id, prev_capacity_log);
          new_bucket_id = FindEmptyAround(new_bucket_id);
          ptr_vectors_alloc_used_ += entries_[new_bucket_id].Insert(std::move(bucket[pos]));
        }
      }
      if (bucket.IsVector())
        ptr_vectors_alloc_used_ -= bucket.AsVector().AllocSize();
    }

    for (size_t bucket_id = 0; bucket_id < mix_size; ++bucket_id) {
      auto& bucket = old_buckets[bucket_id];
      for (uint32_t pos = 0, size = bucket.ElementsNum(); pos < size; ++pos) {
        if (bucket[pos]) {
          auto new_bucket_id = RehashEntry(bucket[pos], bucket_id, prev_capacity_log);
          new_bucket_id = FindEmptyAround(new_bucket_id);
          ptr_vectors_alloc_used_ += entries_[new_bucket_id].Insert(std::move(bucket[pos]));
        }
      }
      if (bucket.IsVector())
        ptr_vectors_alloc_used_ -= bucket.AsVector().AllocSize();
    }
  }

  // it is inefficient for now,
  // TODO predict new position by current position and extended hash
  void ShrinkBucket(uint32_t bucket_id) {
    auto bucket = std::move(entries_[bucket_id]);
    if (bucket.Empty())
      return;

    for (uint32_t pos = 0, size = bucket.ElementsNum(); pos < size; ++pos) {
      if (bucket[pos]) {
        // Check for TTL expiration during shrink - skip expired elements
        if (bucket[pos].HasExpiry() && bucket[pos].GetExpiry() <= time_now_) {
          obj_alloc_used_ -= bucket[pos].AllocSize();
          --size_;
          continue;
        }

        auto hash = Hash(bucket[pos].Key());
        auto new_bucket_id = BucketId(hash, capacity_log_);
        SetEntryHash(bucket[pos], hash);
        new_bucket_id = FindEmptyAround(new_bucket_id);
        ptr_vectors_alloc_used_ += entries_[new_bucket_id].Insert(std::move(bucket[pos]));
      }
    }

    if (bucket.IsVector()) {
      ptr_vectors_alloc_used_ -= bucket.AsVector().AllocSize();
    }
  }

  uint32_t GetExtensionPoint(const uint32_t bid) const {
    constexpr uint32_t extension_point_shift = kDisplacementSize - 1;
    return bid | extension_point_shift;
  }

  bool FastCheck(const uint32_t bid, std::string_view str, uint64_t hash) {
    const auto ext_hash = CalcExtHash(hash, capacity_log_);
    const auto ext_bid = GetExtensionPoint(bid);

    bool res = true;
    for (uint32_t i = 0; i < kDisplacementSize; i++) {
      const uint32_t bucket_id = bid + i;
      res &= entries_[bucket_id].CheckNoCollisions(ext_hash);
    }

    if (res) {
      if (entries_[ext_bid].IsVector()) {
        auto& vec = entries_[ext_bid].AsVector();
        auto raw_arr = vec.Raw();
        for (size_t i = 0, size = vec.Size(); i < size; ++i) {
          res &= raw_arr[i].CheckNoCollisions(ext_hash);
        }
      }
      if (!res) {
        auto pos = FindInBucket(entries_[ext_bid], str, ext_hash);
        if (pos) {
          return true;
        }
      }
    } else {
      return FindInternal(bid, str, hash);
    }
    return false;
  }

  template <class T, std::enable_if_t<std::is_invocable_v<T, std::string_view>>* = nullptr>
  bool ScanBucket(OAHEntry& entry, const T& cb, uint32_t bucket_id) {
    if (!entry.IsVector()) {
      entry.ExpireIfNeeded(time_now_, &size_, &obj_alloc_used_);
      if (CheckBucketAffiliation(entry, bucket_id)) {
        cb(entry.Key());
        return true;
      }
    } else {
      auto& arr = entry.AsVector();
      bool result = false;
      for (auto& el : arr) {
        el.ExpireIfNeeded(time_now_, &size_, &obj_alloc_used_);
        if (CheckBucketAffiliation(el, bucket_id)) {
          cb(el.Key());
          result = true;
        }
      }
      return result;
    }
    return false;
  }

  uint32_t EntryTTL(uint32_t ttl_sec) const {
    return ttl_sec == UINT32_MAX ? ttl_sec : time_now_ + ttl_sec;
  }

  uint32_t FindEmptyAround(uint32_t bid) {
    for (uint32_t i = 0; i < kDisplacementSize; i++) {
      const uint32_t bucket_id = bid + i;
      if (entries_[bucket_id].Empty())
        return bucket_id;
      // TODO add expiration logic
    }

    bid = GetExtensionPoint(bid);
    assert(bid < entries_.size());
    return bid;
  }

  // Searches for a string within a bucket entry (which may be a single entry or a vector).
  // Returns the position within the bucket if found, or std::nullopt if not found.
  std::optional<uint32_t> FindInBucket(OAHEntry& bucket, std::string_view str, uint64_t ext_hash) {
    if (bucket.IsEntry()) {
      bucket.ExpireIfNeeded(time_now_, &size_, &obj_alloc_used_);
      return CheckExtendedHash(bucket, ext_hash) && bucket.Key() == str ? 0
                                                                        : std::optional<uint32_t>();
    }
    if (bucket.IsVector()) {
      auto& vec = bucket.AsVector();
      auto raw_arr = vec.Raw();
      for (size_t i = 0, size = vec.Size(); i < size; ++i) {
        raw_arr[i].ExpireIfNeeded(time_now_, &size_, &obj_alloc_used_);
        if (CheckExtendedHash(raw_arr[i], ext_hash) && raw_arr[i].Key() == str) {
          return i;
        }
      }
    }
    return std::nullopt;
  }

  // return bucket_id and position otherwise max
  iterator FindInternal(uint32_t bid, std::string_view str, uint64_t hash) {
    const auto ext_hash = CalcExtHash(hash, capacity_log_);
    for (uint32_t i = 0; i < kDisplacementSize; i++) {
      const uint32_t bucket_id = bid + i;
      auto pos = FindInBucket(entries_[bucket_id], str, ext_hash);
      if (pos) {
        return iterator{this, bucket_id, *pos};
      }
    }
    return end();
  }

 private:
  static constexpr std::uint32_t kShiftLog = 2;                         // TODO make template
  static constexpr std::uint32_t kMinCapacityLog = kShiftLog;           // should be >= ShiftLog
  static constexpr std::uint32_t kDisplacementSize = (1 << kShiftLog);  // TODO check

  static uint64_t CalcExtHash(uint64_t hash, uint32_t capacity_log) {
    const uint32_t start_hash_bit = capacity_log > kShiftLog ? capacity_log - kShiftLog : 0;
    const uint32_t ext_hash_shift = 64 - start_hash_bit - OAHEntry::kExtHashSize;
    return (hash >> ext_hash_shift) & OAHEntry::kExtHashMask;
  }

  uint64_t SetEntryHash(OAHEntry& entry, uint64_t hash) {
    uint64_t ext_hash = CalcExtHash(hash, capacity_log_);
    entry.SetExtHash(ext_hash);
    return ext_hash;
  }

  bool CheckBucketAffiliation(OAHEntry& entry, uint32_t bucket_id) {
    assert(!entry.IsVector());
    if (entry.Empty())
      return false;
    uint32_t bucket_id_hash_part = capacity_log_ > kShiftLog ? kShiftLog : capacity_log_;
    uint32_t bucket_mask = (1 << bucket_id_hash_part) - 1;
    bucket_id &= bucket_mask;
    auto stored_hash = entry.GetHash();
    if (!stored_hash) {
      stored_hash = SetEntryHash(entry, Hash(entry.Key()));
    }
    uint32_t stored_bucket_id = stored_hash >> (OAHEntry::kExtHashSize - bucket_id_hash_part);
    return bucket_id == stored_bucket_id;
  }

  bool CheckExtendedHash(OAHEntry& entry, uint64_t ext_hash) {
    auto stored_hash = entry.GetHash();
    if (!stored_hash) {
      if (entry.IsEntry()) {
        stored_hash = SetEntryHash(entry, Hash(entry.Key()));
      } else {
        return false;
      }
    }
    return stored_hash == ext_hash;
  }

  // return new bucket_id
  uint32_t RehashEntry(OAHEntry& entry, uint32_t current_bucket_id, uint32_t prev_capacity_log) {
    assert(!entry.IsVector());
    auto stored_hash = entry.GetHash();

    const uint32_t logs_diff = capacity_log_ - prev_capacity_log;
    const uint32_t prev_significant_bits =
        prev_capacity_log > kShiftLog ? kShiftLog : prev_capacity_log;
    const uint32_t needed_hash_bits = prev_significant_bits + logs_diff;

    if (!stored_hash || needed_hash_bits > OAHEntry::kExtHashSize) {
      auto hash = Hash(entry.Key());
      SetEntryHash(entry, hash);
      return BucketId(hash, capacity_log_);
    }

    const uint32_t real_bucket_end =
        stored_hash >> (OAHEntry::kExtHashSize - prev_significant_bits);
    const uint32_t prev_shift_mask = (1 << prev_significant_bits) - 1;
    const uint32_t curr_shift = (current_bucket_id - real_bucket_end) & prev_shift_mask;
    const uint32_t prev_bucket_mask = (1 << prev_capacity_log) - 1;
    const uint32_t base_bucket_id = (current_bucket_id - curr_shift) & prev_bucket_mask;

    const uint32_t last_bits_mask = (1 << logs_diff) - 1;
    const uint32_t stored_hash_shift = OAHEntry::kExtHashSize - needed_hash_bits;
    const uint32_t last_bits = (stored_hash >> stored_hash_shift) & last_bits_mask;
    const uint32_t new_bucket_id = (base_bucket_id << logs_diff) | last_bits;

    entry.ClearHash();  // the cache is invalid after rehash operation

    assert(BucketId(Hash(entry.Key()), capacity_log_) == new_bucket_id);

    return new_bucket_id;
  }

  mutable size_t obj_alloc_used_ = 0;
  mutable size_t ptr_vectors_alloc_used_ = 0;

  std::uint32_t capacity_log_ = 0;
  std::uint32_t size_ = 0;  // number of elements in the set.
  std::uint32_t time_now_ = 0;
  Buckets entries_;
};

}  // namespace dfly


================================================
FILE: src/core/oah_set_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/oah_set.h"

#include <absl/strings/match.h>
#include <absl/strings/str_cat.h>
#include <mimalloc.h>

#include <random>
#include <set>
#include <unordered_set>

#include "base/gtest.h"
#include "core/mi_memory_resource.h"
#include "glog/logging.h"

extern "C" {
#include "redis/zmalloc.h"
}

namespace dfly {

using namespace std;

class OAHSetTest : public ::testing::Test {
 protected:
  static void SetUpTestSuite() {
    auto* tlh = mi_heap_get_backing();
    init_zmalloc_threadlocal(tlh);
    InitTLStatelessAllocMR(PMR_NS::get_default_resource());
  }

  static void TearDownTestSuite() {
  }

  void SetUp() override {
    ss_ = new OAHSet;
    generator_.seed(0);
  }

  void TearDown() override {
    delete ss_;

    // ensure there are no memory leaks after every test
    EXPECT_EQ(zmalloc_used_memory_tl, 0);
  }

  OAHSet* ss_;
  mt19937 generator_;
};

static string random_string(mt19937& rand, unsigned len) {
  const string_view alpanum = "1234567890abcdefghijklmnopqrstuvwxyz";
  string ret;
  ret.reserve(len);

  for (size_t i = 0; i < len; ++i) {
    ret += alpanum[rand() % alpanum.size()];
  }

  return ret;
}

TEST_F(OAHSetTest, PtrVectorTest) {
  PtrVector<int> vp(PtrVector<int>::FromLogSize(3));
  EXPECT_EQ(vp.Size(), 8);
  EXPECT_EQ(vp.LogSize(), 3);
  size_t i = 0;
  for (; i < vp.Size(); ++i) {
    EXPECT_EQ(vp[i], 0);
    vp[i] = i + 1;
  }
  vp.ResizeLog(4);

  for (; i < vp.Size(); ++i) {
    EXPECT_EQ(vp[i], 0);
    vp[i] = i + 1;
  }
  EXPECT_EQ(vp.Size(), 16);
  EXPECT_EQ(vp.LogSize(), 4);
  for (size_t i = 0; i < vp.Size(); ++i) {
    EXPECT_EQ(vp[i], i + 1);
  }
}

TEST_F(OAHSetTest, OAHEntryTest) {
  OAHEntry test("0123456789", 2);

  EXPECT_EQ(test.Key(), "0123456789"sv);
  EXPECT_EQ(test.GetExpiry(), 2);

  OAHEntry first("123456789");

  EXPECT_EQ(test.Insert(std::move(first)), 16);

  EXPECT_EQ(test.Insert(OAHEntry("23456789")), 16);

  EXPECT_TRUE(test.Remove(0));
  EXPECT_FALSE(test.Remove(0));

  EXPECT_EQ(test.Remove(2).Key(), "23456789");
  EXPECT_EQ(test.Pop().Key(), "123456789");
}

TEST_F(OAHSetTest, OAHSetAddFindTest) {
  OAHSet ss;
  std::set<std::string> test_set;

  for (int i = 0; i < 10000; ++i) {
    test_set.insert(base::RandStr(20));
  }

  for (const auto& s : test_set) {
    EXPECT_TRUE(ss.Add(s));
  }

  for (const auto& s : test_set) {
    auto e = ss.Find(s);
    EXPECT_EQ(e->Key(), s);
  }

  EXPECT_EQ(ss.BucketCount(), 16384);
}

TEST_F(OAHSetTest, Basic) {
  EXPECT_TRUE(ss_->Add("foo"sv));
  EXPECT_TRUE(ss_->Add("bar"sv));
  uint32_t size = ss_->UpperBoundSize();
  EXPECT_FALSE(ss_->Add("foo"sv));
  EXPECT_FALSE(ss_->Add("bar"sv));
  EXPECT_EQ(ss_->UpperBoundSize(), size);
  EXPECT_TRUE(ss_->Contains("foo"sv));
  EXPECT_TRUE(ss_->Contains("bar"sv));
  EXPECT_EQ(2, ss_->UpperBoundSize());
}

TEST_F(OAHSetTest, StandardAddErase) {
  EXPECT_TRUE(ss_->Add("@@@@@@@@@@@@@@@@") != ss_->end());
  EXPECT_TRUE(ss_->Add("A@@@@@@@@@@@@@@@") != ss_->end());
  EXPECT_TRUE(ss_->Add("AA@@@@@@@@@@@@@@") != ss_->end());
  EXPECT_TRUE(ss_->Add("AAA@@@@@@@@@@@@@") != ss_->end());
  EXPECT_TRUE(ss_->Add("AAAAAAAAA@@@@@@@") != ss_->end());
  EXPECT_TRUE(ss_->Add("AAAAAAAAAA@@@@@@") != ss_->end());
  EXPECT_TRUE(ss_->Add("AAAAAAAAAAAAAAA@") != ss_->end());
  EXPECT_TRUE(ss_->Add("AAAAAAAAAAAAAAAA") != ss_->end());
  EXPECT_TRUE(ss_->Add("AAAAAAAAAAAAAAAD") != ss_->end());
  EXPECT_TRUE(ss_->Add("BBBBBAAAAAAAAAAA") != ss_->end());
  EXPECT_TRUE(ss_->Add("BBBBBBBBAAAAAAAA") != ss_->end());
  EXPECT_TRUE(ss_->Add("CCCCCBBBBBBBBBBB") != ss_->end());

  // Remove link in the middle of chain
  EXPECT_TRUE(ss_->Erase("BBBBBBBBAAAAAAAA"));
  // Remove start of a chain
  EXPECT_TRUE(ss_->Erase("CCCCCBBBBBBBBBBB"));
  // Remove end of link
  EXPECT_TRUE(ss_->Erase("AAA@@@@@@@@@@@@@"));
  // Remove only item in chain
  EXPECT_TRUE(ss_->Erase("AA@@@@@@@@@@@@@@"));
  EXPECT_TRUE(ss_->Erase("AAAAAAAAA@@@@@@@"));
  EXPECT_TRUE(ss_->Erase("AAAAAAAAAA@@@@@@"));
  EXPECT_TRUE(ss_->Erase("AAAAAAAAAAAAAAA@"));
}

TEST_F(OAHSetTest, DisplacedBug) {
  string_view vals[] = {"imY", "OVl", "NhH", "BCe", "YDL", "lpb",
                        "nhF", "xod", "zYR", "PSa", "hce", "cTR"};
  ss_->AddMany(absl::MakeSpan(vals), UINT32_MAX);

  ss_->Add("fIc");
  ss_->Erase("YDL");
  ss_->Add("fYs");
  ss_->Erase("hce");
  ss_->Erase("nhF");
  ss_->Add("dye");
  ss_->Add("xZT");
  ss_->Add("LVK");
  ss_->Erase("zYR");
  ss_->Erase("fYs");
  ss_->Add("ueB");
  ss_->Erase("PSa");
  ss_->Erase("OVl");
  ss_->Add("cga");
  ss_->Add("too");
  ss_->Erase("ueB");
  ss_->Add("HZe");
  ss_->Add("oQn");
  ss_->Erase("too");
  ss_->Erase("HZe");
  ss_->Erase("xZT");
  ss_->Erase("cga");
  ss_->Erase("cTR");
  ss_->Erase("BCe");
  ss_->Add("eua");
  ss_->Erase("lpb");
  ss_->Add("OXK");
  ss_->Add("QmO");
  ss_->Add("SzV");
  ss_->Erase("QmO");
  ss_->Add("jbe");
  ss_->Add("BPN");
  ss_->Add("OfH");
  ss_->Add("Muf");
  ss_->Add("CwP");
  ss_->Erase("Muf");
  ss_->Erase("xod");
  ss_->Add("Cis");
  ss_->Add("Xvd");
  ss_->Erase("SzV");
  ss_->Erase("eua");
  ss_->Add("DGb");
  ss_->Add("leD");
  ss_->Add("MVX");
  ss_->Add("HPq");
}

TEST_F(OAHSetTest, Resizing) {
  constexpr size_t num_strs = 4096;
  unordered_set<string> strs;
  while (strs.size() != num_strs) {
    auto str = random_string(generator_, 10);
    strs.insert(str);
  }

  unsigned size = 0;
  for (auto it = strs.begin(); it != strs.end(); ++it) {
    const auto& str = *it;
    EXPECT_TRUE(ss_->Add(str, 1));
    EXPECT_EQ(ss_->UpperBoundSize(), size + 1);

    // make sure we haven't lost any items after a grow
    // which happens every power of 2
    if ((size & (size - 1)) == 0) {
      for (auto j = strs.begin(); j != it; ++j) {
        const auto& str = *j;
        auto it = ss_->Find(str);
        ASSERT_NE(it, ss_->end());
        EXPECT_TRUE(it.HasExpiry());
        EXPECT_EQ(it.ExpiryTime(), ss_->time_now() + 1);
      }
    }
    ++size;
  }
}

TEST_F(OAHSetTest, SimpleScan) {
  unordered_set<string_view> info = {"foo", "bar"};
  unordered_set<string_view> seen;

  for (auto str : info) {
    EXPECT_TRUE(ss_->Add(str));
  }

  uint32_t cursor = 0;
  do {
    cursor = ss_->Scan(cursor, [&](std::string_view str) {
      EXPECT_TRUE(info.count(str));
      seen.insert(str);
    });
  } while (cursor != 0);

  EXPECT_EQ(seen.size(), info.size());
  EXPECT_TRUE(equal(seen.begin(), seen.end(), info.begin()));
}

// // Ensure REDIS scan guarantees are met
TEST_F(OAHSetTest, ScanGuarantees) {
  unordered_set<string_view> to_be_seen = {"foo", "bar"};
  unordered_set<string_view> not_be_seen = {"AAA", "BBB"};
  unordered_set<string_view> maybe_seen = {"AA@@@@@@@@@@@@@@", "AAA@@@@@@@@@@@@@",
                                           "AAAAAAAAA@@@@@@@", "AAAAAAAAAA@@@@@@"};
  unordered_set<string_view> seen;

  auto scan_callback = [&](std::string_view str) {
    EXPECT_TRUE(to_be_seen.count(str) || maybe_seen.count(str));
    EXPECT_FALSE(not_be_seen.count(str));
    if (to_be_seen.count(str)) {
      seen.insert(str);
    }
  };

  EXPECT_EQ(ss_->Scan(0, scan_callback), 0);

  for (auto str : not_be_seen) {
    EXPECT_TRUE(ss_->Add(str));
  }

  for (auto str : not_be_seen) {
    EXPECT_TRUE(ss_->Erase(str));
  }

  for (auto str : to_be_seen) {
    EXPECT_TRUE(ss_->Add(str));
  }

  // should reach at least the first item in the set
  uint32_t cursor = ss_->Scan(0, scan_callback);

  for (auto str : maybe_seen) {
    EXPECT_TRUE(ss_->Add(str));
  }

  while (cursor != 0) {
    cursor = ss_->Scan(cursor, scan_callback);
  }

  EXPECT_TRUE(seen.size() == to_be_seen.size());
}

TEST_F(OAHSetTest, IntOnly) {
  constexpr size_t num_ints = 8192;
  unordered_set<unsigned int> numbers;
  for (size_t i = 0; i < num_ints; ++i) {
    numbers.insert(i);
    EXPECT_TRUE(ss_->Add(to_string(i)));
  }
  EXPECT_EQ(ss_->UpperBoundSize(), num_ints);

  for (size_t i = 0; i < num_ints; ++i) {
    ASSERT_FALSE(ss_->Add(to_string(i)));
  }
  EXPECT_EQ(ss_->UpperBoundSize(), num_ints);

  size_t num_remove = generator_() % 4096;
  unordered_set<string> removed;

  for (size_t i = 0; i < num_remove; ++i) {
    auto remove_int = generator_() % num_ints;
    auto remove = to_string(remove_int);
    if (numbers.count(remove_int)) {
      ASSERT_TRUE(ss_->Contains(remove)) << remove_int;
      EXPECT_TRUE(ss_->Erase(remove));
      numbers.erase(remove_int);
    } else {
      EXPECT_FALSE(ss_->Erase(remove));
    }

    EXPECT_FALSE(ss_->Contains(remove));
    removed.insert(remove);
  }

  size_t expected_seen = 0;
  auto scan_callback = [&](std::string_view str_v) {
    std::string str(str_v);
    EXPECT_FALSE(removed.count(str));

    if (numbers.count(std::atoi(str.data()))) {
      ++expected_seen;
    }
  };

  uint32_t cursor = 0;
  do {
    cursor = ss_->Scan(cursor, scan_callback);
    // randomly throw in some new numbers
    uint32_t val = generator_();
    ss_->Add(to_string(val));
  } while (cursor != 0);

  EXPECT_GE(expected_seen + removed.size(), num_ints);
}

TEST_F(OAHSetTest, XtremeScanGrow) {
  unordered_set<string> to_see, force_grow, seen;

  while (to_see.size() != 8) {
    to_see.insert(random_string(generator_, 10));
  }

  while (force_grow.size() != 8192) {
    string str = random_string(generator_, 10);

    if (to_see.count(str)) {
      continue;
    }

    force_grow.insert(random_string(generator_, 10));
  }

  for (auto& str : to_see) {
    EXPECT_TRUE(ss_->Add(str));
  }

  auto scan_callback = [&](string_view strv) {
    std::string str(strv);
    if (to_see.count(str)) {
      seen.insert(str);
    }
  };

  uint32_t cursor = ss_->Scan(0, scan_callback);

  // force approx 10 grows
  for (auto& s : force_grow) {
    EXPECT_TRUE(ss_->Add(s));
  }

  while (cursor != 0) {
    cursor = ss_->Scan(cursor, scan_callback);
  }

  EXPECT_EQ(seen.size(), to_see.size());
}

TEST_F(OAHSetTest, Pop) {
  constexpr size_t num_items = 8;
  unordered_set<string> to_insert;

  while (to_insert.size() != num_items) {
    auto str = random_string(generator_, 10);
    if (to_insert.count(str)) {
      continue;
    }

    to_insert.insert(str);
    EXPECT_TRUE(ss_->Add(str));
  }

  while (!ss_->Empty()) {
    size_t size = ss_->UpperBoundSize();
    auto str = ss_->Pop();
    DCHECK(ss_->UpperBoundSize() == to_insert.size() - 1);
    DCHECK(str);
    DCHECK(to_insert.count(std::string(str.Key())));
    DCHECK_EQ(ss_->UpperBoundSize(), size - 1);
    to_insert.erase(std::string(str.Key()));
  }

  DCHECK(ss_->Empty());
  DCHECK(to_insert.empty());
}

TEST_F(OAHSetTest, Iteration) {
  ss_->Add("foo");
  for (const auto& ptr : *ss_) {
    LOG(INFO) << ptr;
  }
  ss_->Clear();
  constexpr size_t num_items = 8192;
  unordered_set<string> to_insert;

  while (to_insert.size() != num_items) {
    auto str = random_string(generator_, 10);
    if (to_insert.count(str)) {
      continue;
    }

    to_insert.insert(str);
    EXPECT_TRUE(ss_->Add(str));
  }

  for (const auto& ptr : *ss_) {
    std::string str(ptr.Key());
    EXPECT_TRUE(to_insert.count(str));
    to_insert.erase(str);
  }

  EXPECT_EQ(to_insert.size(), 0);
}

TEST_F(OAHSetTest, SetFieldExpireHasExpiry) {
  EXPECT_TRUE(ss_->Add("k1", 100));
  auto k = ss_->Find("k1");
  EXPECT_TRUE(k.HasExpiry());
  EXPECT_EQ(k.ExpiryTime(), 100);
  k.SetExpiryTime(1);
  EXPECT_TRUE(k.HasExpiry());
  EXPECT_EQ(k.ExpiryTime(), 1);
}

TEST_F(OAHSetTest, SetFieldExpireNoHasExpiry) {
  EXPECT_TRUE(ss_->Add("k1"));
  auto k = ss_->Find("k1");
  EXPECT_FALSE(k.HasExpiry());
  k.SetExpiryTime(10);
  EXPECT_TRUE(k.HasExpiry());
  EXPECT_EQ(k.ExpiryTime(), 10);
}

TEST_F(OAHSetTest, Ttl) {
  EXPECT_TRUE(ss_->Add("bla"sv, 1));
  EXPECT_FALSE(ss_->Add("bla"sv, 1));
  auto it = ss_->Find("bla"sv);
  EXPECT_EQ(1u, it.ExpiryTime());

  ss_->set_time(1);
  EXPECT_TRUE(ss_->Add("bla"sv, 1));
  EXPECT_EQ(1u, ss_->UpperBoundSize());

  for (unsigned i = 0; i < 100; ++i) {
    EXPECT_TRUE(ss_->Add(absl::StrCat("foo", i), 1));
  }
  EXPECT_EQ(101u, ss_->UpperBoundSize());
  it = ss_->Find("foo50");
  EXPECT_EQ("foo50"sv, it->Key());
  EXPECT_EQ(2u, it.ExpiryTime());

  ss_->set_time(2);
  // Cleanup all `foo` entries
  uint32_t cursor = 0;
  do {
    cursor = ss_->Scan(cursor, [&](std::string_view) {});
  } while (cursor != 0);

  for (unsigned i = 0; i < 100; ++i) {
    EXPECT_TRUE(ss_->Add(absl::StrCat("bar", i)));
  }
  EXPECT_EQ(100u, ss_->UpperBoundSize());
  it = ss_->Find("bar50");
  EXPECT_FALSE(it.HasExpiry());

  for (auto it = ss_->begin(); it != ss_->end(); ++it) {
    ASSERT_TRUE(absl::StartsWith(it->Key(), "bar")) << it->Key();
    string str(it->Key());
    VLOG(1) << *it;
  }
}

TEST_F(OAHSetTest, Grow) {
  for (size_t j = 0; j < 10; ++j) {
    for (size_t i = 0; i < 4098; ++i) {
      ss_->Reserve(generator_() % 256);
      auto str = random_string(generator_, 3);
      ss_->Add(str);
    }
    ss_->Clear();
  }
}

TEST_F(OAHSetTest, Reserve) {
  vector<string> strs;

  for (size_t i = 0; i < 10; ++i) {
    strs.push_back(random_string(generator_, 10));
    ss_->Add(strs.back());
  }

  for (size_t j = 2; j < 20; j += 3) {
    ss_->Reserve(j * 20);
    for (size_t i = 0; i < 10; ++i) {
      ASSERT_TRUE(ss_->Contains(strs[i]));
    }
  }
}

TEST_F(OAHSetTest, Fill) {
  for (size_t i = 0; i < 100; ++i) {
    ss_->Add(random_string(generator_, 10));
  }
  OAHSet s2;
  ss_->Fill(&s2);
  EXPECT_EQ(s2.UpperBoundSize(), ss_->UpperBoundSize());
  for (const auto& s : *ss_) {
    EXPECT_TRUE(s2.Contains(s.Key()));
  }
}

TEST_F(OAHSetTest, IterateEmpty) {
  for (const auto& s : *ss_) {
    // We're iterating to make sure there is no crash. However, if we got here, it's a bug
    CHECK(false) << "Found entry " << s << " in empty set";
  }
}

static size_t MemUsed(OAHSet& obj) {
  return obj.ObjAllocUsed() + obj.SetAllocUsed();
}

void BM_Clone(benchmark::State& state) {
  vector<string> strs;
  mt19937 generator(0);
  OAHSet ss1, ss2;
  unsigned elems = state.range(0);
  for (size_t i = 0; i < elems; ++i) {
    string str = random_string(generator, 10);
    ss1.Add(str);
  }
  ss2.Reserve(ss1.UpperBoundSize());
  while (state.KeepRunning()) {
    for (auto& src : ss1) {
      ss2.Add(src.Key());
    }
    state.PauseTiming();
    ss2.Clear();
    ss2.Reserve(ss1.UpperBoundSize());
    state.ResumeTiming();
  }
}
BENCHMARK(BM_Clone)->ArgName("elements")->Arg(32000);

void BM_Fill(benchmark::State& state) {
  unsigned elems = state.range(0);
  vector<string> strs;
  mt19937 generator(0);
  OAHSet ss1, ss2;
  for (size_t i = 0; i < elems; ++i) {
    string str = random_string(generator, 10);
    ss1.Add(str);
  }

  while (state.KeepRunning()) {
    ss1.Fill(&ss2);
    state.PauseTiming();
    ss2.Clear();
    state.ResumeTiming();
  }
}
BENCHMARK(BM_Fill)->ArgName("elements")->Arg(32000);

void BM_Clear(benchmark::State& state) {
  unsigned elems = state.range(0);
  mt19937 generator(0);
  OAHSet ss;
  while (state.KeepRunning()) {
    state.PauseTiming();
    for (size_t i = 0; i < elems; ++i) {
      string str = random_string(generator, 16);
      ss.Add(str);
    }
    state.ResumeTiming();
    ss.Clear();
  }
}
BENCHMARK(BM_Clear)->ArgName("elements")->Arg(32000);

void BM_Add(benchmark::State& state) {
  vector<string> strs;
  mt19937 generator(0);
  OAHSet ss;
  unsigned elems = state.range(0);
  unsigned keySize = state.range(1);
  for (size_t i = 0; i < elems; ++i) {
    string str = random_string(generator, keySize);
    strs.push_back(str);
  }
  ss.Reserve(elems);
  size_t mem_used = 0;
  while (state.KeepRunning()) {
    for (auto& str : strs)
      ss.Add(str);
    state.PauseTiming();
    mem_used += MemUsed(ss);
    ss.Clear();
    ss.Reserve(elems);
    state.ResumeTiming();
  }
  state.counters["Memory_Used"] = mem_used / state.iterations();
}
BENCHMARK(BM_Add)
    ->ArgNames({"elements", "KeySize"})
    ->ArgsProduct({{1000, 10000, 100000}, {10, 100, 1000}});

void BM_AddMany(benchmark::State& state) {
  vector<string> strs;
  mt19937 generator(0);
  OAHSet ss;
  unsigned elems = state.range(0);
  unsigned keySize = state.range(1);
  for (size_t i = 0; i < elems; ++i) {
    string str = random_string(generator, keySize);
    strs.push_back(str);
  }
  ss.Reserve(elems);
  vector<string_view> svs;
  size_t mem_used = 0;
  for (const auto& str : strs) {
    svs.push_back(str);
  }
  while (state.KeepRunning()) {
    ss.AddMany(absl::MakeSpan(svs));
    state.PauseTiming();
    CHECK_EQ(ss.UpperBoundSize(), elems);
    mem_used += MemUsed(ss);
    ss.Clear();
    ss.Reserve(elems);
    state.ResumeTiming();
  }
  state.counters["Memory_Used"] = mem_used / state.iterations();
}
BENCHMARK(BM_AddMany)
    ->ArgNames({"elements", "KeySize"})
    ->ArgsProduct({{1000, 10000, 100000}, {10, 100, 1000}});

void BM_Erase(benchmark::State& state) {
  std::vector<std::string> strs;
  mt19937 generator(0);
  OAHSet ss;
  auto elems = state.range(0);
  auto keySize = state.range(1);
  for (long int i = 0; i < elems; ++i) {
    std::string str = random_string(generator, keySize);
    strs.push_back(str);
    ss.Add(str);
  }
  state.counters["Memory_Before_Erase"] = MemUsed(ss);
  size_t mem_used = 0;
  while (state.KeepRunning()) {
    for (auto& str : strs) {
      ss.Erase(str);
    }
    state.PauseTiming();
    mem_used += MemUsed(ss);
    for (auto& str : strs) {
      ss.Add(str);
    }
    state.ResumeTiming();
  }
  state.counters["Memory_After_Erase"] = mem_used / state.iterations();
}
BENCHMARK(BM_Erase)
    ->ArgNames({"elements", "KeySize"})
    ->ArgsProduct({{1000, 10000, 100000}, {10, 100, 1000}});

void BM_Get(benchmark::State& state) {
  std::vector<std::string> strs;
  mt19937 generator(0);
  OAHSet ss;
  auto elems = state.range(0);
  auto keySize = state.range(1);
  for (long int i = 0; i < elems; ++i) {
    std::string str = random_string(generator, keySize);
    strs.push_back(str);
    ss.Add(str);
  }
  while (state.KeepRunning()) {
    for (auto& str : strs) {
      ss.Find(str);
    }
  }
}
BENCHMARK(BM_Get)
    ->ArgNames({"elements", "KeySize"})
    ->ArgsProduct({{1000, 10000, 100000}, {10, 100, 1000}});

void BM_Grow(benchmark::State& state) {
  vector<string> strs;
  mt19937 generator(0);
  OAHSet src;
  unsigned elems = 1 << 18;
  for (size_t i = 0; i < elems; ++i) {
    src.Add(random_string(generator, 16), UINT32_MAX);
    strs.push_back(random_string(generator, 16));
  }

  while (state.KeepRunning()) {
    state.PauseTiming();
    OAHSet tmp;
    src.Fill(&tmp);
    CHECK_EQ(tmp.BucketCount(), elems);
    state.ResumeTiming();
    for (const auto& str : strs) {
      tmp.Add(str);
      if (tmp.BucketCount() > elems) {
        break;  // we grew
      }
    }

    CHECK_GT(tmp.BucketCount(), elems);
  }
}
BENCHMARK(BM_Grow);

// unsigned total_wasted_memory = 0;

// TEST_F(OAHSetTest, ReallocIfNeeded) {
//   auto build_str = [](size_t i) { return to_string(i) + string(131, 'a'); };

//   auto count_waste = [](const mi_heap_t* heap, const mi_heap_area_t* area, void* block,
//                         size_t block_size, void* arg) {
//     size_t used = block_size * area->used;
//     total_wasted_memory += area->committed - used;
//     return true;
//   };

//   for (size_t i = 0; i < 10'000; i++)
//     ss_->Add(build_str(i));

//   for (size_t i = 0; i < 10'000; i++) {
//     if (i % 10 == 0)
//       continue;
//     ss_->Erase(build_str(i));
//   }

//   mi_heap_collect(mi_heap_get_backing(), true);
//   mi_heap_visit_blocks(mi_heap_get_backing(), false, count_waste, nullptr);
//   size_t wasted_before = total_wasted_memory;

//   size_t underutilized = 0;
//   for (auto it = ss_->begin(); it != ss_->end(); ++it) {
//     underutilized += zmalloc_page_is_underutilized(*it, 0.9);
//     it.ReallocIfNeeded(0.9);
//   }
//   // Check there are underutilized pages
//   CHECK_GT(underutilized, 0u);

//   total_wasted_memory = 0;
//   mi_heap_collect(mi_heap_get_backing(), true);
//   mi_heap_visit_blocks(mi_heap_get_backing(), false, count_waste, nullptr);
//   size_t wasted_after = total_wasted_memory;

//   // Check we waste significanlty less now
//   EXPECT_GT(wasted_before, wasted_after * 2);

//   EXPECT_EQ(ss_->UpperBoundSize(), 1000);
//   for (size_t i = 0; i < 1000; i++)
//     EXPECT_EQ(*ss_->Find(build_str(i * 10)), build_str(i * 10));
// }

class ShrinkTest : public OAHSetTest, public ::testing::WithParamInterface<size_t> {};

TEST_P(ShrinkTest, BasicShrink) {
  constexpr size_t num_strs = 1000000;
  size_t shrink_to = GetParam();

  vector<string> strs;
  for (size_t i = 0; i < num_strs; ++i) {
    strs.push_back(random_string(generator_, 10));
    EXPECT_TRUE(ss_->Add(strs.back()));
  }

  // Grow to a larger size
  ss_->Reserve(1 << 22);
  size_t original_bucket_count = ss_->BucketCount();
  EXPECT_EQ(original_bucket_count, 1u << 22);

  // Shrink to the parameterized size
  ss_->Shrink(shrink_to);

  EXPECT_EQ(ss_->BucketCount(), shrink_to);
  EXPECT_EQ(ss_->UpperBoundSize(), num_strs);

  // Verify all elements are still accessible
  for (const auto& str : strs) {
    EXPECT_TRUE(ss_->Contains(str)) << "Missing: " << str;
  }
}

INSTANTIATE_TEST_SUITE_P(ShrinkSizes, ShrinkTest,
                         ::testing::Values(1u << 21,   // 2M buckets (sparse)
                                           1u << 20,   // 1M buckets (~1 per bucket)
                                           1u << 19),  // 512K buckets (~2 per bucket)
                         [](const auto& info) { return absl::StrCat("buckets_", info.param); });

TEST_F(OAHSetTest, ShrinkWithTTL) {
  constexpr size_t num_strs = 1000000;

  // Track elements by their TTL category
  vector<string> expired_strs;    // TTL 1-50, will expire
  vector<string> surviving_strs;  // TTL 51-100, will survive
  vector<string> no_ttl_strs;     // No TTL, will survive

  for (size_t i = 0; i < num_strs; ++i) {
    string str = random_string(generator_, 10);
    if (i % 3 == 0) {
      // No TTL
      EXPECT_TRUE(ss_->Add(str));
      no_ttl_strs.push_back(str);
    } else if (i % 3 == 1) {
      // TTL 1-50 (will expire when time=50)
      uint32_t ttl = (i % 50) + 1;
      EXPECT_TRUE(ss_->Add(str, ttl));
      expired_strs.push_back(str);
    } else {
      // TTL 51-100 (will survive when time=50)
      uint32_t ttl = (i % 50) + 51;
      EXPECT_TRUE(ss_->Add(str, ttl));
      surviving_strs.push_back(str);
    }
  }

  // Grow to larger size
  ss_->Reserve(1 << 22);

  // Set time to 50 - this will expire elements with TTL <= 50
  ss_->set_time(50);

  // Shrink
  ss_->Shrink(1 << 21);
  EXPECT_EQ(ss_->BucketCount(), 1u << 21);

  // Verify expired elements are gone
  for (const auto& str : expired_strs) {
    EXPECT_EQ(ss_->Find(str), ss_->end()) << "Should be expired: " << str;
  }

  // Verify surviving TTL elements are still accessible with correct TTL
  for (const auto& str : surviving_strs) {
    auto it = ss_->Find(str);
    ASSERT_NE(it, ss_->end()) << "Missing surviving TTL element: " << str;
    EXPECT_TRUE(it.HasExpiry());
    EXPECT_GT(it.ExpiryTime(), 50u);
  }

  // Verify no-TTL elements are still accessible
  for (const auto& str : no_ttl_strs) {
    auto it = ss_->Find(str);
    ASSERT_NE(it, ss_->end()) << "Missing no-TTL element: " << str;
    EXPECT_FALSE(it.HasExpiry());
  }
}

TEST_F(OAHSetTest, ScanWithShrinkBetweenCalls) {
  // Test that cursor-based scanning works correctly when Grow and Shrink happen between Scan calls
  // This verifies SCAN guarantees: elements present at start and end of scan must be seen
  constexpr size_t num_strs = 1000000;
  vector<string> strs;
  unordered_set<string> must_see;

  // Add elements and track them
  for (size_t i = 0; i < num_strs; ++i) {
    strs.push_back(random_string(generator_, 10));
    EXPECT_TRUE(ss_->Add(strs.back()));
    must_see.insert(strs.back());
  }

  // Note initial bucket count (will be ~1M after adding 1M elements)
  size_t initial_bucket_count = ss_->BucketCount();

  unordered_set<string> seen;
  auto scan_callback = [&](const string_view str) { seen.emplace(str); };

  // Start scanning BEFORE Grow
  uint32_t cursor = ss_->Scan(0, scan_callback);
  EXPECT_NE(cursor, 0u) << "Should not finish in one iteration";

  // Grow to large size in the middle of scanning
  ss_->Reserve(1 << 22);
  EXPECT_EQ(ss_->BucketCount(), 1u << 22);
  EXPECT_GT(ss_->BucketCount(), initial_bucket_count);

  // Continue scanning a bit after Grow
  cursor = ss_->Scan(cursor, scan_callback);

  // Now Shrink in the middle of scanning - this is the key test
  // Elements that existed at scan start must still be visible
  ss_->Shrink(1 << 21);
  EXPECT_EQ(ss_->BucketCount(), 1u << 21);

  // Continue scanning with the same cursor
  constexpr int max_iterations = 1 << 22;
  int iterations = 0;
  while (cursor != 0 && iterations < max_iterations) {
    cursor = ss_->Scan(cursor, scan_callback);
    iterations++;
  }
  EXPECT_LT(iterations, max_iterations) << "Hit iteration limit";
  EXPECT_EQ(cursor, 0u) << "Scan should complete";

  // Verify all original elements were seen
  for (const auto& str : must_see) {
    ASSERT_TRUE(seen.count(str)) << "Missing element after shrink: " << str;
  }
  EXPECT_EQ(seen.size(), must_see.size()) << "Should see exactly all original elements";
}

}  // namespace dfly


================================================
FILE: src/core/overloaded.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
//

#pragma once

namespace dfly {
template <class... Ts> struct Overloaded : Ts... { using Ts::operator()...; };

template <class... Ts> Overloaded(Ts...) -> Overloaded<Ts...>;

}  // namespace dfly


================================================
FILE: src/core/page_usage/CMakeLists.txt
================================================
add_library(dfly_page_usage page_usage_stats.cc)
target_link_libraries(dfly_page_usage base TRDP::hdr_histogram redis_lib absl::strings)


================================================
FILE: src/core/page_usage/page_usage_stats.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/page_usage/page_usage_stats.h"

#include <absl/container/flat_hash_set.h>
#include <absl/strings/ascii.h>
#include <absl/strings/str_join.h>
#include <glog/logging.h>
#include <hdr/hdr_histogram.h>

#include <string>

#include "base/cycle_clock.h"

extern "C" {
#include <unistd.h>

#include "redis/zmalloc.h"
mi_page_usage_stats_t mi_heap_page_is_underutilized(mi_heap_t* heap, void* p, float ratio,
                                                    bool collect_stats);
}

namespace dfly {

using absl::StrAppend;
using absl::StrFormat;
using absl::StripTrailingAsciiWhitespace;

namespace {
constexpr auto kUsageHistPoints = std::array{50, 90, 99};
constexpr auto kHistSignificantFigures = 3;

HllBufferPtr InitHllPtr() {
  HllBufferPtr p;
  p.size = getDenseHllSize();
  p.hll = new uint8_t[p.size];
  CHECK_EQ(0, createDenseHll(p));
  return p;
}

}  // namespace

CycleQuota::CycleQuota(const uint64_t quota_usec)
    : CycleQuota(base::CycleClock::FromUsec(quota_usec), true) {
}

void CycleQuota::Arm() {
  start_cycles_ = base::CycleClock::Now();
}

bool CycleQuota::Depleted() const {
  if (quota_cycles_ == kMaxQuota)
    return false;
  return UsedCycles() >= quota_cycles_;
}

uint64_t CycleQuota::UsedCycles() const {
  return base::CycleClock::Now() - start_cycles_;
}

CycleQuota CycleQuota::Unlimited() {
  return CycleQuota(kMaxQuota, true);
}

void CycleQuota::Extend(const uint64_t quota_usec) {
  if (quota_cycles_ == kMaxQuota)
    return;

  quota_cycles_ += base::CycleClock::FromUsec(quota_usec);
}

CycleQuota::CycleQuota(const uint64_t quota_cycles, bool /*tag*/) : quota_cycles_{quota_cycles} {
  Arm();
}

void CollectedPageStats::Merge(CollectedPageStats&& other, uint16_t shard_id) {
  this->pages_scanned += other.pages_scanned;
  this->pages_marked_for_realloc += other.pages_marked_for_realloc;
  this->pages_full += other.pages_full;
  this->pages_reserved_for_malloc += other.pages_reserved_for_malloc;
  this->pages_with_heap_mismatch += other.pages_with_heap_mismatch;
  this->pages_above_threshold += other.pages_above_threshold;
  this->objects_skipped_not_required += other.objects_skipped_not_required;
  this->objects_skipped_not_supported += other.objects_skipped_not_supported;
  shard_wide_summary.emplace(std::make_pair(shard_id, std::move(other.page_usage_hist)));
}

CollectedPageStats CollectedPageStats::Merge(std::vector<CollectedPageStats>&& stats,
                                             const float threshold) {
  CollectedPageStats result;
  result.threshold = threshold;

  size_t shard_index = 0;
  for (CollectedPageStats& stat : stats) {
    result.Merge(std::move(stat), shard_index++);
  }
  return result;
}

std::string CollectedPageStats::ToString() const {
  std::string response;
  StrAppend(&response, "Page usage threshold: ", threshold * 100, "\n");
  StrAppend(&response, "Pages scanned: ", pages_scanned, "\n");
  StrAppend(&response, "Pages marked for reallocation: ", pages_marked_for_realloc, "\n");
  StrAppend(&response, "Pages full: ", pages_full, "\n");
  StrAppend(&response, "Pages reserved for malloc: ", pages_reserved_for_malloc, "\n");
  StrAppend(&response, "Pages skipped due to heap mismatch: ", pages_with_heap_mismatch, "\n");
  StrAppend(&response, "Pages with usage above threshold: ", pages_above_threshold, "\n");
  StrAppend(&response,
            "Objects skipped (do not require defragmentation): ", objects_skipped_not_required,
            "\n");
  StrAppend(&response,
            "Objects skipped (do not support defragmentation): ", objects_skipped_not_supported,
            "\n");
  for (const auto& [shard_id, usage] : shard_wide_summary) {
    StrAppend(&response, "[Shard ", shard_id, "]\n");
    for (const auto& [percentage, count] : usage) {
      StrAppend(&response,
                StrFormat(" %d%% pages are below %d%% block usage\n", percentage, count));
    }
  }
  StripTrailingAsciiWhitespace(&response);
  return response;
}

PageUsage::UniquePages::UniquePages()
    : pages_scanned{InitHllPtr()},
      pages_marked_for_realloc{InitHllPtr()},
      pages_full{InitHllPtr()},
      pages_reserved_for_malloc{InitHllPtr()},
      pages_with_heap_mismatch{InitHllPtr()},
      pages_above_threshold{InitHllPtr()} {
  hdr_histogram* h = nullptr;
  const auto init_result = hdr_init(1, 100, kHistSignificantFigures, &h);
  CHECK_EQ(0, init_result) << "failed to initialize histogram";
  page_usage_hist = h;
}

PageUsage::UniquePages::~UniquePages() {
  delete[] pages_scanned.hll;
  delete[] pages_marked_for_realloc.hll;
  delete[] pages_full.hll;
  delete[] pages_reserved_for_malloc.hll;
  delete[] pages_with_heap_mismatch.hll;
  delete[] pages_above_threshold.hll;
  hdr_close(page_usage_hist);
}

void PageUsage::UniquePages::AddStat(mi_page_usage_stats_t stat) {  // NOLINT should not be const
  const auto data = reinterpret_cast<const unsigned char*>(&stat.page_address);

  auto record = [&data](HllBufferPtr ctr) { pfadd_dense(ctr, data, sizeof(stat.page_address)); };

  record(pages_scanned);

  if (stat.flags & MI_DFLY_PAGE_BELOW_THRESHOLD) {
    record(pages_marked_for_realloc);
  }
  if (stat.flags & MI_DFLY_PAGE_FULL) {
    record(pages_full);
  }
  if (stat.flags & MI_DFLY_HEAP_MISMATCH) {
    record(pages_with_heap_mismatch);
  }
  if (stat.flags & MI_DFLY_PAGE_USED_FOR_MALLOC) {
    record(pages_reserved_for_malloc);
  }
  if (stat.flags == 0) {
    // No special flags means the page is above the threshold but not full - record usage for
    // histogram. This allows tuning the threshold for future commands.
    record(pages_above_threshold);
    hdr_record_value(page_usage_hist, 100.0 * stat.used / stat.capacity);
  }
}

CollectedPageStats PageUsage::UniquePages::CollectedStats() const {
  CollectedPageStats::ShardUsageSummary usage;
  for (const auto p : kUsageHistPoints) {
    usage[p] = hdr_value_at_percentile(page_usage_hist, p);
  }

  return CollectedPageStats{
      .pages_scanned = static_cast<uint64_t>(pfcountSingle(pages_scanned)),
      .pages_marked_for_realloc = static_cast<uint64_t>(pfcountSingle(pages_marked_for_realloc)),
      .pages_full = static_cast<uint64_t>(pfcountSingle(pages_full)),
      .pages_reserved_for_malloc = static_cast<uint64_t>(pfcountSingle(pages_reserved_for_malloc)),
      .pages_with_heap_mismatch = static_cast<uint64_t>(pfcountSingle(pages_with_heap_mismatch)),
      .pages_above_threshold = static_cast<uint64_t>(pfcountSingle(pages_above_threshold)),
      .objects_skipped_not_required = objects_skipped_not_required,
      .objects_skipped_not_supported = objects_skipped_not_supported,
      .page_usage_hist = std::move(usage),
      .shard_wide_summary = {}};
}

PageUsage::PageUsage(CollectPageStats collect_stats, float threshold, CycleQuota quota)
    : collect_stats_{collect_stats}, threshold_{threshold}, quota_{quota} {
}

void PageUsage::ArmQuotaTimer() {
  quota_.Arm();
}

uint64_t PageUsage::UsedQuotaCycles() const {
  return quota_.UsedCycles();
}

bool PageUsage::IsPageForObjectUnderUtilized(void* object) {
  mi_page_usage_stats_t stat;
  zmalloc_page_is_underutilized(object, threshold_, collect_stats_ == CollectPageStats::YES, &stat);
  return ConsumePageStats(stat);
}

bool PageUsage::IsPageForObjectUnderUtilized(mi_heap_t* heap, void* object) {
  return ConsumePageStats(mi_heap_page_is_underutilized(heap, object, threshold_,
                                                        collect_stats_ == CollectPageStats::YES));
}

bool PageUsage::ConsumePageStats(mi_page_usage_stats_t stat) {
  const bool should_reallocate = stat.flags == MI_DFLY_PAGE_BELOW_THRESHOLD;
  if (collect_stats_ == CollectPageStats::YES) {
    unique_pages_.AddStat(stat);
  }
  return force_reallocate_ || should_reallocate;
}

bool PageUsage::QuotaDepleted() const {
  return quota_.Depleted();
}

void PageUsage::ExtendQuota(uint64_t quota_usec) {
  quota_.Extend(quota_usec);
}

}  // namespace dfly


================================================
FILE: src/core/page_usage/page_usage_stats.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/btree_map.h>

#define MI_BUILD_RELEASE 1
#include <mimalloc/types.h>

extern "C" {
#include "redis/hyperloglog.h"
}

struct hdr_histogram;

namespace dfly {

class CycleQuota {
 public:
  static constexpr uint64_t kMaxQuota = std::numeric_limits<uint64_t>::max();
  static constexpr uint64_t kDefaultDefragQuota = 150;

  explicit CycleQuota(uint64_t quota_usec);

  // Sets the starting point for the quota to be counted from. Can be called multiple times to reset
  // the quota counter.
  void Arm();

  bool Depleted() const;

  uint64_t UsedCycles() const;

  static CycleQuota Unlimited();

  // Extends the quota by the given amount. If any quota was already left over, it is also retained
  // on top of the newly added quota. For example, if 80 usec was left, and we extend by 50 usec,
  // the task now has 130 usec before the quota will be depleted.
  void Extend(uint64_t quota_usec);

 private:
  explicit CycleQuota(uint64_t quota_cycles, bool /*tag*/);

  uint64_t quota_cycles_;
  uint64_t start_cycles_{0};
};

enum class CollectPageStats : uint8_t { YES, NO };

struct CollectedPageStats {
  double threshold{0.0};
  uint64_t pages_scanned{0};
  uint64_t pages_marked_for_realloc{0};
  uint64_t pages_full{0};
  uint64_t pages_reserved_for_malloc{0};
  uint64_t pages_with_heap_mismatch{0};
  uint64_t pages_above_threshold{0};
  uint64_t objects_skipped_not_required{0};
  uint64_t objects_skipped_not_supported{0};

  using ShardUsageSummary = absl::btree_map<uint8_t, uint64_t>;
  ShardUsageSummary page_usage_hist;
  absl::btree_map<uint16_t, ShardUsageSummary> shard_wide_summary;

  void Merge(CollectedPageStats&& other, uint16_t shard_id);
  static CollectedPageStats Merge(std::vector<CollectedPageStats>&& stats, float threshold);

  std::string ToString() const;
};

class PageUsage {
 public:
  PageUsage(CollectPageStats collect_stats, float threshold,
            CycleQuota quota = CycleQuota::Unlimited());

  virtual ~PageUsage() = default;

  // Resets the quota timer to split defragmentation into different groups with separate quotas.
  // For example, first defragment objects with a quota and then defragment search indices with the
  // same quota independently.
  void ArmQuotaTimer();

  uint64_t UsedQuotaCycles() const;

  virtual bool IsPageForObjectUnderUtilized(void* object);

  bool IsPageForObjectUnderUtilized(mi_heap_t* heap, void* object);

  CollectedPageStats CollectedStats() const {
    return unique_pages_.CollectedStats();
  }

  bool ConsumePageStats(mi_page_usage_stats_t stats);

  void RecordNotRequired() {
    unique_pages_.objects_skipped_not_required += 1;
  }

  void RecordNotSupported() {
    unique_pages_.objects_skipped_not_supported += 1;
  }

  void SetForceReallocate(bool force_reallocate) {
    force_reallocate_ = force_reallocate;
  }

  bool QuotaDepleted() const;

  void ExtendQuota(uint64_t quota_usec);

 private:
  CollectPageStats collect_stats_{CollectPageStats::NO};
  float threshold_;

  struct UniquePages {
    HllBufferPtr pages_scanned;
    HllBufferPtr pages_marked_for_realloc;
    HllBufferPtr pages_full;
    HllBufferPtr pages_reserved_for_malloc;
    HllBufferPtr pages_with_heap_mismatch;
    HllBufferPtr pages_above_threshold;
    hdr_histogram* page_usage_hist{};

    uint64_t objects_skipped_not_required{0};
    uint64_t objects_skipped_not_supported{0};

    explicit UniquePages();
    ~UniquePages();

    void AddStat(mi_page_usage_stats_t stat);
    CollectedPageStats CollectedStats() const;
  };

  UniquePages unique_pages_;

  CycleQuota quota_;

  // For use in testing, forces reallocate check to always return true
  bool force_reallocate_{false};
};

}  // namespace dfly


================================================
FILE: src/core/page_usage_stats_test.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/page_usage/page_usage_stats.h"

#include <absl/flags/reflection.h>
#include <gmock/gmock-matchers.h>

#include <random>

#include "base/gtest.h"
#include "base/logging.h"
#include "core/compact_object.h"
#include "core/qlist.h"
#include "core/score_map.h"
#include "core/search/block_list.h"
#include "core/search/search.h"
#include "core/small_string.h"
#include "core/sorted_map.h"
#include "core/string_map.h"
#include "core/string_set.h"
#include "redis/redis_aux.h"
#include "util/fibers/fibers.h"

extern "C" {
#include "redis/zmalloc.h"
}

ABSL_DECLARE_FLAG(bool, experimental_flat_json);

using namespace dfly;
using namespace std::chrono_literals;

namespace {

std::string GenerateTestJSON(size_t num_objects) {
  std::string data = R"({"contents":[)";
  for (size_t i = 0; i < num_objects; ++i) {
    const auto si = std::to_string(i);
    data += R"({"id":)" + si + R"(,"class":"v___)" + si + R"(","value":)" + si + R"(})";
    if (i < num_objects - 1) {
      data += ",";
    }
  }
  data += R"(], "data": "some", "count": 1, "checked": false})";
  return data;
}

// Helper to defragment only if a randomly generated value is less than preset probability. For
// benchmarking realistic situations, where some nodes are fragmented and others are not
class SelectiveDefragment : public PageUsage {
 public:
  explicit SelectiveDefragment(const double fragmentation_probability)
      : PageUsage(CollectPageStats::NO, 0), frag_prob_{fragmentation_probability} {
  }

  bool IsPageForObjectUnderUtilized(void*) override {
    return dist_(rng_) < frag_prob_;
  }

 private:
  double frag_prob_;
  std::mt19937 rng_{99};
  std::uniform_real_distribution<double> dist_{0.0, 1.0};
};

struct MemStats {
  size_t total_reserved{0};
  size_t total_committed{0};
  size_t total_used{0};
  size_t total_wasted{0};
  size_t num_pages{0};
};

MemStats LogMemStats(const mi_heap_t* heap) {
  MemStats stats;
  mi_heap_visit_blocks(
      heap, false,
      [](const mi_heap_t* /*h*/, const mi_heap_area_t* area, void* /*block*/, size_t block_size,
         void* arg) {
        const size_t committed = area->committed;
        const size_t used = area->used * block_size;

        const auto s = static_cast<MemStats*>(arg);
        s->num_pages++;
        s->total_committed += committed;
        s->total_reserved += area->reserved;
        s->total_used += used;
        s->total_wasted += committed - used;

        return true;
      },
      &stats);

  LOG(INFO) << "Pages: " << stats.num_pages;
  LOG(INFO) << "Reserved : " << stats.total_reserved << " bytes";
  LOG(INFO) << "Committed: " << stats.total_committed << " bytes";
  LOG(INFO) << "Used: " << stats.total_used << " bytes";
  LOG(INFO) << "Wasted: " << stats.total_wasted << " bytes";
  if (stats.total_committed) {
    LOG(INFO) << "Wasted%: "
              << static_cast<double>(stats.total_wasted) / stats.total_committed * 100.0;
    LOG(INFO) << "Utilization%: "
              << static_cast<double>(stats.total_used) / stats.total_committed * 100.0;
  }

  return stats;
}

}  // namespace

class PageUsageStatsTest : public ::testing::Test {
 protected:
  static void SetUpTestSuite() {
    init_zmalloc_threadlocal(mi_heap_get_backing());
  }

  static void TearDownTestSuite() {
    mi_heap_collect(mi_heap_get_backing(), true);
    mi_heap_visit_blocks(
        mi_heap_get_backing(), false,
        [](auto*, auto* a, void*, size_t block_sz, void*) {
          LOG(ERROR) << "Unfreed allocations: block_size " << block_sz
                     << ", allocated: " << a->used * block_sz;
          return true;
        },
        nullptr);
  }

  PageUsageStatsTest() : m_(mi_heap_get_backing()) {
    InitTLStatelessAllocMR(&m_);
  }

  void SetUp() override {
    CompactObj::InitThreadLocal(&m_);

    score_map_ = std::make_unique<ScoreMap>();
    sorted_map_ = std::make_unique<detail::SortedMap>();
    string_set_ = std::make_unique<StringSet>();
    string_map_ = std::make_unique<StringMap>();
    SmallString::InitThreadLocal(m_.heap());
    qlist_ = std::make_unique<QList>(2, 2);
  }

  void TearDown() override {
    score_map_.reset();
    sorted_map_.reset();
    string_set_.reset();
    string_map_.reset();
    small_string_.Free();
    qlist_->Clear();
    EXPECT_EQ(zmalloc_used_memory_tl, 0);
    c_obj_.Reset();
    CleanupStatelessAllocMR();
  }

  MiMemoryResource m_;
  std::unique_ptr<ScoreMap> score_map_;
  std::unique_ptr<detail::SortedMap> sorted_map_;
  std::unique_ptr<StringSet> string_set_;
  std::unique_ptr<StringMap> string_map_;
  SmallString small_string_{};
  std::unique_ptr<QList> qlist_;
  CompactValue c_obj_{};
};

TEST_F(PageUsageStatsTest, Defrag) {
  score_map_->AddOrUpdate("test", 0.1);
  sorted_map_->InsertNew(0.1, "x");
  string_set_->Add("a");
  string_map_->AddOrUpdate("key", "value");
  small_string_.Assign("small-string");

  // INT_TAG, defrag will be skipped
  c_obj_.SetString("1");

  qlist_->Push("xxxx", QList::HEAD);

  {
    PageUsage p{CollectPageStats::YES, 0.1};
    score_map_->begin().ReallocIfNeeded(&p);
    sorted_map_->DefragIfNeeded(&p);
    string_set_->begin().ReallocIfNeeded(&p);
    string_map_->begin().ReallocIfNeeded(&p);
    small_string_.DefragIfNeeded(&p);
    c_obj_.DefragIfNeeded(&p);
    qlist_->DefragIfNeeded(&p);

    const auto stats = p.CollectedStats();
    EXPECT_GT(stats.pages_scanned, 0);
    EXPECT_EQ(stats.objects_skipped_not_required, 1);
  }

  {
    PageUsage p{CollectPageStats::NO, 0.1};
    score_map_->begin().ReallocIfNeeded(&p);
    sorted_map_->DefragIfNeeded(&p);
    string_set_->begin().ReallocIfNeeded(&p);
    string_map_->begin().ReallocIfNeeded(&p);
    small_string_.DefragIfNeeded(&p);
    qlist_->DefragIfNeeded(&p);
    EXPECT_EQ(p.CollectedStats().pages_scanned, 0);
  }
}

TEST_F(PageUsageStatsTest, StatCollection) {
  constexpr auto threshold = 0.5;
  PageUsage p{CollectPageStats::YES, threshold};
  for (size_t i = 0; i < 10000; ++i) {
    p.ConsumePageStats({.page_address = uintptr_t{100000 + i},
                        .block_size = 1,
                        .capacity = 100,
                        .reserved = 100,
                        .used = 65,
                        .flags = 0});
  }

  for (size_t i = 0; i < 2000; ++i) {
    p.ConsumePageStats({.page_address = uintptr_t{200000 + i},
                        .block_size = 1,
                        .capacity = 100,
                        .reserved = 100,
                        .used = 85,
                        .flags = 0});
  }

  for (size_t i = 0; i < 1000; ++i) {
    p.ConsumePageStats({.page_address = uintptr_t{300000 + i},
                        .block_size = 1,
                        .capacity = 100,
                        .reserved = 100,
                        .used = 89,
                        .flags = 0});
  }

  constexpr auto page_count_per_flag = 150;

  auto start = 0;
  for (const uint8_t flag : {MI_DFLY_PAGE_FULL, MI_DFLY_PAGE_USED_FOR_MALLOC, MI_DFLY_HEAP_MISMATCH,
                             MI_DFLY_PAGE_BELOW_THRESHOLD}) {
    for (size_t i = 0; i < page_count_per_flag; ++i) {
      p.ConsumePageStats({.page_address = uintptr_t{start + i},
                          .block_size = 1,
                          .capacity = 100,
                          .reserved = 100,
                          .used = 100,
                          .flags = flag});
    }
    start += page_count_per_flag;
  }

  CollectedPageStats st;
  st.Merge(p.CollectedStats(), 1);

  EXPECT_GT(st.pages_scanned, 12000);

  // Expect a small error margin due to HLL
  EXPECT_NEAR(st.pages_full, page_count_per_flag, 5);
  EXPECT_NEAR(st.pages_reserved_for_malloc, page_count_per_flag, 5);
  EXPECT_NEAR(st.pages_marked_for_realloc, page_count_per_flag, 5);

  const auto usage = st.shard_wide_summary;

  EXPECT_EQ(usage.size(), 1);
  EXPECT_TRUE(usage.contains(1));

  const CollectedPageStats::ShardUsageSummary expected{{50, 65}, {90, 85}, {99, 89}};
  EXPECT_EQ(usage.at(1), expected);
}

TEST_F(PageUsageStatsTest, JSONCons) {
  // Because of the static encoding it is not possible to easily test the flat encoding. Once the
  // encoding flag is set, it is not re-read. If friend class is used to access the compact object
  // inner fields and call `DefragIfNeeded` directly on the flat variant of the union, the test will
  // still fail. This is because freeing the compact object code path takes the wrong branch based
  // on encoding. The flat encoding was tested manually adjusting this same test with changed
  // encoding.
  std::string data = GenerateTestJSON(1000);

  auto* mr = static_cast<MiMemoryResource*>(CompactObj::memory_resource());
  size_t before = mr->used();

  auto parsed = ParseJsonUsingShardHeap(data);
  EXPECT_TRUE(parsed.has_value());

  c_obj_.SetJson(std::move(parsed.value()));
  c_obj_.SetJsonSize(mr->used() - before);
  EXPECT_GT(c_obj_.MallocUsed(), 0);

  PageUsage p{CollectPageStats::YES, 0.1};
  p.SetForceReallocate(true);

  c_obj_.DefragIfNeeded(&p);
  EXPECT_GT(c_obj_.MallocUsed(), 0);

  const auto stats = p.CollectedStats();
  EXPECT_GT(stats.pages_scanned, 0);
  EXPECT_EQ(stats.objects_skipped_not_required, 0);

  EXPECT_EQ(c_obj_.ObjType(), OBJ_JSON);

  auto json_obj = c_obj_.GetJson();
  EXPECT_EQ(json_obj->at("data").as_string_view(), "some");
  EXPECT_EQ(json_obj->at("count").as_integer<uint8_t>(), 1);
  EXPECT_EQ(json_obj->at("checked").as_bool(), false);
}

TEST_F(PageUsageStatsTest, JsonDefragEmpty) {
  auto parsed = ParseJsonUsingShardHeap(R"({})");
  EXPECT_TRUE(parsed.has_value());

  PageUsage p{CollectPageStats::NO, 0};
  p.SetForceReallocate(true);

  Defragment(parsed.value(), &p);
  EXPECT_TRUE(parsed->empty());
}

TEST_F(PageUsageStatsTest, JsonDefragNested) {
  constexpr auto data = R"({"a":{"b":{"c":{"d":"value"}}}})";
  auto parsed = ParseJsonUsingShardHeap(data);
  EXPECT_TRUE(parsed.has_value());

  PageUsage p{CollectPageStats::NO, 0};
  p.SetForceReallocate(true);

  Defragment(parsed.value(), &p);
  EXPECT_EQ(parsed->at("a").at("b").at("c").at("d").as_string_view(), "value");
}

TEST_F(PageUsageStatsTest, JsonDefragRemainsInSameHeap) {
  // This is a brute force test that defragmentation does not erroneously move data to the default
  // heap. Comparing allocators before/after defragmentation is not useful as stateless allocators
  // are all equal. It might be possible to compare the allocator type, but this approach checks
  // that the pointers in a JSON object belong to the same heap as they did before defragmentation.

  const std::string data = R"({
    "data": {"sub-data": "attr1"},
    "values": [true, false, 1.11, 2],
    "secretkey": ")" + std::string(1024, '.') +
                           "\"}";

  auto json = ParseJsonUsingShardHeap(data);
  EXPECT_TRUE(json.has_value());

  auto key_before = json->at("secretkey").as_string_view();
  auto sub_before = json->at("data").at("sub-data").as_string_view();
  auto values_before = &*json->at("values").array_range().begin();

  EXPECT_TRUE(mi_heap_contains_block(m_.heap(), key_before.data()));
  EXPECT_TRUE(mi_heap_contains_block(m_.heap(), sub_before.data()));
  EXPECT_TRUE(mi_heap_contains_block(m_.heap(), values_before));

  PageUsage p{CollectPageStats::NO, 0};
  p.SetForceReallocate(true);

  Defragment(json.value(), &p);

  auto key_after = json->at("secretkey").as_string_view();
  auto sub_after = json->at("data").at("sub-data").as_string_view();
  auto values_after = &*json->at("values").array_range().begin();

  // Data still managed by the same heap.
  EXPECT_TRUE(mi_heap_contains_block(m_.heap(), key_after.data()));
  EXPECT_TRUE(mi_heap_contains_block(m_.heap(), sub_after.data()));
  EXPECT_TRUE(mi_heap_contains_block(m_.heap(), values_after));

  // Defragment actually changed addresses
  EXPECT_NE(key_after.data(), key_before.data());
  EXPECT_NE(sub_after.data(), sub_before.data());
  EXPECT_NE(values_after, values_before);
}

TEST_F(PageUsageStatsTest, QuotaChecks) {
  {
    PageUsage p{CollectPageStats::NO, 0};
    EXPECT_FALSE(p.QuotaDepleted());
  }
  {
    PageUsage p{CollectPageStats::NO, 0, CycleQuota{4}};
    util::ThisFiber::SleepFor(5us);
    EXPECT_TRUE(p.QuotaDepleted());
  }
}

TEST_F(PageUsageStatsTest, BlockList) {
  search::BlockList<search::SortedVector<search::DocId>> bl{&m_, 20};
  PageUsage p{CollectPageStats::NO, 0.1};
  p.SetForceReallocate(true);

  // empty list
  auto result = bl.Defragment(&p);
  EXPECT_FALSE(result.quota_depleted);
  EXPECT_EQ(result.objects_moved, 0);

  // single item will move twice, once for the blocklist and once for the sorted vector
  bl.Insert(1);
  result = bl.Defragment(&p);
  EXPECT_FALSE(result.quota_depleted);
  EXPECT_EQ(result.objects_moved, 2);

  // quota depleted without defragmentation
  PageUsage p_zero{CollectPageStats::NO, 0.1, CycleQuota{0}};
  p_zero.SetForceReallocate(true);
  result = bl.Defragment(&p_zero);
  EXPECT_TRUE(result.quota_depleted);
  EXPECT_EQ(result.objects_moved, 0);
}

TEST_F(PageUsageStatsTest, BlockListDefragmentResumes) {
  search::BlockList<search::SortedVector<search::DocId>> bl{&m_, 20};
  PageUsage p{CollectPageStats::NO, 0.1};
  p.SetForceReallocate(true);

  for (size_t i = 0; i < 1000; ++i) {
    bl.Insert(i);
  }

  PageUsage p_small_quota{CollectPageStats::NO, 0.1, CycleQuota{10}};
  p_small_quota.SetForceReallocate(true);
  util::ThisFiber::SleepFor(10us);
  auto result = bl.Defragment(&p_small_quota);
  EXPECT_TRUE(result.quota_depleted);
  EXPECT_GE(result.objects_moved, 0);

  result = bl.Defragment(&p);
  EXPECT_FALSE(result.quota_depleted);
  EXPECT_GT(result.objects_moved, 0);
}

TEST_F(PageUsageStatsTest, BlockListWithPairs) {
  search::BlockList<search::SortedVector<std::pair<search::DocId, double>>> bl{&m_, 20};
  PageUsage p{CollectPageStats::NO, 0.1};
  p.SetForceReallocate(true);

  for (size_t i = 0; i < 100; ++i) {
    bl.Insert({i, i * 1.1});
  }

  const auto result = bl.Defragment(&p);
  EXPECT_FALSE(result.quota_depleted);
  EXPECT_GT(result.objects_moved, 0);
}

TEST_F(PageUsageStatsTest, BlockListWithNonDefragmentableContainer) {
  search::BlockList<search::CompressedSortedSet> bl{&m_, 20};
  PageUsage p{CollectPageStats::NO, 0.1};
  p.SetForceReallocate(true);

  // empty list
  auto result = bl.Defragment(&p);
  EXPECT_FALSE(result.quota_depleted);
  EXPECT_EQ(result.objects_moved, 0);

  // will reallocate once for the blocklist, the inner sorted set will be skipped
  bl.Insert(1);
  result = bl.Defragment(&p);
  EXPECT_FALSE(result.quota_depleted);
  EXPECT_EQ(result.objects_moved, 1);
}

class MockDocument final : public search::DocumentAccessor {
 public:
  MockDocument() {
    words.reserve(1000);
    for (size_t i = 0; i < 1000; ++i) {
      words.push_back(absl::StrFormat("word-%d", i));
    }
  }

  std::optional<StringList> GetStrings(std::string_view active_field) const override {
    return {{words[absl::GetCurrentTimeNanos() % words.size()]}};
  }
  std::optional<VectorInfo> GetVector(std::string_view active_field, size_t dim) const override {
    return std::nullopt;
  }
  std::optional<NumsList> GetNumbers(std::string_view active_field) const override {
    return {{1, 2, 3, 4}};
  }
  std::optional<StringList> GetTags(std::string_view active_field) const override {
    return {{words[absl::GetCurrentTimeNanos() % words.size()]}};
  }

  std::vector<std::string> words;
};

TEST_F(PageUsageStatsTest, DefragmentTagIndex) {
  search::Schema schema;
  schema.fields["field_name"] =
      search::SchemaField{search::SchemaField::TAG, 0, "fn", search::SchemaField::TagParams{}};
  search::FieldIndices index{schema, {}, &m_, nullptr};

  PageUsage p{CollectPageStats::NO, 0.1};
  p.SetForceReallocate(true);

  // Empty index
  search::DefragmentResult result = index.Defragment(&p);
  EXPECT_FALSE(result.quota_depleted);
  EXPECT_EQ(result.objects_moved, 0);

  const MockDocument md;
  index.Add(1, md);

  result = index.Defragment(&p);
  EXPECT_FALSE(result.quota_depleted);
  // single doc with single term returned by `GetTags` should result in two reallocations.
  EXPECT_EQ(result.objects_moved, 2);

  PageUsage p_zero{CollectPageStats::NO, 0.1, CycleQuota{0}};
  p_zero.SetForceReallocate(true);
  result = index.Defragment(&p_zero);
  EXPECT_TRUE(result.quota_depleted);
  EXPECT_EQ(result.objects_moved, 0);
}

TEST_F(PageUsageStatsTest, TagIndexDefragResumeWithChanges) {
  search::Schema schema;
  schema.fields["field_name"] =
      search::SchemaField{search::SchemaField::TAG, 0, "fn", search::SchemaField::TagParams{}};
  search::FieldIndices index{schema, {}, &m_, nullptr};

  PageUsage p{CollectPageStats::NO, 0.1};
  p.SetForceReallocate(true);

  const MockDocument md;
  for (size_t i = 0; i < 100; ++i) {
    index.Add(i, md);
  }

  PageUsage p_small_quota{CollectPageStats::NO, 0.1, CycleQuota{10}};
  p_small_quota.SetForceReallocate(true);
  util::ThisFiber::SleepFor(10us);
  search::DefragmentResult result = index.Defragment(&p_small_quota);
  EXPECT_TRUE(result.quota_depleted);
  EXPECT_GE(result.objects_moved, 0);

  index.Remove(99, md);

  for (size_t i = 200; i < 300; ++i) {
    index.Add(i, md);
  }

  result = index.Defragment(&p);
  EXPECT_FALSE(result.quota_depleted);
  EXPECT_GT(result.objects_moved, 0);
}

TEST_F(PageUsageStatsTest, DefragmentIndexWithNonDefragmentableFields) {
  search::Schema schema;
  schema.fields["text"] =
      search::SchemaField{search::SchemaField::TEXT, 0, "fn", search::SchemaField::TextParams{}};
  schema.fields["num"] = search::SchemaField{search::SchemaField::NUMERIC, 0, "fn",
                                             search::SchemaField::NumericParams{}};
  search::IndicesOptions options{{}};
  search::FieldIndices index{schema, options, &m_, nullptr};

  PageUsage p{CollectPageStats::NO, 0.1};
  p.SetForceReallocate(true);

  const MockDocument md;
  index.Add(1, md);

  // Unsupported index types will skip defragmenting themselves
  const search::DefragmentResult result = index.Defragment(&p);
  EXPECT_FALSE(result.quota_depleted);
  EXPECT_EQ(result.objects_moved, 0);
}

TEST_F(PageUsageStatsTest, DefragReducesWaste) {
  // This test works with actual defragmentation, by deleting every other json object which creates
  // holes in pages which cannot be directly freed. The test asserts that wasted memory goes down as
  // well as committed memory after defragmentation.

  std::vector<std::optional<JsonType>> all_objects;

  constexpr auto total_json = 100;
  all_objects.reserve(total_json);

  for (auto i = 0; i < total_json; ++i) {
    auto parsed = ParseJsonUsingShardHeap(GenerateTestJSON(500));
    EXPECT_TRUE(parsed.has_value());
    all_objects.emplace_back(std::move(parsed.value()));
  }

  // Delete every other object to create gaps, so that the pages are partially used.
  for (size_t i = 0; i < all_objects.size(); i += 2) {
    all_objects[i].reset();
  }

  // Allow mimalloc to free any completely empty pages, if any
  mi_heap_collect(m_.heap(), true);

  // Collects stats using mi_visit.. also logs, to see logs run the test with:
  // --vmodule=page_usage_stats_test=1 --logtostderr
  const auto before = LogMemStats(m_.heap());

  PageUsage p{CollectPageStats::NO, 0.8};
  for (auto& j : all_objects) {
    if (j.has_value()) {
      Defragment(j.value(), &p);
    }
  }

  mi_heap_collect(m_.heap(), true);
  const auto after = LogMemStats(m_.heap());

  EXPECT_LT(after.total_wasted, before.total_wasted);
  EXPECT_LT(after.total_committed, before.total_committed);
}

TEST_F(PageUsageStatsTest, MixedFlagHandling) {
  PageUsage p{CollectPageStats::YES, 0.0};
  auto add_pages = [&](size_t count, uintptr_t start_address, uint8_t flags) {
    for (const size_t i : std::views::iota(0UL, count)) {
      p.ConsumePageStats({.page_address = uintptr_t{start_address + i},
                          .block_size = 100,
                          .capacity = 1000,
                          .reserved = 100,
                          .used = 99,
                          .flags = flags});
    }
  };

  add_pages(2000, 10, MI_DFLY_PAGE_FULL | MI_DFLY_PAGE_USED_FOR_MALLOC | MI_DFLY_HEAP_MISMATCH);
  add_pages(500, 50000, MI_DFLY_PAGE_BELOW_THRESHOLD);

  const auto stats = p.CollectedStats();

  constexpr auto tolerance = 60;
  EXPECT_NEAR(stats.pages_full, 2000, tolerance);
  EXPECT_NEAR(stats.pages_reserved_for_malloc, 2000, tolerance);
  EXPECT_NEAR(stats.pages_with_heap_mismatch, 2000, tolerance);
  EXPECT_EQ(stats.pages_full, stats.pages_reserved_for_malloc);
  EXPECT_EQ(stats.pages_full, stats.pages_with_heap_mismatch);

  EXPECT_NEAR(stats.pages_marked_for_realloc, 500, 15);
}

namespace {

void InitBenchMemRes() {
  static bool initialized = false;
  if (!initialized) {
    auto* tlh = mi_heap_get_backing();
    init_zmalloc_threadlocal(tlh);
    static MiMemoryResource m{tlh};
    InitTLStatelessAllocMR(&m);
    CompactObj::InitThreadLocal(&m);
    initialized = true;
  }
}

}  // namespace

void BM_JSONDefragSelective(benchmark::State& state) {
  InitBenchMemRes();

  std::string json_data = GenerateTestJSON(state.range(0));

  for (auto _ : state) {
    state.PauseTiming();
    auto parsed = ParseJsonUsingShardHeap(json_data);
    DCHECK(parsed.has_value());
    SelectiveDefragment p{state.range(1) / 100.0};
    state.ResumeTiming();

    Defragment(parsed.value(), &p);

    benchmark::DoNotOptimize(parsed);
  }
}

BENCHMARK(BM_JSONDefragSelective)
    ->ArgNames({"objects_per_json", "fragmentation_probability"})
    ->Args({250, 0})
    ->Args({250, 30})
    ->Args({250, 70})
    ->Args({250, 100})
    ->Args({1000, 0})
    ->Args({1000, 30})
    ->Args({1000, 70})
    ->Args({1000, 100})
    ->Args({4000, 0})
    ->Args({4000, 30})
    ->Args({4000, 70})
    ->Args({4000, 100});


================================================
FILE: src/core/qlist.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/qlist.h"

extern "C" {
#include "redis/listpack.h"
#include "redis/lzfP.h"
#include "redis/zmalloc.h"
}

#include <absl/base/macros.h>
#include <absl/base/optimization.h>
#include <absl/strings/escaping.h>
#include <absl/strings/str_cat.h>
#include <lz4frame.h>

#include "base/logging.h"
#include "core/page_usage/page_usage_stats.h"

using namespace std;

/* Maximum size in bytes of any multi-element listpack.
 * Larger values will live in their own isolated listpacks.
 * This is used only if we're limited by record count. when we're limited by
 * size, the maximum limit is bigger, but still safe.
 * 8k is a recommended / default size limit */
#define SIZE_SAFETY_LIMIT 8192

/* Maximum estimate of the listpack entry overhead.
 * Although in the worst case(sz < 64), we will waste 6 bytes in one
 * quicklistNode, but can avoid memory waste due to internal fragmentation
 * when the listpack exceeds the size limit by a few bytes (e.g. being 16388). */
#define SIZE_ESTIMATE_OVERHEAD 8

/* Minimum listpack size in bytes for attempting compression. */
#define MIN_COMPRESS_BYTES 256

/* Minimum size reduction in bytes to store compressed quicklistNode data.
 * This also prevents us from storing compression if the compression
 * resulted in a larger size than the original data. */
#define MIN_COMPRESS_IMPROVE 32

#define QL_NODE_IS_PLAIN(node) ((node)->container == QUICKLIST_NODE_CONTAINER_PLAIN)

namespace dfly {

namespace {

static_assert(sizeof(QList) == 48);
static_assert(sizeof(QList::Node) == 40);

enum IterDir : uint8_t { FWD = 1, REV = 0 };

/* This is for test suite development purposes only, 0 means disabled. */
size_t packed_threshold = 0;

/* Optimization levels for size-based filling.
 * Note that the largest possible limit is 64k, so even if each record takes
 * just one byte, it still won't overflow the 16 bit count field. */
const size_t kOptLevel[] = {4096, 8192, 16384, 32768, 65536};

/* Calculate the size limit of the quicklist node based on negative 'fill'. */
size_t NodeNegFillLimit(int fill) {
  DCHECK_LT(fill, 0);

  size_t offset = (-fill) - 1;
  constexpr size_t max_level = ABSL_ARRAYSIZE(kOptLevel);
  if (offset >= max_level)
    offset = max_level - 1;
  return kOptLevel[offset];
}

const uint8_t* uint_ptr(string_view sv) {
  static uint8_t empty = 0;
  return sv.empty() ? &empty : reinterpret_cast<const uint8_t*>(sv.data());
}

bool IsLargeElement(size_t sz, int fill) {
  if (ABSL_PREDICT_FALSE(packed_threshold != 0))
    return sz >= packed_threshold;
  if (fill >= 0)
    return sz > SIZE_SAFETY_LIMIT;
  else
    return sz > NodeNegFillLimit(fill);
}

/* Calculate the size limit or length limit of the quicklist node
 * based on 'fill', and is also used to limit list listpack. */
void quicklistNodeLimit(int fill, size_t* size, unsigned int* count) {
  *size = SIZE_MAX;
  *count = UINT_MAX;

  if (fill >= 0) {
    /* Ensure that one node have at least one entry */
    *count = (fill == 0) ? 1 : fill;
  } else {
    *size = NodeNegFillLimit(fill);
  }
}

#define sizeMeetsSafetyLimit(sz) ((sz) <= SIZE_SAFETY_LIMIT)

/* Check if the limit of the quicklist node has been reached to determine if
 * insertions, merges or other operations that would increase the size of
 * the node can be performed.
 * Return 1 if exceeds the limit, otherwise 0. */
int quicklistNodeExceedsLimit(int fill, size_t new_sz, unsigned int new_count) {
  size_t sz_limit;
  unsigned int count_limit;
  quicklistNodeLimit(fill, &sz_limit, &count_limit);

  if (ABSL_PREDICT_TRUE(sz_limit != SIZE_MAX)) {
    return new_sz > sz_limit;
  } else if (count_limit != UINT_MAX) {
    /* when we reach here we know that the limit is a size limit (which is
     * safe, see comments next to optimization_level and SIZE_SAFETY_LIMIT) */
    if (!sizeMeetsSafetyLimit(new_sz))
      return 1;
    return new_count > count_limit;
  }

  ABSL_UNREACHABLE();
}

bool NodeAllowInsert(const QList::Node* node, const int fill, const size_t sz) {
  if (ABSL_PREDICT_FALSE(!node))
    return false;

  if (ABSL_PREDICT_FALSE(QL_NODE_IS_PLAIN(node) || IsLargeElement(sz, fill)))
    return false;

  /* Estimate how many bytes will be added to the listpack by this one entry.
   * We prefer an overestimation, which would at worse lead to a few bytes
   * below the lowest limit of 4k (see optimization_level).
   * Note: No need to check for overflow below since both `node->sz` and
   * `sz` are to be less than 1GB after the plain/large element check above. */
  size_t new_sz = node->sz + sz + SIZE_ESTIMATE_OVERHEAD;
  return !quicklistNodeExceedsLimit(fill, new_sz, node->count + 1);
}

bool NodeAllowMerge(const QList::Node* a, const QList::Node* b, const int fill) {
  if (!a || !b)
    return false;

  if (ABSL_PREDICT_FALSE(QL_NODE_IS_PLAIN(a) || QL_NODE_IS_PLAIN(b)))
    return false;

  /* approximate merged listpack size (- 7 to remove one listpack
   * header/trailer, see LP_HDR_SIZE and LP_EOF) */
  unsigned int merge_sz = a->sz + b->sz - 7;

  // Allow merge if new node will not exceed the limit.
  return !quicklistNodeExceedsLimit(fill, merge_sz, a->count + b->count);
}

// the owner over entry is passed to the node.
QList::Node* CreateRAW(int container, uint8_t* entry, size_t sz) {
  QList::Node* node = (QList::Node*)zmalloc(sizeof(*node));
  node->entry = entry;
  node->count = 1;
  node->sz = sz;
  node->next = node->prev = NULL;
  node->encoding = QUICKLIST_NODE_ENCODING_RAW;
  node->container = container;
  node->recompress = 0;
  node->dont_compress = 0;
  node->offloaded = 0;

  return node;
}

uint8_t* LP_Insert(uint8_t* lp, string_view elem, uint8_t* pos, int lp_where) {
  DCHECK(pos);
  return lpInsertString(lp, uint_ptr(elem), elem.size(), pos, lp_where, NULL);
}

uint8_t* LP_Append(uint8_t* lp, string_view elem) {
  return lpAppend(lp, uint_ptr(elem), elem.size());
}

uint8_t* LP_Prepend(uint8_t* lp, string_view elem) {
  return lpPrepend(lp, uint_ptr(elem), elem.size());
}

QList::Node* CreateFromSV(int container, string_view value) {
  uint8_t* entry = nullptr;
  size_t sz = 0;
  if (container == QUICKLIST_NODE_CONTAINER_PLAIN) {
    DCHECK(!value.empty());
    sz = value.size();
    entry = (uint8_t*)zmalloc(sz);
    memcpy(entry, value.data(), sz);
  } else {
    entry = LP_Append(lpNew(0), value);
    sz = lpBytes(entry);
  }

  return CreateRAW(container, entry, sz);
}

// Returns the relative increase in size.
inline ssize_t NodeSetEntry(QList::Node* node, uint8_t* entry) {
  node->entry = entry;
  size_t new_sz = lpBytes(node->entry);
  ssize_t diff = new_sz - node->sz;
  node->sz = new_sz;
  return diff;
}

/* quicklistLZF is a 8+N byte struct holding 'sz' followed by 'compressed'.
 * 'sz' is byte length of 'compressed' field.
 * 'compressed' is LZF data with total (compressed) length 'sz'
 * NOTE: uncompressed length is stored in quicklistNode->sz.
 * When quicklistNode->entry is compressed, node->entry points to a quicklistLZF */
using quicklistLZF = struct quicklistLZF {
  size_t sz; /* LZF size in bytes*/
  char compressed[];
};

inline quicklistLZF* GetLzf(QList::Node* node) {
  DCHECK(node->encoding == QUICKLIST_NODE_ENCODING_LZF ||
         node->encoding == QLIST_NODE_ENCODING_LZ4);
  return (quicklistLZF*)node->entry;
}

bool CompressLZF(QList::Node* node) {
  // We allocate LZF_STATE on heap, piggy-backing on the existing allocation.
  char* uptr = (char*)zmalloc(sizeof(quicklistLZF) + node->sz + sizeof(LZF_STATE));
  quicklistLZF* lzf = (quicklistLZF*)uptr;
  LZF_HSLOT* sdata = (LZF_HSLOT*)(uptr + sizeof(quicklistLZF) + node->sz);

  /* Cancel if compression fails or doesn't compress small enough */
  if (((lzf->sz = lzf_compress(node->entry, node->sz, lzf->compressed, node->sz, sdata)) == 0) ||
      lzf->sz + MIN_COMPRESS_IMPROVE >= node->sz) {
    /* lzf_compress aborts/rejects compression if value not compressible. */
    DVLOG(2) << "Uncompressable " << node->sz << " vs " << lzf->sz;
    zfree(lzf);
    QList::stats.bad_compression_attempts++;
    return false;
  }
  DVLOG(2) << "Compressed " << node->sz << " to " << lzf->sz;
  QList::stats.compressed_bytes += lzf->sz;
  QList::stats.raw_compressed_bytes += node->sz;

  lzf = (quicklistLZF*)zrealloc(lzf, sizeof(*lzf) + lzf->sz);
  zfree(node->entry);
  node->entry = (unsigned char*)lzf;
  node->encoding = QUICKLIST_NODE_ENCODING_LZF;
  return true;
}

bool CompressLZ4(QList::Node* node) {
  LZ4F_cctx* cntx;
  LZ4F_errorCode_t code = LZ4F_createCompressionContext(&cntx, LZ4F_VERSION);
  CHECK(!LZ4F_isError(code));

  LZ4F_preferences_t lz4_pref = LZ4F_INIT_PREFERENCES;
  lz4_pref.compressionLevel = -1;
  lz4_pref.frameInfo.contentSize = node->sz;
  size_t buf_size = LZ4F_compressFrameBound(node->sz, &lz4_pref);

  // We reuse quicklistLZF struct for LZ4 metadata.
  quicklistLZF* dest = (quicklistLZF*)zmalloc(sizeof(quicklistLZF) + buf_size);
  size_t compr_sz = LZ4F_compressFrame_usingCDict(cntx, dest->compressed, buf_size, node->entry,
                                                  node->sz, nullptr /* dict */, &lz4_pref);
  CHECK(!LZ4F_isError(compr_sz));

  code = LZ4F_freeCompressionContext(cntx);
  CHECK(!LZ4F_isError(code));

  if (compr_sz + MIN_COMPRESS_IMPROVE >= node->sz) {
    QList::stats.bad_compression_attempts++;
    zfree(dest);
    return false;
  }

  dest->sz = compr_sz;
  dest = (quicklistLZF*)zrealloc(dest, sizeof(quicklistLZF) + compr_sz);
  QList::stats.compressed_bytes += compr_sz;
  QList::stats.raw_compressed_bytes += node->sz;

  zfree(node->entry);
  node->entry = (unsigned char*)dest;
  node->encoding = QLIST_NODE_ENCODING_LZ4;
  return true;
}

/* Compress the listpack in 'node' and update encoding details.
 * Returns true if listpack compressed successfully.
 * Returns false if compression failed or if listpack too small to compress. */
bool CompressRaw(QList::Node* node, unsigned method) {
  DCHECK(node->encoding == QUICKLIST_NODE_ENCODING_RAW);
  DCHECK(!node->dont_compress);

  /* validate that the node is neither
   * tail nor head (it has prev and next)*/
  DCHECK(node->prev && node->next);

  node->recompress = 0;
  /* Don't bother compressing small values */
  if (node->sz < MIN_COMPRESS_BYTES)
    return false;

  QList::stats.compression_attempts++;
  if (method == static_cast<unsigned>(QList::LZF)) {
    return CompressLZF(node);
  }

  return CompressLZ4(node);
}

ssize_t TryCompress(QList::Node* node, unsigned method) {
  DCHECK(node);
  if (node->encoding == QUICKLIST_NODE_ENCODING_RAW) {
    node->attempted_compress = 1;
    if (!node->dont_compress) {
      if (CompressRaw(node, method))
        return ssize_t(GetLzf(node)->sz) - node->sz;
    }
  }
  return 0;
}

/* Uncompress the listpack in 'node' and update encoding details.
 * Returns 1 on successful decode, 0 on failure to decode. */
bool DecompressRaw(bool recompress, QList::Node* node) {
  DCHECK(node->encoding == QUICKLIST_NODE_ENCODING_LZF ||
         node->encoding == QLIST_NODE_ENCODING_LZ4);

  node->recompress = int(recompress);

  void* decompressed = zmalloc(node->sz);
  quicklistLZF* lzf = GetLzf(node);
  QList::stats.decompression_calls++;
  QList::stats.compressed_bytes -= lzf->sz;
  QList::stats.raw_compressed_bytes -= node->sz;

  if (node->encoding == QLIST_NODE_ENCODING_LZ4) {
    LZ4F_dctx* dctx = nullptr;
    LZ4F_errorCode_t code = LZ4F_createDecompressionContext(&dctx, LZ4F_VERSION);
    CHECK(!LZ4F_isError(code));
    size_t decompressed_sz = node->sz;
    size_t left =
        LZ4F_decompress(dctx, decompressed, &decompressed_sz, lzf->compressed, &lzf->sz, nullptr);
    CHECK_EQ(left, 0u);
    CHECK_EQ(decompressed_sz, node->sz);
    LZ4F_freeDecompressionContext(dctx);
  } else {
    if (lzf_decompress(lzf->compressed, lzf->sz, decompressed, node->sz) == 0) {
      LOG(DFATAL) << "Invalid LZF compressed data";
      /* Someone requested decompress, but we can't decompress.  Not good. */
      zfree(decompressed);
      return false;
    }
  }
  zfree(lzf);
  node->entry = (uint8_t*)decompressed;
  node->encoding = QUICKLIST_NODE_ENCODING_RAW;
  return true;
}

/* Decompress only compressed nodes.
   recompress: if true, the node will be marked for recompression after decompression.
   returns by how much the size of the node has increased.
*/
ssize_t TryDecompressInternal(bool recompress, QList::Node* node) {
  if (node->encoding != QUICKLIST_NODE_ENCODING_RAW) {
    size_t compressed_sz = GetLzf(node)->sz;
    if (DecompressRaw(recompress, node)) {
      return node->sz - compressed_sz;
    }
  }
  return 0;
}

ssize_t RecompressOnly(QList::Node* node, unsigned method) {
  if (node->recompress && !node->dont_compress) {
    if (CompressRaw(node, method))
      return (GetLzf(node))->sz - node->sz;
  }
  return 0;
}

// If after is true, returns a new node with elements in [offset, inf), otherwise
// returns [0, offset-1].
QList::Node* SplitNode(QList::Node* node, int offset, bool after, ssize_t* diff) {
  DCHECK(node->container == QUICKLIST_NODE_CONTAINER_PACKED);
  size_t zl_sz = node->sz;
  uint8_t* entry = (uint8_t*)zmalloc(zl_sz);

  memcpy(entry, node->entry, zl_sz);

  /* Need positive offset for calculating extent below. */
  if (offset < 0)
    offset = node->count + offset;

  /* Ranges to be trimmed: -1 here means "continue deleting until the list ends" */
  int orig_start = after ? offset + 1 : 0;
  int orig_extent = after ? -1 : offset;
  int new_start = after ? 0 : offset;
  int new_extent = after ? offset + 1 : -1;

  ssize_t diff_existing = NodeSetEntry(node, lpDeleteRange(node->entry, orig_start, orig_extent));
  node->count = lpLength(node->entry);

  entry = lpDeleteRange(entry, new_start, new_extent);
  QList::Node* new_node = CreateRAW(QUICKLIST_NODE_CONTAINER_PACKED, entry, lpBytes(entry));
  new_node->count = lpLength(new_node->entry);
  *diff = diff_existing;

  return new_node;
}

}  // namespace

__thread QList::Stats QList::stats;

QList::Stats& QList::Stats::operator+=(const Stats& other) {
#define ADD_FIELD(field) this->field += other.field;

  ADD_FIELD(compression_attempts);
  ADD_FIELD(bad_compression_attempts);
  ADD_FIELD(decompression_calls);
  ADD_FIELD(compressed_bytes);
  ADD_FIELD(raw_compressed_bytes);
  ADD_FIELD(interior_node_reads);
  ADD_FIELD(total_node_reads);
  ADD_FIELD(offload_requests);
  ADD_FIELD(onload_requests);

#undef ADD_FIELD

  return *this;
}

size_t QList::Node::GetLZF(void** data) const {
  DCHECK(encoding == QUICKLIST_NODE_ENCODING_LZF || encoding == QLIST_NODE_ENCODING_LZ4);
  quicklistLZF* lzf = (quicklistLZF*)entry;
  *data = lzf->compressed;
  return lzf->sz;
}

void QList::SetPackedThreshold(unsigned threshold) {
  packed_threshold = threshold;
}

size_t QList::DefragIfNeeded(PageUsage* page_usage) {
  size_t reallocated = 0;

  for (Node* curr = head_; curr; curr = curr->next) {
    if (!page_usage->IsPageForObjectUnderUtilized(curr->entry)) {
      continue;
    }

    // Data pointed to by the nodes is reallocated. The nodes themselves are not reallocated because
    // of their constant (and relatively small, ~40 bytes per object) size. Defragmentation fixes
    // fragmented memory allocation, which usually happens when variable-sized blocks of data are
    // allocated and deallocated, which is not expected with nodes.
    uint8_t* new_entry = static_cast<uint8_t*>(zmalloc(curr->sz));
    memcpy(new_entry, curr->entry, curr->sz);

    uint8_t* old_entry = curr->entry;
    curr->entry = new_entry;

    zfree(old_entry);
    ++reallocated;
  }
  return reallocated;
}

void QList::SetTieringParams(const TieringParams& params) {
  tiering_params_ = make_unique<TieringParams>(params);
}

QList::QList(int fill, int compress) : fill_(fill), compress_(compress), bookmark_count_(0) {
  compr_method_ = 0;
}

QList::QList(QList&& other) noexcept
    : head_(other.head_),
      count_(other.count_),
      len_(other.len_),
      fill_(other.fill_),
      compress_(other.compress_),
      bookmark_count_(other.bookmark_count_) {
  other.head_ = nullptr;
  other.len_ = other.count_ = 0;
}

QList::~QList() {
  Clear();
}

QList& QList::operator=(QList&& other) noexcept {
  if (this != &other) {
    Clear();
    head_ = other.head_;
    len_ = other.len_;
    count_ = other.count_;
    fill_ = other.fill_;
    compress_ = other.compress_;
    bookmark_count_ = other.bookmark_count_;
    tiering_params_ = std::move(other.tiering_params_);
    num_offloaded_nodes_ = other.num_offloaded_nodes_;
    other.head_ = nullptr;
    other.len_ = other.count_ = other.num_offloaded_nodes_ = 0;
  }
  return *this;
}

void QList::Clear() noexcept {
  Node* current = head_;

  while (len_) {
    Node* next = current->next;
    if (current->encoding != QUICKLIST_NODE_ENCODING_RAW) {
      quicklistLZF* lzf = (quicklistLZF*)current->entry;
      stats.compressed_bytes -= lzf->sz;
      stats.raw_compressed_bytes -= current->sz;
    }
    zfree(current->entry);
    zfree(current);

    len_--;
    current = next;
  }
  head_ = nullptr;
  count_ = 0;
  malloc_size_ = 0;
  num_offloaded_nodes_ = 0;
}

void QList::Push(string_view value, Where where) {
  DVLOG(3) << "Push " << absl::CHexEscape(value) << " " << (where == HEAD ? "HEAD" : "TAIL");

  /* The head and tail should never be compressed (we don't attempt to decompress them) */
  if (head_) {
    DCHECK(head_->encoding != QUICKLIST_NODE_ENCODING_LZF);
    DCHECK(head_->prev->encoding != QUICKLIST_NODE_ENCODING_LZF);
  }

  Node* orig = head_;
  uint32_t orig_id = 0;
  if (where == TAIL && orig) {
    orig = orig->prev;
    orig_id = len_ - 1;
  }

  InsertOpt opt = where == HEAD ? BEFORE : AFTER;

  size_t sz = value.size();
  if (ABSL_PREDICT_FALSE(IsLargeElement(sz, fill_))) {
    InsertPlainNode(orig, value, orig_id, opt);
    return;
  }

  count_++;

  if (ABSL_PREDICT_TRUE(NodeAllowInsert(orig, fill_, sz))) {
    auto func = (where == HEAD) ? LP_Prepend : LP_Append;
    malloc_size_ += NodeSetEntry(orig, func(orig->entry, value));
    orig->count++;
    if (len_ == 1) {  // sanity check
      DCHECK_EQ(malloc_size_, orig->sz);
    }
    DCHECK(head_->prev->next == nullptr);
    return;
  }

  Node* node = CreateFromSV(QUICKLIST_NODE_CONTAINER_PACKED, value);
  InsertNode(orig, node, orig_id, opt);
  DCHECK(head_->prev->next == nullptr);
}

string QList::Pop(Where where) {
  DCHECK_GT(count_, 0u);
  Node* node = head_;
  if (where == TAIL) {
    node = head_->prev;
  }

  /* The head and tail should never be compressed */
  DCHECK(node->encoding != QUICKLIST_NODE_ENCODING_LZF);
  DCHECK(head_->prev->next == nullptr);

  string res;
  if (ABSL_PREDICT_FALSE(QL_NODE_IS_PLAIN(node))) {
    // TODO: We could avoid this copy by returning the pointer of the plain node.
    // But the higher level APIs should support this.
    res.assign(reinterpret_cast<char*>(node->entry), node->sz);
    DelNode(node);
  } else {
    uint8_t* pos = where == HEAD ? lpFirst(node->entry) : lpLast(node->entry);
    unsigned int vlen;
    long long vlong;
    uint8_t* vstr = lpGetValue(pos, &vlen, &vlong);
    if (vstr) {
      res.assign(reinterpret_cast<char*>(vstr), vlen);
    } else {
      res = absl::StrCat(vlong);
    }
    DelPackedIndex(node, pos);
  }
  DCHECK(head_ == nullptr || head_->prev->next == nullptr);
  return res;
}

void QList::AppendListpack(unsigned char* zl) {
  Node* node = CreateRAW(QUICKLIST_NODE_CONTAINER_PACKED, zl, lpBytes(zl));
  node->count = lpLength(node->entry);

  InsertNode(_Tail(), node, len_ ? len_ - 1 : 0, AFTER);
  count_ += node->count;
}

void QList::AppendPlain(unsigned char* data, size_t sz) {
  Node* node = CreateRAW(QUICKLIST_NODE_CONTAINER_PLAIN, data, sz);
  InsertNode(_Tail(), node, len_ ? len_ - 1 : 0, AFTER);
  ++count_;
}

bool QList::Insert(std::string_view pivot, std::string_view elem, InsertOpt opt) {
  Iterator it = GetIterator(HEAD);

  if (it.Valid()) {
    do {
      if (it.Get() == pivot) {
        Insert(it, elem, opt);
        return true;
      }
    } while (it.Next());
  }

  return false;
}

bool QList::Replace(long index, std::string_view elem) {
  Iterator it = GetIterator(index);
  if (it.Valid()) {
    Replace(it, elem);
    return true;
  }
  return false;
}

size_t QList::MallocUsed(bool slow) const {
  size_t node_size = len_ * sizeof(Node) + znallocx(sizeof(QList));
  if (slow) {
    for (Node* node = head_; node; node = node->next) {
      node_size += zmalloc_usable_size(node->entry);
    }
    return node_size;
  }

  return node_size + malloc_size_;
}

void QList::Iterate(IterateFunc cb, long start, long end) const {
  long llen = Size();
  if (llen == 0)
    return;

  if (end < 0 || end >= long(Size()))
    end = Size() - 1;
  Iterator it = GetIterator(start);
  if (it.Valid()) {
    do {
      if (start > end || !cb(it.Get()))
        break;
      start++;
    } while (it.Next());
  }
}

auto QList::InsertPlainNode(Node* old_node, string_view value, uint32_t old_node_id,
                            InsertOpt insert_opt) -> Node* {
  Node* new_node = CreateFromSV(QUICKLIST_NODE_CONTAINER_PLAIN, value);
  InsertNode(old_node, new_node, old_node_id, insert_opt);
  count_++;
  return new_node;
}

void QList::InsertNode(Node* old_node, Node* new_node, uint32_t old_node_id, InsertOpt insert_opt) {
  if (insert_opt == AFTER) {
    new_node->prev = old_node;
    if (old_node) {
      new_node->next = old_node->next;
      if (old_node->next)
        old_node->next->prev = new_node;
      old_node->next = new_node;
      if (head_->prev == old_node)  // if old_node is tail, update the tail to the new node.
        head_->prev = new_node;
    }
  } else {  // BEFORE
    new_node->next = old_node;
    if (old_node) {
      new_node->prev = old_node->prev;
      // if old_node is not head, link its prev to the new node.
      // head->prev is tail, so we don't need to update it.
      if (old_node != head_)
        old_node->prev->next = new_node;
      old_node->prev = new_node;
    }
    if (head_ == old_node)
      head_ = new_node;
  }

  /* If this insert creates the only element so far, initialize head/tail. */
  if (len_ == 0) {
    head_ = new_node;
    head_->prev = new_node;
  }

  /* Update len first, so in Compress we know exactly len */
  len_++;
  malloc_size_ += new_node->sz;

  // Calculate final positions AFTER all linkage and len_ updates are complete.
  uint32_t new_node_id;
  if (insert_opt == AFTER && old_node) {
    new_node_id = old_node_id + 1;  // new_node inserted after, old_node position unchanged
  } else {
    new_node_id = old_node_id;  // new_node takes old_node's position
    old_node_id++;              // old_node shifts one position forward
  }

  if (old_node)
    CoolOff(old_node, old_node_id);

  CoolOff(new_node, new_node_id);
}

void QList::Insert(Iterator it, std::string_view elem, InsertOpt insert_opt) {
  DCHECK(it.current_);
  DCHECK(it.zi_);

  int full = 0, at_tail = 0, at_head = 0, avail_next = 0, avail_prev = 0;
  Node* node = it.current_;
  size_t sz = elem.size();
  bool after = insert_opt == AFTER;

  /* Populate accounting flags for easier boolean checks later */
  if (!NodeAllowInsert(node, fill_, sz)) {
    full = 1;
  }

  if (after && (it.offset_ == node->count - 1 || it.offset_ == -1)) {
    at_tail = 1;
    if (NodeAllowInsert(node->next, fill_, sz)) {
      avail_next = 1;
    }
  }

  if (!after && (it.offset_ == 0 || it.offset_ == -(node->count))) {
    at_head = 1;
    if (NodeAllowInsert(node->prev, fill_, sz)) {
      avail_prev = 1;
    }
  }
  uint32_t node_id = it.node_id_;
  if (ABSL_PREDICT_FALSE(IsLargeElement(sz, fill_))) {
    if (QL_NODE_IS_PLAIN(node) || (at_tail && after) || (at_head && !after)) {
      InsertPlainNode(node, elem, node_id, insert_opt);
    } else {
      AccessForReads(true, node);
      ssize_t diff_existing = 0;
      // if after == true, the order will be node, entry_node, new_node
      // otherwise: new_node, entry_node, node.
      Node* new_node = SplitNode(node, it.offset_, after, &diff_existing);
      Node* entry_node = InsertPlainNode(node, elem, node_id, insert_opt);
      uint32_t entry_node_id = after ? node_id + 1 : node_id;
      InsertNode(entry_node, new_node, entry_node_id, insert_opt);
      malloc_size_ += diff_existing;
    }
    return;
  }

  /* Now determine where and how to insert the new element */
  if (!full) {
    AccessForReads(true, node);
    uint8_t* new_entry = LP_Insert(node->entry, elem, it.zi_, after ? LP_AFTER : LP_BEFORE);
    malloc_size_ += NodeSetEntry(node, new_entry);
    node->count++;
    malloc_size_ += RecompressOnly(node, compr_method_);
  } else {
    bool insert_tail = at_tail && after;
    bool insert_head = at_head && !after;
    if (insert_tail && avail_next) {
      /* If we are: at tail, next has free space, and inserting after:
       *   - insert entry at head of next node. */
      auto* new_node = node->next;
      AccessForReads(true, new_node);
      malloc_size_ += NodeSetEntry(new_node, LP_Prepend(new_node->entry, elem));
      new_node->count++;
      malloc_size_ += RecompressOnly(new_node, compr_method_);
      malloc_size_ += RecompressOnly(node, compr_method_);
    } else if (insert_head && avail_prev) {
      /* If we are: at head, previous has free space, and inserting before:
       *   - insert entry at tail of previous node. */
      auto* new_node = node->prev;
      AccessForReads(true, new_node);
      malloc_size_ += NodeSetEntry(new_node, LP_Append(new_node->entry, elem));
      new_node->count++;
      malloc_size_ += RecompressOnly(new_node, compr_method_);
      malloc_size_ += RecompressOnly(node, compr_method_);
    } else if (insert_tail || insert_head) {
      /* If we are: full, and our prev/next has no available space, then:
       *   - create new node and attach to qlist */
      auto* new_node = CreateFromSV(QUICKLIST_NODE_CONTAINER_PACKED, elem);
      InsertNode(node, new_node, node_id, insert_opt);
    } else {
      /* else, node is full we need to split it. */
      /* covers both after and !after cases */
      AccessForReads(true, node);
      ssize_t diff_existing = 0;
      auto* new_node = SplitNode(node, it.offset_, after, &diff_existing);
      auto func = after ? LP_Prepend : LP_Append;
      malloc_size_ += NodeSetEntry(new_node, func(new_node->entry, elem));
      new_node->count++;
      InsertNode(node, new_node, node_id, insert_opt);
      MergeNodes(node);
      malloc_size_ += diff_existing;
    }
  }
  count_++;
}

void QList::Replace(Iterator it, std::string_view elem) {
  Node* node = it.current_;
  uint8_t* newentry = nullptr;
  size_t sz = elem.size();
  uint32_t node_id = it.node_id_;
  if (ABSL_PREDICT_TRUE(!QL_NODE_IS_PLAIN(node) && !IsLargeElement(sz, fill_) &&
                        (newentry = lpReplace(node->entry, &it.zi_, uint_ptr(elem), sz)) != NULL)) {
    malloc_size_ += NodeSetEntry(node, newentry);
    CoolOff(node, node_id);
  } else if (QL_NODE_IS_PLAIN(node)) {
    if (IsLargeElement(sz, fill_)) {
      zfree(node->entry);
      uint8_t* new_entry = (uint8_t*)zmalloc(sz);
      memcpy(new_entry, elem.data(), sz);
      malloc_size_ += NodeSetEntry(node, new_entry);
      CoolOff(node, node_id);
    } else {
      Insert(it, elem, AFTER);
      DelNode(node);
    }
  } else { /* The node is full or data is a large element */
    Node *split_node = NULL, *new_node;
    node->dont_compress = 1; /* Prevent compression in InsertNode() */

    /* If the entry is not at the tail, split the node at the entry's offset. */
    if (it.offset_ != node->count - 1 && it.offset_ != -1) {
      ssize_t diff_existing = 0;
      split_node = SplitNode(node, it.offset_, 1, &diff_existing);
      malloc_size_ += diff_existing;
    }

    /* Create a new node and insert it after the original node.
     * If the original node was split, insert the split node after the new node. */
    new_node = CreateFromSV(IsLargeElement(sz, fill_) ? QUICKLIST_NODE_CONTAINER_PLAIN
                                                      : QUICKLIST_NODE_CONTAINER_PACKED,
                            elem);
    // The order is: node, new_node, split_node.
    InsertNode(node, new_node, node_id, AFTER);
    if (split_node)
      InsertNode(new_node, split_node, node_id + 1, AFTER);
    count_++;

    /* Delete the replaced element. */
    if (node->count == 1) {
      DelNode(node);
    } else {
      unsigned char* p = lpSeek(node->entry, -1);
      DelPackedIndex(node, p);
      node->dont_compress = 0; /* Re-enable compression */
      new_node = MergeNodes(new_node);

      /* We can't know if the current node and its sibling nodes are correctly compressed,
       * and we don't know if they are within the range of compress depth, so we need to
       * use UpdateCompression() for compression, which checks if node is within compress
       * depth before compressing. */
      // TODO: node_id might be off after merges.
      CoolOff(new_node, node_id + 1);
      CoolOff(new_node->prev, node_id);
      if (new_node->next)
        CoolOff(new_node->next, node_id + 2);
    }
  }
}

void QList::CoolOff(Node* node, uint32_t node_id) {
  if (tiering_params_) {
    // Dry run for offloading decision.
    // a. Node id is withing the offloadable depth - offload it if not already offloaded.
    // b. Node id is outside the offloadable depth - but we have too many nodes that are not
    //    offloaded - take the O(n) route to traverse and offload them. The reason for having such
    //    nodes is because (a) handles node that we touch during operations.
    //    if for example we just perform lpush, then we won't touch any interior nodes, and they
    //    will never get offloaded. The good news is that once interior nodes are offloaded,
    //    we won't need to traverse them again for "trivial" access patterns unless they
    //    get accessed again. Another reason for missing offloaded nodes is that node_id can be
    //    off due to merges (can be improved in future).
    if (node_id >= tiering_params_->node_depth_threshold &&
        node_id + tiering_params_->node_depth_threshold < len_) {
      if (!node->offloaded) {
        OffloadNode(node);
      }
    } else if (num_offloaded_nodes_ * 2 + tiering_params_->node_depth_threshold * 2 < len_) {
      // We check `num_offloaded_nodes_ * 2` above to avoid frequent traversals.
      // So only when the gap between offloaded and non-offloaded nodes is large enough,
      // we do a traversal to offload more nodes.
      auto* fw = head_;
      auto* rev = head_->prev;
      uint32_t traverse_node_id = 0;

      // Traverse from both ends towards the middle as we expect more offloads towards the ends
      // due to usual access patterns of adding items via lpush/rpush.
      while (traverse_node_id <= len_ / 2 &&
             (num_offloaded_nodes_ + 2 * tiering_params_->node_depth_threshold) < len_) {
        if (traverse_node_id >= tiering_params_->node_depth_threshold) {
          if (fw->offloaded == 0) {
            OffloadNode(fw);
          }

          // Avoid offloading the same node twice when fw and rev meet in the middle.
          if (rev != fw && rev->offloaded == 0) {
            OffloadNode(rev);
          }
        }
        fw = fw->next;
        rev = rev->prev;
        traverse_node_id++;
      }
    }
  }

  /* Force 'quicklist' to meet compression guidelines set by compress depth.
   * The only way to guarantee interior nodes get compressed is to iterate
   * to our "interior" compress depth then compress the next node we find.
   * If compress depth is larger than the entire list, we return immediately. */

  if (node->recompress)
    CompressRaw(node, this->compr_method_);
  else
    this->CompressByDepth(node);
}

void QList::CompressByDepth(Node* node) {
  if (len_ == 0)
    return;

  /* The head and tail should never be compressed (we should not attempt to recompress them) */
  DCHECK(head_->recompress == 0 && head_->prev->recompress == 0);

  /* If length is less than our compress depth (from both sides),
   * we can't compress anything. */
  if (!AllowCompression() || len_ < (unsigned int)(compress_ * 2))
    return;

  /* Iterate until we reach compress depth for both sides of the list.a
   * Note: because we do length checks at the *top* of this function,
   *       we can skip explicit null checks below. Everything exists. */
  Node* forward = head_;
  Node* reverse = head_->prev;
  int depth = 0;
  int in_depth = 0;
  while (depth++ < compress_) {
    malloc_size_ += TryDecompressInternal(false, forward);
    malloc_size_ += TryDecompressInternal(false, reverse);

    if (forward == node || reverse == node)
      in_depth = 1;

    /* We passed into compress depth of opposite side of the quicklist
     * so there's no need to compress anything and we can exit. */
    if (forward == reverse || forward->next == reverse)
      return;

    forward = forward->next;
    reverse = reverse->prev;
  }

  if (!in_depth && node) {
    malloc_size_ += TryCompress(node, this->compr_method_);
  }
  /* At this point, forward and reverse are one node beyond depth */
  malloc_size_ += TryCompress(forward, this->compr_method_);
  malloc_size_ += TryCompress(reverse, this->compr_method_);
}

void QList::AccessForReads(bool recompress, Node* node) {
  DCHECK(node);
  stats.total_node_reads++;
  if (node->offloaded) {
    DCHECK(tiering_params_);
    stats.onload_requests++;
    num_offloaded_nodes_--;
    node->offloaded = 0;
  }
  if (len_ > 2 && node != head_ && node->next != nullptr) {
    stats.interior_node_reads++;
  }
  ssize_t res = TryDecompressInternal(recompress, node);
  malloc_size_ += res;
}

/* Attempt to merge listpacks within two nodes on either side of 'center'.
 *
 * We attempt to merge:
 *   - (center->prev->prev, center->prev)
 *   - (center->next, center->next->next)
 *   - (center->prev, center)
 *   - (center, center->next)
 *
 * Returns the new 'center' after merging.
 */
auto QList::MergeNodes(Node* center) -> Node* {
  Node *prev = NULL, *prev_prev = NULL, *next = NULL;
  Node *next_next = NULL, *target = NULL;

  if (center->prev) {
    prev = center->prev;
    if (center->prev->prev)
      prev_prev = center->prev->prev;
  }

  if (center->next) {
    next = center->next;
    if (center->next->next)
      next_next = center->next->next;
  }

  /* Try to merge prev_prev and prev */
  if (NodeAllowMerge(prev, prev_prev, fill_)) {
    ListpackMerge(prev_prev, prev);
    prev_prev = prev = NULL; /* they could have moved, invalidate them. */
  }

  /* Try to merge next and next_next */
  if (NodeAllowMerge(next, next_next, fill_)) {
    ListpackMerge(next, next_next);
    next = next_next = NULL; /* they could have moved, invalidate them. */
  }

  /* Try to merge center node and previous node */
  if (NodeAllowMerge(center, center->prev, fill_)) {
    target = ListpackMerge(center->prev, center);
    center = NULL; /* center could have been deleted, invalidate it. */
  } else {
    /* else, we didn't merge here, but target needs to be valid below. */
    target = center;
  }

  /* Use result of center merge (or original) to merge with next node. */
  if (NodeAllowMerge(target, target->next, fill_)) {
    target = ListpackMerge(target, target->next);
  }
  return target;
}

/* Given two nodes, try to merge their listpacks.
 *
 * This helps us not have a quicklist with 3 element listpacks if
 * our fill factor can handle much higher levels.
 *
 * Note: 'a' must be to the LEFT of 'b'.
 *
 * After calling this function, both 'a' and 'b' should be considered
 * unusable.  The return value from this function must be used
 * instead of re-using any of the quicklistNode input arguments.
 *
 * Returns the input node picked to merge against or NULL if
 * merging was not possible. */
auto QList::ListpackMerge(Node* a, Node* b) -> Node* {
  AccessForReads(false, a);
  AccessForReads(false, b);
  if ((lpMerge(&a->entry, &b->entry))) {
    /* We merged listpacks! Now remove the unused Node. */
    Node *keep = NULL, *nokeep = NULL;
    if (!a->entry) {
      nokeep = a;
      keep = b;
    } else if (!b->entry) {
      nokeep = b;
      keep = a;
    }
    keep->count = lpLength(keep->entry);
    malloc_size_ += NodeSetEntry(keep, keep->entry);

    keep->recompress = 0; /* Prevent 'keep' from being recompressed if
                           * it becomes head or tail after merging. */

    nokeep->count = 0;
    DelNode(nokeep);
    CoolOff(keep, 0);  // TODO: node_id is unknown here, so just pass 0.
    return keep;
  }

  /* else, the merge returned NULL and nothing changed. */
  return NULL;
}

void QList::DelNode(Node* node) {
  if (node->next)
    node->next->prev = node->prev;

  if (node == head_) {
    head_ = node->next;
  } else {
    // for non-head nodes, update prev->next to point to node->next
    // (If node==head, prev is tail and should always point to NULL).
    node->prev->next = node->next;
    if (node == head_->prev)  // tail
      head_->prev = node->prev;
  }

  /* Update len first, so in CompressByDepth we know exactly len */
  len_--;
  count_ -= node->count;
  malloc_size_ -= node->sz;
  if (node->offloaded) {
    num_offloaded_nodes_--;
  }

  /* If we deleted a node within our compress depth, we
   * now have compressed nodes needing to be decompressed. */
  CompressByDepth(NULL);

  zfree(node->entry);
  zfree(node);
}

/* Delete one entry from list given the node for the entry and a pointer
 * to the entry in the node.
 *
 * Note: DelPackedIndex() *requires* uncompressed nodes because you
 *       already had to get *p from an uncompressed node somewhere.
 *
 * Returns true if the entire node was deleted, false if node still exists.
 * Also updates in/out param 'p' with the next offset in the listpack. */
bool QList::DelPackedIndex(Node* node, uint8_t* p) {
  DCHECK(!QL_NODE_IS_PLAIN(node));

  if (node->count == 1) {
    DelNode(node);
    return true;
  }

  malloc_size_ += NodeSetEntry(node, lpDelete(node->entry, p, NULL));
  node->count--;
  count_--;

  return false;
}

void QList::OffloadNode(Node* node) {
  DCHECK(tiering_params_ && node->offloaded == 0);
  num_offloaded_nodes_++;
  stats.offload_requests++;
  node->offloaded = 1;
}

void QList::InitIteratorEntry(Iterator* it) const {
  DCHECK(it->current_);
  const_cast<QList*>(this)->AccessForReads(true, it->current_);
  if (QL_NODE_IS_PLAIN(it->current_)) {
    it->zi_ = it->current_->entry;
  } else {
    it->zi_ = lpSeek(it->current_->entry, it->offset_);
  }
}

auto QList::GetIterator(Where where) const -> Iterator {
  Iterator it;
  it.owner_ = this;
  it.zi_ = NULL;
  if (where == HEAD) {
    it.current_ = head_;
    it.offset_ = 0;
    it.direction_ = FWD;
    it.node_id_ = 0;
  } else {
    it.current_ = _Tail();
    it.offset_ = -1;
    it.direction_ = REV;
    it.node_id_ = len_ - 1;
  }

  if (it.current_) {
    InitIteratorEntry(&it);
  }

  return it;
}

auto QList::GetIterator(long idx) const -> Iterator {
  unsigned long long accum = 0;
  int forward = idx < 0 ? 0 : 1; /* < 0 -> reverse, 0+ -> forward */
  uint64_t index = forward ? idx : (-idx) - 1;
  if (index >= count_)
    return {};

  DCHECK(head_);

  /* Seek in the other direction if that way is shorter. */
  int seek_forward = forward;
  unsigned long long seek_index = index;
  if (index > (count_ - 1) / 2) {
    seek_forward = !forward;
    seek_index = count_ - 1 - index;
  }

  Node* n = seek_forward ? head_ : head_->prev;
  unsigned node_cnt = 0;
  while (ABSL_PREDICT_TRUE(n)) {
    if ((accum + n->count) > seek_index) {
      break;
    } else {
      accum += n->count;
      n = seek_forward ? n->next : n->prev;
      node_cnt++;
    }
  }
  DCHECK(n);
  if (!n)
    return {};

  /* Fix accum so it looks like we seeked in the other direction. */
  if (seek_forward != forward)
    accum = count_ - n->count - accum;

  Iterator iter;
  iter.owner_ = this;
  iter.direction_ = forward ? FWD : REV;
  iter.current_ = n;
  iter.node_id_ = seek_forward ? node_cnt : (len_ - 1 - node_cnt);
  if (forward) {
    /* forward = normal head-to-tail offset. */
    iter.offset_ = index - accum;
  } else {
    /* reverse = need negative offset for tail-to-head, so undo
     * the result of the original index = (-idx) - 1 above. */
    iter.offset_ = (-index) - 1 + accum;
  }

  InitIteratorEntry(&iter);

  return iter;
}

auto QList::Erase(Iterator it) -> Iterator {
  DCHECK(it.current_);

  Node* node = it.current_;
  Node* prev = node->prev;
  Node* next = node->next;

  bool deleted_node = false;
  if (QL_NODE_IS_PLAIN(node)) {
    DelNode(node);
    deleted_node = true;
  } else {
    deleted_node = DelPackedIndex(node, it.zi_);
  }

  it.zi_ = NULL;  // Reset current entry pointer

  // If current node is deleted, we must update iterator node and offset.
  if (deleted_node) {
    if (it.direction_ == FWD) {
      it.current_ = next;
      it.offset_ = 0;
      it.node_id_++;
    } else if (it.direction_ == REV) {
      it.current_ = len_ ? prev : nullptr;
      it.offset_ = -1;
      it.node_id_ = it.node_id_ ? it.node_id_ - 1 : len_ - 1;
    }
  }

  if (it.current_) {
    InitIteratorEntry(&it);
  }

  // Sanity, should be noop in release mode.
  if (len_ == 1) {
    DCHECK_EQ(count_, head_->count);
    DCHECK_EQ(malloc_size_, head_->sz);
  }

  /* else if (!deleted_node), no changes needed.
   * we already reset iter->zi above, and the existing iter->offset
   * doesn't move again because:
   *   - [1, 2, 3] => delete offset 1 => [1, 3]: next element still offset 1
   *   - [1, 2, 3] => delete offset 0 => [2, 3]: next element still offset 0
   *  if we deleted the last element at offset N and now
   *  length of this listpack is N-1, the next call into
   *  quicklistNext() will jump to the next node. */
  return it;
}

bool QList::Erase(const long start, unsigned count) {
  if (count == 0)
    return false;

  unsigned extent = count; /* range is inclusive of start position */

  if (start >= 0 && extent > (count_ - start)) {
    /* if requesting delete more elements than exist, limit to list size. */
    extent = count_ - start;
  } else if (start < 0 && extent > (unsigned long)(-start)) {
    /* else, if at negative offset, limit max size to rest of list. */
    extent = -start; /* c.f. LREM -29 29; just delete until end. */
  }

  Iterator it = GetIterator(start);
  Node* node = it.current_;
  long offset = it.offset_;

  /* iterate over next nodes until everything is deleted. */
  while (extent) {
    Node* next = node->next;

    unsigned long del;
    int delete_entire_node = 0;
    if (offset == 0 && extent >= node->count) {
      /* If we are deleting more than the count of this node, we
       * can just delete the entire node without listpack math. */
      delete_entire_node = 1;
      del = node->count;
    } else if (offset >= 0 && extent + offset >= node->count) {
      /* If deleting more nodes after this one, calculate delete based
       * on size of current node. */
      del = node->count - offset;
    } else if (offset < 0) {
      /* If offset is negative, we are in the first run of this loop
       * and we are deleting the entire range
       * from this start offset to end of list.  Since the Negative
       * offset is the number of elements until the tail of the list,
       * just use it directly as the deletion count. */
      del = -offset;

      /* If the positive offset is greater than the remaining extent,
       * we only delete the remaining extent, not the entire offset.
       */
      if (del > extent)
        del = extent;
    } else {
      /* else, we are deleting less than the extent of this node, so
       * use extent directly. */
      del = extent;
    }

    if (delete_entire_node || QL_NODE_IS_PLAIN(node)) {
      DelNode(node);
    } else {
      AccessForReads(true, node);
      malloc_size_ += NodeSetEntry(node, lpDeleteRange(node->entry, offset, del));
      node->count -= del;
      count_ -= del;
      if (node->count == 0) {
        DelNode(node);
      } else {
        malloc_size_ += RecompressOnly(node, compr_method_);
      }
    }

    extent -= del;
    node = next;
    offset = 0;
  }
  return true;
}

uint8_t* QList::TryExtractListpack() {
  if (len_ != 1 || QL_NODE_IS_PLAIN(head_) || !ShouldStoreAsListPack(head_->sz) ||
      head_->IsCompressed()) {
    return nullptr;
  }

  uint8_t* res = std::exchange(head_->entry, nullptr);
  DelNode(head_);

  return res;
}

bool QList::Iterator::Next() {
  if (!current_)
    return false;

  int plain = QL_NODE_IS_PLAIN(current_);

  // Advance to the next element in the current node.
  if (ABSL_PREDICT_FALSE(plain)) {
    zi_ = NULL;
  } else {
    unsigned char* (*nextFn)(unsigned char*, unsigned char*) = lpNext;
    int offset_update = 1;

    if (direction_ == REV) {
      DCHECK_EQ(REV, direction_);
      nextFn = lpPrev;
      offset_update = -1;
    }
    zi_ = nextFn(current_->entry, zi_);
    offset_ += offset_update;
  }

  if (zi_)
    return true;

  // Move to the next node.
  const_cast<QList*>(owner_)->CompressByDepth(current_);

  if (direction_ == FWD) {
    /* Forward traversal, Jumping to start of next node */
    current_ = current_->next;
    offset_ = 0;
    node_id_++;
  } else {
    /* Reverse traversal, Jumping to end of previous node */
    DCHECK_EQ(REV, direction_);
    offset_ = -1;
    current_ = (current_ == owner_->head_) ? nullptr : current_->prev;
    node_id_--;
  }

  if (!current_)
    return false;

  owner_->InitIteratorEntry(this);
  return zi_ != nullptr;
}

auto QList::Iterator::Get() const -> Entry {
  int plain = QL_NODE_IS_PLAIN(current_);
  if (ABSL_PREDICT_FALSE(plain)) {
    char* str = reinterpret_cast<char*>(current_->entry);
    return Entry(str, current_->sz);
  }

  DCHECK(zi_);

  /* Populate value from existing listpack position */
  unsigned int sz = 0;
  long long val;
  uint8_t* ptr = lpGetValue(zi_, &sz, &val);

  return ptr ? Entry(reinterpret_cast<char*>(ptr), sz) : Entry(val);
}

}  // namespace dfly


================================================
FILE: src/core/qlist.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/functional/function_ref.h>

#include <cstdint>
#include <memory>
#include <string>

#include "core/collection_entry.h"

#define QL_COMP_BITS 16
#define QL_BM_BITS 4

/* quicklist node encodings */
#define QUICKLIST_NODE_ENCODING_RAW 1
#define QUICKLIST_NODE_ENCODING_LZF 2
#define QLIST_NODE_ENCODING_LZ4 3

/* quicklist node container formats */
#define QUICKLIST_NODE_CONTAINER_PLAIN 1
#define QUICKLIST_NODE_CONTAINER_PACKED 2

namespace dfly {

class PageUsage;

// Heuristic: for values smaller than 2 KiB we prefer the compact listpack
// representation. 2048 was chosen as a conservative threshold that matches
// common quicklist usage patterns and avoids creating very large listpacks
// that are costly to reallocate or compress.
inline bool ShouldStoreAsListPack(size_t size) {
  return size < 2048;
}

class QList {
 public:
  enum Where : uint8_t { TAIL, HEAD };
  enum COMPR_METHOD : uint8_t { LZF = 0, LZ4 = 1 };

  /* Node is a 40 byte struct describing a listpack for a quicklist.
   * We use bit fields keep the Node at 40 bytes.
   * count: 16 bits, max 65536 (max lp bytes is 65k, so max count actually < 32k).
   * encoding: 2 bits, RAW=1, LZF=2.
   * container: 2 bits, PLAIN=1 (a single item as char array), PACKED=2 (listpack with multiple
   * items). recompress: 1 bit, bool, true if node is temporary decompressed for usage.
   * attempted_compress: 1 bit, boolean, used for verifying during testing.
   * dont_compress: 1 bit, boolean, used for preventing compression of entry.
   * */

  struct Node {
    Node* prev;
    Node* next;
    unsigned char* entry;
    size_t sz : 48;    /* entry size in bytes */
    size_t count : 16; /* count of items in listpack */

    uint16_t encoding : 2;           /* RAW==1 or LZF==2 */
    uint16_t container : 2;          /* PLAIN==1 or PACKED==2 */
    uint16_t recompress : 1;         /* was this node previous compressed? */
    uint16_t attempted_compress : 1; /* node can't compress; too small */
    uint16_t dont_compress : 1;      /* prevent compression of entry that will be used later */
    uint16_t offloaded : 1;          /* node is offloaded to colder storage */
    uint16_t reserved1 : 8;          /* reserved for future use */

    uint16_t reserved2; /* more bits to steal for future usage */
    uint32_t reserved3; /* more bits to steal for future usage */

    bool IsCompressed() const {
      return encoding != QUICKLIST_NODE_ENCODING_RAW;
    }

    size_t GetLZF(void** data) const;
  };

  using Entry = CollectionEntry;
  class Iterator {
   public:
    // Returns true if the iterator is valid (points to an element).
    bool Valid() const {
      return zi_ != nullptr;
    }

    Entry Get() const;

    // Advances to the next/prev element. Returns false if no more entries.
    bool Next();

   private:
    const QList* owner_ = nullptr;
    Node* current_ = nullptr;
    unsigned char* zi_ = nullptr; /* points to the current element */
    int32_t offset_ = 0;          /* offset in current listpack */
    int32_t node_id_ = 0;         /* node index in the list, 0 is head */
    uint8_t direction_ = 1;

    friend class QList;
  };

  using IterateFunc = absl::FunctionRef<bool(Entry)>;
  enum InsertOpt : uint8_t { BEFORE, AFTER };

  struct TieringParams {
    // TODO: hook functions and params that allow qlist offloading nodes to colder storage.
    uint32_t node_depth_threshold = 2;
  };

  /**
   * fill: The number of entries allowed per internal list node can be specified
   * as a fixed maximum size or a maximum number of elements.
   * For a fixed maximum size, use -5 through -1, meaning:
   * -5: max size: 64 Kb  <-- not recommended for normal workloads
   * -4: max size: 32 Kb  <-- not recommended
   * -3: max size: 16 Kb  <-- probably not recommended
   * -2: max size: 8 Kb   <-- good
   * -1: max size: 4 Kb   <-- good
   * Positive numbers mean store up to _exactly_ that number of elements
   * per list node.
   * The highest performing option is usually -2 (8 Kb size) or -1 (4 Kb size),
   * but if your use case is unique, adjust the settings as necessary.
   *
   *
   * Lists may also be compressed.
   * "compress" is the number of quicklist listpack nodes from *each* side of
   * the list to *exclude* from compression.  The head and tail of the list
   * are always uncompressed for fast push/pop operations.  Settings are:
   * 0: disable all list compression
   * 1: depth 1 means "don't start compressing until after 1 node into the list,
   *    going from either the head or tail"
   *    So: [head]->node->node->...->node->[tail]
   *    [head], [tail] will always be uncompressed; inner nodes will compress.
   * 2: [head]->[next]->node->node->...->node->[prev]->[tail]
   *    2 here means: don't compress head or head->next or tail->prev or tail,
   *    but compress all nodes between them.
   * 3: [head]->[next]->[next]->node->node->...->node->[prev]->[prev]->[tail]
   * etc.
   *
   */
  explicit QList(int fill = -2, int compress = 0);

  QList(QList&&) noexcept;
  QList(const QList&) = delete;
  ~QList();

  QList& operator=(const QList&) = delete;
  QList& operator=(QList&&) noexcept;

  size_t Size() const {
    return count_;
  }

  void Clear() noexcept;

  void Push(std::string_view value, Where where);

  // Returns the popped value. Precondition: list is not empty.
  std::string Pop(Where where);

  void AppendListpack(uint8_t* zl);
  void AppendPlain(uint8_t* zl, size_t sz);

  // Returns true if pivot found and elem inserted, false otherwise.
  bool Insert(std::string_view pivot, std::string_view elem, InsertOpt opt);

  void Insert(Iterator it, std::string_view elem, InsertOpt opt);

  // Returns true if item was replaced, false if index is out of range.
  bool Replace(long index, std::string_view elem);

  size_t MallocUsed(bool slow) const;

  // Iterates over entries from start to end (inclusive).
  void Iterate(IterateFunc cb, long start, long end) const;

  // Returns an iterator to tail or the head of the list.
  // result.Valid() is true if the list is not empty.
  Iterator GetIterator(Where where) const;

  // Returns an iterator at a specific index 'idx',
  // or Invalid iterator if index is out of range.
  // negative index - means counting from the tail.
  // result.Valid() is true if the index is within range.
  Iterator GetIterator(long idx) const;

  uint32_t node_count() const {
    return len_;
  }

  unsigned compress_param() const {
    return compress_;
  }

  Iterator Erase(Iterator it);

  // Returns true if elements were deleted, false if list has not changed.
  // Negative start index is allowed.
  bool Erase(long start, unsigned count);

  // Needed by tests and the rdb code.
  const Node* Head() const {
    return head_;
  }

  const Node* Tail() const {
    return _Tail();
  }

  // Returns nullptr if quicklist does not fit the necessary requirements
  // to be converted to listpack, and listpack otherwise. The ownership over the listpack
  // blob is moved to the caller.
  uint8_t* TryExtractListpack();

  void set_fill(int fill) {
    fill_ = fill;
  }

  void set_compr_method(COMPR_METHOD cm) {
    compr_method_ = static_cast<unsigned>(cm);
  }

  static void SetPackedThreshold(unsigned threshold);

  // Moves nodes away from underused pages by reallocating if the underlying page usage is low.
  // Returns count of nodes reallocated to help in testing.
  size_t DefragIfNeeded(PageUsage* page_usage);

  void SetTieringParams(const TieringParams& params);

  struct Stats {
    uint64_t compression_attempts = 0;

    // compression attempts with compression ratio that was not good enough to keep.
    // Subset of compression_attempts.
    uint64_t bad_compression_attempts = 0;

    uint64_t decompression_calls = 0;

    // How many bytes we currently keep compressed.
    size_t compressed_bytes = 0;

    // how many bytes we compressed from.
    // Compressed savings are calculated as raw_compressed_bytes - compressed_bytes.
    size_t raw_compressed_bytes = 0;
    uint64_t interior_node_reads = 0;
    uint64_t total_node_reads = 0;
    uint64_t offload_requests = 0;
    uint64_t onload_requests = 0;

    Stats& operator+=(const Stats& other);
  };
  static __thread Stats stats;

 private:
  bool AllowCompression() const {
    return compress_ != 0;
  }

  Node* _Tail() const {
    return head_ ? head_->prev : nullptr;
  }

  // Returns newly created plain node.
  Node* InsertPlainNode(Node* old_node, std::string_view elem, uint32_t old_node_id,
                        InsertOpt insert_opt);
  void InsertNode(Node* old_node, Node* new_node, uint32_t old_node_id, InsertOpt insert_opt);

  // Reduces the "warmth" of the node. Current implementation can decide on
  // compressing the node based on its position in the list.
  void CoolOff(Node* node, uint32_t node_id);

  void Replace(Iterator it, std::string_view elem);
  void CompressByDepth(Node* node);

  // Prepares the node for read access.
  void AccessForReads(bool recompress, Node* node);

  Node* MergeNodes(Node* node);

  // Deletes one of the nodes and returns the other.
  Node* ListpackMerge(Node* a, Node* b);

  void DelNode(Node* node);
  bool DelPackedIndex(Node* node, uint8_t* p);
  void OffloadNode(Node* node);

  // Initializes iterator's zi_ to point to the element at offset_.
  // Decompresses the node if needed. Assumes current_ is not null.
  void InitIteratorEntry(Iterator* it) const;

  Node* head_ = nullptr;
  size_t malloc_size_ = 0;    // size of the quicklist struct
  uint32_t count_ = 0;        /* total count of all entries in all listpacks */
  uint32_t len_ = 0;          /* number of quicklistNodes */
  int16_t fill_;              /* fill factor for individual nodes */
  int16_t compr_method_ : 2;  // 0 - lzf, 1 - lz4
  int16_t reserved1_ : 14;
  unsigned compress_ : QL_COMP_BITS; /* depth of end nodes not to compress;0=off */
  unsigned bookmark_count_ : QL_BM_BITS;
  unsigned reserved2_ : 12;
  uint32_t num_offloaded_nodes_ = 0;
  std::unique_ptr<TieringParams> tiering_params_;
};

}  // namespace dfly


================================================
FILE: src/core/qlist_test.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/qlist.h"

#include <absl/strings/match.h>
#include <absl/strings/str_cat.h>
#include <absl/strings/str_format.h>
#include <gmock/gmock.h>
#include <mimalloc.h>

#include "base/gtest.h"
#include "base/logging.h"
#include "core/mi_memory_resource.h"
#include "core/page_usage/page_usage_stats.h"
#include "io/file.h"
#include "io/line_reader.h"

extern "C" {
#include "redis/listpack.h"
#include "redis/zmalloc.h"
}

/* quicklist compression disable */
#define QUICKLIST_NOCOMPRESS 0

namespace dfly {

using namespace std;
using namespace testing;
using absl::StrCat;

static int ql_verify_compress(const QList& ql) {
  int errors = 0;
  unsigned compress_param = ql.compress_param();
  if (compress_param > 0) {
    const auto* node = ql.Head();
    unsigned int low_raw = compress_param;
    unsigned int high_raw = ql.node_count() - compress_param;

    for (unsigned int at = 0; at < ql.node_count(); at++, node = node->next) {
      if (node && (at < low_raw || at >= high_raw)) {
        if (node->encoding != QUICKLIST_NODE_ENCODING_RAW) {
          LOG(ERROR) << "Incorrect compression: node " << at << " is compressed at depth "
                     << compress_param << " ((" << low_raw << "," << high_raw
                     << " total nodes: " << ql.node_count() << "; size: " << node->sz
                     << "; recompress: " << node->recompress;
          errors++;
        }
      } else {
        if (node->encoding != QUICKLIST_NODE_ENCODING_LZF && !node->attempted_compress) {
          LOG(ERROR) << absl::StrFormat(
              "Incorrect non-compression: node %d is NOT "
              "compressed at depth %d ((%u, %u); total "
              "nodes: %lu; size: %zu; recompress: %d; attempted: %d)",
              at, compress_param, low_raw, high_raw, ql.node_count(), node->sz, node->recompress,
              node->attempted_compress);
          errors++;
        }
      }
    }
  }
  return errors;
}

/* Verify list metadata matches physical list contents. */
static int ql_verify(const QList& ql, uint32_t nc, uint32_t count, uint32_t head_count,
                     uint32_t tail_count) {
  int errors = 0;

  if (nc != ql.node_count()) {
    LOG(ERROR) << "quicklist length wrong: expected " << nc << " got " << ql.node_count();
    errors++;
  }

  if (count != ql.Size()) {
    LOG(ERROR) << "quicklist count wrong: expected " << count << " got " << ql.Size();
    errors++;
  }

  auto* node = ql.Head();
  size_t node_size = 0;
  while (node) {
    node_size += node->count;
    node = node->next;
    CHECK(node != ql.Head());
  }

  if (node_size != ql.Size()) {
    LOG(ERROR) << "quicklist cached count not match actual count: expected " << ql.Size() << " got "
               << node_size;
    errors++;
  }

  node = ql.Tail();
  node_size = 0;
  while (node) {
    node_size += node->count;
    node = (node == ql.Head()) ? nullptr : node->prev;
  }
  if (node_size != ql.Size()) {
    LOG(ERROR) << "has different forward count than reverse count!  "
                  "Forward count is "
               << ql.Size() << ", reverse count is " << node_size;
    errors++;
  }

  if (ql.node_count() == 0 && errors == 0) {
    return 0;
  }

  if (head_count != ql.Head()->count && head_count != lpLength(ql.Head()->entry)) {
    LOG(ERROR) << absl::StrFormat("head count wrong: expected %u got cached %u vs. actual %lu",
                                  head_count, ql.Head()->count, lpLength(ql.Head()->entry));
    errors++;
  }

  if (tail_count != ql.Tail()->count && tail_count != lpLength(ql.Tail()->entry)) {
    LOG(ERROR) << "tail count wrong: expected " << tail_count << "got cached " << ql.Tail()->count
               << " vs. actual " << lpLength(ql.Tail()->entry);
    errors++;
  }

  errors += ql_verify_compress(ql);
  return errors;
}

static void SetupMalloc() {
  // configure redis lib zmalloc which requires mimalloc heap to work.
  auto* tlh = mi_heap_get_backing();
  init_zmalloc_threadlocal(tlh);
  mi_option_set(mi_option_purge_delay, -1);  // disable purging of segments (affects benchmarks)
}

class QListTest : public ::testing::Test {
 protected:
  QListTest() : mr_(mi_heap_get_backing()) {
  }

  static void SetUpTestSuite() {
    SetupMalloc();
  }

  static void TearDownTestSuite() {
    mi_heap_collect(mi_heap_get_backing(), true);

    auto cb_visit = [](const mi_heap_t* heap, const mi_heap_area_t* area, void* block,
                       size_t block_size, void* arg) {
      LOG(ERROR) << "Unfreed allocations: block_size " << block_size
                 << ", allocated: " << area->used * block_size;
      return true;
    };

    mi_heap_visit_blocks(mi_heap_get_backing(), false /* do not visit all blocks*/, cb_visit,
                         nullptr);
  }

  vector<string> ToItems() const;

  MiMemoryResource mr_;
  QList ql_;
};

vector<string> QListTest::ToItems() const {
  vector<string> res;
  auto cb = [&](const QList::Entry& e) {
    res.push_back(e.to_string());
    return true;
  };

  ql_.Iterate(cb, 0, ql_.Size());
  return res;
}

TEST_F(QListTest, Basic) {
  EXPECT_EQ(0, ql_.Size());
  ql_.Push("abc", QList::HEAD);
  EXPECT_EQ(1, ql_.Size());
  EXPECT_TRUE(ql_.Tail() == ql_.Head());
  EXPECT_LE(ql_.MallocUsed(false), ql_.MallocUsed(true));

  auto it = ql_.GetIterator(QList::HEAD);
  ASSERT_TRUE(it.Valid());  // Iterator is valid immediately.

  EXPECT_EQ("abc", it.Get().view());

  ASSERT_FALSE(it.Next());

  ql_.Push("def", QList::TAIL);
  EXPECT_EQ(2, ql_.Size());
  EXPECT_LE(ql_.MallocUsed(false), ql_.MallocUsed(true));

  it = ql_.GetIterator(QList::TAIL);
  ASSERT_TRUE(it.Valid());
  EXPECT_EQ("def", it.Get().view());

  ASSERT_TRUE(it.Next());
  EXPECT_EQ("abc", it.Get().view());
  ASSERT_FALSE(it.Next());

  it = ql_.GetIterator(0);
  ASSERT_TRUE(it.Valid());
  EXPECT_EQ("abc", it.Get().view());
  it = ql_.GetIterator(-1);
  ASSERT_TRUE(it.Valid());
  EXPECT_EQ("def", it.Get().view());

  vector<string> items = ToItems();

  EXPECT_THAT(items, ElementsAre("abc", "def"));
  EXPECT_GT(ql_.MallocUsed(false), ql_.MallocUsed(true) * 0.8);
}

TEST_F(QListTest, ListPack) {
  string_view sv = "abcded"sv;
  uint8_t* lp1 = lpPrepend(lpNew(0), (uint8_t*)sv.data(), sv.size());
  uint8_t* lp2 = lpAppend(lpNew(0), (uint8_t*)sv.data(), sv.size());
  ASSERT_EQ(lpBytes(lp1), lpBytes(lp2));
  ASSERT_EQ(0, memcmp(lp1, lp2, lpBytes(lp1)));
  lpFree(lp1);
  lpFree(lp2);
}

TEST_F(QListTest, InsertDelete) {
  EXPECT_FALSE(ql_.Insert("abc", "def", QList::BEFORE));
  ql_.Push("abc", QList::HEAD);
  EXPECT_TRUE(ql_.Insert("abc", "def", QList::BEFORE));
  auto items = ToItems();
  EXPECT_THAT(items, ElementsAre("def", "abc"));
  EXPECT_TRUE(ql_.Insert("abc", "123456", QList::AFTER));
  items = ToItems();
  EXPECT_THAT(items, ElementsAre("def", "abc", "123456"));

  auto it = ql_.GetIterator(QList::HEAD);
  ASSERT_TRUE(it.Valid());

  // Erase the items one by one.
  it = ql_.Erase(it);
  items = ToItems();
  EXPECT_THAT(items, ElementsAre("abc", "123456"));
  ASSERT_TRUE(it.Valid());
  ASSERT_EQ("abc", it.Get().view());

  it = ql_.Erase(it);
  items = ToItems();
  EXPECT_THAT(items, ElementsAre("123456"));
  ASSERT_TRUE(it.Valid());
  ASSERT_EQ(123456, it.Get().ival());

  it = ql_.Erase(it);
  items = ToItems();
  EXPECT_THAT(items, ElementsAre());
  ASSERT_FALSE(it.Valid());
  EXPECT_EQ(0, ql_.Size());
}

TEST_F(QListTest, EraseLastElementInNodeAdvancesToNextNode) {
  // Regression test for iterator semantics: when erasing the last element
  // within a multi-entry node and another node follows, the iterator should
  // correctly advance to the first element of the next node.

  // Create a QList with fill=2 to ensure max 2 elements per node
  ql_ = QList(2, QUICKLIST_NOCOMPRESS);

  // Push 3 elements: this creates 2 nodes (first with 2 elements, second with 1)
  ql_.Push("first", QList::HEAD);   // Will be at index 2 after all pushes
  ql_.Push("second", QList::HEAD);  // Will be at index 1 after all pushes
  ql_.Push("third", QList::HEAD);   // Will be at index 0 after all pushes

  // Verify we have 2 nodes as expected
  ASSERT_EQ(2, ql_.node_count());
  ASSERT_EQ(3, ql_.Size());

  // Node structure should be:
  // Node 1: ["third", "second"]
  // Node 2: ["first"]

  auto items = ToItems();
  EXPECT_THAT(items, ElementsAre("third", "second", "first"));

  // Get iterator to "second" (last element in first node)
  auto it = ql_.GetIterator(1);
  ASSERT_TRUE(it.Valid());
  ASSERT_EQ("second", it.Get().view());

  // Erase "second" - this is the last element in the first node
  it = ql_.Erase(it);

  // Iterator should now point to "first" (first element of the second node)
  ASSERT_TRUE(it.Valid());
  EXPECT_EQ("first", it.Get().view());

  // Verify the list is correct
  items = ToItems();
  EXPECT_THAT(items, ElementsAre("third", "first"));
  EXPECT_EQ(2, ql_.Size());
}

TEST_F(QListTest, PushPlain) {
  // push a value large enough to trigger plain node insertion.
  string val(9000, 'a');
  ql_.Push(val, QList::HEAD);
  auto items = ToItems();
  EXPECT_THAT(items, ElementsAre(val));
}

TEST_F(QListTest, GetNum) {
  ql_.Push("1251977", QList::HEAD);
  QList::Iterator it = ql_.GetIterator(QList::HEAD);
  ASSERT_TRUE(it.Valid());
  EXPECT_EQ(1251977, it.Get().ival());
}

TEST_F(QListTest, CompressionPlain) {
  char buf[256];
  QList::SetPackedThreshold(1);
  ql_ = QList(-2, 1);

  for (int i = 0; i < 500; i++) {
    /* Set to 256 to allow the node to be triggered to compress,
     * if it is less than 48(nocompress), the test will be successful. */
    snprintf(buf, sizeof(buf), "hello%d", i);
    ql_.Push(string_view{buf, sizeof(buf)}, QList::HEAD);
  }
  QList::SetPackedThreshold(0);

  QList::Iterator it = ql_.GetIterator(QList::TAIL);
  int i = 0;
  ASSERT_TRUE(it.Valid());
  do {
    string_view sv = it.Get().view();
    ASSERT_EQ(sizeof(buf), sv.size());
    ASSERT_TRUE(absl::StartsWith(sv, StrCat("hello", i)));
    i++;
  } while (it.Next());
  EXPECT_EQ(500, i);
}

TEST_F(QListTest, LargeValues) {
  string val(100000, 'a');
  ql_.Push(val, QList::HEAD);
  ql_.Push(val, QList::HEAD);
  ql_.Pop(QList::HEAD);
  auto items = ToItems();
  EXPECT_THAT(items, ElementsAre(val));
}

TEST_F(QListTest, RemoveListpack) {
  ql_.Push("ABC", QList::TAIL);
  ql_.Push("DEF", QList::TAIL);
  auto it = ql_.GetIterator(QList::TAIL);
  ASSERT_TRUE(it.Valid());  // Iterator is valid immediately.
  ql_.Erase(it);
  it = ql_.GetIterator(QList::TAIL);
  ASSERT_TRUE(it.Valid());
  it = ql_.Erase(it);
  ASSERT_FALSE(it.Valid());
}

TEST_F(QListTest, DefragListpackRaw) {
  PageUsage page_usage{CollectPageStats::YES, 100.0};
  page_usage.SetForceReallocate(true);

  ql_.Push("first", QList::TAIL);
  ql_.Push("second", QList::TAIL);

  ASSERT_EQ(ql_.DefragIfNeeded(&page_usage), 1);
  EXPECT_THAT(ToItems(), ElementsAre("first", "second"));
  ql_.Clear();
}

TEST_F(QListTest, DefragPlainTextRaw) {
  PageUsage page_usage{CollectPageStats::YES, 100.0};
  page_usage.SetForceReallocate(true);
  string big(100000, 'x');
  ql_.Push(big, QList::HEAD);
  ASSERT_EQ(ql_.DefragIfNeeded(&page_usage), 1);
  EXPECT_THAT(ToItems(), ElementsAre(big));
  ql_.Clear();
}

TEST_F(QListTest, DefragmentListpackCompressed) {
  PageUsage page_usage{CollectPageStats::YES, 100.0};
  page_usage.SetForceReallocate(true);

  // MIN_COMPRESS_BYTES = 256
  char buf[256];
  constexpr auto items_per_list = 4;
  constexpr auto total_items = 20;
  ql_ = QList{items_per_list, 1};

  for (auto i = 0; i < total_items; ++i) {
    absl::SNPrintF(buf, 256, "test__%d", i);
    ql_.Push(string_view{buf, 256}, QList::TAIL);
  }

  ASSERT_EQ(total_items / items_per_list, ql_.DefragIfNeeded(&page_usage));

  auto i = 0;
  auto it = ql_.GetIterator(QList::HEAD);
  ASSERT_TRUE(it.Valid());
  do {
    auto v = it.Get().view();
    ASSERT_EQ(v.size(), 256);
    ASSERT_TRUE(absl::StartsWith(v, StrCat("test__", i)));
    ++i;
  } while (it.Next());
  ASSERT_EQ(i, total_items);
}

TEST_F(QListTest, Tiering) {
  QList::stats.offload_requests = 0;
  ql_.SetTieringParams(QList::TieringParams{.node_depth_threshold = 1});
  for (int i = 0; i < 8000; i++) {
    ql_.Push(absl::StrCat("value", i), QList::TAIL);
  }
  EXPECT_EQ(QList::stats.offload_requests, 9);
}

using FillCompress = tuple<int, unsigned, QList::COMPR_METHOD>;

class PrintToFillCompress {
 public:
  std::string operator()(const TestParamInfo<FillCompress>& info) const {
    int fill = get<0>(info.param);
    int compress = get<1>(info.param);
    QList::COMPR_METHOD method = get<2>(info.param);
    string fill_str = fill >= 0 ? absl::StrCat("f", fill) : absl::StrCat("fminus", -fill);
    string method_str = method == QList::LZF ? "lzf" : "lz4";
    return absl::StrCat(fill_str, "compr", compress, method_str);
  }
};

class OptionsTest : public QListTest, public WithParamInterface<FillCompress> {};

INSTANTIATE_TEST_SUITE_P(Matrix, OptionsTest,
                         Combine(Values(-5, -4, -3, -2, -1, 0, 1, 2, 32, 66, 128, 999),
                                 Values(0, 1, 2, 3, 4, 5, 6, 10), Values(QList::LZF, QList::LZ4)),
                         PrintToFillCompress());

TEST_P(OptionsTest, Numbers) {
  auto [fill, compress, method] = GetParam();
  ql_ = QList(fill, compress);
  ql_.set_compr_method(method);
  array<int64_t, 5000> nums;

  for (unsigned i = 0; i < nums.size(); i++) {
    nums[i] = -5157318210846258176 + i;
    string val = absl::StrCat(nums[i]);
    ql_.Push(val, QList::TAIL);
  }
  ql_.Push("xxxxxxxxxxxxxxxxxxxx", QList::TAIL);

  for (unsigned i = 0; i < nums.size(); i++) {
    auto it = ql_.GetIterator(i);
    ASSERT_TRUE(it.Valid());
    ASSERT_EQ(nums[i], it.Get().ival()) << i;
  }

  auto it = ql_.GetIterator(nums.size());
  ASSERT_TRUE(it.Valid());
  EXPECT_EQ("xxxxxxxxxxxxxxxxxxxx", it.Get().view());
}

TEST_P(OptionsTest, NumbersIndex) {
  auto [fill, compress, method] = GetParam();
  ql_ = QList(fill, compress);
  ql_.set_compr_method(method);

  long long nums[5000];
  for (int i = 0; i < 760; i++) {
    nums[i] = -5157318210846258176 + i;
    ql_.Push(absl::StrCat(nums[i]), QList::TAIL);
  }

  unsigned i = 437;
  QList::Iterator it = ql_.GetIterator(i);
  ASSERT_TRUE(it.Valid());
  do {
    ASSERT_EQ(nums[i], it.Get().ival());
    i++;
  } while (it.Next());
  ASSERT_EQ(760, i);
}

TEST_P(OptionsTest, DelRangeA) {
  auto [fill, compress, method] = GetParam();
  ql_ = QList(fill, compress);
  ql_.set_compr_method(method);
  long long nums[5000];
  for (int i = 0; i < 33; i++) {
    nums[i] = -5157318210846258176 + i;
    ql_.Push(absl::StrCat(nums[i]), QList::TAIL);
  }

  if (fill == 32) {
    ASSERT_EQ(0, ql_verify(ql_, 2, 33, 32, 1));
  }

  /* ltrim 3 3 (keep [3,3] inclusive = 1 remaining) */
  ql_.Erase(0, 3);
  ql_.Erase(-29, 4000); /* make sure not loop forever */
  if (fill == 32) {
    ASSERT_EQ(0, ql_verify(ql_, 1, 1, 1, 1));
  }
  auto it = ql_.GetIterator(0);
  ASSERT_TRUE(it.Valid());
  EXPECT_EQ(-5157318210846258173, it.Get().ival());
}

TEST_P(OptionsTest, DelRangeB) {
  auto [fill, _, method] = GetParam();
  ql_ = QList(fill, QUICKLIST_NOCOMPRESS);  // ignore compress parameter
  ql_.set_compr_method(method);

  long long nums[5000];
  for (int i = 0; i < 33; i++) {
    nums[i] = i;
    ql_.Push(absl::StrCat(nums[i]), QList::TAIL);
  }
  if (fill == 32) {
    ASSERT_EQ(0, ql_verify(ql_, 2, 33, 32, 1));
  }
  /* ltrim 5 16 (keep [5,16] inclusive = 12 remaining) */
  ql_.Erase(0, 5);
  ql_.Erase(-16, 16);
  if (fill == 32) {
    ASSERT_EQ(0, ql_verify(ql_, 1, 12, 12, 12));
  }

  auto it = ql_.GetIterator(0);
  ASSERT_TRUE(it.Valid());
  EXPECT_EQ(5, it.Get().ival());

  it = ql_.GetIterator(-1);
  ASSERT_TRUE(it.Valid());
  EXPECT_EQ(16, it.Get().ival());

  ql_.Push("bobobob", QList::TAIL);
  it = ql_.GetIterator(-1);
  ASSERT_TRUE(it.Valid());
  EXPECT_EQ("bobobob", it.Get().view());

  for (int i = 0; i < 12; i++) {
    it = ql_.GetIterator(i);
    ASSERT_TRUE(it.Valid());
    EXPECT_EQ(i + 5, it.Get().ival());
  }
}

TEST_P(OptionsTest, DelRangeC) {
  auto [fill, compress, method] = GetParam();
  ql_ = QList(fill, compress);
  ql_.set_compr_method(method);

  long long nums[5000];
  for (int i = 0; i < 33; i++) {
    nums[i] = -5157318210846258176 + i;
    ql_.Push(absl::StrCat(nums[i]), QList::TAIL);
  }
  if (fill == 32) {
    ASSERT_EQ(0, ql_verify(ql_, 2, 33, 32, 1));
  }

  /* ltrim 3 3 (keep [3,3] inclusive = 1 remaining) */
  ql_.Erase(0, 3);
  ql_.Erase(-29, 4000); /* make sure not loop forever */
  if (fill == 32) {
    ASSERT_EQ(0, ql_verify(ql_, 1, 1, 1, 1));
  }
  auto it = ql_.GetIterator(0);
  ASSERT_TRUE(it.Valid());
  ASSERT_EQ(-5157318210846258173, it.Get().ival());
}

TEST_P(OptionsTest, DelRangeD) {
  auto [fill, compress, method] = GetParam();
  ql_ = QList(fill, compress);
  ql_.set_compr_method(method);

  long long nums[5000];
  for (int i = 0; i < 33; i++) {
    nums[i] = -5157318210846258176 + i;
    ql_.Push(absl::StrCat(nums[i]), QList::TAIL);
  }
  if (fill == 32) {
    ASSERT_EQ(0, ql_verify(ql_, 2, 33, 32, 1));
  }
  ql_.Erase(-12, 3);

  ASSERT_EQ(30, ql_.Size());
}

TEST_P(OptionsTest, DelRangeNode) {
  auto [_, compress, method] = GetParam();
  ql_ = QList(-2, compress);
  ql_.set_compr_method(method);

  for (int i = 0; i < 32; i++)
    ql_.Push(StrCat("hello", i), QList::HEAD);

  ASSERT_EQ(0, ql_verify(ql_, 1, 32, 32, 32));
  ql_.Erase(0, 32);
  ASSERT_EQ(0, ql_verify(ql_, 0, 0, 0, 0));
}

TEST_P(OptionsTest, DelRangeNodeOverflow) {
  auto [_, compress, method] = GetParam();
  ql_ = QList(-2, compress);
  ql_.set_compr_method(method);

  for (int i = 0; i < 32; i++)
    ql_.Push(StrCat("hello", i), QList::HEAD);
  ASSERT_EQ(0, ql_verify(ql_, 1, 32, 32, 32));
  ql_.Erase(0, 128);
  ASSERT_EQ(0, ql_verify(ql_, 0, 0, 0, 0));
}

TEST_P(OptionsTest, DelRangeMiddle100of500) {
  auto [_, compress, method] = GetParam();
  ql_ = QList(32, compress);

  for (int i = 0; i < 500; i++)
    ql_.Push(StrCat("hello", i + 1), QList::TAIL);

  ASSERT_EQ(0, ql_verify(ql_, 16, 500, 32, 20));
  ql_.Erase(200, 100);
  ASSERT_EQ(0, ql_verify(ql_, 14, 400, 32, 20));
}

TEST_P(OptionsTest, DelLessFillAcrossNodes) {
  auto [_, compress, method] = GetParam();
  ql_ = QList(32, compress);

  for (int i = 0; i < 500; i++)
    ql_.Push(StrCat("hello", i + 1), QList::TAIL);
  ASSERT_EQ(0, ql_verify(ql_, 16, 500, 32, 20));
  ql_.Erase(60, 10);
  ASSERT_EQ(0, ql_verify(ql_, 16, 490, 32, 20));
}

TEST_P(OptionsTest, DelNegOne) {
  auto [_, compress, method] = GetParam();
  ql_ = QList(32, compress);
  for (int i = 0; i < 500; i++)
    ql_.Push(StrCat("hello", i + 1), QList::TAIL);
  ASSERT_EQ(0, ql_verify(ql_, 16, 500, 32, 20));
  ql_.Erase(-1, 1);
  ASSERT_EQ(0, ql_verify(ql_, 16, 499, 32, 19));
}

TEST_P(OptionsTest, DelNegOneOverflow) {
  auto [_, compress, method] = GetParam();
  ql_ = QList(32, compress);
  for (int i = 0; i < 500; i++)
    ql_.Push(StrCat("hello", i + 1), QList::TAIL);

  ASSERT_EQ(0, ql_verify(ql_, 16, 500, 32, 20));
  ql_.Erase(-1, 128);

  ASSERT_EQ(0, ql_verify(ql_, 16, 499, 32, 19));
}

TEST_P(OptionsTest, DelNeg100From500) {
  auto [_, compress, method] = GetParam();
  ql_ = QList(32, compress);
  for (int i = 0; i < 500; i++)
    ql_.Push(StrCat("hello", i + 1), QList::TAIL);
  ql_.Erase(-100, 100);

  QList::Iterator it = ql_.GetIterator(QList::TAIL);
  ASSERT_TRUE(it.Valid());
  ASSERT_EQ("hello400", it.Get());
  ASSERT_EQ(0, ql_verify(ql_, 13, 400, 32, 16));
}

TEST_P(OptionsTest, DelMin10_5_from50) {
  auto [_, compress, method] = GetParam();
  ql_ = QList(32, compress);

  for (int i = 0; i < 50; i++)
    ql_.Push(StrCat("hello", i + 1), QList::TAIL);
  ASSERT_EQ(0, ql_verify(ql_, 2, 50, 32, 18));
  ql_.Erase(-10, 5);
  ASSERT_EQ(0, ql_verify(ql_, 2, 45, 32, 13));
}

TEST_P(OptionsTest, DelElems) {
  auto [fill, compress, method] = GetParam();
  ql_ = QList(fill, compress);

  const char* words[] = {"abc", "foo", "bar", "foobar", "foobared", "zap", "bar", "test", "foo"};
  const char* result[] = {"abc", "foo", "foobar", "foobared", "zap", "test", "foo"};
  const char* resultB[] = {"abc", "foo", "foobar", "foobared", "zap", "test"};

  for (int i = 0; i < 9; i++)
    ql_.Push(words[i], QList::TAIL);

  /* lrem 0 bar */
  auto iter = ql_.GetIterator(QList::HEAD);
  while (iter.Valid()) {
    if (iter.Get() == "bar") {
      iter = ql_.Erase(iter);
      // iter now points to next element, don't call Next()
    } else {
      if (!iter.Next())
        break;
    }
  }
  EXPECT_THAT(ToItems(), ElementsAreArray(result));

  ql_.Push("foo", QList::TAIL);

  /* lrem -2 foo */
  iter = ql_.GetIterator(QList::TAIL);
  int del = 2;
  while (iter.Valid()) {
    if (iter.Get() == "foo") {
      iter = ql_.Erase(iter);
      del--;
      if (del == 0)
        break;
      // iter now points to next element, don't call Next()
    } else {
      if (!iter.Next())
        break;
    }
  }

  /* check result of lrem -2 foo */
  /* (we're ignoring the '2' part and still deleting all foo
   * because we only have two foo) */
  EXPECT_THAT(ToItems(), ElementsAreArray(resultB));
}

TEST_P(OptionsTest, IterateReverse) {
  auto [_, compress, method] = GetParam();
  ql_ = QList(32, compress);

  for (int i = 0; i < 500; i++)
    ql_.Push(StrCat("hello", i), QList::HEAD);
  QList::Iterator it = ql_.GetIterator(QList::TAIL);
  int i = 0;
  ASSERT_TRUE(it.Valid());
  do {
    ASSERT_EQ(StrCat("hello", i), it.Get());
    i++;
  } while (it.Next());
  ASSERT_EQ(500, i);
  ASSERT_EQ(0, ql_verify(ql_, 16, 500, 20, 32));
}

TEST_P(OptionsTest, Iterate500) {
  auto [_, compress, method] = GetParam();
  ql_ = QList(32, compress);
  for (int i = 0; i < 500; i++)
    ql_.Push(StrCat("hello", i), QList::HEAD);

  QList::Iterator it = ql_.GetIterator(QList::HEAD);
  int i = 499, count = 0;
  ASSERT_TRUE(it.Valid());
  do {
    QList::Entry entry = it.Get();
    ASSERT_EQ(StrCat("hello", i), entry);
    i--;
    count++;
  } while (it.Next());
  EXPECT_EQ(500, count);
  ASSERT_EQ(0, ql_verify(ql_, 16, 500, 20, 32));

  it = ql_.GetIterator(QList::TAIL);
  i = 0;
  ASSERT_TRUE(it.Valid());
  do {
    ASSERT_EQ(StrCat("hello", i), it.Get());
    i++;
  } while (it.Next());
  EXPECT_EQ(500, i);
}

TEST_P(OptionsTest, IterateAfterOne) {
  auto [_, compress, method] = GetParam();
  ql_ = QList(-2, compress);
  ql_.Push("hello", QList::HEAD);

  QList::Iterator it = ql_.GetIterator(0);
  ASSERT_TRUE(it.Valid());
  ql_.Insert(it, "abc", QList::AFTER);

  ASSERT_EQ(0, ql_verify(ql_, 1, 2, 2, 2));

  /* verify results */
  it = ql_.GetIterator(0);
  ASSERT_TRUE(it.Valid());
  ASSERT_EQ("hello", it.Get());

  it = ql_.GetIterator(1);
  ASSERT_TRUE(it.Valid());
  ASSERT_EQ("abc", it.Get());
}

TEST_P(OptionsTest, IterateDelete) {
  auto [fill, compress, method] = GetParam();
  ql_ = QList(fill, compress);

  ql_.Push("abc", QList::TAIL);
  ql_.Push("def", QList::TAIL);
  ql_.Push("hij", QList::TAIL);
  ql_.Push("jkl", QList::TAIL);
  ql_.Push("oop", QList::TAIL);

  QList::Iterator it = ql_.GetIterator(QList::HEAD);
  while (it.Valid()) {
    if (it.Get() == "hij") {
      it = ql_.Erase(it);
    } else {
      it.Next();
    }
  }

  ASSERT_THAT(ToItems(), ElementsAre("abc", "def", "jkl", "oop"));
}

TEST_P(OptionsTest, InsertBeforeOne) {
  auto [_, compress, method] = GetParam();
  ql_ = QList(-2, compress);

  ql_.Push("hello", QList::HEAD);
  QList::Iterator it = ql_.GetIterator(0);
  ASSERT_TRUE(it.Valid());
  ql_.Insert(it, "abc", QList::BEFORE);
  ql_verify(ql_, 1, 2, 2, 2);

  /* verify results */
  it = ql_.GetIterator(0);
  ASSERT_TRUE(it.Valid());
  ASSERT_EQ("abc", it.Get());

  it = ql_.GetIterator(1);
  ASSERT_TRUE(it.Valid());
  ASSERT_EQ("hello", it.Get());
}

TEST_P(OptionsTest, InsertWithHeadFull) {
  auto [_, compress, method] = GetParam();
  ql_ = QList(4, compress);

  for (int i = 0; i < 10; i++)
    ql_.Push(StrCat("hello", i), QList::TAIL);

  ql_.set_fill(-1);
  QList::Iterator it = ql_.GetIterator(-10);
  ASSERT_TRUE(it.Valid());

  char buf[4096] = {0};
  ql_.Insert(it, string_view{buf, sizeof(buf)}, QList::BEFORE);
  ql_verify(ql_, 4, 11, 1, 2);
}

TEST_P(OptionsTest, InsertWithTailFull) {
  auto [_, compress, method] = GetParam();
  ql_ = QList(4, compress);
  for (int i = 0; i < 10; i++)
    ql_.Push(StrCat("hello", i), QList::HEAD);

  ql_.set_fill(-1);
  QList::Iterator it = ql_.GetIterator(-1);
  ASSERT_TRUE(it.Valid());

  char buf[4096] = {0};
  ql_.Insert(it, string_view{buf, sizeof(buf)}, QList::AFTER);
  ql_verify(ql_, 4, 11, 2, 1);
}

TEST_P(OptionsTest, InsertOnceWhileIterating) {
  auto [fill, compress, method] = GetParam();
  ql_ = QList(fill, compress);

  ql_.Push("abc", QList::TAIL);
  ql_.set_fill(1);

  ql_.Push("def", QList::TAIL);
  ql_.set_fill(fill);
  ql_.Push("bob", QList::TAIL);
  ql_.Push("foo", QList::TAIL);
  ql_.Push("zoo", QList::TAIL);

  /* insert "bar" before "bob" while iterating over list. */
  QList::Iterator it = ql_.GetIterator(QList::HEAD);
  if (it.Valid()) {
    do {
      if (it.Get() == "bob") {
        ql_.Insert(it, "bar", QList::BEFORE);
        break; /* didn't we fix insert-while-iterating? */
      }
    } while (it.Next());
  }
  EXPECT_THAT(ToItems(), ElementsAre("abc", "def", "bar", "bob", "foo", "zoo"));
}

TEST_P(OptionsTest, InsertBefore250NewInMiddleOf500Elements) {
  auto [fill, compress, method] = GetParam();
  ql_ = QList(fill, compress);
  for (int i = 0; i < 500; i++) {
    string val = StrCat("hello", i);
    val.resize(32);
    ql_.Push(val, QList::TAIL);
  }

  for (int i = 0; i < 250; i++) {
    QList::Iterator it = ql_.GetIterator(250);
    ASSERT_TRUE(it.Valid());
    ql_.Insert(it, StrCat("abc", i), QList::BEFORE);
  }

  if (fill == 32) {
    ASSERT_EQ(0, ql_verify(ql_, 25, 750, 32, 20));
  }
}

TEST_P(OptionsTest, InsertAfter250NewInMiddleOf500Elements) {
  auto [fill, compress, method] = GetParam();
  ql_ = QList(fill, compress);
  for (int i = 0; i < 500; i++)
    ql_.Push(StrCat("hello", i), QList::HEAD);

  for (int i = 0; i < 250; i++) {
    QList::Iterator it = ql_.GetIterator(250);
    ASSERT_TRUE(it.Valid());
    ql_.Insert(it, StrCat("abc", i), QList::AFTER);
  }

  ASSERT_EQ(750, ql_.Size());

  if (fill == 32) {
    ASSERT_EQ(0, ql_verify(ql_, 26, 750, 20, 32));
  }
}

TEST_P(OptionsTest, NextPlain) {
  auto [_, compress, method] = GetParam();
  ql_ = QList(-2, compress);

  QList::SetPackedThreshold(3);

  const char* strings[] = {"hello1", "hello2", "h3", "h4", "hello5"};

  for (int i = 0; i < 5; ++i)
    ql_.Push(strings[i], QList::HEAD);

  QList::Iterator it = ql_.GetIterator(QList::TAIL);
  int j = 0;

  ASSERT_TRUE(it.Valid());
  do {
    ASSERT_EQ(strings[j], it.Get());
    j++;
  } while (it.Next());
}

TEST_P(OptionsTest, IndexFrom500) {
  auto [fill, compress, method] = GetParam();
  ql_ = QList(fill, compress);
  for (int i = 0; i < 500; i++)
    ql_.Push(StrCat("hello", i + 1), QList::TAIL);

  QList::Iterator it = ql_.GetIterator(1);
  ASSERT_TRUE(it.Valid());
  ASSERT_EQ("hello2", it.Get());
  it = ql_.GetIterator(200);
  ASSERT_TRUE(it.Valid());
  ASSERT_EQ("hello201", it.Get());

  it = ql_.GetIterator(-1);
  ASSERT_TRUE(it.Valid());
  ASSERT_EQ("hello500", it.Get());

  it = ql_.GetIterator(-2);
  ASSERT_TRUE(it.Valid());
  ASSERT_EQ("hello499", it.Get());

  it = ql_.GetIterator(-100);
  ASSERT_TRUE(it.Valid());
  ASSERT_EQ("hello401", it.Get());

  it = ql_.GetIterator(500);
  ASSERT_FALSE(it.Valid());
}

static void BM_QListCompress(benchmark::State& state) {
  SetupMalloc();

  string path = base::ProgramRunfile("testdata/list.txt.zst");
  io::Result<io::Source*> src = io::OpenUncompressed(path);
  CHECK(src) << src.error();
  io::LineReader lr(*src, TAKE_OWNERSHIP);
  string_view line;
  vector<string> lines;
  while (lr.Next(&line)) {
    lines.push_back(string(line));
  }

  VLOG(1) << "Read " << lines.size() << " lines " << state.range(0);
  while (state.KeepRunning()) {
    QList ql(-2, state.range(0));  // uses differrent compression modes, see below.
    ql.set_compr_method(state.range(1) == 0 ? QList::LZF : QList::LZ4);

    for (const string& l : lines) {
      ql.Push(l, QList::TAIL);
    }
    DVLOG(1) << ql.node_count() << ", " << ql.MallocUsed(true);
  }
  CHECK_EQ(0, zmalloc_used_memory_tl);
}
BENCHMARK(BM_QListCompress)
    ->ArgsProduct({{1, 4, 0}, {0, 1}});  // x - compression depth, y compression method.
                                         // x = 0 no compression, 1 - compress all nodes but edges,
                                         // 4 - compress all but 4 nodes from edges.

static void BM_QListUncompress(benchmark::State& state) {
  SetupMalloc();

  string path = base::ProgramRunfile("testdata/list.txt.zst");
  io::Result<io::Source*> src = io::OpenUncompressed(path);
  CHECK(src) << src.error();
  io::LineReader lr(*src, TAKE_OWNERSHIP);
  string_view line;
  QList ql(-2, state.range(0));
  ql.set_compr_method(state.range(1) == 0 ? QList::LZF : QList::LZ4);
  QList::stats.compression_attempts = 0;

  CHECK_EQ(QList::stats.compressed_bytes, 0u);
  CHECK_EQ(QList::stats.raw_compressed_bytes, 0u);

  size_t line_len = 0;
  while (lr.Next(&line)) {
    ql.Push(line, QList::TAIL);
    line_len += line.size();
  }

  if (ql.compress_param() > 0) {
    CHECK_GT(QList::stats.compression_attempts, 0u);
    CHECK_GT(QList::stats.compressed_bytes, 0u);
    CHECK_GT(QList::stats.raw_compressed_bytes, QList::stats.compressed_bytes);
  }

  LOG(INFO) << "MallocUsed " << ql.compress_param() << ": " << ql.MallocUsed(true) << ", "
            << ql.MallocUsed(false);
  size_t exp_count = ql.Size();

  while (state.KeepRunning()) {
    unsigned actual_count = 0, actual_len = 0;
    ql.Iterate(
        [&](const QList::Entry& e) {
          actual_len += e.view().size();
          ++actual_count;
          return true;
        },
        0, -1);
    CHECK_EQ(exp_count, actual_count);
    CHECK_EQ(line_len, actual_len);
  }
}
BENCHMARK(BM_QListUncompress)->ArgsProduct({{1, 4, 0}, {0, 1}});

}  // namespace dfly


================================================
FILE: src/core/score_map.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/score_map.h"

#include "base/endian.h"
#include "base/logging.h"
#include "core/compact_object.h"
#include "core/page_usage/page_usage_stats.h"
#include "core/sds_utils.h"

extern "C" {
#include "redis/zmalloc.h"
}

using namespace std;

namespace dfly {

namespace {

inline double GetValue(sds key) {
  char* valptr = key + sdslen(key) + 1;
  return absl::bit_cast<double>(absl::little_endian::Load64(valptr));
}

void* AllocateScored(string_view field, double value) {
  size_t meta_offset = field.size() + 1;

  // The layout is:
  // key, '\0', 8-byte double value
  sds newkey = AllocSdsWithSpace(field.size(), 8);

  if (!field.empty()) {
    memcpy(newkey, field.data(), field.size());
  }

  absl::little_endian::Store64(newkey + meta_offset, absl::bit_cast<uint64_t>(value));

  return newkey;
}

}  // namespace

ScoreMap::~ScoreMap() {
  Clear();
}

pair<void*, bool> ScoreMap::AddOrUpdate(string_view field, double value) {
  void* newkey = AllocateScored(field, value);

  // Replace the whole entry.
  sds prev_entry = (sds)AddOrReplaceObj(newkey, false);
  if (prev_entry) {
    ObjDelete(prev_entry, false);
    return {newkey, false};
  }

  return {newkey, true};
}

std::pair<void*, bool> ScoreMap::AddOrSkip(std::string_view field, double value) {
  uint64_t hashcode = Hash(&field, 1);
  void* obj = FindInternal(&field, hashcode, 1);  // 1 - string_view

  if (obj)
    return {obj, false};

  void* newkey = AllocateScored(field, value);
  DenseSet::AddUnique(newkey, false, hashcode);
  return {newkey, true};
}

void* ScoreMap::AddUnique(std::string_view field, double value) {
  void* newkey = AllocateScored(field, value);
  DenseSet::AddUnique(newkey, false, Hash(&field, 1));
  return newkey;
}

std::optional<double> ScoreMap::Find(std::string_view field) {
  uint64_t hashcode = Hash(&field, 1);
  sds str = (sds)FindInternal(&field, hashcode, 1);
  if (!str)
    return nullopt;

  return GetValue(str);
}

uint64_t ScoreMap::Hash(const void* obj, uint32_t cookie) const {
  DCHECK_LT(cookie, 2u);

  if (cookie == 0) {
    sds s = (sds)obj;
    return CompactObj::HashCode(string_view{s, sdslen(s)});
  }

  const string_view* sv = (const string_view*)obj;
  return CompactObj::HashCode(*sv);
}

bool ScoreMap::ObjEqual(const void* left, const void* right, uint32_t right_cookie) const {
  DCHECK_LT(right_cookie, 2u);

  sds s1 = (sds)left;
  if (right_cookie == 0) {
    sds s2 = (sds)right;

    if (sdslen(s1) != sdslen(s2)) {
      return false;
    }

    return sdslen(s1) == 0 || memcmp(s1, s2, sdslen(s1)) == 0;
  }

  const string_view* right_sv = (const string_view*)right;
  string_view left_sv{s1, sdslen(s1)};
  return left_sv == (*right_sv);
}

size_t ScoreMap::ObjectAllocSize(const void* obj) const {
  sds s1 = (sds)obj;
  size_t res = zmalloc_usable_size(sdsAllocPtr(s1));
  return res;
}

uint32_t ScoreMap::ObjExpireTime(const void* obj) const {
  // Should not reach.
  return UINT32_MAX;
}

void ScoreMap::ObjUpdateExpireTime(const void* obj, uint32_t ttl_sec) {
  // Should not reach.
}

void ScoreMap::ObjDelete(void* obj, bool has_ttl) const {
  sds s1 = (sds)obj;
  sdsfree(s1);
}

void* ScoreMap::ObjectClone(const void* obj, bool has_ttl, bool add_ttl) const {
  return nullptr;
}

detail::SdsScorePair ScoreMap::iterator::BreakToPair(void* obj) {
  sds f = (sds)obj;
  return detail::SdsScorePair(f, GetValue(f));
}

namespace {
// Does not Release obj. Callers must do so explicitly if a `Reallocation` happened
pair<sds, bool> DuplicateEntryIfFragmented(void* obj, PageUsage* page_usage) {
  sds key = (sds)obj;
  size_t key_len = sdslen(key);

  if (!page_usage->IsPageForObjectUnderUtilized(key))
    return {key, false};

  sds newkey = AllocSdsWithSpace(key_len, 8);
  memcpy(newkey, key, key_len + 8 + 1);

  return {newkey, true};
}

}  // namespace

bool ScoreMap::iterator::ReallocIfNeeded(PageUsage* page_usage, std::function<void(sds, sds)> cb) {
  auto* ptr = curr_entry_;

  if (ptr->IsLink()) {
    ptr = ptr->AsLink();
  }

  DCHECK(!ptr->IsEmpty());
  DCHECK(ptr->IsObject());

  auto* obj = ptr->GetObject();
  auto [new_obj, realloced] = DuplicateEntryIfFragmented(obj, page_usage);
  if (realloced) {
    if (cb) {
      cb((sds)obj, (sds)new_obj);
    }
    sdsfree((sds)obj);
    ptr->SetObject(new_obj);
  }
  return realloced;
}

}  // namespace dfly


================================================
FILE: src/core/score_map.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <optional>
#include <string_view>

#include "core/dense_set.h"

extern "C" {
#include "redis/sds.h"
}

namespace dfly {

class PageUsage;

namespace detail {

class SdsScorePair {
 public:
  SdsScorePair(sds k, double v) : first(k), second(v) {
  }

  SdsScorePair* operator->() {
    return this;
  }

  const SdsScorePair* operator->() const {
    return this;
  }

  const sds first;
  const double second;
};

};  // namespace detail

class ScoreMap : public DenseSet {
 public:
  ScoreMap() {
  }

  ~ScoreMap();

  class iterator : private DenseSet::IteratorBase {
    static detail::SdsScorePair BreakToPair(void* obj);

   public:
    iterator() : IteratorBase() {
    }

    iterator(DenseSet* owner, bool is_end) : IteratorBase(owner, is_end) {
    }

    detail::SdsScorePair operator->() const {
      void* ptr = curr_entry_->GetObject();
      return BreakToPair(ptr);
    }

    detail::SdsScorePair operator*() const {
      void* ptr = curr_entry_->GetObject();
      return BreakToPair(ptr);
    }

    // Try reducing memory fragmentation of the value by re-allocating. Returns true if
    // re-allocation happened.
    // If function is set, we call it with the old and the new sds. This is used for data
    // structures that hold multiple storages that need to be update simultaneously. For example,
    // SortedMap contains both a B+ tree and a ScoreMap with the former, containing pointers
    // to the later. Therefore, we need to update those. This is handled by the cb below.
    bool ReallocIfNeeded(PageUsage* page_usage, std::function<void(sds, sds)> = {});

    iterator& operator++() {
      Advance();
      return *this;
    }

    bool operator==(const iterator& b) const {
      return curr_list_ == b.curr_list_;
    }

    bool operator!=(const iterator& b) const {
      return !(*this == b);
    }
  };

  // Returns pointer to the internal objest and the insertion result.
  // i.e. true if field was added, otherwise updates its value and returns false.
  std::pair<void*, bool> AddOrUpdate(std::string_view field, double value);

  // Returns true if field was added
  // false, if already exists. In that case no update is done.
  std::pair<void*, bool> AddOrSkip(std::string_view field, double value);

  void* AddUnique(std::string_view field, double value);

  bool Erase(std::string_view field) {
    return EraseInternal(&field, 1);
  }

  bool Erase(sds field) {
    return EraseInternal(field, 0);
  }

  /// @brief  Returns value of the key or nullptr if key not found.
  /// @param key
  /// @return sds
  std::optional<double> Find(std::string_view key);

  void* FindObj(std::string_view sv) {
    return FindInternal(&sv, Hash(&sv, 1), 1);
  }

  iterator begin() {
    return iterator{this, false};
  }

  iterator end() {
    return iterator{this, true};
  }

 private:
  uint64_t Hash(const void* obj, uint32_t cookie) const final;
  bool ObjEqual(const void* left, const void* right, uint32_t right_cookie) const final;
  size_t ObjectAllocSize(const void* obj) const final;
  uint32_t ObjExpireTime(const void* obj) const final;
  void ObjUpdateExpireTime(const void* obj, uint32_t ttl_sec) override;
  void ObjDelete(void* obj, bool has_ttl) const override;
  void* ObjectClone(const void* obj, bool has_ttl, bool add_ttl) const final;
};

}  // namespace dfly


================================================
FILE: src/core/score_map_test.cc
================================================
// Copyright 2023, Roman Gershman.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/score_map.h"

#include <mimalloc.h>

#include "base/gtest.h"
#include "base/logging.h"
#include "core/mi_memory_resource.h"
#include "core/page_usage/page_usage_stats.h"

extern "C" {
#include "redis/zmalloc.h"
}

using namespace std;

namespace dfly {

class ScoreMapTest : public ::testing::Test {
 protected:
  static void SetUpTestSuite() {
    auto* tlh = mi_heap_get_backing();
    init_zmalloc_threadlocal(tlh);
    InitTLStatelessAllocMR(PMR_NS::get_default_resource());
  }

  static void TearDownTestSuite() {
    mi_heap_collect(mi_heap_get_backing(), true);

    auto cb_visit = [](const mi_heap_t* heap, const mi_heap_area_t* area, void* block,
                       size_t block_size, void* arg) {
      LOG(ERROR) << "Unfreed allocations: block_size " << block_size
                 << ", allocated: " << area->used * block_size;
      return true;
    };

    mi_heap_visit_blocks(mi_heap_get_backing(), false /* do not visit all blocks*/, cb_visit,
                         nullptr);
  }

  ScoreMapTest() : mi_alloc_(mi_heap_get_backing()) {
  }

  void SetUp() override {
    sm_.reset(new ScoreMap());
  }

  void TearDown() override {
    sm_.reset();
    EXPECT_EQ(zmalloc_used_memory_tl, 0);
  }

  MiMemoryResource mi_alloc_;
  std::unique_ptr<ScoreMap> sm_;
};

TEST_F(ScoreMapTest, Basic) {
  EXPECT_TRUE(sm_->AddOrUpdate("foo", 5).second);
  EXPECT_EQ(5, sm_->Find("foo"));

  auto it = sm_->begin();
  EXPECT_STREQ("foo", it->first);
  EXPECT_EQ(5, it->second);
  ++it;

  EXPECT_TRUE(it == sm_->end());

  for (const auto& k_v : *sm_) {
    EXPECT_STREQ("foo", k_v.first);
    EXPECT_EQ(5, k_v.second);
  }

  size_t sz = sm_->ObjMallocUsed();
  EXPECT_FALSE(sm_->AddOrUpdate("foo", 17).second);
  EXPECT_EQ(sm_->ObjMallocUsed(), sz);

  it = sm_->begin();
  EXPECT_EQ(17, it->second);

  EXPECT_FALSE(sm_->AddOrSkip("foo", 31).second);
  EXPECT_EQ(17, it->second);
}

TEST_F(ScoreMapTest, EmptyFind) {
  EXPECT_EQ(nullopt, sm_->Find("bar"));
}

uint64_t total_wasted_memory = 0;

TEST_F(ScoreMapTest, ReallocIfNeeded) {
  auto build_str = [](size_t i) { return to_string(i) + string(131, 'a'); };

  auto count_waste = [](const mi_heap_t* heap, const mi_heap_area_t* area, void* block,
                        size_t block_size, void* arg) {
    size_t used = block_size * area->used;
    total_wasted_memory += area->committed - used;
    return true;
  };

  for (size_t i = 0; i < 10'000; i++) {
    sm_->AddOrUpdate(build_str(i), i);
  }

  for (size_t i = 0; i < 10'000; i++) {
    if (i % 10 == 0)
      continue;
    sm_->Erase(build_str(i));
  }

  mi_heap_collect(mi_heap_get_backing(), true);
  mi_heap_visit_blocks(mi_heap_get_backing(), false, count_waste, nullptr);
  size_t wasted_before = total_wasted_memory;

  size_t underutilized = 0;
  PageUsage page_usage{CollectPageStats::NO, 0.9};
  for (auto it = sm_->begin(); it != sm_->end(); ++it) {
    underutilized += page_usage.IsPageForObjectUnderUtilized(it->first);
    it.ReallocIfNeeded(&page_usage);
  }
  // Check there are underutilized pages
  CHECK_GT(underutilized, 0u);

  total_wasted_memory = 0;
  mi_heap_collect(mi_heap_get_backing(), true);
  mi_heap_visit_blocks(mi_heap_get_backing(), false, count_waste, nullptr);
  size_t wasted_after = total_wasted_memory;

  // Check we waste significanlty less now
  EXPECT_GT(wasted_before, wasted_after * 2);

  ASSERT_EQ(sm_->UpperBoundSize(), 1000);
  for (size_t i = 0; i < 1000; i++) {
    auto res = sm_->Find(build_str(i * 10));
    ASSERT_EQ(res.has_value(), true);
    ASSERT_EQ((size_t)*res, i * 10);
  }
}

}  // namespace dfly


================================================
FILE: src/core/sds_utils.cc
================================================
// Copyright 2022, Roman Gershman.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/sds_utils.h"

#include "base/endian.h"

extern "C" {
#include "redis/sds.h"
#include "redis/zmalloc.h"
}

namespace dfly {

namespace {

inline char SdsReqType(size_t string_size) {
  if (string_size < 1 << 5)
    return SDS_TYPE_5;
  if (string_size < 1 << 8)
    return SDS_TYPE_8;
  if (string_size < 1 << 16)
    return SDS_TYPE_16;
  if (string_size < 1ll << 32)
    return SDS_TYPE_32;
  return SDS_TYPE_64;
}

inline int SdsHdrSize(char type) {
  switch (type & SDS_TYPE_MASK) {
    case SDS_TYPE_5:
      return sizeof(struct sdshdr5);
    case SDS_TYPE_8:
      return sizeof(struct sdshdr8);
    case SDS_TYPE_16:
      return sizeof(struct sdshdr16);
    case SDS_TYPE_32:
      return sizeof(struct sdshdr32);
    case SDS_TYPE_64:
      return sizeof(struct sdshdr64);
  }
  return 0;
}

}  // namespace

void SdsUpdateExpireTime(const void* obj, uint32_t time_at, uint32_t offset) {
  sds str = (sds)obj;
  char* valptr = str + sdslen(str) + 1;
  absl::little_endian::Store32(valptr + offset, time_at);
}

char* AllocSdsWithSpace(uint32_t strlen, uint32_t space) {
  size_t usable;
  char type = SdsReqType(strlen);
  int hdrlen = SdsHdrSize(type);

  char* ptr = (char*)zmalloc_usable(hdrlen + strlen + 1 + space, &usable);
  char* s = ptr + hdrlen;
  char* fp = s - 1;

  switch (type) {
    case SDS_TYPE_5: {
      *fp = type | (strlen << SDS_TYPE_BITS);
      break;
    }

    case SDS_TYPE_8: {
      SDS_HDR_VAR(8, s);
      sh->len = strlen;
      sh->alloc = strlen;
      *fp = type;
      break;
    }

    case SDS_TYPE_16: {
      SDS_HDR_VAR(16, s);
      sh->len = strlen;
      sh->alloc = strlen;
      *fp = type;
      break;
    }

    case SDS_TYPE_32: {
      SDS_HDR_VAR(32, s);
      sh->len = strlen;
      sh->alloc = strlen;
      *fp = type;
      break;
    }
    case SDS_TYPE_64: {
      SDS_HDR_VAR(64, s);
      sh->len = strlen;
      sh->alloc = strlen;
      *fp = type;
      break;
    }
  }

  s[strlen] = '\0';
  return s;
}

}  // namespace dfly


================================================
FILE: src/core/sds_utils.h
================================================
// Copyright 2022, Roman Gershman.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <cstdint>

namespace dfly {

// Allocates an sds string that has an additional space at the end that
// sds does is not aware of. Useful when you need to allocate immutable
// sds string (keys) with metadata attached to them.
char* AllocSdsWithSpace(uint32_t strlen, uint32_t space);

// Updates the expire time of the sds object. The offset is the number of bytes
void SdsUpdateExpireTime(const void* obj, uint32_t time_at, uint32_t offset);

}  // namespace dfly


================================================
FILE: src/core/search/CMakeLists.txt
================================================
gen_flex(lexer)
gen_bison(parser)

cur_gen_dir(gen_dir)

set_source_files_properties(${gen_dir}/parser.cc PROPERTIES
                            COMPILE_FLAGS "-Wno-maybe-uninitialized")
add_library(dfly_search_core ast_expr.cc base.cc hnsw_index.cc query_driver.cc search.cc
            indices.cc sort_indices.cc vector_utils.cc compressed_sorted_set.cc block_list.cc
            renewable_quota.cc range_tree.cc synonyms.cc
            ${gen_dir}/parser.cc ${gen_dir}/lexer.cc)

target_link_libraries(dfly_search_core dfly_page_usage base fibers2 redis_lib absl::strings
  TRDP::reflex TRDP::uni-algo TRDP::hnswlib Boost::headers)

if(WITH_SIMSIMD)
  target_link_libraries(dfly_search_core TRDP::simsimd)
  target_compile_definitions(dfly_search_core PRIVATE
    WITH_SIMSIMD=1
    SIMSIMD_DYNAMIC_DISPATCH=1
    SIMSIMD_NATIVE_F16=$<IF:$<BOOL:${SIMSIMD_NATIVE_F16}>,1,0>
    SIMSIMD_NATIVE_BF16=$<IF:$<BOOL:${SIMSIMD_NATIVE_F16}>,1,0>)
endif()

helio_cxx_test(compressed_sorted_set_test dfly_search_core LABELS DFLY)
helio_cxx_test(block_list_test dfly_search_core LABELS DFLY)
helio_cxx_test(range_tree_test dfly_search_core absl::random_random LABELS DFLY)
helio_cxx_test(rax_tree_test redis_test_lib LABELS DFLY)
helio_cxx_test(search_parser_test dfly_search_core LABELS DFLY)
helio_cxx_test(search_test redis_test_lib dfly_search_core LABELS DFLY)
helio_cxx_test(mrmw_mutex_test redis_test_lib dfly_search_core fibers2 LABELS DFLY)

if(WITH_SIMSIMD)
  target_link_libraries(search_test TRDP::simsimd)
  target_compile_definitions(search_test PRIVATE
    WITH_SIMSIMD=1
    SIMSIMD_DYNAMIC_DISPATCH=1
    SIMSIMD_NATIVE_F16=$<IF:$<BOOL:${SIMSIMD_NATIVE_F16}>,1,0>
    SIMSIMD_NATIVE_BF16=$<IF:$<BOOL:${SIMSIMD_NATIVE_F16}>,1,0>)
endif()


================================================
FILE: src/core/search/ast_expr.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/search/ast_expr.h"

#include <absl/strings/numbers.h>

#include <algorithm>
#include <cmath>
#include <regex>

#include "base/logging.h"

using namespace std;

namespace dfly::search {

AstRangeNode::AstRangeNode(double lo, bool lo_excl, double hi, bool hi_excl)
    : lo{lo_excl ? nextafter(lo, hi) : lo}, hi{hi_excl ? nextafter(hi, lo) : hi} {
}

AstGeoNode::AstGeoNode(double lon, double lat, double radius, std::string unit)
    : lon(lon), lat(lat), radius(radius), unit(std::move(unit)) {
}

AstNegateNode::AstNegateNode(AstNode&& node) : node{make_unique<AstNode>(std::move(node))} {
}

AstLogicalNode::AstLogicalNode(AstNode&& l, AstNode&& r, LogicOp op) : op{op}, nodes{} {
  // If either node is already a logical node with the same op,
  // we can re-use it, as logical ops are associative.
  for (auto* node : {&l, &r}) {
    if (auto* ln = get_if<AstLogicalNode>(node); ln && ln->op == op) {
      *this = std::move(*ln);
      nodes.emplace_back(std::move(*(node == &l ? &r : &l)));
      return;
    }
  }

  nodes.emplace_back(std::move(l));
  nodes.emplace_back(std::move(r));
}

AstFieldNode::AstFieldNode(string field, AstNode&& node)
    : field{field.substr(1)}, node{make_unique<AstNode>(std::move(node))} {
}

AstTagsNode::AstTagsNode(TagValue tag) {
  tags = {std::move(tag)};
}

AstTagsNode::AstTagsNode(AstExpr&& l, TagValue tag) {
  DCHECK(holds_alternative<AstTagsNode>(l));
  auto& tags_node = get<AstTagsNode>(l);

  tags = std::move(tags_node.tags);
  tags.push_back(std::move(tag));
}

AstKnnNode::AstKnnNode(uint32_t limit, std::string_view field, OwnedFtVector vec,
                       std::string_view score_alias, std::optional<size_t> ef_runtime)
    : filter{nullptr},
      limit{limit},
      field{field.substr(1)},
      vec{std::move(vec)},
      score_alias{score_alias},
      ef_runtime{ef_runtime} {
}

AstKnnNode::AstKnnNode(AstNode&& filter, AstKnnNode&& self) {
  *this = std::move(self);
  this->filter = make_unique<AstNode>(std::move(filter));
}

AstVectorRangeNode::AstVectorRangeNode(std::string field, double radius, OwnedFtVector vec,
                                       std::string score_alias)
    : field{field.substr(1)},
      radius{radius},
      vec{std::move(vec)},
      score_alias{std::move(score_alias)} {
}

bool AstKnnNode::HasPreFilter() const {
  // If we have pre filter knn query should not hold filter variable. It will be
  // moved to SearchAlgorithm::query_ variable.
  return filter == nullptr;
}

}  // namespace dfly::search

namespace std {
ostream& operator<<(ostream& os, optional<size_t> o) {
  return os;
}

ostream& operator<<(ostream& os, dfly::search::AstTagsNode::TagValueProxy o) {
  return os;
}
}  // namespace std


================================================
FILE: src/core/search/ast_expr.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <algorithm>
#include <iostream>
#include <memory>
#include <ostream>
#include <variant>
#include <vector>

#include "core/search/base.h"
#include "core/search/tag_types.h"

namespace dfly {

namespace search {

struct AstNode;

// Matches all documents
struct AstStarNode {};

// Matches all documents where this field has a non-null value
struct AstStarFieldNode {};

template <TagType T> struct AstAffixNode {
  explicit AstAffixNode(std::string affix) : affix{std::move(affix)} {
  }

  std::string affix;
};

using AstTermNode = AstAffixNode<TagType::REGULAR>;
using AstPrefixNode = AstAffixNode<TagType::PREFIX>;
using AstSuffixNode = AstAffixNode<TagType::SUFFIX>;
using AstInfixNode = AstAffixNode<TagType::INFIX>;

// Matches numeric range
struct AstRangeNode {
  AstRangeNode(double lo, bool lo_excl, double hi, bool hi_excl);

  double lo, hi;
};

struct AstGeoNode {
  AstGeoNode(double lon, double lat, double radius, std::string unit);
  double lon, lat;
  double radius;
  std::string unit;
};

// Negates subtree
struct AstNegateNode {
  AstNegateNode(AstNode&& node);

  AstNegateNode(const AstNegateNode&) = delete;
  AstNegateNode& operator=(const AstNegateNode&) = delete;

  AstNegateNode(AstNegateNode&&) noexcept = default;
  AstNegateNode& operator=(AstNegateNode&&) noexcept = default;

  std::unique_ptr<AstNode> node;
};

// Applies logical operation to results of all sub-nodes
struct AstLogicalNode {
  enum LogicOp { AND, OR };

  // If either node is already a logical node with the same op, it'll be re-used.
  AstLogicalNode(AstNode&& l, AstNode&& r, LogicOp op);

  AstLogicalNode(const AstLogicalNode&) = delete;
  AstLogicalNode& operator=(const AstLogicalNode&) = delete;

  AstLogicalNode(AstLogicalNode&&) noexcept = default;
  AstLogicalNode& operator=(AstLogicalNode&&) noexcept = default;

  LogicOp op;
  std::vector<AstNode> nodes;
};

// Selects specific field for subtree
struct AstFieldNode {
  AstFieldNode(std::string field, AstNode&& node);

  AstFieldNode(const AstFieldNode&) = delete;
  AstFieldNode& operator=(const AstFieldNode&) = delete;

  AstFieldNode(AstFieldNode&&) noexcept = default;
  AstFieldNode& operator=(AstFieldNode&&) noexcept = default;

  std::string field;
  std::unique_ptr<AstNode> node;
};

// Stores a list of tags for a tag query
struct AstTagsNode {
  using TagValue = std::variant<AstTermNode, AstPrefixNode, AstSuffixNode, AstInfixNode>;

  struct TagValueProxy
      : public AstTagsNode::TagValue {  // bison needs it to be default constructible
    TagValueProxy() : AstTagsNode::TagValue(AstTermNode("")) {
    }
    template <TagType T> TagValueProxy(AstAffixNode<T> tv) : AstTagsNode::TagValue(std::move(tv)) {
    }
  };

  AstTagsNode(TagValue);
  AstTagsNode(AstNode&& l, TagValue);

  std::vector<TagValue> tags;
};

// Applies nearest neighbor search to the final result set
struct AstKnnNode {
  AstKnnNode() = default;
  AstKnnNode(uint32_t limit, std::string_view field, OwnedFtVector vec,
             std::string_view score_alias, std::optional<size_t> ef_runtime);

  AstKnnNode(AstNode&& sub, AstKnnNode&& self);

  AstKnnNode(const AstKnnNode&) = delete;
  AstKnnNode& operator=(const AstKnnNode&) = delete;

  AstKnnNode(AstKnnNode&&) noexcept = default;
  AstKnnNode& operator=(AstKnnNode&&) noexcept = default;

  friend std::ostream& operator<<(std::ostream& stream, const AstKnnNode& matrix) {
    return stream;
  }

  std::unique_ptr<AstNode> filter;
  size_t limit;
  std::string field;
  OwnedFtVector vec;
  std::string score_alias;
  std::optional<float> ef_runtime;

  bool HasPreFilter() const;
};

// Applies vector range search: returns all docs with distance(vec, doc_vec) <= radius
struct AstVectorRangeNode {
  AstVectorRangeNode() = default;
  AstVectorRangeNode(std::string field, double radius, OwnedFtVector vec, std::string score_alias);

  AstVectorRangeNode(const AstVectorRangeNode&) = delete;
  AstVectorRangeNode& operator=(const AstVectorRangeNode&) = delete;

  AstVectorRangeNode(AstVectorRangeNode&&) noexcept = default;
  AstVectorRangeNode& operator=(AstVectorRangeNode&&) noexcept = default;

  friend std::ostream& operator<<(std::ostream& stream, const AstVectorRangeNode& /*node*/) {
    return stream;
  }

  std::string field;
  double radius;
  OwnedFtVector vec;
  std::string score_alias;
};

using NodeVariants =
    std::variant<std::monostate, AstStarNode, AstStarFieldNode, AstTermNode, AstPrefixNode,
                 AstSuffixNode, AstInfixNode, AstRangeNode, AstNegateNode, AstLogicalNode,
                 AstFieldNode, AstTagsNode, AstKnnNode, AstGeoNode, AstVectorRangeNode>;

struct AstNode : public NodeVariants {
  using variant::variant;

  AstNode(const AstNode&) = delete;
  AstNode& operator=(const AstNode&) = delete;

  AstNode(AstNode&&) noexcept = default;
  AstNode& operator=(AstNode&&) noexcept = default;

  friend std::ostream& operator<<(std::ostream& stream, const AstNode& matrix) {
    return stream;
  }

  const NodeVariants& Variant() const& {
    return *this;
  }
};

using AstExpr = AstNode;

}  // namespace search
}  // namespace dfly

namespace std {
ostream& operator<<(ostream& os, optional<size_t> o);
ostream& operator<<(ostream& os, dfly::search::AstTagsNode::TagValueProxy o);
}  // namespace std


================================================
FILE: src/core/search/base.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/search/base.h"

#include <absl/strings/numbers.h>

namespace dfly::search {

std::string_view QueryParams::operator[](std::string_view name) const {
  if (auto it = params.find(name); it != params.end())
    return it->second;
  return "";
}

std::string& QueryParams::operator[](std::string_view k) {
  return params[k];
}

std::optional<double> ParseNumericField(std::string_view value) {
  double value_as_double;
  if (absl::SimpleAtod(value, &value_as_double) && std::isfinite(value_as_double))
    return value_as_double;
  return std::nullopt;
}

DefragmentResult& DefragmentResult::Merge(DefragmentResult&& other) {
  quota_depleted |= other.quota_depleted;
  objects_moved += other.objects_moved;
  return *this;
}

}  // namespace dfly::search


================================================
FILE: src/core/search/base.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/flat_hash_map.h>
#include <absl/container/flat_hash_set.h>
#include <absl/container/inlined_vector.h>

#include <cstdint>
#include <memory>
#include <string>
#include <string_view>
#include <vector>

namespace dfly {
class PageUsage;
}

namespace dfly::search {

struct DefragmentResult {
  bool quota_depleted{false};
  size_t objects_moved{0};
  DefragmentResult& Merge(DefragmentResult&& other);
};

using DocId = uint32_t;
using GlobalDocId = uint64_t;
using ShardId = uint16_t;

inline GlobalDocId CreateGlobalDocId(ShardId shard_id, DocId local_doc_id) {
  return ((uint64_t)shard_id << 32) | local_doc_id;
}

inline std::pair<ShardId, DocId> DecomposeGlobalDocId(GlobalDocId id) {
  return {(id >> 32), (id)&0xFFFFFFFF};
}

enum class VectorSimilarity { L2, IP, COSINE };

using OwnedFtVector = std::pair<std::unique_ptr<float[]>, size_t /* dimension (size) */>;
using BorrowedFtVector = const char*;

// Query params represent named parameters for queries supplied via PARAMS.
struct QueryParams {
  std::string_view operator[](std::string_view name) const;
  std::string& operator[](std::string_view k);

  size_t Size() const {
    return params.size();
  }

 private:
  absl::flat_hash_map<std::string, std::string> params;
};

// Base class for optional search filters

struct AstNode;

struct OptionalFilterBase {
  virtual bool IsEmpty() const = 0;
  virtual AstNode Node(std::string field) = 0;
  virtual ~OptionalFilterBase() = default;
};

using OptionalFilters =
    absl::flat_hash_map<std::string /*field*/, std::unique_ptr<OptionalFilterBase> /* filter */>;

// Values are either sortable as doubles or strings, or not sortable at all.
using SortableValue = std::variant<std::monostate, double, std::string>;

// Interface for accessing document values with different data structures underneath.
struct DocumentAccessor {
  using VectorInfo = std::variant<search::OwnedFtVector, search::BorrowedFtVector>;
  using StringList = absl::InlinedVector<std::string_view, 1>;
  using NumsList = absl::InlinedVector<double, 1>;

  virtual ~DocumentAccessor() = default;

  /* Returns nullopt if the specified field is not a list of strings */
  virtual std::optional<StringList> GetStrings(std::string_view active_field) const = 0;

  /* Returns nullopt if the specified field is not a vector */
  virtual std::optional<VectorInfo> GetVector(std::string_view active_field, size_t dim) const = 0;

  /* Return nullopt if the specified field is not a list of doubles */
  virtual std::optional<NumsList> GetNumbers(std::string_view active_field) const = 0;

  /* Same as GetStrings, but also supports boolean values */
  virtual std::optional<StringList> GetTags(std::string_view active_field) const = 0;
};

// Base class for type-specific indices.
//
// Queries should be done directly on subclasses with their distinc
// query functions. All results for all index types should be sorted.
struct BaseIndex {
  virtual ~BaseIndex() = default;

  // Returns true if the document was added / indexed
  virtual bool Add(DocId id, const DocumentAccessor& doc, std::string_view field) = 0;
  virtual void Remove(DocId id, const DocumentAccessor& doc, std::string_view field) = 0;

  // Returns documents that have non-null values for this field (used for @field:* queries)
  // Result must be sorted
  virtual std::vector<DocId> GetAllDocsWithNonNullValues() const = 0;

  /* Called at the end of indexes rebuilding after all initial Add calls are done.
     Some indices may need to finalize internal structures. See RangeTree for example. */
  virtual void FinalizeInitialization() {
  }

  // Defragments the index by moving objects in underutilized pages to the current malloc page.
  virtual DefragmentResult Defragment(PageUsage* page_usage) {
    return DefragmentResult{.quota_depleted = false, .objects_moved = 0};
  }
};

// Base class for type-specific sorting indices.
struct BaseSortIndex : BaseIndex {
  virtual SortableValue Lookup(DocId doc) const = 0;
  virtual std::vector<SortableValue> Sort(std::vector<DocId>* ids, size_t limit,
                                          bool desc) const = 0;
};

/* Used in iterators of inverse indices.
   It is used to mark iterators that can be seeked to doc id that is greater than or equal to
   the specified value (method name is SeekGE(DocId min_doc_id)).
   This is used to optimize merging of results from different indices.
   See index_result.h for more details. */
struct SeekableTag {};

template <typename Iterator> void BasicSeekGE(DocId min_doc_id, const Iterator& end, Iterator* it);

/* Used for converting field values to double. Returns std::nullopt if the conversion fails */
std::optional<double> ParseNumericField(std::string_view value);

/* Temporary method to create an empty std::optional<InlinedVector> in DocumentAccessor::GetString
   and DocumentAccessor::GetNumbers methods. The problem is that due to internal implementation
   details of absl::InlineVector, we are getting a -Wmaybe-uninitialized compiler warning. To
   suppress this false warning, we temporarily disable it around this block of code using GCC
   diagnostic directives. */
template <typename InlinedVector> std::optional<InlinedVector> EmptyAccessResult() {
#if !defined(__clang__)
  // GCC 13.1 throws spurious warnings around this code.
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#endif

  return InlinedVector{};

#if !defined(__clang__)
#pragma GCC diagnostic pop
#endif
}

// Implementation
/******************************************************************/
namespace details {
inline size_t GetHighestPowerOfTwo(size_t n) {
  static constexpr size_t kBitsNumber = sizeof(size_t) * 8;
  return size_t(1) << (kBitsNumber - 1 - __builtin_clzl(n));
}
}  // namespace details

template <typename Iterator> void BasicSeekGE(DocId min_doc_id, const Iterator& end, Iterator* it) {
  using Category = typename std::iterator_traits<Iterator>::iterator_category;

  auto extract_doc_id = [](const auto& value) {
    using T = std::decay_t<decltype(value)>;
    if constexpr (std::is_same_v<T, DocId>) {
      return value;
    } else {
      return value.first;
    }
  };

  if constexpr (std::is_base_of_v<std::random_access_iterator_tag, Category>) {
    size_t length = std::distance(*it, end);
    for (size_t step = details::GetHighestPowerOfTwo(length); step > 0; step >>= 1) {
      if (step < length) {
        auto next_it = *it + step;
        if (extract_doc_id(*next_it) < min_doc_id) {
          *it = next_it;
          length -= step;
        }
      }
    }
  }

  while (*it != end && extract_doc_id(**it) < min_doc_id) {
    ++(*it);
  }
}

}  // namespace dfly::search


================================================
FILE: src/core/search/block_list.cc
================================================
#include "core/search/block_list.h"

#include "core/page_usage/page_usage_stats.h"

namespace {

template <typename T> bool DefragmentVector(PMR_NS::vector<T>& vec, dfly::PageUsage* page_usage) {
  if (vec.empty() || !page_usage->IsPageForObjectUnderUtilized(vec.data())) {
    return false;
  }

  PMR_NS::vector<T> new_vec(vec.get_allocator());
  new_vec.reserve(vec.size());
  for (auto&& element : vec) {
    new_vec.push_back(std::move(element));
  }
  vec = std::move(new_vec);
  return true;
}

}  // namespace

namespace dfly::search {

using namespace std;

SplitResult Split(BlockList<SortedVector<std::pair<DocId, double>>>&& block_list) {
  using Entry = std::pair<DocId, double>;
  DCHECK(!block_list.Empty());

  const size_t elements_count = block_list.Size();

  // Extract values to find median
  std::vector<double> entries_values(elements_count);
  size_t index = 0;
  for (const Entry& entry : block_list) {
    entries_values[index++] = entry.second;
  }

  // Find median value
  std::nth_element(entries_values.begin(), entries_values.begin() + elements_count / 2,
                   entries_values.end());
  double median_value = entries_values[elements_count / 2];

  /* Now we need to split entries into two parts, left and right, so that:
   1) left has values < median_value
   2) right has values >= median_value
   3) both parts have approximately the same number of elements

   To achieve this, we first split entries into three parts: < median_value (left blocklist), ==
   median_value (median_entries), > median_value (righ blocklist). Then we add == median_value part
   to the smaller of the two parts (< or >). This guarantees that both parts have approximately the
   same number of elements */
  BlockList<SortedVector<Entry>> left(block_list.blocks_.get_allocator().resource(),
                                      block_list.block_size_);
  BlockList<SortedVector<Entry>> right(block_list.blocks_.get_allocator().resource(),
                                       block_list.block_size_);
  absl::InlinedVector<Entry, 1> median_entries;

  left.ReserveBlocks(block_list.blocks_.size() / 2 + 1);
  right.ReserveBlocks(block_list.blocks_.size() / 2 + 1);

  double lmin = std::numeric_limits<double>::infinity(), rmin = lmin;
  double lmax = -std::numeric_limits<double>::infinity(), rmax = lmax;

  for (const Entry& entry : block_list) {
    if (entry.second < median_value) {
      left.PushBack(entry);
      lmin = std::min(lmin, entry.second);
      lmax = std::max(lmax, entry.second);
    } else if (entry.second > median_value) {
      right.PushBack(entry);
      rmin = std::min(rmin, entry.second);
      rmax = std::max(rmax, entry.second);
    } else {
      median_entries.push_back(entry);
    }
  }
  block_list.Clear();

  if (left.Size() < right.Size()) {
    // If left is smaller, we can add median entries to it
    // We need to change median value to the right part and update lmax
    lmax = median_value;
    lmin = std::min(lmin, median_value);
    median_value = rmin;
    for (const auto& entry : median_entries) {
      left.Insert(entry);
    }
  } else {
    // If right part is smaller, we can add median entries to it
    // Median value is still the same
    rmax = std::max(rmax, median_value);
    for (const auto& entry : median_entries) {
      right.Insert(entry);
    }
  }

  return {std::move(left), std::move(right), median_value, lmin, lmax, rmax};
}

template <typename C> bool BlockList<C>::Insert(ElementType t) {
  auto block = FindBlock(t);
  if (block == blocks_.end())
    block = blocks_.insert(blocks_.end(), C{blocks_.get_allocator().resource()});

  if (!block->Insert(std::move(t)))
    return false;

  size_++;
  TrySplit(block);
  return true;
}

template <typename C> bool BlockList<C>::PushBack(ElementType t) {
  // If the last block is full, after insert we will need to split it
  // So we can prevent split by creating a new block and inserting there
  if (blocks_.empty() || ShouldSplit(blocks_.back().Size() + 1)) {
    blocks_.insert(blocks_.end(), C{blocks_.get_allocator().resource()});
  }

  if (!blocks_.back().Insert(std::move(t)))
    return false;

  size_++;
  return true;
}

template <typename C> bool BlockList<C>::Remove(ElementType t) {
  if (auto block = FindBlock(t); block != blocks_.end() && block->Remove(std::move(t))) {
    size_--;
    TryMerge(block);
    return true;
  }

  return false;
}

template <typename Container>
DefragmentResult BlockList<Container>::Defragment(PageUsage* page_usage) {
  if (page_usage->QuotaDepleted()) {
    return DefragmentResult{.quota_depleted = true, .objects_moved = 0};
  }

  DefragmentResult result;
  if (DefragmentVector(blocks_, page_usage)) {
    result.objects_moved += 1;
  }

  for (Container& block : blocks_) {
    if (result.Merge(block.Defragment(page_usage)).quota_depleted) {
      break;
    }
  }
  return result;
}

template <typename C> typename BlockList<C>::BlockIt BlockList<C>::FindBlock(const ElementType& t) {
  DCHECK(blocks_.empty() || !blocks_.back().Empty());

  if (!blocks_.empty() && t >= *blocks_.back().begin())
    return --blocks_.end();

  // Find first block that can't contain t
  auto it = std::upper_bound(blocks_.begin(), blocks_.end(), t,
                             [](const ElementType& t, const C& l) { return *l.begin() > t; });

  // Move to previous if possible
  if (it != blocks_.begin())
    --it;

  DCHECK(it == blocks_.begin() || it->Size() * 2 >= block_size_);
  DCHECK(it == blocks_.end() || it->Size() <= 2 * block_size_);
  return it;
}

template <typename C> bool BlockList<C>::ShouldSplit(size_t block_size) const {
  return block_size >= block_size_ * 2;
}

template <typename C> void BlockList<C>::TryMerge(BlockIt block) {
  if (block->Size() == 0) {
    blocks_.erase(block);
    return;
  }

  if (block->Size() >= block_size_ / 2 || block == blocks_.begin())
    return;

  // Merge strictly right with left to benefit from tail insert optimizations
  size_t idx = std::distance(blocks_.begin(), block);
  blocks_[idx - 1].Merge(std::move(*block));
  blocks_.erase(block);

  TrySplit(blocks_.begin() + (idx - 1));  // to not overgrow it
}

template <typename C> void BlockList<C>::TrySplit(BlockIt block) {
  if (!ShouldSplit(block->Size() + 1))
    return;

  auto [left, right] = std::move(*block).Split();

  *block = std::move(right);
  blocks_.insert(block, std::move(left));
}

template <typename C> void BlockList<C>::ReserveBlocks(size_t n) {
  blocks_.reserve(n);
}

template <typename C>
typename BlockList<C>::BlockListIterator& BlockList<C>::BlockListIterator::operator++() {
  ++block_it;
  if (block_it == block_end) {
    ++it;
    if (it != it_end) {
      block_it = it->begin();
      block_end = it->end();
    } else {
      block_it = {};
      block_end = {};
    }
  }
  return *this;
}

template <typename C> void BlockList<C>::BlockListIterator::SeekGE(DocId min_doc_id) {
  if (it == it_end) {
    block_it = {};
    block_end = {};
    return;
  }

  auto extract_doc_id = [](const auto& value) {
    using T = std::decay_t<decltype(value)>;
    if constexpr (std::is_same_v<T, DocId>) {
      return value;
    } else {
      return value.first;
    }
  };

  auto needed_block = [&](const auto& it) {
    return it->begin() != it->end() && min_doc_id <= extract_doc_id(it->Back());
  };

  // Choose the first block that has the last element >= min_doc_id
  if (!needed_block(it)) {
    while (++it != it_end) {
      if (needed_block(it)) {
        block_it = it->begin();
        block_end = it->end();
        break;
      }
    }
    if (it == it_end) {
      block_it = {};
      block_end = {};
      return;
    }
  }

  BasicSeekGE(min_doc_id, block_end, &block_it);
  DCHECK(block_it != block_end && min_doc_id <= extract_doc_id(*block_it));
}

template class BlockList<CompressedSortedSet>;
template class BlockList<SortedVector<DocId>>;
template class BlockList<SortedVector<std::pair<DocId, double>>>;

template <typename T> bool SortedVector<T>::Insert(T t) {
  if (entries_.empty() || t > entries_.back()) {
    entries_.push_back(t);
    return true;
  }

  auto it = std::lower_bound(entries_.begin(), entries_.end(), t);
  if (it != entries_.end() && *it == t)
    return false;

  entries_.insert(it, t);
  return true;
}

template <typename T> bool SortedVector<T>::Remove(T t) {
  auto it = std::lower_bound(entries_.begin(), entries_.end(), t);
  if (it != entries_.end() && *it == t) {
    entries_.erase(it);
    return true;
  }
  return false;
}

template <typename T> void SortedVector<T>::Merge(SortedVector&& other) {
  // NLog compexity in theory, but in practice used only to merge with larger values.
  // Tail insert optimization makes it linear
  entries_.reserve(entries_.size() + other.entries_.size());
  for (T& t : other.entries_)
    Insert(std::move(t));
}

template <typename T> std::pair<SortedVector<T>, SortedVector<T>> SortedVector<T>::Split() && {
  PMR_NS::vector<T> tail(entries_.begin() + entries_.size() / 2, entries_.end());
  entries_.resize(entries_.size() / 2);

  return std::make_pair(std::move(*this), SortedVector<T>{std::move(tail)});
}

template <typename T> DefragmentResult SortedVector<T>::Defragment(PageUsage* page_usage) {
  if (DefragmentVector(entries_, page_usage)) {
    return DefragmentResult{.quota_depleted = false, .objects_moved = 1};
  }
  return DefragmentResult{};
}

template class SortedVector<DocId>;
template class SortedVector<std::pair<DocId, double>>;

}  // namespace dfly::search


================================================
FILE: src/core/search/block_list.h
================================================
#pragma once

#include <algorithm>
#include <cstdint>
#include <iterator>
#include <vector>

#include "core/search/base.h"
#include "core/search/compressed_sorted_set.h"

namespace dfly::search {

// Forward declarations
struct SplitResult;
template <typename Container> class BlockList;
template <typename T> class SortedVector;

/* Split into two blocks, left and right, so that both blocks have approximately the same number
   of elements. Returns median value of the split. Garantees that median present in the right
   block and not present in the left block. Does not work for empty BlockList. */
// TODO: Move to RangeTree logic
SplitResult Split(BlockList<SortedVector<std::pair<DocId, double>>>&& result);

// BlockList is a container wrapper for CompressedSortedSet / vector<DocId>
// to divide the full sorted id range into separate blocks. This reduces modification
// complexity from O(N) to O(logN + K), where K is the max block size.
//
// It tries to balance block sizes in the range [block_size / 2, block_size * 2]
// by splitting or merging nodes when needed.
// container must have declare ElementType typename
template <typename Container /* underlying container */> class BlockList {
 private:
  using BlockIt = typename PMR_NS::vector<Container>::iterator;
  using ConstBlockIt = typename PMR_NS::vector<Container>::const_iterator;
  using ElementType = typename Container::ElementType;

 public:
  BlockList(PMR_NS::memory_resource* mr, size_t block_size = 1000)
      : block_size_{block_size}, blocks_(mr) {
  }

  BlockList(const BlockList& other) = default;

  BlockList(BlockList&& other) noexcept {
    // Consider not to do move if block_size_ is different
    // DCHECK(block_size_ == other.block_size_);
    // It seams there is bugs in BaseStringIndex
    // because this check fails for it

    size_ = other.size_;
    blocks_ = std::move(other.blocks_);
    other.Clear();
  }

  BlockList& operator=(const BlockList& other) = delete;
  BlockList& operator=(BlockList&& other) = delete;

  ~BlockList() = default;

  // Insert element, returns true if inserted, false if already present.
  bool Insert(ElementType t);
  bool PushBack(ElementType t);

  // Remove element, returns true if removed, false if not found.
  bool Remove(ElementType t);

  size_t Size() const {
    return size_;
  }

  size_t size() const {
    return size_;
  }

  bool Empty() const {
    return size_ == 0;
  }

  void Clear() {
    size_ = 0;
    blocks_.clear();
  }

  struct BlockListIterator : public SeekableTag {
    // To make it work with std container contructors
    using iterator_category = std::forward_iterator_tag;
    using difference_type = std::ptrdiff_t;
    using value_type = ElementType;
    using pointer = ElementType*;
    using reference = ElementType&;

    ElementType operator*() const {
      return *block_it;
    }

    BlockListIterator& operator++();
    void SeekGE(DocId min_doc_id);

    friend class BlockList;

    bool operator==(const BlockListIterator& other) const {
      return it == other.it && block_it == other.block_it;
    }

    bool operator!=(const BlockListIterator& other) const {
      return !operator==(other);
    }

   private:
    BlockListIterator(ConstBlockIt begin, ConstBlockIt end) : it(begin), it_end(end) {
      if (it != it_end) {
        block_it = it->begin();
        block_end = it->end();
      }
    }

    ConstBlockIt it, it_end;
    typename Container::iterator block_it, block_end;
  };

  BlockListIterator begin() const {
    return BlockListIterator{blocks_.begin(), blocks_.end()};
  }

  BlockListIterator end() const {
    return BlockListIterator{blocks_.end(), blocks_.end()};
  }

  DefragmentResult Defragment(PageUsage* page_usage);

 private:
  // Find block that should contain t. Returns end() only if empty
  BlockIt FindBlock(const ElementType& t);

  bool ShouldSplit(size_t block_size) const;

  void TryMerge(BlockIt block);  // If needed, merge with previous block
  void TrySplit(BlockIt block);  // If needed, split into two blocks

  void ReserveBlocks(size_t n);

  friend SplitResult Split(BlockList<SortedVector<std::pair<DocId, double>>>&& block_list);

 private:
  const size_t block_size_ = 1000;
  size_t size_ = 0;
  PMR_NS::vector<Container> blocks_;
};

// Supports Insert and Remove operations for keeping a sorted vector internally.
// Wrapper to use vectors with BlockList
template <typename T> class SortedVector {
 public:
  using ElementType = T;

  explicit SortedVector(PMR_NS::memory_resource* mr) : entries_(mr) {
  }

  bool Insert(T t);
  bool Remove(T t);
  void Merge(SortedVector<T>&& other);
  std::pair<SortedVector<T>, SortedVector<T>> Split() &&;

  T& operator[](size_t idx) {
    return entries_[idx];
  }

  const T& operator[](size_t idx) const {
    return entries_[idx];
  }

  size_t Size() const {
    return entries_.size();
  }

  bool Empty() const {
    return entries_.empty();
  }

  void Clear() {
    entries_.clear();
  }

  const T& Back() const {
    return entries_.back();
  }

  using iterator = typename PMR_NS::vector<T>::const_iterator;

  iterator begin() const {
    return entries_.cbegin();
  }

  iterator end() const {
    return entries_.cend();
  }

  DefragmentResult Defragment(PageUsage* page_usage);

 private:
  SortedVector(PMR_NS::vector<T>&& v) : entries_{std::move(v)} {
  }

  PMR_NS::vector<T> entries_;
};

extern template class SortedVector<DocId>;
extern template class SortedVector<std::pair<DocId, double>>;

extern template class BlockList<CompressedSortedSet>;
extern template class BlockList<SortedVector<DocId>>;
extern template class BlockList<SortedVector<std::pair<DocId, double>>>;

// Used by Split method
struct SplitResult {
  using Container = BlockList<SortedVector<std::pair<DocId, double>>>;

  Container left;
  Container right;

  // Median value of split, used as minimum value of right block
  double median;

  // Min/max values of left (lmin, lmax) and right (rmin=median, rmax) blocks
  double lmin, lmax, rmax;
};
}  // namespace dfly::search


================================================
FILE: src/core/search/block_list_test.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/search/block_list.h"

#include <absl/container/btree_set.h>
#include <gmock/gmock.h>
#include <gtest/gtest.h>

#include <algorithm>
#include <random>
#include <set>

#include "base/gtest.h"
#include "base/logging.h"

namespace dfly::search {

using namespace std;

template <typename C> class TemplatedBlockListTest : public testing::Test {
 private:
  using NumericType = long long;

 public:
  using ElementType = typename C::ElementType;

  auto Make() {
    // Create list with small block size to test blocking mechanism more extensively
    return BlockList<C>{PMR_NS::get_default_resource(), 10};
  }

  auto AddNewBlockListElement(DocId doc_id) {
    if constexpr (std::is_same_v<ElementType, DocId>) {
      return ElementType{doc_id};
    } else {
      static_assert(std::is_same_v<ElementType, std::pair<DocId, double>>,
                    "Unsupported ElementType for BlockListTest");

      const NumericType number = dist_(rnd_);
      id_to_values_[doc_id].push_back(number);
      return ElementType{doc_id, static_cast<double>(number)};
    }
  }

  auto RemoveBlockListElement(DocId doc_id) {
    if constexpr (std::is_same_v<ElementType, DocId>) {
      return ElementType{doc_id};
    } else {
      static_assert(std::is_same_v<ElementType, std::pair<DocId, double>>,
                    "Unsupported ElementType for BlockListTest");

      const NumericType number = id_to_values_[doc_id].back();
      id_to_values_[doc_id].pop_back();
      return ElementType{doc_id, static_cast<double>(number)};
    }
  }

  DocId GetDocId(const ElementType& element) {
    if constexpr (std::is_same_v<ElementType, DocId>) {
      return element;
    } else {
      static_assert(std::is_same_v<ElementType, std::pair<DocId, double>>,
                    "Unsupported ElementType for GetDocId");
      return element.first;
    }
  }

 private:
  // Used to save doubles for std::pair<DocId, double>
  std::unordered_map<DocId, std::vector<NumericType>> id_to_values_;

  // Used to generate random numbers for std::pair<DocId, double>
  default_random_engine rnd_;
  uniform_int_distribution<NumericType> dist_{std::numeric_limits<NumericType>::min(),
                                              std::numeric_limits<NumericType>::max()};
};

using ContainerTypes = ::testing::Types<CompressedSortedSet, SortedVector<DocId>,
                                        SortedVector<std::pair<DocId, double>>>;
TYPED_TEST_SUITE(TemplatedBlockListTest, ContainerTypes);

TYPED_TEST(TemplatedBlockListTest, LoopMidInsertErase) {
  using ElementType = typename TypeParam::ElementType;

  const size_t kNumElements = 50;
  auto list = this->Make();

  for (size_t i = 0; i < kNumElements / 2; i++) {
    list.Insert(this->AddNewBlockListElement(i));
    list.Insert(this->AddNewBlockListElement(i + kNumElements / 2));
  }

  vector<ElementType> out(list.begin(), list.end());
  ASSERT_EQ(list.Size(), kNumElements);
  ASSERT_EQ(out.size(), kNumElements);
  for (size_t i = 0; i < kNumElements; i++)
    ASSERT_EQ(this->GetDocId(out[i]), i);

  for (size_t i = 0; i < kNumElements / 2; i++) {
    list.Remove(this->RemoveBlockListElement(i));
    list.Remove(this->RemoveBlockListElement(i + kNumElements / 2));
  }

  out = {list.begin(), list.end()};
  EXPECT_EQ(out.size(), 0u);
}

TYPED_TEST(TemplatedBlockListTest, InsertReverseRemoveSteps) {
  using ElementType = typename TypeParam::ElementType;

  const size_t kNumElements = 1000;
  auto list = this->Make();

  for (size_t i = 0; i < kNumElements; i++) {
    list.Insert(this->AddNewBlockListElement(kNumElements - i - 1));
  }

  for (size_t deleted_pref = 0; deleted_pref < 10; deleted_pref++) {
    vector<ElementType> out{list.begin(), list.end()};
    reverse(out.begin(), out.end());

    EXPECT_EQ(out.size(), kNumElements / 10 * (10 - deleted_pref));
    for (size_t i = 0; i < kNumElements; i++) {
      if (i % 10 >= deleted_pref) {
        EXPECT_EQ(this->GetDocId(out.back()), DocId(i));
        out.pop_back();
      }
    }

    for (size_t i = 0; i < kNumElements; i++) {
      if (i % 10 == deleted_pref)
        list.Remove(this->RemoveBlockListElement(i));
    }
  }

  EXPECT_EQ(list.Size(), 0u);
}

TYPED_TEST(TemplatedBlockListTest, RandomNumbers) {
  using ElementType = typename TypeParam::ElementType;

  const size_t kNumIterations = 1'000;
  auto list = this->Make();
  std::set<ElementType> list_copy;

  for (size_t i = 0; i < kNumIterations; i++) {
    if (list_copy.size() > 100 && rand() % 5 == 0) {
      auto it = list_copy.begin();
      std::advance(it, rand() % list_copy.size());
      list.Remove(*it);
      list_copy.erase(it);
    } else {
      const ElementType t = this->AddNewBlockListElement(rand() % 1'000'000);
      list.Insert(t);
      list_copy.insert(t);
    }

    ASSERT_TRUE(std::equal(list.begin(), list.end(), list_copy.begin(), list_copy.end()));
  }
}

class BlockListTest : public testing::Test {
 protected:
};

TEST_F(BlockListTest, Split) {
  BlockList<SortedVector<std::pair<DocId, double>>> bl{PMR_NS::get_default_resource(), 20};

  const size_t max_value = 100.0;
  const size_t step = 23.0;
  size_t value = max_value;
  for (size_t i = 0; i < 100; i++) {
    bl.Insert({i, static_cast<double>(value)});
    value = (max_value + value - step) % max_value;
  }

  auto split_result = Split(std::move(bl));
  auto& left = split_result.left;
  auto& right = split_result.right;

  EXPECT_EQ(left.Size(), 50);
  EXPECT_EQ(right.Size(), 50);

  // Test that all values in the left part are less than or equal to max_value
  for (const auto& [_, left_value] : left) {
    for (const auto& [__, right_value] : right) {
      EXPECT_LE(left_value, right_value);
    }
  }

  double median = split_result.median;

  // Test that left part values do not have this median
  for (const auto& [_, left_value] : left) {
    EXPECT_NE(left_value, median);
  }

  // Test that right part values do have this median
  bool is_median_found = false;
  for (const auto& [_, right_value] : right) {
    if (right_value == median) {
      is_median_found = true;
      break;
    }
  }

  EXPECT_TRUE(is_median_found);

  // Test that doc_ids in both parts are sorted
  DocId prev_doc_id = std::numeric_limits<DocId>::min();
  for (const auto& [doc_id, _] : left) {
    EXPECT_GE(doc_id, prev_doc_id);
    prev_doc_id = doc_id;
  }

  prev_doc_id = std::numeric_limits<DocId>::min();
  for (const auto& [doc_id, _] : right) {
    EXPECT_GE(doc_id, prev_doc_id);
    prev_doc_id = doc_id;
  }
}

TEST_F(BlockListTest, SplitHard) {
  // First test 70 values on the left and 30 on the right
  BlockList<SortedVector<std::pair<DocId, double>>> bl1{PMR_NS::get_default_resource(), 20};

  for (size_t i = 0; i < 70; i++) {
    bl1.Insert({i, 1.0});
  }
  for (size_t i = 70; i < 100; i++) {
    bl1.Insert({i, 2.0});
  }

  auto split_result1 = Split(std::move(bl1));

  EXPECT_EQ(split_result1.median, 2.0);
  EXPECT_EQ(split_result1.left.Size(), 70u);
  EXPECT_EQ(split_result1.right.Size(), 30u);

  for (const auto& [_, value] : split_result1.left) {
    EXPECT_EQ(value, 1.0);
  }

  for (const auto& [_, value] : split_result1.right) {
    EXPECT_EQ(value, 2.0);
  }

  // Now test 30 values on the left and 70 on the right
  BlockList<SortedVector<std::pair<DocId, double>>> bl2{PMR_NS::get_default_resource(), 20};
  for (size_t i = 0; i < 30; i++) {
    bl2.Insert({i, 1.0});
  }
  for (size_t i = 30; i < 100; i++) {
    bl2.Insert({i, 2.0});
  }
  auto split_result2 = Split(std::move(bl2));

  EXPECT_EQ(split_result2.median, 2.0);
  EXPECT_EQ(split_result2.left.Size(), 30u);
  EXPECT_EQ(split_result2.right.Size(), 70u);

  for (const auto& [_, value] : split_result2.left) {
    EXPECT_EQ(value, 1.0);
  }

  for (const auto& [_, value] : split_result2.right) {
    EXPECT_EQ(value, 2.0);
  }
}

TEST_F(BlockListTest, SplitSingleDoubleValue) {
  BlockList<SortedVector<std::pair<DocId, double>>> bl{PMR_NS::get_default_resource(), 20};

  for (size_t i = 0; i < 100; i++) {
    bl.Insert({i, 1.0});
  }

  auto split_result = Split(std::move(bl));
  auto& left = split_result.left;
  auto& right = split_result.right;

  EXPECT_EQ(left.Size(), 0u);
  EXPECT_EQ(right.Size(), 100u);
  EXPECT_EQ(split_result.median, 1.0);
}

static void BM_Erase90PctTail(benchmark::State& state) {
  BlockList<CompressedSortedSet> bl{PMR_NS::get_default_resource()};

  unsigned size = state.range(0);
  for (size_t i = 0; i < size; i++)
    bl.Insert(i);

  size_t base = size / 10;
  size_t i = 0;
  while (state.KeepRunning()) {
    benchmark::DoNotOptimize(bl.Remove(base + i));
    i = (i + 1) % (size * 9 / 10);
  }
}

BENCHMARK(BM_Erase90PctTail)->Args({100'000});

}  // namespace dfly::search


================================================
FILE: src/core/search/compressed_sorted_set.cc
================================================
#include "core/search/compressed_sorted_set.h"

#include <array>
#include <bitset>

#include "absl/types/span.h"
#include "base/flit.h"
#include "base/logging.h"

namespace dfly::search {

using namespace std;

namespace {

using VarintBuffer = array<uint8_t, sizeof(CompressedSortedSet::IntType) * 3>;

}  // namespace

CompressedSortedSet::CompressedSortedSet(PMR_NS::memory_resource* mr) : diffs_{mr} {
}

CompressedSortedSet::ConstIterator::ConstIterator(const CompressedSortedSet& list)
    : stash_{}, diffs_{list.diffs_} {
  ReadNext();
}

CompressedSortedSet::IntType CompressedSortedSet::ConstIterator::operator*() const {
  DCHECK(stash_);
  return *stash_;
}

CompressedSortedSet::ConstIterator& CompressedSortedSet::ConstIterator::operator++() {
  ReadNext();
  return *this;
}

bool operator==(const CompressedSortedSet::ConstIterator& l,
                const CompressedSortedSet::ConstIterator& r) {
  return l.diffs_.data() == r.diffs_.data() && l.diffs_.size() == r.diffs_.size();
}

bool operator!=(const CompressedSortedSet::ConstIterator& l,
                const CompressedSortedSet::ConstIterator& r) {
  return !(l == r);
}

void CompressedSortedSet::ConstIterator::ReadNext() {
  if (diffs_.empty()) {
    stash_ = nullopt;
    last_read_ = {nullptr, 0};
    diffs_ = {nullptr, 0};
    return;
  }

  IntType base = stash_.value_or(0);
  auto [diff, read] = CompressedSortedSet::ReadVarLen(diffs_);

  stash_ = base + diff;
  last_read_ = diffs_.subspan(0, read);
  diffs_.remove_prefix(read);
}

CompressedSortedSet::ConstIterator CompressedSortedSet::begin() const {
  return ConstIterator{*this};
}

CompressedSortedSet::ConstIterator CompressedSortedSet::end() const {
  return ConstIterator{};
}

// Simply encode difference and add to end of diffs array
void CompressedSortedSet::PushBackDiff(IntType diff) {
  size_++;

  VarintBuffer buf;
  auto diff_span = WriteVarLen(diff, absl::MakeSpan(buf));
  diffs_.insert(diffs_.end(), diff_span.begin(), diff_span.end());
}

// Do a linear scan by encoding all diffs to find value
CompressedSortedSet::EntryLocation CompressedSortedSet::LowerBound(IntType value) const {
  auto it = begin(), prev_it = end(), next_it = end();
  while (it != end()) {
    next_it = it;
    if (*it >= value || ++next_it == end())
      break;
    prev_it = it;
    it = next_it;
  }

  return EntryLocation{.value = it.stash_.value_or(0),
                       .prev_value = prev_it.stash_.value_or(0),
                       .diff_span = it.last_read_};
}

// Insert has linear complexity. It tries to find between which two entries A and B the new value V
// needs to be inserted. Then it computes the differences dif1 = V - A and diff2 = B - V that need
// to be stored to encode the triple A V B. Those are stored where diff0 = B - A was previously
// stored, possibly extending the vector
bool CompressedSortedSet::Insert(IntType value) {
  if (tail_value_ && *tail_value_ == value)
    return false;

  if (tail_value_ && value > *tail_value_) {
    PushBackDiff(value - *tail_value_);
    tail_value_ = value;
    return true;
  }

  auto bound = LowerBound(value);

  // At least one element was read and it's equal to value: return to avoid duplicate
  if (bound.value == value && !bound.diff_span.empty())
    return false;

  // Value is bigger than any other (or list is empty): append required diff at the end
  if (value > bound.value || bound.diff_span.empty()) {
    PushBackDiff(value - bound.value);
    tail_value_ = value;
    return true;
  }

  size_++;

  // Now the list certainly contains the bound B > V and possibly A < V (or 0 by default),
  // so we need to encode both differences diff1 and diff2
  DCHECK_GT(bound.value, value);
  DCHECK_LE(bound.prev_value, value);

  // Compute and encode new diff1 and diff2 into buf1 and buf2 respectivaly
  VarintBuffer buf1, buf2;
  auto diff1_span = WriteVarLen(value - bound.prev_value, absl::MakeSpan(buf1));
  auto diff2_span = WriteVarLen(bound.value - value, absl::MakeSpan(buf2));

  // Extend the location where diff0 is stored with optional zeros before overwriting it
  ptrdiff_t diff_offset = bound.diff_span.data() - diffs_.data();
  size_t required_len = diff1_span.size() + diff2_span.size();
  DCHECK_LE(bound.diff_span.size(), required_len);  // It can't shrink for sure
  diffs_.insert(diffs_.begin() + diff_offset, required_len - bound.diff_span.size(), 0u);

  // Now overwrite diff0 and 0s with the two new differences
  copy(diff1_span.begin(), diff1_span.end(), diffs_.begin() + diff_offset);
  copy(diff2_span.begin(), diff2_span.end(), diffs_.begin() + diff_offset + diff1_span.size());

  return true;
}

// Remove has linear complexity. It tries to find the element V and its neighbors A and B,
// which are encoded as diff1 = V - A and diff2 = B - V. Adjacently stored diff1 and diff2
// need to be replaced with diff3 = diff1 + diff2s
bool CompressedSortedSet::Remove(IntType value) {
  auto bound = LowerBound(value);

  // Nothing was read or the element was not found
  if (bound.diff_span.empty() || bound.value != value)
    return false;

  // We're removing below unconditionally
  size_--;

  // Calculate offset where values diff is stored and determine diffs tail
  ptrdiff_t diff_offset = bound.diff_span.data() - diffs_.data();
  auto diffs_tail = absl::MakeSpan(diffs_).subspan(diff_offset + bound.diff_span.size());

  // If it's stored at the end, simply truncate it away
  if (diffs_tail.empty()) {
    diffs_.resize(diffs_.size() - bound.diff_span.size());
    tail_value_ = bound.prev_value;
    if (diffs_.empty())
      tail_value_ = nullopt;
    return true;
  }

  // Now the list certainly contains a succeeding element B > V and possibly A < V (or 0)
  // Read diff2 and calculate diff3 = diff1 + diff2
  auto [diff2, diff2_read] = ReadVarLen(diffs_tail);
  IntType diff3 = (bound.value - bound.prev_value) + diff2;

  // Encode diff3
  VarintBuffer buf;
  auto diff3_buf = WriteVarLen(diff3, absl::MakeSpan(buf));

  // Shrink vector before overwriting
  DCHECK_LE(diff3_buf.size(), diff2_read + bound.diff_span.size());
  size_t to_remove = diff2_read + bound.diff_span.size() - diff3_buf.size();
  diffs_.erase(diffs_.begin() + diff_offset, diffs_.begin() + diff_offset + to_remove);

  // Overwrite diff1/diff2 with new diff3
  copy(diff3_buf.begin(), diff3_buf.end(), diffs_.begin() + diff_offset);

  return true;
}

void CompressedSortedSet::Merge(CompressedSortedSet&& other) {
  // Quadratic compexity in theory, but in practice used only to merge with larger values.
  // Tail insert optimization makes it linear
  for (int v : other)
    Insert(v);
}

std::pair<CompressedSortedSet, CompressedSortedSet> CompressedSortedSet::Split() && {
  DCHECK_GT(Size(), 5u);

  CompressedSortedSet second(diffs_.get_allocator().resource());

  // Move iterator to middle position and save size of diffs tail
  auto it = begin();
  std::advance(it, (size_ - 1) / 2);

  // Save last value in the first set
  tail_value_ = *it;
  ++it;

  size_t keep_bytes = it.last_read_.data() - diffs_.data();

  // Copy second half into second set
  for (; it != end(); ++it)
    second.Insert(*it);

  // Erase diffs tail
  diffs_.resize(keep_bytes);
  size_ -= second.Size();

  return std::make_pair(std::move(*this), std::move(second));
}

// The leftmost three bits of the first byte store the number of additional bytes. All following
// bits store the number itself.
absl::Span<uint8_t> CompressedSortedSet::WriteVarLen(IntType value, absl::Span<uint8_t> buf) {
  // TODO: fix flit encoding of large numbers
  size_t written = base::flit::EncodeT(static_cast<uint64_t>(value), buf.data());
  return buf.first(written);
}

std::pair<CompressedSortedSet::IntType, size_t> CompressedSortedSet::ReadVarLen(
    absl::Span<const uint8_t> source) {
  uint64_t out = 0;
  size_t read = 0;

  // We need this because ParseT may read 8 bytes even if source can be less than that
  // due to the encoding and we end up accessing an invalid memory location.
  // (not really a bug because ParseT ignores the extra bytes it reads).
  if (source.size() < 8) {
    VarintBuffer ranged_source{0};
    memcpy(&ranged_source, source.data(), source.size());
    read = base::flit::ParseT(ranged_source.data(), &out);
  } else {
    read = base::flit::ParseT(source.data(), &out);
  }

  CHECK_LE(out, numeric_limits<IntType>::max());
  return {out, read};
}

}  // namespace dfly::search


================================================
FILE: src/core/search/compressed_sorted_set.h
================================================
#pragma once

#include <absl/types/span.h>

#include <cstdint>
#include <iterator>
#include <optional>
#include <vector>

#include "base/logging.h"
#include "base/pmr/memory_resource.h"
#include "core/search/base.h"

namespace dfly::search {

// A list of sorted unique integers with reduced memory usage.
// Only differences between successive elements are stored
// in a variable length encoding.
class CompressedSortedSet {
 public:
  using IntType = DocId;
  using ElementType = IntType;

  // Const access iterator that decodes the compressed list on traversal
  struct ConstIterator {
    friend class CompressedSortedSet;

    // To make it work with std container contructors
    using iterator_category = std::forward_iterator_tag;
    using difference_type = std::ptrdiff_t;
    using value_type = IntType;
    using pointer = IntType*;
    using reference = IntType&;

    IntType operator*() const;
    ConstIterator& operator++();

    friend class CompressedSortedSet;
    friend bool operator==(const ConstIterator& l, const ConstIterator& r);
    friend bool operator!=(const ConstIterator& l, const ConstIterator& r);

    ConstIterator() = default;

   private:
    explicit ConstIterator(const CompressedSortedSet& list);

    void ReadNext();  // Decode next value to stash

    std::optional<IntType> stash_{};
    absl::Span<const uint8_t> last_read_{};
    absl::Span<const uint8_t> diffs_{};
  };

  using iterator = ConstIterator;

 public:
  explicit CompressedSortedSet(PMR_NS::memory_resource* mr);

  ConstIterator begin() const;
  ConstIterator end() const;

  bool Insert(IntType value);  // Insert arbitrary element, needs to scan whole list
  bool Remove(IntType value);  // Remove arbitrary element, needs to scan whole list

  size_t Size() const {
    return size_;
  }

  size_t ByteSize() const {
    return diffs_.size();
  }

  bool Empty() const {
    return size_ == 0;
  }

  void Clear() {
    size_ = 0;
    tail_value_.reset();
    diffs_.clear();
  }

  // Add all values from other
  void Merge(CompressedSortedSet&& other);

  // Split into two equally sized halves
  std::pair<CompressedSortedSet, CompressedSortedSet> Split() &&;

  IntType Back() const {
    DCHECK(!Empty() && tail_value_.has_value());
    return tail_value_.value();
  }

  static DefragmentResult Defragment([[maybe_unused]] PageUsage* page_usage) {
    return {};
  }

 private:
  struct EntryLocation {
    IntType value;                        // Value or 0
    IntType prev_value;                   // Preceding value or 0
    absl::Span<const uint8_t> diff_span;  // Location of value encoded diff, empty if none read
  };

 private:
  // Find EntryLocation of first entry that is not less than value (std::lower_bound)
  EntryLocation LowerBound(IntType value) const;

  // Push back difference without any decoding. Used only for efficient construction from sorted
  // list
  void PushBackDiff(IntType diff);

  // Encode integer with variable length encoding into buf and return written subspan
  static absl::Span<uint8_t> WriteVarLen(IntType value, absl::Span<uint8_t> buf);

  // Decode integer with variable length encoding from source
  static std::pair<IntType /*value*/, size_t /*read*/> ReadVarLen(absl::Span<const uint8_t> source);

 private:
  uint32_t size_{0};

  std::optional<IntType> tail_value_{};
  std::vector<uint8_t, PMR_NS::polymorphic_allocator<uint8_t>> diffs_;
};

}  // namespace dfly::search


================================================
FILE: src/core/search/compressed_sorted_set_test.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/search/compressed_sorted_set.h"

#include <absl/container/btree_set.h>

#include <algorithm>

#include "base/gtest.h"
#include "base/logging.h"
#include "core/bptree_set.h"

namespace dfly::search {

using namespace std;

namespace {

struct SetInserter {
  using iterator_category = std::forward_iterator_tag;
  using difference_type = std::ptrdiff_t;
  using value_type = CompressedSortedSet::IntType;
  using pointer = value_type*;
  using reference = value_type&;

  explicit SetInserter(CompressedSortedSet* set) : set_{set} {};

  SetInserter& operator*() {
    return *this;
  }
  SetInserter& operator++() {
    return *this;
  }

  SetInserter& operator=(value_type value) {
    set_->Insert(value);
    return *this;
  }

 private:
  CompressedSortedSet* set_;
};

}  // namespace

class CompressedSortedSetTest : public ::testing::Test {
 protected:
};

using IdVec = vector<uint32_t>;

TEST_F(CompressedSortedSetTest, BasicInsert) {
  CompressedSortedSet list{PMR_NS::get_default_resource()};
  IdVec list_copy;

  auto current = [&list]() { return IdVec{list.begin(), list.end()}; };
  auto add = [&list, &list_copy](uint32_t value) {
    list.Insert(value);
    set<uint32_t> list_copy_set{list_copy.begin(), list_copy.end()};
    list_copy_set.insert(value);
    list_copy = IdVec{list_copy_set.begin(), list_copy_set.end()};
  };

  // Check empty list is empty
  EXPECT_EQ(current(), list_copy);

  // Insert some numbers in sorted order
  add(10);
  EXPECT_EQ(current(), list_copy);
  add(15);
  EXPECT_EQ(current(), list_copy);
  add(22);
  EXPECT_EQ(current(), list_copy);
  add(25);
  add(31);
  EXPECT_EQ(current(), list_copy);

  // Now insert front
  add(7);
  EXPECT_EQ(current(), list_copy);
  add(2);
  EXPECT_EQ(current(), list_copy);

  // Insert in-between
  add(13);
  EXPECT_EQ(current(), list_copy);
  add(23);
  add(19);
  EXPECT_EQ(current(), list_copy);
  add(30);
  add(27);
  EXPECT_EQ(current(), list_copy);

  // Now add some numbers in reverse order
  add(41);
  add(40);
  add(37);
  add(34);
  EXPECT_EQ(current(), list_copy);

  // Now add a 0
  add(0);
  EXPECT_EQ(current(), list_copy);

  // Make sure all test integers fit into a single byte
  EXPECT_EQ(list.ByteSize(), list.Size());
}

TEST_F(CompressedSortedSetTest, BasicInsertLargeValues) {
  CompressedSortedSet list{PMR_NS::get_default_resource()};
  IdVec list_copy;

  const uint32_t kBase = 1'000'000'000;

  // Add big integers in reverse order
  uint32_t base = kBase;
  while (base > 0) {
    list.Insert(base);
    list_copy.insert(list_copy.begin(), base);
    base /= 10;
  }

  EXPECT_EQ(IdVec(list.begin(), list.end()), list_copy);

  // Now add neighboring  integers with an offset of one
  base = kBase;
  while (base > 0) {
    list.Insert(base + 1);
    list_copy.push_back(base + 1);
    base /= 10;
  }
  sort(list_copy.begin(), list_copy.end());

  EXPECT_EQ(IdVec(list.begin(), list.end()), list_copy);

  // Make sure we use at least twice less memory
  EXPECT_LE(list.ByteSize() * 2, list.Size() * sizeof(uint32_t));
}

TEST_F(CompressedSortedSetTest, SortedBackInserter) {
  CompressedSortedSet list{PMR_NS::get_default_resource()};

  vector<uint32_t> v1 = {1, 3, 5};
  vector<uint32_t> v2 = {2, 4, 6};

  merge(v1.begin(), v1.end(), v2.begin(), v2.end(), SetInserter{&list});

  EXPECT_EQ(IdVec(list.begin(), list.end()), IdVec({1, 2, 3, 4, 5, 6}));
}

TEST_F(CompressedSortedSetTest, BasicRemove) {
  CompressedSortedSet list{PMR_NS::get_default_resource()};

  IdVec values = {1, 3, 4, 7, 8, 11, 15, 17, 20, 22, 27};
  copy(values.begin(), values.end(), SetInserter{&list});
  EXPECT_EQ(IdVec(list.begin(), list.end()), values);

  auto remove = [&list, &values](uint32_t value) {
    values.erase(find(values.begin(), values.end(), value));
    list.Remove(value);
  };

  // Remove back and front
  remove(27);
  EXPECT_EQ(IdVec(list.begin(), list.end()), values);
  remove(1);
  EXPECT_EQ(IdVec(list.begin(), list.end()), values);

  // Remove from middle
  remove(11);
  remove(4);
  EXPECT_EQ(IdVec(list.begin(), list.end()), values);
  remove(17);
  remove(8);
  EXPECT_EQ(IdVec(list.begin(), list.end()), values);

  // Remove non existing
  list.Remove(16);
  EXPECT_EQ(IdVec(list.begin(), list.end()), values);
}

TEST_F(CompressedSortedSetTest, BasicRemoveLargeValues) {
  CompressedSortedSet list{PMR_NS::get_default_resource()};

  IdVec values = {1, 12, 123, 123'4, 123'45, 123'456, 1'234'567, 12'345'678};
  copy(values.begin(), values.end(), SetInserter{&list});
  EXPECT_EQ(IdVec(list.begin(), list.end()), values);

  auto remove = [&list, &values](uint32_t value) {
    values.erase(find(values.begin(), values.end(), value));
    list.Remove(value);
  };

  // Remove from middle
  remove(123'45);
  EXPECT_EQ(IdVec(list.begin(), list.end()), values);
  remove(12);
  EXPECT_EQ(IdVec(list.begin(), list.end()), values);
  remove(1'234'567);
  EXPECT_EQ(IdVec(list.begin(), list.end()), values);

  // Remove front
  remove(1);
  EXPECT_EQ(IdVec(list.begin(), list.end()), values);

  // Remove back
  remove(12'345'678);
  EXPECT_EQ(IdVec(list.begin(), list.end()), values);
}

TEST_F(CompressedSortedSetTest, InsertRemoveLargeValues) {
  CompressedSortedSet list{PMR_NS::get_default_resource()};

  for (int shift = 3; shift < 30; shift++) {
    uint32_t value = 1u << shift;

    IdVec values{value + 3, value, value - 5};
    for (auto v : values)
      list.Insert(v);

    sort(values.begin(), values.end());
    EXPECT_EQ(IdVec(list.begin(), list.end()), values);

    for (auto v : values)
      list.Remove(v);

    EXPECT_EQ(IdVec(list.begin(), list.end()), IdVec({}));
  }
}

}  // namespace dfly::search


================================================
FILE: src/core/search/hnsw_alg.h
================================================
// This file is copied from hnswlib and modified to fit Dragonfly's needs.

#include <hnswlib/hnswalg.h>
#include <hnswlib/visited_list_pool.h>
#include <mimalloc.h>

#pragma once

namespace dfly::search {

enum class HnswErrorStatus : int8_t {
  SUCCESS = 0,
  /* markDelete errors */
  LABEL_NOT_FOUND,
  ELEMENT_ALREADY_DELETED,
};

template <typename dist_t> class HierarchicalNSW : public hnswlib::AlgorithmInterface<dist_t> {
 public:
  using tableint = hnswlib::tableint;
  using labeltype = hnswlib::labeltype;
  using linklistsizeint = hnswlib::linklistsizeint;
  using VisitedListPool = hnswlib::VisitedListPool;
  using vl_type = hnswlib::vl_type;
  using VisitedList = hnswlib::VisitedList;
  using BaseFilterFunctor = hnswlib::BaseFilterFunctor;
  using BaseSearchStopCondition = hnswlib::BaseSearchStopCondition<dist_t>;

  static const tableint MAX_LABEL_OPERATION_LOCKS = 65536;
  static const unsigned char DELETE_MARK = 0x01;

  size_t max_elements_{0};
  mutable std::atomic<size_t> cur_element_count{0};  // current number of elements
  size_t size_data_per_element_{0};
  size_t size_links_per_element_{0};
  mutable std::atomic<size_t> num_deleted_{0};  // number of deleted elements
  size_t M_{0};
  size_t maxM_{0};
  size_t maxM0_{0};
  size_t ef_construction_{0};
  size_t ef_{0};

  double mult_{0.0}, revSize_{0.0};
  int maxlevel_{0};

  std::unique_ptr<VisitedListPool> visited_list_pool_{nullptr};

  // Locks operations with element by label value
  mutable std::vector<std::mutex> label_op_locks_;

  std::mutex global;
  std::vector<std::mutex> link_list_locks_;

  tableint enterpoint_node_{0};

  size_t size_links_level0_{0};
  size_t offsetData_{0}, offsetLevel0_{0}, label_offset_{0};

  char* data_level0_memory_{nullptr};  // Level 0 memory block. Contains links + ptr to data + label
  char* data_vector_memory_{nullptr};  // Memory block for copied vectors
  char** linkLists_{nullptr};
  std::vector<int> element_levels_;  // keeps level of each element

  size_t data_size_{0};

  hnswlib::DISTFUNC<dist_t> fstdistfunc_;
  void* dist_func_param_{nullptr};

  mutable std::mutex label_lookup_lock;  // lock for label_lookup_
  std::unordered_map<labeltype, tableint> label_lookup_;

  std::default_random_engine level_generator_;
  std::default_random_engine update_probability_generator_;

  mutable std::atomic<long> metric_distance_computations{0};
  mutable std::atomic<long> metric_hops{0};

  bool copy_vector_ = true;

  bool allow_replace_deleted_ =
      false;  // flag to replace deleted elements (marked as deleted) during insertions

  std::mutex deleted_elements_lock;               // lock for deleted_elements
  std::unordered_set<tableint> deleted_elements;  // contains internal ids of deleted elements

  HierarchicalNSW(hnswlib::SpaceInterface<dist_t>* s) {
  }

  HierarchicalNSW(hnswlib::SpaceInterface<dist_t>* s, const std::string& location,
                  bool nmslib = false, size_t max_elements = 0, bool allow_replace_deleted = false)
      : allow_replace_deleted_(allow_replace_deleted) {
    loadIndex(location, s, max_elements);
  }

  HierarchicalNSW(hnswlib::SpaceInterface<dist_t>* s, size_t max_elements, size_t M = 16,
                  size_t ef_construction = 200, size_t random_seed = 100, bool copy_vector = true,
                  bool allow_replace_deleted = false)
      : label_op_locks_(MAX_LABEL_OPERATION_LOCKS),
        link_list_locks_(max_elements),
        element_levels_(max_elements),
        copy_vector_(copy_vector),
        allow_replace_deleted_(allow_replace_deleted) {
    max_elements_ = max_elements;
    num_deleted_ = 0;
    data_size_ = s->get_data_size();
    fstdistfunc_ = s->get_dist_func();
    dist_func_param_ = s->get_dist_func_param();
    if (M <= 10000) {
      M_ = M;
    } else {
      HNSWERR << "warning: M parameter exceeds 10000 which may lead to adverse effects."
              << std::endl;
      HNSWERR << "         Cap to 10000 will be applied for the rest of the processing."
              << std::endl;
      M_ = 10000;
    }
    maxM_ = M_;
    maxM0_ = M_ * 2;
    ef_construction_ = std::max(ef_construction, M_);
    ef_ = 10;

    level_generator_.seed(random_seed);
    update_probability_generator_.seed(random_seed + 1);

    // If we copy vector we don't use pointer to data
    size_t vector_ptr_size = copy_vector_ ? 0 : sizeof(char*);
    size_links_level0_ = maxM0_ * sizeof(tableint) + sizeof(linklistsizeint);
    size_data_per_element_ = size_links_level0_ + vector_ptr_size + sizeof(labeltype);
    offsetData_ = size_links_level0_;
    label_offset_ = size_links_level0_ + vector_ptr_size;
    offsetLevel0_ = 0;

    data_level0_memory_ = (char*)mi_malloc(max_elements_ * size_data_per_element_);
    if (data_level0_memory_ == nullptr)
      throw std::runtime_error("Not enough memory");

    if (copy_vector) {
      data_vector_memory_ = (char*)mi_malloc(max_elements_ * data_size_);
      if (data_vector_memory_ == nullptr)
        throw std::runtime_error("Not enough memory");
    }

    cur_element_count = 0;

    visited_list_pool_ = std::unique_ptr<VisitedListPool>(new VisitedListPool(1, max_elements));

    // initializations for special treatment of the first node
    enterpoint_node_ = -1;
    maxlevel_ = -1;

    linkLists_ = (char**)mi_malloc(sizeof(void*) * max_elements_);
    if (linkLists_ == nullptr)
      throw std::runtime_error("Not enough memory: HierarchicalNSW failed to allocate linklists");
    size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint);
    mult_ = 1 / log(1.0 * M_);
    revSize_ = 1.0 / mult_;
  }

  ~HierarchicalNSW() {
    clear();
  }

  void clear() {
    mi_free(data_level0_memory_);
    data_level0_memory_ = nullptr;
    for (tableint i = 0; i < cur_element_count; i++) {
      if (element_levels_[i] > 0)
        mi_free(linkLists_[i]);
    }
    if (copy_vector_) {
      mi_free(data_vector_memory_);
    }
    mi_free(linkLists_);
    linkLists_ = nullptr;
    cur_element_count = 0;
    visited_list_pool_.reset(nullptr);
  }

  struct CompareByFirst {
    constexpr bool operator()(std::pair<dist_t, tableint> const& a,
                              std::pair<dist_t, tableint> const& b) const noexcept {
      return a.first < b.first;
    }
  };

  void setEf(size_t ef) {
    ef_ = ef;
  }

  inline std::mutex& getLabelOpMutex(labeltype label) const {
    // calculate hash
    size_t lock_id = label & (MAX_LABEL_OPERATION_LOCKS - 1);
    return label_op_locks_[lock_id];
  }

  inline labeltype getExternalLabel(tableint internal_id) const {
    labeltype return_label;
    memcpy(&return_label,
           (data_level0_memory_ + internal_id * size_data_per_element_ + label_offset_),
           sizeof(labeltype));
    return return_label;
  }

  inline void setExternalLabel(tableint internal_id, labeltype label) const {
    memcpy((data_level0_memory_ + internal_id * size_data_per_element_ + label_offset_), &label,
           sizeof(labeltype));
  }

  inline char* getDataPtrByInternalId(tableint internal_id) const {
    return (data_level0_memory_ + internal_id * size_data_per_element_ + offsetData_);
  }

  // Return pointer to data by internal id
  inline char* getDataByInternalId(tableint internal_id) const {
    if (copy_vector_) {
      return (data_vector_memory_ + internal_id * data_size_);
    } else {
      char* unaligned_data_ptr = (char*)(getDataPtrByInternalId(internal_id));
      char* data_ptr = nullptr;
      memcpy(static_cast<void*>(&data_ptr), unaligned_data_ptr, sizeof(void*));
      return data_ptr;
    }
  }

  int getRandomLevel(double reverse_size) {
    std::uniform_real_distribution<double> distribution(0.0, 1.0);
    double r = -log(distribution(level_generator_)) * reverse_size;
    return (int)r;
  }

  size_t getMaxElements() {
    return max_elements_;
  }

  size_t getCurrentElementCount() {
    return cur_element_count;
  }

  size_t getDeletedCount() {
    return num_deleted_;
  }

  std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>,
                      CompareByFirst>
  searchBaseLayer(tableint ep_id, const void* data_point, int layer) {
    VisitedList* vl = visited_list_pool_->getFreeVisitedList();
    vl_type* visited_array = vl->mass;
    vl_type visited_array_tag = vl->curV;

    std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>,
                        CompareByFirst>
        top_candidates;
    std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>,
                        CompareByFirst>
        candidateSet;

    dist_t lowerBound;
    if (!isMarkedDeleted(ep_id)) {
      dist_t dist = fstdistfunc_(data_point, getDataByInternalId(ep_id), dist_func_param_);
      top_candidates.emplace(dist, ep_id);
      lowerBound = dist;
      candidateSet.emplace(-dist, ep_id);
    } else {
      lowerBound = std::numeric_limits<dist_t>::max();
      candidateSet.emplace(-lowerBound, ep_id);
    }
    visited_array[ep_id] = visited_array_tag;

    while (!candidateSet.empty()) {
      std::pair<dist_t, tableint> curr_el_pair = candidateSet.top();
      if ((-curr_el_pair.first) > lowerBound && top_candidates.size() == ef_construction_) {
        break;
      }
      candidateSet.pop();

      tableint curNodeNum = curr_el_pair.second;

      std::unique_lock<std::mutex> lock(link_list_locks_[curNodeNum]);

      int* data;  // = (int *)(linkList0_ + curNodeNum * size_links_per_element0_);
      if (layer == 0) {
        data = (int*)get_linklist0(curNodeNum);
      } else {
        data = (int*)get_linklist(curNodeNum, layer);
        //                    data = (int *) (linkLists_[curNodeNum] + (layer - 1) *
        //                    size_links_per_element_);
      }
      size_t size = getListCount((linklistsizeint*)data);
      tableint* datal = (tableint*)(data + 1);

      __builtin_prefetch((char*)(visited_array + *(data + 1)), 0, 3);
      __builtin_prefetch((char*)(visited_array + *(data + 1) + 64), 0, 3);
      __builtin_prefetch(getDataByInternalId(*datal), 0, 3);

      for (size_t j = 0; j < size; j++) {
        tableint candidate_id = *(datal + j);
        //                    if (candidate_id == 0) continue;

        // Request prefetching next vector data memory
        if (j + 1 < size) {
          __builtin_prefetch(getDataByInternalId(*(datal + j + 1)), 0, 3);
        }

        if (visited_array[candidate_id] == visited_array_tag)
          continue;
        visited_array[candidate_id] = visited_array_tag;
        char* currObj1 = (getDataByInternalId(candidate_id));

        dist_t dist1 = fstdistfunc_(data_point, currObj1, dist_func_param_);
        if (top_candidates.size() < ef_construction_ || lowerBound > dist1) {
          candidateSet.emplace(-dist1, candidate_id);

          __builtin_prefetch(getDataByInternalId(candidateSet.top().second), 0, 3);

          if (!isMarkedDeleted(candidate_id))
            top_candidates.emplace(dist1, candidate_id);

          if (top_candidates.size() > ef_construction_)
            top_candidates.pop();

          if (!top_candidates.empty())
            lowerBound = top_candidates.top().first;
        }
      }
    }
    visited_list_pool_->releaseVisitedList(vl);

    return top_candidates;
  }

  // bare_bone_search means there is no check for deletions and stop condition is ignored in return
  // of extra performance
  template <bool bare_bone_search = true, bool collect_metrics = false>
  std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>,
                      CompareByFirst>
  searchBaseLayerST(tableint ep_id, const void* data_point, size_t ef,
                    BaseFilterFunctor* isIdAllowed = nullptr,
                    BaseSearchStopCondition* stop_condition = nullptr) const {
    VisitedList* vl = visited_list_pool_->getFreeVisitedList();
    vl_type* visited_array = vl->mass;
    vl_type visited_array_tag = vl->curV;

    std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>,
                        CompareByFirst>
        top_candidates;
    std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>,
                        CompareByFirst>
        candidate_set;

    dist_t lowerBound;
    if (bare_bone_search ||
        (!isMarkedDeleted(ep_id) && ((!isIdAllowed) || (*isIdAllowed)(getExternalLabel(ep_id))))) {
      char* ep_data = getDataByInternalId(ep_id);
      dist_t dist = fstdistfunc_(data_point, ep_data, dist_func_param_);
      lowerBound = dist;
      top_candidates.emplace(dist, ep_id);
      if (!bare_bone_search && stop_condition) {
        stop_condition->add_point_to_result(getExternalLabel(ep_id), ep_data, dist);
      }
      candidate_set.emplace(-dist, ep_id);
    } else {
      lowerBound = std::numeric_limits<dist_t>::max();
      candidate_set.emplace(-lowerBound, ep_id);
    }

    visited_array[ep_id] = visited_array_tag;

    while (!candidate_set.empty()) {
      std::pair<dist_t, tableint> current_node_pair = candidate_set.top();
      dist_t candidate_dist = -current_node_pair.first;

      bool flag_stop_search;
      if (bare_bone_search) {
        flag_stop_search = candidate_dist > lowerBound;
      } else {
        if (stop_condition) {
          flag_stop_search = stop_condition->should_stop_search(candidate_dist, lowerBound);
        } else {
          flag_stop_search = candidate_dist > lowerBound && top_candidates.size() == ef;
        }
      }
      if (flag_stop_search) {
        break;
      }
      candidate_set.pop();

      tableint current_node_id = current_node_pair.second;
      int* data = (int*)get_linklist0(current_node_id);
      size_t size = getListCount((linklistsizeint*)data);
      //                bool cur_node_deleted = isMarkedDeleted(current_node_id);
      if (collect_metrics) {
        metric_hops++;
        metric_distance_computations += size;
      }

      __builtin_prefetch((char*)(visited_array + *(data + 1)), 0, 3);
      __builtin_prefetch((char*)(visited_array + *(data + 1) + 64), 0, 3);
      __builtin_prefetch(getDataByInternalId(*(data + 1)), 0, 3);
      __builtin_prefetch((char*)(data + 2), 0, 3);

      for (size_t j = 1; j <= size; j++) {
        int candidate_id = *(data + j);
        //                    if (candidate_id == 0) continue;

        // Request prefetching next vector data memory
        if (j + 1 < size) {
          __builtin_prefetch(getDataByInternalId(*(data + j + 1)), 0, 3);
        }

        if (!(visited_array[candidate_id] == visited_array_tag)) {
          visited_array[candidate_id] = visited_array_tag;

          char* currObj1 = (getDataByInternalId(candidate_id));
          dist_t dist = fstdistfunc_(data_point, currObj1, dist_func_param_);

          bool flag_consider_candidate;
          if (!bare_bone_search && stop_condition) {
            flag_consider_candidate = stop_condition->should_consider_candidate(dist, lowerBound);
          } else {
            flag_consider_candidate = top_candidates.size() < ef || lowerBound > dist;
          }

          if (flag_consider_candidate) {
            candidate_set.emplace(-dist, candidate_id);

            __builtin_prefetch(data_level0_memory_ +
                                   candidate_set.top().second * size_data_per_element_ +
                                   offsetLevel0_,  ///////////
                               0, 3);              ////////////////////////

            if (bare_bone_search ||
                (!isMarkedDeleted(candidate_id) &&
                 ((!isIdAllowed) || (*isIdAllowed)(getExternalLabel(candidate_id))))) {
              top_candidates.emplace(dist, candidate_id);
              if (!bare_bone_search && stop_condition) {
                stop_condition->add_point_to_result(getExternalLabel(candidate_id), currObj1, dist);
              }
            }

            bool flag_remove_extra = false;
            if (!bare_bone_search && stop_condition) {
              flag_remove_extra = stop_condition->should_remove_extra();
            } else {
              flag_remove_extra = top_candidates.size() > ef;
            }
            while (flag_remove_extra) {
              tableint id = top_candidates.top().second;
              top_candidates.pop();
              if (!bare_bone_search && stop_condition) {
                stop_condition->remove_point_from_result(getExternalLabel(id),
                                                         getDataByInternalId(id), dist);
                flag_remove_extra = stop_condition->should_remove_extra();
              } else {
                flag_remove_extra = top_candidates.size() > ef;
              }
            }

            if (!top_candidates.empty())
              lowerBound = top_candidates.top().first;
          }
        }
      }
    }

    visited_list_pool_->releaseVisitedList(vl);
    return top_candidates;
  }

  void getNeighborsByHeuristic2(
      std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>,
                          CompareByFirst>& top_candidates,
      const size_t M) {
    if (top_candidates.size() < M) {
      return;
    }

    std::priority_queue<std::pair<dist_t, tableint>> queue_closest;
    std::vector<std::pair<dist_t, tableint>> return_list;
    while (top_candidates.size() > 0) {
      queue_closest.emplace(-top_candidates.top().first, top_candidates.top().second);
      top_candidates.pop();
    }

    while (queue_closest.size()) {
      if (return_list.size() >= M)
        break;
      std::pair<dist_t, tableint> curent_pair = queue_closest.top();
      dist_t dist_to_query = -curent_pair.first;
      queue_closest.pop();
      bool good = true;

      for (std::pair<dist_t, tableint> second_pair : return_list) {
        dist_t curdist = fstdistfunc_(getDataByInternalId(second_pair.second),
                                      getDataByInternalId(curent_pair.second), dist_func_param_);
        if (curdist < dist_to_query) {
          good = false;
          break;
        }
      }
      if (good) {
        return_list.push_back(curent_pair);
      }
    }

    for (std::pair<dist_t, tableint> curent_pair : return_list) {
      top_candidates.emplace(-curent_pair.first, curent_pair.second);
    }
  }

  linklistsizeint* get_linklist0(tableint internal_id) const {
    return (linklistsizeint*)(data_level0_memory_ + internal_id * size_data_per_element_ +
                              offsetLevel0_);
  }

  linklistsizeint* get_linklist0(tableint internal_id, char* data_level0_memory_) const {
    return (linklistsizeint*)(data_level0_memory_ + internal_id * size_data_per_element_ +
                              offsetLevel0_);
  }

  linklistsizeint* get_linklist(tableint internal_id, int level) const {
    return (linklistsizeint*)(linkLists_[internal_id] + (level - 1) * size_links_per_element_);
  }

  linklistsizeint* get_linklist_at_level(tableint internal_id, int level) const {
    return level == 0 ? get_linklist0(internal_id) : get_linklist(internal_id, level);
  }

  tableint mutuallyConnectNewElement(
      const void* data_point, tableint cur_c,
      std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>,
                          CompareByFirst>& top_candidates,
      int level, bool isUpdate) {
    size_t Mcurmax = level ? maxM_ : maxM0_;
    getNeighborsByHeuristic2(top_candidates, M_);
    if (top_candidates.size() > M_)
      throw std::runtime_error(
          "Should be not be more than M_ candidates returned by the heuristic");

    std::vector<tableint> selectedNeighbors;
    selectedNeighbors.reserve(M_);
    while (top_candidates.size() > 0) {
      selectedNeighbors.push_back(top_candidates.top().second);
      top_candidates.pop();
    }

    tableint next_closest_entry_point = selectedNeighbors.back();

    {
      // lock only during the update
      // because during the addition the lock for cur_c is already acquired
      std::unique_lock<std::mutex> lock(link_list_locks_[cur_c], std::defer_lock);
      if (isUpdate) {
        lock.lock();
      }
      linklistsizeint* ll_cur;
      if (level == 0)
        ll_cur = get_linklist0(cur_c);
      else
        ll_cur = get_linklist(cur_c, level);

      if (*ll_cur && !isUpdate) {
        throw std::runtime_error("The newly inserted element should have blank link list");
      }
      setListCount(ll_cur, selectedNeighbors.size());
      tableint* data = (tableint*)(ll_cur + 1);
      for (size_t idx = 0; idx < selectedNeighbors.size(); idx++) {
        if (data[idx] && !isUpdate)
          throw std::runtime_error("Possible memory corruption");
        if (level > element_levels_[selectedNeighbors[idx]])
          throw std::runtime_error("Trying to make a link on a non-existent level");

        data[idx] = selectedNeighbors[idx];
      }
    }

    for (size_t idx = 0; idx < selectedNeighbors.size(); idx++) {
      std::unique_lock<std::mutex> lock(link_list_locks_[selectedNeighbors[idx]]);

      linklistsizeint* ll_other;
      if (level == 0)
        ll_other = get_linklist0(selectedNeighbors[idx]);
      else
        ll_other = get_linklist(selectedNeighbors[idx], level);

      size_t sz_link_list_other = getListCount(ll_other);

      if (sz_link_list_other > Mcurmax)
        throw std::runtime_error("Bad value of sz_link_list_other");
      if (selectedNeighbors[idx] == cur_c)
        throw std::runtime_error("Trying to connect an element to itself");
      if (level > element_levels_[selectedNeighbors[idx]])
        throw std::runtime_error("Trying to make a link on a non-existent level");

      tableint* data = (tableint*)(ll_other + 1);

      bool is_cur_c_present = false;
      if (isUpdate) {
        for (size_t j = 0; j < sz_link_list_other; j++) {
          if (data[j] == cur_c) {
            is_cur_c_present = true;
            break;
          }
        }
      }

      // If cur_c is already present in the neighboring connections of `selectedNeighbors[idx]` then
      // no need to modify any connections or run the heuristics.
      if (!is_cur_c_present) {
        if (sz_link_list_other < Mcurmax) {
          data[sz_link_list_other] = cur_c;
          setListCount(ll_other, sz_link_list_other + 1);
        } else {
          // finding the "weakest" element to replace it with the new one
          dist_t d_max =
              fstdistfunc_(getDataByInternalId(cur_c), getDataByInternalId(selectedNeighbors[idx]),
                           dist_func_param_);
          // Heuristic:
          std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>,
                              CompareByFirst>
              candidates;
          candidates.emplace(d_max, cur_c);

          for (size_t j = 0; j < sz_link_list_other; j++) {
            candidates.emplace(
                fstdistfunc_(getDataByInternalId(data[j]),
                             getDataByInternalId(selectedNeighbors[idx]), dist_func_param_),
                data[j]);
          }

          getNeighborsByHeuristic2(candidates, Mcurmax);

          int indx = 0;
          while (candidates.size() > 0) {
            data[indx] = candidates.top().second;
            candidates.pop();
            indx++;
          }

          setListCount(ll_other, indx);
          // Nearest K:
          /*int indx = -1;
          for (int j = 0; j < sz_link_list_other; j++) {
              dist_t d = fstdistfunc_(getDataByInternalId(data[j]), getDataByInternalId(rez[idx]),
          dist_func_param_); if (d > d_max) { indx = j; d_max = d;
              }
          }
          if (indx >= 0) {
              data[indx] = cur_c;
          } */
        }
      }
    }

    return next_closest_entry_point;
  }

  void resizeIndex(size_t new_max_elements) {
    if (new_max_elements < cur_element_count)
      throw std::runtime_error(
          "Cannot resize, max element is less than the current number of elements");

    visited_list_pool_.reset(new VisitedListPool(1, new_max_elements));

    element_levels_.resize(new_max_elements);

    std::vector<std::mutex>(new_max_elements).swap(link_list_locks_);

    // Reallocate base layer
    char* data_level0_memory_new =
        (char*)mi_realloc(data_level0_memory_, new_max_elements * size_data_per_element_);
    if (data_level0_memory_new == nullptr)
      throw std::runtime_error("Not enough memory: resizeIndex failed to allocate base layer");
    data_level0_memory_ = data_level0_memory_new;

    // If we copy vectors, reallocate also vector data memory
    if (copy_vector_) {
      char* data_vector_memory_new =
          (char*)mi_realloc(data_vector_memory_, new_max_elements * data_size_);
      if (data_vector_memory_new == nullptr)
        throw std::runtime_error("Not enough memory: resizeIndex failed to allocate vector memory");
      data_vector_memory_ = data_vector_memory_new;
    }

    // Reallocate all other layers
    char** linkLists_new = (char**)mi_realloc(linkLists_, sizeof(void*) * new_max_elements);
    if (linkLists_new == nullptr)
      throw std::runtime_error("Not enough memory: resizeIndex failed to allocate other layers");
    linkLists_ = linkLists_new;

    max_elements_ = new_max_elements;
  }

  size_t indexFileSize() const {
    size_t size = 0;
    size += sizeof(offsetLevel0_);
    size += sizeof(max_elements_);
    size += sizeof(cur_element_count);
    size += sizeof(size_data_per_element_);
    size += sizeof(label_offset_);
    size += sizeof(offsetData_);
    size += sizeof(maxlevel_);
    size += sizeof(enterpoint_node_);
    size += sizeof(maxM_);

    size += sizeof(maxM0_);
    size += sizeof(M_);
    size += sizeof(mult_);
    size += sizeof(ef_construction_);

    size += cur_element_count * size_data_per_element_;

    for (size_t i = 0; i < cur_element_count; i++) {
      unsigned int linkListSize =
          element_levels_[i] > 0 ? size_links_per_element_ * element_levels_[i] : 0;
      size += sizeof(linkListSize);
      size += linkListSize;
    }
    return size;
  }

  void saveIndex(const std::string& location) {
#if 0
        std::ofstream output(location, std::ios::binary);
        std::streampos position;

        writeBinaryPOD(output, offsetLevel0_);
        writeBinaryPOD(output, max_elements_);
        writeBinaryPOD(output, cur_element_count);
        writeBinaryPOD(output, size_data_per_element_);
        writeBinaryPOD(output, label_offset_);
        writeBinaryPOD(output, offsetData_);
        writeBinaryPOD(output, maxlevel_);
        writeBinaryPOD(output, enterpoint_node_);
        writeBinaryPOD(output, maxM_);

        writeBinaryPOD(output, maxM0_);
        writeBinaryPOD(output, M_);
        writeBinaryPOD(output, mult_);
        writeBinaryPOD(output, ef_construction_);
        writeBinaryPOD(output, copy_vector_);

        output.write(data_level0_memory_, cur_element_count * size_data_per_element_);

        if(copy_vector_) {
          output.write(data_vector_memory_, cur_element_count * data_size_);
        }

        for (size_t i = 0; i < cur_element_count; i++) {
            unsigned int linkListSize = element_levels_[i] > 0 ? size_links_per_element_ * element_levels_[i] : 0;
            writeBinaryPOD(output, linkListSize);
            if (linkListSize)
                output.write(linkLists_[i], linkListSize);
        }
        output.close();
#endif
  }

  void loadIndex(const std::string& location, hnswlib::SpaceInterface<dist_t>* s,
                 size_t max_elements_i = 0) {
#if 0
        std::ifstream input(location, std::ios::binary);

        if (!input.is_open())
            throw std::runtime_error("Cannot open file");

        clear();
        // get file size:
        input.seekg(0, input.end);
        std::streampos total_filesize = input.tellg();
        input.seekg(0, input.beg);

        readBinaryPOD(input, offsetLevel0_);
        readBinaryPOD(input, max_elements_);
        readBinaryPOD(input, cur_element_count);

        size_t max_elements = max_elements_i;
        if (max_elements < cur_element_count)
            max_elements = max_elements_;
        max_elements_ = max_elements;
        readBinaryPOD(input, size_data_per_element_);
        readBinaryPOD(input, label_offset_);
        readBinaryPOD(input, offsetData_);
        readBinaryPOD(input, maxlevel_);
        readBinaryPOD(input, enterpoint_node_);

        readBinaryPOD(input, maxM_);
        readBinaryPOD(input, maxM0_);
        readBinaryPOD(input, M_);
        readBinaryPOD(input, mult_);
        readBinaryPOD(input, ef_construction_);

        readBinaryPOD(input, copy_vector_);

        data_size_ = s->get_data_size();
        fstdistfunc_ = s->get_dist_func();
        dist_func_param_ = s->get_dist_func_param();

        auto pos = input.tellg();

        /// Optional - check if index is ok:
        input.seekg(cur_element_count * size_data_per_element_, input.cur);
        for (size_t i = 0; i < cur_element_count; i++) {
            if (input.tellg() < 0 || input.tellg() >= total_filesize) {
                throw std::runtime_error("Index seems to be corrupted or unsupported");
            }

            unsigned int linkListSize;
            readBinaryPOD(input, linkListSize);
            if (linkListSize != 0) {
                input.seekg(linkListSize, input.cur);
            }
        }

        // throw exception if it either corrupted or old index
        if (input.tellg() != total_filesize)
            throw std::runtime_error("Index seems to be corrupted or unsupported");

        input.clear();
        /// Optional check end

        input.seekg(pos, input.beg);

        data_level0_memory_ = (char *) mi_malloc(max_elements * size_data_per_element_);
        if (data_level0_memory_ == nullptr)
            throw std::runtime_error("Not enough memory: loadIndex failed to allocate level0");
        input.read(data_level0_memory_, cur_element_count * size_data_per_element_);

        if(copy_vector_) {
          data_vector_memory_ = (char *) mi_malloc(max_elements * data_size_);
          if (data_vector_memory_ == nullptr)
              throw std::runtime_error("Not enough memory: loadIndex failed to allocate vector memory");
          input.read(data_vector_memory_, cur_element_count * data_size_);
        }

        size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint);

        size_links_level0_ = maxM0_ * sizeof(tableint) + sizeof(linklistsizeint);
        std::vector<std::mutex>(max_elements).swap(link_list_locks_);
        std::vector<std::mutex>(MAX_LABEL_OPERATION_LOCKS).swap(label_op_locks_);

        visited_list_pool_.reset(new VisitedListPool(1, max_elements));

        linkLists_ = (char **) mi_malloc(sizeof(void *) * max_elements);
        if (linkLists_ == nullptr)
            throw std::runtime_error("Not enough memory: loadIndex failed to allocate linklists");
        element_levels_ = std::vector<int>(max_elements);
        revSize_ = 1.0 / mult_;
        ef_ = 10;
        for (size_t i = 0; i < cur_element_count; i++) {
            label_lookup_[getExternalLabel(i)] = i;
            unsigned int linkListSize;
            readBinaryPOD(input, linkListSize);
            if (linkListSize == 0) {
                element_levels_[i] = 0;
                linkLists_[i] = nullptr;
            } else {
                element_levels_[i] = linkListSize / size_links_per_element_;
                linkLists_[i] = (char *) mi_malloc(linkListSize);
                if (linkLists_[i] == nullptr)
                    throw std::runtime_error("Not enough memory: loadIndex failed to allocate linklist");
                input.read(linkLists_[i], linkListSize);
            }
        }

        for (size_t i = 0; i < cur_element_count; i++) {
            if (isMarkedDeleted(i)) {
                num_deleted_ += 1;
                if (allow_replace_deleted_) deleted_elements.insert(i);
            }
        }

        input.close();
#endif
  }

  template <typename data_t> std::vector<data_t> getDataByLabel(labeltype label) const {
    // lock all operations with element by label
    std::unique_lock<std::mutex> lock_label(getLabelOpMutex(label));

    std::unique_lock<std::mutex> lock_table(label_lookup_lock);
    auto search = label_lookup_.find(label);
    if (search == label_lookup_.end() || isMarkedDeleted(search->second)) {
      throw std::runtime_error("Label not found");
    }
    tableint internalId = search->second;
    lock_table.unlock();

    char* data_ptrv = getDataByInternalId(internalId);
    size_t dim = *((size_t*)dist_func_param_);
    std::vector<data_t> data;
    data_t* data_ptr = (data_t*)data_ptrv;
    for (size_t i = 0; i < dim; i++) {
      data.push_back(*data_ptr);
      data_ptr += 1;
    }
    return data;
  }

  /*
   * Marks an element with the given label deleted, does NOT really change the current graph.
   */
  HnswErrorStatus markDelete(labeltype label) {
    // lock all operations with element by label
    std::unique_lock<std::mutex> lock_label(getLabelOpMutex(label));

    std::unique_lock<std::mutex> lock_table(label_lookup_lock);
    auto search = label_lookup_.find(label);
    if (search == label_lookup_.end()) {
      return HnswErrorStatus::LABEL_NOT_FOUND;
    }
    tableint internalId = search->second;
    lock_table.unlock();
    if (!markDeletedInternal(internalId)) {
      return HnswErrorStatus::ELEMENT_ALREADY_DELETED;
    }
    return HnswErrorStatus::SUCCESS;
  }

  /*
   * Uses the last 16 bits of the memory for the linked list size to store the mark,
   * whereas maxM0_ has to be limited to the lower 16 bits, however, still large enough in almost
   * all cases.
   */
  bool markDeletedInternal(tableint internalId) {
    assert(internalId < cur_element_count);
    if (!isMarkedDeleted(internalId)) {
      unsigned char* ll_cur = ((unsigned char*)get_linklist0(internalId)) + 2;
      *ll_cur |= DELETE_MARK;
      num_deleted_ += 1;
      if (allow_replace_deleted_) {
        std::unique_lock<std::mutex> lock_deleted_elements(deleted_elements_lock);
        deleted_elements.insert(internalId);
      }
      return true;
    } else {
      return false;
    }
  }

  /*
   * Removes the deleted mark of the node, does NOT really change the current graph.
   *
   * Note: the method is not safe to use when replacement of deleted elements is enabled,
   *  because elements marked as deleted can be completely removed by addPoint
   */
  void unmarkDelete(labeltype label) {
    // lock all operations with element by label
    std::unique_lock<std::mutex> lock_label(getLabelOpMutex(label));

    std::unique_lock<std::mutex> lock_table(label_lookup_lock);
    auto search = label_lookup_.find(label);
    if (search == label_lookup_.end()) {
      throw std::runtime_error("Label not found");
    }
    tableint internalId = search->second;
    lock_table.unlock();

    unmarkDeletedInternal(internalId);
  }

  /*
   * Remove the deleted mark of the node.
   */
  void unmarkDeletedInternal(tableint internalId) {
    assert(internalId < cur_element_count);
    if (isMarkedDeleted(internalId)) {
      unsigned char* ll_cur = ((unsigned char*)get_linklist0(internalId)) + 2;
      *ll_cur &= ~DELETE_MARK;
      num_deleted_ -= 1;
      if (allow_replace_deleted_) {
        std::unique_lock<std::mutex> lock_deleted_elements(deleted_elements_lock);
        deleted_elements.erase(internalId);
      }
    } else {
      throw std::runtime_error("The requested to undelete element is not deleted");
    }
  }

  /*
   * Checks the first 16 bits of the memory to see if the element is marked deleted.
   */
  bool isMarkedDeleted(tableint internalId) const {
    unsigned char* ll_cur = ((unsigned char*)get_linklist0(internalId)) + 2;
    return *ll_cur & DELETE_MARK;
  }

  unsigned short int getListCount(linklistsizeint* ptr) const {
    return *((unsigned short int*)ptr);
  }

  void setListCount(linklistsizeint* ptr, unsigned short int size) const {
    *((unsigned short int*)(ptr)) = *((unsigned short int*)&size);
  }

  /*
   * Adds point. Updates the point if it is already in the index.
   * If replacement of deleted elements is enabled: replaces previously deleted point if any,
   * updating it with new point
   */
  void addPoint(const void* data_point, labeltype label, bool replace_deleted = false) {
    if ((allow_replace_deleted_ == false) && (replace_deleted == true)) {
      throw std::runtime_error("Replacement of deleted elements is disabled in constructor");
    }

    // lock all operations with element by label
    std::unique_lock<std::mutex> lock_label(getLabelOpMutex(label));
    if (!replace_deleted) {
      addPoint(data_point, label, -1);
      return;
    }
    // check if there is vacant place
    tableint internal_id_replaced;
    std::unique_lock<std::mutex> lock_deleted_elements(deleted_elements_lock);
    bool is_vacant_place = !deleted_elements.empty();
    if (is_vacant_place) {
      internal_id_replaced = *deleted_elements.begin();
      deleted_elements.erase(internal_id_replaced);
    }
    lock_deleted_elements.unlock();

    // if there is no vacant place then add or update point
    // else add point to vacant place
    if (!is_vacant_place) {
      addPoint(data_point, label, -1);
    } else {
      // we assume that there are no concurrent operations on deleted element
      labeltype label_replaced = getExternalLabel(internal_id_replaced);
      setExternalLabel(internal_id_replaced, label);

      std::unique_lock<std::mutex> lock_table(label_lookup_lock);
      label_lookup_.erase(label_replaced);
      label_lookup_[label] = internal_id_replaced;
      lock_table.unlock();

      unmarkDeletedInternal(internal_id_replaced);
      updatePoint(data_point, internal_id_replaced, 1.0);
    }
  }

  void updatePoint(const void* dataPointIn, tableint internalId, float updateNeighborProbability) {
    if (copy_vector_) {
      memcpy(getDataByInternalId(internalId), dataPointIn, data_size_);
    } else {
      memcpy(getDataPtrByInternalId(internalId), &dataPointIn, sizeof(void*));
    }

    const void* dataPoint = getDataByInternalId(internalId);
    assert(dataPoint != nullptr);

    int maxLevelCopy = maxlevel_;
    tableint entryPointCopy = enterpoint_node_;
    // If point to be updated is entry point and graph just contains single element then just
    // return.
    if (entryPointCopy == internalId && cur_element_count == 1)
      return;

    int elemLevel = element_levels_[internalId];
    std::uniform_real_distribution<float> distribution(0.0, 1.0);
    for (int layer = 0; layer <= elemLevel; layer++) {
      std::unordered_set<tableint> sCand;
      std::unordered_set<tableint> sNeigh;
      std::vector<tableint> listOneHop = getConnectionsWithLock(internalId, layer);
      if (listOneHop.size() == 0)
        continue;

      sCand.insert(internalId);

      for (auto&& elOneHop : listOneHop) {
        sCand.insert(elOneHop);

        if (distribution(update_probability_generator_) > updateNeighborProbability)
          continue;

        sNeigh.insert(elOneHop);

        std::vector<tableint> listTwoHop = getConnectionsWithLock(elOneHop, layer);
        for (auto&& elTwoHop : listTwoHop) {
          sCand.insert(elTwoHop);
        }
      }

      for (auto&& neigh : sNeigh) {
        // if (neigh == internalId)
        //     continue;

        std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>,
                            CompareByFirst>
            candidates;
        size_t size = sCand.find(neigh) == sCand.end()
                          ? sCand.size()
                          : sCand.size() - 1;  // sCand guaranteed to have size >= 1
        size_t elementsToKeep = std::min(ef_construction_, size);
        for (auto&& cand : sCand) {
          if (cand == neigh)
            continue;

          dist_t distance =
              fstdistfunc_(getDataByInternalId(neigh), getDataByInternalId(cand), dist_func_param_);
          if (candidates.size() < elementsToKeep) {
            candidates.emplace(distance, cand);
          } else {
            if (distance < candidates.top().first) {
              candidates.pop();
              candidates.emplace(distance, cand);
            }
          }
        }

        // Retrieve neighbours using heuristic and set connections.
        getNeighborsByHeuristic2(candidates, layer == 0 ? maxM0_ : maxM_);

        {
          std::unique_lock<std::mutex> lock(link_list_locks_[neigh]);
          linklistsizeint* ll_cur;
          ll_cur = get_linklist_at_level(neigh, layer);
          size_t candSize = candidates.size();
          setListCount(ll_cur, candSize);
          tableint* data = (tableint*)(ll_cur + 1);
          for (size_t idx = 0; idx < candSize; idx++) {
            data[idx] = candidates.top().second;
            candidates.pop();
          }
        }
      }
    }

    repairConnectionsForUpdate(dataPoint, entryPointCopy, internalId, elemLevel, maxLevelCopy);
  }

  void repairConnectionsForUpdate(const void* dataPoint, tableint entryPointInternalId,
                                  tableint dataPointInternalId, int dataPointLevel, int maxLevel) {
    tableint currObj = entryPointInternalId;
    if (dataPointLevel < maxLevel) {
      dist_t curdist = fstdistfunc_(dataPoint, getDataByInternalId(currObj), dist_func_param_);
      for (int level = maxLevel; level > dataPointLevel; level--) {
        bool changed = true;
        while (changed) {
          changed = false;
          unsigned int* data;
          std::unique_lock<std::mutex> lock(link_list_locks_[currObj]);
          data = get_linklist_at_level(currObj, level);
          int size = getListCount(data);
          tableint* datal = (tableint*)(data + 1);

          __builtin_prefetch(getDataByInternalId(*datal), 0, 3);

          for (int i = 0; i < size; i++) {
            if (i + 1 < size) {
              __builtin_prefetch(getDataByInternalId(*(datal + i + 1)), 1, 3);
            }

            tableint cand = datal[i];
            dist_t d = fstdistfunc_(dataPoint, getDataByInternalId(cand), dist_func_param_);
            if (d < curdist) {
              curdist = d;
              currObj = cand;
              changed = true;
            }
          }
        }
      }
    }

    if (dataPointLevel > maxLevel)
      throw std::runtime_error("Level of item to be updated cannot be bigger than max level");

    for (int level = dataPointLevel; level >= 0; level--) {
      std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>,
                          CompareByFirst>
          topCandidates = searchBaseLayer(currObj, dataPoint, level);

      std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>,
                          CompareByFirst>
          filteredTopCandidates;
      while (topCandidates.size() > 0) {
        if (topCandidates.top().second != dataPointInternalId)
          filteredTopCandidates.push(topCandidates.top());

        topCandidates.pop();
      }

      // Since element_levels_ is being used to get `dataPointLevel`, there could be cases where
      // `topCandidates` could just contains entry point itself. To prevent self loops, the
      // `topCandidates` is filtered and thus can be empty.
      if (filteredTopCandidates.size() > 0) {
        bool epDeleted = isMarkedDeleted(entryPointInternalId);
        if (epDeleted) {
          filteredTopCandidates.emplace(
              fstdistfunc_(dataPoint, getDataByInternalId(entryPointInternalId), dist_func_param_),
              entryPointInternalId);
          if (filteredTopCandidates.size() > ef_construction_)
            filteredTopCandidates.pop();
        }

        currObj = mutuallyConnectNewElement(dataPoint, dataPointInternalId, filteredTopCandidates,
                                            level, true);
      }
    }
  }

  std::vector<tableint> getConnectionsWithLock(tableint internalId, int level) {
    std::unique_lock<std::mutex> lock(link_list_locks_[internalId]);
    unsigned int* data = get_linklist_at_level(internalId, level);
    int size = getListCount(data);
    std::vector<tableint> result(size);
    tableint* ll = (tableint*)(data + 1);
    memcpy(result.data(), ll, size * sizeof(tableint));
    return result;
  }

  tableint addPoint(const void* data_point_in, labeltype label, int level) {
    tableint cur_c = 0;
    {
      // Checking if the element with the same label already exists
      // if so, updating it *instead* of creating a new element.
      std::unique_lock<std::mutex> lock_table(label_lookup_lock);
      auto search = label_lookup_.find(label);
      if (search != label_lookup_.end()) {
        tableint existingInternalId = search->second;
        if (allow_replace_deleted_) {
          if (isMarkedDeleted(existingInternalId)) {
            throw std::runtime_error(
                "Can't use addPoint to update deleted elements if replacement of deleted elements "
                "is enabled.");
          }
        }
        lock_table.unlock();

        if (isMarkedDeleted(existingInternalId)) {
          unmarkDeletedInternal(existingInternalId);
        }
        updatePoint(data_point_in, existingInternalId, 1.0);

        return existingInternalId;
      }

      if (cur_element_count >= max_elements_) {
        throw std::runtime_error("The number of elements exceeds the specified limit");
      }

      cur_c = cur_element_count;
      cur_element_count++;
      label_lookup_[label] = cur_c;
    }

    std::unique_lock<std::mutex> lock_el(link_list_locks_[cur_c]);
    int curlevel = getRandomLevel(mult_);
    if (level > 0)
      curlevel = level;

    element_levels_[cur_c] = curlevel;

    std::unique_lock<std::mutex> templock(global);
    int maxlevelcopy = maxlevel_;
    if (curlevel <= maxlevelcopy)
      templock.unlock();
    tableint currObj = enterpoint_node_;
    tableint enterpoint_copy = enterpoint_node_;

    memset(data_level0_memory_ + cur_c * size_data_per_element_ + offsetLevel0_, 0,
           size_data_per_element_);

    if (copy_vector_) {
      memset(data_vector_memory_ + cur_c * data_size_, 0, data_size_);
    }

    // Initialisation of the data and label
    setExternalLabel(cur_c, label);

    if (copy_vector_) {
      memcpy(getDataByInternalId(cur_c), data_point_in, data_size_);
    } else {
      memcpy(getDataPtrByInternalId(cur_c), &data_point_in, sizeof(void*));
    }

    const void* data_point = getDataByInternalId(cur_c);
    assert(data_point != nullptr);

    if (curlevel) {
      linkLists_[cur_c] = (char*)mi_malloc(size_links_per_element_ * curlevel + 1);
      if (linkLists_[cur_c] == nullptr)
        throw std::runtime_error("Not enough memory: addPoint failed to allocate linklist");
      memset(linkLists_[cur_c], 0, size_links_per_element_ * curlevel + 1);
    }

    if ((signed)currObj != -1) {
      if (curlevel < maxlevelcopy) {
        dist_t curdist = fstdistfunc_(data_point, getDataByInternalId(currObj), dist_func_param_);
        for (int level = maxlevelcopy; level > curlevel; level--) {
          bool changed = true;
          while (changed) {
            changed = false;
            unsigned int* data;
            std::unique_lock<std::mutex> lock(link_list_locks_[currObj]);
            data = get_linklist(currObj, level);
            int size = getListCount(data);

            tableint* datal = (tableint*)(data + 1);
            for (int i = 0; i < size; i++) {
              tableint cand = datal[i];
              if (cand > max_elements_)
                throw std::runtime_error("cand error");
              dist_t d = fstdistfunc_(data_point, getDataByInternalId(cand), dist_func_param_);
              if (d < curdist) {
                curdist = d;
                currObj = cand;
                changed = true;
              }
            }
          }
        }
      }

      bool epDeleted = isMarkedDeleted(enterpoint_copy);
      for (int level = std::min(curlevel, maxlevelcopy); level >= 0; level--) {
        if (level > maxlevelcopy || level < 0)  // possible?
          throw std::runtime_error("Level error");

        std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>,
                            CompareByFirst>
            top_candidates = searchBaseLayer(currObj, data_point, level);
        if (epDeleted) {
          top_candidates.emplace(
              fstdistfunc_(data_point, getDataByInternalId(enterpoint_copy), dist_func_param_),
              enterpoint_copy);
          if (top_candidates.size() > ef_construction_)
            top_candidates.pop();
        }
        currObj = mutuallyConnectNewElement(data_point, cur_c, top_candidates, level, false);
      }
    } else {
      // Do nothing for the first element
      enterpoint_node_ = 0;
      maxlevel_ = curlevel;
    }

    // Releasing lock for the maximum level
    if (curlevel > maxlevelcopy) {
      enterpoint_node_ = cur_c;
      maxlevel_ = curlevel;
    }
    return cur_c;
  }

  std::priority_queue<std::pair<dist_t, labeltype>> searchKnn(
      const void* query_data, size_t k, BaseFilterFunctor* isIdAllowed = nullptr) const {
    std::priority_queue<std::pair<dist_t, labeltype>> result;
    if (cur_element_count == 0)
      return result;

    tableint currObj = enterpoint_node_;
    dist_t curdist =
        fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_);

    for (int level = maxlevel_; level > 0; level--) {
      bool changed = true;
      while (changed) {
        changed = false;
        unsigned int* data;

        data = (unsigned int*)get_linklist(currObj, level);
        int size = getListCount(data);
        metric_hops++;
        metric_distance_computations += size;

        tableint* datal = (tableint*)(data + 1);
        for (int i = 0; i < size; i++) {
          tableint cand = datal[i];
          if (cand > max_elements_)
            throw std::runtime_error("cand error");
          dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);

          if (d < curdist) {
            curdist = d;
            currObj = cand;
            changed = true;
          }
        }
      }
    }

    std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>,
                        CompareByFirst>
        top_candidates;
    bool bare_bone_search = !num_deleted_ && !isIdAllowed;
    if (bare_bone_search) {
      top_candidates = searchBaseLayerST<true>(currObj, query_data, std::max(ef_, k), isIdAllowed);
    } else {
      top_candidates = searchBaseLayerST<false>(currObj, query_data, std::max(ef_, k), isIdAllowed);
    }

    while (top_candidates.size() > k) {
      top_candidates.pop();
    }
    while (top_candidates.size() > 0) {
      std::pair<dist_t, tableint> rez = top_candidates.top();
      result.push(std::pair<dist_t, labeltype>(rez.first, getExternalLabel(rez.second)));
      top_candidates.pop();
    }
    return result;
  }

  // Brute-force KNN search over a pre-filtered set of label IDs.
  // Computes distances for all provided IDs and returns the top-k closest, ordered by distance.
  std::priority_queue<std::pair<dist_t, labeltype>> subsetKnnSearch(
      const void* query_data, size_t k, const std::vector<labeltype>& ids) const {
    std::priority_queue<std::pair<dist_t, labeltype>> result;

    if (cur_element_count == 0 || ids.empty() || k == 0)
      return result;

    for (const auto& label : ids) {
      auto it = label_lookup_.find(label);

      if (it == label_lookup_.end()) {
        continue;
      }

      tableint internal_id = it->second;

      if (isMarkedDeleted(internal_id)) {
        continue;
      }

      dist_t dist = fstdistfunc_(query_data, getDataByInternalId(internal_id), dist_func_param_);
      if (result.size() < k) {
        result.emplace(dist, label);
      } else if (dist < result.top().first) {
        result.pop();
        result.emplace(dist, label);
      }
    }

    return result;
  }

  std::vector<std::pair<dist_t, labeltype>> searchStopConditionClosest(
      const void* query_data, BaseSearchStopCondition& stop_condition,
      BaseFilterFunctor* isIdAllowed = nullptr) const {
    std::vector<std::pair<dist_t, labeltype>> result;
    if (cur_element_count == 0)
      return result;

    tableint currObj = enterpoint_node_;
    dist_t curdist =
        fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_);

    for (int level = maxlevel_; level > 0; level--) {
      bool changed = true;
      while (changed) {
        changed = false;
        unsigned int* data;

        data = (unsigned int*)get_linklist(currObj, level);
        int size = getListCount(data);
        metric_hops++;
        metric_distance_computations += size;

        tableint* datal = (tableint*)(data + 1);
        for (int i = 0; i < size; i++) {
          tableint cand = datal[i];
          if (cand < 0 || cand > max_elements_)
            throw std::runtime_error("cand error");
          dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);

          if (d < curdist) {
            curdist = d;
            currObj = cand;
            changed = true;
          }
        }
      }
    }

    std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>,
                        CompareByFirst>
        top_candidates;
    top_candidates = searchBaseLayerST<false>(currObj, query_data, 0, isIdAllowed, &stop_condition);

    size_t sz = top_candidates.size();
    result.resize(sz);
    while (!top_candidates.empty()) {
      result[--sz] = top_candidates.top();
      top_candidates.pop();
    }

    stop_condition.filter_results(result);

    return result;
  }

  // Returns all elements within `radius` distance from query_data.
  // Adapts the HNSW beam search from Malkov & Yashunin (2018), https://arxiv.org/abs/1603.09320:
  // Phase 1 is the standard greedy descent to find the level-0 entry point; Phase 2 replaces
  // the top-k heap with a radius threshold, collecting all nodes with dist <= radius.
  // The dynamic search boundary starts at max(entry_point_distance, radius) and shrinks as
  // closer out-of-radius candidates are found; `epsilon` controls the overscan factor
  // (default 0.01) to improve recall near the boundary.
  std::vector<std::pair<dist_t, labeltype>> searchRange(const void* query_data, dist_t radius,
                                                        double epsilon = 0.01) const {
    std::vector<std::pair<dist_t, labeltype>> result;
    if (cur_element_count == 0)
      return result;

    // Phase 1: greedy descent from top level to find the best entry point for level 0.
    tableint currObj = enterpoint_node_;
    dist_t curdist =
        fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_);
    for (int level = maxlevel_; level > 0; level--) {
      bool changed = true;
      while (changed) {
        changed = false;
        unsigned int* data = (unsigned int*)get_linklist(currObj, level);
        int size = getListCount(data);
        tableint* datal = (tableint*)(data + 1);
        for (int i = 0; i < size; i++) {
          tableint cand = datal[i];
          if (cand >= max_elements_)
            throw std::runtime_error("cand error");
          dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
          if (d < curdist) {
            curdist = d;
            currObj = cand;
            changed = true;
          }
        }
      }
    }

    // Phase 2: range search on bottom layer (level 0) with dynamic search boundary.
    VisitedList* vl = visited_list_pool_->getFreeVisitedList();
    vl_type* visited_array = vl->mass;
    vl_type visited_array_tag = vl->curV;

    std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>,
                        CompareByFirst>
        candidate_set;

    // Dynamic range starts at max(entry_point_dist, radius) so we never stop early just
    // because the entry point is farther than radius.
    dist_t ep_dist = curdist;
    dist_t dynamic_range = std::max(ep_dist, radius);
    dist_t dyn_boundary = static_cast<dist_t>(dynamic_range * (1.0 + epsilon));

    if (!isMarkedDeleted(currObj) && ep_dist <= radius)
      result.emplace_back(ep_dist, getExternalLabel(currObj));

    candidate_set.emplace(-ep_dist, currObj);
    visited_array[currObj] = visited_array_tag;

    while (!candidate_set.empty()) {
      auto curr_pair = candidate_set.top();
      dist_t curr_dist = -curr_pair.first;

      if (curr_dist > dyn_boundary)
        break;

      candidate_set.pop();
      tableint curr_id = curr_pair.second;

      // Shrink dynamic_range: if candidate is between radius and current range, pull the
      // boundary down toward radius. If candidate is within radius and dynamic_range is
      // still above radius (entry point was far), clamp to radius so we stop over-scanning.
      if (curr_dist < dynamic_range) {
        if (curr_dist >= radius) {
          dynamic_range = curr_dist;
        } else if (dynamic_range > radius) {
          dynamic_range = radius;
        }
        dyn_boundary = static_cast<dist_t>(dynamic_range * (1.0 + epsilon));
      }

      int* data = (int*)get_linklist0(curr_id);
      size_t size = getListCount((linklistsizeint*)data);

      for (size_t j = 1; j <= size; j++) {
        tableint candidate_id = *(data + j);
        if (candidate_id >= max_elements_)
          throw std::runtime_error("cand error");

        if (j < size)
          __builtin_prefetch(getDataByInternalId(*(data + j + 1)), 0, 3);

        if (visited_array[candidate_id] == visited_array_tag)
          continue;
        visited_array[candidate_id] = visited_array_tag;

        dist_t d = fstdistfunc_(query_data, getDataByInternalId(candidate_id), dist_func_param_);
        if (d < dyn_boundary) {
          candidate_set.emplace(-d, candidate_id);
          if (!isMarkedDeleted(candidate_id) && d <= radius)
            result.emplace_back(d, getExternalLabel(candidate_id));
        }
      }
    }

    visited_list_pool_->releaseVisitedList(vl);
    return result;
  }

#if 0
    void checkIntegrity() {
        int connections_checked = 0;
        std::vector <int > inbound_connections_num(cur_element_count, 0);
        for (int i = 0; i < cur_element_count; i++) {
            for (int l = 0; l <= element_levels_[i]; l++) {
                linklistsizeint *ll_cur = get_linklist_at_level(i, l);
                int size = getListCount(ll_cur);
                tableint *data = (tableint *) (ll_cur + 1);
                std::unordered_set<tableint> s;
                for (int j = 0; j < size; j++) {
                    assert(data[j] < cur_element_count);
                    assert(data[j] != i);
                    inbound_connections_num[data[j]]++;
                    s.insert(data[j]);
                    connections_checked++;
                }
                assert(s.size() == size);
            }
        }
        if (cur_element_count > 1) {
            int min1 = inbound_connections_num[0], max1 = inbound_connections_num[0];
            for (int i=0; i < cur_element_count; i++) {
                assert(inbound_connections_num[i] > 0);
                min1 = std::min(inbound_connections_num[i], min1);
                max1 = std::max(inbound_connections_num[i], max1);
            }
            std::cout << "Min inbound: " << min1 << ", Max inbound:" << max1 << "\n";
        }
        std::cout << "integrity ok, checked " << connections_checked << " connections\n";
    }
#endif
};

}  // namespace dfly::search


================================================
FILE: src/core/search/hnsw_index.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/search/hnsw_index.h"

#include <absl/container/flat_hash_map.h>
#include <absl/strings/match.h>
#include <hnswlib/hnswlib.h>
#include <hnswlib/space_ip.h>
#include <hnswlib/space_l2.h>

#include "base/logging.h"
#include "core/search/hnsw_alg.h"
#include "core/search/mrmw_mutex.h"
#include "core/search/vector_utils.h"

namespace dfly::search {

using namespace std;

namespace {

class HnswSpace : public hnswlib::SpaceInterface<float> {
  unsigned dim_;
  VectorSimilarity sim_;

  static float L2DistanceStatic(const void* pVect1, const void* pVect2, const void* param) {
    return L2Distance(static_cast<const float*>(pVect1), static_cast<const float*>(pVect2),
                      *static_cast<const unsigned*>(param));
  }

  static float IPDistanceStatic(const void* pVect1, const void* pVect2, const void* param) {
    return IPDistance(static_cast<const float*>(pVect1), static_cast<const float*>(pVect2),
                      *static_cast<const unsigned*>(param));
  }

  static float CosineDistanceStatic(const void* pVect1, const void* pVect2, const void* param) {
    return CosineDistance(static_cast<const float*>(pVect1), static_cast<const float*>(pVect2),
                          *static_cast<const unsigned*>(param));
  }

 public:
  explicit HnswSpace(size_t dim, VectorSimilarity sim) : dim_(dim), sim_(sim) {
  }

  size_t get_data_size() {
    return dim_ * sizeof(float);
  }

  hnswlib::DISTFUNC<float> get_dist_func() {
    if (sim_ == VectorSimilarity::L2) {
      return L2DistanceStatic;
    } else if (sim_ == VectorSimilarity::COSINE) {
      return CosineDistanceStatic;
    } else {
      return IPDistanceStatic;
    }
  }

  void* get_dist_func_param() {
    return &dim_;
  }
};
}  // namespace

// TODO: to replace it and use HierarchicalNSW directly.
struct HnswlibAdapter {
  // Default setting of hnswlib/hnswalg
  constexpr static size_t kDefaultEfRuntime = 10;

  explicit HnswlibAdapter(const SchemaField::VectorParams& params, bool copy_vector)
      : space_{params.dim, params.sim},
        world_{&space_,       params.capacity, params.hnsw_m, params.hnsw_ef_construction,
               100 /* seed*/, copy_vector},
        copy_vector_{copy_vector},
        data_size_{params.dim * sizeof(float)} {
  }

  // Adds a point to the index. If the write lock cannot be acquired (e.g.
  // serialization holds a read lock), the operation is deferred and will be
  // replayed by a subsequent write or TryProcessDeferred() call.
  // When copy_vector_ is false the index stores a raw pointer to external data,
  // so we must add the point synchronously before the caller's pointer goes out
  // of scope — use a blocking write lock in that case.
  void Add(const void* data, GlobalDocId id) {
    if (copy_vector_) {
      {
        MRMWMutexLock lock(&mrmw_mutex_, MRMWMutex::LockMode::kWriteLock, std::try_to_lock);
        if (lock.locked()) {
          ProcessDeferred();
          DoAdd(data, id);
          return;
        }
      }
      // Could not acquire write lock — defer the operation.
      AddDeferredOp(id, DeferredOp(true, data, data_size_, /*copy=*/true));
      TryProcessDeferred();
    } else {
      MRMWMutexLock lock(&mrmw_mutex_, MRMWMutex::LockMode::kWriteLock);
      ProcessDeferred();
      DoAdd(data, id);
    }
  }

  // Removes a point from the index. If the write lock cannot be acquired, the
  // operation is deferred.
  void Remove(GlobalDocId id) {
    {
      MRMWMutexLock lock(&mrmw_mutex_, MRMWMutex::LockMode::kWriteLock, std::try_to_lock);
      if (lock.locked()) {
        ProcessDeferred();
        DoRemove(id);
        return;
      }
    }
    AddDeferredOp(id, DeferredOp(false, nullptr, 0, false));
    TryProcessDeferred();
  }

  vector<pair<float, GlobalDocId>> Knn(float* target, size_t k, std::optional<size_t> ef) {
    TryProcessDeferred();
    world_.setEf(ef.value_or(kDefaultEfRuntime));
    MRMWMutexLock lock(&mrmw_mutex_, MRMWMutex::LockMode::kReadLock);
    return QueueToVec(world_.searchKnn(target, k));
  }

  vector<pair<float, GlobalDocId>> Knn(float* target, size_t k, std::optional<size_t> ef,
                                       const vector<GlobalDocId>& allowed) {
    struct BinsearchFilter : hnswlib::BaseFilterFunctor {
      virtual bool operator()(hnswlib::labeltype id) {
        return binary_search(allowed->begin(), allowed->end(), id);
      }

      BinsearchFilter(const vector<GlobalDocId>* allowed) : allowed{allowed} {
      }
      const vector<GlobalDocId>* allowed;
    };

    TryProcessDeferred();
    world_.setEf(ef.value_or(kDefaultEfRuntime));
    BinsearchFilter filter{&allowed};
    MRMWMutexLock lock(&mrmw_mutex_, MRMWMutex::LockMode::kReadLock);
    return QueueToVec(world_.searchKnn(target, k, &filter));
  }

  // Brute-force KNN search over a specific subset of documents.
  // Computes distances for all provided document IDs and returns the k nearest neighbors.
  vector<pair<float, GlobalDocId>> SubsetKnn(float* target, size_t k,
                                             const vector<GlobalDocId>& docs) {
    MRMWMutexLock lock(&mrmw_mutex_, MRMWMutex::LockMode::kReadLock);
    return QueueToVec(world_.subsetKnnSearch(target, k, docs));
  }

  // Returns all documents within the given radius, with their distances.
  // Uses dynamic-range exploration (searchRange) to correctly handle cases where
  // the entry point is farther than radius.
  vector<pair<float, GlobalDocId>> RangeSearch(float* target, float radius) {
    TryProcessDeferred();
    MRMWMutexLock lock(&mrmw_mutex_, MRMWMutex::LockMode::kReadLock);
    return world_.searchRange(target, radius);
  }

  HnswIndexMetadata GetMetadata() const {
    MRMWMutexLock lock(&mrmw_mutex_, MRMWMutex::LockMode::kReadLock);
    HnswIndexMetadata metadata;
    metadata.max_elements = world_.max_elements_;
    metadata.cur_element_count = world_.cur_element_count.load();
    metadata.maxlevel = world_.maxlevel_;
    metadata.enterpoint_node = world_.enterpoint_node_;
    return metadata;
  }

  void SetMetadata(const HnswIndexMetadata& metadata) {
    MRMWMutexLock lock(&mrmw_mutex_, MRMWMutex::LockMode::kWriteLock);
    absl::WriterMutexLock resize_lock(&resize_mutex_);

    // SetMetadata is only called during deserialization before the index is used.
    // Assert the index is empty to ensure no concurrent operations are possible.
    DCHECK_EQ(world_.cur_element_count.load(), 0u)
        << "SetMetadata should only be called on an empty index during deserialization";

    // Runtime check for release builds to prevent silent corruption
    if (world_.cur_element_count.load() != 0) {
      LOG(ERROR) << "SetMetadata called on non-empty HNSW index with "
                 << world_.cur_element_count.load() << " elements, ignoring";
      return;
    }

    // Pre-allocate capacity based on expected element count, but don't set cur_element_count.
    // cur_element_count will be set by RestoreFromNodes when the actual nodes are restored.
    if (world_.max_elements_ < metadata.cur_element_count) {
      world_.resizeIndex(metadata.cur_element_count);
    }
    // Note: Don't set cur_element_count here - RestoreFromNodes will set it after restoring nodes.
  }

  size_t GetNodeCount() const {
    MRMWMutexLock lock(&mrmw_mutex_, MRMWMutex::LockMode::kReadLock);
    return world_.cur_element_count.load();
  }

  std::vector<HnswNodeData> GetNodesRange(size_t start, size_t end) const {
    DCHECK(mrmw_mutex_.IsReadLocked());
    size_t count = world_.cur_element_count.load();
    end = std::min(end, count);
    start = std::min(start, end);

    std::vector<HnswNodeData> result;
    result.reserve(end - start);

    for (size_t internal_id = start; internal_id < end; ++internal_id) {
      HnswNodeData node_data;
      node_data.internal_id = internal_id;
      node_data.global_id = world_.getExternalLabel(internal_id);
      node_data.level = world_.element_levels_[internal_id];

      node_data.levels_links.resize(node_data.level + 1);

      auto* ll0 = world_.get_linklist0(internal_id);
      unsigned short link_count0 = world_.getListCount(ll0);
      auto* links0 = reinterpret_cast<uint32_t*>(ll0 + 1);
      node_data.levels_links[0].assign(links0, links0 + link_count0);

      for (int lvl = 1; lvl <= node_data.level; ++lvl) {
        auto* ll = world_.get_linklist(internal_id, lvl);
        unsigned short link_count = world_.getListCount(ll);
        auto* links = reinterpret_cast<uint32_t*>(ll + 1);
        node_data.levels_links[lvl].assign(links, links + link_count);
      }

      result.push_back(std::move(node_data));
    }
    return result;
  }

 private:
  // A single deferred Add or Remove operation.
  struct DeferredOp {
    bool is_add;
    bool owns_data;        // If true, data_ptr was allocated by us and must be freed.
    const void* data_ptr;  // Pointer to vector data (owned or borrowed).

    DeferredOp(bool is_add, const void* data, size_t data_size, bool copy)
        : is_add(is_add), owns_data(copy && data != nullptr) {
      if (owns_data) {
        void* buf = mi_malloc(data_size);
        memcpy(buf, data, data_size);
        data_ptr = buf;
      } else {
        data_ptr = data;
      }
    }

    ~DeferredOp() {
      if (owns_data)
        mi_free(const_cast<void*>(data_ptr));
    }

    DeferredOp(DeferredOp&& o) noexcept
        : is_add(o.is_add), owns_data(o.owns_data), data_ptr(o.data_ptr) {
      o.owns_data = false;
      o.data_ptr = nullptr;
    }

    DeferredOp& operator=(DeferredOp&& o) noexcept {
      auto lhs = std::tie(is_add, owns_data, data_ptr);
      auto rhs = std::tie(o.is_add, o.owns_data, o.data_ptr);
      std::swap(lhs, rhs);
      return *this;
    }

    DeferredOp(const DeferredOp&) = delete;
    DeferredOp& operator=(const DeferredOp&) = delete;
  };

  // Actually add the point. Must be called while holding mrmw write lock.
  void DoAdd(const void* data, GlobalDocId id) {
    while (true) {
      try {
        absl::ReaderMutexLock resize_lock(&resize_mutex_);
        world_.addPoint(data, id);
        return;
      } catch (const std::exception& e) {
        std::string error_msg = e.what();
        if (absl::StrContains(error_msg, "The number of elements exceeds the specified limit")) {
          ResizeIfFull();
          continue;
        }
        LOG(ERROR) << "HnswlibAdapter::DoAdd exception: " << e.what();
        return;
      }
    }
  }

  void DoRemove(GlobalDocId id) {
    HnswErrorStatus status = world_.markDelete(id);
    if (status != HnswErrorStatus::SUCCESS) {
      VLOG(1) << "HnswlibAdapter::Remove failed with status: " << static_cast<int>(status)
              << " for global id: " << id;
    }
  }

  // Add a deferred operation, replacing any previous one for the same document.
  void AddDeferredOp(GlobalDocId id, DeferredOp op) {
    std::lock_guard g(deferred_mu_);
    deferred_ops_.insert_or_assign(id, std::move(op));
  }

  // Take all deferred operations out of the queue.
  absl::flat_hash_map<GlobalDocId, DeferredOp> TakeDeferredOps() {
    std::lock_guard g(deferred_mu_);
    absl::flat_hash_map<GlobalDocId, DeferredOp> ops;
    ops.swap(deferred_ops_);
    return ops;
  }

  // Drain the deferred operations queue. Must be called while holding the mrmw
  // write lock.  Only copy_vector_=true adds and removes can be deferred, so
  // ordering within the queue does not matter.
  void ProcessDeferred() {
    auto ops = TakeDeferredOps();
    for (auto& [id, op] : ops) {
      if (op.is_add) {
        DoAdd(op.data_ptr, id);
      } else {
        DoRemove(id);
      }
    }
  }

  // Non-blocking attempt to drain the deferred queue.
  void TryProcessDeferred() {
    MRMWMutexLock lock(&mrmw_mutex_, MRMWMutex::LockMode::kWriteLock, std::try_to_lock);
    if (lock.locked()) {
      ProcessDeferred();
    }
  }

  // Function requires that we hold mutex while resizing index. resizeIndex is not thread safe with
  // insertion (https://github.com/nmslib/hnswlib/issues/267)
  void ResizeIfFull() {
    {
      // First check with reader lock to avoid contention.
      absl::ReaderMutexLock lock(&resize_mutex_);
      if (world_.getCurrentElementCount() < world_.getMaxElements() ||
          (world_.allow_replace_deleted_ && world_.getDeletedCount() > 0)) {
        return;
      }
    }
    try {
      // Upgrade to writer lock.
      absl::WriterMutexLock lock(&resize_mutex_);
      if (world_.getCurrentElementCount() == world_.getMaxElements() &&
          (!world_.allow_replace_deleted_ || world_.getDeletedCount() == 0)) {
        auto max_elements = world_.getMaxElements();
        world_.resizeIndex(max_elements * 2);
        VLOG(1) << "Resizing HNSW Index from " << max_elements << " to " << max_elements * 2;
      }
    } catch (const std::exception& e) {
      LOG(FATAL) << "HnswlibAdapter::ResizeIfFull exception: " << e.what();
    }
  }

  template <typename Q> static vector<pair<float, GlobalDocId>> QueueToVec(Q queue) {
    vector<pair<float, GlobalDocId>> out(queue.size());
    size_t idx = out.size();
    while (!queue.empty()) {
      out[--idx] = queue.top();
      queue.pop();
    }
    return out;
  }

 public:
  // Restore HNSW graph structure from serialized nodes with metadata
  void RestoreFromNodes(const std::vector<HnswNodeData>& nodes, const HnswIndexMetadata& metadata) {
    MRMWMutexLock lock(&mrmw_mutex_, MRMWMutex::LockMode::kWriteLock);
    absl::WriterMutexLock resize_lock(&resize_mutex_);

    if (nodes.empty()) {
      return;
    }

    // RestoreFromNodes is only called during deserialization on a freshly created index.
    // Assert the index is empty to prevent memory leaks from double-allocation of linkLists_.
    DCHECK_EQ(world_.cur_element_count.load(), 0u)
        << "RestoreFromNodes should only be called on an empty index during deserialization";

    // Ensure we have enough capacity.
    // Metadata may have been captured before the snapshot read-lock, so
    // cur_element_count can be smaller than actual node internal_ids when
    // concurrent writes happen.  Compute the real requirement from nodes.
    size_t max_internal_id = 0;
    for (const auto& node : nodes) {
      max_internal_id = std::max<size_t>(max_internal_id, node.internal_id);
    }
    size_t required_capacity = std::max(metadata.cur_element_count, max_internal_id + 1);
    if (world_.max_elements_ < required_capacity) {
      world_.resizeIndex(required_capacity);
    }

    // Restore each node - directly set up memory and fields
    size_t restored_count = 0;

    for (const auto& node : nodes) {
      size_t internal_id = node.internal_id;

      // Validate internal_id is within bounds - invalid internal_id indicates corrupted data
      CHECK(internal_id < world_.max_elements_);

      // Register label in lookup table
      world_.label_lookup_[node.global_id] = internal_id;

      // Set the level
      world_.element_levels_[internal_id] = node.level;

      // Clear level 0 memory and set label.
      // Memory layout: each element occupies size_data_per_element_ bytes starting at
      // data_level0_memory_ + internal_id * size_data_per_element_.
      // offsetLevel0_ is always 0, so we clear exactly one element's worth of data.
      // This matches the pattern in hnswlib's addPoint().
      memset(world_.data_level0_memory_ + internal_id * world_.size_data_per_element_, 0,
             world_.size_data_per_element_);
      world_.setExternalLabel(internal_id, node.global_id);

      // In copy mode, zero the vector memory so distance computations don't use
      // uninitialized data for nodes that are marked deleted.
      if (world_.copy_vector_) {
        char* data_ptr = world_.data_vector_memory_ + internal_id * world_.data_size_;
        memset(data_ptr, 0, world_.data_size_);
      }

      // Allocate upper layer links if needed
      if (node.level > 0) {
        world_.linkLists_[internal_id] =
            (char*)mi_malloc(world_.size_links_per_element_ * node.level + 1);
        memset(world_.linkLists_[internal_id], 0, world_.size_links_per_element_ * node.level + 1);
      }

      // Restore links for layer 0
      if (!node.levels_links.empty()) {
        auto* ll0 = world_.get_linklist0(internal_id);
        world_.setListCount(ll0, node.levels_links[0].size());
        auto* links0 = reinterpret_cast<uint32_t*>(ll0 + 1);
        std::copy(node.levels_links[0].begin(), node.levels_links[0].end(), links0);
      }

      // Restore links for upper layers
      for (int lvl = 1; lvl <= node.level && lvl < static_cast<int>(node.levels_links.size());
           ++lvl) {
        auto* ll = world_.get_linklist(internal_id, lvl);
        world_.setListCount(ll, node.levels_links[lvl].size());
        auto* links = reinterpret_cast<uint32_t*>(ll + 1);
        std::copy(node.levels_links[lvl].begin(), node.levels_links[lvl].end(), links);
      }

      // Track restored count so markDeletedInternal can validate internal_id bounds.
      world_.cur_element_count.store(++restored_count);

      // Mark node as deleted until UpdateVectorData provides valid vector data.
      // This prevents crashes from dereferencing uninitialised data pointers
      // (especially in borrowed-vector mode).
      world_.markDeletedInternal(internal_id);
    }

    // Set the metadata for the graph
    world_.maxlevel_ = metadata.maxlevel;
    world_.enterpoint_node_ = metadata.enterpoint_node;

    VLOG(1) << "Restored HNSW index with " << restored_count
            << " nodes, maxlevel=" << metadata.maxlevel
            << ", enterpoint=" << metadata.enterpoint_node;
  }

  // Update vector data for an existing node (used after RestoreFromNodes).
  // Returns false if the node doesn't exist in the index.
  bool UpdateVectorData(GlobalDocId id, const void* data) {
    TryProcessDeferred();
    MRMWMutexLock lock(&mrmw_mutex_, MRMWMutex::LockMode::kWriteLock);

    // Find the internal id for this label
    auto it = world_.label_lookup_.find(id);
    if (it == world_.label_lookup_.end()) {
      VLOG(1) << "UpdateVectorData: label " << id << " not found in index";
      return false;
    }

    size_t internal_id = it->second;

    // Copy/store the vector data based on copy_vector_ mode
    if (world_.copy_vector_) {
      // Owned mode: copy data into world's vector memory
      char* data_ptr = world_.data_vector_memory_ + internal_id * world_.data_size_;
      memcpy(data_ptr, data, world_.data_size_);
    } else {
      // Borrowed mode: store pointer to external data
      char* ptr_location = world_.getDataPtrByInternalId(internal_id);
      memcpy(ptr_location, &data, sizeof(void*));
    }

    // Unmark deleted so the node participates in KNN searches now that it
    // has valid vector data. During RestoreFromNodes all nodes are marked
    // deleted by default to prevent dereferencing uninitialised data.
    if (world_.isMarkedDeleted(internal_id)) {
      world_.unmarkDeletedInternal(internal_id);
    }
    return true;
  }

  std::unique_ptr<MRMWMutexLock> GetReadLock() const {
    return std::make_unique<MRMWMutexLock>(&mrmw_mutex_, MRMWMutex::LockMode::kReadLock);
  }

 private:
  HnswSpace space_;
  HierarchicalNSW<float> world_;
  absl::Mutex resize_mutex_;
  mutable MRMWMutex mrmw_mutex_;

  bool copy_vector_;                    // Whether vectors are copied into hnswlib.
  size_t data_size_;                    // Byte size of a single vector.
  mutable base::SpinLock deferred_mu_;  // Protects deferred_ops_.
  absl::flat_hash_map<GlobalDocId, DeferredOp> deferred_ops_;  // GUARDED_BY(deferred_mu_)
};

HnswVectorIndex::HnswVectorIndex(const SchemaField::VectorParams& params, bool copy_vector,
                                 PMR_NS::memory_resource*)
    : copy_vector_(copy_vector),
      dim_{params.dim},
      adapter_{make_unique<HnswlibAdapter>(params, copy_vector)} {
  DCHECK(params.use_hnsw);
  // TODO: Patch hnsw to use MR
}

HnswVectorIndex::~HnswVectorIndex() {
}

bool HnswVectorIndex::Add(GlobalDocId id, const DocumentAccessor& doc, std::string_view field) {
  auto vector_ptr = doc.GetVector(field, dim_);

  if (!vector_ptr) {
    return false;
  }

  const void* data = nullptr;
  if (std::holds_alternative<OwnedFtVector>(*vector_ptr)) {
    data = std::get<OwnedFtVector>(*vector_ptr).first.get();
  } else {
    data = std::get<BorrowedFtVector>(*vector_ptr);
  }

  if (!data) {
    return false;
  }

  adapter_->Add(data, id);
  return true;
}

std::vector<std::pair<float, GlobalDocId>> HnswVectorIndex::Knn(float* target, size_t k,
                                                                std::optional<size_t> ef) const {
  return adapter_->Knn(target, k, ef);
}

std::vector<std::pair<float, GlobalDocId>> HnswVectorIndex::Knn(
    float* target, size_t k, std::optional<size_t> ef,
    const std::vector<GlobalDocId>& allowed) const {
  return adapter_->Knn(target, k, ef, allowed);
}

std::vector<std::pair<float, GlobalDocId>> HnswVectorIndex::SubsetKnn(
    float* target, size_t k, const std::vector<GlobalDocId>& docs) const {
  return adapter_->SubsetKnn(target, k, docs);
}

std::vector<std::pair<float, GlobalDocId>> HnswVectorIndex::RangeQuery(float* target,
                                                                       float radius) const {
  return adapter_->RangeSearch(target, radius);
}

void HnswVectorIndex::Remove(GlobalDocId id, const DocumentAccessor& doc, string_view field) {
  adapter_->Remove(id);
}

void HnswVectorIndex::Remove(GlobalDocId id) {
  adapter_->Remove(id);
}

HnswIndexMetadata HnswVectorIndex::GetMetadata() const {
  return adapter_->GetMetadata();
}

void HnswVectorIndex::SetMetadata(const HnswIndexMetadata& metadata) {
  adapter_->SetMetadata(metadata);
}

size_t HnswVectorIndex::GetNodeCount() const {
  return adapter_->GetNodeCount();
}

std::vector<HnswNodeData> HnswVectorIndex::GetNodesRange(size_t start, size_t end) const {
  return adapter_->GetNodesRange(start, end);
}

void HnswVectorIndex::RestoreFromNodes(const std::vector<HnswNodeData>& nodes,
                                       const HnswIndexMetadata& metadata) {
  adapter_->RestoreFromNodes(nodes, metadata);
}

bool HnswVectorIndex::UpdateVectorData(GlobalDocId id, const DocumentAccessor& doc,
                                       std::string_view field) {
  auto vector_ptr = doc.GetVector(field, dim_);
  if (!vector_ptr ||
      *vector_ptr == search::DocumentAccessor::VectorInfo(search::BorrowedFtVector(nullptr))) {
    // Document doesn't have the vector field - mark node as deleted to prevent
    // "ghost" nodes with invalid vector data from participating in searches
    LOG(WARNING) << "UpdateVectorData: document " << id
                 << " missing vector field, marking node as deleted in HNSW index";
    adapter_->Remove(id);
    return false;
  }

  const void* data = nullptr;
  if (std::holds_alternative<OwnedFtVector>(*vector_ptr)) {
    data = std::get<OwnedFtVector>(*vector_ptr).first.get();
  } else {
    data = std::get<BorrowedFtVector>(*vector_ptr);
  }

  return adapter_->UpdateVectorData(id, data);
}

std::unique_ptr<MRMWMutexLock> HnswVectorIndex::GetReadLock() const {
  return adapter_->GetReadLock();
}

}  // namespace dfly::search


================================================
FILE: src/core/search/hnsw_index.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <memory>

#include "core/search/mrmw_mutex.h"
#include "core/search/search.h"

namespace dfly::search {

// Metadata structure for HNSW index serialization
// Contains the key parameters needed to restore the index state
struct HnswIndexMetadata {
  size_t max_elements = 0;  // Maximum number of elements the index can hold
  // Note: cur_element_count may be smaller than actual node count during concurrent writes,
  // so we compute the real requirement from nodes during restoration.
  // TODO: consider removing it from metadata and rely entirely on node data for restoration.
  size_t cur_element_count = 0;  // Current number of elements in the index
  int maxlevel = -1;             // Maximum level of the graph
  size_t enterpoint_node = 0;    // Entry point node for the graph
};

// Node data structure for HNSW serialization
struct HnswNodeData {
  uint32_t internal_id;
  GlobalDocId global_id;
  int level;
  std::vector<std::vector<uint32_t>> levels_links;  // Links for each level (0 to level)

  // Returns the total serialized size in bytes.
  // Format: internal_id(4) + global_id(8) + level(4)
  //         + for each level: links_num(4) + links(4 each)
  size_t TotalSize() const {
    size_t size = 4 + 8 + 4;  // internal_id + global_id + level
    for (const auto& links : levels_links) {
      size += 4 + links.size() * 4;  // links_num + links
    }
    return size;
  }
};

struct HnswlibAdapter;
class HnswVectorIndex {
 public:
  explicit HnswVectorIndex(const search::SchemaField::VectorParams& params, bool copy_vector,
                           PMR_NS::memory_resource* mr = PMR_NS::get_default_resource());

  ~HnswVectorIndex();

  bool Add(search::GlobalDocId id, const search::DocumentAccessor& doc, std::string_view field);
  void Remove(search::GlobalDocId id, const search::DocumentAccessor& doc, std::string_view field);
  void Remove(search::GlobalDocId id);

  bool IsVectorCopied() const {
    return copy_vector_;
  }

  std::vector<std::pair<float, GlobalDocId>> Knn(float* target, size_t k,
                                                 std::optional<size_t> ef) const;
  std::vector<std::pair<float, GlobalDocId>> Knn(float* target, size_t k, std::optional<size_t> ef,
                                                 const std::vector<GlobalDocId>& allowed) const;
  std::vector<std::pair<float, GlobalDocId>> SubsetKnn(float* target, size_t k,
                                                       const std::vector<GlobalDocId>& docs) const;

  // Returns all documents within radius, with their distances.
  std::vector<std::pair<float, GlobalDocId>> RangeQuery(float* target, float radius) const;

  size_t GetDim() const {
    return dim_;
  }

  // Get metadata for serialization
  HnswIndexMetadata GetMetadata() const;

  // Set metadata (used during restoration)
  void SetMetadata(const HnswIndexMetadata& metadata);

  // Get total number of nodes in the index
  size_t GetNodeCount() const;

  // Get nodes in the specified range [start, end)
  // Returns vector of node data for serialization
  std::vector<HnswNodeData> GetNodesRange(size_t start, size_t end) const;

  // Restore graph structure from serialized nodes with metadata
  // This restores the HNSW graph links but NOT the vector data
  // Vector data must be populated separately via UpdateVectorData
  void RestoreFromNodes(const std::vector<HnswNodeData>& nodes, const HnswIndexMetadata& metadata);

  // Update vector data for an existing node (used after RestoreFromNodes)
  // This populates the vector data for a node that already has graph links
  bool UpdateVectorData(GlobalDocId id, const DocumentAccessor& doc, std::string_view field);

  // Acquire a read lock on the internal MRMW mutex.
  // Use this during serialization to block concurrent Add/Remove (write) operations.
  std::unique_ptr<MRMWMutexLock> GetReadLock() const;

 private:
  bool copy_vector_;
  size_t dim_;
  std::unique_ptr<HnswlibAdapter> adapter_;
};

}  // namespace dfly::search


================================================
FILE: src/core/search/index_result.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <variant>
#include <vector>

#include "core/search/ast_expr.h"
#include "core/search/block_list.h"
#include "core/search/range_tree.h"

namespace dfly::search {

// Represents an either owned or non-owned result set that can be accessed and merged transparently.
class IndexResult {
 private:
  using DocVec = std::vector<DocId>;
  using Variant =
      std::variant<DocVec /*owned*/, const DocVec*, const BlockList<CompressedSortedSet>*,
                   const BlockList<SortedVector<DocId>>*, RangeResult>;

  template <typename... Ts> using VariantOfConstPtrs = std::variant<const Ts*...>;
  using BorrowedView =
      VariantOfConstPtrs<DocVec, BlockList<CompressedSortedSet>, BlockList<SortedVector<DocId>>,
                         SingleBlockRangeResult, TwoBlocksRangeResult>;

 public:
  IndexResult() = default;

  explicit IndexResult(Variant value);

  template <typename Container> explicit IndexResult(const Container* container = nullptr);

  /* It will return approximate size of the result set.
     Actual result can be smaller than the size returned by this method. */
  size_t ApproximateSize() const;

  BorrowedView Borrowed() const;

  // Move out of owned or copy borrowed. Take up to `limit` entries and return original size.
  std::pair<DocVec, size_t /* full size */> Take(size_t limit = std::numeric_limits<size_t>::max());

 private:
  bool IsOwned() const;

  Variant value_;
};

std::vector<DocId> MergeIndexResults(const IndexResult& left, const IndexResult& right,
                                     AstLogicalNode::LogicOp op);

// Implementation
/******************************************************************/
inline IndexResult::IndexResult(Variant value) : value_{std::move(value)} {
}

template <typename Container>
IndexResult::IndexResult(const Container* container) : value_{container} {
  if (container == nullptr) {
    value_ = DocVec{};
  }
}

inline size_t IndexResult::ApproximateSize() const {
  return std::visit([](auto* set) { return set->size(); }, Borrowed());
}

inline IndexResult::BorrowedView IndexResult::Borrowed() const {
  auto cb = [](const auto& v) -> BorrowedView {
    using T = std::decay_t<decltype(v)>;
    if constexpr (std::is_pointer_v<std::remove_reference_t<decltype(v)>>) {
      return v;
    } else if constexpr (std::is_same_v<T, RangeResult>) {
      auto range_cb = [](const auto& set) -> BorrowedView { return &set; };
      return std::visit(range_cb, v.GetResult());
    } else {
      return &v;
    }
  };
  return std::visit(cb, value_);
}

inline std::pair<IndexResult::DocVec, size_t> IndexResult::Take(size_t limit) {
  if (IsOwned()) {
    auto& vec = std::get<DocVec>(value_);
    size_t size = vec.size();
    return {std::move(vec), size};
  }

  // Numeric ranges need to be filtered and don't know their exact size ahead
  if (std::holds_alternative<RangeResult>(value_)) {
    auto cb = [limit](auto* range) -> std::pair<DocVec, size_t> {
      DocVec out;
      size_t total = 0;
      out.reserve(std::min(limit, range->size()));
      for (auto it = range->begin(); it != range->end(); ++it) {
        total++;
        if (out.size() < limit)
          out.push_back(*it);
      }
      return {std::move(out), total};
    };
    return std::visit(cb, Borrowed());
  }

  // Generic borrowed results sets don't need to be filtered, so we can tell the result size ahead
  auto cb = [limit](auto* set) -> std::pair<DocVec, size_t> {
    DocVec out;
    out.reserve(std::min(limit, set->size()));
    for (auto it = set->begin(); it != set->end() && out.size() < limit; ++it)
      out.push_back(*it);
    return {std::move(out), set->size()};
  };
  return std::visit(cb, Borrowed());
}

inline bool IndexResult::IsOwned() const {
  return std::holds_alternative<DocVec>(value_);
}

namespace details {
using BackInserter = std::back_insert_iterator<std::vector<DocId>>;

template <typename T> constexpr bool IsSeekableIterator = std::is_base_of_v<SeekableTag, T>;

template <typename Iterator> void Seek(DocId min_doc_id, const Iterator& end, Iterator* it) {
  static constexpr DocId kFastSeekThreshold = 15;

  auto extract_doc_id = [](const auto& value) {
    using T = std::decay_t<decltype(value)>;
    if constexpr (std::is_same_v<T, DocId>) {
      return value;
    } else {
      return value.first;
    }
  };

  DocId current_value = extract_doc_id(**it);
  DCHECK(current_value < min_doc_id);

  if (min_doc_id - current_value > kFastSeekThreshold) {  // If the gap is large, use a fast seek
    if constexpr (IsSeekableIterator<Iterator>) {
      it->SeekGE(min_doc_id);
    } else {
      BasicSeekGE(min_doc_id, end, it);
    }
  } else {
    // If the gap is small, just iterate
    do {
      ++(*it);
    } while (*it != end && extract_doc_id(**it) < min_doc_id);
  }
}

template <typename FirstIterator, typename SecondIterator>
void SetIntersection(FirstIterator first_begin, FirstIterator first_end,
                     SecondIterator second_begin, SecondIterator second_end, BackInserter out) {
  auto l_it = first_begin;
  auto r_it = second_begin;

  while (l_it != first_end && r_it != second_end) {
    DocId l_value = *l_it;
    DocId r_value = *r_it;

    if (l_value == r_value) {
      *out++ = l_value;
      ++l_it;
      if (l_it != first_end) {
        Seek(*l_it, second_end, &r_it);
      }
    } else if (l_value < r_value) {
      Seek(r_value, first_end, &l_it);
    } else {
      DCHECK(l_value > r_value);
      Seek(l_value, second_end, &r_it);
    }
  }
}

}  // namespace details

inline std::vector<DocId> MergeIndexResults(const IndexResult& left, const IndexResult& right,
                                            AstLogicalNode::LogicOp op) {
  std::vector<DocId> result;

  if (op == AstLogicalNode::LogicOp::AND) {
    result.reserve(std::min(left.ApproximateSize(), right.ApproximateSize()));
    auto cb = [&result](auto* s1, auto* s2) {
      details::SetIntersection(s1->begin(), s1->end(), s2->begin(), s2->end(),
                               std::back_inserter(result));
    };
    std::visit(cb, left.Borrowed(), right.Borrowed());
  } else {
    result.reserve(std::max(left.ApproximateSize(), right.ApproximateSize()));
    auto cb = [&result](auto* s1, auto* s2) {
      std::set_union(s1->begin(), s1->end(), s2->begin(), s2->end(), std::back_inserter(result));
    };
    std::visit(cb, left.Borrowed(), right.Borrowed());
  }

  return result;
}

}  // namespace dfly::search


================================================
FILE: src/core/search/indices.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/search/indices.h"

#include <absl/container/flat_hash_set.h>
#include <absl/strings/ascii.h>
#include <absl/strings/numbers.h>
#include <absl/strings/str_join.h>
#include <absl/strings/str_split.h>

#include <boost/iterator/function_output_iterator.hpp>
#include <string_view>

#define UNI_ALGO_DISABLE_NFKC_NFKD

#include <absl/container/btree_set.h>
#include <uni_algo/case.h>
#include <uni_algo/ranges_word.h>

#include <algorithm>
#include <cctype>

#include "base/flags.h"

ABSL_FLAG(bool, use_numeric_range_tree, true,
          "Use range tree for numeric index. "
          "If false, use a simple implementation with btree_set. "
          "Range tree is more memory efficient and faster for range queries, "
          "but slower for single value queries.");

namespace dfly::search {

using namespace std;
using cmn::StringOrView;

namespace {

bool IsAllAscii(string_view sv) {
  return all_of(sv.begin(), sv.end(), [](unsigned char c) { return isascii(c); });
}

string ToLower(string_view word) {
  return IsAllAscii(word) ? absl::AsciiStrToLower(word) : una::cases::to_lowercase_utf8(word);
}

// Get all words from text as matched by the ICU library
absl::flat_hash_set<std::string> TokenizeWords(std::string_view text,
                                               const TextIndex::StopWords& stopwords,
                                               const Synonyms* synonyms) {
  absl::flat_hash_set<std::string> words;
  for (std::string_view word : una::views::word_only::utf8(text)) {
    if (std::string word_lc = una::cases::to_lowercase_utf8(word); !stopwords.contains(word_lc)) {
      if (synonyms) {
        if (auto group_id = synonyms->GetGroupToken(word_lc); group_id) {
          words.insert(*group_id);
        }
      }

      words.insert(std::move(word_lc));
    }
  }
  return words;
}

// Split taglist, remove duplicates and convert all to lowercase
absl::flat_hash_set<string> NormalizeTags(string_view taglist, bool case_sensitive,
                                          char separator) {
  // Splitting utf8 by ascii character is safe
  absl::flat_hash_set<string> tags;
  for (string_view tag : absl::StrSplit(taglist, separator, absl::SkipEmpty())) {
    string_view str = absl::StripAsciiWhitespace(tag);
    if (case_sensitive)
      tags.insert(string{str});
    else
      tags.insert(ToLower(str));
  }
  return tags;
}

// Iterate over all suffixes of all words
void IterateAllSuffixes(const absl::flat_hash_set<string>& words,
                        absl::FunctionRef<void(std::string_view)> cb) {
  for (string_view word : words) {
    for (size_t offs = 0; offs < word.length(); offs++) {
      cb(word.substr(offs));
    }
  }
}

// Haversine with earth radius in meters. Used to calculate distance.
boost::geometry::strategy::distance::haversine haversine_(6372797.560856);

double ConvertToRadiusInMeters(size_t radius, std::string_view arg) {
  const std::string unit = absl::AsciiStrToUpper(arg);
  if (unit == "M") {
    return radius * 1;
  } else if (unit == "KM") {
    return radius * 1000;
  } else if (unit == "FT") {
    return radius * 0.3048;
  } else if (unit == "MI") {
    return radius * 1609.34;
  } else {
    return -1;
  }
}

// Verify if geo string is valid and convert to point
std::optional<GeoIndex::point> GetGeoPoint(const string_view& geo_string) {
  // Empty geo string
  if (geo_string.empty())
    return nullopt;

  absl::InlinedVector<string_view, 2> coordinates = absl::StrSplit(geo_string, ",");

  // Invalid coordinate format
  if (coordinates.size() != 2)
    return std::nullopt;

  // Convert coordinates to double
  double lon, lat;
  if (!absl::SimpleAtod(coordinates[0], &lon) || !absl::SimpleAtod(coordinates[1], &lat))
    return nullopt;

  // Verify that coordinates are within valid ranges
  if (lon < -180 || lon > 180 || lat < -90 || lat > 90)
    return nullopt;

  return GeoIndex::point{lon, lat};
}

};  // namespace

class RangeTreeAdapter : public NumericIndex::RangeTreeBase {
 public:
  explicit RangeTreeAdapter(size_t max_range_block_size, PMR_NS::memory_resource* mr)
      : range_tree_{mr, max_range_block_size}, builder_{RangeTree::Builder{}} {
  }

  void Add(DocId id, absl::Span<double> values) override {
    for (double value : values) {
      if (builder_)
        builder_->Add(id, value);
      else
        range_tree_.Add(id, value);
    }
  }

  void Remove(DocId id, absl::Span<double> values) override {
    for (double value : values) {
      if (builder_)
        builder_->Remove(id, value);
      else
        range_tree_.Remove(id, value);
    }
  }

  RangeResult Range(double l, double r) const override {
    return range_tree_.Range(l, r);
  }

  vector<DocId> GetAllDocIds() const override {
    // TODO: remove take
    return range_tree_.GetAllDocIds().Take();
  }

  void FinalizeInitialization() override {
    builder_->Populate(&range_tree_, {500});
    builder_.reset();
  }

 private:
  RangeTree range_tree_;
  std::optional<RangeTree::Builder> builder_;
};

class BtreeSetImpl : public NumericIndex::RangeTreeBase {
 public:
  explicit BtreeSetImpl(PMR_NS::memory_resource* mr) : entries_(mr) {
  }

  void Add(DocId id, absl::Span<double> values) override {
    if (values.size() > 1) {
      unique_ids_ = false;
    }
    for (double value : values) {
      entries_.insert({value, id});
    }
  }

  void Remove(DocId id, absl::Span<double> values) override {
    for (double value : values) {
      entries_.erase({value, id});
    }
  }

  RangeResult Range(double l, double r) const override {
    DCHECK(l <= r);

    auto it_l = entries_.lower_bound({l, 0});
    auto it_r = entries_.lower_bound({r, numeric_limits<DocId>::max()});
    DCHECK_GE(it_r - it_l, 0);

    vector<DocId> out;
    for (auto it = it_l; it != it_r; ++it)
      out.push_back(it->second);

    sort(out.begin(), out.end());

    if (!unique_ids_) {
      out.erase(unique(out.begin(), out.end()), out.end());
    }
    return RangeResult(std::move(out));
  }

  vector<DocId> GetAllDocIds() const override {
    std::vector<DocId> result;

    result.reserve(entries_.size());

    if (unique_ids_) {
      // If unique_ids_ is true, we can just take the second element of each entry
      for (const auto& [_, doc_id] : entries_) {
        result.push_back(doc_id);
      }
    } else {
      absl::flat_hash_set<DocId> unique_docs;
      unique_docs.reserve(entries_.size());
      for (const auto& [_, doc_id] : entries_) {
        const auto [__, is_new] = unique_docs.insert(doc_id);
        if (is_new) {
          result.push_back(doc_id);
        }
      }
    }

    std::sort(result.begin(), result.end());
    return result;
  }

 private:
  bool unique_ids_ = true;  // If true, docs ids are unique in the index, otherwise they can repeat.
  using Entry = std::pair<double, DocId>;
  absl::btree_set<Entry, std::less<Entry>, PMR_NS::polymorphic_allocator<Entry>> entries_;
};

NumericIndex::NumericIndex(size_t max_range_block_size, PMR_NS::memory_resource* mr) {
  if (absl::GetFlag(FLAGS_use_numeric_range_tree)) {
    range_tree_ = make_unique<RangeTreeAdapter>(max_range_block_size, mr);
  } else {
    range_tree_ = make_unique<BtreeSetImpl>(mr);
  }
}

bool NumericIndex::Add(DocId id, const DocumentAccessor& doc, string_view field) {
  auto numbers = doc.GetNumbers(field);
  if (!numbers) {
    return false;
  }

  range_tree_->Add(id, absl::MakeSpan(numbers.value()));
  return true;
}

void NumericIndex::Remove(DocId id, const DocumentAccessor& doc, string_view field) {
  auto numbers = doc.GetNumbers(field).value();
  range_tree_->Remove(id, absl::MakeSpan(numbers));
}

void NumericIndex::FinalizeInitialization() {
  range_tree_->FinalizeInitialization();
}

RangeResult NumericIndex::Range(double l, double r) const {
  if (r < l)
    return {};
  return range_tree_->Range(l, r);
}

vector<DocId> NumericIndex::GetAllDocsWithNonNullValues() const {
  return range_tree_->GetAllDocIds();
}

template <typename C>
BaseStringIndex<C>::BaseStringIndex(PMR_NS::memory_resource* mr, bool case_sensitive,
                                    bool with_suffix)
    : case_sensitive_{case_sensitive}, entries_{mr} {
  if (with_suffix)
    suffix_trie_.emplace(mr);
}

template <typename C>
const typename BaseStringIndex<C>::Container* BaseStringIndex<C>::Matching(
    string_view word, bool strip_whitespace) const {
  if (strip_whitespace)
    word = absl::StripAsciiWhitespace(word);

  auto it = entries_.find(NormalizeQueryWord(word).view());
  return (it != entries_.end()) ? &it->second : nullptr;
}

template <typename C>
void BaseStringIndex<C>::MatchPrefix(std::string_view prefix,
                                     absl::FunctionRef<void(const Container*)> cb) const {
  StringOrView prefix_norm{NormalizeQueryWord(prefix)};
  prefix = prefix_norm.view();

  // TODO(vlad): Use right iterator to avoid string comparison?
  for (auto it = entries_.lower_bound(prefix);
       it != entries_.end() && (*it).first.rfind(prefix, 0) == 0; ++it) {
    cb(&(*it).second);
  }
}

template <typename C>
void BaseStringIndex<C>::MatchSuffix(std::string_view suffix,
                                     absl::FunctionRef<void(const Container*)> cb) const {
  StringOrView suffix_norm{NormalizeQueryWord(suffix)};
  suffix = suffix_norm.view();

  // If we have a suffix trie built, we just need to fetch the relevant suffix
  if (suffix_trie_) {
    auto it = suffix_trie_->find(suffix);
    cb((it != suffix_trie_->end()) ? &it->second : nullptr);
    return;
  }

  // Otherwise, iterate over all entries and look for the suffix
  for (const auto& entry : entries_) {
    int32_t start = entry.first.size() - suffix.size();
    if (start >= 0 && entry.first.substr(start) == suffix)
      cb(&entry.second);
  }
}

template <typename C>
void BaseStringIndex<C>::MatchInfix(std::string_view infix,
                                    absl::FunctionRef<void(const Container*)> cb) const {
  StringOrView infix_norm{NormalizeQueryWord(infix)};
  infix = infix_norm.view();

  // If we have a suffix trie built, we just need to match the prefix
  if (suffix_trie_) {
    for (auto it = suffix_trie_->lower_bound(infix);
         it != suffix_trie_->end() && (*it).first.rfind(infix, 0) == 0; ++it)
      cb(&(*it).second);
    return;
  }

  // Otherwise, iterate over all entries and check if it contains the entry
  for (const auto& entry : entries_) {
    if (entry.first.find(infix) != string::npos)
      cb(&entry.second);
  }
}

template <typename C>
bool BaseStringIndex<C>::Add(DocId id, const DocumentAccessor& doc, string_view field) {
  auto strings_list = GetStrings(doc, field);
  if (!strings_list) {
    return false;
  }

  absl::flat_hash_set<std::string> tokens;
  for (string_view str : strings_list.value())
    tokens.merge(Tokenize(str));

  if (tokens.size() > 1)
    unique_ids_ = false;
  for (string_view token : tokens)
    GetOrCreate(&entries_, token)->Insert(id);

  if (suffix_trie_)
    IterateAllSuffixes(tokens,
                       [&](string_view str) { GetOrCreate(&*suffix_trie_, str)->Insert(id); });

  return true;
}

template <typename C>
void BaseStringIndex<C>::Remove(DocId id, const DocumentAccessor& doc, string_view field) {
  auto strings_list = GetStrings(doc, field).value();

  absl::flat_hash_set<std::string> tokens;
  for (string_view str : strings_list)
    tokens.merge(Tokenize(str));

  for (string_view token : tokens)
    Remove(&entries_, id, token);

  if (suffix_trie_)
    IterateAllSuffixes(tokens, [&](string_view str) { Remove(&*suffix_trie_, id, str); });
}

template <typename C> vector<string> BaseStringIndex<C>::GetTerms() const {
  vector<string> res;
  res.reserve(entries_.size());
  for (const auto& [term, _] : entries_) {
    res.push_back(string{term});
  }
  return res;
}

template <typename C> vector<DocId> BaseStringIndex<C>::GetAllDocsWithNonNullValues() const {
  std::vector<DocId> result;

  result.reserve(entries_.size());

  if (unique_ids_) {
    // If unique_ids_ is true, we can just take the second element of each entry
    for (const auto& [_, container] : entries_) {
      for (const auto& doc_id : container) {
        result.push_back(doc_id);
      }
    }
  } else {
    absl::flat_hash_set<DocId> unique_docs;
    unique_docs.reserve(entries_.size());

    for (const auto& [_, container] : entries_) {
      for (const auto& doc_id : container) {
        auto [_, is_new] = unique_docs.insert(doc_id);
        if (is_new) {
          result.push_back(doc_id);
        }
      }
    }
  }
  std::sort(result.begin(), result.end());
  return result;
}

template <typename C>
StringOrView BaseStringIndex<C>::NormalizeQueryWord(std::string_view query) const {
  if (case_sensitive_)
    return StringOrView::FromView(query);

  return StringOrView::FromString(ToLower(query));
}

template <typename C>
typename BaseStringIndex<C>::Container* BaseStringIndex<C>::GetOrCreate(
    search::RaxTreeMap<Container>* map, string_view word) {
  auto* mr = map->get_allocator().resource();
  return &map->try_emplace(PMR_NS::string{word, mr}, mr, 1000 /* block size */).first->second;
}

template <typename C>
void BaseStringIndex<C>::Remove(search::RaxTreeMap<Container>* map, DocId id, string_view word) {
  auto it = map->find(word);
  if (it == map->end())
    return;

  it->second.Remove(id);
  if (it->second.Size() == 0)
    map->erase(it);
}

template struct BaseStringIndex<CompressedSortedSet>;
template struct BaseStringIndex<SortedVector<DocId>>;

TextIndex::TextIndex(PMR_NS::memory_resource* mr, const StopWords* stopwords,
                     const Synonyms* synonyms, bool with_suffixtrie)
    : BaseStringIndex(mr, false, with_suffixtrie), stopwords_{stopwords}, synonyms_{synonyms} {
}

std::optional<DocumentAccessor::StringList> TextIndex::GetStrings(const DocumentAccessor& doc,
                                                                  std::string_view field) const {
  return doc.GetStrings(field);
}

absl::flat_hash_set<std::string> TextIndex::Tokenize(std::string_view value) const {
  return TokenizeWords(value, *stopwords_, synonyms_);
}

DefragmentResult TagIndex::Defragment(PageUsage* page_usage) {
  auto defrag = [&](auto& tree, string* key) {
    DefragmentMap dm{tree, key};
    return dm.Defragment(page_usage);
  };

  DefragmentResult result = defrag(entries_, &next_defrag_entry_);

  if (suffix_trie_) {
    result.Merge(defrag(suffix_trie_.value(), &next_defrag_suffix_entry_));
  }

  return result;
}

std::optional<DocumentAccessor::StringList> TagIndex::GetStrings(const DocumentAccessor& doc,
                                                                 std::string_view field) const {
  return doc.GetTags(field);
}

absl::flat_hash_set<std::string> TagIndex::Tokenize(std::string_view value) const {
  return NormalizeTags(value, case_sensitive_, separator_);
}

BaseVectorIndex::BaseVectorIndex(size_t dim, VectorSimilarity sim) : dim_{dim}, sim_{sim} {
}

std::pair<size_t /*dim*/, VectorSimilarity> BaseVectorIndex::Info() const {
  return {dim_, sim_};
}

bool BaseVectorIndex::Add(DocId id, const DocumentAccessor& doc, std::string_view field) {
  auto vector = doc.GetVector(field, dim_);

  if (!vector)
    return false;

  if (std::holds_alternative<OwnedFtVector>(*vector)) {
    const auto& owned_vector = std::get<OwnedFtVector>(*vector);
    AddVector(id, owned_vector.first.get());
  } else {
    const auto& borrowed_vector = std::get<BorrowedFtVector>(*vector);
    AddVector(id, borrowed_vector);
  }

  return true;
}

// Each document occupies (dim_ + 1) floats in entries_: dim_ floats for the vector data,
// followed by one float as a presence marker (1.0 = present, 0.0 = absent/removed).
// This avoids the previous heuristic of treating all-zero vectors as null.
static constexpr float kPresent = 1.0f;
static constexpr float kAbsent = 0.0f;

FlatVectorIndex::FlatVectorIndex(const SchemaField::VectorParams& params,
                                 PMR_NS::memory_resource* mr)
    : BaseVectorIndex{params.dim, params.sim}, entries_{mr} {
  DCHECK(!params.use_hnsw);
  entries_.reserve(params.capacity * (params.dim + 1));
}

void FlatVectorIndex::AddVector(DocId id, const void* vector) {
  const size_t stride = dim_ + 1;
  DCHECK_LE(id * stride, entries_.size());
  if (id * stride == entries_.size())
    entries_.resize((id + 1) * stride, 0.0f);

  if (vector) {
    memcpy(&entries_[id * stride], vector, dim_ * sizeof(float));
    entries_[id * stride + dim_] = kPresent;
  }
}

void FlatVectorIndex::Remove(DocId id, const DocumentAccessor& doc, string_view field) {
  const size_t stride = dim_ + 1;
  if (id * stride + dim_ < entries_.size())
    entries_[id * stride + dim_] = kAbsent;
}

const float* FlatVectorIndex::Get(DocId doc) const {
  const size_t stride = dim_ + 1;
  if (doc * stride + dim_ >= entries_.size() || entries_[doc * stride + dim_] != kPresent)
    return nullptr;
  return &entries_[doc * stride];
}

std::vector<DocId> FlatVectorIndex::GetAllDocsWithNonNullValues() const {
  const size_t stride = dim_ + 1;
  size_t num_slots = entries_.size() / stride;
  std::vector<DocId> result;
  result.reserve(num_slots);
  for (DocId id = 0; id < num_slots; ++id) {
    if (entries_[id * stride + dim_] == kPresent)
      result.push_back(id);
  }
  return result;
}

GeoIndex::GeoIndex(PMR_NS::memory_resource* mr) : rtree_(make_unique<rtree>()) {
}

GeoIndex::~GeoIndex() {
}

bool GeoIndex::Add(DocId id, const DocumentAccessor& doc, std::string_view field) {
  auto geo_string = doc.GetStrings(field);

  if (!geo_string) {
    return false;
  }

  // If field doesn't exists don't add to index.
  if (geo_string->empty()) {
    return true;
  }

  std::vector<GeoIndex::point> points;
  for (string_view str : *geo_string) {
    auto doc_point = GetGeoPoint(str);
    if (!doc_point) {
      return false;
    }
    points.emplace_back(*doc_point);
  }
  for (point p : points) {
    rtree_->insert({p, id});
  }

  return true;
}

void GeoIndex::Remove(DocId id, const DocumentAccessor& doc, string_view field) {
  auto geo_string = doc.GetStrings(field);

  if (!geo_string || geo_string->empty()) {
    return;
  }

  std::vector<GeoIndex::point> points;
  for (string_view str : *geo_string) {
    auto doc_point = GetGeoPoint(str);
    if (!doc_point) {
      return;
    }
    points.emplace_back(*doc_point);
  }
  for (point p : points) {
    rtree_->remove({p, id});
  }
}

std::vector<DocId> GeoIndex::RadiusSearch(double lon, double lat, double radius,
                                          std::string_view unit) {
  std::set<DocId> unique_results;

  // Get radius in meters
  double converted_radius = ConvertToRadiusInMeters(radius, unit);

  // Declare the geographic_point_circle strategy with 4 points
  boost::geometry::strategy::buffer::geographic_point_circle<> point_strategy(4);

  // Declare the distance strategy in meters around the point
  boost::geometry::strategy::buffer::distance_symmetric<double> distance_strategy(converted_radius);

  // Declare other necessary strategies, unused for point
  boost::geometry::strategy::buffer::join_round join_strategy;
  boost::geometry::strategy::buffer::end_round end_strategy;
  boost::geometry::strategy::buffer::side_straight side_strategy;

  point p{lon, lat};

  // Create polygon with 4 point around point
  boost::geometry::model::multi_polygon<boost::geometry::model::polygon<point>> buffer_polygon;

  boost::geometry::buffer(p, buffer_polygon, distance_strategy, side_strategy, join_strategy,
                          end_strategy, point_strategy);

  // Create bouding box around polygon to include all possible points
  boost::geometry::model::box<point> box;
  boost::geometry::envelope(buffer_polygon, box);

  rtree_->query(boost::geometry::index::within(box),
                boost::make_function_output_iterator(
                    [&unique_results, &p, &converted_radius](auto const& val) {
                      if (haversine_.apply(val.first, p) <= converted_radius) {
                        unique_results.insert(val.second);
                      }
                    }));

  // TODO: we should return sorted results by radius distance
  return {unique_results.begin(), unique_results.end()};
}

std::vector<DocId> GeoIndex::GetAllDocsWithNonNullValues() const {
  std::set<DocId> unique_results;
  std::for_each(boost::geometry::index::begin(*rtree_), boost::geometry::index::end(*rtree_),
                [&unique_results](auto const& val) { unique_results.insert(val.second); });
  return {unique_results.begin(), unique_results.end()};
}

}  // namespace dfly::search


================================================
FILE: src/core/search/indices.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/flat_hash_map.h>
#include <absl/container/flat_hash_set.h>

// Wrong warning reported when geometry.hpp is loaded
#ifndef __clang__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#endif
#include <boost/geometry.hpp>
#ifndef __clang__
#pragma GCC diagnostic pop
#endif

#include <absl/functional/function_ref.h>

#include <memory>
#include <optional>
#include <vector>

#include "base/pmr/memory_resource.h"
#include "core/page_usage/page_usage_stats.h"
#include "core/search/base.h"
#include "core/search/block_list.h"
#include "core/search/compressed_sorted_set.h"
#include "core/search/range_tree.h"
#include "core/search/rax_tree.h"

// TODO: move core field definitions out of big header
#include "common/string_or_view.h"
#include "core/search/search.h"

namespace dfly::search {

// Index for integer fields.
// Range bounds are queried in logarithmic time, iteration is constant.
struct NumericIndex : public BaseIndex {
  // Temporary base class for range tree.
  // It is used to use two different range trees depending on the flag use_range_tree.
  // If the flag is true, RangeTree is used, otherwise a simple implementation with btree_set.
  struct RangeTreeBase {
    virtual void Add(DocId id, absl::Span<double> values) = 0;
    virtual void Remove(DocId id, absl::Span<double> values) = 0;

    // Returns all DocIds that match the range [l, r].
    virtual RangeResult Range(double l, double r) const = 0;

    // Returns all DocIds that have non-null values in the index.
    virtual std::vector<DocId> GetAllDocIds() const = 0;

    virtual void FinalizeInitialization(){};

    virtual ~RangeTreeBase() = default;
  };

  // max_range_block_size is the maximum number of entries in a single range block.
  // It is used in RangeTree. Check RangeTree for details.
  explicit NumericIndex(size_t max_range_block_size, PMR_NS::memory_resource* mr);

  bool Add(DocId id, const DocumentAccessor& doc, std::string_view field) override;
  void Remove(DocId id, const DocumentAccessor& doc, std::string_view field) override;

  void FinalizeInitialization() override;

  RangeResult Range(double l, double r) const;

  std::vector<DocId> GetAllDocsWithNonNullValues() const override;

 private:
  std::unique_ptr<RangeTreeBase> range_tree_;
};

// Base index for string based indices.
template <typename C> struct BaseStringIndex : public BaseIndex {
  using Container = BlockList<C>;
  using VecOrPtr = std::variant<std::vector<DocId>, const Container*>;

  BaseStringIndex(PMR_NS::memory_resource* mr, bool case_sensitive, bool with_suffixtrie);

  bool Add(DocId id, const DocumentAccessor& doc, std::string_view field) override;
  void Remove(DocId id, const DocumentAccessor& doc, std::string_view field) override;

  // Pointer is valid as long as index is not mutated. Nullptr if not found
  const Container* Matching(std::string_view str, bool strip_whitespace = true) const;

  // Iterate over all nodes matching on prefix.
  void MatchPrefix(std::string_view prefix, absl::FunctionRef<void(const Container*)> cb) const;

  // Iterate over all nodes matching suffix query. Faster if suffix trie is built.
  void MatchSuffix(std::string_view suffix, absl::FunctionRef<void(const Container*)> cb) const;

  // Iterate over all nodes matching infix query. Faster if suffix trie is built.
  void MatchInfix(std::string_view prefix, absl::FunctionRef<void(const Container*)> cb) const;

  // Returns all the terms that appear as keys in the reverse index.
  std::vector<std::string> GetTerms() const;

  std::vector<DocId> GetAllDocsWithNonNullValues() const override;

 protected:
  using StringList = DocumentAccessor::StringList;

  // Used by Add & Remove to get strings from document
  virtual std::optional<StringList> GetStrings(const DocumentAccessor& doc,
                                               std::string_view field) const = 0;

  // Used by Add & Remove to tokenize text value
  virtual absl::flat_hash_set<std::string> Tokenize(std::string_view value) const = 0;

  cmn::StringOrView NormalizeQueryWord(std::string_view word) const;
  static Container* GetOrCreate(search::RaxTreeMap<Container>* map, std::string_view word);
  static void Remove(search::RaxTreeMap<Container>* map, DocId id, std::string_view word);

  bool case_sensitive_ = false;
  bool unique_ids_ = true;  // If true, docs ids are unique in the index, otherwise they can repeat.
  search::RaxTreeMap<Container> entries_;
  std::optional<search::RaxTreeMap<Container>> suffix_trie_;
};

// Index for text fields.
// Hashmap based lookup per word.
struct TextIndex : public BaseStringIndex<CompressedSortedSet> {
  using StopWords = absl::flat_hash_set<std::string>;

  TextIndex(PMR_NS::memory_resource* mr, const StopWords* stopwords, const Synonyms* synonyms,
            bool with_suffixtrie);

 protected:
  std::optional<StringList> GetStrings(const DocumentAccessor& doc,
                                       std::string_view field) const override;
  absl::flat_hash_set<std::string> Tokenize(std::string_view value) const override;

 private:
  const StopWords* stopwords_;
  const Synonyms* synonyms_;
};

// Index for text fields.
// Hashmap based lookup per word.
struct TagIndex : public BaseStringIndex<SortedVector<DocId>> {
  TagIndex(PMR_NS::memory_resource* mr, SchemaField::TagParams params)
      : BaseStringIndex(mr, params.case_sensitive, params.with_suffixtrie),
        separator_{params.separator} {
  }

  DefragmentResult Defragment(PageUsage* page_usage) override;

 protected:
  std::optional<StringList> GetStrings(const DocumentAccessor& doc,
                                       std::string_view field) const override;
  absl::flat_hash_set<std::string> Tokenize(std::string_view value) const override;

 private:
  char separator_;
  std::string next_defrag_entry_;
  std::string next_defrag_suffix_entry_;
};

struct BaseVectorIndex : public BaseIndex {
  std::pair<size_t /*dim*/, VectorSimilarity> Info() const;

  bool Add(DocId id, const DocumentAccessor& doc, std::string_view field) override final;

 protected:
  BaseVectorIndex(size_t dim, VectorSimilarity sim);

  virtual void AddVector(DocId id, const void* vector) = 0;

  size_t dim_;
  VectorSimilarity sim_;
};

// Index for vector fields.
// Only supports lookup by id.
struct FlatVectorIndex : public BaseVectorIndex {
  FlatVectorIndex(const SchemaField::VectorParams& params, PMR_NS::memory_resource* mr);

  void Remove(DocId id, const DocumentAccessor& doc, std::string_view field) override;

  const float* Get(DocId doc) const;

  // Return all documents that have vectors in this index
  std::vector<DocId> GetAllDocsWithNonNullValues() const override;

 protected:
  void AddVector(DocId id, const void* vector) override;

 private:
  PMR_NS::vector<float> entries_;
};

struct GeoIndex : public BaseIndex {
  using point =
      boost::geometry::model::point<double, 2,
                                    boost::geometry::cs::geographic<boost::geometry::degree>>;
  using index_entry = std::pair<point, DocId>;

  explicit GeoIndex(PMR_NS::memory_resource* mr);
  ~GeoIndex();

  bool Add(DocId id, const DocumentAccessor& doc, std::string_view field) override;
  void Remove(DocId id, const DocumentAccessor& doc, std::string_view field) override;
  std::vector<DocId> RadiusSearch(double lon, double lat, double radius, std::string_view arg);
  std::vector<DocId> GetAllDocsWithNonNullValues() const override;

 private:
  using rtree = boost::geometry::index::rtree<index_entry, boost::geometry::index::linear<16>>;
  std::unique_ptr<rtree> rtree_;
};

// Defragments a map like data structure. The values in the map must have a `Defragment` method.
// Works with rax tree map and hash based maps
template <typename Container> struct DefragmentMap {
  using ValueType = Container::value_type;
  using Iterator = Container::iterator;

  DefragmentMap(Container& container, std::string* key) : key{key} {
    if (key->empty()) {
      it = container.end();
    } else if constexpr (requires { container.lower_bound(*key); }) {
      it = container.lower_bound(*key);
    } else {
      it = container.find(*key);
    }

    if (it == container.end()) {
      it = container.begin();
    }

    end = container.end();
  }

  // The key is set if the defragmentation has to stop mid way due to depleted quota
  DefragmentResult Defragment(PageUsage* page_usage) {
    if (page_usage->QuotaDepleted()) {
      return DefragmentResult{.quota_depleted = true, .objects_moved = 0};
    }

    DefragmentResult result;
    for (; it != end; ++it) {
      const auto& [k, map] = *it;
      if (result.Merge(DefragmentIndex(map, page_usage)).quota_depleted) {
        *key = k;
        break;
      }
    }

    if (it == end) {
      key->clear();
    }

    return result;
  }

 private:
  template <typename T> static auto DefragmentIndex(T& t, PageUsage* page_usage) {
    if constexpr (requires { t->Defragment(page_usage); }) {
      return t->Defragment(page_usage);
    } else {
      return t.Defragment(page_usage);
    }
  }

  std::string* key;
  Iterator it;
  Iterator end;
};

}  // namespace dfly::search


================================================
FILE: src/core/search/lexer.lex
================================================
%top{
  // Our lexer need to know about Parser::symbol_type
  #include "core/search/parser.hh"
  #include "core/search/tag_types.h" // Include TagType enum
}

%{
  #include <absl/strings/escaping.h>
  #include <absl/strings/numbers.h>

  #include "base/logging.h"

  #define DFLY_LEXER_CC 1
     #include "core/search/scanner.h"
  #undef DFLY_LEXER_CC
%}

%o bison-cc-namespace="dfly.search" bison-cc-parser="Parser"
%o namespace="dfly.search"
%o class="Scanner" lex="Lex"
%o nodefault batch case-insensitive
/* %o debug */

/* Declarations before lexer implementation.  */
%{
  // A number symbol corresponding to the value in S.
  using dfly::search::Parser;
  using namespace std;
  using dfly::search::TagType;

  Parser::symbol_type make_StringLit(string_view src, const Parser::location_type& loc);
  Parser::symbol_type make_Tag(string_view src, TagType type, const Parser::location_type& loc);
%}

dq         \"
sq         \'
esc_chars  ['"\?\\abfnrtv]
esc_seq    \\{esc_chars}
term_ch    \w
tag_val_base_ch [^,.<>{}\[\]\\\"\?':;!@#$%^&*()\-+=~\/| ]|\\.
tag_val_ch {tag_val_base_ch}+(:+{tag_val_base_ch}*)*
astrsk_ch  \*


%{
  // Code run each time a pattern is matched.
%}

%%

%{
  // Code run each time lex() is called.
%}

[[:space:]]+   // skip white space

"("                  return Parser::make_LPAREN (loc());
")"                  return Parser::make_RPAREN (loc());
"*"                  return Parser::make_STAR (loc());
"-"                  return Parser::make_NOT_OP (loc());
":"                  return Parser::make_COLON (loc());
"=>"                 return Parser::make_ARROW (loc());
"["                  return Parser::make_LBRACKET (loc());
"]"                  return Parser::make_RBRACKET (loc());
"{"                  return Parser::make_LCURLBR (loc());
"}"                  return Parser::make_RCURLBR (loc());
"|"                  return Parser::make_OR_OP (loc());
","                  return Parser::make_COMMA (loc());
"KNN"                return Parser::make_KNN (loc());
"AS"                 return Parser::make_AS (loc());
"EF_RUNTIME"         return Parser::make_EF_RUNTIME (loc());
"VECTOR_RANGE"       return Parser::make_VECTOR_RANGE (loc());
"$YIELD_DISTANCE_AS" return Parser::make_YIELD_DISTANCE_AS (loc());

[0-9]{1,9}                          return Parser::make_UINT32(str(), loc());
[+-]?(([0-9]*[.])?[0-9]+|inf)       return Parser::make_DOUBLE(str(), loc());

{dq}([^"]|{esc_seq})*{dq}           return make_StringLit(matched_view(1, 1), loc());
{sq}([^']|{esc_seq})*{sq}           return make_StringLit(matched_view(1, 1), loc());

"$"{term_ch}+                       return ParseParam(str(), loc());
"@"{term_ch}+                       return Parser::make_FIELD(str(), loc());
{astrsk_ch}{term_ch}+{astrsk_ch}    return Parser::make_INFIX(string{matched_view(1, 1)}, loc());
{term_ch}+{astrsk_ch}               return Parser::make_PREFIX(string{matched_view(0, 1)}, loc());
{astrsk_ch}{term_ch}+               return Parser::make_SUFFIX(string{matched_view(1, 0)}, loc());

{term_ch}+                          return Parser::make_TERM(str(), loc());
{tag_val_ch}+{astrsk_ch}            return make_Tag(str(), TagType::PREFIX, loc());
{astrsk_ch}{tag_val_ch}+            return make_Tag(str(), TagType::SUFFIX, loc());
{astrsk_ch}{tag_val_ch}+{astrsk_ch} return make_Tag(str(), TagType::INFIX, loc());
{tag_val_ch}+                       return make_Tag(str(), TagType::REGULAR, loc());

<<EOF>> return Parser::make_YYEOF(loc());
%%

Parser::symbol_type make_StringLit(string_view src, const Parser::location_type& loc) {
  string res;
  if (!absl::CUnescape(src, &res))
    throw Parser::syntax_error (loc, "bad escaped string: " + string(src));

  return Parser::make_TERM(res, loc);
}

Parser::symbol_type make_Tag(string_view src, TagType type, const Parser::location_type& loc) {
  string res;
  res.reserve(src.size());

  // Determine processing boundaries
  size_t start = (type == TagType::SUFFIX || type == TagType::INFIX) ? 1 : 0;
  size_t end = src.size();
  if (type == TagType::PREFIX || type == TagType::INFIX) {
    end--; // Skip the last '*' character
  }

    // Handle escaping
  bool escaped = false;
  for (size_t i = start; i < end; ++i) {
    if (escaped) {
      escaped = false;
    } else if (src[i] == '\\') {
      escaped = true;
      continue;
    }
    res.push_back(src[i]);
  }

  // Return the appropriate token type
  switch (type) {
    case TagType::PREFIX:
      return Parser::make_PREFIX(res, loc);
    case TagType::SUFFIX:
      return Parser::make_SUFFIX(res, loc);
    case TagType::INFIX:
      return Parser::make_INFIX(res, loc);
    case TagType::REGULAR:
    default:
      return Parser::make_TAG_VAL(res, loc);
  }
}


================================================
FILE: src/core/search/mrmw_mutex.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <condition_variable>
#include <mutex>

#include "base/logging.h"
#include "base/spinlock.h"

namespace dfly::search {

// Simple implementation of multi-Reader multi-Writer Mutex
// MRMWMutex supports concurrent reads or concurrent writes but not a mix of
// concurrent reads and writes at the same time.

class MRMWMutex {
 public:
  enum class LockMode : uint8_t { kReadLock, kWriteLock };

  MRMWMutex() : lock_mode_(LockMode::kReadLock) {
  }

  void Lock(LockMode mode) {
    std::unique_lock lk(mutex_);

    // If we have any active_runners we need to check lock mode
    if (active_runners_) {
      auto& waiters = GetWaiters(mode);
      waiters++;
      GetCondVar(mode).wait(lk, [&] { return lock_mode_ == mode; });
      waiters--;
    } else {
      // No active runners so just update to requested lock mode
      lock_mode_ = mode;
    }
    active_runners_++;
  }

  void Unlock(LockMode mode) {
    std::lock_guard lk(mutex_);
    LockMode inverse_mode = GetInverseMode(mode);
    active_runners_--;
    // If this was last runner and there are waiters on inverse mode
    if (!active_runners_ && GetWaiters(inverse_mode) > 0) {
      lock_mode_ = inverse_mode;
      GetCondVar(inverse_mode).notify_all();
    }
  }

  // Check if the mutex is currently held in read mode with at least one active runner.
  // For use in DCHECKs only - not thread-safe without external synchronization.
  bool IsReadLocked() const {
    return active_runners_ > 0 && lock_mode_ == LockMode::kReadLock;
  }

  // Non-blocking lock attempt. Returns true if the lock was acquired.
  bool TryLock(LockMode mode) {
    if (!mutex_.try_lock()) {
      return false;
    }
    if (active_runners_ && lock_mode_ != mode) {
      mutex_.unlock();
      return false;
    }
    if (!active_runners_) {
      lock_mode_ = mode;
    }
    active_runners_++;
    mutex_.unlock();
    return true;
  }

 private:
  inline size_t& GetWaiters(LockMode target_mode) {
    return target_mode == LockMode::kReadLock ? reader_waiters_ : writer_waiters_;
  };

  inline std::condition_variable_any& GetCondVar(LockMode target_mode) {
    return target_mode == LockMode::kReadLock ? reader_cond_var_ : writer_cond_var_;
  };

  static inline LockMode GetInverseMode(LockMode mode) {
    return mode == LockMode::kReadLock ? LockMode::kWriteLock : LockMode::kReadLock;
  }

  // TODO: use fiber sync primitives in future
  base::SpinLock mutex_;
  std::condition_variable_any reader_cond_var_, writer_cond_var_;

  size_t writer_waiters_ = 0, reader_waiters_ = 0;
  size_t active_runners_ = 0;
  LockMode lock_mode_;
};

class MRMWMutexLock {
 public:
  // Blocking lock.
  explicit MRMWMutexLock(MRMWMutex* mutex, MRMWMutex::LockMode mode)
      : mutex_(mutex), lock_mode_(mode), locked_(true) {
    mutex->Lock(lock_mode_);
  }

  // Non-blocking try-lock. Check locked() to see if the lock was acquired.
  MRMWMutexLock(MRMWMutex* mutex, MRMWMutex::LockMode mode, std::try_to_lock_t)
      : mutex_(mutex), lock_mode_(mode), locked_(mutex->TryLock(mode)) {
  }

  bool locked() const {
    return locked_;
  }

  ~MRMWMutexLock() {
    if (locked_)
      mutex_->Unlock(lock_mode_);
  }

  MRMWMutexLock(const MRMWMutexLock&) = delete;
  MRMWMutexLock(MRMWMutexLock&&) = delete;
  MRMWMutexLock& operator=(const MRMWMutexLock&) = delete;
  MRMWMutexLock& operator=(MRMWMutexLock&&) = delete;

 private:
  MRMWMutex* const mutex_;
  MRMWMutex::LockMode lock_mode_;
  bool locked_;
};

}  // namespace dfly::search


================================================
FILE: src/core/search/mrmw_mutex_test.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/search/mrmw_mutex.h"

#include <random>
#include <thread>

#include "absl/flags/flag.h"
#include "base/gtest.h"
#include "base/logging.h"
#include "util/fibers/pool.h"

ABSL_FLAG(bool, force_epoll, false, "If true, uses epoll api instead iouring to run tests");

namespace dfly::search {

namespace {

// Helper function to simulate reading operation
void ReadTask(MRMWMutex* mutex, std::atomic<size_t>& read_count, size_t sleep_time) {
  read_count.fetch_add(1, std::memory_order_relaxed);
  MRMWMutexLock lock(mutex, MRMWMutex::LockMode::kReadLock);
  util::ThisFiber::SleepFor(std::chrono::milliseconds(sleep_time));
  read_count.fetch_sub(1, std::memory_order_relaxed);
}

// Helper function to simulate writing operation
void WriteTask(MRMWMutex* mutex, std::atomic<size_t>& write_count, size_t sleep_time) {
  write_count.fetch_add(1, std::memory_order_relaxed);
  MRMWMutexLock lock(mutex, MRMWMutex::LockMode::kWriteLock);
  util::ThisFiber::SleepFor(std::chrono::milliseconds(sleep_time));
  write_count.fetch_sub(1, std::memory_order_relaxed);
}

constexpr size_t kReadTaskSleepTime = 50;
constexpr size_t kWriteTaskSleepTime = 100;

}  // namespace

class MRMWMutexTest : public ::testing::Test {
 protected:
  MRMWMutex mutex_;
  std::mt19937 generator_;
  void SetUp() override {
#ifdef __linux__
    if (absl::GetFlag(FLAGS_force_epoll)) {
      pp_.reset(util::fb2::Pool::Epoll(2));
    } else {
      pp_.reset(util::fb2::Pool::IOUring(16, 2));
    }
#else
    pp_.reset(util::fb2::Pool::Epoll(2));
#endif
    pp_->Run();
  }
  void TearDown() override {
    pp_->Stop();
    pp_.reset();
  }
  std::unique_ptr<util::ProactorPool> pp_;
};

// Test 1: Multiple readers can lock concurrently
TEST_F(MRMWMutexTest, MultipleReadersConcurrently) {
  std::atomic<size_t> read_count(0);
  const int num_readers = 5;

  std::vector<util::fb2::Fiber> readers;
  readers.reserve(num_readers);

  for (int i = 0; i < num_readers; ++i) {
    readers.emplace_back(pp_->at(0)->LaunchFiber(util::fb2::Launch::post, [&] {
      ReadTask(&mutex_, std::ref(read_count), kReadTaskSleepTime);
    }));
  }

  // Wait for all reader threads to finish
  for (auto& t : readers) {
    t.Join();
  }

  // All readers should have been able to lock the mutex concurrently
  EXPECT_EQ(read_count.load(), 0);
}

// Test 2: Writer blocks readers and writer should get the lock exclusively
TEST_F(MRMWMutexTest, ReadersBlockWriters) {
  std::atomic<size_t> read_count(0);
  std::atomic<size_t> write_count(0);

  const int num_readers = 10;

  // Start multiple readers
  std::vector<util::fb2::Fiber> readers;
  readers.reserve(num_readers);

  for (int i = 0; i < num_readers; ++i) {
    readers.emplace_back(pp_->at(0)->LaunchFiber(util::fb2::Launch::post, [&] {
      ReadTask(&mutex_, std::ref(read_count), kReadTaskSleepTime);
    }));
  }

  // Give readers time to acquire the lock
  util::ThisFiber::SleepFor(std::chrono::milliseconds(10));

  pp_->at(1)
      ->LaunchFiber(util::fb2::Launch::post,
                    [&] { WriteTask(&mutex_, std::ref(write_count), kWriteTaskSleepTime); })
      .Join();

  // Wait for all reader threads to finish
  for (auto& t : readers) {
    t.Join();
  }

  EXPECT_EQ(read_count.load(), 0);
  EXPECT_EQ(write_count.load(), 0);
}

// Test 3: Unlock transitions correctly and wakes up waiting threads
TEST_F(MRMWMutexTest, ReaderAfterWriter) {
  std::atomic<size_t> write_count(0);
  std::atomic<size_t> read_count(0);

  // Start a writer thread
  auto writer = pp_->at(1)->LaunchFiber(util::fb2::Launch::post, [&] {
    WriteTask(&mutex_, std::ref(write_count), kWriteTaskSleepTime);
  });

  // Give writer time to acquire the lock
  util::ThisFiber::SleepFor(std::chrono::milliseconds(10));

  // Now start a reader task that will block until the writer is done
  pp_->at(0)
      ->LaunchFiber(util::fb2::Launch::post,
                    [&] { ReadTask(&mutex_, std::ref(read_count), kReadTaskSleepTime); })
      .Join();

  // Ensure that writer has completed
  writer.Join();

  EXPECT_EQ(read_count.load(), 0);
  EXPECT_EQ(write_count.load(), 0);
}

// Test 4: Ensure writer gets the lock after readers finish
TEST_F(MRMWMutexTest, WriterAfterReaders) {
  std::atomic<size_t> read_count(0);
  std::atomic<size_t> write_count(0);

  // Start multiple readers
  const int num_readers = 10;
  std::vector<util::fb2::Fiber> readers;
  readers.reserve(num_readers);

  for (int i = 0; i < num_readers; ++i) {
    readers.emplace_back(pp_->at(0)->LaunchFiber(util::fb2::Launch::post, [&] {
      ReadTask(&mutex_, std::ref(read_count), kReadTaskSleepTime);
    }));
  }

  // Wait for all readers to acquire and release the lock
  for (auto& t : readers) {
    t.Join();
  }

  // Start the writer after all readers are done
  pp_->at(1)
      ->LaunchFiber(util::fb2::Launch::post,
                    [&] { WriteTask(&mutex_, std::ref(write_count), kWriteTaskSleepTime); })
      .Join();

  EXPECT_EQ(read_count.load(), 0);
  EXPECT_EQ(write_count.load(), 0);
}

TEST_F(MRMWMutexTest, MixWritersReadersOnDifferentFibers) {
  std::atomic<size_t> read_count(0);
  std::atomic<size_t> write_count(0);

  // Start multiple readers and writers
  const int num_threads = 100;
  std::vector<util::fb2::Fiber> threads;
  threads.reserve(num_threads);

  for (int i = 0; i < num_threads; ++i) {
    if (rand() % 3) {
      threads.emplace_back(pp_->at(0)->LaunchFiber(util::fb2::Launch::post, [&] {
        ReadTask(&mutex_, std::ref(read_count), kReadTaskSleepTime);
      }));
    } else {
      threads.emplace_back(pp_->at(1)->LaunchFiber(util::fb2::Launch::post, [&] {
        WriteTask(&mutex_, std::ref(write_count), kWriteTaskSleepTime);
      }));
    }
  }

  // Wait for all readers to acquire and release the lock
  for (auto& t : threads) {
    t.Join();
  }
}

// TODO: Once we have fiber locking we can test scenario where we write/read on same fibers
// current implementation block thread so it is not possible to test this for now.

// Test 6: Mix of readers and writes on random fibers
// TEST_F(MRMWMutexTest, MixWritersReadersOnFibers) {
//   std::atomic<size_t> read_count(0);
//   std::atomic<size_t> write_count(0);

//   // Start multiple readers and writers
//   const int num_threads = 100;
//   std::vector<util::fb2::Fiber> threads;
//   threads.reserve(num_threads + 1);

//   // Add long read task that will block all write tasks
//   threads.emplace_back(
//       pp_->at(0)->LaunchFiber([&] { ReadTask(&mutex_, std::ref(read_count), 2000); }));

//   // Give long writer time to acquire the lock
//   util::ThisFiber::SleepFor(std::chrono::milliseconds(100));

//   size_t write_threads = 0;
//   for (int i = 0; i < num_threads; ++i) {
//     size_t fiber_id = rand() % 2;
//     if (rand() % 3) {
//       threads.emplace_back(pp_->at(fiber_id)->LaunchFiber(util::fb2::Launch::post, [&] {
//         ReadTask(&mutex_, std::ref(read_count), kReadTaskSleepTime);
//       }));
//     } else {
//       write_threads++;
//       threads.emplace_back(pp_->at(fiber_id)->LaunchFiber(util::fb2::Launch::post, [&] {
//         WriteTask(&mutex_, std::ref(write_count), kWriteTaskSleepTime);
//       }));
//     }
//   }

//   // All shorter threads should be done and only long one remains
//   util::ThisFiber::SleepFor(std::chrono::milliseconds(500));

//   EXPECT_EQ(read_count.load(), 1);

//   EXPECT_EQ(write_count.load(), write_threads);

//   // Wait for all readers to acquire and release the lock
//   for (auto& t : threads) {
//     t.Join();
//   }
// }

TEST_F(MRMWMutexTest, IsReadLockedReflectsState) {
  // Initially no lock is held.
  EXPECT_FALSE(mutex_.IsReadLocked());

  // Acquire a read lock and verify.
  mutex_.Lock(MRMWMutex::LockMode::kReadLock);
  EXPECT_TRUE(mutex_.IsReadLocked());

  // A second concurrent reader should still report read-locked.
  mutex_.Lock(MRMWMutex::LockMode::kReadLock);
  EXPECT_TRUE(mutex_.IsReadLocked());

  // Release one reader — still locked by the other.
  mutex_.Unlock(MRMWMutex::LockMode::kReadLock);
  EXPECT_TRUE(mutex_.IsReadLocked());

  // Release the last reader.
  mutex_.Unlock(MRMWMutex::LockMode::kReadLock);
  EXPECT_FALSE(mutex_.IsReadLocked());
}

TEST_F(MRMWMutexTest, IsReadLockedFalseUnderWriteLock) {
  mutex_.Lock(MRMWMutex::LockMode::kWriteLock);
  EXPECT_FALSE(mutex_.IsReadLocked());
  mutex_.Unlock(MRMWMutex::LockMode::kWriteLock);
}

TEST_F(MRMWMutexTest, TryLockSucceedsWhenFree) {
  // TryLock on a free mutex should succeed for both modes.
  EXPECT_TRUE(mutex_.TryLock(MRMWMutex::LockMode::kReadLock));
  mutex_.Unlock(MRMWMutex::LockMode::kReadLock);

  EXPECT_TRUE(mutex_.TryLock(MRMWMutex::LockMode::kWriteLock));
  mutex_.Unlock(MRMWMutex::LockMode::kWriteLock);
}

TEST_F(MRMWMutexTest, TryLockFailsOnConflict) {
  // Hold a read lock, then try-lock for write should fail.
  mutex_.Lock(MRMWMutex::LockMode::kReadLock);
  EXPECT_FALSE(mutex_.TryLock(MRMWMutex::LockMode::kWriteLock));
  mutex_.Unlock(MRMWMutex::LockMode::kReadLock);

  // Hold a write lock, then try-lock for read should fail.
  mutex_.Lock(MRMWMutex::LockMode::kWriteLock);
  EXPECT_FALSE(mutex_.TryLock(MRMWMutex::LockMode::kReadLock));
  mutex_.Unlock(MRMWMutex::LockMode::kWriteLock);
}

TEST_F(MRMWMutexTest, TryLockSucceedsForSameMode) {
  // Multiple readers via TryLock should all succeed.
  mutex_.Lock(MRMWMutex::LockMode::kReadLock);
  EXPECT_TRUE(mutex_.TryLock(MRMWMutex::LockMode::kReadLock));
  mutex_.Unlock(MRMWMutex::LockMode::kReadLock);
  mutex_.Unlock(MRMWMutex::LockMode::kReadLock);

  // Multiple writers via TryLock should all succeed.
  mutex_.Lock(MRMWMutex::LockMode::kWriteLock);
  EXPECT_TRUE(mutex_.TryLock(MRMWMutex::LockMode::kWriteLock));
  mutex_.Unlock(MRMWMutex::LockMode::kWriteLock);
  mutex_.Unlock(MRMWMutex::LockMode::kWriteLock);
}

TEST_F(MRMWMutexTest, MRMWMutexLockTryLockSemantics) {
  // Hold a read lock, then try a MRMWMutexLock for write — should not be locked.
  MRMWMutexLock read_lock(&mutex_, MRMWMutex::LockMode::kReadLock);
  MRMWMutexLock try_write(&mutex_, MRMWMutex::LockMode::kWriteLock, std::try_to_lock);
  EXPECT_FALSE(try_write.locked());

  // Same-mode try-lock via RAII should succeed.
  MRMWMutexLock try_read(&mutex_, MRMWMutex::LockMode::kReadLock, std::try_to_lock);
  EXPECT_TRUE(try_read.locked());
}

}  // namespace dfly::search


================================================
FILE: src/core/search/parser.y
================================================
%skeleton "lalr1.cc" // -*- C++ -*-
%require "3.5"  // fedora 32 has this one.

%defines  // %header starts from 3.8.1

%define api.namespace {dfly::search}

%define api.token.raw
%define api.token.constructor
%define api.value.type variant
%define api.parser.class {Parser}
%define parse.assert
%define api.value.automove true

// Added to header file before parser declaration.
%code requires {
  #include "core/search/ast_expr.h"

  namespace dfly {
  namespace search {
    class QueryDriver;
  }
  }
}

// Added to cc file
%code {
#include <absl/strings/ascii.h>
#include "core/search/query_driver.h"
#include "core/search/vector_utils.h"

#define yylex driver->scanner()->Lex

using namespace std;

uint32_t toUint32(string_view src);
double toDouble(string_view src);

}

%parse-param { QueryDriver *driver  }

%locations

%define parse.trace
%define parse.error verbose  // detailed
%define parse.lac full
%define api.token.prefix {TOK_}

%token
  LPAREN      "("
  RPAREN      ")"
  STAR        "*"
  ARROW       "=>"
  COLON       ":"
  LBRACKET    "["
  RBRACKET    "]"
  LCURLBR     "{"
  RCURLBR     "}"
  OR_OP       "|"
  COMMA       ","
  KNN         "KNN"
  AS          "AS"
  EF_RUNTIME  "EF_RUNTIME"
  VECTOR_RANGE      "VECTOR_RANGE"
  YIELD_DISTANCE_AS "$YIELD_DISTANCE_AS"
;

%token AND_OP

// Needed 0 at the end to satisfy bison 3.5.1
%token YYEOF 0
%token <std::string> TERM "term" TAG_VAL "tag_val" PARAM "param" FIELD "field" PREFIX "prefix" SUFFIX "suffix" INFIX "infix"

%precedence TERM TAG_VAL
%left OR_OP
%left AND_OP
%right NOT_OP
%precedence LPAREN RPAREN

%token <std::string> DOUBLE "double"
%token <std::string> UINT32 "uint32"
%nterm <AstExpr> final_query filter star_expr search_expr search_unary_expr search_or_expr search_and_expr bracket_filter_expr
%nterm <AstExpr> field_cond field_cond_expr field_unary_expr field_or_expr field_and_expr tag_list
%nterm <AstTagsNode::TagValueProxy> tag_list_element

%nterm <AstKnnNode> knn_query
%nterm <std::string> opt_knn_alias
%nterm <std::string> geounit
%nterm <std::optional<size_t>> opt_ef_runtime
%nterm <AstVectorRangeNode> vector_range_query
%nterm <double> vec_range_radius

%printer { yyo << $$; } <*>;

%%

final_query:
  filter
      { driver->Set(std::move($1)); }
  | filter ARROW knn_query
      { driver->Set(AstKnnNode(std::move($1), std::move($3))); }
  | vector_range_query
      { driver->Set(std::move($1)); }

knn_query:
  LBRACKET KNN UINT32 FIELD TERM opt_ef_runtime opt_knn_alias RBRACKET
    {
      // Accept any string as vector - validation happens later during search execution
      uint32_t knn_count = toUint32($3);
      auto field = std::move($4);
      auto alias = std::move($7);
      auto ef = $6;

      auto vec_result = BytesToFtVectorSafe($5);
      if (!vec_result) {
        // Create empty vector for invalid data - will return empty results during search
        auto empty_vec = std::make_unique<float[]>(0);
        $$ = AstKnnNode(knn_count, std::move(field), std::make_pair(std::move(empty_vec), size_t{0}), std::move(alias), ef);
      } else {
        $$ = AstKnnNode(knn_count, std::move(field), std::move(*vec_result), std::move(alias), ef);
      }
    }

opt_knn_alias:
  AS TERM { $$ = std::move($2); }
  | { $$ = std::string{}; }

opt_ef_runtime:
  /* empty */ { $$ = std::nullopt; }
  | EF_RUNTIME UINT32 { $$ = toUint32($2); }

vector_range_query:
  FIELD COLON LBRACKET VECTOR_RANGE vec_range_radius TERM RBRACKET ARROW LCURLBR YIELD_DISTANCE_AS COLON TERM RCURLBR
    {
      double radius = $5;
      auto field = std::move($1);
      auto alias = std::move($12);
      auto vec_result = BytesToFtVectorSafe($6);
      if (!vec_result) {
        auto empty_vec = std::make_unique<float[]>(0);
        $$ = AstVectorRangeNode(std::move(field), radius,
                                {std::move(empty_vec), size_t{0}}, std::move(alias));
      } else {
        $$ = AstVectorRangeNode(std::move(field), radius, std::move(*vec_result),
                                std::move(alias));
      }
    }

vec_range_radius:
  DOUBLE  { $$ = toDouble($1); }
  | UINT32 { $$ = static_cast<double>(toUint32($1)); }
  | TERM   { double v = 0; if (!absl::SimpleAtod($1, &v)) YYABORT; $$ = v; }

filter:
  search_expr               { $$ = std::move($1); }
  | star_expr               { $$ = std::move($1); }

star_expr:
  STAR                      { $$ = AstStarNode(); }
  | LPAREN star_expr RPAREN { $$ = std::move($2); }

search_expr:
  search_unary_expr         { $$ = std::move($1); }
  | search_and_expr         { $$ = std::move($1); }
  | search_or_expr          { $$ = std::move($1); }

search_and_expr:
  search_unary_expr search_unary_expr %prec AND_OP { $$ = AstLogicalNode(std::move($1), std::move($2), AstLogicalNode::AND); }
  | search_and_expr search_unary_expr %prec AND_OP { $$ = AstLogicalNode(std::move($1), std::move($2), AstLogicalNode::AND); }

search_or_expr:
  search_expr OR_OP search_and_expr                { $$ = AstLogicalNode(std::move($1), std::move($3), AstLogicalNode::OR); }
  | search_expr OR_OP search_unary_expr            { $$ = AstLogicalNode(std::move($1), std::move($3), AstLogicalNode::OR); }

search_unary_expr:
  LPAREN search_expr RPAREN           { $$ = std::move($2);                }
  | NOT_OP search_unary_expr          { $$ = AstNegateNode(std::move($2)); }
  | TERM                              { $$ = AstTermNode(std::move($1));   }
  | PREFIX                            { $$ = AstPrefixNode(std::move($1)); }
  | SUFFIX                            { $$ = AstSuffixNode(std::move($1)); }
  | INFIX                             { $$ = AstInfixNode(std::move($1));  }
  | UINT32                            { $$ = AstTermNode(std::move($1));   }
  | FIELD COLON field_cond            { $$ = AstFieldNode(std::move($1), std::move($3)); }

field_cond:
  TERM                                                  { $$ = AstTermNode(std::move($1));   }
  | UINT32                                              { $$ = AstTermNode(std::move($1));   }
  | STAR                                                { $$ = AstStarFieldNode();           }
  | NOT_OP field_cond                                   { $$ = AstNegateNode(std::move($2)); }
  | LPAREN field_cond_expr RPAREN                       { $$ = std::move($2); }
  | LBRACKET bracket_filter_expr RBRACKET               { $$ = std::move($2); }
  | LCURLBR tag_list RCURLBR                            { $$ = std::move($2); }
  | PREFIX                                              { $$ = AstPrefixNode(std::move($1)); }
  | SUFFIX                                              { $$ = AstSuffixNode(std::move($1)); }
  | INFIX                                               { $$ = AstInfixNode(std::move($1));  }

bracket_filter_expr:
  /* Numeric filter has form [(] UINT32|DOUBLE [COMMA] [(] UINT32|DOUBLE */
  DOUBLE DOUBLE                                { $$ = AstRangeNode(toDouble($1), false, toDouble($2), false); }
  | LPAREN DOUBLE DOUBLE                       { $$ = AstRangeNode(toDouble($2), true, toDouble($3), false); }
  | DOUBLE LPAREN DOUBLE                       { $$ = AstRangeNode(toDouble($1), false, toDouble($3), true); }
  | LPAREN DOUBLE LPAREN DOUBLE                { $$ = AstRangeNode(toDouble($2), true, toDouble($4), true); }
  | DOUBLE UINT32                              { $$ = AstRangeNode(toDouble($1), false, toUint32($2), false); }
  | LPAREN DOUBLE UINT32                       { $$ = AstRangeNode(toDouble($2), true, toUint32($3), false); }
  | DOUBLE LPAREN UINT32                       { $$ = AstRangeNode(toDouble($1), false, toUint32($3), true); }
  | LPAREN DOUBLE LPAREN UINT32                { $$ = AstRangeNode(toDouble($2), true, toUint32($4), true); }
  | UINT32 DOUBLE                              { $$ = AstRangeNode(toUint32($1), false, toDouble($2), false); }
  | LPAREN UINT32 DOUBLE                       { $$ = AstRangeNode(toUint32($2), true, toDouble($3), false); }
  | UINT32 LPAREN DOUBLE                       { $$ = AstRangeNode(toUint32($1), false, toDouble($3), true); }
  | LPAREN UINT32 LPAREN DOUBLE                { $$ = AstRangeNode(toUint32($2), true, toDouble($4), true); }
  | UINT32 UINT32                              { $$ = AstRangeNode(toUint32($1), false, toUint32($2), false); }
  | LPAREN UINT32 UINT32                       { $$ = AstRangeNode(toUint32($2), true, toUint32($3), false); }
  | UINT32 LPAREN UINT32                       { $$ = AstRangeNode(toUint32($1), false, toUint32($3), true); }
  | LPAREN UINT32 LPAREN UINT32                { $$ = AstRangeNode(toUint32($2), true, toUint32($4), true); }
  | DOUBLE COMMA DOUBLE                        { $$ = AstRangeNode(toDouble($1), false, toDouble($3), false); }
  | DOUBLE COMMA UINT32                        { $$ = AstRangeNode(toDouble($1), false, toUint32($3), false); }
  | UINT32 COMMA DOUBLE                        { $$ = AstRangeNode(toUint32($1), false, toDouble($3), false); }
  | UINT32 COMMA UINT32                        { $$ = AstRangeNode(toUint32($1), false, toUint32($3), false); }
  | LPAREN DOUBLE COMMA DOUBLE                 { $$ = AstRangeNode(toDouble($2), true, toDouble($4), false); }
  | DOUBLE COMMA LPAREN DOUBLE                 { $$ = AstRangeNode(toDouble($1), false, toDouble($4), true); }
  | LPAREN DOUBLE COMMA LPAREN DOUBLE          { $$ = AstRangeNode(toDouble($2), true, toDouble($5), true); }
  | LPAREN DOUBLE COMMA UINT32                 { $$ = AstRangeNode(toDouble($2), true, toUint32($4), false); }
  | DOUBLE COMMA LPAREN UINT32                 { $$ = AstRangeNode(toDouble($1), false, toUint32($4), true); }
  | LPAREN DOUBLE COMMA LPAREN UINT32          { $$ = AstRangeNode(toDouble($2), true, toUint32($5), true); }
  | LPAREN UINT32 COMMA DOUBLE                 { $$ = AstRangeNode(toUint32($2), true, toDouble($4), false); }
  | UINT32 COMMA LPAREN DOUBLE                 { $$ = AstRangeNode(toUint32($1), false, toDouble($4), true); }
  | LPAREN UINT32 COMMA LPAREN DOUBLE          { $$ = AstRangeNode(toUint32($2), true, toDouble($5), true); }
  | LPAREN UINT32 COMMA UINT32                 { $$ = AstRangeNode(toUint32($2), true, toUint32($4), false); }
  | UINT32 COMMA LPAREN UINT32                 { $$ = AstRangeNode(toUint32($1), false, toUint32($4), true); }
  | LPAREN UINT32 COMMA LPAREN UINT32          { $$ = AstRangeNode(toUint32($2), true, toUint32($5), true); }
  /* GEO filter */
  | DOUBLE DOUBLE UINT32 geounit               { $$ = AstGeoNode(toDouble($1), toDouble($2), toUint32($3), std::move($4)); }
  | DOUBLE DOUBLE DOUBLE geounit               { $$ = AstGeoNode(toDouble($1), toDouble($2), toDouble($3), std::move($4)); }

geounit:
  TERM
  {
    std::string unit = $1;
    absl::AsciiStrToUpper(&unit);
    if ((unit == "M") || (unit == "KM") || (unit == "MI") || (unit == "FT")) {
        $$ = unit;
    } else {
        YYABORT;
    }
  }

field_cond_expr:
  field_unary_expr { $$ = std::move($1); }
  | field_and_expr { $$ = std::move($1); }
  | field_or_expr  { $$ = std::move($1); }

field_and_expr:
  field_unary_expr field_unary_expr %prec AND_OP  { $$ = AstLogicalNode(std::move($1), std::move($2), AstLogicalNode::AND); }
  | field_and_expr field_unary_expr %prec AND_OP  { $$ = AstLogicalNode(std::move($1), std::move($2), AstLogicalNode::AND); }

field_or_expr:
  field_cond_expr OR_OP field_unary_expr          { $$ = AstLogicalNode(std::move($1), std::move($3), AstLogicalNode::OR); }
  | field_cond_expr OR_OP field_and_expr          { $$ = AstLogicalNode(std::move($1), std::move($3), AstLogicalNode::OR); }

field_unary_expr:
  LPAREN field_cond_expr RPAREN { $$ = std::move($2);                }
  | NOT_OP field_unary_expr     { $$ = AstNegateNode(std::move($2)); }
  | TERM                        { $$ = AstTermNode(std::move($1));   }
  | UINT32                      { $$ = AstTermNode(std::move($1));   }

tag_list:
  tag_list_element                       { $$ = AstTagsNode(std::move($1));                }
  | tag_list OR_OP tag_list_element      { $$ = AstTagsNode(std::move($1), std::move($3)); }

tag_list_element:
  TERM        { $$ = AstTermNode(std::move($1));   }
  | PREFIX    { $$ = AstPrefixNode(std::move($1)); }
  | SUFFIX    { $$ = AstSuffixNode(std::move($1)); }
  | INFIX     { $$ = AstInfixNode(std::move($1));  }
  | UINT32    { $$ = AstTermNode(std::move($1));   }
  | DOUBLE    { $$ = AstTermNode(std::move($1));   }
  | TAG_VAL   { $$ = AstTermNode(std::move($1));   }


%%

void
dfly::search::Parser::error(const location_type& l, const string& m)
{
  driver->Error(l, m);
}

std::uint32_t toUint32(string_view str) {
  uint32_t val = 0;
  std::ignore = absl::SimpleAtoi(str, &val); // no need to check the result because str is parsed by regex
  return val;
}

double toDouble(string_view str) {
  double val = 0;
  std::ignore = absl::SimpleAtod(str, &val); // no need to check the result because str is parsed by regex
  return val;
}


================================================
FILE: src/core/search/query_driver.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/search/query_driver.h"

namespace dfly {
namespace search {

QueryDriver::QueryDriver() : scanner_(std::make_unique<Scanner>()) {
}

QueryDriver::~QueryDriver() {
}

void QueryDriver::ResetScanner() {
  scanner_ = std::make_unique<Scanner>();
  scanner_->SetParams(params_);
}

void QueryDriver::Error(const Parser::location_type& loc, std::string_view msg) {
  VLOG(1) << "Parse error " << loc << ": " << msg;
}

void QueryDriver::SetOptionalFilters(const OptionalFilters* filters) {
  if (filters) {
    for (auto& [field, filter] : *filters) {
      expr_ = AstLogicalNode(std::move(expr_), filter->Node(field), AstLogicalNode::AND);
    }
  }
}

}  // namespace search

}  // namespace dfly


================================================
FILE: src/core/search/query_driver.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <memory>

#include "core/search/ast_expr.h"
#include "core/search/base.h"
#include "core/search/parser.hh"
#include "core/search/scanner.h"

namespace dfly {

namespace search {

class QueryDriver {
 public:
  QueryDriver();
  ~QueryDriver();

  void SetInput(std::string str) {
    cur_str_ = std::move(str);
    scanner()->in(cur_str_);
  }

  void SetParams(const QueryParams* params) {
    params_ = params;
    scanner_->SetParams(params);
  }

  void SetOptionalFilters(const OptionalFilters* filters);

  Parser::symbol_type Lex() {
    return scanner()->Lex();
  }

  void ResetScanner();

  void Set(AstExpr expr) {
    expr_ = std::move(expr);
  }

  AstExpr Take() {
    return std::move(expr_);
  }

  const QueryParams& GetParams() const {
    return *params_;
  }

  Scanner* scanner() {
    return scanner_.get();
  }

  void Error(const Parser::location_type& loc, std::string_view msg);

 public:
  Parser::location_type location;

 private:
  const QueryParams* params_;
  AstExpr expr_;

  std::string cur_str_;
  std::unique_ptr<Scanner> scanner_;
};

}  // namespace search
}  // namespace dfly


================================================
FILE: src/core/search/range_tree.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/search/range_tree.h"

namespace dfly::search {

namespace {

std::vector<DocId> MergeAllResults(absl::Span<const RangeTree::RangeBlock*> blocks, double l,
                                   double r) {
  DCHECK(blocks.size() != 1 && blocks.size() != 2);

  // After the benchmarking, it is better to use inlined vector
  // than std::priority_queue
  absl::InlinedVector<RangeFilterIterator, 10> heap;
  heap.reserve(blocks.size());

  size_t doc_ids_count = 0;
  for (const auto* block : blocks) {
    auto it = MakeBegin(*block, l, r);
    if (!it.HasReachedEnd()) {
      heap.emplace_back(it);
      doc_ids_count += block->Size();
    }
  }

  std::vector<DocId> result;
  result.reserve(doc_ids_count);

  size_t size = heap.size();
  while (size) {
    DCHECK(!heap[0].HasReachedEnd());

    size_t min_doc_id_index = 0;
    for (size_t i = 1; i < size; ++i) {
      DCHECK(!heap[i].HasReachedEnd());

      if (*heap[i] < *heap[min_doc_id_index]) {
        min_doc_id_index = i;
      }
    }

    auto& it = heap[min_doc_id_index];
    result.push_back(*it);
    ++it;

    if (it.HasReachedEnd()) {
      // If we reached the end of the current block, remove it from the heap
      std::swap(heap[min_doc_id_index], heap[size - 1]);
      --size;
    }
  }

  DCHECK(std::is_sorted(result.begin(), result.end()));
  return result;
}

template <typename MapT> auto FindRangeBlockImpl(MapT& entries, double value) {
  DCHECK(!entries.empty());

  auto it = entries.lower_bound(value);
  if (it != entries.begin() && (it == entries.end() || it->first > value)) {
    // TODO: remove this, we do log N here
    // we can use negative left bouding to find the block
    --it;  // Move to the block that contains the value
  }

  DCHECK(it != entries.end() && it->first <= value);
  return it;
}

}  // namespace

RangeTree::RangeTree(PMR_NS::memory_resource* mr, size_t max_range_block_size)
    : max_range_block_size_(max_range_block_size), entries_(mr) {
  // The tree has at least always a block with a negative infinity bound, so that any new insertion
  // goes at least somewhere
  CreateEmptyBlock(-std::numeric_limits<double>::infinity());
}

void RangeTree::Add(DocId id, double value) {
  DCHECK(std::isfinite(value));

  auto it = FindRangeBlock(value);
  auto& [lower_bound, block] = *it;

  // Don't disrupt large monovalue blocks, instead create new nextafter block
  if (block.Size() >= max_range_block_size_ && lower_bound == block.max_seen /* monovalue */ &&
      value != lower_bound /* but new value is different*/
  ) {
    // We use nextafter as the lower bound to "catch" all other possible inserts into the block,
    // as a decreasing `value` sequence would otherwise create lots of single-value blocks
    double lb2 = std::nextafter(lower_bound, std::numeric_limits<double>::infinity());
    CreateEmptyBlock(lb2)->second.Insert({id, value});
    return;
  }

  auto insert_result = block.Insert({id, value});
  LOG_IF(ERROR, !insert_result) << "RangeTree: Failed to insert id: " << id << ", value: " << value;

  // Small block or large monovalue block, not reducable by splitting
  if (block.Size() <= max_range_block_size_ || lower_bound == block.max_seen)
    return;

  SplitBlock(it);
}

void RangeTree::Remove(DocId id, double value) {
  DCHECK(std::isfinite(value));

  auto it = FindRangeBlock(value);
  RangeBlock& block = it->second;

  auto remove_result = block.Remove({id, value});
  LOG_IF(ERROR, !remove_result) << "RangeTree: Failed to remove id: " << id << ", value: " << value;

  // Merge with left block if both are relatively small and won't be forced to split soon
  if (block.size() < max_range_block_size_ / 4 && it != entries_.begin()) {
    auto lit = it;
    --lit;

    auto& lblock = lit->second;
    if (block.Size() + lblock.Size() < max_range_block_size_ / 2) {
      for (auto e : block)
        lblock.Insert(e);
      entries_.erase(it);
      stats_.merges++;
    }
  }
}

RangeResult RangeTree::Range(double l, double r) const {
  return {RangeBlocks(l, r), l, r};
}

absl::InlinedVector<const RangeTree::RangeBlock*, 5> RangeTree::RangeBlocks(double l,
                                                                            double r) const {
  DCHECK(l <= r);

  auto it_l = FindRangeBlock(l);
  auto it_r = FindRangeBlock(r);

  absl::InlinedVector<const RangeBlock*, 5> blocks;
  for (auto it = it_l;; ++it) {
    blocks.push_back(&it->second);
    if (it == it_r) {
      break;
    }
  }

  DCHECK(!blocks.empty());
  return blocks;
}

RangeResult RangeTree::GetAllDocIds() const {
  return RangeResult{GetAllBlocks()};
}

absl::InlinedVector<const RangeTree::RangeBlock*, 5> RangeTree::GetAllBlocks() const {
  absl::InlinedVector<const RangeBlock*, 5> blocks;
  blocks.reserve(entries_.size());

  for (const auto& entry : entries_) {
    blocks.push_back(&entry.second);
  }

  return blocks;
}

RangeTree::Map::iterator RangeTree::FindRangeBlock(double value) {
  return FindRangeBlockImpl(entries_, value);
}

RangeTree::Map::const_iterator RangeTree::FindRangeBlock(double value) const {
  return FindRangeBlockImpl(entries_, value);
}

RangeTree::Map::iterator RangeTree::CreateEmptyBlock(double lb) {
  return entries_
      .emplace(std::piecewise_construct, std::forward_as_tuple(lb),
               std::forward_as_tuple(entries_.get_allocator().resource(), max_range_block_size_))
      .first;
}

/*
There is an edge case in the SplitBlock method:
If split_result.left.Size() == 0, it means that all values in the block
were equal to the median value.
Because split works like this:
  - at the beginning it does not insert median values into the left or right block,
  - then it checks if left block is smaller than right block, if so, it adds
    median values to the left block, otherwise it adds it to the right block.
So if left block is empty, it means that left.Size() < right.Size() was false,
what means that right.Size() was also zero.
After that all median entries were added to the right block.

That means that we have equal values in the whole block,
and their count is greater than max_range_block_size_.
So we will do cascade splits of the right block.
TODO: we can optimize this case by splitting to three blocks:
 - empty left block with range [l, m),
 - middle block with range [m, std::nextafter(m, +inf)),
 - empty right block with range [std::nextafter(m, +inf), r)
*/
void RangeTree::SplitBlock(Map::iterator it) {
  double lower_bound = it->first;

  auto split_result = Split(std::move(it->second));

  const double m = split_result.median;
  DCHECK(!split_result.right.Empty());

  entries_.erase(it);
  stats_.splits++;

  // Insert left block if it's not empty or if its the first one (negative inf bound)
  if (!split_result.left.Empty() || std::isinf(lower_bound)) {
    if (!std::isinf(lower_bound))  // keep negative inf bound
      lower_bound = split_result.lmin;

    entries_.emplace(std::piecewise_construct, std::forward_as_tuple(lower_bound),
                     std::forward_as_tuple(std::move(split_result.left), split_result.lmax));
  }

  entries_.emplace(std::piecewise_construct, std::forward_as_tuple(m),
                   std::forward_as_tuple(std::move(split_result.right), split_result.rmax));

  DCHECK(TreeIsInCorrectState());
}

RangeTree::Stats RangeTree::GetStats() const {
  return Stats{.splits = stats_.splits, .merges = stats_.merges, .block_count = entries_.size()};
}

// Used for DCHECKs to check that the tree is in a correct state.
[[maybe_unused]] bool RangeTree::TreeIsInCorrectState() const {
  if (entries_.empty()) {
    return false;
  }

  double prev_range = entries_.begin()->first;
  for (auto it = std::next(entries_.begin()); it != entries_.end(); ++it) {
    const double& current_range = it->first;

    // Check that ranges are non-overlapping and sorted
    // Also there can not be gaps between ranges
    if (prev_range >= current_range) {
      return false;
    }

    prev_range = current_range;
  }

  return true;
}

RangeResult::RangeResult(std::vector<DocId> doc_ids) : result_(std::move(doc_ids)) {
}

RangeResult::RangeResult(absl::InlinedVector<RangeBlockPointer, 5> blocks)
    : RangeResult(std::move(blocks), -std::numeric_limits<double>::infinity(),
                  std::numeric_limits<double>::infinity()) {
}

RangeResult::RangeResult(absl::InlinedVector<RangeBlockPointer, 5> blocks, double l, double r) {
  if (blocks.size() == 1) {
    result_ = SingleBlockRangeResult(blocks[0], l, r);
  } else if (blocks.size() == 2) {
    result_ = TwoBlocksRangeResult(blocks[0], blocks[1], l, r);
  } else {
    result_ = MergeAllResults(absl::MakeSpan(blocks), l, r);
  }
}

std::vector<DocId> RangeResult::Take() {
  if (std::holds_alternative<DocsList>(result_)) {
    DCHECK(std::is_sorted(std::get<DocsList>(result_).begin(), std::get<DocsList>(result_).end()));
    return std::get<DocsList>(std::move(result_));
  }

  auto cb = [](const auto& v) {
    std::vector<DocId> result;
    result.reserve(v.size());
    std::copy(v.begin(), v.end(), std::back_inserter(result));
    DCHECK(std::is_sorted(result.begin(), result.end()));
    return result;
  };

  return std::visit(cb, result_);
}

void RangeTree::Builder::Add(DocId id, double value) {
  bool inserted = updates_.emplace(id, value).second;
  DCHECK(inserted);
}

void RangeTree::Builder::Remove(DocId id, double value) {
  if (!updates_.erase({id, value}))
    delayed_erased_.emplace(id, value);
}

void RangeTree::Builder::Populate(RangeTree* tree, const RenewableQuota& quota) {
  // Sort all elements by value
  std::vector<Entry> sorted_entries(updates_.begin(), updates_.end());
  std::ranges::sort(sorted_entries, {}, &Entry::second);
  updates_.clear();

  quota.Check();  // TODO: sort might take a long time

  // Add sorted elements in batches
  size_t max_size = tree->max_range_block_size_;
  RangeBlock* block = &tree->entries_.begin()->second;
  for (size_t idx = 0; idx < sorted_entries.size();) {
    // Create new block for each insertion batch (first goes into only first block)
    if (idx)
      block = &tree->CreateEmptyBlock(sorted_entries[idx].second)->second;

    // Insert until we filled a block and a new value started (equal value must be in same block)
    while (idx < sorted_entries.size()) {
      if (block->Size() >= max_size && sorted_entries[idx - 1].second != sorted_entries[idx].second)
        break;

      block->Insert(sorted_entries[idx]);
      idx++;

      // If we filled a new multiple of the block size due to equal entries, check quota
      if ((block->Size() - 1) / max_size != block->Size() / max_size)
        quota.Check();
    }

    quota.Check();  // Yield if needed
  }

  // Update entries accumulated during yields in batches while respecting quota.
  // Last loop is atomic (without quota checks) to ensure consistency
  size_t iterations = 3;
  while (iterations--) {
    // Take updates to allow new ones during suspensions
    auto stolen_erased = std::move(delayed_erased_);
    auto stolen_updates = std::move(updates_);
    delayed_erased_.clear();
    updates_.clear();

    auto check_quota = [&, ops = size_t(0)]() mutable {
      ops++;
      if (iterations && ops / max_size != (ops + 1) / max_size)
        quota.Check();
    };

    for (auto [id, v] : stolen_erased) {
      tree->Remove(id, v);
      check_quota();
    }

    for (auto [id, v] : stolen_updates) {
      tree->Add(id, v);
      check_quota();
    }
  }

  // Because last iteration was atomic
  DCHECK(updates_.empty());
  DCHECK(delayed_erased_.empty());
}

}  // namespace dfly::search


================================================
FILE: src/core/search/range_tree.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/btree_map.h>
#include <absl/container/flat_hash_set.h>

#include <memory>
#include <queue>
#include <vector>

#include "base/pmr/memory_resource.h"
#include "core/search/base.h"
#include "core/search/block_list.h"
#include "core/search/renewable_quota.h"

namespace dfly::search {
class RangeResult;

/* RangeTree is an index structure for numeric fields that allows efficient range queries.
   It maps disjoint numeric ranges (e.g., [0, 5), [5, 10), [10, 15), ...) to sorted sets of document
   IDs.

   Internally, it uses absl::btree_map<std::pair<double, double>, RangeBlock>, where each key
   represents a numeric value range, and the corresponding RangeBlock (similar to std::vector)
   stores (DocId, value) pairs, sorted by DocId.

   The parameter `max_range_block_size_` defines the maximum number of entries in a single
   RangeBlock. When a block exceeds this limit, it is split into two to maintain balanced
   performance.
*/
class RangeTree {
 public:
  friend class RangeResult;
  using Entry = std::pair<DocId, double>;

  // More efficient builder for range tree where updates are batched
  // and then applied in an optimized order inside Populate.
  struct Builder {
    void Add(DocId id, double value);
    void Remove(DocId id, double value);

    // Build tree from batched updates. Accepts new updates during suspensions.
    void Populate(RangeTree* tree, const RenewableQuota& quota);

   private:
    absl::flat_hash_set<Entry> updates_, delayed_erased_;
  };

  // Main node of numeric tree
  struct RangeBlock : public BlockList<SortedVector<Entry>> {
    template <typename... Ts>
    explicit RangeBlock(PMR_NS::memory_resource* mr, Ts... ts) : BlockList{mr, ts...} {
    }

    RangeBlock(BlockList<SortedVector<Entry>>&& bs, double maxv)
        : BlockList{std::move(bs)}, max_seen{maxv} {
    }

    bool Insert(Entry e) {
      max_seen = std::max(max_seen, e.second);
      return BlockList::Insert(e);
    }

    // Max value seen, might be not present anymore
    double max_seen = -std::numeric_limits<double>::infinity();
  };

  static constexpr size_t kDefaultMaxRangeBlockSize = 10'000;

  explicit RangeTree(PMR_NS::memory_resource* mr,
                     size_t max_range_block_size = kDefaultMaxRangeBlockSize);

  // Adds a document with a value to the index.
  void Add(DocId id, double value);

  // Removes a document with a value from the index.
  void Remove(DocId id, double value);

  // Returns all documents with values in the range [l, r].
  RangeResult Range(double l, double r) const;
  // Same as Range, but returns the blocks that contain the results.
  absl::InlinedVector<const RangeBlock*, 5> RangeBlocks(double l, double r) const;

  RangeResult GetAllDocIds() const;
  // Returns all blocks in the tree.
  absl::InlinedVector<const RangeBlock*, 5> GetAllBlocks() const;

  struct Stats {
    size_t splits = 0;
    size_t merges = 0;
    size_t block_count = 0;
  };

  Stats GetStats() const;

 private:
  using Map = absl::btree_map<double, RangeBlock, std::less<>,
                              PMR_NS::polymorphic_allocator<std::pair<double, RangeBlock>>>;

  Map::iterator FindRangeBlock(double value);
  Map::const_iterator FindRangeBlock(double value) const;

  Map::iterator CreateEmptyBlock(double lb);
  void SplitBlock(Map::iterator it);

  // Used for DCHECKs
  bool TreeIsInCorrectState() const;

 private:
  // The maximum size of a range block. If a block exceeds this size, it will be split
  size_t max_range_block_size_;
  Map entries_;

  struct {
    size_t splits = 0;
    size_t merges = 0;
  } stats_;
};

/* This iterator filters out entries that are not in the range [l, r].
   It is used to iterate over the RangeBlock and return only the entries
   that are within the specified range.
   The iterator is initialized with a range [l, r] and will skip entries
   that are outside this range. */
class RangeFilterIterator : public SeekableTag {
 private:
  static constexpr DocId kInvalidDocId = std::numeric_limits<DocId>::max();

  using RangeBlock = RangeTree::RangeBlock;
  using BaseIterator = RangeBlock::BlockListIterator;

 public:
  using iterator_category = BaseIterator::iterator_category;
  using difference_type = BaseIterator::difference_type;
  using value_type = DocId;
  using pointer = value_type*;
  using reference = value_type&;

  RangeFilterIterator(BaseIterator begin, BaseIterator end, double l, double r);

  value_type operator*() const;

  RangeFilterIterator& operator++();

  void SeekGE(DocId min_doc_id);

  bool operator==(const RangeFilterIterator& other) const;
  bool operator!=(const RangeFilterIterator& other) const;

  bool HasReachedEnd() const;

 private:
  void SkipInvalidEntries(DocId last_id);

  bool InRange(BaseIterator it) const;

  double l_, r_;
  BaseIterator current_, end_;
};

RangeFilterIterator MakeBegin(const RangeTree::RangeBlock& block, double l, double r);
RangeFilterIterator MakeEnd(const RangeTree::RangeBlock& block, double l, double r);

/* Separate class for merging results from a single RangeBlock.
   It provides an iterator interface to iterate over the entries in the block
   that are within the specified range [l, r].
   This is used when the result of a range query is contained within a single block.

   It is needed to avoid unnecessary complexity in the RangeResult class,
   which can handle both single and multiple blocks.
   It provides better performance and clarity when dealing with single block results. */
class SingleBlockRangeResult {
 public:
  SingleBlockRangeResult(const RangeTree::RangeBlock* block, double l, double r);

  RangeFilterIterator begin() const;
  RangeFilterIterator end() const;

  size_t size() const;

 private:
  double l_;
  double r_;
  const RangeTree::RangeBlock* block_ = nullptr;
};

/* Separate class for merging results from two RangeBlocks.
   It provides an iterator interface to iterate over the entries in both blocks
   that are within the specified range [l, r].
   It automatically merges the results from both blocks and provides a unified view.
   This is used when the result of a range query spans two blocks.

   It provides a more efficient way to handle results that span multiple blocks,
   avoiding unnecessary complexity in the RangeResult class.
   TODO: Implement efficient merging for more than two blocks and remove this class. */
class TwoBlocksRangeResult {
 public:
  TwoBlocksRangeResult(const RangeTree::RangeBlock* left_block,
                       const RangeTree::RangeBlock* right_block, double l, double r);

  size_t size() const;

  class MergingIterator : public SeekableTag {
   private:
    static constexpr DocId kInvalidDocId = std::numeric_limits<DocId>::max();

   public:
    using iterator_category = RangeFilterIterator::iterator_category;
    using difference_type = RangeFilterIterator::difference_type;
    using value_type = RangeFilterIterator::value_type;
    using pointer = RangeFilterIterator::pointer;
    using reference = RangeFilterIterator::reference;

    MergingIterator(RangeFilterIterator l, RangeFilterIterator r);

    value_type operator*() const;

    MergingIterator& operator++();

    void SeekGE(DocId min_doc_id);

    bool operator==(const MergingIterator& other) const;
    bool operator!=(const MergingIterator& other) const;

   private:
    void InitializeMin();

    DocId current_min_ = kInvalidDocId;
    RangeFilterIterator l_;
    RangeFilterIterator r_;
  };

  MergingIterator begin() const;
  MergingIterator end() const;

 private:
  double l_;
  double r_;
  const RangeTree::RangeBlock* left_block_ = nullptr;
  const RangeTree::RangeBlock* right_block_ = nullptr;
};

/* Represent the result of a range query on the RangeTree.
   It can contain results from a single block, two blocks, or several blocks.
   Several blocks are merged into a single result, which is represented by
   vector<DocId>.

   TODO: Implement efficient merging for more than two blocks */
class RangeResult {
 private:
  using RangeBlockPointer = const RangeTree::RangeBlock*;
  using RangeBlockIterator = RangeTree::RangeBlock::BlockListIterator;

  using DocsList = std::vector<DocId>;
  using Variant = std::variant<DocsList, SingleBlockRangeResult, TwoBlocksRangeResult>;

 public:
  RangeResult() = default;

  explicit RangeResult(std::vector<DocId> doc_ids);
  explicit RangeResult(absl::InlinedVector<RangeBlockPointer, 5> blocks);
  RangeResult(absl::InlinedVector<RangeBlockPointer, 5> blocks, double l, double r);

  std::vector<DocId> Take();

  Variant& GetResult();
  const Variant& GetResult() const;

 private:
  Variant result_;
};

// Implementation
/******************************************************************/
inline RangeFilterIterator::RangeFilterIterator(BaseIterator begin, BaseIterator end, double l,
                                                double r)
    : l_(l), r_(r), current_(begin), end_(end) {
  SkipInvalidEntries(kInvalidDocId);
}

inline RangeFilterIterator::value_type RangeFilterIterator::operator*() const {
  return (*current_).first;
}

inline RangeFilterIterator& RangeFilterIterator::operator++() {
  const DocId last_id = (*current_).first;
  ++current_;
  SkipInvalidEntries(last_id);
  return *this;
}

inline void RangeFilterIterator::SeekGE(DocId min_doc_id) {
  current_.SeekGE(min_doc_id);
  while (current_ != end_ && !InRange(current_)) {
    DCHECK((*current_).first >= min_doc_id);
    ++current_;
  }
}

inline bool RangeFilterIterator::operator==(const RangeFilterIterator& other) const {
  return current_ == other.current_;
}

inline bool RangeFilterIterator::operator!=(const RangeFilterIterator& other) const {
  return current_ != other.current_;
}

inline bool RangeFilterIterator::HasReachedEnd() const {
  return current_ == end_;
}

inline void RangeFilterIterator::SkipInvalidEntries(DocId last_id) {
  // Faster than using std::find_if
  while (current_ != end_ && (!InRange(current_) || (*current_).first == last_id)) {
    ++current_;
  }
}

inline bool RangeFilterIterator::InRange(BaseIterator it) const {
  return l_ <= (*it).second && (*it).second <= r_;
}

inline RangeFilterIterator MakeBegin(const RangeTree::RangeBlock& block, double l, double r) {
  return {block.begin(), block.end(), l, r};
}

inline RangeFilterIterator MakeEnd(const RangeTree::RangeBlock& block, double l, double r) {
  return {block.end(), block.end(), l, r};
}

inline SingleBlockRangeResult::SingleBlockRangeResult(const RangeTree::RangeBlock* block, double l,
                                                      double r)
    : l_(l), r_(r), block_(block) {
  DCHECK(block_ != nullptr);
}

inline RangeFilterIterator SingleBlockRangeResult::begin() const {
  return MakeBegin(*block_, l_, r_);
}

inline RangeFilterIterator SingleBlockRangeResult::end() const {
  return MakeEnd(*block_, l_, r_);
}

inline size_t SingleBlockRangeResult::size() const {
  return block_->Size();
}

inline TwoBlocksRangeResult::TwoBlocksRangeResult(const RangeTree::RangeBlock* left_block,
                                                  const RangeTree::RangeBlock* right_block,
                                                  double l, double r)
    : l_(l), r_(r), left_block_(left_block), right_block_(right_block) {
  DCHECK(left_block_ != nullptr);
  DCHECK(right_block_ != nullptr);
}

inline size_t TwoBlocksRangeResult::size() const {
  return left_block_->Size() + right_block_->Size();
}

inline TwoBlocksRangeResult::MergingIterator::MergingIterator(RangeFilterIterator l,
                                                              RangeFilterIterator r)
    : l_(std::move(l)), r_(std::move(r)) {
  InitializeMin();
}

inline TwoBlocksRangeResult::MergingIterator::value_type
TwoBlocksRangeResult::MergingIterator::operator*() const {
  return current_min_;
}

inline TwoBlocksRangeResult::MergingIterator& TwoBlocksRangeResult::MergingIterator::operator++() {
  auto increase_iterator = [&](RangeFilterIterator& it) {
    ++it;
    current_min_ = !it.HasReachedEnd() ? *it : std::numeric_limits<DocId>::max();
  };

  if (l_.HasReachedEnd()) {
    increase_iterator(r_);
  } else if (r_.HasReachedEnd()) {
    increase_iterator(l_);
  } else {
    DCHECK(!l_.HasReachedEnd() && !r_.HasReachedEnd());
    if (*l_ == current_min_) {
      ++l_;
    }
    if (*r_ == current_min_) {
      ++r_;
    }
    InitializeMin();
  }

  return *this;
}

inline void TwoBlocksRangeResult::MergingIterator::SeekGE(DocId min_doc_id) {
  l_.SeekGE(min_doc_id);
  r_.SeekGE(min_doc_id);
  InitializeMin();
}

inline bool TwoBlocksRangeResult::MergingIterator::operator==(
    const TwoBlocksRangeResult::MergingIterator& other) const {
  return l_ == other.l_ && r_ == other.r_;
}

inline bool TwoBlocksRangeResult::MergingIterator::operator!=(
    const TwoBlocksRangeResult::MergingIterator& other) const {
  return !(*this == other);
}

inline void TwoBlocksRangeResult::MergingIterator::InitializeMin() {
  DocId left_value = !l_.HasReachedEnd() ? *l_ : std::numeric_limits<DocId>::max();
  DocId right_value = !r_.HasReachedEnd() ? *r_ : std::numeric_limits<DocId>::max();
  current_min_ = std::min(left_value, right_value);
}

inline TwoBlocksRangeResult::MergingIterator TwoBlocksRangeResult::begin() const {
  return MergingIterator{MakeBegin(*left_block_, l_, r_), MakeBegin(*right_block_, l_, r_)};
}

inline TwoBlocksRangeResult::MergingIterator TwoBlocksRangeResult::end() const {
  return MergingIterator{MakeEnd(*left_block_, l_, r_), MakeEnd(*right_block_, l_, r_)};
}

inline RangeResult::Variant& RangeResult::GetResult() {
  return result_;
}

inline const RangeResult::Variant& RangeResult::GetResult() const {
  return result_;
}

}  // namespace dfly::search


================================================
FILE: src/core/search/range_tree_test.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/search/range_tree.h"

#include <absl/random/random.h>
#include <benchmark/benchmark.h>
#include <gmock/gmock.h>
#include <gtest/gtest.h>

#include <utility>

#include "base/gtest.h"
#include "base/logging.h"
#include "util/fibers/fibers.h"

namespace dfly::search {

class RangeTreeTest : public testing::Test {
 protected:
};

static constexpr double kMinRangeValue = std::numeric_limits<double>::min();
static constexpr double kMaxRangeValue = std::numeric_limits<double>::max();

using Entry = std::pair<DocId, double>;
using BlocksList = absl::InlinedVector<const RangeTree::RangeBlock*, 5>;

std::vector<Entry> ExtractDocPairs(const BlocksList& result) {
  std::vector<Entry> out;
  for (const auto& block : result) {
    for (const auto& entry : *block) {
      out.push_back(entry);
    }
  }
  return out;
}

std::vector<std::vector<Entry>> ExtractAllBlocks(const BlocksList& result) {
  std::vector<std::vector<Entry>> all;
  for (const auto& block : result) {
    std::vector<Entry> block_entries;
    for (const auto& entry : *block) {
      block_entries.push_back(entry);
    }
    all.push_back(std::move(block_entries));
  }
  return all;
}

MATCHER_P(UnorderedElementsAreDocPairsMatcher, expected_matchers, "") {
  return testing::ExplainMatchResult(testing::UnorderedElementsAreArray(expected_matchers),
                                     ExtractDocPairs(arg), result_listener);
}

MATCHER_P(BlocksAreMatcher, expected_blocks, "") {
  std::vector<testing::Matcher<std::vector<Entry>>> matchers;
  for (const auto& expected_entries : expected_blocks) {
    matchers.push_back(testing::UnorderedElementsAreArray(expected_entries));
  }
  return testing::ExplainMatchResult(testing::ElementsAreArray(matchers), ExtractAllBlocks(arg),
                                     result_listener);
}

auto UnorderedElementsAreDocPairs(std::vector<Entry> list) {
  return UnorderedElementsAreDocPairsMatcher(std::move(list));
}

auto BlocksAre(std::initializer_list<std::vector<Entry>> blocks) {
  return BlocksAreMatcher(std::vector<std::vector<Entry>>(blocks));
}

std::vector<DocId> ExtractDocIdsFromRange(const std::vector<Entry>& entries, double l, double r) {
  std::vector<DocId> result;
  for (const auto& entry : entries) {
    if (entry.second >= l && entry.second <= r) {
      result.push_back(entry.first);
    }
  }

  std::sort(result.begin(), result.end());
  result.erase(std::unique(result.begin(), result.end()), result.end());
  return result;
}

std::vector<DocId> MergeTwoBlocksRangeResult(const RangeTree& tree, double l, double r) {
  auto result = tree.Range(l, r).GetResult();
  DCHECK(std::holds_alternative<TwoBlocksRangeResult>(result));
  auto& two_blocks_result = std::get<TwoBlocksRangeResult>(result);
  return {two_blocks_result.begin(), two_blocks_result.end()};
}

TEST_F(RangeTreeTest, AddSimple) {
  RangeTree tree{PMR_NS::get_default_resource()};

  // Add some values
  tree.Add(1, 10.0);
  tree.Add(2, 20.0);
  tree.Add(2, 10.0);
  tree.Add(3, 30.0);
  tree.Add(4, 40.0);
  tree.Add(4, 60.0);

  auto result = tree.GetAllBlocks();
  EXPECT_THAT(result, UnorderedElementsAreDocPairs(
                          {{1, 10.0}, {2, 10.0}, {2, 20.0}, {3, 30.0}, {4, 40.0}, {4, 60.0}}));
}

TEST_F(RangeTreeTest, Add) {
  RangeTree tree{PMR_NS::get_default_resource(), 2};

  // Add some values
  tree.Add(1, 10.0);
  tree.Add(1, 20.0);
  tree.Add(2, 20.0);
  tree.Add(3, 20.0);
  tree.Add(4, 30.0);
  tree.Add(5, 30.0);
  tree.Add(6, 30.0);

  auto result = tree.RangeBlocks(10.0, 30.0);
  EXPECT_THAT(result,
              UnorderedElementsAreDocPairs(
                  {{1, 10.0}, {1, 20.0}, {2, 20.0}, {3, 20.0}, {4, 30.0}, {5, 30.0}, {6, 30.0}}));

  // Test that the ranges was split correctly
  result = tree.RangeBlocks(kMinRangeValue, 19.0);
  EXPECT_THAT(result, UnorderedElementsAreDocPairs({{1, 10.0}}));

  result = tree.RangeBlocks(20.0, 29.0);
  EXPECT_THAT(result, UnorderedElementsAreDocPairs({{1, 20.0}, {2, 20.0}, {3, 20.0}}));

  result = tree.RangeBlocks(30.0, kMaxRangeValue);
  EXPECT_THAT(result, UnorderedElementsAreDocPairs({{4, 30.0}, {5, 30.0}, {6, 30.0}}));
}

TEST_F(RangeTreeTest, RemoveSimple) {
  RangeTree tree{PMR_NS::get_default_resource(), 2};

  // Add some values
  tree.Add(1, 10.0);
  tree.Add(2, 20.0);
  tree.Add(3, 30.0);
  tree.Add(4, 40.0);

  // Remove some values
  tree.Remove(1, 10.0);
  tree.Remove(2, 20.0);

  auto result = tree.GetAllBlocks();
  EXPECT_THAT(result, UnorderedElementsAreDocPairs({{3, 30.0}, {4, 40.0}}));
}

TEST_F(RangeTreeTest, Remove) {
  using Container = std::vector<Entry>;

  Container expected_values;
  RangeTree tree{PMR_NS::get_default_resource(), 2};

  const long long max_value = 100;
  long long step = 23;
  long long current_value = max_value;

  auto do_add = [&](DocId i) {
    const double value = static_cast<double>(current_value);
    auto it = std::find(expected_values.begin(), expected_values.end(), std::make_pair(i, value));

    if (it != expected_values.end()) {
      // If the value already exists, we do not add it again
      // The problem is that for now RangeTree does not support duplicates
      // TODO: fix this
      return;
    }

    // Otherwise, we add it to the expected values and to the tree
    expected_values.emplace_back(i, value);
    tree.Add(i, value);
    current_value = (max_value + current_value - step) % max_value;
  };

  auto add_entries_with_step = [&](size_t step) {
    for (size_t i = 0; i < 100; i += step) {
      do_add(i);
    }
  };

  auto do_remove = [&](size_t i) {
    auto pair = expected_values[i];
    tree.Remove(pair.first, pair.second);
  };

  auto remove_entries_with_step = [&](size_t step) {
    Container expected_values_copy;
    for (size_t i = 0; i < expected_values.size(); i++) {
      if (i % step == 0) {
        do_remove(i);
      } else {
        expected_values_copy.push_back(expected_values[i]);
      }
    }
    expected_values = std::move(expected_values_copy);
  };

  // First wave of Add and Remove
  add_entries_with_step(1);

  step = 37;
  current_value = max_value;
  add_entries_with_step(3);

  // Remove some values
  remove_entries_with_step(3);

  auto result = tree.GetAllBlocks();
  EXPECT_THAT(result, UnorderedElementsAreDocPairs(expected_values));

  // Second wave of Add and Remove
  step = 31;
  current_value = max_value;
  add_entries_with_step(5);

  // Remove a first half of the values
  remove_entries_with_step(2);

  result = tree.GetAllBlocks();
  EXPECT_THAT(result, UnorderedElementsAreDocPairs(expected_values));

  // Remove all values
  remove_entries_with_step(1);

  result = tree.GetAllBlocks();
  EXPECT_THAT(result, UnorderedElementsAreDocPairs({}));
}

TEST_F(RangeTreeTest, RangeSimple) {
  RangeTree tree{PMR_NS::get_default_resource(), 1};

  // Add some values
  tree.Add(1, 10.0);
  tree.Add(1, 20.0);
  tree.Add(2, 20.0);
  tree.Add(2, 30.0);
  tree.Add(3, 30.0);
  tree.Add(3, 40.0);
  tree.Add(4, 40.0);

  auto result = tree.RangeBlocks(10.0, 10.0);
  EXPECT_THAT(result, BlocksAre({{{1, 10.0}}}));

  result = tree.RangeBlocks(20.0, 20.0);
  EXPECT_THAT(result, BlocksAre({{{1, 20.0}, {2, 20.0}}}));

  result = tree.RangeBlocks(30.0, 30.0);
  EXPECT_THAT(result, BlocksAre({{{2, 30.0}, {3, 30.0}}}));

  result = tree.RangeBlocks(40.0, 40.0);
  EXPECT_THAT(result, BlocksAre({{{3, 40.0}, {4, 40.0}}}));

  result = tree.RangeBlocks(10.0, 30.0);
  EXPECT_THAT(result, BlocksAre({{{1, 10.0}}, {{1, 20.0}, {2, 20.0}}, {{2, 30.0}, {3, 30.0}}}));

  result = tree.RangeBlocks(20.0, 40.0);
  EXPECT_THAT(result,
              BlocksAre({{{1, 20.0}, {2, 20.0}}, {{2, 30.0}, {3, 30.0}}, {{3, 40.0}, {4, 40.0}}}));

  result = tree.RangeBlocks(10.0, 40.0);
  EXPECT_THAT(
      result,
      BlocksAre(
          {{{1, 10.0}}, {{1, 20.0}, {2, 20.0}}, {{2, 30.0}, {3, 30.0}}, {{3, 40.0}, {4, 40.0}}}));
}

TEST_F(RangeTreeTest, Range) {
  {
    RangeTree tree{PMR_NS::get_default_resource(), 4};

    tree.Add(1, 10.0);
    tree.Add(1, 20.0);
    tree.Add(2, 20.0);
    tree.Add(3, 30.0);
    tree.Add(4, 20.0);
    tree.Add(4, 30.0);

    auto result = tree.RangeBlocks(10.0, 30.0);
    EXPECT_THAT(
        result,
        BlocksAre({{{1, 10.0}}, {{1, 20.0}, {2, 20.0}, {4, 20.0}}, {{3, 30.0}, {4, 30.0}}}));
  }

  {
    RangeTree tree{PMR_NS::get_default_resource(), 4};

    tree.Add(1, 10.0);
    tree.Add(1, 20.0);
    tree.Add(2, 20.0);
    tree.Add(3, 20.0);
    tree.Add(4, 20.0);

    auto result = tree.RangeBlocks(10.0, 20.0);
    EXPECT_THAT(result, BlocksAre({{{1, 10.0}}, {{1, 20.0}, {2, 20.0}, {3, 20.0}, {4, 20.0}}}));
  }

  {
    RangeTree tree{PMR_NS::get_default_resource(), 4};

    tree.Add(1, 10.0);
    tree.Add(2, 10.0);
    tree.Add(3, 10.0);
    tree.Add(4, 20.0);
    tree.Add(4, 10.0);

    auto result = tree.RangeBlocks(10.0, 20.0);
    EXPECT_THAT(result, BlocksAre({{{1, 10.0}, {2, 10.0}, {3, 10.0}, {4, 10.0}}, {{4, 20.0}}}));
  }
}

// Don't split single block with same value
TEST_F(RangeTreeTest, SingleBlockSplit) {
  RangeTree tree{PMR_NS::get_default_resource(), 4};

  for (DocId id = 1; id <= 16; id++)
    tree.Add(id, 5.0);

  // One split was made to create an empty leftmost block
  auto stats = tree.GetStats();
  EXPECT_EQ(stats.splits, 1u);
  EXPECT_EQ(stats.block_count, 2u);

  // Add value that causes a new block to be started
  tree.Add(20, 6.0);

  stats = tree.GetStats();
  EXPECT_EQ(stats.splits, 1u);       // detected ahead, so no split
  EXPECT_EQ(stats.block_count, 3u);  // but new block

  // No more splits with same 5.0
  tree.Add(17, 5.0);
  stats = tree.GetStats();
  EXPECT_EQ(stats.splits, 1u);

  // Verify block sizes
  auto blocks = tree.GetAllBlocks();
  EXPECT_EQ(blocks[0]->Size(), 0u);
  EXPECT_EQ(blocks[1]->Size(), 17u);
  EXPECT_EQ(blocks[2]->Size(), 1u);
}

// Make tree split and then delete every nth value to see if blocks merge properly
TEST_F(RangeTreeTest, BlockMerge) {
  RangeTree tree{PMR_NS::get_default_resource(), 8};
  for (DocId id = 1; id <= 64; id++)
    tree.Add(id, id);

  auto stats = tree.GetStats();
  uint64_t splits = stats.splits;
  EXPECT_GT(splits, 8u);

  // Blocks have at least half occupancy
  EXPECT_GT(stats.block_count, 64 / 8);
  EXPECT_LT(stats.block_count, 2 * 64 / 8);

  // Delete all except  %8 = 0, should trigger merge
  std::vector<Entry> expected;
  for (DocId id = 1; id <= 64; id++) {
    if (id % 8)
      tree.Remove(id, id);
    else
      expected.emplace_back(id, id);
  }

  // Only one block left now
  stats = tree.GetStats();
  size_t blocks = stats.block_count;
  EXPECT_LT(blocks, 4u);
  EXPECT_EQ(stats.merges + blocks - 1, splits);

  // Check the two entries remained
  auto result = tree.GetAllBlocks();
  EXPECT_THAT(result, UnorderedElementsAreDocPairs(expected));
}

TEST_F(RangeTreeTest, BugNotUniqueDoubleValues) {
  // TODO: fix the bug
  GTEST_SKIP() << "Bug not fixed yet";

  RangeTree tree{PMR_NS::get_default_resource()};

  tree.Add(1, 10.0);
  tree.Add(1, 10.0);
  tree.Remove(1, 10.0);

  auto result = tree.GetAllBlocks();
  EXPECT_THAT(result, BlocksAre({{{1, 10.0}}}));
}

TEST_F(RangeTreeTest, RangeResultTwoBlocksSimple) {
  RangeTree tree{PMR_NS::get_default_resource(), 4};

  // First block: [[1, 10.0], [16, 12.0], [12, 15.0], [5, 17.0]]
  // Second block: [[8, 20.0], [5, 30.0], [12, 50.0], [20, 55.0]]
  // [10.0, 12.0, 15.0, 17.0] | [20.0, 30.0, 50.0, 55.0]
  tree.Add(1, 10.0);   // 1
  tree.Add(5, 30.0);   // 2
  tree.Add(20, 55.0);  // 2
  tree.Add(5, 17.0);   // 1
  tree.Add(8, 20.0);   // 2
  tree.Add(16, 12.0);  // 1
  tree.Add(12, 15.0);  // 1
  tree.Add(12, 50.0);  // 2

  EXPECT_THAT(tree.RangeBlocks(10.0, 55.0),
              BlocksAre({{{1, 10.0}, {16, 12.0}, {12, 15.0}, {5, 17.0}},
                         {{8, 20.0}, {5, 30.0}, {12, 50.0}, {20, 55.0}}}));

  std::vector<Entry> entries = {{1, 10.0}, {16, 12.0}, {12, 15.0}, {5, 17.0},
                                {8, 20.0}, {5, 30.0},  {12, 50.0}, {20, 55.0}};

  for (size_t i = 0; i < entries.size() / 2; i++) {
    const double l = entries[i].second;
    for (size_t j = entries.size() / 2; j < entries.size(); j++) {
      const double r = entries[j].second;
      auto range_result = MergeTwoBlocksRangeResult(tree, l, r);
      EXPECT_THAT(range_result, testing::ElementsAreArray(ExtractDocIdsFromRange(entries, l, r)));
    }
  }
}

TEST_F(RangeTreeTest, RangeResultTwoBlocks) {
  RangeTree tree{PMR_NS::get_default_resource(), 50};

  const long long max_value = 100;
  long long step = 23;
  long long current_value = max_value;

  std::vector<Entry> entries;
  for (size_t i = 0; i < 20; i++) {
    const double value = static_cast<double>(current_value);
    entries.emplace_back(i, value);
    entries.emplace_back(i, value + 100.0);
    current_value = (max_value + current_value - step) % max_value;
  }
  for (size_t i = 20; i < 80; i++) {
    const double value = static_cast<double>(current_value);
    entries.emplace_back(i, value);
    current_value = (max_value + current_value - step) % max_value;
  }

  DCHECK(entries.size() == 100);

  std::sort(entries.begin(), entries.end(),
            [](const Entry& a, const Entry& b) { return a.second < b.second; });

  auto add_entries = [&tree, &entries](size_t start, size_t end) {
    for (size_t i = start; i < end; i++) {
      tree.Add(entries[i].first, entries[i].second);
    }
  };

  add_entries(0, 25);
  add_entries(50, 76);
  add_entries(25, 50);
  add_entries(76, entries.size());

  for (size_t i = 0; i < 50; i++) {
    const double l = entries[i].second;
    for (size_t j = 50; j < entries.size(); j++) {
      const double r = entries[j].second;
      auto range_result = MergeTwoBlocksRangeResult(tree, l, r);
      EXPECT_THAT(range_result, testing::ElementsAreArray(ExtractDocIdsFromRange(entries, l, r)));
    }
  }
}

struct BuilderTest : public RangeTreeTest {
  static void Shuffle(std::vector<RangeTree::Entry>* entries) {
    std::random_device rd;
    std::shuffle(entries->begin(), entries->end(), std::mt19937(rd()));
  }
};

// Test if the builder builds the tree correctly
TEST_F(BuilderTest, Builder) {
  RangeTree tree{PMR_NS::get_default_resource(), 4};
  RangeTree::Builder builder;

  // Prepare entries shuffled
  std::vector<RangeTree::Entry> entries;
  entries.reserve(100);
  for (size_t i = 0; i < 120; i++)
    entries.emplace_back(i, double(i) / 2);
  Shuffle(&entries);

  // Add fake entries
  for (auto [id, v] : entries) {
    builder.Add(id, v * 2);
  }

  // Add all entries for real
  for (auto [id, v] : entries) {
    builder.Remove(id, v * 2);
    builder.Add(id, v);
  }

  // Shuffle again
  Shuffle(&entries);

  // Remove last
  while (entries.size() > 100) {
    builder.Remove(entries.back().first, entries.back().second);
    entries.pop_back();
  }

  // Build tree
  builder.Populate(&tree, RenewableQuota::Unlimited());

  // Sort for comparisons
  std::ranges::sort(entries, {}, &RangeTree::Entry::first);
  auto entry_ids = entries | std::views::keys;

  // Check correctness of all ids
  {
    auto all_values = tree.Range(-1000, +1000);
    auto got_ids = all_values.Take();
    EXPECT_TRUE(std::ranges::equal(got_ids, entry_ids));
  }

  // Check correctness of all values including ids
  {
    auto all_pairs = ExtractDocPairs(tree.GetAllBlocks());
    std::sort(all_pairs.begin(), all_pairs.end());
    EXPECT_EQ(all_pairs, entries);
  }
}

TEST_F(BuilderTest, BuilderUpdates) {
  RangeTree tree{PMR_NS::get_default_resource(), 5};
  RangeTree::Builder builder;

  // Prepare entries shuffled
  std::vector<RangeTree::Entry> entries;
  entries.reserve(1000);
  for (size_t i = 0; i < 1000; i++) {
    entries.emplace_back(i, double(i) / 2);
    entries.emplace_back(i, double(i) / 2 + 0.25);
  }
  Shuffle(&entries);

  // Insert entries
  for (auto entry : entries)
    builder.Add(entry.first, entry.second);

  // Construct while suspending at every node
  bool done = false;
  util::fb2::Fiber populate_fb{[&] {
    builder.Populate(&tree, {0});  // suspend each time
    done = true;
  }};

  // In the meantime insert new entries
  DocId current = entries.size();
  bool add = false;
  size_t added = 0;
  absl::InsecureBitGen gen;
  while (!done) {
    if (add) {
      entries.emplace_back(current, double(current) / 2);
      builder.Add(entries.back().first, entries.back().second);
      current++;
    } else {
      size_t idx = absl::Uniform(gen, size_t{0}, entries.size());
      auto it = entries.begin() + idx;
      builder.Remove(it->first, it->second);

      // Change our mind with 50% prob and just update
      if (current % 2 == 0) {
        it->second += 1;
        builder.Add(it->first, it->second);
      } else {
        entries.erase(it);
      }
    }
    add = !add;
    added++;
    util::ThisFiber::Yield();
  }

  EXPECT_GT(added, 5u);  // At least some updates were performed

  populate_fb.Join();

  // Sort for comparisons
  std::sort(entries.begin(), entries.end());
  // auto entry_ids_view = entries | std::views::keys;

  // Check correctness of all ids
  // TODO: Range tree doesn't filter duplicate ids
  //{
  //  auto all_values = tree.Range(-100000, +100000);
  //  auto got_ids = all_values.Take();
  //
  //  std::set entry_ids_set(entry_ids_view.begin(), entry_ids_view.end());
  //  std::vector entry_ids_vec(entry_ids_set.begin(), entry_ids_set.end());
  //
  //  EXPECT_EQ(got_ids, entry_ids_vec);
  //}

  // Check correctness of all values including ids
  {
    auto all_pairs = ExtractDocPairs(tree.GetAllBlocks());
    std::sort(all_pairs.begin(), all_pairs.end());
    EXPECT_EQ(all_pairs, entries);
  }
}

// Test tree doesn't create unnecessary nodes after initialization
TEST_F(RangeTreeTest, DiscreteIntialization) {
  RangeTree tree{PMR_NS::get_default_resource(), 4};
  RangeTree::Builder builder;

  for (size_t i = 0; i < 32; i++) {
    builder.Add(i, i % 4);
  }
  builder.Populate(&tree, RenewableQuota::Unlimited());

  auto result = tree.GetAllBlocks();
  EXPECT_EQ(result.size(), 4u);
}

// Benchmark tree insertion performance with set of discrete values
static void BM_DiscreteInsertion(benchmark::State& state) {
  RangeTree tree{PMR_NS::get_default_resource()};

  absl::InsecureBitGen gen{};
  size_t variety = state.range(0);

  DocId id = 0;
  for (auto _ : state) {
    double v = absl::Uniform(gen, 0u, variety);
    tree.Add(id++, v);
  }
}

BENCHMARK(BM_DiscreteInsertion)->Arg(2)->Arg(12)->Arg(128)->Arg(1024);

}  // namespace dfly::search


================================================
FILE: src/core/search/rax_tree.h
================================================
#pragma once

#include <cassert>
#include <memory>
#include <optional>
#include <string>
#include <string_view>
#include <utility>

#include "base/pmr/memory_resource.h"

extern "C" {
#include "redis/rax.h"
}

namespace detail {

// Copies an iterators state into another by performing a fresh seek on the source's key. While this
// is a little more expensive, it is done to avoid deep copying pointers from raxIterator and
// raxStart while taking care of self-reference links in both structs. The return value is used to
// decide whether to advance iterator after a successful seek.
inline bool CopyIteratorState(raxIterator& destination, raxIterator& source) {
  raxStart(&destination, source.rt);
  if (!destination.rt)
    return false;

  if (!raxSeek(&destination, "=", source.key, source.key_len)) {
    // called from constructor, so no error can be returned. but set up the same state as
    // the SeekIterator constructor, so that it will return true on comparison to RaxTreeMap::end()
    raxStop(&destination);
    destination.rt = nullptr;
    return false;
  }

  return true;
}

}  // namespace detail

namespace dfly::search {

// absl::flat_hash_map/std::unordered_map compatible tree map based on rax tree.
// Allocates all objects on heap (with custom memory resource) as rax tree operates fully on
// pointers.
// TODO: Add full support for polymorphic allocators, including rax trie node allocations
template <typename V> struct RaxTreeMap {
  using value_type = V;

  struct FindIterator;

  // Simple seeking iterator
  struct SeekIterator {
    SeekIterator() {
      it_.rt = nullptr;
    }

    SeekIterator(rax* tree, const char* op, std::string_view key) {
      raxStart(&it_, tree);
      if (raxSeek(&it_, op, to_key_ptr(key), key.size())) {  // Successfuly seeked
        operator++();
      } else {
        InvalidateIterator();
      }
    }

    explicit SeekIterator(rax* tree) : SeekIterator(tree, "^", std::string_view{nullptr, 0}) {
    }

    SeekIterator(SeekIterator&& other) noexcept : it_{} {
      *this = std::move(other);
    }

    SeekIterator& operator=(SeekIterator&& other) noexcept {
      if (this != &other) {
        if (IsValid()) {
          InvalidateIterator();
        }
        if (::detail::CopyIteratorState(it_, other.it_))
          operator++();
        if (other.IsValid())
          other.InvalidateIterator();
      }
      return *this;
    }

    /* Copy constructor deleted to avoid double iterator invalidation */
    SeekIterator(const SeekIterator&) = delete;
    SeekIterator& operator=(const SeekIterator&) = delete;

    ~SeekIterator() {
      if (IsValid()) {
        InvalidateIterator();
      }
    }

    bool operator==(const SeekIterator& rhs) const {
      if (!IsValid() || !rhs.IsValid())
        return !IsValid() && !rhs.IsValid();
      return it_.node == rhs.it_.node;
    }

    bool operator!=(const SeekIterator& rhs) const {
      return !operator==(rhs);
    }

    SeekIterator& operator++() {
      int next_result = raxNext(&it_);
      if (!next_result) {  // OOM or we reached the end of the tree
        InvalidateIterator();
      }
      return *this;
    }

    /* After operator++() the first value (string_view) is invalid. So make sure your copied it to
     * string */
    std::pair<std::string_view, V&> operator*() const {
      assert(IsValid() && it_.node && it_.node->iskey && it_.data);
      return {std::string_view{reinterpret_cast<const char*>(it_.key), it_.key_len},
              *reinterpret_cast<V*>(it_.data)};
    }

    bool IsValid() const {
      return it_.rt;
    }

   private:
    void InvalidateIterator() {
      raxStop(&it_);
      it_.rt = nullptr;
    }

    raxIterator it_;
  };
  using iterator = SeekIterator;

  // Result of find() call. Inherits from pair to mimic iterator interface, not incrementable.
  struct FindIterator : public std::optional<std::pair<std::string, V&>> {
    bool operator==(const SeekIterator& rhs) const {
      if (!this->has_value() || !rhs.IsValid())
        return !this->has_value() && !rhs.IsValid();
      return (*this)->first == (*rhs).first;
    }

    bool operator!=(const SeekIterator& rhs) const {
      return !operator==(rhs);
    }
  };

 public:
  explicit RaxTreeMap(PMR_NS::memory_resource* mr) : tree_(raxNew()), alloc_(mr) {
  }

  ~RaxTreeMap() {
    using Allocator = decltype(alloc_);

    auto free_callback = [](void* data, void* context) {
      Allocator* allocator = static_cast<Allocator*>(context);
      V* ptr = static_cast<V*>(data);
      std::allocator_traits<Allocator>::destroy(*allocator, ptr);
      allocator->deallocate(ptr, 1);
    };

    raxFreeWithCallbackAndArgument(tree_, free_callback, &alloc_);
  }

  size_t size() const {
    return raxSize(tree_);
  }

  auto begin() const {
    return SeekIterator{tree_};
  }

  auto end() const {
    return SeekIterator{};
  }

  auto lower_bound(std::string_view key) const {
    return SeekIterator{tree_, ">=", key};
  }

  FindIterator find(std::string_view key) const {
    if (void* ptr = nullptr; raxFind(tree_, to_key_ptr(key), key.size(), &ptr))
      return FindIterator{std::pair<std::string, V&>(std::string(key), *reinterpret_cast<V*>(ptr))};

    return FindIterator{std::nullopt};
  }

  template <typename... Args>
  std::pair<FindIterator, bool> try_emplace(std::string_view key, Args&&... args);

  void erase(FindIterator it) {
    V* old = nullptr;
    raxRemove(tree_, to_key_ptr(it->first.data()), it->first.size(),
              reinterpret_cast<void**>(&old));
    std::allocator_traits<decltype(alloc_)>::destroy(alloc_, old);
    alloc_.deallocate(old, 1);
  }

  auto& get_allocator() const {
    return alloc_;
  }

 private:
  static unsigned char* to_key_ptr(std::string_view key) {
    return reinterpret_cast<unsigned char*>(const_cast<char*>(key.data()));
  }

  rax* tree_;
  PMR_NS::polymorphic_allocator<V> alloc_;
};

template <typename V>
template <typename... Args>
std::pair<typename RaxTreeMap<V>::FindIterator, bool> RaxTreeMap<V>::try_emplace(
    std::string_view key, Args&&... args) {
  if (auto it = find(key); it)
    return {it, false};

  V* ptr = alloc_.allocate(1);
  std::allocator_traits<decltype(alloc_)>::construct(alloc_, ptr, std::forward<Args>(args)...);

  V* old = nullptr;
  raxInsert(tree_, to_key_ptr(key), key.size(), ptr, reinterpret_cast<void**>(&old));
  assert(!old);

  auto it = std::make_optional(std::pair<std::string, V&>(std::string(key), *ptr));
  return std::make_pair(std::move(FindIterator{it}), true);
}

}  // namespace dfly::search


================================================
FILE: src/core/search/rax_tree_test.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/search/rax_tree.h"

#include <absl/container/btree_set.h>
#include <absl/strings/str_cat.h>
#include <gtest/gtest.h>
#include <mimalloc.h>

#include <algorithm>
#include <memory_resource>

#include "base/gtest.h"
#include "base/iterator.h"
#include "base/logging.h"

extern "C" {
#include "redis/zmalloc.h"
}

namespace dfly::search {

using namespace std;

struct RaxTreeTest : public ::testing::Test {
  static void SetUpTestSuite() {
    auto* tlh = mi_heap_get_backing();
    init_zmalloc_threadlocal(tlh);
  }
};

TEST_F(RaxTreeTest, EmplaceAndIterate) {
  RaxTreeMap<std::string> map(pmr::get_default_resource());

  vector<pair<string, string>> elements(90);
  for (int i = 10; i < 100; i++)
    elements[i - 10] = make_pair(absl::StrCat("key-", i), absl::StrCat("value-", i));

  for (auto& [key, value] : elements) {
    auto [it, inserted] = map.try_emplace(key, value);
    EXPECT_TRUE(inserted);
    EXPECT_EQ(it->first, key);
    EXPECT_EQ(it->second, value);
  }

  size_t i = 0;
  for (auto [key, value] : map) {
    EXPECT_EQ(elements[i].first, key);
    EXPECT_EQ(elements[i].second, value);
    i++;
  }
}

TEST_F(RaxTreeTest, LowerBound) {
  RaxTreeMap<int> map(pmr::get_default_resource());
  vector<string> keys;

  for (unsigned i = 0; i < 5; i++) {
    for (unsigned j = 0; j < 5; j++) {
      keys.emplace_back(absl::StrCat("key-", string(1, 'a' + i), "-", j));
      map.try_emplace(keys.back(), 0);
    }
  }

  auto it1 = map.lower_bound("key-c-3");
  auto it2 = lower_bound(keys.begin(), keys.end(), "key-c-3");

  while (it1 != map.end()) {
    EXPECT_EQ((*it1).first, *it2);
    ++it1;
    ++it2;
  }

  EXPECT_TRUE(it1 == map.end());
  EXPECT_TRUE(it2 == keys.end());

  // Test lower bound empty string
  vector<string> keys2;
  for (auto it = map.lower_bound(string_view{}); it != map.end(); ++it)
    keys2.emplace_back((*it).first);
  EXPECT_EQ(keys, keys2);
}

TEST_F(RaxTreeTest, Find) {
  RaxTreeMap<int> map(pmr::get_default_resource());
  for (unsigned i = 100; i < 999; i += 2)
    map.try_emplace(absl::StrCat("value-", i), i);

  auto it = map.begin();
  for (unsigned i = 100; i < 999; i++) {
    auto fit = map.find(absl::StrCat("value-", i));
    if (i % 2 == 0) {
      EXPECT_TRUE(fit == it);
      EXPECT_EQ(fit->second, i);
      ++it;
    } else {
      EXPECT_TRUE(fit == map.end());
    }
  }

  // Test find with empty string
  EXPECT_TRUE(map.find(string_view{}) == map.end());
}

/* Run with mimalloc to make sure there is no double free */
TEST_F(RaxTreeTest, Iterate) {
  const char* kKeys[] = {
      "aaaaaaaaaaaaaaaaaaaa",
      "bbbbbbbbbbbbbbbbbbbbbb"
      "cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc",
      "dddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"
      "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee",
  };

  RaxTreeMap<int> map(pmr::get_default_resource());
  for (const char* key : kKeys) {
    map.try_emplace(key, 2);
  }

  for (auto it = map.begin(); it != map.end(); ++it) {
    EXPECT_EQ((*it).second, 2);
  }

  for (auto it = map.begin(); it != map.end(); ++it) {
    EXPECT_EQ((*it).second, 2);
  }
}

TEST_F(RaxTreeTest, MoveIterator) {
  RaxTreeMap<bool> m{pmr::get_default_resource()};
  RaxTreeMap<bool>::SeekIterator tmp;
  {
    // empty map, iterator invalidated on construction
    tmp = m.begin();
    const auto it = std::move(tmp);
    EXPECT_FALSE(tmp.IsValid());
    EXPECT_FALSE(it.IsValid());
  }

  {
    tmp = m.end();
    const auto it = std::move(tmp);
    EXPECT_FALSE(tmp.IsValid());
    EXPECT_FALSE(it.IsValid());
    EXPECT_EQ(it, m.end());
  }

  m.try_emplace("first", true);
  m.try_emplace("second", false);

  {
    tmp = m.begin();
    RaxTreeMap<bool>::SeekIterator it{std::move(tmp)};
    EXPECT_FALSE(tmp.IsValid());
    EXPECT_TRUE(it.IsValid());
    EXPECT_EQ((*it).first, "first");
    EXPECT_TRUE((*it).second);

    ++it;

    EXPECT_EQ((*it).first, "second");
    EXPECT_FALSE((*it).second);

    ++it;
    EXPECT_EQ(it, m.end());
  }

  {
    // advance before moving, the moved-to iterator should pick where the moved-from left off
    tmp = m.lower_bound("fig");
    EXPECT_TRUE(tmp.IsValid());

    ++tmp;
    EXPECT_EQ((*tmp).first, "second");

    auto it = std::move(tmp);
    EXPECT_FALSE(tmp.IsValid());
    EXPECT_TRUE(it.IsValid());
    EXPECT_EQ((*it).first, "second");

    ++it;
    EXPECT_FALSE(it.IsValid());
    EXPECT_EQ(it, m.end());
  }

  {
    // move into valid iterator
    auto it = m.begin();
    EXPECT_EQ((*it).first, "first");

    tmp = m.lower_bound("sea");
    EXPECT_EQ((*tmp).first, "second");

    it = std::move(tmp);
    EXPECT_FALSE(tmp.IsValid());
    EXPECT_TRUE(it.IsValid());

    EXPECT_EQ((*it).first, "second");
    ++it;
    EXPECT_FALSE(it.IsValid());
    EXPECT_EQ(it, m.end());
  }

  {
    auto it = m.lower_bound("sea");
    EXPECT_EQ((*it).first, "second");

    tmp = m.end();
    it = std::move(tmp);

    EXPECT_FALSE(it.IsValid());
  }
}

}  // namespace dfly::search


================================================
FILE: src/core/search/renewable_quota.cc
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/search/renewable_quota.h"

#include "base/cycle_clock.h"
#include "base/logging.h"
#include "util/fibers/fibers.h"

namespace dfly::search {

RenewableQuota RenewableQuota::Unlimited() {
  return RenewableQuota{std::numeric_limits<size_t>::max()};
}

// Quota that yields if the fiber is running for too long
void RenewableQuota::Check(std::source_location location) const {
  size_t cycles = util::ThisFiber::GetRunningTimeCycles();
  size_t usec = base::CycleClock::ToUsec(cycles);
  if (usec >= max_usec) {
    size_t ms = usec / 1'000;
    VLOG_IF(1, ms >= 50) << "Grabbed " << ms << "ms for " << location.file_name() << ":"
                         << location.line();

    util::ThisFiber::Yield();
  }
}
}  // namespace dfly::search


================================================
FILE: src/core/search/renewable_quota.h
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <cstddef>
#include <source_location>

namespace dfly::search {

// Running time quota that can be reset by suspending the fiber
struct RenewableQuota {
  // Create unlimited quota
  static RenewableQuota Unlimited();

  // Check if quota is remaining and suspend the fiber if it ran out
  void Check(std::source_location location = std::source_location::current()) const;

  const size_t max_usec;
};

}  // namespace dfly::search


================================================
FILE: src/core/search/scanner.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

// We should not include lexer.h when compiling from lexer.cc file because it already
// includes lexer.h
#ifndef DFLY_LEXER_CC
#include "core/search/lexer.h"
#endif

#include <absl/strings/str_cat.h>

#include "base/logging.h"

namespace dfly {
namespace search {

class Scanner : public Lexer {
 public:
  Scanner() : params_{nullptr} {
  }

  Parser::symbol_type Lex();

  void SetParams(const QueryParams* params) {
    params_ = params;
  }

 private:
  std::string_view matched_view(size_t skip_left = 0, size_t skip_right = 0) const {
    std::string_view res(matcher().begin() + skip_left, matcher().size() - skip_left - skip_right);
    return res;
  }

  dfly::search::location loc() {
    return location();
  }

  Parser::symbol_type ParseParam(std::string_view name, const Parser::location_type& loc) {
    name.remove_prefix(1);  // drop $ symbol

    std::string_view str = (*params_)[name];
    if (str.empty())
      throw std::runtime_error(absl::StrCat("Query parameter ", name, " not found"));

    uint32_t val = 0;
    if (!absl::SimpleAtoi(str, &val))
      return Parser::make_TERM(std::string{str}, loc);

    return Parser::make_UINT32(std::string{str}, loc);
  }

 private:
  const QueryParams* params_;
};

}  // namespace search
}  // namespace dfly


================================================
FILE: src/core/search/search.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/search/search.h"

#include <absl/cleanup/cleanup.h>
#include <absl/container/flat_hash_set.h>
#include <absl/strings/str_cat.h>
#include <absl/strings/str_join.h>

#include <chrono>
#include <type_traits>
#include <variant>

#include "base/logging.h"
#include "core/overloaded.h"
#include "core/search/ast_expr.h"
#include "core/search/index_result.h"
#include "core/search/indices.h"
#include "core/search/query_driver.h"
#include "core/search/sort_indices.h"
#include "core/search/tag_types.h"
#include "core/search/vector_utils.h"

using namespace std;

namespace dfly::search {

namespace {

AstExpr ParseQuery(std::string_view query, const QueryParams* params,
                   const OptionalFilters* filters) {
  QueryDriver driver{};
  driver.ResetScanner();
  driver.SetParams(params);
  driver.SetInput(std::string{query});
  (void)Parser (&driver)();  // can throw
  driver.SetOptionalFilters(filters);
  return driver.Take();
}

// GCC 12 yields a wrong warning in a deeply inlined call in UnifyResults, only ignoring the whole
// scope solves it
#ifndef __clang__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#endif

struct ProfileBuilder {
  struct NodeFormatter {
    template <TagType T> void operator()(std::string* out, const AstAffixNode<T>& node) const {
      out->append(node.affix);
    }
    void operator()(std::string* out, const AstTagsNode::TagValue& value) const {
      visit([this, out](const auto& n) { this->operator()(out, n); }, value);
    }
  };

  string GetNodeInfo(const AstNode& node) {
    Overloaded node_info{
        [](monostate) -> string { return ""s; },
        [](const AstTermNode& n) { return absl::StrCat("Term{", n.affix, "}"); },
        [](const AstPrefixNode& n) { return absl::StrCat("Prefix{", n.affix, "}"); },
        [](const AstSuffixNode& n) { return absl::StrCat("Suffix{", n.affix, "}"); },
        [](const AstInfixNode& n) { return absl::StrCat("Infix{", n.affix, "}"); },
        [](const AstRangeNode& n) { return absl::StrCat("Range{", n.lo, "<>", n.hi, "}"); },
        [](const AstLogicalNode& n) {
          auto op = n.op == AstLogicalNode::AND ? "and" : "or";
          return absl::StrCat("Logical{n=", n.nodes.size(), ",o=", op, "}");
        },
        [](const AstTagsNode& n) {
          return absl::StrCat("Tags{", absl::StrJoin(n.tags, ",", NodeFormatter()), "}");
        },
        [](const AstFieldNode& n) { return absl::StrCat("Field{", n.field, "}"); },
        [](const AstKnnNode& n) { return absl::StrCat("KNN{l=", n.limit, "}"); },
        [](const AstNegateNode& n) { return absl::StrCat("Negate{}"); },
        [](const AstStarNode& n) { return absl::StrCat("Star{}"); },
        [](const AstStarFieldNode& n) { return absl::StrCat("StarField{}"); },
        [](const AstGeoNode& n) {
          return absl::StrCat("Geo{", n.lat, " ", n.lon, " ", n.radius, " ", n.unit, "}");
        },
        [](const AstVectorRangeNode& n) { return absl::StrCat("VectorRange{r=", n.radius, "}"); },
    };
    return visit(node_info, node.Variant());
  }

  using Tp = std::chrono::steady_clock::time_point;

  Tp Start() {
    depth_++;
    return chrono::steady_clock::now();
  }

  void Finish(Tp start, const AstNode& node, const IndexResult& result) {
    DCHECK_GE(depth_, 1u);
    auto took = chrono::steady_clock::now() - start;
    size_t micros = chrono::duration_cast<chrono::microseconds>(took).count();
    auto descr = GetNodeInfo(node);
    profile_.events.push_back({std::move(descr), micros, depth_ - 1, result.ApproximateSize()});
    depth_--;
  }

  AlgorithmProfile Take() {
    reverse(profile_.events.begin(), profile_.events.end());
    return std::move(profile_);
  }

 private:
  size_t depth_;
  AlgorithmProfile profile_;
};

struct BasicSearch {
  using LogicOp = AstLogicalNode::LogicOp;

  BasicSearch(const FieldIndices* indices) : indices_{indices} {
  }

  void EnableProfiling() {
    profile_builder_ = ProfileBuilder{};
  }

  BaseIndex* GetBaseIndex(string_view field) {
    auto index = indices_->GetIndex(field);
    if (!index) {
      error_ = absl::StrCat("Invalid field: ", field);
      return nullptr;
    }
    return index;
  }

  // Get casted sub index by field
  template <typename T> T* GetIndex(string_view field) {
    static_assert(is_base_of_v<BaseIndex, T>);

    auto base_index = GetBaseIndex(field);
    if (!base_index) {
      return nullptr;
    }

    auto* casted_ptr = dynamic_cast<T*>(base_index);
    if (!casted_ptr) {
      error_ = absl::StrCat("Wrong access type for field: ", field);
      return nullptr;
    }

    return casted_ptr;
  }

  BaseSortIndex* GetSortIndex(string_view field) {
    auto index = indices_->GetSortIndex(field);
    if (!index) {
      error_ = absl::StrCat("Invalid sort field: ", field);
      return nullptr;
    }

    return index;
  }

  // Collect all index results from F(C[i])
  template <typename C, typename F>
  vector<IndexResult> GetSubResults(const C& container, const F& f) {
    vector<IndexResult> sub_results(container.size());
    for (size_t i = 0; i < container.size(); i++)
      sub_results[i] = IndexResult{f(container[i])};
    return sub_results;
  }

  void Merge(IndexResult matched, IndexResult* current_ptr, LogicOp op) {
    IndexResult& current = *current_ptr;
    auto vec = MergeIndexResults(matched, current, op);
    current = IndexResult{std::move(vec)};
  }

  // Efficiently unify multiple sub results with specified logical op
  IndexResult UnifyResults(vector<IndexResult>&& sub_results, LogicOp op) {
    if (sub_results.empty())
      return IndexResult{};

    // Unifying from smallest to largest is more efficient.
    // AND: the result only shrinks, so starting with the smallest is most optimal.
    // OR: unifying smaller sets first reduces the number of element traversals on average.
    sort(sub_results.begin(), sub_results.end(),
         [](const auto& l, const auto& r) { return l.ApproximateSize() < r.ApproximateSize(); });

    IndexResult out{std::move(sub_results[0])};
    for (auto& matched : absl::MakeSpan(sub_results).subspan(1))
      Merge(std::move(matched), &out, op);
    return out;
  }

  template <typename C, typename F>
  IndexResult CollectMatches(BaseStringIndex<C>* index, std::string_view word, F&& f) {
    IndexResult result{};
    invoke(f, *index, word,
           [&result, this](const auto* c) { Merge(IndexResult{c}, &result, LogicOp::OR); });
    return result;
  }

  IndexResult Search(monostate, string_view) {
    return IndexResult{};
  }

  IndexResult Search(const AstStarNode& node, string_view active_field) {
    DCHECK(active_field.empty());
    return IndexResult{&indices_->GetAllDocs()};
  }

  IndexResult Search(const AstStarFieldNode& node, string_view active_field) {
    // Try to get a sort index first, as `@field:*` might imply wanting sortable behavior
    BaseSortIndex* sort_index = indices_->GetSortIndex(active_field);
    if (sort_index) {
      return IndexResult{sort_index->GetAllDocsWithNonNullValues()};
    }

    // If sort index doesn't exist try regular index
    BaseIndex* base_index = GetBaseIndex(active_field);
    return base_index ? IndexResult{base_index->GetAllDocsWithNonNullValues()} : IndexResult{};
  }

  template <TagType T> IndexResult Search(const AstAffixNode<T>& node, string_view active_field) {
    vector<TextIndex*> indices;
    if (!active_field.empty()) {
      if (auto* index = GetIndex<TextIndex>(active_field); index)
        indices = {index};
      else
        return IndexResult{};
    } else {
      indices = indices_->GetAllTextIndices();
    }

    auto mapping = [&node, this](TextIndex* index) {
      if constexpr (T == TagType::PREFIX)
        return CollectMatches(index, node.affix, &TextIndex::MatchPrefix);
      else if constexpr (T == TagType::SUFFIX)
        return CollectMatches(index, node.affix, &TextIndex::MatchSuffix);
      else if constexpr (T == TagType::INFIX)
        return CollectMatches(index, node.affix, &TextIndex::MatchInfix);
      else
        return vector<DocId>{};
    };
    return UnifyResults(GetSubResults(indices, mapping), LogicOp::OR);
  }

  // "term": access field's text index or unify results from all text indices if no field is set
  IndexResult Search(const AstAffixNode<TagType::REGULAR> node, string_view active_field) {
    std::string term = node.affix;
    bool strip_whitespace = true;

    if (auto synonyms = indices_->GetSynonyms(); synonyms) {
      if (auto group_id = synonyms->GetGroupToken(term); group_id) {
        term = *group_id;
        strip_whitespace = false;
      }
    }

    if (!active_field.empty()) {
      if (auto* index = GetIndex<TextIndex>(active_field); index)
        return IndexResult{index->Matching(term, strip_whitespace)};
      return IndexResult{};
    }

    vector<TextIndex*> selected_indices = indices_->GetAllTextIndices();
    auto mapping = [&term, strip_whitespace](TextIndex* index) {
      return index->Matching(term, strip_whitespace);
    };

    return UnifyResults(GetSubResults(selected_indices, mapping), LogicOp::OR);
  }

  // [range]: access field's numeric index
  IndexResult Search(const AstRangeNode& node, string_view active_field) {
    DCHECK(!active_field.empty());
    if (auto* index = GetIndex<NumericIndex>(active_field); index) {
      return IndexResult{index->Range(node.lo, node.hi)};
    }
    return IndexResult{};
  }

  IndexResult Search(const AstGeoNode& node, string_view active_field) {
    DCHECK(!active_field.empty());
    if (auto* index = GetIndex<GeoIndex>(active_field); index) {
      return IndexResult{index->RadiusSearch(node.lon, node.lat, node.radius, node.unit)};
    }
    return IndexResult{};
  }

  // negate -(*subquery*): explicitly compute result complement. Needs further optimizations
  IndexResult Search(const AstNegateNode& node, string_view active_field) {
    auto matched = SearchGeneric(*node.node, active_field).Take().first;
    vector<DocId> all = indices_->GetAllDocs();

    // To negate a result, we have to find the complement of matched to all documents,
    // so we remove all matched documents from the set of all documents.
    auto pred = [&matched](DocId doc) {
      return binary_search(matched.begin(), matched.end(), doc);
    };
    all.erase(remove_if(all.begin(), all.end(), pred), all.end());
    return IndexResult{std::move(all)};
  }

  // logical query: unify all sub results
  IndexResult Search(const AstLogicalNode& node, string_view active_field) {
    auto mapping = [&](auto& node) { return SearchGeneric(node, active_field); };
    return UnifyResults(GetSubResults(node.nodes, mapping), node.op);
  }

  // @field: set active field for sub tree
  IndexResult Search(const AstFieldNode& node, string_view active_field) {
    DCHECK(active_field.empty());
    DCHECK(node.node);
    return SearchGeneric(*node.node, node.field);
  }

  // {tags | ...}: Unify results for all tags
  IndexResult Search(const AstTagsNode& node, string_view active_field) {
    auto* tag_index = GetIndex<TagIndex>(active_field);
    if (!tag_index)
      return IndexResult{};

    Overloaded ov{[tag_index](const AstTermNode& term) -> IndexResult {
                    return IndexResult{tag_index->Matching(term.affix)};
                  },
                  [tag_index, this](const AstPrefixNode& prefix) {
                    return CollectMatches(tag_index, prefix.affix, &TagIndex::MatchPrefix);
                  },
                  [tag_index, this](const AstSuffixNode& suffix) {
                    return CollectMatches(tag_index, suffix.affix, &TagIndex::MatchSuffix);
                  },
                  [tag_index, this](const AstInfixNode& infix) {
                    return CollectMatches(tag_index, infix.affix, &TagIndex::MatchInfix);
                  }};
    auto mapping = [ov](const auto& tag) { return visit(ov, tag); };
    return UnifyResults(GetSubResults(node.tags, mapping), LogicOp::OR);
  }

  void SearchKnnFlat(FlatVectorIndex* vec_index, const AstKnnNode& knn, IndexResult&& sub_results) {
    knn_distances_.reserve(sub_results.ApproximateSize());
    auto cb = [&](auto* set) {
      auto [dim, sim] = vec_index->Info();
      for (DocId matched_doc : *set) {
        const float* vec = vec_index->Get(matched_doc);
        if (!vec)
          continue;
        float dist = VectorDistance(knn.vec.first.get(), vec, dim, sim);
        knn_distances_.emplace_back(dist, matched_doc);
      }
    };
    visit(cb, sub_results.Borrowed());

    size_t prefix_size = min(knn.limit, knn_distances_.size());
    partial_sort(knn_distances_.begin(), knn_distances_.begin() + prefix_size,
                 knn_distances_.end());
    knn_distances_.resize(prefix_size);
  }

  void SearchVectorRangeFlat(FlatVectorIndex* vec_index, const AstVectorRangeNode& node) {
    const auto& all_docs = indices_->GetAllDocs();
    auto [dim, sim] = vec_index->Info();
    for (DocId doc : all_docs) {
      const float* vec = vec_index->Get(doc);
      if (!vec)
        continue;
      float dist = VectorDistance(node.vec.first.get(), vec, dim, sim);
      if (dist <= static_cast<float>(node.radius)) {
        knn_scores_.emplace_back(doc, dist);
      }
    }
  }

  // [@field:[VECTOR_RANGE r vec]=>{$YIELD_DISTANCE_AS: alias}]:
  // Return all docs within distance radius, storing distances in knn_scores_
  IndexResult Search(const AstVectorRangeNode& node, string_view active_field) {
    DCHECK(active_field.empty());

    auto* vec_index = GetIndex<BaseVectorIndex>(node.field);
    if (!vec_index)
      return IndexResult{};

    if (node.vec.second == 0)
      return IndexResult{};

    if (node.radius < 0 || std::isnan(node.radius)) {
      error_ = absl::StrCat("VECTOR_RANGE radius must be non-negative, got: ", node.radius);
      return IndexResult{};
    }

    if (auto [dim, _] = vec_index->Info(); dim != node.vec.second) {
      error_ = absl::StrCat("Wrong vector index dimensions, got: ", node.vec.second,
                            ", expected: ", dim);
      return IndexResult{};
    }

    knn_scores_.clear();

    // HNSW fields are not stored in FieldIndices::indices_, so GetIndex<BaseVectorIndex> above
    // returns nullptr for HNSW before we reach this point.
    // HNSW range search support is planned separately (see hnsw_index.h).
    if (auto* flat_index = dynamic_cast<FlatVectorIndex*>(vec_index); flat_index)
      SearchVectorRangeFlat(flat_index, node);

    vector<DocId> out(knn_scores_.size());
    for (size_t i = 0; i < knn_scores_.size(); i++)
      out[i] = knn_scores_[i].first;
    return IndexResult{std::move(out)};
  }

  // [KNN limit @field vec]: Compute distance from `vec` to all vectors keep closest `limit`
  IndexResult Search(const AstKnnNode& knn, string_view active_field) {
    DCHECK(active_field.empty());
    auto sub_results = SearchGeneric(*knn.filter, active_field);

    auto* vec_index = GetIndex<BaseVectorIndex>(knn.field);
    if (!vec_index)
      return IndexResult{};

    // If vector dimension is 0, treat as placeholder/invalid - return empty results
    // This allows tests to use dummy vector values like "<your_vector_blob>"
    if (knn.vec.second == 0)
      return IndexResult{};

    if (auto [dim, _] = vec_index->Info(); dim != knn.vec.second) {
      error_ =
          absl::StrCat("Wrong vector index dimensions, got: ", knn.vec.second, ", expected: ", dim);
      return IndexResult{};
    }

    knn_scores_.clear();

    if (auto flat_index = dynamic_cast<FlatVectorIndex*>(vec_index); flat_index)
      SearchKnnFlat(dynamic_cast<FlatVectorIndex*>(vec_index), knn, std::move(sub_results));

    vector<DocId> out(knn_distances_.size());
    knn_scores_.reserve(knn_distances_.size());

    for (size_t i = 0; i < knn_distances_.size(); i++) {
      knn_scores_.emplace_back(knn_distances_[i].second, knn_distances_[i].first);
      out[i] = knn_distances_[i].second;
    }

    return IndexResult{std::move(out)};
  }

  // Determine node type and call specific search function
  IndexResult SearchGeneric(const AstNode& node, string_view active_field, bool top_level = false) {
    if (!error_.empty())
      return IndexResult{};

    ProfileBuilder::Tp start = profile_builder_ ? profile_builder_->Start() : ProfileBuilder::Tp{};

    auto cb = [this, active_field](const auto& inner) { return Search(inner, active_field); };
    auto result = visit(cb, node.Variant());

    // Top level results don't need to be sorted, because they will be scored, sorted by fields or
    // used by knn
    DCHECK(top_level || holds_alternative<AstKnnNode>(node.Variant()) ||
           holds_alternative<AstGeoNode>(node.Variant()) ||
           holds_alternative<AstVectorRangeNode>(node.Variant()) ||
           visit([](auto* set) { return is_sorted(set->begin(), set->end()); }, result.Borrowed()));

    if (profile_builder_)
      profile_builder_->Finish(start, node, result);

    return result;
  }

  SearchResult Search(const AstNode& query, size_t cuttoff_limit) {
    IndexResult result = SearchGeneric(query, "", true);

    // Extract profile if enabled
    optional<AlgorithmProfile> profile =
        profile_builder_ ? make_optional(profile_builder_->Take()) : nullopt;

    auto [out, total_size] = result.Take(cuttoff_limit);
    return SearchResult{total_size, std::move(out), std::move(knn_scores_), std::move(profile),
                        std::move(error_)};
  }

  const FieldIndices* indices_;

  string error_;
  optional<ProfileBuilder> profile_builder_ = ProfileBuilder{};

  std::vector<pair<DocId, float>> knn_scores_;
  vector<pair<float, DocId>> knn_distances_;
};

#ifndef __clang__
#pragma GCC diagnostic pop
#endif

}  // namespace

AstNode OptionalNumericFilter::Node(std::string field) {
  return AstFieldNode{"@" + field, AstRangeNode(lo_, false, hi_, false)};
}

string_view Schema::LookupAlias(string_view alias) const {
  if (auto it = field_names.find(alias); it != field_names.end())
    return it->second;
  return alias;
}

string_view Schema::LookupIdentifier(string_view identifier) const {
  if (auto it = fields.find(identifier); it != fields.end())
    return it->second.short_name;
  return identifier;
}

IndicesOptions::IndicesOptions() {
  static absl::flat_hash_set<std::string> kDefaultStopwords{
      "a",    "is",    "the",  "an",    "and",   "are",  "as",   "at", "be",  "but",  "by",
      "for",  "if",    "in",   "into",  "it",    "no",   "not",  "of", "on",  "or",   "such",
      "that", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"};

  stopwords = kDefaultStopwords;
}

FieldIndices::FieldIndices(const Schema& schema, const IndicesOptions& options,
                           PMR_NS::memory_resource* mr, const Synonyms* synonyms)
    : schema_{schema}, options_{options}, synonyms_{synonyms} {
  CreateIndices(mr);
  CreateSortIndices();
}

void FieldIndices::CreateIndices(PMR_NS::memory_resource* mr) {
  for (const auto& [field_ident, field_info] : schema_.fields) {
    if ((field_info.flags & SchemaField::NOINDEX) > 0)
      continue;

    switch (field_info.type) {
      case SchemaField::TEXT: {
        const auto& tparams = std::get<SchemaField::TextParams>(field_info.special_params);
        indices_[field_ident] =
            make_unique<TextIndex>(mr, &options_.stopwords, synonyms_, tparams.with_suffixtrie);
        break;
      }
      case SchemaField::NUMERIC: {
        const auto& nparams = std::get<SchemaField::NumericParams>(field_info.special_params);
        indices_[field_ident] = make_unique<NumericIndex>(nparams.block_size, mr);
        break;
      }
      case SchemaField::TAG: {
        const auto& tparams = std::get<SchemaField::TagParams>(field_info.special_params);
        indices_[field_ident] = make_unique<TagIndex>(mr, tparams);
        break;
      }
      case SchemaField::VECTOR: {
        unique_ptr<BaseVectorIndex> vector_index;

        DCHECK(holds_alternative<SchemaField::VectorParams>(field_info.special_params));
        const auto& vparams = std::get<SchemaField::VectorParams>(field_info.special_params);

        // Use global HNSW index
        if (vparams.use_hnsw)
          break;

        vector_index = make_unique<FlatVectorIndex>(vparams, mr);
        indices_[field_ident] = std::move(vector_index);

        break;
      }
      case SchemaField::GEO: {
        indices_[field_ident] = make_unique<GeoIndex>(mr);
        break;
      }
    }
  }
}

void FieldIndices::CreateSortIndices() {
  for (const auto& [field_ident, field_info] : schema_.fields) {
    if ((field_info.flags & SchemaField::SORTABLE) == 0)
      continue;

    switch (field_info.type) {
      case SchemaField::TAG:
      case SchemaField::TEXT:
        sort_indices_[field_ident] = make_unique<StringSortIndex>();
        break;
      case SchemaField::NUMERIC:
        sort_indices_[field_ident] = make_unique<NumericSortIndex>();
        break;
      case SchemaField::VECTOR:
      case SchemaField::GEO:
        break;
    }
  }
}

bool FieldIndices::Add(DocId doc, const DocumentAccessor& access) {
  bool was_added = true;

  std::vector<std::pair<std::string_view, BaseIndex*>> successfully_added_indices;
  successfully_added_indices.reserve(indices_.size() + sort_indices_.size());

  auto try_add = [&](const auto& indices_container) {
    for (auto& [field, index] : indices_container) {
      if (index->Add(doc, access, field)) {
        successfully_added_indices.emplace_back(field, index.get());
      } else {
        was_added = false;
        break;
      }
    }
  };

  try_add(indices_);

  if (was_added) {
    try_add(sort_indices_);
  }

  if (!was_added) {
    for (auto& [field, index] : successfully_added_indices) {
      index->Remove(doc, access, field);
    }
    return false;
  }

  all_ids_.insert(upper_bound(all_ids_.begin(), all_ids_.end(), doc), doc);
  return true;
}

void FieldIndices::Remove(DocId doc, const DocumentAccessor& access) {
  for (auto& [field, index] : indices_)
    index->Remove(doc, access, field);
  for (auto& [field, sort_index] : sort_indices_)
    sort_index->Remove(doc, access, field);

  auto it = lower_bound(all_ids_.begin(), all_ids_.end(), doc);
  DCHECK(it != all_ids_.end() && *it == doc);
  all_ids_.erase(it);
}

BaseIndex* FieldIndices::GetIndex(string_view field) const {
  auto it = indices_.find(schema_.LookupAlias(field));
  return it != indices_.end() ? it->second.get() : nullptr;
}

BaseSortIndex* FieldIndices::GetSortIndex(string_view field) const {
  auto it = sort_indices_.find(schema_.LookupAlias(field));
  return it != sort_indices_.end() ? it->second.get() : nullptr;
}

std::vector<TextIndex*> FieldIndices::GetAllTextIndices() const {
  vector<TextIndex*> out;
  for (const auto& [field_name, field_info] : schema_.fields) {
    if (field_info.type != SchemaField::TEXT || (field_info.flags & SchemaField::NOINDEX) > 0)
      continue;
    auto* index = dynamic_cast<TextIndex*>(GetIndex(field_name));
    DCHECK(index);
    out.push_back(index);
  }
  return out;
}

const vector<DocId>& FieldIndices::GetAllDocs() const {
  return all_ids_;
}

const Schema& FieldIndices::GetSchema() const {
  return schema_;
}

SortableValue FieldIndices::GetSortIndexValue(DocId doc, std::string_view field_identifier) const {
  auto it = sort_indices_.find(field_identifier);
  DCHECK(it != sort_indices_.end());
  return it->second->Lookup(doc);
}

void FieldIndices::FinalizeInitialization() {
  for (auto& [field, index] : indices_) {
    index->FinalizeInitialization();
  }
}

DefragmentResult FieldIndices::Defragment(PageUsage* page_usage) {
  auto defrag = [&](auto& indices, string* key) {
    DefragmentMap dm{indices, key};
    return dm.Defragment(page_usage);
  };

  DefragmentResult result = defrag(indices_, &next_defrag_field_);
  result.Merge(defrag(sort_indices_, &next_defrag_sort_field_));
  return result;
}

const Synonyms* FieldIndices::GetSynonyms() const {
  return synonyms_;
}

SearchAlgorithm::SearchAlgorithm() = default;
SearchAlgorithm::~SearchAlgorithm() = default;

bool SearchAlgorithm::Init(string_view query, const QueryParams* params,
                           const OptionalFilters* filters) {
  try {
    query_ = make_unique<AstExpr>(ParseQuery(query, params, filters));
  } catch (const Parser::syntax_error& se) {
    LOG(INFO) << "Failed to parse query \"" << query << "\":" << se.what();
    return false;
  } catch (...) {
    LOG_EVERY_T(INFO, 10) << "Unexpected query parser error \"" << query << "\"";
    return false;
  }

  if (holds_alternative<monostate>(*query_)) {
    LOG_EVERY_T(INFO, 10) << "Empty result after parsing query \"" << query << "\"";
    return false;
  }

  return true;
}

SearchResult SearchAlgorithm::Search(const FieldIndices* index, size_t cuttoff_limit) const {
  DCHECK(query_);

  auto bs = BasicSearch{index};
  if (profiling_enabled_)
    bs.EnableProfiling();
  return bs.Search(*query_, cuttoff_limit);
}

std::optional<KnnScoreSortOption> SearchAlgorithm::GetKnnScoreSortOption() const {
  // HNSW KNN query
  if (knn_hnsw_score_sort_option_) {
    return knn_hnsw_score_sort_option_;
  }

  // FLAT KNN query
  if (auto* knn = get_if<AstKnnNode>(query_.get()); knn)
    return KnnScoreSortOption{string_view{knn->score_alias}, knn->limit};

  return nullopt;
}

bool SearchAlgorithm::IsKnnQuery() const {
  DCHECK(query_);
  return std::holds_alternative<AstKnnNode>(*query_);
}

AstKnnNode* SearchAlgorithm::GetKnnNode() const {
  if (auto* knn = get_if<AstKnnNode>(query_.get()); knn) {
    return knn;
  }
  return nullptr;
}

std::unique_ptr<AstNode> SearchAlgorithm::PopKnnNode() {
  if (auto* knn = get_if<AstKnnNode>(query_.get()); knn) {
    // Save knn score sort option
    knn_hnsw_score_sort_option_ = KnnScoreSortOption{string_view{knn->score_alias}, knn->limit};
    auto node = std::move(query_);
    AstKnnNode* moved_knn_node = reinterpret_cast<AstKnnNode*>(node.get());
    if (!std::holds_alternative<AstStarNode>(*moved_knn_node->filter))
      query_.swap(moved_knn_node->filter);
    return node;
  }
  LOG(DFATAL) << "Should not reach here";
  return nullptr;
}

void SearchAlgorithm::EnableProfiling() {
  profiling_enabled_ = true;
}

const AstVectorRangeNode* SearchAlgorithm::GetVectorRangeNode() const {
  return get_if<AstVectorRangeNode>(query_.get());
}

}  // namespace dfly::search


================================================
FILE: src/core/search/search.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/flat_hash_map.h>
#include <absl/container/flat_hash_set.h>

#include <memory>
#include <optional>
#include <string>
#include <variant>

#include "base/pmr/memory_resource.h"
#include "core/search/base.h"
#include "core/search/range_tree.h"
#include "core/search/synonyms.h"

namespace dfly::search {

struct AstNode;
struct TextIndex;
struct AstKnnNode;
struct AstVectorRangeNode;

// Optional FILTER
struct OptionalNumericFilter : public OptionalFilterBase {
  OptionalNumericFilter(size_t lo, size_t hi) : empty_(false), lo_(lo), hi_(hi) {
  }

  bool IsEmpty() const override {
    return empty_;
  }

  AstNode Node(std::string field) override;

  void AddRange(size_t lo, size_t hi) {
    if (empty_) {
      return;
    }
    if ((hi_ < lo) || (hi < lo_)) {
      empty_ = true;
    } else {
      lo_ = std::max(lo_, lo);
      hi_ = std::min(hi_, hi);
    }
  }

 private:
  bool empty_;
  size_t lo_;
  size_t hi_;
};

// Describes a specific index field
struct SchemaField {
  enum FieldType { TAG, TEXT, NUMERIC, VECTOR, GEO };
  enum FieldFlags : uint8_t { NOINDEX = 1 << 0, SORTABLE = 1 << 1 };

  struct VectorParams {
    bool use_hnsw = false;

    size_t dim = 0u;                              // dimension of knn vectors
    VectorSimilarity sim = VectorSimilarity::L2;  // similarity type
    size_t capacity = 1000;                       // initial capacity
    size_t hnsw_ef_construction = 200;
    size_t hnsw_m = 16;
  };

  struct TagParams {
    char separator = ',';
    bool case_sensitive = false;
    bool with_suffixtrie = false;  // see TextParams
  };

  struct TextParams {
    // if enabled, suffix trie is build for efficient suffix and infix queries
    bool with_suffixtrie = false;
  };

  struct NumericParams {
    // Block size of the range tree
    // Check RangeTree for details.
    size_t block_size = RangeTree::kDefaultMaxRangeBlockSize;
  };

  bool IsIndexableHnswField() const {
    return type == VECTOR && !(flags & NOINDEX) && std::get<VectorParams>(special_params).use_hnsw;
  }

  using ParamsVariant =
      std::variant<std::monostate, VectorParams, TagParams, TextParams, NumericParams>;

  FieldType type;
  uint8_t flags;
  std::string short_name;  // equal to ident if none provided
  ParamsVariant special_params{std::monostate{}};
};

// Describes the fields of an index
struct Schema {
  // List of fields by identifier.
  absl::flat_hash_map<std::string /*identifier*/, SchemaField> fields;

  // Mapping for short field names (aliases).
  absl::flat_hash_map<std::string /* short name*/, std::string /*identifier*/> field_names;

  // Return identifier for alias if found, otherwise return passed value
  std::string_view LookupAlias(std::string_view alias) const;

  // Return alias for identifier if found, otherwise return passed value
  std::string_view LookupIdentifier(std::string_view identifier) const;
};

struct IndicesOptions {
  IndicesOptions();
  explicit IndicesOptions(absl::flat_hash_set<std::string> stopwords)
      : stopwords{std::move(stopwords)} {
  }

  absl::flat_hash_set<std::string> stopwords;
};

// Collection of indices for all fields in schema
class FieldIndices {
 public:
  // Create indices based on schema and options. Both must outlive the indices
  FieldIndices(const Schema& schema, const IndicesOptions& options, PMR_NS::memory_resource* mr,
               const Synonyms* synonyms);

  // Returns true if document was added
  bool Add(DocId doc, const DocumentAccessor& access);
  void Remove(DocId doc, const DocumentAccessor& access);

  BaseIndex* GetIndex(std::string_view field) const;
  BaseSortIndex* GetSortIndex(std::string_view field) const;
  std::vector<TextIndex*> GetAllTextIndices() const;

  const std::vector<DocId>& GetAllDocs() const;
  const Schema& GetSchema() const;

  const Synonyms* GetSynonyms() const;

  SortableValue GetSortIndexValue(DocId doc, std::string_view field_identifier) const;

  void FinalizeInitialization();

  DefragmentResult Defragment(PageUsage* page_usage);

 private:
  void CreateIndices(PMR_NS::memory_resource* mr);
  void CreateSortIndices();

  const Schema& schema_;
  const IndicesOptions& options_;
  std::vector<DocId> all_ids_;
  absl::flat_hash_map<std::string_view, std::unique_ptr<BaseIndex>> indices_;
  absl::flat_hash_map<std::string_view, std::unique_ptr<BaseSortIndex>> sort_indices_;
  const Synonyms* synonyms_;

  std::string next_defrag_field_;
  std::string next_defrag_sort_field_;
};

struct AlgorithmProfile {
  struct ProfileEvent {
    std::string descr;
    size_t micros;         // time event took in microseconds
    size_t depth;          // tree depth of event
    size_t num_processed;  // number of results processed by the event
  };

  std::vector<ProfileEvent> events;
};

// Represents a search result returned from the search algorithm.
struct SearchResult {
  size_t total;  // how many documents were matched in total

  // The ids of the matched documents
  std::vector<DocId> ids;

  // Contains final scores if an aggregation was present
  std::vector<std::pair<DocId, float>> knn_scores;

  // If profiling was enabled
  std::optional<AlgorithmProfile> profile;

  // If an error occurred, last recent one
  std::string error;
};

struct KnnScoreSortOption {
  std::string_view score_field_alias;
  size_t limit = std::numeric_limits<size_t>::max();
};

// SearchAlgorithm allows searching field indices with a query
class SearchAlgorithm {
 public:
  SearchAlgorithm();
  ~SearchAlgorithm();

  // Init with query and optional filters and return true if successful.
  bool Init(std::string_view query, const QueryParams* params,
            const OptionalFilters* filters = nullptr);

  // Search on given index with predefined limit for cutting off result ids
  SearchResult Search(const FieldIndices* index,
                      size_t cuttoff_limit = std::numeric_limits<size_t>::max()) const;

  std::optional<KnnScoreSortOption> GetKnnScoreSortOption() const;

  bool IsKnnQuery() const;

  AstKnnNode* GetKnnNode() const;

  std::unique_ptr<AstNode> PopKnnNode();

  const AstVectorRangeNode* GetVectorRangeNode() const;

  void EnableProfiling();

 private:
  bool profiling_enabled_ = false;
  std::unique_ptr<AstNode> query_;
  std::optional<KnnScoreSortOption> knn_hnsw_score_sort_option_;
};

}  // namespace dfly::search


================================================
FILE: src/core/search/search_parser_test.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "base/gtest.h"
#include "base/logging.h"
#include "core/search/base.h"
#include "core/search/query_driver.h"
#include "core/search/search.h"

namespace dfly::search {

using namespace std;

class SearchParserTest : public ::testing::Test {
 protected:
  SearchParserTest() {
    query_driver_.scanner()->set_debug(1);
  }

  void SetInput(const std::string& str) {
    query_driver_.SetInput(str);
  }

  Parser::symbol_type Lex() {
    return query_driver_.Lex();
  }

  int Parse(const std::string& str) {
    query_driver_.ResetScanner();
    query_driver_.SetInput(str);

    return Parser(&query_driver_)();
  }

  void SetParams(const QueryParams* params) {
    query_driver_.SetParams(params);
  }

  QueryDriver query_driver_;
};

// tokens are not assignable, so we can not reuse them. This macros reduce the boilerplate.
#define NEXT_EQ(tok_enum, type, val)                    \
  {                                                     \
    auto tok = Lex();                                   \
    ASSERT_EQ(tok.type_get(), Parser::token::tok_enum); \
    EXPECT_EQ(val, tok.value.as<type>());               \
  }

#define NEXT_TOK(tok_enum)                              \
  {                                                     \
    auto tok = Lex();                                   \
    ASSERT_EQ(tok.type_get(), Parser::token::tok_enum); \
  }
#define NEXT_ERROR()                          \
  {                                           \
    bool caught = false;                      \
    try {                                     \
      auto tok = Lex();                       \
    } catch (const Parser::syntax_error& e) { \
      caught = true;                          \
    }                                         \
    ASSERT_TRUE(caught);                      \
  }

TEST_F(SearchParserTest, Scanner) {
  SetInput("ab cd");
  // 3.5.1 does not have name() method.
  // EXPECT_STREQ("term", tok.name());

  NEXT_EQ(TOK_TERM, string, "ab");
  NEXT_EQ(TOK_TERM, string, "cd");
  NEXT_TOK(TOK_YYEOF);

  SetInput("*");
  NEXT_TOK(TOK_STAR);

  SetInput("(5a 6) ");
  NEXT_TOK(TOK_LPAREN);
  NEXT_EQ(TOK_TERM, string, "5a");
  NEXT_EQ(TOK_UINT32, string, "6");
  NEXT_TOK(TOK_RPAREN);

  SetInput(R"( "hello\"world" )");
  NEXT_EQ(TOK_TERM, string, R"(hello"world)");

  SetInput("@field:hello");
  NEXT_EQ(TOK_FIELD, string, "@field");
  NEXT_TOK(TOK_COLON);
  NEXT_EQ(TOK_TERM, string, "hello");

  SetInput("@field:{ tag }");
  NEXT_EQ(TOK_FIELD, string, "@field");
  NEXT_TOK(TOK_COLON);
  NEXT_TOK(TOK_LCURLBR);
  NEXT_EQ(TOK_TERM, string, "tag");
  NEXT_TOK(TOK_RCURLBR);

  SetInput("@color:{blue\\,1\\\\\\$\\+}");
  NEXT_EQ(TOK_FIELD, string, "@color");
  NEXT_TOK(TOK_COLON);
  NEXT_TOK(TOK_LCURLBR);
  NEXT_EQ(TOK_TAG_VAL, string, R"(blue,1\$+)");
  NEXT_TOK(TOK_RCURLBR);

  SetInput("@color:{blue\\.1\\\"\\%\\=}");
  NEXT_EQ(TOK_FIELD, string, "@color");
  NEXT_TOK(TOK_COLON);
  NEXT_TOK(TOK_LCURLBR);
  NEXT_EQ(TOK_TAG_VAL, string, "blue.1\"%=");
  NEXT_TOK(TOK_RCURLBR);

  SetInput("@color:{blue\\<1\\'\\^\\~}");
  NEXT_EQ(TOK_FIELD, string, "@color");
  NEXT_TOK(TOK_COLON);
  NEXT_TOK(TOK_LCURLBR);
  NEXT_EQ(TOK_TAG_VAL, string, "blue<1'^~");
  NEXT_TOK(TOK_RCURLBR);

  SetInput("@color:{blue\\>1\\:\\&\\/}");
  NEXT_EQ(TOK_FIELD, string, "@color");
  NEXT_TOK(TOK_COLON);
  NEXT_TOK(TOK_LCURLBR);
  NEXT_EQ(TOK_TAG_VAL, string, "blue>1:&/");
  NEXT_TOK(TOK_RCURLBR);

  SetInput("@color:{blue\\{1\\;\\*\\ }");
  NEXT_EQ(TOK_FIELD, string, "@color");
  NEXT_TOK(TOK_COLON);
  NEXT_TOK(TOK_LCURLBR);
  NEXT_EQ(TOK_TAG_VAL, string, "blue{1;* ");
  NEXT_TOK(TOK_RCURLBR);

  SetInput("@color:{blue\\}1\\!\\(}");
  NEXT_EQ(TOK_FIELD, string, "@color");
  NEXT_TOK(TOK_COLON);
  NEXT_TOK(TOK_LCURLBR);
  NEXT_EQ(TOK_TAG_VAL, string, "blue}1!(");
  NEXT_TOK(TOK_RCURLBR);

  SetInput("@color:{blue\\[1\\@\\)}");
  NEXT_EQ(TOK_FIELD, string, "@color");
  NEXT_TOK(TOK_COLON);
  NEXT_TOK(TOK_LCURLBR);
  NEXT_EQ(TOK_TAG_VAL, string, "blue[1@)");
  NEXT_TOK(TOK_RCURLBR);

  SetInput("@color:{blue\\]1\\#\\-}");
  NEXT_EQ(TOK_FIELD, string, "@color");
  NEXT_TOK(TOK_COLON);
  NEXT_TOK(TOK_LCURLBR);
  NEXT_EQ(TOK_TAG_VAL, string, "blue]1#-");
  NEXT_TOK(TOK_RCURLBR);

  // Colon in tag value (unescaped)
  SetInput("@t:{Tag:value}");
  NEXT_EQ(TOK_FIELD, string, "@t");
  NEXT_TOK(TOK_COLON);
  NEXT_TOK(TOK_LCURLBR);
  NEXT_EQ(TOK_TAG_VAL, string, "Tag:value");
  NEXT_TOK(TOK_RCURLBR);

  // Prefix simple
  SetInput("pre*");
  NEXT_EQ(TOK_PREFIX, string, "pre");

  // TODO: uncomment when we support escaped terms
  // Prefix escaped (redis doesn't support quoted prefix matches)
  // SetInput("pre\\**");
  // NEXT_EQ(TOK_PREFIX, string, "pre*");

  // Prefix in tag
  SetInput("@color:{prefix*}");
  NEXT_EQ(TOK_FIELD, string, "@color");
  NEXT_TOK(TOK_COLON);
  NEXT_TOK(TOK_LCURLBR);
  NEXT_EQ(TOK_PREFIX, string, "prefix");
  NEXT_TOK(TOK_RCURLBR);

  // Prefix escaped star
  SetInput("@color:{\"prefix*\"}");
  NEXT_EQ(TOK_FIELD, string, "@color");
  NEXT_TOK(TOK_COLON);
  NEXT_TOK(TOK_LCURLBR);
  NEXT_EQ(TOK_TERM, string, "prefix*");
  NEXT_TOK(TOK_RCURLBR);

  // Prefix spaced with star
  SetInput("pre *");
  NEXT_EQ(TOK_TERM, string, "pre");
  NEXT_TOK(TOK_STAR);

  SetInput("почтальон Печкин");
  NEXT_EQ(TOK_TERM, string, "почтальон");
  NEXT_EQ(TOK_TERM, string, "Печкин");

  SetInput("33.3");
  NEXT_EQ(TOK_DOUBLE, string, "33.3");
}

TEST_F(SearchParserTest, EscapedTagPrefixes) {
  SetInput("@name:{escape\\-err*}");
  NEXT_EQ(TOK_FIELD, string, "@name");
  NEXT_TOK(TOK_COLON);
  NEXT_TOK(TOK_LCURLBR);
  NEXT_EQ(TOK_PREFIX, string, "escape-err");
  NEXT_TOK(TOK_RCURLBR);

  SetInput("@name:{escape\\+pre*}");
  NEXT_EQ(TOK_FIELD, string, "@name");
  NEXT_TOK(TOK_COLON);
  NEXT_TOK(TOK_LCURLBR);
  NEXT_EQ(TOK_PREFIX, string, "escape+pre");
  NEXT_TOK(TOK_RCURLBR);

  SetInput("@name:{escape\\.pre*}");
  NEXT_EQ(TOK_FIELD, string, "@name");
  NEXT_TOK(TOK_COLON);
  NEXT_TOK(TOK_LCURLBR);
  NEXT_EQ(TOK_PREFIX, string, "escape.pre");
  NEXT_TOK(TOK_RCURLBR);

  SetInput("@name:{complex\\-escape\\+with\\.many\\*chars*}");
  NEXT_EQ(TOK_FIELD, string, "@name");
  NEXT_TOK(TOK_COLON);
  NEXT_TOK(TOK_LCURLBR);
  NEXT_EQ(TOK_PREFIX, string, "complex-escape+with.many*chars");
  NEXT_TOK(TOK_RCURLBR);
}

TEST_F(SearchParserTest, Parse) {
  EXPECT_EQ(0, Parse(" foo bar (baz) "));
  EXPECT_EQ(0, Parse(" -(foo) @foo:bar @ss:[1 2]"));
  EXPECT_EQ(0, Parse("@foo:{ tag1 | tag2 }"));

  EXPECT_EQ(0, Parse("@foo:{1|2}"));
  EXPECT_EQ(0, Parse("@foo:{1|2.0|4|3.0}"));
  EXPECT_EQ(0, Parse("@foo:{1|hello|3.0|world|4}"));

  EXPECT_EQ(0, Parse("@name:{escape\\-err*}"));

  // Parenthesized star - used by LangChain for KNN queries (issue #6342)
  EXPECT_EQ(0, Parse("(*)"));
  EXPECT_EQ(0, Parse("((*))"));
  EXPECT_EQ(0, Parse("(((*)))"));

  // Colon in tag value
  EXPECT_EQ(0, Parse("@t:{Tag:value}"));
  EXPECT_EQ(0, Parse("@t:{Tag:*}"));
  EXPECT_EQ(0, Parse("@category:{Product:Electronics}"));

  EXPECT_EQ(1, Parse(" -(foo "));
  EXPECT_EQ(1, Parse(" foo:bar "));
  EXPECT_EQ(1, Parse(" @foo:@bar "));
  EXPECT_EQ(1, Parse(" @foo: "));

  EXPECT_EQ(0, Parse("*suffix"));
  EXPECT_EQ(0, Parse("*infix*"));

  EXPECT_EQ(1, Parse("pre***"));

  // Geo units
  EXPECT_EQ(0, Parse("@t:{km}"));
  EXPECT_EQ(0, Parse("@t:{Km|M}"));
  EXPECT_EQ(0, Parse("@t:{ft|mi}"));
  EXPECT_EQ(0, Parse("@location:[0.0 0.0 1 m]"));
  EXPECT_EQ(0, Parse("@location:[0.0 0.0 1 Km]"));
  EXPECT_EQ(1, Parse("@location:[0.0 0.0 1 yd]"));
}

TEST_F(SearchParserTest, ParseParams) {
  QueryParams params;
  params["k"] = "10";
  params["name"] = "alex";
  SetParams(&params);

  SetInput("$name $k");
  NEXT_EQ(TOK_TERM, string, "alex");
  NEXT_EQ(TOK_UINT32, string, "10");
}

TEST_F(SearchParserTest, Quotes) {
  SetInput(" \"fir  st\"  'sec@o@nd' \":third:\" 'four\\\"th' ");
  NEXT_EQ(TOK_TERM, string, "fir  st");
  NEXT_EQ(TOK_TERM, string, "sec@o@nd");
  NEXT_EQ(TOK_TERM, string, ":third:");
  NEXT_EQ(TOK_TERM, string, "four\"th");
}

TEST_F(SearchParserTest, Numeric) {
  SetInput("11 123123123123 '22'");
  NEXT_EQ(TOK_UINT32, string, "11");
  NEXT_EQ(TOK_DOUBLE, string, "123123123123");
  NEXT_EQ(TOK_TERM, string, "22");
}

TEST_F(SearchParserTest, VectorRange) {
  // Full vector range query tokenization
  SetInput("@vector:[VECTOR_RANGE $radius $vec]=>{$YIELD_DISTANCE_AS: dist}");
  NEXT_EQ(TOK_FIELD, string, "@vector");
  NEXT_TOK(TOK_COLON);
  NEXT_TOK(TOK_LBRACKET);
  NEXT_TOK(TOK_VECTOR_RANGE);
}

TEST_F(SearchParserTest, VectorRangeParse) {
  QueryParams params;
  params["radius"] = "1";
  // 4 bytes = one float dimension
  params["vec"] = std::string(4, '\0');
  SetParams(&params);

  // Basic syntax parses without error
  EXPECT_EQ(0, Parse("@f:[VECTOR_RANGE $radius $vec]=>{$YIELD_DISTANCE_AS: dist}"));
}

TEST_F(SearchParserTest, KNN) {
  SetInput("*=>[KNN 1 @vector field_vec]");
  NEXT_TOK(TOK_STAR);
  NEXT_TOK(TOK_ARROW);
  NEXT_TOK(TOK_LBRACKET);
}

TEST_F(SearchParserTest, KNNfull) {
  SetInput("*=>[Knn 1 @vector field_vec EF_Runtime 15 as vec_sort]");
  NEXT_TOK(TOK_STAR);
  NEXT_TOK(TOK_ARROW);
  NEXT_TOK(TOK_LBRACKET);

  NEXT_TOK(TOK_KNN);
  NEXT_EQ(TOK_UINT32, string, "1");
  NEXT_TOK(TOK_FIELD);
  NEXT_TOK(TOK_TERM);

  NEXT_TOK(TOK_EF_RUNTIME);
  NEXT_EQ(TOK_UINT32, string, "15");

  NEXT_TOK(TOK_AS);
  NEXT_EQ(TOK_TERM, string, "vec_sort");

  NEXT_TOK(TOK_RBRACKET);
}

}  // namespace dfly::search


================================================
FILE: src/core/search/search_test.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/search/search.h"

#include <absl/cleanup/cleanup.h>
#include <absl/container/flat_hash_map.h>
#include <absl/strings/escaping.h>
#include <absl/strings/numbers.h>
#include <absl/strings/str_split.h>
#include <benchmark/benchmark.h>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include <mimalloc.h>

#include <algorithm>
#include <cmath>
#include <memory_resource>
#include <random>

#include "absl/base/macros.h"
#include "base/gtest.h"
#include "base/logging.h"
#include "core/search/base.h"
#include "core/search/hnsw_index.h"
#include "core/search/query_driver.h"
#include "core/search/stateless_allocator.h"
#include "core/search/vector_utils.h"

extern "C" {
#include "redis/zmalloc.h"
}

namespace dfly {
namespace search {

using namespace std;

using ::testing::HasSubstr;

// Used for NumericIndex benchmarks.
// The value is used to determine the maximum size of a range block in the range tree.
constexpr size_t kMaxRangeBlockSize = 500000;

struct MockedDocument : public DocumentAccessor {
 public:
  using Map = absl::flat_hash_map<std::string, std::string>;

  MockedDocument() = default;
  MockedDocument(Map map) : fields_{map} {
  }
  MockedDocument(std::string test_field) : fields_{{"field", test_field}} {
  }

  std::optional<StringList> GetStrings(string_view field) const override {
    auto it = fields_.find(field);
    if (it == fields_.end()) {
      return EmptyAccessResult<StringList>();
    }
    return StringList{string_view{it->second}};
  }

  std::optional<StringList> GetTags(string_view field) const override {
    return GetStrings(field);
  }

  std::optional<VectorInfo> GetVector(string_view field, size_t dim) const override {
    auto strings_list = GetStrings(field);
    if (!strings_list)
      return std::nullopt;
    return !strings_list->empty() ? BytesToFtVectorSafe(strings_list->front()) : OwnedFtVector{};
  }

  std::optional<NumsList> GetNumbers(std::string_view field) const override {
    auto strings_list = GetStrings(field);
    if (!strings_list)
      return std::nullopt;

    NumsList nums_list;
    nums_list.reserve(strings_list->size());
    for (auto str : strings_list.value()) {
      auto num = ParseNumericField(str);
      if (!num) {
        return std::nullopt;
      }
      nums_list.push_back(num.value());
    }
    return nums_list;
  }

  string DebugFormat() {
    string out = "{";
    for (const auto& [field, value] : fields_)
      absl::StrAppend(&out, field, "=", value, ",");
    if (out.size() > 1)
      out.pop_back();
    out += "}";
    return out;
  }

  void Set(Map hset) {
    fields_ = hset;
  }

 private:
  Map fields_{};
};

IndicesOptions kEmptyOptions{{}};

struct SchemaFieldInitializer {
  SchemaFieldInitializer(std::string_view name, SchemaField::FieldType type)
      : name{name}, type{type} {
    switch (type) {
      case SchemaField::TAG:
        special_params = SchemaField::TagParams{};
        break;
      case SchemaField::TEXT:
        special_params = SchemaField::TextParams{};
        break;
      case SchemaField::NUMERIC:
        special_params = SchemaField::NumericParams{};
        break;
      case SchemaField::VECTOR:
        special_params = SchemaField::VectorParams{};
        break;
      case SchemaField::GEO:
        break;
    }
  }

  SchemaFieldInitializer(std::string_view name, SchemaField::FieldType type,
                         SchemaField::ParamsVariant special_params)
      : name{name}, type{type}, special_params{special_params} {
  }

  std::string_view name;
  SchemaField::FieldType type;
  SchemaField::ParamsVariant special_params{std::monostate{}};
};

Schema MakeSimpleSchema(initializer_list<SchemaFieldInitializer> ilist,
                        bool make_sortable = false) {
  Schema schema;
  uint8_t flags = make_sortable ? SchemaField::SORTABLE : 0;
  for (auto ifield : ilist) {
    auto& field = schema.fields[ifield.name];
    field = {ifield.type, flags, string{ifield.name}, ifield.special_params};
  }
  return schema;
}

class SearchTest : public ::testing::Test {
 protected:
  static void SetUpTestSuite() {
    auto* tlh = mi_heap_get_backing();
    init_zmalloc_threadlocal(tlh);
    // Initialize SimSIMD runtime for tests that may exercise vector kernels
    InitSimSIMD();
  }

  SearchTest() {
    PrepareSchema({{"field", SchemaField::TEXT}});
  }

  ~SearchTest() {
    EXPECT_EQ(entries_.size(), 0u) << "Missing check";
  }

  void PrepareSchema(initializer_list<SchemaFieldInitializer> ilist) {
    schema_ = MakeSimpleSchema(ilist);
  }

  void PrepareQuery(string_view query) {
    query_ = query;
  }

  template <typename... Args> void ExpectAll(Args... args) {
    (entries_.emplace_back(args, true), ...);
  }

  template <typename... Args> void ExpectNone(Args... args) {
    (entries_.emplace_back(args, false), ...);
  }

  bool Check() {
    absl::Cleanup cl{[this] { entries_.clear(); }};

    FieldIndices index{schema_, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};

    shuffle(entries_.begin(), entries_.end(), default_random_engine{});
    for (DocId i = 0; i < entries_.size(); i++)
      index.Add(i, entries_[i].first);
    index.FinalizeInitialization();

    SearchAlgorithm search_algo{};
    if (!search_algo.Init(query_, &params_)) {
      error_ = "Failed to parse query";
      return false;
    }

    auto matched = search_algo.Search(&index);

    if (!is_sorted(matched.ids.begin(), matched.ids.end()))
      LOG(FATAL) << "Search result is not sorted";

    for (DocId i = 0; i < entries_.size(); i++) {
      bool doc_matched = binary_search(matched.ids.begin(), matched.ids.end(), i);
      if (doc_matched != entries_[i].second) {
        error_ = "doc: \"" + entries_[i].first.DebugFormat() + "\"" + " was expected" +
                 (entries_[i].second ? "" : " not") + " to match" + " query: \"" + query_ + "\"";
        return false;
      }
    }

    return true;
  }

  string_view GetError() const {
    return error_;
  }

 private:
  using DocEntry = pair<MockedDocument, bool /*should_match*/>;

  QueryParams params_;
  Schema schema_;
  vector<DocEntry> entries_;
  string query_, error_;
};

TEST_F(SearchTest, MatchTerm) {
  PrepareQuery("foo");

  // Check basic cases
  ExpectAll("foo", "foo bar", "more foo bar");
  ExpectNone("wrong", "nomatch");

  // Check part of sentence + case.
  ExpectAll("Foo is cool.", "Where is foo?", "One. FOO!. More", "Foo is foo.");

  // Check part of word is not matched
  ExpectNone("foocool", "veryfoos", "ufoo", "morefoomore", "thefoo");

  EXPECT_TRUE(Check()) << GetError();
}

TEST_F(SearchTest, MatchNotTerm) {
  PrepareQuery("-foo");

  ExpectAll("faa", "definitielyright");
  ExpectNone("foo", "foo bar", "more foo bar");

  EXPECT_TRUE(Check()) << GetError();
}

TEST_F(SearchTest, MatchLogicalNode) {
  {
    PrepareQuery("foo bar");

    ExpectAll("foo bar", "bar foo", "more bar and foo");
    ExpectNone("wrong", "foo", "bar", "foob", "far");

    EXPECT_TRUE(Check()) << GetError();
  }

  {
    PrepareQuery("foo | bar");

    ExpectAll("foo bar", "foo", "bar", "foo and more", "or only bar");
    ExpectNone("wrong", "only far");

    EXPECT_TRUE(Check()) << GetError();
  }

  {
    PrepareQuery("foo bar baz");

    ExpectAll("baz bar foo", "bar and foo and baz");
    ExpectNone("wrong", "foo baz", "bar baz", "and foo");

    EXPECT_TRUE(Check()) << GetError();
  }
}

TEST_F(SearchTest, MatchParenthesis) {
  PrepareQuery("( foo | oof ) ( bar | rab )");

  ExpectAll("foo bar", "oof rab", "foo rab", "oof bar", "foo oof bar rab");
  ExpectNone("wrong", "bar rab", "foo oof");

  EXPECT_TRUE(Check()) << GetError();
}

TEST_F(SearchTest, CheckNotPriority) {
  for (auto expr : {"-bar foo baz", "foo -bar baz", "foo baz -bar"}) {
    PrepareQuery(expr);

    ExpectAll("foo baz", "foo rab baz", "baz rab foo");
    ExpectNone("wrong", "bar", "foo bar baz", "foo baz bar");

    EXPECT_TRUE(Check()) << GetError();
  }

  for (auto expr : {"-bar | foo", "foo | -bar"}) {
    PrepareQuery(expr);

    ExpectAll("foo", "right", "foo bar");
    ExpectNone("bar", "bar baz");

    EXPECT_TRUE(Check()) << GetError();
  }

  for (auto expr : {"-bar far|-foo tam"}) {
    PrepareQuery(expr);

    ExpectAll("far baz", "far foo", "bar tam");
    ExpectNone("bar far", "foo tam", "bar foo", "far bar foo");

    EXPECT_TRUE(Check()) << GetError();
  }
}

TEST_F(SearchTest, CheckParenthesisPriority) {
  {
    PrepareQuery("foo | -(bar baz)");

    ExpectAll("foo", "not b/r and b/z", "foo bar baz", "single bar", "only baz");
    ExpectNone("bar baz", "some more bar and baz");

    EXPECT_TRUE(Check()) << GetError();
  }
  {
    PrepareQuery("( foo (bar | baz) (rab | zab) ) | true");

    ExpectAll("true", "foo bar rab", "foo baz zab", "foo bar zab");
    ExpectNone("wrong", "foo bar baz", "foo rab zab", "foo bar what", "foo rab foo");

    EXPECT_TRUE(Check()) << GetError();
  }
}

TEST_F(SearchTest, CheckPrefix) {
  {
    PrepareQuery("pre*");

    ExpectAll("pre", "prepre", "preachers", "prepared", "pRetty", "PRedators", "prEcisely!");
    ExpectNone("pristine", "represent", "repair", "depreciation");

    EXPECT_TRUE(Check()) << GetError();
  }
  {
    PrepareQuery("new*");

    ExpectAll("new", "New York", "Newham", "newbie", "news", "Welcome to Newark!");
    ExpectNone("ne", "renew", "nev", "ne-w", "notnew", "casino in neVada");

    EXPECT_TRUE(Check()) << GetError();
  }
}

using Map = MockedDocument::Map;

TEST_F(SearchTest, MatchField) {
  PrepareSchema({{"f1", SchemaField::TEXT}, {"f2", SchemaField::TEXT}, {"f3", SchemaField::TEXT}});
  PrepareQuery("@f1:foo @f2:bar @f3:baz");

  ExpectAll(Map{{"f1", "foo"}, {"f2", "bar"}, {"f3", "baz"}});
  ExpectNone(Map{{"f1", "foo"}, {"f2", "bar"}, {"f3", "last is wrong"}},
             Map{{"f1", "its"}, {"f2", "totally"}, {"f3", "wrong"}},
             Map{{"f1", "im foo but its only me and"}, {"f2", "bar"}});

  EXPECT_TRUE(Check()) << GetError();
}

TEST_F(SearchTest, MatchRange) {
  PrepareSchema({{"f1", SchemaField::NUMERIC}, {"f2", SchemaField::NUMERIC}});
  PrepareQuery("@f1:[1 10] @f2:[50 100]");

  ExpectAll(Map{{"f1", "5"}, {"f2", "50"}}, Map{{"f1", "1"}, {"f2", "100"}},
            Map{{"f1", "10"}, {"f2", "50"}});
  ExpectNone(Map{{"f1", "11"}, {"f2", "49"}}, Map{{"f1", "0"}, {"f2", "101"}});

  EXPECT_TRUE(Check()) << GetError();
}

TEST_F(SearchTest, MatchDoubleRange) {
  PrepareSchema({{"f1", SchemaField::NUMERIC}});

  {
    PrepareQuery("@f1: [100.03 199.97]");

    ExpectAll(Map{{"f1", "130"}}, Map{{"f1", "170"}}, Map{{"f1", "100.03"}}, Map{{"f1", "199.97"}});

    ExpectNone(Map{{"f1", "0"}}, Map{{"f1", "200"}}, Map{{"f1", "100.02999"}},
               Map{{"f1", "199.9700001"}});

    EXPECT_TRUE(Check()) << GetError();
  }

  {
    PrepareQuery("@f1: [(100 (199.9]");

    ExpectAll(Map{{"f1", "150"}}, Map{{"f1", "100.00001"}}, Map{{"f1", "199.8999999"}});

    ExpectNone(Map{{"f1", "50"}}, Map{{"f1", "100"}}, Map{{"f1", "199.9"}}, Map{{"f1", "200"}});

    EXPECT_TRUE(Check()) << GetError();
  }
}

TEST_F(SearchTest, MatchStar) {
  PrepareQuery("*");
  ExpectAll("one", "two", "three", "and", "all", "documents");
  EXPECT_TRUE(Check()) << GetError();
}

TEST_F(SearchTest, CheckExprInField) {
  PrepareSchema({{"f1", SchemaField::TEXT}, {"f2", SchemaField::TEXT}, {"f3", SchemaField::TEXT}});
  {
    PrepareQuery("@f1:(a|b) @f2:(c d) @f3:-e");

    ExpectAll(Map{{"f1", "a"}, {"f2", "c and d"}, {"f3", "right"}},
              Map{{"f1", "b"}, {"f2", "d and c"}, {"f3", "ok"}});
    ExpectNone(Map{{"f1", "none"}, {"f2", "only d"}, {"f3", "ok"}},
               Map{{"f1", "b"}, {"f2", "d and c"}, {"f3", "it has an e"}});

    EXPECT_TRUE(Check()) << GetError();
  }
  {
    PrepareQuery({"@f1:(a (b | c) -(d | e)) @f2:-(a|b)"});

    ExpectAll(Map{{"f1", "a b w"}, {"f2", "c"}});
    ExpectNone(Map{{"f1", "a b d"}, {"f2", "c"}}, Map{{"f1", "a b w"}, {"f2", "a"}},
               Map{{"f1", "a w"}, {"f2", "c"}});

    EXPECT_TRUE(Check()) << GetError();
  }
  {
    PrepareQuery("@f1:(-a c|-b d)");

    ExpectAll(Map{{"f1", "c"}}, Map{{"f1", "d"}});
    ExpectNone(Map{{"f1", "a"}}, Map{{"f1", "b"}});

    EXPECT_TRUE(Check()) << GetError();
  }
}

TEST_F(SearchTest, CheckTag) {
  PrepareSchema({{"f1", SchemaField::TAG}, {"f2", SchemaField::TAG}});

  PrepareQuery("@f1:{red | blue} @f2:{circle | square}");

  ExpectAll(Map{{"f1", "red"}, {"f2", "square"}}, Map{{"f1", "blue"}, {"f2", "square"}},
            Map{{"f1", "red"}, {"f2", "circle"}}, Map{{"f1", "red"}, {"f2", "circle, square"}},
            Map{{"f1", "red"}, {"f2", "triangle, circle"}},
            Map{{"f1", "red, green"}, {"f2", "square"}},
            Map{{"f1", "green, blue"}, {"f2", "circle"}});
  ExpectNone(Map{{"f1", "green"}, {"f2", "square"}}, Map{{"f1", "green"}, {"f2", "circle"}},
             Map{{"f1", "red"}, {"f2", "triangle"}}, Map{{"f1", "blue"}, {"f2", "line, triangle"}});

  EXPECT_TRUE(Check()) << GetError();
}

TEST_F(SearchTest, CheckTagPrefix) {
  PrepareSchema({{"color", SchemaField::TAG}});
  PrepareQuery("@color:{green* | orange | yellow*}");

  ExpectAll(Map{{"color", "green"}}, Map{{"color", "yellow"}}, Map{{"color", "greenish"}},
            Map{{"color", "yellowish"}}, Map{{"color", "green-forestish"}},
            Map{{"color", "yellowsunish"}}, Map{{"color", "orange"}});
  ExpectNone(Map{{"color", "red"}}, Map{{"color", "blue"}}, Map{{"color", "orangeish"}},
             Map{{"color", "darkgreen"}}, Map{{"color", "light-yellow"}});

  EXPECT_TRUE(Check()) << GetError();
}

TEST_F(SearchTest, IntegerTerms) {
  PrepareSchema({{"status", SchemaField::TAG}, {"title", SchemaField::TEXT}});

  PrepareQuery("@status:{1} @title:33");

  ExpectAll(Map{{"status", "1"}, {"title", "33 cars on the road"}});
  ExpectNone(Map{{"status", "0"}, {"title", "22 trains on the tracks"}});

  EXPECT_TRUE(Check()) << GetError();
}

TEST_F(SearchTest, StopWords) {
  auto schema = MakeSimpleSchema({{"title", SchemaField::TEXT}});
  IndicesOptions options{{"some", "words", "are", "left", "out"}};

  FieldIndices indices{schema, options, PMR_NS::get_default_resource(), nullptr};
  SearchAlgorithm algo{};
  QueryParams params;

  vector<string> documents = {"some words left out",      //
                              "some can be found",        //
                              "words are never matched",  //
                              "explicitly found!"};
  for (size_t i = 0; i < documents.size(); i++) {
    MockedDocument doc{{{"title", documents[i]}}};
    indices.Add(i, doc);
  }

  // words is a stopword
  algo.Init("words", &params);
  EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre());

  // some is a stopword
  algo.Init("some", &params);
  EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre());

  // found is not a stopword
  algo.Init("found", &params);
  EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(1, 3));
}

class SearchRaxTest
    : public SearchTest,
      public testing::WithParamInterface<pair<bool /* build suffix trie */, bool /* tag index */>> {
};

TEST_P(SearchRaxTest, SuffixInfix) {
  auto [with_trie, use_tag] = GetParam();
  Schema schema = MakeSimpleSchema({{"title", use_tag ? SchemaField::TAG : SchemaField::TEXT}});
  if (use_tag) {
    schema.fields["title"].special_params = SchemaField::TagParams{.with_suffixtrie = with_trie};
  } else {
    schema.fields["title"].special_params = SchemaField::TextParams{.with_suffixtrie = with_trie};
  }

  FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};
  SearchAlgorithm algo{};
  QueryParams params;

  vector<string> documents = {"Berries",     "BlueBeRRies", "Blackberries", "APPLES",
                              "CranbeRRies", "Wolfberry",   "StraWberry"};
  for (size_t i = 0; i < documents.size(); i++) {
    MockedDocument doc{{{"title", documents[i]}}};
    indices.Add(i, doc);
  }

  auto prepare = [&, use_tag = use_tag](string q) {
    if (use_tag)
      q = "@title:{"s + q + "}"s;
    algo.Init(q, &params);
  };

  // suffix queries

  prepare("*Es");
  EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(0, 1, 2, 3, 4));

  prepare("*beRRies");
  EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(0, 1, 2, 4));

  prepare("*les");
  EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(3));

  prepare("*lueBERRies");
  EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(1));

  prepare("*berrY");
  EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(5, 6));

  // infix queries

  prepare("*berr*");
  EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(0, 1, 2, 4, 5, 6));

  prepare("*ANB*");
  EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(4));

  prepare("*berries*");
  EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(0, 1, 2, 4));

  prepare("*bL*");
  EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(1, 2));
}

INSTANTIATE_TEST_SUITE_P(NoTrieText, SearchRaxTest, testing::Values(pair{false, false}));
INSTANTIATE_TEST_SUITE_P(WithTrieText, SearchRaxTest, testing::Values(pair{true, false}));
INSTANTIATE_TEST_SUITE_P(NoTrieTag, SearchRaxTest, testing::Values(pair{false, true}));
INSTANTIATE_TEST_SUITE_P(WithTrieTag, SearchRaxTest, testing::Values(pair{true, true}));

std::string ToBytes(absl::Span<const float> vec) {
  return string{reinterpret_cast<const char*>(vec.data()), sizeof(float) * vec.size()};
}

TEST_F(SearchTest, Errors) {
  auto schema = MakeSimpleSchema(
      {{"score", SchemaField::NUMERIC}, {"even", SchemaField::TAG}, {"pos", SchemaField::VECTOR}});
  schema.fields["pos"].special_params = SchemaField::VectorParams{false, 1};
  FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};

  SearchAlgorithm algo{};
  QueryParams params;

  // Non-existent field
  algo.Init("@cantfindme:[1 10]", &params);
  EXPECT_THAT(algo.Search(&indices).error, HasSubstr("Invalid field"));

  // Invalid type
  algo.Init("@even:[1 10]", &params);
  EXPECT_THAT(algo.Search(&indices).error, HasSubstr("Wrong access type"));

  // Wrong vector index dimensions
  params["vec"] = ToBytes({1, 2, 3, 4});
  algo.Init("* => [KNN 5 @pos $vec]", &params);
  EXPECT_THAT(algo.Search(&indices).error, HasSubstr("Wrong vector index dimensions"));
}

TEST_F(SearchTest, MatchNumericRangeWithCommas) {
  PrepareSchema({{"f1", SchemaField::NUMERIC}, {"draw_end", SchemaField::NUMERIC}});

  // Main tests for point range with identical values and different delimiters
  {
    PrepareQuery("@draw_end:[1742916180 1742916180]");
    ExpectAll(Map{{"draw_end", "1742916180"}});
    ExpectNone(Map{{"draw_end", "1742916181"}}, Map{{"draw_end", "1742916179"}});
    EXPECT_TRUE(Check()) << GetError();
  }

  {
    PrepareQuery("@draw_end:[1742916180, 1742916180]");
    ExpectAll(Map{{"draw_end", "1742916180"}});
    ExpectNone(Map{{"draw_end", "1742916181"}}, Map{{"draw_end", "1742916179"}});
    EXPECT_TRUE(Check()) << GetError();
  }

  {
    PrepareQuery("@draw_end:[1742916180 ,1742916180]");
    ExpectAll(Map{{"draw_end", "1742916180"}});
    ExpectNone(Map{{"draw_end", "1742916181"}}, Map{{"draw_end", "1742916179"}});
    EXPECT_TRUE(Check()) << GetError();
  }

  {
    PrepareQuery("@draw_end:[1742916180   1742916180]");
    ExpectAll(Map{{"draw_end", "1742916180"}});
    ExpectNone(Map{{"draw_end", "1742916181"}}, Map{{"draw_end", "1742916179"}});
    EXPECT_TRUE(Check()) << GetError();
  }

  {
    PrepareQuery("@f1:[100   ,     200]");
    ExpectAll(Map{{"f1", "100"}}, Map{{"f1", "150"}}, Map{{"f1", "200"}});
    ExpectNone(Map{{"f1", "99"}}, Map{{"f1", "201"}});
    EXPECT_TRUE(Check()) << GetError();
  }
}

class KnnTest : public SearchTest {};

class VectorRangeTest : public ::testing::Test {
 protected:
  static void SetUpTestSuite() {
    auto* tlh = mi_heap_get_backing();
    init_zmalloc_threadlocal(tlh);
    InitSimSIMD();
  }
};

TEST_F(VectorRangeTest, FlatRange1D) {
  auto schema = MakeSimpleSchema({{"pos", SchemaField::VECTOR}});
  schema.fields["pos"].special_params = SchemaField::VectorParams{false, 1};
  FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};

  // Place 10 points on a line: 1, 2, ..., 10 (avoid zero vector for doc 0)
  for (size_t i = 0; i < 10; i++) {
    MockedDocument doc{Map{{"pos", ToBytes({float(i + 1)})}}};
    indices.Add(i, doc);
  }

  SearchAlgorithm algo{};
  QueryParams params;

  // Query at 5.0 with radius 1.5 → points at pos 4,5,6 → doc ids 3,4,5
  {
    params["vec"] = ToBytes({5.0f});
    algo.Init("@pos:[VECTOR_RANGE 1.5 $vec]=>{$YIELD_DISTANCE_AS: dist}", &params);
    auto result = algo.Search(&indices);
    EXPECT_THAT(result.ids, testing::UnorderedElementsAre(3, 4, 5));
  }

  // Exact match at pos 4.0 with radius 0 → only doc 3
  {
    params["vec"] = ToBytes({4.0f});
    algo.Init("@pos:[VECTOR_RANGE 0 $vec]=>{$YIELD_DISTANCE_AS: dist}", &params);
    auto result = algo.Search(&indices);
    EXPECT_THAT(result.ids, testing::UnorderedElementsAre(3));
  }

  // Large radius → all 10 points
  {
    params["vec"] = ToBytes({5.0f});
    algo.Init("@pos:[VECTOR_RANGE 100 $vec]=>{$YIELD_DISTANCE_AS: dist}", &params);
    auto result = algo.Search(&indices);
    EXPECT_EQ(result.ids.size(), 10u);
  }

  // Empty result when radius is too small
  {
    params["vec"] = ToBytes({5.5f});
    algo.Init("@pos:[VECTOR_RANGE 0.1 $vec]=>{$YIELD_DISTANCE_AS: dist}", &params);
    auto result = algo.Search(&indices);
    EXPECT_TRUE(result.ids.empty());
  }
}

TEST_F(VectorRangeTest, FlatRangeDistancesStoredInScores) {
  auto schema = MakeSimpleSchema({{"pos", SchemaField::VECTOR}});
  schema.fields["pos"].special_params = SchemaField::VectorParams{false, 1};
  FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};

  // Use i+1 so doc positions are 1..5 (query radius 1.5 from pos 2.0 catches docs 0,1,2)
  for (size_t i = 0; i < 5; i++) {
    MockedDocument doc{Map{{"pos", ToBytes({float(i + 1)})}}};
    indices.Add(i, doc);
  }

  SearchAlgorithm algo{};
  QueryParams params;
  params["vec"] = ToBytes({2.0f});

  algo.Init("@pos:[VECTOR_RANGE 1.5 $vec]=>{$YIELD_DISTANCE_AS: vector_distance}", &params);
  ASSERT_NE(nullptr, algo.GetVectorRangeNode());
  EXPECT_STREQ("vector_distance", algo.GetVectorRangeNode()->score_alias.c_str());

  auto result = algo.Search(&indices);
  // Positions 1,2,3 (docs 0,1,2) are within L2 distance 1.5 from query pos 2.0
  EXPECT_THAT(result.ids, testing::UnorderedElementsAre(0, 1, 2));
  // knn_scores should contain distances for all matched docs
  EXPECT_EQ(result.knn_scores.size(), 3u);
}

TEST_F(VectorRangeTest, FlatStarQueryZeroVectorIsValid) {
  // Regression: @field:* on a FLAT vector index uses GetAllDocsWithNonNullValues(), which
  // incorrectly skips zero vectors. The zero vector [0.0,...,0.0] is a valid embedding.
  auto schema = MakeSimpleSchema({{"pos", SchemaField::VECTOR}});
  schema.fields["pos"].special_params = SchemaField::VectorParams{false, 2};
  FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};

  // doc 0: zero vector [0.0, 0.0] — valid embedding, must not be skipped
  indices.Add(0, MockedDocument{Map{{"pos", ToBytes({0.0f, 0.0f})}}});
  // doc 1: non-zero vector [1.0, 0.0]
  indices.Add(1, MockedDocument{Map{{"pos", ToBytes({1.0f, 0.0f})}}});

  SearchAlgorithm algo{};
  QueryParams params;
  algo.Init("@pos:*", &params);
  auto result = algo.Search(&indices);
  // Both docs must appear — zero vector is NOT null
  EXPECT_THAT(result.ids, testing::UnorderedElementsAre(0, 1));
}

TEST_F(VectorRangeTest, FlatStarQueryRemovedDocNotMatched) {
  // Regression: @field:* on a FLAT vector index uses GetAllDocsWithNonNullValues(), which
  // iterates entries_ directly and does NOT respect all_ids_. After Remove(), the doc's
  // slot in entries_ is still non-zero, so the removed doc incorrectly appears in results.
  auto schema = MakeSimpleSchema({{"pos", SchemaField::VECTOR}});
  schema.fields["pos"].special_params = SchemaField::VectorParams{false, 1};
  FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};

  indices.Add(0, MockedDocument{Map{{"pos", ToBytes({1.0f})}}});
  indices.Add(1, MockedDocument{Map{{"pos", ToBytes({2.0f})}}});
  indices.Add(2, MockedDocument{Map{{"pos", ToBytes({3.0f})}}});

  // Remove doc 1
  MockedDocument doc1{Map{{"pos", ToBytes({2.0f})}}};
  indices.Remove(1, doc1);

  SearchAlgorithm algo{};
  QueryParams params;
  algo.Init("@pos:*", &params);
  auto result = algo.Search(&indices);
  // Doc 1 was removed, only docs 0 and 2 should appear
  EXPECT_THAT(result.ids, testing::UnorderedElementsAre(0, 2));
}

TEST_F(KnnTest, Simple1D) {
  auto schema = MakeSimpleSchema({{"even", SchemaField::TAG}, {"pos", SchemaField::VECTOR}});
  schema.fields["pos"].special_params = SchemaField::VectorParams{false, 1};
  FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};

  // Place points on a straight line
  for (size_t i = 0; i < 100; i++) {
    Map values{{{"even", i % 2 == 0 ? "YES" : "NO"}, {"pos", ToBytes({float(i)})}}};
    MockedDocument doc{values};
    indices.Add(i, doc);
  }

  SearchAlgorithm algo{};
  QueryParams params;

  // Five closest to 50
  {
    params["vec"] = ToBytes({50.0});
    algo.Init("*=>[KNN 5 @pos $vec]", &params);
    EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(48, 49, 50, 51, 52));
  }

  // Five closest to 0
  {
    params["vec"] = ToBytes({0.0});
    algo.Init("*=>[KNN 5 @pos $vec]", &params);
    EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(0, 1, 2, 3, 4));
  }

  // Five closest to 20, all even
  {
    params["vec"] = ToBytes({20.0});
    algo.Init("@even:{yes} =>[KNN 5 @pos $vec]", &params);
    EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(16, 18, 20, 22, 24));
  }

  // Three closest to 31, all odd
  {
    params["vec"] = ToBytes({31.0});
    algo.Init("@even:{no} =>[KNN 3 @pos $vec]", &params);
    EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(29, 31, 33));
  }

  // Two closest to 70.5
  {
    params["vec"] = ToBytes({70.5});
    algo.Init("* =>[KNN 2 @pos $vec]", &params);
    EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(70, 71));
  }

  // Two closest to 70.5
  {
    params["vec"] = ToBytes({70.5});
    algo.Init("* =>[KNN 2 @pos $vec as vector_distance]", &params);
    EXPECT_EQ("vector_distance", algo.GetKnnScoreSortOption()->score_field_alias);
    SearchResult result = algo.Search(&indices);
    EXPECT_THAT(result.ids, testing::UnorderedElementsAre(70, 71));
  }
}

TEST_F(KnnTest, Simple2D) {
  // Square:
  // 3      2
  //    4
  // 0      1
  const pair<float, float> kTestCoords[] = {{0, 0}, {1, 0}, {1, 1}, {0, 1}, {0.5, 0.5}};

  auto schema = MakeSimpleSchema({{"pos", SchemaField::VECTOR}});
  schema.fields["pos"].special_params = SchemaField::VectorParams{false, 2};
  FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};

  for (size_t i = 0; i < ABSL_ARRAYSIZE(kTestCoords); i++) {
    string coords = ToBytes({kTestCoords[i].first, kTestCoords[i].second});
    MockedDocument doc{Map{{"pos", coords}}};
    indices.Add(i, doc);
  }

  SearchAlgorithm algo{};
  QueryParams params;

  // Single center
  {
    params["vec"] = ToBytes({0.5, 0.5});
    algo.Init("* =>[KNN 1 @pos $vec]", &params);
    EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(4));
  }

  // Lower left
  {
    params["vec"] = ToBytes({0, 0});
    algo.Init("* =>[KNN 4 @pos $vec]", &params);
    EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(0, 1, 3, 4));
  }

  // Upper right
  {
    params["vec"] = ToBytes({1, 1});
    algo.Init("* =>[KNN 4 @pos $vec]", &params);
    EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(1, 2, 3, 4));
  }

  // Request more than there is
  {
    params["vec"] = ToBytes({0, 0});
    algo.Init("* => [KNN 10 @pos $vec]", &params);
    EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(0, 1, 2, 3, 4));
  }

  // Test correct order: (0.7, 0.15)
  {
    params["vec"] = ToBytes({0.7, 0.15});
    algo.Init("* => [KNN 10 @pos $vec]", &params);
    EXPECT_THAT(algo.Search(&indices).ids, testing::ElementsAre(1, 4, 0, 2, 3));
  }

  // Test correct order: (0.8, 0.9)
  {
    params["vec"] = ToBytes({0.8, 0.9});
    algo.Init("* => [KNN 10 @pos $vec]", &params);
    EXPECT_THAT(algo.Search(&indices).ids, testing::ElementsAre(2, 4, 3, 1, 0));
  }
}

TEST_F(KnnTest, Cosine) {
  // Four arrows, closest cosing distance will be closes by angle
  // 0 🡢 1 🡣 2 🡠 3 🡡
  const pair<float, float> kTestCoords[] = {{1, 0}, {0, -1}, {-1, 0}, {0, 1}};

  auto schema = MakeSimpleSchema({{"pos", SchemaField::VECTOR}});
  schema.fields["pos"].special_params =
      SchemaField::VectorParams{false, 2, VectorSimilarity::COSINE};
  FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};

  for (size_t i = 0; i < ABSL_ARRAYSIZE(kTestCoords); i++) {
    string coords = ToBytes({kTestCoords[i].first, kTestCoords[i].second});
    MockedDocument doc{Map{{"pos", coords}}};
    indices.Add(i, doc);
  }

  SearchAlgorithm algo{};
  QueryParams params;

  // Point down
  {
    params["vec"] = ToBytes({-0.1, -10});
    algo.Init("* =>[KNN 1 @pos $vec]", &params);
    EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(1));
  }

  // Point left
  {
    params["vec"] = ToBytes({-0.1, -0.01});
    algo.Init("* =>[KNN 1 @pos $vec]", &params);
    EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(2));
  }

  // Point up
  {
    params["vec"] = ToBytes({0, 5});
    algo.Init("* =>[KNN 1 @pos $vec]", &params);
    EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(3));
  }

  // Point right
  {
    params["vec"] = ToBytes({0.2, 0.05});
    algo.Init("* =>[KNN 1 @pos $vec]", &params);
    EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(0));
  }
}

TEST_F(KnnTest, IP) {
  // Test with normalized unit vectors for IP distance
  // Using unit vectors pointing in different directions
  const pair<float, float> kTestCoords[] = {
      {1.0f, 0.0f}, {0.0f, 1.0f}, {-1.0f, 0.0f}, {0.0f, -1.0f}};

  auto schema = MakeSimpleSchema({{"pos", SchemaField::VECTOR}});
  schema.fields["pos"].special_params = SchemaField::VectorParams{false, 2, VectorSimilarity::IP};
  FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};

  for (size_t i = 0; i < ABSL_ARRAYSIZE(kTestCoords); i++) {
    string coords = ToBytes({kTestCoords[i].first, kTestCoords[i].second});
    MockedDocument doc{Map{{"pos", coords}}};
    indices.Add(i, doc);
  }

  SearchAlgorithm algo{};
  QueryParams params;

  // Query with vector pointing right - should find exact match (highest dot product)
  {
    params["vec"] = ToBytes({1.0f, 0.0f});
    algo.Init("* =>[KNN 1 @pos $vec]", &params);
    EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(0));
  }

  // Query with vector pointing up - should find exact match (highest dot product)
  {
    params["vec"] = ToBytes({0.0f, 1.0f});
    algo.Init("* =>[KNN 1 @pos $vec]", &params);
    EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(1));
  }
}

TEST_F(KnnTest, AddRemove) {
  auto schema = MakeSimpleSchema({{"pos", SchemaField::VECTOR}});
  schema.fields["pos"].special_params = SchemaField::VectorParams{false, 1, VectorSimilarity::L2};
  FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};

  vector<MockedDocument> documents(10);
  for (size_t i = 0; i < 10; i++) {
    documents[i] = Map{{"pos", ToBytes({float(i)})}};
    indices.Add(i, documents[i]);
  }

  SearchAlgorithm algo{};
  QueryParams params;

  // search leftmost 5
  {
    params["vec"] = ToBytes({-1.0});
    algo.Init("* =>[KNN 5 @pos $vec]", &params);
    EXPECT_THAT(algo.Search(&indices).ids, testing::ElementsAre(0, 1, 2, 3, 4));
  }

  // delete leftmost 5
  for (size_t i = 0; i < 5; i++)
    indices.Remove(i, documents[i]);

  // search leftmost 5 again
  {
    params["vec"] = ToBytes({-1.0});
    algo.Init("* =>[KNN 5 @pos $vec]", &params);
    EXPECT_THAT(algo.Search(&indices).ids, testing::ElementsAre(5, 6, 7, 8, 9));
  }

  // add removed elements
  for (size_t i = 0; i < 5; i++)
    indices.Add(i, documents[i]);

  // repeat first search
  {
    params["vec"] = ToBytes({-1.0});
    algo.Init("* =>[KNN 5 @pos $vec]", &params);
    EXPECT_THAT(algo.Search(&indices).ids, testing::ElementsAre(0, 1, 2, 3, 4));
  }
}

TEST_F(KnnTest, AutoResize) {
  // Make sure index resizes automatically even with a small initial capacity
  const size_t kInitialCapacity = 5;

  auto schema = MakeSimpleSchema({{"pos", SchemaField::VECTOR}});
  schema.fields["pos"].special_params =
      SchemaField::VectorParams{false, 1, VectorSimilarity::L2, kInitialCapacity};
  FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};

  for (size_t i = 0; i < 100; i++) {
    MockedDocument doc{Map{{"pos", ToBytes({float(i)})}}};
    indices.Add(i, doc);
  }

  EXPECT_EQ(indices.GetAllDocs().size(), 100);
}

// Parameterized HNSW serialization round-trip test.
// Parameters: {num_elements, dim, similarity}
struct HnswSerParam {
  size_t num_elements;
  size_t dim;
  VectorSimilarity sim;

  friend std::ostream& operator<<(std::ostream& os, const HnswSerParam& p) {
    const char* sim_name[] = {"L2", "IP", "COSINE"};
    return os << p.num_elements << "el_" << p.dim << "d_" << sim_name[static_cast<int>(p.sim)];
  }
};

class HnswSerializationTest : public ::testing::TestWithParam<HnswSerParam> {
 protected:
  void SetUp() override {
    InitTLSearchMR(PMR_NS::get_default_resource());
  }

  void TearDown() override {
    InitTLSearchMR(nullptr);
  }
};

TEST_P(HnswSerializationTest, RoundTrip) {
  const auto [num_elements, dim, sim] = GetParam();

  SchemaField::VectorParams params;
  params.use_hnsw = true;
  params.dim = dim;
  params.sim = sim;
  params.capacity = std::max<size_t>(num_elements, 10);
  params.hnsw_m = 16;
  params.hnsw_ef_construction = 200;

  HnswVectorIndex original(params, /*copy_vector=*/true);

  std::mt19937 rng(42);
  std::uniform_real_distribution<float> dist(0.0f, 1.0f);
  vector<MockedDocument> docs(num_elements);
  for (size_t i = 0; i < num_elements; i++) {
    vector<float> coords(dim);
    for (size_t d = 0; d < dim; d++)
      coords[d] = dist(rng);
    docs[i] = MockedDocument::Map{{"vec", ToBytes(absl::MakeConstSpan(coords))}};
    original.Add(i, docs[i], "vec");
  }

  // Serialize
  auto metadata = original.GetMetadata();
  ASSERT_EQ(metadata.cur_element_count, num_elements);

  std::vector<HnswNodeData> nodes;
  {
    auto lock = original.GetReadLock();
    nodes = original.GetNodesRange(0, metadata.cur_element_count);
  }
  ASSERT_EQ(nodes.size(), num_elements);

  // Verify node data integrity
  for (const auto& node : nodes) {
    EXPECT_EQ(node.levels_links.size(), static_cast<size_t>(node.level + 1));
    EXPECT_GT(node.TotalSize(), 0u);
  }

  // Deserialize into a fresh index
  HnswVectorIndex restored(params, /*copy_vector=*/true);
  restored.SetMetadata(metadata);
  restored.RestoreFromNodes(nodes, metadata);

  // Before UpdateVectorData, all nodes must be marked deleted.
  // KNN should safely return empty results (no crash from nullptr dereference).
  if (num_elements > 0) {
    vector<float> probe(dim, 0.5f);
    auto pre_results = restored.Knn(probe.data(), 10, std::nullopt);
    EXPECT_TRUE(pre_results.empty()) << "All nodes should be deleted before UpdateVectorData";
  }

  for (size_t i = 0; i < num_elements; i++)
    restored.UpdateVectorData(i, docs[i], "vec");

  // Metadata must match
  auto rm = restored.GetMetadata();
  EXPECT_EQ(rm.cur_element_count, metadata.cur_element_count);
  EXPECT_EQ(rm.maxlevel, metadata.maxlevel);
  EXPECT_EQ(rm.enterpoint_node, metadata.enterpoint_node);

  // Graph links must be identical
  std::vector<HnswNodeData> restored_nodes;
  {
    auto lock = restored.GetReadLock();
    restored_nodes = restored.GetNodesRange(0, rm.cur_element_count);
  }
  ASSERT_EQ(restored_nodes.size(), nodes.size());
  for (size_t i = 0; i < nodes.size(); i++) {
    EXPECT_EQ(restored_nodes[i].internal_id, nodes[i].internal_id);
    EXPECT_EQ(restored_nodes[i].global_id, nodes[i].global_id);
    EXPECT_EQ(restored_nodes[i].level, nodes[i].level);
    ASSERT_EQ(restored_nodes[i].levels_links.size(), nodes[i].levels_links.size());
    for (size_t lvl = 0; lvl < nodes[i].levels_links.size(); lvl++)
      EXPECT_EQ(restored_nodes[i].levels_links[lvl], nodes[i].levels_links[lvl]);
  }

  if (num_elements == 0)
    return;

  // KNN results must match for several queries
  auto compare_knn = [&](vector<float> query, size_t k) {
    auto orig = original.Knn(query.data(), k, std::nullopt);
    auto rest = restored.Knn(query.data(), k, std::nullopt);
    ASSERT_EQ(orig.size(), rest.size());
    for (size_t j = 0; j < orig.size(); j++) {
      EXPECT_EQ(orig[j].second, rest[j].second);
      EXPECT_NEAR(orig[j].first, rest[j].first, 1e-5);
    }
  };

  size_t k = std::min<size_t>(num_elements, 10);
  compare_knn(vector<float>(dim, 0.0f), k);
  compare_knn(vector<float>(dim, 0.5f), k);
  compare_knn(vector<float>(dim, 1.0f), k);

  // Filtered KNN must also match
  vector<GlobalDocId> allowed;
  for (size_t i = 0; i < num_elements; i += 2)
    allowed.push_back(i);
  size_t fk = std::min<size_t>(allowed.size(), 5);
  vector<float> q(dim, 0.5f);
  auto orig_f = original.Knn(q.data(), fk, std::nullopt, allowed);
  auto rest_f = restored.Knn(q.data(), fk, std::nullopt, allowed);
  ASSERT_EQ(orig_f.size(), rest_f.size());
  for (size_t i = 0; i < orig_f.size(); i++) {
    EXPECT_EQ(orig_f[i].second, rest_f[i].second);
    EXPECT_NEAR(orig_f[i].first, rest_f[i].first, 1e-5);
  }
}

INSTANTIATE_TEST_SUITE_P(HnswSer, HnswSerializationTest,
                         testing::Values(HnswSerParam{0, 2, VectorSimilarity::L2},
                                         HnswSerParam{10, 2, VectorSimilarity::L2},
                                         HnswSerParam{1000, 4, VectorSimilarity::L2},
                                         HnswSerParam{10000, 8, VectorSimilarity::L2},
                                         HnswSerParam{10, 3, VectorSimilarity::COSINE},
                                         HnswSerParam{1000, 4, VectorSimilarity::COSINE},
                                         HnswSerParam{10, 2, VectorSimilarity::IP},
                                         HnswSerParam{1000, 4, VectorSimilarity::IP}),
                         [](const testing::TestParamInfo<HnswSerParam>& info) {
                           std::ostringstream name;
                           name << info.param;
                           return name.str();
                         });

// Test fixture for HNSW deferred operations.
// Verifies that Add/Remove called while a read lock is held are properly
// deferred and replayed once the lock is released.
class HnswDeferredOpsTest : public ::testing::Test {
 protected:
  static constexpr size_t kDim = 4;
  static constexpr size_t kCapacity = 100;

  void SetUp() override {
    InitTLSearchMR(PMR_NS::get_default_resource());

    SchemaField::VectorParams params;
    params.use_hnsw = true;
    params.dim = kDim;
    params.sim = VectorSimilarity::L2;
    params.capacity = kCapacity;
    params.hnsw_m = 16;
    params.hnsw_ef_construction = 200;
    index_ = std::make_unique<HnswVectorIndex>(params, /*copy_vector=*/true);
  }

  void TearDown() override {
    index_.reset();
    InitTLSearchMR(nullptr);
  }

  MockedDocument MakeDoc(std::initializer_list<float> coords) {
    return MockedDocument::Map{{"vec", ToBytes(coords)}};
  }

  // Helper: run KNN for the zero vector and return the set of found GlobalDocIds.
  absl::flat_hash_set<GlobalDocId> KnnIds(size_t k) {
    vector<float> q(kDim, 0.0f);
    auto results = index_->Knn(q.data(), k, std::nullopt);
    absl::flat_hash_set<GlobalDocId> ids;
    for (auto& [dist, id] : results)
      ids.insert(id);
    return ids;
  }

  std::unique_ptr<HnswVectorIndex> index_;
};

TEST_F(HnswDeferredOpsTest, AddWhileReadLocked) {
  // Hold a read lock (simulating serialization), then add elements.
  auto doc0 = MakeDoc({1, 0, 0, 0});
  auto doc1 = MakeDoc({0, 1, 0, 0});

  {
    auto lock = index_->GetReadLock();

    // These Adds cannot acquire the write lock and must be deferred.
    index_->Add(0, doc0, "vec");
    index_->Add(1, doc1, "vec");

    // While the read lock is still held, KNN should not find the deferred docs.
    auto ids = KnnIds(10);
    EXPECT_TRUE(ids.empty());
  }

  // After the read lock is released, deferred ops should replay.
  // The next operation that touches the index triggers ProcessDeferred.
  auto ids = KnnIds(10);
  EXPECT_EQ(ids.size(), 2u);
  EXPECT_TRUE(ids.contains(0));
  EXPECT_TRUE(ids.contains(1));
}

TEST_F(HnswDeferredOpsTest, RemoveWhileReadLocked) {
  // Pre-populate the index.
  auto doc0 = MakeDoc({1, 0, 0, 0});
  auto doc1 = MakeDoc({0, 1, 0, 0});
  auto doc2 = MakeDoc({0, 0, 1, 0});
  index_->Add(0, doc0, "vec");
  index_->Add(1, doc1, "vec");
  index_->Add(2, doc2, "vec");

  {
    auto lock = index_->GetReadLock();

    // Remove doc1 while read-locked — should be deferred.
    index_->Remove(1, doc1, "vec");

    // doc1 is still visible because the remove is deferred.
    auto ids = KnnIds(10);
    EXPECT_EQ(ids.size(), 3u);
  }

  // After releasing the lock, removal should take effect.
  auto ids = KnnIds(10);
  EXPECT_EQ(ids.size(), 2u);
  EXPECT_TRUE(ids.contains(0));
  EXPECT_TRUE(ids.contains(2));
  EXPECT_FALSE(ids.contains(1));
}

TEST_F(HnswDeferredOpsTest, DuplicateDeferredOpsKeepLatest) {
  // Pre-populate with doc0.
  auto doc0 = MakeDoc({1, 0, 0, 0});
  index_->Add(0, doc0, "vec");

  auto doc1 = MakeDoc({0, 1, 0, 0});

  {
    auto lock = index_->GetReadLock();

    // Add doc1, then remove doc1 — both deferred for the same id.
    // Only the last operation (remove) should survive.
    index_->Add(1, doc1, "vec");
    index_->Remove(1, doc1, "vec");
  }

  // After lock release, doc1 should not exist (remove was last).
  auto ids = KnnIds(10);
  EXPECT_EQ(ids.size(), 1u);
  EXPECT_TRUE(ids.contains(0));
  EXPECT_FALSE(ids.contains(1));
}

TEST_F(HnswDeferredOpsTest, DuplicateDeferredOpsAddOverridesRemove) {
  // Pre-populate with doc0 and doc1.
  auto doc0 = MakeDoc({1, 0, 0, 0});
  auto doc1 = MakeDoc({0, 1, 0, 0});
  index_->Add(0, doc0, "vec");
  index_->Add(1, doc1, "vec");

  auto doc1_new = MakeDoc({0, 0, 1, 0});

  {
    auto lock = index_->GetReadLock();

    // Remove doc1, then re-add it with new data — the add should win.
    index_->Remove(1, doc1, "vec");
    index_->Add(1, doc1_new, "vec");
  }

  // After lock release, doc1 should still be present with updated data.
  auto ids = KnnIds(10);
  EXPECT_EQ(ids.size(), 2u);
  EXPECT_TRUE(ids.contains(0));
  EXPECT_TRUE(ids.contains(1));
}

// Verify that Remove without a read lock also works correctly.
TEST_F(HnswDeferredOpsTest, RemoveWithoutReadLock) {
  auto doc0 = MakeDoc({1, 0, 0, 0});
  auto doc1 = MakeDoc({0, 1, 0, 0});
  index_->Add(0, doc0, "vec");
  index_->Add(1, doc1, "vec");

  index_->Remove(1, doc1, "vec");

  auto ids = KnnIds(10);
  EXPECT_EQ(ids.size(), 1u);
  EXPECT_TRUE(ids.contains(0));
  EXPECT_FALSE(ids.contains(1));
}

class HnswSubsetKnnTest : public ::testing::TestWithParam<VectorSimilarity> {
 protected:
  void SetUp() override {
    InitTLSearchMR(PMR_NS::get_default_resource());
  }

  void TearDown() override {
    InitTLSearchMR(nullptr);
  }

  // Helper to create a simple index with vectors on a line for easy verification
  unique_ptr<HnswVectorIndex> CreateSimple1DIndex(size_t num_elements, VectorSimilarity sim) {
    SchemaField::VectorParams params;
    params.use_hnsw = true;
    params.dim = 1;
    params.sim = sim;
    params.capacity = std::max<size_t>(num_elements, 10);
    params.hnsw_m = 16;
    params.hnsw_ef_construction = 200;

    auto index = make_unique<HnswVectorIndex>(params, /*copy_vector=*/true);

    for (size_t i = 0; i < num_elements; i++) {
      vector<float> coords = {static_cast<float>(i)};
      auto doc = MockedDocument::Map{{"vec", ToBytes(absl::MakeConstSpan(coords))}};
      index->Add(i, MockedDocument(doc), "vec");
    }

    return index;
  }

  // Helper to create a 2D index with unit-circle vectors, for COSINE similarity testing.
  // Vector i is placed at angle i * (2π / num_elements), giving meaningful cosine distances.
  unique_ptr<HnswVectorIndex> CreateCircle2DIndex(size_t num_elements, VectorSimilarity sim) {
    SchemaField::VectorParams params;
    params.use_hnsw = true;
    params.dim = 2;
    params.sim = sim;
    params.capacity = std::max<size_t>(num_elements, 10);
    params.hnsw_m = 16;
    params.hnsw_ef_construction = 200;

    auto index = make_unique<HnswVectorIndex>(params, /*copy_vector=*/true);

    const float step = 2.0f * static_cast<float>(acos(-1.0)) / static_cast<float>(num_elements);
    for (size_t i = 0; i < num_elements; i++) {
      float angle = step * static_cast<float>(i);
      vector<float> coords = {cosf(angle), sinf(angle)};
      auto doc = MockedDocument::Map{{"vec", ToBytes(absl::MakeConstSpan(coords))}};
      index->Add(i, MockedDocument(doc), "vec");
    }

    return index;
  }
};

TEST_P(HnswSubsetKnnTest, CorrectResults) {
  // Test that SubsetKnn returns correct top-k from a subset
  auto sim = GetParam();
  auto index = CreateSimple1DIndex(100, sim);

  vector<float> query = {50.0f};
  vector<GlobalDocId> subset;

  // Create subset: only even numbers from 40 to 60
  for (size_t i = 40; i <= 60; i += 2) {
    subset.push_back(i);
  }

  // Ask for top 5
  auto results = index->SubsetKnn(query.data(), 5, subset);

  // Should get exactly 5 results
  ASSERT_EQ(results.size(), 5u);

  // All results should be from the subset
  for (const auto& [dist, id] : results) {
    EXPECT_TRUE(std::find(subset.begin(), subset.end(), id) != subset.end())
        << "Result ID " << id << " not in subset";
  }

  // For L2 similarity, verify the closest point is 50
  if (sim == VectorSimilarity::L2) {
    bool found_50 = false;
    for (const auto& [dist, id] : results) {
      if (id == 50) {
        found_50 = true;
        break;
      }
    }
    EXPECT_TRUE(found_50) << "For L2, point 50 should be in top 5 closest to query {50}";
  }
}

TEST_P(HnswSubsetKnnTest, EmptySubset) {
  // Test edge case: empty subset
  auto sim = GetParam();
  auto index = CreateSimple1DIndex(10, sim);

  vector<float> query = {5.0f};
  vector<GlobalDocId> empty_subset;

  auto results = index->SubsetKnn(query.data(), 5, empty_subset);
  EXPECT_TRUE(results.empty()) << "SubsetKnn with empty subset should return empty results";
}

TEST_P(HnswSubsetKnnTest, KEqualsZero) {
  // Test edge case: k = 0
  auto sim = GetParam();
  auto index = CreateSimple1DIndex(10, sim);

  vector<float> query = {5.0f};
  vector<GlobalDocId> subset = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};

  auto results = index->SubsetKnn(query.data(), 0, subset);
  EXPECT_TRUE(results.empty()) << "SubsetKnn with k=0 should return empty results";
}

TEST_P(HnswSubsetKnnTest, KGreaterThanSubsetSize) {
  // Test edge case: k > number of valid documents in subset
  auto sim = GetParam();
  auto index = CreateSimple1DIndex(10, sim);

  vector<float> query = {5.0f};
  vector<GlobalDocId> subset = {1, 3, 5};  // Only 3 elements

  auto results = index->SubsetKnn(query.data(), 10, subset);  // Ask for 10
  EXPECT_EQ(results.size(), 3u) << "SubsetKnn should return at most subset.size() results";

  // Verify all 3 are returned
  vector<GlobalDocId> result_ids;
  for (const auto& [dist, id] : results) {
    result_ids.push_back(id);
  }
  EXPECT_THAT(result_ids, testing::UnorderedElementsAre(1, 3, 5));
}

TEST_P(HnswSubsetKnnTest, NonExistentIds) {
  // Test that non-existent IDs in subset are gracefully ignored
  auto sim = GetParam();
  auto index = CreateSimple1DIndex(10, sim);

  vector<float> query = {5.0f};
  // Mix of valid (0-9) and invalid (100-105) IDs
  vector<GlobalDocId> subset = {100, 4, 101, 5, 102, 6, 103, 104, 105};

  auto results = index->SubsetKnn(query.data(), 3, subset);
  EXPECT_EQ(results.size(), 3u);

  // Should only return valid IDs: 5, 4, 6 (closest to 5)
  vector<GlobalDocId> result_ids;
  for (const auto& [dist, id] : results) {
    result_ids.push_back(id);
  }
  EXPECT_THAT(result_ids, testing::UnorderedElementsAre(4, 5, 6));
}

TEST_P(HnswSubsetKnnTest, AllDeletedDocuments) {
  // Test edge case: all documents in subset are marked deleted
  auto sim = GetParam();

  SchemaField::VectorParams params;
  params.use_hnsw = true;
  params.dim = 1;
  params.sim = sim;
  params.capacity = 10;
  params.hnsw_m = 16;
  params.hnsw_ef_construction = 200;

  HnswVectorIndex index(params, /*copy_vector=*/true);

  // Add and then remove documents
  vector<MockedDocument> docs;
  for (size_t i = 0; i < 5; i++) {
    vector<float> coords = {static_cast<float>(i)};
    docs.push_back(
        MockedDocument(MockedDocument::Map{{"vec", ToBytes(absl::MakeConstSpan(coords))}}));
    index.Add(i, docs[i], "vec");
  }

  // Delete all documents
  for (size_t i = 0; i < 5; i++) {
    index.Remove(i, docs[i], "vec");
  }

  vector<float> query = {2.5f};
  vector<GlobalDocId> subset = {0, 1, 2, 3, 4};

  auto results = index.SubsetKnn(query.data(), 3, subset);
  EXPECT_TRUE(results.empty()) << "SubsetKnn should return empty when all docs are deleted";
}

TEST_P(HnswSubsetKnnTest, MixedDeletedAndValidDocs) {
  // Test with a mix of deleted and valid documents
  auto sim = GetParam();

  SchemaField::VectorParams params;
  params.use_hnsw = true;
  params.dim = 1;
  params.sim = sim;
  params.capacity = 10;
  params.hnsw_m = 16;
  params.hnsw_ef_construction = 200;

  HnswVectorIndex index(params, /*copy_vector=*/true);

  // Add documents
  vector<MockedDocument> docs;
  for (size_t i = 0; i < 10; i++) {
    vector<float> coords = {static_cast<float>(i)};
    docs.push_back(
        MockedDocument(MockedDocument::Map{{"vec", ToBytes(absl::MakeConstSpan(coords))}}));
    index.Add(i, docs[i], "vec");
  }

  // Delete even documents
  for (size_t i = 0; i < 10; i += 2) {
    index.Remove(i, docs[i], "vec");
  }

  vector<float> query = {5.0f};
  // Subset includes both deleted (even) and valid (odd) docs
  vector<GlobalDocId> subset = {2, 3, 4, 5, 6, 7, 8};

  auto results = index.SubsetKnn(query.data(), 3, subset);
  EXPECT_EQ(results.size(), 3u);

  // Should only return odd (non-deleted) IDs: 5, 3, 7 (closest to 5)
  vector<GlobalDocId> result_ids;
  for (const auto& [dist, id] : results) {
    result_ids.push_back(id);
  }
  EXPECT_THAT(result_ids, testing::UnorderedElementsAre(3, 5, 7));
}

TEST_P(HnswSubsetKnnTest, CompareWithFilteredKnn) {
  // Integration test: verify SubsetKnn produces similar results to filtered Knn
  // SubsetKnn uses brute-force exact search, while Knn uses HNSW approximate search
  // So results may differ slightly, but should have significant overlap
  constexpr double kMinOverlapRatio = 0.7;  // 70% minimum overlap threshold

  auto sim = GetParam();

  // COSINE similarity is undefined for 1D positive vectors (all share the same direction,
  // so all cosine distances equal 0). Use 2D unit-circle vectors instead, where element i
  // is at angle i * 2π/100, giving each pair a distinct, meaningful cosine distance.
  unique_ptr<HnswVectorIndex> index;
  vector<float> query;
  if (sim == VectorSimilarity::COSINE) {
    constexpr size_t kNumElements = 100;
    index = CreateCircle2DIndex(kNumElements, sim);
    const float step = 2.0f * static_cast<float>(acos(-1.0)) / static_cast<float>(kNumElements);
    float angle = step * 50.0f;
    query = {cosf(angle), sinf(angle)};
  } else {
    index = CreateSimple1DIndex(100, sim);
    query = {50.0f};
  }

  vector<GlobalDocId> subset;

  // Create a small subset (well below typical 8192 threshold)
  for (size_t i = 40; i <= 60; i++) {
    subset.push_back(i);
  }

  size_t k = 10;

  // Get results from SubsetKnn (exact brute-force)
  auto subset_results = index->SubsetKnn(query.data(), k, subset);

  // Get results from regular filtered Knn (HNSW approximate)
  auto knn_results = index->Knn(query.data(), k, std::nullopt, subset);

  // Both should return k results (or fewer if subset is smaller)
  EXPECT_LE(subset_results.size(), k);
  EXPECT_LE(knn_results.size(), k);

  // Extract IDs from both
  std::set<GlobalDocId> subset_ids;
  for (const auto& [dist, id] : subset_results) {
    subset_ids.insert(id);
  }

  std::set<GlobalDocId> knn_ids;
  for (const auto& [dist, id] : knn_results) {
    knn_ids.insert(id);
  }

  // Count overlap - since HNSW is approximate, we expect good but not perfect overlap
  size_t overlap = 0;
  for (const auto& id : subset_ids) {
    if (knn_ids.count(id) > 0) {
      overlap++;
    }
  }

  // Expect at least kMinOverlapRatio overlap (HNSW is approximate, so some difference is expected)
  size_t min_overlap =
      static_cast<size_t>(std::min(subset_ids.size(), knn_ids.size()) * kMinOverlapRatio);
  EXPECT_GE(overlap, min_overlap) << "Expected at least " << min_overlap
                                  << " overlapping results, got " << overlap;
}

INSTANTIATE_TEST_SUITE_P(SubsetKnnSimilarities, HnswSubsetKnnTest,
                         testing::Values(VectorSimilarity::L2, VectorSimilarity::COSINE,
                                         VectorSimilarity::IP),
                         [](const testing::TestParamInfo<VectorSimilarity>& info) {
                           switch (info.param) {
                             case VectorSimilarity::L2:
                               return "L2";
                             case VectorSimilarity::COSINE:
                               return "COSINE";
                             case VectorSimilarity::IP:
                               return "IP";
                             default:
                               return "Unknown";
                           }
                         });

// Tests for HnswVectorIndex::RangeQuery
class HnswRangeQueryTest : public ::testing::TestWithParam<VectorSimilarity> {
 protected:
  void SetUp() override {
    InitTLSearchMR(PMR_NS::get_default_resource());
  }

  void TearDown() override {
    InitTLSearchMR(nullptr);
  }

  // 1-D index: doc i has vector {float(i)}, GlobalDocId = i
  unique_ptr<HnswVectorIndex> CreateSimple1DIndex(size_t num_elements) {
    SchemaField::VectorParams params;
    params.use_hnsw = true;
    params.dim = 1;
    params.sim = VectorSimilarity::L2;
    params.capacity = std::max<size_t>(num_elements, 10);
    params.hnsw_m = 16;
    params.hnsw_ef_construction = 200;

    auto index = make_unique<HnswVectorIndex>(params, /*copy_vector=*/true);
    for (size_t i = 0; i < num_elements; i++) {
      vector<float> coords = {static_cast<float>(i)};
      index->Add(i,
                 MockedDocument(MockedDocument::Map{{"vec", ToBytes(absl::MakeConstSpan(coords))}}),
                 "vec");
    }
    return index;
  }
};

TEST_P(HnswRangeQueryTest, BasicRange) {
  // 10 docs at positions 0..9. Query at 5.0 with radius 1.5 → docs 4,5,6 (dist 1.0,0.0,1.0)
  (void)GetParam();  // L2 only for 1-D
  auto index = CreateSimple1DIndex(10);

  vector<float> query = {5.0f};
  auto results = index->RangeQuery(query.data(), 1.5f);

  set<GlobalDocId> ids;
  for (const auto& [dist, id] : results)
    ids.insert(id);

  EXPECT_THAT(ids, testing::UnorderedElementsAre(4, 5, 6));
}

TEST_P(HnswRangeQueryTest, ExactMatch) {
  // Radius 0: only the doc at exact position
  (void)GetParam();
  auto index = CreateSimple1DIndex(10);

  vector<float> query = {3.0f};
  auto results = index->RangeQuery(query.data(), 0.0f);

  ASSERT_EQ(results.size(), 1u);
  EXPECT_EQ(results[0].second, GlobalDocId{3});
  EXPECT_FLOAT_EQ(results[0].first, 0.0f);
}

TEST_P(HnswRangeQueryTest, LargeRadiusReturnsAll) {
  (void)GetParam();
  auto index = CreateSimple1DIndex(20);

  vector<float> query = {10.0f};
  auto results = index->RangeQuery(query.data(), 1000.0f);

  EXPECT_EQ(results.size(), 20u);
}

TEST_P(HnswRangeQueryTest, EmptyResultOutsideRadius) {
  (void)GetParam();
  auto index = CreateSimple1DIndex(10);

  vector<float> query = {5.5f};
  auto results = index->RangeQuery(query.data(), 0.1f);

  EXPECT_TRUE(results.empty());
}

TEST_P(HnswRangeQueryTest, EmptyIndex) {
  (void)GetParam();
  auto index = CreateSimple1DIndex(0);

  vector<float> query = {0.0f};
  auto results = index->RangeQuery(query.data(), 100.0f);

  EXPECT_TRUE(results.empty());
}

TEST_P(HnswRangeQueryTest, DistancesCorrect) {
  // Verify returned distances match actual L2 distances
  (void)GetParam();
  auto index = CreateSimple1DIndex(10);

  vector<float> query = {5.0f};
  auto results = index->RangeQuery(query.data(), 2.0f);  // docs 3,4,5,6,7

  EXPECT_EQ(results.size(), 5u);
  for (const auto& [dist, id] : results) {
    float expected = std::abs(static_cast<float>(id) - 5.0f);
    // L2Distance returns sqrt(sum of squares); for 1-D: sqrt((a-b)²) = |a-b|
    EXPECT_FLOAT_EQ(dist, expected);
  }
}

TEST_P(HnswRangeQueryTest, DeletedDocNotReturned) {
  (void)GetParam();
  auto index = CreateSimple1DIndex(10);

  // Remove doc 5 (at position 5.0, distance 0 from query)
  index->Remove(5);

  vector<float> query = {5.0f};
  auto results = index->RangeQuery(query.data(), 1.5f);

  set<GlobalDocId> ids;
  for (const auto& [dist, id] : results)
    ids.insert(id);

  EXPECT_THAT(ids, testing::UnorderedElementsAre(4, 6));
  EXPECT_THAT(ids, testing::Not(testing::Contains(GlobalDocId{5})));
}

TEST_P(HnswRangeQueryTest, ConsistentWithBruteForce) {
  // Compare RangeQuery results against brute-force SubsetKnn-based check
  (void)GetParam();
  const size_t n = 50;
  auto index = CreateSimple1DIndex(n);

  vector<float> query = {25.0f};
  float radius = 5.0f;

  auto results = index->RangeQuery(query.data(), radius);

  // Brute force: collect all docs within radius.
  // L2Distance returns |a-b| for 1-D vectors (actual Euclidean, not squared).
  set<GlobalDocId> expected;
  for (size_t i = 0; i < n; i++) {
    float dist = std::abs(static_cast<float>(i) - 25.0f);
    if (dist <= radius)
      expected.insert(i);
  }

  set<GlobalDocId> got;
  for (const auto& [dist, id] : results)
    got.insert(id);

  EXPECT_EQ(got, expected);
}

INSTANTIATE_TEST_SUITE_P(HnswRangeL2, HnswRangeQueryTest, testing::Values(VectorSimilarity::L2),
                         [](const testing::TestParamInfo<VectorSimilarity>&) { return "L2"; });

TEST_F(SearchTest, GeoSearch) {
  auto schema = MakeSimpleSchema({{"name", SchemaField::TEXT}, {"location", SchemaField::GEO}});
  FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};

  indices.Add(0, MockedDocument(Map{{"name", "Mountain View"}, {"location", "-122.08, 37.386"}}));
  indices.Add(1, MockedDocument(Map{{"name", "Palo Alto"}, {"location", "-122.143, 37.444"}}));
  indices.Add(2, MockedDocument(Map{{"name", "San Jose"}, {"location", "-121.886, 37.338"}}));
  indices.Add(3, MockedDocument(Map{{"name", "San Francisco"}, {"location", "-122.419, 37.774"}}));

  SearchAlgorithm algo{};
  QueryParams params;

  // Search around Mount View 30 miles - San Francisco not included
  {
    algo.Init("@location:[-122.083 37.386 30 mi]", &params);
    EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(0, 1, 2));
  }

  // Search around Mount View 50 miles - all points included
  {
    algo.Init("@location:[-122.083 37.386 50 mi]", &params);
    EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(0, 1, 2, 3));
  }

  // Return all indexes
  {
    algo.Init("@location:*", &params);
    EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(0, 1, 2, 3));
  }

  // Search around Mount View 50 miles - all points included and filter on prefix
  {
    algo.Init("San* @location:[-122.083 37.386 50 mi]", &params);
    EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(2, 3));
  }

  // Add duplicate point of San Francisco and search again to include this point also
  {
    indices.Add(4,
                MockedDocument(Map{{"name", "San Francisco"}, {"location", "-122.419, 37.774"}}));
    algo.Init("San* @location:[-122.083 37.386 50 mi]", &params);
    EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(2, 3, 4));
  }

  // Remove first index of San Francisco (id = 3) and search
  {
    indices.Remove(
        3, MockedDocument(Map{{"name", "San Francisco"}, {"location", "-122.419, 37.774"}}));
    algo.Init("San* @location:[-122.083 37.386 50 mi]", &params);
    EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(2, 4));
  }
}

TEST_F(SearchTest, VectorDistanceBasic) {
  // Test basic vector distance calculations
  std::vector<float> vec1 = {1.0f, 2.0f, 3.0f};
  std::vector<float> vec2 = {4.0f, 5.0f, 6.0f};

  // Test L2 distance
  float l2_dist = VectorDistance(vec1.data(), vec2.data(), 3, VectorSimilarity::L2);
  EXPECT_GT(l2_dist, 0.0f);
  EXPECT_LT(l2_dist, 10.0f);  // Should be reasonable value

  // Test Cosine distance
  float cos_dist = VectorDistance(vec1.data(), vec2.data(), 3, VectorSimilarity::COSINE);
  EXPECT_GE(cos_dist, 0.0f);
  EXPECT_LE(cos_dist, 2.0f);  // Cosine distance range

  // Test IP distance
  float ip_dist = VectorDistance(vec1.data(), vec2.data(), 3, VectorSimilarity::IP);
  // IP distance can be negative for non-normalized vectors
  EXPECT_NE(ip_dist, 0.0f);  // Should be non-zero for different vectors

  // Test identical vectors
  float l2_same = VectorDistance(vec1.data(), vec1.data(), 3, VectorSimilarity::L2);
  EXPECT_NEAR(l2_same, 0.0f, 1e-6);

  float cos_same = VectorDistance(vec1.data(), vec1.data(), 3, VectorSimilarity::COSINE);
  EXPECT_NEAR(cos_same, 0.0f, 1e-6);

  float ip_same = VectorDistance(vec1.data(), vec1.data(), 3, VectorSimilarity::IP);
  // For identical vectors: IP = 1 - dot_product(v, v) = 1 - ||v||^2
  // For vec1 = {1, 2, 3}: ||v||^2 = 1 + 4 + 9 = 14, so IP = 1 - 14 = -13
  EXPECT_LT(ip_same, 0.0f);  // Should be negative for non-normalized vectors
}

TEST_F(SearchTest, VectorDistanceConsistency) {
  // Test that results are consistent across multiple calls
  std::vector<float> vec1 = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f};
  std::vector<float> vec2 = {0.6f, 0.7f, 0.8f, 0.9f, 1.0f};

  float l2_dist1 = VectorDistance(vec1.data(), vec2.data(), 5, VectorSimilarity::L2);
  float l2_dist2 = VectorDistance(vec1.data(), vec2.data(), 5, VectorSimilarity::L2);
  EXPECT_EQ(l2_dist1, l2_dist2);

  float cos_dist1 = VectorDistance(vec1.data(), vec2.data(), 5, VectorSimilarity::COSINE);
  float cos_dist2 = VectorDistance(vec1.data(), vec2.data(), 5, VectorSimilarity::COSINE);
  EXPECT_EQ(cos_dist1, cos_dist2);

  float ip_dist1 = VectorDistance(vec1.data(), vec2.data(), 5, VectorSimilarity::IP);
  float ip_dist2 = VectorDistance(vec1.data(), vec2.data(), 5, VectorSimilarity::IP);
  EXPECT_EQ(ip_dist1, ip_dist2);
}

static void BM_VectorSearch(benchmark::State& state) {
  // Ensure SimSIMD dynamic dispatch is initialized for the benchmark
  InitSimSIMD();
  unsigned ndims = state.range(0);
  unsigned nvecs = state.range(1);

  auto schema = MakeSimpleSchema({{"pos", SchemaField::VECTOR}});
  schema.fields["pos"].special_params = SchemaField::VectorParams{false, ndims};
  FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};

  auto random_vec = [ndims]() {
    vector<float> coords;
    for (size_t j = 0; j < ndims; j++)
      coords.push_back(static_cast<float>(rand()) / static_cast<float>(RAND_MAX));
    return coords;
  };

  for (size_t i = 0; i < nvecs; i++) {
    auto rv = random_vec();
    MockedDocument doc{Map{{"pos", ToBytes(rv)}}};
    indices.Add(i, doc);
  }

  SearchAlgorithm algo{};
  QueryParams params;

  auto rv = random_vec();
  params["vec"] = ToBytes(rv);
  algo.Init("* =>[KNN 1 @pos $vec]", &params);

  while (state.KeepRunningBatch(10)) {
    for (size_t i = 0; i < 10; i++)
      benchmark::DoNotOptimize(algo.Search(&indices));
  }
}

BENCHMARK(BM_VectorSearch)->Args({120, 10'000});

TEST_F(SearchTest, MatchNonNullField) {
  PrepareSchema({{"text_field", SchemaField::TEXT},
                 {"tag_field", SchemaField::TAG},
                 {"num_field", SchemaField::NUMERIC}});

  {
    PrepareQuery("@text_field:*");

    ExpectAll(Map{{"text_field", "any value"}}, Map{{"text_field", "another value"}},
              Map{{"text_field", "third"}, {"tag_field", "tag1"}});

    ExpectNone(Map{{"tag_field", "wrong field"}}, Map{{"num_field", "123"}}, Map{});

    EXPECT_TRUE(Check()) << GetError();
  }

  {
    PrepareQuery("@tag_field:*");

    ExpectAll(Map{{"tag_field", "tag1"}}, Map{{"tag_field", "tag2"}},
              Map{{"text_field", "value"}, {"tag_field", "tag3"}});

    ExpectNone(Map{{"text_field", "wrong field"}}, Map{{"num_field", "456"}}, Map{});

    EXPECT_TRUE(Check()) << GetError();
  }

  {
    PrepareQuery("@num_field:*");

    ExpectAll(Map{{"num_field", "123"}}, Map{{"num_field", "456"}},
              Map{{"text_field", "value"}, {"num_field", "789"}});

    ExpectNone(Map{{"text_field", "wrong field"}}, Map{{"tag_field", "tag1"}}, Map{});

    EXPECT_TRUE(Check()) << GetError();
  }
}

TEST_F(SearchTest, InvalidVectorParameter) {
  search::Schema schema;
  schema.fields["v"] = search::SchemaField{
      search::SchemaField::VECTOR,
      0,   // flags
      "v"  // short_name
  };

  search::SchemaField::VectorParams params;
  params.use_hnsw = true;
  params.dim = 2;
  params.sim = search::VectorSimilarity::L2;
  params.capacity = 10;
  params.hnsw_m = 16;
  params.hnsw_ef_construction = 200;
  schema.fields["v"].special_params = params;

  search::IndicesOptions options;
  search::FieldIndices indices{schema, options, PMR_NS::get_default_resource(), nullptr};

  search::SearchAlgorithm algo;
  search::QueryParams query_params;

  query_params["b"] = "abcdefg";

  // Parser accepts any string as placeholder
  // Invalid vectors result in empty vector (dimension 0) which returns empty results
  ASSERT_TRUE(algo.Init("*=>[KNN 2 @v $b]", &query_params));

  // Search should return empty results for invalid vector
  auto result = algo.Search(&indices);
  EXPECT_TRUE(result.ids.empty());
}

class SortIndexTest : public testing::Test {
 protected:
  void SetUp() override {
    InitTLSearchMR(PMR_NS::get_default_resource());
  }

  void TearDown() override {
    InitTLSearchMR(nullptr);
  }
};

TEST_F(SortIndexTest, StringSort) {
  constexpr auto field = "name";
  const auto schema = MakeSimpleSchema({{field, SchemaField::TAG}}, true);
  FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};

  indices.Add(0, MockedDocument{Map{{field, "charlie"}}});
  indices.Add(1, MockedDocument{Map{{field, "alpha"}}});
  indices.Add(2, MockedDocument{Map{{field, "bravo"}}});

  std::vector<DocId> ids{0, 1, 2};
  constexpr bool desc = false;

  const auto index = indices.GetSortIndex(field);

  index->Sort(&ids, ids.size(), desc);
  std::vector<DocId> expected{1, 2, 0};
  EXPECT_EQ(ids, expected);

  index->Sort(&ids, ids.size(), !desc);
  expected = {0, 2, 1};
  EXPECT_EQ(ids, expected);

  // conversion from stateless to normal string
  auto lookup = index->Lookup(1);
  EXPECT_TRUE(std::holds_alternative<std::string>(lookup));
  EXPECT_EQ(std::get<std::string>(lookup), "alpha");
}

TEST_F(SortIndexTest, NumSort) {
  constexpr auto field = "cost";
  const auto schema = MakeSimpleSchema({{field, SchemaField::NUMERIC}}, true);
  FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};

  indices.Add(0, MockedDocument{Map{{field, "2999"}}});
  indices.Add(1, MockedDocument{Map{{field, "999"}}});
  indices.Add(2, MockedDocument{Map{{field, "12"}}});

  std::vector<DocId> ids{0, 1, 2};
  constexpr bool desc = false;

  auto index = indices.GetSortIndex(field);
  index->Sort(&ids, ids.size(), desc);
  std::vector<DocId> expected{2, 1, 0};
  EXPECT_EQ(ids, expected);

  index->Sort(&ids, ids.size(), !desc);
  expected = {0, 1, 2};
  EXPECT_EQ(ids, expected);

  auto lookup = index->Lookup(1);
  EXPECT_TRUE(std::holds_alternative<double>(lookup));
  EXPECT_EQ(std::get<double>(lookup), 999);
}

// Enumeration for different search types
enum class SearchType { PREFIX = 0, SUFFIX = 1, INFIX = 2 };

// Helper function to generate content with ASCII characters
static std::string GenerateWordSequence(size_t word_count, size_t doc_offset = 0) {
  std::string content;
  for (size_t i = 0; i < word_count; ++i) {
    std::string word;
    char start_char = 'a' + ((doc_offset + i) % 26);
    size_t word_len = 3 + (i % 5);  // Word length 3-7 chars

    for (size_t j = 0; j < word_len; ++j) {
      char c = start_char + (j % 26);
      if (c > 'z')
        c = 'a' + (c - 'z' - 1);
      word += c;
    }

    if (i > 0)
      content += " ";
    content += word;
  }
  return content;
}

// Helper function to generate pattern with variety
static std::string GeneratePattern(SearchType search_type, size_t pattern_len, bool use_uniform) {
  if (use_uniform) {
    // Original uniform pattern for comparison
    switch (search_type) {
      case SearchType::PREFIX:
        return std::string(pattern_len, 'p');
      case SearchType::SUFFIX:
        return std::string(pattern_len, 's');
      case SearchType::INFIX:
        return std::string(pattern_len, 'i');
    }
  } else {
    // Diverse ASCII pattern
    std::string pattern;
    char base_char = (search_type == SearchType::PREFIX)   ? 'p'
                     : (search_type == SearchType::SUFFIX) ? 's'
                                                           : 'i';

    for (size_t i = 0; i < pattern_len; ++i) {
      char c = base_char + (i % 10);  // Use variety of chars
      if (c > 'z')
        c = 'a' + (c - 'z' - 1);
      pattern += c;
    }
    return pattern;
  }
  return "";
}

static void BM_SearchByTypeImpl(benchmark::State& state, bool use_diverse_pattern) {
  size_t num_docs = state.range(0);
  size_t pattern_len = state.range(1);
  SearchType search_type = static_cast<SearchType>(state.range(2));

  auto schema = MakeSimpleSchema({{"title", SchemaField::TEXT}});
  FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};

  // Generate pattern
  std::string pattern = GeneratePattern(search_type, pattern_len, !use_diverse_pattern);
  std::string search_type_name = (search_type == SearchType::PREFIX)   ? "prefix"
                                 : (search_type == SearchType::SUFFIX) ? "suffix"
                                                                       : "infix";

  // Generate test data with more realistic content
  for (size_t i = 0; i < num_docs; i++) {
    std::string content;
    if (i < num_docs / 2) {
      // Half documents have the pattern in appropriate position
      std::string base_content = GenerateWordSequence(5 + (i % 5), i);

      switch (search_type) {
        case SearchType::PREFIX:
          content = pattern + base_content;
          break;
        case SearchType::SUFFIX:
          content = base_content + pattern;
          break;
        case SearchType::INFIX:
          // Fix: embed pattern inside a word, not as separate word
          size_t split_pos = base_content.length() / 2;
          content = base_content.substr(0, split_pos) + pattern + base_content.substr(split_pos);
          break;
      }
    } else {
      // Half don't have the pattern - generate different content
      content = GenerateWordSequence(8 + (i % 3), i + 1000);
    }
    MockedDocument doc{Map{{"title", content}}};
    indices.Add(i, doc);
  }

  SearchAlgorithm algo{};
  QueryParams params;
  std::string query;

  // Generate query based on search type
  switch (search_type) {
    case SearchType::PREFIX:
      query = pattern + "*";
      break;
    case SearchType::SUFFIX:
      query = "*" + pattern;
      break;
    case SearchType::INFIX:
      query = "*" + pattern + "*";
      break;
  }

  if (!algo.Init(query, &params)) {
    state.SkipWithError("Failed to initialize " + search_type_name + " search");
    return;
  }

  while (state.KeepRunning()) {
    auto result = algo.Search(&indices);
    benchmark::DoNotOptimize(result);

    // If result has error, skip the benchmark
    if (!result.error.empty()) {
      state.SkipWithError(search_type_name + " search returned error: " + result.error);
      return;
    }
  }

  // Set counters for analysis
  state.counters["docs_total"] = num_docs;
  state.counters["pattern_length"] = pattern_len;
  state.counters["diverse_pattern"] = use_diverse_pattern ? 1 : 0;
  state.SetLabel(search_type_name + (use_diverse_pattern ? "_diverse" : "_uniform"));
}

// Instantiate template functions
static void BM_SearchByType_Uniform(benchmark::State& state) {
  BM_SearchByTypeImpl(state, false);
}

static void BM_SearchByType_Diverse(benchmark::State& state) {
  BM_SearchByTypeImpl(state, true);
}

// Benchmark to compare all search types - removed 100K docs per romange's suggestion
BENCHMARK(BM_SearchByType_Uniform)
    // Uniform patterns (original test)
    ->Args({1000, 3, static_cast<int>(SearchType::PREFIX)})
    ->Args({1000, 5, static_cast<int>(SearchType::PREFIX)})
    ->Args({10000, 3, static_cast<int>(SearchType::PREFIX)})
    ->Args({10000, 5, static_cast<int>(SearchType::PREFIX)})
    ->Args({1000, 3, static_cast<int>(SearchType::SUFFIX)})
    ->Args({1000, 5, static_cast<int>(SearchType::SUFFIX)})
    ->Args({10000, 3, static_cast<int>(SearchType::SUFFIX)})
    ->Args({10000, 5, static_cast<int>(SearchType::SUFFIX)})
    ->Args({1000, 3, static_cast<int>(SearchType::INFIX)})
    ->Args({1000, 5, static_cast<int>(SearchType::INFIX)})
    ->Args({10000, 3, static_cast<int>(SearchType::INFIX)})
    ->Args({10000, 5, static_cast<int>(SearchType::INFIX)})
    ->ArgNames({"docs", "pattern_len", "search_type"})
    ->Unit(benchmark::kMicrosecond);

BENCHMARK(BM_SearchByType_Diverse)
    // Diverse patterns (new test with ASCII variety)
    ->Args({1000, 3, static_cast<int>(SearchType::PREFIX)})
    ->Args({1000, 5, static_cast<int>(SearchType::PREFIX)})
    ->Args({10000, 3, static_cast<int>(SearchType::PREFIX)})
    ->Args({10000, 5, static_cast<int>(SearchType::PREFIX)})
    ->Args({1000, 3, static_cast<int>(SearchType::SUFFIX)})
    ->Args({1000, 5, static_cast<int>(SearchType::SUFFIX)})
    ->Args({10000, 3, static_cast<int>(SearchType::SUFFIX)})
    ->Args({10000, 5, static_cast<int>(SearchType::SUFFIX)})
    ->Args({1000, 3, static_cast<int>(SearchType::INFIX)})
    ->Args({1000, 5, static_cast<int>(SearchType::INFIX)})
    ->Args({10000, 3, static_cast<int>(SearchType::INFIX)})
    ->Args({10000, 5, static_cast<int>(SearchType::INFIX)})
    ->ArgNames({"docs", "pattern_len", "search_type"})
    ->Unit(benchmark::kMicrosecond);

// Helper function to generate random vector
static std::vector<float> GenerateRandomVector(size_t dims, unsigned seed = 42) {
  std::mt19937 gen(seed);
  std::uniform_real_distribution<float> dis(-1.0f, 1.0f);

  std::vector<float> vec(dims);
  for (size_t i = 0; i < dims; ++i) {
    vec[i] = dis(gen);
  }
  return vec;
}

static void BM_SearchDocIds(benchmark::State& state) {
  auto schema = MakeSimpleSchema({{"score", SchemaField::NUMERIC}, {"tag", SchemaField::TAG}});
  FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};

  SearchAlgorithm algo;
  QueryParams params;
  default_random_engine rnd;
  const char* tag_vals[] = {"test", "example", "sample", "demo", "demo2"};
  uniform_int_distribution<size_t> tag_dist(0, ABSL_ARRAYSIZE(tag_vals) - 1);
  uniform_int_distribution<size_t> score_dist(0, 100);

  for (size_t i = 0; i < 1000; i++) {
    MockedDocument doc{
        Map{{"score", std::to_string(score_dist(rnd))}, {"tag", tag_vals[tag_dist(rnd)]}}};
    indices.Add(i, doc);
  }

  std::string queries[] = {"@tag:{test} @score:[10 50]", "@tag: *", "@score:*"};
  size_t query_type = state.range(0);
  CHECK_LT(query_type, ABSL_ARRAYSIZE(queries));
  CHECK(algo.Init(queries[query_type], &params));
  while (state.KeepRunning()) {
    auto result = algo.Search(&indices);
    CHECK(result.error.empty());
  }
}
BENCHMARK(BM_SearchDocIds)->Range(0, 2);

static void BM_SearchNumericIndexes(benchmark::State& state) {
  auto schema = MakeSimpleSchema({{"numeric", SchemaField::NUMERIC,
                                   SchemaField::NumericParams{.block_size = kMaxRangeBlockSize}}});
  FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};

  SearchAlgorithm algo;
  QueryParams params;
  default_random_engine rnd;

  using NumericType = long long;
  uniform_int_distribution<NumericType> dist(std::numeric_limits<NumericType>::min(),
                                             std::numeric_limits<NumericType>::max());

  const size_t num_docs = state.range(0);
  for (size_t i = 0; i < num_docs; i++) {
    MockedDocument doc{Map{{"numeric", std::to_string(dist(rnd))}}};
    indices.Add(i, doc);
  }

  std::string queries[] = {"@numeric:[15 +inf]", "@numeric:[-inf 20]", "@numeric:[-inf +inf]",
                           "@numeric:[0 100000]"};

  std::unordered_map<size_t, std::vector<size_t>> expected_results_per_num_docs = {
      {10000, {4982, 5018, 10000, 0}},
      {100000, {49885, 50115, 100000, 0}},
      {1000000, {500853, 499147, 1000000, 0}},
  };

  while (state.KeepRunning()) {
    for (size_t i = 0; i < ABSL_ARRAYSIZE(queries); ++i) {
      const auto& query = queries[i];

      CHECK(algo.Init(query, &params));
      auto result = algo.Search(&indices);
      CHECK(result.error.empty());

      const size_t expected_result = expected_results_per_num_docs[num_docs][i];
      CHECK_EQ(result.total, expected_result);
      CHECK_EQ(result.ids.size(), expected_result);
    }
  }
}

BENCHMARK(BM_SearchNumericIndexes)->Arg(10000)->Arg(100000)->Arg(1000000)->ArgNames({"num_docs"});

static void BM_SearchNumericIndexesSmallRanges(benchmark::State& state) {
  auto schema = MakeSimpleSchema({{"numeric", SchemaField::NUMERIC,
                                   SchemaField::NumericParams{.block_size = kMaxRangeBlockSize}}});
  FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};

  SearchAlgorithm algo;
  QueryParams params;
  default_random_engine rnd;

  using NumericType = uint16_t;
  uniform_int_distribution<NumericType> dist(0, std::numeric_limits<NumericType>::max());

  const size_t num_docs = state.range(0);
  // Insert zero values
  for (size_t i = 0; i < num_docs / 50; i++) {
    MockedDocument doc{Map{{"numeric", "0"}}};
    indices.Add(i, doc);
  }
  for (size_t i = num_docs / 50; i < num_docs; i++) {
    MockedDocument doc{Map{{"numeric", std::to_string(dist(rnd))}}};
    indices.Add(i, doc);
  }

  std::string queries[] = {"@numeric:[0 40000]", "@numeric:[-inf +inf]"};

  std::unordered_map<size_t, std::vector<size_t>> expected_results_per_num_docs = {
      {100000, {61939, 100000}},
      {1000000, {618365, 1000000}},
  };

  while (state.KeepRunning()) {
    for (size_t i = 0; i < ABSL_ARRAYSIZE(queries); ++i) {
      const auto& query = queries[i];

      CHECK(algo.Init(query, &params));
      auto result = algo.Search(&indices);
      CHECK(result.error.empty());

      const size_t expected_result = expected_results_per_num_docs[num_docs][i];
      CHECK_EQ(result.total, expected_result);
      CHECK_EQ(result.ids.size(), expected_result);
    }
  }
}

BENCHMARK(BM_SearchNumericIndexesSmallRanges)
    ->Arg(100000)   // One block
    ->Arg(1000000)  // Two blocks
    ->ArgNames({"num_docs"});

static void BM_SearchTwoNumericIndexes(benchmark::State& state) {
  auto schema = MakeSimpleSchema({
      {"numeric1", SchemaField::NUMERIC,
       SchemaField::NumericParams{.block_size = kMaxRangeBlockSize}},
      {"numeric2", SchemaField::NUMERIC,
       SchemaField::NumericParams{.block_size = kMaxRangeBlockSize}},
  });

  FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};

  SearchAlgorithm algo;
  QueryParams params;
  std::default_random_engine rnd;

  using NumericType = long long;
  uniform_int_distribution<NumericType> dist1(std::numeric_limits<NumericType>::min(),
                                              std::numeric_limits<NumericType>::max());
  uniform_int_distribution<NumericType> dist2(std::numeric_limits<NumericType>::min(),
                                              std::numeric_limits<NumericType>::max());

  const size_t num_docs = state.range(0);
  for (size_t i = 0; i < num_docs; ++i) {
    MockedDocument doc{Map{
        {"numeric1", std::to_string(dist1(rnd))},
        {"numeric2", std::to_string(dist2(rnd))},
    }};
    indices.Add(i, doc);
  }

  std::string queries[] = {absl::StrCat("@numeric1:[15 +inf] @numeric2:[-inf 20]"),
                           absl::StrCat("@numeric1:[-inf 20] @numeric2:[15 +inf]"),
                           absl::StrCat("@numeric1:[0 100000] @numeric2:[-100000 0]"),
                           absl::StrCat("@numeric1:[-100000 0] @numeric2:[0 100000]")};

  std::unordered_map<size_t, std::vector<size_t>> expected_results_per_num_docs = {
      {10000, {2508, 2507, 0, 0}},
      {100000, {25119, 25232, 0, 0}},
      {1000000, {250623, 250643, 0, 0}},
  };

  while (state.KeepRunning()) {
    for (size_t i = 0; i < ABSL_ARRAYSIZE(queries); ++i) {
      const auto& query = queries[i];

      CHECK(algo.Init(query, &params));
      auto result = algo.Search(&indices);
      CHECK(result.error.empty());

      const size_t expected_result = expected_results_per_num_docs[num_docs][i];
      CHECK_EQ(result.total, expected_result);
      CHECK_EQ(result.ids.size(), expected_result);
    }
  }
}

BENCHMARK(BM_SearchTwoNumericIndexes)
    ->Arg(10000)
    ->Arg(100000)
    ->Arg(1000000)
    ->ArgNames({"num_docs"});

static void BM_SearchNumericAndTagIndexes(benchmark::State& state) {
  auto schema = MakeSimpleSchema({{"tag", SchemaField::TAG},
                                  {"numeric", SchemaField::NUMERIC,
                                   SchemaField::NumericParams{.block_size = kMaxRangeBlockSize}}});
  FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};

  SearchAlgorithm algo;
  QueryParams params;
  default_random_engine rnd;

  using NumericType = long long;
  uniform_int_distribution<NumericType> dist(std::numeric_limits<NumericType>::min(),
                                             std::numeric_limits<NumericType>::max());

  size_t tag_number = 0;
  const size_t max_tag_number = 1000;

  const size_t num_docs = state.range(0);
  for (size_t i = 0; i < num_docs; i++) {
    MockedDocument doc{
        Map{{"tag", absl::StrCat("tag", tag_number)}, {"numeric", std::to_string(dist(rnd))}}};
    indices.Add(i, doc);

    tag_number = (tag_number + 1) % max_tag_number;
  }

  std::string queries[] = {absl::StrCat("@tag:{tag230|tag3|tag942} @numeric:[15 +inf]"),
                           absl::StrCat("@tag:{tag1|tag829|tag236} @numeric:[-inf 20]"),
                           absl::StrCat("@tag:{tag0|tag999} @numeric:[-1000000 +inf]")};

  std::unordered_map<size_t, std::vector<size_t>> expected_results_per_num_docs = {
      {10000, {19, 16, 8}},
      {100000, {164, 157, 97}},
      {1000000, {1528, 1518, 1017}},
  };

  while (state.KeepRunning()) {
    for (size_t i = 0; i < ABSL_ARRAYSIZE(queries); ++i) {
      const auto& query = queries[i];

      CHECK(algo.Init(query, &params));
      auto result = algo.Search(&indices);
      CHECK(result.error.empty());

      const size_t expected_result = expected_results_per_num_docs[num_docs][i];
      CHECK_EQ(result.total, expected_result);
      CHECK_EQ(result.ids.size(), expected_result);
    }
  }
}

BENCHMARK(BM_SearchNumericAndTagIndexes)
    ->Arg(10000)
    ->Arg(100000)
    ->Arg(1000000)
    ->ArgNames({"num_docs"});

static void BM_SearchSeveralNumericAndTagIndexes(benchmark::State& state) {
  auto schema = MakeSimpleSchema({{"tag", SchemaField::TAG},
                                  {"numeric1", SchemaField::NUMERIC,
                                   SchemaField::NumericParams{.block_size = kMaxRangeBlockSize}},
                                  {"numeric2", SchemaField::NUMERIC,
                                   SchemaField::NumericParams{.block_size = kMaxRangeBlockSize}},
                                  {"numeric3", SchemaField::NUMERIC,
                                   SchemaField::NumericParams{.block_size = kMaxRangeBlockSize}}});
  FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};

  SearchAlgorithm algo;
  QueryParams params;
  default_random_engine rnd;

  using NumericType = uint16_t;
  uniform_int_distribution<NumericType> dist(std::numeric_limits<NumericType>::min(),
                                             std::numeric_limits<NumericType>::max());

  const size_t num_docs = state.range(0);

  size_t tag_number = 0;
  const size_t max_tag_number = num_docs / 30;

  for (size_t i = 0; i < num_docs; i++) {
    MockedDocument doc{Map{{"tag", absl::StrCat("tag", tag_number)},
                           {"numeric1", std::to_string(dist(rnd))},
                           {"numeric2", std::to_string(dist(rnd))},
                           {"numeric3", std::to_string(dist(rnd))}}};
    indices.Add(i, doc);

    tag_number = (tag_number + 1) % max_tag_number;
  }

  std::string queries[] = {
      absl::StrCat(
          "@tag:{tag230|tag3} @numeric1:[0 10000] @numeric2:[20000 30000] @numeric3:[-1000 +inf]"),
      absl::StrCat("@tag:{tag829|tag236} @numeric1:[-inf 10000] @numeric2:[40000 +inf] "
                   "@numeric3:[10000 30000]"),
      absl::StrCat(
          "@tag:{tag0|tag999} @numeric1:[-inf +inf] @numeric2:[20 +inf] @numeric3:[1000 10000]")};

  std::unordered_map<size_t, std::vector<size_t>> expected_results_per_num_docs = {
      {10000, {1, 0, 4}},
      {100000, {1, 1, 10}},
      {1000000, {0, 1, 9}},
  };

  while (state.KeepRunning()) {
    for (size_t i = 0; i < ABSL_ARRAYSIZE(queries); ++i) {
      const auto& query = queries[i];

      CHECK(algo.Init(query, &params));
      auto result = algo.Search(&indices);
      CHECK(result.error.empty());

      const size_t expected_result = expected_results_per_num_docs[num_docs][i];
      CHECK_EQ(result.total, expected_result);
      CHECK_EQ(result.ids.size(), expected_result);
    }
  }
}

BENCHMARK(BM_SearchSeveralNumericAndTagIndexes)
    ->Arg(10000)
    ->Arg(100000)
    ->Arg(1000000)
    ->ArgNames({"num_docs"});

static void BM_SearchMergeEqualSets(benchmark::State& state) {
  auto schema = MakeSimpleSchema({
      {"numeric1", SchemaField::NUMERIC,
       SchemaField::NumericParams{.block_size = kMaxRangeBlockSize}},
      {"numeric2", SchemaField::NUMERIC,
       SchemaField::NumericParams{.block_size = kMaxRangeBlockSize}},
  });

  FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};

  SearchAlgorithm algo;
  QueryParams params;
  std::default_random_engine rnd;

  using NumericType = long long;
  uniform_int_distribution<NumericType> dist1(std::numeric_limits<NumericType>::min(),
                                              std::numeric_limits<NumericType>::max());
  uniform_int_distribution<NumericType> dist2(std::numeric_limits<NumericType>::min(),
                                              std::numeric_limits<NumericType>::max());

  const size_t num_docs = state.range(0);
  for (size_t i = 0; i < num_docs; ++i) {
    MockedDocument doc{Map{
        {"numeric1", std::to_string(dist1(rnd))},
        {"numeric2", std::to_string(dist2(rnd))},
    }};
    indices.Add(i, doc);
  }

  std::string query = absl::StrCat("@numeric1:[-inf +inf] @numeric2:[-inf +inf]");

  while (state.KeepRunning()) {
    CHECK(algo.Init(query, &params));
    auto result = algo.Search(&indices);
    CHECK(result.error.empty());

    // All documents should match both conditions, so total should equal num_docs
    CHECK_EQ(result.total, num_docs);
    CHECK_EQ(result.ids.size(), num_docs);
  }
}

BENCHMARK(BM_SearchMergeEqualSets)
    ->Arg(100)
    ->Arg(1000)
    ->Arg(10000)
    ->Arg(100000)
    ->Arg(1000000)
    ->ArgNames({"num_docs"});

static void BM_SearchRangeTreeSplits(benchmark::State& state) {
  auto schema = MakeSimpleSchema({
      {"num", SchemaField::NUMERIC, SchemaField::NumericParams{}},
  });

  FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};

  const size_t batch_size = state.range(0);
  std::default_random_engine rnd;

  using NumericType = long long;
  uniform_int_distribution<NumericType> dist(0, batch_size + 1);

  size_t doc_index = 0;
  while (state.KeepRunning()) {
    for (size_t i = 0; i < batch_size; i++) {
      MockedDocument doc{Map{{"num", std::to_string(dist(rnd))}}};
      indices.Add(doc_index++, doc);
    }
  }
}

BENCHMARK(BM_SearchRangeTreeSplits)
    ->Arg(100000)
    ->Arg(1000000)
    ->Arg(3000000)
    ->ArgNames({"batch_size"});

// Semantics test for cosine on zero vectors (independent of SimSIMD)
TEST(CosineDistanceTest, ZeroVectors) {
  const size_t dims = 128;
  std::vector<float> zero(dims, 0.0f);
  float d = VectorDistance(zero.data(), zero.data(), dims, VectorSimilarity::COSINE);
  EXPECT_EQ(d, 0.0f);
}

// Unified vector distance benchmarks using VectorDistance function
static void BM_VectorDistance(benchmark::State& state) {
  // Ensure SimSIMD dynamic dispatch is initialized for the benchmark
  InitSimSIMD();
  size_t dims = state.range(0);
  size_t num_pairs = state.range(1);
  VectorSimilarity sim = static_cast<VectorSimilarity>(state.range(2));

  std::vector<std::vector<float>> vectors_a, vectors_b;
  vectors_a.reserve(num_pairs);
  vectors_b.reserve(num_pairs);

  for (size_t i = 0; i < num_pairs; ++i) {
    vectors_a.push_back(GenerateRandomVector(dims, i));
    vectors_b.push_back(GenerateRandomVector(dims, i + 1000));
  }

  size_t pair_idx = 0;
  for (auto _ : state) {
    float distance =
        VectorDistance(vectors_a[pair_idx].data(), vectors_b[pair_idx].data(), dims, sim);
    benchmark::DoNotOptimize(distance);
    pair_idx = (pair_idx + 1) % num_pairs;
  }

  state.counters["dims"] = dims;
  state.counters["pairs"] = num_pairs;

  std::string sim_name = (sim == VectorSimilarity::L2)       ? "L2"
                         : (sim == VectorSimilarity::COSINE) ? "Cosine"
                                                             : "IP";
  state.SetLabel(sim_name);
}

// Intensive benchmark with batch processing
static void BM_VectorDistance_Intensive(benchmark::State& state) {
  // Ensure SimSIMD dynamic dispatch is initialized for the benchmark
  InitSimSIMD();
  size_t dims = 512;  // Fixed medium size
  size_t batch_size = 1000;
  VectorSimilarity sim = static_cast<VectorSimilarity>(state.range(0));

  std::vector<std::vector<float>> vectors_a, vectors_b;
  vectors_a.reserve(batch_size);
  vectors_b.reserve(batch_size);

  for (size_t i = 0; i < batch_size; ++i) {
    vectors_a.push_back(GenerateRandomVector(dims, i));
    vectors_b.push_back(GenerateRandomVector(dims, i + 4000));
  }

  size_t total_ops = 0;
  while (state.KeepRunning()) {
    for (size_t i = 0; i < batch_size; ++i) {
      float distance = VectorDistance(vectors_a[i].data(), vectors_b[i].data(), dims, sim);
      benchmark::DoNotOptimize(distance);
      ++total_ops;
    }
  }

  state.counters["ops"] = total_ops;
  state.counters["ops_per_sec"] = benchmark::Counter(total_ops, benchmark::Counter::kIsRate);

  std::string sim_name = (sim == VectorSimilarity::L2)       ? "L2"
                         : (sim == VectorSimilarity::COSINE) ? "Cosine"
                                                             : "IP";
  state.SetLabel(sim_name + "_Intensive");
}

// Benchmark declarations
BENCHMARK(BM_VectorDistance)
    // Small vectors - L2 Distance
    ->Args({32, 100, static_cast<int>(VectorSimilarity::L2)})
    ->Args({32, 1000, static_cast<int>(VectorSimilarity::L2)})
    ->Args({32, 10000, static_cast<int>(VectorSimilarity::L2)})
    // Medium vectors - L2 Distance
    ->Args({128, 100, static_cast<int>(VectorSimilarity::L2)})
    ->Args({128, 1000, static_cast<int>(VectorSimilarity::L2)})
    ->Args({128, 10000, static_cast<int>(VectorSimilarity::L2)})
    // Large vectors - L2 Distance
    ->Args({512, 100, static_cast<int>(VectorSimilarity::L2)})
    ->Args({512, 1000, static_cast<int>(VectorSimilarity::L2)})
    ->Args({512, 5000, static_cast<int>(VectorSimilarity::L2)})
    // Very large vectors - L2 Distance
    ->Args({1536, 100, static_cast<int>(VectorSimilarity::L2)})
    ->Args({1536, 1000, static_cast<int>(VectorSimilarity::L2)})

    // Small vectors - Cosine Distance
    ->Args({32, 100, static_cast<int>(VectorSimilarity::COSINE)})
    ->Args({32, 1000, static_cast<int>(VectorSimilarity::COSINE)})
    ->Args({32, 10000, static_cast<int>(VectorSimilarity::COSINE)})
    // Medium vectors - Cosine Distance
    ->Args({128, 100, static_cast<int>(VectorSimilarity::COSINE)})
    ->Args({128, 1000, static_cast<int>(VectorSimilarity::COSINE)})
    ->Args({128, 10000, static_cast<int>(VectorSimilarity::COSINE)})
    // Large vectors - Cosine Distance
    ->Args({512, 100, static_cast<int>(VectorSimilarity::COSINE)})
    ->Args({512, 1000, static_cast<int>(VectorSimilarity::COSINE)})
    ->Args({512, 5000, static_cast<int>(VectorSimilarity::COSINE)})
    // Very large vectors - Cosine Distance
    ->Args({1536, 100, static_cast<int>(VectorSimilarity::COSINE)})
    ->Args({1536, 1000, static_cast<int>(VectorSimilarity::COSINE)})

    // Small vectors - IP Distance
    ->Args({32, 100, static_cast<int>(VectorSimilarity::IP)})
    ->Args({32, 1000, static_cast<int>(VectorSimilarity::IP)})
    ->Args({32, 10000, static_cast<int>(VectorSimilarity::IP)})
    // Medium vectors - IP Distance
    ->Args({128, 100, static_cast<int>(VectorSimilarity::IP)})
    ->Args({128, 1000, static_cast<int>(VectorSimilarity::IP)})
    ->Args({128, 10000, static_cast<int>(VectorSimilarity::IP)})
    // Large vectors - IP Distance
    ->Args({512, 100, static_cast<int>(VectorSimilarity::IP)})
    ->Args({512, 1000, static_cast<int>(VectorSimilarity::IP)})
    ->Args({512, 5000, static_cast<int>(VectorSimilarity::IP)})
    // Very large vectors - IP Distance
    ->Args({1536, 100, static_cast<int>(VectorSimilarity::IP)})
    ->Args({1536, 1000, static_cast<int>(VectorSimilarity::IP)})
    ->ArgNames({"dims", "pairs", "similarity"})
    ->Unit(benchmark::kMicrosecond);

BENCHMARK(BM_VectorDistance_Intensive)
    ->Arg(static_cast<int>(VectorSimilarity::L2))
    ->Arg(static_cast<int>(VectorSimilarity::COSINE))
    ->Arg(static_cast<int>(VectorSimilarity::IP))
    ->ArgNames({"similarity_type"})
    ->Unit(benchmark::kMicrosecond);

}  // namespace search
}  // namespace dfly


================================================
FILE: src/core/search/sort_indices.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/search/sort_indices.h"

#include <absl/strings/ascii.h>
#include <absl/strings/str_split.h>
#include <base/logging.h>

#include <algorithm>
#include <optional>
#include <type_traits>
#include <variant>

namespace dfly::search {

using namespace std;

namespace {
template <typename T>
using ScoreT = std::conditional_t<is_same_v<T, StatelessString>, std::string, T>;
}  // namespace

template <typename T> bool SimpleValueSortIndex<T>::ParsedSortValue::HasValue() const {
  return !std::holds_alternative<std::monostate>(value);
}

template <typename T> bool SimpleValueSortIndex<T>::ParsedSortValue::IsNullValue() const {
  return std::holds_alternative<std::nullopt_t>(value);
}

template <typename T> SortableValue SimpleValueSortIndex<T>::Lookup(DocId doc) const {
  DCHECK_LT(doc, occupied_.size());
  if (!occupied_[doc])
    return std::monostate{};

  DCHECK_LT(doc, values_.size());
  return ScoreT<T>{values_[doc]};
}

template <typename T>
std::vector<SortableValue> SimpleValueSortIndex<T>::Sort(std::vector<DocId>* ids, size_t limit,
                                                         bool desc) const {
  auto cb = [this, desc](const auto& lhs, const auto& rhs) {
    // null values are at the end
    auto p1 = make_pair(!occupied_[lhs], cref(values_[lhs]));
    auto p2 = make_pair(!occupied_[rhs], cref(values_[rhs]));
    return desc ? (p1 > p2) : (p1 < p2);
  };
  std::partial_sort(ids->begin(), ids->begin() + std::min(ids->size(), limit), ids->end(), cb);

  // Turn stateless string into std::string
  vector<SortableValue> out(min(ids->size(), limit));
  for (size_t i = 0; i < out.size(); i++)
    out[i] = ScoreT<T>{values_[(*ids)[i]]};
  return out;
}

template <typename T>
bool SimpleValueSortIndex<T>::Add(DocId id, const DocumentAccessor& doc, std::string_view field) {
  auto field_value = Get(doc, field);
  if (!field_value.HasValue()) {
    return false;
  }

  if (id >= values_.size()) {
    values_.resize(id + 1);
    occupied_.resize(id + 1);
  }

  if (!field_value.IsNullValue()) {
    values_[id] = std::move(std::get<T>(field_value.value));
    occupied_[id] = true;
  }
  return true;
}

template <typename T>
void SimpleValueSortIndex<T>::Remove(DocId id, const DocumentAccessor& doc,
                                     std::string_view field) {
  DCHECK_LT(id, values_.size());
  DCHECK_EQ(values_.size(), occupied_.size());
  values_[id] = T{};
  occupied_[id] = false;
}

template <typename T>
std::vector<DocId> SimpleValueSortIndex<T>::GetAllDocsWithNonNullValues() const {
  std::vector<DocId> result;
  result.reserve(values_.size());

  for (DocId id = 0; id < values_.size(); ++id) {
    if (occupied_[id])
      result.push_back(id);
  }

  return result;
}

template struct SimpleValueSortIndex<double>;
template struct SimpleValueSortIndex<StatelessString>;

SimpleValueSortIndex<double>::ParsedSortValue NumericSortIndex::Get(const DocumentAccessor& doc,
                                                                    std::string_view field) {
  auto numbers_list = doc.GetNumbers(field);
  if (!numbers_list) {
    return {};
  }
  if (numbers_list->empty()) {
    return ParsedSortValue{std::nullopt};
  }
  return ParsedSortValue{numbers_list->front()};
}

SimpleValueSortIndex<StatelessString>::ParsedSortValue StringSortIndex::Get(
    const DocumentAccessor& doc, std::string_view field) {
  auto strings_list = doc.GetTags(field);
  if (!strings_list) {
    return {};
  }
  if (strings_list->empty()) {
    return ParsedSortValue{std::nullopt};
  }
  return ParsedSortValue{StatelessString{strings_list->front()}};
}

}  // namespace dfly::search


================================================
FILE: src/core/search/sort_indices.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include "core/search/base.h"
#include "core/search/stateless_allocator.h"

namespace dfly::search {

using StatelessString =
    std::basic_string<char, std::char_traits<char>, StatelessSearchAllocator<char>>;
static_assert(sizeof(StatelessString) == sizeof(std::string));

template <typename T> using StatelessVector = std::vector<T, StatelessSearchAllocator<T>>;
static_assert(sizeof(StatelessVector<StatelessString>) == sizeof(std::vector<std::string>));

template <typename T> struct SimpleValueSortIndex : BaseSortIndex {
 protected:
  struct ParsedSortValue {
    bool HasValue() const;
    bool IsNullValue() const;

    // std::monostate - no value was found.
    // std::nullopt - found value is null.
    // T - found value.
    std::variant<std::monostate, std::nullopt_t, T> value;
  };

 public:
  SortableValue Lookup(DocId doc) const override;
  std::vector<SortableValue> Sort(std::vector<DocId>* ids, size_t limit, bool desc) const override;

  bool Add(DocId id, const DocumentAccessor& doc, std::string_view field) override;
  void Remove(DocId id, const DocumentAccessor& doc, std::string_view field) override;

  // Override GetAllResults to return all documents with non-null values
  std::vector<DocId> GetAllDocsWithNonNullValues() const override;

 protected:
  virtual ParsedSortValue Get(const DocumentAccessor& doc, std::string_view field_value) = 0;

 private:
  StatelessVector<T> values_;
  StatelessVector<bool> occupied_;  // instead of optional<T> in values to avoid memory overhead
};

struct NumericSortIndex : SimpleValueSortIndex<double> {
  ParsedSortValue Get(const DocumentAccessor& doc, std::string_view field) override;
};

// TODO: Map tags to integers for fast sort
struct StringSortIndex : SimpleValueSortIndex<StatelessString> {
  ParsedSortValue Get(const DocumentAccessor& doc, std::string_view field) override;
};

}  // namespace dfly::search


================================================
FILE: src/core/search/stateless_allocator.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.

#pragma once

#include <cassert>

#include "base/pmr/memory_resource.h"
#include "core/detail/stateless_allocator.h"

namespace dfly {

namespace detail {
inline thread_local PMR_NS::memory_resource* search_tl_mr = nullptr;
}

template <typename T>
class StatelessSearchAllocator : public StatelessAllocatorBase<T, StatelessSearchAllocator<T>> {
 public:
  StatelessSearchAllocator() noexcept {
    assert(detail::search_tl_mr != nullptr);
  }

  template <typename U>
  StatelessSearchAllocator(const StatelessSearchAllocator<U>&) noexcept {  // NOLINT
  }

  static PMR_NS::memory_resource* resource() {
    return detail::search_tl_mr;
  }
};

template <typename T, typename U>
bool operator==(const StatelessSearchAllocator<T>&, const StatelessSearchAllocator<U>&) noexcept {
  return true;
}

template <typename T, typename U>
bool operator!=(const StatelessSearchAllocator<T>&, const StatelessSearchAllocator<U>&) noexcept {
  return false;
}

inline void InitTLSearchMR(PMR_NS::memory_resource* mr) {
  detail::search_tl_mr = mr;
}

}  // namespace dfly


================================================
FILE: src/core/search/synonyms.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "synonyms.h"

#include <absl/strings/str_cat.h>
#include <uni_algo/case.h>

namespace dfly::search {

const absl::flat_hash_map<std::string, Synonyms::Group>& Synonyms::GetGroups() const {
  return groups_;
}

void Synonyms::UpdateGroup(const std::string_view& id, const std::vector<std::string_view>& terms) {
  auto& group = groups_[id];

  // Convert all terms to lowercase before adding them to the group
  for (const std::string_view& term : terms) {
    group.insert(una::cases::to_lowercase_utf8(term));
  }
}

std::optional<std::string> Synonyms::GetGroupToken(std::string term) const {
  term = una::cases::to_lowercase_utf8(term);
  for (const auto& [id, group] : groups_) {
    if (group.count(term)) {
      // Add space before group id to avoid matching the term itself
      return absl::StrCat(" ", id);
    }
  }

  return std::nullopt;
}

}  // namespace dfly::search


================================================
FILE: src/core/search/synonyms.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/flat_hash_map.h>
#include <absl/container/flat_hash_set.h>

namespace dfly::search {

// Class that manages synonym groups for search indices.
// Allows defining groups of related terms that should be considered equivalent during search.
// All terms are converted to lowercase for normalization.
//
// When retrieving a group token via GetGroupToken, the group identifier is returned with a space
// prefix. The space is intentionally added to avoid matching with the term itself during text
// tokenization and to distinguish the group identifier from regular terms during search.
class Synonyms {
 public:
  // Represents a group of synonymous terms
  using Group = absl::flat_hash_set<std::string>;

  // Get all synonym groups
  const absl::flat_hash_map<std::string, Group>& GetGroups() const;

  // Update or create a synonym group
  void UpdateGroup(const std::string_view& id, const std::vector<std::string_view>& terms);

  // Get the group ID for a term
  std::optional<std::string> GetGroupToken(std::string term) const;

 private:
  // Maps group ID to synonym group
  absl::flat_hash_map<std::string, Group> groups_;
};

}  // namespace dfly::search


================================================
FILE: src/core/search/tag_types.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

namespace dfly {
namespace search {

enum class TagType { PREFIX, SUFFIX, INFIX, REGULAR };

}  // namespace search
}  // namespace dfly


================================================
FILE: src/core/search/vector_utils.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/search/vector_utils.h"

#include <cmath>
#include <memory>

#include "base/logging.h"

namespace dfly::search {

using namespace std;

namespace {

#ifdef WITH_SIMSIMD
#include <simsimd/simsimd.h>
#endif

#if defined(__GNUC__) && !defined(__clang__)
#define FAST_MATH __attribute__((optimize("fast-math")))
#else
#define FAST_MATH
#endif

OwnedFtVector ConvertToFtVector(string_view value) {
  // Value cannot be casted directly as it might be not aligned as a float (4 bytes).
  // Misaligned memory access is UB.
  size_t size = value.size() / sizeof(float);
  auto out = make_unique<float[]>(size);
  memcpy(out.get(), value.data(), size * sizeof(float));

  return OwnedFtVector{std::move(out), size};
}

}  // namespace

// Euclidean vector distance: sqrt( sum: (u[i] - v[i])^2  )
FAST_MATH float L2Distance(const float* u, const float* v, size_t dims) {
#ifdef WITH_SIMSIMD
  simsimd_distance_t distance = 0;
  simsimd_l2_f32(u, v, dims, &distance);
  return static_cast<float>(distance);
#else
  float sum = 0;
  for (size_t i = 0; i < dims; i++)
    sum += (u[i] - v[i]) * (u[i] - v[i]);
  return sqrt(sum);
#endif
}

// Inner product distance: 1 - dot_product(u, v)
// For normalized vectors, this is equivalent to cosine distance
FAST_MATH float IPDistance(const float* u, const float* v, size_t dims) {
#ifdef WITH_SIMSIMD
  // Use SimSIMD dot product and convert to inner product distance: 1 - dot(u, v).
  simsimd_distance_t dot = 0;
  simsimd_dot_f32(u, v, dims, &dot);
  return 1.0f - static_cast<float>(dot);
#else
  float sum_uv = 0;
  for (size_t i = 0; i < dims; i++)
    sum_uv += u[i] * v[i];
  return 1.0f - sum_uv;
#endif
}

// Cosine distance: 1 - (dot_product(u, v) / (||u|| * ||v||))
FAST_MATH float CosineDistance(const float* u, const float* v, size_t dims) {
#ifdef WITH_SIMSIMD
  simsimd_distance_t distance = 0;
  simsimd_cos_f32(u, v, dims, &distance);
  return static_cast<float>(distance);
#else
  float sum_uv = 0, sum_uu = 0, sum_vv = 0;
  for (size_t i = 0; i < dims; i++) {
    sum_uv += u[i] * v[i];
    sum_uu += u[i] * u[i];
    sum_vv += v[i] * v[i];
  }

  if (float denom = sum_uu * sum_vv; denom != 0.0f)
    return 1 - sum_uv / sqrt(denom);
  return 0.0f;
#endif
}

OwnedFtVector BytesToFtVector(string_view value) {
  DCHECK_EQ(value.size() % sizeof(float), 0u) << value.size();
  return ConvertToFtVector(value);
}

std::optional<OwnedFtVector> BytesToFtVectorSafe(string_view value) {
  if (value.size() % sizeof(float)) {
    return std::nullopt;
  }
  return ConvertToFtVector(value);
}

float VectorDistance(const float* u, const float* v, size_t dims, VectorSimilarity sim) {
  switch (sim) {
    case VectorSimilarity::L2:
      return L2Distance(u, v, dims);
    case VectorSimilarity::IP:
      return IPDistance(u, v, dims);
    case VectorSimilarity::COSINE:
      return CosineDistance(u, v, dims);
  };
  return 0.0f;
}

void InitSimSIMD() {
#if defined(WITH_SIMSIMD)
  (void)simsimd_capabilities();
#endif
}

}  // namespace dfly::search


================================================
FILE: src/core/search/vector_utils.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include "core/search/base.h"

namespace dfly::search {

// Initializes SimSIMD runtime if dynamic dispatch is enabled.
void InitSimSIMD();

OwnedFtVector BytesToFtVector(std::string_view value);

// Returns std::nullopt if value can not be converted to the vector
// TODO: Remove unsafe version
std::optional<OwnedFtVector> BytesToFtVectorSafe(std::string_view value);

float L2Distance(const float* u, const float* v, size_t dims);
float IPDistance(const float* u, const float* v, size_t dims);
float CosineDistance(const float* u, const float* v, size_t dims);
float VectorDistance(const float* u, const float* v, size_t dims, VectorSimilarity sim);

}  // namespace dfly::search


================================================
FILE: src/core/segment_allocator.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#include "core/segment_allocator.h"

#define MI_BUILD_RELEASE 1
#include <mimalloc/types.h>

#include "base/logging.h"

namespace dfly {

SegmentAllocator::SegmentAllocator(mi_heap_t* heap) : heap_(heap) {
  // 256GB
  constexpr size_t limit = 1ULL << 35;
  static_assert((1ULL << (kSegmentIdBits + kSegmentShift)) == limit);
  // mimalloc uses 32MiB segments and we might need change this code if it changes.
  static_assert(kSegmentShift == MI_SEGMENT_SHIFT);
  static_assert((~kSegmentAlignMask) == (MI_SEGMENT_MASK));
}

void SegmentAllocator::ValidateMapSize() {
  if (address_table_.size() > (1u << kSegmentIdBits)) {
    // This can happen if we restrict dragonfly to small number of threads on high-memory machine,
    // for example.
    LOG(WARNING) << "address_table_ map is growing too large: " << address_table_.size();
  }
}

bool SegmentAllocator::CanAllocate() {
  return address_table_.size() < (1u << kSegmentIdBits);
}

}  // namespace dfly


================================================
FILE: src/core/segment_allocator.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once

#include <absl/container/flat_hash_map.h>
#include <mimalloc.h>

/***
 * This class is tightly coupled with mimalloc segment allocation logic and is designed to provide
 * a compact pointer representation (4bytes ptr) over 64bit address space that gives you
 * 32GB of allocations with option to extend it to 32*256GB if needed.
 *
 */

namespace dfly {

/**
 * @brief Tightly coupled with mi_malloc 2.x implementation.
 *        Fetches 32MiB segment pointers from the allocated pointers.
 *        Provides own indexing of small pointers to real address space using the segment ptrs/
 */

class SegmentAllocator {
  // (2 ^ 10) total segments
  static constexpr uint32_t kSegmentIdBits = 10;
  static constexpr uint32_t kSegmentIdMask = (1u << kSegmentIdBits) - 1;
  // (2 ^ 25) total bytes per segment = 32MiB
  static constexpr uint32_t kSegmentShift = 25;

  // Segment range that we cover within a single segment.
  static constexpr uint64_t kSegmentAlignMask = ~((1ULL << kSegmentShift) - 1);

 public:
  using Ptr = uint32_t;

  SegmentAllocator(mi_heap_t* heap);
  bool CanAllocate();

  uint8_t* Translate(Ptr p) const {
    return address_table_[p & kSegmentIdMask] + Offset(p);
  }

  std::pair<Ptr, uint8_t*> Allocate(uint32_t size);

  void Free(Ptr ptr) {
    void* p = Translate(ptr);
    used_ -= mi_usable_size(p);
    mi_free(p);
  }

  mi_heap_t* heap() {
    return heap_;
  }

  size_t used() const {
    return used_;
  }

 private:
  static uint32_t Offset(Ptr p) {
    return (p >> kSegmentIdBits) * 8;
  }

  void ValidateMapSize();

  std::vector<uint8_t*> address_table_;
  absl::flat_hash_map<uint64_t, uint16_t> rev_indx_;
  mi_heap_t* heap_;
  size_t used_ = 0;
};

inline auto SegmentAllocator::Allocate(uint32_t size) -> std::pair<Ptr, uint8_t*> {
  void* ptr = mi_heap_malloc(heap_, size);
  if (!ptr)
    throw std::bad_alloc{};

  uint64_t iptr = (uint64_t)ptr;
  uint64_t seg_ptr = iptr & kSegmentAlignMask;

  // could be speed up using last used seg_ptr.
  auto [it, inserted] = rev_indx_.emplace(seg_ptr, address_table_.size());
  if (inserted) {
    ValidateMapSize();
    address_table_.push_back((uint8_t*)seg_ptr);
  }

  uint32_t seg_offset = (iptr - seg_ptr) / 8;
  Ptr res = (seg_offset << kSegmentIdBits) | it->second;
  used_ += mi_good_size(size);

  return std::make_pair(res, (uint8_t*)ptr);
}

}  // namespace dfly


================================================
FILE: src/core/size_tracking_channel.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <atomic>

#include "util/fibers/simple_channel.h"

namespace dfly {

// SimpleQueue-like interface, but also keeps track over the size of Ts it owns.
// It has a slightly less efficient TryPush() API as it forces construction of Ts even if they are
// not pushed.
// T must have a .size() method, which should return the heap-allocated size of T, excluding
// anything included in sizeof(T). We could generalize this in the future.
template <typename T, typename Queue = folly::ProducerConsumerQueue<T>> class SizeTrackingChannel {
 public:
  SizeTrackingChannel(size_t n, unsigned num_producers = 1) : queue_(n, num_producers) {
  }

  // Here and below, we must accept a T instead of building it from variadic args, as we need to
  // know its size in case it is added.
  size_t Push(T t) noexcept {
    size_t tsize = t.size();
    size_t res = size_.fetch_add(tsize, std::memory_order_relaxed);
    queue_.Push(std::move(t));
    return res + tsize;
  }

  bool TryPush(T t) noexcept {
    const size_t size = t.size();
    if (queue_.TryPush(std::move(t))) {
      size_.fetch_add(size, std::memory_order_relaxed);
      return true;
    }

    return false;
  }

  bool Pop(T& dest) {
    if (queue_.Pop(dest)) {
      size_.fetch_sub(dest.size(), std::memory_order_relaxed);
      return true;
    }

    return false;
  }

  void StartClosing() {
    queue_.StartClosing();
  }

  bool TryPop(T& dest) {
    if (queue_.TryPop(dest)) {
      size_.fetch_sub(dest.size(), std::memory_order_relaxed);
      return true;
    }

    return false;
  }

  bool IsClosing() const {
    return queue_.IsClosing();
  }

  size_t GetSize() const {
    return queue_.Capacity() * sizeof(T) + size_.load(std::memory_order_relaxed);
  }

 private:
  util::fb2::SimpleChannel<T, Queue> queue_;
  std::atomic<size_t> size_ = 0;
};

}  // namespace dfly


================================================
FILE: src/core/small_string.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/small_string.h"

#include <mimalloc.h>
#include <xxhash.h>

#include <memory>

#include "base/logging.h"
#include "core/page_usage/page_usage_stats.h"
#include "core/segment_allocator.h"

namespace dfly {
using namespace std;

namespace {

class XXH3_Deleter {
 public:
  void operator()(XXH3_state_t* ptr) const {
    XXH3_freeState(ptr);
  }
};

struct TL {
  unique_ptr<XXH3_state_t, XXH3_Deleter> xxh_state;
  unique_ptr<SegmentAllocator> seg_alloc;
};

thread_local TL tl;

constexpr XXH64_hash_t kHashSeed = 24061983;  // same as in compact_object.cc

}  // namespace

void SmallString::InitThreadLocal(void* heap) {
  SegmentAllocator* ns = new SegmentAllocator((mi_heap_t*)heap);

  tl.seg_alloc.reset(ns);
  tl.xxh_state.reset(XXH3_createState());
  XXH3_64bits_reset_withSeed(tl.xxh_state.get(), kHashSeed);
}

bool SmallString::CanAllocate(size_t size) {
  return size <= kMaxSize && tl.seg_alloc->CanAllocate();
}

size_t SmallString::UsedThreadLocal() {
  return tl.seg_alloc ? tl.seg_alloc->used() : 0;
}

static_assert(sizeof(SmallString) == 16);

size_t SmallString::Assign(std::string_view s) {
  DCHECK_GT(s.size(), kPrefLen);
  DCHECK(CanAllocate(s.size()));
  uint8_t* realptr = nullptr;

  // reallocate if we need a larger allocation or it becomes space-inefficient
  size_t heap_len = s.size() - kPrefLen;
  if (size_t available = MallocUsed(); available < heap_len || heap_len * 2 < available) {
    Free();

    auto [sp, rp] = tl.seg_alloc->Allocate(heap_len);
    small_ptr_ = sp;
    realptr = rp;
  } else {
    realptr = tl.seg_alloc->Translate(small_ptr_);
  }

  size_ = s.size();
  memcpy(prefix_, s.data(), kPrefLen);
  memcpy(realptr, s.data() + kPrefLen, heap_len);
  return mi_malloc_usable_size(realptr);
}

void SmallString::Free() {
  if (size_)
    tl.seg_alloc->Free(small_ptr_);
  size_ = 0;
}

uint16_t SmallString::MallocUsed() const {
  if (size_)
    return mi_malloc_usable_size(tl.seg_alloc->Translate(small_ptr_));
  return 0;
}

bool SmallString::Equal(std::string_view o) const {
  if (size_ != o.size())
    return false;

  if (size_ == 0)
    return true;

  if (memcmp(prefix_, o.data(), kPrefLen) != 0)
    return false;

  uint8_t* realp = tl.seg_alloc->Translate(small_ptr_);
  return memcmp(realp, o.data() + kPrefLen, size_ - kPrefLen) == 0;
}

bool SmallString::Equal(const SmallString& os) const {
  if (size_ != os.size_)
    return false;

  return Get() == os.Get();
}

uint64_t SmallString::HashCode() const {
  array<string_view, 2> slice = Get();

  XXH3_state_t* state = tl.xxh_state.get();
  XXH3_64bits_reset_withSeed(state, kHashSeed);
  XXH3_64bits_update(state, slice[0].data(), slice[0].size());
  XXH3_64bits_update(state, slice[1].data(), slice[1].size());

  return XXH3_64bits_digest(state);
}

array<string_view, 2> SmallString::Get() const {
  DCHECK(size_);

  array<string_view, 2> dest;
  dest[0] = string_view{prefix_, kPrefLen};
  uint8_t* ptr = tl.seg_alloc->Translate(small_ptr_);
  dest[1] = string_view{reinterpret_cast<char*>(ptr), size_ - kPrefLen};
  return dest;
}

void SmallString::Get(char* out) const {
  auto strs = Get();
  memcpy(out, strs[0].data(), strs[0].size());
  memcpy(out + strs[0].size(), strs[1].data(), strs[1].size());
}

void SmallString::Get(std::string* dest) const {
  dest->resize(size_);
  Get(dest->data());
}

bool SmallString::DefragIfNeeded(PageUsage* page_usage) {
  uint8_t* cur_real_ptr = tl.seg_alloc->Translate(small_ptr_);
  if (!page_usage->IsPageForObjectUnderUtilized(tl.seg_alloc->heap(), cur_real_ptr))
    return false;

  if (!CanAllocate(size_ - kPrefLen))  // Forced
    return false;

  auto [sp, rp] = tl.seg_alloc->Allocate(size_ - kPrefLen);
  memcpy(rp, cur_real_ptr, size_ - kPrefLen);
  tl.seg_alloc->Free(small_ptr_);
  small_ptr_ = sp;

  return true;
}

}  // namespace dfly


================================================
FILE: src/core/small_string.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once

#include <array>
#include <cstdint>
#include <string_view>

namespace dfly {

class PageUsage;

// Efficient storage of strings longer than 10 bytes.
// Requires explicit memory management
class SmallString {
  static constexpr unsigned kPrefLen = 10;
  static constexpr unsigned kMaxSize = (1 << 8) - 1;

 public:
  static void InitThreadLocal(void* heap);
  static size_t UsedThreadLocal();
  static bool CanAllocate(size_t size);

  // Returns malloc used.
  size_t Assign(std::string_view s);
  void Free();

  bool Equal(std::string_view o) const;
  bool Equal(const SmallString& mps) const;

  uint64_t HashCode() const;
  uint16_t MallocUsed() const;

  std::array<std::string_view, 2> Get() const;
  void Get(char* out) const;
  void Get(std::string* dest) const;

  bool DefragIfNeeded(PageUsage* page_usage);

  size_t size() const {
    return size_;
  }

  uint8_t first_byte() const {
    return prefix_[0];
  }

 private:
  // The string is stored broken up into two parts, the first one - in this array
  char prefix_[kPrefLen];

  uint32_t small_ptr_;  // 32GB capacity because we ignore 3 lsb bits (i.e. x8).
  uint16_t size_;       // uint16_t - total size (including prefix)

} __attribute__((packed));

}  // namespace dfly


================================================
FILE: src/core/sorted_map.cc
================================================
// Copyright 2023, Roman Gershman.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/sorted_map.h"

#include <absl/strings/str_cat.h>

#include <cmath>

extern "C" {
#include "redis/listpack.h"
#include "redis/redis_aux.h"
#include "redis/util.h"
#include "redis/zmalloc.h"
}

#include <double-conversion/double-to-string.h>

#include "base/endian.h"
#include "base/logging.h"

using namespace std;

namespace dfly {
namespace detail {

namespace {

double GetObjScore(const void* obj) {
  sds s = (sds)obj;
  char* ptr = s + sdslen(s) + 1;
  return absl::bit_cast<double>(absl::little_endian::Load64(ptr));
}

void SetObjScore(void* obj, double score) {
  sds s = (sds)obj;
  char* ptr = s + sdslen(s) + 1;
  absl::little_endian::Store64(ptr, absl::bit_cast<uint64_t>(score));
}

// buf must be at least 10 chars long.
void* BuildScoredKey(double score, char buf[]) {
  buf[0] = SDS_TYPE_5;  // length 0.
  buf[1] = 0;
  absl::little_endian::Store64(buf + 2, absl::bit_cast<uint64_t>(score));
  void* key = buf + 1;

  return key;
}

// Copied from t_zset.c
/* Returns 1 if the double value can safely be represented in long long without
 * precision loss, in which case the corresponding long long is stored in the out variable. */
static int double2ll(double d, long long* out) {
#if (DBL_MANT_DIG >= 52) && (DBL_MANT_DIG <= 63) && (LLONG_MAX == 0x7fffffffffffffffLL)
  /* Check if the float is in a safe range to be casted into a
   * long long. We are assuming that long long is 64 bit here.
   * Also we are assuming that there are no implementations around where
   * double has precision < 52 bit.
   *
   * Under this assumptions we test if a double is inside a range
   * where casting to long long is safe. Then using two castings we
   * make sure the decimal part is zero. If all this is true we can use
   * integer without precision loss.
   *
   * Note that numbers above 2^52 and below 2^63 use all the fraction bits as real part,
   * and the exponent bits are positive, which means the "decimal" part must be 0.
   * i.e. all double values in that range are representable as a long without precision loss,
   * but not all long values in that range can be represented as a double.
   * we only care about the first part here. */
  if (d < (double)(-LLONG_MAX / 2) || d > (double)(LLONG_MAX / 2))
    return 0;
  long long ll = d;
  if (ll == d) {
    *out = ll;
    return 1;
  }
#endif
  return 0;
}

/* Compare element in sorted set with given element. */
int zzlCompareElements(unsigned char* eptr, unsigned char* cstr, unsigned int clen) {
  unsigned char* vstr;
  unsigned int vlen;
  long long vlong;
  unsigned char vbuf[32];
  int minlen, cmp;

  vstr = lpGetValue(eptr, &vlen, &vlong);
  if (vstr == NULL) {
    /* Store string representation of long long in buf. */
    vlen = ll2string((char*)vbuf, sizeof(vbuf), vlong);
    vstr = vbuf;
  }

  minlen = (vlen < clen) ? vlen : clen;
  cmp = memcmp(vstr, cstr, minlen);
  if (cmp == 0)
    return vlen - clen;
  return cmp;
}

using double_conversion::DoubleToStringConverter;
constexpr unsigned kConvFlags = DoubleToStringConverter::UNIQUE_ZERO;

DoubleToStringConverter score_conv(kConvFlags, "inf", "nan", 'e', -6, 21, 6, 0);

// Copied from redis code but uses double_conversion to encode double values.
unsigned char* ZzlInsertAt(unsigned char* zl, unsigned char* eptr, std::string_view ele,
                           double score) {
  unsigned char* sptr;
  char scorebuf[128];
  unsigned scorelen = 0;
  long long lscore;
  int score_is_long = double2ll(score, &lscore);
  if (!score_is_long) {
    // Use double converter to get the shortest representation.
    double_conversion::StringBuilder sb(scorebuf, sizeof(scorebuf));
    score_conv.ToShortest(score, &sb);
    scorelen = sb.position();
    sb.Finalize();
    DCHECK_EQ(scorelen, strlen(scorebuf));
  }

  // Argument parsing converts empty strings to default initialized string views.
  // Such string views have a null data field, which if passed into lpAppend (via zzlInsertAt)
  // results in the replace operation being applied on the listpack. In addition to being wrong, it
  // also causes assertion failures. To circumvent this corner case we pass here a string view
  // pointing to an empty string on the stack, which has a non-null data field.
  if (ele.data() == nullptr) {
    ele = ""sv;
  }

  if (eptr == NULL) {
    zl = lpAppend(zl, (const unsigned char*)(ele.data()), ele.size());
    if (score_is_long)
      zl = lpAppendInteger(zl, lscore);
    else
      zl = lpAppend(zl, (unsigned char*)scorebuf, scorelen);
  } else {
    /* Insert member before the element 'eptr'. */
    zl = lpInsertString(zl, (const unsigned char*)ele.data(), ele.size(), eptr, LP_BEFORE, &sptr);

    /* Insert score after the member. */
    if (score_is_long)
      zl = lpInsertInteger(zl, lscore, sptr, LP_AFTER, NULL);
    else
      zl = lpInsertString(zl, (unsigned char*)scorebuf, scorelen, sptr, LP_AFTER, NULL);
  }
  return zl;
}

double ZzlStrtod(unsigned char* vstr, unsigned int vlen) {
  char buf[128];
  if (vlen > sizeof(buf))
    vlen = sizeof(buf);
  memcpy(buf, vstr, vlen);
  buf[vlen] = '\0';
  return strtod(buf, NULL);
}

/* Return a listpack element as an SDS string. */
sds LpGetObject(const uint8_t* sptr) {
  unsigned char* vstr;
  unsigned int vlen;
  long long vlong;

  serverAssert(sptr != NULL);
  vstr = lpGetValue(const_cast<uint8_t*>(sptr), &vlen, &vlong);

  if (vstr) {
    return sdsnewlen((char*)vstr, vlen);
  } else {
    return sdsfromlonglong(vlong);
  }
}

// static representation of sds strings
char kMinStrData[] =
    "\110"
    "minstring";
char kMaxStrData[] =
    "\110"
    "maxstring";

}  // namespace

double ZzlGetScore(const uint8_t* sptr) {
  unsigned char* vstr;
  unsigned int vlen;
  long long vlong;
  double score;

  DCHECK(sptr != NULL);
  vstr = lpGetValue(const_cast<uint8_t*>(sptr), &vlen, &vlong);

  if (vstr) {
    score = ZzlStrtod(vstr, vlen);
  } else {
    score = vlong;
  }

  return score;
}

/* Move to the previous entry based on the values in eptr and sptr. Both are
 * set to NULL when there is no prev entry. */
void ZzlPrev(const uint8_t* zl, uint8_t** eptr, uint8_t** sptr) {
  unsigned char *_eptr, *_sptr;
  serverAssert(*eptr != NULL && *sptr != NULL);

  _sptr = lpPrev(const_cast<uint8_t*>(zl), *eptr);
  if (_sptr != NULL) {
    _eptr = lpPrev(const_cast<uint8_t*>(zl), _sptr);
    DCHECK(_eptr != NULL);
  } else {
    /* No previous entry. */
    _eptr = NULL;
  }

  *eptr = _eptr;
  *sptr = _sptr;
}

/* Move to next entry based on the values in eptr and sptr. Both are set to
 * NULL when there is no next entry. */
void ZzlNext(const uint8_t* zl, uint8_t** eptr, uint8_t** sptr) {
  unsigned char *_eptr, *_sptr;
  DCHECK(*eptr != NULL && *sptr != NULL);

  _eptr = lpNext(const_cast<uint8_t*>(zl), *sptr);
  if (_eptr != NULL) {
    _sptr = lpNext(const_cast<uint8_t*>(zl), _eptr);
    DCHECK(_sptr != NULL);
  } else {
    /* No next entry. */
    _sptr = NULL;
  }

  *eptr = _eptr;
  *sptr = _sptr;
}

/* Free a lex range structure, must be called only after zslParseLexRange()
 * populated the structure with success (C_OK returned). */
void ZslFreeLexRange(const zlexrangespec* spec) {
  if (spec->min != cminstring && spec->min != cmaxstring)
    sdsfree(spec->min);
  if (spec->max != cminstring && spec->max != cmaxstring)
    sdsfree(spec->max);
}

/* This is just a wrapper to sdscmp() that is able to
 * handle shared.minstring and shared.maxstring as the equivalent of
 * -inf and +inf for strings */
int sdscmplex(sds a, sds b) {
  if (a == b)
    return 0;
  if (a == cminstring || b == cmaxstring)
    return -1;
  if (a == cmaxstring || b == cminstring)
    return 1;
  return sdscmp(a, b);
}

int zslLexValueGteMin(sds value, const zlexrangespec* spec) {
  return spec->minex ? (sdscmplex(value, spec->min) > 0) : (sdscmplex(value, spec->min) >= 0);
}

int zslLexValueLteMax(sds value, const zlexrangespec* spec) {
  return spec->maxex ? (sdscmplex(value, spec->max) < 0) : (sdscmplex(value, spec->max) <= 0);
}

int ZzlLexValueGteMin(unsigned char* p, const zlexrangespec* spec) {
  sds value = LpGetObject(p);
  int res = zslLexValueGteMin(value, spec);
  sdsfree(value);
  return res;
}

int ZzlLexValueLteMax(unsigned char* p, const zlexrangespec* spec) {
  sds value = LpGetObject(p);
  int res = zslLexValueLteMax(value, spec);
  sdsfree(value);
  return res;
}

/* Returns if there is a part of the zset is in range. Should only be used
 * internally by zzlFirstInRange and zzlLastInRange. */
int zzlIsInRange(unsigned char* zl, const zrangespec* range) {
  unsigned char* p;
  double score;

  /* Test for ranges that will always be empty. */
  if (range->min > range->max || (range->min == range->max && (range->minex || range->maxex)))
    return 0;

  p = lpSeek(zl, -1); /* Last score. */
  if (p == NULL)
    return 0; /* Empty sorted set */
  score = ZzlGetScore(p);
  if (!ZslValueGteMin(score, range))
    return 0;

  p = lpSeek(zl, 1); /* First score. */
  serverAssert(p != NULL);
  score = ZzlGetScore(p);
  if (!ZslValueLteMax(score, range))
    return 0;

  return 1;
}

/* Find pointer to the first element contained in the specified range.
 * Returns NULL when no element is contained in the range. */
unsigned char* ZzlFirstInRange(unsigned char* zl, const zrangespec* range) {
  unsigned char *eptr = lpSeek(zl, 0), *sptr;
  double score;

  /* If everything is out of range, return early. */
  if (!zzlIsInRange(zl, range))
    return NULL;

  while (eptr != NULL) {
    sptr = lpNext(zl, eptr);
    serverAssert(sptr != NULL);

    score = ZzlGetScore(sptr);
    if (ZslValueGteMin(score, range)) {
      /* Check if score <= max. */
      if (ZslValueLteMax(score, range))
        return eptr;
      return NULL;
    }

    /* Move to next element. */
    eptr = lpNext(zl, sptr);
  }

  return NULL;
}

/* Find pointer to the last element contained in the specified range.
 * Returns NULL when no element is contained in the range. */
unsigned char* ZzlLastInRange(unsigned char* zl, const zrangespec* range) {
  unsigned char *eptr = lpSeek(zl, -2), *sptr;
  double score;

  /* If everything is out of range, return early. */
  if (!zzlIsInRange(zl, range))
    return NULL;

  while (eptr != NULL) {
    sptr = lpNext(zl, eptr);
    serverAssert(sptr != NULL);

    score = ZzlGetScore(sptr);
    if (ZslValueLteMax(score, range)) {
      /* Check if score >= min. */
      if (ZslValueGteMin(score, range))
        return eptr;
      return NULL;
    }

    /* Move to previous element by moving to the score of previous element.
     * When this returns NULL, we know there also is no element. */
    sptr = lpPrev(zl, eptr);
    if (sptr != NULL)
      serverAssert((eptr = lpPrev(zl, sptr)) != NULL);
    else
      eptr = NULL;
  }

  return NULL;
}

/* Returns if there is a part of the zset is in range. Should only be used
 * internally by zzlFirstInRange and zzlLastInRange. */
int ZzlIsInLexRange(unsigned char* zl, const zlexrangespec* range) {
  unsigned char* p;

  /* Test for ranges that will always be empty. */
  int cmp = sdscmplex(range->min, range->max);
  if (cmp > 0 || (cmp == 0 && (range->minex || range->maxex)))
    return 0;

  p = lpSeek(zl, -2); /* Last element. */
  if (p == NULL)
    return 0;
  if (!ZzlLexValueGteMin(p, range))
    return 0;

  p = lpSeek(zl, 0); /* First element. */
  serverAssert(p != NULL);
  if (!ZzlLexValueLteMax(p, range))
    return 0;

  return 1;
}

/* Find pointer to the first element contained in the specified lex range.
 * Returns NULL when no element is contained in the range. */
unsigned char* ZzlFirstInLexRange(unsigned char* zl, const zlexrangespec* range) {
  unsigned char *eptr = lpSeek(zl, 0), *sptr;

  /* If everything is out of range, return early. */
  if (!ZzlIsInLexRange(zl, range))
    return NULL;

  while (eptr != NULL) {
    if (ZzlLexValueGteMin(eptr, range)) {
      /* Check if score <= max. */
      if (ZzlLexValueLteMax(eptr, range))
        return eptr;
      return NULL;
    }

    /* Move to next element. */
    sptr = lpNext(zl, eptr); /* This element score. Skip it. */
    serverAssert(sptr != NULL);
    eptr = lpNext(zl, sptr); /* Next element. */
  }

  return NULL;
}

/* Find pointer to the last element contained in the specified lex range.
 * Returns NULL when no element is contained in the range. */
unsigned char* ZzlLastInLexRange(unsigned char* zl, const zlexrangespec* range) {
  unsigned char *eptr = lpSeek(zl, -2), *sptr;

  /* If everything is out of range, return early. */
  if (!ZzlIsInLexRange(zl, range))
    return NULL;

  while (eptr != NULL) {
    if (ZzlLexValueLteMax(eptr, range)) {
      /* Check if score >= min. */
      if (ZzlLexValueGteMin(eptr, range))
        return eptr;
      return NULL;
    }

    /* Move to previous element by moving to the score of previous element.
     * When this returns NULL, we know there also is no element. */
    sptr = lpPrev(zl, eptr);
    if (sptr != NULL)
      serverAssert((eptr = lpPrev(zl, sptr)) != NULL);
    else
      eptr = NULL;
  }

  return NULL;
}

unsigned char* ZzlDeleteRangeByLex(unsigned char* zl, const zlexrangespec* range,
                                   unsigned long* deleted) {
  unsigned char *eptr, *sptr;
  unsigned long num = 0;

  if (deleted != NULL)
    *deleted = 0;

  eptr = ZzlFirstInLexRange(zl, range);
  if (eptr == NULL)
    return zl;

  /* When the tail of the listpack is deleted, eptr will be NULL. */
  while (eptr && (sptr = lpNext(zl, eptr)) != NULL) {
    if (ZzlLexValueLteMax(eptr, range)) {
      /* Delete both the element and the score. */
      zl = lpDeleteRangeWithEntry(zl, &eptr, 2);
      num++;
    } else {
      /* No longer in range. */
      break;
    }
  }

  if (deleted != NULL)
    *deleted = num;
  return zl;
}

unsigned char* ZzlDeleteRangeByScore(unsigned char* zl, const zrangespec* range,
                                     unsigned long* deleted) {
  unsigned char *eptr, *sptr;
  double score;
  unsigned long num = 0;

  if (deleted != NULL)
    *deleted = 0;

  eptr = ZzlFirstInRange(zl, range);
  if (eptr == NULL)
    return zl;

  /* When the tail of the listpack is deleted, eptr will be NULL. */
  while (eptr && (sptr = lpNext(zl, eptr)) != NULL) {
    score = ZzlGetScore(sptr);
    if (ZslValueLteMax(score, range)) {
      /* Delete both the element and the score. */
      zl = lpDeleteRangeWithEntry(zl, &eptr, 2);
      num++;
    } else {
      /* No longer in range. */
      break;
    }
  }

  if (deleted != NULL)
    *deleted = num;
  return zl;
}

/* Insert (element,score) pair in listpack. This function assumes the element is
 * not yet present in the list. */
unsigned char* ZzlInsert(unsigned char* zl, std::string_view ele, double score) {
  unsigned char *eptr = NULL, *sptr = lpSeek(zl, -1);
  double s;

  // Optimization: check first whether the new element should be the last.
  if (sptr != NULL) {
    s = ZzlGetScore(sptr);
    if (s >= score) {
      // It should not be the last, so fallback to the forward iteration.
      eptr = lpSeek(zl, 0);
    }
  }

  while (eptr != NULL) {
    sptr = lpNext(zl, eptr);
    s = ZzlGetScore(sptr);

    if (s > score) {
      /* First element with score larger than score for element to be
       * inserted. This means we should take its spot in the list to
       * maintain ordering. */
      return ZzlInsertAt(zl, eptr, ele, score);
    } else if (s == score) {
      /* Ensure lexicographical ordering for elements. */
      if (zzlCompareElements(eptr, (unsigned char*)ele.data(), ele.size()) > 0) {
        return ZzlInsertAt(zl, eptr, ele, score);
      }
    }

    /* Move to next element. */
    eptr = lpNext(zl, sptr);
  }

  /* Push on tail of list when it was not yet inserted. */
  return ZzlInsertAt(zl, NULL, ele, score);
}

unsigned char* ZzlFind(unsigned char* lp, std::string_view ele, double* score) {
  uint8_t *sptr, *eptr = lpFirst(lp);

  if (eptr == nullptr)
    return nullptr;
  eptr = lpFind(lp, eptr, (unsigned char*)ele.data(), ele.size(), 1);
  if (eptr) {
    sptr = lpNext(lp, eptr);
    serverAssert(sptr != NULL);

    /* Matching element, pull out score. */
    if (score != nullptr)
      *score = ZzlGetScore(sptr);
    return eptr;
  }

  return nullptr;
}

SortedMap::SortedMap()
    : score_map(new ScoreMap), score_tree(new ScoreTree(StatelessAllocator<char>::resource())) {
}

SortedMap::~SortedMap() {
  delete score_tree;
  delete score_map;
}

// Three way comparison of q and key.
// Compares scores first and then the keys, unless q.ignore_score is set.
// In that case only keys are compared.
// In order to support close/open intervals, we introduce a special flag for +inf strings.
// So, in case of score equality (or if scores are ignored), q.str_is_infinite means q > key,
// and 1 is returned.
int SortedMap::ScoreSdsPolicy::KeyCompareTo::operator()(Query q, ScoreSds key) const {
  sds sdsa = (sds)q.item;

  if (!q.ignore_score) {
    double sa = GetObjScore(sdsa);
    double sb = GetObjScore(key);

    if (sa < sb)
      return -1;
    if (sa > sb)
      return 1;
  }

  // if q.str_is_infinite is set, it means q > key at this point.
  if (q.str_is_infinite)
    return 1;

  return sdscmp(sdsa, (sds)key);
}

int SortedMap::AddElem(double score, std::string_view ele, int in_flags, int* out_flags,
                       double* newscore) {
  // does not take ownership over ele.
  DCHECK(!isnan(score));

  ScoreSds obj = nullptr;
  bool added = false;

  if (in_flags & ZADD_IN_XX) {
    obj = score_map->FindObj(ele);
    if (obj == nullptr) {
      *out_flags = ZADD_OUT_NOP;
      return 1;
    }
  } else {
    tie(obj, added) = score_map->AddOrSkip(ele, score);
  }

  if (added) {
    // Adding a new element.
    DCHECK_EQ(in_flags & ZADD_IN_XX, 0);

    *out_flags = ZADD_OUT_ADDED;
    *newscore = score;
    bool added = score_tree->Insert(obj);
    DCHECK(added);

    return 1;
  }

  // Updating an existing element.
  if ((in_flags & ZADD_IN_NX)) {
    // Updating an existing element.
    *out_flags = ZADD_OUT_NOP;
    return 1;
  }

  if (in_flags & ZADD_IN_INCR) {
    score += GetObjScore(obj);
    if (isnan(score)) {
      *out_flags = ZADD_OUT_NAN;
      return 0;
    }
  }

  // Update the score.
  CHECK(score_tree->Delete(obj));
  SetObjScore(obj, score);
  CHECK(score_tree->Insert(obj));
  *out_flags = ZADD_OUT_UPDATED;
  *newscore = score;
  return 1;
}

optional<double> SortedMap::GetScore(std::string_view ele) const {
  ScoreSds obj = score_map->FindObj(ele);
  if (obj != nullptr) {
    return GetObjScore(obj);
  }

  return std::nullopt;
}

bool SortedMap::InsertNew(double score, std::string_view member) {
  DVLOG(2) << "InsertNew " << score << " " << member;

  auto [newk, added] = score_map->AddOrSkip(member, score);
  if (!added)
    return false;

  added = score_tree->Insert(newk);
  CHECK(added);
  return true;
}

optional<unsigned> SortedMap::GetRank(std::string_view ele, bool reverse) const {
  ScoreSds obj = score_map->FindObj(ele);
  if (obj == nullptr)
    return std::nullopt;

  optional rank = score_tree->GetRank(obj, reverse);
  DCHECK(rank);
  return *rank;
}

SortedMap::ScoredArray SortedMap::GetRange(const zrangespec& range, unsigned offset, unsigned limit,
                                           bool reverse) const {
  ScoredArray arr;
  if (score_tree->Size() <= offset || limit == 0)
    return arr;

  char buf[16];
  if (reverse) {
    ScoreSds key = BuildScoredKey(range.max, buf);
    auto path = score_tree->LEQ(Query{key, false, !range.maxex});
    if (path.Empty())
      return arr;

    if (range.maxex && range.max == GetObjScore(path.Terminal())) {
      ++offset;
    }
    DCHECK_LE(GetObjScore(path.Terminal()), range.max);

    while (offset--) {
      if (!path.Prev())
        return arr;
    }

    while (limit--) {
      ScoreSds ele = path.Terminal();

      double score = GetObjScore(ele);
      if (range.min > score || (range.min == score && range.minex))
        break;
      arr.emplace_back(string{(sds)ele, sdslen((sds)ele)}, score);
      if (!path.Prev())
        break;
    }
  } else {
    ScoreSds key = BuildScoredKey(range.min, buf);
    auto path = score_tree->GEQ(Query{key, false, range.minex});
    if (path.Empty())
      return arr;

    while (offset--) {
      if (!path.Next())
        return arr;
    }

    auto path2 = path;
    size_t num_elems = 0;

    // Count the number of elements in the range.
    while (limit--) {
      ScoreSds ele = path.Terminal();

      double score = GetObjScore(ele);
      if (range.max < score || (range.max == score && range.maxex))
        break;
      ++num_elems;
      if (!path.Next())
        break;
    }

    // reserve enough space.
    arr.resize(num_elems);
    for (size_t i = 0; i < num_elems; ++i) {
      ScoreSds ele = path2.Terminal();
      arr[i] = {string{(sds)ele, sdslen((sds)ele)}, GetObjScore(ele)};
      path2.Next();
    }
  }

  return arr;
}

SortedMap::ScoredArray SortedMap::GetLexRange(const zlexrangespec& range, unsigned offset,
                                              unsigned limit, bool reverse) const {
  if (score_tree->Size() <= offset || limit == 0)
    return {};

  detail::BPTreePath<ScoreSds> path;
  ScoredArray arr;

  if (reverse) {
    if (range.max != cmaxstring) {
      path = score_tree->LEQ(Query{range.max, true});
      if (path.Empty())
        return {};

      if (range.maxex && sdscmp((sds)path.Terminal(), range.max) == 0) {
        ++offset;
      }
      while (offset--) {
        if (!path.Prev())
          return {};
      }
    } else {
      path = score_tree->FromRank(score_tree->Size() - offset - 1);
    }

    while (limit--) {
      ScoreSds ele = path.Terminal();

      if (range.min != cminstring) {
        int cmp = sdscmp((sds)ele, range.min);
        if (cmp < 0 || (cmp == 0 && range.minex))
          break;
      }
      arr.emplace_back(string{(sds)ele, sdslen((sds)ele)}, GetObjScore(ele));
      if (!path.Prev())
        break;
    }
  } else {
    if (range.min != cminstring) {
      path = score_tree->GEQ(Query{range.min, true});
      if (path.Empty())
        return {};

      if (range.minex && sdscmp((sds)path.Terminal(), range.min) == 0) {
        ++offset;
      }
      while (offset--) {
        if (!path.Next())
          return {};
      }
    } else {
      path = score_tree->FromRank(offset);
    }

    while (limit--) {
      ScoreSds ele = path.Terminal();

      if (range.max != cmaxstring) {
        int cmp = sdscmp((sds)ele, range.max);
        if (cmp > 0 || (cmp == 0 && range.maxex))
          break;
      }
      arr.emplace_back(string{(sds)ele, sdslen((sds)ele)}, GetObjScore(ele));
      if (!path.Next())
        break;
    }
  }
  return arr;
}

uint8_t* SortedMap::ToListPack() const {
  uint8_t* lp = lpNew(0);

  score_tree->Iterate(0, UINT32_MAX, [&](ScoreSds ele) {
    const std::string_view v{(sds)ele, sdslen((sds)ele)};
    lp = ZzlInsertAt(lp, NULL, v, GetObjScore(ele));
    return true;
  });

  return lp;
}

bool SortedMap::Delete(std::string_view ele) const {
  ScoreSds obj = score_map->FindObj(ele);
  if (obj == nullptr)
    return false;

  CHECK(score_tree->Delete(obj));
  CHECK(score_map->Erase(ele));
  return true;
}

size_t SortedMap::MallocSize() const {
  // TODO: add malloc used to BPTree.
  return score_map->SetMallocUsed() + score_map->ObjMallocUsed() + score_tree->NodeCount() * 256;
}

bool SortedMap::Reserve(size_t sz) {
  score_map->Reserve(sz);
  return true;
}

size_t SortedMap::DeleteRangeByRank(unsigned start, unsigned end) {
  DCHECK_LE(start, end);
  DCHECK_LT(end, score_tree->Size());

  for (uint32_t i = start; i <= end; ++i) {
    /* Ideally, we would want to advance path to the next item and delete the previous one.
     * However, we can not do that because the path is invalidated after the
     * deletion. So we have to recreate the path for each item using the same rank.
     * Note, it is probably could be improved, but it's much more complicated.
     */

    auto path = score_tree->FromRank(start);
    sds ele = (sds)path.Terminal();
    score_tree->Delete(path);
    score_map->Erase(ele);
  }

  return end - start + 1;
}

size_t SortedMap::DeleteRangeByScore(const zrangespec& range) {
  char buf[16] = {0};
  size_t deleted = 0;

  while (!score_tree->Empty()) {
    ScoreSds min_key = BuildScoredKey(range.min, buf);
    auto path = score_tree->GEQ(Query{min_key, false, range.minex});
    if (path.Empty())
      break;

    ScoreSds item = path.Terminal();
    double score = GetObjScore(item);

    if (range.minex) {
      DCHECK_GT(score, range.min);
    } else {
      DCHECK_GE(score, range.min);
    }
    if (score > range.max || (range.maxex && score == range.max))
      break;

    score_tree->Delete(item);
    ++deleted;
    score_map->Erase((sds)item);
  }

  return deleted;
}

size_t SortedMap::DeleteRangeByLex(const zlexrangespec& range) {
  if (score_tree->Size() == 0)
    return 0;

  size_t deleted = 0;

  uint32_t rank = 0;
  if (range.min != cminstring) {
    auto path = score_tree->GEQ(Query{range.min, true});
    if (path.Empty())
      return {};

    rank = path.Rank();
    if (range.minex && sdscmp((sds)path.Terminal(), range.min) == 0) {
      ++rank;
    }
  }

  while (rank < score_tree->Size()) {
    auto path = score_tree->FromRank(rank);
    ScoreSds item = path.Terminal();
    if (range.max != cmaxstring) {
      int cmp = sdscmp((sds)item, range.max);
      if (cmp > 0 || (cmp == 0 && range.maxex))
        break;
    }
    ++deleted;
    score_tree->Delete(path);
    score_map->Erase((sds)item);
  }

  return deleted;
}

SortedMap::ScoredArray SortedMap::PopTopScores(unsigned count, bool reverse) {
  DCHECK_GT(count, 0u);
  DCHECK_EQ(score_map->UpperBoundSize(), score_tree->Size());
  size_t sz = score_map->UpperBoundSize();

  ScoredArray res;

  DCHECK_GT(sz, 0u);  // Empty sets are not allowed.

  if (sz == 0 || count == 0)
    return res;

  if (count > sz)
    count = sz;

  res.reserve(count);

  auto cb = [&](ScoreSds obj) {
    res.emplace_back(string{(sds)obj, sdslen((sds)obj)}, GetObjScore(obj));

    // We can not delete from score_tree because we are in the middle of the iteration.
    CHECK(score_map->Erase((sds)obj));
    return true;  // continue with the iteration.
  };

  unsigned rank = 0;
  unsigned step = 0;
  if (reverse) {
    score_tree->IterateReverse(0, count - 1, std::move(cb));
    rank = score_tree->Size() - 1;
    step = 1;
  } else {
    score_tree->Iterate(0, count - 1, std::move(cb));
  }

  // We already deleted elements from score_map, so what's left is to delete from the tree.
  if (score_map->Empty()) {
    // Corner case optimization.
    score_tree->Clear();
  } else {
    for (unsigned i = 0; i < res.size(); ++i) {
      auto path = score_tree->FromRank(rank);
      score_tree->Delete(path);
      rank -= step;
    }
  }

  return res;
}

size_t SortedMap::Count(const zrangespec& range) const {
  DCHECK_LE(range.min, range.max);

  if (score_tree->Size() == 0)
    return 0;

  // build min key.
  char buf[16];

  ScoreSds range_key = BuildScoredKey(range.min, buf);
  auto path = score_tree->GEQ(Query{range_key, false, range.minex});
  if (path.Empty())
    return 0;

  ScoreSds bound = path.Terminal();

  if (range.minex) {
    DCHECK_GT(GetObjScore(bound), range.min);
  } else {
    DCHECK_GE(GetObjScore(bound), range.min);
  }

  uint32_t min_rank = path.Rank();

  // Now build the max key.
  // If we need to exclude the maximum score, set the key'sstring part to empty string,
  // otherwise set it to infinity.
  range_key = BuildScoredKey(range.max, buf);
  path = score_tree->GEQ(Query{range_key, false, !range.maxex});
  if (path.Empty()) {
    return score_tree->Size() - min_rank;
  }

  bound = path.Terminal();
  uint32_t max_rank = path.Rank();
  if (range.maxex || GetObjScore(bound) > range.max) {
    if (max_rank <= min_rank)
      return 0;
    --max_rank;
  }

  // max_rank could be less than min_rank, for example, if the range is [a, a).
  return max_rank < min_rank ? 0 : max_rank - min_rank + 1;
}

size_t SortedMap::LexCount(const zlexrangespec& range) const {
  if (score_tree->Size() == 0)
    return 0;

  // Ranges that will always be zero - (+inf, anything) or (anything, -inf)
  if (range.min == cmaxstring || range.max == cminstring) {
    return 0;
  }

  uint32_t min_rank = 0;
  detail::BPTreePath<ScoreSds> path;

  if (range.min != cminstring) {
    path = score_tree->GEQ(Query{range.min, true});
    if (path.Empty())
      return 0;

    min_rank = path.Rank();
    if (range.minex && sdscmp((sds)path.Terminal(), range.min) == 0) {
      ++min_rank;
      if (min_rank >= score_tree->Size())
        return 0;
    }
  }

  uint32_t max_rank = score_tree->Size() - 1;
  if (range.max != cmaxstring) {
    path = score_tree->GEQ(Query{range.max, true});
    if (!path.Empty()) {
      max_rank = path.Rank();

      // fix the max rank, if needed.
      int cmp = sdscmp((sds)path.Terminal(), range.max);
      DCHECK_GE(cmp, 0);
      if (cmp > 0 || range.maxex) {
        if (max_rank <= min_rank)
          return 0;
        --max_rank;
      }
    }
  }

  return max_rank < min_rank ? 0 : max_rank - min_rank + 1;
}

bool SortedMap::Iterate(unsigned start_rank, unsigned len, bool reverse,
                        std::function<bool(sds, double)> cb) const {
  DCHECK_GT(len, 0u);
  unsigned end_rank = start_rank + len - 1;
  bool success;
  if (reverse) {
    success = score_tree->IterateReverse(
        start_rank, end_rank, [&](ScoreSds obj) { return cb((sds)obj, GetObjScore(obj)); });
  } else {
    success = score_tree->Iterate(start_rank, end_rank,
                                  [&](ScoreSds obj) { return cb((sds)obj, GetObjScore(obj)); });
  }

  return success;
}

uint64_t SortedMap::Scan(uint64_t cursor,
                         absl::FunctionRef<void(std::string_view, double)> cb) const {
  auto scan_cb = [&cb](const void* obj) {
    sds ele = (sds)obj;
    cb(string_view{ele, sdslen(ele)}, GetObjScore(obj));
  };

  return this->score_map->Scan(cursor, std::move(scan_cb));
}

// taken from zsetConvert
SortedMap* SortedMap::FromListPack(PMR_NS::memory_resource* res, const uint8_t* lp) {
  uint8_t* zl = (uint8_t*)lp;
  unsigned char *eptr, *sptr;
  unsigned char* vstr;
  unsigned int vlen;
  long long vlong;

  void* ptr = res->allocate(sizeof(SortedMap), alignof(SortedMap));
  SortedMap* zs = new (ptr) SortedMap;

  eptr = lpSeek(zl, 0);
  if (eptr != NULL) {
    sptr = lpNext(zl, eptr);
    CHECK(sptr != NULL);
  }

  while (eptr != NULL) {
    double score = ZzlGetScore(sptr);
    vstr = lpGetValue(eptr, &vlen, &vlong);
    if (vstr == NULL) {
      CHECK(zs->InsertNew(score, absl::StrCat(vlong)));
    } else {
      CHECK(zs->InsertNew(score, string_view{reinterpret_cast<const char*>(vstr), vlen}));
    }

    ZzlNext(zl, &eptr, &sptr);
  }

  return zs;
}

bool SortedMap::DefragIfNeeded(PageUsage* page_usage) {
  auto cb = [this](sds old_obj, sds new_obj) { score_tree->ForceUpdate(old_obj, new_obj); };
  bool reallocated = false;

  for (auto it = score_map->begin(); it != score_map->end(); ++it) {
    reallocated |= it.ReallocIfNeeded(page_usage, cb);
  }

  return reallocated;
}

std::optional<SortedMap::RankAndScore> SortedMap::GetRankAndScore(std::string_view ele,
                                                                  bool reverse) const {
  ScoreSds obj = score_map->FindObj(ele);
  if (obj == nullptr)
    return std::nullopt;

  optional rank = score_tree->GetRank(obj, reverse);
  DCHECK(rank);

  return SortedMap::RankAndScore{*rank, GetObjScore(obj)};
}
}  // namespace detail

sds cminstring = detail::kMinStrData + 1;
sds cmaxstring = detail::kMaxStrData + 1;

}  // namespace dfly


================================================
FILE: src/core/sorted_map.h
================================================
// Copyright 2023, Roman Gershman.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/functional/function_ref.h>

#include <functional>
#include <memory>
#include <optional>
#include <string>
#include <variant>
#include <vector>

#include "core/bptree_set.h"
#include "core/score_map.h"

extern "C" {

/* Struct to hold an inclusive/exclusive range spec by score comparison. */
typedef struct {
  double min, max;
  int minex, maxex; /* are min or max exclusive? */
} zrangespec;

/* Struct to hold an inclusive/exclusive range spec by lexicographic comparison. */
typedef struct {
  sds min, max;     /* May be set to shared.(minstring|maxstring) */
  int minex, maxex; /* are min or max exclusive? */
} zlexrangespec;

}  // extern "C"

/* Input flags. */
#define ZADD_IN_NONE 0
#define ZADD_IN_INCR (1 << 0) /* Increment the score instead of setting it. */
#define ZADD_IN_NX (1 << 1)   /* Don't touch elements already existing. */
#define ZADD_IN_XX (1 << 2)   /* Only touch elements already existing. */
#define ZADD_IN_GT (1 << 3)   /* Only update existing when new scores are higher. */
#define ZADD_IN_LT (1 << 4)   /* Only update existing when new scores are lower. */

/* Output flags. */
#define ZADD_OUT_NOP (1 << 0)     /* Operation not performed because of conditionals.*/
#define ZADD_OUT_NAN (1 << 1)     /* Only touch elements already existing. */
#define ZADD_OUT_ADDED (1 << 2)   /* The element was new and was added. */
#define ZADD_OUT_UPDATED (1 << 3) /* The element already existed, score updated. */

namespace dfly {

class PageUsage;

// Copied from zset.h
extern sds cmaxstring;
extern sds cminstring;

namespace detail {

/**
 * @brief SortedMap is a sorted map implementation based on zset.h. It holds unique strings that
 * are ordered by score and lexicographically. The score is a double value and has higher priority.
 * The map is implemented as a skip list and a hash table. For more details see
 * zset.h and t_zset.c files in Redis.
 */
class SortedMap {
 public:
  using ScoredMember = std::pair<std::string, double>;
  using ScoredArray = std::vector<ScoredMember>;
  using ScoreSds = void*;
  using RankAndScore = std::pair<unsigned, double>;

  SortedMap();
  ~SortedMap();

  SortedMap(const SortedMap&) = delete;
  SortedMap& operator=(const SortedMap&) = delete;

  bool Reserve(size_t sz);
  int AddElem(double score, std::string_view ele, int in_flags, int* out_flags, double* newscore);

  // Inserts a new element. Returns false if the element already exists.
  // No score update is performed in this case.
  bool InsertNew(double score, std::string_view member);

  bool Delete(std::string_view ele) const;

  // Upper bound size of the set.
  // Note: Currently we do not allow member expiry in sorted sets, therefore it's exact
  // But if we decide to add expire, this method will provide an approximation from above.
  size_t Size() const {
    return score_map->UpperBoundSize();
  }

  size_t MallocSize() const;

  size_t DeleteRangeByRank(unsigned start, unsigned end);
  size_t DeleteRangeByScore(const zrangespec& range);
  size_t DeleteRangeByLex(const zlexrangespec& range);

  ScoredArray PopTopScores(unsigned count, bool reverse);

  std::optional<double> GetScore(std::string_view ele) const;
  std::optional<unsigned> GetRank(std::string_view ele, bool reverse) const;
  std::optional<RankAndScore> GetRankAndScore(std::string_view ele, bool reverse) const;
  ScoredArray GetRange(const zrangespec& r, unsigned offs, unsigned len, bool rev) const;
  ScoredArray GetLexRange(const zlexrangespec& r, unsigned o, unsigned l, bool rev) const;

  size_t Count(const zrangespec& range) const;
  size_t LexCount(const zlexrangespec& range) const;

  // Runs cb for each element in the range [start_rank, start_rank + len).
  // Stops iteration if cb returns false. Returns false in this case.
  bool Iterate(unsigned start_rank, unsigned len, bool reverse,
               std::function<bool(sds, double)> cb) const;

  uint64_t Scan(uint64_t cursor, absl::FunctionRef<void(std::string_view, double)> cb) const;

  uint8_t* ToListPack() const;
  static SortedMap* FromListPack(PMR_NS::memory_resource* res, const uint8_t* lp);

  bool DefragIfNeeded(PageUsage* page_usage);

 private:
  struct Query {
    ScoreSds item;
    bool ignore_score;
    bool str_is_infinite;

    Query(ScoreSds key, bool ign_score = false, int is_inf = 0)
        : item(key), ignore_score(ign_score), str_is_infinite(is_inf != 0) {
    }
  };

  struct ScoreSdsPolicy {
    using KeyT = ScoreSds;

    struct KeyCompareTo {
      int operator()(Query q, ScoreSds key) const;
    };
  };

  using ScoreTree = BPTree<ScoreSds, ScoreSdsPolicy>;

  // hash map from fields to scores.
  ScoreMap* score_map = nullptr;

  // sorted tree of (score,field) items.
  ScoreTree* score_tree = nullptr;
};

// Used by CompactObject.
unsigned char* ZzlInsert(unsigned char* zl, std::string_view ele, double score);
unsigned char* ZzlFind(unsigned char* lp, std::string_view ele, double* score);

// Used by SortedMap and ZsetFamily.
double ZzlGetScore(const uint8_t* sptr);
void ZzlNext(const uint8_t* zl, uint8_t** eptr, uint8_t** sptr);
void ZzlPrev(const uint8_t* zl, uint8_t** eptr, uint8_t** sptr);
void ZslFreeLexRange(const zlexrangespec* spec);
uint8_t* ZzlLastInRange(uint8_t* zl, const zrangespec* range);
uint8_t* ZzlFirstInRange(uint8_t* zl, const zrangespec* range);

uint8_t* ZzlFirstInLexRange(uint8_t* zl, const zlexrangespec* range);
uint8_t* ZzlLastInLexRange(uint8_t* zl, const zlexrangespec* range);

int ZzlLexValueGteMin(uint8_t* p, const zlexrangespec* spec);
int ZzlLexValueLteMax(uint8_t* p, const zlexrangespec* spec);

uint8_t* ZzlDeleteRangeByLex(uint8_t* zl, const zlexrangespec* range, unsigned long* deleted);
uint8_t* ZzlDeleteRangeByScore(uint8_t* zl, const zrangespec* range, unsigned long* deleted);

inline int ZslValueGteMin(double value, const zrangespec* spec) {
  return spec->minex ? (value > spec->min) : (value >= spec->min);
}

inline int ZslValueLteMax(double value, const zrangespec* spec) {
  return spec->maxex ? (value < spec->max) : (value <= spec->max);
}

}  // namespace detail
}  // namespace dfly


================================================
FILE: src/core/sorted_map_test.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/sorted_map.h"

#include <absl/strings/str_cat.h>
#include <gmock/gmock.h>
#include <mimalloc.h>

#include "base/gtest.h"
#include "base/logging.h"
#include "core/mi_memory_resource.h"
#include "core/page_usage/page_usage_stats.h"

extern "C" {
#include "redis/zmalloc.h"
}

using namespace std;
using absl::StrCat;
using testing::ElementsAre;
using testing::Pair;
using testing::StrEq;

namespace dfly {
using detail::SortedMap;

class SortedMapTest : public ::testing::Test {
 protected:
  static void SetUpTestSuite() {
    // configure redis lib zmalloc which requires mimalloc heap to work.
    auto* tlh = mi_heap_get_backing();
    init_zmalloc_threadlocal(tlh);
    InitTLStatelessAllocMR(PMR_NS::get_default_resource());
  }

  SortedMap sm_;
};

TEST_F(SortedMapTest, Add) {
  int out_flags;
  double new_score;

  int res = sm_.AddElem(1.0, "a", 0, &out_flags, &new_score);
  EXPECT_EQ(1, res);
  EXPECT_EQ(ZADD_OUT_ADDED, out_flags);
  EXPECT_EQ(1, new_score);

  res = sm_.AddElem(2.0, "a", ZADD_IN_NX, &out_flags, &new_score);
  EXPECT_EQ(1, res);
  EXPECT_EQ(ZADD_OUT_NOP, out_flags);

  res = sm_.AddElem(2.0, "a", ZADD_IN_INCR, &out_flags, &new_score);
  EXPECT_EQ(1, res);
  EXPECT_EQ(ZADD_OUT_UPDATED, out_flags);
  EXPECT_EQ(3, new_score);
  sds ele = sdsnew("a");
  EXPECT_EQ(3, sm_.GetScore(ele));
  sdsfree(ele);
}

TEST_F(SortedMapTest, Scan) {
  for (unsigned i = 0; i < 972; ++i) {
    sm_.InsertNew(i, StrCat(i));
  }
  uint64_t cursor = 0;

  unsigned cnt = 0;
  do {
    cursor = sm_.Scan(cursor, [&](string_view str, double score) { ++cnt; });
  } while (cursor != 0);
  EXPECT_EQ(972, cnt);
}

TEST_F(SortedMapTest, InsertPop) {
  for (unsigned i = 0; i < 256; ++i) {
    ASSERT_TRUE(sm_.InsertNew(1000, StrCat("a", i)));
  }

  vector<sds> vec;
  bool res = sm_.Iterate(1, 2, false, [&](sds ele, double score) {
    vec.push_back(ele);
    return true;
  });
  EXPECT_TRUE(res);
  EXPECT_THAT(vec, ElementsAre(StrEq("a1"), StrEq("a10")));

  sds s = sdsnew("a1");
  EXPECT_EQ(1, sm_.GetRank(s, false));
  EXPECT_EQ(254, sm_.GetRank(s, true));
  sdsfree(s);

  auto top_scores = sm_.PopTopScores(3, false);
  EXPECT_THAT(top_scores, ElementsAre(Pair(StrEq("a0"), 1000), Pair(StrEq("a1"), 1000),
                                      Pair(StrEq("a10"), 1000)));
  top_scores = sm_.PopTopScores(3, true);
  EXPECT_THAT(top_scores, ElementsAre(Pair(StrEq("a99"), 1000), Pair(StrEq("a98"), 1000),
                                      Pair(StrEq("a97"), 1000)));
}

TEST_F(SortedMapTest, LexRanges) {
  for (unsigned i = 0; i < 100; ++i) {
    ASSERT_TRUE(sm_.InsertNew(1, StrCat("a", i)));
  }

  zlexrangespec range;
  range.max = sdsnew("a96");
  range.min = sdsnew("a93");
  range.maxex = 0;
  range.minex = 0;
  EXPECT_EQ(4, sm_.LexCount(range));
  auto array = sm_.GetLexRange(range, 1, 1000, false);
  ASSERT_EQ(3, array.size());
  EXPECT_THAT(array.front(), Pair("a94", 1));

  range.maxex = 1;
  EXPECT_EQ(3, sm_.LexCount(range));
  array = sm_.GetLexRange(range, 1, 1000, true);
  ASSERT_EQ(2, array.size());
  EXPECT_THAT(array.front(), Pair("a94", 1));

  range.minex = 1;
  EXPECT_EQ(2, sm_.LexCount(range));
  array = sm_.GetLexRange(range, 1, 1000, false);
  ASSERT_EQ(1, array.size());
  EXPECT_THAT(array.front(), Pair("a95", 1));
  sdsfree(range.min);

  range.min = range.max;
  EXPECT_EQ(0, sm_.LexCount(range));
  range.minex = 0;
  EXPECT_EQ(0, sm_.LexCount(range));
  sdsfree(range.max);

  range.maxex = 0;
  range.min = cminstring;
  range.max = sdsnew("a");
  EXPECT_EQ(0, sm_.LexCount(range));
  sdsfree(range.max);

  range.max = sdsnew("a0");
  EXPECT_EQ(1, sm_.LexCount(range));
  range.maxex = 1;
  EXPECT_EQ(0, sm_.LexCount(range));
  sdsfree(range.max);
}

TEST_F(SortedMapTest, ScoreRanges) {
  for (unsigned i = 0; i < 10; ++i) {
    ASSERT_TRUE(sm_.InsertNew(1, StrCat("a", i)));
  }

  for (unsigned i = 0; i < 10; ++i) {
    ASSERT_TRUE(sm_.InsertNew(2, StrCat("b", i)));
  }

  zrangespec range;
  range.max = 5;
  range.min = 1;
  range.maxex = 0;
  range.minex = 0;
  EXPECT_EQ(20, sm_.Count(range));
  detail::SortedMap::ScoredArray array = sm_.GetRange(range, 0, 1000, false);
  ASSERT_EQ(20, array.size());
  EXPECT_THAT(array.front(), Pair("a0", 1));
  EXPECT_THAT(array.back(), Pair("b9", 2));

  range.minex = 1;  // exclude all the "1" scores.
  EXPECT_EQ(10, sm_.Count(range));
  array = sm_.GetRange(range, 2, 1, false);
  ASSERT_EQ(1, array.size());
  EXPECT_THAT(array.front(), Pair("b2", 2));

  range.max = 1;
  range.minex = 0;
  range.min = -HUGE_VAL;
  EXPECT_EQ(10, sm_.Count(range));
  array = sm_.GetRange(range, 2, 2, true);
  ASSERT_EQ(2, array.size());
  EXPECT_THAT(array.back(), Pair("a6", 1));

  range.maxex = 1;
  EXPECT_EQ(0, sm_.Count(range));
  array = sm_.GetRange(range, 0, 2, true);
  ASSERT_EQ(0, array.size());

  range.min = 3;
  array = sm_.GetRange(range, 0, 2, true);
  ASSERT_EQ(0, array.size());
}

TEST_F(SortedMapTest, DeleteRange) {
  for (unsigned i = 0; i <= 100; ++i) {
    ASSERT_TRUE(sm_.InsertNew(i * 2, StrCat("a", i)));
  }

  zrangespec range;
  range.min = range.max = 200;
  range.minex = range.maxex = 1;
  EXPECT_EQ(0, sm_.DeleteRangeByScore(range));

  range.min = 199;
  EXPECT_EQ(0, sm_.DeleteRangeByScore(range));

  range.minex = 0;
  EXPECT_EQ(0, sm_.DeleteRangeByScore(range));

  range.max = 199;
  range.min = 198;
  EXPECT_EQ(1, sm_.DeleteRangeByScore(range));

  range.max = 197;
  range.min = 193;
  EXPECT_EQ(2, sm_.DeleteRangeByScore(range));

  EXPECT_EQ(2, sm_.DeleteRangeByRank(0, 1));

  zlexrangespec lex_range;
  lex_range.min = sdsnew("b");
  lex_range.max = sdsnew("c");
  EXPECT_EQ(0, sm_.DeleteRangeByLex(lex_range));

  sdsfree(lex_range.min);
  sdsfree(lex_range.max);
  lex_range.min = cminstring;
  lex_range.max = cmaxstring;
  EXPECT_EQ(96, sm_.DeleteRangeByLex(lex_range));
}

TEST_F(SortedMapTest, RangeBug) {
  constexpr size_t kArrLen = 80;
  for (unsigned i = 0; i < kArrLen; i++) {
    ASSERT_TRUE(sm_.InsertNew(i, StrCat("score", i)));
  }

  for (unsigned i = 0; i < kArrLen; i++) {
    zrangespec range;
    range.max = HUGE_VAL;
    range.min = i;
    range.minex = 0;
    range.maxex = 0;
    auto arr = sm_.GetRange(range, 0, 5, false);
    ASSERT_GT(arr.size(), 0) << i;
  }
}

uint64_t total_wasted_memory = 0;

TEST_F(SortedMapTest, ReallocIfNeeded) {
  auto build_str = [](size_t i) { return to_string(i) + string(131, 'a'); };

  auto count_waste = [](const mi_heap_t* heap, const mi_heap_area_t* area, void* block,
                        size_t block_size, void* arg) {
    size_t used = block_size * area->used;
    total_wasted_memory += area->committed - used;
    return true;
  };

  for (size_t i = 0; i < 10'000; i++) {
    int out_flags;
    double new_val;
    auto str = build_str(i);
    sm_.AddElem(i, str, 0, &out_flags, &new_val);
  }

  for (size_t i = 0; i < 10'000; i++) {
    if (i % 10 == 0)
      continue;
    auto str = build_str(i);
    sds ele = sdsnew(str.c_str());
    sm_.Delete(ele);
    sdsfree(ele);
  }

  mi_heap_collect(mi_heap_get_backing(), true);
  mi_heap_visit_blocks(mi_heap_get_backing(), false, count_waste, nullptr);
  size_t wasted_before = total_wasted_memory;

  PageUsage page_usage{CollectPageStats::NO, 9};
  ASSERT_TRUE(sm_.DefragIfNeeded(&page_usage));

  total_wasted_memory = 0;
  mi_heap_collect(mi_heap_get_backing(), true);
  mi_heap_visit_blocks(mi_heap_get_backing(), false, count_waste, nullptr);
  size_t wasted_after = total_wasted_memory;

  // Check we waste significanlty less now
  EXPECT_GT(wasted_before, wasted_after * 2);

  ASSERT_EQ(sm_.Size(), 1000);
  auto cb = [i = 0, build_str](sds ele, double score) mutable -> bool {
    EXPECT_EQ(std::string_view(ele), build_str(i * 10));
    EXPECT_EQ((size_t)score, i * 10);
    ++i;
    return true;
  };

  sm_.Iterate(0, 10000, false, cb);
}

}  // namespace dfly


================================================
FILE: src/core/sse_port.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once
#if defined(__aarch64__)
#define SSE2NEON_SUPPRESS_WARNINGS
#include "base/sse2neon.h"
#elif defined(__riscv) || defined(__riscv__)
#include "base/sse2rvv.h"
#elif defined(__s390x__)
#include <vecintrin.h>
#else
#include <emmintrin.h>
#include <tmmintrin.h>
#endif

namespace dfly {

#ifndef __s390x__
inline __m128i mm_loadu_si128(const __m128i* ptr) {
#if defined(__aarch64__)
  __m128i res;
  memcpy(&res, ptr, sizeof(res));
  return res;
// return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
#else
  return _mm_loadu_si128(ptr);
#endif
}
#endif

}  // namespace dfly


================================================
FILE: src/core/string_map.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/string_map.h"

#include "base/endian.h"
#include "base/logging.h"
#include "core/compact_object.h"
#include "core/page_usage/page_usage_stats.h"
#include "core/sds_utils.h"

extern "C" {
#include "redis/zmalloc.h"
}

using namespace std;

namespace dfly {

namespace {

constexpr uint64_t kValTtlBit = 1ULL << 63;
constexpr uint64_t kValMask = ~kValTtlBit;

// Returns key, tagged value pair
pair<sds, uint64_t> CreateEntry(string_view field, string_view value, uint32_t time_now,
                                uint32_t ttl_sec) {
  // 8 additional bytes for a pointer to value.
  sds newkey;
  size_t meta_offset = field.size() + 1;
  sds sdsval = sdsnewlen(value.data(), value.size());
  uint64_t sdsval_tag = uint64_t(sdsval);

  if (ttl_sec == UINT32_MAX) {
    // The layout is:
    // key, '\0', 8-byte pointer to value
    newkey = AllocSdsWithSpace(field.size(), 8);
  } else {
    // The layout is:
    // key, '\0', 8-byte pointer to value, 4-byte absolute time.
    // the value pointer it tagged.
    newkey = AllocSdsWithSpace(field.size(), 8 + 4);
    uint32_t at = time_now + ttl_sec;
    absl::little_endian::Store32(newkey + meta_offset + 8, at);  // skip the value pointer.
    sdsval_tag |= kValTtlBit;
  }

  if (!field.empty()) {
    memcpy(newkey, field.data(), field.size());
  }

  absl::little_endian::Store64(newkey + meta_offset, sdsval_tag);
  return {newkey, sdsval_tag};
}

bool HasTtl(sds entry) {
  const uint64_t tag = absl::little_endian::Load64(entry + sdslen(entry) + 1);
  return (tag & kValTtlBit) != 0;
}

}  // namespace

StringMap::~StringMap() {
  Clear();
}

bool StringMap::AddOrUpdate(std::string_view field, std::string_view value, uint32_t ttl_sec,
                            bool keepttl) {
  sds prev = AddOrExchange(field, value, ttl_sec, keepttl);
  if (prev) {
    ObjDelete(prev, false);
    return false;
  }
  return true;
}

sds StringMap::AddOrExchange(std::string_view field, std::string_view value, uint32_t ttl_sec,
                             bool keepttl) {
  const uint32_t computed_ttl = ComputeTtl(field, ttl_sec, keepttl);
  auto [newkey, sdsval_tag] = CreateEntry(field, value, time_now(), computed_ttl);
  auto prev_entry = static_cast<sds>(AddOrReplaceObj(newkey, sdsval_tag & kValTtlBit));
  return prev_entry;
}

uint32_t StringMap::ComputeTtl(string_view field, uint32_t ttl_sec, bool keepttl) const {
  if (!keepttl)
    return ttl_sec;

  auto* prev = static_cast<sds>(FindInternal(&field, Hash(&field, 1), 1));
  if (!prev)
    return ttl_sec;

  if (!HasTtl(prev))
    return ttl_sec;

  return ObjExpireTime(prev) - time_now();
}

bool StringMap::AddOrSkip(std::string_view field, std::string_view value, uint32_t ttl_sec) {
  uint64_t hashcode = Hash(&field, 1);
  void* obj = FindInternal(&field, hashcode, 1);  // 1 - string_view

  if (obj)
    return false;

  auto [newkey, sdsval_tag] = CreateEntry(field, value, time_now(), ttl_sec);
  AddUnique(newkey, sdsval_tag & kValTtlBit, hashcode);
  return true;
}

bool StringMap::Erase(string_view key) {
  return EraseInternal(&key, 1);
}

StringMap::SdsEntry StringMap::Extract(string_view key) {
  return SdsEntry(static_cast<sds>(DetachInternal(const_cast<string_view*>(&key), 1)), DeleteEntry);
}

void StringMap::DeleteEntry(sds entry) {
  sds value = GetValue(entry);
  sdsfree(value);
  sdsfree(entry);
}

bool StringMap::Contains(string_view field) const {
  // 1 - means it's string_view. See ObjEqual for details.
  uint64_t hashcode = Hash(&field, 1);
  return FindInternal(&field, hashcode, 1) != nullptr;
}

optional<pair<sds, sds>> StringMap::RandomPair() {
  // Iteration may remove elements, and so we need to loop if we happen to reach the end
  while (true) {
    auto it = begin();

    // It may be that begin() will invalidate all elements, getting us to an Empty() state
    if (Empty()) {
      break;
    }

    it += rand() % UpperBoundSize();
    if (it != end()) {
      return std::make_pair(it->first, it->second);
    }
  }
  return nullopt;
}

void StringMap::RandomPairsUnique(unsigned int count, std::vector<sds>& keys,
                                  std::vector<sds>& vals, bool with_value) {
  unsigned int total_size = SizeSlow();
  unsigned int index = 0;
  if (count > total_size)
    count = total_size;

  auto itr = begin();
  uint32_t picked = 0, remaining = count;
  while (picked < count && itr != end()) {
    double random_double = ((double)rand()) / RAND_MAX;
    double threshold = ((double)remaining) / (total_size - index);
    if (random_double <= threshold) {
      keys.push_back(itr->first);
      if (with_value) {
        vals.push_back(itr->second);
      }
      remaining--;
      picked++;
    }
    ++itr;
    index++;
  }

  DCHECK(keys.size() == count);
  if (with_value)
    DCHECK(vals.size() == count);
}

void StringMap::RandomPairs(unsigned int count, std::vector<sds>& keys, std::vector<sds>& vals,
                            bool with_value) {
  using RandomPick = std::pair<unsigned int, unsigned int>;
  std::vector<RandomPick> picks;
  unsigned int total_size = SizeSlow();

  for (unsigned int i = 0; i < count; ++i) {
    RandomPick pick{rand() % total_size, i};
    picks.push_back(pick);
  }

  std::sort(picks.begin(), picks.end(), [](auto& x, auto& y) { return x.first < y.first; });

  unsigned int index = picks[0].first, pick_index = 0;
  auto itr = begin();
  for (unsigned int i = 0; i < index; ++i)
    ++itr;

  keys.resize(count);
  if (with_value)
    vals.resize(count);

  while (itr != end() && pick_index < count) {
    auto [key, val] = *itr;
    while (pick_index < count && index == picks[pick_index].first) {
      int store_order = picks[pick_index].second;
      keys[store_order] = key;
      if (with_value)
        vals[store_order] = val;
      ++pick_index;
    }
    ++index;
    ++itr;
  }
}

sds StringMap::GetValue(sds key) {
  char* valptr = key + sdslen(key) + 1;
  const uint64_t val = absl::little_endian::Load64(valptr);
  return (sds)(kValMask & val);
}

pair<sds, bool> StringMap::ReallocIfNeeded(void* obj, PageUsage* page_usage) {
  sds key = (sds)obj;
  size_t key_len = sdslen(key);

  auto* value_ptr = key + key_len + 1;
  uint64_t value_tag = absl::little_endian::Load64(value_ptr);
  sds value = (sds)(uint64_t(value_tag) & kValMask);

  bool realloced_value = false;

  // If the allocated value is underutilized, re-allocate it and update the pointer inside the key
  if (page_usage->IsPageForObjectUnderUtilized(value)) {
    size_t value_len = sdslen(value);
    sds new_value = sdsnewlen(value, value_len);
    memcpy(new_value, value, value_len);
    uint64_t new_value_tag = (uint64_t(new_value) & kValMask) | (value_tag & ~kValMask);
    absl::little_endian::Store64(value_ptr, new_value_tag);
    sdsfree(value);
    realloced_value = true;
  }

  if (!page_usage->IsPageForObjectUnderUtilized(key))
    return {key, realloced_value};

  size_t space_size = 8 /* value ptr */ + ((value_tag & kValTtlBit) ? 4 : 0) /* optional expiry */;

  sds new_key = AllocSdsWithSpace(key_len, space_size);
  memcpy(new_key, key, key_len + 1 /* \0 */ + space_size);
  sdsfree(key);

  return {new_key, true};
}

uint64_t StringMap::Hash(const void* obj, uint32_t cookie) const {
  DCHECK_LT(cookie, 2u);

  if (cookie == 0) {
    sds s = (sds)obj;
    return CompactObj::HashCode(string_view{s, sdslen(s)});
  }

  const string_view* sv = (const string_view*)obj;
  return CompactObj::HashCode(*sv);
}

bool StringMap::ObjEqual(const void* left, const void* right, uint32_t right_cookie) const {
  DCHECK_LT(right_cookie, 2u);

  sds s1 = (sds)left;
  if (right_cookie == 0) {
    sds s2 = (sds)right;

    if (sdslen(s1) != sdslen(s2)) {
      return false;
    }

    return sdslen(s1) == 0 || memcmp(s1, s2, sdslen(s1)) == 0;
  }

  const string_view* right_sv = (const string_view*)right;
  string_view left_sv{s1, sdslen(s1)};
  return left_sv == (*right_sv);
}

size_t StringMap::ObjectAllocSize(const void* obj) const {
  sds s1 = (sds)obj;
  size_t res = zmalloc_usable_size(sdsAllocPtr(s1));
  sds val = GetValue(s1);
  res += zmalloc_usable_size(sdsAllocPtr(val));

  return res;
}

uint32_t StringMap::ObjExpireTime(const void* obj) const {
  sds str = (sds)obj;
  const char* valptr = str + sdslen(str) + 1;

  uint64_t val = absl::little_endian::Load64(valptr);

  DCHECK(val & kValTtlBit);
  if (val & kValTtlBit) {
    return absl::little_endian::Load32(valptr + 8);
  }

  // Should not reach.
  return UINT32_MAX;
}

void StringMap::ObjUpdateExpireTime(const void* obj, uint32_t ttl_sec) {
  return SdsUpdateExpireTime(obj, time_now() + ttl_sec, 8);
}

void StringMap::ObjDelete(void* obj, bool has_ttl) const {
  sds s1 = (sds)obj;
  sds value = GetValue(s1);
  sdsfree(value);
  sdsfree(s1);
}

void* StringMap::ObjectClone(const void* obj, bool has_ttl, bool add_ttl) const {
  uint32_t ttl_sec = add_ttl ? 0 : (has_ttl ? ObjExpireTime(obj) : UINT32_MAX);
  sds str = (sds)obj;
  auto pair = detail::SdsPair(str, GetValue(str));
  // Use explicit string_view constructor with length to preserve null characters
  string_view key_sv(pair->first, sdslen(pair->first));
  string_view value_sv(pair->second, sdslen(pair->second));
  auto [newkey, sdsval_tag] = CreateEntry(key_sv, value_sv, time_now(), ttl_sec);

  return (void*)newkey;
}

detail::SdsPair StringMap::iterator::BreakToPair(void* obj) {
  sds f = (sds)obj;
  return detail::SdsPair(f, GetValue(f));
}

bool StringMap::iterator::ReallocIfNeeded(PageUsage* page_usage) {
  auto* ptr = curr_entry_;
  if (ptr->IsLink()) {
    ptr = ptr->AsLink();
  }

  DCHECK(!ptr->IsEmpty());
  DCHECK(ptr->IsObject());

  auto* obj = ptr->GetObject();
  auto [new_obj, realloced] = static_cast<StringMap*>(owner_)->ReallocIfNeeded(obj, page_usage);
  ptr->SetObject(new_obj);

  return realloced;
}

}  // namespace dfly


================================================
FILE: src/core/string_map.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <memory>
#include <optional>
#include <string_view>

#include "core/dense_set.h"

extern "C" {
#include "redis/sds.h"
}

namespace dfly {

class PageUsage;

namespace detail {

class SdsPair {
 public:
  SdsPair(sds k, sds v) : first(k), second(v) {
  }

  SdsPair* operator->() {
    return this;
  }

  const SdsPair* operator->() const {
    return this;
  }

  operator std::pair<std::string_view, std::string_view>() const {
    return {{first, sdslen(first)}, {second, sdslen(second)}};
  }

  const sds first;
  const sds second;
};

};  // namespace detail

class StringMap : public DenseSet {
 public:
  explicit StringMap(void* unused = nullptr) {
  }

  ~StringMap();

  class iterator : private DenseSet::IteratorBase {
    static detail::SdsPair BreakToPair(void* obj);

   public:
    iterator() : IteratorBase() {
    }

    explicit iterator(const IteratorBase& o) : IteratorBase(o) {
    }

    iterator(DenseSet* owner) : IteratorBase(owner, false) {
    }

    detail::SdsPair operator->() const {
      void* ptr = curr_entry_->GetObject();
      return BreakToPair(ptr);
    }

    detail::SdsPair operator*() const {
      void* ptr = curr_entry_->GetObject();
      return BreakToPair(ptr);
    }

    // Try reducing memory fragmentation of the value by re-allocating. Returns true if
    // re-allocation happened.
    bool ReallocIfNeeded(PageUsage* page_usage);

    iterator& operator++() {
      Advance();
      return *this;
    }

    // Advances at most `n` steps, but stops at end.
    iterator& operator+=(unsigned int n) {
      for (unsigned int i = 0; i < n; ++i) {
        if (curr_entry_ == nullptr) {
          break;
        }

        Advance();
      }
      return *this;
    }

    bool operator==(const iterator& b) const {
      if (owner_ == nullptr && b.owner_ == nullptr) {  // to allow comparison with end()
        return true;
      }
      return owner_ == b.owner_ && curr_entry_ == b.curr_entry_;
    }

    bool operator!=(const iterator& b) const {
      return !(*this == b);
    }

    using IteratorBase::ExpiryTime;
    using IteratorBase::HasExpiry;
    using IteratorBase::SetExpiryTime;
  };

  // Adds a new field or updates its value. Returns true if added, false if updated.
  bool AddOrUpdate(std::string_view field, std::string_view value, uint32_t ttl_sec = UINT32_MAX,
                   bool keepttl = false);

  // Like AddOrUpdate but on update returns the previous sds entry
  // instead of deleting it. Caller must free the returned entry via DeleteEntry().
  // Returns nullptr if a new field was added.
  sds AddOrExchange(std::string_view field, std::string_view value, uint32_t ttl_sec = UINT32_MAX,
                    bool keepttl = false);

  // Returns true if field was added
  // false, if already exists. In that case no update is done.
  bool AddOrSkip(std::string_view field, std::string_view value, uint32_t ttl_sec = UINT32_MAX);

  bool Erase(std::string_view s1);

  using SdsEntry = std::unique_ptr<char, void (*)(sds)>;

  // Removes and returns the sds entry for the given key without freeing it.
  // Returns nullptr if the key was not found.
  SdsEntry Extract(std::string_view s1);

  // Frees a StringMap sds entry (key + embedded value).
  static void DeleteEntry(sds entry);

  bool Contains(std::string_view s1) const;

  /// @brief  Returns value of the key or an empty iterator if key not found.
  /// @param key
  /// @return sds
  iterator Find(std::string_view member) {
    return iterator{FindIt(&member, 1)};
  }

  iterator begin() {
    return iterator{this};
  }

  iterator end() {
    return iterator{};
  }

  // Returns a random key value pair.
  // Returns key only if value is a nullptr.
  std::optional<std::pair<sds, sds>> RandomPair();

  // Randomly selects count of key value pairs. The selections are unique.
  // if count is larger than the total number of key value pairs, returns
  // every pair.
  // Executes at O(n) (i.e. slow for large sets).
  void RandomPairsUnique(unsigned int count, std::vector<sds>& keys, std::vector<sds>& vals,
                         bool with_value);

  // Randomly selects count of key value pairs. The select key value pairs
  // are allowed to have duplications.
  // Executes at O(n) (i.e. slow for large sets).
  void RandomPairs(unsigned int count, std::vector<sds>& keys, std::vector<sds>& vals,
                   bool with_value);

  static sds GetValue(sds key);

 private:
  // If keepttl is specified, performs a lookup for given field and computes ttl by comparing
  // existing expiry against time_now(). If keepttl is false, or field is not found, or it expires,
  // or the field has no ttl, returns ttl_sec. set_time() must have been called before computing
  // ttl.
  uint32_t ComputeTtl(std::string_view field, uint32_t ttl_sec, bool keepttl) const;

  // Reallocate key and/or value if their pages are underutilized.
  // Returns new pointer (stays same if key utilization is enough) and if reallocation happened.
  std::pair<sds, bool> ReallocIfNeeded(void* obj, PageUsage* page_usage);

  uint64_t Hash(const void* obj, uint32_t cookie) const final;
  bool ObjEqual(const void* left, const void* right, uint32_t right_cookie) const final;
  size_t ObjectAllocSize(const void* obj) const final;
  uint32_t ObjExpireTime(const void* obj) const final;
  void ObjUpdateExpireTime(const void* obj, uint32_t ttl_sec) override;
  void ObjDelete(void* obj, bool has_ttl) const override;
  void* ObjectClone(const void* obj, bool has_ttl, bool add_ttl) const final;
};

}  // namespace dfly


================================================
FILE: src/core/string_map_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/string_map.h"

#include <absl/strings/match.h>
#include <absl/strings/str_cat.h>
#include <gtest/gtest.h>
#include <mimalloc.h>

#include <algorithm>
#include <cstddef>
#include <memory_resource>
#include <random>
#include <string>
#include <string_view>
#include <unordered_set>
#include <vector>

#include "base/logging.h"
#include "core/compact_object.h"
#include "core/detail/stateless_allocator.h"
#include "core/page_usage/page_usage_stats.h"

extern "C" {
#include "redis/zmalloc.h"
}

namespace dfly {

using namespace std;

class StringMapTest : public ::testing::Test {
 protected:
  static void SetUpTestSuite() {
    auto* tlh = mi_heap_get_backing();
    init_zmalloc_threadlocal(tlh);
    InitTLStatelessAllocMR(PMR_NS::get_default_resource());
  }

  static void TearDownTestSuite() {
    mi_heap_collect(mi_heap_get_backing(), true);

    auto cb_visit = [](const mi_heap_t* heap, const mi_heap_area_t* area, void* block,
                       size_t block_size, void* arg) {
      LOG(ERROR) << "Unfreed allocations: block_size " << block_size
                 << ", allocated: " << area->used * block_size;
      return true;
    };

    mi_heap_visit_blocks(mi_heap_get_backing(), false /* do not visit all blocks*/, cb_visit,
                         nullptr);
  }

  StringMapTest() : mi_alloc_(mi_heap_get_backing()) {
  }

  void SetUp() override {
    sm_.reset(new StringMap(&mi_alloc_));
  }

  void TearDown() override {
    sm_.reset();
    EXPECT_EQ(zmalloc_used_memory_tl, 0);
  }

  MiMemoryResource mi_alloc_;
  std::unique_ptr<StringMap> sm_;
};

TEST_F(StringMapTest, Basic) {
  EXPECT_TRUE(sm_->AddOrUpdate("foo", "bar"));
  EXPECT_TRUE(sm_->Contains("foo"));
  auto it = sm_->Find("foo");
  EXPECT_STREQ("bar", it->second);

  it = sm_->begin();
  EXPECT_STREQ("foo", it->first);
  EXPECT_STREQ("bar", it->second);
  ++it;
  EXPECT_TRUE(it == sm_->end());

  for (const auto& k_v : *sm_) {
    EXPECT_STREQ("foo", k_v.first);
    EXPECT_STREQ("bar", k_v.second);
  }

  size_t sz = sm_->ObjMallocUsed();
  EXPECT_FALSE(sm_->AddOrUpdate("foo", "baraaaaaaaaaaaa2"));
  EXPECT_GT(sm_->ObjMallocUsed(), sz);
  it = sm_->begin();
  EXPECT_STREQ("baraaaaaaaaaaaa2", it->second);

  EXPECT_FALSE(sm_->AddOrSkip("foo", "bar2"));
  EXPECT_STREQ("baraaaaaaaaaaaa2", it->second);
}

TEST_F(StringMapTest, EmptyFind) {
  sm_->Find("bar");
}

TEST_F(StringMapTest, Ttl) {
  EXPECT_TRUE(sm_->AddOrUpdate("bla", "val1", 1));
  EXPECT_FALSE(sm_->AddOrUpdate("bla", "val2", 1));
  sm_->set_time(1);
  EXPECT_TRUE(sm_->AddOrUpdate("bla", "val2", 1));
  EXPECT_EQ(1u, sm_->UpperBoundSize());

  EXPECT_FALSE(sm_->AddOrSkip("bla", "val3", 2));

  // set ttl to 2, meaning that the key will expire at time 3.
  EXPECT_TRUE(sm_->AddOrSkip("bla2", "val3", 2));
  EXPECT_TRUE(sm_->Contains("bla2"));

  sm_->set_time(3);
  auto it = sm_->begin();
  EXPECT_TRUE(it == sm_->end());
}

TEST_F(StringMapTest, IterateExpired) {
  EXPECT_TRUE(sm_->AddOrUpdate("k1", "v1", 1));
  EXPECT_TRUE(sm_->AddOrUpdate("k2", "v2", 1));
  sm_->set_time(1);
  auto it = sm_->begin();
  it += 1;
  EXPECT_EQ(it, sm_->end());
}

TEST_F(StringMapTest, SetFieldExpireHasExpiry) {
  EXPECT_TRUE(sm_->AddOrUpdate("k1", "v1", 5));
  auto k = sm_->Find("k1");
  EXPECT_TRUE(k.HasExpiry());
  EXPECT_EQ(k.ExpiryTime(), 5);
  k.SetExpiryTime(1);
  EXPECT_TRUE(k.HasExpiry());
  EXPECT_EQ(k.ExpiryTime(), 1);
}

TEST_F(StringMapTest, SetFieldExpireNoHasExpiry) {
  EXPECT_TRUE(sm_->AddOrUpdate("k1", "v1"));
  auto k = sm_->Find("k1");
  EXPECT_FALSE(k.HasExpiry());
  k.SetExpiryTime(1);
  EXPECT_TRUE(k.HasExpiry());
  EXPECT_EQ(k.ExpiryTime(), 1);
}

TEST_F(StringMapTest, Bug3973) {
  for (unsigned i = 0; i < 8; i++) {
    EXPECT_TRUE(sm_->AddOrUpdate(to_string(i), "val"));
  }
  for (unsigned i = 0; i < 8; i++) {
    auto k = sm_->Find(to_string(i));
    ASSERT_FALSE(k.HasExpiry());
    k.SetExpiryTime(1);
    EXPECT_EQ(k.ExpiryTime(), 1);
  }
  for (unsigned i = 100; i < 1000; i++) {
    EXPECT_TRUE(sm_->AddOrUpdate(to_string(i), "val"));
  }

  // make sure the first 8 keys have expiry set
  for (unsigned i = 0; i < 8; i++) {
    auto k = sm_->Find(to_string(i));
    ASSERT_TRUE(k.HasExpiry());
    EXPECT_EQ(k.ExpiryTime(), 1);
  }
}

TEST_F(StringMapTest, Bug3984) {
  for (unsigned i = 0; i < 6; i++) {
    EXPECT_TRUE(sm_->AddOrUpdate(to_string(i), "val"));
  }
  for (unsigned i = 0; i < 6; i++) {
    auto k = sm_->Find(to_string(i));
    ASSERT_FALSE(k.HasExpiry());
    k.SetExpiryTime(1);
    EXPECT_EQ(k.ExpiryTime(), 1);
  }

  for (unsigned i = 0; i < 6; i++) {
    EXPECT_FALSE(sm_->AddOrUpdate(to_string(i), "val"));
  }
}

unsigned total_wasted_memory = 0;

TEST_F(StringMapTest, ReallocIfNeeded) {
  auto build_str = [](size_t i) { return to_string(i) + string(131, 'a'); };

  auto count_waste = [](const mi_heap_t* heap, const mi_heap_area_t* area, void* block,
                        size_t block_size, void* arg) {
    size_t used = block_size * area->used;
    total_wasted_memory += area->committed - used;
    return true;
  };

  for (size_t i = 0; i < 10'000; i++)
    sm_->AddOrUpdate(build_str(i), build_str(i + 1), i * 10 + 1);

  for (size_t i = 0; i < 10'000; i++) {
    if (i % 10 == 0)
      continue;
    sm_->Erase(build_str(i));
  }

  mi_heap_collect(mi_heap_get_backing(), true);
  mi_heap_visit_blocks(mi_heap_get_backing(), false, count_waste, nullptr);
  size_t wasted_before = total_wasted_memory;

  size_t underutilized = 0;
  PageUsage page_usage{CollectPageStats::NO, 0.9};
  for (auto it = sm_->begin(); it != sm_->end(); ++it) {
    underutilized += page_usage.IsPageForObjectUnderUtilized(it->first);
    it.ReallocIfNeeded(&page_usage);
  }
  // Check there are underutilized pages
  CHECK_GT(underutilized, 0u);

  total_wasted_memory = 0;
  mi_heap_collect(mi_heap_get_backing(), true);
  mi_heap_visit_blocks(mi_heap_get_backing(), false, count_waste, nullptr);
  size_t wasted_after = total_wasted_memory;

  // Check we waste significanlty less now
  EXPECT_GT(wasted_before, wasted_after * 2);

  EXPECT_EQ(sm_->UpperBoundSize(), 1000);
  for (size_t i = 0; i < 1000; i++)
    EXPECT_EQ(sm_->Find(build_str(i * 10))->second, build_str(i * 10 + 1));
}

TEST_F(StringMapTest, ExpiryChangesSize) {
  sm_->AddOrUpdate("field", "value");
  const size_t old_size = sm_->ObjMallocUsed();

  auto it = sm_->Find("field");
  it.SetExpiryTime(1);

  const size_t new_size = sm_->ObjMallocUsed();
  EXPECT_LT(old_size, new_size);

  sm_->AddOrUpdate("field", "value", 1);
  EXPECT_EQ(new_size, sm_->ObjMallocUsed());
}

TEST_F(StringMapTest, ExpiryWithMaxAndKeepTTL) {
  sm_->AddOrUpdate("field", "value", 100);
  auto k = sm_->Find("field");
  EXPECT_TRUE(k.HasExpiry());
  EXPECT_EQ(k.ExpiryTime(), 100);

  // ttl is copied from prev. if max value is supplied
  sm_->AddOrUpdate("field", "value", UINT32_MAX, true);
  k = sm_->Find("field");
  EXPECT_TRUE(k.HasExpiry());
  EXPECT_EQ(k.ExpiryTime(), 100);

  // max ttl value results in no expiry without keepttl
  sm_->AddOrUpdate("field", "value", UINT32_MAX);
  EXPECT_FALSE(sm_->Find("field").HasExpiry());

  // No prev. expiry, supplied ttl_sec value is used
  sm_->AddOrUpdate("field", "value", 10, true);
  k = sm_->Find("field");
  EXPECT_TRUE(k.HasExpiry());
  EXPECT_EQ(k.ExpiryTime(), 10);

  // object removed while adding due to expiry
  sm_->set_time(11);
  sm_->AddOrUpdate("field", "value", UINT32_MAX, true);
  k = sm_->Find("field");
  EXPECT_FALSE(k.HasExpiry());
}

TEST_F(StringMapTest, ExtractExisting) {
  sm_->AddOrUpdate("f1", "v1");
  sm_->AddOrUpdate("f2", "v2");
  EXPECT_EQ(sm_->UpperBoundSize(), 2u);

  auto entry = sm_->Extract("f1");
  ASSERT_TRUE(entry);

  // Verify the extracted entry has the correct value
  sds val = StringMap::GetValue(entry.get());
  EXPECT_EQ(string_view(val, sdslen(val)), "v1");

  // Verify it was removed from the map
  EXPECT_EQ(sm_->UpperBoundSize(), 1u);
  EXPECT_FALSE(sm_->Contains("f1"));
  EXPECT_TRUE(sm_->Contains("f2"));
}

TEST_F(StringMapTest, ExtractNonExisting) {
  sm_->AddOrUpdate("f1", "v1");
  auto entry = sm_->Extract("no_such_key");
  EXPECT_FALSE(entry);
  EXPECT_EQ(sm_->UpperBoundSize(), 1u);
}

TEST_F(StringMapTest, AddOrExchangeNew) {
  // Adding a new field returns nullptr (no previous entry)
  sds prev = sm_->AddOrExchange("f1", "v1");
  EXPECT_EQ(prev, nullptr);
  EXPECT_TRUE(sm_->Contains("f1"));
  EXPECT_STREQ(sm_->Find("f1")->second, "v1");
}

TEST_F(StringMapTest, AddOrExchangeReplace) {
  sm_->AddOrUpdate("f1", "old_value");
  EXPECT_EQ(sm_->UpperBoundSize(), 1u);

  sds prev = sm_->AddOrExchange("f1", "new_value");
  ASSERT_NE(prev, nullptr);

  // Verify the extracted entry has the old value
  sds val = StringMap::GetValue(prev);
  EXPECT_EQ(string_view(val, sdslen(val)), "old_value");

  // Verify map now has the new value
  EXPECT_STREQ(sm_->Find("f1")->second, "new_value");
  EXPECT_EQ(sm_->UpperBoundSize(), 1u);

  StringMap::DeleteEntry(prev);
}

TEST_F(StringMapTest, AddOrExchangeWithTtl) {
  sm_->AddOrUpdate("f1", "v1", 100);

  sds prev = sm_->AddOrExchange("f1", "v2", 200);
  ASSERT_NE(prev, nullptr);

  sds val = StringMap::GetValue(prev);
  EXPECT_EQ(string_view(val, sdslen(val)), "v1");

  // Make sure new entry has correct value and ttl
  auto it = sm_->Find("f1");
  EXPECT_STREQ(it->second, "v2");
  EXPECT_TRUE(it.HasExpiry());
  EXPECT_EQ(it.ExpiryTime(), 200u);

  StringMap::DeleteEntry(prev);
}

TEST_F(StringMapTest, ExtractMultiple) {
  for (unsigned i = 0; i < 20; i++) {
    sm_->AddOrUpdate(to_string(i), "val" + to_string(i));
  }
  EXPECT_EQ(sm_->UpperBoundSize(), 20u);

  // Extract every other entry
  vector<StringMap::SdsEntry> extracted;
  for (unsigned i = 0; i < 20; i += 2) {
    auto entry = sm_->Extract(to_string(i));
    ASSERT_TRUE(entry);
    extracted.push_back(std::move(entry));
  }

  EXPECT_EQ(sm_->UpperBoundSize(), 10u);

  // Verify remaining entries
  for (unsigned i = 1; i < 20; i += 2) {
    EXPECT_TRUE(sm_->Contains(to_string(i)));
  }
}

}  // namespace dfly


================================================
FILE: src/core/string_set.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/string_set.h"

#include "absl/flags/flag.h"
#include "core/compact_object.h"
#include "core/page_usage/page_usage_stats.h"
#include "core/sds_utils.h"

extern "C" {
#include "redis/sds.h"
#include "redis/zmalloc.h"
}

#include "base/logging.h"

using namespace std;

namespace dfly {

namespace {

inline bool MayHaveTtl(sds s) {
  char* alloc_ptr = (char*)sdsAllocPtr(s);
  return sdslen(s) + 1 + 4 <= zmalloc_usable_size(alloc_ptr);
}

sds AllocImmutableWithTtl(uint32_t len, uint32_t at) {
  sds res = AllocSdsWithSpace(len, sizeof(at));
  absl::little_endian::Store32(res + len + 1, at);  // Save TTL

  return res;
}

}  // namespace

StringSet::~StringSet() {
  Clear();
}

bool StringSet::Add(string_view src, uint32_t ttl_sec) {
  uint64_t hash = Hash(&src, 1);
  void* prev = FindInternal(&src, hash, 1);
  if (prev != nullptr) {
    return false;
  }

  sds newsds = MakeSetSds(src, ttl_sec);
  bool has_ttl = ttl_sec != UINT32_MAX;
  AddUnique(newsds, has_ttl, hash);
  return true;
}

unsigned StringSet::AddMany(absl::Span<std::string_view> span, uint32_t ttl_sec, bool keepttl) {
  std::string_view views[kMaxBatchLen];
  unsigned res = 0;
  if (BucketCount() < span.size()) {
    Reserve(span.size());
  }

  while (span.size() >= kMaxBatchLen) {
    for (size_t i = 0; i < kMaxBatchLen; i++)
      views[i] = span[i];

    span.remove_prefix(kMaxBatchLen);
    res += AddBatch(absl::MakeSpan(views), ttl_sec, keepttl);
  }

  if (span.size()) {
    for (size_t i = 0; i < span.size(); i++)
      views[i] = span[i];

    res += AddBatch(absl::MakeSpan(views, span.size()), ttl_sec, keepttl);
  }
  return res;
}

unsigned StringSet::AddBatch(absl::Span<std::string_view> span, uint32_t ttl_sec, bool keepttl) {
  uint64_t hash[kMaxBatchLen];
  bool has_ttl = ttl_sec != UINT32_MAX;
  unsigned count = span.size();
  unsigned res = 0;

  DCHECK_LE(count, kMaxBatchLen);

  for (size_t i = 0; i < count; i++) {
    hash[i] = CompactObj::HashCode(span[i]);
    Prefetch(hash[i]);
  }

  for (unsigned i = 0; i < count; ++i) {
    void* prev = FindInternal(&span[i], hash[i], 1);
    if (prev == nullptr) {
      ++res;
      sds field = MakeSetSds(span[i], ttl_sec);
      AddUnique(field, has_ttl, hash[i]);
    } else if (has_ttl && !keepttl) {
      ObjUpdateExpireTime(prev, ttl_sec);
    }
  }

  return res;
}

StringSet::iterator StringSet::GetRandomMember() {
  return iterator{DenseSet::GetRandomIterator()};
}

std::optional<std::string> StringSet::Pop() {
  sds str = (sds)PopInternal();

  if (str == nullptr) {
    return std::nullopt;
  }

  std::string ret{str, sdslen(str)};
  sdsfree(str);

  return ret;
}

uint32_t StringSet::Scan(uint32_t cursor, const std::function<void(const sds)>& func) const {
  return DenseSet::Scan(cursor, [func](const void* ptr) { func((sds)ptr); });
}

uint64_t StringSet::Hash(const void* ptr, uint32_t cookie) const {
  DCHECK_LT(cookie, 2u);

  if (cookie == 0) {
    sds s = (sds)ptr;
    return CompactObj::HashCode(string_view{s, sdslen(s)});
  }

  const string_view* sv = (const string_view*)ptr;
  return CompactObj::HashCode(*sv);
}

bool StringSet::ObjEqual(const void* left, const void* right, uint32_t right_cookie) const {
  DCHECK_LT(right_cookie, 2u);

  sds s1 = (sds)left;

  if (right_cookie == 0) {
    sds s2 = (sds)right;

    if (sdslen(s1) != sdslen(s2)) {
      return false;
    }

    return sdslen(s1) == 0 || memcmp(s1, s2, sdslen(s1)) == 0;
  }

  const string_view* right_sv = (const string_view*)right;
  string_view left_sv{s1, sdslen(s1)};
  return left_sv == (*right_sv);
}

size_t StringSet::ObjectAllocSize(const void* s1) const {
  return zmalloc_usable_size(sdsAllocPtr((sds)s1));
}

uint32_t StringSet::ObjExpireTime(const void* str) const {
  sds s = (sds)str;
  DCHECK(MayHaveTtl(s));

  char* ttlptr = s + sdslen(s) + 1;
  return absl::little_endian::Load32(ttlptr);
}

void StringSet::ObjUpdateExpireTime(const void* obj, uint32_t ttl_sec) {
  return SdsUpdateExpireTime(obj, time_now() + ttl_sec, 0);
}

void StringSet::ObjDelete(void* obj, bool has_ttl) const {
  sdsfree((sds)obj);
}

void* StringSet::ObjectClone(const void* obj, bool has_ttl, bool add_ttl) const {
  sds src = (sds)obj;
  string_view sv{src, sdslen(src)};
  uint32_t ttl_sec = add_ttl ? 0 : (has_ttl ? ObjExpireTime(obj) : UINT32_MAX);
  return (void*)MakeSetSds(sv, ttl_sec);
}

sds StringSet::MakeSetSds(string_view src, uint32_t ttl_sec) const {
  if (ttl_sec != UINT32_MAX) {
    uint32_t at = time_now() + ttl_sec;

    sds newsds = AllocImmutableWithTtl(src.size(), at);
    if (!src.empty())
      memcpy(newsds, src.data(), src.size());
    return newsds;
  }

  return sdsnewlen(src.data(), src.size());
}

// Does not release obj. Callers must deallocate with sdsfree explicitly
pair<sds, bool> StringSet::DuplicateEntryIfFragmented(void* obj, PageUsage* page_usage) {
  sds key = (sds)obj;

  if (!page_usage->IsPageForObjectUnderUtilized(key))
    return {key, false};

  size_t key_len = sdslen(key);
  bool has_ttl = MayHaveTtl(key);

  if (has_ttl) {
    sds res = AllocSdsWithSpace(key_len, sizeof(uint32_t));
    std::memcpy(res, key, key_len + sizeof(uint32_t));
    return {res, true};
  }

  return {sdsnewlen(key, key_len), true};
}

bool StringSet::iterator::ReallocIfNeeded(PageUsage* page_usage) {
  auto* ptr = curr_entry_;
  if (ptr->IsLink()) {
    ptr = ptr->AsLink();
  }

  DCHECK(!ptr->IsEmpty());
  DCHECK(ptr->IsObject());

  auto* obj = ptr->GetObject();
  auto [new_obj, realloced] =
      static_cast<StringSet*>(owner_)->DuplicateEntryIfFragmented(obj, page_usage);

  if (realloced) {
    ptr->SetObject(new_obj);
    sdsfree((sds)obj);
  }

  return realloced;
}

}  // namespace dfly


================================================
FILE: src/core/string_set.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/types/span.h>

#include <cstdint>
#include <functional>
#include <optional>
#include <string_view>

#include "core/dense_set.h"

extern "C" {
#include "redis/sds.h"
}

namespace dfly {
class PageUsage;

class StringSet : public DenseSet {
 public:
  StringSet() = default;

  ~StringSet();

  // Returns true if elem was added.
  bool Add(std::string_view s1, uint32_t ttl_sec = UINT32_MAX);

  unsigned AddMany(absl::Span<std::string_view> span, uint32_t ttl_sec, bool keepttl);

  bool Erase(std::string_view str) {
    return EraseInternal(&str, 1);
  }

  bool Contains(std::string_view s1) const {
    return FindInternal(&s1, Hash(&s1, 1), 1) != nullptr;
  }

  class iterator : private IteratorBase {
   public:
    using iterator_category = std::forward_iterator_tag;
    using difference_type = std::ptrdiff_t;
    using value_type = sds;
    using pointer = sds*;
    using reference = sds&;

    explicit iterator(const IteratorBase& o) : IteratorBase(o) {
    }

    iterator() : IteratorBase() {
    }

    iterator(DenseSet* set) : IteratorBase(set, false) {
    }

    iterator& operator++() {
      Advance();
      return *this;
    }

    bool operator==(const iterator& b) const {
      if (owner_ == nullptr && b.owner_ == nullptr) {  // to allow comparison with end()
        return true;
      }
      return owner_ == b.owner_ && curr_entry_ == b.curr_entry_;
    }

    bool operator!=(const iterator& b) const {
      return !(*this == b);
    }

    value_type operator*() {
      return (value_type)curr_entry_->GetObject();
    }

    value_type operator->() {
      return (value_type)curr_entry_->GetObject();
    }

    using IteratorBase::ExpiryTime;
    using IteratorBase::HasExpiry;
    using IteratorBase::SetExpiryTime;

    // Try reducing memory fragmentation of the value by re-allocating. Returns true if
    // re-allocation happened.
    bool ReallocIfNeeded(PageUsage* page_usage);
  };

  iterator begin() {
    return iterator{this};
  }

  iterator end() {
    return iterator{};
  }

  // See DenseSet::GetRandomIterator
  iterator GetRandomMember();

  std::optional<std::string> Pop();

  uint32_t Scan(uint32_t, const std::function<void(sds)>&) const;

  iterator Find(std::string_view member) {
    return iterator{FindIt(&member, 1)};
  }

 protected:
  uint64_t Hash(const void* ptr, uint32_t cookie) const override;

  unsigned AddBatch(absl::Span<std::string_view> span, uint32_t ttl_sec, bool keepttl);

  bool ObjEqual(const void* left, const void* right, uint32_t right_cookie) const override;

  size_t ObjectAllocSize(const void* s1) const override;
  uint32_t ObjExpireTime(const void* obj) const override;
  void ObjUpdateExpireTime(const void* obj, uint32_t ttl_sec) override;
  void ObjDelete(void* obj, bool has_ttl) const override;
  void* ObjectClone(const void* obj, bool has_ttl, bool add_ttl) const override;
  sds MakeSetSds(std::string_view src, uint32_t ttl_sec) const;

 private:
  std::pair<sds, bool> DuplicateEntryIfFragmented(void* obj, PageUsage* page_usage);
};

}  // end namespace dfly


================================================
FILE: src/core/string_set_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/string_set.h"

#include <absl/strings/match.h>
#include <absl/strings/str_cat.h>
#include <mimalloc.h>

#include <algorithm>
#include <memory_resource>
#include <random>
#include <string>
#include <string_view>
#include <unordered_set>
#include <vector>

#include "base/gtest.h"
#include "base/logging.h"
#include "core/compact_object.h"
#include "core/page_usage/page_usage_stats.h"
#include "redis/sds.h"

extern "C" {
#include "redis/zmalloc.h"
}

namespace dfly {

using namespace std;
using absl::StrCat;

class DenseSetAllocator : public PMR_NS::memory_resource {
 public:
  bool all_freed() const {
    return alloced_ == 0;
  }

  void* do_allocate(size_t bytes, size_t alignment) override {
    alloced_ += bytes;
    void* p = PMR_NS::new_delete_resource()->allocate(bytes, alignment);
    return p;
  }

  void do_deallocate(void* p, size_t bytes, size_t alignment) override {
    alloced_ -= bytes;
    return PMR_NS::new_delete_resource()->deallocate(p, bytes, alignment);
  }

  bool do_is_equal(const PMR_NS::memory_resource& other) const noexcept override {
    return PMR_NS::new_delete_resource()->is_equal(other);
  }

 private:
  size_t alloced_ = 0;
};

class StringSetTest : public ::testing::Test {
 protected:
  static void SetUpTestSuite() {
    auto* tlh = mi_heap_get_backing();
    init_zmalloc_threadlocal(tlh);
    InitTLStatelessAllocMR(PMR_NS::get_default_resource());
  }

  static void TearDownTestSuite() {
  }

  void SetUp() override {
    ss_ = new StringSet;
    generator_.seed(0);
  }

  void TearDown() override {
    delete ss_;

    // ensure there are no memory leaks after every test
    EXPECT_TRUE(alloc_.all_freed());
    EXPECT_EQ(zmalloc_used_memory_tl, 0);
  }

  StringSet* ss_;
  DenseSetAllocator alloc_;
  mt19937 generator_;
};

TEST_F(StringSetTest, Basic) {
  EXPECT_TRUE(ss_->Add("foo"sv));
  EXPECT_TRUE(ss_->Add("bar"sv));
  EXPECT_FALSE(ss_->Add("foo"sv));
  EXPECT_FALSE(ss_->Add("bar"sv));
  EXPECT_TRUE(ss_->Contains("foo"sv));
  EXPECT_TRUE(ss_->Contains("bar"sv));
  EXPECT_EQ(2, ss_->UpperBoundSize());
}

TEST_F(StringSetTest, StandardAddErase) {
  EXPECT_TRUE(ss_->Add("@@@@@@@@@@@@@@@@"));
  EXPECT_TRUE(ss_->Add("A@@@@@@@@@@@@@@@"));
  EXPECT_TRUE(ss_->Add("AA@@@@@@@@@@@@@@"));
  EXPECT_TRUE(ss_->Add("AAA@@@@@@@@@@@@@"));
  EXPECT_TRUE(ss_->Add("AAAAAAAAA@@@@@@@"));
  EXPECT_TRUE(ss_->Add("AAAAAAAAAA@@@@@@"));
  EXPECT_TRUE(ss_->Add("AAAAAAAAAAAAAAA@"));
  EXPECT_TRUE(ss_->Add("AAAAAAAAAAAAAAAA"));
  EXPECT_TRUE(ss_->Add("AAAAAAAAAAAAAAAD"));
  EXPECT_TRUE(ss_->Add("BBBBBAAAAAAAAAAA"));
  EXPECT_TRUE(ss_->Add("BBBBBBBBAAAAAAAA"));
  EXPECT_TRUE(ss_->Add("CCCCCBBBBBBBBBBB"));

  // Remove link in the middle of chain
  EXPECT_TRUE(ss_->Erase("BBBBBBBBAAAAAAAA"));
  // Remove start of a chain
  EXPECT_TRUE(ss_->Erase("CCCCCBBBBBBBBBBB"));
  // Remove end of link
  EXPECT_TRUE(ss_->Erase("AAA@@@@@@@@@@@@@"));
  // Remove only item in chain
  EXPECT_TRUE(ss_->Erase("AA@@@@@@@@@@@@@@"));
  EXPECT_TRUE(ss_->Erase("AAAAAAAAA@@@@@@@"));
  EXPECT_TRUE(ss_->Erase("AAAAAAAAAA@@@@@@"));
  EXPECT_TRUE(ss_->Erase("AAAAAAAAAAAAAAA@"));
}

TEST_F(StringSetTest, DisplacedBug) {
  string_view vals[] = {"imY", "OVl", "NhH", "BCe", "YDL", "lpb",
                        "nhF", "xod", "zYR", "PSa", "hce", "cTR"};
  ss_->AddMany(absl::MakeSpan(vals), UINT32_MAX, false);

  ss_->Add("fIc");
  ss_->Erase("YDL");
  ss_->Add("fYs");
  ss_->Erase("hce");
  ss_->Erase("nhF");
  ss_->Add("dye");
  ss_->Add("xZT");
  ss_->Add("LVK");
  ss_->Erase("zYR");
  ss_->Erase("fYs");
  ss_->Add("ueB");
  ss_->Erase("PSa");
  ss_->Erase("OVl");
  ss_->Add("cga");
  ss_->Add("too");
  ss_->Erase("ueB");
  ss_->Add("HZe");
  ss_->Add("oQn");
  ss_->Erase("too");
  ss_->Erase("HZe");
  ss_->Erase("xZT");
  ss_->Erase("cga");
  ss_->Erase("cTR");
  ss_->Erase("BCe");
  ss_->Add("eua");
  ss_->Erase("lpb");
  ss_->Add("OXK");
  ss_->Add("QmO");
  ss_->Add("SzV");
  ss_->Erase("QmO");
  ss_->Add("jbe");
  ss_->Add("BPN");
  ss_->Add("OfH");
  ss_->Add("Muf");
  ss_->Add("CwP");
  ss_->Erase("Muf");
  ss_->Erase("xod");
  ss_->Add("Cis");
  ss_->Add("Xvd");
  ss_->Erase("SzV");
  ss_->Erase("eua");
  ss_->Add("DGb");
  ss_->Add("leD");
  ss_->Add("MVX");
  ss_->Add("HPq");
}

static string random_string(mt19937& rand, unsigned len) {
  const string_view alpanum = "1234567890abcdefghijklmnopqrstuvwxyz";
  string ret;
  ret.reserve(len);

  for (size_t i = 0; i < len; ++i) {
    ret += alpanum[rand() % alpanum.size()];
  }

  return ret;
}

TEST_F(StringSetTest, Resizing) {
  constexpr size_t num_strs = 4096;
  unordered_set<string> strs;
  while (strs.size() != num_strs) {
    auto str = random_string(generator_, 10);
    strs.insert(str);
  }

  unsigned size = 0;
  for (auto it = strs.begin(); it != strs.end(); ++it) {
    const auto& str = *it;
    EXPECT_TRUE(ss_->Add(str, 1));
    EXPECT_EQ(ss_->UpperBoundSize(), size + 1);

    // make sure we haven't lost any items after a grow
    // which happens every power of 2
    if ((size & (size - 1)) == 0) {
      for (auto j = strs.begin(); j != it; ++j) {
        const auto& str = *j;
        auto it = ss_->Find(str);
        ASSERT_TRUE(it != ss_->end());
        EXPECT_TRUE(it.HasExpiry());
        EXPECT_EQ(it.ExpiryTime(), ss_->time_now() + 1);
      }
    }
    ++size;
  }
}

TEST_F(StringSetTest, SimpleScan) {
  unordered_set<string_view> info = {"foo", "bar"};
  unordered_set<string_view> seen;

  for (auto str : info) {
    EXPECT_TRUE(ss_->Add(str));
  }

  uint32_t cursor = 0;
  do {
    cursor = ss_->Scan(cursor, [&](const sds ptr) {
      sds s = (sds)ptr;
      string_view str{s, sdslen(s)};
      EXPECT_TRUE(info.count(str));
      seen.insert(str);
    });
  } while (cursor != 0);

  EXPECT_TRUE(seen.size() == info.size() && equal(seen.begin(), seen.end(), info.begin()));
}

// Ensure REDIS scan guarantees are met
TEST_F(StringSetTest, ScanGuarantees) {
  unordered_set<string_view> to_be_seen = {"foo", "bar"};
  unordered_set<string_view> not_be_seen = {"AAA", "BBB"};
  unordered_set<string_view> maybe_seen = {"AA@@@@@@@@@@@@@@", "AAA@@@@@@@@@@@@@",
                                           "AAAAAAAAA@@@@@@@", "AAAAAAAAAA@@@@@@"};
  unordered_set<string_view> seen;

  auto scan_callback = [&](const sds ptr) {
    sds s = (sds)ptr;
    string_view str{s, sdslen(s)};
    EXPECT_TRUE(to_be_seen.count(str) || maybe_seen.count(str));
    EXPECT_FALSE(not_be_seen.count(str));
    if (to_be_seen.count(str)) {
      seen.insert(str);
    }
  };

  EXPECT_EQ(ss_->Scan(0, scan_callback), 0);

  for (auto str : not_be_seen) {
    EXPECT_TRUE(ss_->Add(str));
  }

  for (auto str : not_be_seen) {
    EXPECT_TRUE(ss_->Erase(str));
  }

  for (auto str : to_be_seen) {
    EXPECT_TRUE(ss_->Add(str));
  }

  // should reach at least the first item in the set
  uint32_t cursor = ss_->Scan(0, scan_callback);

  for (auto str : maybe_seen) {
    EXPECT_TRUE(ss_->Add(str));
  }

  while (cursor != 0) {
    cursor = ss_->Scan(cursor, scan_callback);
  }

  EXPECT_TRUE(seen.size() == to_be_seen.size());
}

TEST_F(StringSetTest, IntOnly) {
  constexpr size_t num_ints = 8192;
  unordered_set<unsigned int> numbers;
  for (size_t i = 0; i < num_ints; ++i) {
    numbers.insert(i);
    EXPECT_TRUE(ss_->Add(to_string(i)));
  }

  for (size_t i = 0; i < num_ints; ++i) {
    ASSERT_FALSE(ss_->Add(to_string(i)));
  }

  size_t num_remove = generator_() % 4096;
  unordered_set<string> removed;

  for (size_t i = 0; i < num_remove; ++i) {
    auto remove_int = generator_() % num_ints;
    auto remove = to_string(remove_int);
    if (numbers.count(remove_int)) {
      ASSERT_TRUE(ss_->Contains(remove)) << remove_int;
      EXPECT_TRUE(ss_->Erase(remove));
      numbers.erase(remove_int);
    } else {
      EXPECT_FALSE(ss_->Erase(remove));
    }

    EXPECT_FALSE(ss_->Contains(remove));
    removed.insert(remove);
  }

  size_t expected_seen = 0;
  auto scan_callback = [&](const sds ptr) {
    string str{ptr, sdslen(ptr)};
    EXPECT_FALSE(removed.count(str));

    if (numbers.count(atoi(str.data()))) {
      ++expected_seen;
    }
  };

  uint32_t cursor = 0;
  do {
    cursor = ss_->Scan(cursor, scan_callback);
    // randomly throw in some new numbers
    uint32_t val = generator_();
    VLOG(1) << "Val " << val;
    ss_->Add(to_string(val));
  } while (cursor != 0);

  EXPECT_GE(expected_seen + removed.size(), num_ints);
}

TEST_F(StringSetTest, XtremeScanGrow) {
  unordered_set<string> to_see, force_grow, seen;

  while (to_see.size() != 8) {
    to_see.insert(random_string(generator_, 10));
  }

  while (force_grow.size() != 8192) {
    string str = random_string(generator_, 10);

    if (to_see.count(str)) {
      continue;
    }

    force_grow.insert(random_string(generator_, 10));
  }

  for (auto& str : to_see) {
    EXPECT_TRUE(ss_->Add(str));
  }

  auto scan_callback = [&](const sds ptr) {
    sds s = (sds)ptr;
    string_view str{s, sdslen(s)};
    if (to_see.count(string(str))) {
      seen.insert(string(str));
    }
  };

  uint32_t cursor = ss_->Scan(0, scan_callback);

  // force approx 10 grows
  for (auto& s : force_grow) {
    EXPECT_TRUE(ss_->Add(s));
  }

  while (cursor != 0) {
    cursor = ss_->Scan(cursor, scan_callback);
  }

  EXPECT_EQ(seen.size(), to_see.size());
}

TEST_F(StringSetTest, Pop) {
  constexpr size_t num_items = 8;
  unordered_set<string> to_insert;

  while (to_insert.size() != num_items) {
    auto str = random_string(generator_, 10);
    if (to_insert.count(str)) {
      continue;
    }

    to_insert.insert(str);
    EXPECT_TRUE(ss_->Add(str));
  }

  while (!ss_->Empty()) {
    size_t size = ss_->UpperBoundSize();
    auto str = ss_->Pop();
    DCHECK(ss_->UpperBoundSize() == to_insert.size() - 1);
    DCHECK(str.has_value());
    DCHECK(to_insert.count(str.value()));
    DCHECK_EQ(ss_->UpperBoundSize(), size - 1);
    to_insert.erase(str.value());
  }

  DCHECK(ss_->Empty());
  DCHECK(to_insert.empty());
}

TEST_F(StringSetTest, Iteration) {
  ss_->Add("foo");
  for (const sds ptr : *ss_) {
    LOG(INFO) << ptr;
  }
  ss_->Clear();
  constexpr size_t num_items = 8192;
  unordered_set<string> to_insert;

  while (to_insert.size() != num_items) {
    auto str = random_string(generator_, 10);
    if (to_insert.count(str)) {
      continue;
    }

    to_insert.insert(str);
    EXPECT_TRUE(ss_->Add(str));
  }

  for (const sds ptr : *ss_) {
    string str{ptr, sdslen(ptr)};
    EXPECT_TRUE(to_insert.count(str));
    to_insert.erase(str);
  }

  EXPECT_EQ(to_insert.size(), 0);
}

TEST_F(StringSetTest, SetFieldExpireHasExpiry) {
  EXPECT_TRUE(ss_->Add("k1", 100));
  auto k = ss_->Find("k1");
  EXPECT_TRUE(k.HasExpiry());
  EXPECT_EQ(k.ExpiryTime(), 100);
  k.SetExpiryTime(1);
  EXPECT_TRUE(k.HasExpiry());
  EXPECT_EQ(k.ExpiryTime(), 1);
}

TEST_F(StringSetTest, SetFieldExpireNoHasExpiry) {
  EXPECT_TRUE(ss_->Add("k1"));
  auto k = ss_->Find("k1");
  EXPECT_FALSE(k.HasExpiry());
  k.SetExpiryTime(10);
  EXPECT_TRUE(k.HasExpiry());
  EXPECT_EQ(k.ExpiryTime(), 10);
}

TEST_F(StringSetTest, Ttl) {
  EXPECT_TRUE(ss_->Add("bla"sv, 1));
  EXPECT_FALSE(ss_->Add("bla"sv, 1));
  auto it = ss_->Find("bla"sv);
  EXPECT_EQ(1u, it.ExpiryTime());

  ss_->set_time(1);
  EXPECT_TRUE(ss_->Add("bla"sv, 1));
  EXPECT_EQ(1u, ss_->UpperBoundSize());

  for (unsigned i = 0; i < 100; ++i) {
    EXPECT_TRUE(ss_->Add(StrCat("foo", i), 1));
  }
  EXPECT_EQ(101u, ss_->UpperBoundSize());
  it = ss_->Find("foo50");
  EXPECT_STREQ("foo50", *it);
  EXPECT_EQ(2u, it.ExpiryTime());

  ss_->set_time(2);
  for (unsigned i = 0; i < 100; ++i) {
    EXPECT_TRUE(ss_->Add(StrCat("bar", i)));
  }
  it = ss_->Find("bar50");
  EXPECT_FALSE(it.HasExpiry());

  for (auto it = ss_->begin(); it != ss_->end(); ++it) {
    ASSERT_TRUE(absl::StartsWith(*it, "bar")) << *it;
    string str = *it;
    VLOG(1) << *it;
  }
}

TEST_F(StringSetTest, Grow) {
  for (size_t j = 0; j < 10; ++j) {
    for (size_t i = 0; i < 4098; ++i) {
      ss_->Reserve(generator_() % 256);
      auto str = random_string(generator_, 3);
      ss_->Add(str);
    }
    ss_->Clear();
  }
}

TEST_F(StringSetTest, Reserve) {
  vector<string> strs;

  for (size_t i = 0; i < 10; ++i) {
    strs.push_back(random_string(generator_, 10));
    ss_->Add(strs.back());
  }

  for (size_t j = 2; j < 20; j += 3) {
    ss_->Reserve(j * 20);
    for (size_t i = 0; i < 10; ++i) {
      ASSERT_TRUE(ss_->Contains(strs[i]));
    }
  }
}

TEST_F(StringSetTest, Fill) {
  for (size_t i = 0; i < 100; ++i) {
    ss_->Add(random_string(generator_, 10));
  }
  StringSet s2;
  ss_->Fill(&s2);
  EXPECT_EQ(s2.UpperBoundSize(), ss_->UpperBoundSize());
  for (sds str : *ss_) {
    EXPECT_TRUE(s2.Contains(str));
  }
}

TEST_F(StringSetTest, ClearResetsObjMallocUsed) {
  // Add some items
  for (size_t i = 0; i < 100; ++i) {
    ss_->Add(random_string(generator_, 10));
  }

  // Verify ObjMallocUsed() > 0 after adding items
  EXPECT_GT(ss_->ObjMallocUsed(), 0u);
  EXPECT_GT(ss_->UpperBoundSize(), 0u);

  // Clear the set
  ss_->Clear();

  // Verify ObjMallocUsed() is reset to 0 after Clear
  EXPECT_EQ(ss_->ObjMallocUsed(), 0u);
  EXPECT_EQ(ss_->UpperBoundSize(), 0u);
}

TEST_F(StringSetTest, IterateEmpty) {
  for (const auto& s : *ss_) {
    // We're iterating to make sure there is no crash. However, if we got here, it's a bug
    CHECK(false) << "Found entry " << s << " in empty set";
  }
}

static size_t MemUsed(StringSet& obj) {
  return obj.ObjMallocUsed() + obj.SetMallocUsed();
}

void BM_Clone(benchmark::State& state) {
  vector<string> strs;
  mt19937 generator(0);
  StringSet ss1, ss2;
  unsigned elems = state.range(0);
  for (size_t i = 0; i < elems; ++i) {
    string str = random_string(generator, 10);
    ss1.Add(str);
  }
  ss2.Reserve(ss1.UpperBoundSize());
  while (state.KeepRunning()) {
    for (auto src : ss1) {
      ss2.Add(src);
    }
    state.PauseTiming();
    ss2.Clear();
    ss2.Reserve(ss1.UpperBoundSize());
    state.ResumeTiming();
  }
}
BENCHMARK(BM_Clone)->ArgName("elements")->Arg(32000);

void BM_Fill(benchmark::State& state) {
  unsigned elems = state.range(0);
  vector<string> strs;
  mt19937 generator(0);
  StringSet ss1, ss2;
  for (size_t i = 0; i < elems; ++i) {
    string str = random_string(generator, 10);
    ss1.Add(str);
  }

  while (state.KeepRunning()) {
    ss1.Fill(&ss2);
    state.PauseTiming();
    ss2.Clear();
    state.ResumeTiming();
  }
}
BENCHMARK(BM_Fill)->ArgName("elements")->Arg(32000);

void BM_Clear(benchmark::State& state) {
  unsigned elems = state.range(0);
  mt19937 generator(0);
  StringSet ss;
  while (state.KeepRunning()) {
    state.PauseTiming();
    for (size_t i = 0; i < elems; ++i) {
      string str = random_string(generator, 16);
      ss.Add(str);
    }
    state.ResumeTiming();
    ss.Clear();
  }
}
BENCHMARK(BM_Clear)->ArgName("elements")->Arg(32000);

void BM_Add(benchmark::State& state) {
  vector<string> strs;
  mt19937 generator(0);
  StringSet ss;
  unsigned elems = state.range(0);
  unsigned keySize = state.range(1);
  for (size_t i = 0; i < elems; ++i) {
    string str = random_string(generator, keySize);
    strs.push_back(str);
  }
  ss.Reserve(elems);
  size_t mem_used = 0;
  while (state.KeepRunning()) {
    for (auto& str : strs)
      ss.Add(str);
    state.PauseTiming();
    mem_used += MemUsed(ss);
    ss.Clear();
    ss.Reserve(elems);
    state.ResumeTiming();
  }
  state.counters["Memory_Used"] = mem_used / state.iterations();
}
BENCHMARK(BM_Add)
    ->ArgNames({"elements", "Key Size"})
    ->ArgsProduct({{1000, 10000, 100000}, {10, 100, 1000}});

void BM_AddMany(benchmark::State& state) {
  vector<string> strs;
  mt19937 generator(0);
  StringSet ss;
  unsigned elems = state.range(0);
  unsigned keySize = state.range(1);
  for (size_t i = 0; i < elems; ++i) {
    string str = random_string(generator, keySize);
    strs.push_back(str);
  }
  ss.Reserve(elems);
  vector<string_view> svs;
  for (const auto& str : strs) {
    svs.push_back(str);
  }
  size_t mem_used = 0;
  while (state.KeepRunning()) {
    ss.AddMany(absl::MakeSpan(svs), UINT32_MAX, false);
    state.PauseTiming();
    CHECK_EQ(ss.UpperBoundSize(), elems);
    mem_used += MemUsed(ss);
    ss.Clear();
    ss.Reserve(elems);
    state.ResumeTiming();
  }
  state.counters["Memory_Used"] = mem_used / state.iterations();
}
BENCHMARK(BM_AddMany)
    ->ArgNames({"elements", "Key Size"})
    ->ArgsProduct({{1000, 10000, 100000}, {10, 100, 1000}});

void BM_Erase(benchmark::State& state) {
  std::vector<std::string> strs;
  mt19937 generator(0);
  StringSet ss;
  auto elems = state.range(0);
  auto keySize = state.range(1);
  for (long int i = 0; i < elems; ++i) {
    std::string str = random_string(generator, keySize);
    strs.push_back(str);
    ss.Add(str);
  }
  state.counters["Memory_Before_Erase"] = MemUsed(ss);
  size_t mem_used = 0;
  while (state.KeepRunning()) {
    for (auto& str : strs) {
      ss.Erase(str);
    }
    state.PauseTiming();
    mem_used += MemUsed(ss);
    for (auto& str : strs) {
      ss.Add(str);
    }
    state.ResumeTiming();
  }
  state.counters["Memory_After_Erase"] = mem_used / state.iterations();
}
BENCHMARK(BM_Erase)
    ->ArgNames({"elements", "Key Size"})
    ->ArgsProduct({{1000, 10000, 100000}, {10, 100, 1000}});

void BM_Get(benchmark::State& state) {
  std::vector<std::string> strs;
  mt19937 generator(0);
  StringSet ss;
  auto elems = state.range(0);
  auto keySize = state.range(1);
  for (long int i = 0; i < elems; ++i) {
    std::string str = random_string(generator, keySize);
    strs.push_back(str);
    ss.Add(str);
  }
  while (state.KeepRunning()) {
    for (auto& str : strs) {
      ss.Find(str);
    }
  }
}
BENCHMARK(BM_Get)
    ->ArgNames({"elements", "Key Size"})
    ->ArgsProduct({{1000, 10000, 100000}, {10, 100, 1000}});

void BM_Grow(benchmark::State& state) {
  vector<string> strs;
  mt19937 generator(0);
  StringSet src;
  unsigned elems = 1 << 18;
  for (size_t i = 0; i < elems; ++i) {
    src.Add(random_string(generator, 16), UINT32_MAX);
    strs.push_back(random_string(generator, 16));
  }

  while (state.KeepRunning()) {
    state.PauseTiming();
    StringSet tmp;
    src.Fill(&tmp);
    CHECK_EQ(tmp.BucketCount(), elems);
    state.ResumeTiming();
    for (const auto& str : strs) {
      tmp.Add(str);
      if (tmp.BucketCount() > elems) {
        break;  // we grew
      }
    }

    CHECK_GT(tmp.BucketCount(), elems);
  }
}
BENCHMARK(BM_Grow);

void BM_Spop1000(benchmark::State& state) {
  mt19937 generator(0);
  StringSet src;
  unsigned elems = 1 << 14;
  for (size_t i = 0; i < elems; ++i) {
    src.Add(random_string(generator, 16), UINT32_MAX);
  }

  auto sparseness = state.range(0);
  while (state.KeepRunning()) {
    state.PauseTiming();
    StringSet tmp;
    src.Fill(&tmp);
    tmp.Reserve(elems * sparseness);
    state.ResumeTiming();
    for (int i = 0; i < 1000; ++i) {
      tmp.Pop();
    }
  }
}
BENCHMARK(BM_Spop1000)->ArgName("sparseness")->ArgsProduct({{1, 4, 10}});

unsigned total_wasted_memory = 0;

TEST_F(StringSetTest, ReallocIfNeeded) {
  auto build_str = [](size_t i) { return to_string(i) + string(131, 'a'); };

  auto count_waste = [](const mi_heap_t* heap, const mi_heap_area_t* area, void* block,
                        size_t block_size, void* arg) {
    size_t used = block_size * area->used;
    total_wasted_memory += area->committed - used;
    return true;
  };

  for (size_t i = 0; i < 10'000; i++)
    ss_->Add(build_str(i));

  for (size_t i = 0; i < 10'000; i++) {
    if (i % 10 == 0)
      continue;
    ss_->Erase(build_str(i));
  }

  mi_heap_collect(mi_heap_get_backing(), true);
  mi_heap_visit_blocks(mi_heap_get_backing(), false, count_waste, nullptr);
  size_t wasted_before = total_wasted_memory;

  size_t underutilized = 0;
  PageUsage page_usage{CollectPageStats::NO, 0.9};
  for (auto it = ss_->begin(); it != ss_->end(); ++it) {
    underutilized += page_usage.IsPageForObjectUnderUtilized(*it);
    it.ReallocIfNeeded(&page_usage);
  }
  // Check there are underutilized pages
  CHECK_GT(underutilized, 0u);

  total_wasted_memory = 0;
  mi_heap_collect(mi_heap_get_backing(), true);
  mi_heap_visit_blocks(mi_heap_get_backing(), false, count_waste, nullptr);
  size_t wasted_after = total_wasted_memory;

  // Check we waste significanlty less now
  EXPECT_GT(wasted_before, wasted_after * 2);

  EXPECT_EQ(ss_->UpperBoundSize(), 1000);
  for (size_t i = 0; i < 1000; i++)
    EXPECT_EQ(*ss_->Find(build_str(i * 10)), build_str(i * 10));
}

TEST_F(StringSetTest, TransferTTLFlagLinkToObjectOnDelete) {
  for (size_t i = 0; i < 10; i++) {
    EXPECT_TRUE(ss_->Add(absl::StrCat(i), 1));
  }
  for (size_t i = 0; i < 9; i++) {
    EXPECT_TRUE(ss_->Erase(absl::StrCat(i)));
  }
  auto it = ss_->Find("9"sv);
  EXPECT_TRUE(it.HasExpiry());
  EXPECT_EQ(1u, it.ExpiryTime());
}

class ShrinkTest : public StringSetTest, public ::testing::WithParamInterface<size_t> {};

TEST_P(ShrinkTest, BasicShrink) {
  constexpr size_t num_strs = 1000000;
  size_t shrink_to = GetParam();

  vector<string> strs;
  for (size_t i = 0; i < num_strs; ++i) {
    strs.push_back(random_string(generator_, 10));
    EXPECT_TRUE(ss_->Add(strs.back()));
  }

  // Grow to a larger size
  ss_->Reserve(1 << 22);
  size_t original_bucket_count = ss_->BucketCount();
  EXPECT_EQ(original_bucket_count, 1u << 22);

  // Shrink to the parameterized size
  ss_->Shrink(shrink_to);

  EXPECT_EQ(ss_->BucketCount(), shrink_to);
  EXPECT_EQ(ss_->UpperBoundSize(), num_strs);

  // Verify all elements are still accessible
  for (const auto& str : strs) {
    EXPECT_TRUE(ss_->Contains(str)) << "Missing: " << str;
  }
}

INSTANTIATE_TEST_SUITE_P(ShrinkSizes, ShrinkTest,
                         ::testing::Values(1u << 21,   // 2M buckets (sparse)
                                           1u << 20,   // 1M buckets (~1 per bucket)
                                           1u << 19),  // 512K buckets (~2 per bucket)
                         [](const auto& info) { return absl::StrCat("buckets_", info.param); });

TEST_F(StringSetTest, ShrinkWithTTL) {
  constexpr size_t num_strs = 1000000;

  // Track elements by their TTL category
  vector<string> expired_strs;    // TTL 1-50, will expire
  vector<string> surviving_strs;  // TTL 51-100, will survive
  vector<string> no_ttl_strs;     // No TTL, will survive

  for (size_t i = 0; i < num_strs; ++i) {
    string str = random_string(generator_, 10);
    if (i % 3 == 0) {
      // No TTL
      EXPECT_TRUE(ss_->Add(str));
      no_ttl_strs.push_back(str);
    } else if (i % 3 == 1) {
      // TTL 1-50 (will expire when time=50)
      uint32_t ttl = (i % 50) + 1;
      EXPECT_TRUE(ss_->Add(str, ttl));
      expired_strs.push_back(str);
    } else {
      // TTL 51-100 (will survive when time=50)
      uint32_t ttl = (i % 50) + 51;
      EXPECT_TRUE(ss_->Add(str, ttl));
      surviving_strs.push_back(str);
    }
  }

  // Grow to larger size
  ss_->Reserve(1 << 22);

  // Set time to 50 - this will expire elements with TTL <= 50
  ss_->set_time(50);

  // Shrink
  ss_->Shrink(1 << 21);
  EXPECT_EQ(ss_->BucketCount(), 1u << 21);

  // Verify expired elements are gone
  for (const auto& str : expired_strs) {
    EXPECT_EQ(ss_->Find(str), ss_->end()) << "Should be expired: " << str;
  }

  // Verify surviving TTL elements are still accessible with correct TTL
  for (const auto& str : surviving_strs) {
    auto it = ss_->Find(str);
    ASSERT_NE(it, ss_->end()) << "Missing surviving TTL element: " << str;
    EXPECT_TRUE(it.HasExpiry());
    EXPECT_GT(it.ExpiryTime(), 50u);
  }

  // Verify no-TTL elements are still accessible
  for (const auto& str : no_ttl_strs) {
    auto it = ss_->Find(str);
    ASSERT_NE(it, ss_->end()) << "Missing no-TTL element: " << str;
    EXPECT_FALSE(it.HasExpiry());
  }
}

TEST_F(StringSetTest, ScanWithShrinkBetweenCalls) {
  // Test that cursor-based scanning works correctly when Grow and Shrink happen between Scan calls
  // This verifies SCAN guarantees: elements present at start and end of scan must be seen
  constexpr size_t num_strs = 1000000;
  vector<string> strs;
  unordered_set<string> must_see;

  // Add elements and track them
  for (size_t i = 0; i < num_strs; ++i) {
    strs.push_back(random_string(generator_, 10));
    EXPECT_TRUE(ss_->Add(strs.back()));
    must_see.insert(strs.back());
  }

  // Note initial bucket count (will be ~1M after adding 1M elements)
  size_t initial_bucket_count = ss_->BucketCount();

  unordered_set<string> seen;
  auto scan_callback = [&](const sds ptr) {
    string str{ptr, sdslen(ptr)};
    seen.insert(str);
  };

  // Start scanning BEFORE Grow
  uint32_t cursor = ss_->Scan(0, scan_callback);
  EXPECT_NE(cursor, 0u) << "Should not finish in one iteration";

  // Grow to large size in the middle of scanning
  ss_->Reserve(1 << 22);
  EXPECT_EQ(ss_->BucketCount(), 1u << 22);
  EXPECT_GT(ss_->BucketCount(), initial_bucket_count);

  // Continue scanning a bit after Grow
  cursor = ss_->Scan(cursor, scan_callback);

  // Now Shrink in the middle of scanning - this is the key test
  // Elements that existed at scan start must still be visible
  ss_->Shrink(1 << 21);
  EXPECT_EQ(ss_->BucketCount(), 1u << 21);

  // Continue scanning with the same cursor
  constexpr int max_iterations = 1 << 22;
  int iterations = 0;
  while (cursor != 0 && iterations < max_iterations) {
    cursor = ss_->Scan(cursor, scan_callback);
    iterations++;
  }
  EXPECT_LT(iterations, max_iterations) << "Hit iteration limit";
  EXPECT_EQ(cursor, 0u) << "Scan should complete";

  // Verify all original elements were seen
  for (const auto& str : must_see) {
    EXPECT_TRUE(seen.count(str)) << "Missing element after shrink: " << str;
  }
  EXPECT_EQ(seen.size(), must_see.size()) << "Should see exactly all original elements";
}

}  // namespace dfly


================================================
FILE: src/core/task_queue.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/task_queue.h"

#include <absl/strings/str_cat.h>

#include "base/logging.h"

using namespace std;
using namespace util::fb2;

namespace dfly {

__thread unsigned TaskQueue::blocked_submitters_ = 0;

TaskQueue::TaskQueue(unsigned queue_size, unsigned start_size, unsigned pool_max_size)
    : queue_(queue_size), consumer_fibers_(start_size) {
  CHECK_GT(start_size, 0u);
  CHECK_LE(start_size, pool_max_size);
}

void TaskQueue::Start(std::string_view base_name) {
  for (size_t i = 0; i < consumer_fibers_.size(); ++i) {
    auto& fb = consumer_fibers_[i];
    CHECK(!fb.IsJoinable());

    string name = absl::StrCat(base_name, "/", i);
    fb =
        Fiber(Fiber::Opts{.priority = FiberPriority::HIGH, .name = name}, [this] { queue_.Run(); });
  }
}

void TaskQueue::Shutdown() {
  queue_.Shutdown();
  for (auto& fb : consumer_fibers_)
    fb.JoinIfNeeded();
}

}  // namespace dfly


================================================
FILE: src/core/task_queue.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include "util/fibers/fiberqueue_threadpool.h"
#include "util/fibers/fibers.h"

namespace dfly {

/**
 *  MPSC task-queue that is handled by a single consumer thread.
 *  The queue is just a wrapper around FiberQueue that manages its fiber itself.
 */
class TaskQueue {
 public:
  // TODO: to add a mechanism to moderate pool size. Currently it's static with pool_start_size.
  TaskQueue(unsigned queue_size, unsigned pool_start_size, unsigned pool_max_size);

  template <typename F> bool TryAdd(F&& f) {
    return queue_.TryAdd(std::forward<F>(f));
  }

  // Returns true if task queue was blocked when adding the task.
  template <typename F> bool Add(F&& f) {
    if (queue_.TryAdd(std::forward<F>(f)))
      return false;

    ++blocked_submitters_;
    auto res = queue_.Add(std::forward<F>(f));
    --blocked_submitters_;
    return res;
  }

  template <typename F> auto Await(F&& f) -> decltype(f()) {
    util::fb2::Done done;
    using ResultType = decltype(f());
    util::detail::ResultMover<ResultType> mover;

    ++blocked_submitters_;
    Add([&mover, f = std::forward<F>(f), done]() mutable {
      mover.Apply(f);
      done.Notify();
    });
    --blocked_submitters_;
    done.Wait();
    return std::move(mover).get();
  }

  /**
   * @brief Start running consumer loop in the caller thread by spawning fibers.
   *        Returns immediately.
   */
  void Start(std::string_view base_name);

  /**
   * @brief Notifies Run() function to empty the queue and to exit and waits for the consumer
   *        fiber to finish.
   */
  void Shutdown();

  static unsigned blocked_submitters() {
    return blocked_submitters_;
  }

 private:
  util::fb2::FiberQueue queue_;
  std::vector<util::fb2::Fiber> consumer_fibers_;

  static __thread unsigned blocked_submitters_;
};

}  // namespace dfly


================================================
FILE: src/core/tiering_types.cc
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/tiering_types.h"

#include "redis/redis_aux.h"

namespace dfly::tiering {

auto FragmentRef::GetDescr(const CompactValue* pv) -> SerializationDescr {
  switch (pv->ObjType()) {
    case OBJ_STRING: {
      if (!pv->HasAllocated())
        return {};
      auto strs = pv->GetRawString();
      return {strs, CompactObj::ExternalRep::STRING};
    }
    case OBJ_HASH: {
      if (pv->Encoding() == kEncodingListPack) {
        return {static_cast<uint8_t*>(pv->RObjPtr()), CompactObj::ExternalRep::SERIALIZED_MAP};
      }
      return {};
    }
    default:
      return {};
  };
}

TieredCoolRecord* FragmentRef::GetCoolRecord() const {
  return std::visit(
      [](auto* pv) -> TieredCoolRecord* {
        return pv->IsExternal() && pv->IsCool() ? pv->GetCool().record : nullptr;
      },
      val_);
}

}  // namespace dfly::tiering


================================================
FILE: src/core/tiering_types.h
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <boost/intrusive/list_hook.hpp>

#include "core/compact_object.h"

namespace dfly::tiering {

// TieredCoolRecord is part of the cooling cache. It allows offloading values to disk
// while still keeping some of them in-memory to avoid disk reads in case they are requested again
// soon after offloading. When a value is moved to the cold storage, TieredCoolRecord and only
// the external reference is kept. When the value is warmed up, the record is removed from the cool
// storage and the value is read back to memory.
struct TieredCoolRecord : public ::boost::intrusive::list_base_hook<
                              boost::intrusive::link_mode<boost::intrusive::normal_link>> {
  uint64_t key_hash;  // Allows searching the entry in the dbslice.
  CompactValue value;
  uint16_t db_index;
  uint32_t page_index;
};
static_assert(sizeof(TieredCoolRecord) == 48);

class FragmentRef {
 public:
  // Describes how this fragment should be serialized for offloading.
  // Used by stashing flow.
  struct SerializationDescr {
    std::variant<std::array<std::string_view, 2>, uint8_t*> blob;
    CompactObj::ExternalRep rep = CompactObj::ExternalRep::STRING;
  };

  FragmentRef(CompactValue& pv) : val_(&pv) {  // NOLINT
  }

  FragmentRef(CompactValue* pv) : val_(pv) {  // NOLINT
  }

  bool IsOffloaded() const {
    return std::visit([](auto* pv) { return pv->IsExternal(); }, val_);
  }

  // Resets offloaded state for this fragment.
  void ClearOffloaded() {
    std::visit([](auto* pv) { pv->RemoveExternal(); }, val_);
  }

  bool HasStashPending() const {
    return std::visit([](auto* pv) { return pv->HasStashPending(); }, val_);
  }

  void ClearStashPending() {
    std::visit([](auto* pv) { pv->SetStashPending(false); }, val_);
  }

  CompactObjType ObjType() const {
    return std::visit([](auto* pv) { return pv->ObjType(); }, val_);
  }

  // Determine required byte size and encoding type based on value.
  SerializationDescr GetSerializationDescr() const {
    return std::visit([](auto* pv) { return GetDescr(pv); }, val_);
  }

  // Returns a pointer to TieredCoolRecord if this fragment is cool, and null otherwise.
  TieredCoolRecord* GetCoolRecord() const;

  // Returns the external slice of the offloaded value. Only valid if IsOffloaded() is true.
  std::pair<size_t, size_t> GetExternalSlice() const {
    return std::visit([](auto* pv) { return pv->GetExternalSlice(); }, val_);
  }

 private:
  static SerializationDescr GetDescr(const CompactValue* pv);

  // TODO: to support more types, for example Node* from qlist.h.
  std::variant<CompactValue*> val_;
};

}  // namespace dfly::tiering


================================================
FILE: src/core/top_keys.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/top_keys.h"

#include <xxhash.h>

#include "absl/numeric/bits.h"
#include "absl/random/distributions.h"
#include "base/logging.h"

namespace dfly {

using namespace std;

TopKeys::TopKeys(Options options)
    : options_(options), fingerprints_(options_.buckets * options_.depth) {
  if (options_.min_key_count_to_record < 2) {
    options_.min_key_count_to_record = 2;
  }
}

void TopKeys::Touch(std::string_view key) {
  auto ResetCell = [&](Cell& cell, uint64_t fingerprint) {
    cell.fingerprint = fingerprint;
    cell.count = 1;
    cell.key.clear();
  };

  uint64_t fingerprint = XXH3_64bits(key.data(), key.size());
  constexpr uint64_t kPrime = 0xff51afd7ed558ccd;
  for (uint64_t id = 0; id < options_.depth; ++id) {
    const unsigned bucket = fingerprint % options_.buckets;
    fingerprint *= kPrime;
    Cell& cell = GetCell(id, bucket);
    if (cell.count == 0) {
      // No fingerprint in cell.
      ResetCell(cell, fingerprint);
    } else if (cell.fingerprint == fingerprint) {
      // Same fingerprint, simply increment count.

      // We could make sure that, if !cell.key.empty(), then key == cell.key.empty() here. However,
      // what do we do in case they are different?
      ++cell.count;

      if (cell.count >= options_.min_key_count_to_record && cell.key.empty()) {
        cell.key = key;
      }
    } else {
      // Different fingerprint, apply exponential decay.
      const double rand = absl::Uniform(bitgen_, 0, 1.0);
      if (rand < std::pow(options_.decay_base, -static_cast<double>(cell.count))) {
        --cell.count;
        if (cell.count == 0) {
          ResetCell(cell, fingerprint);
        }
      }
    }
  }
}

absl::flat_hash_map<std::string, uint64_t> TopKeys::GetTopKeys() const {
  absl::flat_hash_map<std::string, uint64_t> results;
  for (unsigned array = 0; array < options_.depth; ++array) {
    for (unsigned bucket = 0; bucket < options_.buckets; ++bucket) {
      const Cell& cell = GetCell(array, bucket);
      if (!cell.key.empty()) {
        auto [it, added] = results.emplace(cell.key, cell.count);
        if (!added && it->second < cell.count) {
          it->second = cell.count;
        }
      }
    }
  }
  return results;
}

TopKeys::Cell& TopKeys::GetCell(uint32_t d, uint32_t bucket) {
  DCHECK(d < options_.depth);
  DCHECK(bucket < options_.buckets);
  return fingerprints_[d * options_.buckets + bucket];
}

const TopKeys::Cell& TopKeys::GetCell(uint32_t d, uint32_t bucket) const {
  DCHECK(d < options_.depth);
  DCHECK(bucket < options_.buckets);
  return fingerprints_[d * options_.buckets + bucket];
}

}  // end of namespace dfly


================================================
FILE: src/core/top_keys.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/flat_hash_map.h>

#include <string>
#include <string_view>
#include <vector>

#include "base/random.h"

namespace dfly {

// INTERNAL USE ONLY: This class is an optimized, O(1) probabilistic hot-key tracker designed
// specifically to run on the database's hot path (e.g., tracking hot keys using DEBUG TOPK).
// It cannot and should not be used for user-facing Redis TOPK commands. It intentionally
// omits a Min-Heap (preventing instant eviction reporting), does not support arbitrary
// increments, and does not use PMR allocators (which are required for strict memory
// tracking and RDB serialization of user data).
//
// For the public Redis TOPK module API, use the `TOPK` class defined in `core/topk.h`.
//
// TopKeys is a utility class that helps determine the most frequently used keys.
// Based on: HeavyKeeper paper,  https://www.usenix.org/conference/atc18/presentation/gong
//
// Usage:
// - Instantiate this class with proper options (see below)
// - For every used key k, call Touch(k)
// - At some point(s) in time, call GetTopKeys() to get an estimated list of top keys along with
//   their approximate count (i.e. how many times Touch() was invoked for them).
//
// Notes:
// - This class implements a slightly modified version of HeavyKeeper, a data structure designed
//   for a similar problem domain. The modification made is to store the keys directly within the
//   tables, when they meet a certain threshold, instead of using a min-heap.
// - This class is statistical in nature. Do *not* expect accurate counts.
// - When misconfigured, real top keys may be missing from GetTopKeys(). This can occur when there
//   are too few buckets, or when min_key_count_to_record is too high, depending on actual usage.
class TopKeys {
  TopKeys(const TopKeys&) = delete;
  TopKeys& operator=(const TopKeys&) = delete;

 public:
  struct Options {
    // HeavyKeeper options
    uint32_t buckets = 1 << 16;
    uint32_t depth = 4;

    // What is the minimum times Touch() has to be called for a given key in order for the key to be
    // saved. Use lower values when load is low, or higher values when load is high. The cost of a
    // low value for high load is frequent string copying and memory allocation.
    // Min value: 2
    uint32_t min_key_count_to_record = 50;

    double decay_base = 1.08;
  };

  explicit TopKeys(Options options);

  void Touch(std::string_view key);
  absl::flat_hash_map<std::string, uint64_t> GetTopKeys() const;

 private:
  // Each cell consists of a key-fingerprint, a count, and potentially the key itself, when it's
  // above options_.min_key_count_to_record.
  struct Cell {
    uint64_t fingerprint = 0;
    uint64_t count = 0;
    std::string key;
  };
  Cell& GetCell(uint32_t d, uint32_t bucket);
  const Cell& GetCell(uint32_t d, uint32_t bucket) const;

  Options options_;
  base::Xoroshiro128p bitgen_;

  // fingerprints_'s size is options_.buckets * options_.arrays. Always access fields via GetCell().
  std::vector<Cell> fingerprints_;
};

}  // end of namespace dfly


================================================
FILE: src/core/top_keys_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/top_keys.h"

#include <absl/strings/str_cat.h>
#include <gmock/gmock.h>

#include "base/gtest.h"
#include "base/logging.h"

using ::testing::Pair;
using ::testing::UnorderedElementsAre;

namespace dfly {

TEST(TopKeysTest, Basic) {
  TopKeys top_keys({.min_key_count_to_record = 2});
  top_keys.Touch("key1");
  top_keys.Touch("key1");
  top_keys.Touch("key2");
  EXPECT_THAT(top_keys.GetTopKeys(), UnorderedElementsAre(Pair("key1", 2)));
}

TEST(TopKeysTest, MultiTouch) {
  TopKeys top_keys({.min_key_count_to_record = 2});
  top_keys.Touch("key1");
  EXPECT_THAT(top_keys.GetTopKeys(), UnorderedElementsAre());
  top_keys.Touch("key1");
  EXPECT_THAT(top_keys.GetTopKeys(), UnorderedElementsAre(Pair("key1", 2)));
  top_keys.Touch("key1");
  EXPECT_THAT(top_keys.GetTopKeys(), UnorderedElementsAre(Pair("key1", 3)));
}

TEST(TopKeysTest, MinKeyCountToRecord) {
  TopKeys top_keys({.min_key_count_to_record = 3});
  top_keys.Touch("key1");
  EXPECT_THAT(top_keys.GetTopKeys(), UnorderedElementsAre());
  top_keys.Touch("key1");
  EXPECT_THAT(top_keys.GetTopKeys(), UnorderedElementsAre());
  top_keys.Touch("key1");
  EXPECT_THAT(top_keys.GetTopKeys(), UnorderedElementsAre(Pair("key1", 3)));
  top_keys.Touch("key1");
  EXPECT_THAT(top_keys.GetTopKeys(), UnorderedElementsAre(Pair("key1", 4)));
  top_keys.Touch("key1");
  EXPECT_THAT(top_keys.GetTopKeys(), UnorderedElementsAre(Pair("key1", 5)));
}

TEST(TopKeysTest, MultiKeys) {
  TopKeys top_keys({.min_key_count_to_record = 2});
  for (int i = 0; i < 2; ++i) {
    top_keys.Touch("key1");
    top_keys.Touch("key2");
  }
  top_keys.Touch("key3");
  EXPECT_THAT(top_keys.GetTopKeys(), UnorderedElementsAre(Pair("key1", 2), Pair("key2", 2)));
}

TEST(TopKeysTest, BucketCollision) {
  TopKeys top_keys({.buckets = 1, .min_key_count_to_record = 1});
  for (int i = 0; i < 5; ++i) {
    top_keys.Touch("key1");
  }
  EXPECT_THAT(top_keys.GetTopKeys(), UnorderedElementsAre(Pair("key1", 5)));

  for (int i = 0; i < 100; ++i) {
    top_keys.Touch("key2");
  }

  auto top_keys_table = top_keys.GetTopKeys();
  EXPECT_EQ(top_keys_table.size(), 1);
  EXPECT_LE(top_keys_table["key2"], 100);
  EXPECT_GE(top_keys_table["key2"], 50);

  // Touching "key1" should *not* replace "key2".
  top_keys.Touch("key1");
  EXPECT_FALSE(top_keys.GetTopKeys().contains("key1"));
}

TEST(TopKeysTest, BucketCollisionAggressiveDecay) {
  TopKeys top_keys({.buckets = 1, .min_key_count_to_record = 2, .decay_base = 1.0});
  for (int i = 0; i < 5; ++i) {
    top_keys.Touch("key1");
  }
  EXPECT_THAT(top_keys.GetTopKeys(), UnorderedElementsAre(Pair("key1", 5)));

  for (int i = 0; i < 100; ++i) {
    top_keys.Touch("key2");
  }
  EXPECT_THAT(top_keys.GetTopKeys(), UnorderedElementsAre(Pair("key2", 96)));
}

TEST(TopKeysTest, BucketCollisionHesitantDecay) {
  TopKeys top_keys({.buckets = 1, .min_key_count_to_record = 2, .decay_base = 1000.0});
  for (int i = 0; i < 5; ++i) {
    top_keys.Touch("key1");
  }
  EXPECT_THAT(top_keys.GetTopKeys(), UnorderedElementsAre(Pair("key1", 5)));

  for (int i = 0; i < 100; ++i) {
    top_keys.Touch("key2");
  }
  // "key2" will never replace "key1", as the decay practically never happens (1000^-5)
  EXPECT_THAT(top_keys.GetTopKeys(), UnorderedElementsAre(Pair("key1", 5)));
}

TEST(TopKeysTest, SavedByMultipleArrays) {
  // This test is not trivial. It tests that having multiple arrays inside TopKeys saves keys in
  // case of collision. The way it does it is by inserting an arbitrary key (= "key"), and then (at
  // runtime) finding another key which *does* collide with that key.
  //
  // Once we've found such a key, we create another TopKeys instance, but this time with 10 arrays
  // which should mean that for some hash value, the keys won't be present in the same bucket.

  std::string collision_key;

  TopKeys::Options options(
      {.buckets = 2, .depth = 1, .min_key_count_to_record = 2, .decay_base = 1});
  {
    TopKeys top_keys(options);

    // Insert some key
    top_keys.Touch("key");
    top_keys.Touch("key");

    // Find a key with a collision
    int i = 0;
    while (true) {
      collision_key = absl::StrCat("key", i);
      top_keys.Touch(collision_key);
      if (!top_keys.GetTopKeys().contains(collision_key)) {
        break;
      }
      ++i;
    }
  }

  options.depth = 10;
  {
    TopKeys top_keys(options);

    // Insert some key
    top_keys.Touch("key");
    top_keys.Touch("key");

    // Insert collision key, expect result to be present
    top_keys.Touch(collision_key);
    top_keys.Touch(collision_key);
    EXPECT_THAT(top_keys.GetTopKeys(),
                UnorderedElementsAre(Pair("key", 2), Pair(collision_key, 2)));
  }
}

}  // end of namespace dfly


================================================
FILE: src/core/topk.cc
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/topk.h"

#include <xxhash.h>

#include <algorithm>
#include <cmath>
#include <limits>
#include <utility>

#include "absl/random/distributions.h"
#include "base/logging.h"
#include "base/random.h"

namespace dfly {

namespace {

const std::array<double, TOPK::kDecayLookupSize>& GetDefaultDecayTable() {
  static const auto table = [] {
    std::array<double, TOPK::kDecayLookupSize> t{};
    for (size_t i = 0; i < TOPK::kDecayLookupSize; ++i) {
      t[i] = std::pow(TOPK::kDefaultDecay, static_cast<double>(i));
    }
    return t;
  }();
  return table;
}

}  // namespace

TOPK::TOPK(PMR_NS::memory_resource* mr, uint32_t k, uint32_t width, uint32_t depth, double decay)
    : k_(k),
      width_(width),
      depth_(depth),
      decay_(decay),
      counters_(static_cast<size_t>(width) * depth, 0, PMR_NS::polymorphic_allocator<uint32_t>(mr)),
      min_heap_(PMR_NS::polymorphic_allocator<HeapItem>(mr)) {
  DCHECK(mr != nullptr);
  DCHECK_GT(k_, 0u);
  DCHECK_GT(width_, 0u);
  DCHECK_GT(depth_, 0u);
  DCHECK_GE(decay_, 0.0);
  DCHECK_LE(decay_, 1.0);
  min_heap_.reserve(k_);

  if (std::abs(decay_ - TOPK::kDefaultDecay) < TOPK::kDecayEpsilon) {
    // default decay value: use shared static table to save memory and initialization time
    decay_lookup_ = &GetDefaultDecayTable();
  } else {
    // custom decay value: build a dedicated table for this instance
    custom_decay_table_ = std::make_unique<std::array<double, TOPK::kDecayLookupSize>>();
    for (size_t i = 0; i < TOPK::kDecayLookupSize; ++i) {
      (*custom_decay_table_)[i] = std::pow(decay_, static_cast<double>(i));
    }
    decay_lookup_ = custom_decay_table_.get();
  }
}

TOPK::TOPK(TOPK&& other) noexcept
    : k_(std::exchange(other.k_, 0)),
      width_(std::exchange(other.width_, 0)),
      depth_(std::exchange(other.depth_, 0)),
      decay_(std::exchange(other.decay_, 0.0)),
      decay_lookup_(std::exchange(other.decay_lookup_, nullptr)),
      custom_decay_table_(std::move(other.custom_decay_table_)),
      counters_(std::move(other.counters_)),
      min_heap_(std::move(other.min_heap_)) {
}

TOPK& TOPK::operator=(TOPK&& other) noexcept {
  if (this != &other) {
    k_ = std::exchange(other.k_, 0);
    width_ = std::exchange(other.width_, 0);
    depth_ = std::exchange(other.depth_, 0);
    decay_ = std::exchange(other.decay_, 0.0);
    decay_lookup_ = std::exchange(other.decay_lookup_, nullptr);
    custom_decay_table_ = std::move(other.custom_decay_table_);
    counters_ = std::move(other.counters_);
    min_heap_ = std::move(other.min_heap_);
  }
  return *this;
}

uint64_t TOPK::Hash(std::string_view item, uint32_t row) const {
  auto full_hash = XXH3_64bits_withSeed(item.data(), item.size(), row);

  // Lemire's Fast Range Reduction avoids the expensive CPU integer division penalty of the modulo
  // (%) operator. The main principle: multiplication is much faster than division, so we multiply
  // a 32-bit slice of the hash by the width, and then shift right by 32 bits to get the bucket
  // index. See: https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
  uint32_t hash32 = static_cast<uint32_t>(full_hash);

  uint64_t bucket = (static_cast<uint64_t>(hash32) * width_) >> 32;
  DCHECK_LT(bucket, width_);
  return bucket;
}

double TOPK::ComputeDecayProbability(uint32_t count) const {
  DCHECK(decay_lookup_);
  DCHECK_GT(count, 0u);
  const auto& table = *decay_lookup_;
  if (count < kDecayLookupSize) {
    return table[count];
  }

  // If the probability is already less than kDecayEpsilon, the chance of decay is
  // statistically zero (see ShouldDecay). Skip the expensive std::pow extrapolation entirely.
  if (table[TOPK::kDecayLookupSize - 1] < TOPK::kDecayEpsilon) {
    return 0.0;
  }

  // Extrapolate probabilities for counts that exceed our lookup table's max index.
  // Let M = the maximum table index (kDecayLookupSize - 1)
  // Let Q = the quotient (count / M)
  // Let R = the remainder (count % M)
  //
  // Using the Laws of Exponents, we break down decay^count:
  // decay^count = decay^((Q * M) + R) = (decay^M)^Q * decay^R
  //
  // This translates directly to reusing our cached table:
  // std::pow(table[M], Q) * table[R]
  uint32_t quotient = count / (TOPK::kDecayLookupSize - 1);
  uint32_t remainder = count % (TOPK::kDecayLookupSize - 1);
  double base = table[TOPK::kDecayLookupSize - 1];
  return std::pow(base, static_cast<double>(quotient)) * table[remainder];
}

bool TOPK::ShouldDecay(uint32_t current_count) const {
  if (current_count == 0)
    return false;

  // Exponential decay probability: decay^count
  thread_local base::Xoroshiro128p bitgen;
  double prob = ComputeDecayProbability(current_count);
  return absl::Uniform(bitgen, 0.0, 1.0) < prob;
}

void TOPK::HeapifyUp(size_t index) {
  DCHECK_LT(index, min_heap_.size());
  // Restores the min-heap property by shifting the element at 'index' upward.
  // Triggered in two cases:
  // 1. Initial insertion: A new item is appended to the array and needs to bubble up.
  // 2. Count decrease: An existing item's count drops (becomes smaller), floating higher.
  while (index > 0) {
    size_t parent = (index - 1) / 2;
    if (min_heap_[parent].count <= min_heap_[index].count) {
      break;  // Heap property satisfied
    }

    // Swap with parent
    std::swap(min_heap_[parent], min_heap_[index]);
    index = parent;
  }
}

void TOPK::HeapifyDown(size_t index) {
  DCHECK_LT(index, min_heap_.size());
  // Restores the min-heap property by shifting the element at 'index' downward.
  // Triggered in two cases:
  // 1. Root replacement/removal: The minimum item is evicted/replaced and the new root must sink.
  // 2. Count increase: An existing item's count grows (becomes heavier), sinking lower.
  size_t size = min_heap_.size();

  while (true) {
    size_t left = (2 * index) + 1;
    size_t right = (2 * index) + 2;
    size_t smallest = index;

    if ((left < size) && (min_heap_[left].count) < (min_heap_[smallest].count)) {
      smallest = left;
    }
    if ((right < size) && (min_heap_[right].count) < (min_heap_[smallest].count)) {
      smallest = right;
    }

    if (smallest == index) {
      break;  // Heap property satisfied
    }

    // Swap with smallest child
    std::swap(min_heap_[smallest], min_heap_[index]);
    index = smallest;
  }
}

size_t TOPK::GetCounterIndex(std::string_view item, uint32_t row) const {
  DCHECK_LT(row, depth_);
  // Note:
  // - bucket is mathematically guaranteed to be in the range [0, width_ - 1]
  // - The max possible idx is depth * width - 1, which is within the bounds of our counters_
  // vector
  uint64_t bucket = Hash(item, row);
  size_t idx = static_cast<size_t>(row) * width_ + bucket;
  DCHECK_LT(idx, counters_.size());
  return idx;
}

uint32_t TOPK::Count(std::string_view item) const {
  uint32_t min_count = std::numeric_limits<uint32_t>::max();

  for (uint32_t row = 0; row < depth_; ++row) {
    size_t idx = GetCounterIndex(item, row);
    min_count = std::min(min_count, counters_[idx]);
  }

  return min_count;
}

std::optional<std::string> TOPK::IncrementInternal(std::string_view item, uint32_t increment) {
  uint32_t min_count = std::numeric_limits<uint32_t>::max();

  // Update counters using HeavyKeeper logic
  for (uint32_t row = 0; row < depth_; ++row) {
    size_t idx = GetCounterIndex(item, row);

    // HeavyKeeper: decay and increment are mutually exclusive.
    // - With probability decay^count, the counter is decremented (colliding items suppress each
    // other).
    // - Otherwise, the counter is incremented for the item being added.
    if ((counters_[idx] > 0) && ShouldDecay(counters_[idx])) {
      --counters_[idx];
    } else {
      counters_[idx] = static_cast<uint32_t>(
          std::min(static_cast<uint64_t>(counters_[idx]) + increment,
                   static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())));
    }

    // Count-Min Sketch property: The minimum counter across all rows is the
    // most accurate, as it has suffered the fewest hash collisions.
    min_count = std::min(min_count, counters_[idx]);
  }

  return UpdateHeap(item, min_count);
}

std::optional<std::string> TOPK::Add(std::string_view item) {
  return IncrementInternal(item, 1);
}

std::optional<std::string> TOPK::IncrBy(std::string_view item, uint32_t increment) {
  if (increment < 1) {
    return std::nullopt;
  }
  return IncrementInternal(item, increment);
}

std::vector<TOPK::TopKItem> TOPK::List() const {
  std::vector<TopKItem> result;
  result.reserve(min_heap_.size());

  for (const auto& heap_item : min_heap_) {
    result.push_back({heap_item.key, heap_item.count});
  }

  // Sort by count (descending) for output
  std::sort(result.begin(), result.end(),
            [](const TopKItem& a, const TopKItem& b) { return a.count > b.count; });

  return result;
}

std::optional<std::string> TOPK::UpdateHeap(std::string_view item, uint32_t new_count) {
  // Fast path: O(K) linear scan.
  // For small K, this avoids hash map overhead. Short keys benefit from SSO
  // (Small String Optimization), keeping memory contiguous and cache-friendly.
  // TODO: Benchmark to find the crossover point where larger K OR long strings (SSO not applicable)
  // justify re-introducing a hash map.
  for (size_t i = 0; i < min_heap_.size(); ++i) {
    if (min_heap_[i].key == item) {
      uint32_t old_count = min_heap_[i].count;
      min_heap_[i].count = new_count;
      if (new_count > old_count) {
        HeapifyDown(i);
      } else if (new_count < old_count) {
        HeapifyUp(i);
      }
      return std::nullopt;
    }
  }

  // Fast reject: item doesn't qualify for the heap. Just exit without any memory allocations or
  // modifications.
  if ((min_heap_.size() >= k_) && (new_count <= min_heap_.front().count)) {
    return std::nullopt;
  }
  DCHECK_LE(min_heap_.size(), k_);

  // Slow path: item will enter the heap. Now allocate.
  std::string item_str(item);

  if (min_heap_.size() < k_) {
    // Heap not full, add the item, no eviction needed
    size_t new_idx = min_heap_.size();
    min_heap_.push_back({std::move(item_str), new_count});
    HeapifyUp(new_idx);
    return std::nullopt;
  }

  // Heap is full, evict minimum and add new item
  DCHECK_EQ(min_heap_.size(), k_);
  std::string old_key = std::move(min_heap_[0].key);
  min_heap_[0] = {std::move(item_str), new_count};
  HeapifyDown(0);
  return old_key;
}

size_t TOPK::MallocUsed() const {
  size_t size = 0;

  // Custom decay table (only for non-default decay values)
  if (custom_decay_table_) {
    size += sizeof(std::array<double, kDecayLookupSize>);
  }

  // Counter array
  size += counters_.capacity() * sizeof(uint32_t);

  // Heap items - calculate actual string sizes
  size += min_heap_.capacity() * sizeof(HeapItem);
  for (const auto& item : min_heap_) {
    size += item.key.capacity();
  }

  return size;
}

TOPK::SerializedData TOPK::Serialize() const {
  SerializedData data;
  data.k = k_;
  data.width = width_;
  data.depth = depth_;
  data.decay = decay_;

  // Serialize heap items
  data.heap_items.reserve(min_heap_.size());
  for (const auto& heap_item : min_heap_) {
    data.heap_items.push_back({heap_item.key, heap_item.count});
  }

  // Serialize counter array
  data.counters.assign(counters_.begin(), counters_.end());

  return data;
}

void TOPK::Deserialize(const SerializedData& data) {
  DCHECK_EQ(data.counters.size(), static_cast<size_t>(width_) * depth_);
  DCHECK_LE(data.heap_items.size(), k_);
  DCHECK_EQ(data.k, k_);
  DCHECK_EQ(data.width, width_);
  DCHECK_EQ(data.depth, depth_);
  DCHECK_EQ(data.decay, decay_);

  // Clear existing data
  min_heap_.clear();

  // Restore counters
  counters_.assign(data.counters.begin(), data.counters.end());

  // Restore heap
  min_heap_.reserve(data.heap_items.size());
  for (const auto& item : data.heap_items) {
    min_heap_.push_back({item.item, item.count});
  }

  // Rebuild heap property
  std::make_heap(min_heap_.begin(), min_heap_.end(), std::greater<HeapItem>());
}

}  // namespace dfly


================================================
FILE: src/core/topk.h
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <array>
#include <cstdint>
#include <memory>
#include <optional>
#include <queue>
#include <string>
#include <string_view>
#include <vector>

#include "base/pmr/memory_resource.h"

namespace dfly {

class TOPKTest;

//
// TOPK: User-Facing API Data Structure
//
// This class implements the data structure required to support the public Redis
// TOPK module API (e.g., TOPK.RESERVE, TOPK.ADD, TOPK.INCRBY).
//
// WHY WE HAVE TWO TOP-K IMPLEMENTATIONS:
// Dragonfly maintains two separate Top-K tracking structures to protect the
// performance of the database's hot path:
// 1. `TopKeys` (src/core/top_keys.h): An internal-only, hyper-optimized O(1)
//    tracker that runs on every single database command to detect hot keys.
//    It intentionally lacks a min-heap and uses standard memory allocation to
//    maximize raw speed and minimize instruction cache pollution.
// 2. `TOPK` (this file): The user-facing implementation. To comply with the Redis
//    API contract, this class MUST support instant eviction reporting (requiring an
//    O(log K) Min-Heap), arbitrary increments, and PMR allocators for strict
//    memory limit tracking and RDB snapshot serialization.
//
// Forcing the internal tracker to support Min-Heaps and PMR would severely
// degrade overall database throughput, hence the strict separation of concerns.
//
// Algorithm Deviation Note:
// While heavily inspired by the HeavyKeeper algorithm, this is NOT a strict
// implementation. The original HeavyKeeper paper requires storing a
// (fingerprint, count) pair in each cell so that decay only penalizes a specific
// item. This implementation uses a bare `uint32_t` counter grid, making it closer
// to a Count-Min Sketch coupled with a Min-Heap and a decay heuristic. This
// design safely overestimates counts (which is acceptable for Top-K bounds)
// while simplifying PMR memory layout and RDB serialization.
//
// TODO: Full PMR Integration for String Ownership
// Currently, min_heap_ and counters_ use the provided memory_resource, ensuring the
// dominant allocations are tracked. However, the std::string keys inside HeapItem
// use the default heap.
// Future optimization: Upgrade HeapItem to use PMR_NS::string with proper
// uses_allocator construction.
class TOPK {
  friend class TOPKTest;

 public:
  // Initializes a Top-K tracking sketch with the specified dimensions.
  //
  // mr: Pointer to the memory resource used for allocations (MUST NOT be null).
  // k: Maximum number of most frequent items to maintain in the min-heap.
  // width: Number of counter buckets per row in the hash grid (default: 8).
  // depth: Number of independent hash functions (rows) used (default: 7).
  // decay: Probability multiplier for exponential decay (must be 0.0 to 1.0, default: 0.9).
  TOPK(PMR_NS::memory_resource* mr, uint32_t k, uint32_t width = kDefaultWidth,
       uint32_t depth = kDefaultDepth, double decay = kDefaultDecay);

  TOPK(const TOPK&) = delete;
  TOPK& operator=(const TOPK&) = delete;
  TOPK(TOPK&& other) noexcept;
  TOPK& operator=(TOPK&& other) noexcept;
  ~TOPK() = default;

  static constexpr double kDefaultDecay = 0.9;
  static constexpr uint32_t kDefaultWidth = 8;
  static constexpr uint32_t kDefaultDepth = 7;
  static constexpr double kDecayEpsilon = 1e-9;
  // Size is 4097 so that (kDecayLookupSize - 1) equals exactly 4096 (2^12).
  // This allows the C++ compiler to optimize the division and modulo operations
  // in the extrapolation hot-path into very-fast bitwise shifts & ANDs.
  static constexpr size_t kDecayLookupSize = 4097;

  // Represents an item in the Top-K list with its estimated count
  struct TopKItem {
    std::string item;
    uint32_t count;
  };

  // Inserts a single item into the Top-K sketch, incrementing its estimated frequency by 1.
  //
  // Returns: The string of the evicted item if this insertion caused a resident
  //          item to be displaced from the Top-K min-heap, or std::nullopt
  //          if no eviction occurred.
  std::optional<std::string> Add(std::string_view item);

  // Increments an item's estimated frequency by a specific amount.
  //
  // If 'increment' is 0, this operation is a safe no-op and returns std::nullopt.
  // Otherwise, returns the string of the evicted item if this operation caused
  // a resident item to be displaced from the Top-K min-heap, or std::nullopt.
  std::optional<std::string> IncrBy(std::string_view item, uint32_t increment);

  // Queries whether an item currently resides in the Top-K min-heap.
  [[nodiscard]] bool Query(std::string_view item) const {
    return IsInHeap(item);
  }

  // Estimates the frequency count for an item using the underlying sketch.
  // Returns the minimum counter value across all hash rows (Count-Min Sketch estimate).
  [[nodiscard]] uint32_t Count(std::string_view item) const;

  // Retrieves the complete list of current Top-K high-frequency items.
  //
  // Returns: A vector of TopKItem structures (containing the key and its count),
  //          sorted in descending order by estimated frequency (highest first).
  [[nodiscard]] std::vector<TopKItem> List() const;

  // --------------------------------------------------------------------------
  // Accessors for Top-K Configuration Parameters
  // --------------------------------------------------------------------------

  // Returns the maximum capacity (K) of the Top-K min-heap.
  [[nodiscard]] uint32_t K() const {
    return k_;
  }

  // Returns the number of items currently tracked in the Top-K heap.
  [[nodiscard]] size_t Size() const {
    return min_heap_.size();
  }

  // Returns the width (number of columns/buckets) of the Count-Min Sketch array.
  [[nodiscard]] uint32_t Width() const {
    return width_;
  }

  // Returns the depth (number of rows/hash functions) of the Count-Min Sketch array.
  [[nodiscard]] uint32_t Depth() const {
    return depth_;
  }

  // Returns the exponential decay probability base used by the HeavyKeeper algorithm.
  [[nodiscard]] double Decay() const {
    return decay_;
  }

  // Calculates the total heap memory dynamically allocated by this Top-K instance,
  // including sketch counters, min-heap allocations, and hash map overhead.
  //
  // Returns: Total memory usage in bytes.
  [[nodiscard]] size_t MallocUsed() const;

  // --------------------------------------------------------------------------
  // Serialization and Persistence
  // --------------------------------------------------------------------------

  // Pod-like structure to hold the exact internal state of the Top-K instance.
  struct SerializedData {
    uint32_t k;
    uint32_t width;
    uint32_t depth;
    double decay;
    std::vector<TopKItem> heap_items;
    std::vector<uint32_t> counters;
  };

  // Extracts the current structural state of the sketch for RDB persistence.
  [[nodiscard]] SerializedData Serialize() const;

  // Reconstructs the internal state of the sketch from a previously serialized dataset.
  void Deserialize(const SerializedData& data);

 private:
  struct HeapItem {
    std::string key;
    uint32_t count;

    // Min heap comparator
    bool operator>(const HeapItem& other) const {
      return count > other.count;
    }
  };

  // Hash function for bucket selection in row
  [[nodiscard]] uint64_t Hash(std::string_view item, uint32_t row) const;

  // Exponential decay logic
  [[nodiscard]] bool ShouldDecay(uint32_t current_count) const;

  // Updates the min-heap with the new count for the given item.
  // Returns the evicted item's key if the heap is at capacity and a new item displaces an existing
  // one. Otherwise, returns std::nullopt.
  std::optional<std::string> UpdateHeap(std::string_view item, uint32_t new_count);

  // Check if an item is in the Top-K heap
  [[nodiscard]] bool IsInHeap(std::string_view item) const {
    for (const auto& heap_item : min_heap_) {
      if (heap_item.key == item)
        return true;
    }
    return false;
  }

  // Hashes the item for a specific row and calculates its flattened 1D index
  // within the counters_ array. Maps the 2D Count-Min Sketch grid (depth x width)
  // into a single contiguous block of memory for better CPU cache locality.
  size_t GetCounterIndex(std::string_view item, uint32_t row) const;

  // Shared increment logic
  std::optional<std::string> IncrementInternal(std::string_view item, uint32_t increment);

  // Compute decay probability using lookup table or extrapolation
  double ComputeDecayProbability(uint32_t count) const;

  // Heap maintenance functions
  // O(log k) ops
  void HeapifyUp(size_t index);
  void HeapifyDown(size_t index);

  uint32_t k_;      // Number of top items to track
  uint32_t width_;  // Hash table width (buckets per row)
  uint32_t depth_;  // Hash table depth (number of rows)
  double decay_;    // Decay constant (0.0-1.0, typically 0.9)

  // Pointer to the active decay lookup table. For the default decay (0.9), this points to
  // a process-wide shared static table (32KB, allocated once). For custom (non-default) decay
  // values, it points to custom_decay_table_ below. This pattern can help to avoid embedding a 32KB
  // array in every TOPK object.
  // Assumption: >99% of TOPK instances will use the default decay, so
  // this optimization can significantly reduce memory usage and improve startup performance by
  // avoiding the need to build a custom table for each instance.
  const std::array<double, kDecayLookupSize>* decay_lookup_ = nullptr;

  // Heap-allocated table for non-default decay values. Null for the common case (decay=0.9).
  std::unique_ptr<std::array<double, kDecayLookupSize>> custom_decay_table_;

  // HeavyKeeper data structures
  // Hash table: width × depth matrix of counters
  std::vector<uint32_t, PMR_NS::polymorphic_allocator<uint32_t>> counters_;

  // Min heap: vector of top-K items maintained as a min heap
  std::vector<HeapItem, PMR_NS::polymorphic_allocator<HeapItem>> min_heap_;
};

}  // namespace dfly


================================================
FILE: src/core/topk_test.cc
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/topk.h"

#include <absl/strings/str_cat.h>

#include <cmath>
#include <limits>
#include <string>
#include <utility>
#include <vector>

#include "base/gtest.h"

namespace dfly {

using namespace std;

class TOPKTest : public ::testing::Test {
 protected:
  // Use decay=0 to disable probabilistic decay, making tests deterministic.
  // With decay=0, ShouldDecay always returns false (0^count = 0 for count>0),
  // so counters only grow and are never decremented by colliding items.
  // Having a decay != 0 will cause probabilistic flakiness in tests, as items may be randomly
  // evicted due to decay rather than true count comparisons.
  TOPKTest() : topk_(PMR_NS::get_default_resource(), 5, 100, 5, 0.0) {
  }

  double ComputeDecayProbability(TOPK* topk, uint32_t count) const {
    return topk->ComputeDecayProbability(count);
  }
  TOPK topk_;
};

// ---------------------------------------------------------------------------
// Construction & Configuration
// ---------------------------------------------------------------------------

// Verify K(), Width(), Depth(), Decay() return the exact values passed to the constructor.
TEST(TOPKBasic, ConstructorStoresParameters) {
  TOPK topk(PMR_NS::get_default_resource(), 10, 200, 7, 0.85);
  EXPECT_EQ(topk.K(), 10u);
  EXPECT_EQ(topk.Width(), 200u);
  EXPECT_EQ(topk.Depth(), 7u);
  EXPECT_DOUBLE_EQ(topk.Decay(), 0.85);
}

// Verify that default decay reuses the static process-wide table (saving memory),
// while a custom decay value allocates its own ~32KB lookup table.
TEST(TOPKBasic, DecayTableMemoryAllocation) {
  TOPK default_topk(PMR_NS::get_default_resource(), 5, 100, 5, TOPK::kDefaultDecay);
  TOPK custom_topk(PMR_NS::get_default_resource(), 5, 100, 5, 0.75);

  size_t default_mem = default_topk.MallocUsed();
  size_t custom_mem = custom_topk.MallocUsed();

  // Test that the custom one uses strictly more memory
  EXPECT_LT(default_mem, custom_mem);

  // Test that the difference in memory is exactly the size of the custom decay array
  size_t expected_table_size = TOPK::kDecayLookupSize * sizeof(double);
  EXPECT_GE(custom_mem - default_mem, expected_table_size);
}

// Move-construct a populated TOPK; source should be emptied and destination should hold the items.
TEST_F(TOPKTest, MoveConstructorTransfersOwnership) {
  topk_.Add("alpha");
  topk_.Add("beta");

  TOPK moved(std::move(topk_));

  EXPECT_EQ(moved.K(), 5u);
  auto list = moved.List();
  EXPECT_FALSE(list.empty());

  // Source is zeroed out.
  EXPECT_EQ(topk_.K(), 0u);
}

// Move-assign a populated TOPK into another; verify same post-conditions as move constructor.
TEST(TOPKBasic, MoveAssignmentTransfersOwnership) {
  TOPK src(PMR_NS::get_default_resource(), 3, 50, 3, 0.0);
  src.Add("x");
  src.Add("y");

  TOPK dst(PMR_NS::get_default_resource(), 1, 10, 1, 0.0);
  dst = std::move(src);

  EXPECT_EQ(dst.K(), 3u);
  EXPECT_EQ(dst.Width(), 50u);
  auto list = dst.List();
  EXPECT_EQ(list.size(), 2u);
  EXPECT_EQ(src.K(), 0u);
}

// ---------------------------------------------------------------------------
// Add
// ---------------------------------------------------------------------------

// Add exactly K distinct items; List() should return exactly K items with no evictions.
TEST_F(TOPKTest, AddFillsHeapUpToK) {
  for (uint32_t i{}; i < topk_.K(); ++i) {
    auto evicted = topk_.Add(absl::StrCat("item", i));
    EXPECT_FALSE(evicted.has_value()) << "Unexpected eviction at i=" << i;
  }
  EXPECT_EQ(topk_.List().size(), topk_.K());
}

// Each Add() while the heap has room returns std::nullopt.
// Note: adding a K+1th item with the same count as the minimum also returns nullopt,
// because the fast-reject path correctly requires new_count > min to trigger an eviction.
TEST_F(TOPKTest, AddReturnsNulloptWhileHeapNotFull) {
  for (uint32_t i{}; i < topk_.K(); ++i) {
    EXPECT_EQ(topk_.Add(absl::StrCat("item", i)), nullopt);
  }
}

// After filling the heap, IncrBy a new item with a large count to force an eviction.
TEST_F(TOPKTest, AddEvictsMinimumWhenHeapFull) {
  // Fill the heap with K items, each added once (count=1).
  for (uint32_t i{}; i < topk_.K(); ++i) {
    topk_.Add(absl::StrCat("filler", i));
  }

  // Force a new item in with a large count; it must evict the minimum.
  auto evicted = topk_.IncrBy("heavy_hitter", 1000);
  EXPECT_TRUE(evicted.has_value());
}

// After filling the heap, adding an item whose count can't exceed the minimum shouldn't evict.
TEST_F(TOPKTest, AddDoesNotEvictWhenNewItemScoreTooLow) {
  // Fill the heap with items pumped to high counts.
  for (uint32_t i{}; i < topk_.K(); ++i) {
    topk_.IncrBy(absl::StrCat("big", i), 1000);
  }

  // Single add of a brand-new item (count=1) won't beat any existing item.
  auto evicted = topk_.Add("tiny_newcomer");
  EXPECT_FALSE(evicted.has_value());
}

// Adding the same item repeatedly increases its count in the heap.
// Because decay=0.0 and there are no collisions, the count must be exactly 100.
TEST_F(TOPKTest, AddSameItemRepeatedlyIncreasesCount) {
  for (int i{}; i < 100; ++i) {
    topk_.Add("repeat");
  }

  auto list = topk_.List();
  bool found = false;
  for (const auto& item : list) {
    if (item.item == "repeat") {
      EXPECT_EQ(item.count, 100u);
      found = true;
    }
  }
  EXPECT_TRUE(found);
}

// ---------------------------------------------------------------------------
// IncrBy
// ---------------------------------------------------------------------------

// IncrBy with increment=0 must return nullopt and not modify state.
TEST_F(TOPKTest, IncrByZeroReturnsNullopt) {
  topk_.Add("existing");
  auto before = topk_.Count("existing");
  auto result = topk_.IncrBy("existing", 0);
  EXPECT_EQ(result, nullopt);
  auto after = topk_.Count("existing");
  EXPECT_EQ(before, after);
}

// IncrBy(item, 1) should behave the same as Add(item) — both increment by 1.
TEST(TOPKBasic, IncrByOneBehavesLikeAdd) {
  TOPK a(PMR_NS::get_default_resource(), 3, 100, 5, 0.0);
  TOPK b(PMR_NS::get_default_resource(), 3, 100, 5, 0.0);

  a.Add("x");
  b.IncrBy("x", 1);

  EXPECT_EQ(a.Count("x"), b.Count("x"));
}

// A single IncrBy with a large increment should immediately promote the item into the heap,
// evicting the current minimum.
TEST_F(TOPKTest, IncrByLargeValueCausesImmediateEviction) {
  for (uint32_t i{}; i < topk_.K(); ++i) {
    topk_.Add(absl::StrCat("base", i));
  }
  auto evicted = topk_.IncrBy("newcomer", 10000);
  EXPECT_TRUE(evicted.has_value());

  EXPECT_TRUE(topk_.Query("newcomer"));
}

// IncrBy on an item already in the heap should increase its count without eviction.
TEST_F(TOPKTest, IncrByExistingHeapItemUpdatesCount) {
  topk_.IncrBy("item_a", 50);
  auto count_before = topk_.Count("item_a");

  auto evicted = topk_.IncrBy("item_a", 100);
  EXPECT_EQ(evicted, nullopt);

  auto count_after = topk_.Count("item_a");
  EXPECT_GT(count_after, count_before);
}

// ---------------------------------------------------------------------------
// Query
// ---------------------------------------------------------------------------

// All K items currently in the heap should return true from Query.
TEST_F(TOPKTest, QueryReturnsTrueForHeapItems) {
  for (uint32_t i{}; i < topk_.K(); ++i) {
    string key = absl::StrCat("key", i);
    topk_.Add(key);
    EXPECT_TRUE(topk_.Query(key)) << key << " should be in heap";
  }
}

// Items that were never inserted should return false from Query.
TEST_F(TOPKTest, QueryReturnsFalseForNonHeapItems) {
  EXPECT_FALSE(topk_.Query("never_seen"));
  EXPECT_FALSE(topk_.Query("also_absent"));
  EXPECT_FALSE(topk_.Query("nope"));
}

// An item that was once in the heap but got evicted should return false from Query.
TEST_F(TOPKTest, QueryReturnsFalseForEvictedItems) {
  // Add our target victim. Count = 1.
  string victim = "low0";
  topk_.Add(victim);

  // Fill the rest of the heap (K=5) with items that are heavier.
  for (uint32_t i{1}; i < topk_.K(); ++i) {
    topk_.IncrBy(absl::StrCat("heavier", i), 50);
  }

  // Verify the victim is currently in the heap.
  EXPECT_TRUE(topk_.Query(victim));

  // Evict by adding a massive item.
  topk_.IncrBy("massive", 10000);

  // Strictly assert that the victim is gone.
  EXPECT_FALSE(topk_.Query(victim));
}

// Mixed: item in heap vs item not in heap.
TEST_F(TOPKTest, QueryMixedBatch) {
  topk_.IncrBy("inheap", 100);
  EXPECT_TRUE(topk_.Query("inheap"));
  EXPECT_FALSE(topk_.Query("notheap"));
}

// ---------------------------------------------------------------------------
// Count
// ---------------------------------------------------------------------------

// Items never inserted should return count 0.
TEST_F(TOPKTest, CountReturnsZeroForUnseen) {
  EXPECT_EQ(topk_.Count("never_added"), 0u);
  EXPECT_EQ(topk_.Count("also_missing"), 0u);
}

// Items that have been added should return a count >= 1.
TEST_F(TOPKTest, CountReturnsNonZeroForSeenItems) {
  topk_.Add("seen");
  EXPECT_GE(topk_.Count("seen"), 1u);
}

// The count from Count() for a heap item should match the count reported in List().
TEST_F(TOPKTest, CountForHeapItemMatchesListCount) {
  topk_.IncrBy("match_me", 50);
  auto count_val = topk_.Count("match_me");
  auto list = topk_.List();

  bool found = false;
  for (const auto& item : list) {
    if (item.item == "match_me") {
      EXPECT_EQ(item.count, count_val);
      found = true;
    }
  }
  EXPECT_TRUE(found);
}

// ---------------------------------------------------------------------------
// List
// ---------------------------------------------------------------------------

// List() returns an empty vector on a freshly constructed TOPK.
TEST(TOPKBasic, ListEmptyOnConstruction) {
  TOPK fresh(PMR_NS::get_default_resource(), 5, 100, 5, 0.0);
  EXPECT_TRUE(fresh.List().empty());
}

// List() output is sorted in descending order by count.
TEST_F(TOPKTest, ListReturnsSortedByCountDescending) {
  topk_.IncrBy("low", 10);
  topk_.IncrBy("mid", 50);
  topk_.IncrBy("high", 100);

  auto list = topk_.List();

  // 1. Guarantee the items actually returned
  ASSERT_EQ(list.size(), 3u);

  // 2. Exact match the deterministic order
  EXPECT_EQ(list[0].item, "high");
  EXPECT_EQ(list[0].count, 100u);

  EXPECT_EQ(list[1].item, "mid");
  EXPECT_EQ(list[1].count, 50u);

  EXPECT_EQ(list[2].item, "low");
  EXPECT_EQ(list[2].count, 10u);
}

// After inserting more than K distinct items, List().size() == K.
TEST_F(TOPKTest, ListNeverExceedsKItems) {
  for (int i{}; i < 100; ++i) {
    topk_.IncrBy(absl::StrCat("x", i), (i + 1) * 10);
  }
  // We inserted 100 items. The heap MUST be exactly full.
  EXPECT_EQ(topk_.List().size(), topk_.K());
}

// ---------------------------------------------------------------------------
// Decay & ComputeDecayProbability
// ---------------------------------------------------------------------------

// For count < kDecayLookupSize, ComputeDecayProbability equals std::pow(decay, count).
TEST_F(TOPKTest, ProbabilityBelowTableSize) {
  double decay_val = 0.85;
  TOPK topk(PMR_NS::get_default_resource(), 5, 100, 5, decay_val);

  // ComputeDecayProbability enforces DCHECK_GT(count, 0u), so we start at 1.
  for (uint32_t count = 1; count < TOPK::kDecayLookupSize; ++count) {
    double expected = std::pow(decay_val, static_cast<double>(count));

    // EXPECT_DOUBLE_EQ allows up to 4 ULPs of rounding difference.
    EXPECT_DOUBLE_EQ(ComputeDecayProbability(&topk, count), expected);
  }
}

// For count >= kDecayLookupSize, the extrapolation path should not crash or produce NaN.
TEST(TOPKBasic, ProbabilityAboveTableSizeNoCrash) {
  TOPK topk(PMR_NS::get_default_resource(), 3, 10, 3, 0.999);

  // Push counter safely above kDecayLookupSize (4097)
  topk.IncrBy("big", 5000);

  // 2. NOW call Add. This forces ShouldDecay(5000) to execute!
  // It shouldn't crash, segfault, or produce NaN.
  for (int i = 0; i < 10; ++i) {
    topk.Add("big");
  }

  // Just verify the state isn't corrupted (count is still around 5000)
  EXPECT_GT(topk.Count("big"), 4000u);
}

// For an extremely large count with a small decay, probability drops to effectively zero.
// This means ShouldDecay always returns false for very high counts, so counters aren't decremented.
TEST(TOPKBasic, VeryHighCountApproachesZero) {
  // decay=0.5: 0.5^4096 is astronomically small (< kDecayEpsilon). The extrapolation
  // path should return 0.0, meaning no decay fires for counts above the table range.
  TOPK topk(PMR_NS::get_default_resource(), 3, 10, 3, 0.5);
  topk.IncrBy("stable", 10000);
  auto count_before = topk.Count("stable");
  // Adding more items should not decay "stable"'s counter because the decay
  // probability for such high counts is effectively zero.
  for (int i{}; i < 100; ++i) {
    topk.Add(absl::StrCat("other", i));
  }
  auto count_after = topk.Count("stable");
  // Count may increase from hash collisions but should never decrease.
  EXPECT_GE(count_after, count_before);
}

// With decay=0.0, the decay probability is always 0 (0^n = 0 for n>0),
// so counters should grow monotonically.
TEST(TOPKBasic, ZeroDecayNeverDecays) {
  TOPK topk(PMR_NS::get_default_resource(), 3, 50, 3, 0.0);
  topk.IncrBy("mono", 100);
  auto count1 = topk.Count("mono");
  topk.IncrBy("mono", 50);
  auto count2 = topk.Count("mono");
  EXPECT_GE(count2, count1);
  EXPECT_EQ(count2, 150u);
}

// With decay=1.0, every non-zero counter has ShouldDecay probability exactly 1.0 (1^n = 1).
// Because this implementation uses no fingerprints (unlike the original HeavyKeeper paper),
// decay fires even when re-adding the same item to its own non-zero counter.
// The counter therefore oscillates: 0 → 1 (add to zero-counter) → 0 (decay fires) → repeat.
// It is mathematically impossible for the counter to exceed 1.
TEST(TOPKBasic, DecayOneAlwaysDecays) {
  TOPK topk(PMR_NS::get_default_resource(), 3, 10, 3, 1.0);

  for (int i{}; i < 1000; ++i) {
    topk.Add("suppressed");
  }

  // Because decay is 100%, the counter just oscillates between 0 and 1.
  // It is mathematically impossible for it to exceed 1.
  EXPECT_LE(topk.Count("suppressed"), 1u);
}

// ---------------------------------------------------------------------------
// MallocUsed
// ---------------------------------------------------------------------------

// MallocUsed() after filling the heap should be larger than right after construction.
TEST(TOPKBasic, MallocUsedIncreaseWithHeapGrowth) {
  TOPK topk(PMR_NS::get_default_resource(), 5, 100, 5, 0.0);
  size_t before = topk.MallocUsed();
  for (int i{}; i < 5; ++i) {
    topk.IncrBy(absl::StrCat("item_with_a_long_name_", i), 100);
  }
  size_t after = topk.MallocUsed();
  EXPECT_GT(after, before);
}

// ---------------------------------------------------------------------------
// Serialize / Deserialize
// ---------------------------------------------------------------------------

// After Serialize() + Deserialize(), K(), Width(), Depth(), Decay() are unchanged.
TEST_F(TOPKTest, SerializeRoundTripPreservesConfiguration) {
  topk_.IncrBy("a", 10);
  auto data = topk_.Serialize();

  TOPK restored(PMR_NS::get_default_resource(), data.k, data.width, data.depth, data.decay);
  restored.Deserialize(data);

  EXPECT_EQ(restored.K(), topk_.K());
  EXPECT_EQ(restored.Width(), topk_.Width());
  EXPECT_EQ(restored.Depth(), topk_.Depth());
  EXPECT_DOUBLE_EQ(restored.Decay(), topk_.Decay());
}

// After round-trip, List() returns the same items with the same counts.
TEST_F(TOPKTest, SerializeRoundTripPreservesHeapItems) {
  topk_.IncrBy("alpha", 100);
  topk_.IncrBy("beta", 50);
  topk_.IncrBy("gamma", 25);

  auto data = topk_.Serialize();
  TOPK restored(PMR_NS::get_default_resource(), data.k, data.width, data.depth, data.decay);
  restored.Deserialize(data);

  auto orig_list = topk_.List();
  auto rest_list = restored.List();
  ASSERT_EQ(orig_list.size(), rest_list.size());
  for (size_t i{}; i < orig_list.size(); ++i) {
    EXPECT_EQ(orig_list[i].item, rest_list[i].item);
    EXPECT_EQ(orig_list[i].count, rest_list[i].count);
  }
}

// After round-trip, Count() returns the same estimated frequencies.
TEST_F(TOPKTest, SerializeRoundTripPreservesCounters) {
  topk_.IncrBy("foo", 42);
  topk_.IncrBy("bar", 77);

  auto data = topk_.Serialize();
  TOPK restored(PMR_NS::get_default_resource(), data.k, data.width, data.depth, data.decay);
  restored.Deserialize(data);

  EXPECT_EQ(topk_.Count("foo"), restored.Count("foo"));
  EXPECT_EQ(topk_.Count("bar"), restored.Count("bar"));
}

// After Deserialize(), subsequent Add() calls work correctly and evictions are reported.
TEST_F(TOPKTest, DeserializeRebuildsValidHeapProperty) {
  for (uint32_t i{}; i < topk_.K(); ++i) {
    topk_.IncrBy(absl::StrCat("pre", i), 10);
  }

  auto data = topk_.Serialize();
  TOPK restored(PMR_NS::get_default_resource(), data.k, data.width, data.depth, data.decay);
  restored.Deserialize(data);

  // The restored heap is full (K items). A heavy new item should evict the minimum.
  auto evicted = restored.IncrBy("post_restore_big", 10000);
  EXPECT_TRUE(evicted.has_value());
  EXPECT_TRUE(restored.Query("post_restore_big"));
}

// Serializing a fresh TOPK produces empty heap_items and a zero-filled counters vector.
TEST(TOPKBasic, SerializeEmptyTOPK) {
  TOPK topk(PMR_NS::get_default_resource(), 5, 100, 5, 0.0);
  auto data = topk.Serialize();

  EXPECT_TRUE(data.heap_items.empty());
  EXPECT_EQ(data.counters.size(), 100u * 5);
  for (auto c : data.counters) {
    EXPECT_EQ(c, 0u);
  }
}

// ---------------------------------------------------------------------------
// PMR Allocator
// ---------------------------------------------------------------------------

// Explicitly passing get_default_resource() works correctly without crashing.
TEST(TOPKBasic, PMRExplicitDefaultResourceWorks) {
  TOPK topk(PMR_NS::get_default_resource(), 5, 100, 5, 0.9);
  topk.Add("works");
  EXPECT_EQ(topk.List().size(), 1u);
}

// ---------------------------------------------------------------------------
// Statistical / Accuracy
// ---------------------------------------------------------------------------

// Verify that the Top-K correctly identifies "Hot" items even when
// the sketch is flooded with "Cold" noise (many items seen only once).
//
// SETUP:
// 1. We disable Decay (decay=0.0) to make the test 100% predictable (no RNG).
// 2. We use IncrBy to give 5 "Hot" items a guaranteed high score of 1000.
// 3. We use Add to insert 200 "Cold" items once each (score of 1).
//
// WHY INCRBY?
// In a real-world scenario with decay, an item's count eventually hits a
// "ceiling" where decay and growth balance out. By using IncrBy and decay=0,
// we bypass that math to ensure our "Hot" items are strictly,
// deterministically larger than the noise.
TEST(TOPKBasic, TopKItemsIdentifiedUnderHeavyLoad) {
  TOPK topk(PMR_NS::get_default_resource(), 5, 500, 5, 0.0);
  // Hot items get a large, deterministic count via IncrBy.
  for (int h{}; h < 5; ++h) {
    topk.IncrBy(absl::StrCat("hot", h), 1000);
  }
  // Cold items are each seen only once.
  for (int c{}; c < 200; ++c) {
    topk.Add(absl::StrCat("cold", c));
  }

  auto list = topk.List();
  ASSERT_EQ(list.size(), 5u);
  // All 5 hot items should be present in the top-K list.
  for (int h{}; h < 5; ++h) {
    string hot_key = absl::StrCat("hot", h);
    bool found{};
    for (const auto& item : list) {
      if (item.item == hot_key) {
        found = true;
        break;
      }
    }
    EXPECT_TRUE(found) << hot_key << " should be in the top-K list";
  }
}

// With k=1, only the single most-frequent item survives in the heap.
// Uses decay=0.0 and IncrBy so "dominant" has a deterministically high count
// that minor items (each added once, count=1) can never exceed.
TEST(TOPKBasic, KEqualsOneTracksOnlyTopItem) {
  TOPK topk(PMR_NS::get_default_resource(), 1, 500, 5, 0.0);

  // "dominant" gets a large, fixed count.
  topk.IncrBy("dominant", 1000);
  // Minor items are each seen only once; count=1 < 1000, so none can displace dominant.
  for (int i{}; i < 50; ++i) {
    topk.Add(absl::StrCat("minor", i));
  }

  auto list = topk.List();
  ASSERT_EQ(list.size(), 1u);
  EXPECT_EQ(list[0].item, "dominant");
}

// ---------------------------------------------------------------------------
// Deserialization Heap Repair
// ---------------------------------------------------------------------------

// Deserialize() must call std::make_heap to restore the min-heap invariant even when
// heap_items are stored out-of-order in the RDB snapshot (e.g. saved in List() order).
TEST(TOPKBasic, DeserializeRestoresHeapProperty) {
  TOPK::SerializedData data;
  data.k = 5;
  data.width = 100;
  data.depth = 5;
  data.decay = 0.0;
  data.counters.resize(500, 0);

  // Items deliberately out of min-heap order: smallest must end up at the root.
  data.heap_items.push_back({"heavy", 1000});
  data.heap_items.push_back({"medium", 500});
  data.heap_items.push_back({"light", 10});

  TOPK restored(PMR_NS::get_default_resource(), 5, 100, 5, 0.0);
  restored.Deserialize(data);

  // List() sorts descending — correct only if make_heap built a valid heap.
  auto list = restored.List();
  ASSERT_EQ(list.size(), 3u);
  EXPECT_EQ(list[0].item, "heavy");
  EXPECT_EQ(list[1].item, "medium");
  EXPECT_EQ(list[2].item, "light");

  // Heap is not yet full (3 of 5 slots used), so fill it to capacity.
  restored.IncrBy("filler1", 20);
  restored.IncrBy("filler2", 30);

  // Now heap is full (5 items: light=10, filler1=20, filler2=30, medium=500, heavy=1000).
  // A new item with count > 10 must evict "light" — the min-heap root.
  auto evicted = restored.IncrBy("newcomer", 50);
  ASSERT_TRUE(evicted.has_value());
  EXPECT_EQ(evicted.value(), "light");
}

// ---------------------------------------------------------------------------
// Counter Saturation (Overflow Prevention)
// ---------------------------------------------------------------------------

// IncrBy must saturate at UINT32_MAX rather than wrapping around to 0.
// A wrap-around would trick the heap into evicting a top item — a correctness
// and security issue (malicious TOPK.INCRBY with a huge increment).
TEST_F(TOPKTest, CounterSaturationPreventsOverflow) {
  const uint32_t max_val = numeric_limits<uint32_t>::max();
  topk_.IncrBy("max_item", max_val);
  EXPECT_EQ(topk_.Count("max_item"), max_val);

  // Adding more must not wrap the counter back to a small number.
  topk_.IncrBy("max_item", 100);
  EXPECT_EQ(topk_.Count("max_item"), max_val);
}

// ---------------------------------------------------------------------------
// Death Tests (DCHECKs active in debug builds only)
// ---------------------------------------------------------------------------

#ifndef NDEBUG
// k=0 violates DCHECK_GT(k_, 0u) in the constructor.
TEST(TOPKDeathTest, ZeroKCrashes) {
  EXPECT_DEBUG_DEATH(TOPK(PMR_NS::get_default_resource(), 0, 100, 5, 0.9), "k_ > 0");
}

// width=0 violates DCHECK_GT(width_, 0u) in the constructor.
TEST(TOPKDeathTest, ZeroWidthCrashes) {
  EXPECT_DEBUG_DEATH(TOPK(PMR_NS::get_default_resource(), 5, 0, 5, 0.9), "width_ > 0");
}

// decay=1.5 violates DCHECK_LE(decay_, 1.0) in the constructor.
TEST(TOPKDeathTest, DecayAboveOneCrashes) {
  EXPECT_DEBUG_DEATH(TOPK(PMR_NS::get_default_resource(), 5, 100, 5, 1.5), "decay_ <= 1.0");
}

// Deserializing data with a mismatched k violates DCHECK_EQ(data.k, k_).
TEST(TOPKDeathTest, DeserializeDimensionMismatchCrashes) {
  TOPK topk(PMR_NS::get_default_resource(), 5, 100, 5, 0.9);
  TOPK::SerializedData bad;
  bad.k = 10;  // Mismatch: object was constructed with k=5.
  bad.width = 100;
  bad.depth = 5;
  bad.decay = 0.9;
  bad.counters.resize(500, 0);
  EXPECT_DEBUG_DEATH(topk.Deserialize(bad), "data.k == k_");
}

// Deserializing data with a mismatched decay violates DCHECK_EQ(data.decay, decay_).
TEST(TOPKDeathTest, DeserializeDecayMismatchCrashes) {
  TOPK topk(PMR_NS::get_default_resource(), 5, 100, 5, 0.9);
  TOPK::SerializedData bad;
  bad.k = 5;
  bad.width = 100;
  bad.depth = 5;
  bad.decay = 0.5;  // Mismatch: object was constructed with decay=0.9.
  bad.counters.resize(500, 0);
  EXPECT_DEBUG_DEATH(topk.Deserialize(bad), "data.decay == decay_");
}
#endif

}  // namespace dfly


================================================
FILE: src/core/tx_queue.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#include "core/tx_queue.h"

#include "base/logging.h"

namespace dfly {

TxQueue::TxQueue(std::function<uint64_t(const Transaction*)> sf)
    : score_fun_(sf), vec_(32) {
  for (size_t i = 0; i < vec_.size(); ++i) {
    vec_[i].next = i + 1;
  }
}

auto TxQueue::Insert(Transaction* t) -> Iterator {
  if (next_free_ >= vec_.size()) {
    Grow();
  }
  DCHECK_LT(next_free_, vec_.size());
  DCHECK_EQ(FREE_TAG, vec_[next_free_].tag);

  Iterator res = next_free_;
  vec_[next_free_].u.trans = t;
  vec_[next_free_].tag = TRANS_TAG;
  DVLOG(1) << "Insert " << next_free_ << " " << t;
  LinkFree(score_fun_(t));
  return res;
}

auto TxQueue::Insert(uint64_t val) -> Iterator {
  if (next_free_ >= vec_.size()) {
    Grow();
  }
  DCHECK_LT(next_free_, vec_.size());

  Iterator res = next_free_;

  vec_[next_free_].u.uval = val;
  vec_[next_free_].tag = UINT_TAG;

  LinkFree(val);
  return res;
}

void TxQueue::LinkFree(uint64_t weight) {
  uint32_t taken = next_free_;
  next_free_ = vec_[taken].next;

  if (size_ == 0) {
    head_ = taken;
    vec_[head_].next = vec_[head_].prev = head_;
  } else {
    uint32_t cur = vec_[head_].prev;
    while (true) {
      if (Rank(vec_[cur]) < weight) {
        Link(cur, taken);
        break;
      }
      if (cur == head_) {
        Link(vec_[head_].prev, taken);
        head_ = taken;
        break;
      }
      cur = vec_[cur].prev;
    }
  }
  ++size_;
}

void TxQueue::Grow() {
  size_t start = vec_.size();
  DVLOG(1) << "Grow from " << start << " to " << start * 2;

  vec_.resize(start * 2);
  for (size_t i = start; i < vec_.size(); ++i) {
    vec_[i].next = i + 1;
  }
}

void TxQueue::Remove(Iterator it) {
  DCHECK_GT(size_, 0u);
  DCHECK_LT(it, vec_.size());
  DCHECK_NE(FREE_TAG, vec_[it].tag);

  DVLOG(1) << "Remove " << it << " " << vec_[it].u.trans;
  Iterator next = kEnd;
  if (size_ > 1) {
    Iterator prev = vec_[it].prev;
    next = vec_[it].next;

    vec_[prev].next = next;
    vec_[next].prev = prev;
  }
  --size_;
  vec_[it].next = next_free_;
  vec_[it].tag = FREE_TAG;
  next_free_ = it;
  if (head_ == it) {
    head_ = next;
  }
}

uint64_t TxQueue::Rank(const QRecord& r) const {
  switch (r.tag) {
    case UINT_TAG:
      return r.u.uval;
    case TRANS_TAG:
      return score_fun_(r.u.trans);
  }
  return 0;
}

}  // namespace dfly


================================================
FILE: src/core/tx_queue.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once

#include <cstdint>
#include <functional>
#include <variant>
#include <vector>

namespace dfly {

class Transaction;

// TxQueue implemmented as a circular doubly-linked list.
class TxQueue {
  void Link(uint32_t p, uint32_t n) {
    uint32_t next = vec_[p].next;
    vec_[n].next = next;
    vec_[n].prev = p;
    vec_[p].next = n;
    vec_[next].prev = n;
  }

 public:
  // uint64_t is used for unit-tests.
  using ValueType = std::variant<Transaction*, uint64_t>;
  using Iterator = uint32_t;
  enum { kEnd = Iterator(-1) };

  TxQueue(std::function<uint64_t(const Transaction*)> score_fun = nullptr);

  // returns iterator to that item the list
  Iterator Insert(Transaction* t);

  Iterator Insert(uint64_t val);
  void Remove(Iterator);

  ValueType At(Iterator it) const {
    switch (vec_[it].tag) {
      case TRANS_TAG:
        return vec_[it].u.trans;
      case UINT_TAG:
        return vec_[it].u.uval;
    }
    return 0u;
  }

  ValueType Front() const {
    return At(head_);
  }

  void PopFront() {
    Remove(head_);
  }

  size_t size() const {
    return size_;
  }

  bool Empty() const {
    return size_ == 0;
  }

  //! returns the score of the tail record. Can be called only if !Empty().
  uint64_t TailScore() const {
    return Rank(vec_[vec_[head_].prev]);
  }

  //! returns the score of the head record. Can be called only if !Empty().
  uint64_t HeadScore() const {
    return Rank(vec_[head_]);
  }

  //! Can be called only if !Empty().
  Iterator Head() const {
    return head_;
  }

  // Returns the next iterator, it's circular so it always returns a valid
  // iterator. Can be called only if !Empty().
  Iterator Next(Iterator it) const {
    return vec_[it].next;
  }

 private:
  enum { TRANS_TAG = 0, UINT_TAG = 11, FREE_TAG = 12 };

  void Grow();
  void LinkFree(uint64_t rank);

  struct QRecord {
    union {
      Transaction* trans;
      uint64_t uval;
    } u;

    uint32_t tag : 8;
    uint32_t next : 24;
    uint32_t prev;

    QRecord() : tag(FREE_TAG), prev(kEnd) {
    }
  };

  static_assert(sizeof(QRecord) == 16, "");

  uint64_t Rank(const QRecord& r) const;

  std::function<uint64_t(const Transaction*)> score_fun_;
  std::vector<QRecord> vec_;
  uint32_t next_free_ = 0, head_ = kEnd;
  size_t size_ = 0;

  TxQueue(const TxQueue&) = delete;
};

}  // namespace dfly


================================================
FILE: src/core/zstd_test.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include <absl/base/macros.h>
#include <gmock/gmock.h>
#include <zstd.h>

#include <random>

#include "base/logging.h"

namespace dfly {

using namespace std;

constexpr unsigned kLevel = 1;

class ZStdTest : public ::testing::Test {
 protected:
  string Compress(const string& src, const ZSTD_CDict* cdict) {
    ZSTD_CCtx* cctx = ZSTD_createCCtx();
    size_t c_buffer_size = ZSTD_compressBound(src.size());
    string res(c_buffer_size, '\0');
    size_t compressed_size =
        ZSTD_compress_usingCDict(cctx, res.data(), c_buffer_size, src.c_str(), src.size(), cdict);

    ZSTD_freeCCtx(cctx);
    res.resize(compressed_size);
    return res;
  }

  string Decompress(const string& src, const ZSTD_DDict* ddict, size_t decompressed_size) {
    string res(decompressed_size, '\0');
    ZSTD_DCtx* dctx = ZSTD_createDCtx();
    size_t decompressed_size_actual = ZSTD_decompress_usingDDict(
        dctx, res.data(), decompressed_size, src.c_str(), src.size(), ddict);
    CHECK_EQ(decompressed_size, decompressed_size_actual);
    ZSTD_freeDCtx(dctx);
    return res;
  }

  string CompressNoDict(const string& src) {
    ZSTD_CCtx* cctx = ZSTD_createCCtx();
    size_t c_buffer_size = ZSTD_compressBound(src.size());
    string res(c_buffer_size, '\0');
    size_t compressed_size =
        ZSTD_compressCCtx(cctx, res.data(), c_buffer_size, src.c_str(), src.size(), kLevel);
    ZSTD_freeCCtx(cctx);
    res.resize(compressed_size);
    return res;
  }
};

// Dictionary works well for small messages where we do not have enough data to reference
// previous stream to have significant savings.
// For large messages, it may not be less beneficial.
TEST_F(ZStdTest, Dict) {
  const char* kRandomPieces[] = {"ABCD", "EFGH", "IJKL", "MNOP", "QRST", "UVWX", "YZAB", "CDEF"};
  string dict_source;
  random_device rd;

  for (unsigned i = 0; i < 1000; ++i) {
    dict_source += kRandomPieces[rd() % ABSL_ARRAYSIZE(kRandomPieces)];
  }
  LOG(INFO) << "Creating CDICT from " << dict_source.size() << " bytes of random data";
  ZSTD_CDict* cdict = ZSTD_createCDict(dict_source.data(), dict_source.size(), 7);
  ASSERT_TRUE(cdict);
  size_t actual_dict_size = ZSTD_sizeof_CDict(cdict);
  LOG(INFO) << "ZSTD_CDict created, size: " << actual_dict_size << " bytes";

  ZSTD_DDict* ddict = ZSTD_createDDict(dict_source.data(), dict_source.size());
  ASSERT_TRUE(ddict);
  size_t actual_ddict_size = ZSTD_sizeof_DDict(ddict);
  LOG(INFO) << "ZSTD_DDict created, size: " << actual_ddict_size << " bytes";

  // 3. Data to compress
  std::string data_to_compress;
  for (unsigned j = 0; j < 30; ++j) {
    data_to_compress += kRandomPieces[rd() % ABSL_ARRAYSIZE(kRandomPieces)];
  }
  size_t data_to_compress_size = data_to_compress.size();

  // 4. Compress data
  string compressed = Compress(data_to_compress, cdict);

  LOG(INFO) << "Compressed data size: " << compressed.size() << " bytes vs "
            << data_to_compress_size << " bytes of original data";

  string compress_no_dict = CompressNoDict(data_to_compress);
  LOG(INFO) << "Compressed data size without dict: " << compress_no_dict.size() << " bytes";

  // 5. Decompress data
  string decompressed = Decompress(compressed, ddict, data_to_compress_size);
  ASSERT_EQ(data_to_compress, decompressed);

  // 7. Free memory
  ZSTD_freeCDict(cdict);
  ZSTD_freeDDict(ddict);
}

}  // namespace dfly

================================================
FILE: src/external_libs.cmake
================================================
add_third_party(
  lua
  GIT_REPOSITORY https://github.com/dragonflydb/lua
  GIT_TAG Dragonfly-5.4.6a
  CONFIGURE_COMMAND echo
  BUILD_IN_SOURCE 1
  BUILD_COMMAND ${DFLY_TOOLS_MAKE} all
  INSTALL_COMMAND cp <SOURCE_DIR>/liblua.a ${THIRD_PARTY_LIB_DIR}/lua/lib/
  COMMAND cp <SOURCE_DIR>/lualib.h <SOURCE_DIR>/lua.h <SOURCE_DIR>/lauxlib.h
          <SOURCE_DIR>/luaconf.h ${THIRD_PARTY_LIB_DIR}/lua/include
)


if (APPLE OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD")
  set(SED_REPL sed "-i" '')
else()
  set(SED_REPL sed "-i")
endif()

add_third_party(
  dconv
  GIT_REPOSITORY https://github.com/google/double-conversion
  # URL https://github.com/google/double-conversion/archive/refs/tags/v3.3.1.tar.gz
  GIT_TAG 0604b4c
  PATCH_COMMAND ${SED_REPL} "/static const std::ctype/d"
                <SOURCE_DIR>/double-conversion/string-to-double.cc
  COMMAND ${SED_REPL} "/std::use_facet</d" <SOURCE_DIR>/double-conversion/string-to-double.cc
  COMMAND ${SED_REPL} "s/cType.tolower/std::tolower/g" <SOURCE_DIR>/double-conversion/string-to-double.cc
  LIB libdouble-conversion.a
)

add_third_party(
  reflex
  URL https://github.com/Genivia/RE-flex/archive/refs/tags/v5.2.2.tar.gz
  PATCH_COMMAND autoreconf -fi
  CONFIGURE_COMMAND <SOURCE_DIR>/configure --disable-avx2 --prefix=${THIRD_PARTY_LIB_DIR}/reflex
          CXX=${CMAKE_CXX_COMPILER} CC=${CMAKE_C_COMPILER}
)

set(REFLEX "${THIRD_PARTY_LIB_DIR}/reflex/bin/reflex")

add_third_party(
  jsoncons
  GIT_REPOSITORY https://github.com/dragonflydb/jsoncons
  GIT_TAG Dragonfly1.5.0
  GIT_SHALLOW 1
  CMAKE_PASS_FLAGS "-DJSONCONS_BUILD_TESTS=OFF -DJSONCONS_HAS_POLYMORPHIC_ALLOCATOR=ON"
  LIB "none"
)

add_third_party(
  lz4
  URL https://github.com/lz4/lz4/archive/refs/tags/v1.10.0.tar.gz

  BUILD_IN_SOURCE 1
  CONFIGURE_COMMAND echo skip
  BUILD_COMMAND ${DFLY_TOOLS_MAKE} lib-release
  INSTALL_COMMAND ${DFLY_TOOLS_MAKE} install BUILD_SHARED=no PREFIX=${THIRD_PARTY_LIB_DIR}/lz4
)

set(MIMALLOC_ROOT_DIR ${THIRD_PARTY_LIB_DIR}/mimalloc2)
set(MIMALLOC_INCLUDE_DIR ${MIMALLOC_ROOT_DIR}/include)
set(MIMALLOC_PATCH_DIR ${CMAKE_CURRENT_LIST_DIR}/../patches/mimalloc-v2.2.4)
set(MIMALLOC_C_FLAGS "-O3 -g -DMI_STAT=1 -DNDEBUG")
file(MAKE_DIRECTORY ${MIMALLOC_INCLUDE_DIR})

ExternalProject_Add(mimalloc2_project
  URL https://github.com/microsoft/mimalloc/archive/refs/tags/v2.2.4.tar.gz
  DOWNLOAD_DIR ${THIRD_PARTY_DIR}/mimalloc2
  SOURCE_DIR ${THIRD_PARTY_DIR}/mimalloc2
  # INSTALL_DIR ${MIMALLOC_ROOT_DIR}
  UPDATE_COMMAND ""

  PATCH_COMMAND
      patch -p1 -d ${THIRD_PARTY_DIR}/mimalloc2/ -i ${MIMALLOC_PATCH_DIR}/0_base.patch
      COMMAND patch -p1 -d ${THIRD_PARTY_DIR}/mimalloc2/ -i ${MIMALLOC_PATCH_DIR}/1_add_stat_type.patch
      COMMAND patch -p1 -d ${THIRD_PARTY_DIR}/mimalloc2/ -i ${MIMALLOC_PATCH_DIR}/2_return_stat.patch
      COMMAND patch -p1 -d ${THIRD_PARTY_DIR}/mimalloc2/ -i ${MIMALLOC_PATCH_DIR}/3_track_full_size.patch
      COMMAND patch -p1 -d ${THIRD_PARTY_DIR}/mimalloc2/ -i ${MIMALLOC_PATCH_DIR}/4_fix_heap_collect.patch
  BUILD_COMMAND make mimalloc-static

  INSTALL_COMMAND make install
  # Copy internal types like mi_page_usage_stats_s and mi_heap_s
  COMMAND cp -r <SOURCE_DIR>/include/mimalloc ${MIMALLOC_INCLUDE_DIR}/

  LOG_INSTALL ON
  LOG_DOWNLOAD ON
  LOG_CONFIGURE ON
  LOG_BUILD ON
  LOG_PATCH ON
  LOG_UPDATE ON
  DOWNLOAD_EXTRACT_TIMESTAMP YES

  CMAKE_GENERATOR "Unix Makefiles"

  # Add -DCMAKE_BUILD_TYPE=Debug -DCMAKE_C_FLAGS=-O0 to debug, and set BUILD_BYPRODUCTS to
  # libmimalloc-debug.a

  BUILD_BYPRODUCTS ${MIMALLOC_ROOT_DIR}/lib/libmimalloc.a

  CMAKE_ARGS -DCMAKE_ARCHIVE_OUTPUT_DIRECTORY:PATH=${MIMALLOC_ROOT_DIR}/lib
        -DCMAKE_LIBRARY_OUTPUT_DIRECTORY:PATH=${MIMALLOC_ROOT_DIR}/lib
        -DCMAKE_BUILD_TYPE:STRING=Release
        -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
        -DMI_INSTALL_TOPLEVEL=ON
        -DMI_OVERRIDE=OFF
        -DMI_NO_PADDING=ON
        -DMI_BUILD_TESTS=OFF
        -DMI_BUILD_SHARED=OFF
        -DMI_BUILD_OBJECT=OFF
        -DCMAKE_C_FLAGS=${MIMALLOC_C_FLAGS}
        -DCMAKE_INSTALL_PREFIX:PATH=${MIMALLOC_ROOT_DIR}
)

add_library(TRDP::mimalloc2 STATIC IMPORTED)
add_dependencies(TRDP::mimalloc2 mimalloc2_project)
set_target_properties(TRDP::mimalloc2 PROPERTIES IMPORTED_LOCATION ${MIMALLOC_ROOT_DIR}/lib/libmimalloc.a
                      INTERFACE_INCLUDE_DIRECTORIES ${MIMALLOC_ROOT_DIR}/include)

add_third_party(
  croncpp
  URL https://github.com/mariusbancila/croncpp/archive/refs/tags/v2023.03.30.tar.gz
  LIB "none"
)

if (WITH_SEARCH)
  add_third_party(
    uni-algo
    URL https://github.com/uni-algo/uni-algo/archive/refs/tags/v1.0.0.tar.gz

    CMAKE_PASS_FLAGS "-DCMAKE_CXX_STANDARD:STRING=20"
  )

  add_third_party(
    hnswlib
    GIT_REPOSITORY https://github.com/dragonflydb/hnswlib.git
    # HEAD of dragonfly branch
    GIT_TAG d07dd1da2bf48b85d2f03b8396193ad7120f75c2

    BUILD_COMMAND echo SKIP
    INSTALL_COMMAND cp -R <SOURCE_DIR>/hnswlib ${THIRD_PARTY_LIB_DIR}/hnswlib/include/
    LIB "none"
  )
endif()

add_third_party(
  fast_float
  URL https://github.com/fastfloat/fast_float/archive/refs/tags/v5.2.0.tar.gz
  LIB "none"
)

add_third_party(
  flatbuffers
  URL https://github.com/google/flatbuffers/archive/refs/tags/v23.5.26.tar.gz
  CMAKE_PASS_FLAGS "-DFLATBUFFERS_BUILD_TESTS=OFF -DFLATBUFFERS_LIBCXX_WITH_CLANG=OFF
                    -DFLATBUFFERS_BUILD_FLATC=OFF"
)

add_third_party(
  hdr_histogram
  GIT_REPOSITORY https://github.com/HdrHistogram/HdrHistogram_c/
  GIT_TAG 652d51bcc36744fd1a6debfeb1a8a5f58b14022c
  CMAKE_PASS_FLAGS "-DHDR_LOG_REQUIRED=OFF -DHDR_HISTOGRAM_BUILD_PROGRAMS=OFF
                    -DHDR_HISTOGRAM_INSTALL_SHARED=OFF"
  LIB libhdr_histogram_static.a
)

if(WITH_SIMSIMD)
  # Compute integer macros for native half-precision support.
  set(SIMSIMD_NATIVE_F16_VAL 0)
  set(SIMSIMD_NATIVE_BF16_VAL 0)
  if(SIMSIMD_NATIVE_F16)
    set(SIMSIMD_NATIVE_F16_VAL 1)
    set(SIMSIMD_NATIVE_BF16_VAL 1)
  endif()

  # Build statically via add_third_party using the C shim with dynamic dispatch.
  add_third_party(
    simsimd
    URL https://github.com/ashvardanian/SimSIMD/archive/refs/tags/v6.5.3.tar.gz
    BUILD_IN_SOURCE 1
    CONFIGURE_COMMAND echo skip
    BUILD_COMMAND bash -c "\
      mkdir -p ${THIRD_PARTY_LIB_DIR}/simsimd/lib && \
      ${CMAKE_C_COMPILER} -O3 -fPIC -DNDEBUG \
        -DSIMSIMD_DYNAMIC_DISPATCH=1 \
        -DSIMSIMD_NATIVE_F16=${SIMSIMD_NATIVE_F16_VAL} \
        -DSIMSIMD_NATIVE_BF16=${SIMSIMD_NATIVE_BF16_VAL} \
        -I<SOURCE_DIR>/include -c <SOURCE_DIR>/c/lib.c -o <SOURCE_DIR>/lib.o && \
      ar rcs <SOURCE_DIR>/libsimsimd.a <SOURCE_DIR>/lib.o"
    INSTALL_COMMAND bash -c "\
      mkdir -p ${THIRD_PARTY_LIB_DIR}/simsimd/include ${THIRD_PARTY_LIB_DIR}/simsimd/lib && \
      cp -R <SOURCE_DIR>/include/* ${THIRD_PARTY_LIB_DIR}/simsimd/include/ && \
      cp <SOURCE_DIR>/libsimsimd.a ${THIRD_PARTY_LIB_DIR}/simsimd/lib/"
    LIB libsimsimd.a
  )
endif()


add_library(TRDP::jsoncons INTERFACE IMPORTED)
add_dependencies(TRDP::jsoncons jsoncons_project)
set_target_properties(TRDP::jsoncons PROPERTIES
                      INTERFACE_INCLUDE_DIRECTORIES "${JSONCONS_INCLUDE_DIR}")

add_library(TRDP::croncpp INTERFACE IMPORTED)
add_dependencies(TRDP::croncpp croncpp_project)
set_target_properties(TRDP::croncpp PROPERTIES
                      INTERFACE_INCLUDE_DIRECTORIES "${CRONCPP_INCLUDE_DIR}")

if (WITH_SEARCH)
  add_library(TRDP::hnswlib INTERFACE IMPORTED)
  add_dependencies(TRDP::hnswlib hnswlib_project)
  set_target_properties(TRDP::hnswlib PROPERTIES
                        INTERFACE_INCLUDE_DIRECTORIES "${HNSWLIB_INCLUDE_DIR}")
endif()

add_library(TRDP::fast_float INTERFACE IMPORTED)
add_dependencies(TRDP::fast_float fast_float_project)
set_target_properties(TRDP::fast_float PROPERTIES
                      INTERFACE_INCLUDE_DIRECTORIES "${FAST_FLOAT_INCLUDE_DIR}")


================================================
FILE: src/facade/CMakeLists.txt
================================================
add_library(dfly_parser_lib redis_parser.cc resp_expr.cc resp_parser.cc
            resp_srv_parser.cc)
cxx_link(dfly_parser_lib base strings_lib redis_lib)

add_library(dfly_facade dragonfly_listener.cc dragonfly_connection.cc facade.cc
            memcache_parser.cc reply_builder.cc op_status.cc parsed_command.cc service_interface.cc
            reply_capture.cc cmd_arg_parser.cc tls_helpers.cc socket_utils.cc disk_backed_queue.cc)

if (DF_USE_SSL)
  set(TLS_LIB tls_lib)
  target_compile_definitions(dfly_facade PRIVATE DFLY_USE_SSL)
endif()

cxx_link(dfly_facade dfly_parser_lib http_server_lib fibers2
         ${TLS_LIB} TRDP::mimalloc2 TRDP::dconv redis_lib)

add_library(facade_test facade_test.cc resp_expr_test_utils.cc)
cxx_link(facade_test dfly_facade gtest_main_ext)

helio_cxx_test(memcache_parser_test dfly_facade LABELS DFLY)
helio_cxx_test(redis_parser_test facade_test LABELS DFLY)
helio_cxx_test(resp_srv_parser_test facade_test LABELS DFLY)
helio_cxx_test(reply_builder_test facade_test LABELS DFLY)
helio_cxx_test(resp_parser_test facade_test LABELS DFLY)
helio_cxx_test(cmd_arg_parser_test facade_test LABELS DFLY)
helio_cxx_test(disk_backed_queue_test facade_test LABELS DFLY)

add_executable(ok_backend ok_main.cc)
cxx_link(ok_backend dfly_facade)

add_executable(resp_validator resp_validator.cc)
cxx_link(resp_validator dfly_parser_lib)


================================================
FILE: src/facade/README.md
================================================
## A facade library

The library is responsible for opening dragonfly-like TCP client connections.
I call it facade because "client" term is often abused.

It should be separated from the rest of dragonfly server logic and should be self-contained, i.e
no redis-lib or server dependencies are allowed.


================================================
FILE: src/facade/cmd_arg_parser.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "facade/cmd_arg_parser.h"

#include <absl/strings/ascii.h>

#include "base/logging.h"
#include "facade/error.h"

namespace facade {

void CmdArgParser::ExpectTag(std::string_view tag) {
  if (cur_i_ >= args_.size()) {
    Report(OUT_OF_BOUNDS, cur_i_);
    return;
  }

  auto idx = cur_i_++;
  auto val = ToSV(args_[idx]);
  if (!absl::EqualsIgnoreCase(val, tag)) {
    Report(INVALID_NEXT, idx);
  }
}

CmdArgParser::ErrorInfo CmdArgParser::TakeError() {
  return std::exchange(error_, {});
}

ErrorReply CmdArgParser::ErrorInfo::MakeReply() const {
  DCHECK(operator bool());
  switch (type) {
    case INVALID_INT:
      return ErrorReply{kInvalidIntErr};
    case INVALID_FLOAT:
      return ErrorReply{kInvalidFloatErr};
    default:
      return ErrorReply{kSyntaxErr};
  };
  return ErrorReply{kSyntaxErr};
}

CmdArgParser::~CmdArgParser() {
  DCHECK(!error_) << "Parsing error occured but not checked";
  // TODO DCHECK(!HasNext()) << "Not all args were processed";
}

}  // namespace facade


================================================
FILE: src/facade/cmd_arg_parser.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/strings/match.h>
#include <absl/strings/numbers.h>

#include <optional>
#include <string_view>
#include <utility>

#include "facade/facade_types.h"

namespace facade {

// Helper class for numerical range restriction during parsing
template <auto min, auto max> struct FInt {
  decltype(min) value = {};
  operator decltype(min)() {
    return value;
  }

  static_assert(std::is_same_v<decltype(min), decltype(max)>, "inconsistent types");
  static constexpr auto kMin = min;
  static constexpr auto kMax = max;
};

template <class T> constexpr bool is_fint = false;

template <auto min, auto max> constexpr bool is_fint<FInt<min, max>> = true;

// Utility class for easily parsing command options from argument lists.
struct CmdArgParser {
  enum ErrorType {
    NO_ERROR,
    OUT_OF_BOUNDS,
    SHORT_OPT_TAIL,
    INVALID_INT,
    INVALID_FLOAT,
    INVALID_CASES,
    INVALID_NEXT,
    UNPROCESSED,
    CUSTOM_ERROR  // should be the last one
  };

  struct ErrorInfo {
    int type = NO_ERROR;
    size_t index = 0;

    operator bool() const {
      return type != ErrorType::NO_ERROR;
    }
    ErrorReply MakeReply() const;
  };

 public:
  CmdArgParser(ArgSlice args) : args_{args} {
  }

  // Debug asserts sure error was consumed
  ~CmdArgParser();

  // Get next value without consuming it
  std::string_view Peek() {
    return SafeSV(cur_i_);
  }

  // Consume next value
  template <class T = std::string_view, class... Ts> auto Next() {
    if (cur_i_ + sizeof...(Ts) >= args_.size()) {
      Report(OUT_OF_BOUNDS, cur_i_);
      return std::conditional_t<sizeof...(Ts) == 0, T, std::tuple<T, Ts...>>();
    }

    if constexpr (sizeof...(Ts) == 0) {
      auto idx = cur_i_++;
      return Convert<T>(idx);
    } else {
      std::tuple<T, Ts...> res;
      NextImpl<0>(&res);
      cur_i_ += sizeof...(Ts) + 1;
      return res;
    }
  }

  // returns next value if exists or default value
  template <class T = std::string_view> auto NextOrDefault(T default_value = {}) {
    return HasNext() ? Next<T>() : default_value;
  }

  // check next value ignoring case and consume it
  void ExpectTag(std::string_view tag);

  // Consume next value
  template <class... Cases> auto MapNext(Cases&&... cases) {
    if (cur_i_ >= args_.size()) {
      Report(OUT_OF_BOUNDS, cur_i_);
      return typename decltype(MapImpl(std::string_view(),
                                       std::forward<Cases>(cases)...))::value_type{};
    }

    auto idx = cur_i_++;
    auto res = MapImpl(SafeSV(idx), std::forward<Cases>(cases)...);
    if (!res) {
      Report(INVALID_CASES, idx);
      return typename decltype(res)::value_type{};
    }
    return *res;
  }

  // Consume next value if can map it and return mapped result or return nullopt
  template <class... Cases>
  auto TryMapNext(Cases&&... cases)
      -> std::optional<std::tuple_element_t<1, std::tuple<Cases...>>> {
    if (cur_i_ >= args_.size()) {
      return std::nullopt;
    }

    auto res = MapImpl(SafeSV(cur_i_), std::forward<Cases>(cases)...);
    cur_i_ = res ? cur_i_ + 1 : cur_i_;
    return res;
  }

  // Check if the next value is equal to a specific tag. If equal, its consumed.
  template <class... Args> bool Check(std::string_view tag, Args*... args) {
    if (cur_i_ + sizeof...(Args) >= args_.size())
      return false;

    std::string_view arg = SafeSV(cur_i_);
    if (!absl::EqualsIgnoreCase(arg, tag))
      return false;

    ((*args = Convert<Args>(++cur_i_)), ...);

    ++cur_i_;

    return true;
  }

  // Skip specified number of arguments
  CmdArgParser& Skip(size_t n) {
    if (cur_i_ + n > args_.size()) {
      Report(OUT_OF_BOUNDS, cur_i_);
    } else {
      cur_i_ += n;
    }
    return *this;
  }

  // Expect no more arguments and return if no error has occured
  bool Finalize() {
    if (HasNext()) {
      Report(UNPROCESSED, cur_i_);
      return false;
    }
    return !HasError();
  }

  // Return remaining arguments
  ArgSlice Tail() const {
    return args_.subspan(cur_i_);
  }

  // Return true if arguments are left and no errors occured
  bool HasNext() {
    return cur_i_ < args_.size() && !error_;
  }

  bool HasError() const {
    return bool(error_);
  }

  ErrorInfo TakeError();

  bool HasAtLeast(size_t i) const {
    return cur_i_ + i <= args_.size() && !error_;
  }

  size_t GetCurrentIndex() const {
    return cur_i_;
  }

  // Custom error_type should start from CUSTOM_ERROR
  void Report(int error_type) {
    // we use previous index, because the check was done outside and it's done after element is
    // processed
    Report(error_type, cur_i_ - 1);
  }

 private:
  void Report(int error_type, size_t idx) {
    if (!error_) {
      error_ = {error_type, idx};
      cur_i_ = args_.size();
    }
  }

  template <class T, class... Cases>
  std::optional<std::decay_t<T>> MapImpl(std::string_view arg, std::string_view tag, T&& value,
                                         Cases&&... cases) {
    if (absl::EqualsIgnoreCase(arg, tag))
      return std::forward<T>(value);

    if constexpr (sizeof...(cases) > 0)
      return MapImpl(arg, cases...);

    return std::nullopt;
  }

  template <size_t shift, class Tuple> void NextImpl(Tuple* t) {
    std::get<shift>(*t) = Convert<std::tuple_element_t<shift, Tuple>>(cur_i_ + shift);
    if constexpr (constexpr auto next = shift + 1; next < std::tuple_size_v<Tuple>)
      NextImpl<next>(t);
  }

  template <class T> T Convert(size_t idx) {
    static_assert(
        std::is_arithmetic_v<T> || std::is_constructible_v<T, std::string_view> || is_fint<T>,
        "incorrect type");
    if constexpr (std::is_arithmetic_v<T>) {
      return Num<T>(idx);
    } else if constexpr (std::is_constructible_v<T, std::string_view>) {
      return static_cast<T>(SafeSV(idx));
    } else if constexpr (is_fint<T>) {
      return {ConvertFInt<T::kMin, T::kMax>(idx)};
    }
  }

  template <auto min, auto max> FInt<min, max> ConvertFInt(size_t idx) {
    auto res = Num<decltype(min)>(idx);
    if (res < min || res > max) {
      Report(INVALID_INT, idx);
      return {};
    }
    return {res};
  }

  std::string_view SafeSV(size_t i) const {
    using namespace std::literals::string_view_literals;
    if (i >= args_.size())
      return ""sv;
    return args_[i].empty() ? ""sv : ToSV(args_[i]);
  }

  template <typename T> T Num(size_t idx) {
    auto arg = SafeSV(idx);
    T out;
    if constexpr (std::is_same_v<T, float>) {
      if (absl::SimpleAtof(arg, &out))
        return out;
    } else if constexpr (std::is_same_v<T, double>) {
      if (absl::SimpleAtod(arg, &out))
        return out;
    } else if constexpr (std::is_integral_v<T> && sizeof(T) >= sizeof(int32_t)) {
      if (absl::SimpleAtoi(arg, &out))
        return out;
    } else if constexpr (std::is_integral_v<T> && sizeof(T) < sizeof(int32_t)) {
      int32_t tmp;
      if (absl::SimpleAtoi(arg, &tmp)) {
        out = tmp;  // out can not store the whole tmp
        if (tmp == out)
          return out;
      }
    }

    if constexpr (std::is_floating_point_v<T>) {
      Report(INVALID_FLOAT, idx);
    } else {
      Report(INVALID_INT, idx);
    }
    return {};
  }

 private:
  size_t cur_i_ = 0;
  ArgSlice args_;

  ErrorInfo error_;
};

}  // namespace facade


================================================
FILE: src/facade/cmd_arg_parser_test.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "facade/cmd_arg_parser.h"

#include <absl/base/casts.h>
#include <gmock/gmock.h>

#include "facade/memcache_parser.h"

using namespace testing;
using namespace std;

namespace facade {

class CmdArgParserTest : public testing::Test {
 public:
  CmdArgParser Make(absl::Span<const std::string_view> args) {
    storage_.assign(args.begin(), args.end());
    arg_vec_.clear();
    for (auto& s : storage_)
      arg_vec_.push_back(MutableSlice{s.data(), s.size()});
    return CmdArgParser{absl::MakeSpan(arg_vec_)};
  }

 private:
  CmdArgVec arg_vec_;
  std::vector<std::string> storage_;
};

TEST_F(CmdArgParserTest, BasicTypes) {
  auto parser = Make({"STRING", "VIEW", "11", "22", "33", "44"});

  EXPECT_TRUE(parser.HasNext());

  EXPECT_EQ(parser.Next<string>(), "STRING"s);
  EXPECT_EQ(parser.Next<string_view>(), "VIEW"sv);

  EXPECT_EQ(parser.Next<size_t>(), 11u);
  EXPECT_EQ(parser.Next<size_t>(), 22u);
  auto [a, b] = parser.Next<size_t, size_t>();
  EXPECT_EQ(a, 33u);
  EXPECT_EQ(b, 44u);

  EXPECT_FALSE(parser.HasNext());
  EXPECT_FALSE(parser.HasError());
}

TEST_F(CmdArgParserTest, BoundError) {
  auto parser = Make({});

  EXPECT_EQ(absl::implicit_cast<string_view>(parser.Next()), ""sv);

  auto err = parser.TakeError();
  EXPECT_TRUE(err);
  EXPECT_EQ(err.type, CmdArgParser::OUT_OF_BOUNDS);
  EXPECT_EQ(err.index, 0);
}

#ifndef __APPLE__
TEST_F(CmdArgParserTest, IntError) {
  auto parser = Make({"NOTANINT"});

  EXPECT_EQ(parser.Next<size_t>(), 0u);

  auto err = parser.TakeError();
  EXPECT_TRUE(err);
  EXPECT_EQ(err.type, CmdArgParser::INVALID_INT);
  EXPECT_EQ(err.index, 0);
}
#endif

TEST_F(CmdArgParserTest, Check) {
  auto parser = Make({"TAG", "TAG_2", "22"});

  EXPECT_FALSE(parser.Check("NOT_TAG"));
  EXPECT_TRUE(parser.Check("TAG"));

  EXPECT_FALSE(parser.Check("NOT_TAG_2"));
  EXPECT_TRUE(parser.Check("TAG_2"));
  EXPECT_EQ(parser.Next<int>(), 22);
}

TEST_F(CmdArgParserTest, NextStatement) {
  auto parser = Make({"TAG", "tag_2", "tag_3"});

  parser.ExpectTag("TAG");
  EXPECT_FALSE(parser.TakeError());

  parser.ExpectTag("TAG_2");
  EXPECT_FALSE(parser.TakeError());

  parser.ExpectTag("TAG_2");
  EXPECT_TRUE(parser.TakeError());
}

TEST_F(CmdArgParserTest, CheckTailFail) {
  auto parser = Make({"TAG", "11", "22", "TAG", "text"});

  int first;
  string_view second;
  EXPECT_TRUE(parser.Check("TAG", &first, &second));
  EXPECT_EQ(first, 11);
  EXPECT_EQ(second, "22");

  EXPECT_FALSE(parser.Check("TAG", &first, &second));
  EXPECT_TRUE(parser.Check("TAG", &first));
  EXPECT_TRUE(parser.TakeError());
}

TEST_F(CmdArgParserTest, Map) {
  auto parser = Make({"TWO", "NONE"});

  EXPECT_EQ(parser.MapNext("ONE", 1, "TWO", 2), 2);

  EXPECT_EQ(parser.MapNext("ONE", 1, "TWO", 2), 0);
  auto err = parser.TakeError();
  EXPECT_TRUE(err);
  EXPECT_EQ(err.type, CmdArgParser::INVALID_CASES);
  EXPECT_EQ(err.index, 1);
}

TEST_F(CmdArgParserTest, TryMapNext) {
  auto parser = Make({"TWO", "GREEN"});

  EXPECT_EQ(parser.TryMapNext("ONE", 1, "TWO", 2), std::make_optional(2));

  EXPECT_EQ(parser.TryMapNext("ONE", 1, "TWO", 2), std::nullopt);
  EXPECT_FALSE(parser.HasError());
  EXPECT_EQ(parser.TryMapNext("green", 1, "yellow", 2), std::make_optional(1));
  EXPECT_FALSE(parser.HasError());
}

TEST_F(CmdArgParserTest, IgnoreCase) {
  auto parser = Make({"hello", "marker", "taail", "world"});

  EXPECT_EQ(absl::implicit_cast<string_view>(parser.Next()), "hello"sv);

  EXPECT_TRUE(parser.Check("MARKER"sv));
  parser.Skip(1);

  EXPECT_EQ(absl::implicit_cast<string_view>(parser.Next()), "world"sv);
}

TEST_F(CmdArgParserTest, FixedRangeInt) {
  {
    auto parser = Make({"10", "-10", "12"});

    EXPECT_EQ((parser.Next<FInt<-11, 11>>().value), 10);
    EXPECT_EQ((parser.Next<FInt<-11, 11>>().value), -10);
    EXPECT_EQ((parser.Next<FInt<-11, 11>>().value), 0);

    auto err = parser.TakeError();
    EXPECT_TRUE(err);
    EXPECT_EQ(err.type, CmdArgParser::INVALID_INT);
    EXPECT_EQ(err.index, 2);
  }

  {
    auto parser = Make({"-12"});
    EXPECT_EQ((parser.Next<FInt<-11, 11>>().value), 0);

    auto err = parser.TakeError();
    EXPECT_TRUE(err);
    EXPECT_EQ(err.type, CmdArgParser::INVALID_INT);
    EXPECT_EQ(err.index, 0);
  }
}

}  // namespace facade


================================================
FILE: src/facade/command_id.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <cstdint>
#include <string>
#include <string_view>

namespace facade {

class CommandId {
 public:
  /**
   * @brief Construct a new Command Id object
   *
   * When creating a new command use the https://github.com/redis/redis/tree/unstable/src/commands
   * files to find the right arguments.
   *
   * @param name
   * @param mask
   * @param arity -     positive if command has fixed number of required arguments including
   *                    the command, negative if command has minimum number of required arguments,
   *                    but may have more.
   * @param first_key - position of first key in argument list
   * @param last_key  - position of last key in argument list,
   *                    -1 means the last key index is (arg_length - 1), -2 means that the last key
   * index is (arg_length - 2).
   * @param acl_categories - bitfield for acl categories of the command
   */
  CommandId(const char* name, uint32_t mask, int8_t arity, int8_t first_key, int8_t last_key,
            uint32_t acl_categories);

  std::string_view name() const {
    return name_;
  }

  int arity() const {
    return arity_;
  }

  uint32_t opt_mask() const {
    return opt_mask_;
  }

  int8_t first_key_pos() const {
    return first_key_;
  }

  int8_t last_key_pos() const {
    return last_key_;
  }

  uint32_t acl_categories() const {
    return acl_categories_;
  }

  void SetFamily(size_t fam) {
    family_ = fam;
  }

  void SetBitIndex(uint64_t bit) {
    bit_index_ = bit;
  }

  size_t GetFamily() const {
    return family_;
  }

  uint64_t GetBitIndex() const {
    return bit_index_;
  }

  // Returns true if the command can only be used by admin connections, false
  // otherwise.
  bool IsRestricted() const {
    return restricted_;
  }

  void SetRestricted(bool restricted) {
    restricted_ = restricted;
  }

  void SetFlag(uint32_t flag) {
    opt_mask_ |= flag;
  }

 protected:
  std::string name_;

  uint32_t opt_mask_;
  int8_t arity_;
  int8_t first_key_;
  int8_t last_key_;

  // Acl categories
  uint32_t acl_categories_;
  // Acl commands indices
  size_t family_;
  uint64_t bit_index_;

  // Whether the command can only be used by admin connections.
  bool restricted_ = false;
};

}  // namespace facade


================================================
FILE: src/facade/conn_context.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/flat_hash_set.h>

#include <string_view>

namespace facade {

class Connection;

class ConnectionContext {
 public:
  explicit ConnectionContext(Connection* owner) : owner_(owner) {
    conn_closing = false;
    req_auth = false;
    replica_conn = false;
    authenticated = false;
    async_dispatch = false;
    sync_dispatch = false;
    paused = false;
    blocked = false;

    subscriptions = 0;
  }

  virtual ~ConnectionContext() {
  }

  Connection* conn() {
    return owner_;
  }

  const Connection* conn() const {
    return owner_;
  }

  virtual size_t UsedMemory() const {
    return 0;
  }

  // Noop.
  virtual void Unsubscribe(std::string_view channel) {
  }

  // connection state / properties.
  bool conn_closing : 1;
  bool req_auth : 1;
  bool replica_conn : 1;  // whether it's a replica connection on the master side.
  bool authenticated : 1;
  bool async_dispatch : 1;  // whether this connection is amid an async dispatch
  bool sync_dispatch : 1;   // whether this connection is amid a sync dispatch

  bool paused = false;  // whether this connection is paused due to CLIENT PAUSE
  // whether it's blocked on blocking commands like BLPOP, needs to be addressable
  bool blocked = false;

  // How many async subscription sources are active: monitor and/or pubsub - at most 2.
  uint8_t subscriptions;

 private:
  Connection* owner_;
};

}  // namespace facade


================================================
FILE: src/facade/connection_ref.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <cstdint>
#include <memory>

namespace facade {

class Connection;

// Weak reference to a connection, invalidated upon connection close.
// Used to dispatch async operations for the connection without worrying about pointer lifetime.
struct ConnectionRef {
 public:
  // Get residing thread of connection. Thread-safe.
  unsigned LastKnownThreadId() const {
    return last_known_thread_id_;
  }
  // Get pointer to connection if still valid, nullptr if expired.
  // Can only be called from connection's thread. Validity is guaranteed
  // only until the next suspension point.
  Connection* Get() const;

  // Returns true if the reference expired. Thread-safe.
  bool IsExpired() const;

  // Returns client id.Thread-safe.
  uint32_t GetClientId() const;

  bool operator<(const ConnectionRef& other) const;
  bool operator==(const ConnectionRef& other) const;

 private:
  friend class Connection;

  ConnectionRef(const std::shared_ptr<Connection>& ptr, unsigned thread_id, uint32_t client_id);

  std::weak_ptr<Connection> ptr_;
  unsigned last_known_thread_id_;
  uint32_t client_id_;
};

}  // namespace facade


================================================
FILE: src/facade/disk_backed_queue.cc
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
//
// See LICENSE for licensing terms.
//

#include "facade/disk_backed_queue.h"

#include <absl/strings/str_cat.h>
#include <fcntl.h>

#include <cerrno>
#include <cstring>
#include <string>

#include "base/flags.h"
#include "base/logging.h"
#include "facade/facade_types.h"
#include "io/io.h"
#include "util/fibers/uring_file.h"
#include "util/fibers/uring_proactor.h"

using facade::operator""_MB;

ABSL_FLAG(std::string, disk_backpressure_folder, "/tmp/",
          "Folder to store disk-backed connection backpressure");

ABSL_FLAG(size_t, disk_backpressure_file_max_bytes, 50_MB,
          "Maximum size of the backing file. When max size is reached, connection will "
          "stop offloading backpressure to disk and block on client read.");

namespace facade {

DiskBackedQueue::DiskBackedQueue(uint32_t conn_id)
    : max_backing_size_(absl::GetFlag(FLAGS_disk_backpressure_file_max_bytes)), id_(conn_id) {
}

std::error_code DiskBackedQueue::Init() {
  std::string backing_name = absl::StrCat(absl::GetFlag(FLAGS_disk_backpressure_folder), id_);
  // Open a single O_RDWR file so the same fd serves writes, reads, and fallocate punch holes.
  // Kernel transparently handles buffering via the page cache.
  auto res = util::fb2::OpenLinux(backing_name, O_RDWR | O_CREAT | O_TRUNC | O_CLOEXEC, 0600);
  if (!res) {
    return res.error();
  }
  file_ = std::move(*res);

  VLOG(3) << "Created backing for connection " << this << " " << backing_name;

  return {};
}

DiskBackedQueue::~DiskBackedQueue() {
  DCHECK_EQ(in_flight_callbacks_, 0ul);
}

std::error_code DiskBackedQueue::Close() {
  if (file_) {
    auto ec = file_->Close();
    LOG_IF(WARNING, ec) << ec.message();

    std::string backing = absl::StrCat(absl::GetFlag(FLAGS_disk_backpressure_folder), id_);
    int errc = unlink(backing.c_str());
    LOG_IF(ERROR, errc != 0) << "Failed to unlink backing file: "
                             << std::error_code{errc, std::system_category()};
    return ec;
  }

  return {};
}

// Check if backing file is empty, i.e. backing file has 0 bytes.
bool DiskBackedQueue::Empty() const {
  return total_backing_bytes_ == 0;
}

bool DiskBackedQueue::HasEnoughBackingSpaceFor(size_t bytes) const {
  return (bytes + total_backing_bytes_) < max_backing_size_;
}

void DiskBackedQueue::MaybePunchHole() {
  // Punch holes over the aligned region we have fully read past so the OS can reclaim pages.
  // Both offset and length must be multiples of the filesystem block size: XFS returns EINVAL
  // otherwise, and ext4/tmpfs only zero partial blocks rather than freeing them.
  // We assume 4096-byte blocks (correct for virtually all deployments); a fully robust
  // implementation would query the actual block size via fstatfs(file_->GetFd(), &fsst) and
  // align to fsst.f_bsize instead.
  const size_t aligned_end = (next_read_offset_ / 4096) * 4096;
  if (aligned_end > punch_offset_) {
    int res = fallocate(file_->GetFd(), FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, punch_offset_,
                        aligned_end - punch_offset_);
    DCHECK_EQ(res, 0) << "fallocate punch failed: " << strerror(errno);
    punch_offset_ = aligned_end;
  }
}

void DiskBackedQueue::PushAsync(io::Bytes bytes, AsyncPushCallback cb) {
  const size_t offset = write_offset_;
  const size_t size = bytes.size();
  ++in_flight_callbacks_;

  file_->WriteAsync(bytes, offset, [this, size, cb = std::move(cb)](int res) {
    --in_flight_callbacks_;
    if (res < 0) {
      std::error_code ec{-res, std::system_category()};
      VLOG(2) << "Failed to offload blob of size " << size << " to backing with error: " << ec;
      cb(ec);
      return;
    }

    write_offset_ += size;
    total_backing_bytes_ += size;
    VLOG(2) << "Offload connection " << this << " backpressure of " << size;
    cb({});
  });
}

void DiskBackedQueue::PopAsync(io::MutableBytes out, AsyncPopCallback cb) {
  const size_t to_read = std::min(total_backing_bytes_, out.size());
  const size_t offset = next_read_offset_;
  ++in_flight_callbacks_;

  // Capture a subset of out for the actual read size
  io::MutableBytes read_buf = out.subspan(0, to_read);

  file_->ReadAsync(read_buf, offset, [this, to_read, offset, cb = std::move(cb)](int res) {
    --in_flight_callbacks_;
    if (res < 0) {
      std::error_code ec{-res, std::system_category()};
      LOG(ERROR) << "Could not load item at offset " << offset << " of size " << to_read
                 << " from disk with error: " << ec.value() << " " << ec.message();
      cb(nonstd::make_unexpected(ec));
      return;
    }

    size_t bytes_read = static_cast<size_t>(res);
    next_read_offset_ += bytes_read;
    total_backing_bytes_ -= bytes_read;

    VLOG(2) << "Loaded item with offset " << offset << " of size " << bytes_read
            << " for connection " << this;

    MaybePunchHole();

    cb(bytes_read);
  });
}

}  // namespace facade


================================================
FILE: src/facade/disk_backed_queue.h
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <functional>
#include <memory>
#include <string_view>
#include <system_error>

#include "io/io.h"
#include "util/fibers/uring_file.h"

namespace facade {

class DiskBackedQueue {
 public:
  explicit DiskBackedQueue(uint32_t conn_id);
  ~DiskBackedQueue();

  std::error_code Init();

  // Check if we can offload bytes to backing file.
  bool HasEnoughBackingSpaceFor(size_t bytes) const;

  using AsyncPushCallback = std::function<void(std::error_code)>;

  void PushAsync(io::Bytes bytes, AsyncPushCallback cb);

  using AsyncPopCallback = std::function<void(io::Result<size_t>)>;

  // Async read variant. Callback is invoked with Result containing bytes read or error.
  void PopAsync(io::MutableBytes out, AsyncPopCallback cb);

  // Check if backing file is empty, i.e. backing file has 0 bytes.
  bool Empty() const;

  std::error_code Close();

 private:
  // Punch holes over the aligned region we have fully read past so the OS can reclaim pages.
  void MaybePunchHole();

  // Single O_RDWR file used for both writes and reads, avoiding a separate fd for fallocate.
  std::unique_ptr<util::fb2::LinuxFile> file_;

  size_t write_offset_ = 0;
  size_t total_backing_bytes_ = 0;
  size_t next_read_offset_ = 0;
  // Tracks how far into the file holes have been punched (always 4096-aligned).
  size_t punch_offset_ = 0;

  // Read only constants
  const size_t max_backing_size_ = 0;

  // same as connection id. Used to uniquely identify the backed file
  const size_t id_ = 0;
  size_t in_flight_callbacks_ = 0;
};

}  // namespace facade


================================================
FILE: src/facade/disk_backed_queue_test.cc
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "facade/disk_backed_queue.h"

#include <absl/strings/str_cat.h>
#include <fcntl.h>
#include <gmock/gmock.h>
#include <unistd.h>

#include <memory>
#include <string>
#include <vector>

#include "base/flags.h"
#include "base/gtest.h"
#include "base/logging.h"
#include "io/io.h"
#include "util/fibers/pool.h"

namespace dfly {
namespace {

using namespace facade;

class DiskBackedQueueTest : public testing::Test {
 protected:
  void SetUp() override {
    pp_.reset(util::fb2::Pool::IOUring(16, 1));
    pp_->Run();
  }

  void TearDown() override {
    pp_->Stop();
    pp_.reset();
  }

  std::unique_ptr<util::ProactorPool> pp_;
};

// Verifies that after reading >= 4096 bytes, punch_hole is called correctly
// and disk space is reclaimed.
TEST_F(DiskBackedQueueTest, PunchHoleReleasesSpace) {
  pp_->at(0)->Await([]() {
    // Use id=2 to avoid collision with ReadWrite test.
    DiskBackedQueue backing(2);
    ASSERT_FALSE(backing.Init());

    // Write 3 pages (12288 bytes) so the punch logic is triggered on reads.
    std::string data(12288, 'x');
    {
      util::fb2::Done done;
      backing.PushAsync(io::MutableBytes(reinterpret_cast<uint8_t*>(data.data()), data.size()),
                        [&done](std::error_code ec) {
                          ASSERT_FALSE(ec);
                          done.Notify();
                        });
      done.Wait();
    }

    // Read all data back in 4096-byte chunks.
    std::string results;
    while (!backing.Empty()) {
      std::string buf(4096, '\0');
      auto out = io::MutableBytes(reinterpret_cast<uint8_t*>(buf.data()), buf.size());
      util::fb2::Done done;
      backing.PopAsync(out, [&done, &results, &buf](io::Result<size_t> res) {
        ASSERT_TRUE(res);
        results.append(buf.data(), *res);
        done.Notify();
      });
      done.Wait();
    }
    EXPECT_EQ(results, data);

    // After reading all 3 pages the punch should have freed the first 3 aligned pages.
    // SEEK_HOLE at offset 0 returns 0 when a hole starts at the beginning of the file.
    int check_fd = open("/tmp/2", O_RDONLY);
    ASSERT_GE(check_fd, 0);
    off_t hole_start = lseek(check_fd, 0, SEEK_HOLE);
    close(check_fd);
    EXPECT_EQ(hole_start, 0) << "Expected hole at start of file - punch_hole did not free space";

    ASSERT_FALSE(backing.Close());
  });
}

// Verifies that reading across multiple pages advances the punch offset correctly so that
// successive reads keep freeing space (not re-punching offset 0 or skipping blocks).
TEST_F(DiskBackedQueueTest, PunchHoleAdvancesOffset) {
  pp_->at(0)->Await([]() {
    DiskBackedQueue backing(3);
    ASSERT_FALSE(backing.Init());

    // Write 8 pages so we can do several reads and check the hole grows.
    std::string data(32768, 'y');
    {
      util::fb2::Done done;
      backing.PushAsync(io::MutableBytes(reinterpret_cast<uint8_t*>(data.data()), data.size()),
                        [&done](std::error_code ec) {
                          ASSERT_FALSE(ec);
                          done.Notify();
                        });
      done.Wait();
    }

    // Read exactly 4096 bytes (1 page).
    {
      std::string buf(4096, '\0');
      auto out = io::MutableBytes(reinterpret_cast<uint8_t*>(buf.data()), buf.size());
      util::fb2::Done done;
      backing.PopAsync(out, [&done](io::Result<size_t> res) {
        ASSERT_TRUE(res);
        done.Notify();
      });
      done.Wait();
    }

    // After 1 page read the hole should start at 0 and the first non-hole (data) should be at
    // offset 4096 (i.e., lseek SEEK_DATA starting from 0 skips the punched hole).
    int check_fd = open("/tmp/3", O_RDONLY);
    ASSERT_GE(check_fd, 0);
    off_t first_hole = lseek(check_fd, 0, SEEK_HOLE);
    off_t first_data = lseek(check_fd, 0, SEEK_DATA);
    close(check_fd);

    EXPECT_EQ(first_hole, 0) << "Hole should begin at offset 0 after first page read";
    EXPECT_EQ(first_data, 4096) << "Non-hole data should start at 4096 after punching first page";

    ASSERT_FALSE(backing.Close());
  });
}

// Verifies that unaligned writes and reads correctly punch holes at aligned boundaries.
// Punch should only occur when we've fully read past 4096-byte boundaries.
TEST_F(DiskBackedQueueTest, PunchHoleUnalignedReadsAndWrites) {
  pp_->at(0)->Await([]() {
    DiskBackedQueue backing(4);
    ASSERT_FALSE(backing.Init());

    // Write 10000 bytes (not a multiple of 4096).
    // This is 2 full pages (8192 bytes) + 1808 partial bytes.
    std::string data(10000, 'z');
    {
      util::fb2::Done done;
      backing.PushAsync(io::MutableBytes(reinterpret_cast<uint8_t*>(data.data()), data.size()),
                        [&done](std::error_code ec) {
                          ASSERT_FALSE(ec);
                          done.Notify();
                        });
      done.Wait();
    }

    // Read 3000 bytes (unaligned, less than 1 page).
    // next_read_offset_ will be 3000, but aligned_end = (3000/4096)*4096 = 0.
    // So no punch should happen yet.
    std::string results;
    {
      std::string buf(3000, '\0');
      auto out = io::MutableBytes(reinterpret_cast<uint8_t*>(buf.data()), buf.size());
      util::fb2::Done done;
      backing.PopAsync(out, [&done, &results, &buf](io::Result<size_t> res) {
        ASSERT_TRUE(res);
        results.append(buf.data(), *res);
        done.Notify();
      });
      done.Wait();
    }

    // Check that no hole exists yet (first 3000 bytes read but not 4096-aligned).
    int check_fd = open("/tmp/4", O_RDONLY);
    ASSERT_GE(check_fd, 0);
    off_t hole_at_start = lseek(check_fd, 0, SEEK_HOLE);
    // SEEK_HOLE from offset 0 should jump to EOF if no hole exists at start.
    EXPECT_GT(hole_at_start, 0) << "No hole should exist yet after reading 3000 bytes";
    close(check_fd);

    // Read another 2000 bytes (total read = 5000 bytes).
    // next_read_offset_ will be 5000, aligned_end = (5000/4096)*4096 = 4096.
    // Now the first page (0-4095) should be punched.
    {
      std::string buf(2000, '\0');
      auto out = io::MutableBytes(reinterpret_cast<uint8_t*>(buf.data()), buf.size());
      util::fb2::Done done;
      backing.PopAsync(out, [&done, &results, &buf](io::Result<size_t> res) {
        ASSERT_TRUE(res);
        results.append(buf.data(), *res);
        done.Notify();
      });
      done.Wait();
    }

    // Verify first page is now a hole.
    check_fd = open("/tmp/4", O_RDONLY);
    ASSERT_GE(check_fd, 0);
    off_t first_hole = lseek(check_fd, 0, SEEK_HOLE);
    off_t first_data = lseek(check_fd, 0, SEEK_DATA);
    EXPECT_EQ(first_hole, 0) << "Hole should start at offset 0 after reading past 4096 bytes";
    EXPECT_EQ(first_data, 4096) << "Data should start at 4096 (second page)";

    // Read another 3500 bytes (total read = 8500 bytes).
    // next_read_offset_ will be 8500, aligned_end = (8500/4096)*4096 = 8192.
    // Now the first two pages (0-8191) should be punched.
    {
      std::string buf(3500, '\0');
      auto out = io::MutableBytes(reinterpret_cast<uint8_t*>(buf.data()), buf.size());
      util::fb2::Done done;
      backing.PopAsync(out, [&done, &results, &buf](io::Result<size_t> res) {
        ASSERT_TRUE(res);
        results.append(buf.data(), *res);
        done.Notify();
      });
      done.Wait();
    }

    // Verify first two pages are holes.
    first_hole = lseek(check_fd, 0, SEEK_HOLE);
    first_data = lseek(check_fd, 0, SEEK_DATA);
    close(check_fd);
    EXPECT_EQ(first_hole, 0) << "Hole should start at offset 0";
    EXPECT_EQ(first_data, 8192) << "Data should start at 8192 (third page)";

    // Read remaining data and verify results match.
    while (!backing.Empty()) {
      std::string buf(4096, '\0');
      auto out = io::MutableBytes(reinterpret_cast<uint8_t*>(buf.data()), buf.size());
      util::fb2::Done done;
      backing.PopAsync(out, [&done, &results, &buf](io::Result<size_t> res) {
        ASSERT_TRUE(res);
        results.append(buf.data(), *res);
        done.Notify();
      });
      done.Wait();
    }
    EXPECT_EQ(results, data);

    ASSERT_FALSE(backing.Close());
  });
}

TEST_F(DiskBackedQueueTest, AsyncReadWrite) {
  pp_->at(0)->Await([]() {
    DiskBackedQueue backing(5 /* id */);
    EXPECT_FALSE(backing.Init());

    std::string commands;
    for (size_t i = 0; i < 100; ++i) {
      auto cmd = absl::StrCat("SET FOO", i, " BAR");
      commands += cmd;
    }

    // Async write all commands
    util::fb2::Fiber write_fiber = util::fb2::Fiber("writer", [&]() {
      for (size_t i = 0; i < 100; ++i) {
        auto cmd = absl::StrCat("SET FOO", i, " BAR");
        auto bytes = io::MutableBytes(reinterpret_cast<uint8_t*>(cmd.data()), cmd.size());

        util::fb2::Done done;
        backing.PushAsync(bytes, [&done](std::error_code ec) {
          EXPECT_FALSE(ec);
          done.Notify();
        });
        done.Wait();
      }
    });

    write_fiber.Join();

    // Async read all results
    std::string results;
    util::fb2::Fiber read_fiber = util::fb2::Fiber("reader", [&]() {
      while (!backing.Empty()) {
        std::string buf(1024, 'c');
        auto bytes = io::MutableBytes(reinterpret_cast<uint8_t*>(buf.data()), buf.size());

        util::fb2::Done done;
        backing.PopAsync(bytes, [&done, &results, &buf](io::Result<size_t> res) {
          EXPECT_TRUE(res);
          results.append(buf.data(), *res);
          done.Notify();
        });
        done.Wait();
      }
    });

    read_fiber.Join();

    EXPECT_EQ(results.size(), commands.size());
    EXPECT_EQ(results, commands);

    EXPECT_FALSE(backing.Close());
  });
}

TEST_F(DiskBackedQueueTest, AsyncPunchHole) {
  pp_->at(0)->Await([]() {
    DiskBackedQueue backing(6);
    ASSERT_FALSE(backing.Init());

    // Write 3 pages (12288 bytes) asynchronously
    std::string data(12288, 'x');

    util::fb2::Done write_done;
    backing.PushAsync(io::MutableBytes(reinterpret_cast<uint8_t*>(data.data()), data.size()),
                      [&write_done](std::error_code ec) {
                        ASSERT_FALSE(ec);
                        write_done.Notify();
                      });
    write_done.Wait();

    // Async read all data back in 4096-byte chunks
    std::string results;
    while (!backing.Empty()) {
      std::string buf(4096, '\0');
      auto out = io::MutableBytes(reinterpret_cast<uint8_t*>(buf.data()), buf.size());

      util::fb2::Done read_done;
      backing.PopAsync(out, [&read_done, &results, &buf](io::Result<size_t> res) {
        ASSERT_TRUE(res);
        results.append(buf.data(), *res);
        read_done.Notify();
      });
      read_done.Wait();
    }
    EXPECT_EQ(results, data);

    // Verify punch hole freed space
    int check_fd = open("/tmp/6", O_RDONLY);
    ASSERT_GE(check_fd, 0);
    off_t hole_start = lseek(check_fd, 0, SEEK_HOLE);
    close(check_fd);
    EXPECT_EQ(hole_start, 0) << "Expected hole at start of file - async punch did not free space";

    ASSERT_FALSE(backing.Close());
  });
}

}  // namespace
}  // namespace dfly


================================================
FILE: src/facade/dragonfly_connection.cc
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
//
// See LICENSE for licensing terms.
//

#include "facade/dragonfly_connection.h"

#include <absl/cleanup/cleanup.h>
#include <absl/container/flat_hash_map.h>
#include <absl/strings/escaping.h>
#include <absl/strings/match.h>
#include <absl/strings/str_cat.h>
#include <absl/time/time.h>

#include <numeric>
#include <variant>

#include "base/cycle_clock.h"
#include "base/flag_utils.h"
#include "base/flags.h"
#include "base/histogram.h"
#include "base/io_buf.h"
#include "base/logging.h"
#include "base/stl_util.h"
#include "common/heap_size.h"
#include "facade/conn_context.h"
#include "facade/dragonfly_listener.h"
#include "facade/facade_types.h"
#include "facade/memcache_parser.h"
#include "facade/redis_parser.h"
#include "facade/reply_builder.h"
#include "facade/resp_srv_parser.h"
#include "facade/service_interface.h"
#include "facade/socket_utils.h"
#include "io/file.h"
#include "strings/human_readable.h"
#include "util/fiber_socket_base.h"
#include "util/fibers/fibers.h"
#include "util/fibers/proactor_base.h"

#ifdef DFLY_USE_SSL
#include "util/tls/tls_socket.h"
#endif

#ifdef __linux__
#include "util/fibers/uring_file.h"
#include "util/fibers/uring_proactor.h"
#include "util/fibers/uring_socket.h"
#endif

using namespace std;
using facade::operator""_MB;

ABSL_FLAG(bool, tcp_nodelay, true,
          "Configures dragonfly connections with socket option TCP_NODELAY");
ABSL_FLAG(bool, primary_port_http_enabled, true,
          "If true allows accessing http console on main TCP port");

ABSL_FLAG(uint16_t, admin_port, 0,
          "If set, would enable admin access to console on the assigned port. "
          "This supports both HTTP and RESP protocols");

ABSL_FLAG(string, admin_bind, "",
          "If set, the admin consol TCP connection would be bind the given address. "
          "This supports both HTTP and RESP protocols");

ABSL_FLAG(strings::MemoryBytesFlag, request_cache_limit, 64_MB,
          "Amount of memory to use for request cache in bytes - per IO thread.");

ABSL_FLAG(strings::MemoryBytesFlag, pipeline_buffer_limit, 128_MB,
          "Amount of memory to use for storing pipeline requests - per IO thread."
          "Please note that clients that send excecissively huge pipelines, "
          "may deadlock themselves. See https://github.com/dragonflydb/dragonfly/discussions/3997"
          "for details.");

ABSL_FLAG(uint32_t, pipeline_queue_limit, 10000,
          "Pipeline queue max length, the server will stop reading from the client socket"
          " once its pipeline queue crosses this limit, and will resume once it processes "
          "excessive requests. This is to prevent OOM states. Users of huge pipelines sizes "
          "may require increasing this limit to prevent the risk of deadlocking."
          "See https://github.com/dragonflydb/dragonfly/discussions/3997 for details");

ABSL_FLAG(strings::MemoryBytesFlag, publish_buffer_limit, 128_MB,
          "Amount of memory to use for storing pub commands in bytes - per IO thread");

ABSL_FLAG(uint32_t, pipeline_squash, 1,
          "Number of queued pipelined commands above which squashing is enabled, 0 means disabled");

// When changing this constant, also update `test_large_cmd` test in connection_test.py.
ABSL_FLAG(uint32_t, max_multi_bulk_len, 1u << 16,
          "Maximum multi-bulk (array) length that is "
          "allowed to be accepted when parsing RESP protocol");

ABSL_FLAG(uint64_t, max_bulk_len, 2u << 30,
          "Maximum bulk length that is "
          "allowed to be accepted when parsing RESP protocol");

ABSL_FLAG(strings::MemoryBytesFlag, max_client_iobuf_len, 1u << 16,
          "Maximum io buffer length that is used to read client requests.");

ABSL_FLAG(bool, migrate_connections, true,
          "When enabled, Dragonfly will try to migrate connections to the target thread on which "
          "they operate. Currently this is only supported for Lua script invocations, and can "
          "happen at most once per connection.");

ABSL_FLAG(uint32_t, max_busy_read_usec, 200,
          "Maximum time we read and parse from "
          "a socket without yielding. In microseconds.");

ABSL_FLAG(size_t, squashed_reply_size_limit, 0,
          "Max bytes allowed for squashing_current_reply_size. If this limit is reached, "
          "connections dispatching pipelines won't squash them.");

ABSL_FLAG(bool, always_flush_pipeline, false,
          "if true will flush pipeline response after each pipeline squashing");

ABSL_FLAG(uint32_t, async_dispatch_quota, 100,
          "Maximum number of consecutive async dispatch messages to process before either "
          "yielding to I/O when the pipeline appears empty or forcibly processing a queued "
          "pipelined command to prevent starvation. Set to 0 to disable this mechanism.");

ABSL_FLAG(uint32_t, pipeline_squash_limit, 1 << 30, "Limit on the size of a squashed pipeline. ");
ABSL_FLAG(uint32_t, pipeline_wait_batch_usec, 0,
          "If non-zero, waits for this time for more I/O "
          " events to come for the connection in case there is only one command in the pipeline. ");

ABSL_FLAG(bool, experimental_io_loop_v2, true, "new io loop");

using namespace util;
using namespace std;
using absl::GetFlag;
using base::CycleClock;
using nonstd::make_unexpected;

namespace facade {

namespace {

void SendProtocolError(RespSrvParser::Result pres, SinkReplyBuilder* builder) {
  constexpr string_view res = "-ERR Protocol error: "sv;
  if (pres == RespSrvParser::BAD_BULKLEN) {
    builder->SendProtocolError(absl::StrCat(res, "invalid bulk length"));
  } else if (pres == RespSrvParser::BAD_ARRAYLEN) {
    builder->SendProtocolError(absl::StrCat(res, "invalid multibulk length"));
  } else {
    builder->SendProtocolError(absl::StrCat(res, "parse error"));
  }
}

// TODO: to implement correct matcher according to HTTP spec
// https://www.w3.org/Protocols/rfc2616/rfc2616-sec5.html
// One place to find a good implementation would be https://github.com/h2o/picohttpparser
bool MatchHttp11Line(string_view line) {
  return (absl::StartsWith(line, "GET ") || absl::StartsWith(line, "POST ")) &&
         absl::EndsWith(line, "HTTP/1.1");
}

void UpdateIoBufCapacity(const io::IoBuf& io_buf, ConnectionStats* stats,
                         absl::FunctionRef<void()> f) {
  const size_t prev_capacity = io_buf.Capacity();
  f();
  const size_t capacity = io_buf.Capacity();
  if (prev_capacity != capacity) {
    VLOG(2) << "Grown io_buf to " << capacity;
    stats->read_buf_capacity += capacity - prev_capacity;
  }
}

size_t UsedMemoryInternal(const ParsedCommand& msg) {
  return msg.GetSize() + msg.HeapMemory();
}

struct TrafficLogger {
  // protects agains closing the file while writing or data races when opening the file.
  // Also, makes sure that LogTraffic are executed atomically.
  fb2::Mutex mutex;
  unique_ptr<io::WriteFile> log_file;

  void ResetLocked();
  // Returns true if Write succeeded, false if it failed and the recording should be aborted.
  bool Write(string_view blob);
  bool Write(iovec* blobs, size_t len);
};

void TrafficLogger::ResetLocked() {
  if (log_file) {
    std::ignore = log_file->Close();
    log_file.reset();
  }
}

// Returns true if Write succeeded, false if it failed and the recording should be aborted.
bool TrafficLogger::Write(string_view blob) {
  auto ec = log_file->Write(io::Buffer(blob));
  if (ec) {
    LOG(ERROR) << "Error writing to traffic log: " << ec;
    ResetLocked();
    return false;
  }
  return true;
}

bool TrafficLogger::Write(iovec* blobs, size_t len) {
  auto ec = log_file->Write(blobs, len);
  if (ec) {
    LOG(ERROR) << "Error writing to traffic log: " << ec;
    ResetLocked();
    return false;
  }
  return true;
}

thread_local TrafficLogger tl_traffic_logger{};
thread_local base::Histogram* io_req_size_hist = nullptr;

thread_local const size_t reply_size_limit = absl::GetFlag(FLAGS_squashed_reply_size_limit);
thread_local uint32 pipeline_wait_batch_usec = absl::GetFlag(FLAGS_pipeline_wait_batch_usec);

void OpenTrafficLogger(string_view base_path) {
  unique_lock lk{tl_traffic_logger.mutex};
  if (tl_traffic_logger.log_file)
    return;

#ifdef __linux__
  // Open file with append mode, without it concurrent fiber writes seem to conflict
  string path = absl::StrCat(
      base_path, "-", absl::Dec(ProactorBase::me()->GetPoolIndex(), absl::kZeroPad3), ".bin");
  auto file = util::fb2::OpenWrite(path, io::WriteFile::Options{/*.append = */ false});
  if (!file) {
    LOG(ERROR) << "Error opening a file " << path << " for traffic logging: " << file.error();
    return;
  }
  tl_traffic_logger.log_file = unique_ptr<io::WriteFile>{file.value()};
#else
  LOG(WARNING) << "Traffic logger is only supported on Linux";
#endif

  // Write version, incremental numbering :)
  uint8_t version[1] = {2};
  std::ignore = tl_traffic_logger.log_file->Write(version);
}

void LogTraffic(uint32_t id, bool has_more, const cmn::BackedArguments& args,
                ServiceInterface::ContextInfo ci) {
  string_view cmd = args.Front();
  if (absl::EqualsIgnoreCase(cmd, "debug"sv))
    return;

  DVLOG(2) << "Recording " << cmd;

  char stack_buf[1024];
  char* next = stack_buf;

  // We write id, timestamp, db_index, has_more, num_parts, part_len, part_len, part_len, ...
  // And then all the part blobs concatenated together.
  auto write_u32 = [&next](uint32_t i) {
    absl::little_endian::Store32(next, i);
    next += 4;
  };

  // id
  write_u32(id);

  // timestamp
  absl::little_endian::Store64(next, absl::GetCurrentTimeNanos());
  next += 8;

  // db_index
  write_u32(ci.db_index);

  // has_more, num_parts
  write_u32(has_more ? 1 : 0);
  write_u32(uint32_t(args.size()));

  // Grab the lock and check if the file is still open.
  lock_guard lk{tl_traffic_logger.mutex};
  if (!tl_traffic_logger.log_file)
    return;

  // part_len, ...
  for (auto part : args) {
    if (size_t(next - stack_buf + 4) > sizeof(stack_buf)) {
      if (!tl_traffic_logger.Write(string_view{stack_buf, size_t(next - stack_buf)})) {
        return;
      }
      next = stack_buf;
    }
    write_u32(part.size());
  }

  // Write the data itself.
  array<iovec, 16> blobs;
  unsigned index = 0;
  if (next != stack_buf) {
    blobs[index++] = iovec{.iov_base = stack_buf, .iov_len = size_t(next - stack_buf)};
  }

  for (auto part : args) {
    if (auto blob_len = part.size(); blob_len > 0) {
      blobs[index++] = iovec{.iov_base = const_cast<char*>(part.data()), .iov_len = blob_len};

      if (index >= blobs.size()) {
        if (!tl_traffic_logger.Write(blobs.data(), blobs.size())) {
          return;
        }
        index = 0;
      }
    }
  }

  if (index) {
    tl_traffic_logger.Write(blobs.data(), index);
  }
}

constexpr size_t kMinReadSize = 256;

const char* kPhaseName[Connection::NUM_PHASES] = {"SETUP", "READ", "PROCESS", "SHUTTING_DOWN",
                                                  "PRECLOSE"};

// Keeps track of total per-thread sizes of dispatch queues to limit memory taken up by messages
// in these queues.
struct QueueBackpressure {
  QueueBackpressure() {
  }

  // Block until subscriber memory usage is below limit, can be called from any thread.
  void EnsureBelowLimit();

  // Checks if backpressure should be applied.
  // 'size' should be the total bytes currently consumed by all connections on this thread.
  // 'q_len' should be the length of the pipeline queue for the current connection.
  //
  // Returns true if EITHER:
  // 1. Thread-local: memory limit (on all thread's connections) is exceeded (protects server from
  // OOM).
  // 2. Per-Connection queue length limit is exceeded (protects against single-client abuse).
  bool IsPipelineBufferOverLimit(size_t size, uint32_t q_len) const {
    return size >= (pipeline_buffer_limit) || (q_len > pipeline_queue_max_len);
  }

  // Checks if usage has dropped below the limit in at least one criteria.
  // Used to determine if we should notify waiters.
  // 'size' should be the total bytes currently consumed by all connections on this thread.
  // 'q_len' should be the length of the pipeline queue for the current connection.
  //
  // Returns true if EITHER:
  // 1. Thread-Global memory is now under the limit (allows neighbors to wake up).
  // 2. Per-Connection queue length is now within the limit (allows self to wake up).
  bool IsPipelineBufferUnderLimit(size_t size, uint32_t q_len) const {
    return (size < pipeline_buffer_limit) || (q_len <= pipeline_queue_max_len);
  }

  // Used by publisher/subscriber actors to make sure we do not publish too many messages
  // into the queue. Thread-safe to allow safe access in EnsureBelowLimit.
  util::fb2::EventCount pubsub_ec;
  atomic_size_t subscriber_bytes = 0;

  // Used by pipelining/execution fiber to throttle the incoming pipeline messages.
  // Used together with pipeline_buffer_limit to limit the pipeline usage per thread.
  util::fb2::CondVarAny pipeline_cnd;

  size_t publish_buffer_limit = 0;        // cached flag publish_buffer_limit
  size_t pipeline_cache_limit = 0;        // cached flag pipeline_cache_limit
  size_t pipeline_buffer_limit = 0;       // cached flag for buffer size in bytes
  uint32_t pipeline_queue_max_len = 256;  // cached flag for pipeline queue max length.
};

void QueueBackpressure::EnsureBelowLimit() {
  pubsub_ec.await(
      [this] { return subscriber_bytes.load(memory_order_relaxed) <= publish_buffer_limit; });
}

// Global array for each io thread to keep track of the total memory usage of the dispatch queues.
QueueBackpressure* thread_queue_backpressure = nullptr;

QueueBackpressure& GetQueueBackpressure() {
  DCHECK(thread_queue_backpressure != nullptr);

  return thread_queue_backpressure[ProactorBase::me()->GetPoolIndex()];
}

// A special accessor for accessing thread local ConnectionStats that is robust to fiber-thread
// migrations. Compiler optimizations can cache a stale thread local pointer, and not refresh it
// after HandleMigrateRequest() is called. This function should be used to force loading
// the variable from memory every time, preventing such bugs.
ConnectionStats& __attribute__((noinline)) GetLocalConnStats() {
  // https://stackoverflow.com/a/75622732
  asm volatile("");

  return tl_facade_stats->conn_stats;
}

thread_local uint64_t max_busy_read_cycles_cached = 1ULL << 32;
thread_local bool always_flush_pipeline_cached = absl::GetFlag(FLAGS_always_flush_pipeline);
thread_local uint32_t pipeline_squash_limit_cached = absl::GetFlag(FLAGS_pipeline_squash_limit);

}  // namespace

thread_local vector<Connection::PipelineMessagePtr> Connection::pipeline_req_pool_;

class PipelineCacheSizeTracker {
 public:
  bool CheckAndUpdateWatermark(size_t pipeline_sz) {
    const auto now = absl::Now();
    const auto elapsed = now - last_check_;
    min_ = std::min(min_, pipeline_sz);
    if (elapsed < absl::Milliseconds(10)) {
      return false;
    }

    const bool watermark_reached = (min_ > 0);
    min_ = Limits::max();
    last_check_ = absl::Now();

    return watermark_reached;
  }

 private:
  using Limits = std::numeric_limits<size_t>;

  absl::Time last_check_ = absl::Now();
  size_t min_ = Limits::max();
};

thread_local PipelineCacheSizeTracker tl_pipe_cache_sz_tracker;

size_t Connection::MessageHandle::UsedMemory() const {
  struct MessageSize {
    size_t operator()(const PubMessagePtr& msg) {
      return sizeof(PubMessage) + (msg->channel.size() + msg->message.size());
    }
    size_t operator()(const MonitorMessage& msg) {
      return msg.capacity();
    }
    size_t operator()(const MigrationRequestMessage& msg) {
      return 0;
    }
    size_t operator()(const CheckpointMessage& msg) {
      return 0;  // no access to internal type, memory usage negligible
    }
    size_t operator()(const InvalidationMessage& msg) {
      return 0;
    }
  };

  return sizeof(MessageHandle) + visit(MessageSize{}, this->handle);
}

bool Connection::MessageHandle::IsReplying() const {
  return IsPubMsg() || holds_alternative<MonitorMessage>(handle);
}

struct Connection::AsyncOperations {
  AsyncOperations(SinkReplyBuilder* b, Connection* me) : builder{b}, self(me) {
  }

  void operator()(const PubMessage& msg);
  void operator()(ParsedCommand& msg);
  void operator()(const MonitorMessage& msg);
  void operator()(const MigrationRequestMessage& msg);
  void operator()(CheckpointMessage msg);
  void operator()(const InvalidationMessage& msg);

  template <typename T, typename D> void operator()(unique_ptr<T, D>& ptr) {
    operator()(*ptr.get());
  }

  SinkReplyBuilder* builder = nullptr;
  Connection* self = nullptr;
};

void Connection::AsyncOperations::operator()(const MonitorMessage& msg) {
  RedisReplyBuilder* rbuilder = (RedisReplyBuilder*)builder;
  rbuilder->SendSimpleString(msg);
}

void Connection::AsyncOperations::operator()(const PubMessage& pub_msg) {
  RedisReplyBuilder* rb = static_cast<RedisReplyBuilder*>(builder);

  // Discard stale messages to not break the protocol after exiting "pubsub" mode.
  // Even after removing all subscriptions, we still can receive messages delayed
  // by inter-thread dispatches or backpressure.
  // TODO: filter messages from channels the client unsubscribed from
  if (self->cntx()->subscriptions == 0 &&
      !base::_in(pub_msg.channel, {"unsubscribe", "punsubscribe"}))
    return;

  if (pub_msg.force_unsubscribe) {
    rb->StartCollection(3, CollectionType::PUSH);
    rb->SendBulkString("sunsubscribe");
    rb->SendBulkString(pub_msg.channel);
    rb->SendLong(0);
    self->cntx()->Unsubscribe(pub_msg.channel);
    return;
  }

  unsigned i = 0;
  array<string_view, 4> arr;
  if (pub_msg.pattern.empty()) {
    arr[i++] = pub_msg.is_sharded ? "smessage" : "message";
  } else {
    arr[i++] = "pmessage";
    arr[i++] = pub_msg.pattern;
  }

  arr[i++] = pub_msg.channel;
  arr[i++] = pub_msg.message;

  rb->SendBulkStrArr(absl::Span<string_view>{arr.data(), i}, CollectionType::PUSH);
}

void Connection::AsyncOperations::operator()(ParsedCommand& cmd) {
  DVLOG(2) << "Dispatching pipeline: " << cmd.Front();

  ++self->local_stats_.cmds;
  self->service_->DispatchCommand(ParsedArgs{cmd}, &cmd, facade::AsyncPreference::ONLY_SYNC);

  self->last_interaction_ = time(nullptr);
  self->skip_next_squashing_ = false;
}

void Connection::AsyncOperations::operator()(const MigrationRequestMessage& msg) {
  // no-op
}

void Connection::AsyncOperations::operator()(CheckpointMessage msg) {
  VLOG(2) << "Decremented checkpoint at " << self->DebugInfo();

  msg.bc->Dec();
}

void Connection::AsyncOperations::operator()(const InvalidationMessage& msg) {
  RedisReplyBuilder* rbuilder = (RedisReplyBuilder*)builder;
  DCHECK(rbuilder->IsResp3());
  rbuilder->StartCollection(2, facade::CollectionType::PUSH);
  rbuilder->SendBulkString("invalidate");
  if (msg.invalidate_due_to_flush) {
    rbuilder->SendNull();
  } else {
    string_view keys[] = {msg.key};
    rbuilder->SendBulkStrArr(keys);
  }
}

namespace {
thread_local absl::flat_hash_map<string, uint64_t> g_libname_ver_map;

void UpdateLibNameVerMap(const string& name, const string& ver, int delta) {
  string key = absl::StrCat(name, ":", ver);
  uint64_t& val = g_libname_ver_map[key];
  val += delta;
  if (val == 0) {
    g_libname_ver_map.erase(key);
  }
}
}  // namespace

void Connection::Init(unsigned io_threads) {
  CHECK(thread_queue_backpressure == nullptr);
  thread_queue_backpressure = new QueueBackpressure[io_threads];

  for (unsigned i = 0; i < io_threads; ++i) {
    auto& qbp = thread_queue_backpressure[i];
    qbp.publish_buffer_limit = GetFlag(FLAGS_publish_buffer_limit);
    qbp.pipeline_cache_limit = GetFlag(FLAGS_request_cache_limit);
    qbp.pipeline_buffer_limit = GetFlag(FLAGS_pipeline_buffer_limit);
    qbp.pipeline_queue_max_len = GetFlag(FLAGS_pipeline_queue_limit);

    if (qbp.publish_buffer_limit == 0 || qbp.pipeline_cache_limit == 0 ||
        qbp.pipeline_buffer_limit == 0 || qbp.pipeline_queue_max_len == 0) {
      LOG(ERROR) << "pipeline flag limit is 0";
      exit(-1);
    }
  }
}

void Connection::Shutdown() {
  delete[] thread_queue_backpressure;
  thread_queue_backpressure = nullptr;
}

Connection::Connection(Protocol protocol, util::HttpListenerBase* http_listener, SSL_CTX* ctx,
                       ServiceInterface* service)
    : io_buf_(kMinReadSize),
      protocol_(protocol),
      http_listener_(http_listener),
      ssl_ctx_(ctx),
      service_(service),
      flags_(0) {
  static atomic_uint32_t next_id{1};

  constexpr size_t kReqSz = sizeof(ParsedCommand);
  static_assert(kReqSz <= 256);

  // TODO: to move parser initialization to where we initialize the reply builder.
  switch (protocol) {
    case Protocol::REDIS:
      redis_parser_.reset(
          new RespSrvParser(GetFlag(FLAGS_max_multi_bulk_len), GetFlag(FLAGS_max_bulk_len)));
      break;
    case Protocol::MEMCACHE:
      memcache_parser_ =
          make_unique<MemcacheParser>(std::min<uint64_t>(GetFlag(FLAGS_max_bulk_len), UINT32_MAX));
      break;
  }

  creation_time_ = time(nullptr);
  last_interaction_ = creation_time_;
  id_ = next_id.fetch_add(1, memory_order_relaxed);

  migration_enabled_ = GetFlag(FLAGS_migrate_connections);

  // Create shared_ptr with empty value and associate it with `this` pointer (aliasing constructor).
  // We use it for reference counting and accessing `this` (without managing it).
  self_ = {make_shared<std::monostate>(), this};

#ifdef DFLY_USE_SSL
  // Increment reference counter so Listener won't free the context while we're
  // still using it.
  if (ctx) {
    SSL_CTX_up_ref(ctx);
  }
#endif

  UpdateLibNameVerMap(lib_name_, lib_ver_, +1);
  migration_allowed_to_register_ = false;
}

Connection::~Connection() {
#ifdef DFLY_USE_SSL
  SSL_CTX_free(ssl_ctx_);
#endif
  UpdateLibNameVerMap(lib_name_, lib_ver_, -1);
}

bool Connection::IsSending() const {
  return reply_builder_ && reply_builder_->IsSendActive();
}

void Connection::MarkForClose() {
  if (reply_builder_) {
    reply_builder_->CloseConnection();
  }
  request_shutdown_ = true;
}

// Called from Connection::Shutdown() right after socket_->Shutdown call.
void Connection::OnShutdown() {
  VLOG(1) << "Connection::OnShutdown";

  BreakOnce(POLLHUP);
  io_ec_ = make_error_code(errc::connection_aborted);
  io_event_.notify();
}

void Connection::OnPreMigrateThread() {
  DVLOG(1) << "OnPreMigrateThread " << GetClientId();

  CHECK(!cc_->conn_closing);

  DCHECK(!migration_in_process_);

  // CancelOnErrorCb is a preemption point, so we make sure the Migration start
  // is marked beforehand.
  migration_in_process_ = true;

  // Mark as not owned by any thread as it going through the dark hole
  self_.reset();

  socket_->CancelOnErrorCb();
  DCHECK(!async_fb_.IsJoinable()) << GetClientId();

  DecreaseConnStats();
}

void Connection::OnPostMigrateThread() {
  DVLOG(1) << "[" << id_ << "] OnPostMigrateThread";

  // Once we migrated, we should rearm OnBreakCb callback.
  if (breaker_cb_ && socket()->IsOpen()) {
    socket_->RegisterOnErrorCb([this](int32_t mask) { this->OnBreakCb(mask); });
  }

  if (ioloop_v2_ && socket_ && socket_->IsOpen() && migration_allowed_to_register_) {
    socket_->RegisterOnRecv([this](const FiberSocketBase::RecvNotification& n) {
      DoReadOnRecv(n);
      io_event_.notify();
    });
  }

  migration_in_process_ = false;
  self_ = {make_shared<std::monostate>(), this};  // Recreate shared_ptr to self.
  DCHECK(!async_fb_.IsJoinable());

  // If someone had sent Async during the migration, we must create async_fb_.
  if (HasPendingMessages()) {
    LaunchAsyncFiberIfNeeded();
  }

  IncreaseConnStats();
}

void Connection::OnConnectionStart() {
  SetName(absl::StrCat(id_));

  // is null in unit-tests.
  if (const Listener* lsnr = static_cast<Listener*>(listener()); lsnr) {
    is_main_ = lsnr->IsMainInterface();
  }

  if (GetFlag(FLAGS_tcp_nodelay) && !socket_->IsUDS()) {
    int val = 1;
    int res = setsockopt(socket_->native_handle(), IPPROTO_TCP, TCP_NODELAY, &val, sizeof(val));
    DCHECK_EQ(res, 0);
  }
}

void Connection::HandleRequests() {
  VLOG(1) << "[" << id_ << "] HandleRequests";
  DCHECK(tl_facade_stats);
  auto& conn_stats = tl_facade_stats->conn_stats;

  auto remote_ep = RemoteEndpointStr();

#ifdef DFLY_USE_SSL
  if (ssl_ctx_) {
    // Early TLS connection filter
    //
    // Before entering the expensive OpenSSL handshake we pre-read the 5-byte TLS Record Layer
    // header on the raw TCP socket. This serves two purposes:
    //
    //  1. Wrong-client detection:
    //     Clients that forgot to enable TLS (e.g. a plaintext Redis client connecting to the TLS
    //     port) will not send a valid TLS Record Layer header.  We detect this immediately and
    //     reply with a human-readable "-ERR" message before disconnecting, instead of letting
    //     OpenSSL produce a cryptic handshake failure.
    //
    //  2. Zombie-connection rejection:
    //     Zombie connections —— open a TCP socket but never send any data.  By demanding at least
    //     the 5-byte header before allocating any SSL state, we drop these cheaply on the raw
    //     socket instead of tying up an OpenSSL context and handshake state machine that will never
    //     complete.
    //
    // The pre-read header bytes are injected into the TlsSocket via InitSSL(), which writes them
    // into OpenSSL's internal BIO so that Accept() can drive the normal handshake from there.
    //
    // Reminder: TLS Record Layer header structure (universal across TLS 1.0 – 1.3):
    // - Byte 0: ContentType (0x16 = Handshake)
    // - Bytes 1–2: ProtocolVersion. While the minor version varies (0x01 for TLS 1.0,
    //   0x03 for TLS 1.2/1.3), the major version is consistently 0x03 for all
    //   modern TLS versions.
    // - Bytes 3–4: Length (uint16 BE) — payload length, max 2^14 = 16384
    uint8_t buf[5];  // universal TLS Record Header size is 5 bytes
    auto read_sz = socket_->Read(io::MutableBytes(buf));
    if (!read_sz || *read_sz < sizeof(buf)) {
      auto msg = read_sz ? absl::StrCat(*read_sz, " < ", sizeof(buf)) : read_sz.error().message();
      LOG_EVERY_T(INFO, 1) << "Error reading from peer " << remote_ep << " " << msg
                           << ", socket state: " + dfly::GetSocketInfo(socket_->native_handle());
      conn_stats.tls_accept_disconnects++;
      return;
    }

    // Byte 0: ContentType must be 0x16 (Handshake).
    // Byte 1: major ProtocolVersion — always 0x03 for TLS 1.0 through TLS 1.3.
    // Byte 2: minor ProtocolVersion — 0x01 (TLS 1.0), 0x02 (TLS 1.1), 0x03 (TLS 1.2/1.3).
    //         SSL 3.0 (0x00) is deprecated (RFC 7568) and rejected.
    if ((buf[0] != 0x16) || (buf[1] != 0x03) || (buf[2] < 0x01) || (buf[2] > 0x03)) {
      VLOG(1) << "Bad TLS header "
              << absl::StrCat(absl::Hex(buf[0], absl::kZeroPad2),
                              absl::Hex(buf[1], absl::kZeroPad2),
                              absl::Hex(buf[2], absl::kZeroPad2));
      std::ignore =
          socket_->Write(io::Buffer("-ERR Bad TLS header, double check "
                                    "if you enabled TLS for your client.\r\n"));
      conn_stats.tls_accept_disconnects++;
      return;
    }

    // Must be done atomically before the preemption point in Accept so that at any
    // point in time, the socket_ is defined.
    {
      FiberAtomicGuard fg;
      unique_ptr<tls::TlsSocket> tls_sock = make_unique<tls::TlsSocket>(std::move(socket_));
      tls_sock->InitSSL(ssl_ctx_, buf);
      SetSocket(tls_sock.release());
    }
    FiberSocketBase::AcceptResult aresult = socket_->Accept();

    if (!aresult) {
      // This can flood the logs -- don't change
      LOG_EVERY_T(INFO, 1) << "Error handshaking " << aresult.error().message()
                           << ", socket state: " + dfly::GetSocketInfo(socket_->native_handle());
      conn_stats.tls_accept_disconnects++;
      return;
    }
    is_tls_ = 1;
    VLOG(1) << "TLS handshake succeeded";
  }
#endif

  io::Result<bool> http_res{false};

  http_res = CheckForHttpProto();

  // We need to check if the socket is open because the server might be
  // shutting down. During the shutdown process, the server iterates over
  // the connections of each shard and shuts down their socket. Since the
  // main listener dispatches the connection into the next proactor, we
  // allow a schedule order that first shuts down the socket and then calls
  // this function which triggers a DCHECK on the socket while it tries to
  // RegisterOnErrorCb. Furthermore, we can get away with one check here
  // because both Write and Recv internally check if the socket was shut
  // down and return with an error accordingly.
  if (http_res && socket_->IsOpen()) {
    cc_.reset(service_->CreateContext(this));

    if (*http_res) {
      VLOG(1) << "HTTP1.1 identified";
      is_http_ = true;
      HttpConnection http_conn{http_listener_};
      http_conn.SetSocket(socket_.get());
      http_conn.set_user_data(cc_.get());

      // We validate the http request using basic-auth inside HttpConnection::HandleSingleRequest.
      cc_->authenticated = true;
      auto ec = http_conn.ParseFromBuffer(io_buf_.InputBuffer());
      io_buf_.ConsumeInput(io_buf_.InputLen());
      if (!ec) {
        http_conn.HandleRequests();
      }

      // Release the ownership of the socket from http_conn so it would stay with
      // this connection.
      http_conn.ReleaseSocket();
    } else {  // non-http
      // ioloop_v2 not supported for TLS & redis connections yet.
      ioloop_v2_ =
          GetFlag(FLAGS_experimental_io_loop_v2) && !is_tls_ && protocol_ == Protocol::MEMCACHE;

      if (breaker_cb_) {
        socket_->RegisterOnErrorCb([this](int32_t mask) { this->OnBreakCb(mask); });
      }
      switch (protocol_) {
        case Protocol::REDIS:
          reply_builder_.reset(new RedisReplyBuilder(socket_.get()));
          break;
        case Protocol::MEMCACHE:
          reply_builder_.reset(new MCReplyBuilder(socket_.get()));
          break;
        default:
          break;
      }
      parsed_cmd_ = CreateParsedCommand();
      ConnectionFlow();

      socket_->CancelOnErrorCb();  // noop if nothing is registered.
      VLOG(1) << "Closed connection for peer "
              << GetClientInfo(fb2::ProactorBase::me()->GetPoolIndex());
      reply_builder_.reset();
      DestroyParsedQueue();
    }
    cc_.reset();
  }
}

unsigned Connection::GetSendWaitTimeSec() const {
  if (reply_builder_ && reply_builder_->IsSendActive()) {
    return (util::fb2::ProactorBase::GetMonotonicTimeNs() - reply_builder_->GetLastSendTimeNs()) /
           1'000'000'000;
  }

  return 0;
}

void Connection::RegisterBreakHook(BreakerCb breaker_cb) {
  breaker_cb_ = std::move(breaker_cb);
}

void Connection::FlushReplies() {  // NOLINT must not be const due to flush side effect
  DCHECK(reply_builder_);
  reply_builder_->Flush();
}

pair<string, string> Connection::GetClientInfoBeforeAfterTid() const {
  if (!socket_) {
    LOG(DFATAL) << "unexpected null socket_ "
                << " phase " << unsigned(phase_) << ", is_http: " << unsigned(is_http_);
    return {};
  }

  CHECK_LT(unsigned(phase_), NUM_PHASES);

  string before;
  auto le = LocalBindStr();
  auto re = RemoteEndpointStr();
  time_t now = time(nullptr);

  int cpu = 0;
  socklen_t len = sizeof(cpu);
  getsockopt(socket_->native_handle(), SOL_SOCKET, SO_INCOMING_CPU, &cpu, &len);

#ifdef __APPLE__
  int my_cpu_id = -1;  // __APPLE__ does not have sched_getcpu()
#else
  int my_cpu_id = sched_getcpu();
#endif

  static constexpr string_view PHASE_NAMES[] = {"setup", "readsock", "process", "shutting_down",
                                                "preclose"};
  static_assert(NUM_PHASES == ABSL_ARRAYSIZE(PHASE_NAMES));
  static_assert(PHASE_NAMES[SHUTTING_DOWN] == "shutting_down");

  absl::StrAppend(&before, "id=", id_, " addr=", re, " laddr=", le);
  absl::StrAppend(&before, " fd=", socket_->native_handle());
  if (is_http_) {
    absl::StrAppend(&before, " http=true");
  } else {
    absl::StrAppend(&before, " name=", name_);
  }
#ifdef DFLY_USE_SSL
  if (is_tls_) {
    tls::TlsSocket* tls_sock = static_cast<tls::TlsSocket*>(socket_.get());
    string_view proto_version = SSL_get_version(tls_sock->ssl_handle());
    const SSL_CIPHER* cipher = SSL_get_current_cipher(tls_sock->ssl_handle());
    absl::StrAppend(&before, " tls=", proto_version, "|", SSL_CIPHER_get_name(cipher));
  }
#endif
  string after;
  absl::StrAppend(&after, " irqmatch=", int(cpu == my_cpu_id));
  if (parsed_cmd_q_len_ > 0) {
    absl::StrAppend(&after, " pipeline=", parsed_cmd_q_len_);
    absl::StrAppend(&after, " pbuf=", parsed_cmd_q_bytes_);
  }
  absl::StrAppend(&after, " age=", now - creation_time_, " idle=", now - last_interaction_);
  string_view phase_name = PHASE_NAMES[phase_];

  absl::StrAppend(&after, " tot-cmds=", local_stats_.cmds,
                  " tot-net-in=", local_stats_.net_bytes_in,
                  " tot-read-calls=", local_stats_.read_cnt,
                  " tot-dispatches=", local_stats_.dispatch_entries_added);

  if (cc_) {
    string cc_info = service_->GetContextInfo(cc_.get()).Format();

    // reply_builder_ may be null if the connection is in the setup phase, for example.
    if (reply_builder_ && reply_builder_->IsSendActive())
      phase_name = "send";
    absl::StrAppend(&after, " ", cc_info);
  }
  absl::StrAppend(&after, " phase=", phase_name);

  if (IsSending()) {
    absl::StrAppend(&before, " send-wait-time=", GetSendWaitTimeSec());
  }

  return {std::move(before), std::move(after)};
}

string Connection::GetClientInfo(unsigned thread_id) const {
  auto [before, after] = GetClientInfoBeforeAfterTid();
  absl::StrAppend(&before, " tid=", thread_id);
  absl::StrAppend(&before, after);
  absl::StrAppend(&before, " lib-name=", lib_name_, " lib-ver=", lib_ver_);
  return before;
}

string Connection::GetClientInfo() const {
  auto [before, after] = GetClientInfoBeforeAfterTid();
  absl::StrAppend(&before, after);
  // The following are dummy fields and users should not rely on those unless
  // we decide to implement them.
  // This is only done because the redis pyclient parser for the field "client-info"
  // for the command ACL LOG hardcodes the expected values. This behaviour does not
  // conform to the actual expected values, since it's missing half of them.
  // That is, even for redis-server, issuing an ACL LOG command via redis-cli and the pyclient
  // will return different results! For example, the fields:
  // addr=127.0.0.1:57275
  // laddr=127.0.0.1:6379
  // are missing from the pyclient.

  absl::StrAppend(&before, " qbuf=0 ", "qbuf-free=0 ", "obl=0 ", "argv-mem=0 ");
  absl::StrAppend(&before, "oll=0 ", "omem=0 ", "tot-mem=0 ", "multi=0 ");
  absl::StrAppend(&before, "psub=0 ", "sub=0");
  return before;
}

uint32_t Connection::GetClientId() const {
  return id_;
}

bool Connection::IsPrivileged() const {
  return static_cast<Listener*>(listener())->IsPrivilegedInterface();
}

bool Connection::IsMain() const {
  return is_main_;
}

bool Connection::IsMainOrMemcache() const {
  return is_main_ || protocol_ == Protocol::MEMCACHE;
}

void Connection::SetName(string name) {
  util::ThisFiber::SetName(absl::StrCat("DflyConn_", name));
  name_ = std::move(name);
}

void Connection::SetLibName(string name) {
  UpdateLibNameVerMap(lib_name_, lib_ver_, -1);
  lib_name_ = std::move(name);
  UpdateLibNameVerMap(lib_name_, lib_ver_, +1);
}

void Connection::SetLibVersion(string version) {
  UpdateLibNameVerMap(lib_name_, lib_ver_, -1);
  lib_ver_ = std::move(version);
  UpdateLibNameVerMap(lib_name_, lib_ver_, +1);
}

const absl::flat_hash_map<string, uint64_t>& Connection::GetLibStatsTL() {
  return g_libname_ver_map;
}

io::Result<bool> Connection::CheckForHttpProto() {
  if (!IsPrivileged() && !IsMain()) {
    return false;
  }

  const bool primary_port_enabled = GetFlag(FLAGS_primary_port_http_enabled);
  if (!primary_port_enabled && !IsPrivileged()) {
    return false;
  }

  size_t last_len = 0;
  auto* peer = socket_.get();
  auto& conn_stats = tl_facade_stats->conn_stats;
  do {
    auto buf = io_buf_.AppendBuffer();
    DCHECK(!buf.empty());

    ::io::Result<size_t> recv_sz = peer->Recv(buf);
    if (!recv_sz) {
      return make_unexpected(recv_sz.error());
    }
    if (recv_sz == 0) {
      // Peer closed connection.
      return false;
    }

    io_buf_.CommitWrite(*recv_sz);
    string_view ib = io::View(io_buf_.InputBuffer());
    if (ib.size() >= 2 && ib[0] == 22 && ib[1] == 3) {
      // We matched the TLS handshake raw data, which means "peer" is a TCP socket.
      // Reject the connection.
      return make_unexpected(make_error_code(errc::protocol_not_supported));
    }

    ib = ib.substr(last_len);
    size_t pos = ib.find('\n');
    if (pos != string_view::npos) {
      ib = io::View(io_buf_.InputBuffer().first(last_len + pos));
      if (ib.size() < 10 || ib.back() != '\r')
        return false;

      ib.remove_suffix(1);
      return MatchHttp11Line(ib);
    }
    last_len = io_buf_.InputLen();
    UpdateIoBufCapacity(io_buf_, &conn_stats, [&]() { io_buf_.EnsureCapacity(128); });
  } while (last_len < 1024);

  return false;
}

void Connection::ConnectionFlow() {
  DCHECK(reply_builder_);
  auto& conn_stats = tl_facade_stats->conn_stats;

  // Register the new connection with the thread-local statistics.
  // At this point (connection birth), local queue stats/luggage are 0,
  // so only connection counts and buffer capacities are incremented.
  IncreaseConnStats();
  ++conn_stats.conn_received_cnt;

  ++local_stats_.read_cnt;
  local_stats_.net_bytes_in += io_buf_.InputLen();

  ParserStatus parse_status = OK;

  // At the start we read from the socket to determine the HTTP/Memstore protocol.
  // Therefore we may already have some data in the buffer.
  if (io_buf_.InputLen() > 0) {
    phase_ = PROCESS;
    if (redis_parser_) {
      parse_status = ParseRedis(10000);
    } else {
      DCHECK(memcache_parser_);
      parse_status = ParseLoop();
    }
  }

  error_code ec = reply_builder_->GetError();

  // Main loop.
  if (parse_status != ERROR && !ec) {
    UpdateIoBufCapacity(io_buf_, &conn_stats, [&]() { io_buf_.EnsureCapacity(64); });
    variant<error_code, Connection::ParserStatus> res;
    if (ioloop_v2_) {
      // Everything above the IoLoopV2 is fiber blocking. A connection can migrate before
      // it reaches here and will cause a double RegisterOnRecv check fail. To avoid this,
      // a migration shall only call RegisterOnRecv if it reached the main IoLoopV2 below.
      migration_allowed_to_register_ = true;
      res = IoLoopV2();
    } else {
      res = IoLoop();
    }

    if (holds_alternative<error_code>(res)) {
      ec = get<error_code>(res);
    } else {
      parse_status = get<ParserStatus>(res);
    }
  }

  // After the client disconnected.
  cc_->conn_closing = true;  // Signal dispatch to close.
  cnd_.notify_one();
  phase_ = SHUTTING_DOWN;
  VLOG(2) << "Before dispatch_fb.join()";
  async_fb_.JoinIfNeeded();
  VLOG(2) << "After dispatch_fb.join()";

  phase_ = PRECLOSE;

  ClearPipelinedMessages();
  DCHECK(!HasPendingMessages());

  service_->OnConnectionClose(cc_.get());

  // We have already cleared the queues above (ClearPipelinedMessages), so local queue stats
  // (dispatch_q_bytes_, etc.) represent 0 usage. DecreaseConnStats will safely subtract 0 for those
  // stats, while correctly removing this connection from the global connection counts and buffer
  // capacity tracking.
  DecreaseConnStats();

  if (ioloop_v2_) {
    socket_->ResetOnRecvHook();
  }

  // We wait for dispatch_fb to finish writing the previous replies before replying to the last
  // offending request.
  if (parse_status == ERROR) {
    VLOG(1) << "Error parser status " << parser_error_;

    if (redis_parser_) {
      SendProtocolError(RespSrvParser::Result(parser_error_), reply_builder_.get());
    } else {
      DCHECK(memcache_parser_);
      reply_builder_->SendProtocolError("bad command line format");
    }

    // Shut down the servers side of the socket to send a FIN to the client
    // then keep draining the socket (discarding any received data) until
    // the client closes the connection.
    //
    // Otherwise the clients write could fail (or block), so they would never
    // read the above protocol error (see issue #1327).
    // TODO: we have a bug that can potentially deadlock the code below.
    // If the socket does not close the socket on the other side, the while loop will never finish.
    // to reproduce: nc localhost 6379  and then run invalid sequence: *1 <enter> *1 <enter>
    error_code ec2 = socket_->Shutdown(SHUT_WR);
    LOG_IF(WARNING, ec2) << "Could not shutdown socket " << ec2;
    while (!ec2) {
      // Discard any received data.
      io_buf_.Clear();
      auto recv_sz = socket_->Recv(io_buf_.AppendBuffer());
      if (!recv_sz || *recv_sz == 0) {
        break;  // Peer closed connection.
      }
    }
  }

  if (ec && !FiberSocketBase::IsConnClosed(ec)) {
    string conn_info = service_->GetContextInfo(cc_.get()).Format();
    LOG_EVERY_T(WARNING, 1) << "Socket error for connection " << conn_info << " " << GetName()
                            << " during phase " << kPhaseName[phase_] << " : " << ec << " "
                            << ec.message();
  }
}

void Connection::DispatchSingle(bool has_more, absl::FunctionRef<void()> invoke_cb,
                                absl::FunctionRef<void()> enqueue_cmd_cb) {
  // Unconditional return when closing:
  // else, non-throttled connections skip the check below and enqueue data even if they are closing.
  // No one will read that data anyway.
  if (cc_->conn_closing)
    return;
  auto can_dispatch_sync_fn = [this]() {
    return !cc_->async_dispatch && !HasPendingMessages() && (cc_->subscriptions == 0);
  };
  bool optimize_for_async = has_more;
  bool can_dispatch_sync = can_dispatch_sync_fn();
  QueueBackpressure& qbp = GetQueueBackpressure();
  ConnectionStats* conn_stats = &tl_facade_stats->conn_stats;
  if ((optimize_for_async || !can_dispatch_sync) &&
      qbp.IsPipelineBufferOverLimit(conn_stats->pipeline_queue_bytes, parsed_cmd_q_len_)) {
    conn_stats->pipeline_throttle_count++;
    LOG_EVERY_T(WARNING, 10) << "Pipeline buffer over limit."
                             << ", Thread pipeline_queue_bytes: "
                             << conn_stats->pipeline_queue_bytes
                             << ", Thread pipeline_queue_entries: "
                             << conn_stats->pipeline_queue_entries
                             << ", Connection parsed_cmd_q_bytes_: " << parsed_cmd_q_bytes_
                             << ", Connection parsed commands queue size: " << parsed_cmd_q_len_
                             << ", consider increasing pipeline_buffer_limit/pipeline_queue_limit";
    fb2::NoOpLock noop;
    qbp.pipeline_cnd.wait(noop, [this, &qbp, &can_dispatch_sync_fn] {
      // Wait until at least one is true:
      // 1) Connection is closing.
      // 2) Can dispatch synchronously.
      // 3) Not over limits (for an async dispatch).
      bool can_dispatch_sync = can_dispatch_sync_fn();
      if (can_dispatch_sync)
        return true;
      bool over_limits = qbp.IsPipelineBufferOverLimit(
          tl_facade_stats->conn_stats.pipeline_queue_bytes, parsed_cmd_q_len_);
      return !over_limits || cc_->conn_closing;
    });

    // prefer synchronous dispatching to save memory.
    optimize_for_async = false;
    last_interaction_ = time(nullptr);
  }

  // Avoid sync dispatch if we can interleave with an ongoing async dispatch.
  can_dispatch_sync = can_dispatch_sync_fn();

  // Dispatch async if we're handling a pipeline or if we can't dispatch sync.
  if (optimize_for_async || !can_dispatch_sync) {
    LaunchAsyncFiberIfNeeded();
    enqueue_cmd_cb();
  } else {
    ShrinkPipelinePool();  // Gradually release pipeline request pool.
    {
      ++local_stats_.cmds;
      cc_->sync_dispatch = true;
      invoke_cb();
      cc_->sync_dispatch = false;
    }
    last_interaction_ = time(nullptr);

    // We might have blocked the dispatch queue from processing, wake it up.
    if (HasPendingMessages())
      cnd_.notify_one();
  }
}

Connection::ParserStatus Connection::ParseRedis(unsigned max_busy_cycles, bool enqueue_only) {
  uint32_t consumed = 0;
  RespSrvParser::Result result = RespSrvParser::OK;

  auto dispatch_sync = [this] {
    service_->DispatchCommand(ParsedArgs{*parsed_cmd_}, parsed_cmd_,
                              facade::AsyncPreference::ONLY_SYNC);
  };
  auto dispatch_async = [this]() -> void {
    PipelineMessagePtr ptr = GetFromPoolOrCreate();
    // parsed_cmd_ holds the parsed arguments. Move it to 'cmd' to be enqueued and set it with a new
    // empty ParsedCommand for the next parse.
    auto* cmd = std::exchange(parsed_cmd_, ptr.release());
    EnqueueParsedCommand(cmd);
  };
  io::Bytes read_buffer = io_buf_.InputBuffer();
  // Keep track of total bytes consumed/parsed. The do/while{} loop below preempts,
  // and InputBuffer() size might change between preemption points. There is a corner case,
  // that ConsumeInput() will strip a portion of the request which makes the test_publish_stuck
  // test fail.
  // TODO(kostas): follow up on this
  size_t total_consumed = 0;
  do {
    DCHECK(parsed_cmd_);
    result = redis_parser_->Parse(read_buffer, &consumed, parsed_cmd_);
    request_consumed_bytes_ += consumed;
    total_consumed += consumed;
    if (result == RespSrvParser::OK) {
      DCHECK(!parsed_cmd_->empty());
      DVLOG(2) << "Got Args with first token " << parsed_cmd_->Front();

      if (io_req_size_hist)
        io_req_size_hist->Add(request_consumed_bytes_);
      request_consumed_bytes_ = 0;
      bool has_more = consumed < read_buffer.size();

      if (tl_traffic_logger.log_file && IsMain() /* log only on the main interface */) {
        LogTraffic(id_, has_more, *parsed_cmd_, service_->GetContextInfo(cc_.get()));
      }

      if (enqueue_only)
        dispatch_async();
      else
        DispatchSingle(has_more, dispatch_sync, dispatch_async);
    }
    if (result != RespSrvParser::OK && result != RespSrvParser::INPUT_PENDING) {
      // We do not expect that a replica sends an invalid command so we log if it happens.
      LOG_IF(WARNING, cntx()->replica_conn)
          << "Redis parser error: " << result << " during parse: " << io::View(read_buffer);
    }
    read_buffer.remove_prefix(consumed);

    // We must yield from time to time to allow other fibers to run.
    // Specifically, if a client sends a huge chunk of data resulting in a very long pipeline,
    // we want to yield to allow AsyncFiber to actually execute on the pending pipeline.
    if (ThisFiber::GetRunningTimeCycles() > max_busy_cycles) {
      GetLocalConnStats().num_read_yields++;
      ThisFiber::Yield();
    }
  } while (RespSrvParser::OK == result && read_buffer.size() > 0 && !reply_builder_->GetError());

  io_buf_.ConsumeInput(total_consumed);

  parser_error_ = result;
  if (result == RespSrvParser::OK)
    return OK;

  if (result == RespSrvParser::INPUT_PENDING) {
    DCHECK_EQ(read_buffer.size(), 0u);

    return NEED_MORE;
  }

  VLOG(1) << "Parser error " << result;

  return ERROR;
}

auto Connection::ParseLoop() -> ParserStatus {
  auto parse_func =
      protocol_ == Protocol::MEMCACHE ? &Connection::ParseMCBatch : &Connection::ParseRedisBatch;

  bool commands_parsed = false;
  do {
    commands_parsed = (this->*parse_func)();

    if (!ExecuteBatch())
      return ERROR;

    if (!ReplyBatch())
      return ERROR;
  } while (commands_parsed && io_buf_.InputLen() > 0);

  return commands_parsed ? OK : NEED_MORE;
}

void Connection::OnBreakCb(int32_t mask) {
  if (mask <= 0)
    return;  // we cancelled the poller, which means we do not need to break from anything.

  if (!cc_) {
    LOG(ERROR) << "Unexpected event " << mask;
    return;
  }

  DCHECK(reply_builder_) << "[" << id_ << "] " << phase_ << " " << migration_in_process_;

  VLOG(1) << "[" << id_ << "] Got event " << mask << " " << phase_ << " "
          << reply_builder_->IsSendActive() << " " << reply_builder_->GetError();

  cc_->conn_closing = true;
  BreakOnce(mask);
  cnd_.notify_one();  // Notify dispatch fiber.
}

void Connection::HandleMigrateRequest() {
  if (cc_->conn_closing || !migration_request_) {
    return;
  }
  ProactorBase* dest = migration_request_;

  if (async_fb_.IsJoinable()) {
    SendAsync({MigrationRequestMessage{}});
    async_fb_.Join();
  }

  // We don't support migrating with subscriptions as it would require moving thread local
  // handles. We can't check above, as the queue might have contained a subscribe request.

  if (cc_->subscriptions == 0) {
    // RegisterOnErrorCb might be called on POLLHUP and the join above is a preemption point.
    // So, it could be the case that after this fiber wakes up the connection might be closing.
    if (cc_->conn_closing) {
      return;
    }

    tl_facade_stats->conn_stats.num_migrations++;
    migration_request_ = nullptr;

    // We need to return early as the socket is closing and IoLoop will clean up.
    // The reason that this is true is because of the following DCHECK
    DCHECK(!async_fb_.IsJoinable());

    // which can never trigger since we Joined on the async_fb_ above and we are
    // atomic in respect to our proactor meaning that no other fiber will
    // launch the DispatchFiber.
    std::ignore = !this->Migrate(dest);
  }
}

io::Result<size_t> Connection::HandleRecvSocket() {
  phase_ = READ_SOCKET;
  auto& conn_stats = tl_facade_stats->conn_stats;

  io::MutableBytes append_buf = io_buf_.AppendBuffer();
  DCHECK(!append_buf.empty());
  ::io::Result<size_t> recv_sz = socket_->Recv(append_buf);
  last_interaction_ = time(nullptr);

  // In case the socket was closed orderly, we get 0 bytes read.
  if (recv_sz && *recv_sz) {
    size_t commit_sz = *recv_sz;
    io_buf_.CommitWrite(commit_sz);

    conn_stats.io_read_bytes += commit_sz;
    local_stats_.net_bytes_in += commit_sz;

    ++conn_stats.io_read_cnt;
    ++local_stats_.read_cnt;
  }
  return recv_sz;
}

variant<error_code, Connection::ParserStatus> Connection::IoLoop() {
  error_code ec;
  ParserStatus parse_status = OK;
  size_t max_iobfuf_len = GetFlag(FLAGS_max_client_iobuf_len);

  auto* peer = socket_.get();
  recv_buf_.res_len = 0;

  do {
    HandleMigrateRequest();
    auto recv_sz = HandleRecvSocket();
    if (!recv_sz) {
      LOG_IF(WARNING, cntx()->replica_conn) << "HandleRecvSocket() error: " << recv_sz.error();
      return recv_sz.error();
    }
    if (*recv_sz == 0) {
      break;
    }

    phase_ = PROCESS;
    bool is_iobuf_full = io_buf_.AppendLen() == 0;

    if (redis_parser_) {
      parse_status = ParseRedis(max_busy_read_cycles_cached);
    } else {
      DCHECK(memcache_parser_);
      parse_status = ParseLoop();
    }

    if (reply_builder_->GetError()) {
      return reply_builder_->GetError();
    }

    if (parse_status == NEED_MORE) {
      parse_status = OK;

      size_t capacity = io_buf_.Capacity();
      if (capacity < max_iobfuf_len) {
        size_t parser_hint = 0;
        if (redis_parser_)
          parser_hint = redis_parser_->parselen_hint();  // Could be done for MC as well.

        // If we got a partial request and we managed to parse its
        // length, make sure we have space to store it instead of
        // increasing space incrementally.
        // (Note: The buffer object is only working in power-of-2 sizes,
        // so there's no danger of accidental O(n^2) behavior.)
        if (parser_hint > capacity) {
          auto& conn_stats = GetLocalConnStats();
          UpdateIoBufCapacity(io_buf_, &conn_stats,
                              [&]() { io_buf_.Reserve(std::min(max_iobfuf_len, parser_hint)); });
        }

        // If we got a partial request because iobuf was full, grow it up to
        // a reasonable limit to save on Recv() calls.
        if (is_iobuf_full && capacity < max_iobfuf_len / 2) {
          auto& conn_stats = GetLocalConnStats();
          // Last io used most of the io_buf to the end.
          UpdateIoBufCapacity(io_buf_, &conn_stats, [&]() {
            io_buf_.Reserve(capacity * 2);  // Valid growth range.
          });
        }

        if (io_buf_.AppendLen() == 0U) {
          // it can happen with memcached but not for RedisParser, because RedisParser fully
          // consumes the passed buffer
          LOG_EVERY_T(WARNING, 10)
              << "Maximum io_buf length reached, consider to increase max_client_iobuf_len flag";
        }
      }
    } else if (parse_status != OK) {
      break;
    }
  } while (peer->IsOpen());

  return parse_status;
}

bool Connection::ShouldEndAsyncFiber(const MessageHandle& msg) {
  if (!holds_alternative<MigrationRequestMessage>(msg.handle)) {
    return false;
  }

  if (!HasPendingMessages()) {
    // Migration requests means we should terminate this function (and allow the fiber to
    // join), so that we can re-launch the fiber in the new thread.
    // We intentionally return and not break in order to keep the connection open.
    return true;
  }

  // There shouldn't be any other migration requests in the queue, but it's worth checking
  // as otherwise it would lead to an endless loop.
  bool has_migration_req =
      any_of(dispatch_q_.begin(), dispatch_q_.end(), [](const MessageHandle& msg) {
        return holds_alternative<MigrationRequestMessage>(msg.handle);
      });
  if (!has_migration_req) {
    SendAsync({MigrationRequestMessage{}});
  }

  return false;
}

void Connection::SquashPipeline() {
  DCHECK_EQ(GetPendingMessageCount(), parsed_cmd_q_len_);
  DCHECK_EQ(reply_builder_->GetProtocol(), Protocol::REDIS);  // Only Redis is supported.
  unsigned pipeline_count = std::min<uint32_t>(parsed_cmd_q_len_, pipeline_squash_limit_cached);
  auto& conn_stats = tl_facade_stats->conn_stats;

  uint64_t start = CycleClock::Now();

  // Define a "Feeder" Lambda
  // This lambda advances a temporary pointer exec_cmd_ptr to feed the execution engine.
  // We do not modify parsed_to_execute_ yet, in case execution throws/fails.
  auto exec_cmd_ptr{parsed_to_execute_};
  auto get_next_fn = [&exec_cmd_ptr]() mutable -> ParsedArgs {
    DCHECK(exec_cmd_ptr);
    return ParsedArgs{*std::exchange(exec_cmd_ptr, exec_cmd_ptr->next)};
  };

  // async_dispatch is a guard to prevent concurrent writes into reply_builder_, hence
  // it must guard the Flush() as well.
  cc_->async_dispatch = true;

  DispatchManyResult result =
      service_->DispatchManyCommands(get_next_fn, pipeline_count, reply_builder_.get(), cc_.get());

  local_stats_.cmds += result.processed;
  last_interaction_ = time(nullptr);
  uint32_t num_dispatched_cmds = result.processed;
  uint64_t flush_start_cycle_cnt = CycleClock::Now();
  //
  // TODO: to investigate if always flushing will improve P99 latency because otherwise we
  // wait for the next batch to finish before fully flushing the current response.
  if (parsed_cmd_q_len_ == pipeline_count ||
      always_flush_pipeline_cached) {  // Flush if no new commands appeared
    reply_builder_->Flush();
    reply_builder_->SetBatchMode(false);  // in case the next dispatch is sync
  } else {
    conn_stats.skip_pipeline_flushing++;
  }

  cc_->async_dispatch = false;

  if (result.account_in_stats) {
    conn_stats.pipeline_dispatch_calls++;
    conn_stats.pipeline_dispatch_commands += num_dispatched_cmds;
    conn_stats.pipeline_dispatch_flush_usec +=
        CycleClock::ToUsec(CycleClock::Now() - flush_start_cycle_cnt);
  }

  auto* current{parsed_head_};
  for (size_t i = 0; (i < num_dispatched_cmds) && current; ++i) {
    auto* next{current->next};

    if (result.account_in_stats) {
      conn_stats.pipelined_wait_latency += CycleClock::ToUsec(start - current->parsed_cycle);
    }

    ReleaseParsedCommand(current, result.account_in_stats /* is_pipelined */);
    current = next;
  }
  parsed_head_ = current;
  if (!parsed_head_) {
    parsed_tail_ = nullptr;
  }
  parsed_to_execute_ = parsed_head_;

  // If interrupted due to pause, fall back to regular dispatch
  skip_next_squashing_ = (num_dispatched_cmds != pipeline_count);
}

void Connection::ClearPipelinedMessages() {
  AsyncOperations async_op{reply_builder_.get(), this};

  // First, clear dispatch queue
  // Recycle messages even from disconnecting client to keep properly track of memory stats
  // As well as to avoid pubsub backpressure leakage.
  for (auto& msg : dispatch_q_) {
    FiberAtomicGuard guard;  // don't suspend when concluding to avoid getting new messages
    if (msg.IsCheckPoint())
      visit(async_op, msg.handle);  // to not miss checkpoints
    UpdateDispatchStats(msg, false /* subtract */);
  }

  dispatch_q_.clear();

  // Second, drain the pending pipeline queue: release memory and update stats without executing
  // commands.
  while (parsed_head_) {
    auto* curr{parsed_head_};
    parsed_head_ = parsed_head_->next;

    // Wait for the in-flight async commands processing by consumer to finish before recycling.
    if (curr->IsDeferredReply() && !curr->CanReply()) {
      curr->Blocker()->Wait();
    }

    ReleaseParsedCommand(curr, false);
  }

  DCHECK_EQ(parsed_cmd_q_len_, 0u);
  DCHECK_EQ(parsed_cmd_q_bytes_, 0u);
  parsed_tail_ = nullptr;
  parsed_to_execute_ = nullptr;

  QueueBackpressure& qbp = GetQueueBackpressure();
  qbp.pipeline_cnd.notify_all();
  qbp.pubsub_ec.notifyAll();
}

string Connection::DebugInfo() const {
  string info = "{";

  absl::StrAppend(&info, "id=", id_, ", ");
  absl::StrAppend(&info, "phase=", phase_, ", ");
  if (cc_) {
    // In some rare cases cc_ can be null, see https://github.com/dragonflydb/dragonfly/pull/3873
    absl::StrAppend(&info, "dispatch(s/a)=", cc_->sync_dispatch, " ", cc_->async_dispatch, ", ");
    absl::StrAppend(&info, "closing=", cc_->conn_closing, ", ");
  }
  absl::StrAppend(&info, "df:joinable=", async_fb_.IsJoinable(), ", ");

  absl::StrAppend(&info, "dq:size=", dispatch_q_.size(), ", ");
  absl::StrAppend(&info, "pq:parsed_cmd_q_len=", parsed_cmd_q_len_, ", ");
  absl::StrAppend(&info, "pq:is_empty=", (parsed_head_ == nullptr), ", ");

  if (cc_) {
    absl::StrAppend(&info, "state=");
    if (cc_->paused)
      absl::StrAppend(&info, "p");
    if (cc_->blocked)
      absl::StrAppend(&info, "b");
  }
  time_t now = time(nullptr);
  absl::StrAppend(&info, " age=", now - creation_time_, " idle=", now - last_interaction_, "}");

  return info;
}

bool Connection::ProcessAdminMessage(MessageHandle* msg, AsyncOperations* async_op) {
  // Guard: Automatically subtract stats when this scope exits (via return or exception).
  absl::Cleanup stats_guard = [this, msg] { UpdateDispatchStats(*msg, false /* subtract */); };
  bool is_replying = msg->IsReplying();

  // Pre-execution Flush
  // If this is a non-replying control message (e.g. Migration) and it's the last item,
  // we MUST flush the buffer now. Otherwise, previous pipelined replies might wait
  // indefinitely or be lost if the fiber terminates.
  if (!HasPendingMessages() && !is_replying) {
    reply_builder_->Flush();
  }

  // Fiber Termination Check
  if (ShouldEndAsyncFiber(*msg)) {
    CHECK(!HasPendingMessages()) << DebugInfo();
    GetQueueBackpressure().pipeline_cnd.notify_all();
    return true;  // Signal to terminate AsyncFiber
  }

  // Execution
  auto replies_recorded_before = reply_builder_->RepliesRecorded();
  cc_->async_dispatch = true;
  std::visit(*async_op, msg->handle);
  cc_->async_dispatch = false;

  // Post-execution Flush
  // We force a flush If the message is supposed to reply (e.g. PubSub) but didn't write to the
  // buffer (e.g. subscription filter), and the queues are empty.
  if (!HasPendingMessages() && is_replying &&
      (replies_recorded_before == reply_builder_->RepliesRecorded())) {
    reply_builder_->Flush();
  }
  return false;
}

void Connection::ProcessPipelineCommand() {
  DCHECK(parsed_head_ && parsed_to_execute_) << DebugInfo();
  auto* cmd = parsed_to_execute_;
  parsed_to_execute_ = cmd->next;
  parsed_head_ = parsed_to_execute_;
  if (!parsed_head_) {
    parsed_tail_ = nullptr;
  }

  tl_facade_stats->conn_stats.pipelined_wait_latency +=
      CycleClock::ToUsec(CycleClock::Now() - cmd->parsed_cycle);

  cc_->async_dispatch = true;
  local_stats_.cmds++;
  service_->DispatchCommand(ParsedArgs{*cmd}, cmd, facade::AsyncPreference::ONLY_SYNC);
  last_interaction_ = time(nullptr);
  skip_next_squashing_ = false;
  cc_->async_dispatch = false;

  ReleaseParsedCommand(cmd, true);

  // If we drained the pipeline and no admin messages are waiting, flush.
  if (!HasPendingMessages()) {
    reply_builder_->Flush();
  }
}

// AsyncFiber acts as the consumer for all asynchronous connection tasks.
//
// It operates on a producer-consumer model where the InputLoop parses socket data
// and routes it into two distinct streams:
// 1. Data Path: Pipelined commands are queued in a Parsed Commands linked list
// 2. Control Path: Admin events (Migrations, Checkpoints, PubSub) use a deque (dispatch_q_)
//
// AsyncFiber drains these queues according to system prioritization, ensuring
// high-priority events are handled promptly while preventing priority inversion
// during thread migrations. For simple requests, the InputLoop may bypass this
// fiber and dispatch synchronously to minimize latency.
void Connection::AsyncFiber() {
  ThisFiber::SetName("AsyncFiber");

  AsyncOperations async_op{reply_builder_.get(), this};
  size_t squashing_threshold = GetFlag(FLAGS_pipeline_squash);
  uint64_t prev_epoch = fb2::FiberSwitchEpoch();
  fb2::NoOpLock noop_lk;
  QueueBackpressure& qbp = GetQueueBackpressure();
  auto& conn_stats = tl_facade_stats->conn_stats;
  uint32_t dispatch_q_cmd_processed = 0;
  uint32_t async_dispatch_quota = GetFlag(FLAGS_async_dispatch_quota);

  while (!reply_builder_->GetError()) {
    DCHECK_EQ(socket()->proactor(), ProactorBase::me());
    cnd_.wait(noop_lk, [this] {
      if (cc_->conn_closing)
        return true;

      // If we are currently executing a synchronous dispatch (e.g. inside IoLoop),
      // we must wait until it finishes to avoid race conditions.
      if (cc_->sync_dispatch)
        return false;

      // For Memcache, we ONLY wake up for Admin messages (dispatch_q_) as we process
      // parsed_head_  in the connection fiber. For RESP, we wake up for both queues.
      if (protocol_ == Protocol::MEMCACHE) {
        return !dispatch_q_.empty();
      }
      return HasPendingMessages();
    });

    if (cc_->conn_closing)
      break;

    // We really want to have batching in the builder if possible. This is especially
    // critical in situations where Nagle's algorithm can introduce unwanted high
    // latencies. However we can only batch if we're sure that there are more commands
    // on the way that will trigger a flush. To know if there are, we sometimes yield before
    // executing the last command in the queue and let the producer fiber push more commands if it
    // wants to.
    // As an optimization, we only yield if the fiber was not suspended since the last dispatch.
    uint64_t cur_epoch = fb2::FiberSwitchEpoch();
    if ((GetPendingMessageCount() == 1) && (cur_epoch == prev_epoch)) {
      if (pipeline_wait_batch_usec > 0) {
        ThisFiber::SleepFor(chrono::microseconds(pipeline_wait_batch_usec));
      } else {
        ThisFiber::Yield();
      }
      DVLOG(2) << "After yielding to producer, parsed_cmd_q_len_=" << parsed_cmd_q_len_
               << " dispatch_q size=" << dispatch_q_.size();
      if (cc_->conn_closing)
        break;
    }
    prev_epoch = cur_epoch;

    reply_builder_->SetBatchMode(GetPendingMessageCount() > 1);

    bool subscriber_over_limit =
        conn_stats.dispatch_queue_subscriber_bytes >= qbp.publish_buffer_limit;

    // The below if/else conditionally choose between 3 message processing policies:
    // 1. Pipeline squashing
    // 2. Process pipeline queue
    // 3. Process admin queue
    //
    // Special case: if the dispatch queue accumulated a big number of commands,
    // we can try to squash them
    // It is only enabled if the threshold is reached and the whole dispatch queue
    // consists only of commands (no pubsub or monitor messages)
    bool squashing_enabled = squashing_threshold > 0;
    bool threshold_reached = parsed_cmd_q_len_ > squashing_threshold;
    if (squashing_enabled && threshold_reached && dispatch_q_.empty() && !skip_next_squashing_ &&
        !IsReplySizeOverLimit()) {  // 1. Pipeline squashing
      SquashPipeline();
      dispatch_q_cmd_processed = 0;
    } else {
      MessageHandle msg;

      // If the front message is a Migration Request, but we still have pipeline data
      // (parsed_head_), we must block the migration and process the pipeline messages first.
      bool is_migration_req =
          !dispatch_q_.empty() &&
          std::holds_alternative<MigrationRequestMessage>(dispatch_q_.front().handle);

      // If the quota is reached but the pipeline appears empty, we must yield to the IoLoop
      // (producer). This allows the discovery and parsing of commands potentially sitting in the
      // TCP buffer. Without this yield, AsyncFiber would monopolize the CPU, starving the IoLoop
      // and remaining blind to pending pipeline data.
      bool quota_reached =
          (async_dispatch_quota > 0) && (dispatch_q_cmd_processed >= async_dispatch_quota);
      if (quota_reached && (parsed_head_ == nullptr)) {
        ThisFiber::Yield();

        // If it is STILL empty after IoLoop got a chance to run, the client hasn't sent anything.
        // Reset the counter so we don't yield on every single loop.
        if (parsed_head_ == nullptr) {
          dispatch_q_cmd_processed = 0;
        }
      }

      // We prioritize pipeline execution over the admin queue in two distinct cases (Pipeline queue
      // must be non-empty for both cases):
      // 1. A migration is requested (Redis only), but we must drain the existing
      // pipeline first.
      // 2.  The dispatch quota was reached, forcing a pipeline execution to prevent
      // starvation.
      bool prefer_pipeline_execution = false;
      if (parsed_head_ != nullptr) {
        prefer_pipeline_execution =
            quota_reached || (is_migration_req && (protocol_ == Protocol::REDIS));
      }
      if (dispatch_q_.empty() || prefer_pipeline_execution) {  // 2. Process pipeline Queue
        VLOG_IF(1, prefer_pipeline_execution)
            << "[" << id_ << "] Preferring pipeline execution over admin queue. "
            << "Migration requested: " << is_migration_req
            << ", dispatch quota reached: " << quota_reached
            << ", async_dispatch_quota: " << async_dispatch_quota
            << ", dispatch_q_cmd_processed: " << dispatch_q_cmd_processed;
        ProcessPipelineCommand();
        dispatch_q_cmd_processed = 0;
      } else {  // 3. Process admin Queue
        msg = std::move(dispatch_q_.front());
        dispatch_q_.pop_front();
        dispatch_q_cmd_processed++;

        // Execute and check if we need to terminate the fiber
        if (ProcessAdminMessage(&msg, &async_op)) {
          return;  // don't set conn closing flag
        }
      }
    }

    // Notify waiters if backpressure constraints are relieved.
    // 1. Global memory (bytes) is under limit -> Wakes up neighbors on this thread.
    // 2. Local queue (length) is under limit -> Wakes up this connection's producer.
    if (qbp.IsPipelineBufferUnderLimit(conn_stats.pipeline_queue_bytes, parsed_cmd_q_len_) ||
        !HasPendingMessages()) {
      qbp.pipeline_cnd.notify_all();
    }

    if (subscriber_over_limit &&
        conn_stats.dispatch_queue_subscriber_bytes < qbp.publish_buffer_limit)
      qbp.pubsub_ec.notify();
  }

  DCHECK(cc_->conn_closing || reply_builder_->GetError());

  cc_->conn_closing = true;
  qbp.pipeline_cnd.notify_all();

  // If shutdown was requested, we need to break the receive call in case the i/o fiber
  // is blocked there. With io loop v2, we can have a different mechanism to break from recv flow.
  if (request_shutdown_) {
    ShutdownSelfBlocking();
  }
}

void Connection::ShrinkPipelinePool() {
  if (pipeline_req_pool_.empty())
    return;
  auto& conn_stats = tl_facade_stats->conn_stats;

  if (tl_pipe_cache_sz_tracker.CheckAndUpdateWatermark(pipeline_req_pool_.size())) {
    conn_stats.pipeline_cmd_cache_bytes -= UsedMemoryInternal(*pipeline_req_pool_.back());
    pipeline_req_pool_.pop_back();
  }
}

Connection::PipelineMessagePtr Connection::GetFromPoolOrCreate() {
  if (pipeline_req_pool_.empty())
    return PipelineMessagePtr{CreateParsedCommand()};
  auto& conn_stats = tl_facade_stats->conn_stats;

  auto ptr = std::move(pipeline_req_pool_.back());
  pipeline_req_pool_.pop_back();

  conn_stats.pipeline_cmd_cache_bytes -= UsedMemoryInternal(*ptr);
  ptr->ResetForReuse();

  ptr->Init(reply_builder_.get(), cc_.get());
  ptr->ConfigureMCExtension(protocol_ == Protocol::MEMCACHE);

  return ptr;
}

void Connection::ShutdownSelfBlocking() {
  util::Connection::Shutdown();
}

bool Connection::Migrate(util::fb2::ProactorBase* dest) {
  // Migrate is used only by replication, so it doesn't have properties of full-fledged
  // connections
  CHECK(!cc_->async_dispatch);
  CHECK_EQ(cc_->subscriptions, 0);  // are bound to thread local caches
  CHECK_EQ(self_.use_count(), 1u);  // references cache our thread and backpressure
                                    //
  if (ioloop_v2_ && socket_ && socket_->IsOpen()) {
    socket_->ResetOnRecvHook();
  }

  // Migrate is only used by DFLY Thread and Flow command which both check against
  // the result of Migration and handle it explicitly in their flows so this can act
  // as a weak if condition instead of a crash prone CHECK.
  if (async_fb_.IsJoinable() || cc_->conn_closing) {
    return false;
  }

  listener()->Migrate(this, dest);

  // After we migrate, it could be the case the connection was shut down. We should
  // act accordingly.
  if (!socket()->IsOpen()) {
    return false;
  }

  return true;
}

Connection::WeakRef Connection::Borrow() {
  DCHECK(self_);

  return {self_, unsigned(socket_->proactor()->GetPoolIndex()), id_};
}

void Connection::ShutdownThreadLocal() {
  pipeline_req_pool_.clear();
}

bool Connection::IsCurrentlyDispatching() const {
  if (!cc_)
    return false;

  return cc_->async_dispatch || cc_->sync_dispatch;
}

void Connection::SendPubMessageAsync(PubMessage msg) {
  SendAsync({make_unique<PubMessage>(std::move(msg))});
}

void Connection::SendMonitorMessageAsync(string msg) {
  SendAsync({MonitorMessage{std::move(msg)}});
}

void Connection::SendCheckpoint(fb2::BlockingCounter bc, bool ignore_paused, bool ignore_blocked) {
  if (!IsCurrentlyDispatching())
    return;

  if (cc_->paused && ignore_paused)
    return;

  if (cc_->blocked && ignore_blocked)
    return;

  VLOG(2) << "Sent checkpoint to " << DebugInfo();

  bc->Add(1);
  SendAsync({CheckpointMessage{bc}});
}

void Connection::SendInvalidationMessageAsync(InvalidationMessage msg) {
  SendAsync({std::move(msg)});
}

void Connection::LaunchAsyncFiberIfNeeded() {
  if (!async_fb_.IsJoinable() && !migration_in_process_) {
    VLOG(1) << "[" << id_ << "] LaunchAsyncFiberIfNeeded ";
    async_fb_ = fb2::Fiber(fb2::Launch::post, "connection_dispatch", [this]() { AsyncFiber(); });
  }
}

// SendAsync is now strictly for the Control Path (Admin/Events).
// Pipeline commands are handled separately via EnqueueParsedCommand to maintain
// clean separation between Data and Control paths.
// Note: Should never block - the callers may run in as a brief callback.
void Connection::SendAsync(MessageHandle msg) {
  DCHECK(cc_);
  DCHECK(listener());
  DCHECK_EQ(ProactorBase::me(), socket_->proactor());
  auto& conn_stats = tl_facade_stats->conn_stats;

  // "Closing" connections might be still processing commands, as we don't interrupt them.
  // So we still want to deliver control messages to them (like checkpoints) if
  // async_fb_ is running (joinable).
  if (cc_->conn_closing && (!msg.IsCheckPoint() || !async_fb_.IsJoinable()))
    return;

  // If we launch while closing, it won't be awaited. Control messages will be processed on cleanup.
  if (!cc_->conn_closing) {
    LaunchAsyncFiberIfNeeded();
  }
  DCHECK_NE(phase_, PRECLOSE);  // No more messages are processed after this point

  // Close MONITOR connection if we overflow limits.
  // We must check the Thread-Global memory usage of BOTH:
  // 1. The Control Path (dispatch_queue_bytes)
  // 2. The Data Path (pipeline_queue_bytes)
  if (msg.IsMonitor()) {
    if (GetQueueBackpressure().IsPipelineBufferOverLimit(
            conn_stats.dispatch_queue_bytes + conn_stats.pipeline_queue_bytes,
            GetPendingMessageCount())) {
      cc_->conn_closing = true;
      request_shutdown_ = true;
      // We don't shutdown here. The reason is that TLS socket is preemptive
      // and SendAsync is atomic.
      cnd_.notify_one();
      return;
    }
  }

  local_stats_.dispatch_entries_added++;
  UpdateDispatchStats(msg, true /* add */);
  msg.dispatch_cycle = CycleClock::Now();

  // Admin Queueing Rules:
  // Checkpoints go to the front (after existing checkpoints), while all others to the back.
  bool had_pending_messages = HasPendingMessages();  // check the queues before enqueuing
  if (msg.IsCheckPoint()) {
    auto it = dispatch_q_.begin();
    while (it < dispatch_q_.end() && it->IsCheckPoint())
      ++it;
    dispatch_q_.insert(it, std::move(msg));
  } else {
    dispatch_q_.push_back(std::move(msg));
  }

  // Control Path Notification:
  // We need to wake up the AsyncFiber only if it is currently sleeping.
  // 1. Memcache: Sleeps if dispatch_q_ is empty. Must notify on 0->1 transition.
  // 2. Redis: Sleeps if BOTH queues are empty. If pipeline has items, it's already awake.
  bool should_notify = false;
  if (protocol_ == Protocol::REDIS) {
    if (!had_pending_messages) {
      should_notify = true;
    }
  } else {  // MEMCACHE
    should_notify = (dispatch_q_.size() == 1);
  }

  if (should_notify && !cc_->sync_dispatch) {
    cnd_.notify_one();
  }
}

void Connection::UpdateDispatchStats(const MessageHandle& msg, bool add) {
  size_t mem = msg.UsedMemory();
  auto& qbp = GetQueueBackpressure();
  auto& conn_stats = tl_facade_stats->conn_stats;
  if (add) {
    conn_stats.dispatch_queue_entries++;
    conn_stats.dispatch_queue_bytes += mem;
    dispatch_q_bytes_ += mem;
    if (msg.IsPubMsg()) {
      qbp.subscriber_bytes.fetch_add(mem, std::memory_order_relaxed);
      conn_stats.dispatch_queue_subscriber_bytes += mem;
      dispatch_q_subscriber_bytes_ += mem;
    }
  } else {
    DCHECK_GT(conn_stats.dispatch_queue_entries, 0u);
    DCHECK_GE(conn_stats.dispatch_queue_bytes, mem);
    conn_stats.dispatch_queue_entries--;
    conn_stats.dispatch_queue_bytes -= mem;
    dispatch_q_bytes_ -= mem;
    if (msg.IsPubMsg()) {
      DCHECK_GE(conn_stats.dispatch_queue_subscriber_bytes, mem);
      DCHECK_GE(qbp.subscriber_bytes.load(std::memory_order_relaxed), mem);
      qbp.subscriber_bytes.fetch_sub(mem, std::memory_order_relaxed);
      conn_stats.dispatch_queue_subscriber_bytes -= mem;
      dispatch_q_subscriber_bytes_ -= mem;
    }
  }
}

std::string Connection::LocalBindStr() const {
  if (socket_->IsUDS())
    return "unix-domain-socket";

  auto le = socket_->LocalEndpoint();
  return absl::StrCat(le.address().to_string(), ":", le.port());
}

std::string Connection::LocalBindAddress() const {
  if (socket_->IsUDS())
    return "unix-domain-socket";

  auto le = socket_->LocalEndpoint();
  return le.address().to_string();
}

std::string Connection::RemoteEndpointStr() const {
  if (socket_->IsUDS())
    return "unix-domain-socket";

  auto re = socket_->RemoteEndpoint();
  return absl::StrCat(re.address().to_string(), ":", re.port());
}

std::string Connection::RemoteEndpointAddress() const {
  if (socket_->IsUDS())
    return "unix-domain-socket";

  auto re = socket_->RemoteEndpoint();
  return re.address().to_string();
}

facade::ConnectionContext* Connection::cntx() {
  return cc_.get();
}

void Connection::RequestAsyncMigration(util::fb2::ProactorBase* dest, bool force) {
  if ((!force && !migration_enabled_) || cc_ == nullptr) {
    return;
  }

  // Connections can migrate at most once.
  migration_enabled_ = false;
  migration_request_ = dest;
}

void Connection::StartTrafficLogging(string_view path) {
  OpenTrafficLogger(path);
}

void Connection::StopTrafficLogging() {
  lock_guard lk(tl_traffic_logger.mutex);
  tl_traffic_logger.ResetLocked();
}

bool Connection::IsHttp() const {
  return is_http_;
}

size_t Connection::GetMemoryUsage() const {
  size_t mem = sizeof(*this) + cmn::HeapSize(name_) + cmn::HeapSize(memcache_parser_) +
               cmn::HeapSize(redis_parser_) + cmn::HeapSize(cc_) + cmn::HeapSize(reply_builder_);

  // parsed_cmd_ can be null when dispatching a command, or for http connections.
  if (parsed_cmd_) {
    mem += UsedMemoryInternal(*parsed_cmd_);
  }

  // We add a hardcoded 9k value to accommodate for the part of the Fiber stack that is in use.
  // The allocated stack is actually larger (~130k), but only a small fraction of that (9k
  // according to our checks) is actually part of the RSS.
  mem += 9'000;

  return mem;
}

void Connection::IncreaseConnStats() {
  DCHECK(tl_facade_stats);
  auto& conn_stats = tl_facade_stats->conn_stats;
  if (IsMainOrMemcache())
    ++conn_stats.num_conns_main;
  else
    ++conn_stats.num_conns_other;
  conn_stats.read_buf_capacity += io_buf_.Capacity();

  conn_stats.dispatch_queue_entries += dispatch_q_.size();
  conn_stats.dispatch_queue_bytes += dispatch_q_bytes_;
  conn_stats.pipeline_queue_entries += parsed_cmd_q_len_;
  conn_stats.pipeline_queue_bytes += parsed_cmd_q_bytes_;
  if (dispatch_q_subscriber_bytes_ > 0) {
    auto& qbp = GetQueueBackpressure();
    conn_stats.dispatch_queue_subscriber_bytes += dispatch_q_subscriber_bytes_;
    qbp.subscriber_bytes.fetch_add(dispatch_q_subscriber_bytes_, std::memory_order_relaxed);
  }
}

void Connection::DecreaseConnStats() {
  DCHECK(tl_facade_stats);
  auto& conn_stats = tl_facade_stats->conn_stats;
  if (IsMainOrMemcache()) {
    DCHECK_GT(conn_stats.num_conns_main, 0u);
    --conn_stats.num_conns_main;
  } else {
    DCHECK_GT(conn_stats.num_conns_other, 0u);
    --conn_stats.num_conns_other;
  }
  DCHECK_GE(conn_stats.read_buf_capacity, io_buf_.Capacity());
  conn_stats.read_buf_capacity -= io_buf_.Capacity();

  DCHECK_GE(conn_stats.dispatch_queue_entries, dispatch_q_.size());
  conn_stats.dispatch_queue_entries -= dispatch_q_.size();
  DCHECK_GE(conn_stats.dispatch_queue_bytes, dispatch_q_bytes_);
  conn_stats.dispatch_queue_bytes -= dispatch_q_bytes_;
  if (dispatch_q_subscriber_bytes_ > 0) {
    auto& qbp = GetQueueBackpressure();
    DCHECK_GE(conn_stats.dispatch_queue_subscriber_bytes, dispatch_q_subscriber_bytes_);
    conn_stats.dispatch_queue_subscriber_bytes -= dispatch_q_subscriber_bytes_;
    DCHECK_GE(qbp.subscriber_bytes.load(std::memory_order_relaxed), dispatch_q_subscriber_bytes_);
    qbp.subscriber_bytes.fetch_sub(dispatch_q_subscriber_bytes_, std::memory_order_relaxed);
  }
  DCHECK_GE(conn_stats.pipeline_queue_entries, parsed_cmd_q_len_);
  conn_stats.pipeline_queue_entries -= parsed_cmd_q_len_;
  DCHECK_GE(conn_stats.pipeline_queue_bytes, parsed_cmd_q_bytes_);
  conn_stats.pipeline_queue_bytes -= parsed_cmd_q_bytes_;
}

void Connection::BreakOnce(uint32_t ev_mask) {
  if (breaker_cb_) {
    DVLOG(1) << "[" << id_ << "] Connection::breaker_cb_ " << ev_mask;
    auto fun = std::move(breaker_cb_);
    DCHECK(!breaker_cb_);
    fun(ev_mask);
  }
}

bool Connection::IsReplySizeOverLimit() const {
  std::atomic<size_t>& reply_sz = tl_facade_stats->reply_stats.squashing_current_reply_size;
  size_t current = reply_sz.load(std::memory_order_acquire);
  const bool over_limit = reply_size_limit != 0 && current > 0 && current > reply_size_limit;
  if (over_limit) {
    LOG_EVERY_T(INFO, 10) << "Commands squashing current reply size is overlimit: " << current
                          << "/" << reply_size_limit
                          << ". Falling back to single command dispatch (instead of squashing)";
    // Used by testing. Should not be used in production, therefore debug log level 5.
    DVLOG(5) << "Commands squashing current reply size is overlimit: " << current << "/"
             << reply_size_limit
             << ". Falling back to single command dispatch (instead of squashing)";
  }
  return over_limit;
}

bool Connection::ParseRedisBatch() {
  return ParseRedis(max_busy_read_cycles_cached, true) == ParserStatus::OK;
}

bool Connection::ParseMCBatch() {
  CHECK(io_buf_.InputLen() > 0);

  do {
    if (parsed_cmd_ == nullptr) {
      // Happens with pipelined commands after the first one.
      PipelineMessagePtr ptr = GetFromPoolOrCreate();
      parsed_cmd_ = ptr.release();
    }
    uint32_t consumed = 0;
    memcache_parser_->set_last_unix_time(time(nullptr));
    MemcacheParser::Result result = memcache_parser_->Parse(io::View(io_buf_.InputBuffer()),
                                                            &consumed, parsed_cmd_->mc_command());
    io_buf_.ConsumeInput(consumed);

    DVLOG(2) << "mc_result " << unsigned(result) << " consumed: " << consumed << " type "
             << unsigned(parsed_cmd_->mc_command()->type);
    if (result == MemcacheParser::INPUT_PENDING)
      return false;

    // We push the command to the parsed queue even in case of parse errors,
    // so that we can reply in order.
    EnqueueParsedCommand(parsed_cmd_);
    parsed_cmd_ = nullptr;  // ownership transferred.

    if (result != MemcacheParser::OK) {
      // We can not just reply directly to parse error, as we may have pipelined commands before.
      // Fill the reply_payload into parsed_tail_ with the error and continue parsing.
      memcache_parser_->Reset();
      // TODO(vlad): Use Proper SendError calls instead of SendSimpleString and error building
      auto client_error = [](string_view msg) { return absl::StrCat("CLIENT_ERROR ", msg); };

      parsed_tail_->SetDeferredReply();
      switch (result) {
        case MemcacheParser::UNKNOWN_CMD:
          parsed_tail_->SendSimpleString("ERROR");
          break;
        case MemcacheParser::PARSE_ERROR:
          parsed_tail_->SendSimpleString(client_error("bad data chunk"));
          break;
        case MemcacheParser::BAD_DELTA:
          parsed_tail_->SendSimpleString(client_error("invalid numeric delta argument"));
          break;
        default:
          parsed_tail_->SendSimpleString(client_error("bad command line format"));
          break;
      }
    }
  } while (parsed_cmd_q_len_ < 128 && io_buf_.InputLen() > 0);
  return true;
}

bool Connection::ExecuteBatch() {
  auto& conn_stats = tl_facade_stats->conn_stats;
  auto advance_head = [this]() -> ParsedCommand* {
    auto* cmd = parsed_head_;
    parsed_head_ = cmd->next;
    ReleaseParsedCommand(cmd, parsed_head_ != nullptr /* is_pipelined */);
    return parsed_head_;
  };

  auto dispatch = protocol_ == Protocol::MEMCACHE ? &ServiceInterface::DispatchMC
                                                  : &ServiceInterface::DispatchCommandSimple;

  // Execute sequentially all parsed commands.
  for (auto& cmd = parsed_to_execute_; cmd != nullptr;) {
    if (reply_builder_->GetError())
      return false;
    bool is_head = cmd == parsed_head_;

    // parser errors are stored as deferred replies
    if (cmd->IsDeferredReply() && cmd->CanReply()) {
      if (is_head) {
        cmd->SendReply();
        cmd = advance_head();
      } else {
        cmd = cmd->next;
      }
      continue;
    }

    // We must continue with async execution if we already have executing commands
    auto mode = is_head ? AsyncPreference::PREFER_ASYNC : AsyncPreference::ONLY_ASYNC;

    if (!ioloop_v2_)  // only v2 loop supports any async commands so far
      mode = AsyncPreference::ONLY_SYNC;

    auto dispatch_res = (service_->*dispatch)(cmd, mode);

    // Enforce the pipeline invariant between the IO loop (producer) and AsyncFiber (consumer).
    // To prevent stream corruption, the command state must satisfy ONE of these rules:
    // 1. It is the head command (safely writes to the socket directly).
    // 2. It did not stall the pipeline (dispatch_res != WOULD_BLOCK) and therefore
    //    must have buffered its reply locally (is_deferred == true).
    // 3. It stalled the pipeline because it requires synchronous execution
    //    (dispatch_res == WOULD_BLOCK) and therefore must NOT have buffered
    //    a reply (is_deferred == false).
    bool is_deferred = cmd->IsDeferredReply();
    DCHECK(is_head || (is_deferred == (dispatch_res != DispatchResult::WOULD_BLOCK)))
        << "Pipeline contract breach! Invalid state for non-head command. "
        << "DispatchResult: " << static_cast<int>(dispatch_res) << ", IsDeferred: " << is_deferred
        << ", Command Type: " << cmd->mc_command()->type;

    if (dispatch_res == DispatchResult::WOULD_BLOCK)
      break;  // Sync command. Wait for current async commands to finish

    conn_stats.pipeline_dispatch_commands++;
    if (is_head)
      conn_stats.pipeline_dispatch_calls++;

    if (cmd->IsDeferredReply()) {
      cmd = cmd->next;
    } else {
      DCHECK(is_head);       // only head can execute sync
      cmd = advance_head();  // advance it
    }
  }

  if (parsed_head_ == nullptr)
    parsed_tail_ = nullptr;
  return true;
}

bool Connection::ReplyBatch() {
  reply_builder_->SetBatchMode(true);
  for (auto& cmd = parsed_head_; cmd != parsed_to_execute_;) {
    if (!cmd->CanReply())
      break;

    current_wait_.reset();  // we must free waiter before proceeding with other commands
    cmd->SendReply();

    auto* prev = exchange(cmd, cmd->next);
    ReleaseParsedCommand(prev, cmd != parsed_to_execute_ /* is_pipelined */);
    if (reply_builder_->GetError())
      return false;
  }

  if (parsed_head_ == nullptr)
    parsed_tail_ = nullptr;

  reply_builder_->SetBatchMode(false);
  reply_builder_->Flush();
  return !reply_builder_->GetError();
}

ParsedCommand* Connection::CreateParsedCommand() {
  auto* res = service_->AllocateParsedCommand();
  res->Init(reply_builder_.get(), cc_.get());
  res->ConfigureMCExtension(protocol_ == Protocol::MEMCACHE);
  return res;
}

void Connection::EnqueueParsedCommand(ParsedCommand* cmd) {
  DCHECK(cmd);
  cmd->next = nullptr;
  auto& conn_stats = tl_facade_stats->conn_stats;

  cmd->parsed_cycle = base::CycleClock::Now();

  if (parsed_head_ == nullptr) {
    parsed_head_ = cmd;
    parsed_to_execute_ = cmd;
  } else {
    parsed_tail_->next = cmd;
    if (parsed_to_execute_ == nullptr) {
      // we've executed all the parsed commands so far.
      parsed_to_execute_ = cmd;
    }
  }
  parsed_tail_ = cmd;

  size_t used_mem = cmd->UsedMemory();
  parsed_cmd_q_len_++;
  parsed_cmd_q_bytes_ += used_mem;
  local_stats_.dispatch_entries_added++;
  conn_stats.pipeline_queue_entries++;
  conn_stats.pipeline_queue_bytes += used_mem;

  // AsyncFiber for Memcache only wakes up on dispatch_q_, notify only redis as this is the parse
  // commands queue.
  if ((!cc_->sync_dispatch) && (protocol_ == Protocol::REDIS)) {
    cnd_.notify_one();
  }
}

void Connection::ReleaseParsedCommand(ParsedCommand* cmd, bool is_pipelined) {
  size_t used_mem = cmd->UsedMemory();
  auto& conn_stats = tl_facade_stats->conn_stats;

  DCHECK_GT(parsed_cmd_q_len_, 0u);
  DCHECK_GE(parsed_cmd_q_bytes_, used_mem);
  DCHECK_GT(conn_stats.pipeline_queue_entries, 0u);
  DCHECK_GE(conn_stats.pipeline_queue_bytes, used_mem);
  parsed_cmd_q_len_--;
  parsed_cmd_q_bytes_ -= used_mem;

  conn_stats.pipeline_queue_entries--;
  conn_stats.pipeline_queue_bytes -= used_mem;

  if (is_pipelined) {
    conn_stats.pipelined_cmd_cnt++;
    uint64_t latency_usec = CycleClock::ToUsec(CycleClock::Now() - cmd->parsed_cycle);
    conn_stats.pipelined_cmd_latency += latency_usec;
    conn_stats.pipelined_latency_hist.Add(latency_usec);
    // Decay the histogram every kPipelineLatencyDecayPeriod samples to
    // approximate a moving-window distribution; older observations contribute
    // half as much after each decay period.
    constexpr uint64_t kPipelineLatencyDecayPeriod = 1 << 14;  // 16384
    if ((conn_stats.pipelined_latency_hist.count() & (kPipelineLatencyDecayPeriod - 1)) == 0) {
      conn_stats.pipelined_latency_hist.Decay();
    }
  }

  if (parsed_cmd_ == nullptr) {
    parsed_cmd_ = cmd;
    parsed_cmd_->ResetForReuse();
  } else {
    // If we are over the limit, destroy the command instead of caching it.
    size_t cmd_mem = UsedMemoryInternal(*cmd);
    QueueBackpressure& qbp = GetQueueBackpressure();
    if (conn_stats.pipeline_cmd_cache_bytes + cmd_mem <= qbp.pipeline_cache_limit) {
      conn_stats.pipeline_cmd_cache_bytes += cmd_mem;
      pipeline_req_pool_.emplace_back(cmd);
    } else {
      delete cmd;
    }
  }
}

void Connection::DestroyParsedQueue() {
  while (parsed_head_ != nullptr) {
    auto* cmd = parsed_head_;
    parsed_head_ = cmd->next;

    // Being able to drop an in-flight transaction would require it keeping no pointers
    // at all to any context data - too costly for now! (maybe let it own the arguments?)
    if (cmd->IsDeferredReply() && !cmd->CanReply())
      cmd->Blocker()->Wait();  // explicitly wait for it to finish
    ReleaseParsedCommand(cmd, false);
  }

  parsed_tail_ = nullptr;
  CHECK_EQ(parsed_cmd_q_len_, 0u);
  CHECK_EQ(parsed_cmd_q_bytes_, 0u);
  delete parsed_cmd_;
  parsed_cmd_ = nullptr;
}

void Connection::UpdateFromFlags() {
  unsigned tid = fb2::ProactorBase::me()->GetPoolIndex();
  thread_queue_backpressure[tid].pipeline_queue_max_len = GetFlag(FLAGS_pipeline_queue_limit);
  thread_queue_backpressure[tid].pipeline_buffer_limit = GetFlag(FLAGS_pipeline_buffer_limit);
  thread_queue_backpressure[tid].pipeline_cnd.notify_all();

  max_busy_read_cycles_cached = base::CycleClock::FromUsec(GetFlag(FLAGS_max_busy_read_usec));
  always_flush_pipeline_cached = GetFlag(FLAGS_always_flush_pipeline);
  pipeline_squash_limit_cached = GetFlag(FLAGS_pipeline_squash_limit);
  pipeline_wait_batch_usec = GetFlag(FLAGS_pipeline_wait_batch_usec);
}

std::vector<std::string> Connection::GetMutableFlagNames() {
  return base::GetFlagNames(FLAGS_pipeline_queue_limit, FLAGS_pipeline_buffer_limit,
                            FLAGS_max_busy_read_usec, FLAGS_always_flush_pipeline,
                            FLAGS_pipeline_squash_limit, FLAGS_pipeline_wait_batch_usec);
}

void Connection::GetRequestSizeHistogramThreadLocal(std::string* hist) {
  if (io_req_size_hist)
    *hist = io_req_size_hist->ToString();
}

void Connection::TrackRequestSize(bool enable) {
  if (enable && !io_req_size_hist) {
    io_req_size_hist = new base::Histogram;
  } else if (!enable && io_req_size_hist) {
    delete io_req_size_hist;
    io_req_size_hist = nullptr;
  }
}

void Connection::EnsureMemoryBudget(unsigned tid) {
  thread_queue_backpressure[tid].EnsureBelowLimit();
}

ConnectionRef::ConnectionRef(const std::shared_ptr<Connection>& ptr, unsigned thread_id,
                             uint32_t client_id)
    : ptr_{ptr}, last_known_thread_id_{thread_id}, client_id_{client_id} {
}

Connection* ConnectionRef::Get() const {
  auto sptr = ptr_.lock();

  //  The connection can only be deleted on this thread, so
  //  this pointer is valid until the next suspension.
  //  Note: keeping a shared_ptr doesn't prolong the lifetime because
  //  it doesn't manage the underlying connection. See definition of `self_`.
  return sptr.get();
}

bool Connection::WeakRef::IsExpired() const {
  return ptr_.expired();
}

uint32_t Connection::WeakRef::GetClientId() const {
  return client_id_;
}

bool ConnectionRef::operator<(const ConnectionRef& other) const {
  return client_id_ < other.client_id_;
}

bool ConnectionRef::operator==(const ConnectionRef& other) const {
  return client_id_ == other.client_id_;
}

void Connection::DoReadOnRecv(const util::FiberSocketBase::RecvNotification& n) {
  if (std::holds_alternative<std::error_code>(n.read_result)) {
    io_ec_ = std::get<std::error_code>(n.read_result);
    return;
  }

  using RecvNoti = util::FiberSocketBase::RecvNotification::RecvCompletion;
  if (std::holds_alternative<RecvNoti>(n.read_result)) {
    if (!std::get<RecvNoti>(n.read_result)) {
      io_ec_ = make_error_code(errc::connection_aborted);
      return;
    }

    if (io_buf_.AppendLen() == 0) {
      // We will regrow in IoLoopV2
      return;
    }

    io::MutableBytes buf = io_buf_.AppendBuffer();
    io::Result<size_t> res = socket_->TryRecv(buf);

    if (res) {
      if (*res > 0) {
        // A recv call can return fewer bytes than requested even if the
        // socket buffer actually contains enough data to satisfy the full request.
        // TODO maybe worth looping here and try another recv call until it fails
        // with EAGAIN or EWOULDBLOCK. The problem there is that we need to handle
        // resizing if AppendBuffer is zero.
        io_buf_.CommitWrite(*res);
        return;
      }
      // *res == 0
      io_ec_ = make_error_code(errc::connection_aborted);
      return;
    }

    // error path (!res)
    auto ec = res.error();
    // EAGAIN and EWOULDBLOCK
    if (ec == errc::resource_unavailable_try_again || ec == errc::operation_would_block) {
      return;
    }

    io_ec_ = ec;
  } else if (std::holds_alternative<io::MutableBytes>(n.read_result)) {  // provided buffer.
    io::MutableBytes buf = std::get<io::MutableBytes>(n.read_result);
    UpdateIoBufCapacity(io_buf_, &tl_facade_stats->conn_stats,
                        [&]() { io_buf_.WriteAndCommit(buf.data(), buf.size()); });
  } else {
    LOG(FATAL) << "Should not reach here";
  }
}

void Connection::CheckIoBufCapacity(bool is_iobuf_full) {
  auto& conn_stats = tl_facade_stats->conn_stats;
  size_t max_io_buf_len = GetFlag(FLAGS_max_client_iobuf_len);

  size_t capacity = io_buf_.Capacity();
  if (capacity < max_io_buf_len) {
    size_t parser_hint = 0;
    if (redis_parser_)
      parser_hint = redis_parser_->parselen_hint();  // Could be done for MC as well.

    // If we got a partial request and we managed to parse its
    // length, make sure we have space to store it instead of
    // increasing space incrementally.
    // (Note: The buffer object is only working in power-of-2 sizes,
    // so there's no danger of accidental O(n^2) behavior.)
    if (parser_hint > capacity) {
      UpdateIoBufCapacity(io_buf_, &conn_stats,
                          [&]() { io_buf_.Reserve(std::min(max_io_buf_len, parser_hint)); });
    }

    // If we got a partial request because iobuf was full, grow it up to
    // a reasonable limit to save on Recv() calls.
    if (is_iobuf_full && capacity < max_io_buf_len / 2) {
      // Last io used most of the io_buf to the end.
      UpdateIoBufCapacity(io_buf_, &conn_stats, [&]() {
        io_buf_.Reserve(capacity * 2);  // Valid growth range.
      });
    }

    if (io_buf_.AppendLen() == 0U) {
      // it can happen with memcached but not for RedisParser, because RedisParser fully
      // consumes the passed buffer
      LOG_EVERY_T(WARNING, 10) << "Maximum io_buf length reached " << io_buf_.Capacity()
                               << ", consider to increase max_client_iobuf_len flag";
    }
  }
}

variant<error_code, Connection::ParserStatus> Connection::IoLoopV2() {
  DCHECK(memcache_parser_) << "Not supported for redis yet";

  auto* peer = socket_.get();
  recv_buf_.res_len = 0;

  // Don't proceed with RegisterOnRecv() if socket is closed (possible cancellation)
  if (!peer->IsOpen())
    return ParserStatus::OK;

  if (fb2::ProactorBase::me()->GetKind() == fb2::ProactorBase::Kind::IOURING) {
#ifdef __linux__
    fb2::UringProactor* up = static_cast<fb2::UringProactor*>(fb2::ProactorBase::me());
    if (up->BufRingEntrySize(kRecvSockGid) > 0 && !is_tls_) {
      static_cast<fb2::UringSocket*>(peer)->EnableRecvMultishot();
    }
#endif
  }

  peer->RegisterOnRecv([this](const FiberSocketBase::RecvNotification& n) {
    DVLOG(2) << "Calling DoReadOnRecv iobuf_len: " << io_buf_.InputLen();
    DoReadOnRecv(n);
    io_event_.notify();
  });

  ParserStatus parse_status = OK;

  // Waiter that is passed to the current async command head to be notified on completion
  auto ioevent_cb = [this]() { io_event_.notify(); };
  util::fb2::detail::Waiter ioevent_waiter{ioevent_cb};  // takes callback by reference
  absl::Cleanup waiter_cleanup = [this] { current_wait_.reset(); };

  do {
    HandleMigrateRequest();

    // Register completion for current head if its pending and we don't wait
    if (auto* cmd = parsed_head_; cmd && cmd != parsed_to_execute_ && !current_wait_.has_value()) {
      current_wait_.emplace(cmd, &ioevent_waiter);
    }

    if (io_buf_.InputLen() == 0) {
      // Poll again for readiness. The event handler registered above is edge triggered
      // We should read from the socket until EAGAIN or EWOULDBLOCK
      // to make sure we consume all available data.
      // See "Do I need to continuously read/write" question
      // under https://man7.org/linux/man-pages/man7/epoll.7.html
      // The exception is when we use io_uring with multishot recv enabled, in which case
      // we rely on the kernel to keep feeding us data until we multishot is disabled.
      DoReadOnRecv(FiberSocketBase::RecvNotification{true});
      io_event_.await([this]() {
        // TODO: optimize CanReply with looking up waiter key
        bool cmd_executable = parsed_head_ && parsed_head_ == parsed_to_execute_;
        bool cmd_ready = !cmd_executable && parsed_head_ && parsed_head_->CanReply();
        return io_buf_.InputLen() > 0 || cmd_ready || cmd_executable || io_ec_;
      });
    }

    if (io_ec_) {
      LOG_IF(WARNING, cntx()->replica_conn) << "async io error: " << io_ec_;
      return std::exchange(io_ec_, {});
    }

    phase_ = PROCESS;
    bool is_iobuf_full = io_buf_.AppendLen() == 0;

    if (io_buf_.InputLen() > 0) {
      parse_status = ParseLoop();
    } else {
      parse_status = NEED_MORE;

      if (parsed_head_) {
        if (parsed_head_ == parsed_to_execute_)
          ExecuteBatch();
        ReplyBatch();
      }
    }

    if (reply_builder_->GetError()) {
      return reply_builder_->GetError();
    }

    if (parse_status == NEED_MORE) {
      parse_status = OK;
      CheckIoBufCapacity(is_iobuf_full);
    } else if (parse_status != OK) {
      break;
    }
  } while (peer->IsOpen());

  return parse_status;
}

Connection::WaitEvent::WaitEvent(ParsedCommand* cmd, util::fb2::detail::Waiter* w)
    : key(cmd->Blocker()->OnCompletion(w)) {
}

void ResetStats() {
  auto& cstats = tl_facade_stats->conn_stats;
  cstats.pipelined_cmd_cnt = 0;
  cstats.conn_received_cnt = 0;
  cstats.command_cnt_main = 0;
  cstats.command_cnt_other = 0;
  cstats.io_read_cnt = 0;
  cstats.io_read_bytes = 0;

  tl_facade_stats->reply_stats = {};
  if (io_req_size_hist)
    io_req_size_hist->Clear();
}

}  // namespace facade


================================================
FILE: src/facade/dragonfly_connection.h
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/fixed_array.h>
#include <sys/socket.h>

#include <deque>
#include <memory>
#include <string_view>
#include <utility>
#include <variant>

#include "facade/connection_ref.h"
#include "facade/facade_types.h"
#include "facade/parsed_command.h"
#include "io/io_buf.h"
#include "util/connection.h"
#include "util/fibers/fibers.h"
#include "util/fibers/synchronization.h"

typedef struct ssl_ctx_st SSL_CTX;

// need to declare for older linux distributions like CentOS 7
#ifndef SO_INCOMING_CPU
#define SO_INCOMING_CPU 49
#endif

#ifndef SO_INCOMING_NAPI_ID
#define SO_INCOMING_NAPI_ID 56
#endif

#ifdef ABSL_HAVE_ADDRESS_SANITIZER
constexpr size_t kReqStorageSize = 88;
#else
constexpr size_t kReqStorageSize = 120;
#endif

namespace util {
class HttpListenerBase;
}  // namespace util

namespace facade {

struct ConnectionStats;
class ConnectionContext;
class ServiceInterface;
class SinkReplyBuilder;
class RespSrvParser;

// Connection represents an active connection for a client.
//
// It directly dispatches regular commands from the io-loop.
// For pipelined requests, monitor and pubsub messages it uses
// a separate dispatch queue that is processed on a separate fiber.
class Connection : public util::Connection {
 public:
  static void Init(unsigned io_threads);
  static void Shutdown();
  static void ShutdownThreadLocal();

  Connection(Protocol protocol, util::HttpListenerBase* http_listener, SSL_CTX* ctx,
             ServiceInterface* service);
  ~Connection();

  // A callback called by Listener::OnConnectionStart in the same thread where
  // HandleRequests will run.
  void OnConnectionStart();

  using BreakerCb = std::function<void(uint32_t)>;
  using ShutdownCb = std::function<void()>;

  // PubSub message, either incoming message for active subscription or reply for new subscription.
  struct PubMessage {
    std::string pattern;                // non-empty for pattern subscriber
    std::shared_ptr<char[]> buf;        // stores channel name and message
    std::string_view channel, message;  // channel and message parts from buf
    bool is_sharded = false;

    // Unsubscribe simultaneously when sending unsubscribe message. Used for cluster migrations
    bool force_unsubscribe = false;
  };

  // Monitor message, carries a simple payload with the registered event to be sent.
  struct MonitorMessage : public std::string {};

  // Migration request message, the async fiber stops to give way for thread migration.
  struct MigrationRequestMessage {};

  // Checkpoint message, used to track when the connection finishes executing the current command.
  struct CheckpointMessage {
    util::fb2::BlockingCounter bc;  // Decremented counter when processed
  };

  struct InvalidationMessage {
    std::string key;
    bool invalidate_due_to_flush = false;
  };

  // Pipeline message, accumulated Redis command to be executed.
  using PipelineMessagePtr = std::unique_ptr<ParsedCommand>;
  using PubMessagePtr = std::unique_ptr<PubMessage>;

  // Variant wrapper around different message types
  struct MessageHandle {
    size_t UsedMemory() const;  // How much bytes this handle takes up in total.

    // Checkpoint messages put themselves at the front of the queue, but only in relative
    // order to the rest of the messages in the queue.
    bool IsCheckPoint() const {
      return std::holds_alternative<CheckpointMessage>(handle);
    }

    bool IsPubMsg() const {
      return std::holds_alternative<PubMessagePtr>(handle);
    }

    bool IsMonitor() const {
      return std::holds_alternative<MonitorMessage>(handle);
    }

    bool IsReplying() const;  // control messages don't reply, messages carrying data do

    std::variant<MonitorMessage, PubMessagePtr, MigrationRequestMessage, CheckpointMessage,
                 InvalidationMessage>
        handle;

    // time when the message was dispatched to the dispatch queue as reported by
    // CycleClock::Now()
    uint64_t dispatch_cycle = 0;
  };

  static_assert(sizeof(MessageHandle) <= 80,
                "Big structs should use indirection to avoid wasting deque space!");

  enum Phase : uint8_t { SETUP, READ_SOCKET, PROCESS, SHUTTING_DOWN, PRECLOSE, NUM_PHASES };

  using WeakRef = ConnectionRef;

  // Add PubMessage to dispatch queue.
  // Virtual because behavior is overridden in test_utils.
  virtual void SendPubMessageAsync(PubMessage);

  // Add monitor message to dispatch queue.
  void SendMonitorMessageAsync(std::string);

  // If any dispatch is currently in progress, increment counter and send checkpoint message to
  // decrement it once finished.
  void SendCheckpoint(util::fb2::BlockingCounter bc, bool ignore_paused = false,
                      bool ignore_blocked = false);

  // Add InvalidationMessage to dispatch queue.
  virtual void SendInvalidationMessageAsync(InvalidationMessage);

  // Register hook that is executen when the connection breaks.
  void RegisterBreakHook(BreakerCb breaker_cb);

  void FlushReplies();

  // Manually shutdown self.
  void ShutdownSelfBlocking();

  // Migrate this connecton to a different thread.
  // Return true if Migrate succeeded
  // Return false if dispatch_fb_ is active
  bool Migrate(util::fb2::ProactorBase* dest);

  // Borrow weak reference to connection. Can be called from any thread.
  WeakRef Borrow();

  bool IsCurrentlyDispatching() const;

  std::string GetClientInfo(unsigned thread_id) const;
  std::string GetClientInfo() const;

  virtual std::string RemoteEndpointStr() const;  // virtual because overwritten in test_utils
  std::string RemoteEndpointAddress() const;

  std::string LocalBindStr() const;
  std::string LocalBindAddress() const;

  uint32_t GetClientId() const;

  virtual bool IsPrivileged() const;  // virtual because overwritten in test_utils

  bool IsMain() const;

  // In addition to the listener role being main, also returns true if the protocol is Memcached.
  // This method returns true for customer facing listeners.
  bool IsMainOrMemcache() const;

  void SetName(std::string name);

  void SetLibName(std::string name);
  void SetLibVersion(std::string version);

  // Returns a map of 'libname:libver'->count, thread local data
  static const absl::flat_hash_map<std::string, uint64_t>& GetLibStatsTL();

  std::string_view GetName() const {
    return name_;
  }

  // Returns protocol type of this connection
  Protocol GetProtocol() const {
    return protocol_;
  }

  // Returns memory usage of this connection's auxiliary members in bytes.
  size_t GetMemoryUsage() const;

  ConnectionContext* cntx();

  // Requests that at some point, this connection will be migrated to `dest` thread.
  // If force is false, the connection will migrate at most once,
  // and only when the flag --migrate_connections is true.
  void RequestAsyncMigration(util::fb2::ProactorBase* dest, bool force);

  // Starts traffic logging in the calling thread. Must be a proactor thread.
  // Each thread creates its own log file combining requests from all the connections in
  // that thread. A noop if the thread is already logging.
  static void StartTrafficLogging(std::string_view base_path);

  // Stops traffic logging in this thread. A noop if the thread is not logging.
  static void StopTrafficLogging();

  // Get quick debug info for logs
  std::string DebugInfo() const;

  bool IsHttp() const;

  static void UpdateFromFlags();                          // Set values from flags
  static std::vector<std::string> GetMutableFlagNames();  // Triggers UpdateFromFlags

  static void TrackRequestSize(bool enable);
  static void EnsureMemoryBudget(unsigned tid);
  static void GetRequestSizeHistogramThreadLocal(std::string* hist);

  unsigned idle_time() const {
    return time(nullptr) - last_interaction_;
  }

  unsigned GetSendWaitTimeSec() const;

  Phase phase() const {
    return phase_;
  }

  bool IsSending() const;

  void Notify() {
    io_event_.notify();
  }

  void MarkForClose();

 protected:
  void OnShutdown() override;
  void OnPreMigrateThread() override;
  void OnPostMigrateThread() override;

  std::unique_ptr<ConnectionContext> cc_;  // Null for http connections

 private:
  enum ParserStatus : uint8_t { OK, NEED_MORE, ERROR };

  struct AsyncOperations;

  // Check protocol and handle connection.
  void HandleRequests() final;

  // Start dispatch fiber and run IoLoop.
  void ConnectionFlow();

  // Main loop reading client messages and passing requests to dispatch queue.
  std::variant<std::error_code, ParserStatus> IoLoop();

  void DoReadOnRecv(const util::FiberSocketBase::RecvNotification& n);

  void CheckIoBufCapacity(bool is_iobuf_full);

  // Main loop reading client messages and passing requests to dispatch queue.
  std::variant<std::error_code, ParserStatus> IoLoopV2();

  // Returns true if HTTP header is detected.
  io::Result<bool> CheckForHttpProto();

  // Dispatches a single (Redis or MC) command.
  // `has_more` should indicate whether the io buffer has more commands
  // (pipelining in progress). Performs async dispatch if forced (already in async mode) or if
  // has_more is true, otherwise uses synchronous dispatch.
  void DispatchSingle(bool has_more, absl::FunctionRef<void()> invoke_cb,
                      absl::FunctionRef<void()> enqueue_cmd_cb);

  // Handles events from the dispatch queue.
  void AsyncFiber();

  // Processes a single Admin/Control message from dispatch_q_.
  // Returns true if the fiber should terminate (e.g. Migration).
  bool ProcessAdminMessage(MessageHandle* msg, AsyncOperations* async_op);

  // Processes the next Pipeline command from parsed_head_.
  void ProcessPipelineCommand();

  void SendAsync(MessageHandle msg);

  // Updates Control Path statistics and backpressure counters for administrative
  // events, monitor messages, and PubSub notifications.
  // If add is true, stats are incremented, otherwise decremented.
  void UpdateDispatchStats(const MessageHandle& msg, bool add);

  ParserStatus ParseRedis(unsigned max_busy_cycles, bool enqueue_only = false);

  void OnBreakCb(int32_t mask);

  // Shrink pipeline pool by a little while handling regular commands.
  void ShrinkPipelinePool();

  // Returns non-null request ptr if pool has vacant entries.
  PipelineMessagePtr GetFromPoolOrCreate();

  void HandleMigrateRequest();
  io::Result<size_t> HandleRecvSocket();

  bool ShouldEndAsyncFiber(const MessageHandle& msg);

  void LaunchAsyncFiberIfNeeded();  // Async fiber is started lazily

  // Squashes pipelined commands from the dispatch queue to spread load over all threads
  void SquashPipeline();

  // Clear pipelined messages, disaptching only intrusive ones.
  void ClearPipelinedMessages();

  std::pair<std::string, std::string> GetClientInfoBeforeAfterTid() const;

  void IncreaseConnStats();
  void DecreaseConnStats();
  void BreakOnce(uint32_t ev_mask);

  // The read buffer with read data that needs to be parsed and processed.
  // For io_uring bundles we may have available_bytes larger than slice.size()
  // which means that there are more buffers available to read.
  struct ReadBuffer {
    size_t available_bytes;
    io::Bytes slice;

    void Consume(size_t len) {
      available_bytes -= len;
      slice.remove_prefix(len);
    }
  };

  bool IsReplySizeOverLimit() const;

  // Returns true if one or more commands were parsed from the read buffer,
  // and false if no complete commands could be parsed (for example, when
  // parsing is pending more input).
  bool ParseMCBatch();

  bool ParseRedisBatch();

  // Call appropriate ParseBatch function, proceed with Execute and Reply all why input is remaining
  ParserStatus ParseLoop();

  // Loop over enqueued async commands and enqueue them for async execution.
  // If async execution is not possible, handle them in synchronous mode one by one.
  // Returns true on successful execution, false on reply builder error.
  bool ExecuteBatch();

  // Loop over finished async commands and let them reply.
  // Returns true on successful execution, false on reply builder error.
  bool ReplyBatch();

  // Guard of the current subscription to a parsed commands async task blocker
  struct WaitEvent {
    explicit WaitEvent(ParsedCommand* cmd, util::fb2::detail::Waiter* w);

    std::optional<util::fb2::EventCount::SubKey> key;
  };

  ParsedCommand* CreateParsedCommand();
  void EnqueueParsedCommand(ParsedCommand* cmd);

  // Releases the command memory back to the pool.
  // - Set is_pipelined=true if the command was successfully executed and should be counted
  // in latency/throughput stats.
  // - Set is_pipelined=false if the command is being dropped/cleaned up without execution or should
  // not be counted in stats.
  void ReleaseParsedCommand(ParsedCommand* cmd, bool is_pipelined);

  void DestroyParsedQueue();

  // Dispatch Queue - Queue for the Control Path.
  // Handles asynchronous administrative tasks, events, and high-priority control
  // messages (e.g., PubSub, Monitor, Migration requests, Checkpoints) processed
  // by the AsyncFiber.
  std::deque<MessageHandle> dispatch_q_;    // dispatch queue
  util::fb2::CondVarAny cnd_;               // dispatch queue waker
  util::fb2::Fiber async_fb_;               // async fiber (if started)
  size_t dispatch_q_bytes_ = 0;             // total bytes in dispatch queue
  size_t dispatch_q_subscriber_bytes_ = 0;  // total bytes from subscribers in dispatch queue

  std::error_code io_ec_;
  util::fb2::EventCount io_event_;
  std::optional<WaitEvent> current_wait_;

  // how many bytes of the current request have been consumed
  size_t request_consumed_bytes_ = 0;

  util::FiberSocketBase::ProvidedBuffer recv_buf_;
  io::IoBuf io_buf_;  // used in io loop and parsers
  std::unique_ptr<RespSrvParser> redis_parser_;
  std::unique_ptr<MemcacheParser> memcache_parser_;
  ParsedCommand* parsed_cmd_ = nullptr;

  // Parsed Commands Queue - Queue for the Data Path.
  //
  // Commands move through the following stages in a single linked list:
  //   1) parsed but not yet dispatched        : [parsed_to_execute_, ..., parsed_tail_]
  //   2) dispatched but not yet completed     : between parsed_head_ and parsed_to_execute_
  //   3) completed (replies ready to send)    : a prefix of [parsed_head_, ..., parsed_to_execute_)
  //   4) replied and removed                  : before parsed_head_ (no longer in the list)
  //
  // Logical order diagram:
  //   head -> ... -> (dispatched, waiting for completion) -> ... -> parsed_to_execute_ -> ... ->
  //   tail
  //
  // parsed_to_execute_ is advanced as commands are dispatched for execution.
  // Executed (completed) commands are kept in the queue until their replies are sent,
  // in order to preserve reply ordering.
  // ReplyMCBatch walks from parsed_head_ up to (but not including) parsed_to_execute_,
  // replies commands that have completed, and removes only those replied commands from
  // the queue, advancing parsed_head_ accordingly.
  ParsedCommand* parsed_head_ = nullptr;
  ParsedCommand* parsed_tail_ = nullptr;
  ParsedCommand* parsed_to_execute_ = nullptr;
  // Total number of commands in parsed command queue
  size_t parsed_cmd_q_len_ = 0;
  // Total bytes used by commands in parsed command queue
  size_t parsed_cmd_q_bytes_ = 0;
  // Returns true if there are any commands pending in the parsed command queue or dispatch queue.
  bool HasPendingMessages() const {
    return parsed_head_ || !dispatch_q_.empty();
  }

  // Returns total count of commands pending in the parsed command queue and dispatch queue.
  size_t GetPendingMessageCount() const {
    return parsed_cmd_q_len_ + dispatch_q_.size();
  }

  uint32_t id_;
  Protocol protocol_;
  Phase phase_ = SETUP;

  struct {
    size_t read_cnt = 0;                // total number of read calls
    size_t net_bytes_in = 0;            // total number of bytes read
    size_t dispatch_entries_added = 0;  // total number of dispatch queue entries
    size_t cmds = 0;                    // total number of commands executed
  } local_stats_;

  std::unique_ptr<SinkReplyBuilder> reply_builder_;
  util::HttpListenerBase* http_listener_;
  SSL_CTX* ssl_ctx_;

  ServiceInterface* service_;

  time_t creation_time_, last_interaction_;
  std::string name_;

  std::string lib_name_;
  std::string lib_ver_;

  unsigned parser_error_ = 0;

  BreakerCb breaker_cb_;

  // Used to keep track of borrowed references. Does not really own itself
  std::shared_ptr<Connection> self_;

  util::fb2::ProactorBase* migration_request_ = nullptr;

  // Pooled pipeline messages per-thread
  // Aggregated while handling pipelines, gradually released while handling regular commands.
  static thread_local std::vector<PipelineMessagePtr> pipeline_req_pool_;

  union {
    uint16_t flags_;
    struct {
      // a flag indicating whether the client has turned on client tracking.
      bool tracking_enabled_ : 1;
      bool skip_next_squashing_ : 1;  // Forcefully skip next squashing

      // Connection migration vars, see RequestAsyncMigration() above.
      bool migration_enabled_ : 1;
      bool migration_in_process_ : 1;
      bool is_http_ : 1;

      // whether the connection is TLS. We can be sure our socket is TlsSocket
      // if the flag is set.
      bool is_tls_ : 1;
      bool is_main_ : 1;
      bool ioloop_v2_ : 1;  // whether this connection is running on ioloop v2

      // If post migration is allowed to call RegisterRecv
      bool migration_allowed_to_register_ : 1;
    };
  };

  bool request_shutdown_ = false;
};

}  // namespace facade


================================================
FILE: src/facade/dragonfly_listener.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "facade/dragonfly_listener.h"

#include <mimalloc.h>
#include <netinet/tcp.h>
#include <openssl/err.h>

#include <memory>

#include "absl/functional/bind_front.h"
#include "facade/tls_helpers.h"

#ifdef DFLY_USE_SSL
#include <openssl/ssl.h>
#endif
#include "base/flags.h"
#include "base/logging.h"
#include "facade/dragonfly_connection.h"
#include "facade/service_interface.h"
#include "util/proactor_pool.h"

using namespace std;

ABSL_FLAG(uint32_t, conn_io_threads, 0, "Number of threads used for handing server connections");
ABSL_FLAG(uint32_t, conn_io_thread_start, 0, "Starting thread id for handling server connections");
ABSL_FLAG(bool, tls, false, "");
ABSL_FLAG(bool, no_tls_on_admin_port, false, "Allow non-tls connections on admin port");
ABSL_FLAG(bool, enable_tcp_defer_accept, true, "Enable TCP_DEFER_ACCEPT option on server sockets");

ABSL_FLAG(bool, conn_use_incoming_cpu, false,
          "If true uses incoming cpu of a socket in order to distribute"
          " incoming connections");

ABSL_DECLARE_FLAG(std::string, tls_cert_file);
ABSL_DECLARE_FLAG(std::string, tls_key_file);
ABSL_DECLARE_FLAG(std::string, tls_ca_cert_file);
ABSL_DECLARE_FLAG(std::string, tls_ca_cert_dir);

ABSL_FLAG(uint32_t, tcp_keepalive, 300,
          "the period in seconds of inactivity after which keep-alives are triggerred,"
          "the duration until an inactive connection is terminated is twice the specified time");
ABSL_FLAG(uint32_t, tcp_user_timeout, 0,
          "the maximum period in milliseconds that transimitted data may stay unacknowledged "
          "before TCP aborts the connection. 0 means OS default timeout");

ABSL_DECLARE_FLAG(bool, primary_port_http_enabled);

#if 0
enum TlsClientAuth {
  CL_AUTH_NO = 0,
  CL_AUTH_YES = 1,
  CL_AUTH_OPTIONAL = 2,
};

facade::ConfigEnum tls_auth_clients_enum[] = {
    {"no", CL_AUTH_NO},
    {"yes", CL_AUTH_YES},
    {"optional", CL_AUTH_OPTIONAL},
};

static int tls_auth_clients_opt = CL_AUTH_YES;

CONFIG_enum(tls_auth_clients, "yes", "", tls_auth_clients_enum, tls_auth_clients_opt);
#endif

namespace facade {

// See dragonfly_listener.h
std::atomic<bool> g_shutdown_fast{false};

using namespace util;
using util::detail::SafeErrorMessage;

using absl::GetFlag;

namespace {

bool ConfigureKeepAlive(int fd) {
  int val = 1;
  if (setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &val, sizeof(val)) < 0)
    return false;

  val = absl::GetFlag(FLAGS_tcp_keepalive);
#ifdef __APPLE__
  if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE, &val, sizeof(val)) < 0)
    return false;
#else
  if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPIDLE, &val, sizeof(val)) < 0)
    return false;
#endif

  /* Send next probes after the specified interval. Note that we set the
   * delay as interval / 3, as we send three probes before detecting
   * an error (see the next setsockopt call). */
  val = std::max(val / 3, 1);
  if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPINTVL, &val, sizeof(val)) < 0)
    return false;

  /* Consider the socket in error state after three we send three ACK
   * probes without getting a reply. */
  val = 3;
  if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPCNT, &val, sizeof(val)) < 0)
    return false;

  return true;
}

struct ListenerStats {
  size_t tls_allocated_bytes = 0;
  uint64_t refused_conn_maxclients_reached_cnt = 0;
};

thread_local ListenerStats listener_tl_stats;
atomic_int ssl_init_refcount = 0;

void* OverriddenSSLMalloc(size_t size, const char* file, int line) {
  void* res = mi_malloc(size);
  listener_tl_stats.tls_allocated_bytes += mi_malloc_usable_size(res);
  return res;
}

void* OverriddenSSLRealloc(void* addr, size_t size, const char* file, int line) {
  size_t prev_size = mi_malloc_usable_size(addr);
  void* res = mi_realloc(addr, size);
  listener_tl_stats.tls_allocated_bytes += mi_malloc_usable_size(res);
  listener_tl_stats.tls_allocated_bytes -= prev_size;
  return res;
}

void OverriddenSSLFree(void* addr, const char* file, int line) {
  listener_tl_stats.tls_allocated_bytes -= mi_malloc_usable_size(addr);
  mi_free(addr);
}

}  // namespace

Listener::Listener(Protocol protocol, ServiceInterface* si, Role role)
    : service_(si), role_(role), protocol_(protocol) {
#ifdef DFLY_USE_SSL
  if (ssl_init_refcount.fetch_add(1) == 0) {
    CRYPTO_set_mem_functions(&OverriddenSSLMalloc, &OverriddenSSLRealloc, &OverriddenSSLFree);
  }

  // Always initialise OpenSSL so we can enable TLS at runtime.
  OPENSSL_init_ssl(OPENSSL_INIT_SSL_DEFAULT, nullptr);
  // Print this only for main interface
  if (IsMainInterface()) {
    std::string_view ssl_version = SSLeay_version(SSLEAY_VERSION);
    LOG(INFO) << "SSL version: " << ssl_version;
  }
  if (!ReconfigureTLS()) {
    exit(-1);
  }
#endif

  // We only set the HTTP interface for:
  // 1. Privileged users (on privileged listener)
  // 2. Main listener (if enabled)
  const bool is_main_enabled = GetFlag(FLAGS_primary_port_http_enabled);
  if (IsPrivilegedInterface() || (IsMainInterface() && is_main_enabled)) {
    http_base_ = std::make_unique<HttpListener<>>();
    http_base_->set_resource_prefix("http://static.dragonflydb.io/data-plane");
    si->ConfigureHttpHandlers(http_base_.get(), IsPrivilegedInterface());
  }
}

Listener::~Listener() {
#ifdef DFLY_USE_SSL
  SSL_CTX_free(ctx_);

  if (ssl_init_refcount.fetch_sub(1) == 1) {
    OPENSSL_cleanup();
  }
#endif
}

util::Connection* Listener::NewConnection(ProactorBase* proactor) {
  return new Connection{protocol_, http_base_.get(), ctx_, service_};
}

error_code Listener::ConfigureServerSocket(int fd) {
  int val = 1;

  if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val)) < 0) {
    LOG(WARNING) << "Could not set reuse addr on socket " << SafeErrorMessage(errno);
  }

#ifdef TCP_DEFER_ACCEPT  // TCP_DEFER_ACCEPT is only for Linux, and defined by Linux OS-Kernel
  if (GetFlag(FLAGS_enable_tcp_defer_accept)) {
    sockaddr_storage addr;
    socklen_t len = sizeof(addr);
    // TCP_DEFER_ACCEPT is only applicable to TCP (IPv4/IPv6) sockets, not Unix domain sockets
    // (UDS).
    if (getsockname(fd, reinterpret_cast<sockaddr*>(&addr), &len) == 0 &&
        (addr.ss_family == AF_INET || addr.ss_family == AF_INET6)) {
      // Instruct the kernel to defer waking up accept() until actual payload data arrives,
      // with a timeout of 1 second.
      // This provides a kernel-level shield against "Pure Zombie" storms - where malicious or
      // misconfigured clients complete the TCP 3-way handshake but never send data (or immediately
      // send FIN/RST). The kernel will silently clean up these empty connections without
      // consuming Dragonfly fibers or OpenSSL memory.
      // This imposes zero latency penalty on well-behaved clients, as the kernel instantly
      // yields the connection to user-space the moment their first byte (e.g., TLS ClientHello
      // or RESP command) arrives.
      static constexpr int kDeferAcceptTimeoutSec = 1;
      if (setsockopt(fd, IPPROTO_TCP, TCP_DEFER_ACCEPT, &kDeferAcceptTimeoutSec,
                     sizeof(kDeferAcceptTimeoutSec)) < 0) {
        LOG(WARNING) << "Could not set TCP_DEFER_ACCEPT " << SafeErrorMessage(errno);
      }
    }
  }
#endif
  bool success = ConfigureKeepAlive(fd);

#ifdef __linux__
  int user_timeout = absl::GetFlag(FLAGS_tcp_user_timeout);
  if (setsockopt(fd, IPPROTO_TCP, TCP_USER_TIMEOUT, &user_timeout, sizeof(int)) < 0) {
    LOG(WARNING) << "Could not set user timeout on socket " << SafeErrorMessage(errno);
  }
#endif

  if (!success) {
#ifndef __APPLE__
    int myerr = errno;

    int socket_type;
    socklen_t length = sizeof(socket_type);

    // Ignore the error on UDS.
    if (getsockopt(fd, SOL_SOCKET, SO_DOMAIN, &socket_type, &length) != 0 ||
        socket_type != AF_UNIX) {
      LOG(WARNING) << "Could not configure keep alive " << SafeErrorMessage(myerr);
    }
#endif
  }

  return error_code{};
}

bool Listener::ReconfigureTLS() {
#ifdef DFLY_USE_SSL
  SSL_CTX* prev_ctx = ctx_;
  const bool tls_on_privileged_port = !GetFlag(FLAGS_no_tls_on_admin_port);

  if (GetFlag(FLAGS_tls) && (!IsPrivilegedInterface() || tls_on_privileged_port)) {
    SSL_CTX* ctx = CreateSslCntx(facade::TlsContextRole::SERVER);
    if (!ctx) {
      return false;
    }
    ctx_ = ctx;
  } else {
    ctx_ = nullptr;
  }

  if (prev_ctx) {
    // SSL_CTX is reference counted so if other connections have a reference
    // to the context it won't be freed yet.
    SSL_CTX_free(prev_ctx);
  }
#endif
  return true;
}

size_t Listener::TLSUsedMemoryThreadLocal() {
  return listener_tl_stats.tls_allocated_bytes;
}

uint64_t Listener::RefusedConnectionMaxClientsCount() {
  return listener_tl_stats.refused_conn_maxclients_reached_cnt;
}

void Listener::PreAcceptLoop(util::ProactorBase* pb) {
}

bool Listener::IsPrivilegedInterface() const {
  return role_ == Role::PRIVILEGED;
}

bool Listener::IsMainInterface() const {
  return role_ == Role::MAIN;
}

void Listener::PreShutdown() {
  // If NOW/FORCE requested, expedite shutdown without waiting.
  if (g_shutdown_fast.load(std::memory_order_acquire)) {
    return;
  }

  // Otherwise: Iterate on all connections and allow them to finish their commands for
  // a short period.
  // Executed commands can be visible in snapshots or replicas, but if we close the client
  // connections too fast we might not send the acknowledgment for those commands.
  // This shouldn't take a long time: All clients should reject incoming commands
  // at this stage since we're in SHUTDOWN mode.
  // If a command is running for too long we give up and proceed.
  DispatchTracker tracker{
      {this}, nullptr, false /* paused connections */, false /* blocking connections*/};
  tracker.TrackAll();

  if (!tracker.Wait(absl::Milliseconds(10))) {
    LOG(WARNING) << "Some commands are still being dispatched but didn't conclude in time. "
                    "Proceeding in shutdown.";
  }
}

void Listener::PostShutdown() {
}

void Listener::OnConnectionStart(util::Connection* conn) {
  facade::Connection* facade_conn = static_cast<facade::Connection*>(conn);
  VLOG(1) << "Opening connection " << facade_conn->GetClientId();

  facade_conn->OnConnectionStart();
}

void Listener::OnConnectionClose(util::Connection* conn) {
  Connection* facade_conn = static_cast<Connection*>(conn);
  VLOG(1) << "Closing connection " << facade_conn->GetClientId();
}

void Listener::OnMaxConnectionsReached(util::FiberSocketBase* sock) {
  listener_tl_stats.refused_conn_maxclients_reached_cnt++;
  sock->Write(io::Buffer("-ERR max number of clients reached\r\n"));
}

// We can limit number of threads handling dragonfly connections.
ProactorBase* Listener::PickConnectionProactor(util::FiberSocketBase* sock) {
  util::ProactorPool* pp = pool();

  uint32_t res_id = kuint32max;

  if (!sock->IsUDS()) {
    int fd = sock->native_handle();

    int cpu, napi_id;
    socklen_t len = sizeof(cpu);

    // I suspect that the advantage of using SO_INCOMING_NAPI_ID is that
    // we can also track the affinity changes during the lifetime of the process
    // i.e. when a different CPU is assigned to handle the RX traffic.
    // On some distributions (WSL1, for example), SO_INCOMING_CPU is not supported.
    if (0 == getsockopt(fd, SOL_SOCKET, SO_INCOMING_CPU, &cpu, &len)) {
      VLOG(1) << "CPU for connection " << fd << " is " << cpu;
      // Avoid CHECKINGing success, it sometimes fail on WSL
      // https://github.com/dragonflydb/dragonfly/issues/2090
      if (0 == getsockopt(fd, SOL_SOCKET, SO_INCOMING_NAPI_ID, &napi_id, &len)) {
        VLOG(1) << "NAPI for connection " << fd << " is " << napi_id;
      }

      if (GetFlag(FLAGS_conn_use_incoming_cpu)) {
        // We choose a thread that is running on the incoming CPU. Usually there is
        // a single thread per CPU. SO_INCOMING_CPU returns the CPU that the kernel
        // uses to steer the packets to. In order to make
        // conn_use_incoming_cpu effective, we should make sure that the receive packets are
        // steered to enough CPUs. This can be done by setting the RPS mask in
        // /sys/class/net/<dev>/queues/rx-<n>/rps_cpus. For more details, see
        // https://docs.kernel.org/networking/scaling.html#rps-configuration
        // Please note that if conn_use_incoming_cpu is true, connections will be handled only
        // on the CPUs that handle the softirqs for the incoming packets.
        // To avoid imbalance in CPU load, RPS tuning is strongly advised.
        const vector<unsigned>& ids = pool()->MapCpuToThreads(cpu);
        if (!ids.empty()) {
          res_id = ids[0];
        }
      }
    }
  }

  if (res_id == kuint32max) {
    uint32_t total = GetFlag(FLAGS_conn_io_threads);
    uint32_t start = GetFlag(FLAGS_conn_io_thread_start) % pp->size();

    if (total == 0 || total + start > pp->size()) {
      total = pp->size() - start;
    }

    res_id = start + (next_id_.fetch_add(1, std::memory_order_relaxed) % total);
  }

  return pp->at(res_id);
}

DispatchTracker::DispatchTracker(absl::Span<facade::Listener* const> listeners,
                                 facade::Connection* issuer, bool ignore_paused,
                                 bool ignore_blocked)
    : listeners_{listeners.begin(), listeners.end()},
      issuer_{issuer},
      ignore_paused_{ignore_paused},
      ignore_blocked_{ignore_blocked} {
}

void DispatchTracker::TrackOnThread() {
  for (auto* listener : listeners_) {
    listener->TraverseConnectionsOnThread(
        [this](unsigned thread_index, util::Connection* conn) { Handle(thread_index, conn); },
        UINT32_MAX, nullptr);
  }
}

bool DispatchTracker::Wait(absl::Duration duration) {
  bool res = bc_->WaitFor(absl::ToChronoMilliseconds(duration));
  if (!res && ignore_blocked_) {
    LOG(INFO) << "Retrying DispatchTracker::Wait, as bc=" << bc_->DEBUG_Count();
    // We track all connections again because a connection might became blocked between the time
    // we call tracking the last time.
    bc_ = BlockingCounter{0};
    TrackAll();
    res = bc_->WaitFor(absl::ToChronoMilliseconds(duration));
    LOG_IF(INFO, !res) << "DispatchTracker::Wait failed again, bc=" << bc_->DEBUG_Count();
  }
  return res;
}

void DispatchTracker::TrackAll() {
  for (auto* listener : listeners_)
    listener->TraverseConnections(absl::bind_front(&DispatchTracker::Handle, this));
}

void DispatchTracker::Handle(unsigned thread_index, util::Connection* conn) {
  if (auto* fconn = static_cast<facade::Connection*>(conn); fconn != issuer_)
    fconn->SendCheckpoint(bc_, ignore_paused_, ignore_blocked_);
}

}  // namespace facade


================================================
FILE: src/facade/dragonfly_listener.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/base/internal/spinlock.h>
#include <absl/time/time.h>

#include <atomic>
#include <memory>
#include <system_error>
#include <vector>

#include "facade/facade_types.h"
#include "util/fiber_socket_base.h"
#include "util/fibers/proactor_base.h"
#include "util/http/http_handler.h"
#include "util/listener_interface.h"

typedef struct ssl_ctx_st SSL_CTX;

namespace facade {

class ServiceInterface;
class Connection;

class Listener : public util::ListenerInterface {
 public:
  // The Role PRIVILEGED is for admin port/listener
  // The Role MAIN is for the main listener on main port
  // The Role OTHER is for all the other listeners
  enum class Role { PRIVILEGED, MAIN, OTHER };
  Listener(Protocol protocol, ServiceInterface*, Role role = Role::OTHER);
  ~Listener();

  std::error_code ConfigureServerSocket(int fd) final;

  // Wait until all command dispatches that are currently in progress finish,
  // ignore commands from issuer connection.
  bool AwaitCurrentDispatches(absl::Duration timeout, util::Connection* issuer);

  // ReconfigureTLS MUST be called from the same proactor as the listener.
  bool ReconfigureTLS();

  // Returns thread-local dynamic memory usage by TLS.
  static size_t TLSUsedMemoryThreadLocal();
  static uint64_t RefusedConnectionMaxClientsCount();

  bool IsPrivilegedInterface() const;
  bool IsMainInterface() const;

  Protocol protocol() const {
    return protocol_;
  }

 private:
  util::Connection* NewConnection(ProactorBase* proactor) final;
  ProactorBase* PickConnectionProactor(util::FiberSocketBase* sock) final;

  void OnConnectionStart(util::Connection* conn) final;
  void OnConnectionClose(util::Connection* conn) final;
  void OnMaxConnectionsReached(util::FiberSocketBase* sock) final;
  void PreAcceptLoop(ProactorBase* pb) final;

  void PreShutdown() final;
  void PostShutdown() final;

  std::unique_ptr<util::HttpListenerBase> http_base_;

  ServiceInterface* service_;

  std::atomic_uint32_t next_id_{0};

  Role role_;

  uint32_t conn_cnt_{0};

  Protocol protocol_;
  SSL_CTX* ctx_ = nullptr;
};

// Dispatch tracker allows tracking the dispatch state of connections and blocking until all
// detected busy connections finished dispatching. Ignores issuer connection.
//
// Mostly used to detect when global state changes (takeover, pause, cluster config update) are
// visible to all commands and no commands are still running according to the old state / config.
class DispatchTracker {
 public:
  DispatchTracker(absl::Span<facade::Listener* const>, facade::Connection* issuer,
                  bool ignore_paused, bool ignore_blocked);

  void TrackAll();       // Track busy connection on all threads
  void TrackOnThread();  // Track busy connections on current thread

  // Wait until all tracked connections finished dispatching.
  // Returns true on success, false if timeout was reached.
  bool Wait(absl::Duration timeout);

 private:
  void Handle(unsigned thread_index, util::Connection* conn);

  std::vector<facade::Listener*> listeners_;
  facade::Connection* issuer_;
  util::fb2::BlockingCounter bc_{0};  // tracks number of pending checkpoints
  bool ignore_paused_;
  bool ignore_blocked_;
};

// Global shutdown tuning flag, controlled by SHUTDOWN options.
// When true, listeners perform expedited shutdown without waiting for
// in-flight dispatches (used by NOW/FORCE).
extern std::atomic<bool> g_shutdown_fast;

}  // namespace facade


================================================
FILE: src/facade/error.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <string>
#include <string_view>

namespace facade {

std::string WrongNumArgsError(std::string_view cmd);
std::string ConfigSetFailed(std::string_view config_name);
std::string InvalidExpireTime(std::string_view cmd);
std::string UnknownSubCmd(std::string_view subcmd, std::string_view cmd);

inline constexpr char kSyntaxErr[] = "syntax error";
inline constexpr char kWrongTypeErr[] =
    "-WRONGTYPE Operation against a key holding the wrong kind of value";
inline constexpr char kWrongJsonTypeErr[] = "-WRONGTYPE wrong JSON type of path value";
inline constexpr char kKeyNotFoundErr[] = "no such key";
inline constexpr char kInvalidIntErr[] = "value is not an integer or out of range";
inline constexpr char kInvalidFloatErr[] = "value is not a valid float";
inline constexpr char kUintErr[] = "value is out of range, must be positive";
inline constexpr char kIncrOverflow[] = "increment or decrement would overflow";
inline constexpr char kDbIndOutOfRangeErr[] = "DB index is out of range";
inline constexpr char kInvalidDbIndErr[] = "invalid DB index";
inline constexpr char kScriptNotFound[] = "-NOSCRIPT No matching script. Please use EVAL.";
inline constexpr char kAuthRejected[] =
    "-WRONGPASS invalid username-password pair or user is disabled.";
inline constexpr char kExpiryOutOfRange[] = "expiry is out of range";
inline constexpr char kIndexOutOfRange[] = "index out of range";
inline constexpr char kOutOfMemory[] = "Out of memory";
inline constexpr char kInvalidNumericResult[] = "result is not a number";
inline constexpr char kClusterNotConfigured[] = "Cluster is not yet configured";
inline constexpr char kLoadingErr[] = "-LOADING Dragonfly is loading the dataset in memory";
inline constexpr char kUndeclaredKeyErr[] = "script tried accessing undeclared key";
inline constexpr char kInvalidDumpValueErr[] = "DUMP payload version or checksum are wrong";
inline constexpr char kInvalidJsonPathErr[] = "invalid JSON path";
inline constexpr char kJsonParseError[] = "failed to parse JSON";
inline constexpr char kNanOrInfDuringIncr[] = "increment would produce NaN or Infinity";
inline constexpr char kCrossSlotError[] = "-CROSSSLOT Keys in request don't hash to the same slot";
inline constexpr char kTieredIoError[] = "IO error when reading value from tiered storage";
inline constexpr char kInvalidHllError[] = "Key is not a valid HyperLogLog string value";

inline constexpr char kSyntaxErrType[] = "syntax_error";
inline constexpr char kScriptErrType[] = "script_error";
inline constexpr char kConfigErrType[] = "config_error";
inline constexpr char kSearchErrType[] = "search_error";
inline constexpr char kWrongTypeErrType[] = "wrong_type";
inline constexpr char kRestrictDenied[] = "restrict_denied";
inline constexpr char kNoGroupErrType[] = "no_group_error";
inline constexpr char kNoAuthErrType[] = "no_auth";

}  // namespace facade


================================================
FILE: src/facade/facade.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include <absl/strings/escaping.h>
#include <absl/strings/str_cat.h>

#include "base/logging.h"
#include "facade/command_id.h"
#include "facade/error.h"
#include "facade/facade_stats.h"
#include "facade/parsed_command.h"
#include "facade/reply_builder.h"
#include "facade/resp_expr.h"
#include "strings/human_readable.h"

namespace facade {

using namespace std;

#define ADD(x) (x) += o.x

constexpr size_t kSizeConnStats = sizeof(ConnectionStats);

ConnectionStats& ConnectionStats::operator+=(const ConnectionStats& o) {
  static_assert(kSizeConnStats == 272);

  ADD(read_buf_capacity);
  ADD(dispatch_queue_entries);
  ADD(dispatch_queue_bytes);
  ADD(pipeline_queue_entries);
  ADD(pipeline_queue_bytes);
  ADD(dispatch_queue_subscriber_bytes);
  ADD(pipeline_cmd_cache_bytes);
  ADD(io_read_cnt);
  ADD(io_read_bytes);
  ADD(command_cnt_main);
  ADD(command_cnt_other);
  ADD(pipelined_cmd_cnt);
  ADD(pipelined_cmd_latency);
  pipelined_latency_hist.Merge(o.pipelined_latency_hist);
  ADD(pipelined_wait_latency);
  ADD(conn_received_cnt);
  ADD(num_conns_main);
  ADD(num_conns_other);
  ADD(num_blocked_clients);
  ADD(num_read_yields);
  ADD(num_migrations);
  ADD(num_recv_provided_calls);
  ADD(pipeline_throttle_count);
  ADD(tls_accept_disconnects);
  ADD(handshakes_started);
  ADD(handshakes_completed);
  ADD(pipeline_dispatch_calls);
  ADD(pipeline_dispatch_commands);
  ADD(pipeline_dispatch_flush_usec);
  ADD(skip_pipeline_flushing);

  return *this;
}

ReplyStats::ReplyStats(ReplyStats&& other) noexcept {
  *this = other;
}

ReplyStats& ReplyStats::operator+=(const ReplyStats& o) {
  static_assert(sizeof(ReplyStats) == 80u + kSanitizerOverhead);
  ADD(io_write_cnt);
  ADD(io_write_bytes);

  for (const auto& k_v : o.err_count) {
    err_count[k_v.first] += k_v.second;
  }

  ADD(script_error_count);

  send_stats += o.send_stats;
  squashing_current_reply_size.fetch_add(o.squashing_current_reply_size.load(memory_order_relaxed),
                                         memory_order_relaxed);
  return *this;
}

#undef ADD

ReplyStats& ReplyStats::operator=(const ReplyStats& o) {
  static_assert(sizeof(ReplyStats) == 80u + kSanitizerOverhead);

  if (this == &o) {
    return *this;
  }

  send_stats = o.send_stats;
  io_write_cnt = o.io_write_cnt;
  io_write_bytes = o.io_write_bytes;
  err_count = o.err_count;
  script_error_count = o.script_error_count;
  squashing_current_reply_size.store(o.squashing_current_reply_size.load(memory_order_relaxed),
                                     memory_order_relaxed);
  return *this;
}

string WrongNumArgsError(string_view cmd) {
  return absl::StrCat("wrong number of arguments for '", absl::AsciiStrToLower(cmd), "' command");
}

string InvalidExpireTime(string_view cmd) {
  return absl::StrCat("invalid expire time in '", absl::AsciiStrToLower(cmd), "' command");
}

string UnknownSubCmd(string_view subcmd, string_view cmd) {
  return absl::StrCat("Unknown subcommand or wrong number of arguments for '", subcmd, "'. Try ",
                      cmd, " HELP.");
}

string ConfigSetFailed(string_view config_name) {
  return absl::StrCat("CONFIG SET failed (possibly related to argument '", config_name, "').");
}

const char* RespExpr::TypeName(Type t) {
  switch (t) {
    case STRING:
      return "string";
    case INT64:
      return "int";
    case DOUBLE:
      return "double";
    case ARRAY:
      return "array";
    case NIL_ARRAY:
      return "nil-array";
    case NIL:
      return "nil";
    case ERROR:
      return "error";
  }
  ABSL_UNREACHABLE();
}

CommandId::CommandId(const char* name, uint32_t mask, int8_t arity, int8_t first_key,
                     int8_t last_key, uint32_t acl_categories)
    : name_(name),
      opt_mask_(mask),
      arity_(arity),
      first_key_(first_key),
      last_key_(last_key),
      acl_categories_(acl_categories) {
}

}  // namespace facade

namespace std {

using facade::ArgS;

ostream& operator<<(ostream& os, facade::CmdArgList ras) {
  os << "[";
  if (!ras.empty()) {
    for (size_t i = 0; i < ras.size() - 1; ++i) {
      os << absl::CHexEscape(ArgS(ras, i)) << ",";
    }
    os << absl::CHexEscape(ArgS(ras, ras.size() - 1));
  }
  os << "]";

  return os;
}

ostream& operator<<(ostream& os, const facade::RespExpr& e) {
  using facade::RespExpr;
  using facade::ToSV;

  switch (e.type) {
    case RespExpr::INT64:
      os << "i" << get<int64_t>(e.u);
      break;
    case RespExpr::DOUBLE:
      os << "d" << get<double>(e.u);
      break;
    case RespExpr::STRING:
      os << "'" << ToSV(get<RespExpr::Buffer>(e.u)) << "'";
      break;
    case RespExpr::NIL:
      os << "nil";
      break;
    case RespExpr::NIL_ARRAY:
      os << "[]";
      break;
    case RespExpr::ARRAY:
      os << facade::RespSpan{*get<RespExpr::Vec*>(e.u)};
      break;
    case RespExpr::ERROR:
      os << "e(" << ToSV(get<RespExpr::Buffer>(e.u)) << ")";
      break;
  }

  return os;
}

ostream& operator<<(ostream& os, facade::RespSpan ras) {
  os << "[";
  if (!ras.empty()) {
    for (size_t i = 0; i < ras.size() - 1; ++i) {
      os << ras[i] << ",";
    }
    os << ras.back();
  }
  os << "]";

  return os;
}

ostream& operator<<(ostream& os, facade::Protocol p) {
  switch (p) {
    case facade::Protocol::REDIS:
      os << "REDIS";
      break;
    case facade::Protocol::MEMCACHE:
      os << "MEMCACHE";
      break;
  }

  return os;
}

}  // namespace std


================================================
FILE: src/facade/facade_stats.h
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/flat_hash_map.h>

#include <atomic>
#include <cstdint>

#include "base/histogram.h"
namespace facade {

struct ConnectionStats {
  size_t read_buf_capacity = 0;  // total capacity of input buffers
  // Count of pending messages in dispatch queue
  uint64_t dispatch_queue_entries = 0;
  // Memory used by pending messages in dispatch queue
  size_t dispatch_queue_bytes = 0;
  // Count of pending parsed commands in the pipeline queue (Data Path)
  uint64_t pipeline_queue_entries = 0;
  // Memory used by pending parsed commands in the pipeline queue (Data Path)
  size_t pipeline_queue_bytes = 0;
  // total size of all publish messages (subset of dispatch_queue_bytes)
  size_t dispatch_queue_subscriber_bytes = 0;

  size_t pipeline_cmd_cache_bytes = 0;

  uint64_t io_read_cnt = 0;
  size_t io_read_bytes = 0;

  uint64_t command_cnt_main = 0;
  uint64_t command_cnt_other = 0;
  uint64_t pipelined_cmd_cnt = 0;
  uint64_t pipelined_cmd_latency = 0;      // in microseconds
  base::Histogram pipelined_latency_hist;  // distribution of per-command latencies (usec)

  // in microseconds, time spent waiting for the pipelined commands to start executing
  uint64_t pipelined_wait_latency = 0;
  uint64_t conn_received_cnt = 0;

  uint32_t num_conns_main = 0;
  uint32_t num_conns_other = 0;
  uint32_t num_blocked_clients = 0;

  // number of times the connection yielded due to max_busy_read_usec limit
  uint32_t num_read_yields = 0;
  uint64_t num_migrations = 0;
  uint64_t num_recv_provided_calls = 0;

  // Number of times the tls connection was closed by the time we started reading from it.
  uint64_t tls_accept_disconnects = 0;  // number of TLS socket disconnects during the handshake
                                        //
  uint64_t handshakes_started = 0;
  uint64_t handshakes_completed = 0;

  // Number of events when the pipeline queue was over the limit and was throttled.
  uint64_t pipeline_throttle_count = 0;
  uint64_t pipeline_dispatch_calls = 0;
  uint64_t pipeline_dispatch_commands = 0;
  uint64_t pipeline_dispatch_flush_usec = 0;

  uint64_t skip_pipeline_flushing = 0;  // number of times we skipped flushing the pipeline

  ConnectionStats& operator+=(const ConnectionStats& o);
};

struct ReplyStats {
  struct SendStats {
    int64_t count = 0;
    int64_t total_duration = 0;

    SendStats& operator+=(const SendStats& other) {
      static_assert(sizeof(SendStats) == 16u);

      count += other.count;
      total_duration += other.total_duration;
      return *this;
    }
  };

  // Send() operations that are written to sockets
  SendStats send_stats;

  size_t io_write_cnt = 0;
  size_t io_write_bytes = 0;
  absl::flat_hash_map<std::string, uint64_t> err_count;
  size_t script_error_count = 0;

  // This variable can be updated directly from shard threads when they allocate memory for replies.
  std::atomic<size_t> squashing_current_reply_size{0};

  ReplyStats() = default;
  ReplyStats(ReplyStats&& other) noexcept;
  ReplyStats& operator+=(const ReplyStats& other);
  ReplyStats& operator=(const ReplyStats& other);
};

struct FacadeStats {
  ConnectionStats conn_stats;
  ReplyStats reply_stats;

  FacadeStats& operator+=(const FacadeStats& other) {
    conn_stats += other.conn_stats;
    reply_stats += other.reply_stats;
    return *this;
  }
};

inline thread_local FacadeStats* tl_facade_stats = nullptr;

}  // namespace facade


================================================
FILE: src/facade/facade_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "facade/facade_test.h"

#include <absl/strings/match.h>
#include <absl/strings/numbers.h>

#include "base/logging.h"

namespace facade {

using namespace testing;
using namespace std;

bool RespMatcher::MatchAndExplain(RespExpr e, MatchResultListener* listener) const {
  if (e.type != type_) {
    if (e.type == RespExpr::STRING && type_ == RespExpr::DOUBLE) {
      // Doubles are encoded as strings, unless RESP3 is selected. So parse string and try to
      // compare it.
      double d = 0;
      if (!absl::SimpleAtod(e.GetString(), &d)) {
        *listener << "\nCan't parse as double: " << e.GetString();
        return false;
      }
      e.type = RespExpr::DOUBLE;
      e.u = d;
    } else {
      *listener << "\nWrong type: " << RespExpr::TypeName(e.type);
      return false;
    }
  }

  if (type_ == RespExpr::STRING || type_ == RespExpr::ERROR) {
    RespExpr::Buffer ebuf = e.GetBuf();
    std::string_view actual{reinterpret_cast<const char*>(ebuf.data()), ebuf.size()};

    if (type_ == RespExpr::ERROR && !absl::StrContains(actual, exp_str_)) {
      *listener << "Actual does not contain '" << exp_str_ << "'";
      return false;
    }
    if (type_ == RespExpr::STRING && exp_str_ != actual) {
      *listener << "\nActual string: " << actual;
      return false;
    }
  } else if (type_ == RespExpr::INT64) {
    auto actual = get<int64_t>(e.u);
    if (exp_int_ != actual) {
      *listener << "\nActual : " << actual << " expected: " << exp_int_;
      return false;
    }
  } else if (type_ == RespExpr::DOUBLE) {
    auto actual = get<double>(e.u);
    if (abs(exp_double_ - actual) > 0.0001) {
      *listener << "\nActual : " << actual << " expected: " << exp_double_;
      return false;
    }
  } else if (type_ == RespExpr::ARRAY) {
    size_t len = get<RespVec*>(e.u)->size();
    if (len != size_t(exp_int_)) {
      *listener << "Actual length " << len << ", expected: " << exp_int_;
      return false;
    }
  }

  return true;
}

void RespMatcher::DescribeTo(std::ostream* os) const {
  *os << "is ";
  switch (type_) {
    case RespExpr::STRING:
    case RespExpr::ERROR:
      *os << exp_str_;
      break;

    case RespExpr::INT64:
      *os << exp_str_;
      break;
    case RespExpr::ARRAY:
      *os << "array of length " << exp_int_;
      break;
    case RespExpr::DOUBLE:
      *os << exp_double_;
      break;
    default:
      *os << "TBD";
      break;
  }
}

void RespMatcher::DescribeNegationTo(std::ostream* os) const {
  *os << "is not ";
}

bool RespTypeMatcher::MatchAndExplain(const RespExpr& e, MatchResultListener* listener) const {
  if (e.type != type_) {
    *listener << "\nWrong type: " << RespExpr::TypeName(e.type);
    return false;
  }

  return true;
}

void RespTypeMatcher::DescribeTo(std::ostream* os) const {
  *os << "is " << RespExpr::TypeName(type_);
}

void RespTypeMatcher::DescribeNegationTo(std::ostream* os) const {
  *os << "is not " << RespExpr::TypeName(type_);
}

void PrintTo(const RespExpr::Vec& vec, std::ostream* os) {
  *os << "Vec: [";
  if (!vec.empty()) {
    for (size_t i = 0; i < vec.size() - 1; ++i) {
      *os << vec[i] << ",";
    }
    *os << vec.back();
  }
  *os << "]\n";
}

}  // namespace facade


================================================
FILE: src/facade/facade_test.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <gmock/gmock.h>

#include <ostream>
#include <string>
#include <string_view>

#include "facade/resp_expr.h"

namespace facade {

class RespMatcher {
 public:
  RespMatcher(std::string_view val, RespExpr::Type t = RespExpr::STRING) : type_(t), exp_str_(val) {
  }

  RespMatcher(int64_t val, RespExpr::Type t = RespExpr::INT64) : type_(t), exp_int_(val) {
  }

  RespMatcher(double_t val, RespExpr::Type t = RespExpr::DOUBLE) : type_(t), exp_double_(val) {
  }
  using is_gtest_matcher = void;

  bool MatchAndExplain(RespExpr e, testing::MatchResultListener*) const;

  void DescribeTo(std::ostream* os) const;

  void DescribeNegationTo(std::ostream* os) const;

 private:
  RespExpr::Type type_;

  std::string exp_str_;
  int64_t exp_int_ = 0;
  double_t exp_double_ = 0;
};

class RespTypeMatcher {
 public:
  RespTypeMatcher(RespExpr::Type type) : type_(type) {
  }

  using is_gtest_matcher = void;

  bool MatchAndExplain(const RespExpr& e, testing::MatchResultListener*) const;

  void DescribeTo(std::ostream* os) const;

  void DescribeNegationTo(std::ostream* os) const;

 private:
  RespExpr::Type type_;
};

inline ::testing::PolymorphicMatcher<RespMatcher> ErrArg(std::string_view str) {
  return ::testing::MakePolymorphicMatcher(RespMatcher(str, RespExpr::ERROR));
}

inline ::testing::PolymorphicMatcher<RespMatcher> IntArg(int64_t ival) {
  return ::testing::MakePolymorphicMatcher(RespMatcher(ival));
}

inline ::testing::PolymorphicMatcher<RespMatcher> DoubleArg(double_t dval) {
  return ::testing::MakePolymorphicMatcher(RespMatcher(dval));
}

inline ::testing::PolymorphicMatcher<RespMatcher> ArrLen(size_t len) {
  return ::testing::MakePolymorphicMatcher(RespMatcher((int64_t)len, RespExpr::ARRAY));
}

inline ::testing::PolymorphicMatcher<RespTypeMatcher> ArgType(RespExpr::Type t) {
  return ::testing::MakePolymorphicMatcher(RespTypeMatcher(t));
}

MATCHER_P(RespArray, value, "") {
  return ExplainMatchResult(
      testing::AllOf(ArgType(RespExpr::ARRAY), testing::Property(&RespExpr::GetVec, value)), arg,
      result_listener);
}

template <typename... Args> auto RespElementsAre(const Args&... matchers) {
  return RespArray(::testing::ElementsAre(matchers...));
}

inline bool operator==(const RespExpr& left, std::string_view s) {
  return left.type == RespExpr::STRING && ToSV(left.GetBuf()) == s;
}

inline bool operator==(const RespExpr& left, int64_t val) {
  return left.type == RespExpr::INT64 && left.GetInt() == val;
}

inline bool operator!=(const RespExpr& left, std::string_view s) {
  return !(left == s);
}

inline bool operator==(std::string_view s, const RespExpr& right) {
  return right == s;
}

inline bool operator!=(std::string_view s, const RespExpr& right) {
  return !(right == s);
}

void PrintTo(const RespExpr::Vec& vec, std::ostream* os);

}  // namespace facade


================================================
FILE: src/facade/facade_types.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <optional>
#include <string>
#include <string_view>
#include <variant>

#include "common/arg_range.h"
#include "common/backed_args.h"
#include "facade/op_status.h"

namespace facade {

#if defined(__clang__)
#if defined(__has_feature)
#if __has_feature(address_sanitizer)
constexpr size_t kSanitizerOverhead = 24u;
#else
constexpr size_t kSanitizerOverhead = 0u;
#endif
#endif
#else
#ifdef __SANITIZE_ADDRESS__
constexpr size_t kSanitizerOverhead = 24u;
#else
constexpr size_t kSanitizerOverhead = 0u;
#endif
#endif

enum class Protocol : uint8_t { MEMCACHE = 1, REDIS = 2 };
enum class CollectionType : uint8_t { ARRAY, SET, MAP, PUSH };

using MutableSlice = std::string_view;
using CmdArgVec = std::vector<std::string_view>;
using cmn::ArgSlice;
using CmdArgList = cmn::ArgSlice;
using cmn::ArgRange;

class ParsedArgs {
 public:
  ParsedArgs() = default;

  // References backed arguments. The object must outlive this ParsedArgs.
  ParsedArgs(const cmn::BackedArguments& bargs)  // NOLINT google-explicit-constructor
      : args_(&bargs) {
  }

  ParsedArgs(ArgSlice slice)  // NOLINT google-explicit-constructor
      : args_(slice) {
  }

  ParsedArgs(const ParsedArgs& other) = default;
  ParsedArgs& operator=(const ParsedArgs& bargs) = default;

  size_t size() const {
    return std::visit([](const auto& args) { return args.size(); }, args_);
  }

  bool empty() const {
    return size() == 0;
  }

  ParsedArgs Tail() const {
    return std::visit([](const auto& args) { return args.Tail(); }, args_);
  }

  std::string_view Front() const {
    return std::visit([](const auto& args) { return args.front(); }, args_);
  }

  ArgSlice ToSlice(CmdArgVec* scratch) const {
    return std::visit([scratch](const auto& args) { return args.ToSlice(scratch); }, args_);
  }

  void ToVec(CmdArgVec* vec) const {
    std::visit([vec](const auto& args) { return args.ToVec(vec); }, args_);
  }

 private:
  struct WrapperBacked {
    WrapperBacked(const cmn::BackedArguments* args) : args_(args) {  // NOLINT
    }

    const cmn::BackedArguments* args_;
    uint32_t index_ = 0;

    ParsedArgs Tail() const {
      ParsedArgs res(*args_);
      WrapperBacked* wb = std::get_if<WrapperBacked>(&res.args_);
      wb->index_ = index_ + 1;
      return res;
    };

    size_t size() const {
      return args_->size() - index_;
    }

    std::string_view front() const {
      return args_->at(index_);
    }

    ArgSlice ToSlice(CmdArgVec* scratch) const {
      ToVec(scratch);
      return *scratch;
    }

    void ToVec(CmdArgVec* vec) const {
      vec->assign(args_->begin() + index_, args_->end());
    }
  };

  struct Slice : public ArgSlice {
    using ArgSlice::ArgSlice;
    Slice(ArgSlice other) : ArgSlice(other) {  // NOLINT
    }

    ParsedArgs Tail() const {
      return ParsedArgs{subspan(1)};
    }

    ArgSlice ToSlice(void* /*scratch*/) const {
      return *this;
    }

    void ToVec(CmdArgVec* vec) const {
      vec->assign(begin(), end());
    }
  };
  std::variant<Slice, WrapperBacked> args_;
};

inline std::string_view ToSV(std::string_view slice) {
  return slice;
}

inline std::string_view ToSV(const std::string& slice) {
  return slice;
}

inline std::string_view ToSV(std::string&& slice) = delete;

inline std::string_view ArgS(ArgSlice args, size_t i) {
  return args[i];
}

struct ErrorReply {
  explicit ErrorReply(std::string&& msg, std::string_view kind = {})
      : message{std::move(msg)}, kind{kind} {
  }
  explicit ErrorReply(std::string_view msg, std::string_view kind = {}) : message{msg}, kind{kind} {
  }
  explicit ErrorReply(const char* msg,
                      std::string_view kind = {})  // to resolve ambiguity of constructors above
      : message{std::string_view{msg}}, kind{kind} {
  }

  ErrorReply(OpStatus status)  // NOLINT google-explicit-constructor)
      : status{status} {
  }

  std::string_view ToSv() const {
    return std::visit(cmn::kToSV, message);
  }

  std::variant<std::string, std::string_view> message;
  std::string_view kind;
  std::optional<OpStatus> status{std::nullopt};
};

struct MemcacheCmdFlags {
  MemcacheCmdFlags() : raw(0) {
  }

  union {
    uint16_t raw = 0;
    struct {
      uint16_t no_reply : 1;  // q
      uint16_t meta : 1;

      // meta flags
      uint16_t base64 : 1;              // b
      uint16_t return_flags : 1;        // f
      uint16_t return_value : 1;        // v
      uint16_t return_ttl : 1;          // t
      uint16_t return_access_time : 1;  // l
      uint16_t return_hit : 1;          // h
      uint16_t return_cas : 1;          // c
    };
  };
};

static_assert(sizeof(MemcacheCmdFlags) == 2);

constexpr unsigned long long operator""_MB(unsigned long long x) {
  return 1024L * 1024L * x;
}

constexpr unsigned long long operator""_KB(unsigned long long x) {
  return 1024L * x;
}

void ResetStats();

// Constants for socket bufring.
constexpr uint16_t kRecvSockGid = 0;

// Size of the buffer in bufring (kRecvSockGid).
constexpr size_t kRecvBufSize = 1500;

}  // namespace facade

namespace std {
ostream& operator<<(ostream& os, cmn::ArgSlice args);
ostream& operator<<(ostream& os, facade::Protocol protocol);

}  // namespace std


================================================
FILE: src/facade/memcache_parser.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#include "facade/memcache_parser.h"

#include <absl/container/flat_hash_map.h>
#include <absl/container/inlined_vector.h>
#include <absl/strings/ascii.h>
#include <absl/strings/escaping.h>
#include <absl/strings/numbers.h>
#include <absl/strings/str_split.h>
#include <absl/types/span.h>

#include "base/logging.h"
#include "base/stl_util.h"
#include "facade/facade_types.h"

namespace facade {
using namespace std;
using MP = MemcacheParser;

namespace {

int64_t ToAbsolute(uint32_t ts, uint64_t now) {
  // if expire_ts is greater than month it's a unix timestamp
  // https://github.com/memcached/memcached/blob/master/doc/protocol.txt#L139
  constexpr uint32_t kExpireLimit = 60 * 60 * 24 * 30;
  int64_t expire_ts = ts && ts <= kExpireLimit ? ts + now : ts;
  return expire_ts;
}

MP::CmdType From(string_view token) {
  static absl::flat_hash_map<string_view, MP::CmdType> cmd_map{
      {"set", MP::SET},       {"add", MP::ADD},         {"replace", MP::REPLACE},
      {"append", MP::APPEND}, {"prepend", MP::PREPEND}, {"cas", MP::CAS},
      {"get", MP::GET},       {"gets", MP::GETS},       {"gat", MP::GAT},
      {"gats", MP::GATS},     {"stats", MP::STATS},     {"incr", MP::INCR},
      {"decr", MP::DECR},     {"delete", MP::DELETE},   {"flush_all", MP::FLUSHALL},
      {"quit", MP::QUIT},     {"version", MP::VERSION},
  };

  if (token.size() == 2) {
    // META_COMMANDS
    if (token[0] != 'm')
      return MP::INVALID;
    switch (token[1]) {
      case 's':
        return MP::META_SET;
      case 'g':
        return MP::META_GET;
      case 'd':
        return MP::META_DEL;
      case 'a':
        return MP::META_ARITHM;
      case 'n':
        return MP::META_NOOP;
      case 'e':
        return MP::META_DEBUG;
    }
    return MP::INVALID;
  }

  if (token.size() > 2) {
    auto it = cmd_map.find(token);
    if (it == cmd_map.end())
      return MP::INVALID;
    return it->second;
  }
  return MP::INVALID;
}

MP::Result ParseStore(ArgSlice tokens, int64_t now, MP::Command* res, uint32_t max_value_len) {
  DCHECK_EQ(res->size(), 0u);

  const size_t num_tokens = tokens.size();
  unsigned opt_pos = 4;
  if (res->type == MP::CAS) {
    if (num_tokens <= opt_pos)
      return MP::PARSE_ERROR;
    ++opt_pos;
  }

  // tokens[0] is key
  uint32_t bytes_len = 0;
  uint32_t flags;
  uint32_t expire_ts;
  if (!absl::SimpleAtoi(tokens[1], &flags) || !absl::SimpleAtoi(tokens[2], &expire_ts) ||
      !absl::SimpleAtoi(tokens[3], &bytes_len))
    return MP::BAD_INT;

  if (bytes_len > max_value_len) {
    LOG_EVERY_T(WARNING, 1) << "Memcache value size " << bytes_len << " exceeds max_bulk_len "
                            << max_value_len;
    return MP::PARSE_ERROR;
  }

  res->expire_ts = ToAbsolute(expire_ts, now);

  if (res->type == MP::CAS && !absl::SimpleAtoi(tokens[4], &res->cas_unique)) {
    return MP::BAD_INT;
  }

  res->flags = flags;
  if (num_tokens == opt_pos + 1) {
    if (tokens[opt_pos] == "noreply") {
      res->cmd_flags.no_reply = true;
    } else {
      return MP::PARSE_ERROR;
    }
  } else if (num_tokens > opt_pos + 1) {
    return MP::PARSE_ERROR;
  }

  string_view key = tokens[0];
  res->backed_args->PushArg(key);
  res->backed_args->PushArg(bytes_len);

  return MP::OK;
}

MP::Result ParseValueless(ArgSlice tokens, int64_t now, MP::Command* res) {
  const size_t num_tokens = tokens.size();
  size_t key_pos = 0;
  uint32_t expire_ts;
  if (res->type == MP::GAT || res->type == MP::GATS) {
    if (!absl::SimpleAtoi(tokens[0], &expire_ts)) {
      return MP::BAD_INT;
    }
    res->expire_ts = ToAbsolute(expire_ts, now);
    ++key_pos;
  }

  // We support only `flushall` or `flushall 0`
  if (key_pos < num_tokens && res->type == MP::FLUSHALL) {
    DCHECK_EQ(res->size(), 0u);

    int delay = 0;
    if (key_pos + 1 == num_tokens && absl::SimpleAtoi(tokens[key_pos], &delay) && delay == 0)
      return MP::OK;
    return MP::PARSE_ERROR;
  }

  if (key_pos >= num_tokens)
    return MP::PARSE_ERROR;

  res->cmd_flags.return_cas = (res->type == MP::GETS || res->type == MP::GATS);
  res->cmd_flags.return_value = true;
  res->cmd_flags.return_flags = true;

  res->backed_args->PushArg(tokens[key_pos++]);

  if (key_pos < num_tokens && res->type == MP::STATS)
    return MP::PARSE_ERROR;  // we don't support additional arguments to stats for now

  if (res->type == MP::INCR || res->type == MP::DECR) {
    if (key_pos == num_tokens)
      return MP::PARSE_ERROR;

    if (!absl::SimpleAtoi(tokens[key_pos], &res->delta))
      return MP::BAD_DELTA;
    ++key_pos;
  }

  while (key_pos < num_tokens) {
    res->backed_args->PushArg(tokens[key_pos++]);
  }

  if (res->type >= MP::DELETE) {  // write commands
    if (res->size() > 1 && res->backed_args->back() == "noreply") {
      res->cmd_flags.no_reply = true;
      res->backed_args->PopArg();
    }
  }

  return MP::OK;
}

bool ParseMetaMode(char m, MP::Command* res) {
  if (res->type == MP::SET) {
    switch (m) {
      case 'E':
        res->type = MP::ADD;
        break;
      case 'A':
        res->type = MP::APPEND;
        break;
      case 'R':
        res->type = MP::REPLACE;
        break;
      case 'P':
        res->type = MP::PREPEND;
        break;
      case 'S':
        break;
      default:
        return false;
    }
    return true;
  }

  if (res->type == MP::INCR) {
    switch (m) {
      case 'I':
      case '+':
        break;
      case 'D':
      case '-':
        res->type = MP::DECR;
        break;
      default:
        return false;
    }
    return true;
  }
  return false;
}

// See https://raw.githubusercontent.com/memcached/memcached/refs/heads/master/doc/protocol.txt
MP::Result ParseMeta(ArgSlice tokens, int64_t now, MP::Command* res, uint32_t max_value_len) {
  DCHECK(!tokens.empty());

  if (res->type == MP::META_DEBUG) {
    LOG(ERROR) << "meta debug not yet implemented";
    return MP::PARSE_ERROR;
  }

  if (tokens[0].size() > 250)
    return MP::PARSE_ERROR;

  res->cmd_flags.meta = true;
  res->flags = 0;
  res->expire_ts = 0;

  string_view arg0 = tokens[0];
  tokens.remove_prefix(1);
  uint32_t bytes_len = 0;

  // We emulate the behavior by returning the high level commands.
  // TODO: we should reverse the interface in the future, so that a high level command
  // will be represented in MemcacheParser::Command by a meta command with flags.
  // high level commands should not be part of the interface in the future.
  switch (res->type) {
    case MP::META_GET:
      res->type = MP::GET;
      break;
    case MP::META_DEL:
      res->type = MP::DELETE;
      break;
    case MP::META_SET:
      if (tokens.empty())
        return MP::PARSE_ERROR;
      if (!absl::SimpleAtoi(tokens[0], &bytes_len))
        return MP::BAD_INT;
      if (bytes_len > max_value_len) {
        LOG_EVERY_T(WARNING, 1) << "Memcache value size " << bytes_len << " exceeds max_bulk_len "
                                << max_value_len;
        return MP::PARSE_ERROR;
      }

      res->type = MP::SET;
      tokens.remove_prefix(1);
      break;
    case MP::META_ARITHM:
      res->type = MP::INCR;
      res->delta = 1;
      break;
    default:
      return MP::PARSE_ERROR;
  }

  string blob;
  uint32_t expire_ts;
  for (size_t i = 0; i < tokens.size(); ++i) {
    string_view token = tokens[i];

    switch (token[0]) {
      case 'T':
        if (!absl::SimpleAtoi(token.substr(1), &expire_ts))
          return MP::BAD_INT;
        res->expire_ts = ToAbsolute(expire_ts, now);
        if (res->type == MP::GET)
          res->type = MP::GAT;
        break;
      case 'b':
        if (token.size() != 1)
          return MP::PARSE_ERROR;
        if (!absl::Base64Unescape(arg0, &blob))
          return MP::PARSE_ERROR;
        arg0 = blob;
        res->cmd_flags.base64 = true;
        break;
      case 'F':
        if (!absl::SimpleAtoi(token.substr(1), &res->flags))
          return MP::BAD_INT;
        break;
      case 'M':
        if (token.size() != 2 || !ParseMetaMode(token[1], res))
          return MP::PARSE_ERROR;
        break;
      case 'D':
        if (!absl::SimpleAtoi(token.substr(1), &res->delta))
          return MP::BAD_INT;
        break;
      case 'q':
        res->cmd_flags.no_reply = true;
        break;
      case 'f':
        res->cmd_flags.return_flags = true;
        break;
      case 'v':
        res->cmd_flags.return_value = true;
        break;
      case 't':
        res->cmd_flags.return_ttl = true;
        break;
      case 'l':
        res->cmd_flags.return_access_time = true;
        break;
      case 'h':
        res->cmd_flags.return_hit = true;
        break;
      case 'c':
        res->cmd_flags.return_cas = true;
        break;
      default:
        LOG(WARNING) << "unknown meta flag: " << token;  // not yet implemented
        return MP::PARSE_ERROR;
    }
  }
  res->backed_args->PushArg(arg0);
  if (MP::IsStoreCmd(res->type)) {
    res->backed_args->PushArg(bytes_len);
  }
  return MP::OK;
}

}  // namespace

auto MP::Parse(string_view str, uint32_t* consumed, Command* cmd) -> Result {
  DVLOG(1) << "Parsing memcache input: [" << str << "]";

  *consumed = 0;

  if (val_len_to_read_ > 0) {
    return ConsumeValue(str, consumed, cmd);
  }

  cmd->cmd_flags.raw = 0;  // re-initialize

  size_t pos = str.find('\n');
  if (pos == string_view::npos) {
    // We need more data to parse the command. For get/gets commands this line can be very long.
    // we limit maximum buffer capacity in the higher levels using max_client_iobuf_len.
    tmp_buf_.append(str);
    *consumed = str.size();
    return INPUT_PENDING;
  }

  *consumed = pos + 1;
  string_view main_cmd;

  if (tmp_buf_.empty()) {
    main_cmd = str.substr(0, pos);
  } else {
    tmp_buf_.append(str.substr(0, pos));
    main_cmd = tmp_buf_;
  }

  // main_cmd is \n stripped, so it should end with \r.
  if (main_cmd.empty() || main_cmd.back() != '\r') {
    return PARSE_ERROR;
  }
  main_cmd.remove_suffix(1);  // remove trailing \r

  // cas <key> <flags> <exptime> <bytes> <cas unique> [noreply]\r\n
  // get <key>*\r\n
  // ms <key> <datalen> <flags>*\r\n
  absl::InlinedVector<string_view, 32> tokens =
      absl::StrSplit(main_cmd, ' ', absl::SkipWhitespace());

  Result res = ParseInternal(absl::MakeSpan(tokens), cmd);
  tmp_buf_.clear();
  if (val_len_to_read_ > 0)
    return ConsumeValue(str.substr(pos + 1), consumed, cmd);
  return res;
};

auto MP::ParseInternal(ArgSlice tokens_view, Command* cmd) -> Result {
  if (tokens_view.empty())
    return PARSE_ERROR;

  cmd->type = From(tokens_view[0]);
  if (cmd->type == INVALID) {
    return UNKNOWN_CMD;
  }

  tokens_view.remove_prefix(1);
  cmd->backed_args->clear();

  if (cmd->type <= CAS) {                                         // Store command
    if (tokens_view.size() < 4 || tokens_view[0].size() > 250) {  // key length limit
      return MP::PARSE_ERROR;
    }

    auto res = ParseStore(tokens_view, last_unix_time_, cmd, max_value_len_);
    if (res != MP::OK)
      return res;
    val_len_to_read_ = cmd->value().size() + 2;
    return MP::OK;
  }

  if (cmd->type >= META_SET) {
    if (tokens_view.empty())
      return MP::PARSE_ERROR;

    auto res = ParseMeta(tokens_view, last_unix_time_, cmd, max_value_len_);
    if (res != MP::OK)
      return res;

    if (IsStoreCmd(cmd->type)) {
      val_len_to_read_ = cmd->value().size() + 2;
      res = MP::OK;
    }
    return res;
  }

  if (tokens_view.empty()) {
    if (base::_in(cmd->type, {MP::STATS, MP::FLUSHALL, MP::QUIT, MP::VERSION, MP::META_NOOP})) {
      return MP::OK;
    }
    return MP::PARSE_ERROR;
  }

  return ParseValueless(tokens_view, last_unix_time_, cmd);
}

auto MP::ConsumeValue(std::string_view str, uint32_t* consumed, Command* dest) -> Result {
  DCHECK_EQ(dest->size(), 2u);  // key and value
  DCHECK_GT(val_len_to_read_, 0u);

  if (val_len_to_read_ > 2) {
    uint32_t need_copy = val_len_to_read_ - 2;
    uint32_t dest_len = dest->backed_args->elem_len(1);
    DCHECK_GE(dest_len, need_copy);  // should be ensured during parsing

    char* start = dest->value_ptr() + (dest_len - need_copy);
    uint32_t to_fill = std::min<uint32_t>(need_copy, str.size());
    if (to_fill) {
      memcpy(start, str.data(), to_fill);
      val_len_to_read_ -= to_fill;
      *consumed += to_fill;
      str.remove_prefix(to_fill);
    }
  }

  if (str.empty()) {
    return MP::INPUT_PENDING;
  }

  DCHECK(val_len_to_read_ <= 2u && val_len_to_read_ > 0);
  // consume \r\n
  char end[] = "\r\n";

  do {
    if (str.front() != end[2 - val_len_to_read_])  // val_len_to_read_ 2 -> '\r', 1 -> '\n'
      return MP::PARSE_ERROR;

    ++(*consumed);
    --val_len_to_read_;
    str.remove_prefix(1);
  } while (val_len_to_read_ && !str.empty());

  return val_len_to_read_ > 0 ? MP::INPUT_PENDING : MP::OK;
}

}  // namespace facade


================================================
FILE: src/facade/memcache_parser.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <cstdint>
#include <string>
#include <string_view>
#include <vector>

#include "common/backed_args.h"
#include "facade/facade_types.h"

namespace facade {

// Memcache parser does not parse value blobs, only the commands.
// The expectation is that the caller will parse the command and
// then will follow up with reading the blob data directly from source.
class MemcacheParser {
 public:
  explicit MemcacheParser(uint32_t max_value_len = UINT32_MAX) : max_value_len_(max_value_len) {
  }

  enum CmdType : uint8_t {
    INVALID = 0,
    SET = 1,
    ADD = 2,
    REPLACE = 3,
    APPEND = 4,
    PREPEND = 5,
    CAS = 6,

    // Retrieval
    GET = 10,
    GETS = 11,
    GAT = 12,
    GATS = 13,
    STATS = 14,

    QUIT = 20,
    VERSION = 21,

    // The rest of write commands.
    DELETE = 31,
    INCR = 32,
    DECR = 33,
    FLUSHALL = 34,

    // META_COMMANDS
    META_NOOP = 50,
    META_SET = 51,
    META_DEL = 52,
    META_ARITHM = 53,
    META_GET = 54,
    META_DEBUG = 55,
  };

  // According to https://github.com/memcached/memcached/wiki/Commands#standard-protocol
  struct Command {
    Command() = default;
    Command(const Command&) = delete;
    Command(Command&&) noexcept = default;

    CmdType type = INVALID;

    std::string_view key() const {
      return backed_args->empty() ? std::string_view{} : backed_args->Front();
    }

    // For STORE commands, value is at index 1.
    // For both key and value we provide convenience accessors that return empty string_view
    // if not present.
    std::string_view value() const {
      return backed_args->size() < 2 ? std::string_view{} : backed_args->at(1);
    }

    size_t size() const {
      return backed_args->size();
    }

    char* value_ptr() {  // NOLINT
      return backed_args->data(1);
    }

    union {
      uint64_t cas_unique = 0;  // for CAS COMMAND
      uint64_t delta;           // for DECR/INCR commands.
    };

    int64_t expire_ts = 0;  // unix time (expire_ts > month) in seconds

    // flags for STORE commands
    uint32_t flags = 0;

    MemcacheCmdFlags cmd_flags;

    // Does not own this object, only references it.
    cmn::BackedArguments* backed_args = nullptr;
  };

  static_assert(sizeof(Command) == 40);

  enum Result : uint8_t {
    OK,
    INPUT_PENDING,
    UNKNOWN_CMD,
    BAD_INT,
    PARSE_ERROR,  // request parse error, but can continue parsing within the same connection.
    BAD_DELTA,
  };

  static bool IsStoreCmd(CmdType type) {
    return type >= SET && type <= CAS;
  }

  size_t UsedMemory() const {
    return tmp_buf_.capacity();
  }

  void Reset() {
    val_len_to_read_ = 0;
    tmp_buf_.clear();
  }

  Result Parse(std::string_view str, uint32_t* consumed, Command* res);

  void set_last_unix_time(int64_t t) {
    last_unix_time_ = t;
  }

 private:
  Result ConsumeValue(std::string_view str, uint32_t* consumed, Command* dest);
  Result ParseInternal(ArgSlice tokens_view, Command* cmd);

  uint32_t val_len_to_read_ = 0;
  uint32_t max_value_len_ = UINT32_MAX;
  std::string tmp_buf_;
  int64_t last_unix_time_ = 0;
};

}  // namespace facade


================================================
FILE: src/facade/memcache_parser_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "facade/memcache_parser.h"

#include <gmock/gmock.h>

#include "absl/strings/str_cat.h"
#include "base/gtest.h"
#include "base/logging.h"
#include "facade/facade_test.h"

using namespace testing;
using namespace std;

namespace facade {

class MCParserTest : public testing::Test {
 protected:
  MCParserTest() {
    cmd_.backed_args = &backed_args_;
  }
  MemcacheParser::Result Parse(string_view input) {
    parser_.Reset();
    return parser_.Parse(input, &consumed_, &cmd_);
  }

  vector<string_view> ToArgs() const {
    return {cmd_.backed_args->begin(), cmd_.backed_args->end()};
  }

  MemcacheParser parser_;
  cmn::BackedArguments backed_args_;
  MemcacheParser::Command cmd_;
  uint32_t consumed_;
};

TEST_F(MCParserTest, Basic) {
  MemcacheParser::Result st = Parse("set a 1 20 3\r\n");
  EXPECT_EQ(MemcacheParser::INPUT_PENDING, st);
  EXPECT_EQ("a", cmd_.key());
  EXPECT_EQ(1, cmd_.flags);
  EXPECT_EQ(20, cmd_.expire_ts);
  EXPECT_EQ(3, cmd_.value().size());
  EXPECT_EQ(MemcacheParser::SET, cmd_.type);

  st = Parse("quit\r\n");
  EXPECT_EQ(MemcacheParser::OK, st);
  EXPECT_EQ(MemcacheParser::QUIT, cmd_.type);
}

TEST_F(MCParserTest, Incr) {
  MemcacheParser::Result st = Parse("incr a\r\n");
  EXPECT_EQ(MemcacheParser::PARSE_ERROR, st);

  st = Parse("incr a 1\r\n");
  EXPECT_EQ(MemcacheParser::OK, st);
  EXPECT_EQ(MemcacheParser::INCR, cmd_.type);
  EXPECT_EQ("a", cmd_.key());
  EXPECT_EQ(1, cmd_.delta);
  EXPECT_FALSE(cmd_.cmd_flags.no_reply);

  st = Parse("incr a -1\r\n");
  EXPECT_EQ(MemcacheParser::BAD_DELTA, st);

  st = Parse("decr b 10 noreply\r\n");
  EXPECT_EQ(MemcacheParser::OK, st);
  EXPECT_EQ(MemcacheParser::DECR, cmd_.type);
  EXPECT_EQ(10, cmd_.delta);
}

TEST_F(MCParserTest, Stats) {
  MemcacheParser::Result st = Parse("stats foo\r\n");
  EXPECT_EQ(MemcacheParser::OK, st);
  EXPECT_EQ(consumed_, 11);
  EXPECT_EQ(cmd_.type, MemcacheParser::STATS);
  EXPECT_EQ("foo", cmd_.key());

  st = Parse("stats  \r\n");
  EXPECT_EQ(MemcacheParser::OK, st);
  EXPECT_EQ(consumed_, 9);
  EXPECT_EQ(cmd_.type, MemcacheParser::STATS);
  EXPECT_EQ(0, cmd_.size());

  st = Parse("stats  fpp bar\r\n");
  EXPECT_EQ(MemcacheParser::PARSE_ERROR, st);
}

TEST_F(MCParserTest, NoreplyBasic) {
  MemcacheParser::Result st = Parse("set mykey 1 2 3 noreply\r\n");

  EXPECT_EQ(MemcacheParser::INPUT_PENDING, st);
  EXPECT_EQ("mykey", cmd_.key());
  EXPECT_EQ(1, cmd_.flags);
  EXPECT_EQ(2, cmd_.expire_ts);
  EXPECT_EQ(3, cmd_.value().size());
  EXPECT_EQ(MemcacheParser::SET, cmd_.type);
  EXPECT_TRUE(cmd_.cmd_flags.no_reply);

  st = Parse("set mykey2 4 5 6\r\n");

  EXPECT_EQ(MemcacheParser::INPUT_PENDING, st);
  EXPECT_EQ("mykey2", cmd_.key());
  EXPECT_EQ(4, cmd_.flags);
  EXPECT_EQ(5, cmd_.expire_ts);
  EXPECT_EQ(6, cmd_.value().size());
  EXPECT_EQ(MemcacheParser::SET, cmd_.type);
  EXPECT_FALSE(cmd_.cmd_flags.no_reply);
}

TEST_F(MCParserTest, Meta) {
  MemcacheParser::Result st = Parse("ms key1 ");
  EXPECT_EQ(MemcacheParser::INPUT_PENDING, st);
  EXPECT_EQ(8, consumed_);
  st = parser_.Parse("6 T1 F2\r\naaaaaa\r\n", &consumed_, &cmd_);
  EXPECT_EQ(MemcacheParser::OK, st);
  EXPECT_EQ(17, consumed_);
  EXPECT_EQ(MemcacheParser::SET, cmd_.type);
  EXPECT_EQ("key1", cmd_.key());
  EXPECT_EQ(2, cmd_.flags);
  EXPECT_EQ(1, cmd_.expire_ts);
  st = Parse("ms 16nXnNeV150= 5 b ME\r\nbbbbb");
  EXPECT_EQ(MemcacheParser::INPUT_PENDING, st);
  EXPECT_EQ(29, consumed_);
  EXPECT_EQ(MemcacheParser::ADD, cmd_.type);
  EXPECT_EQ("שלום", cmd_.key());
  EXPECT_EQ(5, cmd_.value().size());

  st = Parse("mg 16nXnNeV150= b\r\n");
  EXPECT_EQ(MemcacheParser::OK, st);
  EXPECT_EQ(19, consumed_);
  EXPECT_EQ(MemcacheParser::GET, cmd_.type);
  EXPECT_EQ("שלום", cmd_.key());

  st = Parse("ma val b\r\n");
  EXPECT_EQ(MemcacheParser::OK, st);
  EXPECT_EQ(10, consumed_);
  EXPECT_EQ(MemcacheParser::INCR, cmd_.type);

  st = Parse("ma val M- D10\r\n");
  EXPECT_EQ(MemcacheParser::OK, st);
  EXPECT_EQ(15, consumed_);
  EXPECT_EQ(MemcacheParser::DECR, cmd_.type);
  EXPECT_EQ(10, cmd_.delta);

  st = Parse("mg key f v t l h\r\n");
  EXPECT_EQ(MemcacheParser::OK, st);
  EXPECT_EQ(18, consumed_);
  EXPECT_EQ(MemcacheParser::GET, cmd_.type);
  EXPECT_EQ("key", cmd_.key());
  EXPECT_TRUE(cmd_.cmd_flags.return_flags);
  EXPECT_TRUE(cmd_.cmd_flags.return_value);
  EXPECT_TRUE(cmd_.cmd_flags.return_ttl);
  EXPECT_TRUE(cmd_.cmd_flags.return_access_time);
  EXPECT_TRUE(cmd_.cmd_flags.return_hit);
}

TEST_F(MCParserTest, Gat) {
  auto res = Parse("gat 1000 foo bar baz\r\n");
  EXPECT_EQ(MemcacheParser::OK, res);
  EXPECT_EQ(consumed_, 22);
  EXPECT_EQ(cmd_.type, MemcacheParser::GAT);
  EXPECT_THAT(ToArgs(), ElementsAre("foo", "bar", "baz"));
  EXPECT_EQ(cmd_.expire_ts, 1000);

  res = Parse("gat foo bar\r\n");
  EXPECT_EQ(MemcacheParser::BAD_INT, res);

  res = Parse("gats 1000 foo bar baz\r\n");
  EXPECT_EQ(MemcacheParser::OK, res);
  EXPECT_EQ(consumed_, 23);
  EXPECT_EQ(cmd_.type, MemcacheParser::GATS);
  EXPECT_THAT(ToArgs(), ElementsAre("foo", "bar", "baz"));
  EXPECT_EQ(cmd_.expire_ts, 1000);

  parser_.set_last_unix_time(2000);
  res = Parse("gats 1000 foo bar baz\r\n");
  EXPECT_EQ(MemcacheParser::OK, res);
  EXPECT_EQ(cmd_.expire_ts, 3000);

  res = Parse("gats 100\r\n");
  EXPECT_EQ(MemcacheParser::PARSE_ERROR, res);

  res = Parse("gat 100\r\n");
  EXPECT_EQ(MemcacheParser::PARSE_ERROR, res);
}

TEST_F(MCParserTest, ValueState) {
  auto st = Parse("ms key1 6\r\nabc");
  EXPECT_EQ(MemcacheParser::INPUT_PENDING, st);
  EXPECT_EQ(consumed_, 14);
  st = parser_.Parse("de", &consumed_, &cmd_);
  EXPECT_EQ(MemcacheParser::INPUT_PENDING, st);
  EXPECT_EQ(consumed_, 2);

  st = parser_.Parse("f\r", &consumed_, &cmd_);
  EXPECT_EQ(MemcacheParser::INPUT_PENDING, st);
  EXPECT_EQ(consumed_, 2);
  EXPECT_EQ(cmd_.value(), "abcdef");

  st = parser_.Parse("\n", &consumed_, &cmd_);
  EXPECT_EQ(MemcacheParser::OK, st);
  EXPECT_EQ(consumed_, 1);
}

TEST_F(MCParserTest, MaxValueLen) {
  MemcacheParser capped_parser(10);
  cmn::BackedArguments ba;
  MemcacheParser::Command cmd;
  cmd.backed_args = &ba;
  uint32_t consumed;

  // Value within limit — accepted.
  auto st = capped_parser.Parse("set k 0 0 10\r\n", &consumed, &cmd);
  EXPECT_EQ(MemcacheParser::INPUT_PENDING, st);

  // Value exceeds limit — rejected.
  capped_parser.Reset();
  st = capped_parser.Parse("set k 0 0 11\r\n", &consumed, &cmd);
  EXPECT_EQ(MemcacheParser::PARSE_ERROR, st);

  // Meta set within limit.
  capped_parser.Reset();
  st = capped_parser.Parse("ms key 10\r\n", &consumed, &cmd);
  EXPECT_EQ(MemcacheParser::INPUT_PENDING, st);

  // Meta set exceeds limit.
  capped_parser.Reset();
  st = capped_parser.Parse("ms key 11\r\n", &consumed, &cmd);
  EXPECT_EQ(MemcacheParser::PARSE_ERROR, st);
}

TEST_F(MCParserTest, ParseError) {
  EXPECT_EQ(MemcacheParser::PARSE_ERROR, Parse("ms key1 3\r\nabcd"));
  EXPECT_EQ(MemcacheParser::INPUT_PENDING, Parse("ms key1 3\r\nabc"));
  EXPECT_EQ(MemcacheParser::PARSE_ERROR, parser_.Parse("\ra", &consumed_, &cmd_));
  EXPECT_EQ(MemcacheParser::INPUT_PENDING, Parse("ms key1 3\r\nabc\r"));
  EXPECT_EQ(MemcacheParser::PARSE_ERROR, parser_.Parse("\r", &consumed_, &cmd_));
}

// Test for the bug where \r\n command line terminator split across TCP packets
// would cause parse errors.
TEST_F(MCParserTest, SplitCRLFInCommandLine) {
  // Simulate TCP fragmentation where command line ends with \r but \n comes in next packet
  auto st = Parse("set k10 0 0 3 noreply\r");
  EXPECT_EQ(MemcacheParser::INPUT_PENDING, st);
  EXPECT_EQ(consumed_, 22);

  // Now the \n arrives followed by the value and another command
  st = parser_.Parse("\nd10\r\nget k11\r\n", &consumed_, &cmd_);
  EXPECT_EQ(MemcacheParser::OK, st);
  EXPECT_EQ(consumed_, 6);  // \n + d10\r\n
  EXPECT_EQ(cmd_.type, MemcacheParser::SET);
  EXPECT_EQ(cmd_.key(), "k10");
  EXPECT_EQ(cmd_.value(), "d10");
  EXPECT_TRUE(cmd_.cmd_flags.no_reply);
}

// Test edge case: empty command line when \r\n split
TEST_F(MCParserTest, SplitCRLFEmptyCommand) {
  // Just \r with nothing before it
  auto st = Parse("\r");
  EXPECT_EQ(MemcacheParser::INPUT_PENDING, st);

  // Now \n arrives - should be parse error since command line is empty
  st = parser_.Parse("\nget key\r\n", &consumed_, &cmd_);
  EXPECT_EQ(MemcacheParser::PARSE_ERROR, st);
}

class MCParserNoreplyTest : public MCParserTest {
 protected:
  void RunTest(string_view str, bool noreply,
               MemcacheParser::Result expected_res = MemcacheParser::OK) {
    MemcacheParser::Result st = Parse(str);

    EXPECT_EQ(expected_res, st);
    EXPECT_EQ(cmd_.cmd_flags.no_reply, noreply);
  }
};

TEST_F(MCParserNoreplyTest, StoreCommands) {
  RunTest("set mykey 0 0 3 noreply\r\n", true, MemcacheParser::INPUT_PENDING);
  RunTest("set mykey 0 0 3\r\n", false, MemcacheParser::INPUT_PENDING);
  RunTest("add mykey 0 0 3\r\n", false, MemcacheParser::INPUT_PENDING);
  RunTest("replace mykey 0 0 3\r\n", false, MemcacheParser::INPUT_PENDING);
  RunTest("append mykey 0 0 3\r\n", false, MemcacheParser::INPUT_PENDING);
  RunTest("prepend mykey 0 0 3\r\n", false, MemcacheParser::INPUT_PENDING);
}

TEST_F(MCParserNoreplyTest, Other) {
  RunTest("quit\r\n", false);
  RunTest("delete mykey\r\n", false);
  RunTest("incr mykey 1\r\n", false);
  RunTest("decr mykey 1\r\n", false);
  RunTest("flush_all\r\n", false);
}

TEST_F(MCParserNoreplyTest, LargeGetRequest) {
  std::string large_request = "get";
  for (size_t i = 0; i < 100; ++i) {
    absl::StrAppend(&large_request, " mykey", i, " ");
  }
  absl::StrAppend(&large_request, "\r\n");

  RunTest(large_request, false);

  EXPECT_EQ(cmd_.type, MemcacheParser::CmdType::GET);
  auto keys = ToArgs();
  EXPECT_TRUE(std::all_of(keys.begin(), keys.end(), [i = 0u](const auto& elem) mutable {
    return elem == absl::StrCat("mykey", i++);
  }));
}

}  // namespace facade


================================================
FILE: src/facade/ok_main.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "base/init.h"
#include "facade/conn_context.h"
#include "facade/dragonfly_connection.h"
#include "facade/dragonfly_listener.h"
#include "facade/reply_builder.h"
#include "facade/service_interface.h"
#include "util/accept_server.h"
#include "util/fibers/pool.h"

ABSL_FLAG(uint32_t, port, 6379, "server port");

using namespace util;
using namespace std;
using absl::GetFlag;

namespace facade {

namespace {

struct CmdContext : public facade::ParsedCommand {
  void ReuseInternal() final {
  }
};

class OkService : public ServiceInterface {
 public:
  DispatchResult DispatchCommand(ParsedArgs args, ParsedCommand* cmd, AsyncPreference) final {
    cmd->rb()->SendOk();
    return DispatchResult::OK;
  }

  DispatchManyResult DispatchManyCommands(std::function<ParsedArgs()> arg_gen, unsigned count,
                                          SinkReplyBuilder* builder,
                                          ConnectionContext* cntx) final {
    for (unsigned i = 0; i < count; i++) {
      ParsedArgs args = arg_gen();
      ParsedCommand* cmd = AllocateParsedCommand();
      cmd->Init(builder, cntx);

      DispatchCommand(args, cmd, AsyncPreference::ONLY_SYNC);
      delete cmd;
    }
    DispatchManyResult result{
        .processed = static_cast<uint32_t>(count),
        .account_in_stats = true,
    };
    return result;
  }

  DispatchResult DispatchMC(ParsedCommand* cmd, AsyncPreference) final {
    cmd->rb()->SendError("");
    return DispatchResult::OK;
  }

  ConnectionContext* CreateContext(Connection* owner) final {
    return new ConnectionContext{owner};
  }

  ParsedCommand* AllocateParsedCommand() final {
    return new CmdContext{};
  }
};

void RunEngine(ProactorPool* pool, AcceptServer* acceptor) {
  OkService service;

  Connection::Init(pool->size());
  pool->Await([](auto*) { tl_facade_stats = new FacadeStats; });

  acceptor->AddListener(GetFlag(FLAGS_port), new Listener{Protocol::REDIS, &service});

  acceptor->Run();
  acceptor->Wait();
}

}  // namespace

}  // namespace facade

#ifdef __linux__
#define USE_URING 1
#else
#define USE_URING 0
#endif

int main(int argc, char* argv[]) {
  MainInitGuard guard(&argc, &argv);

  CHECK_GT(GetFlag(FLAGS_port), 0u);

#if USE_URING
  unique_ptr<util::ProactorPool> pp(fb2::Pool::IOUring(1024));
#else
  unique_ptr<util::ProactorPool> pp(fb2::Pool::Epoll());
#endif
  pp->Run();

  AcceptServer acceptor(pp.get());
  facade::RunEngine(pp.get(), &acceptor);

  pp->Stop();

  return 0;
}


================================================
FILE: src/facade/op_status.cc
================================================
#include "facade/op_status.h"

#include "base/logging.h"
#include "facade/error.h"
#include "facade/resp_expr.h"

namespace facade {

std::string_view StatusToMsg(OpStatus status) {
  switch (status) {
    case OpStatus::OK:
      return "OK";
    case OpStatus::KEY_NOTFOUND:
      return kKeyNotFoundErr;
    case OpStatus::WRONG_TYPE:
      return kWrongTypeErr;
    case OpStatus::WRONG_JSON_TYPE:
      return kWrongJsonTypeErr;
    case OpStatus::OUT_OF_RANGE:
      return kIndexOutOfRange;
    case OpStatus::INVALID_FLOAT:
      return kInvalidFloatErr;
    case OpStatus::INVALID_INT:
      return kInvalidIntErr;
    case OpStatus::SYNTAX_ERR:
      return kSyntaxErr;
    case OpStatus::OUT_OF_MEMORY:
      return kOutOfMemory;
    case OpStatus::CORRUPTED_HLL:
      return "-INVALIDOBJ Corrupted HLL object detected.";
    case OpStatus::BUSY_GROUP:
      return "-BUSYGROUP Consumer Group name already exists";
    case OpStatus::INVALID_NUMERIC_RESULT:
      return kInvalidNumericResult;
    case OpStatus::AT_LEAST_ONE_KEY:
      return "at least 1 input key is needed for this command";
    case OpStatus::MEMBER_NOTFOUND:
      return kKeyNotFoundErr;
    case OpStatus::INVALID_JSON_PATH:
      return kInvalidJsonPathErr;
    case OpStatus::INVALID_JSON:
      return kJsonParseError;
    case OpStatus::NAN_OR_INF_DURING_INCR:
      return kNanOrInfDuringIncr;
    case OpStatus::IO_ERROR:
      return kTieredIoError;
    default:
      LOG(ERROR) << "Unsupported status " << status;
      return "Internal error";
  }
}

}  // namespace facade


================================================
FILE: src/facade/op_status.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <cstdint>
#include <ostream>

namespace facade {

enum class OpStatus : uint16_t {
  OK,
  KEY_EXISTS,
  KEY_NOTFOUND,
  KEY_MOVED,
  SKIPPED,
  INVALID_VALUE,
  CORRUPTED_HLL,
  OUT_OF_RANGE,
  WRONG_TYPE,
  WRONG_JSON_TYPE,
  TIMED_OUT,
  OUT_OF_MEMORY,
  INVALID_FLOAT,
  INVALID_INT,
  SYNTAX_ERR,
  BUSY_GROUP,
  STREAM_ID_SMALL,
  INVALID_NUMERIC_RESULT,
  CANCELLED,
  AT_LEAST_ONE_KEY,
  MEMBER_NOTFOUND,
  INVALID_JSON_PATH,
  INVALID_JSON,
  IO_ERROR,
  NAN_OR_INF_DURING_INCR,
};

class OpResultBase {
 public:
  OpResultBase(OpStatus st = OpStatus::OK) : st_(st) {
  }

  constexpr explicit operator bool() const {
    return st_ == OpStatus::OK;
  }

  OpStatus status() const {
    return st_;
  }

  bool operator==(OpStatus st) const {
    return st_ == st;
  }

  bool ok() const {
    return st_ == OpStatus::OK;
  }

  const char* DebugFormat() const;

 private:
  OpStatus st_;
};

template <typename V> class OpResult : public OpResultBase {
 public:
  using Type = V;

  OpResult(V&& v) : v_(std::move(v)) {
  }

  OpResult(const V& v) : v_(v) {
  }

  using OpResultBase::OpResultBase;

  const V& value() const {
    return v_;
  }

  V& value() {
    return v_;
  }

  V value_or(V v) const {
    return status() == OpStatus::OK ? v_ : v;
  }

  V* operator->() {
    return &v_;
  }

  V& operator*() & {
    return v_;
  }

  V&& operator*() && {
    return std::move(v_);
  }

  const V* operator->() const {
    return &v_;
  }

  const V& operator*() const& {
    return v_;
  }

 private:
  V v_{};
};

template <> class OpResult<void> : public OpResultBase {
 public:
  using OpResultBase::OpResultBase;
};

inline bool operator==(OpStatus st, const OpResultBase& ob) {
  return ob.operator==(st);
}

std::string_view StatusToMsg(OpStatus status);

}  // namespace facade

namespace std {

template <typename T> std::ostream& operator<<(std::ostream& os, const facade::OpResult<T>& res) {
  os << res.status();
  return os;
}

inline std::ostream& operator<<(std::ostream& os, const facade::OpStatus op) {
  os << int(op);
  return os;
}

}  // namespace std


================================================
FILE: src/facade/parsed_command.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "facade/parsed_command.h"

#include "base/logging.h"
#include "core/overloaded.h"
#include "facade/conn_context.h"
#include "facade/dragonfly_connection.h"
#include "facade/reply_builder.h"
#include "facade/reply_capture.h"
#include "facade/reply_payload.h"

namespace facade {

using namespace std;

string MCRender::RenderNotFound() const {
  if (flags_.no_reply)
    return {};
  return flags_.meta ? "NF" : "NOT_FOUND";
}

string MCRender::RenderGetEnd() const {
  if (flags_.no_reply || flags_.meta)
    return {};
  return "END";
}

std::string MCRender::RenderStored(bool ok) const {
  if (flags_.no_reply)
    return {};
  if (ok)
    return flags_.meta ? "HD" : "STORED";
  return flags_.meta ? "NS" : "NOT_STORED";
}

string MCRender::RenderMiss() const {
  if (flags_.no_reply || !flags_.meta)
    return {};
  return "EN";
}

string MCRender::RenderDeleted() const {
  if (flags_.no_reply)
    return {};
  return flags_.meta ? "HD" : "DELETED";
}

void ParsedCommand::ResetForReuse() {
  is_deferred_reply_ = false;
  reply_ = std::monostate{};

  offsets_.clear();
  if (HeapMemory() > 1024) {
    storage_.clear();  // also deallocates the heap.
    offsets_.shrink_to_fit();
  }
  ReuseInternal();
}

void ParsedCommand::SendError(std::string_view str, std::string_view type) {
  if (!is_deferred_reply_) {
    rb_->SendError(str, type);
  } else {
    reply_ = payload::make_error(str, type);
  }
}

void ParsedCommand::SendError(facade::OpStatus status) {
  if (!is_deferred_reply_) {
    rb_->SendError(status);
  } else {
    if (status == OpStatus::OK)
      reply_ = payload::SimpleString{"OK"};
    else
      reply_ = payload::make_error(StatusToMsg(status));
  }
}

void ParsedCommand::SendError(const facade::ErrorReply& error) {
  if (error.status)
    return SendError(*error.status);
  SendError(error.ToSv(), error.kind);
}

void ParsedCommand::SendSimpleString(std::string_view str) {
  if (!is_deferred_reply_) {
    rb_->SendSimpleString(str);
  } else {
    reply_ = payload::make_simple_or_noreply(str);
  }
}

void ParsedCommand::SendLong(long val) {
  DCHECK(!is_deferred_reply_);
  rb_->SendLong(val);
}

bool ParsedCommand::CanReply() const {
  DCHECK(is_deferred_reply_);
  dfly::Overloaded ov{[](const payload::Payload& pl) { return pl.index() > 0 /* not monostate */; },
                      [](const SuspendedCommand& task) { return task.blocker->IsCompleted(); }};
  return std::visit(ov, reply_);
}

void ParsedCommand::SendReply() {
  auto payload_handler = [this](payload::Payload& pl) {
    CapturingReplyBuilder::Apply(std::move(pl), rb_);
  };
  auto task_handler = [](SuspendedCommand& task) {
    DCHECK(task.coro);
    task.coro.resume();
    task.coro = {};
  };
  std::visit(dfly::Overloaded{task_handler, payload_handler}, reply_);
}

ParsedCommand::SuspendedCommand::~SuspendedCommand() {
  if (coro) {
    coro.destroy();
    coro = {};
  }
}

}  // namespace facade


================================================
FILE: src/facade/parsed_command.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <coroutine>
#include <variant>

#include "base/function2.hpp"
#include "common/backed_args.h"
#include "facade/memcache_parser.h"
#include "facade/reply_payload.h"
#include "util/fibers/synchronization.h"

namespace facade {

class ConnectionContext;
class SinkReplyBuilder;

// Renders simple string responses based on flags.
// Returns empty string if no response is to be sent.
class MCRender {
 public:
  explicit MCRender(MemcacheCmdFlags flags) : flags_(flags) {
  }

  std::string RenderNotFound() const;
  std::string RenderMiss() const;
  std::string RenderDeleted() const;
  std::string RenderGetEnd() const;
  std::string RenderStored(bool ok) const;

 private:
  MemcacheCmdFlags flags_;
};

// ParsedCommand is a protocol-agnostic holder for parsed request state.
// It wraps cmn::BackedArguments so the facade can populate RESP arguments and
// optionally attach a MemcacheParser::Command, complementing the arguments
// with memcache-specific data.
// The purpose of ParsedCommand is to hold the entire state of a parsed request
// during its lifetime, from parsing to dispatching and reply building including
// any async dispatching.
class ParsedCommand : public cmn::BackedArguments {
  friend class ServiceInterface;

 protected:
  SinkReplyBuilder* rb_ = nullptr;  // either RedisReplyBuilder or MCReplyBuilder
  ConnectionContext* conn_cntx_ = nullptr;

  std::unique_ptr<MemcacheParser::Command> mc_cmd_;  // only for memcache protocol

  ParsedCommand() = default;

  // Helper function to get the only argument type
  template <typename C, typename Arg> static Arg OnlyArgType(void (C::*)(Arg) const);

 public:
  using ReplyFunc = fu2::function_base<true, false, fu2::capacity_fixed<16, 8>, false, false,
                                       void(SinkReplyBuilder*)>;

  virtual ~ParsedCommand() = default;

  virtual size_t GetSize() const {
    return sizeof(ParsedCommand);
  }

  // time when the message was parsed as reported by CycleClock::Now()
  // Also serves as the enqueue timestamp for calculating pipeline wait latency.
  uint64_t parsed_cycle = 0;
  ParsedCommand* next = nullptr;

  void Init(SinkReplyBuilder* rb, ConnectionContext* conn_cntx) {
    rb_ = rb;
    conn_cntx_ = conn_cntx;
  }

  // If true, creates mc specific fields, false - destroys them.
  void ConfigureMCExtension(bool is_mc) {
    if (is_mc && !mc_cmd_) {
      mc_cmd_ = std::make_unique<MemcacheParser::Command>();
      mc_cmd_->backed_args = this;
    } else if (!is_mc) {
      mc_cmd_.reset();
    }
  }

  SinkReplyBuilder* rb() const {
    return rb_;
  }

  ConnectionContext* conn_cntx() const {
    return conn_cntx_;
  }
  MemcacheParser::Command* mc_command() const {
    return mc_cmd_.get();
  }

  size_t UsedMemory() const {
    size_t sz = HeapMemory() + GetSize();
    if (mc_cmd_) {
      sz += sizeof(*mc_cmd_);
    }
    return sz;
  }

  // Marks this command as having reply stored in its payload instead of being sent directly.
  void SetDeferredReply() {
    is_deferred_reply_ = true;
  }

  bool IsDeferredReply() const {
    return is_deferred_reply_;
  }

  void ResetForReuse();

  void SendError(std::string_view str, std::string_view type = std::string_view{});
  void SendError(facade::OpStatus status);
  void SendError(const facade::ErrorReply& error);

  void SendSimpleString(std::string_view str);
  void SendOk() {
    SendSimpleString("OK");
  }

  void SendLong(long val);
  template <typename F> void ReplyWith(F&& func) {
    assert(!is_deferred_reply_);
    using RbType = decltype(OnlyArgType(&std::decay_t<F>::operator()));
    func(static_cast<RbType>(rb_));
  }

  // Below are main commands for the async api and all assume that the command defers replies

  // Whether SendReply() can be called. If not, it must be waited via Blocker()
  bool CanReply() const;

  // Reaching zero on blocker means CanReply() turns true
  util::fb2::EmbeddedBlockingCounter* Blocker() const {
    return std::get<SuspendedCommand>(reply_).blocker;
  }

  // Assumes CanReply() is true. Sends reply
  void SendReply();

  // Resolve deferred command with reply
  void Resolve(const facade::ErrorReply& error) {
    SendError(error);
  }

  // Resolve deferred command with async task
  void Resolve(util::fb2::EmbeddedBlockingCounter* blocker, std::coroutine_handle<> coro) {
    reply_ = SuspendedCommand{blocker, coro};
  }

 protected:
  virtual void ReuseInternal() = 0;

 private:
  // Suspended asynchronous command. Once blocker is done, the coroutine can be resumed.
  // Deletes the coroutine on drop.
  struct SuspendedCommand {
    SuspendedCommand(util::fb2::EmbeddedBlockingCounter* blocker, std::coroutine_handle<> coro)
        : blocker{blocker}, coro{coro} {
    }

    SuspendedCommand(SuspendedCommand&& other) noexcept
        : blocker{other.blocker}, coro{std::exchange(other.coro, {})} {
    }

    SuspendedCommand& operator=(SuspendedCommand&& other) noexcept {
      blocker = other.blocker;
      coro = std::exchange(other.coro, {});
      return *this;
    }

    // To destroy the coroutine when cancelling (as the handle is non owning)
    ~SuspendedCommand();

    util::fb2::EmbeddedBlockingCounter* blocker;
    std::coroutine_handle<> coro;
  };

  // if false then the reply was sent directly to reply builder,
  // otherwise, moved asynchronously into reply_payload_
  bool is_deferred_reply_ = false;

  std::variant<payload::Payload, SuspendedCommand> reply_;
};

#ifdef __linux__
static_assert(sizeof(ParsedCommand) == 232);
#endif

}  // namespace facade


================================================
FILE: src/facade/redis_parser.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#include "facade/redis_parser.h"

#include <absl/strings/escaping.h>
#include <absl/strings/numbers.h>

#include "base/logging.h"
#include "common/heap_size.h"

namespace facade {

using namespace std;

auto RedisParser::Parse(Buffer str, uint32_t* consumed, RespExpr::Vec* res) -> Result {
  DCHECK(!str.empty());
  *consumed = 0;
  res->clear();

  DVLOG(2) << "Parsing: "
           << absl::CHexEscape(string_view{reinterpret_cast<const char*>(str.data()), str.size()});

  if (state_ == CMD_COMPLETE_S) {
    if (InitStart(str[0], res)) {
      // We recognized a non-INLINE state, starting with a special char.
      str.remove_prefix(1);
      *consumed += 1;
      if (server_mode_ && state_ == PARSE_ARG_S) {  // server requests start with ARRAY_LEN_S.
        state_ = CMD_COMPLETE_S;                    // reject and reset the state.
        return BAD_ARRAYLEN;
      }
      if (str.empty())
        return INPUT_PENDING;
    }
  } else {  // INLINE mode, aka PING\n
    // We continue parsing in the middle.
    if (!cached_expr_)
      cached_expr_ = res;
  }
  DCHECK(state_ != CMD_COMPLETE_S);

  ResultConsumed resultc{OK, 0};

  do {
    switch (state_) {
      case MAP_LEN_S:
      case ARRAY_LEN_S:
        resultc = ConsumeArrayLen(str);
        break;
      case PARSE_ARG_TYPE:
        arg_c_ = str[0];
        if (server_mode_ && arg_c_ != '$')  // server side only supports bulk strings.
          return BAD_BULKLEN;
        resultc.second = 1;
        state_ = PARSE_ARG_S;
        break;
      case PARSE_ARG_S:
        resultc = ParseArg(str);
        break;
      case INLINE_S:
        DCHECK(parse_stack_.empty());
        resultc = ParseInline(str);
        break;
      case BULK_STR_S:
        resultc = ConsumeBulk(str);
        break;
      case SLASH_N_S:
        if (str[0] != '\n') {
          resultc.first = BAD_STRING;
        } else {
          resultc = {OK, 1};
          if (arg_c_ == '_') {
            cached_expr_->emplace_back(RespExpr::NIL);
            cached_expr_->back().u = Buffer{};
          }
          HandleFinishArg();
        }
        break;
      default:
        LOG(FATAL) << "Unexpected state " << int(state_);
    }

    *consumed += resultc.second;
    str.remove_prefix(exchange(resultc.second, 0));
  } while (state_ != CMD_COMPLETE_S && resultc.first == OK && !str.empty());

  if (state_ != CMD_COMPLETE_S) {
    if (resultc.first == OK) {
      resultc.first = INPUT_PENDING;
    }

    if (resultc.first == INPUT_PENDING) {
      // TODO: we still need to handle ':' and ',' cases for client mode
      // to consume them completely.
      if (server_mode_ && !str.empty()) {
        LOG(DFATAL) << "Did not consume all input: "
                    << absl::CHexEscape({reinterpret_cast<const char*>(str.data()), str.size()})
                    << ", state: " << int(state_) << " smallbuf: "
                    << absl::CHexEscape(
                           {reinterpret_cast<const char*>(small_buf_.data()), small_len_});
      }
      StashState(res);
    }
    return resultc.first;
  }

  if (resultc.first == OK) {
    DCHECK(cached_expr_);
    DCHECK_EQ(0, small_len_);

    if (res != cached_expr_) {
      DCHECK(!stash_.empty());

      *res = *cached_expr_;
    }
  }

  return resultc.first;
}

bool RedisParser::InitStart(char prefix_b, RespExpr::Vec* res) {
  buf_stash_.clear();
  stash_.clear();
  cached_expr_ = res;
  parse_stack_.clear();
  last_stashed_level_ = 0;
  last_stashed_index_ = 0;

  switch (prefix_b) {
    case '$':
    case ':':
    case '+':
    case '-':
    case '_':  // Resp3 NULL
    case ',':  // Resp3 DOUBLE
      state_ = PARSE_ARG_S;
      parse_stack_.emplace_back(1, cached_expr_);  // expression of length 1.
      arg_c_ = prefix_b;
      return true;
    case '*':
    case '~':  // Resp3 SET
      state_ = ARRAY_LEN_S;
      return true;
    case '%':  // Resp3 MAP
      state_ = MAP_LEN_S;
      return true;
  }

  state_ = INLINE_S;
  return false;
}

void RedisParser::StashState(RespExpr::Vec* res) {
  if (cached_expr_->empty() && stash_.empty()) {
    cached_expr_ = nullptr;
    return;
  }

  if (cached_expr_ == res) {
    stash_.emplace_back(new RespExpr::Vec(*res));
    cached_expr_ = stash_.back().get();
  }

  DCHECK_LT(last_stashed_level_, stash_.size());
  while (true) {
    auto& cur = *stash_[last_stashed_level_];

    for (; last_stashed_index_ < cur.size(); ++last_stashed_index_) {
      auto& e = cur[last_stashed_index_];
      if (RespExpr::STRING == e.type) {
        Buffer& ebuf = get<Buffer>(e.u);
        if (ebuf.empty() && last_stashed_index_ + 1 == cur.size())
          break;
        if (!ebuf.empty() && !e.has_support) {
          Blob blob(ebuf.size());
          memcpy(blob.data(), ebuf.data(), ebuf.size());
          ebuf = Buffer{blob.data(), blob.size()};
          buf_stash_.push_back(std::move(blob));
          e.has_support = true;
        }
      }
    }

    if (last_stashed_level_ + 1 == stash_.size())
      break;
    ++last_stashed_level_;
    last_stashed_index_ = 0;
  }
}

auto RedisParser::ParseInline(Buffer str) -> ResultConsumed {
  DCHECK(!str.empty());

  const uint8_t* ptr = str.begin();
  const uint8_t* end = str.end();
  const uint8_t* token_start = ptr;

  auto find_token_end = [](const uint8_t* ptr, const uint8_t* end) {
    while (ptr != end && *ptr > 32)
      ++ptr;
    return ptr;
  };

  if (is_broken_token_) {
    ptr = find_token_end(ptr, end);
    size_t len = ptr - token_start;

    ExtendLastString(Buffer(token_start, len));
    if (ptr == end) {
      return {INPUT_PENDING, ptr - token_start};
    }
    is_broken_token_ = false;
  }

  while (ptr != end) {
    // For inline input we only require \n.
    if (*ptr == '\n') {
      if (cached_expr_->empty()) {
        ++ptr;
        continue;  // skip empty line
      }
      break;
    }

    if (*ptr <= 32) {  // skip ws/control chars
      ++ptr;
      continue;
    }

    // token start
    DCHECK(!is_broken_token_);

    token_start = ptr;
    ptr = find_token_end(ptr, end);

    cached_expr_->emplace_back(RespExpr::STRING);
    cached_expr_->back().u = Buffer{token_start, size_t(ptr - token_start)};
  }

  uint32_t last_consumed = ptr - str.data();
  if (ptr == end) {  // we have not finished parsing.
    if (cached_expr_->empty()) {
      state_ = CMD_COMPLETE_S;  // have not found anything besides whitespace.
    } else {
      is_broken_token_ = ptr[-1] > 32;  // we stopped in the middle of the token.
    }
    return {INPUT_PENDING, last_consumed};
  }

  DCHECK_EQ('\n', *ptr);

  ++last_consumed;  // consume \n as well.
  state_ = CMD_COMPLETE_S;

  return {OK, last_consumed};
}

// Parse lines like:'$5\r\n' or '*2\r\n'. The first character is already consumed by the caller.
auto RedisParser::ParseLen(Buffer str, int64_t* res) -> ResultConsumed {
  DCHECK(!str.empty());

  const char* s = reinterpret_cast<const char*>(str.data());
  const char* pos = reinterpret_cast<const char*>(memchr(s, '\n', str.size()));
  if (!pos) {
    if (str.size() + small_len_ < small_buf_.size()) {
      memcpy(&small_buf_[small_len_], str.data(), str.size());
      small_len_ += str.size();
      return {INPUT_PENDING, str.size()};
    }
    LOG(WARNING) << "Unexpected format " << string_view{s, str.size()};
    return ResultConsumed{BAD_ARRAYLEN, 0};
  }

  unsigned consumed = pos - s + 1;
  if (small_len_ > 0) {
    if (small_len_ + consumed >= small_buf_.size()) {
      return ResultConsumed{BAD_ARRAYLEN, consumed};
    }
    memcpy(&small_buf_[small_len_], str.data(), consumed);
    small_len_ += consumed;
    s = small_buf_.data();
    pos = s + small_len_ - 1;
    small_len_ = 0;
  }

  if (pos[-1] != '\r') {
    return {BAD_ARRAYLEN, consumed};
  }

  // Skip 2 last characters (\r\n).
  string_view len_token{s, size_t(pos - 1 - s)};
  bool success = absl::SimpleAtoi(len_token, res);

  if (success && *res >= -1) {
    return ResultConsumed{OK, consumed};
  }

  LOG(ERROR) << "Failed to parse len " << absl::CHexEscape(len_token) << " "
             << absl::CHexEscape(string_view{reinterpret_cast<const char*>(str.data()), str.size()})
             << " " << consumed << " " << int(s == small_buf_.data());
  return ResultConsumed{BAD_ARRAYLEN, consumed};
}

auto RedisParser::ConsumeArrayLen(Buffer str) -> ResultConsumed {
  int64_t len;

  ResultConsumed res = ParseLen(str, &len);
  if (res.first != OK) {
    return res;
  }

  if (state_ == MAP_LEN_S) {
    // Map starts with %N followed by an array of 2*N elements.
    // Even elements are keys, odd elements are values.
    len *= 2;
  }

  if (len > max_arr_len_) {
    LOG(WARNING) << "Multibulk len is too large " << len;

    return {BAD_ARRAYLEN, res.second};
  }

  if (server_mode_ && (!parse_stack_.empty() || !cached_expr_->empty()))
    return {BAD_STRING, res.second};

  if (len <= 0) {
    if (len < 0) {
      cached_expr_->emplace_back(RespExpr::NIL_ARRAY);
      cached_expr_->back().u.emplace<RespVec*>(nullptr);  // nil
    } else {
      static RespVec empty_vec;
      cached_expr_->emplace_back(RespExpr::ARRAY);
      cached_expr_->back().u = &empty_vec;
    }
    if (parse_stack_.empty()) {
      state_ = CMD_COMPLETE_S;
    } else {
      HandleFinishArg();
    }

    return {OK, res.second};
  }

  if (state_ == PARSE_ARG_S) {
    DCHECK(!server_mode_);

    cached_expr_->emplace_back(RespExpr::ARRAY);
    stash_.emplace_back(new RespExpr::Vec());
    RespExpr::Vec* arr = stash_.back().get();
    arr->reserve(len);
    cached_expr_->back().u = arr;
    cached_expr_ = arr;
  }
  state_ = PARSE_ARG_TYPE;

  DVLOG(1) << "PushStack: (" << len << ", " << cached_expr_ << ")";
  parse_stack_.emplace_back(len, cached_expr_);

  return {OK, res.second};
}

auto RedisParser::ParseArg(Buffer str) -> ResultConsumed {
  DCHECK(!str.empty());

  if (arg_c_ == '$') {
    int64_t len;

    ResultConsumed res = ParseLen(str, &len);
    if (res.first != OK) {
      return res;
    }

    if (len > 0 && static_cast<uint64_t>(len) > max_bulk_len_) {
      LOG_EVERY_T(WARNING, 1) << "Threshold reached with bulk len: " << len
                              << ", consider increasing max_bulk_len";
      return {BAD_ARRAYLEN, res.second};
    }

    if (len == -1) {  // Resp2 NIL
      cached_expr_->emplace_back(RespExpr::NIL);
      cached_expr_->back().u = Buffer{};
      HandleFinishArg();
    } else {
      DVLOG(1) << "String(" << len << ")";

      cached_expr_->emplace_back(RespExpr::STRING);
      cached_expr_->back().u = Buffer{};
      bulk_len_ = len;
      state_ = BULK_STR_S;
    }

    return {OK, res.second};
  }

  DCHECK(!server_mode_);

  if (arg_c_ == '_') {  // Resp3 NIL
    // "_\r\n", with '_' consumed into arg_c_.
    DCHECK_LT(small_len_, 2u);  // must be because we never fill here with more than 2 bytes.
    DCHECK_GE(str.size(), 1u);

    if (str[0] != '\r' || (str.size() > 1 && str[1] != '\n')) {
      return {BAD_STRING, 0};
    }

    if (str.size() == 1) {
      state_ = SLASH_N_S;
      return {INPUT_PENDING, 1};
    }

    cached_expr_->emplace_back(RespExpr::NIL);
    cached_expr_->back().u = Buffer{};
    HandleFinishArg();
    return {OK, 2};
  }

  if (arg_c_ == '*') {
    return ConsumeArrayLen(str);
  }

  const char* s = reinterpret_cast<const char*>(str.data());
  const char* eol = reinterpret_cast<const char*>(memchr(s, '\n', str.size()));

  if (arg_c_ == '+' || arg_c_ == '-') {  // Simple string or error.
    DCHECK(!server_mode_);
    if (!eol) {
      // if eol is not found we should still read input as bulk string
      cached_expr_->emplace_back(RespExpr::STRING);
      cached_expr_->back().u = Buffer{};
      bulk_len_ = str.length();
      // eol is not found but if '\r' is present decrease bulk_len
      if (s[bulk_len_ - 1] == '\r')
        bulk_len_--;
      state_ = BULK_STR_S;
      Result r = str.size() < 256 ? OK : BAD_STRING;
      return {r, 0};
    }

    if (eol[-1] != '\r')
      return {BAD_STRING, 0};

    cached_expr_->emplace_back(arg_c_ == '+' ? RespExpr::STRING : RespExpr::ERROR);
    cached_expr_->back().u = Buffer{reinterpret_cast<const uint8_t*>(s), size_t((eol - 1) - s)};
  } else if (arg_c_ == ':') {
    DCHECK(!server_mode_);
    if (!eol) {
      Result r = str.size() < 32 ? INPUT_PENDING : BAD_INT;
      return {r, 0};
    }
    int64_t ival;
    std::string_view tok{s, size_t((eol - s) - 1)};

    if (eol[-1] != '\r' || !absl::SimpleAtoi(tok, &ival))
      return {BAD_INT, 0};

    cached_expr_->emplace_back(RespExpr::INT64);
    cached_expr_->back().u = ival;
  } else if (arg_c_ == ',') {
    DCHECK(!server_mode_);
    if (!eol) {
      Result r = str.size() < 32 ? INPUT_PENDING : BAD_DOUBLE;
      return {r, 0};
    }
    double_t dval;
    std::string_view tok{s, size_t((eol - s) - 1)};

    if (eol[-1] != '\r' || !absl::SimpleAtod(tok, &dval))
      return {BAD_DOUBLE, 0};

    cached_expr_->emplace_back(RespExpr::DOUBLE);
    cached_expr_->back().u = dval;
  } else {
    return {BAD_STRING, 0};
  }

  HandleFinishArg();

  return {OK, (eol - s) + 1};
}

auto RedisParser::ConsumeBulk(Buffer str) -> ResultConsumed {
  DCHECK_EQ(small_len_, 0);
  uint32_t consumed = 0;
  auto& bulk_str = get<Buffer>(cached_expr_->back().u);

  bool extend = false;
  // Handle split simple message or error in client mode
  if (!server_mode_ && (arg_c_ == '+' || arg_c_ == '-') && !bulk_len_) {
    // Search first '\r' in next partial message which ends bulk string
    const char* s = reinterpret_cast<const char*>(str.data());
    const char* pos = reinterpret_cast<const char*>(memchr(s, '\r', str.size()));
    bulk_len_ = pos ? pos - s : str.size();
    extend = true;
  }

  if (str.size() >= bulk_len_) {
    consumed = bulk_len_;
    if (bulk_len_) {
      // is_broken_token_ can be false, if we just parsed the bulk length but have
      // not parsed the token itself.
      if (is_broken_token_) {
        memcpy(const_cast<uint8_t*>(bulk_str.end()), str.data(), bulk_len_);
        bulk_str = Buffer{bulk_str.data(), bulk_str.size() + bulk_len_};
      } else if (extend) {
        ExtendBulkString(Buffer(str.begin(), bulk_len_));
      } else {
        bulk_str = str.subspan(0, bulk_len_);
      }
      str.remove_prefix(exchange(bulk_len_, 0));
      is_broken_token_ = false;
    }

    if (str.size() >= 2) {
      if (str[0] != '\r' || str[1] != '\n') {
        return {BAD_STRING, consumed};
      }
      HandleFinishArg();
      return {OK, consumed + 2};
    } else if (str.size() == 1) {
      if (str[0] != '\r') {
        return {BAD_STRING, consumed};
      }
      state_ = SLASH_N_S;
      consumed++;
    }
    return {INPUT_PENDING, consumed};
  }

  DCHECK(bulk_len_);
  size_t len = std::min<size_t>(str.size(), bulk_len_);

  if (is_broken_token_) {
    memcpy(const_cast<uint8_t*>(bulk_str.end()), str.data(), len);
    bulk_str = Buffer{bulk_str.data(), bulk_str.size() + len};
    DVLOG(1) << "Extending bulk stash to size " << bulk_str.size();
  } else {
    DVLOG(1) << "New bulk stash size " << bulk_len_;
    vector<uint8_t> nb(bulk_len_);
    memcpy(nb.data(), str.data(), len);
    bulk_str = Buffer{nb.data(), len};
    buf_stash_.emplace_back(std::move(nb));
    is_broken_token_ = true;
    cached_expr_->back().has_support = true;
  }
  consumed = len;
  bulk_len_ -= len;

  return {INPUT_PENDING, consumed};
}

void RedisParser::HandleFinishArg() {
  DCHECK(!parse_stack_.empty());
  DCHECK_GT(parse_stack_.back().first, 0u);

  state_ = PARSE_ARG_TYPE;
  while (true) {
    --parse_stack_.back().first;
    if (parse_stack_.back().first != 0)
      break;
    auto* arr = parse_stack_.back().second;
    DVLOG(1) << "PopStack (" << arr << ")";
    parse_stack_.pop_back();  // pop 0.
    if (parse_stack_.empty()) {
      state_ = CMD_COMPLETE_S;
      break;
    }
    cached_expr_ = parse_stack_.back().second;
  }
  small_len_ = 0;
}

void RedisParser::ExtendLastString(Buffer str) {
  DCHECK(!cached_expr_->empty() && cached_expr_->back().type == RespExpr::STRING);
  DCHECK(!buf_stash_.empty());

  Buffer& last_str = get<Buffer>(cached_expr_->back().u);

  DCHECK(last_str.data() == buf_stash_.back().data());

  vector<uint8_t> nb(last_str.size() + str.size());
  memcpy(nb.data(), last_str.data(), last_str.size());
  memcpy(nb.data() + last_str.size(), str.data(), str.size());
  last_str = RespExpr::Buffer{nb.data(), last_str.size() + str.size()};
  buf_stash_.back() = std::move(nb);
}

void RedisParser::ExtendBulkString(Buffer str) {
  DCHECK(!cached_expr_->empty() && cached_expr_->back().type == RespExpr::STRING);

  Buffer& bulk_str = get<Buffer>(cached_expr_->back().u);

  DCHECK(bulk_str.data() == buf_stash_.back().data());

  vector<uint8_t> nb(bulk_str.size() + str.size());
  memcpy(nb.data(), bulk_str.data(), bulk_str.size());
  memcpy(nb.data() + bulk_str.size(), str.data(), str.size());
  bulk_str = RespExpr::Buffer{nb.data(), bulk_str.size() + str.size()};
  buf_stash_.back() = std::move(nb);
}

size_t RedisParser::UsedMemory() const {
  return cmn::HeapSize(parse_stack_) + cmn::HeapSize(stash_) + cmn::HeapSize(buf_stash_);
}

}  // namespace facade


================================================
FILE: src/facade/redis_parser.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once

#include <memory>
#include <utility>
#include <vector>

#include "facade/resp_expr.h"

namespace facade {

/**
 * @brief Zero-copy (best-effort) parser.
 * Note: The client-mode parsing is buggy and should not be used in production.
 *       Currently we only use server-mode parsing in production and client-mode in tests.
 *       It works because tests do not do any incremental parsing.
 *
 */
class RedisParser {
 public:
  enum Result : uint8_t {
    OK,
    INPUT_PENDING,
    BAD_ARRAYLEN,
    BAD_BULKLEN,
    BAD_STRING,
    BAD_INT,
    BAD_DOUBLE
  };
  using Buffer = RespExpr::Buffer;
  enum Mode : uint8_t { SERVER, CLIENT };

  explicit RedisParser(Mode mode = Mode::SERVER, uint32_t max_arr_len = UINT32_MAX,
                       uint64_t max_bulk_len = UINT64_MAX)
      : server_mode_(mode == Mode::SERVER), max_arr_len_(max_arr_len), max_bulk_len_(max_bulk_len) {
  }

  /**
   * @brief Parses str into res. "consumed" stores number of bytes consumed from str.
   *
   * A caller should not invalidate str if the parser returns RESP_OK as long as he continues
   * accessing res. However, if parser returns INPUT_PENDING a caller may discard consumed
   * part of str because parser caches the intermediate state internally according to 'consumed'
   * result.
   *
   *
   */

  Result Parse(Buffer str, uint32_t* consumed, RespVec* res);

  void SetClientMode() {
    server_mode_ = false;
  }

  size_t parselen_hint() const {
    return bulk_len_;
  }

  size_t stash_size() const {
    return stash_.size();
  }
  const std::vector<std::unique_ptr<RespVec>>& stash() const {
    return stash_;
  }

  size_t UsedMemory() const;

 private:
  using ResultConsumed = std::pair<Result, uint32_t>;

  // Returns true if this is a RESP message, false if INLINE.
  bool InitStart(char prefix_b, RespVec* res);
  void StashState(RespVec* res);

  // Skips the first character (*).
  ResultConsumed ConsumeArrayLen(Buffer str);
  ResultConsumed ParseArg(Buffer str);
  ResultConsumed ConsumeBulk(Buffer str);
  ResultConsumed ParseInline(Buffer str);
  ResultConsumed ParseLen(Buffer str, int64_t* res);

  void HandleFinishArg();
  void ExtendLastString(Buffer str);
  void ExtendBulkString(Buffer str);

  enum State : uint8_t {
    INLINE_S,
    ARRAY_LEN_S,
    MAP_LEN_S,
    PARSE_ARG_TYPE,  // Parse [$:+-]
    PARSE_ARG_S,     // Parse string\r\n
    BULK_STR_S,
    SLASH_N_S,
    CMD_COMPLETE_S,
  };

  State state_ = CMD_COMPLETE_S;
  bool is_broken_token_ = false;  // true, if a token (inline or bulk) is broken during the parsing.
  bool server_mode_ = true;
  uint8_t small_len_ = 0;
  char arg_c_ = 0;

  uint32_t bulk_len_ = 0;
  uint32_t last_stashed_level_ = 0, last_stashed_index_ = 0;
  uint32_t max_arr_len_;
  uint64_t max_bulk_len_;

  // Points either to the result passed by the caller or to the stash.
  RespVec* cached_expr_ = nullptr;

  // expected expression length, pointer to expression vector.
  // For server mode, the length is at most 1.
  absl::InlinedVector<std::pair<uint32_t, RespVec*>, 4> parse_stack_;
  std::vector<std::unique_ptr<RespVec>> stash_;

  using Blob = std::vector<uint8_t>;
  std::vector<Blob> buf_stash_;
  std::array<char, 32> small_buf_;
};

}  // namespace facade


================================================
FILE: src/facade/redis_parser_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "facade/redis_parser.h"

#include <absl/strings/str_cat.h>
#include <gmock/gmock.h>

#include "absl/strings/str_cat.h"
#include "base/gtest.h"
#include "base/logging.h"
#include "common/heap_size.h"
#include "facade/facade_test.h"

using namespace testing;
using namespace std;
namespace facade {

MATCHER_P(ArrArg, expected, absl::StrCat(negation ? "is not" : "is", " equal to:\n", expected)) {
  if (arg.type != RespExpr::ARRAY) {
    *result_listener << "\nWrong type: " << arg.type;
    return false;
  }
  size_t exp_sz = expected;
  size_t actual = get<RespVec*>(arg.u)->size();

  if (exp_sz != actual) {
    *result_listener << "\nActual size: " << actual;
    return false;
  }
  return true;
}

class RedisParserTest : public testing::Test {
 protected:
  static void SetUpTestSuite() {
  }

  RedisParser::Result Parse(std::string_view str);

  RedisParser parser_;
  RespExpr::Vec args_;
  uint32_t consumed_;

  unique_ptr<uint8_t[]> stash_;
};

RedisParser::Result RedisParserTest::Parse(std::string_view str) {
  stash_.reset(new uint8_t[str.size()]);
  auto* ptr = stash_.get();
  memcpy(ptr, str.data(), str.size());
  return parser_.Parse(RedisParser::Buffer{ptr, str.size()}, &consumed_, &args_);
}

TEST_F(RedisParserTest, Inline) {
  RespExpr e{RespExpr::STRING};
  ASSERT_EQ(RespExpr::STRING, e.type);

  const char kCmd1[] = "KEY   VAL\r\n";

  ASSERT_EQ(RedisParser::OK, Parse(kCmd1));
  EXPECT_EQ(strlen(kCmd1), consumed_);
  EXPECT_THAT(args_, ElementsAre("KEY", "VAL"));

  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse("KEY"));
  EXPECT_EQ(3, consumed_);
  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse(" FOO "));
  EXPECT_EQ(5, consumed_);
  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse(" BAR"));
  EXPECT_EQ(4, consumed_);
  ASSERT_EQ(RedisParser::OK, Parse(" \r\n "));
  EXPECT_EQ(3, consumed_);
  EXPECT_THAT(args_, ElementsAre("KEY", "FOO", "BAR"));

  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse(" 1 2"));
  EXPECT_EQ(4, consumed_);
  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse(" 45"));
  EXPECT_EQ(3, consumed_);
  ASSERT_EQ(RedisParser::OK, Parse("\r\n"));
  EXPECT_EQ(2, consumed_);
  EXPECT_THAT(args_, ElementsAre("1", "2", "45"));

  // Empty queries return INPUT_PENDING.
  EXPECT_EQ(RedisParser::INPUT_PENDING, Parse("\r\n"));
  EXPECT_EQ(2, consumed_);
}

TEST_F(RedisParserTest, InlineEscaping) {
  LOG(ERROR) << "TBD: to be compliant with sdssplitargs";  // TODO:
}

TEST_F(RedisParserTest, Multi1) {
  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse("*1\r\n"));
  EXPECT_EQ(4, consumed_);
  EXPECT_EQ(0, parser_.parselen_hint());

  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse("$4\r\n"));
  EXPECT_EQ(4, consumed_);
  EXPECT_EQ(4, parser_.parselen_hint());

  ASSERT_EQ(RedisParser::OK, Parse("PING\r\n"));
  EXPECT_EQ(6, consumed_);
  EXPECT_EQ(0, parser_.parselen_hint());
  EXPECT_THAT(args_, ElementsAre("PING"));
}

TEST_F(RedisParserTest, Multi2) {
  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse("*1\r\n$"));
  EXPECT_EQ(5, consumed_);

  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse("4\r\nMSET"));
  EXPECT_EQ(7, consumed_);

  ASSERT_EQ(RedisParser::OK, Parse("\r\n*2\r\n"));
  EXPECT_EQ(2, consumed_);

  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse("*2\r\n$3\r\nKEY\r\n$3\r\nVAL"));
  EXPECT_EQ(20, consumed_);

  ASSERT_EQ(RedisParser::OK, Parse("\r\n"));
  EXPECT_EQ(2, consumed_);
  EXPECT_THAT(args_, ElementsAre("KEY", "VAL"));
}

TEST_F(RedisParserTest, Multi3) {
  const char kFirst[] = "*3\r\n$3\r\nSET\r\n$16\r\nkey:";
  const char kSecond[] = "000002273458\r\n$3\r\nVXK";
  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse(kFirst));
  ASSERT_EQ(strlen(kFirst), consumed_);
  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse(kSecond));
  ASSERT_EQ(strlen(kSecond), consumed_);
  ASSERT_EQ(RedisParser::OK, Parse("\r\n*3\r\n$3\r\nSET"));
  ASSERT_EQ(2, consumed_);
  EXPECT_THAT(args_, ElementsAre("SET", "key:000002273458", "VXK"));
}

TEST_F(RedisParserTest, ClientMode) {
  parser_.SetClientMode();

  ASSERT_EQ(RedisParser::OK, Parse(":-1\r\n"));
  EXPECT_THAT(args_, ElementsAre(IntArg(-1)));

  ASSERT_EQ(RedisParser::OK, Parse("+OK\r\n"));
  EXPECT_EQ(args_[0], "OK");

  ASSERT_EQ(RedisParser::OK, Parse("-ERR foo bar\r\n"));
  EXPECT_THAT(args_, ElementsAre(ErrArg("ERR foo")));

  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse("_"));
  EXPECT_EQ(1, consumed_);
  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse("\r"));
  EXPECT_EQ(1, consumed_);
  ASSERT_EQ(RedisParser::OK, Parse("\n"));
  EXPECT_EQ(1, consumed_);
  EXPECT_THAT(args_, ElementsAre(ArgType(RespExpr::NIL)));
  ASSERT_EQ(RedisParser::OK, Parse("*2\r\n_\r\n_\r\n"));
  ASSERT_EQ(10, consumed_);

  ASSERT_EQ(RedisParser::OK, Parse("*3\r\n+OK\r\n$1\r\n1\r\n*2\r\n$1\r\n1\r\n$-1\r\n"));
  ASSERT_THAT(args_, ElementsAre("OK", "1", ArrLen(2)));

  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse("+O"));
  EXPECT_EQ(2, consumed_);
  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse("K\r"));
  EXPECT_EQ(2, consumed_);
  ASSERT_EQ(RedisParser::OK, Parse("\n"));
  ASSERT_THAT(args_, ElementsAre("OK"));
  EXPECT_EQ(1, consumed_);

  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse("+OK\r"));
  EXPECT_EQ(4, consumed_);
  ASSERT_EQ(RedisParser::OK, Parse("\n"));
  ASSERT_THAT(args_, ElementsAre("OK"));
  EXPECT_EQ(1, consumed_);

  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse("+"));
  EXPECT_EQ(1, consumed_);
  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse("O"));
  EXPECT_EQ(1, consumed_);
  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse("K"));
  EXPECT_EQ(1, consumed_);
  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse("\r"));
  EXPECT_EQ(1, consumed_);
  ASSERT_EQ(RedisParser::OK, Parse("\n"));
  EXPECT_EQ(1, consumed_);
  ASSERT_THAT(args_, ElementsAre("OK"));

  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse("-"));
  EXPECT_EQ(1, consumed_);
  ASSERT_EQ(RedisParser::OK, Parse("ERR\r\n"));
  EXPECT_EQ(5, consumed_);
  ASSERT_THAT(args_, ElementsAre(ErrArg("ERR")));

  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse("-ERR foo"));
  EXPECT_EQ(8, consumed_);
  ASSERT_EQ(RedisParser::OK, Parse("\r\n"));
  EXPECT_EQ(2, consumed_);
  ASSERT_THAT(args_, ElementsAre("ERR foo"));
}

TEST_F(RedisParserTest, Hierarchy) {
  parser_.SetClientMode();

  const char* kThirdArg = "*2\r\n$3\r\n100\r\n$3\r\n200\r\n";
  string resp = absl::StrCat("*3\r\n$3\r\n900\r\n$3\r\n800\r\n", kThirdArg);
  ASSERT_EQ(RedisParser::OK, Parse(resp));
  ASSERT_THAT(args_, ElementsAre("900", "800", ArrArg(2)));
  EXPECT_THAT(args_[2].GetVec(), ElementsAre("100", "200"));

  ASSERT_EQ(RedisParser::OK, Parse("*2\r\n*1\r\n$3\r\n1-0\r\n*1\r\n$2\r\nf1\r\n"));
  ASSERT_THAT(args_, ElementsAre(ArrLen(1), ArrLen(1)));
}

TEST_F(RedisParserTest, InvalidMult1) {
  ASSERT_EQ(RedisParser::BAD_BULKLEN, Parse("*2\r\n$3\r\nFOO\r\nBAR\r\n"));
}

TEST_F(RedisParserTest, Empty) {
  ASSERT_EQ(RedisParser::OK, Parse("*2\r\n$0\r\n\r\n$0\r\n\r\n"));
}

TEST_F(RedisParserTest, LargeBulk) {
  string_view prefix("*1\r\n$1024\r\n");

  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse(prefix));
  ASSERT_EQ(prefix.size(), consumed_);
  ASSERT_GE(parser_.parselen_hint(), 1024);

  string half(512, 'a');
  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse(half));
  ASSERT_EQ(512, consumed_);
  ASSERT_GE(parser_.parselen_hint(), 512);
  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse(half));
  ASSERT_EQ(512, consumed_);
  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse("\r"));
  ASSERT_EQ(1, consumed_);
  ASSERT_EQ(RedisParser::OK, Parse("\n"));
  EXPECT_EQ(1, consumed_);

  string part1 = absl::StrCat(prefix, half);
  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse(part1));
  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse(half));
  ASSERT_EQ(RedisParser::OK, Parse("\r\n"));

  prefix = "*1\r\n$270000000\r\n";
  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse(prefix));
  ASSERT_EQ(prefix.size(), consumed_);
  string chunk(1000000, 'a');
  for (unsigned i = 0; i < 270; ++i) {
    ASSERT_EQ(RedisParser::INPUT_PENDING, Parse(chunk));
    ASSERT_EQ(chunk.size(), consumed_);
  }
  ASSERT_EQ(RedisParser::OK, Parse("\r\n"));
  ASSERT_THAT(args_, ElementsAre(ArgType(RespExpr::STRING)));
  EXPECT_EQ(270000000, args_[0].GetBuf().size());
}

TEST_F(RedisParserTest, NILs) {
  ASSERT_EQ(RedisParser::BAD_ARRAYLEN, Parse("_\r\n"));
  parser_.SetClientMode();
  ASSERT_EQ(RedisParser::OK, Parse("_\r\nfooobar"));
  EXPECT_EQ(3, consumed_);
}

TEST_F(RedisParserTest, NestedArray) {
  parser_.SetClientMode();

  // [[['foo'],['bar']],['car']]
  ASSERT_EQ(RedisParser::OK,
            Parse("*2\r\n*2\r\n*1\r\n$3\r\nfoo\r\n*1\r\n$3\r\nbar\r\n*1\r\n$3\r\ncar\r\n"));

  ASSERT_THAT(args_, ElementsAre(ArrArg(2), ArrArg(1)));
  ASSERT_THAT(args_[0].GetVec(), ElementsAre(ArrArg(1), ArrArg(1)));
  ASSERT_THAT(args_[1].GetVec(), ElementsAre("car"));
}

TEST_F(RedisParserTest, UsedMemory) {
  vector<vector<uint8_t>> blobs;
  for (size_t i = 0; i < 100; ++i) {
    blobs.emplace_back(vector<uint8_t>(200));
  }
  EXPECT_GT(cmn::HeapSize(blobs), 20000);

  std::vector<std::unique_ptr<RespVec>> stash;
  RespVec vec;
  for (unsigned i = 0; i < 10; ++i) {
    vec.emplace_back(RespExpr::STRING);
    vec.back().u = RespExpr::Buffer(nullptr, 0);
  }

  for (unsigned i = 0; i < 100; i++) {
    stash.emplace_back(new RespExpr::Vec(vec));
  }
  EXPECT_GT(cmn::HeapSize(stash), 30000);
}

TEST_F(RedisParserTest, Eol) {
  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse("*1\r"));
  EXPECT_EQ(3, consumed_);
  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse("\n$5\r\n"));
  EXPECT_EQ(5, consumed_);
}

TEST_F(RedisParserTest, BulkSplit) {
  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse("*1\r\n$4\r\nSADD\r"));
  ASSERT_EQ(13, consumed_);
  ASSERT_EQ(RedisParser::OK, Parse("\n"));
}

TEST_F(RedisParserTest, InlineSplit) {
  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse("\n"));
  EXPECT_EQ(1, consumed_);
  ASSERT_EQ(RedisParser::OK, Parse("\nPING\n\n"));
  EXPECT_EQ(6, consumed_);
  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse("\n"));
  EXPECT_EQ(1, consumed_);
  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse("P"));
  ASSERT_EQ(RedisParser::OK, Parse("ING\n"));
}

TEST_F(RedisParserTest, InlineReset) {
  ASSERT_EQ(RedisParser::INPUT_PENDING, Parse("\t \r\n"));
  EXPECT_EQ(4, consumed_);
  ASSERT_EQ(RedisParser::OK, Parse("*1\r\n$3\r\nfoo\r\n"));
  EXPECT_EQ(13, consumed_);
}

}  // namespace facade


================================================
FILE: src/facade/reply_builder.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#include "facade/reply_builder.h"

#include <absl/cleanup/cleanup.h>
#include <absl/container/fixed_array.h>
#include <absl/strings/numbers.h>
#include <absl/strings/str_cat.h>
#include <double-conversion/double-to-string.h>

#include <limits>

#include "absl/strings/escaping.h"
#include "absl/types/span.h"
#include "base/logging.h"
#include "facade/error.h"
#include "util/fibers/proactor_base.h"

#ifdef __APPLE__
#ifndef UIO_MAXIOV
// Some versions of MacOSX dont have IOV_MAX
#define UIO_MAXIOV 1024
#endif
#endif

using namespace std;
using namespace double_conversion;

namespace facade {

namespace {

constexpr char kCRLF[] = "\r\n";
constexpr char kSimplePref[] = "+";
constexpr char kLengthPrefix[] = "$";
constexpr char kDoublePref[] = ",";
constexpr char kLongPref[] = ":";
constexpr char kNullStringR2[] = "$-1\r\n";
constexpr char kNullStringR3[] = "_\r\n";

constexpr unsigned kConvFlags =
    DoubleToStringConverter::UNIQUE_ZERO | DoubleToStringConverter::EMIT_POSITIVE_EXPONENT_SIGN;

DoubleToStringConverter dfly_conv(kConvFlags, "inf", "nan", 'e', -6, 21, 6, 0);

template <typename T> size_t piece_size(const T& v) {
  if constexpr (is_array_v<T>)
    return ABSL_ARRAYSIZE(v) - 1;  // expect null terminated
  else if constexpr (is_integral_v<T>)
    return absl::numbers_internal::kFastToBufferSize;
  else  // string_view
    return v.size();
}

template <size_t S> char* write_piece(const char (&arr)[S], char* dest) {
  return (char*)memcpy(dest, arr, S - 1) + (S - 1);
}

template <typename T> enable_if_t<is_integral_v<T>, char*> write_piece(T num, char* dest) {
  static_assert(!is_same_v<T, char>, "Use arrays for single chars");
  return absl::numbers_internal::FastIntToBuffer(num, dest);
}

char* write_piece(string_view str, char* dest) {
  return (char*)memcpy(dest, str.data(), str.size()) + str.size();
}

}  // namespace

thread_local SinkReplyBuilder::PendingList SinkReplyBuilder::pending_list;

SinkReplyBuilder::ReplyAggregator::~ReplyAggregator() {
  rb->batched_ = prev;
  if (!prev)
    rb->Flush();
}

SinkReplyBuilder::ReplyScope::~ReplyScope() {
  rb->scoped_ = prev;
  if (!prev)
    rb->FinishScope();
}

void SinkReplyBuilder::SendError(ErrorReply error) {
  if (error.status)
    return SendError(*error.status);
  SendError(error.ToSv(), error.kind);
}

void SinkReplyBuilder::SendError(OpStatus status) {
  if (status == OpStatus::OK)
    return SendSimpleString("OK");
  SendError(StatusToMsg(status));
}

void SinkReplyBuilder::CloseConnection() {
  if (!ec_)
    ec_ = std::make_error_code(std::errc::connection_aborted);
}

template <typename... Ts> void SinkReplyBuilder::WritePieces(Ts&&... pieces) {
  if (size_t required = (piece_size(pieces) + ...); buffer_.AppendLen() <= required)
    Flush(required);

  auto iovec_end = [](const iovec& v) { return reinterpret_cast<char*>(v.iov_base) + v.iov_len; };

  // Ensure last iovec points to buffer segment
  char* dest = reinterpret_cast<char*>(buffer_.AppendBuffer().data());
  if (vecs_.empty()) {
    vecs_.push_back(iovec{dest, 0});
  } else if (iovec_end(vecs_.back()) != dest) {
    if (vecs_.size() >= IOV_MAX - 2)
      Flush();
    dest = reinterpret_cast<char*>(buffer_.AppendBuffer().data());
    vecs_.push_back(iovec{dest, 0});
  }

  DCHECK(iovec_end(vecs_.back()) == dest);
  char* ptr = dest;
  ([&]() { ptr = write_piece(pieces, ptr); }(), ...);

  size_t written = ptr - dest;
  buffer_.CommitWrite(written);
  vecs_.back().iov_len += written;
  total_size_ += written;
}

void SinkReplyBuilder::WriteRef(std::string_view str) {
  if (vecs_.size() >= IOV_MAX - 2)
    Flush();
  vecs_.push_back(iovec{const_cast<char*>(str.data()), str.size()});
  total_size_ += str.size();
}

void SinkReplyBuilder::Flush(size_t expected_buffer_cap) {
  if (!vecs_.empty())
    Send();

  // Grow backing buffer if was at least half full and still below it's max size
  if (buffer_.InputLen() * 2 > buffer_.Capacity() && buffer_.Capacity() * 2 <= kMaxBufferSize)
    expected_buffer_cap = max(expected_buffer_cap, buffer_.Capacity() * 2);

  total_size_ = 0;
  buffer_.Clear();
  vecs_.clear();
  guaranteed_pieces_ = 0;

  DCHECK_LE(expected_buffer_cap, kMaxBufferSize);  // big strings should be enqueued as iovecs

  if (expected_buffer_cap > buffer_.Capacity())
    buffer_.Reserve(expected_buffer_cap);
}

uint64_t SinkReplyBuilder::GetLastSendTimeNs() const {
  return send_time_ns_;
}

void SinkReplyBuilder::Send() {
  DCHECK(sink_ != nullptr);
  DCHECK(!vecs_.empty());
  auto& reply_stats = tl_facade_stats->reply_stats;

  send_time_ns_ = util::fb2::ProactorBase::GetMonotonicTimeNs();
  PendingPin pin(send_time_ns_);

  pending_list.push_back(pin);

  reply_stats.io_write_cnt++;
  reply_stats.io_write_bytes += total_size_;
  DVLOG(2) << "Writing " << total_size_ << " bytes";
  if (auto ec = sink_->Write(vecs_.data(), vecs_.size()); ec)
    ec_ = ec;

  auto it = PendingList::s_iterator_to(pin);
  pending_list.erase(it);

  send_time_ns_ = 0;

  uint64_t after_ns = util::fb2::ProactorBase::GetMonotonicTimeNs();
  reply_stats.send_stats.count++;
  reply_stats.send_stats.total_duration += (after_ns - pin.timestamp_ns);
  DVLOG(2) << "Finished writing " << total_size_ << " bytes";
}

void SinkReplyBuilder::FinishScope() {
  replies_recorded_++;

  if (!batched_ || total_size_ * 2 >= kMaxBufferSize /* copying isn't worth it */)
    return Flush();

  // Check if we have enough space to copy all refs to buffer
  size_t ref_bytes = total_size_ - buffer_.InputLen();
  if (ref_bytes > buffer_.AppendLen())
    return Flush(ref_bytes);

  // Copy all external references to buffer to safely keep batching
  for (size_t i = guaranteed_pieces_; i < vecs_.size(); i++) {
    auto ib = buffer_.InputBuffer();
    if (vecs_[i].iov_base >= ib.data() && vecs_[i].iov_base <= ib.data() + ib.size())
      continue;  // this is a piece

    DCHECK_LE(vecs_[i].iov_len, buffer_.AppendLen());
    void* dest = buffer_.AppendBuffer().data();
    memcpy(dest, vecs_[i].iov_base, vecs_[i].iov_len);
    buffer_.CommitWrite(vecs_[i].iov_len);
    vecs_[i].iov_base = dest;
  }
  guaranteed_pieces_ = vecs_.size();  // all vecs are pieces
}

MCReplyBuilder::MCReplyBuilder(::io::Sink* sink) : SinkReplyBuilder(sink) {
}

void MCReplyBuilder::SendValue(MemcacheCmdFlags cmd_flags, std::string_view key,
                               std::string_view value, uint64_t mc_token, uint32_t mc_flag,
                               uint32_t ttl_sec) {
  ReplyScope scope(this);
  if (cmd_flags.meta) {
    string flags;
    if (cmd_flags.return_flags)
      absl::StrAppend(&flags, " f", mc_flag);
    if (cmd_flags.return_cas)
      absl::StrAppend(&flags, " c", mc_token);
    if (cmd_flags.return_ttl)
      absl::StrAppend(&flags, " t", ttl_sec);

    if (cmd_flags.return_value) {
      WritePieces("VA ", value.size(), flags, kCRLF);
      if (value.size() <= kMaxInlineSize) {
        WritePieces(value, kCRLF);
      } else {
        WriteRef(value);
        WritePieces(kCRLF);
      }
    } else {
      WritePieces("HD ", flags, kCRLF);
    }
  } else {
    WritePieces("VALUE ", key, " ", mc_flag, " ", value.size());
    if (cmd_flags.return_cas)
      WritePieces(" ", mc_token);

    if (value.size() <= kMaxInlineSize) {
      WritePieces(kCRLF, value, kCRLF);
    } else {
      WritePieces(kCRLF);
      WriteRef(value);
      WritePieces(kCRLF);
    }
  }
}

void MCReplyBuilder::SendSimpleString(std::string_view str) {
  if (str.empty())
    return;
  ReplyScope scope(this);
  WritePieces(str, kCRLF);
}

void MCReplyBuilder::SendLong(long val) {
  SendSimpleString(absl::StrCat(val));
}

void MCReplyBuilder::SendError(string_view str, std::string_view type) {
  last_error_ = str;
  SendSimpleString(absl::StrCat("SERVER_ERROR ", str));
}

void MCReplyBuilder::SendProtocolError(std::string_view str) {
  SendSimpleString(absl::StrCat("CLIENT_ERROR ", str));
}

void MCReplyBuilder::SendClientError(string_view str) {
  SendSimpleString(absl::StrCat("CLIENT_ERROR ", str));
}

void MCReplyBuilder::SendRaw(std::string_view str) {
  ReplyScope scope(this);
  WriteRef(str);
}

void RedisReplyBuilderBase::SendNull() {
  ReplyScope scope(this);
  IsResp3() ? WritePieces(kNullStringR3) : WritePieces(kNullStringR2);
}

void RedisReplyBuilderBase::SendSimpleString(std::string_view str) {
  ReplyScope scope(this);
  if (str.size() <= kMaxInlineSize * 2)
    return WritePieces(kSimplePref, str, kCRLF);

  WritePieces(kSimplePref);
  WriteRef(str);
  WritePieces(kCRLF);
}

void RedisReplyBuilderBase::SendBulkString(std::string_view str) {
  ReplyScope scope(this);
  if (str.size() <= kMaxInlineSize)
    return WritePieces(kLengthPrefix, uint32_t(str.size()), kCRLF, str, kCRLF);

  DVLOG(1) << "SendBulk " << str.size();
  WritePieces(kLengthPrefix, uint32_t(str.size()), kCRLF);
  WriteRef(str);
  WritePieces(kCRLF);
}

void RedisReplyBuilderBase::SendLong(long val) {
  ReplyScope scope(this);
  WritePieces(kLongPref, val, kCRLF);
}

void RedisReplyBuilderBase::SendDouble(double val) {
  char buf[DoubleToStringConverter::kBase10MaximalLength + 8];  // +8 to be on the safe side.
  static_assert(ABSL_ARRAYSIZE(buf) < kMaxInlineSize, "Write temporary string from buf inline");
  string_view val_str = FormatDouble(val, buf, ABSL_ARRAYSIZE(buf));

  if (!IsResp3())
    return SendBulkString(val_str);

  ReplyScope scope(this);
  WritePieces(kDoublePref, val_str, kCRLF);
}

void RedisReplyBuilderBase::SendNullArray() {
  ReplyScope scope(this);
  WritePieces("*-1", kCRLF);
}

constexpr static const char START_SYMBOLS2[4][2] = {"*", "~", "%", ">"};
static_assert(START_SYMBOLS2[unsigned(CollectionType::MAP)][0] == '%' &&
              START_SYMBOLS2[unsigned(CollectionType::SET)][0] == '~');

void RedisReplyBuilderBase::StartCollection(unsigned len, CollectionType ct) {
  if (!IsResp3()) {  // RESP2 supports only arrays
    if (ct == CollectionType::MAP)
      len *= 2;
    ct = CollectionType::ARRAY;
  }
  ReplyScope scope(this);
  WritePieces(START_SYMBOLS2[unsigned(ct)], len, kCRLF);
}

void RedisReplyBuilderBase::SendError(std::string_view str, std::string_view type) {
  ReplyScope scope(this);

  if (type.empty()) {
    type = str;
    if (type == kSyntaxErr)
      type = kSyntaxErrType;
  }
  tl_facade_stats->reply_stats.err_count[type]++;
  last_error_ = str;

  if (str[0] != '-') {
    WritePieces("-ERR ");
  }
  if (str.size() <= kMaxInlineSize) {
    WritePieces(str, kCRLF);
  } else {
    WriteRef(str);
    WritePieces(kCRLF);
  }
}

void RedisReplyBuilderBase::SendProtocolError(std::string_view str) {
  SendError(absl::StrCat("-ERR Protocol error: ", str), "protocol_error");
}

char* RedisReplyBuilderBase::FormatDouble(double d, char* dest, unsigned len) {
  StringBuilder sb(dest, len);
  CHECK(dfly_conv.ToShortest(d, &sb));
  return sb.Finalize();
}

void RedisReplyBuilderBase::SendVerbatimString(std::string_view str, VerbatimFormat format) {
  DCHECK(format <= VerbatimFormat::MARKDOWN);
  if (!IsResp3())
    return SendBulkString(str);

  ReplyScope scope(this);
  WritePieces("=", str.size() + 4, kCRLF, format == VerbatimFormat::MARKDOWN ? "mkd:" : "txt:");
  if (str.size() <= kMaxInlineSize)
    WritePieces(str);
  else
    WriteRef(str);
  WritePieces(kCRLF);
}

std::string RedisReplyBuilderBase::SerializeCommand(std::string_view command) {
  return string{command} + kCRLF;
}

void RedisReplyBuilder::SendSimpleStrArr(const facade::ArgRange& strs) {
  ReplyScope scope(this);
  StartArray(strs.Size());
  for (std::string_view str : strs)
    SendSimpleString(str);
}

void RedisReplyBuilder::SendBulkStrArr(const facade::ArgRange& strs, CollectionType ct) {
  ReplyScope scope(this);
  StartCollection(ct == CollectionType::MAP ? strs.Size() / 2 : strs.Size(), ct);
  for (std::string_view str : strs)
    SendBulkString(str);
}

void RedisReplyBuilder::SendScoredArray(ScoredArray arr, bool with_scores) {
  ReplyScope scope(this);
  StartArray((with_scores && !IsResp3()) ? arr.size() * 2 : arr.size());
  for (const auto& [str, score] : arr) {
    if (with_scores && IsResp3())
      StartArray(2);
    SendBulkString(str);
    if (with_scores)
      SendDouble(score);
  }
}

void RedisReplyBuilder::SendLabeledScoredArray(std::string_view arr_label, ScoredArray arr) {
  ReplyScope scope(this);

  StartArray(2);

  SendBulkString(arr_label);
  StartArray(arr.size());
  for (const auto& [str, score] : arr) {
    StartArray(2);
    SendBulkString(str);
    SendDouble(score);
  }
}

template <typename I> void RedisReplyBuilder::SendLongArr(absl::Span<const I> longs) {
  static_assert(std::is_integral_v<I>, "Must use integral type");
  ReplyScope scope(this);
  StartArray(longs.size());
  for (auto v : longs) {
    if constexpr (std::is_unsigned_v<I>)
      DCHECK_LE(uint64_t(v), uint64_t(std::numeric_limits<long>::max()));
    SendLong(v);
  }
}

template void RedisReplyBuilder::SendLongArr<long>(absl::Span<const long>);
template void RedisReplyBuilder::SendLongArr<int32_t>(absl::Span<const int32_t>);
template void RedisReplyBuilder::SendLongArr<uint32_t>(absl::Span<const uint32_t>);
template void RedisReplyBuilder::SendLongArr<uint64_t>(absl::Span<const uint64_t>);

void RedisReplyBuilder::StartArray(unsigned len) {
  StartCollection(len, CollectionType::ARRAY);
}

void RedisReplyBuilder::SendEmptyArray() {
  StartArray(0);
}

}  // namespace facade


================================================
FILE: src/facade/reply_builder.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once

#include <absl/container/flat_hash_map.h>

#include <boost/intrusive/list.hpp>
#include <optional>
#include <string_view>

#include "facade/facade_stats.h"
#include "facade/facade_types.h"
#include "io/io.h"

namespace facade {

enum class RespVersion { kResp2, kResp3 };

// Base class for all reply builders. Offer a simple high level interface for controlling output
// modes and sending basic response types.
class SinkReplyBuilder {
  struct GuardBase {
    bool prev;
    SinkReplyBuilder* rb;
  };

 public:
  constexpr static size_t kMaxInlineSize = 32;
  constexpr static size_t kMaxBufferSize = 8192;

  struct PendingPin : public boost::intrusive::list_base_hook<
                          ::boost::intrusive::link_mode<::boost::intrusive::normal_link>> {
    uint64_t timestamp_ns;

    PendingPin(uint64_t v = 0) : timestamp_ns(v) {
    }
  };

  using PendingList =
      boost::intrusive::list<PendingPin, boost::intrusive::constant_time_size<false>,
                             boost::intrusive::cache_last<false>>;

  static thread_local PendingList pending_list;

  explicit SinkReplyBuilder(io::Sink* sink) : sink_(sink) {
  }

  virtual ~SinkReplyBuilder() = default;

  // USE WITH CARE! ReplyScope assumes that all string views in Send calls keep valid for the scopes
  // lifetime. This allows the builder to avoid copies by enqueueing long strings directly for
  // vectorized io.
  struct ReplyScope : GuardBase {
    explicit ReplyScope(SinkReplyBuilder* rb) : GuardBase{std::exchange(rb->scoped_, true), rb} {
    }

    ~ReplyScope();
  };

  // Aggregator reduces the number of raw send calls by copying data in an intermediate buffer.
  // Prefer ReplyScope if possible to additionally reduce the number of copies.
  struct ReplyAggregator : GuardBase {
    explicit ReplyAggregator(SinkReplyBuilder* rb)
        : GuardBase{std::exchange(rb->batched_, true), rb} {
    }

    ~ReplyAggregator();
  };

  void Flush(size_t expected_buffer_cap = 0);  // Send all accumulated data and reset to clear state

  std::error_code GetError() const {
    return ec_;
  }

  size_t UsedMemory() const {
    return buffer_.Capacity();
  }

  size_t RepliesRecorded() const {
    return replies_recorded_;
  }

  bool IsSendActive() const {
    return send_time_ns_ > 0;
  }

  void SetBatchMode(bool b) {
    batched_ = b;
  }

  void CloseConnection();

  static const ReplyStats& GetThreadLocalStats() {
    return tl_facade_stats->reply_stats;
  }

 public:  // High level interface
  virtual Protocol GetProtocol() const = 0;

  virtual void SendLong(long val) = 0;
  virtual void SendSimpleString(std::string_view str) = 0;

  void SendOk() {
    SendSimpleString("OK");
  }

  virtual void SendError(std::string_view str, std::string_view type = {}) = 0;  // MC and Redis
  void SendError(OpStatus status);
  void SendError(ErrorReply error);
  virtual void SendProtocolError(std::string_view str) = 0;

  std::string ConsumeLastError() {
    return std::exchange(last_error_, {});
  }

  uint64_t GetLastSendTimeNs() const;

 protected:
  template <typename... Ts>
  void WritePieces(Ts&&... pieces);     // Copy pieces into buffer and reference buffer
  void WriteRef(std::string_view str);  // Add iovec bypassing buffer

  void FinishScope();  // Called when scope ends to flush buffer if needed
  void Send();

 protected:
  size_t replies_recorded_ = 0;
  std::string last_error_;

 private:
  io::Sink* sink_;
  std::error_code ec_;

  bool scoped_ = false, batched_ = false;

  size_t total_size_ = 0;  // sum of vec_ lengths
  base::IoBuf buffer_;     // backing buffer for pieces

  // Stores iovecs for a single writev call. Can reference either the buffer (WritePiece) or
  // external data (WriteRef). Validity is ensured by FinishScope that either flushes before ref
  // lifetime ends or copies refs to the buffer.
  absl::InlinedVector<iovec, 16> vecs_;
  size_t guaranteed_pieces_ = 0;  // length of prefix of vecs_ that are guaranteed to be pieces
  uint64_t send_time_ns_ = 0;
};

class MCReplyBuilder : public SinkReplyBuilder {
 public:
  explicit MCReplyBuilder(::io::Sink* sink);

  ~MCReplyBuilder() override = default;

  Protocol GetProtocol() const final {
    return Protocol::MEMCACHE;
  }

  void SendError(std::string_view str, std::string_view type = std::string_view{}) final;

  void SendLong(long val) final;

  void SendClientError(std::string_view str);
  void SendValue(MemcacheCmdFlags cmd_flags, std::string_view key, std::string_view value,
                 uint64_t mc_token, uint32_t mc_flag, uint32_t ttl_sec);
  void SendSimpleString(std::string_view str) final;
  void SendProtocolError(std::string_view str) final;

  void SendRaw(std::string_view str);
};

// Redis reply builder interface for sending RESP data.
class RedisReplyBuilderBase : public SinkReplyBuilder {
 public:
  enum VerbatimFormat : uint8_t { TXT, MARKDOWN };

  explicit RedisReplyBuilderBase(io::Sink* sink) : SinkReplyBuilder(sink) {
  }

  ~RedisReplyBuilderBase() override = default;

  Protocol GetProtocol() const final {
    return Protocol::REDIS;
  }

  virtual void SendNull();

  void SendSimpleString(std::string_view str) override;
  virtual void SendBulkString(std::string_view str);  // RESP: Blob String

  void SendLong(long val) override;
  virtual void SendDouble(double val);  // RESP: Number

  virtual void SendNullArray();
  virtual void StartCollection(unsigned len, CollectionType ct);

  using SinkReplyBuilder::SendError;
  void SendError(std::string_view str, std::string_view type = {}) override;
  void SendProtocolError(std::string_view str) override;

  virtual void SendVerbatimString(std::string_view str, VerbatimFormat format = TXT);

  static char* FormatDouble(double d, char* dest, unsigned len);
  static std::string SerializeCommand(std::string_view command);

  bool IsResp3() const {
    return resp_ == RespVersion::kResp3;
  }

  void SetRespVersion(RespVersion resp_version) {
    resp_ = resp_version;
  }

  RespVersion GetRespVersion() {
    return resp_;
  }

 private:
  RespVersion resp_ = RespVersion::kResp2;
};

// Non essential redis reply builder functions implemented on top of the base resp protocol
class RedisReplyBuilder : public RedisReplyBuilderBase {
 public:
  using ScoredArray = absl::Span<const std::pair<std::string, double>>;

  RedisReplyBuilder(io::Sink* sink) : RedisReplyBuilderBase(sink) {
  }

  ~RedisReplyBuilder() override = default;

  // One-liner for ReplyScope + StartArray
  struct ArrayScope : ReplyScope {
    ArrayScope(RedisReplyBuilder* rb, size_t len) : ReplyScope(rb) {
      rb->StartArray(len);
    }
  };

  void SendSimpleStrArr(const facade::ArgRange& strs);
  void SendBulkStrArr(const facade::ArgRange& strs, CollectionType ct = CollectionType::ARRAY);
  template <typename I> void SendLongArr(absl::Span<const I> longs);

  void SendScoredArray(ScoredArray arr, bool with_scores);
  void SendLabeledScoredArray(std::string_view arr_label, ScoredArray arr);
  void StartArray(unsigned len);
  void SendEmptyArray();
};

#define RETURN_ON_PARSE_ERROR(parser, rb)       \
  do {                                          \
    if (auto err = (parser).TakeError(); err) { \
      return (rb)->SendError(err.MakeReply());  \
    }                                           \
  } while (0)

}  // namespace facade


================================================
FILE: src/facade/reply_builder_test.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "facade/reply_builder.h"

#include <absl/strings/str_cat.h>
#include <absl/strings/str_split.h>
#include <facade/resp_parser.h>
#include <mimalloc.h>

#include <random>

#include "base/gtest.h"
#include "base/logging.h"
#include "facade/error.h"
#include "facade/facade_test.h"
#include "facade/redis_parser.h"
#include "facade/reply_capture.h"
#include "facade/resp_expr_test_utils.h"

using namespace testing;
using namespace std;

namespace facade {

namespace {

const std::string_view kErrorStrPreFix = "-ERR ";
constexpr std::string_view kCRLF = "\r\n";
constexpr char kErrorStartChar = '-';
constexpr char kStringStartChar = '+';
constexpr std::string_view kOKMessage = "+OK\r\n";
constexpr char kArrayStart = '*';
constexpr char kBulkString = '$';
constexpr char kIntStart = ':';
const std::string_view kIntStartString = ":";
const std::string_view kNullBulkString = "$-1\r\n";
const std::string_view kBulkStringStart = "$";
const std::string_view kStringStart = "+";
const std::string_view kErrorStart = "-";
const std::string_view kArrayStartString = "*";
constexpr std::size_t kMinPayloadLen = 3;  // the begin type char and "\r\n" at the end

std::string BuildExpectedErrorString(std::string_view msg) {
  if (msg.at(0) == kErrorStartChar) {
    return absl::StrCat(msg, kCRLF);
  } else {
    return absl::StrCat(kErrorStrPreFix, msg, kCRLF);
  }
}

std::string_view GetErrorType(std::string_view err) {
  return err == kSyntaxErr ? kSyntaxErrType : err;
}

}  // namespace

class RedisReplyBuilderTest : public testing::Test {
 public:
  struct ParsingResults {
    RedisParser::Result result = RedisParser::OK;
    RespExpr::Vec args;
    std::uint32_t consumed = 0;

    ParsingResults(std::optional<RESPObj> obj = std::nullopt, size_t buf_pos = 0) {
      if (!obj.has_value() || obj->Empty()) {
        return;
      }

      holder_.emplace(std::move(*obj));

      result = RedisParser::OK;
      consumed = buf_pos;

      if (holder_->GetType() == RESPObj::Type::ARRAY) {
        auto arr = holder_->As<RESPArray>();
        if (!arr.has_value()) {
          result = RedisParser::BAD_ARRAYLEN;
          return;
        }

        args.reserve(arr->Size());
        for (size_t i = 0; i < arr->Size(); ++i) {
          args.push_back(expr_builder_.BuildExpr((*arr)[i]));
        }
        return;
      }

      args.push_back(expr_builder_.BuildExpr(*holder_));
    }

    bool Verify(std::uint32_t expected) const {
      return consumed == expected && result == RedisParser::OK;
    }

    bool IsError() const {
      return result != RedisParser::OK || (args.size() == 1 && args[0].type == RespExpr::ERROR);
    }

    bool IsOk() const {
      return IsString();
    }

    bool IsNull() const {
      return result == RedisParser::OK && args.size() == 1 && args.at(0).type == RespExpr::NIL;
    }

    bool IsString() const {
      return args.size() == 1 && result == RedisParser::OK && args[0].type == RespExpr::STRING;
    }

   private:
    std::optional<RESPObj> holder_;
    RespExprBuilder expr_builder_;
  };

  void SetUp() {
    sink_.Clear();
    builder_.reset(new RedisReplyBuilder(&sink_));
    ResetStats();
  }

  static void SetUpTestSuite() {
    tl_facade_stats = new FacadeStats;
    init_zmalloc_threadlocal(mi_heap_get_backing());
  }

 protected:
  std::vector<std::string_view> RawTokenizedMessage() const {
    CHECK(!str().empty());
    return absl::StrSplit(str(), kCRLF);
  }

  std::string_view str() const {
    return sink_.str();
  }

  std::string TakePayload() {
    std::string ret = sink_.str();
    sink_.Clear();
    return ret;
  }

  std::size_t SinkSize() const {
    return str().size();
  }

  unsigned GetError(string_view err) const {
    const auto& map = SinkReplyBuilder::GetThreadLocalStats().err_count;
    auto it = map.find(err);
    return it == map.end() ? 0 : it->second;
  }

  static bool NoErrors() {
    return tl_facade_stats->reply_stats.err_count.empty();
  }

  static const ReplyStats& GetReplyStats() {
    return tl_facade_stats->reply_stats;
  }

  // Breaks the string we have in sink into tokens.
  // In  RESP each token is build up from series of bytes follow by "\r\n"
  // This function don't try to parse the message, only to break the strings based
  // on the delimiter "\r\n". It is up to the test to verify these tokens
  std::vector<std::string_view> TokenizeMessage() const;

  // Call the redis parser with the data in the sink
  ParsingResults Parse();

  io::StringSink sink_;
  std::unique_ptr<RedisReplyBuilder> builder_;
  std::unique_ptr<std::uint8_t[]> parser_buffer_;
};

std::vector<std::string_view> RedisReplyBuilderTest::TokenizeMessage() const {
  std::vector<std::string_view> message_tokens = RawTokenizedMessage();
  CHECK(message_tokens.back().empty());  // we're expecting to last to be empty as it only has \r\n
  message_tokens.pop_back();             // remove this empty entry
  std::string_view data = str();
  switch (data[0]) {
    case kArrayStart:
      // in the case of array. we cannot tell the expected tokens number without doing parsing for
      // sub elements
      break;
    case kBulkString:
      if (data == kNullBulkString) {
        CHECK(message_tokens.size() == 1)
            << "NULL bulk string should only have one token, got " << message_tokens.size();
      } else {
        CHECK(message_tokens.size() == 2)
            << "bulk string should only have two tokens, got " << message_tokens.size();
      }
      break;
    case kErrorStartChar:
    case kStringStartChar:
    case kIntStart:
      // for errors and string and ints we don't really need to split as there must be only one
      // entry for \r\n
      CHECK(message_tokens.size() == 1)
          << "string/error message must have only one token got " << message_tokens.size();
      break;
    default:
      LOG(FATAL) << "invalid start char [" << data[0] << "]";
      break;
  }
  return message_tokens;
}

std::ostream& operator<<(std::ostream& os, const RedisReplyBuilderTest::ParsingResults& res) {
  os << "result{consumed bytes:" << res.consumed << ", status: " << res.result << " result count "
     << res.args.size() << ", first entry result: ";
  if (!res.args.empty()) {
    if (res.args.size() > 1) {
      os << "ARRAY: ";
    }

    for (const auto& e : res.args) {
      os << e << "\n";
    }
  } else {
    os << "NILL";
  }
  return os << "}";
}

RedisReplyBuilderTest::ParsingResults RedisReplyBuilderTest::Parse() {
  parser_buffer_.reset(new uint8_t[SinkSize()]);
  auto* ptr = parser_buffer_.get();
  memcpy(ptr, str().data(), SinkSize());
  RESPParser parser;
  auto resp_obj = parser.Feed(reinterpret_cast<char*>(ptr), SinkSize());
  size_t buf_pos = parser.BufferPos();
  buf_pos =
      resp_obj && !buf_pos ? SinkSize() : buf_pos;  // after parsing if success buf_pos can be 0

  ParsingResults result(std::move(resp_obj), buf_pos);
  return result;
}

///////////////////////////////////////////////////////////////////////////////

TEST_F(RedisReplyBuilderTest, MessageSend) {
  // Test each message that is "sent" to the sink
  builder_->SinkReplyBuilder::SendOk();
  ASSERT_EQ(TakePayload(), kOKMessage);
  builder_->StartArray(10);

  std::string_view hello_msg = "hello";
  builder_->SendBulkString(hello_msg);
  std::string expected_bulk_string = absl::StrCat(
      "*10\r\n", kBulkStringStart, std::to_string(hello_msg.size()), kCRLF, hello_msg, kCRLF);
  ASSERT_EQ(TakePayload(), expected_bulk_string);
}

TEST_F(RedisReplyBuilderTest, SimpleError) {
  // test with simple error case. This means that we must comply to
  // https://redis.io/docs/reference/protocol-spec/#resp-errors
  std::string_view error = "my error";
  std::string_view empty_type;

  builder_->SendError(error, empty_type);
  // must start with "-" and ends with "\r\n"
  // ASSERT_EQ(sink_.str().at(0), kErrorStartChar);
  ASSERT_TRUE(absl::StartsWith(str(), kErrorStart));
  ASSERT_TRUE(absl::EndsWith(str(), kCRLF));
  ASSERT_EQ(GetError(error), 1);
  ASSERT_EQ(str(), BuildExpectedErrorString(error))
      << " error different from expected - '" << str() << "'";
  auto parsing = Parse();
  ASSERT_TRUE(parsing.Verify(SinkSize()));
  ASSERT_TRUE(parsing.IsError()) << " result: " << parsing;
  EXPECT_THAT(parsing.args, ElementsAre(ErrArg(absl::StrCat("ERR ", error))));

  sink_.Clear();
  builder_->SendError(OpStatus::OK);  // in this case we should not have an error string
  ASSERT_TRUE(absl::StartsWith(str(), kStringStart));
  ASSERT_EQ(str(), kOKMessage);

  ASSERT_TRUE(absl::EndsWith(str(), kCRLF));
  ASSERT_EQ(GetError(error), 1);

  parsing = Parse();
  ASSERT_TRUE(parsing.Verify(SinkSize()));
  ASSERT_TRUE(parsing.IsOk()) << " result: " << parsing;
  EXPECT_THAT(parsing.args, ElementsAre("OK"));
}

TEST_F(RedisReplyBuilderTest, VeryLongError) {
  std::string long_error(10 * 1024, 'X');  // 10KB error
  std::string_view empty_type;

  builder_->SendError(long_error, empty_type);

  ASSERT_TRUE(absl::StartsWith(str(), kErrorStart));
  ASSERT_TRUE(absl::EndsWith(str(), kCRLF));
}

TEST_F(RedisReplyBuilderTest, ErrorBuiltInMessage) {
  OpStatus error_codes[] = {
      OpStatus::KEY_NOTFOUND,  OpStatus::OUT_OF_RANGE,  OpStatus::WRONG_TYPE,
      OpStatus::OUT_OF_MEMORY, OpStatus::INVALID_FLOAT, OpStatus::INVALID_INT,
      OpStatus::SYNTAX_ERR,    OpStatus::BUSY_GROUP,    OpStatus::INVALID_NUMERIC_RESULT};
  for (const auto& err : error_codes) {
    const std::string_view error_name = StatusToMsg(err);
    const std::string_view error_type = GetErrorType(error_name);

    sink_.Clear();
    builder_->SendError(err);
    ASSERT_TRUE(absl::StartsWith(str(), kErrorStart)) << " invalid start char for " << err;
    ASSERT_TRUE(absl::EndsWith(str(), kCRLF)) << " failed to find correct termination at " << err;
    ASSERT_EQ(GetError(error_type), 1) << " number of error count is invalid for " << err;
    ASSERT_EQ(str(), BuildExpectedErrorString(error_name))
        << " error different from expected - '" << str() << "'";

    auto parsing_output = Parse();
    ASSERT_TRUE(parsing_output.Verify(SinkSize()))
        << " verify for the result is invalid for " << err;
    ASSERT_TRUE(parsing_output.IsError()) << " expecting error for " << err;
  }
}

TEST_F(RedisReplyBuilderTest, ErrorReplyBuiltInMessage) {
  ErrorReply err{OpStatus::OUT_OF_RANGE};
  builder_->SendError(err);
  ASSERT_TRUE(absl::StartsWith(str(), kErrorStart));
  ASSERT_TRUE(absl::EndsWith(str(), kCRLF));
  ASSERT_EQ(GetError(kIndexOutOfRange), 1);
  ASSERT_EQ(str(), BuildExpectedErrorString(kIndexOutOfRange));

  auto parsing_output = Parse();
  ASSERT_TRUE(parsing_output.Verify(SinkSize()));
  ASSERT_TRUE(parsing_output.IsError());
  sink_.Clear();

  err = ErrorReply{"e1", "e2"};
  builder_->SendError(err);
  ASSERT_TRUE(absl::StartsWith(str(), kErrorStart));
  ASSERT_TRUE(absl::EndsWith(str(), kCRLF));
  ASSERT_EQ(GetError("e2"), 1);
  ASSERT_EQ(str(), BuildExpectedErrorString("e1"));

  parsing_output = Parse();
  ASSERT_TRUE(parsing_output.Verify(SinkSize()));
  ASSERT_TRUE(parsing_output.IsError());
}

TEST_F(RedisReplyBuilderTest, ErrorNoneBuiltInMessage) {
  // All these op codes creating the same error message
  OpStatus none_unique_codes[] = {OpStatus::SKIPPED, OpStatus::KEY_EXISTS, OpStatus::INVALID_VALUE,
                                  OpStatus::TIMED_OUT, OpStatus::STREAM_ID_SMALL};
  uint64_t error_count = 0;
  for (const auto& err : none_unique_codes) {
    const std::string_view error_name = StatusToMsg(err);
    const std::string_view error_type = GetErrorType(error_name);

    sink_.Clear();
    builder_->SendError(err);
    ASSERT_TRUE(absl::StartsWith(str(), kErrorStart)) << " invalid start char for " << err;
    ASSERT_TRUE(absl::EndsWith(str(), kCRLF));
    auto current_error_count = GetError(error_type);
    error_count++;
    ASSERT_EQ(current_error_count, error_count) << " number of error count is invalid for " << err;
    auto parsing_output = Parse();
    ASSERT_TRUE(parsing_output.Verify(SinkSize()))
        << " verify for the result is invalid for " << err;

    ASSERT_TRUE(parsing_output.IsError()) << " expecting error for " << err;
  }
}

TEST_F(RedisReplyBuilderTest, StringMessage) {
  // This would test a message that contain a string in it
  // For string this is simple, any string message should start with + and ends with \r\n
  // there can never be more than single \r\n in it as well as no special chars
  const std::string_view payloads[] = {
      "this is a string message", "$$$$$", "12334", "1v%6&*", "@@@", "----", "!!!"};
  for (auto payload : payloads) {
    const std::size_t expected_len = payload.size() + kCRLF.size() + 1;  // include '+' at the start
    sink_.Clear();
    builder_->SendSimpleString(payload);
    ASSERT_EQ(SinkSize(), expected_len);
    ASSERT_TRUE(absl::StartsWith(str(), kStringStart));
    ASSERT_TRUE(absl::EndsWith(str(), kCRLF));
    // auto message_payload = SimpleStringPayload();
    //  ASSERT_EQ(message_payload, payload);
    ASSERT_TRUE(absl::StartsWith(str(), kStringStart));
    ASSERT_TRUE(absl::EndsWith(str(), kCRLF));
    auto data = str();
    data.remove_suffix(kCRLF.size());
    ASSERT_TRUE(absl::EndsWith(data, payload));
  }
}

TEST_F(RedisReplyBuilderTest, EmptyArray) {
  // This test would build an array and try sending it over the "wire"
  // The array starts with the '*', then the number of elements in the array
  // then "\r\n", then each element inside is encoded accordingly
  // an empty array has this "*0\r\n" form
  const std::string_view empty_array = "*0\r\n";
  const std::string_view null_array = "*-1\r\n";
  builder_->StartArray(0);
  ASSERT_EQ(str(), empty_array);

  sink_.Clear();
  builder_->SendNullArray();
  ASSERT_EQ(null_array, str());

  sink_.Clear();
  builder_->SendEmptyArray();
  ASSERT_EQ(str(), empty_array);
}

TEST_F(RedisReplyBuilderTest, StrArray) {
  std::vector<std::string_view> string_vector{"hello", "world", "111", "@3#$^&*~"};
  builder_->StartArray(string_vector.size());
  std::size_t expected_size = kCRLF.size() + 2;
  for (auto s : string_vector) {
    builder_->SendSimpleString(s);
    expected_size += s.size() + kCRLF.size() + 1;
    ASSERT_TRUE(NoErrors());
  }
  ASSERT_EQ(SinkSize(), expected_size);
  // ASSERT_EQ(kArrayStart, str().at(0));
  ASSERT_TRUE(absl::StartsWith(str(), absl::StrCat(kArrayStartString, 4)));
  auto parsing_output = Parse();
  ASSERT_TRUE(parsing_output.Verify(SinkSize()))
      << " invalid parsing for the array message by the parser: " << parsing_output;

  ASSERT_EQ(string_vector.size(), parsing_output.args.size());
  ASSERT_THAT(parsing_output.args,
              ElementsAre(string_vector[0], string_vector[1], string_vector[2], string_vector[3]));

  std::vector<std::string_view> message_tokens = TokenizeMessage();
  ASSERT_THAT(message_tokens, ElementsAre("*4", absl::StrCat(kStringStart, string_vector[0]),
                                          absl::StrCat(kStringStart, string_vector[1]),
                                          absl::StrCat(kStringStart, string_vector[2]),
                                          absl::StrCat(kStringStart, string_vector[3])));
}

TEST_F(RedisReplyBuilderTest, SendSimpleStrArr) {
  // This would send array of strings, but with different API than TestStrArray test
  const std::string_view kArrayMessage[] = {
      // random values
      "+++", "---", "$$$", "~~~~", "@@@", "^^^", "1234", "foo"};
  const std::size_t kArrayLen = sizeof(kArrayMessage) / sizeof(kArrayMessage[0]);
  builder_->SendSimpleStrArr(kArrayMessage);
  ASSERT_TRUE(NoErrors());
  // Tokenize the message and verify content
  std::vector<std::string_view> message_tokens = TokenizeMessage();
  ASSERT_THAT(message_tokens, ElementsAre(absl::StrCat(kArrayStartString, kArrayLen),
                                          absl::StrCat(kStringStart, kArrayMessage[0]),
                                          absl::StrCat(kStringStart, kArrayMessage[1]),
                                          absl::StrCat(kStringStart, kArrayMessage[2]),
                                          absl::StrCat(kStringStart, kArrayMessage[3]),
                                          absl::StrCat(kStringStart, kArrayMessage[4]),
                                          absl::StrCat(kStringStart, kArrayMessage[5]),
                                          absl::StrCat(kStringStart, kArrayMessage[6]),
                                          absl::StrCat(kStringStart, kArrayMessage[7])));

  auto parsed_message = Parse();
  ASSERT_THAT(parsed_message.args,
              ElementsAre(kArrayMessage[0], kArrayMessage[1], kArrayMessage[2], kArrayMessage[3],
                          kArrayMessage[4], kArrayMessage[5], kArrayMessage[6], kArrayMessage[7]));
}

TEST_F(RedisReplyBuilderTest, SendStringViewArr) {
  // This would send array of strings, but with different API than TestStrArray test
  const std::vector<std::string_view> kArrayMessage{
      // random values
      "(((", "}}}", "&&&&", "####", "___", "+++", "0.1234", "bar"};
  builder_->SendBulkStrArr(kArrayMessage);
  ASSERT_TRUE(NoErrors());
  // verify content
  std::vector<std::string_view> message_tokens = TokenizeMessage();
  // the form of this is *<array size>\r\n$<string1 size>\r\n<string1>..$<stringN
  // size>\r\n<stringN>\r\n
  ASSERT_THAT(
      message_tokens,
      ElementsAre(absl::StrCat(kArrayStartString, kArrayMessage.size()),  // array size
                                                                          // size + string 0..N
                  absl::StrCat(kBulkStringStart, kArrayMessage[0].size()), kArrayMessage[0],
                  absl::StrCat(kBulkStringStart, kArrayMessage[1].size()), kArrayMessage[1],
                  absl::StrCat(kBulkStringStart, kArrayMessage[2].size()), kArrayMessage[2],
                  absl::StrCat(kBulkStringStart, kArrayMessage[3].size()), kArrayMessage[3],
                  absl::StrCat(kBulkStringStart, kArrayMessage[4].size()), kArrayMessage[4],
                  absl::StrCat(kBulkStringStart, kArrayMessage[5].size()), kArrayMessage[5],
                  absl::StrCat(kBulkStringStart, kArrayMessage[6].size()), kArrayMessage[6],
                  absl::StrCat(kBulkStringStart, kArrayMessage[7].size()), kArrayMessage[7]));

  // Check the parsed message
  auto parsed_message = Parse();
  ASSERT_THAT(parsed_message.args,
              ElementsAre(kArrayMessage[0], kArrayMessage[1], kArrayMessage[2], kArrayMessage[3],
                          kArrayMessage[4], kArrayMessage[5], kArrayMessage[6], kArrayMessage[7]));
}

TEST_F(RedisReplyBuilderTest, SendBulkStringArr) {
  // This would send array of strings, but with different API than TestStrArray test
  const std::vector<std::string> kArrayMessage{
      // Test this one with large values
      std::string(1024, '.'), std::string(2048, ','), std::string(4096, ' ')};
  builder_->SendBulkStrArr(kArrayMessage);
  ASSERT_TRUE(NoErrors());
  std::vector<std::string_view> message_tokens = TokenizeMessage();
  // the form of this is *<array size>\r\n$<string1 size>\r\n<string1>..$<stringN
  // size>\r\n<stringN>\r\n
  ASSERT_THAT(
      message_tokens,
      ElementsAre(absl::StrCat(kArrayStartString, kArrayMessage.size()),  // array size
                                                                          // size + string 0..N
                  absl::StrCat(kBulkStringStart, kArrayMessage[0].size()), kArrayMessage[0],
                  absl::StrCat(kBulkStringStart, kArrayMessage[1].size()), kArrayMessage[1],
                  absl::StrCat(kBulkStringStart, kArrayMessage[2].size()), kArrayMessage[2]));
  // Check the parsed message
  auto parsed_message = Parse();
  ASSERT_TRUE(parsed_message.Verify(SinkSize()))
      << "message was not successfully parsed: " << parsed_message;
  ASSERT_THAT(parsed_message.args,
              ElementsAre(kArrayMessage[0], kArrayMessage[1], kArrayMessage[2]));
}

TEST_F(RedisReplyBuilderTest, NullBulkString) {
  // null bulk string == "$-1\r\n" i.e. '$' + -1 + \r + \n
  builder_->SendNull();
  ASSERT_TRUE(NoErrors());
  ASSERT_EQ(str(), kNullBulkString);
  auto parsing_output = Parse();
  ASSERT_TRUE(parsing_output.Verify(SinkSize()));
  ASSERT_TRUE(parsing_output.IsNull());
  ASSERT_THAT(parsing_output.args, ElementsAre(ArgType(RespExpr::NIL)));
}

TEST_F(RedisReplyBuilderTest, EmptyBulkString) {
  // empty bulk string is in the form of "$0\r\n\r\n", i.e. length 0 after $ follow by \r\n*2
  const std::string_view kEmptyBulkString = "$0\r\n\r\n";
  builder_->SendBulkString(std::string_view{});
  ASSERT_TRUE(NoErrors());
  ASSERT_EQ(str(), kEmptyBulkString);
  auto parsing_output = Parse();
  ASSERT_TRUE(parsing_output.Verify(SinkSize()));
  ASSERT_TRUE(parsing_output.IsString());
  ASSERT_THAT(parsing_output.args, ElementsAre(std::string_view{}));
}

TEST_F(RedisReplyBuilderTest, NoAsciiBulkString) {
  // Bulk string may contain none ascii chars
  const char random_bytes[] = {0x12, 0x25, 0x37};
  std::size_t data_size = sizeof(random_bytes) / sizeof(random_bytes[0]);
  std::string_view none_ascii_payload{random_bytes, data_size};
  builder_->SendBulkString(none_ascii_payload);
  ASSERT_TRUE(NoErrors());
  const std::string expected_payload =
      absl::StrCat(kBulkStringStart, data_size, kCRLF, none_ascii_payload, kCRLF);
  ASSERT_EQ(str(), expected_payload);
  std::vector<std::string_view> message_tokens = TokenizeMessage();
  ASSERT_EQ(message_tokens.size(), 2);  // length and payload
  ASSERT_THAT(message_tokens,
              ElementsAre(absl::StrCat(kBulkStringStart, data_size), none_ascii_payload));
  auto parsing_output = Parse();
  ASSERT_TRUE(parsing_output.IsString());
  ASSERT_THAT(parsing_output.args, ElementsAre(none_ascii_payload));
}

TEST_F(RedisReplyBuilderTest, BulkStringWithCRLF) {
  // Verify bulk string that contains the \r\n as payload
  std::string_view crlf_chars{"\r\n"};
  builder_->SendBulkString(crlf_chars);
  ASSERT_TRUE(NoErrors());
  // the expected message in this case is $2\r\n\r\n\r\n
  std::string expected_message =
      absl::StrCat(kBulkStringStart, crlf_chars.size(), kCRLF, crlf_chars, kCRLF);
  ASSERT_EQ(str(), expected_message);
  auto parsing_output = Parse();
  ASSERT_TRUE(parsing_output.IsString());
  ASSERT_THAT(parsing_output.args, ElementsAre(crlf_chars));
}

TEST_F(RedisReplyBuilderTest, BulkStringWithStartBulkString) {
  // check a bulk string that contains $<number> as payload
  std::string message = absl::StrCat(kBulkStringStart, "10");
  std::string expected_message =
      absl::StrCat(kBulkStringStart, message.size(), kCRLF, message, kCRLF);
  builder_->SendBulkString(message);
  ASSERT_TRUE(NoErrors());
  ASSERT_EQ(str(), expected_message);

  auto parsing_output = Parse();
  ASSERT_TRUE(parsing_output.IsString());
  ASSERT_THAT(parsing_output.args, ElementsAre(message));
}

TEST_F(RedisReplyBuilderTest, BulkStringWithStarString) {
  std::string message = absl::StrCat(kStringStart, "a string message");
  std::string expected_message =
      absl::StrCat(kBulkStringStart, message.size(), kCRLF, message, kCRLF);
  builder_->SendBulkString(message);
  ASSERT_EQ(str(), expected_message);
  auto parsing_output = Parse();
  ASSERT_TRUE(parsing_output.IsString());
  ASSERT_THAT(parsing_output.args, ElementsAre(message));
}

TEST_F(RedisReplyBuilderTest, BulkStringWithErrorString) {
  std::string message = absl::StrCat(kErrorStrPreFix, kSyntaxErrType);
  std::string expected_message =
      absl::StrCat(kBulkStringStart, message.size(), kCRLF, message, kCRLF);
  builder_->SendBulkString(message);
  ASSERT_TRUE(NoErrors());
  ASSERT_EQ(str(), expected_message);
  auto parsing_output = Parse();
  ASSERT_TRUE(parsing_output.IsString());
  ASSERT_THAT(parsing_output.args, ElementsAre(message));
}

TEST_F(RedisReplyBuilderTest, Int) {
  // message in the form of ":0\r\n" and ":1000\r\n"
  // this message just starts with ':' and ends with \r\n
  // and the payload must be successfully parsed into int type
  const long kPayloadInt = 12345;
  const std::string expected_output = absl::StrCat(kIntStartString, kPayloadInt, kCRLF);
  builder_->SendLong(kPayloadInt);
  ASSERT_EQ(str(), expected_output);
  long value = 0;
  std::string_view expected_payload = str().substr(1, SinkSize() - kMinPayloadLen);
  ASSERT_TRUE(absl::SimpleAtoi(expected_payload, &value));
  ASSERT_EQ(value, kPayloadInt);
  auto parsing_output = Parse();
  ASSERT_THAT(parsing_output.args, ElementsAre(IntArg(kPayloadInt)));
}

TEST_F(RedisReplyBuilderTest, Double) {
  // There is no direct support for double types in RESP
  // to send this, it is sent as bulk string
  const std::string_view kPayloadStr = "23.456";
  double double_value = 0;
  CHECK(absl::SimpleAtod(kPayloadStr, &double_value));
  const std::string expected_payload =
      absl::StrCat(kBulkStringStart, kPayloadStr.size(), kCRLF, kPayloadStr, kCRLF);
  builder_->SendDouble(double_value);
  ASSERT_TRUE(NoErrors());
  std::vector<std::string_view> message_tokens = TokenizeMessage();
  ASSERT_EQ(str(), expected_payload);
  ASSERT_THAT(message_tokens,
              ElementsAre(absl::StrCat(kBulkStringStart, kPayloadStr.size()), kPayloadStr));
  auto parsing_output = Parse();
  ASSERT_TRUE(parsing_output.IsString());
  ASSERT_THAT(parsing_output.args, ElementsAre(kPayloadStr));
}

TEST_F(RedisReplyBuilderTest, MixedTypeArray) {
  // For arrays, we can send an array that contains more than a single type (string/bulk
  // string/simple string/null..) In this test we are verifying that this is actually working. note
  // that this is not part of class RedisReplyBuilder API
  // The entries are:
  // array start
  // bulk string
  // int
  // int
  // simple string
  // simple string
  // empty bulk string
  // double (bulk string)
  std::string long_string(1024, '-');
  const unsigned int kArraySize = 6;
  const char random_bytes[] = {0x12, 0x15, 0x2F};
  const std::string_view kFirstBulkString{random_bytes, 3};
  const long kFirstLongValue = 54321;
  const long kSecondLongValue = 87654321;
  const std::string_view kLongSimpleString{long_string};
  const std::string_view kPayloadDoubleStr = "9987654321.0123";
  double double_value = 0;
  CHECK(absl::SimpleAtod(kPayloadDoubleStr, &double_value));

  builder_->StartArray(kArraySize);
  builder_->SendBulkString(kFirstBulkString);
  builder_->SendLong(kFirstLongValue);
  builder_->SendLong(kSecondLongValue);
  builder_->SendSimpleString(kLongSimpleString);
  // builder_->SendNull();
  builder_->SendBulkString(std::string_view{});
  builder_->SendDouble(double_value);
  const std::string_view output_msg = str();
  ASSERT_FALSE(output_msg.empty());
  ASSERT_TRUE(NoErrors());
  std::vector<std::string_view> message_tokens = TokenizeMessage();
  ASSERT_THAT(
      message_tokens,
      ElementsAre(absl::StrCat(kArrayStartString, kArraySize),  // the length
                  absl::StrCat(kBulkStringStart, kFirstBulkString.size()), kFirstBulkString,
                  absl::StrCat(kIntStartString, kFirstLongValue),
                  absl::StrCat(kIntStartString, kSecondLongValue),
                  absl::StrCat(kStringStart, kLongSimpleString),  // ArgType(RespExpr::NIL),
                  absl::StrCat(kBulkStringStart, "0"), std::string_view{},
                  absl::StrCat(kBulkStringStart, kPayloadDoubleStr.size()), kPayloadDoubleStr));

  // // Now we need to parse it and make sure that its a valid message by the parser as well
  auto parsed_message = Parse();
  ASSERT_THAT(
      parsed_message.args,
      ElementsAre(ArgType(RespExpr::STRING), ArgType(RespExpr::INT64), ArgType(RespExpr::INT64),
                  ArgType(RespExpr::STRING), ArgType(RespExpr::STRING), ArgType(RespExpr::STRING)));
}

TEST_F(RedisReplyBuilderTest, BatchMode) {
  GTEST_SKIP() << "Some differences";

  // Test that when the batch mode is enabled, we are getting the same correct results
  builder_->SetBatchMode(true);
  // Some random values and sizes
  const std::vector<std::string> kInputArray{
      std::string(10, 'p'),  std::string(48, 'o'),  std::string(67, 'y'),
      std::string(167, 'e'), std::string(478, '*'), std::string(164, 't'),
  };
  builder_->StartArray(kInputArray.size());
  ASSERT_EQ(SinkSize(), 0);
  int count = 0;
  std::size_t total_bytes = 0;
  for (const auto& val : kInputArray) {
    builder_->SendBulkString(val);
    ASSERT_EQ(SinkSize(), 0) << " sink is not empty at iteration number " << count;
    ASSERT_EQ(GetReplyStats().io_write_bytes, 0);
    ASSERT_EQ(GetReplyStats().io_write_cnt, 0);
    total_bytes += val.size();
    ++count;
  }
  // in order to actually see the message, we need to disable the batching, then
  // write something
  builder_->SetBatchMode(false);
  builder_->SendBulkString(std::string_view{});
  ASSERT_EQ(GetReplyStats().io_write_cnt, 1);
  // We expecting to have more than the total bytes we count,
  // since we are not counting the \r\n and the type char as well
  // as length entries
  ASSERT_GT(GetReplyStats().io_write_bytes, total_bytes);
  std::vector<std::string_view> array_members = TokenizeMessage();
  ASSERT_THAT(array_members,
              ElementsAre(absl::StrCat(kArrayStartString, kInputArray.size()),
                          absl::StrCat(kBulkStringStart, kInputArray[0].size()), kInputArray[0],
                          absl::StrCat(kBulkStringStart, kInputArray[1].size()), kInputArray[1],
                          absl::StrCat(kBulkStringStart, kInputArray[2].size()), kInputArray[2],
                          absl::StrCat(kBulkStringStart, kInputArray[3].size()), kInputArray[3],
                          absl::StrCat(kBulkStringStart, kInputArray[4].size()), kInputArray[4],
                          absl::StrCat(kBulkStringStart, kInputArray[5].size()), kInputArray[5],
                          absl::StrCat(kBulkStringStart, "0"), std::string_view{}));
}

TEST_F(RedisReplyBuilderTest, Resp3Double) {
  builder_->SetRespVersion(RespVersion::kResp3);
  builder_->SendDouble(5.5);
  ASSERT_TRUE(NoErrors());
  ASSERT_EQ(str(), ",5.5\r\n");
}

TEST_F(RedisReplyBuilderTest, Resp3NullString) {
  builder_->SetRespVersion(RespVersion::kResp3);
  builder_->SendNull();
  ASSERT_TRUE(NoErrors());
  ASSERT_EQ(TakePayload(), "_\r\n");
}

TEST_F(RedisReplyBuilderTest, SendStringArrayAsMap) {
  const std::vector<std::string> map_array{"k1", "v1", "k2", "v2"};

  builder_->SetRespVersion(RespVersion::kResp2);
  builder_->SendBulkStrArr(map_array, CollectionType::MAP);
  ASSERT_TRUE(NoErrors());
  ASSERT_EQ(TakePayload(), "*4\r\n$2\r\nk1\r\n$2\r\nv1\r\n$2\r\nk2\r\n$2\r\nv2\r\n")
      << "SendStringArrayAsMap Resp2 Failed.";

  builder_->SetRespVersion(RespVersion::kResp3);
  builder_->SendBulkStrArr(map_array, CollectionType::MAP);
  ASSERT_TRUE(NoErrors());
  ASSERT_EQ(TakePayload(), "%2\r\n$2\r\nk1\r\n$2\r\nv1\r\n$2\r\nk2\r\n$2\r\nv2\r\n")
      << "SendStringArrayAsMap Resp3 Failed.";
}

TEST_F(RedisReplyBuilderTest, SendStringArrayAsSet) {
  const std::vector<std::string> set_array{"e1", "e2", "e3"};

  builder_->SetRespVersion(RespVersion::kResp2);
  builder_->SendBulkStrArr(set_array, CollectionType::SET);
  ASSERT_TRUE(NoErrors());
  ASSERT_EQ(TakePayload(), "*3\r\n$2\r\ne1\r\n$2\r\ne2\r\n$2\r\ne3\r\n")
      << "SendStringArrayAsSet Resp2 Failed.";

  builder_->SetRespVersion(RespVersion::kResp3);
  builder_->SendBulkStrArr(set_array, CollectionType::SET);
  ASSERT_TRUE(NoErrors());
  ASSERT_EQ(TakePayload(), "~3\r\n$2\r\ne1\r\n$2\r\ne2\r\n$2\r\ne3\r\n")
      << "SendStringArrayAsSet Resp3 Failed.";
}

TEST_F(RedisReplyBuilderTest, SendScoredArray) {
  const std::vector<std::pair<std::string, double>> scored_array{
      {"e1", 1.1}, {"e2", 2.2}, {"e3", 3.3}};

  builder_->SetRespVersion(RespVersion::kResp2);
  builder_->SendScoredArray(scored_array, false);
  ASSERT_TRUE(NoErrors());
  ASSERT_EQ(TakePayload(), "*3\r\n$2\r\ne1\r\n$2\r\ne2\r\n$2\r\ne3\r\n")
      << "Resp2 WITHOUT scores failed.";

  builder_->SetRespVersion(RespVersion::kResp3);
  builder_->SendScoredArray(scored_array, false);
  ASSERT_TRUE(NoErrors());
  ASSERT_EQ(TakePayload(), "*3\r\n$2\r\ne1\r\n$2\r\ne2\r\n$2\r\ne3\r\n")
      << "Resp3 WITHOUT scores failed.";

  builder_->SetRespVersion(RespVersion::kResp2);
  builder_->SendScoredArray(scored_array, true);
  ASSERT_TRUE(NoErrors());
  ASSERT_EQ(TakePayload(),
            "*6\r\n$2\r\ne1\r\n$3\r\n1.1\r\n$2\r\ne2\r\n$3\r\n2.2\r\n$2\r\ne3\r\n$3\r\n3.3\r\n")
      << "Resp3 WITHSCORES failed.";

  builder_->SetRespVersion(RespVersion::kResp3);
  builder_->SendScoredArray(scored_array, true);
  ASSERT_TRUE(NoErrors());
  ASSERT_EQ(TakePayload(),
            "*3\r\n*2\r\n$2\r\ne1\r\n,1.1\r\n*2\r\n$2\r\ne2\r\n,2.2\r\n*2\r\n$2\r\ne3\r\n,3.3\r\n")
      << "Resp3 WITHSCORES failed.";
}

TEST_F(RedisReplyBuilderTest, SendLabeledScoredArray) {
  const std::vector<std::pair<std::string, double>> scored_array{
      {"e1", 1.1}, {"e2", 2.2}, {"e3", 3.3}};

  builder_->SetRespVersion(RespVersion::kResp2);
  builder_->SendLabeledScoredArray("foobar", scored_array);
  ASSERT_TRUE(NoErrors());
  ASSERT_EQ(TakePayload(),
            "*2\r\n$6\r\nfoobar\r\n*3\r\n*2\r\n$2\r\ne1\r\n$3\r\n1.1\r\n*2\r\n$2\r\ne2\r\n$3\r\n2."
            "2\r\n*2\r\n$2\r\ne3\r\n$3\r\n3.3\r\n")
      << "Resp3 failed.\n";

  builder_->SetRespVersion(RespVersion::kResp3);
  builder_->SendLabeledScoredArray("foobar", scored_array);
  ASSERT_TRUE(NoErrors());
  ASSERT_EQ(TakePayload(),
            "*2\r\n$6\r\nfoobar\r\n*3\r\n*2\r\n$2\r\ne1\r\n,1.1\r\n*2\r\n$2\r\ne2\r\n,2.2\r\n*"
            "2\r\n$2\r\ne3\r\n,3.3\r\n")
      << "Resp3 failed.";
}

TEST_F(RedisReplyBuilderTest, BasicCapture) {
  using namespace std;
  string_view kTestSws[] = {"a1"sv, "a2"sv, "a3"sv, "a4"sv};

  CapturingReplyBuilder crb{};
  using RRB = RedisReplyBuilder;

  auto big_arr_cb = [](RRB* r) {
    r->StartArray(4);
    {
      r->StartArray(2);
      r->SendLong(1);
      r->StartArray(2);
      {
        r->SendLong(2);
        r->SendLong(3);
      }
    }
    r->SendLong(4);
    {
      r->StartArray(2);
      {
        r->StartArray(2);
        r->SendLong(5);
        r->SendLong(6);
      }
      r->SendLong(7);
    }
    r->SendLong(8);
  };

  function<void(RRB*)> funcs[] = {
      [](RRB* r) { r->SendNull(); },
      [](RRB* r) { r->SendLong(1L); },
      [](RRB* r) { r->SendDouble(6.7); },
      [](RRB* r) { r->SendSimpleString("ok"); },
      [](RRB* r) { r->SendEmptyArray(); },
      [](RRB* r) { r->SendNullArray(); },
      [](RRB* r) { r->SendError("e1", "e2"); },
      [kTestSws](RRB* r) { r->SendSimpleStrArr(kTestSws); },
      [kTestSws](RRB* r) { r->SendBulkStrArr(kTestSws); },
      [kTestSws](RRB* r) { r->SendBulkStrArr(kTestSws, CollectionType::SET); },
      [kTestSws](RRB* r) { r->SendBulkStrArr(kTestSws, CollectionType::MAP); },
      [kTestSws](RRB* r) {
        r->StartArray(3);
        r->SendLong(1L);
        r->SendDouble(2.5);
        r->SendSimpleStrArr(kTestSws);
      },
      big_arr_cb,
  };

  crb.SetRespVersion(RespVersion::kResp3);
  builder_->SetRespVersion(RespVersion::kResp3);

  // Run generator functions on both a regular redis builder
  // and the capturing builder with its capture applied.
  for (auto& f : funcs) {
    f(builder_.get());
    auto expected = TakePayload();
    f(&crb);
    CapturingReplyBuilder::Apply(crb.Take(), builder_.get());
    auto actual = TakePayload();
    EXPECT_EQ(expected, actual);
  }

  builder_->SetRespVersion(RespVersion::kResp2);
}

TEST_F(RedisReplyBuilderTest, FormatDouble) {
  char buf[64];

  auto format = [&](double d) { return RedisReplyBuilder::FormatDouble(d, buf, sizeof(buf)); };

  EXPECT_STREQ("0.1", format(0.1));
  EXPECT_STREQ("0.2", format(0.2));
  EXPECT_STREQ("0.8", format(0.8));
  EXPECT_STREQ("1.1", format(1.1));
  EXPECT_STREQ("inf", format(INFINITY));
  EXPECT_STREQ("-inf", format(-INFINITY));
  EXPECT_STREQ("0", format(-0.0));
  EXPECT_STREQ("1e-7", format(0.0000001));
  EXPECT_STREQ("111111111111111110000", format(111111111111111111111.0));
  EXPECT_STREQ("1.1111111111111111e+21", format(1111111111111111111111.0));
  EXPECT_STREQ("1e-23", format(1e-23));
}

TEST_F(RedisReplyBuilderTest, VerbatimString) {
  // test resp3
  std::string str = "A simple string!";

  builder_->SetRespVersion(RespVersion::kResp3);
  builder_->SendVerbatimString(str, RedisReplyBuilder::VerbatimFormat::TXT);
  ASSERT_TRUE(NoErrors());
  ASSERT_EQ(TakePayload(), "=20\r\ntxt:A simple string!\r\n") << "Resp3 VerbatimString TXT failed.";

  builder_->SetRespVersion(RespVersion::kResp3);
  builder_->SendVerbatimString(str, RedisReplyBuilder::VerbatimFormat::MARKDOWN);
  ASSERT_TRUE(NoErrors());
  ASSERT_EQ(TakePayload(), "=20\r\nmkd:A simple string!\r\n") << "Resp3 VerbatimString TXT failed.";

  builder_->SetRespVersion(RespVersion::kResp2);
  builder_->SendVerbatimString(str);
  ASSERT_TRUE(NoErrors());
  ASSERT_EQ(TakePayload(), "$16\r\nA simple string!\r\n") << "Resp3 VerbatimString TXT failed.";
}

TEST_F(RedisReplyBuilderTest, Issue3449) {
  vector<string> records;
  for (unsigned i = 0; i < 10'000; ++i) {
    records.push_back(absl::StrCat(i));
  }
  builder_->SendBulkStrArr(records);
  ASSERT_TRUE(NoErrors());
  ParsingResults parse_result = Parse();
  ASSERT_FALSE(parse_result.IsError());
  EXPECT_EQ(10000, parse_result.args.size());
}

TEST_F(RedisReplyBuilderTest, Issue4424) {
  vector<string> records;
  for (unsigned i = 0; i < 800; ++i) {
    records.push_back(string(100, 'a'));
  }

  for (unsigned j = 0; j < 2; ++j) {
    builder_->SendBulkStrArr(records);
    ASSERT_TRUE(NoErrors());
    ParsingResults parse_result = Parse();
    ASSERT_FALSE(parse_result.IsError()) << int(parse_result.result);
    ASSERT_TRUE(parse_result.Verify(SinkSize()));
    EXPECT_EQ(800, parse_result.args.size());
    sink_.Clear();
  }
}

TEST_F(RedisReplyBuilderTest, MCMetaGetLargeValue) {
  io::StringSink mc_sink;
  MCReplyBuilder mc_builder(&mc_sink);

  MemcacheCmdFlags flags;
  flags.meta = true;
  flags.return_value = true;

  string large_val(16000, 'x');
  mc_builder.SendValue(flags, "key", large_val, 0, 0, 0);

  string_view output = mc_sink.str();
  EXPECT_THAT(output, HasSubstr("VA 16000"));
  EXPECT_THAT(output, HasSubstr(large_val));
}

static void BM_FormatDouble(benchmark::State& state) {
  vector<double> values;
  char buf[64];

  uniform_real_distribution<double> unif(0, 1e9);
  default_random_engine re;
  for (unsigned i = 0; i < 100; i++) {
    values.push_back(unif(re));
  }

  while (state.KeepRunning()) {
    for (auto d : values) {
      RedisReplyBuilder::FormatDouble(d, buf, sizeof(buf));
    }
  }
}
BENCHMARK(BM_FormatDouble);

}  // namespace facade


================================================
FILE: src/facade/reply_capture.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#include "facade/reply_capture.h"

#include "absl/types/span.h"
#include "base/logging.h"
#include "reply_capture.h"

#define SKIP_LESS(needed)     \
  replies_recorded_++;        \
  if (reply_mode_ < needed) { \
    current_ = monostate{};   \
    return;                   \
  }
namespace facade {

using namespace std;
using namespace payload;

void CapturingReplyBuilder::SendError(std::string_view str, std::string_view type) {
  last_error_ = str;
  SKIP_LESS(ReplyMode::ONLY_ERR);
  Capture(make_error(str, type));
}

void CapturingReplyBuilder::SendNullArray() {
  SKIP_LESS(ReplyMode::FULL);
  Capture(unique_ptr<CollectionPayload>{nullptr});
}

void CapturingReplyBuilder::SendNull() {
  SKIP_LESS(ReplyMode::FULL);
  Capture(nullptr_t{});
}

void CapturingReplyBuilder::SendLong(long val) {
  SKIP_LESS(ReplyMode::FULL);
  Capture(val);
}

void CapturingReplyBuilder::SendDouble(double val) {
  SKIP_LESS(ReplyMode::FULL);
  Capture(val);
}

void CapturingReplyBuilder::SendSimpleString(std::string_view str) {
  SKIP_LESS(ReplyMode::FULL);
  Capture(SimpleString{string{str}});
}

void CapturingReplyBuilder::SendBulkString(std::string_view str) {
  SKIP_LESS(ReplyMode::FULL);
  Capture(BulkString{string{str}});
}

void CapturingReplyBuilder::StartCollection(unsigned len, CollectionType type) {
  SKIP_LESS(ReplyMode::FULL);
  stack_.emplace(make_unique<CollectionPayload>(len, type),
                 type == CollectionType::MAP ? len * 2 : len);

  // If we added an empty collection, it must be collapsed immediately.
  CollapseFilledCollections();
}

CapturingReplyBuilder::Payload CapturingReplyBuilder::Take() {
  CHECK(stack_.empty());
  Payload pl = std::move(current_);
  current_ = monostate{};
  return pl;
}

void CapturingReplyBuilder::SendDirect(Payload&& val) {
  replies_recorded_ += !holds_alternative<monostate>(val);
  bool is_err = holds_alternative<Error>(val);
  ReplyMode min_mode = is_err ? ReplyMode::ONLY_ERR : ReplyMode::FULL;
  if (reply_mode_ >= min_mode) {
    DCHECK_EQ(current_.index(), 0u);
    current_ = std::move(val);
  } else {
    current_ = monostate{};
  }
}

void CapturingReplyBuilder::Capture(Payload val, bool collapse_if_needed) {
  if (!stack_.empty()) {
    auto& last = stack_.top();
    last.first->arr.push_back(std::move(val));
    if (last.second-- == 1 && collapse_if_needed) {
      CollapseFilledCollections();
    }
  } else {
    DCHECK_EQ(current_.index(), 0u);
    current_ = std::move(val);
  }
}

void CapturingReplyBuilder::CollapseFilledCollections() {
  while (!stack_.empty() && stack_.top().second == 0) {
    auto pl = std::move(stack_.top());
    stack_.pop();
    Capture(std::move(pl.first), false);
  }
}

struct CaptureVisitor {
  void operator()(monostate) {
  }

  void operator()(long v) {
    rb->SendLong(v);
  }

  void operator()(double v) {
    static_cast<RedisReplyBuilder*>(rb)->SendDouble(v);
  }

  void operator()(const payload::SimpleString& ss) {
    rb->SendSimpleString(ss);
  }

  void operator()(const payload::BulkString& bs) {
    static_cast<RedisReplyBuilder*>(rb)->SendBulkString(bs);
  }

  void operator()(payload::Null) {
    static_cast<RedisReplyBuilder*>(rb)->SendNull();
  }

  void operator()(const payload::Error& err) {
    rb->SendError(err->first, err->second);
  }

  void operator()(const unique_ptr<payload::CollectionPayload>& cp) {
    auto* builder = static_cast<RedisReplyBuilder*>(rb);
    if (!cp) {
      builder->SendNullArray();
      return;
    }
    if (cp->len == 0 && cp->type == CollectionType::ARRAY) {
      builder->SendEmptyArray();
      return;
    }
    builder->StartCollection(cp->len, cp->type);
    for (auto& pl : cp->arr)
      visit(*this, std::move(pl));
  }

  SinkReplyBuilder* rb;
};

void CapturingReplyBuilder::Apply(Payload&& pl, SinkReplyBuilder* rb) {
  if (auto* crb = dynamic_cast<CapturingReplyBuilder*>(rb); crb != nullptr) {
    crb->SendDirect(std::move(pl));
    return;
  }

  CaptureVisitor cv{rb};
  visit(cv, std::move(pl));
}

void CapturingReplyBuilder::SetReplyMode(ReplyMode mode) {
  reply_mode_ = mode;
  current_ = monostate{};
}

optional<CapturingReplyBuilder::ErrorRef> CapturingReplyBuilder::TryExtractError(
    const Payload& pl) {
  if (auto* err = get_if<Error>(&pl); err != nullptr) {
    return ErrorRef{(*err)->first, (*err)->second};
  }
  return nullopt;
}

}  // namespace facade


================================================
FILE: src/facade/reply_capture.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <memory>
#include <stack>
#include <string_view>
#include <variant>

#include "facade/reply_builder.h"
#include "facade/reply_mode.h"
#include "facade/reply_payload.h"

namespace facade {

struct CaptureVisitor;

// CapturingReplyBuilder allows capturing replies and retrieveing them with Take().
// Those replies can be stored standalone and sent with
// CapturingReplyBuilder::Apply() to another reply builder.
class CapturingReplyBuilder : public RedisReplyBuilder {
  friend struct CaptureVisitor;

 public:
  using RedisReplyBuilder::SendError;
  void SendError(std::string_view str, std::string_view type) override;

  void SendLong(long val) override;
  void SendDouble(double val) override;
  void SendSimpleString(std::string_view str) override;
  void SendBulkString(std::string_view str) override;

  void StartCollection(unsigned len, CollectionType type) override;
  void SendNullArray() override;
  void SendNull() override;

  explicit CapturingReplyBuilder(ReplyMode mode = ReplyMode::FULL,
                                 RespVersion resp_v = RespVersion::kResp2)
      : RedisReplyBuilder{nullptr}, reply_mode_{mode} {
    SetRespVersion(resp_v);
  }

  using Payload = payload::Payload;

  // Non owned Error based on SendError arguments (msg, type)
  using ErrorRef = std::pair<std::string_view, std::string_view>;

  void SetReplyMode(ReplyMode mode);

  // Take payload and clear state.
  Payload Take();

  // Send payload to builder.
  static void Apply(Payload&& pl, SinkReplyBuilder* builder);

  // If an error is stored inside payload, get a reference to it.
  static std::optional<ErrorRef> TryExtractError(const Payload& pl);

 private:
  // Send payload directly, bypassing external interface. For efficient passing between two
  // captures.
  void SendDirect(Payload&& val);

  // Capture value and store eiter in current topmost collection or as a standalone value.
  void Capture(Payload val, bool collapse_if_needed = true);

  // While topmost collection in stack is full, finalize it and add it as a regular value.
  void CollapseFilledCollections();

  ReplyMode reply_mode_;

  // List of nested active collections that are being built.
  std::stack<std::pair<std::unique_ptr<payload::CollectionPayload>, int>> stack_;

  // Root payload.
  Payload current_;
};

}  // namespace facade


================================================
FILE: src/facade/reply_mode.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

namespace facade {

// Reply mode allows filtering replies.
enum class ReplyMode {
  NONE,      // No replies are recorded
  ONLY_ERR,  // Only errors are recorded
  FULL       // All replies are recorded
};

class RedisReplyBuilder;

}  // namespace facade


================================================
FILE: src/facade/reply_payload.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <memory>
#include <string>
#include <variant>

#include "base/function2.hpp"
#include "facade/facade_types.h"

namespace facade {

class SinkReplyBuilder;
namespace payload {

// SendError (msg, type)
using Error = std::unique_ptr<std::pair<std::string, std::string>>;
using Null = std::nullptr_t;  // SendNull or SendNullArray

struct CollectionPayload;
struct SimpleString : public std::string {};  // SendSimpleString
struct BulkString : public std::string {};    // SendBulkString

using Payload = std::variant<std::monostate, Null, Error, long, double, SimpleString, BulkString,
                             std::unique_ptr<CollectionPayload>>;

#ifdef __linux__
static_assert(sizeof(Payload) == 40);
#endif

struct CollectionPayload {
  CollectionPayload(unsigned _len, CollectionType _type) : len{_len}, type{_type} {
    arr.reserve(type == CollectionType::MAP ? len * 2 : len);
  }

  unsigned len;
  CollectionType type;
  std::vector<Payload> arr;
};

inline Error make_error(std::string_view msg, std::string_view type = "") {
  return std::make_unique<std::pair<std::string, std::string>>(msg, type);
}

inline Payload make_simple_or_noreply(std::string_view resp) {
  if (resp.empty())
    return std::monostate{};
  else
    return SimpleString{std::string(resp)};
}

}  // namespace payload
}  // namespace facade


================================================
FILE: src/facade/resp_expr.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "facade/resp_expr.h"

#include "base/logging.h"

namespace facade {

void FillBackedArgs(const RespVec& src, cmn::BackedArguments* dest) {
  auto map = [](const RespExpr& expr) { return expr.GetView(); };
  auto range = base::it::Transform(map, base::it::Range(src.begin(), src.end()));

  dest->Assign(range.begin(), range.end(), src.size());
}

}  // namespace facade


================================================
FILE: src/facade/resp_expr.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/strings/ascii.h>
#include <absl/types/span.h>

#include <optional>
#include <string_view>
#include <variant>
#include <vector>

#include "facade/facade_types.h"

namespace facade {

class RespExpr {
 public:
  using Buffer = absl::Span<const uint8_t>;

  enum Type : uint8_t { STRING, ARRAY, INT64, DOUBLE, NIL, NIL_ARRAY, ERROR };

  using Vec = std::vector<RespExpr>;
  Type type;
  bool has_support;  // whether pointers in this item are supported by the external storage.

  std::variant<int64_t, double, Buffer, Vec*> u;

  RespExpr(Type t = NIL) : type(t), has_support(false) {
  }

  static Buffer buffer(std::string* s) {
    return Buffer{reinterpret_cast<uint8_t*>(s->data()), s->size()};
  }

  std::string_view GetView() const {
    Buffer buffer = GetBuf();
    return {reinterpret_cast<const char*>(buffer.data()), buffer.size()};
  }

  std::string GetString() const {
    return std::string(GetView());
  }

  Buffer GetBuf() const {
    return std::get<Buffer>(u);
  }

  const Vec& GetVec() const {
    return *std::get<Vec*>(u);
  }

  std::optional<int64_t> GetInt() const {
    return std::holds_alternative<int64_t>(u) ? std::make_optional(std::get<int64_t>(u))
                                              : std::nullopt;
  }

  size_t UsedMemory() const {
    return 0;
  }

  static const char* TypeName(Type t);
};

using RespVec = RespExpr::Vec;
using RespSpan = absl::Span<const RespExpr>;

inline std::string_view ToSV(RespExpr::Buffer buf) {
  return std::string_view{reinterpret_cast<const char*>(buf.data()), buf.size()};
}

void FillBackedArgs(const RespVec& src, cmn::BackedArguments* dest);

}  // namespace facade

namespace std {

ostream& operator<<(ostream& os, const facade::RespExpr& e);
ostream& operator<<(ostream& os, facade::RespSpan rspan);

}  // namespace std


================================================
FILE: src/facade/resp_expr_test_utils.cc
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "facade/resp_expr_test_utils.h"

#include <cstddef>
#include <cstring>

namespace facade {

RespExpr RespExprBuilder::BuildExpr(const RESPObj& obj) {
  RespExpr expr{RespExpr::NIL};

  switch (obj.GetType()) {
    case RESPObj::Type::INTEGER: {
      expr.type = RespExpr::INT64;
      expr.u = obj.As<int64_t>().value();
      break;
    }
    case RESPObj::Type::DOUBLE: {
      expr.type = RespExpr::DOUBLE;
      expr.u = obj.As<double>().value();
      break;
    }
    case RESPObj::Type::NIL: {
      expr.type = RespExpr::NIL;
      break;
    }
    case RESPObj::Type::ERROR: {
      expr.type = RespExpr::ERROR;
      SetStringPayload(obj, &expr);
      break;
    }
    case RESPObj::Type::STRING:
    case RESPObj::Type::REPLY_STATUS: {
      expr.type = RespExpr::STRING;
      SetStringPayload(obj, &expr);
      break;
    }
    case RESPObj::Type::ARRAY:
    case RESPObj::Type::MAP:
    case RESPObj::Type::SET: {
      auto arr = obj.As<RESPArray>();
      if (arr.has_value()) {
        // Check if this is a null array (elements == SIZE_MAX which represents -1)
        if (arr->Size() == SIZE_MAX) {
          expr.type = RespExpr::NIL_ARRAY;
          expr.u.emplace<RespExpr::Vec*>(nullptr);
        } else {
          expr.type = RespExpr::ARRAY;
          auto vec = std::make_unique<RespExpr::Vec>();
          vec->reserve(arr->Size());
          for (size_t i = 0; i < arr->Size(); ++i) {
            vec->push_back(BuildExpr((*arr)[i]));
          }
          expr.u = vec.get();
          owned_arrays_.emplace_back(std::move(vec));
          expr.has_support = true;
        }
      }
      break;
    }
  }

  return expr;
}

void RespExprBuilder::SetStringPayload(const RESPObj& obj, RespExpr* expr) {
  auto sv = obj.As<std::string_view>().value_or(std::string_view{});
  // Copy the string data so we don't hold references into zmalloc-allocated
  // hiredis replies. The replies can then be freed on their allocating thread.
  auto owned = std::make_unique<char[]>(sv.size());
  memcpy(owned.get(), sv.data(), sv.size());
  expr->u = RespExpr::Buffer{reinterpret_cast<const uint8_t*>(owned.get()), sv.size()};
  expr->has_support = true;
  owned_strings_.emplace_back(std::move(owned));
}

}  // namespace facade


================================================
FILE: src/facade/resp_expr_test_utils.h
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <memory>
#include <optional>
#include <vector>

#include "facade/resp_expr.h"
#include "facade/resp_parser.h"

namespace facade {

class RespExprBuilder {
 public:
  RespExpr BuildExpr(const RESPObj& obj);

  void Clear() {
    owned_arrays_.clear();
    // Note: owned_strings_ is NOT cleared here because test code may still hold
    // string_view/Buffer references to data from prior ParseResponse calls
    // (e.g., SHA values, DUMP payloads). This mirrors the old behavior where
    // tmp_str_vec_ accumulated across calls within a test.
  }

 private:
  void SetStringPayload(const RESPObj& obj, RespExpr* expr);

  std::vector<std::unique_ptr<RespExpr::Vec>> owned_arrays_;
  // Own copies of string data so we don't hold references to zmalloc-allocated
  // hiredis replies (which must be freed on the same thread they were allocated).
  std::vector<std::unique_ptr<char[]>> owned_strings_;
};

}  // namespace facade


================================================
FILE: src/facade/resp_parser.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "facade/resp_parser.h"

#include <cstring>

#include "base/logging.h"

extern "C" {
#include "redis/hiredis.h"
}

namespace facade {

RESPParser::RESPParser() {
  reader_ = redisReaderCreate();
}

RESPObj::RESPObj(RESPObj&& other) noexcept
    : reply_(other.reply_), needs_to_free_(other.needs_to_free_) {
  other.reply_ = nullptr;
  other.needs_to_free_ = false;
}

RESPObj& RESPObj::operator=(RESPObj&& other) noexcept {
  std::swap(needs_to_free_, other.needs_to_free_);
  std::swap(reply_, other.reply_);
  return *this;
}

RESPObj::~RESPObj() {
  if (needs_to_free_)
    freeReplyObject(reply_);
}

RESPObj::Type RESPObj::GetType() const {
  DCHECK(reply_);
  return static_cast<Type>(reply_->type);
}

size_t RESPObj::Size() const {
  if (!reply_)
    return 0;
  Type type = GetType();
  return (type == Type::ARRAY || type == Type::MAP || type == Type::SET) ? reply_->elements : 1;
}

std::optional<RESPObj> RESPParser::Feed(const char* data, size_t len) {
  int status = REDIS_OK;
  if (len != 0) {  // if no new data we check is previoud data produced a reply
    status = redisReaderFeed(reader_, data, len);
    if (status != REDIS_OK) {
      LOG(ERROR) << "RESP parser error: " << status << " description: " << reader_->errstr
                 << " data: " << std::string_view{data, len};
      return std::nullopt;
    }
  }
  void* reply_obj = nullptr;
  status = redisReaderGetReply(reader_, &reply_obj);
  if (status != REDIS_OK) {
    LOG(ERROR) << "RESP parser error: " << status << " description: " << reader_->errstr
               << " data: " << data;
    return std::nullopt;
  }

  return RESPObj(static_cast<redisReply*>(reply_obj), reply_obj != nullptr);
}

std::ostream& operator<<(std::ostream& os, const RESPObj& obj) {
  if (obj.Empty()) {
    os << "nullptr RESPObj";
    return os;
  }
  switch (obj.GetType()) {
    // because we check type we don't expect As<T> to return nullopt here
    case RESPObj::Type::INTEGER: {
      os << *obj.As<std::int64_t>();
      break;
    }
    case RESPObj::Type::DOUBLE: {
      os << *obj.As<double>();
      break;
    }
    case RESPObj::Type::ARRAY: {
      os << *obj.As<RESPArray>();
      break;
    }
    case RESPObj::Type::MAP:
      [[fallthrough]];
    case RESPObj::Type::SET: {
      os << *obj.As<RESPArray>();
      break;
    }
    case RESPObj::Type::STRING:
      [[fallthrough]];
    case RESPObj::Type::NIL:
      [[fallthrough]];
    case RESPObj::Type::ERROR:
      [[fallthrough]];
    case RESPObj::Type::REPLY_STATUS: {
      os << *obj.As<std::string_view>();
      break;
    }
    default:
      os << "Unknown RESPObj type: " << static_cast<int>(obj.GetType());
  }
  return os;
}

std::ostream& operator<<(std::ostream& os, const RESPArray& arr) {
  os << "[";
  for (int64_t i = 0; i < (int64_t)arr.Size() - 1; ++i) {
    os << arr[i] << ", ";
  }
  os << arr[arr.Size() - 1] << "]";
  return os;
}

}  // namespace facade


================================================
FILE: src/facade/resp_parser.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once

#include <cassert>
#include <memory>
#include <optional>
#include <tuple>

#include "io/io.h"
extern "C" {
#include "redis/hiredis.h"
}

namespace facade {

class RESPArray;
class RESPIterator;

class RESPObj {
 public:
  enum class Type {
    STRING = REDIS_REPLY_STRING,
    ARRAY = REDIS_REPLY_ARRAY,
    INTEGER = REDIS_REPLY_INTEGER,
    NIL = REDIS_REPLY_NIL,
    REPLY_STATUS = REDIS_REPLY_STATUS,
    DOUBLE = REDIS_REPLY_DOUBLE,
    ERROR = REDIS_REPLY_ERROR,
    MAP = REDIS_REPLY_MAP,
    SET = REDIS_REPLY_SET,
  };
  RESPObj() = default;
  RESPObj(redisReply* reply, bool needs_to_free) : reply_(reply), needs_to_free_(needs_to_free) {
  }

  // TODO remove copy ctor, because it is not a deep copy
  RESPObj(const RESPObj& other) : reply_(other.reply_), needs_to_free_(false) {
  }
  RESPObj& operator=(const RESPObj& other) = delete;

  RESPObj(RESPObj&& other) noexcept;
  RESPObj& operator=(RESPObj&& other) noexcept;

  ~RESPObj();

  bool Empty() const {
    return reply_ == nullptr;
  }

  size_t Size() const;

  Type GetType() const;

  template <class T> std::optional<T> As() const;

 private:
  redisReply* reply_ = nullptr;
  bool needs_to_free_ = true;
};

class RESPArray {
 public:
  RESPArray(redisReply* arr_obj = nullptr) : arr_obj_(arr_obj) {
  }

  size_t Size() const {
    return arr_obj_->elements;
  }

  bool Empty() const {
    return Size() == 0;
  }

  RESPObj operator[](size_t index) const {
    return RESPObj(arr_obj_->element[index], false);
  }

 private:
  redisReply* arr_obj_ = nullptr;
};

class RESPParser {
 public:
  RESPParser();
  ~RESPParser() {
    redisReaderFree(reader_);
  }

  std::optional<RESPObj> Feed(const char* data, size_t len);

  size_t BufferPos() const {
    return reader_->pos;
  }

 private:
  redisReader* reader_;
};

std::ostream& operator<<(std::ostream& os, const RESPObj& obj);
std::ostream& operator<<(std::ostream& os, const RESPArray& arr);

class RESPIterator {
 public:
  RESPIterator() = default;
  RESPIterator(const RESPObj& obj) : obj_(obj) {
  }

  RESPIterator(RESPIterator&&) = default;
  RESPIterator& operator=(RESPIterator&&) = default;

  bool HasNext() const {
    return index_ < obj_.Size();
  }

  bool HasError() const {
    return index_ == std::numeric_limits<decltype(index_)>::max();
  }

  // Consume next values and return as tuple or single value
  // if extraction fails, set error state
  template <class T = std::string_view, class... Ts> auto Next() {
    std::conditional_t<sizeof...(Ts) == 0, T, std::tuple<T, Ts...>> res{};
    bool success = true;
    if constexpr (sizeof...(Ts) == 0) {
      success = Check(&res);
    } else {
      success = std::apply([this](auto&... args) { return Check<T, Ts...>(&args...); }, res);
    }
    SetError(!success);
    return res;
  }

  // increase index only if all args are successfully extracted
  template <class Arg, class... Args> bool Check(Arg* arg, Args*... args) {
    auto tmp_index = index_;
    if (index_ + sizeof...(Args) < obj_.Size()) {
      if (auto arr = obj_.As<RESPArray>(); arr.has_value()) {
        if (GetEntry(*arr, index_++, arg) && (GetEntry(*arr, index_++, args) && ...)) {
          return true;
        }
      } else if (auto val = obj_.As<Arg>(); val.has_value()) {
        assert(sizeof...(Args) == 0 && index_ == 0);
        *arg = std::move(*val);
        return true;
      }
    }
    index_ = tmp_index;
    return false;
  }

  void SetError(bool set = true) {
    if (set)
      index_ = std::numeric_limits<decltype(index_)>::max();
  }

 private:
  template <class Arg> bool GetEntry(const RESPArray& arr, size_t idx, Arg* arg) {
    if (auto val = arr[idx].As<Arg>(); val.has_value()) {
      *arg = std::move(*val);

      return true;
    }
    return false;
  }

 private:
  RESPObj obj_;
  size_t index_ = 0;
};

template <class T> std::optional<T> RESPObj::As() const {
  if (!reply_) {
    return std::nullopt;
  }
  if constexpr (std::is_constructible_v<T, std::string_view>) {
    if (reply_->type == REDIS_REPLY_STRING || reply_->type == REDIS_REPLY_ERROR ||
        reply_->type == REDIS_REPLY_STATUS) {
      return T{std::string_view{reply_->str, reply_->len}};
    } else if (reply_->type == REDIS_REPLY_NIL) {
      return T{std::string_view("NIL")};
    }
  } else if constexpr (std::is_integral_v<T>) {
    if (reply_->type == REDIS_REPLY_INTEGER) {
      return static_cast<T>(reply_->integer);
    }
  } else if constexpr (std::is_floating_point_v<T>) {
    if (reply_->type == REDIS_REPLY_DOUBLE) {
      return static_cast<T>(reply_->dval);
    }
  } else if constexpr (std::is_same_v<T, RESPArray>) {
    // MAP and SET use the same elements/element structure as ARRAY in hiredis
    if (reply_->type == REDIS_REPLY_ARRAY || reply_->type == REDIS_REPLY_MAP ||
        reply_->type == REDIS_REPLY_SET) {
      return RESPArray(reply_);
    }
  } else if constexpr (std::is_same_v<T, RESPObj>) {
    return RESPObj(reply_, false);
  } else if constexpr (std::is_same_v<T, RESPIterator>) {
    return RESPIterator(RESPObj(reply_, false));
  }

  // TODO add other types and errors processing
  return std::nullopt;
}

}  // namespace facade


================================================
FILE: src/facade/resp_parser_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "facade/resp_parser.h"

#include <mimalloc.h>

#include "base/gtest.h"
#include "base/logging.h"

using namespace testing;
using namespace std;
namespace facade {

class RESPParserTest : public testing::Test {
 protected:
  static void SetUpTestSuite() {
    init_zmalloc_threadlocal(mi_heap_get_backing());
  }
};

TEST_F(RESPParserTest, BaseRespTypesTest) {
  using Fields = std::map<std::string, std::string>;
  using Docs = std::map<std::string, Fields>;

  std::string msg1 =
      "*17\r\n:8\r\n$2\r\ns0\r\n*2\r\n$5\r\ntitle\r\n$6\r\ntest "
      "0\r\n$2\r\ns3\r\n*2\r\n$5\r\ntitle\r\n$6\r\ntest "
      "3\r\n$2\r\ns7\r\n*2\r\n$5\r\ntitle\r\n$6\r\ntest "
      "7\r\n$2\r\ns8\r\n*2\r\n$5\r\ntitle\r\n$6\r\ntest "
      "8\r\n$2\r\ns4\r\n*2\r\n$5\r\ntitle\r\n$6\r\ntest "
      "4\r\n$2\r\ns9\r\n*2\r\n$5\r\ntitle\r\n$6\r\ntest 9\r\n";

  std::string msg2 =
      "$2\r\ns1\r\n*2\r\n$5\r\ntitle\r\n$6\r\ntest "
      "1\r\n$2\r\ns5\r\n*2\r\n$5\r\ntitle\r\n$6\r\ntest 5\r\n";

  RESPParser reader;
  auto reply = reader.Feed(msg1.c_str(), msg1.size());
  ASSERT_TRUE(reply->Empty());

  reply = reader.Feed(msg2.c_str(), msg2.size());
  ASSERT_FALSE(reply->Empty());

  EXPECT_EQ(reply->GetType(), RESPObj::Type::ARRAY);
  auto array = *reply->As<RESPArray>();
  EXPECT_GE(array.Size(), 1);
  EXPECT_EQ(array[0].GetType(), RESPObj::Type::INTEGER);

  Docs search_results;
  for (size_t i = 1; i < array.Size(); i += 2) {
    auto& fields = search_results[*array[i].As<std::string>()];

    auto field_array = *array[i + 1].As<RESPArray>();

    for (size_t j = 0; j < field_array.Size(); j += 2) {
      std::string field_name = *field_array[j].As<std::string>();
      std::string field_value = *field_array[j + 1].As<std::string>();

      fields[field_name] = field_value;
    }
  }

  EXPECT_EQ(search_results.size(), 8);

  EXPECT_EQ(search_results["s0"]["title"], "test 0");
  EXPECT_EQ(search_results["s1"]["title"], "test 1");
  EXPECT_EQ(search_results["s3"]["title"], "test 3");
  EXPECT_EQ(search_results["s4"]["title"], "test 4");
  EXPECT_EQ(search_results["s5"]["title"], "test 5");
  EXPECT_EQ(search_results["s7"]["title"], "test 7");
  EXPECT_EQ(search_results["s8"]["title"], "test 8");
  EXPECT_EQ(search_results["s9"]["title"], "test 9");
}

TEST_F(RESPParserTest, RESPIteratorTest) {
  using Fields = std::map<std::string, std::string>;
  using Docs = std::map<std::string, Fields>;

  std::string msg1 =
      "*17\r\n:8\r\n$2\r\ns0\r\n*2\r\n$5\r\ntitle\r\n$6\r\ntest "
      "0\r\n$2\r\ns3\r\n*2\r\n$5\r\ntitle\r\n$6\r\ntest "
      "3\r\n$2\r\ns7\r\n*2\r\n$5\r\ntitle\r\n$6\r\ntest "
      "7\r\n$2\r\ns8\r\n*2\r\n$5\r\ntitle\r\n$6\r\ntest "
      "8\r\n$2\r\ns4\r\n*2\r\n$5\r\ntitle\r\n$6\r\ntest "
      "4\r\n$2\r\ns9\r\n*2\r\n$5\r\ntitle\r\n$6\r\ntest 9\r\n";

  std::string msg2 =
      "$2\r\ns1\r\n*2\r\n$5\r\ntitle\r\n$6\r\ntest "
      "1\r\n$2\r\ns5\r\n*2\r\n$5\r\ntitle\r\n$6\r\ntest 5\r\n";

  RESPParser reader;
  auto reply = reader.Feed(msg1.c_str(), msg1.size());
  ASSERT_TRUE(reply->Empty());

  reply = reader.Feed(msg2.c_str(), msg2.size());
  ASSERT_FALSE(reply->Empty());

  RESPIterator it(*reply);
  EXPECT_EQ(it.Next<size_t>(), 8);

  Docs search_results;
  while (it.HasNext()) {
    auto [doc_id, field_it] = it.Next<std::string, RESPIterator>();
    auto& fields = search_results[std::move(doc_id)];

    while (field_it.HasNext()) {
      auto [field_name, field_value] = field_it.Next<std::string_view, std::string_view>();
      fields.emplace(field_name, field_value);
    }
  }

  EXPECT_EQ(search_results.size(), 8);

  EXPECT_EQ(search_results["s0"]["title"], "test 0");
  EXPECT_EQ(search_results["s1"]["title"], "test 1");
  EXPECT_EQ(search_results["s3"]["title"], "test 3");
  EXPECT_EQ(search_results["s4"]["title"], "test 4");
  EXPECT_EQ(search_results["s5"]["title"], "test 5");
  EXPECT_EQ(search_results["s7"]["title"], "test 7");
  EXPECT_EQ(search_results["s8"]["title"], "test 8");
  EXPECT_EQ(search_results["s9"]["title"], "test 9");
}

}  // namespace facade


================================================
FILE: src/facade/resp_srv_parser.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#include "facade/resp_srv_parser.h"

#include <absl/strings/escaping.h>
#include <absl/strings/numbers.h>

#include "base/logging.h"
#include "common/backed_args.h"
#include "common/heap_size.h"

namespace facade {

using namespace std;

auto RespSrvParser::Parse(Buffer str, uint32_t* consumed, cmn::BackedArguments* args) -> Result {
  DCHECK(!str.empty());
  *consumed = 0;
  DVLOG(2) << "Parsing: "
           << absl::CHexEscape(string_view{reinterpret_cast<const char*>(str.data()), str.size()});

  if (state_ == CMD_COMPLETE_S) {
    args->clear();
    buf_stash_.clear();

    if (str[0] == '*') {
      // We recognized a non-INLINE state, starting with '*'
      str.remove_prefix(1);
      *consumed += 1;
      state_ = ARRAY_LEN_S;
      if (str.empty())
        return INPUT_PENDING;
    } else {  // INLINE mode, aka PING\n
      state_ = INLINE_S;
    }
  }

  ResultConsumed resultc{OK, 0};
  do {
    switch (state_) {
      case ARRAY_LEN_S:
        resultc = ConsumeArrayLen(str, args);
        break;
      case PARSE_ARG_TYPE:
        if (str[0] != '$')  // server side only supports bulk strings.
          return BAD_BULKLEN;
        resultc.second = 1;
        state_ = PARSE_ARG_S;
        break;
      case PARSE_ARG_S:
        resultc = ParseArg(str, args);
        break;
      case INLINE_S:
        resultc = ParseInline(str, args);
        break;
      case BULK_STR_S:
        resultc = ConsumeBulk(str, args);
        break;
      case SLASH_N_S:
        if (str[0] != '\n') {
          resultc.first = BAD_STRING;
        } else {
          resultc = {OK, 1};
          HandleFinishArg();
        }
        break;
      default:
        LOG(FATAL) << "Unexpected state " << int(state_);
    }

    *consumed += resultc.second;
    str.remove_prefix(exchange(resultc.second, 0));
  } while (state_ != CMD_COMPLETE_S && resultc.first == OK && !str.empty());

  if (state_ != CMD_COMPLETE_S) {
    if (resultc.first == OK) {
      resultc.first = INPUT_PENDING;
    }

    if (resultc.first == INPUT_PENDING) {
      if (!str.empty()) {
        LOG(DFATAL) << "Did not consume all input: "
                    << absl::CHexEscape({reinterpret_cast<const char*>(str.data()), str.size()})
                    << ", state: " << int(state_) << " smallbuf: "
                    << absl::CHexEscape(
                           {reinterpret_cast<const char*>(small_buf_.data()), small_len_});
      }
    }
    return resultc.first;
  }

  return resultc.first;
}

auto RespSrvParser::ParseInline(Buffer str, cmn::BackedArguments* args) -> ResultConsumed {
  DCHECK(!str.empty());

  const uint8_t* ptr = str.begin();
  const uint8_t* end = str.end();
  const uint8_t* token_start = ptr;

  auto find_token_end = [](const uint8_t* ptr, const uint8_t* end) {
    while (ptr != end && *ptr > 32)
      ++ptr;
    return ptr;
  };

  if (!buf_stash_.empty()) {
    ptr = find_token_end(ptr, end);
    size_t len = ptr - token_start;

    buf_stash_.append(reinterpret_cast<const char*>(token_start), len);
    if (ptr == end) {
      return {INPUT_PENDING, ptr - token_start};
    }

    args->PushArg(buf_stash_);
    buf_stash_.clear();
  }

  while (ptr != end) {
    // For inline input we only require \n.
    if (*ptr == '\n') {
      if (args->empty()) {
        ++ptr;
        continue;  // skip empty line
      }
      break;
    }

    if (*ptr <= 32) {  // skip ws/control chars
      ++ptr;
      continue;
    }

    // token start
    DCHECK(buf_stash_.empty());

    token_start = ptr;
    ptr = find_token_end(ptr, end);
    if (ptr != end) {
      args->PushArg(
          string_view{reinterpret_cast<const char*>(token_start), size_t(ptr - token_start)});
    }
  }

  uint32_t last_consumed = ptr - str.data();
  if (ptr == end) {                       // we have not finished parsing.
    bool is_broken_token = ptr[-1] > 32;  // we stopped in the middle of the token.
    if (is_broken_token) {
      DCHECK(buf_stash_.empty());
      buf_stash_.append(reinterpret_cast<const char*>(token_start), size_t(ptr - token_start));
    } else if (args->empty()) {
      state_ = CMD_COMPLETE_S;  // have not found anything besides whitespace.
    }
    return {INPUT_PENDING, last_consumed};
  }

  DCHECK_EQ('\n', *ptr);

  ++last_consumed;  // consume \n as well.
  state_ = CMD_COMPLETE_S;

  return {OK, last_consumed};
}

// Parse lines like:'$5\r\n' or '*2\r\n'. The first character is already consumed by the caller.
auto RespSrvParser::ParseLen(Buffer str, int64_t* res) -> ResultConsumed {
  DCHECK(!str.empty());

  const char* s = reinterpret_cast<const char*>(str.data());
  const char* pos = reinterpret_cast<const char*>(memchr(s, '\n', str.size()));
  if (!pos) {
    if (str.size() + small_len_ < small_buf_.size()) {
      memcpy(&small_buf_[small_len_], str.data(), str.size());
      small_len_ += str.size();
      return {INPUT_PENDING, str.size()};
    }
    LOG(WARNING) << "Unexpected format " << string_view{s, str.size()};
    return ResultConsumed{BAD_ARRAYLEN, 0};
  }

  unsigned consumed = pos - s + 1;
  if (small_len_ > 0) {
    if (small_len_ + consumed >= small_buf_.size()) {
      return ResultConsumed{BAD_ARRAYLEN, consumed};
    }
    memcpy(&small_buf_[small_len_], str.data(), consumed);
    small_len_ += consumed;
    s = small_buf_.data();
    pos = s + small_len_ - 1;
    small_len_ = 0;
  }

  if (pos[-1] != '\r') {
    return {BAD_ARRAYLEN, consumed};
  }

  // Skip 2 last characters (\r\n).
  string_view len_token{s, size_t(pos - 1 - s)};
  bool success = absl::SimpleAtoi(len_token, res);

  if (success && *res >= -1) {
    return ResultConsumed{OK, consumed};
  }

  LOG(ERROR) << "Failed to parse len " << absl::CHexEscape(len_token) << " "
             << absl::CHexEscape(string_view{reinterpret_cast<const char*>(str.data()), str.size()})
             << " " << consumed << " " << int(s == small_buf_.data());
  return ResultConsumed{BAD_ARRAYLEN, consumed};
}

auto RespSrvParser::ConsumeArrayLen(Buffer str, cmn::BackedArguments* args) -> ResultConsumed {
  int64_t len;

  ResultConsumed res = ParseLen(str, &len);
  if (res.first != OK) {
    return res;
  }

  if (len <= 0) {
    return {BAD_ARRAYLEN, res.second};
  }

  if (len > max_arr_len_) {
    LOG(WARNING) << "Multibulk len is too large " << len;

    return {BAD_ARRAYLEN, res.second};
  }

  state_ = PARSE_ARG_TYPE;
  arg_len_ = len;
  args->Reserve(len, 0);
  return {OK, res.second};
}

auto RespSrvParser::ParseArg(Buffer str, cmn::BackedArguments* args) -> ResultConsumed {
  DCHECK(!str.empty());

  int64_t len;

  ResultConsumed res = ParseLen(str, &len);
  if (res.first != OK) {
    return res;
  }

  if (len > 0 && static_cast<uint64_t>(len) > max_bulk_len_) {
    LOG_EVERY_T(WARNING, 1) << "Threshold reached with bulk len: " << len
                            << ", consider increasing max_bulk_len";
    return {BAD_BULKLEN, res.second};
  }

  if (len < 0) {
    return {BAD_BULKLEN, res.second};
  }

  bulk_len_ = len;
  state_ = BULK_STR_S;
  args->PushArg(size_t(len));

  return {OK, res.second};
}

auto RespSrvParser::ConsumeBulk(Buffer str, cmn::BackedArguments* args) -> ResultConsumed {
  DCHECK_EQ(small_len_, 0);
  uint32_t consumed = 0;

  if (str.size() >= bulk_len_) {
    consumed = bulk_len_;
    if (bulk_len_) {
      char* last_arg = args->data(args->size() - 1);  // Get pointer to last argument.
      DCHECK_GE(args->elem_len(args->size() - 1), bulk_len_);
      char* start = last_arg + (args->elem_len(args->size() - 1) - bulk_len_);
      memcpy(start, str.data(), bulk_len_);
      str.remove_prefix(exchange(bulk_len_, 0));
    }

    if (str.size() >= 2) {
      if (str[0] != '\r' || str[1] != '\n') {
        return {BAD_STRING, consumed};
      }
      HandleFinishArg();
      return {OK, consumed + 2};
    }

    if (str.size() == 1) {
      if (str[0] != '\r') {
        return {BAD_STRING, consumed};
      }
      state_ = SLASH_N_S;
      consumed++;
    }
    return {INPUT_PENDING, consumed};
  }

  DCHECK(bulk_len_);
  DCHECK_GE(args->elem_len(args->size() - 1), bulk_len_);
  size_t len = std::min<size_t>(str.size(), bulk_len_);
  char* last_arg = args->data(args->size() - 1);  // Get pointer to last argument.
  char* start = last_arg + (args->elem_len(args->size() - 1) - bulk_len_);
  memcpy(start, str.data(), len);
  consumed = len;
  bulk_len_ -= len;

  return {INPUT_PENDING, consumed};
}

void RespSrvParser::HandleFinishArg() {
  state_ = (--arg_len_ == 0) ? CMD_COMPLETE_S : PARSE_ARG_TYPE;

  small_len_ = 0;
}

size_t RespSrvParser::UsedMemory() const {
  return cmn::HeapSize(buf_stash_);
}

}  // namespace facade


================================================
FILE: src/facade/resp_srv_parser.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once

#include <absl/types/span.h>

#include <memory>
#include <utility>
#include <vector>

#include "common/backed_args.h"

namespace facade {

/**
 * @brief RESP server-side parser.
 */
class RespSrvParser {
 public:
  enum Result : uint8_t {
    OK,
    INPUT_PENDING,
    BAD_ARRAYLEN,
    BAD_BULKLEN,
    BAD_STRING,
  };
  using Buffer = absl::Span<const uint8_t>;
  explicit RespSrvParser(uint32_t max_arr_len = UINT32_MAX, uint32_t max_bulk_len = UINT32_MAX)
      : max_arr_len_(max_arr_len), max_bulk_len_(max_bulk_len) {
  }

  /**
   * @brief Parses str into res. "consumed" stores number of bytes consumed from str.
   *
   * A caller should not invalidate str if the parser returns RESP_OK as long as he continues
   * accessing res. However, if parser returns INPUT_PENDING a caller may discard consumed
   * part of str because parser caches the intermediate state internally according to 'consumed'
   * result.
   */

  Result Parse(Buffer str, uint32_t* consumed, cmn::BackedArguments* dest);

  size_t parselen_hint() const {
    return bulk_len_;
  }

  size_t UsedMemory() const;

 private:
  using ResultConsumed = std::pair<Result, uint32_t>;

  // Skips the first character (*).
  ResultConsumed ConsumeArrayLen(Buffer str, cmn::BackedArguments* args);
  ResultConsumed ParseArg(Buffer str, cmn::BackedArguments* args);
  ResultConsumed ConsumeBulk(Buffer str, cmn::BackedArguments* args);
  ResultConsumed ParseInline(Buffer str, cmn::BackedArguments* args);
  ResultConsumed ParseLen(Buffer str, int64_t* res);

  void HandleFinishArg();

  enum State : uint8_t {
    INLINE_S,
    ARRAY_LEN_S,
    PARSE_ARG_TYPE,
    PARSE_ARG_S,  // Parse string\r\n
    BULK_STR_S,
    SLASH_N_S,
    CMD_COMPLETE_S,
  };

  State state_ = CMD_COMPLETE_S;
  uint8_t small_len_ = 0;

  uint32_t bulk_len_ = 0, arg_len_ = 0;
  uint32_t max_arr_len_;
  uint32_t max_bulk_len_;

  std::string buf_stash_;
  std::array<char, 32> small_buf_;
};

}  // namespace facade


================================================
FILE: src/facade/resp_srv_parser_test.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "facade/resp_srv_parser.h"

#include <absl/strings/str_cat.h>
#include <gmock/gmock.h>

#include "base/gtest.h"
#include "base/logging.h"

using namespace testing;
using namespace std;
namespace facade {

// Custom printer for RespSrvParser::Result to make test output more readable
void PrintTo(const RespSrvParser::Result& result, std::ostream* os) {
  switch (result) {
    case RespSrvParser::OK:
      *os << "OK";
      break;
    case RespSrvParser::INPUT_PENDING:
      *os << "INPUT_PENDING";
      break;
    case RespSrvParser::BAD_ARRAYLEN:
      *os << "BAD_ARRAYLEN";
      break;
    case RespSrvParser::BAD_BULKLEN:
      *os << "BAD_BULKLEN";
      break;
    case RespSrvParser::BAD_STRING:
      *os << "BAD_STRING";
      break;
    default:
      *os << "UNKNOWN(" << static_cast<int>(result) << ")";
      break;
  }
}

class RespSrvParserTest : public testing::Test {
 protected:
  RespSrvParser::Result Parse(std::string_view str);

  RespSrvParser parser_;
  cmn::BackedArguments args_;
  uint32_t consumed_;
};

RespSrvParser::Result RespSrvParserTest::Parse(std::string_view str) {
  RespSrvParser::Buffer buf{reinterpret_cast<const uint8_t*>(str.data()), str.size()};
  return parser_.Parse(buf, &consumed_, &args_);
}

TEST_F(RespSrvParserTest, Inline) {
  const char kCmd1[] = "KEY   VAL\r\n";

  ASSERT_EQ(RespSrvParser::OK, Parse(kCmd1));
  EXPECT_EQ(strlen(kCmd1), consumed_);
  EXPECT_THAT(args_, ElementsAre("KEY", "VAL"));

  ASSERT_EQ(RespSrvParser::INPUT_PENDING, Parse("KEY"));
  EXPECT_EQ(3, consumed_);
  ASSERT_EQ(RespSrvParser::INPUT_PENDING, Parse(" FOO "));
  EXPECT_EQ(5, consumed_);
  ASSERT_EQ(RespSrvParser::INPUT_PENDING, Parse(" BAR"));
  EXPECT_EQ(4, consumed_);
  ASSERT_EQ(RespSrvParser::OK, Parse(" \r\n "));
  EXPECT_EQ(3, consumed_);
  EXPECT_THAT(args_, ElementsAre("KEY", "FOO", "BAR"));

  ASSERT_EQ(RespSrvParser::INPUT_PENDING, Parse(" 1 2"));
  EXPECT_EQ(4, consumed_);
  ASSERT_EQ(RespSrvParser::INPUT_PENDING, Parse(" 45"));
  EXPECT_EQ(3, consumed_);
  ASSERT_EQ(RespSrvParser::OK, Parse("\r\n"));
  EXPECT_EQ(2, consumed_);
  EXPECT_THAT(args_, ElementsAre("1", "2", "45"));

  // Empty queries return INPUT_PENDING.
  EXPECT_EQ(RespSrvParser::INPUT_PENDING, Parse("\r\n"));
  EXPECT_EQ(2, consumed_);

  ASSERT_EQ(RespSrvParser::OK, Parse("_\r\n"));
  EXPECT_THAT(args_, ElementsAre("_"));
}

TEST_F(RespSrvParserTest, Multi1) {
  ASSERT_EQ(RespSrvParser::INPUT_PENDING, Parse("*1\r\n"));
  EXPECT_EQ(4, consumed_);
  EXPECT_EQ(0, parser_.parselen_hint());

  ASSERT_EQ(RespSrvParser::INPUT_PENDING, Parse("$4\r\n"));
  EXPECT_EQ(4, consumed_);
  EXPECT_EQ(4, parser_.parselen_hint());

  ASSERT_EQ(RespSrvParser::OK, Parse("PING\r\n"));
  EXPECT_EQ(6, consumed_);
  EXPECT_EQ(0, parser_.parselen_hint());
  EXPECT_THAT(args_, ElementsAre("PING"));
}

TEST_F(RespSrvParserTest, Multi2) {
  ASSERT_EQ(RespSrvParser::INPUT_PENDING, Parse("*1\r\n$"));
  EXPECT_EQ(5, consumed_);

  ASSERT_EQ(RespSrvParser::INPUT_PENDING, Parse("4\r\nMSET"));
  EXPECT_EQ(7, consumed_);

  ASSERT_EQ(RespSrvParser::OK, Parse("\r\n*2\r\n"));
  EXPECT_EQ(2, consumed_);

  ASSERT_EQ(RespSrvParser::INPUT_PENDING, Parse("*2\r\n$3\r\nKEY\r\n$3\r\nVAL"));
  EXPECT_EQ(20, consumed_);

  ASSERT_EQ(RespSrvParser::OK, Parse("\r\n"));
  EXPECT_EQ(2, consumed_);
  EXPECT_THAT(args_, ElementsAre("KEY", "VAL"));
}

TEST_F(RespSrvParserTest, Multi3) {
  const char kFirst[] = "*3\r\n$3\r\nSET\r\n$16\r\nkey:";
  const char kSecond[] = "000002273458\r\n$3\r\nVXK";
  ASSERT_EQ(RespSrvParser::INPUT_PENDING, Parse(kFirst));
  ASSERT_EQ(strlen(kFirst), consumed_);
  ASSERT_EQ(RespSrvParser::INPUT_PENDING, Parse(kSecond));
  ASSERT_EQ(strlen(kSecond), consumed_);
  ASSERT_EQ(RespSrvParser::OK, Parse("\r\n*3\r\n$3\r\nSET"));
  ASSERT_EQ(2, consumed_);
  EXPECT_THAT(args_, ElementsAre("SET", "key:000002273458", "VXK"));
}

TEST_F(RespSrvParserTest, InvalidMult1) {
  ASSERT_EQ(RespSrvParser::BAD_BULKLEN, Parse("*2\r\n$3\r\nFOO\r\nBAR\r\n"));
}

TEST_F(RespSrvParserTest, Empty) {
  ASSERT_EQ(RespSrvParser::OK, Parse("*2\r\n$0\r\n\r\n$0\r\n\r\n"));
}

TEST_F(RespSrvParserTest, LargeBulk) {
  string_view prefix("*1\r\n$1024\r\n");

  ASSERT_EQ(RespSrvParser::INPUT_PENDING, Parse(prefix));
  ASSERT_EQ(prefix.size(), consumed_);
  ASSERT_GE(parser_.parselen_hint(), 1024);

  string half(512, 'a');
  ASSERT_EQ(RespSrvParser::INPUT_PENDING, Parse(half));
  ASSERT_EQ(512, consumed_);
  ASSERT_GE(parser_.parselen_hint(), 512);
  ASSERT_EQ(RespSrvParser::INPUT_PENDING, Parse(half));
  ASSERT_EQ(512, consumed_);
  ASSERT_EQ(RespSrvParser::INPUT_PENDING, Parse("\r"));
  ASSERT_EQ(1, consumed_);
  ASSERT_EQ(RespSrvParser::OK, Parse("\n"));
  EXPECT_EQ(1, consumed_);

  string part1 = absl::StrCat(prefix, half);
  ASSERT_EQ(RespSrvParser::INPUT_PENDING, Parse(part1));
  ASSERT_EQ(RespSrvParser::INPUT_PENDING, Parse(half));
  ASSERT_EQ(RespSrvParser::OK, Parse("\r\n"));

  prefix = "*1\r\n$27000000\r\n";
  ASSERT_EQ(RespSrvParser::INPUT_PENDING, Parse(prefix));
  ASSERT_EQ(prefix.size(), consumed_);
  string chunk(1000000, 'a');
  for (unsigned i = 0; i < 27; ++i) {
    ASSERT_EQ(RespSrvParser::INPUT_PENDING, Parse(chunk));
    ASSERT_EQ(chunk.size(), consumed_);
  }
  ASSERT_EQ(RespSrvParser::OK, Parse("\r\n"));
  ASSERT_EQ(args_.size(), 1);
  EXPECT_EQ(27000000u, args_[0].size());
}

TEST_F(RespSrvParserTest, Eol) {
  ASSERT_EQ(RespSrvParser::INPUT_PENDING, Parse("*1\r"));
  EXPECT_EQ(3, consumed_);
  ASSERT_EQ(RespSrvParser::INPUT_PENDING, Parse("\n$5\r\n"));
  EXPECT_EQ(5, consumed_);
}

TEST_F(RespSrvParserTest, BulkSplit) {
  ASSERT_EQ(RespSrvParser::INPUT_PENDING, Parse("*1\r\n$4\r\nSADD\r"));
  ASSERT_EQ(13, consumed_);
  ASSERT_EQ(RespSrvParser::OK, Parse("\n"));
}

TEST_F(RespSrvParserTest, InlineSplit) {
  ASSERT_EQ(RespSrvParser::INPUT_PENDING, Parse("\n"));
  EXPECT_EQ(1, consumed_);
  ASSERT_EQ(RespSrvParser::OK, Parse("\nPING\n\n"));
  EXPECT_EQ(6, consumed_);
  ASSERT_EQ(RespSrvParser::INPUT_PENDING, Parse("\n"));
  EXPECT_EQ(1, consumed_);
  ASSERT_EQ(RespSrvParser::INPUT_PENDING, Parse("P"));
  ASSERT_EQ(RespSrvParser::OK, Parse("ING\n"));
}

TEST_F(RespSrvParserTest, InlineReset) {
  ASSERT_EQ(RespSrvParser::INPUT_PENDING, Parse("\t \r\n"));
  EXPECT_EQ(4, consumed_);
  ASSERT_EQ(RespSrvParser::OK, Parse("*1\r\n$3\r\nfoo\r\n"));
  EXPECT_EQ(13, consumed_);
}

}  // namespace facade


================================================
FILE: src/facade/resp_validator.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include <absl/strings/escaping.h>

#include <cstdint>
#include <fstream>
#include <iostream>

#include "base/flags.h"
#include "base/init.h"
#include "facade/redis_parser.h"
#include "io/io.h"

using namespace facade;
using namespace std;

ABSL_FLAG(string, input, "", "If not empty - reads data from the file instead of stdin. ");

// Validates RESP3 server responses by using RespParser.
// Server traffic can be recorded using:
// tcpflow  -i any port 6379 -o /tmp/tcp_flow
int main(int argc, char* argv[]) {
  MainInitGuard guard(&argc, &argv);

  RedisParser parser(RedisParser::Mode::CLIENT);
  RedisParser::Result parse_result = RedisParser::OK;
  char buf[1024];
  istream* input_stream = &cin;
  if (!absl::GetFlag(FLAGS_input).empty()) {
    input_stream = new ifstream(absl::GetFlag(FLAGS_input), ios::binary);
    if (!input_stream->good()) {
      cerr << "Failed to open input file: " << absl::GetFlag(FLAGS_input) << "\n";
      return -1;
    }
  }
  size_t len = 0, offset = 0;
  do {
    input_stream->read(buf + len, sizeof(buf) - len);
    size_t read = input_stream->gcount();
    if (read == 0) {
      if (parse_result != RedisParser::OK) {
        cerr << "unexpected: " << parse_result << "\n";
      }
      break;
    }
    DVLOG(1) << "Read " << read << " bytes from input stream, offset: " << offset;
    len += read;

    RespExpr::Vec args;
    uint32_t consumed = 0;
    char* next = buf;
    while (len) {
      string_view sv{next, len};
      parse_result = parser.Parse(io::Buffer(sv), &consumed, &args);
      if (parse_result != RedisParser::OK && parse_result != RedisParser::INPUT_PENDING) {
        cerr << "Parse error: " << int(parse_result) << " at offset " << offset
             << " when parsing: " << absl::CHexEscape({reinterpret_cast<const char*>(next), len})
             << "\n";
        return -1;
      }

      if (consumed == 0) {  // not enough data to parse.
        DVLOG(1) << "No data consumed, waiting for more input.";
        memcpy(buf, next, len);  // move the remaining data to the start of the buffer.
        break;
      }
      len -= consumed;
      next += consumed;
      offset += consumed;
    }
  } while (!input_stream->eof());

  if (input_stream != &cin) {
    delete input_stream;
  }
  cout << "LGTM\n";
  return 0;
}


================================================
FILE: src/facade/service_interface.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "facade/service_interface.h"

#include <absl/strings/str_cat.h>

#include "facade/facade_types.h"

namespace facade {

std::string ServiceInterface::ContextInfo::Format() const {
  char buf[16] = {0};
  std::string res = absl::StrCat("db=", db_index);

  unsigned index = 0;

  if (async_dispatch)
    buf[index++] = 'a';

  if (conn_closing)
    buf[index++] = 't';

  if (subscribers)
    buf[index++] = 'P';

  if (blocked)
    buf[index++] = 'b';

  if (index)
    absl::StrAppend(&res, " flags=", buf);
  return res;
}

DispatchResult ServiceInterface::DispatchCommandSimple(ParsedCommand* cmd, AsyncPreference mode) {
  return DispatchCommand(ParsedArgs{*cmd}, cmd, mode);
}

}  // namespace facade


================================================
FILE: src/facade/service_interface.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <string>

#include "facade/facade_types.h"
#include "facade/parsed_command.h"
#include "util/fiber_socket_base.h"

namespace util {
class HttpListenerBase;
}  // namespace util

namespace facade {

class ConnectionContext;
class Connection;
class SinkReplyBuilder;
class MCReplyBuilder;

// Controls asynchronicity of command dispatch
enum class AsyncPreference : uint8_t {
  ONLY_SYNC,     // Caller supports only synchronous dispatch
  PREFER_ASYNC,  // Prefer async if available
  ONLY_ASYNC,    // Only async execution is possible (command is dispatched in pipeline)
};

enum class DispatchResult : uint8_t {
  OK,
  OOM,
  ERROR,
  WOULD_BLOCK  // Returned if ONLY_ASYNC was set, but only synchronous execution is possible
};

struct DispatchManyResult {
  uint32_t processed;  // how many commands out of passed were actually processed

  // whether to account the processed commands in stats. This is needed to consistently
  // account commands that were included based on squash_stats_latency_lower_limit filter.
  bool account_in_stats;
};

class ServiceInterface {
 public:
  virtual ~ServiceInterface() {
  }

  virtual DispatchResult DispatchCommand(ParsedArgs args, ParsedCommand* cmd, AsyncPreference) = 0;
  DispatchResult DispatchCommandSimple(ParsedCommand* cmd, AsyncPreference mode);

  virtual DispatchManyResult DispatchManyCommands(std::function<ParsedArgs()> arg_gen,
                                                  unsigned count, SinkReplyBuilder* builder,
                                                  ConnectionContext* cntx) = 0;

  virtual DispatchResult DispatchMC(ParsedCommand* cmd, AsyncPreference) = 0;

  virtual ConnectionContext* CreateContext(Connection* owner) = 0;

  virtual ParsedCommand* AllocateParsedCommand() = 0;

  virtual void ConfigureHttpHandlers(util::HttpListenerBase* base, bool is_privileged) {
  }

  virtual void OnConnectionClose(ConnectionContext* cntx) {
  }

  struct ContextInfo {
    std::string Format() const;

    unsigned db_index;
    bool async_dispatch, conn_closing, subscribers, blocked;
  };

  virtual ContextInfo GetContextInfo(ConnectionContext* cntx) const {
    return {};
  }
};

}  // namespace facade


================================================
FILE: src/facade/socket_utils.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "socket_utils.h"

#include <arpa/inet.h>
#include <sys/socket.h>

#ifdef __linux__
#include <sys/stat.h>
#include <unistd.h>

#include "absl/strings/str_cat.h"
#include "io/proc_reader.h"

#endif

namespace {

int get_socket_family(int fd) {
  struct sockaddr_storage ss;
  socklen_t len = sizeof(ss);

  if (getsockname(fd, (struct sockaddr*)&ss, &len) == -1) {
    return -1;  // Indicate an error
  }

  return ss.ss_family;
}

}  // namespace

namespace dfly {

// Returns information about the TCP socket state by its descriptor
std::string GetSocketInfo(int socket_fd) {
  if (socket_fd < 0)
    return "invalid socket";

#ifdef __linux__
  struct stat sock_stat;
  if (fstat(socket_fd, &sock_stat) != 0) {
    return "could not stat socket";
  }

  io::Result<io::TcpInfo> tcp_info;
  int family = get_socket_family(socket_fd);
  if (family == AF_INET) {
    tcp_info = io::ReadTcpInfo(sock_stat.st_ino);
  } else if (family == AF_INET6) {
    tcp_info = io::ReadTcp6Info(sock_stat.st_ino);
  } else {
    return "unsupported socket family";
  }

  if (!tcp_info) {
    return "socket not found in /proc/net/tcp or /proc/net/tcp6";
  }

  std::string state_str = io::TcpStateToString(tcp_info->state);

  if (tcp_info->is_ipv6) {
    char local_ip[INET6_ADDRSTRLEN], remote_ip[INET6_ADDRSTRLEN];
    inet_ntop(AF_INET6, &tcp_info->local_addr6, local_ip, sizeof(local_ip));
    inet_ntop(AF_INET6, &tcp_info->remote_addr6, remote_ip, sizeof(remote_ip));
    return absl::StrCat("State: ", state_str, ", Local: [", local_ip, "]:", tcp_info->local_port,
                        ", Remote: [", remote_ip, "]:", tcp_info->remote_port,
                        ", Inode: ", tcp_info->inode);
  } else {
    char local_ip[INET_ADDRSTRLEN], remote_ip[INET_ADDRSTRLEN];
    struct in_addr addr;
    addr.s_addr = htonl(tcp_info->local_addr);
    inet_ntop(AF_INET, &addr, local_ip, sizeof(local_ip));
    addr.s_addr = htonl(tcp_info->remote_addr);
    inet_ntop(AF_INET, &addr, remote_ip, sizeof(remote_ip));
    return absl::StrCat("State: ", state_str, ", Local: ", local_ip, ":", tcp_info->local_port,
                        ", Remote: ", remote_ip, ":", tcp_info->remote_port,
                        ", Inode: ", tcp_info->inode);
  }
#else
  return "socket info not available on this platform";
#endif
}

}  // namespace dfly


================================================
FILE: src/facade/socket_utils.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <string>

namespace dfly {

// Returns information about the TCP socket state by its descriptor
std::string GetSocketInfo(int socket_fd);

}  // namespace dfly


================================================
FILE: src/facade/tls_helpers.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#include "tls_helpers.h"

#include <openssl/err.h>

#ifdef DFLY_USE_SSL
#include <openssl/ssl.h>
#endif

#include <absl/functional/bind_front.h>

#include <string>

#include "base/flags.h"
#include "base/logging.h"
#include "facade/facade_stats.h"
#include "facade/facade_types.h"

ABSL_FLAG(std::string, tls_cert_file, "", "cert file for tls connections");
ABSL_FLAG(std::string, tls_key_file, "", "key file for tls connections");
ABSL_FLAG(std::string, tls_ca_cert_file, "", "ca signed certificate to validate tls connections");
ABSL_FLAG(std::string, tls_ca_cert_dir, "",
          "ca signed certificates directory. Use c_rehash before, read description in "
          "https://www.openssl.org/docs/man3.0/man1/c_rehash.html");
ABSL_FLAG(std::string, tls_ciphers, "DEFAULT:!MEDIUM", "TLS ciphers configuration for tls1.2");
ABSL_FLAG(std::string, tls_cipher_suites, "", "TLS ciphers configuration for tls1.3");
ABSL_FLAG(bool, tls_prefer_server_ciphers, false,
          "If true, prefer server ciphers over client ciphers");
ABSL_FLAG(bool, tls_session_caching, false, "If true enables session caching and tickets");
ABSL_FLAG(size_t, tls_session_cache_size, 20 * 1024, "Size of the cache for tls sessions");
ABSL_FLAG(size_t, tls_session_cache_timeout, 300, "Timeout for each session/ticket");

namespace facade {

#ifdef DFLY_USE_SSL

// Creates the TLS context. Returns nullptr if the TLS configuration is invalid.
// To connect: openssl s_client -state -crlf -connect 127.0.0.1:6380
SSL_CTX* CreateSslCntx(TlsContextRole role) {
  using absl::GetFlag;
  const auto& tls_key_file = GetFlag(FLAGS_tls_key_file);
  if (tls_key_file.empty()) {
    LOG(ERROR) << "To use TLS, a server certificate must be provided with the --tls_key_file flag!";
    return nullptr;
  }

  SSL_CTX* ctx;

  if (role == TlsContextRole::SERVER) {
    ctx = SSL_CTX_new(TLS_server_method());
  } else {
    ctx = SSL_CTX_new(TLS_client_method());
  }
  unsigned mask = SSL_VERIFY_NONE;

  if (SSL_CTX_use_PrivateKey_file(ctx, tls_key_file.c_str(), SSL_FILETYPE_PEM) != 1) {
    LOG(ERROR) << "Failed to load TLS key";
    return nullptr;
  }
  const auto& tls_cert_file = GetFlag(FLAGS_tls_cert_file);

  if (!tls_cert_file.empty()) {
    // TO connect with redis-cli you need both tls-key-file and tls-cert-file
    // loaded. Use `redis-cli --tls -p 6380 --insecure  PING` to test
    if (SSL_CTX_use_certificate_chain_file(ctx, tls_cert_file.c_str()) != 1) {
      LOG(ERROR) << "Failed to load TLS certificate";
      return nullptr;
    }
  }

  const auto tls_ca_cert_file = GetFlag(FLAGS_tls_ca_cert_file);
  const auto tls_ca_cert_dir = GetFlag(FLAGS_tls_ca_cert_dir);
  if (!tls_ca_cert_file.empty() || !tls_ca_cert_dir.empty()) {
    const auto* file = tls_ca_cert_file.empty() ? nullptr : tls_ca_cert_file.data();
    const auto* dir = tls_ca_cert_dir.empty() ? nullptr : tls_ca_cert_dir.data();
    if (SSL_CTX_load_verify_locations(ctx, file, dir) != 1) {
      LOG(ERROR) << "Failed to load TLS verify locations (CA cert file or CA cert dir)";
      return nullptr;
    }
    mask = SSL_VERIFY_PEER | SSL_VERIFY_FAIL_IF_NO_PEER_CERT;
  }

  if (!GetFlag(FLAGS_tls_ciphers).empty()) {
    DFLY_SSL_CHECK(1 == SSL_CTX_set_cipher_list(ctx, GetFlag(FLAGS_tls_ciphers).c_str()));
  }

  // Relevant only for TLS 1.3 connections.
  if (!GetFlag(FLAGS_tls_cipher_suites).empty()) {
    SSL_CTX_set_ciphersuites(ctx, GetFlag(FLAGS_tls_cipher_suites).c_str());
  }

  SSL_CTX_set_min_proto_version(ctx, TLS1_2_VERSION);

  SSL_CTX_set_options(ctx, SSL_OP_DONT_INSERT_EMPTY_FRAGMENTS);

  SSL_CTX_set_verify(ctx, mask, NULL);

  DFLY_SSL_CHECK(1 == SSL_CTX_set_dh_auto(ctx, 1));

  if (GetFlag(FLAGS_tls_prefer_server_ciphers)) {
    SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE);
  }

  if (GetFlag(FLAGS_tls_session_caching)) {
    SSL_CTX_set_session_cache_mode(ctx, SSL_SESS_CACHE_SERVER);
    SSL_CTX_sess_set_cache_size(ctx, GetFlag(FLAGS_tls_session_cache_size));
    SSL_CTX_set_timeout(ctx, GetFlag(FLAGS_tls_session_cache_timeout));
    SSL_CTX_set_session_id_context(ctx, (const unsigned char*)"dragonfly", 9);
  }

  SSL_CTX_set_info_callback(ctx, [](const SSL* ssl, int where, int ret) {
    // When we skip the handshake we never reach this state.
    if (where & SSL_CB_HANDSHAKE_START) {
      ++tl_facade_stats->conn_stats.handshakes_started;
    }
    // When we skip the handshake, we never reach this state.
    if (where & SSL_CB_HANDSHAKE_DONE) {
      ++tl_facade_stats->conn_stats.handshakes_completed;
    }
  });

  return ctx;
}

void PrintSSLError() {
  ERR_print_errors_cb(
      [](const char* str, size_t len, void* u) {
        LOG(ERROR) << std::string_view(str, len);
        return 1;
      },
      nullptr);
}

#endif
}  // namespace facade


================================================
FILE: src/facade/tls_helpers.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#ifdef DFLY_USE_SSL
#include <openssl/ssl.h>
#endif

namespace facade {

#ifdef DFLY_USE_SSL
enum class TlsContextRole { SERVER, CLIENT };

SSL_CTX* CreateSslCntx(TlsContextRole role);

void PrintSSLError();

#define DFLY_SSL_CHECK(condition)               \
  if (!(condition)) {                           \
    LOG(ERROR) << "OpenSSL Error: " #condition; \
    PrintSSLError();                            \
    exit(17);                                   \
  }

#endif

}  // namespace facade


================================================
FILE: src/huff/LICENSE
================================================
BSD License

For Zstandard software

Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

 * Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.

 * Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.

 * Neither the name Facebook, nor Meta, nor the names of its contributors may
   be used to endorse or promote products derived from this software without
   specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


================================================
FILE: src/huff/README.md
================================================
The code in this folder exposes internal functions that are used by ZSTD.
These functions are part of https://github.com/Cyan4973/FiniteStateEntropy project.

Since we already link to ZSTD, it is convenient that we get this functionality for free.

================================================
FILE: src/huff/hist.h
================================================
/* ******************************************************************
 * hist : Histogram functions
 * part of Finite State Entropy project
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 *  You can contact the author at :
 *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
 *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
 *
 * This source code is licensed under both the BSD-style license (found in the
 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
 * in the COPYING file in the root directory of this source tree).
 * You may select, at your option, one of the above-listed licenses.
****************************************************************** */

/* --- dependencies --- */
#include <stddef.h>  /* size_t */


/* --- simple histogram functions --- */

/*! HIST_count():
 *  Provides the precise count of each byte within a table 'count'.
 * 'count' is a table of unsigned int, of minimum size (*maxSymbolValuePtr+1).
 *  Updates *maxSymbolValuePtr with actual largest symbol value detected.
 * @return : count of the most frequent symbol (which isn't identified).
 *           or an error code, which can be tested using HIST_isError().
 *           note : if return == srcSize, there is only one symbol.
 */
size_t HIST_count(unsigned* count, unsigned* maxSymbolValuePtr,
                  const void* src, size_t srcSize);

unsigned HIST_isError(size_t code);  /**< tells if a return value is an error code */


/* --- advanced histogram functions --- */

#define HIST_WKSP_SIZE_U32 1024
#define HIST_WKSP_SIZE    (HIST_WKSP_SIZE_U32 * sizeof(unsigned))
/** HIST_count_wksp() :
 *  Same as HIST_count(), but using an externally provided scratch buffer.
 *  Benefit is this function will use very little stack space.
 * `workSpace` is a writable buffer which must be 4-bytes aligned,
 * `workSpaceSize` must be >= HIST_WKSP_SIZE
 */
size_t HIST_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
                       const void* src, size_t srcSize,
                       void* workSpace, size_t workSpaceSize);

/** HIST_countFast() :
 *  same as HIST_count(), but blindly trusts that all byte values within src are <= *maxSymbolValuePtr.
 *  This function is unsafe, and will segfault if any value within `src` is `> *maxSymbolValuePtr`
 */
size_t HIST_countFast(unsigned* count, unsigned* maxSymbolValuePtr,
                      const void* src, size_t srcSize);

/** HIST_countFast_wksp() :
 *  Same as HIST_countFast(), but using an externally provided scratch buffer.
 * `workSpace` is a writable buffer which must be 4-bytes aligned,
 * `workSpaceSize` must be >= HIST_WKSP_SIZE
 */
size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
                           const void* src, size_t srcSize,
                           void* workSpace, size_t workSpaceSize);

/*! HIST_count_simple() :
 *  Same as HIST_countFast(), this function is unsafe,
 *  and will segfault if any value within `src` is `> *maxSymbolValuePtr`.
 *  It is also a bit slower for large inputs.
 *  However, it does not need any additional memory (not even on stack).
 * @return : count of the most frequent symbol.
 *  Note this function doesn't produce any error (i.e. it must succeed).
 */
unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
                           const void* src, size_t srcSize);

/*! HIST_add() :
 *  Lowest level: just add nb of occurrences of characters from @src into @count.
 *  @count is not reset. @count array is presumed large enough (i.e. 1 KB).
 @  This function does not need any additional stack memory.
 */
void HIST_add(unsigned* count, const void* src, size_t srcSize);


================================================
FILE: src/huff/huf.h
================================================
/* ******************************************************************
 * huff0 huffman codec,
 * part of Finite State Entropy library
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * You can contact the author at :
 * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
 *
 * This source code is licensed under both the BSD-style license (found in the
 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
 * in the COPYING file in the root directory of this source tree).
 * You may select, at your option, one of the above-listed licenses.
****************************************************************** */

#ifndef HUF_H_298734234
#define HUF_H_298734234

/* *** Dependencies *** */
#include <stddef.h>  /* size_t */
#include "mem.h"          /* U32 */

/* ***   Tool functions *** */
#define HUF_BLOCKSIZE_MAX (128 * 1024)   /**< maximum input size for a single block compressed with HUF_compress */
size_t HUF_compressBound(size_t size);   /**< maximum compressed size (worst case) */

/* Error Management */
unsigned    HUF_isError(size_t code);       /**< tells if a return value is an error code */
const char* HUF_getErrorName(size_t code);  /**< provides error code string (useful for debugging) */


#define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */)
#define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64))

/* *** Constants *** */
#define HUF_TABLELOG_MAX      12      /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */
#define HUF_TABLELOG_DEFAULT  11      /* default tableLog value when none specified */
#define HUF_SYMBOLVALUE_MAX  255

#define HUF_TABLELOG_ABSOLUTEMAX  12  /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
#if (HUF_TABLELOG_MAX > HUF_TABLELOG_ABSOLUTEMAX)
#  error "HUF_TABLELOG_MAX is too large !"
#endif


/* ****************************************
*  Static allocation
******************************************/
/* HUF buffer bounds */
#define HUF_CTABLEBOUND 129
#define HUF_BLOCKBOUND(size) (size + (size>>8) + 8)   /* only true when incompressible is pre-filtered with fast heuristic */
#define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size))   /* Macro version, useful for static allocation */

/* static allocation of HUF's Compression Table */
/* this is a private definition, just exposed for allocation and strict aliasing purpose. never EVER access its members directly */
typedef size_t HUF_CElt;   /* consider it an incomplete type */
#define HUF_CTABLE_SIZE_ST(maxSymbolValue)   ((maxSymbolValue)+2)   /* Use tables of size_t, for proper alignment */
#define HUF_CTABLE_SIZE(maxSymbolValue)       (HUF_CTABLE_SIZE_ST(maxSymbolValue) * sizeof(size_t))
#define HUF_CREATE_STATIC_CTABLE(name, maxSymbolValue) \
    HUF_CElt name[HUF_CTABLE_SIZE_ST(maxSymbolValue)] /* no final ; */

/* static allocation of HUF's DTable */
typedef U32 HUF_DTable;
#define HUF_DTABLE_SIZE(maxTableLog)   (1 + (1<<(maxTableLog)))
#define HUF_CREATE_STATIC_DTABLEX1(DTable, maxTableLog) \
        HUF_DTable DTable[HUF_DTABLE_SIZE((maxTableLog)-1)] = { ((U32)((maxTableLog)-1) * 0x01000001) }
#define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
        HUF_DTable DTable[HUF_DTABLE_SIZE(maxTableLog)] = { ((U32)(maxTableLog) * 0x01000001) }


/* ****************************************
*  Advanced decompression functions
******************************************/

/**
 * Huffman flags bitset.
 * For all flags, 0 is the default value.
 */
typedef enum {
    /**
     * If compiled with DYNAMIC_BMI2: Set flag only if the CPU supports BMI2 at runtime.
     * Otherwise: Ignored.
     */
    HUF_flags_bmi2 = (1 << 0),
    /**
     * If set: Test possible table depths to find the one that produces the smallest header + encoded size.
     * If unset: Use heuristic to find the table depth.
     */
    HUF_flags_optimalDepth = (1 << 1),
    /**
     * If set: If the previous table can encode the input, always reuse the previous table.
     * If unset: If the previous table can encode the input, reuse the previous table if it results in a smaller output.
     */
    HUF_flags_preferRepeat = (1 << 2),
    /**
     * If set: Sample the input and check if the sample is uncompressible, if it is then don't attempt to compress.
     * If unset: Always histogram the entire input.
     */
    HUF_flags_suspectUncompressible = (1 << 3),
    /**
     * If set: Don't use assembly implementations
     * If unset: Allow using assembly implementations
     */
    HUF_flags_disableAsm = (1 << 4),
    /**
     * If set: Don't use the fast decoding loop, always use the fallback decoding loop.
     * If unset: Use the fast decoding loop when possible.
     */
    HUF_flags_disableFast = (1 << 5)
} HUF_flags_e;


/* ****************************************
 *  HUF detailed API
 * ****************************************/
#define HUF_OPTIMAL_DEPTH_THRESHOLD ZSTD_btultra

/*! HUF_compress() does the following:
 *  1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h")
 *  2. (optional) refine tableLog using HUF_optimalTableLog()
 *  3. build Huffman table from count using HUF_buildCTable()
 *  4. save Huffman table to memory buffer using HUF_writeCTable()
 *  5. encode the data stream using HUF_compress4X_usingCTable()
 *
 *  The following API allows targeting specific sub-functions for advanced tasks.
 *  For example, it's possible to compress several blocks using the same 'CTable',
 *  or to save and regenerate 'CTable' using external methods.
 */
unsigned HUF_minTableLog(unsigned symbolCardinality);
unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue);
unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, void* workSpace,
 size_t wkspSize, HUF_CElt* table, const unsigned* count, int flags); /* table is used as scratch space for building and testing tables, not a return value */
size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize);
size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);

typedef enum {
   HUF_repeat_none,  /**< Cannot use the previous table */
   HUF_repeat_check, /**< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */
   HUF_repeat_valid  /**< Can use the previous table and it is assumed to be valid */
 } HUF_repeat;

/** HUF_compress4X_repeat() :
 *  Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
 *  If it uses hufTable it does not modify hufTable or repeat.
 *  If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
 *  If preferRepeat then the old table will always be used if valid.
 *  If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
                       const void* src, size_t srcSize,
                       unsigned maxSymbolValue, unsigned tableLog,
                       void* workSpace, size_t wkspSize,    /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);

/** HUF_buildCTable_wksp() :
 *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
 * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE.
 */
#define HUF_CTABLE_WORKSPACE_SIZE_U32 ((4 * (HUF_SYMBOLVALUE_MAX + 1)) + 192)
#define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned))
size_t HUF_buildCTable_wksp (HUF_CElt* tree,
                       const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
                             void* workSpace, size_t wkspSize);

/*! HUF_readStats() :
 *  Read compact Huffman tree, saved by HUF_writeCTable().
 * `huffWeight` is destination buffer.
 * @return : size read from `src` , or an error Code .
 *  Note : Needed by HUF_readCTable() and HUF_readDTableXn() . */
size_t HUF_readStats(BYTE* huffWeight, size_t hwSize,
                     U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
                     const void* src, size_t srcSize);

/*! HUF_readStats_wksp() :
 * Same as HUF_readStats() but takes an external workspace which must be
 * 4-byte aligned and its size must be >= HUF_READ_STATS_WORKSPACE_SIZE.
 * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
 */
#define HUF_READ_STATS_WORKSPACE_SIZE_U32 FSE_DECOMPRESS_WKSP_SIZE_U32(6, HUF_TABLELOG_MAX-1)
#define HUF_READ_STATS_WORKSPACE_SIZE (HUF_READ_STATS_WORKSPACE_SIZE_U32 * sizeof(unsigned))
size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize,
                          U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
                          const void* src, size_t srcSize,
                          void* workspace, size_t wkspSize,
                          int flags);

/** HUF_readCTable() :
 *  Loading a CTable saved with HUF_writeCTable() */
size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned *hasZeroWeights);

/** HUF_getNbBitsFromCTable() :
 *  Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX
 *  Note 1 : If symbolValue > HUF_readCTableHeader(symbolTable).maxSymbolValue, returns 0
 *  Note 2 : is not inlined, as HUF_CElt definition is private
 */
U32 HUF_getNbBitsFromCTable(const HUF_CElt* symbolTable, U32 symbolValue);

typedef struct {
    BYTE tableLog;
    BYTE maxSymbolValue;
    BYTE unused[sizeof(size_t) - 2];
} HUF_CTableHeader;

/** HUF_readCTableHeader() :
 *  @returns The header from the CTable specifying the tableLog and the maxSymbolValue.
 */
HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable);

/*
 * HUF_decompress() does the following:
 * 1. select the decompression algorithm (X1, X2) based on pre-computed heuristics
 * 2. build Huffman table from save, using HUF_readDTableX?()
 * 3. decode 1 or 4 segments in parallel using HUF_decompress?X?_usingDTable()
 */

/** HUF_selectDecoder() :
 *  Tells which decoder is likely to decode faster,
 *  based on a set of pre-computed metrics.
 * @return : 0==HUF_decompress4X1, 1==HUF_decompress4X2 .
 *  Assumption : 0 < dstSize <= 128 KB */
U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);

/**
 *  The minimum workspace size for the `workSpace` used in
 *  HUF_readDTableX1_wksp() and HUF_readDTableX2_wksp().
 *
 *  The space used depends on HUF_TABLELOG_MAX, ranging from ~1500 bytes when
 *  HUF_TABLE_LOG_MAX=12 to ~1850 bytes when HUF_TABLE_LOG_MAX=15.
 *  Buffer overflow errors may potentially occur if code modifications result in
 *  a required workspace size greater than that specified in the following
 *  macro.
 */
#define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9))
#define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))


/* ====================== */
/* single stream variants */
/* ====================== */

size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
/** HUF_compress1X_repeat() :
 *  Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
 *  If it uses hufTable it does not modify hufTable or repeat.
 *  If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
 *  If preferRepeat then the old table will always be used if valid.
 *  If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
                       const void* src, size_t srcSize,
                       unsigned maxSymbolValue, unsigned tableLog,
                       void* workSpace, size_t wkspSize,   /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);

size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
#ifndef HUF_FORCE_DECOMPRESS_X1
size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);   /**< double-symbols decoder */
#endif

/* BMI2 variants.
 * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
 */
size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
#ifndef HUF_FORCE_DECOMPRESS_X2
size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
#endif
size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
#ifndef HUF_FORCE_DECOMPRESS_X2
size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
#endif
#ifndef HUF_FORCE_DECOMPRESS_X1
size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
#endif

#endif   /* HUF_H_298734234 */


================================================
FILE: src/huff/mem.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under both the BSD-style license (found in the
 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
 * in the COPYING file in the root directory of this source tree).
 * You may select, at your option, one of the above-listed licenses.
 */

#ifndef MEM_H_MODULE
#define MEM_H_MODULE

/*-****************************************
*  Dependencies
******************************************/
#include <stddef.h>  /* size_t, ptrdiff_t */
#include <stdint.h> /* intptr_t */
#define MEM_STATIC

typedef  uint32_t U32;
typedef   uint8_t BYTE;

#endif /* MEM_H_MODULE */


================================================
FILE: src/redis/CMakeLists.txt
================================================
option(REDIS_ZMALLOC_MI "Implement zmalloc layer using mimalloc allocator" ON)

if (REDIS_ZMALLOC_MI)
  set(ZMALLOC_SRC "zmalloc_mi.c")
  set(ZMALLOC_DEPS "TRDP::mimalloc2")
else()
  set(ZMALLOC_SRC "zmalloc.c")
  set(ZMALLOC_DEPS "")
endif()

add_library(redis_lib crc16.c crc64.c crcspeed.c debug.c  intset.c geo.c 
            geohash.c geohash_helper.c hiredis.c read.c
            listpack.c lzf_c.c lzf_d.c sds.c
            rax.c redis_aux.c t_stream.c 
            util.c ziplist.c hyperloglog.c ${ZMALLOC_SRC})

cxx_link(redis_lib  ${ZMALLOC_DEPS})

add_library(redis_test_lib dict.c siphash.c)
cxx_link(redis_test_lib redis_lib)

if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
  target_compile_options(redis_lib PRIVATE -Wno-maybe-uninitialized)
endif()

if (REDIS_ZMALLOC_MI)
  target_compile_definitions(redis_lib PUBLIC USE_ZMALLOC_MI)
endif()

add_subdirectory(lua)


================================================
FILE: src/redis/LICENSE.redis
================================================
Copyright (c) 2006-2020, Salvatore Sanfilippo
All rights reserved.

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
    * Neither the name of Redis nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


================================================
FILE: src/redis/config.h
================================================
/*
 * Copyright (c) 2009-2012, Salvatore Sanfilippo <antirez at gmail dot com>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef __CONFIG_H
#define __CONFIG_H

#ifdef __APPLE__
#include <AvailabilityMacros.h>
#endif

#ifdef __linux__
#include <features.h>
#endif

/* Define redis_fstat to fstat or fstat64() */
#if defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
#define redis_fstat fstat64
#define redis_stat stat64
#else
#define redis_fstat fstat
#define redis_stat stat
#endif

/* Test for proc filesystem */
#ifdef __linux__
#define HAVE_PROC_STAT 1
#define HAVE_PROC_MAPS 1
#define HAVE_PROC_SMAPS 1
#define HAVE_PROC_SOMAXCONN 1
#define HAVE_PROC_OOM_SCORE_ADJ 1
#endif

/* Test for task_info() */
#if defined(__APPLE__)
#define HAVE_TASKINFO 1
#endif

/* Test for backtrace() */
#if defined(__APPLE__) || (defined(__linux__) && defined(__GLIBC__)) || \
    defined(__FreeBSD__) || ((defined(__OpenBSD__) || defined(__NetBSD__)) && defined(USE_BACKTRACE))\
 || defined(__DragonFly__) || (defined(__UCLIBC__) && defined(__UCLIBC_HAS_BACKTRACE__))
#define HAVE_BACKTRACE 1
#endif

/* MSG_NOSIGNAL. */
#ifdef __linux__
#define HAVE_MSG_NOSIGNAL 1
#endif

/* Test for polling API */
#ifdef __linux__
#define HAVE_EPOLL 1
#endif

#if (defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined (__NetBSD__)
#define HAVE_KQUEUE 1
#endif

#ifdef __sun
#include <sys/feature_tests.h>
#ifdef _DTRACE_VERSION
#define HAVE_EVPORT 1
#define HAVE_PSINFO 1
#endif
#endif

/* Define redis_fsync to fdatasync() in Linux and fsync() for all the rest */
#ifdef __linux__
#define redis_fsync fdatasync
#else
#define redis_fsync fsync
#endif

#if __GNUC__ >= 4
#define valkey_unreachable __builtin_unreachable
#else
#define valkey_unreachable abort
#endif
#if __GNUC__ >= 3
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
#else
#define likely(x) (x)
#define unlikely(x) (x)
#endif

/* Define rdb_fsync_range to sync_file_range() on Linux, otherwise we use
 * the plain fsync() call. */
#if (defined(__linux__) && defined(SYNC_FILE_RANGE_WAIT_BEFORE))
#define rdb_fsync_range(fd,off,size) sync_file_range(fd,off,size,SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE)
#else
#define rdb_fsync_range(fd,off,size) fsync(fd)
#endif

/* Check if we can use setproctitle().
 * BSD systems have support for it, we provide an implementation for
 * Linux and osx. */
#if (defined __NetBSD__ || defined __FreeBSD__ || defined __OpenBSD__)
#define USE_SETPROCTITLE
#endif

#if defined(__HAIKU__)
#define ESOCKTNOSUPPORT 0
#endif

#if (defined __linux || defined __APPLE__)
#define USE_SETPROCTITLE
#define INIT_SETPROCTITLE_REPLACEMENT
void spt_init(int argc, char *argv[]);
void setproctitle(const char *fmt, ...);
#endif

/* Byte ordering detection */
#include <sys/types.h> /* This will likely define BYTE_ORDER */

#ifndef BYTE_ORDER
#if (BSD >= 199103)
# include <machine/endian.h>
#else
#if defined(linux) || defined(__linux__)
# include <endian.h>
#else
#define	LITTLE_ENDIAN	1234	/* least-significant byte first (vax, pc) */
#define	BIG_ENDIAN	4321	/* most-significant byte first (IBM, net) */
#define	PDP_ENDIAN	3412	/* LSB first in word, MSW first in long (pdp)*/

#if defined(__i386__) || defined(__x86_64__) || defined(__amd64__) || \
   defined(vax) || defined(ns32000) || defined(sun386) || \
   defined(MIPSEL) || defined(_MIPSEL) || defined(BIT_ZERO_ON_RIGHT) || \
   defined(__alpha__) || defined(__alpha)
#define BYTE_ORDER    LITTLE_ENDIAN
#endif

#if defined(sel) || defined(pyr) || defined(mc68000) || defined(sparc) || \
    defined(is68k) || defined(tahoe) || defined(ibm032) || defined(ibm370) || \
    defined(MIPSEB) || defined(_MIPSEB) || defined(_IBMR2) || defined(DGUX) ||\
    defined(apollo) || defined(__convex__) || defined(_CRAY) || \
    defined(__hppa) || defined(__hp9000) || \
    defined(__hp9000s300) || defined(__hp9000s700) || \
    defined (BIT_ZERO_ON_LEFT) || defined(m68k) || defined(__sparc)
#define BYTE_ORDER	BIG_ENDIAN
#endif
#endif /* linux */
#endif /* BSD */
#endif /* BYTE_ORDER */

/* Sometimes after including an OS-specific header that defines the
 * endianness we end with __BYTE_ORDER but not with BYTE_ORDER that is what
 * the Redis code uses. In this case let's define everything without the
 * underscores. */
#ifndef BYTE_ORDER
#ifdef __BYTE_ORDER
#if defined(__LITTLE_ENDIAN) && defined(__BIG_ENDIAN)
#ifndef LITTLE_ENDIAN
#define LITTLE_ENDIAN __LITTLE_ENDIAN
#endif
#ifndef BIG_ENDIAN
#define BIG_ENDIAN __BIG_ENDIAN
#endif
#if (__BYTE_ORDER == __LITTLE_ENDIAN)
#define BYTE_ORDER LITTLE_ENDIAN
#else
#define BYTE_ORDER BIG_ENDIAN
#endif
#endif
#endif
#endif

#if !defined(BYTE_ORDER) || \
    (BYTE_ORDER != BIG_ENDIAN && BYTE_ORDER != LITTLE_ENDIAN)
	/* you must determine what the correct bit order is for
	 * your compiler - the next line is an intentional error
	 * which will force your compiles to bomb until you fix
	 * the above macros.
	 */
#error "Undefined or invalid BYTE_ORDER"
#endif

#if (__i386 || __amd64 || __powerpc__) && __GNUC__
#define GNUC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
#if defined(__clang__)
#define HAVE_ATOMIC
#endif
#if (defined(__GLIBC__) && defined(__GLIBC_PREREQ))
#if (GNUC_VERSION >= 40100 && __GLIBC_PREREQ(2, 6))
#define HAVE_ATOMIC
#endif
#endif
#endif

/* Make sure we can test for ARM just checking for __arm__, since sometimes
 * __arm is defined but __arm__ is not. */
#if defined(__arm) && !defined(__arm__)
#define __arm__
#endif
#if defined (__aarch64__) && !defined(__arm64__)
#define __arm64__
#endif

/* Make sure we can test for SPARC just checking for __sparc__. */
#if defined(__sparc) && !defined(__sparc__)
#define __sparc__
#endif

#if defined(__sparc__) || defined(__arm__)
#define USE_ALIGNED_ACCESS
#endif

/* Define for redis_set_thread_title */
#ifdef __linux__
#define redis_set_thread_title(name) pthread_setname_np(pthread_self(), name)
#else
#if (defined __FreeBSD__ || defined __OpenBSD__)
#include <pthread_np.h>
#define redis_set_thread_title(name) pthread_set_name_np(pthread_self(), name)
#elif defined __NetBSD__
#include <pthread.h>
#define redis_set_thread_title(name) pthread_setname_np(pthread_self(), "%s", name)
#elif defined __HAIKU__
#include <kernel/OS.h>
#define redis_set_thread_title(name) rename_thread(find_thread(0), name)
#else
#if (defined __APPLE__ && defined(MAC_OS_X_VERSION_10_7))
int pthread_setname_np(const char *name);
#include <pthread.h>
#define redis_set_thread_title(name) pthread_setname_np(name)
#else
#define redis_set_thread_title(name)
#endif
#endif
#endif

/* Check if we can use setcpuaffinity(). */
#if (defined __linux || defined __NetBSD__ || defined __FreeBSD__ || defined __DragonFly__)
#define USE_SETCPUAFFINITY
void setcpuaffinity(const char *cpulist);
#endif

#endif


================================================
FILE: src/redis/crc16.c
================================================


#include "crc16.h"

/*
 * Copyright 2001-2010 Georges Menie (www.menie.org)
 * Copyright 2010-2012 Salvatore Sanfilippo (adapted to Redis coding style)
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the University of California, Berkeley nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/* CRC16 implementation according to CCITT standards.
 *
 * Note by @antirez: this is actually the XMODEM CRC 16 algorithm, using the
 * following parameters:
 *
 * Name                       : "XMODEM", also known as "ZMODEM", "CRC-16/ACORN"
 * Width                      : 16 bit
 * Poly                       : 1021 (That is actually x^16 + x^12 + x^5 + 1)
 * Initialization             : 0000
 * Reflect Input byte         : False
 * Reflect Output CRC         : False
 * Xor constant to output CRC : 0000
 * Output for "123456789"     : 31C3
 */

static const uint16_t crc16tab[256] = {
    0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7, 0x8108, 0x9129, 0xa14a, 0xb16b,
    0xc18c, 0xd1ad, 0xe1ce, 0xf1ef, 0x1231, 0x0210, 0x3273, 0x2252, 0x52b5, 0x4294, 0x72f7, 0x62d6,
    0x9339, 0x8318, 0xb37b, 0xa35a, 0xd3bd, 0xc39c, 0xf3ff, 0xe3de, 0x2462, 0x3443, 0x0420, 0x1401,
    0x64e6, 0x74c7, 0x44a4, 0x5485, 0xa56a, 0xb54b, 0x8528, 0x9509, 0xe5ee, 0xf5cf, 0xc5ac, 0xd58d,
    0x3653, 0x2672, 0x1611, 0x0630, 0x76d7, 0x66f6, 0x5695, 0x46b4, 0xb75b, 0xa77a, 0x9719, 0x8738,
    0xf7df, 0xe7fe, 0xd79d, 0xc7bc, 0x48c4, 0x58e5, 0x6886, 0x78a7, 0x0840, 0x1861, 0x2802, 0x3823,
    0xc9cc, 0xd9ed, 0xe98e, 0xf9af, 0x8948, 0x9969, 0xa90a, 0xb92b, 0x5af5, 0x4ad4, 0x7ab7, 0x6a96,
    0x1a71, 0x0a50, 0x3a33, 0x2a12, 0xdbfd, 0xcbdc, 0xfbbf, 0xeb9e, 0x9b79, 0x8b58, 0xbb3b, 0xab1a,
    0x6ca6, 0x7c87, 0x4ce4, 0x5cc5, 0x2c22, 0x3c03, 0x0c60, 0x1c41, 0xedae, 0xfd8f, 0xcdec, 0xddcd,
    0xad2a, 0xbd0b, 0x8d68, 0x9d49, 0x7e97, 0x6eb6, 0x5ed5, 0x4ef4, 0x3e13, 0x2e32, 0x1e51, 0x0e70,
    0xff9f, 0xefbe, 0xdfdd, 0xcffc, 0xbf1b, 0xaf3a, 0x9f59, 0x8f78, 0x9188, 0x81a9, 0xb1ca, 0xa1eb,
    0xd10c, 0xc12d, 0xf14e, 0xe16f, 0x1080, 0x00a1, 0x30c2, 0x20e3, 0x5004, 0x4025, 0x7046, 0x6067,
    0x83b9, 0x9398, 0xa3fb, 0xb3da, 0xc33d, 0xd31c, 0xe37f, 0xf35e, 0x02b1, 0x1290, 0x22f3, 0x32d2,
    0x4235, 0x5214, 0x6277, 0x7256, 0xb5ea, 0xa5cb, 0x95a8, 0x8589, 0xf56e, 0xe54f, 0xd52c, 0xc50d,
    0x34e2, 0x24c3, 0x14a0, 0x0481, 0x7466, 0x6447, 0x5424, 0x4405, 0xa7db, 0xb7fa, 0x8799, 0x97b8,
    0xe75f, 0xf77e, 0xc71d, 0xd73c, 0x26d3, 0x36f2, 0x0691, 0x16b0, 0x6657, 0x7676, 0x4615, 0x5634,
    0xd94c, 0xc96d, 0xf90e, 0xe92f, 0x99c8, 0x89e9, 0xb98a, 0xa9ab, 0x5844, 0x4865, 0x7806, 0x6827,
    0x18c0, 0x08e1, 0x3882, 0x28a3, 0xcb7d, 0xdb5c, 0xeb3f, 0xfb1e, 0x8bf9, 0x9bd8, 0xabbb, 0xbb9a,
    0x4a75, 0x5a54, 0x6a37, 0x7a16, 0x0af1, 0x1ad0, 0x2ab3, 0x3a92, 0xfd2e, 0xed0f, 0xdd6c, 0xcd4d,
    0xbdaa, 0xad8b, 0x9de8, 0x8dc9, 0x7c26, 0x6c07, 0x5c64, 0x4c45, 0x3ca2, 0x2c83, 0x1ce0, 0x0cc1,
    0xef1f, 0xff3e, 0xcf5d, 0xdf7c, 0xaf9b, 0xbfba, 0x8fd9, 0x9ff8, 0x6e17, 0x7e36, 0x4e55, 0x5e74,
    0x2e93, 0x3eb2, 0x0ed1, 0x1ef0};

uint16_t crc16(const char* buf, int len) {
  int counter;
  uint16_t crc = 0;
  for (counter = 0; counter < len; counter++)
    crc = (crc << 8) ^ crc16tab[((crc >> 8) ^ *buf++) & 0x00FF];
  return crc;
}


================================================
FILE: src/redis/crc16.h
================================================
#ifndef CRC16_H
#define CRC16_H

#include <stdint.h>

uint16_t crc16(const char* buf, int len);

#endif


================================================
FILE: src/redis/crc64.c
================================================
/* Copyright (c) 2014, Matt Stancliff <matt@genges.com>
 * Copyright (c) 2020, Amazon Web Services
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. */

#include "crc64.h"
#include "crcspeed.h"
static uint64_t crc64_table[8][256] = {{0}};

#define POLY UINT64_C(0xad93d23594c935a9)
/******************** BEGIN GENERATED PYCRC FUNCTIONS ********************/
/**
 * Generated on Sun Dec 21 14:14:07 2014,
 * by pycrc v0.8.2, https://www.tty1.net/pycrc/
 *
 * LICENSE ON GENERATED CODE:
 * ==========================
 * As of version 0.6, pycrc is released under the terms of the MIT licence.
 * The code generated by pycrc is not considered a substantial portion of the
 * software, therefore the author of pycrc will not claim any copyright on
 * the generated code.
 * ==========================
 *
 * CRC configuration:
 *    Width        = 64
 *    Poly         = 0xad93d23594c935a9
 *    XorIn        = 0xffffffffffffffff
 *    ReflectIn    = True
 *    XorOut       = 0x0000000000000000
 *    ReflectOut   = True
 *    Algorithm    = bit-by-bit-fast
 *
 * Modifications after generation (by matt):
 *   - included finalize step in-line with update for single-call generation
 *   - re-worked some inner variable architectures
 *   - adjusted function parameters to match expected prototypes.
 *****************************************************************************/

/**
 * Reflect all bits of a \a data word of \a data_len bytes.
 *
 * \param data         The data word to be reflected.
 * \param data_len     The width of \a data expressed in number of bits.
 * \return             The reflected data.
 *****************************************************************************/
static inline uint_fast64_t crc_reflect(uint_fast64_t data, size_t data_len) {
    uint_fast64_t ret = data & 0x01;

    for (size_t i = 1; i < data_len; i++) {
        data >>= 1;
        ret = (ret << 1) | (data & 0x01);
    }

    return ret;
}

/**
 *  Update the crc value with new data.
 *
 * \param crc      The current crc value.
 * \param data     Pointer to a buffer of \a data_len bytes.
 * \param data_len Number of bytes in the \a data buffer.
 * \return         The updated crc value.
 ******************************************************************************/
uint64_t _crc64(uint_fast64_t crc, const void *in_data, const uint64_t len) {
    const uint8_t *data = in_data;
    unsigned long long bit;

    for (uint64_t offset = 0; offset < len; offset++) {
        uint8_t c = data[offset];
        for (uint_fast8_t i = 0x01; i & 0xff; i <<= 1) {
            bit = crc & 0x8000000000000000;
            if (c & i) {
                bit = !bit;
            }

            crc <<= 1;
            if (bit) {
                crc ^= POLY;
            }
        }

        crc &= 0xffffffffffffffff;
    }

    crc = crc & 0xffffffffffffffff;
    return crc_reflect(crc, 64) ^ 0x0000000000000000;
}

/******************** END GENERATED PYCRC FUNCTIONS ********************/

/* Initializes the 16KB lookup tables. */
void crc64_init(void) {
    crcspeed64native_init(_crc64, crc64_table);
}

/* Compute crc64 */
uint64_t crc64(uint64_t crc, const unsigned char *s, uint64_t l) {
    return crcspeed64native(crc64_table, crc, (void *) s, l);
}

/* Test main */
#ifdef REDIS_TEST
#include <stdio.h>

#define UNUSED(x) (void)(x)
int crc64Test(int argc, char *argv[], int flags) {
    UNUSED(argc);
    UNUSED(argv);
    UNUSED(flags);
    crc64_init();
    printf("[calcula]: e9c6d914c4b8d9ca == %016" PRIx64 "\n",
           (uint64_t)_crc64(0, "123456789", 9));
    printf("[64speed]: e9c6d914c4b8d9ca == %016" PRIx64 "\n",
           (uint64_t)crc64(0, (unsigned char*)"123456789", 9));
    char li[] = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed "
                "do eiusmod tempor incididunt ut labore et dolore magna "
                "aliqua. Ut enim ad minim veniam, quis nostrud exercitation "
                "ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis "
                "aute irure dolor in reprehenderit in voluptate velit esse "
                "cillum dolore eu fugiat nulla pariatur. Excepteur sint "
                "occaecat cupidatat non proident, sunt in culpa qui officia "
                "deserunt mollit anim id est laborum.";
    printf("[calcula]: c7794709e69683b3 == %016" PRIx64 "\n",
           (uint64_t)_crc64(0, li, sizeof(li)));
    printf("[64speed]: c7794709e69683b3 == %016" PRIx64 "\n",
           (uint64_t)crc64(0, (unsigned char*)li, sizeof(li)));
    return 0;
}

#endif

#ifdef REDIS_TEST_MAIN
int main(int argc, char *argv[]) {
    return crc64Test(argc, argv);
}

#endif


================================================
FILE: src/redis/crc64.h
================================================
#ifndef CRC64_H
#define CRC64_H

#include <stdint.h>

void crc64_init(void);
uint64_t crc64(uint64_t crc, const unsigned char *s, uint64_t l);

#ifdef REDIS_TEST
int crc64Test(int argc, char *argv[], int flags);
#endif

#endif


================================================
FILE: src/redis/crcspeed.c
================================================
/*
 * Copyright (C) 2013 Mark Adler
 * Originally by: crc64.c Version 1.4  16 Dec 2013  Mark Adler
 * Modifications by Matt Stancliff <matt@genges.com>:
 *   - removed CRC64-specific behavior
 *   - added generation of lookup tables by parameters
 *   - removed inversion of CRC input/result
 *   - removed automatic initialization in favor of explicit initialization

  This software is provided 'as-is', without any express or implied
  warranty.  In no event will the author be held liable for any damages
  arising from the use of this software.

  Permission is granted to anyone to use this software for any purpose,
  including commercial applications, and to alter it and redistribute it
  freely, subject to the following restrictions:

  1. The origin of this software must not be misrepresented; you must not
     claim that you wrote the original software. If you use this software
     in a product, an acknowledgment in the product documentation would be
     appreciated but is not required.
  2. Altered source versions must be plainly marked as such, and must not be
     misrepresented as being the original software.
  3. This notice may not be removed or altered from any source distribution.

  Mark Adler
  madler@alumni.caltech.edu
 */

#include "crcspeed.h"

/* Fill in a CRC constants table. */
void crcspeed64little_init(crcfn64 crcfn, uint64_t table[8][256]) {
    uint64_t crc;

    /* generate CRCs for all single byte sequences */
    for (int n = 0; n < 256; n++) {
        unsigned char v = n;
        table[0][n] = crcfn(0, &v, 1);
    }

    /* generate nested CRC table for future slice-by-8 lookup */
    for (int n = 0; n < 256; n++) {
        crc = table[0][n];
        for (int k = 1; k < 8; k++) {
            crc = table[0][crc & 0xff] ^ (crc >> 8);
            table[k][n] = crc;
        }
    }
}

void crcspeed16little_init(crcfn16 crcfn, uint16_t table[8][256]) {
    uint16_t crc;

    /* generate CRCs for all single byte sequences */
    for (int n = 0; n < 256; n++) {
        table[0][n] = crcfn(0, &n, 1);
    }

    /* generate nested CRC table for future slice-by-8 lookup */
    for (int n = 0; n < 256; n++) {
        crc = table[0][n];
        for (int k = 1; k < 8; k++) {
            crc = table[0][(crc >> 8) & 0xff] ^ (crc << 8);
            table[k][n] = crc;
        }
    }
}

/* Reverse the bytes in a 64-bit word. */
static inline uint64_t rev8(uint64_t a) {
#if defined(__GNUC__) || defined(__clang__)
    return __builtin_bswap64(a);
#else
    uint64_t m;

    m = UINT64_C(0xff00ff00ff00ff);
    a = ((a >> 8) & m) | (a & m) << 8;
    m = UINT64_C(0xffff0000ffff);
    a = ((a >> 16) & m) | (a & m) << 16;
    return a >> 32 | a << 32;
#endif
}

/* This function is called once to initialize the CRC table for use on a
   big-endian architecture. */
void crcspeed64big_init(crcfn64 fn, uint64_t big_table[8][256]) {
    /* Create the little endian table then reverse all the entries. */
    crcspeed64little_init(fn, big_table);
    for (int k = 0; k < 8; k++) {
        for (int n = 0; n < 256; n++) {
            big_table[k][n] = rev8(big_table[k][n]);
        }
    }
}

void crcspeed16big_init(crcfn16 fn, uint16_t big_table[8][256]) {
    /* Create the little endian table then reverse all the entries. */
    crcspeed16little_init(fn, big_table);
    for (int k = 0; k < 8; k++) {
        for (int n = 0; n < 256; n++) {
            big_table[k][n] = rev8(big_table[k][n]);
        }
    }
}

/* Calculate a non-inverted CRC multiple bytes at a time on a little-endian
 * architecture. If you need inverted CRC, invert *before* calling and invert
 * *after* calling.
 * 64 bit crc = process 8 bytes at once;
 */
uint64_t crcspeed64little(uint64_t little_table[8][256], uint64_t crc,
                          void *buf, size_t len) {
    unsigned char *next = buf;

    /* process individual bytes until we reach an 8-byte aligned pointer */
    while (len && ((uintptr_t)next & 7) != 0) {
        crc = little_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
        len--;
    }

    /* fast middle processing, 8 bytes (aligned!) per loop */
    while (len >= 8) {
        crc ^= *(uint64_t *)next;
        crc = little_table[7][crc & 0xff] ^
              little_table[6][(crc >> 8) & 0xff] ^
              little_table[5][(crc >> 16) & 0xff] ^
              little_table[4][(crc >> 24) & 0xff] ^
              little_table[3][(crc >> 32) & 0xff] ^
              little_table[2][(crc >> 40) & 0xff] ^
              little_table[1][(crc >> 48) & 0xff] ^
              little_table[0][crc >> 56];
        next += 8;
        len -= 8;
    }

    /* process remaining bytes (can't be larger than 8) */
    while (len) {
        crc = little_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
        len--;
    }

    return crc;
}

uint16_t crcspeed16little(uint16_t little_table[8][256], uint16_t crc,
                          void *buf, size_t len) {
    unsigned char *next = buf;

    /* process individual bytes until we reach an 8-byte aligned pointer */
    while (len && ((uintptr_t)next & 7) != 0) {
        crc = little_table[0][((crc >> 8) ^ *next++) & 0xff] ^ (crc << 8);
        len--;
    }

    /* fast middle processing, 8 bytes (aligned!) per loop */
    while (len >= 8) {
        uint64_t n = *(uint64_t *)next;
        crc = little_table[7][(n & 0xff) ^ ((crc >> 8) & 0xff)] ^
              little_table[6][((n >> 8) & 0xff) ^ (crc & 0xff)] ^
              little_table[5][(n >> 16) & 0xff] ^
              little_table[4][(n >> 24) & 0xff] ^
              little_table[3][(n >> 32) & 0xff] ^
              little_table[2][(n >> 40) & 0xff] ^
              little_table[1][(n >> 48) & 0xff] ^
              little_table[0][n >> 56];
        next += 8;
        len -= 8;
    }

    /* process remaining bytes (can't be larger than 8) */
    while (len) {
        crc = little_table[0][((crc >> 8) ^ *next++) & 0xff] ^ (crc << 8);
        len--;
    }

    return crc;
}

/* Calculate a non-inverted CRC eight bytes at a time on a big-endian
 * architecture.
 */
uint64_t crcspeed64big(uint64_t big_table[8][256], uint64_t crc, void *buf,
                       size_t len) {
    unsigned char *next = buf;

    crc = rev8(crc);
    while (len && ((uintptr_t)next & 7) != 0) {
        crc = big_table[0][(crc >> 56) ^ *next++] ^ (crc << 8);
        len--;
    }

    while (len >= 8) {
        crc ^= *(uint64_t *)next;
        crc = big_table[0][crc & 0xff] ^
              big_table[1][(crc >> 8) & 0xff] ^
              big_table[2][(crc >> 16) & 0xff] ^
              big_table[3][(crc >> 24) & 0xff] ^
              big_table[4][(crc >> 32) & 0xff] ^
              big_table[5][(crc >> 40) & 0xff] ^
              big_table[6][(crc >> 48) & 0xff] ^
              big_table[7][crc >> 56];
        next += 8;
        len -= 8;
    }

    while (len) {
        crc = big_table[0][(crc >> 56) ^ *next++] ^ (crc << 8);
        len--;
    }

    return rev8(crc);
}

/* WARNING: Completely untested on big endian architecture.  Possibly broken. */
uint16_t crcspeed16big(uint16_t big_table[8][256], uint16_t crc_in, void *buf,
                       size_t len) {
    unsigned char *next = buf;
    uint64_t crc = crc_in;

    crc = rev8(crc);
    while (len && ((uintptr_t)next & 7) != 0) {
        crc = big_table[0][((crc >> (56 - 8)) ^ *next++) & 0xff] ^ (crc >> 8);
        len--;
    }

    while (len >= 8) {
        uint64_t n = *(uint64_t *)next;
        crc = big_table[0][(n & 0xff) ^ ((crc >> (56 - 8)) & 0xff)] ^
              big_table[1][((n >> 8) & 0xff) ^ (crc & 0xff)] ^
              big_table[2][(n >> 16) & 0xff] ^
              big_table[3][(n >> 24) & 0xff] ^
              big_table[4][(n >> 32) & 0xff] ^
              big_table[5][(n >> 40) & 0xff] ^
              big_table[6][(n >> 48) & 0xff] ^
              big_table[7][n >> 56];
        next += 8;
        len -= 8;
    }

    while (len) {
        crc = big_table[0][((crc >> (56 - 8)) ^ *next++) & 0xff] ^ (crc >> 8);
        len--;
    }

    return rev8(crc);
}

/* Return the CRC of buf[0..len-1] with initial crc, processing eight bytes
   at a time using passed-in lookup table.
   This selects one of two routines depending on the endianess of
   the architecture. */
uint64_t crcspeed64native(uint64_t table[8][256], uint64_t crc, void *buf,
                          size_t len) {
    uint64_t n = 1;

    return *(char *)&n ? crcspeed64little(table, crc, buf, len)
                       : crcspeed64big(table, crc, buf, len);
}

uint16_t crcspeed16native(uint16_t table[8][256], uint16_t crc, void *buf,
                          size_t len) {
    uint64_t n = 1;

    return *(char *)&n ? crcspeed16little(table, crc, buf, len)
                       : crcspeed16big(table, crc, buf, len);
}

/* Initialize CRC lookup table in architecture-dependent manner. */
void crcspeed64native_init(crcfn64 fn, uint64_t table[8][256]) {
    uint64_t n = 1;

    *(char *)&n ? crcspeed64little_init(fn, table)
                : crcspeed64big_init(fn, table);
}

void crcspeed16native_init(crcfn16 fn, uint16_t table[8][256]) {
    uint64_t n = 1;

    *(char *)&n ? crcspeed16little_init(fn, table)
                : crcspeed16big_init(fn, table);
}


================================================
FILE: src/redis/crcspeed.h
================================================
/* Copyright (c) 2014, Matt Stancliff <matt@genges.com>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. */

#ifndef CRCSPEED_H
#define CRCSPEED_H

#include <inttypes.h>
#include <stdio.h>

typedef uint64_t (*crcfn64)(uint64_t, const void *, const uint64_t);
typedef uint16_t (*crcfn16)(uint16_t, const void *, const uint64_t);

/* CRC-64 */
void crcspeed64little_init(crcfn64 fn, uint64_t table[8][256]);
void crcspeed64big_init(crcfn64 fn, uint64_t table[8][256]);
void crcspeed64native_init(crcfn64 fn, uint64_t table[8][256]);

uint64_t crcspeed64little(uint64_t table[8][256], uint64_t crc, void *buf,
                          size_t len);
uint64_t crcspeed64big(uint64_t table[8][256], uint64_t crc, void *buf,
                       size_t len);
uint64_t crcspeed64native(uint64_t table[8][256], uint64_t crc, void *buf,
                          size_t len);

/* CRC-16 */
void crcspeed16little_init(crcfn16 fn, uint16_t table[8][256]);
void crcspeed16big_init(crcfn16 fn, uint16_t table[8][256]);
void crcspeed16native_init(crcfn16 fn, uint16_t table[8][256]);

uint16_t crcspeed16little(uint16_t table[8][256], uint16_t crc, void *buf,
                          size_t len);
uint16_t crcspeed16big(uint16_t table[8][256], uint16_t crc, void *buf,
                       size_t len);
uint16_t crcspeed16native(uint16_t table[8][256], uint16_t crc, void *buf,
                          size_t len);
#endif


================================================
FILE: src/redis/debug.c
================================================
/*
 * Copyright (c) 2009-2020, Salvatore Sanfilippo <antirez at gmail dot com>
 * Copyright (c) 2020, Redis Labs, Inc
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */


#include <stdarg.h>
#include <stdio.h>
#include <syslog.h>
#include <assert.h>

#include "util.h"

int verbosity = LL_NOTICE;

void serverLog(int level, const char *fmt, ...) {
    va_list ap;
    char msg[LOG_MAX_LEN];

    if ((level&0xff) < verbosity) return;

    va_start(ap, fmt);
    vsnprintf(msg, sizeof(msg), fmt, ap);
    va_end(ap);

    fprintf(stdout, "%s\n",msg);
}

void _serverPanic(const char *file, int line, const char *msg, ...) {
    va_list ap;
    va_start(ap,msg);
    char fmtmsg[256];
    vsnprintf(fmtmsg,sizeof(fmtmsg),msg,ap);
    va_end(ap);

    serverLog(LL_WARNING, "------------------------------------------------");
    serverLog(LL_WARNING, "!!! Software Failure. Press left mouse button to continue");
    serverLog(LL_WARNING, "Guru Meditation: %s #%s:%d", fmtmsg,file,line);
#ifndef NDEBUG
#if defined(__APPLE__)
    __assert_rtn(msg, file, line, "");
#elif defined(__FreeBSD__)
    __assert("", file, line, msg);
#else      
    __assert_fail(msg, file, line, "");
#endif    
#endif 
}

void _serverAssert(const char *estr, const char *file, int line) {
    serverLog(LL_WARNING,"=== ASSERTION FAILED ===");
    serverLog(LL_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
}


================================================
FILE: src/redis/dict.c
================================================
/* Hash Tables Implementation.
 *
 * This file implements in memory hash tables with insert/del/replace/find/
 * get-random-element operations. Hash tables will auto resize if needed
 * tables of power of two in size are used, collisions are handled by
 * chaining. See the source code for more information... :)
 *
 * Copyright (c) 2006-2012, Salvatore Sanfilippo <antirez at gmail dot com>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdarg.h>
#include <limits.h>
#include <sys/time.h>

#include "dict.h"
#include "zmalloc.h"

#if !defined(DICT_BENCHMARK_MAIN) && defined(ROMAN_REDIS_ASSERT_DISABLED)
#include "redisassert.h"
#else
#include <assert.h>
#endif

/* Using dictEnableResize() / dictDisableResize() we make possible to
 * enable/disable resizing of the hash table as needed. This is very important
 * for Redis, as we use copy-on-write and don't want to move too much memory
 * around when there is a child performing saving operations.
 *
 * Note that even when dict_can_resize is set to 0, not all resizes are
 * prevented: a hash table is still allowed to grow if the ratio between
 * the number of elements and the buckets > dict_force_resize_ratio. */
static int dict_can_resize = 1;
static unsigned int dict_force_resize_ratio = 5;

/* -------------------------- private prototypes ---------------------------- */

static int _dictExpandIfNeeded(dict *d);
static signed char _dictNextExp(unsigned long size);
static long _dictKeyIndex(dict *d, const void *key, uint64_t hash, dictEntry **existing);
static int _dictInit(dict *d, dictType *type);

/* -------------------------- hash functions -------------------------------- */

static uint8_t dict_hash_function_seed[16];

void dictSetHashFunctionSeed(uint8_t *seed) {
    memcpy(dict_hash_function_seed,seed,sizeof(dict_hash_function_seed));
}

uint8_t *dictGetHashFunctionSeed(void) {
    return dict_hash_function_seed;
}

/* The default hashing function uses SipHash implementation
 * in siphash.c. */

uint64_t siphash(const uint8_t *in, const size_t inlen, const uint8_t *k);
uint64_t siphash_nocase(const uint8_t *in, const size_t inlen, const uint8_t *k);

uint64_t dictGenHashFunction(const void *key, size_t len) {
    return siphash(key,len,dict_hash_function_seed);
}

uint64_t dictGenCaseHashFunction(const unsigned char *buf, size_t len) {
    return siphash_nocase(buf,len,dict_hash_function_seed);
}

/* ----------------------------- API implementation ------------------------- */

/* Reset hash table parameters already initialized with _dictInit()*/
static void _dictReset(dict *d, int htidx)
{
    d->ht_table[htidx] = NULL;
    d->ht_size_exp[htidx] = -1;
    d->ht_used[htidx] = 0;
}

/* Create a new hash table */
dict *dictCreate(dictType *type)
{
    dict *d = zmalloc(sizeof(*d));

    _dictInit(d,type);
    return d;
}

/* Initialize the hash table */
int _dictInit(dict *d, dictType *type)
{
    _dictReset(d, 0);
    _dictReset(d, 1);
    d->type = type;
    d->rehashidx = -1;
    d->pauserehash = 0;
    return DICT_OK;
}

/* Resize the table to the minimal size that contains all the elements,
 * but with the invariant of a USED/BUCKETS ratio near to <= 1 */
int dictResize(dict *d)
{
    unsigned long minimal;

    if (!dict_can_resize || dictIsRehashing(d)) return DICT_ERR;
    minimal = d->ht_used[0];
    if (minimal < DICT_HT_INITIAL_SIZE)
        minimal = DICT_HT_INITIAL_SIZE;
    return dictExpand(d, minimal);
}

/* Expand or create the hash table,
 * when malloc_failed is non-NULL, it'll avoid panic if malloc fails (in which case it'll be set to 1).
 * Returns DICT_OK if expand was performed, and DICT_ERR if skipped. */
int _dictExpand(dict *d, unsigned long size, int* malloc_failed)
{
    if (malloc_failed) *malloc_failed = 0;

    /* the size is invalid if it is smaller than the number of
     * elements already inside the hash table */
    if (dictIsRehashing(d) || d->ht_used[0] > size)
        return DICT_ERR;

    /* the new hash table */
    dictEntry **new_ht_table;
    unsigned long new_ht_used;
    signed char new_ht_size_exp = _dictNextExp(size);

    /* Detect overflows */
    size_t newsize = 1ul<<new_ht_size_exp;
    if (newsize < size || newsize * sizeof(dictEntry*) < newsize)
        return DICT_ERR;

    /* Rehashing to the same table size is not useful. */
    if (new_ht_size_exp == d->ht_size_exp[0]) return DICT_ERR;

    /* Allocate the new hash table and initialize all pointers to NULL */
    if (malloc_failed) {
        new_ht_table = ztrycalloc(newsize*sizeof(dictEntry*));
        *malloc_failed = new_ht_table == NULL;
        if (*malloc_failed)
            return DICT_ERR;
    } else
        new_ht_table = zcalloc(newsize*sizeof(dictEntry*));

    new_ht_used = 0;

    /* Is this the first initialization? If so it's not really a rehashing
     * we just set the first hash table so that it can accept keys. */
    if (d->ht_table[0] == NULL) {
        d->ht_size_exp[0] = new_ht_size_exp;
        d->ht_used[0] = new_ht_used;
        d->ht_table[0] = new_ht_table;
        return DICT_OK;
    }

    /* Prepare a second hash table for incremental rehashing */
    d->ht_size_exp[1] = new_ht_size_exp;
    d->ht_used[1] = new_ht_used;
    d->ht_table[1] = new_ht_table;
    d->rehashidx = 0;
    return DICT_OK;
}

/* return DICT_ERR if expand was not performed */
int dictExpand(dict *d, unsigned long size) {
    return _dictExpand(d, size, NULL);
}

/* return DICT_ERR if expand failed due to memory allocation failure */
int dictTryExpand(dict *d, unsigned long size) {
    int malloc_failed;
    _dictExpand(d, size, &malloc_failed);
    return malloc_failed? DICT_ERR : DICT_OK;
}

/* Performs N steps of incremental rehashing. Returns 1 if there are still
 * keys to move from the old to the new hash table, otherwise 0 is returned.
 *
 * Note that a rehashing step consists in moving a bucket (that may have more
 * than one key as we use chaining) from the old to the new hash table, however
 * since part of the hash table may be composed of empty spaces, it is not
 * guaranteed that this function will rehash even a single bucket, since it
 * will visit at max N*10 empty buckets in total, otherwise the amount of
 * work it does would be unbound and the function may block for a long time. */
int dictRehash(dict *d, int n) {
    int empty_visits = n*10; /* Max number of empty buckets to visit. */
    if (!dictIsRehashing(d)) return 0;

    while(n-- && d->ht_used[0] != 0) {
        dictEntry *de, *nextde;

        /* Note that rehashidx can't overflow as we are sure there are more
         * elements because ht[0].used != 0 */
        assert(DICTHT_SIZE(d->ht_size_exp[0]) > (unsigned long)d->rehashidx);
        while(d->ht_table[0][d->rehashidx] == NULL) {
            d->rehashidx++;
            if (--empty_visits == 0) return 1;
        }
        de = d->ht_table[0][d->rehashidx];
        /* Move all the keys in this bucket from the old to the new hash HT */
        while(de) {
            uint64_t h;

            nextde = de->next;
            /* Get the index in the new hash table */
            h = dictHashKey(d, de->key) & DICTHT_SIZE_MASK(d->ht_size_exp[1]);
            de->next = d->ht_table[1][h];
            d->ht_table[1][h] = de;
            d->ht_used[0]--;
            d->ht_used[1]++;
            de = nextde;
        }
        d->ht_table[0][d->rehashidx] = NULL;
        d->rehashidx++;
    }

    /* Check if we already rehashed the whole table... */
    if (d->ht_used[0] == 0) {
        zfree(d->ht_table[0]);
        /* Copy the new ht onto the old one */
        d->ht_table[0] = d->ht_table[1];
        d->ht_used[0] = d->ht_used[1];
        d->ht_size_exp[0] = d->ht_size_exp[1];
        _dictReset(d, 1);
        d->rehashidx = -1;
        return 0;
    }

    /* More to rehash... */
    return 1;
}

long long timeInMilliseconds(void) {
    struct timeval tv;

    gettimeofday(&tv,NULL);
    return (((long long)tv.tv_sec)*1000)+(tv.tv_usec/1000);
}

/* Rehash in ms+"delta" milliseconds. The value of "delta" is larger 
 * than 0, and is smaller than 1 in most cases. The exact upper bound 
 * depends on the running time of dictRehash(d,100).*/
int dictRehashMilliseconds(dict *d, int ms) {
    if (d->pauserehash > 0) return 0;

    long long start = timeInMilliseconds();
    int rehashes = 0;

    while(dictRehash(d,100)) {
        rehashes += 100;
        if (timeInMilliseconds()-start > ms) break;
    }
    return rehashes;
}

/* This function performs just a step of rehashing, and only if hashing has
 * not been paused for our hash table. When we have iterators in the
 * middle of a rehashing we can't mess with the two hash tables otherwise
 * some element can be missed or duplicated.
 *
 * This function is called by common lookup or update operations in the
 * dictionary so that the hash table automatically migrates from H1 to H2
 * while it is actively used. */
static void _dictRehashStep(dict *d) {
    if (d->pauserehash == 0) dictRehash(d,1);
}

/* Add an element to the target hash table */
int dictAdd(dict *d, void *key, void *val)
{
    dictEntry *entry = dictAddRaw(d,key,NULL);

    if (!entry) return DICT_ERR;
    dictSetVal(d, entry, val);
    return DICT_OK;
}

/* Low level add or find:
 * This function adds the entry but instead of setting a value returns the
 * dictEntry structure to the user, that will make sure to fill the value
 * field as they wish.
 *
 * This function is also directly exposed to the user API to be called
 * mainly in order to store non-pointers inside the hash value, example:
 *
 * entry = dictAddRaw(dict,mykey,NULL);
 * if (entry != NULL) dictSetSignedIntegerVal(entry,1000);
 *
 * Return values:
 *
 * If key already exists NULL is returned, and "*existing" is populated
 * with the existing entry if existing is not NULL.
 *
 * If key was added, the hash entry is returned to be manipulated by the caller.
 */
dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing)
{
    long index;
    dictEntry *entry;
    int htidx;

    if (dictIsRehashing(d)) _dictRehashStep(d);

    /* Get the index of the new element, or -1 if
     * the element already exists. */
    if ((index = _dictKeyIndex(d, key, dictHashKey(d,key), existing)) == -1)
        return NULL;

    /* Allocate the memory and store the new entry.
     * Insert the element in top, with the assumption that in a database
     * system it is more likely that recently added entries are accessed
     * more frequently. */
    htidx = dictIsRehashing(d) ? 1 : 0;
    size_t metasize = 0;
    entry = zmalloc(sizeof(*entry) + metasize);
    entry->next = d->ht_table[htidx][index];
    d->ht_table[htidx][index] = entry;
    d->ht_used[htidx]++;

    /* Set the hash entry fields. */
    dictSetKey(d, entry, key);
    return entry;
}

/* Add or Overwrite:
 * Add an element, discarding the old value if the key already exists.
 * Return 1 if the key was added from scratch, 0 if there was already an
 * element with such key and dictReplace() just performed a value update
 * operation. */
int dictReplace(dict *d, void *key, void *val)
{
    dictEntry *entry, *existing, auxentry;

    /* Try to add the element. If the key
     * does not exists dictAdd will succeed. */
    entry = dictAddRaw(d,key,&existing);
    if (entry) {
        dictSetVal(d, entry, val);
        return 1;
    }

    /* Set the new value and free the old one. Note that it is important
     * to do that in this order, as the value may just be exactly the same
     * as the previous one. In this context, think to reference counting,
     * you want to increment (set), and then decrement (free), and not the
     * reverse. */
    auxentry = *existing;
    dictSetVal(d, existing, val);
    dictFreeVal(d, &auxentry);
    return 0;
}

/* Add or Find:
 * dictAddOrFind() is simply a version of dictAddRaw() that always
 * returns the hash entry of the specified key, even if the key already
 * exists and can't be added (in that case the entry of the already
 * existing key is returned.)
 *
 * See dictAddRaw() for more information. */
dictEntry *dictAddOrFind(dict *d, void *key) {
    dictEntry *entry, *existing;
    entry = dictAddRaw(d,key,&existing);
    return entry ? entry : existing;
}

/* Search and remove an element. This is a helper function for
 * dictDelete() and dictUnlink(), please check the top comment
 * of those functions. */
static dictEntry *dictGenericDelete(dict *d, const void *key, int nofree) {
    uint64_t h, idx;
    dictEntry *he, *prevHe;
    int table;

    /* dict is empty */
    if (dictSize(d) == 0) return NULL;

    if (dictIsRehashing(d)) _dictRehashStep(d);
    h = dictHashKey(d, key);

    for (table = 0; table <= 1; table++) {
        idx = h & DICTHT_SIZE_MASK(d->ht_size_exp[table]);
        he = d->ht_table[table][idx];
        prevHe = NULL;
        while(he) {
            if (key==he->key || dictCompareKeys(d, key, he->key)) {
                /* Unlink the element from the list */
                if (prevHe)
                    prevHe->next = he->next;
                else
                    d->ht_table[table][idx] = he->next;
                if (!nofree) {
                    dictFreeUnlinkedEntry(d, he);
                }
                d->ht_used[table]--;
                return he;
            }
            prevHe = he;
            he = he->next;
        }
        if (!dictIsRehashing(d)) break;
    }
    return NULL; /* not found */
}

/* Remove an element, returning DICT_OK on success or DICT_ERR if the
 * element was not found. */
int dictDelete(dict *ht, const void *key) {
    return dictGenericDelete(ht,key,0) ? DICT_OK : DICT_ERR;
}

/* Remove an element from the table, but without actually releasing
 * the key, value and dictionary entry. The dictionary entry is returned
 * if the element was found (and unlinked from the table), and the user
 * should later call `dictFreeUnlinkedEntry()` with it in order to release it.
 * Otherwise if the key is not found, NULL is returned.
 *
 * This function is useful when we want to remove something from the hash
 * table but want to use its value before actually deleting the entry.
 * Without this function the pattern would require two lookups:
 *
 *  entry = dictFind(...);
 *  // Do something with entry
 *  dictDelete(dictionary,entry);
 *
 * Thanks to this function it is possible to avoid this, and use
 * instead:
 *
 * entry = dictUnlink(dictionary,entry);
 * // Do something with entry
 * dictFreeUnlinkedEntry(entry); // <- This does not need to lookup again.
 */
dictEntry *dictUnlink(dict *d, const void *key) {
    return dictGenericDelete(d,key,1);
}

/* You need to call this function to really free the entry after a call
 * to dictUnlink(). It's safe to call this function with 'he' = NULL. */
void dictFreeUnlinkedEntry(dict *d, dictEntry *he) {
    if (he == NULL) return;
    dictFreeKey(d, he);
    dictFreeVal(d, he);
    zfree(he);
}

/* Destroy an entire dictionary */
int _dictClear(dict *d, int htidx, void(callback)(dict*)) {
    unsigned long i;

    /* Free all the elements */
    for (i = 0; i < DICTHT_SIZE(d->ht_size_exp[htidx]) && d->ht_used[htidx] > 0; i++) {
        dictEntry *he, *nextHe;

        if (callback && (i & 65535) == 0) callback(d);

        if ((he = d->ht_table[htidx][i]) == NULL) continue;
        while(he) {
            nextHe = he->next;
            dictFreeKey(d, he);
            dictFreeVal(d, he);
            zfree(he);
            d->ht_used[htidx]--;
            he = nextHe;
        }
    }
    /* Free the table and the allocated cache structure */
    zfree(d->ht_table[htidx]);
    /* Re-initialize the table */
    _dictReset(d, htidx);
    return DICT_OK; /* never fails */
}

/* Clear & Release the hash table */
void dictRelease(dict *d)
{
    _dictClear(d,0,NULL);
    _dictClear(d,1,NULL);
    zfree(d);
}

dictEntry *dictFind(dict *d, const void *key)
{
    dictEntry *he;
    uint64_t h, idx, table;

    if (dictSize(d) == 0) return NULL; /* dict is empty */
    if (dictIsRehashing(d)) _dictRehashStep(d);
    h = dictHashKey(d, key);
    for (table = 0; table <= 1; table++) {
        idx = h & DICTHT_SIZE_MASK(d->ht_size_exp[table]);
        he = d->ht_table[table][idx];
        while(he) {
            if (key==he->key || dictCompareKeys(d, key, he->key))
                return he;
            he = he->next;
        }
        if (!dictIsRehashing(d)) return NULL;
    }
    return NULL;
}

void *dictFetchValue(dict *d, const void *key) {
    dictEntry *he;

    he = dictFind(d,key);
    return he ? dictGetVal(he) : NULL;
}

/* A fingerprint is a 64 bit number that represents the state of the dictionary
 * at a given time, it's just a few dict properties xored together.
 * When an unsafe iterator is initialized, we get the dict fingerprint, and check
 * the fingerprint again when the iterator is released.
 * If the two fingerprints are different it means that the user of the iterator
 * performed forbidden operations against the dictionary while iterating. */
unsigned long long dictFingerprint(dict *d) {
    unsigned long long integers[6], hash = 0;
    int j;

    integers[0] = (long) d->ht_table[0];
    integers[1] = d->ht_size_exp[0];
    integers[2] = d->ht_used[0];
    integers[3] = (long) d->ht_table[1];
    integers[4] = d->ht_size_exp[1];
    integers[5] = d->ht_used[1];

    /* We hash N integers by summing every successive integer with the integer
     * hashing of the previous sum. Basically:
     *
     * Result = hash(hash(hash(int1)+int2)+int3) ...
     *
     * This way the same set of integers in a different order will (likely) hash
     * to a different number. */
    for (j = 0; j < 6; j++) {
        hash += integers[j];
        /* For the hashing step we use Tomas Wang's 64 bit integer hash. */
        hash = (~hash) + (hash << 21); // hash = (hash << 21) - hash - 1;
        hash = hash ^ (hash >> 24);
        hash = (hash + (hash << 3)) + (hash << 8); // hash * 265
        hash = hash ^ (hash >> 14);
        hash = (hash + (hash << 2)) + (hash << 4); // hash * 21
        hash = hash ^ (hash >> 28);
        hash = hash + (hash << 31);
    }
    return hash;
}

dictIterator *dictGetIterator(dict *d)
{
    dictIterator *iter = zmalloc(sizeof(*iter));

    iter->d = d;
    iter->table = 0;
    iter->index = -1;
    iter->safe = 0;
    iter->entry = NULL;
    iter->nextEntry = NULL;
    return iter;
}

dictIterator *dictGetSafeIterator(dict *d) {
    dictIterator *i = dictGetIterator(d);

    i->safe = 1;
    return i;
}

dictEntry *dictNext(dictIterator *iter)
{
    while (1) {
        if (iter->entry == NULL) {
            if (iter->index == -1 && iter->table == 0) {
                if (iter->safe)
                    dictPauseRehashing(iter->d);
                else
                    iter->fingerprint = dictFingerprint(iter->d);
            }
            iter->index++;
            if (iter->index >= (long) DICTHT_SIZE(iter->d->ht_size_exp[iter->table])) {
                if (dictIsRehashing(iter->d) && iter->table == 0) {
                    iter->table++;
                    iter->index = 0;
                } else {
                    break;
                }
            }
            iter->entry = iter->d->ht_table[iter->table][iter->index];
        } else {
            iter->entry = iter->nextEntry;
        }
        if (iter->entry) {
            /* We need to save the 'next' here, the iterator user
             * may delete the entry we are returning. */
            iter->nextEntry = iter->entry->next;
            return iter->entry;
        }
    }
    return NULL;
}

void dictReleaseIterator(dictIterator *iter)
{
    if (!(iter->index == -1 && iter->table == 0)) {
        if (iter->safe)
            dictResumeRehashing(iter->d);
        else
            assert(iter->fingerprint == dictFingerprint(iter->d));
    }
    zfree(iter);
}

/* Function to reverse bits. Algorithm from:
 * http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel */
static unsigned long rev(unsigned long v) {
    unsigned long s = CHAR_BIT * sizeof(v); // bit size; must be power of 2
    unsigned long mask = ~0UL;
    while ((s >>= 1) > 0) {
        mask ^= (mask << s);
        v = ((v >> s) & mask) | ((v << s) & ~mask);
    }
    return v;
}

/* dictScan() is used to iterate over the elements of a dictionary.
 *
 * Iterating works the following way:
 *
 * 1) Initially you call the function using a cursor (v) value of 0.
 * 2) The function performs one step of the iteration, and returns the
 *    new cursor value you must use in the next call.
 * 3) When the returned cursor is 0, the iteration is complete.
 *
 * The function guarantees all elements present in the
 * dictionary get returned between the start and end of the iteration.
 * However it is possible some elements get returned multiple times.
 *
 * For every element returned, the callback argument 'fn' is
 * called with 'privdata' as first argument and the dictionary entry
 * 'de' as second argument.
 *
 * HOW IT WORKS.
 *
 * The iteration algorithm was designed by Pieter Noordhuis.
 * The main idea is to increment a cursor starting from the higher order
 * bits. That is, instead of incrementing the cursor normally, the bits
 * of the cursor are reversed, then the cursor is incremented, and finally
 * the bits are reversed again.
 *
 * This strategy is needed because the hash table may be resized between
 * iteration calls.
 *
 * dict.c hash tables are always power of two in size, and they
 * use chaining, so the position of an element in a given table is given
 * by computing the bitwise AND between Hash(key) and SIZE-1
 * (where SIZE-1 is always the mask that is equivalent to taking the rest
 *  of the division between the Hash of the key and SIZE).
 *
 * For example if the current hash table size is 16, the mask is
 * (in binary) 1111. The position of a key in the hash table will always be
 * the last four bits of the hash output, and so forth.
 *
 * WHAT HAPPENS IF THE TABLE CHANGES IN SIZE?
 *
 * If the hash table grows, elements can go anywhere in one multiple of
 * the old bucket: for example let's say we already iterated with
 * a 4 bit cursor 1100 (the mask is 1111 because hash table size = 16).
 *
 * If the hash table will be resized to 64 elements, then the new mask will
 * be 111111. The new buckets you obtain by substituting in ??1100
 * with either 0 or 1 can be targeted only by keys we already visited
 * when scanning the bucket 1100 in the smaller hash table.
 *
 * By iterating the higher bits first, because of the inverted counter, the
 * cursor does not need to restart if the table size gets bigger. It will
 * continue iterating using cursors without '1100' at the end, and also
 * without any other combination of the final 4 bits already explored.
 *
 * Similarly when the table size shrinks over time, for example going from
 * 16 to 8, if a combination of the lower three bits (the mask for size 8
 * is 111) were already completely explored, it would not be visited again
 * because we are sure we tried, for example, both 0111 and 1111 (all the
 * variations of the higher bit) so we don't need to test it again.
 *
 * WAIT... YOU HAVE *TWO* TABLES DURING REHASHING!
 *
 * Yes, this is true, but we always iterate the smaller table first, then
 * we test all the expansions of the current cursor into the larger
 * table. For example if the current cursor is 101 and we also have a
 * larger table of size 16, we also test (0)101 and (1)101 inside the larger
 * table. This reduces the problem back to having only one table, where
 * the larger one, if it exists, is just an expansion of the smaller one.
 *
 * LIMITATIONS
 *
 * This iterator is completely stateless, and this is a huge advantage,
 * including no additional memory used.
 *
 * The disadvantages resulting from this design are:
 *
 * 1) It is possible we return elements more than once. However this is usually
 *    easy to deal with in the application level.
 * 2) The iterator must return multiple elements per call, as it needs to always
 *    return all the keys chained in a given bucket, and all the expansions, so
 *    we are sure we don't miss keys moving during rehashing.
 * 3) The reverse cursor is somewhat hard to understand at first, but this
 *    comment is supposed to help.
 */
unsigned long dictScan(dict *d,
                       unsigned long v,
                       dictScanFunction *fn,
                       dictScanBucketFunction* bucketfn,
                       void *privdata)
{
    int htidx0, htidx1;
    const dictEntry *de, *next;
    unsigned long m0, m1;

    if (dictSize(d) == 0) return 0;

    /* This is needed in case the scan callback tries to do dictFind or alike. */
    dictPauseRehashing(d);

    if (!dictIsRehashing(d)) {
        htidx0 = 0;
        m0 = DICTHT_SIZE_MASK(d->ht_size_exp[htidx0]);

        /* Emit entries at cursor */
        if (bucketfn) bucketfn(d, &d->ht_table[htidx0][v & m0]);
        de = d->ht_table[htidx0][v & m0];
        while (de) {
            next = de->next;
            fn(privdata, de);
            de = next;
        }

        /* Set unmasked bits so incrementing the reversed cursor
         * operates on the masked bits */
        v |= ~m0;

        /* Increment the reverse cursor */
        v = rev(v);
        v++;
        v = rev(v);

    } else {
        htidx0 = 0;
        htidx1 = 1;

        /* Make sure t0 is the smaller and t1 is the bigger table */
        if (DICTHT_SIZE(d->ht_size_exp[htidx0]) > DICTHT_SIZE(d->ht_size_exp[htidx1])) {
            htidx0 = 1;
            htidx1 = 0;
        }

        m0 = DICTHT_SIZE_MASK(d->ht_size_exp[htidx0]);
        m1 = DICTHT_SIZE_MASK(d->ht_size_exp[htidx1]);

        /* Emit entries at cursor */
        if (bucketfn) bucketfn(d, &d->ht_table[htidx0][v & m0]);
        de = d->ht_table[htidx0][v & m0];
        while (de) {
            next = de->next;
            fn(privdata, de);
            de = next;
        }

        /* Iterate over indices in larger table that are the expansion
         * of the index pointed to by the cursor in the smaller table */
        do {
            /* Emit entries at cursor */
            if (bucketfn) bucketfn(d, &d->ht_table[htidx1][v & m1]);
            de = d->ht_table[htidx1][v & m1];
            while (de) {
                next = de->next;
                fn(privdata, de);
                de = next;
            }

            /* Increment the reverse cursor not covered by the smaller mask.*/
            v |= ~m1;
            v = rev(v);
            v++;
            v = rev(v);

            /* Continue while bits covered by mask difference is non-zero */
        } while (v & (m0 ^ m1));
    }

    dictResumeRehashing(d);

    return v;
}

/* ------------------------- private functions ------------------------------ */

/* Because we may need to allocate huge memory chunk at once when dict
 * expands, we will check this allocation is allowed or not if the dict
 * type has expandAllowed member function. */
static int dictTypeExpandAllowed(dict *d) {
    if (d->type->expandAllowed == NULL) return 1;
    return d->type->expandAllowed(
                    DICTHT_SIZE(_dictNextExp(d->ht_used[0] + 1)) * sizeof(dictEntry*),
                    (double)d->ht_used[0] / DICTHT_SIZE(d->ht_size_exp[0]));
}

/* Expand the hash table if needed */
static int _dictExpandIfNeeded(dict *d)
{
    /* Incremental rehashing already in progress. Return. */
    if (dictIsRehashing(d)) return DICT_OK;

    /* If the hash table is empty expand it to the initial size. */
    if (DICTHT_SIZE(d->ht_size_exp[0]) == 0) return dictExpand(d, DICT_HT_INITIAL_SIZE);

    /* If we reached the 1:1 ratio, and we are allowed to resize the hash
     * table (global setting) or we should avoid it but the ratio between
     * elements/buckets is over the "safe" threshold, we resize doubling
     * the number of buckets. */
    if (d->ht_used[0] >= DICTHT_SIZE(d->ht_size_exp[0]) &&
        (dict_can_resize ||
         d->ht_used[0]/ DICTHT_SIZE(d->ht_size_exp[0]) > dict_force_resize_ratio) &&
        dictTypeExpandAllowed(d))
    {
        return dictExpand(d, d->ht_used[0] + 1);
    }
    return DICT_OK;
}

/* TODO: clz optimization */
/* Our hash table capability is a power of two */
static signed char _dictNextExp(unsigned long size)
{
    unsigned char e = DICT_HT_INITIAL_EXP;

    if (size >= LONG_MAX) return (8*sizeof(long)-1);
    while(1) {
        if (((unsigned long)1<<e) >= size)
            return e;
        e++;
    }
}

/* Returns the index of a free slot that can be populated with
 * a hash entry for the given 'key'.
 * If the key already exists, -1 is returned
 * and the optional output parameter may be filled.
 *
 * Note that if we are in the process of rehashing the hash table, the
 * index is always returned in the context of the second (new) hash table. */
static long _dictKeyIndex(dict *d, const void *key, uint64_t hash, dictEntry **existing)
{
    unsigned long idx, table;
    dictEntry *he;
    if (existing) *existing = NULL;

    /* Expand the hash table if needed */
    if (_dictExpandIfNeeded(d) == DICT_ERR)
        return -1;
    for (table = 0; table <= 1; table++) {
        idx = hash & DICTHT_SIZE_MASK(d->ht_size_exp[table]);
        /* Search if this slot does not already contain the given key */
        he = d->ht_table[table][idx];
        while(he) {
            if (key==he->key || dictCompareKeys(d, key, he->key)) {
                if (existing) *existing = he;
                return -1;
            }
            he = he->next;
        }
        if (!dictIsRehashing(d)) break;
    }
    return idx;
}

void dictEmpty(dict *d, void(callback)(dict*)) {
    _dictClear(d,0,callback);
    _dictClear(d,1,callback);
    d->rehashidx = -1;
    d->pauserehash = 0;
}

void dictEnableResize(void) {
    dict_can_resize = 1;
}

void dictDisableResize(void) {
    dict_can_resize = 0;
}

uint64_t dictGetHash(dict *d, const void *key) {
    return dictHashKey(d, key);
}

/* Finds the dictEntry reference by using pointer and pre-calculated hash.
 * oldkey is a dead pointer and should not be accessed.
 * the hash value should be provided using dictGetHash.
 * no string / key comparison is performed.
 * return value is the reference to the dictEntry if found, or NULL if not found. */
dictEntry **dictFindEntryRefByPtrAndHash(dict *d, const void *oldptr, uint64_t hash) {
    dictEntry *he, **heref;
    unsigned long idx, table;

    if (dictSize(d) == 0) return NULL; /* dict is empty */
    for (table = 0; table <= 1; table++) {
        idx = hash & DICTHT_SIZE_MASK(d->ht_size_exp[table]);
        heref = &d->ht_table[table][idx];
        he = *heref;
        while(he) {
            if (oldptr==he->key)
                return heref;
            heref = &he->next;
            he = *heref;
        }
        if (!dictIsRehashing(d)) return NULL;
    }
    return NULL;
}

/* ------------------------------- Debugging ---------------------------------*/

#define DICT_STATS_VECTLEN 50
size_t _dictGetStatsHt(char *buf, size_t bufsize, dict *d, int htidx) {
    unsigned long i, slots = 0, chainlen, maxchainlen = 0;
    unsigned long totchainlen = 0;
    unsigned long clvector[DICT_STATS_VECTLEN];
    size_t l = 0;

    if (d->ht_used[htidx] == 0) {
        return snprintf(buf,bufsize,
            "No stats available for empty dictionaries\n");
    }

    /* Compute stats. */
    for (i = 0; i < DICT_STATS_VECTLEN; i++) clvector[i] = 0;
    for (i = 0; i < DICTHT_SIZE(d->ht_size_exp[htidx]); i++) {
        dictEntry *he;

        if (d->ht_table[htidx][i] == NULL) {
            clvector[0]++;
            continue;
        }
        slots++;
        /* For each hash entry on this slot... */
        chainlen = 0;
        he = d->ht_table[htidx][i];
        while(he) {
            chainlen++;
            he = he->next;
        }
        clvector[(chainlen < DICT_STATS_VECTLEN) ? chainlen : (DICT_STATS_VECTLEN-1)]++;
        if (chainlen > maxchainlen) maxchainlen = chainlen;
        totchainlen += chainlen;
    }

    /* Generate human readable stats. */
    l += snprintf(buf+l,bufsize-l,
        "Hash table %d stats (%s):\n"
        " table size: %lu\n"
        " number of elements: %lu\n"
        " different slots: %lu\n"
        " max chain length: %lu\n"
        " avg chain length (counted): %.02f\n"
        " avg chain length (computed): %.02f\n"
        " Chain length distribution:\n",
        htidx, (htidx == 0) ? "main hash table" : "rehashing target",
        DICTHT_SIZE(d->ht_size_exp[htidx]), d->ht_used[htidx], slots, maxchainlen,
        (float)totchainlen/slots, (float)d->ht_used[htidx]/slots);

    for (i = 0; i < DICT_STATS_VECTLEN-1; i++) {
        if (clvector[i] == 0) continue;
        if (l >= bufsize) break;
        l += snprintf(buf+l,bufsize-l,
            "   %ld: %ld (%.02f%%)\n",
            i, clvector[i], ((float)clvector[i]/DICTHT_SIZE(d->ht_size_exp[htidx]))*100);
    }

    /* Unlike snprintf(), return the number of characters actually written. */
    if (bufsize) buf[bufsize-1] = '\0';
    return strlen(buf);
}

void dictGetStats(char *buf, size_t bufsize, dict *d) {
    size_t l;
    char *orig_buf = buf;
    size_t orig_bufsize = bufsize;

    l = _dictGetStatsHt(buf,bufsize,d,0);
    buf += l;
    bufsize -= l;
    if (dictIsRehashing(d) && bufsize > 0) {
        _dictGetStatsHt(buf,bufsize,d,1);
    }
    /* Make sure there is a NULL term at the end. */
    if (orig_bufsize) orig_buf[orig_bufsize-1] = '\0';
}

/* ------------------------------- Benchmark ---------------------------------*/

#ifdef REDIS_TEST
#include "testhelp.h"

#define UNUSED(V) ((void) V)

uint64_t hashCallback(const void *key) {
    return dictGenHashFunction((unsigned char*)key, strlen((char*)key));
}

int compareCallback(dict *d, const void *key1, const void *key2) {
    int l1,l2;
    UNUSED(d);

    l1 = strlen((char*)key1);
    l2 = strlen((char*)key2);
    if (l1 != l2) return 0;
    return memcmp(key1, key2, l1) == 0;
}

void freeCallback(dict *d, void *val) {
    UNUSED(d);

    zfree(val);
}

char *stringFromLongLong(long long value) {
    char buf[32];
    int len;
    char *s;

    len = sprintf(buf,"%lld",value);
    s = zmalloc(len+1);
    memcpy(s, buf, len);
    s[len] = '\0';
    return s;
}

dictType BenchmarkDictType = {
    hashCallback,
    NULL,
    NULL,
    compareCallback,
    freeCallback,
    NULL,
    NULL
};

#define start_benchmark() start = timeInMilliseconds()
#define end_benchmark(msg) do { \
    elapsed = timeInMilliseconds()-start; \
    printf(msg ": %ld items in %lld ms\n", count, elapsed); \
} while(0)

/* ./redis-server test dict [<count> | --accurate] */
int dictTest(int argc, char **argv, int flags) {
    long j;
    long long start, elapsed;
    dict *dict = dictCreate(&BenchmarkDictType);
    long count = 0;
    int accurate = (flags & REDIS_TEST_ACCURATE);

    if (argc == 4) {
        if (accurate) {
            count = 5000000;
        } else {
            count = strtol(argv[3],NULL,10);
        }
    } else {
        count = 5000;
    }

    start_benchmark();
    for (j = 0; j < count; j++) {
        int retval = dictAdd(dict,stringFromLongLong(j),(void*)j);
        assert(retval == DICT_OK);
    }
    end_benchmark("Inserting");
    assert((long)dictSize(dict) == count);

    /* Wait for rehashing. */
    while (dictIsRehashing(dict)) {
        dictRehashMilliseconds(dict,100);
    }

    start_benchmark();
    for (j = 0; j < count; j++) {
        char *key = stringFromLongLong(j);
        dictEntry *de = dictFind(dict,key);
        assert(de != NULL);
        zfree(key);
    }
    end_benchmark("Linear access of existing elements");

    start_benchmark();
    for (j = 0; j < count; j++) {
        char *key = stringFromLongLong(j);
        dictEntry *de = dictFind(dict,key);
        assert(de != NULL);
        zfree(key);
    }
    end_benchmark("Linear access of existing elements (2nd round)");

    start_benchmark();
    for (j = 0; j < count; j++) {
        char *key = stringFromLongLong(rand() % count);
        dictEntry *de = dictFind(dict,key);
        assert(de != NULL);
        zfree(key);
    }
    end_benchmark("Random access of existing elements");

    start_benchmark();
    for (j = 0; j < count; j++) {
        dictEntry *de = dictGetRandomKey(dict);
        assert(de != NULL);
    }
    end_benchmark("Accessing random keys");

    start_benchmark();
    for (j = 0; j < count; j++) {
        char *key = stringFromLongLong(rand() % count);
        key[0] = 'X';
        dictEntry *de = dictFind(dict,key);
        assert(de == NULL);
        zfree(key);
    }
    end_benchmark("Accessing missing");

    start_benchmark();
    for (j = 0; j < count; j++) {
        char *key = stringFromLongLong(j);
        int retval = dictDelete(dict,key);
        assert(retval == DICT_OK);
        key[0] += 17; /* Change first number to letter. */
        retval = dictAdd(dict,key,(void*)j);
        assert(retval == DICT_OK);
    }
    end_benchmark("Removing and adding");
    dictRelease(dict);
    return 0;
}
#endif


================================================
FILE: src/redis/dict.h
================================================
/* Hash Tables Implementation.
 *
 * This file implements in-memory hash tables with insert/del/replace/find/
 * get-random-element operations. Hash tables will auto-resize if needed
 * tables of power of two in size are used, collisions are handled by
 * chaining. See the source code for more information... :)
 *
 * Copyright (c) 2006-2012, Salvatore Sanfilippo <antirez at gmail dot com>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef __DICT_H
#define __DICT_H

#include <limits.h>
#include <stdint.h>
#include <stdlib.h>

#define DICT_OK 0
#define DICT_ERR 1

/* Unused arguments generate annoying warnings... */
#define DICT_NOTUSED(V) ((void) V)

typedef struct dictEntry {
    void *key;
    union {
        void *val;
        uint64_t u64;
        int64_t s64;
        double d;
    } v;
    struct dictEntry *next;     /* Next entry in the same hash bucket. */
} dictEntry;

typedef struct dict dict;

typedef struct dictType {
    uint64_t (*hashFunction)(const void *key);
    void *(*keyDup)(dict *d, const void *key);
    void *(*valDup)(dict *d, const void *obj);
    int (*keyCompare)(dict *d, const void *key1, const void *key2);
    void (*keyDestructor)(dict *d, void *key);
    void (*valDestructor)(dict *d, void *obj);
    int (*expandAllowed)(size_t moreMem, double usedRatio);
} dictType;

#define DICTHT_SIZE(exp) ((exp) == -1 ? 0 : (unsigned long)1<<(exp))
#define DICTHT_SIZE_MASK(exp) ((exp) == -1 ? 0 : (DICTHT_SIZE(exp))-1)

struct dict {
    dictType *type;

    dictEntry **ht_table[2];
    unsigned long ht_used[2];

    long rehashidx; /* rehashing not in progress if rehashidx == -1 */

    /* Keep small vars at end for optimal (minimal) struct padding */
    int16_t pauserehash; /* If >0 rehashing is paused (<0 indicates coding error) */
    signed char ht_size_exp[2]; /* exponent of size. (size = 1<<exp) */
};

/* If safe is set to 1 this is a safe iterator, that means, you can call
 * dictAdd, dictFind, and other functions against the dictionary even while
 * iterating. Otherwise it is a non safe iterator, and only dictNext()
 * should be called while iterating. */
typedef struct dictIterator {
    dict *d;
    long index;
    int table, safe;
    dictEntry *entry, *nextEntry;
    /* unsafe iterator fingerprint for misuse detection. */
    unsigned long long fingerprint;
} dictIterator;

typedef void (dictScanFunction)(void *privdata, const dictEntry *de);
typedef void (dictScanBucketFunction)(dict *d, dictEntry **bucketref);

/* This is the initial size of every hash table */
#define DICT_HT_INITIAL_EXP      2
#define DICT_HT_INITIAL_SIZE     (1<<(DICT_HT_INITIAL_EXP))

/* ------------------------------- Macros ------------------------------------*/
#define dictFreeVal(d, entry) \
    if ((d)->type->valDestructor) \
        (d)->type->valDestructor((d), (entry)->v.val)

#define dictSetVal(d, entry, _val_) do { \
    if ((d)->type->valDup) \
        (entry)->v.val = (d)->type->valDup((d), _val_); \
    else \
        (entry)->v.val = (_val_); \
} while(0)

#define dictSetSignedIntegerVal(entry, _val_) \
    do { (entry)->v.s64 = _val_; } while(0)

#define dictSetUnsignedIntegerVal(entry, _val_) \
    do { (entry)->v.u64 = _val_; } while(0)

#define dictSetDoubleVal(entry, _val_) \
    do { (entry)->v.d = _val_; } while(0)

#define dictFreeKey(d, entry) \
    if ((d)->type->keyDestructor) \
        (d)->type->keyDestructor((d), (entry)->key)

#define dictSetKey(d, entry, _key_) do { \
    if ((d)->type->keyDup) \
        (entry)->key = (d)->type->keyDup((d), _key_); \
    else \
        (entry)->key = (_key_); \
} while(0)

#define dictCompareKeys(d, key1, key2) \
    (((d)->type->keyCompare) ? \
        (d)->type->keyCompare((d), key1, key2) : \
        (key1) == (key2))

#define dictHashKey(d, key) (d)->type->hashFunction(key)
#define dictGetKey(he) ((he)->key)
#define dictGetVal(he) ((he)->v.val)
#define dictGetSignedIntegerVal(he) ((he)->v.s64)
#define dictGetUnsignedIntegerVal(he) ((he)->v.u64)
#define dictGetDoubleVal(he) ((he)->v.d)
#define dictSlots(d) (DICTHT_SIZE((d)->ht_size_exp[0])+DICTHT_SIZE((d)->ht_size_exp[1]))
#define dictSize(d) ((d)->ht_used[0]+(d)->ht_used[1])
#define dictIsRehashing(d) ((d)->rehashidx != -1)
#define dictPauseRehashing(d) (d)->pauserehash++
#define dictResumeRehashing(d) (d)->pauserehash--

/* If our unsigned long type can store a 64 bit number, use a 64 bit PRNG. */
#if ULONG_MAX >= 0xffffffffffffffff
#define randomULong() ((unsigned long) genrand64_int64())
#else
#define randomULong() random()
#endif

/* API */
dict *dictCreate(dictType *type);
int dictExpand(dict *d, unsigned long size);
int dictTryExpand(dict *d, unsigned long size);
int dictAdd(dict *d, void *key, void *val);
dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing);
dictEntry *dictAddOrFind(dict *d, void *key);
int dictReplace(dict *d, void *key, void *val);
int dictDelete(dict *d, const void *key);
dictEntry *dictUnlink(dict *d, const void *key);
void dictFreeUnlinkedEntry(dict *d, dictEntry *he);
void dictRelease(dict *d);
dictEntry * dictFind(dict *d, const void *key);
void *dictFetchValue(dict *d, const void *key);
int dictResize(dict *d);
dictIterator *dictGetIterator(dict *d);
dictIterator *dictGetSafeIterator(dict *d);
dictEntry *dictNext(dictIterator *iter);
void dictReleaseIterator(dictIterator *iter);
dictEntry *dictGetRandomKey(dict *d);
dictEntry *dictGetFairRandomKey(dict *d);
unsigned int dictGetSomeKeys(dict *d, dictEntry **des, unsigned int count);
void dictGetStats(char *buf, size_t bufsize, dict *d);
uint64_t dictGenHashFunction(const void *key, size_t len);
uint64_t dictGenCaseHashFunction(const unsigned char *buf, size_t len);
void dictEmpty(dict *d, void(callback)(dict*));
void dictEnableResize(void);
void dictDisableResize(void);
int dictRehash(dict *d, int n);
int dictRehashMilliseconds(dict *d, int ms);
void dictSetHashFunctionSeed(uint8_t *seed);
uint8_t *dictGetHashFunctionSeed(void);
unsigned long dictScan(dict *d, unsigned long v, dictScanFunction *fn, dictScanBucketFunction *bucketfn, void *privdata);
uint64_t dictGetHash(dict *d, const void *key);
dictEntry **dictFindEntryRefByPtrAndHash(dict *d, const void *oldptr, uint64_t hash);

#endif /* __DICT_H */


================================================
FILE: src/redis/endianconv.h
================================================
/* See endianconv.c top comments for more information
 *
 * ----------------------------------------------------------------------------
 *
 * Copyright (c) 2011-2012, Salvatore Sanfilippo <antirez at gmail dot com>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef __ENDIANCONV_H
#define __ENDIANCONV_H

#include "config.h"
#include <stdint.h>

void memrev16(void *p);
void memrev32(void *p);
void memrev64(void *p);
uint16_t intrev16(uint16_t v);
uint32_t intrev32(uint32_t v);
uint64_t intrev64(uint64_t v);

/* variants of the function doing the actual conversion only if the target
 * host is big endian */
#if (BYTE_ORDER == LITTLE_ENDIAN)
#define memrev16ifbe(p) ((void)(0))
#define memrev32ifbe(p) ((void)(0))
#define memrev64ifbe(p) ((void)(0))
#define intrev16ifbe(v) (v)
#define intrev32ifbe(v) (v)
#define intrev64ifbe(v) (v)
#else
#define memrev16ifbe(p) memrev16(p)
#define memrev32ifbe(p) memrev32(p)
#define memrev64ifbe(p) memrev64(p)
#define intrev16ifbe(v) intrev16(v)
#define intrev32ifbe(v) intrev32(v)
#define intrev64ifbe(v) intrev64(v)
#endif

/* The functions htonu64() and ntohu64() convert the specified value to
 * network byte ordering and back. In big endian systems they are no-ops. */
#if (BYTE_ORDER == BIG_ENDIAN)
#define htonu64(v) (v)
#define ntohu64(v) (v)
#else
#define htonu64(v) intrev64(v)
#define ntohu64(v) intrev64(v)
#endif

#ifdef REDIS_TEST
int endianconvTest(int argc, char *argv[], int flags);
#endif

#endif


================================================
FILE: src/redis/geo.c
================================================
/*
 * Copyright (c) 2014, Matt Stancliff <matt@genges.com>.
 * Copyright (c) 2015-2016, Salvatore Sanfilippo <antirez@gmail.com>.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <string.h>

#include "geo.h"
#include "geohash_helper.h"
#include "listpack.h"
#include "util.h"
#include "zmalloc.h"
#include "sds.h"


// D - noop
#define D(...) while (0)

/* ====================================================================
 * This file implements the following commands:
 *
 *   - geoadd - add coordinates for value to geoset
 *   - georadius - search radius by coordinates in geoset
 *   - georadiusbymember - search radius based on geoset member position
 * ==================================================================== */

/* ====================================================================
 * geoArray implementation
 * ==================================================================== */

/* Create a new array of geoPoints. */
geoArray *geoArrayCreate(void) {
    geoArray *ga = zmalloc(sizeof(*ga));
    /* It gets allocated on first geoArrayAppend() call. */
    ga->array = NULL;
    ga->buckets = 0;
    ga->used = 0;
    return ga;
}

/* Add and populate with data a new entry to the geoArray. */
geoPoint *geoArrayAppend(geoArray *ga, double *xy, double dist,
                         double score, char *member)
{
    if (ga->used == ga->buckets) {
        ga->buckets = (ga->buckets == 0) ? 8 : ga->buckets*2;
        ga->array = zrealloc(ga->array,sizeof(geoPoint)*ga->buckets);
    }
    geoPoint *gp = ga->array+ga->used;
    gp->longitude = xy[0];
    gp->latitude = xy[1];
    gp->dist = dist;
    gp->member = member;
    gp->score = score;
    ga->used++;
    return gp;
}

/* Destroy a geoArray created with geoArrayCreate(). */
void geoArrayFree(geoArray *ga) {
    size_t i;
    for (i = 0; i < ga->used; i++) sdsfree(ga->array[i].member);
    zfree(ga->array);
    zfree(ga);
}

/* ====================================================================
 * Helpers
 * ==================================================================== */
int decodeGeohash(double bits, double *xy) {
    GeoHashBits hash = { .bits = (uint64_t)bits, .step = GEO_STEP_MAX };
    return geohashDecodeToLongLatWGS84(hash, xy);
}


/* Helper function for geoGetPointsInRange(): given a sorted set score
 * representing a point, and a GeoShape, checks if the point is within the search area.
 *
 * shape: the rectangle
 * score: the encoded version of lat,long
 * xy: output variable, the decoded lat,long
 * distance: output variable, the distance between the center of the shape and the point
 *
 * Return values:
 *
 * The return value is C_OK if the point is within search area, or C_ERR if it is outside.
 * "*xy" is populated with the decoded lat,long.
 * "*distance" is populated with the distance between the center of the shape and the point.
 */
int geoWithinShape(GeoShape *shape, double score, double *xy, double *distance) {
    if (!decodeGeohash(score,xy)) return C_ERR; /* Can't decode. */
    /* Note that geohashGetDistanceIfInRadiusWGS84() takes arguments in
     * reverse order: longitude first, latitude later. */
    if (shape->type == CIRCULAR_TYPE) {
        if (!geohashGetDistanceIfInRadiusWGS84(shape->xy[0], shape->xy[1], xy[0], xy[1],
                                               shape->t.radius*shape->conversion, distance))
            return C_ERR;
    } else if (shape->type == RECTANGLE_TYPE) {
        if (!geohashGetDistanceIfInRectangle(shape->t.r.width * shape->conversion,
                                             shape->t.r.height * shape->conversion,
                                             shape->xy[0], shape->xy[1], xy[0], xy[1], distance))
            return C_ERR;
    }
    return C_OK;
}

/* Compute the sorted set scores min (inclusive), max (exclusive) we should
 * query in order to retrieve all the elements inside the specified area
 * 'hash'. The two scores are returned by reference in *min and *max. */
void scoresOfGeoHashBox(GeoHashBits hash, GeoHashFix52Bits *min, GeoHashFix52Bits *max) {
    /* We want to compute the sorted set scores that will include all the
     * elements inside the specified Geohash 'hash', which has as many
     * bits as specified by hash.step * 2.
     *
     * So if step is, for example, 3, and the hash value in binary
     * is 101010, since our score is 52 bits we want every element which
     * is in binary: 101010?????????????????????????????????????????????
     * Where ? can be 0 or 1.
     *
     * To get the min score we just use the initial hash value left
     * shifted enough to get the 52 bit value. Later we increment the
     * 6 bit prefix (see the hash.bits++ statement), and get the new
     * prefix: 101011, which we align again to 52 bits to get the maximum
     * value (which is excluded from the search). So we get everything
     * between the two following scores (represented in binary):
     *
     * 1010100000000000000000000000000000000000000000000000 (included)
     * and
     * 1010110000000000000000000000000000000000000000000000 (excluded).
     */
    *min = geohashAlign52Bits(hash);
    hash.bits++;
    *max = geohashAlign52Bits(hash);
}

================================================
FILE: src/redis/geo.h
================================================
#ifndef __GEO_H__
#define __GEO_H__

#include <stddef.h> /* for size_t */
#include "geohash_helper.h"

/* Structures used inside geo.c in order to represent points and array of
 * points on the earth. */
typedef struct geoPoint {
    double longitude;
    double latitude;
    double dist;
    double score;
    char *member;
} geoPoint;

typedef struct geoArray {
    struct geoPoint *array;
    size_t buckets;
    size_t used;
} geoArray;

int geoWithinShape(GeoShape *shape, double score, double *xy, double *distance);
void scoresOfGeoHashBox(GeoHashBits hash, GeoHashFix52Bits *min, GeoHashFix52Bits *max);

#endif


================================================
FILE: src/redis/geohash.c
================================================
/*
 * Copyright (c) 2013-2014, yinqiwen <yinqiwen@gmail.com>
 * Copyright (c) 2014, Matt Stancliff <matt@genges.com>.
 * Copyright (c) 2015-2016, Salvatore Sanfilippo <antirez@gmail.com>.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Redis nor the names of its contributors may be used
 *    to endorse or promote products derived from this software without
 *    specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 * THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <string.h>

#include "geohash.h"

/**
 * Hashing works like this:
 * Divide the world into 4 buckets.  Label each one as such:
 *  -----------------
 *  |       |       |
 *  |       |       |
 *  | 0,1   | 1,1   |
 *  -----------------
 *  |       |       |
 *  |       |       |
 *  | 0,0   | 1,0   |
 *  -----------------
 */

/* Interleave lower bits of x and y, so the bits of x
 * are in the even positions and bits from y in the odd;
 * x and y must initially be less than 2**32 (4294967296).
 * From:  https://graphics.stanford.edu/~seander/bithacks.html#InterleaveBMN
 */
static inline uint64_t interleave64(uint32_t xlo, uint32_t ylo) {
    static const uint64_t B[] = {0x5555555555555555ULL, 0x3333333333333333ULL,
                                 0x0F0F0F0F0F0F0F0FULL, 0x00FF00FF00FF00FFULL,
                                 0x0000FFFF0000FFFFULL};
    static const unsigned int S[] = {1, 2, 4, 8, 16};

    uint64_t x = xlo;
    uint64_t y = ylo;

    x = (x | (x << S[4])) & B[4];
    y = (y | (y << S[4])) & B[4];

    x = (x | (x << S[3])) & B[3];
    y = (y | (y << S[3])) & B[3];

    x = (x | (x << S[2])) & B[2];
    y = (y | (y << S[2])) & B[2];

    x = (x | (x << S[1])) & B[1];
    y = (y | (y << S[1])) & B[1];

    x = (x | (x << S[0])) & B[0];
    y = (y | (y << S[0])) & B[0];

    return x | (y << 1);
}

/* reverse the interleave process
 * derived from http://stackoverflow.com/questions/4909263
 */
static inline uint64_t deinterleave64(uint64_t interleaved) {
    static const uint64_t B[] = {0x5555555555555555ULL, 0x3333333333333333ULL,
                                 0x0F0F0F0F0F0F0F0FULL, 0x00FF00FF00FF00FFULL,
                                 0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL};
    static const unsigned int S[] = {0, 1, 2, 4, 8, 16};

    uint64_t x = interleaved;
    uint64_t y = interleaved >> 1;

    x = (x | (x >> S[0])) & B[0];
    y = (y | (y >> S[0])) & B[0];

    x = (x | (x >> S[1])) & B[1];
    y = (y | (y >> S[1])) & B[1];

    x = (x | (x >> S[2])) & B[2];
    y = (y | (y >> S[2])) & B[2];

    x = (x | (x >> S[3])) & B[3];
    y = (y | (y >> S[3])) & B[3];

    x = (x | (x >> S[4])) & B[4];
    y = (y | (y >> S[4])) & B[4];

    x = (x | (x >> S[5])) & B[5];
    y = (y | (y >> S[5])) & B[5];

    return x | (y << 32);
}

void geohashGetCoordRange(GeoHashRange *long_range, GeoHashRange *lat_range) {
    /* These are constraints from EPSG:900913 / EPSG:3785 / OSGEO:41001 */
    /* We can't geocode at the north/south pole. */
    long_range->max = GEO_LONG_MAX;
    long_range->min = GEO_LONG_MIN;
    lat_range->max = GEO_LAT_MAX;
    lat_range->min = GEO_LAT_MIN;
}

int geohashEncode(const GeoHashRange *long_range, const GeoHashRange *lat_range,
                  double longitude, double latitude, uint8_t step,
                  GeoHashBits *hash) {
    /* Check basic arguments sanity. */
    if (hash == NULL || step > 32 || step == 0 ||
        RANGEPISZERO(lat_range) || RANGEPISZERO(long_range)) return 0;

    /* Return an error when trying to index outside the supported
     * constraints. */
    if (longitude > GEO_LONG_MAX || longitude < GEO_LONG_MIN ||
        latitude > GEO_LAT_MAX || latitude < GEO_LAT_MIN) return 0;

    hash->bits = 0;
    hash->step = step;

    if (latitude < lat_range->min || latitude > lat_range->max ||
        longitude < long_range->min || longitude > long_range->max) {
        return 0;
    }

    double lat_offset =
        (latitude - lat_range->min) / (lat_range->max - lat_range->min);
    double long_offset =
        (longitude - long_range->min) / (long_range->max - long_range->min);

    /* convert to fixed point based on the step size */
    lat_offset *= (1ULL << step);
    long_offset *= (1ULL << step);
    hash->bits = interleave64(lat_offset, long_offset);
    return 1;
}

int geohashEncodeType(double longitude, double latitude, uint8_t step, GeoHashBits *hash) {
    GeoHashRange r[2] = {{0}};
    geohashGetCoordRange(&r[0], &r[1]);
    return geohashEncode(&r[0], &r[1], longitude, latitude, step, hash);
}

int geohashEncodeWGS84(double longitude, double latitude, uint8_t step,
                       GeoHashBits *hash) {
    return geohashEncodeType(longitude, latitude, step, hash);
}

int geohashDecode(const GeoHashRange long_range, const GeoHashRange lat_range,
                   const GeoHashBits hash, GeoHashArea *area) {
    if (HASHISZERO(hash) || NULL == area || RANGEISZERO(lat_range) ||
        RANGEISZERO(long_range)) {
        return 0;
    }

    area->hash = hash;
    uint8_t step = hash.step;
    uint64_t hash_sep = deinterleave64(hash.bits); /* hash = [LAT][LONG] */

    double lat_scale = lat_range.max - lat_range.min;
    double long_scale = long_range.max - long_range.min;

    uint32_t ilato = hash_sep;       /* get lat part of deinterleaved hash */
    uint32_t ilono = hash_sep >> 32; /* shift over to get long part of hash */

    /* divide by 2**step.
     * Then, for 0-1 coordinate, multiply times scale and add
       to the min to get the absolute coordinate. */
    area->latitude.min =
        lat_range.min + (ilato * 1.0 / (1ull << step)) * lat_scale;
    area->latitude.max =
        lat_range.min + ((ilato + 1) * 1.0 / (1ull << step)) * lat_scale;
    area->longitude.min =
        long_range.min + (ilono * 1.0 / (1ull << step)) * long_scale;
    area->longitude.max =
        long_range.min + ((ilono + 1) * 1.0 / (1ull << step)) * long_scale;

    return 1;
}

int geohashDecodeType(const GeoHashBits hash, GeoHashArea *area) {
    GeoHashRange r[2] = {{0}};
    geohashGetCoordRange(&r[0], &r[1]);
    return geohashDecode(r[0], r[1], hash, area);
}

int geohashDecodeWGS84(const GeoHashBits hash, GeoHashArea *area) {
    return geohashDecodeType(hash, area);
}

int geohashDecodeAreaToLongLat(const GeoHashArea *area, double *xy) {
    if (!xy) return 0;
    xy[0] = (area->longitude.min + area->longitude.max) / 2;
    if (xy[0] > GEO_LONG_MAX) xy[0] = GEO_LONG_MAX;
    if (xy[0] < GEO_LONG_MIN) xy[0] = GEO_LONG_MIN;
    xy[1] = (area->latitude.min + area->latitude.max) / 2;
    if (xy[1] > GEO_LAT_MAX) xy[1] = GEO_LAT_MAX;
    if (xy[1] < GEO_LAT_MIN) xy[1] = GEO_LAT_MIN;
    return 1;
}

int geohashDecodeToLongLatType(const GeoHashBits hash, double *xy) {
    GeoHashArea area;
    memset(&area, 0, sizeof(area));

    if (!xy || !geohashDecodeType(hash, &area))
        return 0;
    return geohashDecodeAreaToLongLat(&area, xy);
}

int geohashDecodeToLongLatWGS84(const GeoHashBits hash, double *xy) {
    return geohashDecodeToLongLatType(hash, xy);
}

static void geohash_move_x(GeoHashBits *hash, int8_t d) {
    if (d == 0)
        return;

    uint64_t x = hash->bits & 0xaaaaaaaaaaaaaaaaULL;
    uint64_t y = hash->bits & 0x5555555555555555ULL;

    uint64_t zz = 0x5555555555555555ULL >> (64 - hash->step * 2);

    if (d > 0) {
        x = x + (zz + 1);
    } else {
        x = x | zz;
        x = x - (zz + 1);
    }

    x &= (0xaaaaaaaaaaaaaaaaULL >> (64 - hash->step * 2));
    hash->bits = (x | y);
}

static void geohash_move_y(GeoHashBits *hash, int8_t d) {
    if (d == 0)
        return;

    uint64_t x = hash->bits & 0xaaaaaaaaaaaaaaaaULL;
    uint64_t y = hash->bits & 0x5555555555555555ULL;

    uint64_t zz = 0xaaaaaaaaaaaaaaaaULL >> (64 - hash->step * 2);
    if (d > 0) {
        y = y + (zz + 1);
    } else {
        y = y | zz;
        y = y - (zz + 1);
    }
    y &= (0x5555555555555555ULL >> (64 - hash->step * 2));
    hash->bits = (x | y);
}

void geohashNeighbors(const GeoHashBits *hash, GeoHashNeighbors *neighbors) {
    neighbors->east = *hash;
    neighbors->west = *hash;
    neighbors->north = *hash;
    neighbors->south = *hash;
    neighbors->south_east = *hash;
    neighbors->south_west = *hash;
    neighbors->north_east = *hash;
    neighbors->north_west = *hash;

    geohash_move_x(&neighbors->east, 1);
    geohash_move_y(&neighbors->east, 0);

    geohash_move_x(&neighbors->west, -1);
    geohash_move_y(&neighbors->west, 0);

    geohash_move_x(&neighbors->south, 0);
    geohash_move_y(&neighbors->south, -1);

    geohash_move_x(&neighbors->north, 0);
    geohash_move_y(&neighbors->north, 1);

    geohash_move_x(&neighbors->north_west, -1);
    geohash_move_y(&neighbors->north_west, 1);

    geohash_move_x(&neighbors->north_east, 1);
    geohash_move_y(&neighbors->north_east, 1);

    geohash_move_x(&neighbors->south_east, 1);
    geohash_move_y(&neighbors->south_east, -1);

    geohash_move_x(&neighbors->south_west, -1);
    geohash_move_y(&neighbors->south_west, -1);
}


================================================
FILE: src/redis/geohash.h
================================================
/*
 * Copyright (c) 2013-2014, yinqiwen <yinqiwen@gmail.com>
 * Copyright (c) 2014, Matt Stancliff <matt@genges.com>.
 * Copyright (c) 2015, Salvatore Sanfilippo <antirez@gmail.com>.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Redis nor the names of its contributors may be used
 *    to endorse or promote products derived from this software without
 *    specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 * THE POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef GEOHASH_H_
#define GEOHASH_H_

#include <stddef.h>
#include <stdint.h>

#if defined(__cplusplus)
extern "C" {
#endif

#define HASHISZERO(r) (!(r).bits && !(r).step)
#define RANGEISZERO(r) (!(r).max && !(r).min)
#define RANGEPISZERO(r) (r == NULL || RANGEISZERO(*r))

#define GEO_STEP_MAX 26 /* 26*2 = 52 bits. */

/* Limits from EPSG:900913 / EPSG:3785 / OSGEO:41001 */
#define GEO_LAT_MIN -85.05112878
#define GEO_LAT_MAX 85.05112878
#define GEO_LONG_MIN -180
#define GEO_LONG_MAX 180

typedef enum {
    GEOHASH_NORTH = 0,
    GEOHASH_EAST,
    GEOHASH_WEST,
    GEOHASH_SOUTH,
    GEOHASH_SOUTH_WEST,
    GEOHASH_SOUTH_EAST,
    GEOHASH_NORT_WEST,
    GEOHASH_NORT_EAST
} GeoDirection;

typedef struct {
    uint64_t bits;
    uint8_t step;
} GeoHashBits;

typedef struct {
    double min;
    double max;
} GeoHashRange;

typedef struct {
    GeoHashBits hash;
    GeoHashRange longitude;
    GeoHashRange latitude;
} GeoHashArea;

typedef struct {
    GeoHashBits north;
    GeoHashBits east;
    GeoHashBits west;
    GeoHashBits south;
    GeoHashBits north_east;
    GeoHashBits south_east;
    GeoHashBits north_west;
    GeoHashBits south_west;
} GeoHashNeighbors;

#define CIRCULAR_TYPE 1
#define RECTANGLE_TYPE 2
typedef struct {
    int type; /* search type */
    double xy[2]; /* search center point, xy[0]: lon, xy[1]: lat */
    double conversion; /* km: 1000 */
    double bounds[4]; /* bounds[0]: min_lon, bounds[1]: min_lat
                       * bounds[2]: max_lon, bounds[3]: max_lat */
    union {
        /* CIRCULAR_TYPE */
        double radius;
        /* RECTANGLE_TYPE */
        struct {
            double height;
            double width;
        } r;
    } t;
} GeoShape;

/*
 * 0:success
 * -1:failed
 */
void geohashGetCoordRange(GeoHashRange *long_range, GeoHashRange *lat_range);
int geohashEncode(const GeoHashRange *long_range, const GeoHashRange *lat_range,
                  double longitude, double latitude, uint8_t step,
                  GeoHashBits *hash);
int geohashEncodeType(double longitude, double latitude,
                      uint8_t step, GeoHashBits *hash);
int geohashEncodeWGS84(double longitude, double latitude, uint8_t step,
                       GeoHashBits *hash);
int geohashDecode(const GeoHashRange long_range, const GeoHashRange lat_range,
                  const GeoHashBits hash, GeoHashArea *area);
int geohashDecodeType(const GeoHashBits hash, GeoHashArea *area);
int geohashDecodeWGS84(const GeoHashBits hash, GeoHashArea *area);
int geohashDecodeAreaToLongLat(const GeoHashArea *area, double *xy);
int geohashDecodeToLongLatType(const GeoHashBits hash, double *xy);
int geohashDecodeToLongLatWGS84(const GeoHashBits hash, double *xy);
void geohashNeighbors(const GeoHashBits *hash, GeoHashNeighbors *neighbors);

#if defined(__cplusplus)
}
#endif
#endif /* GEOHASH_H_ */


================================================
FILE: src/redis/geohash_helper.c
================================================
/*
 * Copyright (c) 2013-2014, yinqiwen <yinqiwen@gmail.com>
 * Copyright (c) 2014, Matt Stancliff <matt@genges.com>.
 * Copyright (c) 2015-2016, Salvatore Sanfilippo <antirez@gmail.com>.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Redis nor the names of its contributors may be used
 *    to endorse or promote products derived from this software without
 *    specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 * THE POSSIBILITY OF SUCH DAMAGE.
 */

/* This is a C++ to C conversion from the ardb project.
 * This file started out as:
 * https://github.com/yinqiwen/ardb/blob/d42503/src/geo/geohash_helper.cpp
 */

#define __USE_XOPEN

#include "geohash_helper.h"
#include <math.h>

#define D_R (M_PI / 180.0)
#define R_MAJOR 6378137.0
#define R_MINOR 6356752.3142
#define RATIO (R_MINOR / R_MAJOR)
#define ECCENT (sqrt(1.0 - (RATIO *RATIO)))
#define COM (0.5 * ECCENT)

/// @brief The usual PI/180 constant
const double DEG_TO_RAD = 0.017453292519943295769236907684886;
/// @brief Earth's quatratic mean radius for WGS-84
const double EARTH_RADIUS_IN_METERS = 6372797.560856;

const double MERCATOR_MAX = 20037726.37;
const double MERCATOR_MIN = -20037726.37;

static inline double deg_rad(double ang) { return ang * D_R; }
static inline double rad_deg(double ang) { return ang / D_R; }

/* This function is used in order to estimate the step (bits precision)
 * of the 9 search area boxes during radius queries. */
uint8_t geohashEstimateStepsByRadius(double range_meters, double lat) {
    if (range_meters == 0) return 26;
    int step = 1;
    while (range_meters < MERCATOR_MAX) {
        range_meters *= 2;
        step++;
    }
    step -= 2; /* Make sure range is included in most of the base cases. */

    /* Wider range towards the poles... Note: it is possible to do better
     * than this approximation by computing the distance between meridians
     * at this latitude, but this does the trick for now. */
    if (lat > 66 || lat < -66) {
        step--;
        if (lat > 80 || lat < -80) step--;
    }

    /* Frame to valid range. */
    if (step < 1) step = 1;
    if (step > 26) step = 26;
    return step;
}

/* Return the bounding box of the search area by shape (see geohash.h GeoShape)
 * bounds[0] - bounds[2] is the minimum and maximum longitude
 * while bounds[1] - bounds[3] is the minimum and maximum latitude.
 * since the higher the latitude, the shorter the arc length, the box shape is as follows
 * (left and right edges are actually bent), as shown in the following diagram:
 *
 *    \-----------------/          --------               \-----------------/
 *     \               /         /          \              \               /
 *      \  (long,lat) /         / (long,lat) \              \  (long,lat) /
 *       \           /         /              \             /             \
 *         ---------          /----------------\           /---------------\
 *  Northern Hemisphere       Southern Hemisphere         Around the equator
 */
int geohashBoundingBox(GeoShape *shape, double *bounds) {
    if (!bounds) return 0;
    double longitude = shape->xy[0];
    double latitude = shape->xy[1];
    double height = shape->conversion * (shape->type == CIRCULAR_TYPE ? shape->t.radius : shape->t.r.height/2);
    double width = shape->conversion * (shape->type == CIRCULAR_TYPE ? shape->t.radius : shape->t.r.width/2);

    const double lat_delta = rad_deg(height/EARTH_RADIUS_IN_METERS);
    const double long_delta_top = rad_deg(width/EARTH_RADIUS_IN_METERS/cos(deg_rad(latitude+lat_delta)));
    const double long_delta_bottom = rad_deg(width/EARTH_RADIUS_IN_METERS/cos(deg_rad(latitude-lat_delta)));
    /* The directions of the northern and southern hemispheres
     * are opposite, so we choice different points as min/max long/lat */
    int southern_hemisphere = latitude < 0 ? 1 : 0;
    bounds[0] = southern_hemisphere ? longitude-long_delta_bottom : longitude-long_delta_top;
    bounds[2] = southern_hemisphere ? longitude+long_delta_bottom : longitude+long_delta_top;
    bounds[1] = latitude - lat_delta;
    bounds[3] = latitude + lat_delta;
    return 1;
}

/* Calculate a set of areas (center + 8) that are able to cover a range query
 * for the specified position and shape (see geohash.h GeoShape).
 * the bounding box saved in shaple.bounds */
GeoHashRadius geohashCalculateAreasByShapeWGS84(GeoShape *shape) {
    GeoHashRange long_range, lat_range;
    GeoHashRadius radius;
    GeoHashBits hash;
    GeoHashNeighbors neighbors;
    GeoHashArea area;
    double min_lon, max_lon, min_lat, max_lat;
    int steps;

    geohashBoundingBox(shape, shape->bounds);
    min_lon = shape->bounds[0];
    min_lat = shape->bounds[1];
    max_lon = shape->bounds[2];
    max_lat = shape->bounds[3];

    double longitude = shape->xy[0];
    double latitude = shape->xy[1];
    /* radius_meters is calculated differently in different search types:
     * 1) CIRCULAR_TYPE, just use radius.
     * 2) RECTANGLE_TYPE, we use sqrt((width/2)^2 + (height/2)^2) to
     * calculate the distance from the center point to the corner */
    double radius_meters = shape->type == CIRCULAR_TYPE ? shape->t.radius :
            sqrt((shape->t.r.width/2)*(shape->t.r.width/2) + (shape->t.r.height/2)*(shape->t.r.height/2));
    radius_meters *= shape->conversion;

    steps = geohashEstimateStepsByRadius(radius_meters,latitude);

    geohashGetCoordRange(&long_range,&lat_range);
    geohashEncode(&long_range,&lat_range,longitude,latitude,steps,&hash);
    geohashNeighbors(&hash,&neighbors);
    geohashDecode(long_range,lat_range,hash,&area);

    /* Check if the step is enough at the limits of the covered area.
     * Sometimes when the search area is near an edge of the
     * area, the estimated step is not small enough, since one of the
     * north / south / west / east square is too near to the search area
     * to cover everything. */
    int decrease_step = 0;
    {
        GeoHashArea north, south, east, west;

        geohashDecode(long_range, lat_range, neighbors.north, &north);
        geohashDecode(long_range, lat_range, neighbors.south, &south);
        geohashDecode(long_range, lat_range, neighbors.east, &east);
        geohashDecode(long_range, lat_range, neighbors.west, &west);

        if (north.latitude.max < max_lat) 
            decrease_step = 1;
        if (south.latitude.min > min_lat) 
            decrease_step = 1;
        if (east.longitude.max < max_lon) 
            decrease_step = 1;
        if (west.longitude.min > min_lon)  
            decrease_step = 1;
    }

    if (steps > 1 && decrease_step) {
        steps--;
        geohashEncode(&long_range,&lat_range,longitude,latitude,steps,&hash);
        geohashNeighbors(&hash,&neighbors);
        geohashDecode(long_range,lat_range,hash,&area);
    }

    /* Exclude the search areas that are useless. */
    if (steps >= 2) {
        if (area.latitude.min < min_lat) {
            GZERO(neighbors.south);
            GZERO(neighbors.south_west);
            GZERO(neighbors.south_east);
        }
        if (area.latitude.max > max_lat) {
            GZERO(neighbors.north);
            GZERO(neighbors.north_east);
            GZERO(neighbors.north_west);
        }
        if (area.longitude.min < min_lon) {
            GZERO(neighbors.west);
            GZERO(neighbors.south_west);
            GZERO(neighbors.north_west);
        }
        if (area.longitude.max > max_lon) {
            GZERO(neighbors.east);
            GZERO(neighbors.south_east);
            GZERO(neighbors.north_east);
        }
    }
    radius.hash = hash;
    radius.neighbors = neighbors;
    radius.area = area;
    return radius;
}

GeoHashFix52Bits geohashAlign52Bits(const GeoHashBits hash) {
    uint64_t bits = hash.bits;
    bits <<= (52 - hash.step * 2);
    return bits;
}

/* Calculate distance using simplified haversine great circle distance formula.
 * Given longitude diff is 0 the asin(sqrt(a)) on the haversine is asin(sin(abs(u))).
 * arcsin(sin(x)) equal to x when x ∈[−𝜋/2,𝜋/2]. Given latitude is between [−𝜋/2,𝜋/2]
 * we can simplify arcsin(sin(x)) to x.
 */
double geohashGetLatDistance(double lat1d, double lat2d) {
    return EARTH_RADIUS_IN_METERS * fabs(deg_rad(lat2d) - deg_rad(lat1d));
}

/* Calculate distance using haversine great circle distance formula. */
double geohashGetDistance(double lon1d, double lat1d, double lon2d, double lat2d) {
    double lat1r, lon1r, lat2r, lon2r, u, v, a;
    lon1r = deg_rad(lon1d);
    lon2r = deg_rad(lon2d);
    v = sin((lon2r - lon1r) / 2);
    /* if v == 0 we can avoid doing expensive math when lons are practically the same */
    if (v == 0.0)
        return geohashGetLatDistance(lat1d, lat2d);
    lat1r = deg_rad(lat1d);
    lat2r = deg_rad(lat2d);
    u = sin((lat2r - lat1r) / 2);
    a = u * u + cos(lat1r) * cos(lat2r) * v * v;
    return 2.0 * EARTH_RADIUS_IN_METERS * asin(sqrt(a));
}

int geohashGetDistanceIfInRadius(double x1, double y1,
                                 double x2, double y2, double radius,
                                 double *distance) {
    *distance = geohashGetDistance(x1, y1, x2, y2);
    if (*distance > radius) return 0;
    return 1;
}

int geohashGetDistanceIfInRadiusWGS84(double x1, double y1, double x2,
                                      double y2, double radius,
                                      double *distance) {
    return geohashGetDistanceIfInRadius(x1, y1, x2, y2, radius, distance);
}

/* Judge whether a point is in the axis-aligned rectangle, when the distance
 * between a searched point and the center point is less than or equal to
 * height/2 or width/2 in height and width, the point is in the rectangle.
 *
 * width_m, height_m: the rectangle
 * x1, y1 : the center of the box
 * x2, y2 : the point to be searched
 */
int geohashGetDistanceIfInRectangle(double width_m, double height_m, double x1, double y1,
                                    double x2, double y2, double *distance) {
    /* latitude distance is less expensive to compute than longitude distance
     * so we check first for the latitude condition */
    double lat_distance = geohashGetLatDistance(y2, y1);
    if (lat_distance > height_m/2) {
        return 0;
    }
    double lon_distance = geohashGetDistance(x2, y2, x1, y2);
    if (lon_distance > width_m/2) {
        return 0;
    }
    *distance = geohashGetDistance(x1, y1, x2, y2);
    return 1;
}


================================================
FILE: src/redis/geohash_helper.h
================================================
/*
 * Copyright (c) 2013-2014, yinqiwen <yinqiwen@gmail.com>
 * Copyright (c) 2014, Matt Stancliff <matt@genges.com>.
 * Copyright (c) 2015, Salvatore Sanfilippo <antirez@gmail.com>.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Redis nor the names of its contributors may be used
 *    to endorse or promote products derived from this software without
 *    specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 * THE POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef GEOHASH_HELPER_HPP_
#define GEOHASH_HELPER_HPP_

#include "geohash.h"

#define GZERO(s) s.bits = s.step = 0;
#define GISZERO(s) (!s.bits && !s.step)
#define GISNOTZERO(s) (s.bits || s.step)

typedef uint64_t GeoHashFix52Bits;
typedef uint64_t GeoHashVarBits;

typedef struct {
    GeoHashBits hash;
    GeoHashArea area;
    GeoHashNeighbors neighbors;
} GeoHashRadius;

uint8_t geohashEstimateStepsByRadius(double range_meters, double lat);
int geohashBoundingBox(GeoShape *shape, double *bounds);
GeoHashRadius geohashCalculateAreasByShapeWGS84(GeoShape *shape);
GeoHashFix52Bits geohashAlign52Bits(const GeoHashBits hash);
double geohashGetDistance(double lon1d, double lat1d,
                          double lon2d, double lat2d);
int geohashGetDistanceIfInRadius(double x1, double y1,
                                 double x2, double y2, double radius,
                                 double *distance);
int geohashGetDistanceIfInRadiusWGS84(double x1, double y1, double x2,
                                      double y2, double radius,
                                      double *distance);
int geohashGetDistanceIfInRectangle(double width_m, double height_m, double x1, double y1,
                                    double x2, double y2, double *distance);

#endif /* GEOHASH_HELPER_HPP_ */


================================================
FILE: src/redis/hiredis.c
================================================
/*
 * Copyright (c) 2009-2011, Salvatore Sanfilippo <antirez at gmail dot com>
 * Copyright (c) 2010-2014, Pieter Noordhuis <pcnoordhuis at gmail dot com>
 * Copyright (c) 2015, Matt Stancliff <matt at genges dot com>,
 *                     Jan-Erik Rediger <janerik at fnordig dot com>
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <string.h>
#include <stdlib.h>
#include <assert.h>
#include <errno.h>
#include <ctype.h>

#include "hiredis.h"
#include "sds.h"


static redisReply *createReplyObject(int type);
static void *createStringObject(const redisReadTask *task, char *str, size_t len);
static void *createArrayObject(const redisReadTask *task, size_t elements);
static void *createIntegerObject(const redisReadTask *task, long long value);
static void *createDoubleObject(const redisReadTask *task, double value, char *str, size_t len);
static void *createNilObject(const redisReadTask *task);
static void *createBoolObject(const redisReadTask *task, int bval);

/* Default set of functions to build the reply. Keep in mind that such a
 * function returning NULL is interpreted as OOM. */
static redisReplyObjectFunctions defaultFunctions = {
    createStringObject,
    createArrayObject,
    createIntegerObject,
    createDoubleObject,
    createNilObject,
    createBoolObject,
    freeReplyObject
};

/* Create a reply object */
static redisReply *createReplyObject(int type) {
    redisReply *r = s_calloc(sizeof(*r));

    if (r == NULL)
        return NULL;

    r->type = type;
    return r;
}

/* Free a reply object */
void freeReplyObject(void *reply) {
    redisReply *r = reply;
    size_t j;

    if (r == NULL)
        return;

    switch(r->type) {
    case REDIS_REPLY_INTEGER:
    case REDIS_REPLY_NIL:
    case REDIS_REPLY_BOOL:
        break; /* Nothing to free */
    case REDIS_REPLY_ARRAY:
    case REDIS_REPLY_MAP:
    case REDIS_REPLY_ATTR:
    case REDIS_REPLY_SET:
    case REDIS_REPLY_PUSH:
        if (r->element != NULL) {
            for (j = 0; j < r->elements; j++)
                freeReplyObject(r->element[j]);
            s_free(r->element);
        }
        break;
    case REDIS_REPLY_ERROR:
    case REDIS_REPLY_STATUS:
    case REDIS_REPLY_STRING:
    case REDIS_REPLY_DOUBLE:
    case REDIS_REPLY_VERB:
    case REDIS_REPLY_BIGNUM:
        s_free(r->str);
        break;
    }
    s_free(r);
}

static void *createStringObject(const redisReadTask *task, char *str, size_t len) {
    redisReply *r, *parent;
    char *buf;

    r = createReplyObject(task->type);
    if (r == NULL)
        return NULL;

    assert(task->type == REDIS_REPLY_ERROR  ||
           task->type == REDIS_REPLY_STATUS ||
           task->type == REDIS_REPLY_STRING ||
           task->type == REDIS_REPLY_VERB   ||
           task->type == REDIS_REPLY_BIGNUM);

    /* Copy string value */
    if (task->type == REDIS_REPLY_VERB) {
        buf = s_malloc(len-4+1); /* Skip 4 bytes of verbatim type header. */
        if (buf == NULL) goto oom;

        memcpy(r->vtype,str,3);
        r->vtype[3] = '\0';
        memcpy(buf,str+4,len-4);
        buf[len-4] = '\0';
        r->len = len - 4;
    } else {
        buf = s_malloc(len+1);
        if (buf == NULL) goto oom;

        memcpy(buf,str,len);
        buf[len] = '\0';
        r->len = len;
    }
    r->str = buf;

    if (task->parent) {
        parent = task->parent->obj;
        assert(parent->type == REDIS_REPLY_ARRAY ||
               parent->type == REDIS_REPLY_MAP ||
               parent->type == REDIS_REPLY_ATTR ||
               parent->type == REDIS_REPLY_SET ||
               parent->type == REDIS_REPLY_PUSH);
        parent->element[task->idx] = r;
    }
    return r;

oom:
    freeReplyObject(r);
    return NULL;
}

static void *createArrayObject(const redisReadTask *task, size_t elements) {
    redisReply *r, *parent;

    r = createReplyObject(task->type);
    if (r == NULL)
        return NULL;

    if (elements > 0) {
        r->element = s_calloc(elements * sizeof(redisReply*));
        if (r->element == NULL) {
            freeReplyObject(r);
            return NULL;
        }
    }

    r->elements = elements;

    if (task->parent) {
        parent = task->parent->obj;
        assert(parent->type == REDIS_REPLY_ARRAY ||
               parent->type == REDIS_REPLY_MAP ||
               parent->type == REDIS_REPLY_ATTR ||
               parent->type == REDIS_REPLY_SET ||
               parent->type == REDIS_REPLY_PUSH);
        parent->element[task->idx] = r;
    }
    return r;
}

static void *createIntegerObject(const redisReadTask *task, long long value) {
    redisReply *r, *parent;

    r = createReplyObject(REDIS_REPLY_INTEGER);
    if (r == NULL)
        return NULL;

    r->integer = value;

    if (task->parent) {
        parent = task->parent->obj;
        assert(parent->type == REDIS_REPLY_ARRAY ||
               parent->type == REDIS_REPLY_MAP ||
               parent->type == REDIS_REPLY_ATTR ||
               parent->type == REDIS_REPLY_SET ||
               parent->type == REDIS_REPLY_PUSH);
        parent->element[task->idx] = r;
    }
    return r;
}

static void *createDoubleObject(const redisReadTask *task, double value, char *str, size_t len) {
    redisReply *r, *parent;

    if (len == SIZE_MAX) // Prevents s_malloc(0) if len equals to SIZE_MAX
        return NULL;

    r = createReplyObject(REDIS_REPLY_DOUBLE);
    if (r == NULL)
        return NULL;

    r->dval = value;
    r->str = s_malloc(len+1);
    if (r->str == NULL) {
        freeReplyObject(r);
        return NULL;
    }

    /* The double reply also has the original protocol string representing a
     * double as a null terminated string. This way the caller does not need
     * to format back for string conversion, especially since Redis does efforts
     * to make the string more human readable avoiding the calssical double
     * decimal string conversion artifacts. */
    memcpy(r->str, str, len);
    r->str[len] = '\0';
    r->len = len;

    if (task->parent) {
        parent = task->parent->obj;
        assert(parent->type == REDIS_REPLY_ARRAY ||
               parent->type == REDIS_REPLY_MAP ||
               parent->type == REDIS_REPLY_ATTR ||
               parent->type == REDIS_REPLY_SET ||
               parent->type == REDIS_REPLY_PUSH);
        parent->element[task->idx] = r;
    }
    return r;
}

static void *createNilObject(const redisReadTask *task) {
    int type = task->type;
    int is_aggregate = (type == REDIS_REPLY_ARRAY || type == REDIS_REPLY_MAP ||
                        type == REDIS_REPLY_SET || type == REDIS_REPLY_PUSH);

    /* For aggregate nils (*-1, etc.) preserve the original aggregate type
     * with SIZE_MAX elements as a sentinel, so callers can distinguish
     * null arrays from null bulk strings. */
    if (is_aggregate) {
        void *obj = createArrayObject(task, 0);
        if (obj == NULL)
            return NULL;
        ((redisReply*)obj)->elements = SIZE_MAX;
        return obj;
    }

    redisReply *r, *parent;

    r = createReplyObject(REDIS_REPLY_NIL);
    if (r == NULL)
        return NULL;

    if (task->parent) {
        parent = task->parent->obj;
        assert(parent->type == REDIS_REPLY_ARRAY ||
               parent->type == REDIS_REPLY_MAP ||
               parent->type == REDIS_REPLY_ATTR ||
               parent->type == REDIS_REPLY_SET ||
               parent->type == REDIS_REPLY_PUSH);
        parent->element[task->idx] = r;
    }
    return r;
}

static void *createBoolObject(const redisReadTask *task, int bval) {
    redisReply *r, *parent;

    r = createReplyObject(REDIS_REPLY_BOOL);
    if (r == NULL)
        return NULL;

    r->integer = bval != 0;

    if (task->parent) {
        parent = task->parent->obj;
        assert(parent->type == REDIS_REPLY_ARRAY ||
               parent->type == REDIS_REPLY_MAP ||
               parent->type == REDIS_REPLY_ATTR ||
               parent->type == REDIS_REPLY_SET ||
               parent->type == REDIS_REPLY_PUSH);
        parent->element[task->idx] = r;
    }
    return r;
}

/* Return the number of digits of 'v' when converted to string in radix 10.
 * Implementation borrowed from link in redis/src/util.c:string2ll(). */
static uint32_t countDigits(uint64_t v) {
  uint32_t result = 1;
  for (;;) {
    if (v < 10) return result;
    if (v < 100) return result + 1;
    if (v < 1000) return result + 2;
    if (v < 10000) return result + 3;
    v /= 10000U;
    result += 4;
  }
}

/* Helper that calculates the bulk length given a certain string length. */
static size_t bulklen(size_t len) {
    return 1+countDigits(len)+2+len+2;
}

int redisvFormatCommand(char **target, const char *format, va_list ap) {
    const char *c = format;
    char *cmd = NULL; /* final command */
    int pos; /* position in final command */
    sds curarg, newarg; /* current argument */
    int touched = 0; /* was the current argument touched? */
    char **curargv = NULL, **newargv = NULL;
    int argc = 0;
    int totlen = 0;
    int error_type = 0; /* 0 = no error; -1 = memory error; -2 = format error */
    int j;

    /* Abort if there is not target to set */
    if (target == NULL)
        return -1;

    /* Build the command string accordingly to protocol */
    curarg = sdsempty();
    if (curarg == NULL)
        return -1;

    while(*c != '\0') {
        if (*c != '%' || c[1] == '\0') {
            if (*c == ' ') {
                if (touched) {
                    newargv = s_realloc(curargv,sizeof(char*)*(argc+1));
                    if (newargv == NULL) goto memory_err;
                    curargv = newargv;
                    curargv[argc++] = curarg;
                    totlen += bulklen(sdslen(curarg));

                    /* curarg is put in argv so it can be overwritten. */
                    curarg = sdsempty();
                    if (curarg == NULL) goto memory_err;
                    touched = 0;
                }
            } else {
                newarg = sdscatlen(curarg,c,1);
                if (newarg == NULL) goto memory_err;
                curarg = newarg;
                touched = 1;
            }
        } else {
            char *arg;
            size_t size;

            /* Set newarg so it can be checked even if it is not touched. */
            newarg = curarg;

            switch(c[1]) {
            case 's':
                arg = va_arg(ap,char*);
                size = strlen(arg);
                if (size > 0)
                    newarg = sdscatlen(curarg,arg,size);
                break;
            case 'b':
                arg = va_arg(ap,char*);
                size = va_arg(ap,size_t);
                if (size > 0)
                    newarg = sdscatlen(curarg,arg,size);
                break;
            case '%':
                newarg = sdscat(curarg,"%");
                break;
            default:
                /* Try to detect printf format */
                {
                    static const char intfmts[] = "diouxX";
                    static const char flags[] = "#0-+ ";
                    char _format[16];
                    const char *_p = c+1;
                    size_t _l = 0;
                    va_list _cpy;

                    /* Flags */
                    while (*_p != '\0' && strchr(flags,*_p) != NULL) _p++;

                    /* Field width */
                    while (*_p != '\0' && isdigit((int) *_p)) _p++;

                    /* Precision */
                    if (*_p == '.') {
                        _p++;
                        while (*_p != '\0' && isdigit((int) *_p)) _p++;
                    }

                    /* Copy va_list before consuming with va_arg */
                    va_copy(_cpy,ap);

                    /* Make sure we have more characters otherwise strchr() accepts
                     * '\0' as an integer specifier. This is checked after above
                     * va_copy() to avoid UB in fmt_invalid's call to va_end(). */
                    if (*_p == '\0') goto fmt_invalid;

                    /* Integer conversion (without modifiers) */
                    if (strchr(intfmts,*_p) != NULL) {
                        va_arg(ap,int);
                        goto fmt_valid;
                    }

                    /* Double conversion (without modifiers) */
                    if (strchr("eEfFgGaA",*_p) != NULL) {
                        va_arg(ap,double);
                        goto fmt_valid;
                    }

                    /* Size: char */
                    if (_p[0] == 'h' && _p[1] == 'h') {
                        _p += 2;
                        if (*_p != '\0' && strchr(intfmts,*_p) != NULL) {
                            va_arg(ap,int); /* char gets promoted to int */
                            goto fmt_valid;
                        }
                        goto fmt_invalid;
                    }

                    /* Size: short */
                    if (_p[0] == 'h') {
                        _p += 1;
                        if (*_p != '\0' && strchr(intfmts,*_p) != NULL) {
                            va_arg(ap,int); /* short gets promoted to int */
                            goto fmt_valid;
                        }
                        goto fmt_invalid;
                    }

                    /* Size: long long */
                    if (_p[0] == 'l' && _p[1] == 'l') {
                        _p += 2;
                        if (*_p != '\0' && strchr(intfmts,*_p) != NULL) {
                            va_arg(ap,long long);
                            goto fmt_valid;
                        }
                        goto fmt_invalid;
                    }

                    /* Size: long */
                    if (_p[0] == 'l') {
                        _p += 1;
                        if (*_p != '\0' && strchr(intfmts,*_p) != NULL) {
                            va_arg(ap,long);
                            goto fmt_valid;
                        }
                        goto fmt_invalid;
                    }

                fmt_invalid:
                    va_end(_cpy);
                    goto format_err;

                fmt_valid:
                    _l = (_p+1)-c;
                    if (_l < sizeof(_format)-2) {
                        memcpy(_format,c,_l);
                        _format[_l] = '\0';
                        newarg = sdscatvprintf(curarg,_format,_cpy);

                        /* Update current position (note: outer blocks
                         * increment c twice so compensate here) */
                        c = _p-1;
                    }

                    va_end(_cpy);
                    break;
                }
            }

            if (newarg == NULL) goto memory_err;
            curarg = newarg;

            touched = 1;
            c++;
            if (*c == '\0')
                break;
        }
        c++;
    }

    /* Add the last argument if needed */
    if (touched) {
        newargv = s_realloc(curargv,sizeof(char*)*(argc+1));
        if (newargv == NULL) goto memory_err;
        curargv = newargv;
        curargv[argc++] = curarg;
        totlen += bulklen(sdslen(curarg));
    } else {
        sdsfree(curarg);
    }

    /* Clear curarg because it was put in curargv or was free'd. */
    curarg = NULL;

    /* Add bytes needed to hold multi bulk count */
    totlen += 1+countDigits(argc)+2;

    /* Build the command at protocol level */
    cmd = s_malloc(totlen+1);
    if (cmd == NULL) goto memory_err;

    pos = sprintf(cmd,"*%d\r\n",argc);
    for (j = 0; j < argc; j++) {
        pos += sprintf(cmd+pos,"$%zu\r\n",sdslen(curargv[j]));
        memcpy(cmd+pos,curargv[j],sdslen(curargv[j]));
        pos += sdslen(curargv[j]);
        sdsfree(curargv[j]);
        cmd[pos++] = '\r';
        cmd[pos++] = '\n';
    }
    assert(pos == totlen);
    cmd[pos] = '\0';

    s_free(curargv);
    *target = cmd;
    return totlen;

format_err:
    error_type = -2;
    goto cleanup;

memory_err:
    error_type = -1;
    goto cleanup;

cleanup:
    if (curargv) {
        while(argc--)
            sdsfree(curargv[argc]);
        s_free(curargv);
    }

    sdsfree(curarg);
    s_free(cmd);

    return error_type;
}

/* Format a command according to the Redis protocol. This function
 * takes a format similar to printf:
 *
 * %s represents a C null terminated string you want to interpolate
 * %b represents a binary safe string
 *
 * When using %b you need to provide both the pointer to the string
 * and the length in bytes as a size_t. Examples:
 *
 * len = redisFormatCommand(target, "GET %s", mykey);
 * len = redisFormatCommand(target, "SET %s %b", mykey, myval, myvallen);
 */
int redisFormatCommand(char **target, const char *format, ...) {
    va_list ap;
    int len;
    va_start(ap,format);
    len = redisvFormatCommand(target,format,ap);
    va_end(ap);

    /* The API says "-1" means bad result, but we now also return "-2" in some
     * cases.  Force the return value to always be -1. */
    if (len < 0)
        len = -1;

    return len;
}

/* Format a command according to the Redis protocol using an sds string and
 * sdscatfmt for the processing of arguments. This function takes the
 * number of arguments, an array with arguments and an array with their
 * lengths. If the latter is set to NULL, strlen will be used to compute the
 * argument lengths.
 */
long long redisFormatSdsCommandArgv(sds *target, int argc, const char **argv,
                                    const size_t *argvlen)
{
    sds cmd, aux;
    unsigned long long totlen, len;
    int j;

    /* Abort on a NULL target */
    if (target == NULL)
        return -1;

    /* Calculate our total size */
    totlen = 1+countDigits(argc)+2;
    for (j = 0; j < argc; j++) {
        len = argvlen ? argvlen[j] : strlen(argv[j]);
        totlen += bulklen(len);
    }

    /* Use an SDS string for command construction */
    cmd = sdsempty();
    if (cmd == NULL)
        return -1;

    /* We already know how much storage we need */
    aux = sdsMakeRoomFor(cmd, totlen);
    if (aux == NULL) {
        sdsfree(cmd);
        return -1;
    }

    cmd = aux;

    /* Construct command */
    cmd = sdscatfmt(cmd, "*%i\r\n", argc);
    for (j=0; j < argc; j++) {
        len = argvlen ? argvlen[j] : strlen(argv[j]);
        cmd = sdscatfmt(cmd, "$%U\r\n", len);
        cmd = sdscatlen(cmd, argv[j], len);
        cmd = sdscatlen(cmd, "\r\n", sizeof("\r\n")-1);
    }

    assert(sdslen(cmd)==totlen);

    *target = cmd;
    return totlen;
}

void redisFreeSdsCommand(sds cmd) {
    sdsfree(cmd);
}

/* Format a command according to the Redis protocol. This function takes the
 * number of arguments, an array with arguments and an array with their
 * lengths. If the latter is set to NULL, strlen will be used to compute the
 * argument lengths.
 */
long long redisFormatCommandArgv(char **target, int argc, const char **argv, const size_t *argvlen) {
    char *cmd = NULL; /* final command */
    size_t pos; /* position in final command */
    size_t len, totlen;
    int j;

    /* Abort on a NULL target */
    if (target == NULL)
        return -1;

    /* Calculate number of bytes needed for the command */
    totlen = 1+countDigits(argc)+2;
    for (j = 0; j < argc; j++) {
        len = argvlen ? argvlen[j] : strlen(argv[j]);
        totlen += bulklen(len);
    }

    /* Build the command at protocol level */
    cmd = s_malloc(totlen+1);
    if (cmd == NULL)
        return -1;

    pos = sprintf(cmd,"*%d\r\n",argc);
    for (j = 0; j < argc; j++) {
        len = argvlen ? argvlen[j] : strlen(argv[j]);
        pos += sprintf(cmd+pos,"$%zu\r\n",len);
        memcpy(cmd+pos,argv[j],len);
        pos += len;
        cmd[pos++] = '\r';
        cmd[pos++] = '\n';
    }
    assert(pos == totlen);
    cmd[pos] = '\0';

    *target = cmd;
    return totlen;
}

void redisFreeCommand(char *cmd) {
    s_free(cmd);
}

redisReader *redisReaderCreate(void) {
    return redisReaderCreateWithFunctions(&defaultFunctions);
}


================================================
FILE: src/redis/hiredis.h
================================================
/*
 * Copyright (c) 2009-2011, Salvatore Sanfilippo <antirez at gmail dot com>
 * Copyright (c) 2010-2014, Pieter Noordhuis <pcnoordhuis at gmail dot com>
 * Copyright (c) 2015, Matt Stancliff <matt at genges dot com>,
 *                     Jan-Erik Rediger <janerik at fnordig dot com>
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef __HIREDIS_H
#define __HIREDIS_H
#include "read.h"
#include <stdarg.h> /* for va_list */
#ifndef _MSC_VER
#include <sys/time.h> /* for struct timeval */
#else
struct timeval; /* forward declaration */
typedef long long ssize_t;
#endif
#include <stdint.h> /* uintXX_t, etc */
#include "sds.h" /* for sds */
#include "sdsalloc.h" /* for allocation wrappers */

#define HIREDIS_MAJOR 1
#define HIREDIS_MINOR 3
#define HIREDIS_PATCH 0
#define HIREDIS_SONAME 1.3.0

/* Connection type can be blocking or non-blocking and is set in the
 * least significant bit of the flags field in redisContext. */
#define REDIS_BLOCK 0x1

/* Connection may be disconnected before being free'd. The second bit
 * in the flags field is set when the context is connected. */
#define REDIS_CONNECTED 0x2

/* The async API might try to disconnect cleanly and flush the output
 * buffer and read all subsequent replies before disconnecting.
 * This flag means no new commands can come in and the connection
 * should be terminated once all replies have been read. */
#define REDIS_DISCONNECTING 0x4

/* Flag specific to the async API which means that the context should be clean
 * up as soon as possible. */
#define REDIS_FREEING 0x8

/* Flag that is set when an async callback is executed. */
#define REDIS_IN_CALLBACK 0x10

/* Flag that is set when the async context has one or more subscriptions. */
#define REDIS_SUBSCRIBED 0x20

/* Flag that is set when monitor mode is active */
#define REDIS_MONITORING 0x40

/* Flag that is set when we should set SO_REUSEADDR before calling bind() */
#define REDIS_REUSEADDR 0x80

/* Flag that is set when the async connection supports push replies. */
#define REDIS_SUPPORTS_PUSH 0x100

/**
 * Flag that indicates the user does not want the context to
 * be automatically freed upon error
 */
#define REDIS_NO_AUTO_FREE 0x200

/* Flag that indicates the user does not want replies to be automatically freed */
#define REDIS_NO_AUTO_FREE_REPLIES 0x400

/* Flags to prefer IPv6 or IPv4 when doing DNS lookup. (If both are set,
 * AF_UNSPEC is used.) */
#define REDIS_PREFER_IPV4 0x800
#define REDIS_PREFER_IPV6 0x1000

#define REDIS_KEEPALIVE_INTERVAL 15 /* seconds */

/* number of times we retry to connect in the case of EADDRNOTAVAIL and
 * SO_REUSEADDR is being used. */
#define REDIS_CONNECT_RETRIES  10

/* Forward declarations for structs defined elsewhere */
struct redisAsyncContext;
struct redisContext;

/* RESP3 push helpers and callback prototypes */
#define redisIsPushReply(r) (((redisReply*)(r))->type == REDIS_REPLY_PUSH)
typedef void (redisPushFn)(void *, void *);
typedef void (redisAsyncPushFn)(struct redisAsyncContext *, void *);

#ifdef __cplusplus
extern "C" {
#endif

/* This is the reply object returned by redisCommand() */
typedef struct redisReply {
    int type; /* REDIS_REPLY_* */
    long long integer; /* The integer when type is REDIS_REPLY_INTEGER */
    double dval; /* The double when type is REDIS_REPLY_DOUBLE */
    size_t len; /* Length of string */
    char *str; /* Used for REDIS_REPLY_ERROR, REDIS_REPLY_STRING
                  REDIS_REPLY_VERB, REDIS_REPLY_DOUBLE (in additional to dval),
                  and REDIS_REPLY_BIGNUM. */
    char vtype[4]; /* Used for REDIS_REPLY_VERB, contains the null
                      terminated 3 character content type, such as "txt". */
    size_t elements; /* number of elements, for REDIS_REPLY_ARRAY */
    struct redisReply **element; /* elements vector for REDIS_REPLY_ARRAY */
} redisReply;

redisReader *redisReaderCreate(void);

/* Function to free the reply objects hiredis returns by default. */
void freeReplyObject(void *reply);

/* Functions to format a command according to the protocol. */
int redisvFormatCommand(char **target, const char *format, va_list ap);
int redisFormatCommand(char **target, const char *format, ...);
long long redisFormatCommandArgv(char **target, int argc, const char **argv, const size_t *argvlen);
long long redisFormatSdsCommandArgv(sds *target, int argc, const char ** argv, const size_t *argvlen);
void redisFreeCommand(char *cmd);
void redisFreeSdsCommand(sds cmd);


#ifdef __cplusplus
}
#endif

#endif


================================================
FILE: src/redis/hyperloglog.c
================================================
/* hyperloglog.c - Redis HyperLogLog probabilistic cardinality approximation.
 * This file implements the algorithm and the exported Redis commands.
 *
 * Copyright (c) 2014, Salvatore Sanfilippo <antirez at gmail dot com>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include "redis/hyperloglog.h"

#include <math.h>
#include <string.h>

#include "redis/redis_aux.h"
#include "redis/util.h"

#define min(a, b) ((a) < (b) ? (a) : (b))

/* The Redis HyperLogLog implementation is based on the following ideas:
 *
 * * The use of a 64 bit hash function as proposed in [1], in order to estimate
 *   cardinalities larger than 10^9, at the cost of just 1 additional bit per
 *   register.
 * * The use of 16384 6-bit registers for a great level of accuracy, using
 *   a total of 12k per key.
 * * The use of the Redis string data type. No new type is introduced.
 * * No attempt is made to compress the data structure as in [1]. Also the
 *   algorithm used is the original HyperLogLog Algorithm as in [2], with
 *   the only difference that a 64 bit hash function is used, so no correction
 *   is performed for values near 2^32 as in [1].
 *
 * [1] Heule, Nunkesser, Hall: HyperLogLog in Practice: Algorithmic
 *     Engineering of a State of The Art Cardinality Estimation Algorithm.
 *
 * [2] P. Flajolet, Éric Fusy, O. Gandouet, and F. Meunier. Hyperloglog: The
 *     analysis of a near-optimal cardinality estimation algorithm.
 *
 * Redis uses two representations:
 *
 * 1) A "dense" representation where every entry is represented by
 *    a 6-bit integer.
 * 2) A "sparse" representation using run length compression suitable
 *    for representing HyperLogLogs with many registers set to 0 in
 *    a memory efficient way.
 *
 *
 * HLL header
 * ===
 *
 * Both the dense and sparse representation have a 16 byte header as follows:
 *
 * +------+---+-----+----------+
 * | HYLL | E | N/U | Cardin.  |
 * +------+---+-----+----------+
 *
 * The first 4 bytes are a magic string set to the bytes "HYLL".
 * "E" is one byte encoding, currently set to HLL_DENSE or
 * HLL_SPARSE. N/U are three not used bytes.
 *
 * The "Cardin." field is a 64 bit integer stored in little endian format
 * with the latest cardinality computed that can be reused if the data
 * structure was not modified since the last computation (this is useful
 * because there are high probabilities that HLLADD operations don't
 * modify the actual data structure and hence the approximated cardinality).
 *
 * When the most significant bit in the most significant byte of the cached
 * cardinality is set, it means that the data structure was modified and
 * we can't reuse the cached value that must be recomputed.
 *
 * Dense representation
 * ===
 *
 * The dense representation used by Redis is the following:
 *
 * +--------+--------+--------+------//      //--+
 * |11000000|22221111|33333322|55444444 ....     |
 * +--------+--------+--------+------//      //--+
 *
 * The 6 bits counters are encoded one after the other starting from the
 * LSB to the MSB, and using the next bytes as needed.
 *
 * Sparse representation
 * ===
 *
 * The sparse representation encodes registers using a run length
 * encoding composed of three opcodes, two using one byte, and one using
 * of two bytes. The opcodes are called ZERO, XZERO and VAL.
 *
 * ZERO opcode is represented as 00xxxxxx. The 6-bit integer represented
 * by the six bits 'xxxxxx', plus 1, means that there are N registers set
 * to 0. This opcode can represent from 1 to 64 contiguous registers set
 * to the value of 0.
 *
 * XZERO opcode is represented by two bytes 01xxxxxx yyyyyyyy. The 14-bit
 * integer represented by the bits 'xxxxxx' as most significant bits and
 * 'yyyyyyyy' as least significant bits, plus 1, means that there are N
 * registers set to 0. This opcode can represent from 0 to 16384 contiguous
 * registers set to the value of 0.
 *
 * VAL opcode is represented as 1vvvvvxx. It contains a 5-bit integer
 * representing the value of a register, and a 2-bit integer representing
 * the number of contiguous registers set to that value 'vvvvv'.
 * To obtain the value and run length, the integers vvvvv and xx must be
 * incremented by one. This opcode can represent values from 1 to 32,
 * repeated from 1 to 4 times.
 *
 * The sparse representation can't represent registers with a value greater
 * than 32, however it is very unlikely that we find such a register in an
 * HLL with a cardinality where the sparse representation is still more
 * memory efficient than the dense representation. When this happens the
 * HLL is converted to the dense representation.
 *
 * The sparse representation is purely positional. For example a sparse
 * representation of an empty HLL is just: XZERO:16384.
 *
 * An HLL having only 3 non-zero registers at position 1000, 1020, 1021
 * respectively set to 2, 3, 3, is represented by the following three
 * opcodes:
 *
 * XZERO:1000 (Registers 0-999 are set to 0)
 * VAL:2,1    (1 register set to value 2, that is register 1000)
 * ZERO:19    (Registers 1001-1019 set to 0)
 * VAL:3,2    (2 registers set to value 3, that is registers 1020,1021)
 * XZERO:15362 (Registers 1022-16383 set to 0)
 *
 * In the example the sparse representation used just 7 bytes instead
 * of 12k in order to represent the HLL registers. In general for low
 * cardinality there is a big win in terms of space efficiency, traded
 * with CPU time since the sparse representation is slower to access.
 *
 * The following table shows average cardinality vs bytes used, 100
 * samples per cardinality (when the set was not representable because
 * of registers with too big value, the dense representation size was used
 * as a sample).
 *
 * 100 267
 * 200 485
 * 300 678
 * 400 859
 * 500 1033
 * 600 1205
 * 700 1375
 * 800 1544
 * 900 1713
 * 1000 1882
 * 2000 3480
 * 3000 4879
 * 4000 6089
 * 5000 7138
 * 6000 8042
 * 7000 8823
 * 8000 9500
 * 9000 10088
 * 10000 10591
 *
 * The dense representation uses 12288 bytes, so there is a big win up to
 * a cardinality of ~2000-3000. For bigger cardinalities the constant times
 * involved in updating the sparse representation is not justified by the
 * memory savings. The exact maximum length of the sparse representation
 * when this implementation switches to the dense representation is
 * configured via the define HLL_SPARSE_MAX_BYTES.
 */
#define HLL_SPARSE_MAX_BYTES 3000

struct hllhdr {
  char magic[4];       /* "HYLL" */
  uint8_t encoding;    /* HLL_DENSE or HLL_SPARSE. */
  uint8_t notused[3];  /* Reserved for future use, must be zero. */
  uint8_t card[8];     /* Cached cardinality, little endian. */
  uint8_t registers[]; /* Data bytes. */
};

/* The cached cardinality MSB is used to signal validity of the cached value. */
#define HLL_INVALIDATE_CACHE(hdr) (hdr)->card[7] |= (1 << 7)
#define HLL_VALID_CACHE(hdr) (((hdr)->card[7] & (1 << 7)) == 0)

#define HLL_P 14 /* The greater is P, the smaller the error. */
#define HLL_Q                                                                           \
  (64 - HLL_P)                         /* The number of bits of the hash value used for \
                                          determining the number of leading zeros. */
#define HLL_REGISTERS (1 << HLL_P)     /* With P=14, 16384 registers. */
#define HLL_P_MASK (HLL_REGISTERS - 1) /* Mask to index register. */
#define HLL_BITS 6                     /* Enough to count up to 63 leading zeroes. */
#define HLL_REGISTER_MAX ((1 << HLL_BITS) - 1)
#define HLL_HDR_SIZE sizeof(struct hllhdr)
#define HLL_DENSE_SIZE (HLL_HDR_SIZE + ((HLL_REGISTERS * HLL_BITS + 7) / 8))
#define HLL_DENSE 0  /* Dense encoding. */
#define HLL_SPARSE 1 /* Sparse encoding. */
#define HLL_RAW 255  /* Only used internally, never exposed. */
#define HLL_MAX_ENCODING 1

/* =========================== Low level bit macros ========================= */

/* Macros to access the dense representation.
 *
 * We need to get and set 6 bit counters in an array of 8 bit bytes.
 * We use macros to make sure the code is inlined since speed is critical
 * especially in order to compute the approximated cardinality in
 * HLLCOUNT where we need to access all the registers at once.
 * For the same reason we also want to avoid conditionals in this code path.
 *
 * +--------+--------+--------+------//
 * |11000000|22221111|33333322|55444444
 * +--------+--------+--------+------//
 *
 * Note: in the above representation the most significant bit (MSB)
 * of every byte is on the left. We start using bits from the LSB to MSB,
 * and so forth passing to the next byte.
 *
 * Example, we want to access to counter at pos = 1 ("111111" in the
 * illustration above).
 *
 * The index of the first byte b0 containing our data is:
 *
 *  b0 = 6 * pos / 8 = 0
 *
 *   +--------+
 *   |11000000|  <- Our byte at b0
 *   +--------+
 *
 * The position of the first bit (counting from the LSB = 0) in the byte
 * is given by:
 *
 *  fb = 6 * pos % 8 -> 6
 *
 * Right shift b0 of 'fb' bits.
 *
 *   +--------+
 *   |11000000|  <- Initial value of b0
 *   |00000011|  <- After right shift of 6 pos.
 *   +--------+
 *
 * Left shift b1 of bits 8-fb bits (2 bits)
 *
 *   +--------+
 *   |22221111|  <- Initial value of b1
 *   |22111100|  <- After left shift of 2 bits.
 *   +--------+
 *
 * OR the two bits, and finally AND with 111111 (63 in decimal) to
 * clean the higher order bits we are not interested in:
 *
 *   +--------+
 *   |00000011|  <- b0 right shifted
 *   |22111100|  <- b1 left shifted
 *   |22111111|  <- b0 OR b1
 *   |  111111|  <- (b0 OR b1) AND 63, our value.
 *   +--------+
 *
 * We can try with a different example, like pos = 0. In this case
 * the 6-bit counter is actually contained in a single byte.
 *
 *  b0 = 6 * pos / 8 = 0
 *
 *   +--------+
 *   |11000000|  <- Our byte at b0
 *   +--------+
 *
 *  fb = 6 * pos % 8 = 0
 *
 *  So we right shift of 0 bits (no shift in practice) and
 *  left shift the next byte of 8 bits, even if we don't use it,
 *  but this has the effect of clearing the bits so the result
 *  will not be affected after the OR.
 *
 * -------------------------------------------------------------------------
 *
 * Setting the register is a bit more complex, let's assume that 'val'
 * is the value we want to set, already in the right range.
 *
 * We need two steps, in one we need to clear the bits, and in the other
 * we need to bitwise-OR the new bits.
 *
 * Let's try with 'pos' = 1, so our first byte at 'b' is 0,
 *
 * "fb" is 6 in this case.
 *
 *   +--------+
 *   |11000000|  <- Our byte at b0
 *   +--------+
 *
 * To create an AND-mask to clear the bits about this position, we just
 * initialize the mask with the value 63, left shift it of "fs" bits,
 * and finally invert the result.
 *
 *   +--------+
 *   |00111111|  <- "mask" starts at 63
 *   |11000000|  <- "mask" after left shift of "ls" bits.
 *   |00111111|  <- "mask" after invert.
 *   +--------+
 *
 * Now we can bitwise-AND the byte at "b" with the mask, and bitwise-OR
 * it with "val" left-shifted of "ls" bits to set the new bits.
 *
 * Now let's focus on the next byte b1:
 *
 *   +--------+
 *   |22221111|  <- Initial value of b1
 *   +--------+
 *
 * To build the AND mask we start again with the 63 value, right shift
 * it by 8-fb bits, and invert it.
 *
 *   +--------+
 *   |00111111|  <- "mask" set at 2&6-1
 *   |00001111|  <- "mask" after the right shift by 8-fb = 2 bits
 *   |11110000|  <- "mask" after bitwise not.
 *   +--------+
 *
 * Now we can mask it with b+1 to clear the old bits, and bitwise-OR
 * with "val" left-shifted by "rs" bits to set the new value.
 */

/* Note: if we access the last counter, we will also access the b+1 byte
 * that is out of the array, but sds strings always have an implicit null
 * term, so the byte exists, and we can skip the conditional (or the need
 * to allocate 1 byte more explicitly). */

/* Store the value of the register at position 'regnum' into variable 'target'.
 * 'p' is an array of unsigned bytes. */
#define HLL_DENSE_GET_REGISTER(target, p, regnum)             \
  do {                                                        \
    uint8_t* _p = (uint8_t*)p;                                \
    unsigned long _byte = regnum * HLL_BITS / 8;              \
    unsigned long _fb = regnum * HLL_BITS & 7;                \
    unsigned long _fb8 = 8 - _fb;                             \
    unsigned long b0 = _p[_byte];                             \
    unsigned long b1 = _p[_byte + 1];                         \
    target = ((b0 >> _fb) | (b1 << _fb8)) & HLL_REGISTER_MAX; \
  } while (0)

/* Set the value of the register at position 'regnum' to 'val'.
 * 'p' is an array of unsigned bytes. */
#define HLL_DENSE_SET_REGISTER(p, regnum, val)    \
  do {                                            \
    uint8_t* _p = (uint8_t*)p;                    \
    unsigned long _byte = (regnum)*HLL_BITS / 8;  \
    unsigned long _fb = (regnum)*HLL_BITS & 7;    \
    unsigned long _fb8 = 8 - _fb;                 \
    unsigned long _v = (val);                     \
    _p[_byte] &= ~(HLL_REGISTER_MAX << _fb);      \
    _p[_byte] |= _v << _fb;                       \
    _p[_byte + 1] &= ~(HLL_REGISTER_MAX >> _fb8); \
    _p[_byte + 1] |= _v >> _fb8;                  \
  } while (0)

/* Macros to access the sparse representation.
 * The macros parameter is expected to be an uint8_t pointer. */
#define HLL_SPARSE_XZERO_BIT 0x40                    /* 01xxxxxx */
#define HLL_SPARSE_VAL_BIT 0x80                      /* 1vvvvvxx */
#define HLL_SPARSE_IS_ZERO(p) (((*(p)) & 0xc0) == 0) /* 00xxxxxx */
#define HLL_SPARSE_IS_XZERO(p) (((*(p)) & 0xc0) == HLL_SPARSE_XZERO_BIT)
#define HLL_SPARSE_IS_VAL(p) ((*(p)) & HLL_SPARSE_VAL_BIT)
#define HLL_SPARSE_ZERO_LEN(p) (((*(p)) & 0x3f) + 1)
#define HLL_SPARSE_XZERO_LEN(p) (((((*(p)) & 0x3f) << 8) | (*((p) + 1))) + 1)
#define HLL_SPARSE_VAL_VALUE(p) ((((*(p)) >> 2) & 0x1f) + 1)
#define HLL_SPARSE_VAL_LEN(p) (((*(p)) & 0x3) + 1)
#define HLL_SPARSE_VAL_MAX_VALUE 32
#define HLL_SPARSE_VAL_MAX_LEN 4
#define HLL_SPARSE_ZERO_MAX_LEN 64
#define HLL_SPARSE_XZERO_MAX_LEN 16384
#define HLL_SPARSE_VAL_SET(p, val, len)                       \
  do {                                                        \
    *(p) = (((val)-1) << 2 | ((len)-1)) | HLL_SPARSE_VAL_BIT; \
  } while (0)
#define HLL_SPARSE_ZERO_SET(p, len) \
  do {                              \
    *(p) = (len)-1;                 \
  } while (0)
#define HLL_SPARSE_XZERO_SET(p, len)         \
  do {                                       \
    int _l = (len)-1;                        \
    *(p) = (_l >> 8) | HLL_SPARSE_XZERO_BIT; \
    *((p) + 1) = (_l & 0xff);                \
  } while (0)
#define HLL_ALPHA_INF 0.721347520444481703680 /* constant for 0.5/ln(2) */

/* ========================= HyperLogLog algorithm  ========================= */

/* Our hash function is MurmurHash2, 64 bit version.
 * It was modified for Redis in order to provide the same result in
 * big and little endian archs (endian neutral). */
uint64_t MurmurHash64A(const void* key, int len, unsigned int seed) {
  const uint64_t m = 0xc6a4a7935bd1e995;
  const int r = 47;
  uint64_t h = seed ^ (len * m);
  const uint8_t* data = (const uint8_t*)key;
  const uint8_t* end = data + (len - (len & 7));

  while (data != end) {
    uint64_t k;

#if (BYTE_ORDER == LITTLE_ENDIAN)
#ifdef USE_ALIGNED_ACCESS
    memcpy(&k, data, sizeof(uint64_t));
#else
    k = *((uint64_t*)data);
#endif
#else
    k = (uint64_t)data[0];
    k |= (uint64_t)data[1] << 8;
    k |= (uint64_t)data[2] << 16;
    k |= (uint64_t)data[3] << 24;
    k |= (uint64_t)data[4] << 32;
    k |= (uint64_t)data[5] << 40;
    k |= (uint64_t)data[6] << 48;
    k |= (uint64_t)data[7] << 56;
#endif

    k *= m;
    k ^= k >> r;
    k *= m;
    h ^= k;
    h *= m;
    data += 8;
  }

  switch (len & 7) {
    case 7:
      h ^= (uint64_t)data[6] << 48; /* fall-thru */
    case 6:
      h ^= (uint64_t)data[5] << 40; /* fall-thru */
    case 5:
      h ^= (uint64_t)data[4] << 32; /* fall-thru */
    case 4:
      h ^= (uint64_t)data[3] << 24; /* fall-thru */
    case 3:
      h ^= (uint64_t)data[2] << 16; /* fall-thru */
    case 2:
      h ^= (uint64_t)data[1] << 8; /* fall-thru */
    case 1:
      h ^= (uint64_t)data[0];
      h *= m; /* fall-thru */
  };

  h ^= h >> r;
  h *= m;
  h ^= h >> r;
  return h;
}

/* Given a string element to add to the HyperLogLog, returns the length
 * of the pattern 000..1 of the element hash. As a side effect 'regp' is
 * set to the register index this element hashes to. */
int hllPatLen(unsigned char* ele, size_t elesize, long* regp) {
  uint64_t hash, bit, index;
  int count;

  /* Count the number of zeroes starting from bit HLL_REGISTERS
   * (that is a power of two corresponding to the first bit we don't use
   * as index). The max run can be 64-P+1 = Q+1 bits.
   *
   * Note that the final "1" ending the sequence of zeroes must be
   * included in the count, so if we find "001" the count is 3, and
   * the smallest count possible is no zeroes at all, just a 1 bit
   * at the first position, that is a count of 1.
   *
   * This may sound like inefficient, but actually in the average case
   * there are high probabilities to find a 1 after a few iterations. */
  hash = MurmurHash64A(ele, elesize, 0xadc83b19ULL);
  index = hash & HLL_P_MASK;      /* Register index. */
  hash >>= HLL_P;                 /* Remove bits used to address the register. */
  hash |= ((uint64_t)1 << HLL_Q); /* Make sure the loop terminates
                                     and count will be <= Q+1. */
  bit = 1;
  count = 1; /* Initialized to 1 since we count the "00000...1" pattern. */
  while ((hash & bit) == 0) {
    count++;
    bit <<= 1;
  }
  *regp = (int)index;
  return count;
}

/* ================== Dense representation implementation  ================== */

/* Low level function to set the dense HLL register at 'index' to the
 * specified value if the current value is smaller than 'count'.
 *
 * 'registers' is expected to have room for HLL_REGISTERS plus an
 * additional byte on the right. This requirement is met by sds strings
 * automatically since they are implicitly null terminated.
 *
 * The function always succeed, however if as a result of the operation
 * the approximated cardinality changed, 1 is returned. Otherwise 0
 * is returned. */
int hllDenseSet(uint8_t* registers, long index, uint8_t count) {
  uint8_t oldcount;

  HLL_DENSE_GET_REGISTER(oldcount, registers, index);
  if (count > oldcount) {
    HLL_DENSE_SET_REGISTER(registers, index, count);
    return 1;
  } else {
    return 0;
  }
}

/* "Add" the element in the dense hyperloglog data structure.
 * Actually nothing is added, but the max 0 pattern counter of the subset
 * the element belongs to is incremented if needed.
 *
 * This is just a wrapper to hllDenseSet(), performing the hashing of the
 * element in order to retrieve the index and zero-run count. */
int hllDenseAdd(uint8_t* registers, unsigned char* ele, size_t elesize) {
  long index;
  uint8_t count = hllPatLen(ele, elesize, &index);
  /* Update the register if this element produced a longer run of zeroes. */
  return hllDenseSet(registers, index, count);
}

/* Compute the register histogram in the dense representation. */
void hllDenseRegHisto(uint8_t* registers, int* reghisto) {
  int j;

  /* Redis default is to use 16384 registers 6 bits each. The code works
   * with other values by modifying the defines, but for our target value
   * we take a faster path with unrolled loops. */
  if (HLL_REGISTERS == 16384 && HLL_BITS == 6) {
    uint8_t* r = registers;
    unsigned long r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
    for (j = 0; j < 1024; j++) {
      /* Handle 16 registers per iteration. */
      r0 = r[0] & 63;
      r1 = (r[0] >> 6 | r[1] << 2) & 63;
      r2 = (r[1] >> 4 | r[2] << 4) & 63;
      r3 = (r[2] >> 2) & 63;
      r4 = r[3] & 63;
      r5 = (r[3] >> 6 | r[4] << 2) & 63;
      r6 = (r[4] >> 4 | r[5] << 4) & 63;
      r7 = (r[5] >> 2) & 63;
      r8 = r[6] & 63;
      r9 = (r[6] >> 6 | r[7] << 2) & 63;
      r10 = (r[7] >> 4 | r[8] << 4) & 63;
      r11 = (r[8] >> 2) & 63;
      r12 = r[9] & 63;
      r13 = (r[9] >> 6 | r[10] << 2) & 63;
      r14 = (r[10] >> 4 | r[11] << 4) & 63;
      r15 = (r[11] >> 2) & 63;

      reghisto[r0]++;
      reghisto[r1]++;
      reghisto[r2]++;
      reghisto[r3]++;
      reghisto[r4]++;
      reghisto[r5]++;
      reghisto[r6]++;
      reghisto[r7]++;
      reghisto[r8]++;
      reghisto[r9]++;
      reghisto[r10]++;
      reghisto[r11]++;
      reghisto[r12]++;
      reghisto[r13]++;
      reghisto[r14]++;
      reghisto[r15]++;

      r += 12;
    }
  } else {
    for (j = 0; j < HLL_REGISTERS; j++) {
      unsigned long reg;
      HLL_DENSE_GET_REGISTER(reg, registers, j);
      reghisto[reg]++;
    }
  }
}

/* ================== Sparse representation implementation  ================= */


/* Convert the HLL with sparse representation given as input in its dense
 * representation. Both representations are represented by SDS strings, and
 * the input representation is freed as a side effect.
 *
 * The function returns C_OK if the sparse representation was valid,
 * otherwise C_ERR is returned if the representation was corrupted. */
int hllSparseToDense(sds* hll_ptr) {
    sds sparse = *hll_ptr, dense;
    struct hllhdr *hdr, *oldhdr = (struct hllhdr*)sparse;
    int idx = 0, runlen, regval;
    uint8_t *p = (uint8_t*)sparse, *end = p+sdslen(sparse);

    /* If the representation is already the right one return ASAP. */
    hdr = (struct hllhdr*) sparse;
    if (hdr->encoding == HLL_DENSE) return C_OK;

    /* Create a string of the right size filled with zero bytes.
     * Note that the cached cardinality is set to 0 as a side effect
     * that is exactly the cardinality of an empty HLL. */
    dense = sdsnewlen(NULL,HLL_DENSE_SIZE);
    hdr = (struct hllhdr*) dense;
    *hdr = *oldhdr; /* This will copy the magic and cached cardinality. */
    hdr->encoding = HLL_DENSE;

    /* Now read the sparse representation and set non-zero registers
     * accordingly. */
    p += HLL_HDR_SIZE;
    while(p < end) {
        if (HLL_SPARSE_IS_ZERO(p)) {
            runlen = HLL_SPARSE_ZERO_LEN(p);
            idx += runlen;
            p++;
        } else if (HLL_SPARSE_IS_XZERO(p)) {
            runlen = HLL_SPARSE_XZERO_LEN(p);
            idx += runlen;
            p += 2;
        } else {
            runlen = HLL_SPARSE_VAL_LEN(p);
            regval = HLL_SPARSE_VAL_VALUE(p);
            if ((runlen + idx) > HLL_REGISTERS) break; /* Overflow. */
            while(runlen--) {
                HLL_DENSE_SET_REGISTER(hdr->registers,idx,regval);
                idx++;
            }
            p++;
        }
    }

    /* If the sparse representation was valid, we expect to find idx
     * set to HLL_REGISTERS. */
    if (idx != HLL_REGISTERS) {
        sdsfree(dense);
        return C_ERR;
    }

    /* Free the old representation and set the new one. */
    sdsfree(*hll_ptr);
    *hll_ptr = dense;
    return C_OK;
}

/* Low level function to set the sparse HLL register at 'index' to the
 * specified value if the current value is smaller than 'count'.
 *
 * The object 'hll' is the SDS object holding the HLL. The function requires
 * a reference to the object in order to be able to enlarge the string if
 * needed.
 *
 * On success, the function returns 1 if the cardinality changed, or 0
 * if the register for this element was not updated.
 * On error (if the representation is invalid) -1 is returned.
 *
 * As a side effect the function may promote the HLL representation from
 * sparse to dense: this happens when a register requires to be set to a value
 * not representable with the sparse representation, or when the resulting
 * size would be greater than HLL_SPARSE_MAX_BYTES. */
int hllSparseSet(sds* hll_ptr, long index, uint8_t count, int* promoted) {
    struct hllhdr *hdr;
    uint8_t oldcount, *sparse, *end, *p, *prev, *next;
    long first, span;
    long is_zero = 0, is_xzero = 0, is_val = 0, runlen = 0;

    /* If the count is too big to be representable by the sparse representation
     * switch to dense representation. */
    if (count > HLL_SPARSE_VAL_MAX_VALUE) goto promote;

    /* When updating a sparse representation, sometimes we may need to enlarge the
     * buffer for up to 3 bytes in the worst case (XZERO split into XZERO-VAL-XZERO),
     * and the following code does the enlarge job.
     * Actually, we use a greedy strategy, enlarge more than 3 bytes to avoid the need
     * for future reallocates on incremental growth. But we do not allocate more than
     * 'HLL_SPARSE_MAX_BYTES' bytes for the sparse representation.
     * If the available size of hyperloglog sds string is not enough for the increment
     * we need, we promote the hypreloglog to dense representation in 'step 3'.
     */
    sds hll = *hll_ptr;
    if (sdsalloc(hll) < HLL_SPARSE_MAX_BYTES && sdsavail(hll) < 3) {
        size_t newlen = sdslen(hll) + 3;
        newlen += min(newlen, 300); /* Greediness: double 'newlen' if it is smaller than 300, or add 300 to it when it exceeds 300 */
        if (newlen > HLL_SPARSE_MAX_BYTES)
            newlen = HLL_SPARSE_MAX_BYTES;
        *hll_ptr = sdsResize(hll, newlen);
        hll = *hll_ptr;
    }

    /* Step 1: we need to locate the opcode we need to modify to check
     * if a value update is actually needed. */
    sparse = p = ((uint8_t*)hll) + HLL_HDR_SIZE;
    end = p + sdslen(hll) - HLL_HDR_SIZE;

    first = 0;
    prev = NULL; /* Points to previous opcode at the end of the loop. */
    next = NULL; /* Points to the next opcode at the end of the loop. */
    span = 0;
    while(p < end) {
        long oplen;

        /* Set span to the number of registers covered by this opcode.
         *
         * This is the most performance critical loop of the sparse
         * representation. Sorting the conditionals from the most to the
         * least frequent opcode in many-bytes sparse HLLs is faster. */
        oplen = 1;
        if (HLL_SPARSE_IS_ZERO(p)) {
            span = HLL_SPARSE_ZERO_LEN(p);
        } else if (HLL_SPARSE_IS_VAL(p)) {
            span = HLL_SPARSE_VAL_LEN(p);
        } else { /* XZERO. */
            span = HLL_SPARSE_XZERO_LEN(p);
            oplen = 2;
        }
        /* Break if this opcode covers the register as 'index'. */
        if (index <= first+span-1) break;
        prev = p;
        p += oplen;
        first += span;
    }
    if (span == 0 || p >= end) return -1; /* Invalid format. */

    next = HLL_SPARSE_IS_XZERO(p) ? p+2 : p+1;
    if (next >= end) next = NULL;

    /* Cache current opcode type to avoid using the macro again and
     * again for something that will not change.
     * Also cache the run-length of the opcode. */
    if (HLL_SPARSE_IS_ZERO(p)) {
        is_zero = 1;
        runlen = HLL_SPARSE_ZERO_LEN(p);
    } else if (HLL_SPARSE_IS_XZERO(p)) {
        is_xzero = 1;
        runlen = HLL_SPARSE_XZERO_LEN(p);
    } else {
        is_val = 1;
        runlen = HLL_SPARSE_VAL_LEN(p);
    }

    /* Step 2: After the loop:
     *
     * 'first' stores to the index of the first register covered
     *  by the current opcode, which is pointed by 'p'.
     *
     * 'next' ad 'prev' store respectively the next and previous opcode,
     *  or NULL if the opcode at 'p' is respectively the last or first.
     *
     * 'span' is set to the number of registers covered by the current
     *  opcode.
     *
     * There are different cases in order to update the data structure
     * in place without generating it from scratch:
     *
     * A) If it is a VAL opcode already set to a value >= our 'count'
     *    no update is needed, regardless of the VAL run-length field.
     *    In this case PFADD returns 0 since no changes are performed.
     *
     * B) If it is a VAL opcode with len = 1 (representing only our
     *    register) and the value is less than 'count', we just update it
     *    since this is a trivial case. */
    if (is_val) {
        oldcount = HLL_SPARSE_VAL_VALUE(p);
        /* Case A. */
        if (oldcount >= count) return 0;

        /* Case B. */
        if (runlen == 1) {
            HLL_SPARSE_VAL_SET(p,count,1);
            goto updated;
        }
    }

    /* C) Another trivial to handle case is a ZERO opcode with a len of 1.
     * We can just replace it with a VAL opcode with our value and len of 1. */
    if (is_zero && runlen == 1) {
        HLL_SPARSE_VAL_SET(p,count,1);
        goto updated;
    }

    /* D) General case.
     *
     * The other cases are more complex: our register requires to be updated
     * and is either currently represented by a VAL opcode with len > 1,
     * by a ZERO opcode with len > 1, or by an XZERO opcode.
     *
     * In those cases the original opcode must be split into multiple
     * opcodes. The worst case is an XZERO split in the middle resulting into
     * XZERO - VAL - XZERO, so the resulting sequence max length is
     * 5 bytes.
     *
     * We perform the split writing the new sequence into the 'new' buffer
     * with 'newlen' as length. Later the new sequence is inserted in place
     * of the old one, possibly moving what is on the right a few bytes
     * if the new sequence is longer than the older one. */
    uint8_t seq[5], *n = seq;
    int last = first+span-1; /* Last register covered by the sequence. */
    int len;

    if (is_zero || is_xzero) {
        /* Handle splitting of ZERO / XZERO. */
        if (index != first) {
            len = index-first;
            if (len > HLL_SPARSE_ZERO_MAX_LEN) {
                HLL_SPARSE_XZERO_SET(n,len);
                n += 2;
            } else {
                HLL_SPARSE_ZERO_SET(n,len);
                n++;
            }
        }
        HLL_SPARSE_VAL_SET(n,count,1);
        n++;
        if (index != last) {
            len = last-index;
            if (len > HLL_SPARSE_ZERO_MAX_LEN) {
                HLL_SPARSE_XZERO_SET(n,len);
                n += 2;
            } else {
                HLL_SPARSE_ZERO_SET(n,len);
                n++;
            }
        }
    } else {
        /* Handle splitting of VAL. */
        int curval = HLL_SPARSE_VAL_VALUE(p);

        if (index != first) {
            len = index-first;
            HLL_SPARSE_VAL_SET(n,curval,len);
            n++;
        }
        HLL_SPARSE_VAL_SET(n,count,1);
        n++;
        if (index != last) {
            len = last-index;
            HLL_SPARSE_VAL_SET(n,curval,len);
            n++;
        }
    }

    /* Step 3: substitute the new sequence with the old one.
     *
     * Note that we already allocated space on the sds string
     * calling sdsResize(). */
    int seqlen = n-seq;
    int oldlen = is_xzero ? 2 : 1;
    int deltalen = seqlen-oldlen;

    if (deltalen > 0 &&
        sdslen(hll) + deltalen > HLL_SPARSE_MAX_BYTES) goto promote;
    serverAssert(sdslen(hll) + deltalen <= sdsalloc(hll));
    if (deltalen && next) memmove(next+deltalen,next,end-next);
    sdsIncrLen(hll,deltalen);
    memcpy(p,seq,seqlen);
    end += deltalen;

updated:
    /* Step 4: Merge adjacent values if possible.
     *
     * The representation was updated, however the resulting representation
     * may not be optimal: adjacent VAL opcodes can sometimes be merged into
     * a single one. */
    p = prev ? prev : sparse;
    int scanlen = 5; /* Scan up to 5 upcodes starting from prev. */
    while (p < end && scanlen--) {
        if (HLL_SPARSE_IS_XZERO(p)) {
            p += 2;
            continue;
        } else if (HLL_SPARSE_IS_ZERO(p)) {
            p++;
            continue;
        }
        /* We need two adjacent VAL opcodes to try a merge, having
         * the same value, and a len that fits the VAL opcode max len. */
        if (p+1 < end && HLL_SPARSE_IS_VAL(p+1)) {
            int v1 = HLL_SPARSE_VAL_VALUE(p);
            int v2 = HLL_SPARSE_VAL_VALUE(p+1);
            if (v1 == v2) {
                int len = HLL_SPARSE_VAL_LEN(p)+HLL_SPARSE_VAL_LEN(p+1);
                if (len <= HLL_SPARSE_VAL_MAX_LEN) {
                    HLL_SPARSE_VAL_SET(p+1,v1,len);
                    memmove(p,p+1,end-p);
                    sdsIncrLen(hll,-1);
                    end--;
                    /* After a merge we reiterate without incrementing 'p'
                     * in order to try to merge the just merged value with
                     * a value on its right. */
                    continue;
                }
            }
        }
        p++;
    }

    /* Invalidate the cached cardinality. */
    hdr = (struct hllhdr *)hll;
    HLL_INVALIDATE_CACHE(hdr);
    return 1;

promote: /* Promote to dense representation. */
    if (hllSparseToDense(&hll) == C_ERR) return -1; /* Corrupted HLL. */
    *hll_ptr = hll;
    hdr = (struct hllhdr *)hll;

    /* We need to call hllDenseAdd() to perform the operation after the
     * conversion. However the result must be 1, since if we need to
     * convert from sparse to dense a register requires to be updated.
     *
     * Note that this in turn means that PFADD will make sure the command
     * is propagated to slaves / AOF, so if there is a sparse -> dense
     * conversion, it will be performed in all the slaves as well. */
    int dense_retval = hllDenseSet(hdr->registers,index,count);
    serverAssert(dense_retval == 1);
    *promoted = 1;
    return dense_retval;
}

/* "Add" the element in the sparse hyperloglog data structure.
 * Actually nothing is added, but the max 0 pattern counter of the subset
 * the element belongs to is incremented if needed.
 *
 * This function is actually a wrapper for hllSparseSet(), it only performs
 * the hashing of the element to obtain the index and zeros run length. */
int hllSparseAdd(sds* hll_ptr, unsigned char *ele, size_t elesize, int* promoted) {
    long index;
    uint8_t count = hllPatLen(ele,elesize,&index);
    /* Update the register if this element produced a longer run of zeroes. */
    return hllSparseSet(hll_ptr,index,count, promoted);
}
/* Compute the register histogram in the sparse representation. */
void hllSparseRegHisto(uint8_t* sparse, int sparselen, int* invalid, int* reghisto) {
  int idx = 0, runlen, regval;
  uint8_t *end = sparse + sparselen, *p = sparse;

  while (p < end) {
    if (HLL_SPARSE_IS_ZERO(p)) {
      runlen = HLL_SPARSE_ZERO_LEN(p);
      idx += runlen;
      reghisto[0] += runlen;
      p++;
    } else if (HLL_SPARSE_IS_XZERO(p)) {
      runlen = HLL_SPARSE_XZERO_LEN(p);
      idx += runlen;
      reghisto[0] += runlen;
      p += 2;
    } else {
      runlen = HLL_SPARSE_VAL_LEN(p);
      regval = HLL_SPARSE_VAL_VALUE(p);
      idx += runlen;
      reghisto[regval] += runlen;
      p++;
    }
  }
  if (idx != HLL_REGISTERS && invalid)
    *invalid = 1;
}

/* ========================= HyperLogLog Count ==============================
 * This is the core of the algorithm where the approximated count is computed.
 * The function uses the lower level hllDenseRegHisto() and hllSparseRegHisto()
 * functions as helpers to compute histogram of register values part of the
 * computation, which is representation-specific, while all the rest is common. */

/* Implements the register histogram calculation for uint8_t data type
 * which is only used internally as speedup for PFCOUNT with multiple keys. */
void hllRawRegHisto(uint8_t* registers, int* reghisto) {
  uint64_t* word = (uint64_t*)registers;
  uint8_t* bytes;
  int j;

  for (j = 0; j < HLL_REGISTERS / 8; j++) {
    if (*word == 0) {
      reghisto[0] += 8;
    } else {
      bytes = (uint8_t*)word;
      reghisto[bytes[0]]++;
      reghisto[bytes[1]]++;
      reghisto[bytes[2]]++;
      reghisto[bytes[3]]++;
      reghisto[bytes[4]]++;
      reghisto[bytes[5]]++;
      reghisto[bytes[6]]++;
      reghisto[bytes[7]]++;
    }
    word++;
  }
}

/* Helper function sigma as defined in
 * "New cardinality estimation algorithms for HyperLogLog sketches"
 * Otmar Ertl, arXiv:1702.01284 */
double hllSigma(double x) {
  if (x == 1.)
    return INFINITY;
  double zPrime;
  double y = 1;
  double z = x;
  do {
    x *= x;
    zPrime = z;
    z += x * y;
    y += y;
  } while (zPrime != z);
  return z;
}

/* Helper function tau as defined in
 * "New cardinality estimation algorithms for HyperLogLog sketches"
 * Otmar Ertl, arXiv:1702.01284 */
double hllTau(double x) {
  if (x == 0. || x == 1.)
    return 0.;
  double zPrime;
  double y = 1.0;
  double z = 1 - x;
  do {
    x = sqrt(x);
    zPrime = z;
    y *= 0.5;
    z -= pow(1 - x, 2) * y;
  } while (zPrime != z);
  return z / 3;
}

/* Return the approximated cardinality of the set based on the harmonic
 * mean of the registers values. 'hdr' points to the start of the SDS
 * representing the String object holding the HLL representation.
 *
 * If the sparse representation of the HLL object is not valid, the integer
 * pointed by 'invalid' is set to non-zero, otherwise it is left untouched.
 *
 * hllCount() supports a special internal-only encoding of HLL_RAW, that
 * is, hdr->registers will point to an uint8_t array of HLL_REGISTERS element.
 * This is useful in order to speedup PFCOUNT when called against multiple
 * keys (no need to work with 6-bit integers encoding). */
uint64_t hllCount(struct hllhdr* hdr, int* invalid) {
  double m = HLL_REGISTERS;
  double E;
  int j;
  /* Note that reghisto size could be just HLL_Q+2, because HLL_Q+1 is
   * the maximum frequency of the "000...1" sequence the hash function is
   * able to return. However it is slow to check for sanity of the
   * input: instead we history array at a safe size: overflows will
   * just write data to wrong, but correctly allocated, places. */
  int reghisto[64] = {0};

  /* Compute register histogram */
  if (hdr->encoding == HLL_DENSE) {
    hllDenseRegHisto(hdr->registers, reghisto);
  } else if (hdr->encoding == HLL_SPARSE) {
    hllSparseRegHisto(hdr->registers, sdslen((sds)hdr) - HLL_HDR_SIZE, invalid, reghisto);
  } else if (hdr->encoding == HLL_RAW) {
    hllRawRegHisto(hdr->registers, reghisto);
  } else {
    serverPanic("Unknown HyperLogLog encoding in hllCount()");
  }

  /* Estimate cardinality from register histogram. See:
   * "New cardinality estimation algorithms for HyperLogLog sketches"
   * Otmar Ertl, arXiv:1702.01284 */
  double z = m * hllTau((m - reghisto[HLL_Q + 1]) / (double)m);
  for (j = HLL_Q; j >= 1; --j) {
    z += reghisto[j];
    z *= 0.5;
  }
  z += m * hllSigma(reghisto[0] / (double)m);
  E = llroundl(HLL_ALPHA_INF * m * m / z);

  return (uint64_t)E;
}

#if 0
/* Merge by computing MAX(registers[i],hll[i]) the HyperLogLog 'hll'
 * with an array of uint8_t HLL_REGISTERS registers pointed by 'max'.
 *
 * The hll object must be already validated via isHLLObjectOrReply()
 * or in some other way.
 *
 * If the HyperLogLog is sparse and is found to be invalid, C_ERR
 * is returned, otherwise the function always succeeds. */
int hllMerge(uint8_t* max, robj* hll) {
  struct hllhdr* hdr = hll->ptr;
  int i;

  if (hdr->encoding == HLL_DENSE) {
    uint8_t val;

    for (i = 0; i < HLL_REGISTERS; i++) {
      HLL_DENSE_GET_REGISTER(val, hdr->registers, i);
      if (val > max[i])
        max[i] = val;
    }
  } else {
    uint8_t *p = hll->ptr, *end = p + sdslen(hll->ptr);
    long runlen, regval;

    p += HLL_HDR_SIZE;
    i = 0;
    while (p < end) {
      if (HLL_SPARSE_IS_ZERO(p)) {
        runlen = HLL_SPARSE_ZERO_LEN(p);
        i += runlen;
        p++;
      } else if (HLL_SPARSE_IS_XZERO(p)) {
        runlen = HLL_SPARSE_XZERO_LEN(p);
        i += runlen;
        p += 2;
      } else {
        runlen = HLL_SPARSE_VAL_LEN(p);
        regval = HLL_SPARSE_VAL_VALUE(p);
        if ((runlen + i) > HLL_REGISTERS)
          break; /* Overflow. */
        while (runlen--) {
          if (regval > max[i])
            max[i] = regval;
          i++;
        }
        p++;
      }
    }
    if (i != HLL_REGISTERS)
      return C_ERR;
  }
  return C_OK;
}

/* ========================== HyperLogLog commands ========================== */
robj* createHLLObject(void) {
  robj* o;
  struct hllhdr* hdr;
  sds s;
  uint8_t* p;
  int sparselen =
      HLL_HDR_SIZE +
      (((HLL_REGISTERS + (HLL_SPARSE_XZERO_MAX_LEN - 1)) / HLL_SPARSE_XZERO_MAX_LEN) * 2);
  int aux;

  /* Populate the sparse representation with as many XZERO opcodes as
   * needed to represent all the registers. */
  aux = HLL_REGISTERS;
  s = sdsnewlen(NULL, sparselen);
  p = (uint8_t*)s + HLL_HDR_SIZE;
  while (aux) {
    int xzero = HLL_SPARSE_XZERO_MAX_LEN;
    if (xzero > aux)
      xzero = aux;
    HLL_SPARSE_XZERO_SET(p, xzero);
    p += 2;
    aux -= xzero;
  }
  serverAssert((p - (uint8_t*)s) == sparselen);

  /* Create the actual object. */
  o = createObject(OBJ_STRING, s);
  hdr = o->ptr;
  memcpy(hdr->magic, "HYLL", 4);
  hdr->encoding = HLL_SPARSE;
  return o;
}
#endif

/* ========================== Dragonfly custom functions ===================== */

enum HllValidness isValidHLL(struct HllBufferPtr hll_buffer) {
  struct hllhdr* hdr;

  if (hll_buffer.size < sizeof(*hdr)) {
    return HLL_INVALID;
  }

  hdr = (struct hllhdr*)hll_buffer.hll;

  /* Magic should be "HYLL". */
  if (hdr->magic[0] != 'H' || hdr->magic[1] != 'Y' || hdr->magic[2] != 'L' ||
      hdr->magic[3] != 'L') {
    return HLL_INVALID;
  }

  if (hdr->encoding > HLL_MAX_ENCODING) {
    return HLL_INVALID;
  }

  switch (hdr->encoding) {
    case HLL_DENSE:
      /* Dense representation string length should match exactly. */
      return (hll_buffer.size == HLL_DENSE_SIZE) ? HLL_VALID_DENSE : HLL_INVALID;
    case HLL_SPARSE:
      return HLL_VALID_SPARSE;
    default:
      return HLL_INVALID;
  }
}

size_t getDenseHllSize() {
  return HLL_DENSE_SIZE;
}

size_t getSparseHllInitSize() {
  return HLL_HDR_SIZE + (((HLL_REGISTERS+(HLL_SPARSE_XZERO_MAX_LEN-1)) /
                     HLL_SPARSE_XZERO_MAX_LEN)*2);
}

int initSparseHll(struct HllBufferPtr hll_ptr) {
  if (hll_ptr.size != getSparseHllInitSize()) {
    return C_ERR;
  }

  memset(hll_ptr.hll, 0, hll_ptr.size);

  /* Populate the sparse representation with as many XZERO opcodes as
    * needed to represent all the registers. */
  int aux = HLL_REGISTERS;
  uint8_t* p = (uint8_t*)hll_ptr.hll + HLL_HDR_SIZE;
  while(aux) {
      int xzero = HLL_SPARSE_XZERO_MAX_LEN;
      if (xzero > aux) xzero = aux;
      HLL_SPARSE_XZERO_SET(p,xzero);
      p += 2;
      aux -= xzero;
  }

  struct hllhdr* hdr = (struct hllhdr*)hll_ptr.hll;

  memcpy(hdr->magic, "HYLL", 4);
  hdr->encoding = HLL_SPARSE;
  return C_OK;
}

int createDenseHll(struct HllBufferPtr hll_ptr) {
  if (hll_ptr.size != getDenseHllSize()) {
    return C_ERR;
  }

  memset(hll_ptr.hll, 0, hll_ptr.size);
  struct hllhdr* hdr = (struct hllhdr*)hll_ptr.hll;
  memcpy(hdr->magic, "HYLL", 4);
  hdr->encoding = HLL_DENSE;
  return C_OK;
}

/* This is a copied & modified version of hllSparseToDense() above that does not use robj */
int convertSparseToDenseHll(struct HllBufferPtr in_hll, struct HllBufferPtr out_hll) {
  struct hllhdr *hdr, *oldhdr = (struct hllhdr*)in_hll.hll;
  int idx = 0, runlen, regval;
  uint8_t *p = (uint8_t*)in_hll.hll, *end = p + in_hll.size;

  if (oldhdr->encoding != HLL_SPARSE)
    return C_ERR;
  if (out_hll.size != getDenseHllSize())
    return C_ERR;

  /* Create a string of the right size filled with zero bytes.
   * Note that the cached cardinality is set to 0 as a side effect
   * that is exactly the cardinality of an empty HLL. */
  hdr = (struct hllhdr*)out_hll.hll;
  *hdr = *oldhdr; /* This will copy the magic and cached cardinality. */
  hdr->encoding = HLL_DENSE;

  /* Now read the sparse representation and set non-zero registers
   * accordingly. */
  p += HLL_HDR_SIZE;
  while (p < end) {
    if (HLL_SPARSE_IS_ZERO(p)) {
      runlen = HLL_SPARSE_ZERO_LEN(p);
      idx += runlen;
      p++;
    } else if (HLL_SPARSE_IS_XZERO(p)) {
      runlen = HLL_SPARSE_XZERO_LEN(p);
      idx += runlen;
      p += 2;
    } else {
      runlen = HLL_SPARSE_VAL_LEN(p);
      regval = HLL_SPARSE_VAL_VALUE(p);
      if ((runlen + idx) > HLL_REGISTERS)
        break; /* Overflow. */
      while (runlen--) {
        HLL_DENSE_SET_REGISTER(hdr->registers, idx, regval);
        idx++;
      }
      p++;
    }
  }

  /* If the sparse representation was valid, we expect to find idx
   * set to HLL_REGISTERS. */
  if (idx != HLL_REGISTERS) {
    return C_ERR;
  }

  return C_OK;
}

int pfadd_sparse(sds* hll_ptr, const unsigned char* value,
                 size_t size, int* promoted) {
  struct hllhdr* hdr = (struct hllhdr*)(*hll_ptr);
  int retval = hllSparseAdd(hll_ptr, (unsigned char*)value, size, promoted);
  switch (retval) {
    case 1:
      HLL_INVALIDATE_CACHE(hdr);
      return 1;
    default:
      return retval;
  }
}

int pfadd_dense(struct HllBufferPtr hll_ptr, const unsigned char* value,
                size_t size) {
  if (isValidHLL(hll_ptr) != HLL_VALID_DENSE)
    return C_ERR;

  struct hllhdr* hdr = (struct hllhdr*)hll_ptr.hll;

  /* Perform the low level ADD operation for every element. */
  int retval = hllDenseAdd(hdr->registers, (unsigned char*)value, size);
  switch (retval) {
    case 1:
      HLL_INVALIDATE_CACHE(hdr);
      return 1;
    default:
      return retval;
  }
}

int64_t pfcountSingle(struct HllBufferPtr hll_ptr) {
  uint64_t card;

  if (isValidHLL(hll_ptr) != HLL_VALID_DENSE)
    return C_ERR;

  /* Check if the cached cardinality is valid. */
  struct hllhdr* hdr = (struct hllhdr*)hll_ptr.hll;
  if (HLL_VALID_CACHE(hdr)) {
    /* Just return the cached value. */
    card = (uint64_t)hdr->card[0];
    card |= (uint64_t)hdr->card[1] << 8;
    card |= (uint64_t)hdr->card[2] << 16;
    card |= (uint64_t)hdr->card[3] << 24;
    card |= (uint64_t)hdr->card[4] << 32;
    card |= (uint64_t)hdr->card[5] << 40;
    card |= (uint64_t)hdr->card[6] << 48;
    card |= (uint64_t)hdr->card[7] << 56;
  } else {
    int invalid = 0;
    /* Recompute it and update the cached value. */
    card = hllCount(hdr, &invalid);
    if (invalid) {
      return -1;
    }
    hdr->card[0] = card & 0xff;
    hdr->card[1] = (card >> 8) & 0xff;
    hdr->card[2] = (card >> 16) & 0xff;
    hdr->card[3] = (card >> 24) & 0xff;
    hdr->card[4] = (card >> 32) & 0xff;
    hdr->card[5] = (card >> 40) & 0xff;
    hdr->card[6] = (card >> 48) & 0xff;
    hdr->card[7] = (card >> 56) & 0xff;
  }
  return card;
}

/* Merge dense-encoded HLL */
static void hllMergeDense(uint8_t* registers, struct HllBufferPtr to) {
  uint8_t val;
  struct hllhdr* hll_hdr = (struct hllhdr*)to.hll;

  for (int i = 0; i < HLL_REGISTERS; i++) {
    HLL_DENSE_GET_REGISTER(val, hll_hdr->registers, i);
    if (val > registers[i]) {
      registers[i] = val;
    }
  }
}

int64_t pfcountMulti(struct HllBufferPtr* hlls, size_t hlls_count) {
  struct hllhdr* hdr;
  uint8_t max[HLL_HDR_SIZE + HLL_REGISTERS];

  /* Compute an HLL with M[i] = MAX(M[i]_j). */
  memset(max, 0, sizeof(max));
  hdr = (struct hllhdr*)max;
  hdr->encoding = HLL_RAW; /* Special internal-only encoding. */
  for (size_t j = 0; j < hlls_count; j++) {
    /* Check type and size. */
    struct HllBufferPtr hll = hlls[j];
    if (isValidHLL(hll) != HLL_VALID_DENSE) {
      return C_ERR;
    }

    hllMergeDense(max, hll);
  }

  /* Compute cardinality of the resulting set. */
  return hllCount(hdr, NULL);
}

int pfmerge(struct HllBufferPtr* in_hlls, size_t in_hlls_count, struct HllBufferPtr out_hll) {
  if (isValidHLL(out_hll) != HLL_VALID_DENSE) {
    return C_ERR;
  }

  uint8_t max[HLL_REGISTERS];

  /* Compute an HLL with M[i] = MAX(M[i]_j).
   * We store the maximum into the max array of registers. We'll write
   * it to the target variable later. */
  memset(max, 0, sizeof(max));

  for (size_t j = 0; j < in_hlls_count; j++) {
    struct HllBufferPtr hll = in_hlls[j];
    if (isValidHLL(hll) != HLL_VALID_DENSE) {
      return C_ERR;
    }

    hllMergeDense(max, hll);
  }

  struct hllhdr* hdr = (struct hllhdr*)out_hll.hll;
  for (size_t j = 0; j < HLL_REGISTERS; j++) {
    hllDenseSet(hdr->registers, j, max[j]);
  }
  HLL_INVALIDATE_CACHE(hdr);

  return C_OK;
}


================================================
FILE: src/redis/hyperloglog.h
================================================
#ifndef __REDIS_HYPERLOGLOG_H
#define __REDIS_HYPERLOGLOG_H

#include <stddef.h>
#include <stdint.h>

#include "redis/sds.h"

/* This version of hyperloglog, forked from Redis, only supports using the dense format of HLL.
 * The reason is that it is of a fixed size, which makes it easier to integrate into Dragonfly.
 * We do support converting of existing sprase-encoded HLL into dense-encoded, which can be useful
 * for replication, serialization, etc. */

enum HllValidness {
  HLL_INVALID,
  HLL_VALID_SPARSE,
  HLL_VALID_DENSE,
};

/* Convenience struct for pointing to an Hll buffer along with its size */
struct HllBufferPtr {
  unsigned char* hll;
  size_t size;
};

enum HllValidness isValidHLL(struct HllBufferPtr hll_ptr);

size_t getDenseHllSize();
size_t getSparseHllInitSize();


int initSparseHll(struct HllBufferPtr hll_ptr);
/* Writes into `hll_ptr` an empty dense-encoded HLL.
 * Returns 0 upon success, or a negative number when `hll_ptr.size` is different from
 * getDenseHllSize() */
int createDenseHll(struct HllBufferPtr hll_ptr);

/* Converts an existing sparse-encoded HLL pointed by `in_hll`, and writes the converted result into
 * `out_hll`.
 * Returns 0 upon success, otherwise a negative number.
 * Failures can occur when `out_hll.size` is different from getDenseHllSize() or when input is not a
 * valid sparse-encoded HLL. */
int convertSparseToDenseHll(struct HllBufferPtr in_hll, struct HllBufferPtr out_hll);

/* Adds `value` of size `size`, to `hll_ptr`.
 * If `obj` does not have an underlying type of HLL a negative number is returned. */
int pfadd_sparse(sds* hll_ptr, const unsigned char* value, size_t size, int* promoted);
int pfadd_dense(struct HllBufferPtr hll_ptr, const unsigned char* value, size_t size);

/* Returns the estimated count of elements for `hll_ptr`.
 * If `hll_ptr` is not a valid dense-encoded HLL, a negative number is returned. */
int64_t pfcountSingle(struct HllBufferPtr hll_ptr);

/* Returns the estimated count for all HLLs in `hlls` array of size `hlls_count`.
 * All `hlls` elements must be valid, dense-encoded HLLs. */
int64_t pfcountMulti(struct HllBufferPtr* hlls, size_t hlls_count);

/* Merges array of HLLs pointed to be `in_hlls` of size `in_hlls_count` into `out_hll`.
 * Returns 0 upon success, otherwise a negative number.
 * Failure can occur when any of `in_hlls` or `out_hll` is not a dense-encoded HLL.
 * `out_hll` *can* be one of the elements in `in_hlls`. */
int pfmerge(struct HllBufferPtr* in_hlls, size_t in_hlls_count, struct HllBufferPtr out_hll);

#endif


================================================
FILE: src/redis/intset.c
================================================
/*
 * Copyright (c) 2009-2012, Pieter Noordhuis <pcnoordhuis at gmail dot com>
 * Copyright (c) 2009-2012, Salvatore Sanfilippo <antirez at gmail dot com>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "intset.h"
#include "zmalloc.h"
#include "endianconv.h"

/* Note that these encodings are ordered, so:
 * INTSET_ENC_INT16 < INTSET_ENC_INT32 < INTSET_ENC_INT64. */
#define INTSET_ENC_INT16 (sizeof(int16_t))
#define INTSET_ENC_INT32 (sizeof(int32_t))
#define INTSET_ENC_INT64 (sizeof(int64_t))

/* Return the required encoding for the provided value. */
static uint8_t _intsetValueEncoding(int64_t v) {
    if (v < INT32_MIN || v > INT32_MAX)
        return INTSET_ENC_INT64;
    else if (v < INT16_MIN || v > INT16_MAX)
        return INTSET_ENC_INT32;
    else
        return INTSET_ENC_INT16;
}

/* Return the value at pos, given an encoding. */
static int64_t _intsetGetEncoded(intset *is, int pos, uint8_t enc) {
    int64_t v64;
    int32_t v32;
    int16_t v16;

    if (enc == INTSET_ENC_INT64) {
        memcpy(&v64,((int64_t*)is->contents)+pos,sizeof(v64));
        memrev64ifbe(&v64);
        return v64;
    } else if (enc == INTSET_ENC_INT32) {
        memcpy(&v32,((int32_t*)is->contents)+pos,sizeof(v32));
        memrev32ifbe(&v32);
        return v32;
    } else {
        memcpy(&v16,((int16_t*)is->contents)+pos,sizeof(v16));
        memrev16ifbe(&v16);
        return v16;
    }
}

/* Return the value at pos, using the configured encoding. */
static int64_t _intsetGet(intset *is, int pos) {
    return _intsetGetEncoded(is,pos,intrev32ifbe(is->encoding));
}

/* Set the value at pos, using the configured encoding. */
static void _intsetSet(intset *is, int pos, int64_t value) {
    uint32_t encoding = intrev32ifbe(is->encoding);

    if (encoding == INTSET_ENC_INT64) {
        ((int64_t*)is->contents)[pos] = value;
        memrev64ifbe(((int64_t*)is->contents)+pos);
    } else if (encoding == INTSET_ENC_INT32) {
        ((int32_t*)is->contents)[pos] = value;
        memrev32ifbe(((int32_t*)is->contents)+pos);
    } else {
        ((int16_t*)is->contents)[pos] = value;
        memrev16ifbe(((int16_t*)is->contents)+pos);
    }
}

/* Create an empty intset. */
intset *intsetNew(void) {
    intset *is = zmalloc(sizeof(intset));
    is->encoding = intrev32ifbe(INTSET_ENC_INT16);
    is->length = 0;
    return is;
}

/* Resize the intset */
static intset *intsetResize(intset *is, uint32_t len) {
    uint64_t size = (uint64_t)len*intrev32ifbe(is->encoding);
    assert(size <= SIZE_MAX - sizeof(intset));
    is = zrealloc(is,sizeof(intset)+size);
    return is;
}

/* Search for the position of "value". Return 1 when the value was found and
 * sets "pos" to the position of the value within the intset. Return 0 when
 * the value is not present in the intset and sets "pos" to the position
 * where "value" can be inserted. */
static uint8_t intsetSearch(intset *is, int64_t value, uint32_t *pos) {
    int min = 0, max = intrev32ifbe(is->length)-1, mid = -1;
    int64_t cur = -1;

    /* The value can never be found when the set is empty */
    if (intrev32ifbe(is->length) == 0) {
        if (pos) *pos = 0;
        return 0;
    } else {
        /* Check for the case where we know we cannot find the value,
         * but do know the insert position. */
        if (value > _intsetGet(is,max)) {
            if (pos) *pos = intrev32ifbe(is->length);
            return 0;
        } else if (value < _intsetGet(is,0)) {
            if (pos) *pos = 0;
            return 0;
        }
    }

    while(max >= min) {
        mid = ((unsigned int)min + (unsigned int)max) >> 1;
        cur = _intsetGet(is,mid);
        if (value > cur) {
            min = mid+1;
        } else if (value < cur) {
            max = mid-1;
        } else {
            break;
        }
    }

    if (value == cur) {
        if (pos) *pos = mid;
        return 1;
    } else {
        if (pos) *pos = min;
        return 0;
    }
}

/* Upgrades the intset to a larger encoding and inserts the given integer. */
static intset *intsetUpgradeAndAdd(intset *is, int64_t value) {
    uint8_t curenc = intrev32ifbe(is->encoding);
    uint8_t newenc = _intsetValueEncoding(value);
    int length = intrev32ifbe(is->length);
    int prepend = value < 0 ? 1 : 0;

    /* First set new encoding and resize */
    is->encoding = intrev32ifbe(newenc);
    is = intsetResize(is,intrev32ifbe(is->length)+1);

    /* Upgrade back-to-front so we don't overwrite values.
     * Note that the "prepend" variable is used to make sure we have an empty
     * space at either the beginning or the end of the intset. */
    while(length--)
        _intsetSet(is,length+prepend,_intsetGetEncoded(is,length,curenc));

    /* Set the value at the beginning or the end. */
    if (prepend)
        _intsetSet(is,0,value);
    else
        _intsetSet(is,intrev32ifbe(is->length),value);
    is->length = intrev32ifbe(intrev32ifbe(is->length)+1);
    return is;
}

static void intsetMoveTail(intset *is, uint32_t from, uint32_t to) {
    void *src, *dst;
    uint32_t bytes = intrev32ifbe(is->length)-from;
    uint32_t encoding = intrev32ifbe(is->encoding);

    if (encoding == INTSET_ENC_INT64) {
        src = (int64_t*)is->contents+from;
        dst = (int64_t*)is->contents+to;
        bytes *= sizeof(int64_t);
    } else if (encoding == INTSET_ENC_INT32) {
        src = (int32_t*)is->contents+from;
        dst = (int32_t*)is->contents+to;
        bytes *= sizeof(int32_t);
    } else {
        src = (int16_t*)is->contents+from;
        dst = (int16_t*)is->contents+to;
        bytes *= sizeof(int16_t);
    }
    memmove(dst,src,bytes);
}

/* Insert an integer in the intset */
intset *intsetAdd(intset *is, int64_t value, uint8_t *success) {
    uint8_t valenc = _intsetValueEncoding(value);
    uint32_t pos;
    if (success) *success = 1;

    /* Upgrade encoding if necessary. If we need to upgrade, we know that
     * this value should be either appended (if > 0) or prepended (if < 0),
     * because it lies outside the range of existing values. */
    if (valenc > intrev32ifbe(is->encoding)) {
        /* This always succeeds, so we don't need to curry *success. */
        return intsetUpgradeAndAdd(is,value);
    } else {
        /* Abort if the value is already present in the set.
         * This call will populate "pos" with the right position to insert
         * the value when it cannot be found. */
        if (intsetSearch(is,value,&pos)) {
            if (success) *success = 0;
            return is;
        }

        is = intsetResize(is,intrev32ifbe(is->length)+1);
        if (pos < intrev32ifbe(is->length)) intsetMoveTail(is,pos,pos+1);
    }

    _intsetSet(is,pos,value);
    is->length = intrev32ifbe(intrev32ifbe(is->length)+1);
    return is;
}

/* Delete integer from intset */
intset *intsetRemove(intset *is, int64_t value, int *success) {
    uint8_t valenc = _intsetValueEncoding(value);
    uint32_t pos;
    if (success) *success = 0;

    if (valenc <= intrev32ifbe(is->encoding) && intsetSearch(is,value,&pos)) {
        uint32_t len = intrev32ifbe(is->length);

        /* We know we can delete */
        if (success) *success = 1;

        /* Overwrite value with tail and update length */
        if (pos < (len-1)) intsetMoveTail(is,pos+1,pos);
        is = intsetResize(is,len-1);
        is->length = intrev32ifbe(len-1);
    }
    return is;
}

intset *intsetTrimTail(intset *is, uint32_t tail_len) {
    uint32_t len = intrev32ifbe(is->length);
    uint32_t new_len = tail_len >= len ? 0 : len - tail_len;
    is->length = intrev32ifbe(new_len);
    return intsetResize(is, new_len);
}

/* Determine whether a value belongs to this set */
uint8_t intsetFind(intset *is, int64_t value) {
    uint8_t valenc = _intsetValueEncoding(value);
    return valenc <= intrev32ifbe(is->encoding) && intsetSearch(is,value,NULL);
}

/* Return random member */
int64_t intsetRandom(intset *is) {
    uint32_t len = intrev32ifbe(is->length);
    assert(len); /* avoid division by zero on corrupt intset payload. */
    return _intsetGet(is,rand()%len);
}

/* Get the value at the given position. When this position is
 * out of range the function returns 0, when in range it returns 1. */
uint8_t intsetGet(intset *is, uint32_t pos, int64_t *value) {
    if (pos < intrev32ifbe(is->length)) {
        *value = _intsetGet(is,pos);
        return 1;
    }
    return 0;
}

/* Return intset length */
uint32_t intsetLen(const intset *is) {
    return intrev32ifbe(is->length);
}

/* Return intset blob size in bytes. */
size_t intsetBlobLen(intset *is) {
    return sizeof(intset)+(size_t)intrev32ifbe(is->length)*intrev32ifbe(is->encoding);
}

/* Validate the integrity of the data structure.
 * when `deep` is 0, only the integrity of the header is validated.
 * when `deep` is 1, we make sure there are no duplicate or out of order records. */
int intsetValidateIntegrity(const unsigned char *p, size_t size, int deep) {
    intset *is = (intset *)p;
    /* check that we can actually read the header. */
    if (size < sizeof(*is))
        return 0;

    uint32_t encoding = intrev32ifbe(is->encoding);

    size_t record_size;
    if (encoding == INTSET_ENC_INT64) {
        record_size = INTSET_ENC_INT64;
    } else if (encoding == INTSET_ENC_INT32) {
        record_size = INTSET_ENC_INT32;
    } else if (encoding == INTSET_ENC_INT16){
        record_size = INTSET_ENC_INT16;
    } else {
        return 0;
    }

    /* check that the size matches (all records are inside the buffer). */
    uint32_t count = intrev32ifbe(is->length);
    if (sizeof(*is) + count*record_size != size)
        return 0;

    /* check that the set is not empty. */
    if (count==0)
        return 0;

    if (!deep)
        return 1;

    /* check that there are no dup or out of order records. */
    int64_t prev = _intsetGet(is,0);
    for (uint32_t i=1; i<count; i++) {
        int64_t cur = _intsetGet(is,i);
        if (cur <= prev)
            return 0;
        prev = cur;
    }

    return 1;
}

#ifdef REDIS_TEST
#include <sys/time.h>
#include <time.h>

#if 0
static void intsetRepr(intset *is) {
    for (uint32_t i = 0; i < intrev32ifbe(is->length); i++) {
        printf("%lld\n", (uint64_t)_intsetGet(is,i));
    }
    printf("\n");
}

static void error(char *err) {
    printf("%s\n", err);
    exit(1);
}
#endif

static void ok(void) {
    printf("OK\n");
}

static long long usec(void) {
    struct timeval tv;
    gettimeofday(&tv,NULL);
    return (((long long)tv.tv_sec)*1000000)+tv.tv_usec;
}

static intset *createSet(int bits, int size) {
    uint64_t mask = (1<<bits)-1;
    uint64_t value;
    intset *is = intsetNew();

    for (int i = 0; i < size; i++) {
        if (bits > 32) {
            value = (rand()*rand()) & mask;
        } else {
            value = rand() & mask;
        }
        is = intsetAdd(is,value,NULL);
    }
    return is;
}

static void checkConsistency(intset *is) {
    for (uint32_t i = 0; i < (intrev32ifbe(is->length)-1); i++) {
        uint32_t encoding = intrev32ifbe(is->encoding);

        if (encoding == INTSET_ENC_INT16) {
            int16_t *i16 = (int16_t*)is->contents;
            assert(i16[i] < i16[i+1]);
        } else if (encoding == INTSET_ENC_INT32) {
            int32_t *i32 = (int32_t*)is->contents;
            assert(i32[i] < i32[i+1]);
        } else {
            int64_t *i64 = (int64_t*)is->contents;
            assert(i64[i] < i64[i+1]);
        }
    }
}

#define UNUSED(x) (void)(x)
int intsetTest(int argc, char **argv, int flags) {
    uint8_t success;
    int i;
    intset *is;
    srand(time(NULL));

    UNUSED(argc);
    UNUSED(argv);
    UNUSED(flags);

    printf("Value encodings: "); {
        assert(_intsetValueEncoding(-32768) == INTSET_ENC_INT16);
        assert(_intsetValueEncoding(+32767) == INTSET_ENC_INT16);
        assert(_intsetValueEncoding(-32769) == INTSET_ENC_INT32);
        assert(_intsetValueEncoding(+32768) == INTSET_ENC_INT32);
        assert(_intsetValueEncoding(-2147483648) == INTSET_ENC_INT32);
        assert(_intsetValueEncoding(+2147483647) == INTSET_ENC_INT32);
        assert(_intsetValueEncoding(-2147483649) == INTSET_ENC_INT64);
        assert(_intsetValueEncoding(+2147483648) == INTSET_ENC_INT64);
        assert(_intsetValueEncoding(-9223372036854775808ull) ==
                    INTSET_ENC_INT64);
        assert(_intsetValueEncoding(+9223372036854775807ull) ==
                    INTSET_ENC_INT64);
        ok();
    }

    printf("Basic adding: "); {
        is = intsetNew();
        is = intsetAdd(is,5,&success); assert(success);
        is = intsetAdd(is,6,&success); assert(success);
        is = intsetAdd(is,4,&success); assert(success);
        is = intsetAdd(is,4,&success); assert(!success);
        ok();
        zfree(is);
    }

    printf("Large number of random adds: "); {
        uint32_t inserts = 0;
        is = intsetNew();
        for (i = 0; i < 1024; i++) {
            is = intsetAdd(is,rand()%0x800,&success);
            if (success) inserts++;
        }
        assert(intrev32ifbe(is->length) == inserts);
        checkConsistency(is);
        ok();
        zfree(is);
    }

    printf("Upgrade from int16 to int32: "); {
        is = intsetNew();
        is = intsetAdd(is,32,NULL);
        assert(intrev32ifbe(is->encoding) == INTSET_ENC_INT16);
        is = intsetAdd(is,65535,NULL);
        assert(intrev32ifbe(is->encoding) == INTSET_ENC_INT32);
        assert(intsetFind(is,32));
        assert(intsetFind(is,65535));
        checkConsistency(is);
        zfree(is);

        is = intsetNew();
        is = intsetAdd(is,32,NULL);
        assert(intrev32ifbe(is->encoding) == INTSET_ENC_INT16);
        is = intsetAdd(is,-65535,NULL);
        assert(intrev32ifbe(is->encoding) == INTSET_ENC_INT32);
        assert(intsetFind(is,32));
        assert(intsetFind(is,-65535));
        checkConsistency(is);
        ok();
        zfree(is);
    }

    printf("Upgrade from int16 to int64: "); {
        is = intsetNew();
        is = intsetAdd(is,32,NULL);
        assert(intrev32ifbe(is->encoding) == INTSET_ENC_INT16);
        is = intsetAdd(is,4294967295,NULL);
        assert(intrev32ifbe(is->encoding) == INTSET_ENC_INT64);
        assert(intsetFind(is,32));
        assert(intsetFind(is,4294967295));
        checkConsistency(is);
        zfree(is);

        is = intsetNew();
        is = intsetAdd(is,32,NULL);
        assert(intrev32ifbe(is->encoding) == INTSET_ENC_INT16);
        is = intsetAdd(is,-4294967295,NULL);
        assert(intrev32ifbe(is->encoding) == INTSET_ENC_INT64);
        assert(intsetFind(is,32));
        assert(intsetFind(is,-4294967295));
        checkConsistency(is);
        ok();
        zfree(is);
    }

    printf("Upgrade from int32 to int64: "); {
        is = intsetNew();
        is = intsetAdd(is,65535,NULL);
        assert(intrev32ifbe(is->encoding) == INTSET_ENC_INT32);
        is = intsetAdd(is,4294967295,NULL);
        assert(intrev32ifbe(is->encoding) == INTSET_ENC_INT64);
        assert(intsetFind(is,65535));
        assert(intsetFind(is,4294967295));
        checkConsistency(is);
        zfree(is);

        is = intsetNew();
        is = intsetAdd(is,65535,NULL);
        assert(intrev32ifbe(is->encoding) == INTSET_ENC_INT32);
        is = intsetAdd(is,-4294967295,NULL);
        assert(intrev32ifbe(is->encoding) == INTSET_ENC_INT64);
        assert(intsetFind(is,65535));
        assert(intsetFind(is,-4294967295));
        checkConsistency(is);
        ok();
        zfree(is);
    }

    printf("Stress lookups: "); {
        long num = 100000, size = 10000;
        int i, bits = 20;
        long long start;
        is = createSet(bits,size);
        checkConsistency(is);

        start = usec();
        for (i = 0; i < num; i++) intsetSearch(is,rand() % ((1<<bits)-1),NULL);
        printf("%ld lookups, %ld element set, %lldusec\n",
               num,size,usec()-start);
        zfree(is);
    }

    printf("Stress add+delete: "); {
        int i, v1, v2;
        is = intsetNew();
        for (i = 0; i < 0xffff; i++) {
            v1 = rand() % 0xfff;
            is = intsetAdd(is,v1,NULL);
            assert(intsetFind(is,v1));

            v2 = rand() % 0xfff;
            is = intsetRemove(is,v2,NULL);
            assert(!intsetFind(is,v2));
        }
        checkConsistency(is);
        ok();
        zfree(is);
    }

    return 0;
}
#endif


================================================
FILE: src/redis/intset.h
================================================
/*
 * Copyright (c) 2009-2012, Pieter Noordhuis <pcnoordhuis at gmail dot com>
 * Copyright (c) 2009-2012, Salvatore Sanfilippo <antirez at gmail dot com>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef __INTSET_H
#define __INTSET_H
#include <stdint.h>

typedef struct intset {
    uint32_t encoding;
    uint32_t length;
    int8_t contents[];
} intset;

intset *intsetNew(void);
intset *intsetAdd(intset *is, int64_t value, uint8_t *success);
intset *intsetRemove(intset *is, int64_t value, int *success);
intset *intsetTrimTail(intset *is, uint32_t trim_len);  // Removes last trim_len elements.
uint8_t intsetFind(intset *is, int64_t value);
int64_t intsetRandom(intset *is);
uint8_t intsetGet(intset *is, uint32_t pos, int64_t *value);
uint32_t intsetLen(const intset *is);
size_t intsetBlobLen(intset *is);

int intsetValidateIntegrity(const unsigned char *is, size_t size, int deep);

#ifdef REDIS_TEST
int intsetTest(int argc, char *argv[], int flags);
#endif

#endif // __INTSET_H


================================================
FILE: src/redis/listpack.c
================================================
/* Listpack -- A lists of strings serialization format
 *
 * This file implements the specification you can find at:
 *
 *  https://github.com/antirez/listpack
 *
 * Copyright (c) 2017,2020, Redis Ltd.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <stdint.h>
#include <limits.h>
#include <sys/types.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <assert.h>

#include "config.h"
#include "listpack.h"
#include "util.h"
#include "zmalloc.h"

#define LP_HDR_SIZE 6       /* 32 bit total len + 16 bit number of elements. */
#define LP_HDR_NUMELE_UNKNOWN UINT16_MAX
#define LP_MAX_INT_ENCODING_LEN 9
#define LP_MAX_BACKLEN_SIZE 5
#define LP_ENCODING_INT 0
#define LP_ENCODING_STRING 1

#define LP_ENCODING_7BIT_UINT 0
#define LP_ENCODING_7BIT_UINT_MASK 0x80
#define LP_ENCODING_IS_7BIT_UINT(byte) (((byte)&LP_ENCODING_7BIT_UINT_MASK)==LP_ENCODING_7BIT_UINT)
#define LP_ENCODING_7BIT_UINT_ENTRY_SIZE 2

#define LP_ENCODING_6BIT_STR 0x80
#define LP_ENCODING_6BIT_STR_MASK 0xC0
#define LP_ENCODING_IS_6BIT_STR(byte) (((byte)&LP_ENCODING_6BIT_STR_MASK)==LP_ENCODING_6BIT_STR)

#define LP_ENCODING_13BIT_INT 0xC0
#define LP_ENCODING_13BIT_INT_MASK 0xE0
#define LP_ENCODING_IS_13BIT_INT(byte) (((byte)&LP_ENCODING_13BIT_INT_MASK)==LP_ENCODING_13BIT_INT)
#define LP_ENCODING_13BIT_INT_ENTRY_SIZE 3

#define LP_ENCODING_12BIT_STR 0xE0
#define LP_ENCODING_12BIT_STR_MASK 0xF0
#define LP_ENCODING_IS_12BIT_STR(byte) (((byte)&LP_ENCODING_12BIT_STR_MASK)==LP_ENCODING_12BIT_STR)

#define LP_ENCODING_16BIT_INT 0xF1
#define LP_ENCODING_16BIT_INT_MASK 0xFF
#define LP_ENCODING_IS_16BIT_INT(byte) (((byte)&LP_ENCODING_16BIT_INT_MASK)==LP_ENCODING_16BIT_INT)
#define LP_ENCODING_16BIT_INT_ENTRY_SIZE 4

#define LP_ENCODING_24BIT_INT 0xF2
#define LP_ENCODING_24BIT_INT_MASK 0xFF
#define LP_ENCODING_IS_24BIT_INT(byte) (((byte)&LP_ENCODING_24BIT_INT_MASK)==LP_ENCODING_24BIT_INT)
#define LP_ENCODING_24BIT_INT_ENTRY_SIZE 5

#define LP_ENCODING_32BIT_INT 0xF3
#define LP_ENCODING_32BIT_INT_MASK 0xFF
#define LP_ENCODING_IS_32BIT_INT(byte) (((byte)&LP_ENCODING_32BIT_INT_MASK)==LP_ENCODING_32BIT_INT)
#define LP_ENCODING_32BIT_INT_ENTRY_SIZE 6

#define LP_ENCODING_64BIT_INT 0xF4
#define LP_ENCODING_64BIT_INT_MASK 0xFF
#define LP_ENCODING_IS_64BIT_INT(byte) (((byte)&LP_ENCODING_64BIT_INT_MASK)==LP_ENCODING_64BIT_INT)
#define LP_ENCODING_64BIT_INT_ENTRY_SIZE 10

#define LP_ENCODING_32BIT_STR 0xF0
#define LP_ENCODING_32BIT_STR_MASK 0xFF
#define LP_ENCODING_IS_32BIT_STR(byte) (((byte)&LP_ENCODING_32BIT_STR_MASK)==LP_ENCODING_32BIT_STR)

#define LP_EOF 0xFF

#define LP_ENCODING_6BIT_STR_LEN(p) ((p)[0] & 0x3F)
#define LP_ENCODING_12BIT_STR_LEN(p) ((((p)[0] & 0xF) << 8) | (p)[1])
#define LP_ENCODING_32BIT_STR_LEN(p)                                                                                   \
    (((uint32_t)(p)[1] << 0) | ((uint32_t)(p)[2] << 8) | ((uint32_t)(p)[3] << 16) | ((uint32_t)(p)[4] << 24))

#define lpGetTotalBytes(p)                                                                                             \
    (((uint32_t)(p)[0] << 0) | ((uint32_t)(p)[1] << 8) | ((uint32_t)(p)[2] << 16) | ((uint32_t)(p)[3] << 24))

#define lpGetNumElements(p) (((uint32_t)(p)[4] << 0) | ((uint32_t)(p)[5] << 8))
#define lpSetTotalBytes(p, v)                                                                                          \
    do {                                                                                                               \
    (p)[0] = (v)&0xff; \
    (p)[1] = ((v)>>8)&0xff; \
    (p)[2] = ((v)>>16)&0xff; \
    (p)[3] = ((v)>>24)&0xff; \
} while(0)

/* TODO: delete this function once corruption in the stream code is identified */
static void lpSetTotalBytesChecked(unsigned char *p, uint32_t v) {
    uint32_t current = lpGetTotalBytes(p);
    if (current == 0) {
      fprintf(stderr, "Error: corrupted listpack size.");
      abort();
    } else if (current > 4194304) { /* 4 MiB */
      /* suspicous size, lets check its validity*/  
      size_t block_size = zmalloc_size(p);
      if (block_size < current) {
        fprintf(stderr, "Error: listpack size (%u) is larger than allocated "
                "block size (%lu).", current, block_size);
        abort();
      }
    }

    lpSetTotalBytes(p, v);
}

#define lpSetNumElements(p, v)                                                                                         \
    do {                                                                                                               \
    (p)[4] = (v)&0xff; \
    (p)[5] = ((v)>>8)&0xff; \
} while(0)

/* Validates that 'p' is not outside the listpack.
 * All function that return a pointer to an element in the listpack will assert
 * that this element is valid, so it can be freely used.
 * Generally functions such lpNext and lpDelete assume the input pointer is
 * already validated (since it's the return value of another function). */
#define ASSERT_INTEGRITY(lp, p)                                                                                        \
    do {                                                                                                               \
    assert((p) >= (lp)+LP_HDR_SIZE && (p) < (lp)+lpGetTotalBytes((lp))); \
} while (0)

/* Similar to the above, but validates the entire element length rather than just
 * it's pointer. */
#define ASSERT_INTEGRITY_LEN(lp, p, len)                                                                               \
    do {                                                                                                               \
    assert((p) >= (lp)+LP_HDR_SIZE && (p)+(len) < (lp)+lpGetTotalBytes((lp))); \
} while (0)

static inline void lpAssertValidEntry(unsigned char* lp, size_t lpbytes, unsigned char *p);

/* Don't let listpacks grow over 1GB in any case, don't wanna risk overflow in
 * Total Bytes header field */
#define LISTPACK_MAX_SAFETY_SIZE (1<<30)
int lpSafeToAdd(unsigned char* lp, size_t add) {
    size_t len = lp? lpGetTotalBytes(lp): 0;
    if (len + add > LISTPACK_MAX_SAFETY_SIZE) return 0;
    return 1;
}

/* Convert a string into a signed 64 bit integer.
 * The function returns 1 if the string could be parsed into a (non-overflowing)
 * signed 64 bit int, 0 otherwise. The 'value' will be set to the parsed value
 * when the function returns success.
 *
 * Note that this function demands that the string strictly represents
 * a int64 value: no spaces or other characters before or after the string
 * representing the number are accepted, nor zeroes at the start if not
 * for the string "0" representing the zero number.
 *
 * Because of its strictness, it is safe to use this function to check if
 * you can convert a string into a long long, and obtain back the string
 * from the number without any loss in the string representation. *
 *
 * -----------------------------------------------------------------------------
 *
 * Credits: this function was adapted from the Redis OSS source code, file
 * "utils.c", function string2ll(), and is copyright:
 *
 * Copyright(C) 2011, Pieter Noordhuis
 * Copyright(C) 2011, Redis Ltd.
 *
 * The function is released under the BSD 3-clause license.
 */
int lpStringToInt64(const char *s, unsigned long slen, int64_t *value) {
    const char *p = s;
    unsigned long plen = 0;
    int negative = 0;
    uint64_t v;

    /* Abort if length indicates this cannot possibly be an int */
    if (slen == 0 || slen >= LONG_STR_SIZE) return 0;

    /* Special case: first and only digit is 0. */
    if (slen == 1 && p[0] == '0') {
        if (value != NULL) *value = 0;
        return 1;
    }

    if (p[0] == '-') {
        negative = 1;
        p++;
        plen++;

        /* Abort on only a negative sign. */
        if (plen == slen) return 0;
    }

    /* First digit should be 1-9, otherwise the string should just be 0. */
    if (p[0] >= '1' && p[0] <= '9') {
        v = p[0]-'0';
        p++;
        plen++;
    } else {
        return 0;
    }

    while (plen < slen && p[0] >= '0' && p[0] <= '9') {
        if (v > (UINT64_MAX / 10)) /* Overflow. */
            return 0;
        v *= 10;

        if (v > (UINT64_MAX - (p[0]-'0'))) /* Overflow. */
            return 0;
        v += p[0]-'0';

        p++;
        plen++;
    }

    /* Return if not all bytes were used. */
    if (plen < slen) return 0;

    if (negative) {
        if (v > ((uint64_t)(-(INT64_MIN+1))+1)) /* Overflow. */
            return 0;
        if (value != NULL) *value = -v;
    } else {
        if (v > INT64_MAX) /* Overflow. */
            return 0;
        if (value != NULL) *value = v;
    }
    return 1;
}

/* Create a new, empty listpack.
 * On success the new listpack is returned, otherwise an error is returned.
 * Pre-allocate at least `capacity` bytes of memory,
 * over-allocated memory can be shrunk by `lpShrinkToFit`.
 * */
unsigned char *lpNew(size_t capacity) {
    unsigned char *lp = zmalloc(capacity > LP_HDR_SIZE+1 ? capacity : LP_HDR_SIZE+1);
    if (lp == NULL) return NULL;
    lpSetTotalBytes(lp,LP_HDR_SIZE+1);
    lpSetNumElements(lp,0);
    lp[LP_HDR_SIZE] = LP_EOF;
    return lp;
}

/* Free the specified listpack. */
void lpFree(unsigned char *lp) {
    zfree(lp);
}

/* Shrink the memory to fit. */
unsigned char* lpShrinkToFit(unsigned char *lp) {
    size_t size = lpGetTotalBytes(lp);
    if (size < zmalloc_size(lp)) {
        return zrealloc(lp, size);
    } else {
        return lp;
    }
}

/* Stores the integer encoded representation of 'v' in the 'intenc' buffer. */
static inline void lpEncodeIntegerGetType(int64_t v, unsigned char *intenc, uint64_t *enclen) {
    if (v >= 0 && v <= 127) {
        /* Single byte 0-127 integer. */
        intenc[0] = v;
        *enclen = 1;
    } else if (v >= -4096 && v <= 4095) {
        /* 13 bit integer. */
        if (v < 0) v = ((int64_t)1<<13)+v;
        intenc[0] = (v>>8)|LP_ENCODING_13BIT_INT;
        intenc[1] = v&0xff;
        *enclen = 2;
    } else if (v >= -32768 && v <= 32767) {
        /* 16 bit integer. */
        if (v < 0) v = ((int64_t)1<<16)+v;
        intenc[0] = LP_ENCODING_16BIT_INT;
        intenc[1] = v&0xff;
        intenc[2] = v>>8;
        *enclen = 3;
    } else if (v >= -8388608 && v <= 8388607) {
        /* 24 bit integer. */
        if (v < 0) v = ((int64_t)1<<24)+v;
        intenc[0] = LP_ENCODING_24BIT_INT;
        intenc[1] = v&0xff;
        intenc[2] = (v>>8)&0xff;
        intenc[3] = v>>16;
        *enclen = 4;
    } else if (v >= -2147483648 && v <= 2147483647) {
        /* 32 bit integer. */
        if (v < 0) v = ((int64_t)1<<32)+v;
        intenc[0] = LP_ENCODING_32BIT_INT;
        intenc[1] = v&0xff;
        intenc[2] = (v>>8)&0xff;
        intenc[3] = (v>>16)&0xff;
        intenc[4] = v>>24;
        *enclen = 5;
    } else {
        /* 64 bit integer. */
        uint64_t uv = v;
        intenc[0] = LP_ENCODING_64BIT_INT;
        intenc[1] = uv&0xff;
        intenc[2] = (uv>>8)&0xff;
        intenc[3] = (uv>>16)&0xff;
        intenc[4] = (uv>>24)&0xff;
        intenc[5] = (uv>>32)&0xff;
        intenc[6] = (uv>>40)&0xff;
        intenc[7] = (uv>>48)&0xff;
        intenc[8] = uv>>56;
        *enclen = 9;
    }
}

/* Given an element 'ele' of size 'size', determine if the element can be
 * represented inside the listpack encoded as integer, and returns
 * LP_ENCODING_INT if so. Otherwise returns LP_ENCODING_STR if no integer
 * encoding is possible.
 *
 * If the LP_ENCODING_INT is returned, the function stores the integer encoded
 * representation of the element in the 'intenc' buffer.
 *
 * Regardless of the returned encoding, 'enclen' is populated by reference to
 * the number of bytes that the string or integer encoded element will require
 * in order to be represented. */
static inline int lpEncodeGetType(const unsigned char *ele, uint32_t size, unsigned char *intenc, uint64_t *enclen) {
    int64_t v;
    if (lpStringToInt64((const char*)ele, size, &v)) {
        lpEncodeIntegerGetType(v, intenc, enclen);
        return LP_ENCODING_INT;
    } else {
        if (size < 64)
            *enclen = 1 + size;
        else if (size < 4096)
            *enclen = 2 + size;
        else
            *enclen = 5 + (uint64_t)size;
        return LP_ENCODING_STRING;
    }
}

/* Store a reverse-encoded variable length field, representing the length
 * of the previous element of size 'l', in the target buffer 'buf'.
 * The function returns the number of bytes used to encode it, from
 * 1 to 5. If 'buf' is NULL the function just returns the number of bytes
 * needed in order to encode the backlen. */
static inline unsigned long lpEncodeBacklen(unsigned char *buf, uint64_t l) {
    if (l <= 127) {
        if (buf) buf[0] = l;
        return 1;
    } else if (l < 16383) {
        if (buf) {
            buf[0] = l>>7;
            buf[1] = (l&127)|128;
        }
        return 2;
    } else if (l < 2097151) {
        if (buf) {
            buf[0] = l>>14;
            buf[1] = ((l>>7)&127)|128;
            buf[2] = (l&127)|128;
        }
        return 3;
    } else if (l < 268435455) {
        if (buf) {
            buf[0] = l>>21;
            buf[1] = ((l>>14)&127)|128;
            buf[2] = ((l>>7)&127)|128;
            buf[3] = (l&127)|128;
        }
        return 4;
    } else {
        if (buf) {
            buf[0] = l>>28;
            buf[1] = ((l>>21)&127)|128;
            buf[2] = ((l>>14)&127)|128;
            buf[3] = ((l>>7)&127)|128;
            buf[4] = (l&127)|128;
        }
        return 5;
    }
}

/* Decode the backlen and returns it. If the encoding looks invalid (more than
 * 5 bytes are used), UINT64_MAX is returned to report the problem. */
static inline uint64_t lpDecodeBacklen(unsigned char *p) {
    uint64_t val = 0;
    uint64_t shift = 0;
    do {
        val |= (uint64_t)(p[0] & 127) << shift;
        if (!(p[0] & 128)) break;
        shift += 7;
        p--;
        if (shift > 28) return UINT64_MAX;
    } while(1);
    return val;
}

/* Encode the string element pointed by 's' of size 'len' in the target
 * buffer 's'. The function should be called with 'buf' having always enough
 * space for encoding the string. This is done by calling lpEncodeGetType()
 * before calling this function. */
static inline void lpEncodeString(unsigned char *buf, const unsigned char *s, uint32_t len) {
    if (len < 64) {
        buf[0] = len | LP_ENCODING_6BIT_STR;
        memcpy(buf+1,s,len);
    } else if (len < 4096) {
        buf[0] = (len >> 8) | LP_ENCODING_12BIT_STR;
        buf[1] = len & 0xff;
        memcpy(buf+2,s,len);
    } else {
        buf[0] = LP_ENCODING_32BIT_STR;
        buf[1] = len & 0xff;
        buf[2] = (len >> 8) & 0xff;
        buf[3] = (len >> 16) & 0xff;
        buf[4] = (len >> 24) & 0xff;
        memcpy(buf+5,s,len);
    }
}

/* Return the encoded length of the listpack element pointed by 'p'.
 * This includes the encoding byte, length bytes, and the element data itself.
 * If the element encoding is wrong then 0 is returned.
 * Note that this method may access additional bytes (in case of 12 and 32 bit
 * str), so should only be called when we know 'p' was already validated by
 * lpCurrentEncodedSizeBytes or ASSERT_INTEGRITY_LEN (possibly since 'p' is
 * a return value of another function that validated its return. */
static inline uint32_t lpCurrentEncodedSizeUnsafe(unsigned char *p) {
    if (LP_ENCODING_IS_7BIT_UINT(p[0])) return 1;
    if (LP_ENCODING_IS_6BIT_STR(p[0])) return 1+LP_ENCODING_6BIT_STR_LEN(p);
    if (LP_ENCODING_IS_13BIT_INT(p[0])) return 2;
    if (LP_ENCODING_IS_16BIT_INT(p[0])) return 3;
    if (LP_ENCODING_IS_24BIT_INT(p[0])) return 4;
    if (LP_ENCODING_IS_32BIT_INT(p[0])) return 5;
    if (LP_ENCODING_IS_64BIT_INT(p[0])) return 9;
    if (LP_ENCODING_IS_12BIT_STR(p[0])) return 2+LP_ENCODING_12BIT_STR_LEN(p);
    if (LP_ENCODING_IS_32BIT_STR(p[0])) return 5+LP_ENCODING_32BIT_STR_LEN(p);
    if (p[0] == LP_EOF) return 1;
    return 0;
}

/* Return bytes needed to encode the length of the listpack element pointed by 'p'.
 * This includes just the encoding byte, and the bytes needed to encode the length
 * of the element (excluding the element data itself)
 * If the element encoding is wrong then 0 is returned. */
static inline uint32_t lpCurrentEncodedSizeBytes(unsigned char *p) {
    if (LP_ENCODING_IS_7BIT_UINT(p[0])) return 1;
    if (LP_ENCODING_IS_6BIT_STR(p[0])) return 1;
    if (LP_ENCODING_IS_13BIT_INT(p[0])) return 1;
    if (LP_ENCODING_IS_16BIT_INT(p[0])) return 1;
    if (LP_ENCODING_IS_24BIT_INT(p[0])) return 1;
    if (LP_ENCODING_IS_32BIT_INT(p[0])) return 1;
    if (LP_ENCODING_IS_64BIT_INT(p[0])) return 1;
    if (LP_ENCODING_IS_12BIT_STR(p[0])) return 2;
    if (LP_ENCODING_IS_32BIT_STR(p[0])) return 5;
    if (p[0] == LP_EOF) return 1;
    return 0;
}

/* Skip the current entry returning the next. It is invalid to call this
 * function if the current element is the EOF element at the end of the
 * listpack, however, while this function is used to implement lpNext(),
 * it does not return NULL when the EOF element is encountered. */
unsigned char *lpSkip(unsigned char *p) {
    unsigned long entrylen = lpCurrentEncodedSizeUnsafe(p);
    entrylen += lpEncodeBacklen(NULL,entrylen);
    p += entrylen;
    return p;
}

/* If 'p' points to an element of the listpack, calling lpNext() will return
 * the pointer to the next element (the one on the right), or NULL if 'p'
 * already pointed to the last element of the listpack. */
unsigned char *lpNext(unsigned char *lp, unsigned char *p) {
    assert(p);
    p = lpSkip(p);
    if (p[0] == LP_EOF) return NULL;
    lpAssertValidEntry(lp, lpBytes(lp), p);
    return p;
}

/* If 'p' points to an element of the listpack, calling lpPrev() will return
 * the pointer to the previous element (the one on the left), or NULL if 'p'
 * already pointed to the first element of the listpack. */
unsigned char *lpPrev(unsigned char *lp, unsigned char *p) {
    assert(p);
    if (p-lp == LP_HDR_SIZE) return NULL;
    p--; /* Seek the first backlen byte of the last element. */
    uint64_t prevlen = lpDecodeBacklen(p);
    prevlen += lpEncodeBacklen(NULL,prevlen);
    p -= prevlen-1; /* Seek the first byte of the previous entry. */
    lpAssertValidEntry(lp, lpBytes(lp), p);
    return p;
}

/* Return a pointer to the first element of the listpack, or NULL if the
 * listpack has no elements. */
unsigned char *lpFirst(unsigned char *lp) {
    unsigned char *p = lp + LP_HDR_SIZE; /* Skip the header. */
    if (p[0] == LP_EOF) return NULL;
    lpAssertValidEntry(lp, lpBytes(lp), p);
    return p;
}

/* Return a pointer to the last element of the listpack, or NULL if the
 * listpack has no elements. */
unsigned char *lpLast(unsigned char *lp) {
    unsigned char *p = lp+lpGetTotalBytes(lp)-1; /* Seek EOF element. */
    return lpPrev(lp,p); /* Will return NULL if EOF is the only element. */
}

/* Return the number of elements inside the listpack. This function attempts
 * to use the cached value when within range, otherwise a full scan is
 * needed. As a side effect of calling this function, the listpack header
 * could be modified, because if the count is found to be already within
 * the 'numele' header field range, the new value is set. */
unsigned long lpLength(unsigned char *lp) {
    uint32_t numele = lpGetNumElements(lp);
    if (numele != LP_HDR_NUMELE_UNKNOWN) return numele;

    /* Too many elements inside the listpack. We need to scan in order
     * to get the total number. */
    uint32_t count = 0;
    unsigned char *p = lpFirst(lp);
    while(p) {
        count++;
        p = lpNext(lp,p);
    }

    /* If the count is again within range of the header numele field,
     * set it. */
    if (count < LP_HDR_NUMELE_UNKNOWN) lpSetNumElements(lp,count);
    return count;
}

/* Return the listpack element pointed by 'p'.
 *
 * The function changes behavior depending on the passed 'intbuf' value.
 * Specifically, if 'intbuf' is NULL:
 *
 * If the element is internally encoded as an integer, the function returns
 * NULL and populates the integer value by reference in 'count'. Otherwise if
 * the element is encoded as a string a pointer to the string (pointing inside
 * the listpack itself) is returned, and 'count' is set to the length of the
 * string.
 *
 * If instead 'intbuf' points to a buffer passed by the caller, that must be
 * at least LP_INTBUF_SIZE bytes, the function always returns the element as
 * it was a string (returning the pointer to the string and setting the
 * 'count' argument to the string length by reference). However if the element
 * is encoded as an integer, the 'intbuf' buffer is used in order to store
 * the string representation.
 *
 * The user should use one or the other form depending on what the value will
 * be used for. If there is immediate usage for an integer value returned
 * by the function, than to pass a buffer (and convert it back to a number)
 * is of course useless.
 *
 * If 'entry_size' is not NULL, *entry_size is set to the entry length of the
 * listpack element pointed by 'p'. This includes the encoding bytes, length
 * bytes, the element data itself, and the backlen bytes.
 *
 * If the function is called against a badly encoded ziplist, so that there
 * is no valid way to parse it, the function returns like if there was an
 * integer encoded with value 12345678900000000 + <unrecognized byte>, this may
 * be an hint to understand that something is wrong. To crash in this case is
 * not sensible because of the different requirements of the application using
 * this lib.
 *
 * Similarly, there is no error returned since the listpack normally can be
 * assumed to be valid, so that would be a very high API cost. */
static inline unsigned char *
lpGetWithSize(unsigned char *p, int64_t *count, unsigned char *intbuf, uint64_t *entry_size) {
    int64_t val;
    uint64_t uval, negstart, negmax;

    assert(p); /* assertion for valgrind (avoid NPD) */
    if (LP_ENCODING_IS_7BIT_UINT(p[0])) {
        negstart = UINT64_MAX; /* 7 bit ints are always positive. */
        negmax = 0;
        uval = p[0] & 0x7f;
        if (entry_size) *entry_size = LP_ENCODING_7BIT_UINT_ENTRY_SIZE;
    } else if (LP_ENCODING_IS_6BIT_STR(p[0])) {
        *count = LP_ENCODING_6BIT_STR_LEN(p);
        if (entry_size) *entry_size = 1 + *count + lpEncodeBacklen(NULL, *count + 1);
        return p+1;
    } else if (LP_ENCODING_IS_13BIT_INT(p[0])) {
        uval = ((p[0]&0x1f)<<8) | p[1];
        negstart = (uint64_t)1<<12;
        negmax = 8191;
        if (entry_size) *entry_size = LP_ENCODING_13BIT_INT_ENTRY_SIZE;
    } else if (LP_ENCODING_IS_16BIT_INT(p[0])) {
        uval = (uint64_t)p[1] | (uint64_t)p[2] << 8;
        negstart = (uint64_t)1<<15;
        negmax = UINT16_MAX;
        if (entry_size) *entry_size = LP_ENCODING_16BIT_INT_ENTRY_SIZE;
    } else if (LP_ENCODING_IS_24BIT_INT(p[0])) {
        uval = (uint64_t)p[1] | (uint64_t)p[2] << 8 | (uint64_t)p[3] << 16;
        negstart = (uint64_t)1<<23;
        negmax = UINT32_MAX>>8;
        if (entry_size) *entry_size = LP_ENCODING_24BIT_INT_ENTRY_SIZE;
    } else if (LP_ENCODING_IS_32BIT_INT(p[0])) {
        uval = (uint64_t)p[1] | (uint64_t)p[2] << 8 | (uint64_t)p[3] << 16 | (uint64_t)p[4] << 24;
        negstart = (uint64_t)1<<31;
        negmax = UINT32_MAX;
        if (entry_size) *entry_size = LP_ENCODING_32BIT_INT_ENTRY_SIZE;
    } else if (LP_ENCODING_IS_64BIT_INT(p[0])) {
        uval = (uint64_t)p[1] | (uint64_t)p[2] << 8 | (uint64_t)p[3] << 16 | (uint64_t)p[4] << 24 |
               (uint64_t)p[5] << 32 | (uint64_t)p[6] << 40 | (uint64_t)p[7] << 48 | (uint64_t)p[8] << 56;
        negstart = (uint64_t)1<<63;
        negmax = UINT64_MAX;
        if (entry_size) *entry_size = LP_ENCODING_64BIT_INT_ENTRY_SIZE;
    } else if (LP_ENCODING_IS_12BIT_STR(p[0])) {
        *count = LP_ENCODING_12BIT_STR_LEN(p);
        if (entry_size) *entry_size = 2 + *count + lpEncodeBacklen(NULL, *count + 2);
        return p+2;
    } else if (LP_ENCODING_IS_32BIT_STR(p[0])) {
        *count = LP_ENCODING_32BIT_STR_LEN(p);
        if (entry_size) *entry_size = 5 + *count + lpEncodeBacklen(NULL, *count + 5);
        return p+5;
    } else {
        uval = 12345678900000000ULL + p[0];
        negstart = UINT64_MAX;
        negmax = 0;
    }

    /* We reach this code path only for integer encodings.
     * Convert the unsigned value to the signed one using two's complement
     * rule. */
    if (uval >= negstart) {
        /* This three steps conversion should avoid undefined behaviors
         * in the unsigned -> signed conversion. */
        uval = negmax-uval;
        val = uval;
        val = -val-1;
    } else {
        val = uval;
    }

    /* Return the string representation of the integer or the value itself
     * depending on intbuf being NULL or not. */
    if (intbuf) {
        *count = ll2string((char*)intbuf,LP_INTBUF_SIZE,(long long)val);
        return intbuf;
    } else {
        *count = val;
        return NULL;
    }
}

int lpGetInteger(unsigned char *p, int64_t *ival) {
    int64_t val;
    uint64_t uval = 0, negstart = UINT64_MAX, negmax = 0;
    uint8_t encoding = p[0];
    
    // Prioritize checking for integers first.
    if (encoding < LP_ENCODING_7BIT_UINT_MASK) {        
        uval = encoding & 0x7f;    
    } else if (encoding > LP_ENCODING_32BIT_STR) {
        switch (encoding) {
            case LP_ENCODING_16BIT_INT:
                uval = (uint64_t)p[1] | (uint64_t)p[2] << 8;
                negstart = (uint64_t)1<<15;
                negmax = UINT16_MAX;
                break;
            case LP_ENCODING_24BIT_INT:
                uval = (uint64_t)p[1] | (uint64_t)p[2] << 8 | (uint64_t)p[3] << 16;
                negstart = (uint64_t)1<<23;
                negmax = UINT32_MAX>>8;
                break;
            case LP_ENCODING_32BIT_INT:
                uval = (uint64_t)p[1] | (uint64_t)p[2] << 8 | (uint64_t)p[3] << 16 | (uint64_t)p[4] << 24;
                negstart = (uint64_t)1<<31;
                negmax = UINT32_MAX;
                break;
            case LP_ENCODING_64BIT_INT:                
                uval = (uint64_t)p[1] | (uint64_t)p[2] << 8 | (uint64_t)p[3] << 16 | (uint64_t)p[4] << 24 |
               (uint64_t)p[5] << 32 | (uint64_t)p[6] << 40 | (uint64_t)p[7] << 48 | (uint64_t)p[8] << 56;
                negstart = (uint64_t)1<<63;
                negmax = UINT64_MAX;
            break;
            default:
                return 0;
        }
    } else if (encoding < LP_ENCODING_13BIT_INT_MASK && encoding >= LP_ENCODING_6BIT_STR_MASK) {
   	    uval = ((encoding & 0x1f) << 8) | p[1];
        negstart = (uint64_t)1 << 12;
        negmax = 8191;        
    } else {
        // string encodings.
        return 0;
    }

     /* We reach this code path only for integer encodings.
     * Convert the unsigned value to the signed one using two's complement
     * rule. */
    if (uval >= negstart) {
        /* This three steps conversion should avoid undefined behaviors
         * in the unsigned -> signed conversion. */
        uval = negmax-uval;
        val = uval;
        val = -val-1;
    } else {
        val = uval;
    }
    
    *ival = val;
    return 1;    
}

unsigned char *lpGet(unsigned char *p, int64_t *count, unsigned char *intbuf) {
    return lpGetWithSize(p, count, intbuf, NULL);
}

/* This is just a wrapper to lpGet() that is able to get entry value directly.
 * When the function returns NULL, it populates the integer value by reference in 'lval'.
 * Otherwise if the element is encoded as a string a pointer to the string (pointing
 * inside the listpack itself) is returned, and 'slen' is set to the length of the
 * string. */
unsigned char *lpGetValue(unsigned char *p, unsigned int *slen, long long *lval) {
    unsigned char *vstr;
    int64_t ele_len;

    vstr = lpGet(p, &ele_len, NULL);
    if (vstr) {
        *slen = ele_len;
    } else {
        *lval = ele_len;
    }
    return vstr;
}

/* Find pointer to the entry equal to the specified entry. Skip 'skip' entries
 * between every comparison. Returns NULL when the field could not be found. */
unsigned char *lpFind(unsigned char *lp, unsigned char *p, unsigned char *s, uint32_t slen, unsigned int skip) {
    int skipcnt = 0;
    unsigned char vencoding = 0;
    unsigned char *value;
    int64_t ll, vll;
    uint64_t entry_size = 123456789; /* initialized to avoid warning. */
    uint32_t lp_bytes = lpBytes(lp);

    assert(p);
    while (p) {
        if (skipcnt == 0) {
            value = lpGetWithSize(p, &ll, NULL, &entry_size);
            if (value) {
                /* check the value doesn't reach outside the listpack before accessing it */
                assert(p >= lp + LP_HDR_SIZE && p + entry_size < lp + lp_bytes);
                if (slen == ll && memcmp(value, s, slen) == 0) {
                    return p;
                }
            } else {
                /* Find out if the searched field can be encoded. Note that
                 * we do it only the first time, once done vencoding is set
                 * to non-zero and vll is set to the integer value. */
                if (vencoding == 0) {
                    /* If the entry can be encoded as integer we set it to
                     * 1, else set it to UCHAR_MAX, so that we don't retry
                     * again the next time. */
                    if (slen >= 32 || slen == 0 || !lpStringToInt64((const char*)s, slen, &vll)) {
                        vencoding = UCHAR_MAX;
                    } else {
                        vencoding = 1;
                    }
                }

                /* Compare current entry with specified entry, do it only
                 * if vencoding != UCHAR_MAX because if there is no encoding
                 * possible for the field it can't be a valid integer. */
                if (vencoding != UCHAR_MAX && ll == vll) {
                    return p;
                }
            }

            /* Reset skip count */
            skipcnt = skip;
            p += entry_size;
        } else {
            /* Skip entry */
            skipcnt--;

            /* Move to next entry, avoid use `lpNext` due to `lpAssertValidEntry` in
            * `lpNext` will call `lpBytes`, will cause performance degradation */
            p = lpSkip(p);
        }

        /* The next call to lpGetWithSize could read at most 8 bytes past `p`
         * We use the slower validation call only when necessary. */
        if (p + 8 >= lp + lp_bytes)
            lpAssertValidEntry(lp, lp_bytes, p);
        else
            assert(p >= lp + LP_HDR_SIZE && p < lp + lp_bytes);
        if (p[0] == LP_EOF) break;
    }

    return NULL;
}

/* Insert, delete or replace the specified string element 'elestr' of length
 * 'size' or integer element 'eleint' at the specified position 'p', with 'p'
 * being a listpack element pointer obtained with lpFirst(), lpLast(), lpNext(),
 * lpPrev() or lpSeek().
 *
 * The element is inserted before, after, or replaces the element pointed
 * by 'p' depending on the 'where' argument, that can be LP_BEFORE, LP_AFTER
 * or LP_REPLACE.
 * 
 * If both 'elestr' and `eleint` are NULL, the function removes the element
 * pointed by 'p' instead of inserting one.
 * If `eleint` is non-NULL, 'size' is the length of 'eleint', the function insert
 * or replace with a 64 bit integer, which is stored in the 'eleint' buffer.
 * If 'elestr` is non-NULL, 'size' is the length of 'elestr', the function insert
 * or replace with a string, which is stored in the 'elestr' buffer.
 * 
 * Returns NULL on out of memory or when the listpack total length would exceed
 * the max allowed size of 2^32-1, otherwise the new pointer to the listpack
 * holding the new element is returned (and the old pointer passed is no longer
 * considered valid)
 *
 * If 'newp' is not NULL, at the end of a successful call '*newp' will be set
 * to the address of the element just added, so that it will be possible to
 * continue an interaction with lpNext() and lpPrev().
 *
 * For deletion operations (both 'elestr' and 'eleint' set to NULL) 'newp' is
 * set to the next element, on the right of the deleted one, or to NULL if the
 * deleted element was the last one. */
unsigned char *lpInsert(unsigned char *lp, const unsigned char *elestr, unsigned char *eleint,
                        uint32_t size, unsigned char *p, int where, unsigned char **newp)
{
    unsigned char intenc[LP_MAX_INT_ENCODING_LEN];
    unsigned char backlen[LP_MAX_BACKLEN_SIZE];

    uint64_t enclen; /* The length of the encoded element. */
    int del_ele = (elestr == NULL && eleint == NULL);

    /* when deletion, it is conceptually replacing the element with a
     * zero-length element. So whatever we get passed as 'where', set
     * it to LP_REPLACE. */
    if (del_ele) where = LP_REPLACE;

    /* If we need to insert after the current element, we just jump to the
     * next element (that could be the EOF one) and handle the case of
     * inserting before. So the function will actually deal with just two
     * cases: LP_BEFORE and LP_REPLACE. */
    if (where == LP_AFTER) {
        p = lpSkip(p);
        where = LP_BEFORE;
        ASSERT_INTEGRITY(lp, p);
    }

    /* Store the offset of the element 'p', so that we can obtain its
     * address again after a reallocation. */
    unsigned long poff = p-lp;

    int enctype;
    if (elestr) {
        /* Calling lpEncodeGetType() results into the encoded version of the
        * element to be stored into 'intenc' in case it is representable as
        * an integer: in that case, the function returns LP_ENCODING_INT.
        * Otherwise if LP_ENCODING_STR is returned, we'll have to call
        * lpEncodeString() to actually write the encoded string on place later.
        *
        * Whatever the returned encoding is, 'enclen' is populated with the
        * length of the encoded element. */
        enctype = lpEncodeGetType(elestr,size,intenc,&enclen);
        if (enctype == LP_ENCODING_INT) eleint = intenc;
    } else if (eleint) {
        enctype = LP_ENCODING_INT;
        enclen = size; /* 'size' is the length of the encoded integer element. */
    } else {
        enctype = -1;
        enclen = 0;
    }

    /* We need to also encode the backward-parsable length of the element
     * and append it to the end: this allows to traverse the listpack from
     * the end to the start. */
    unsigned long backlen_size = (!del_ele) ? lpEncodeBacklen(backlen, enclen) : 0;
    uint64_t old_listpack_bytes = lpGetTotalBytes(lp);
    uint32_t replaced_len  = 0;
    if (where == LP_REPLACE) {
        replaced_len = lpCurrentEncodedSizeUnsafe(p);
        replaced_len += lpEncodeBacklen(NULL,replaced_len);
        ASSERT_INTEGRITY_LEN(lp, p, replaced_len);
    }

    uint64_t new_listpack_bytes = old_listpack_bytes + enclen + backlen_size - replaced_len;
    if (new_listpack_bytes > UINT32_MAX) return NULL;

    /* We now need to reallocate in order to make space or shrink the
     * allocation (in case 'when' value is LP_REPLACE and the new element is
     * smaller). However we do that before memmoving the memory to
     * make room for the new element if the final allocation will get
     * larger, or we do it after if the final allocation will get smaller. */

    unsigned char *dst = lp + poff; /* May be updated after reallocation. */

    /* Realloc before: we need more room. */
    if (new_listpack_bytes > old_listpack_bytes && new_listpack_bytes > zmalloc_size(lp)) {
        if ((lp = zrealloc(lp, new_listpack_bytes)) == NULL) return NULL;
        dst = lp + poff;
    }

    /* Setup the listpack relocating the elements to make the exact room
     * we need to store the new one. */
    if (where == LP_BEFORE) {
        memmove(dst+enclen+backlen_size,dst,old_listpack_bytes-poff);
    } else { /* LP_REPLACE. */
        memmove(dst + enclen + backlen_size, dst + replaced_len, old_listpack_bytes - poff - replaced_len);
    }

    /* Realloc after: we need to free space. */
    if (new_listpack_bytes < old_listpack_bytes) {
        if ((lp = zrealloc(lp,new_listpack_bytes)) == NULL) return NULL;
        dst = lp + poff;
    }

    /* Store the entry. */
    if (newp) {
        *newp = dst;
        /* In case of deletion, set 'newp' to NULL if the next element is
         * the EOF element. */
        if (del_ele && dst[0] == LP_EOF) *newp = NULL;
    }
    if (!del_ele) {
        if (enctype == LP_ENCODING_INT) {
            memcpy(dst,eleint,enclen);
        } else if (elestr) {
            lpEncodeString(dst,elestr,size);
        } else {
            valkey_unreachable();
        }
        dst += enclen;
        memcpy(dst,backlen,backlen_size);
        dst += backlen_size;
    }

    /* Update header. */
    if (where != LP_REPLACE || del_ele) {
        uint32_t num_elements = lpGetNumElements(lp);
        if (num_elements != LP_HDR_NUMELE_UNKNOWN) {
            if (!del_ele)
                lpSetNumElements(lp,num_elements+1);
            else
                lpSetNumElements(lp,num_elements-1);
        }
    }
    lpSetTotalBytesChecked(lp,new_listpack_bytes);

#if 0
    /* This code path is normally disabled: what it does is to force listpack
     * to return *always* a new pointer after performing some modification to
     * the listpack, even if the previous allocation was enough. This is useful
     * in order to spot bugs in code using listpacks: by doing so we can find
     * if the caller forgets to set the new pointer where the listpack reference
     * is stored, after an update. */
    unsigned char *oldlp = lp;
    lp = zmalloc(new_listpack_bytes);
    memcpy(lp,oldlp,new_listpack_bytes);
    if (newp) {
        unsigned long offset = (*newp)-oldlp;
        *newp = lp + offset;
    }
    /* Make sure the old allocation contains garbage. */
    memset(oldlp,'A',new_listpack_bytes);
    zfree(oldlp);
#endif

    return lp;
}

/* This is just a wrapper for lpInsert() to directly use a string. */
unsigned char *lpInsertString(unsigned char *lp, const unsigned char *s, uint32_t slen,
                              unsigned char *p, int where, unsigned char **newp)
{
    return lpInsert(lp, s, NULL, slen, p, where, newp);
}

/* This is just a wrapper for lpInsert() to directly use a 64 bit integer
 * instead of a string. */
unsigned char *lpInsertInteger(unsigned char *lp, long long lval, unsigned char *p, int where, unsigned char **newp) {
    uint64_t enclen; /* The length of the encoded element. */
    unsigned char intenc[LP_MAX_INT_ENCODING_LEN];

    lpEncodeIntegerGetType(lval, intenc, &enclen);
    return lpInsert(lp, NULL, intenc, enclen, p, where, newp);
}

/* Append the specified element 's' of length 'slen' at the head of the listpack. */
unsigned char *lpPrepend(unsigned char *lp, const unsigned char *s, uint32_t slen) {
    unsigned char *p = lpFirst(lp);
    if (!p) return lpAppend(lp, s, slen);
    return lpInsert(lp, s, NULL, slen, p, LP_BEFORE, NULL);
}

/* Append the specified integer element 'lval' at the head of the listpack. */
unsigned char *lpPrependInteger(unsigned char *lp, long long lval) {
    unsigned char *p = lpFirst(lp);
    if (!p) return lpAppendInteger(lp, lval);
    return lpInsertInteger(lp, lval, p, LP_BEFORE, NULL);
}

/* Append the specified element 'ele' of length 'size' at the end of the
 * listpack. It is implemented in terms of lpInsert(), so the return value is
 * the same as lpInsert(). */
unsigned char *lpAppend(unsigned char *lp, const unsigned char *ele, uint32_t size) {
    uint64_t listpack_bytes = lpGetTotalBytes(lp);
    unsigned char *eofptr = lp + listpack_bytes - 1;
    return lpInsert(lp,ele,NULL,size,eofptr,LP_BEFORE,NULL);
}

/* Append the specified integer element 'lval' at the end of the listpack. */
unsigned char *lpAppendInteger(unsigned char *lp, long long lval) {
    uint64_t listpack_bytes = lpGetTotalBytes(lp);
    unsigned char *eofptr = lp + listpack_bytes - 1;
    return lpInsertInteger(lp, lval, eofptr, LP_BEFORE, NULL);
}

/* This is just a wrapper for lpInsert() to directly use a string to replace
 * the current element. The function returns the new listpack as return
 * value, and also updates the current cursor by updating '*p'. */
unsigned char *lpReplace(unsigned char *lp, unsigned char **p, const unsigned char *s, uint32_t slen) {
    return lpInsert(lp, s, NULL, slen, *p, LP_REPLACE, p);
}

/* This is just a wrapper for lpInsertInteger() to directly use a 64 bit integer
 * instead of a string to replace the current element. The function returns
 * the new listpack as return value, and also updates the current cursor
 * by updating '*p'. */
unsigned char *lpReplaceInteger(unsigned char *lp, unsigned char **p, long long lval) {
    return lpInsertInteger(lp, lval, *p, LP_REPLACE, p);
}

/* Remove the element pointed by 'p', and return the resulting listpack.
 * If 'newp' is not NULL, the next element pointer (to the right of the
 * deleted one) is returned by reference. If the deleted element was the
 * last one, '*newp' is set to NULL. */
unsigned char *lpDelete(unsigned char *lp, unsigned char *p, unsigned char **newp) {
    return lpInsert(lp,NULL,NULL,0,p,LP_REPLACE,newp);
}

/* Delete a range of entries from the listpack start with the element pointed by 'p'. */
unsigned char *lpDeleteRangeWithEntry(unsigned char *lp, unsigned char **p, unsigned long num) {
    size_t bytes = lpBytes(lp);
    unsigned long deleted = 0;
    unsigned char *eofptr = lp + bytes - 1;
    unsigned char *first, *tail;
    first = tail = *p;

    if (num == 0) return lp;  /* Nothing to delete, return ASAP. */

    /* Find the next entry to the last entry that needs to be deleted.
     * lpLength may be unreliable due to corrupt data, so we cannot
     * treat 'num' as the number of elements to be deleted. */
    while (num--) {
        deleted++;
        tail = lpSkip(tail);
        if (tail[0] == LP_EOF) break;
        lpAssertValidEntry(lp, bytes, tail);
    }

    /* Store the offset of the element 'first', so that we can obtain its
     * address again after a reallocation. */
    unsigned long poff = first-lp;

    /* Move tail to the front of the listpack */
    memmove(first, tail, eofptr - tail + 1);
    lpSetTotalBytesChecked(lp, bytes - (tail - first));
    uint32_t numele = lpGetNumElements(lp);
    if (numele != LP_HDR_NUMELE_UNKNOWN) lpSetNumElements(lp, numele - deleted);
    lp = lpShrinkToFit(lp);

    /* Store the entry. */
    *p = lp+poff;
    if ((*p)[0] == LP_EOF) *p = NULL;

    return lp;
}

/* Delete a range of entries from the listpack. */
unsigned char *lpDeleteRange(unsigned char *lp, long index, unsigned long num) {
    unsigned char *p;
    uint32_t numele = lpGetNumElements(lp);

    if (num == 0) return lp; /* Nothing to delete, return ASAP. */
    if ((p = lpSeek(lp, index)) == NULL) return lp;

    /* If we know we're gonna delete beyond the end of the listpack, we can just move
     * the EOF marker, and there's no need to iterate through the entries,
     * but if we can't be sure how many entries there are, we rather avoid calling lpLength
     * since that means an additional iteration on all elements.
     *
     * Note that index could overflow, but we use the value after seek, so when we
     * use it no overflow happens. */
    if (numele != LP_HDR_NUMELE_UNKNOWN && index < 0) index = (long)numele + index;
    if (numele != LP_HDR_NUMELE_UNKNOWN && (numele - (unsigned long)index) <= num) {
        p[0] = LP_EOF;
        lpSetTotalBytesChecked(lp, p - lp + 1);
        lpSetNumElements(lp, index);
        lp = lpShrinkToFit(lp);
    } else {
        lp = lpDeleteRangeWithEntry(lp, &p, num);
    }

    return lp;
}

/* Merge listpacks 'first' and 'second' by appending 'second' to 'first'.
 *
 * NOTE: The larger listpack is reallocated to contain the new merged listpack.
 * Either 'first' or 'second' can be used for the result.  The parameter not
 * used will be free'd and set to NULL.
 *
 * After calling this function, the input parameters are no longer valid since
 * they are changed and free'd in-place.
 *
 * The result listpack is the contents of 'first' followed by 'second'.
 *
 * On failure: returns NULL if the merge is impossible.
 * On success: returns the merged listpack (which is expanded version of either
 * 'first' or 'second', also frees the other unused input listpack, and sets the
 * input listpack argument equal to newly reallocated listpack return value. */
unsigned char *lpMerge(unsigned char **first, unsigned char **second) {
    /* If any params are null, we can't merge, so NULL. */
    if (first == NULL || *first == NULL || second == NULL || *second == NULL) return NULL;

    /* Can't merge same list into itself. */
    if (*first == *second) return NULL;

    size_t first_bytes = lpBytes(*first);
    unsigned long first_len = lpLength(*first);

    size_t second_bytes = lpBytes(*second);
    unsigned long second_len = lpLength(*second);

    int append;
    unsigned char *source, *target;
    size_t target_bytes, source_bytes;
    /* Pick the largest listpack so we can resize easily in-place.
     * We must also track if we are now appending or prepending to
     * the target listpack. */
    if (first_bytes >= second_bytes) {
        /* retain first, append second to first. */
        target = *first;
        target_bytes = first_bytes;
        source = *second;
        source_bytes = second_bytes;
        append = 1;
    } else {
        /* else, retain second, prepend first to second. */
        target = *second;
        target_bytes = second_bytes;
        source = *first;
        source_bytes = first_bytes;
        append = 0;
    }

    /* Calculate final bytes (subtract one pair of metadata) */
    unsigned long long lpbytes = (unsigned long long)first_bytes + second_bytes - LP_HDR_SIZE - 1;
    assert(lpbytes < UINT32_MAX); /* larger values can't be stored */
    unsigned long lplength = first_len + second_len;

    /* Combined lp length should be limited within UINT16_MAX */
    lplength = lplength < UINT16_MAX ? lplength : UINT16_MAX;

    /* Extend target to new lpbytes then append or prepend source. */
    target = zrealloc(target, lpbytes);
    if (append) {
        /* append == appending to target */
        /* Copy source after target (copying over original [END]):
         *   [TARGET - END, SOURCE - HEADER] */
        memcpy(target + target_bytes - 1, source + LP_HDR_SIZE, source_bytes - LP_HDR_SIZE);
    } else {
        /* !append == prepending to target */
        /* Move target *contents* exactly size of (source - [END]),
         * then copy source into vacated space (source - [END]):
         *   [SOURCE - END, TARGET - HEADER] */
        memmove(target + source_bytes - 1, target + LP_HDR_SIZE, target_bytes - LP_HDR_SIZE);
        memcpy(target, source, source_bytes - 1);
    }

    lpSetNumElements(target, lplength);
    lpSetTotalBytesChecked(target, lpbytes);

    /* Now free and NULL out what we didn't realloc */
    if (append) {
        zfree(*second);
        *second = NULL;
        *first = target;
    } else {
        zfree(*first);
        *first = NULL;
        *second = target;
    }

    return target;
}

/* Return the total number of bytes the listpack is composed of. */
size_t lpBytes(unsigned char *lp) {
    return lpGetTotalBytes(lp);
}

/* Seek the specified element and returns the pointer to the seeked element.
 * Positive indexes specify the zero-based element to seek from the head to
 * the tail, negative indexes specify elements starting from the tail, where
 * -1 means the last element, -2 the penultimate and so forth. If the index
 * is out of range, NULL is returned. */
unsigned char *lpSeek(unsigned char *lp, long index) {
    int forward = 1; /* Seek forward by default. */

    /* We want to seek from left to right or the other way around
     * depending on the listpack length and the element position.
     * However if the listpack length cannot be obtained in constant time,
     * we always seek from left to right. */
    uint32_t numele = lpGetNumElements(lp);
    if (numele != LP_HDR_NUMELE_UNKNOWN) {
        if (index < 0) index = (long)numele+index;
        if (index < 0) return NULL; /* Index still < 0 means out of range. */
        if (index >= (long)numele) return NULL; /* Out of range the other side. */
        /* We want to scan right-to-left if the element we are looking for
         * is past the half of the listpack. */
        if (index > (long)numele/2) {
            forward = 0;
            /* Right to left scanning always expects a negative index. Convert
             * our index to negative form. */
            index -= numele;
        }
    } else {
        /* If the listpack length is unspecified, for negative indexes we
         * want to always scan right-to-left. */
        if (index < 0) forward = 0;
    }

    /* Forward and backward scanning is trivially based on lpNext()/lpPrev(). */
    if (forward) {
        unsigned char *ele = lpFirst(lp);
        while (index > 0 && ele) {
            ele = lpNext(lp,ele);
            index--;
        }
        return ele;
    } else {
        unsigned char *ele = lpLast(lp);
        while (index < -1 && ele) {
            ele = lpPrev(lp,ele);
            index++;
        }
        return ele;
    }
}

/* Same as lpFirst but without validation assert, to be used right before lpValidateNext. */
unsigned char *lpValidateFirst(unsigned char *lp) {
    unsigned char *p = lp + LP_HDR_SIZE; /* Skip the header. */
    if (p[0] == LP_EOF) return NULL;
    return p;
}

/* Validate the integrity of a single listpack entry and move to the next one.
 * The input argument 'pp' is a reference to the current record and is advanced on exit.
 * Returns 1 if valid, 0 if invalid. */
int lpValidateNext(unsigned char *lp, unsigned char **pp, size_t lpbytes) {
#define OUT_OF_RANGE(p) ((p) < lp + LP_HDR_SIZE || (p) > lp + lpbytes - 1)
    unsigned char *p = *pp;
    if (!p) return 0;

    /* Before accessing p, make sure it's valid. */
    if (OUT_OF_RANGE(p)) return 0;

    if (*p == LP_EOF) {
        *pp = NULL;
        return 1;
    }

    /* check that we can read the encoded size */
    uint32_t lenbytes = lpCurrentEncodedSizeBytes(p);
    if (!lenbytes) return 0;

    /* make sure the encoded entry length doesn't reach outside the edge of the listpack */
    if (OUT_OF_RANGE(p + lenbytes)) return 0;

    /* get the entry length and encoded backlen. */
    unsigned long entrylen = lpCurrentEncodedSizeUnsafe(p);
    unsigned long encodedBacklen = lpEncodeBacklen(NULL,entrylen);
    entrylen += encodedBacklen;

    /* make sure the entry doesn't reach outside the edge of the listpack */
    if (OUT_OF_RANGE(p + entrylen)) return 0;

    /* move to the next entry */
    p += entrylen;

    /* make sure the encoded length at the end patches the one at the beginning. */
    uint64_t prevlen = lpDecodeBacklen(p-1);
    if (prevlen + encodedBacklen != entrylen) return 0;

    *pp = p;
    return 1;
#undef OUT_OF_RANGE
}

/* Validate that the entry doesn't reach outside the listpack allocation. */
static inline void lpAssertValidEntry(unsigned char* lp, size_t lpbytes, unsigned char *p) {
    assert(lpValidateNext(lp, &p, lpbytes));
}

/* Validate the integrity of the data structure.
 * when `deep` is 0, only the integrity of the header is validated.
 * when `deep` is 1, we scan all the entries one by one. */
int lpValidateIntegrity(unsigned char *lp, size_t size, int deep, listpackValidateEntryCB entry_cb, void *cb_userdata) {
    /* Check that we can actually read the header. (and EOF) */
    if (size < LP_HDR_SIZE + 1) return 0;

    /* Check that the encoded size in the header must match the allocated size. */
    size_t bytes = lpGetTotalBytes(lp);
    if (bytes != size) return 0;

    /* The last byte must be the terminator. */
    if (lp[size - 1] != LP_EOF) return 0;

    if (!deep) return 1;

    /* Validate the individual entries. */
    uint32_t count = 0;
    uint32_t numele = lpGetNumElements(lp);
    unsigned char *p = lp + LP_HDR_SIZE;
    while(p && p[0] != LP_EOF) {
        unsigned char *prev = p;

        /* Validate this entry and move to the next entry in advance
         * to avoid callback crash due to corrupt listpack. */
        if (!lpValidateNext(lp, &p, bytes)) return 0;

        /* Optionally let the caller validate the entry too. */
        if (entry_cb && !entry_cb(prev, numele, cb_userdata)) return 0;

        count++;
    }

    /* Make sure 'p' really does point to the end of the listpack. */
    if (p != lp + size - 1) return 0;

    /* Check that the count in the header is correct */
    if (numele != LP_HDR_NUMELE_UNKNOWN && numele != count) return 0;

    return 1;
}

/* Compare entry pointer to by 'p' with string 's' of length 'slen'.
 * Return 1 if equal. */
unsigned int lpCompare(unsigned char *p, const unsigned char *s, uint32_t slen) {
    unsigned char *value;
    int64_t sz;
    if (p[0] == LP_EOF) return 0;

    value = lpGet(p, &sz, NULL);
    if (value) {
        return (slen == sz) && memcmp(value,s,slen) == 0;
    } else {
        /* We use lpStringToInt64() to get an integer representation of the
         * string 's' and compare it to 'sval', it's much faster than convert
         * integer to string and comparing. */
        int64_t sval;
        if (lpStringToInt64((const char *)s, slen, &sval)) return sz == sval;
    }

    return 0;
}

/* uint compare for qsort */
static int uintCompare(const void *a, const void *b) {
    return (*(unsigned int *) a - *(unsigned int *) b);
}

/* Helper method to store a string into from val or lval into dest */
static inline void lpSaveValue(unsigned char *val, unsigned int len, int64_t lval, listpackEntry *dest) {
    dest->sval = val;
    dest->slen = len;
    dest->lval = lval;
}

/* Randomly select a pair of key and value.
 * total_count is a pre-computed length/2 of the listpack (to avoid calls to lpLength)
 * 'key' and 'val' are used to store the result key value pair.
 * 'val' can be NULL if the value is not needed. */
void lpRandomPair(unsigned char *lp, unsigned long total_count, listpackEntry *key, listpackEntry *val) {
    unsigned char *p;

    /* Avoid div by zero on corrupt listpack */
    assert(total_count);

    /* Generate even numbers, because listpack saved K-V pair */
    int r = (rand() % total_count) * 2;
    p = lpSeek(lp, r);
    assert(p);
    key->sval = lpGetValue(p, &(key->slen), &(key->lval));

    if (!val)
        return;
    p = lpNext(lp, p);
    assert(p);
    val->sval = lpGetValue(p, &(val->slen), &(val->lval));
}

/* Randomly select count of key value pairs and store into 'keys' and
 * 'vals' args. The order of the picked entries is random, and the selections
 * are non-unique (repetitions are possible).
 * The 'vals' arg can be NULL in which case we skip these. */
void lpRandomPairs(unsigned char *lp, unsigned int count, listpackEntry *keys, listpackEntry *vals) {
    unsigned char *p, *key, *value;
    unsigned int klen = 0, vlen = 0;
    long long klval = 0, vlval = 0;

    /* Notice: the index member must be first due to the use in uintCompare */
    typedef struct {
        unsigned int index;
        unsigned int order;
    } rand_pick;
    rand_pick *picks = zmalloc(sizeof(rand_pick)*count);
    unsigned int total_size = lpLength(lp)/2;

    /* Avoid div by zero on corrupt listpack */
    assert(total_size);

    /* create a pool of random indexes (some may be duplicate). */
    for (unsigned int i = 0; i < count; i++) {
        picks[i].index = (rand() % total_size) * 2; /* Generate even indexes */
        /* keep track of the order we picked them */
        picks[i].order = i;
    }

    /* sort by indexes. */
    qsort(picks, count, sizeof(rand_pick), uintCompare);

    /* fetch the elements form the listpack into a output array respecting the original order. */
    unsigned int lpindex = picks[0].index, pickindex = 0;
    p = lpSeek(lp, lpindex);
    while (p && pickindex < count) {
        key = lpGetValue(p, &klen, &klval);
        p = lpNext(lp, p);
        assert(p);
        value = lpGetValue(p, &vlen, &vlval);
        while (pickindex < count && lpindex == picks[pickindex].index) {
            int storeorder = picks[pickindex].order;
            lpSaveValue(key, klen, klval, &keys[storeorder]);
            if (vals) lpSaveValue(value, vlen, vlval, &vals[storeorder]);
             pickindex++;
        }
        lpindex += 2;
        p = lpNext(lp, p);
    }

    zfree(picks);
}

/* Randomly select count of key value pairs and store into 'keys' and
 * 'vals' args. The selections are unique (no repetitions), and the order of
 * the picked entries is NOT-random.
 * The 'vals' arg can be NULL in which case we skip these.
 * The return value is the number of items picked which can be lower than the
 * requested count if the listpack doesn't hold enough pairs. */
unsigned int lpRandomPairsUnique(unsigned char *lp, unsigned int count, listpackEntry *keys, listpackEntry *vals) {
    unsigned char *p, *key;
    unsigned int klen = 0;
    long long klval = 0;
    unsigned int total_size = lpLength(lp)/2;
    unsigned int index = 0;
    if (count > total_size) count = total_size;

    /* To only iterate once, every time we try to pick a member, the probability
     * we pick it is the quotient of the count left we want to pick and the
     * count still we haven't visited in the dict, this way, we could make every
     * member be equally picked.*/
    p = lpFirst(lp);
    unsigned int picked = 0, remaining = count;
    while (picked < count && p) {
        double randomDouble = ((double)rand()) / RAND_MAX;
        double threshold = ((double)remaining) / (total_size - index);
        if (randomDouble <= threshold) {
            key = lpGetValue(p, &klen, &klval);
            lpSaveValue(key, klen, klval, &keys[picked]);
            p = lpNext(lp, p);
            assert(p);
            if (vals) {
                key = lpGetValue(p, &klen, &klval);
                lpSaveValue(key, klen, klval, &vals[picked]);
            }
            remaining--;
            picked++;
        } else {
            p = lpNext(lp, p);
            assert(p);
        }
        p = lpNext(lp, p);
        index++;
    }
    return picked;
}

/* Print info of listpack which is used in debugCommand */
void lpRepr(unsigned char *lp) {
    unsigned char *p, *vstr;
    int64_t vlen;
    unsigned char intbuf[LP_INTBUF_SIZE];
    int index = 0;

    printf("{total bytes %zu} {num entries %lu}\n", lpBytes(lp), lpLength(lp));
        
    p = lpFirst(lp);
    while(p) {
        uint32_t encoded_size_bytes = lpCurrentEncodedSizeBytes(p);
        uint32_t encoded_size = lpCurrentEncodedSizeUnsafe(p);
        unsigned long back_len = lpEncodeBacklen(NULL, encoded_size);
        printf("{\n"
                "\taddr: 0x%08lx,\n"
                "\tindex: %2d,\n"
                "\toffset: %1lu,\n"
                "\thdr+entrylen+backlen: %2lu,\n"
                "\thdrlen: %3u,\n"
                "\tbacklen: %2lu,\n"
                "\tpayload: %1u\n",
               (long unsigned)p, index, (unsigned long)(p - lp), encoded_size + back_len, encoded_size_bytes, back_len,
            encoded_size - encoded_size_bytes);
        printf("\tbytes: ");
        for (unsigned int i = 0; i < (encoded_size + back_len); i++) {
            printf("%02x|",p[i]);
        }
        printf("\n");

        vstr = lpGet(p, &vlen, intbuf);
        printf("\t[str]");
        if (vlen > 40) {
            if (fwrite(vstr, 40, 1, stdout) == 0) perror("fwrite");
            printf("...");
        } else {
            if (fwrite(vstr, vlen, 1, stdout) == 0) perror("fwrite");
        }
        printf("\n}\n");
        index++;
        p = lpNext(lp, p);
    }
    printf("{end}\n\n");
}

#ifdef REDIS_TEST

#include <sys/time.h>
#include "adlist.h"
#include "sds.h"
#include "testhelp.h"

#define UNUSED(x) (void)(x)
#define TEST(name) printf("test — %s\n", name);

char *mixlist[] = {"hello", "foo", "quux", "1024"};
char *intlist[] = {"4294967296", "-100", "100", "128000", 
                   "non integer", "much much longer non integer"};

static unsigned char *createList() {
    unsigned char *lp = lpNew(0);
    lp = lpAppend(lp, (unsigned char*)mixlist[1], strlen(mixlist[1]));
    lp = lpAppend(lp, (unsigned char*)mixlist[2], strlen(mixlist[2]));
    lp = lpPrepend(lp, (unsigned char*)mixlist[0], strlen(mixlist[0]));
    lp = lpAppend(lp, (unsigned char*)mixlist[3], strlen(mixlist[3]));
    return lp;
}

static unsigned char *createIntList() {
    unsigned char *lp = lpNew(0);
    lp = lpAppend(lp, (unsigned char*)intlist[2], strlen(intlist[2]));
    lp = lpAppend(lp, (unsigned char*)intlist[3], strlen(intlist[3]));
    lp = lpPrepend(lp, (unsigned char*)intlist[1], strlen(intlist[1]));
    lp = lpPrepend(lp, (unsigned char*)intlist[0], strlen(intlist[0]));
    lp = lpAppend(lp, (unsigned char*)intlist[4], strlen(intlist[4]));
    lp = lpAppend(lp, (unsigned char*)intlist[5], strlen(intlist[5]));
    return lp;
}

static long long usec(void) {
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return (((long long)tv.tv_sec)*1000000)+tv.tv_usec;
}

static void stress(int pos, int num, int maxsize, int dnum) {
    int i, j, k;
    unsigned char *lp;
    char posstr[2][5] = { "HEAD", "TAIL" };
    long long start;
    for (i = 0; i < maxsize; i+=dnum) {
        lp = lpNew(0);
        for (j = 0; j < i; j++) {
            lp = lpAppend(lp, (unsigned char*)"quux", 4);
        }

        /* Do num times a push+pop from pos */
        start = usec();
        for (k = 0; k < num; k++) {
            if (pos == 0) {
                lp = lpPrepend(lp, (unsigned char*)"quux", 4);
            } else {
                lp = lpAppend(lp, (unsigned char*)"quux", 4);

            }
            lp = lpDelete(lp, lpFirst(lp), NULL);
        }
        printf("List size: %8d, bytes: %8zu, %dx push+pop (%s): %6lld usec\n",
               i, lpBytes(lp), num, posstr[pos], usec()-start);
        lpFree(lp);
    }
}

static unsigned char *pop(unsigned char *lp, int where) {
    unsigned char *p, *vstr;
    int64_t vlen;

    p = lpSeek(lp, where == 0 ? 0 : -1);
    vstr = lpGet(p, &vlen, NULL);
    if (where == 0)
        printf("Pop head: ");
    else
        printf("Pop tail: ");

    if (vstr) {
        if (vlen && fwrite(vstr, vlen, 1, stdout) == 0) perror("fwrite");
    } else {
        printf("%lld", (long long)vlen);
    }

    printf("\n");
    return lpDelete(lp, p, &p);
}

static int randstring(char *target, unsigned int min, unsigned int max) {
    int p = 0;
    int len = min+rand()%(max-min+1);
    int minval, maxval;
    switch(rand() % 3) {
    case 0:
        minval = 0;
        maxval = 255;
    break;
    case 1:
        minval = 48;
        maxval = 122;
    break;
    case 2:
        minval = 48;
        maxval = 52;
    break;
    default:
        assert(NULL);
    }

    while(p < len)
        target[p++] = minval+rand()%(maxval-minval+1);
    return len;
}

static void verifyEntry(unsigned char *p, unsigned char *s, size_t slen) {
    assert(lpCompare(p, s, slen));
}

static int lpValidation(unsigned char *p, unsigned int head_count, void *userdata) {
    UNUSED(p);
    UNUSED(head_count);

    int ret;
    long *count = userdata;
    ret = lpCompare(p, (unsigned char *)mixlist[*count], strlen(mixlist[*count]));
    (*count)++;
    return ret;
}

int listpackTest(int argc, char *argv[], int flags) {
    UNUSED(argc);
    UNUSED(argv);

    int i;
    unsigned char *lp, *p, *vstr;
    int64_t vlen;
    unsigned char intbuf[LP_INTBUF_SIZE];
    int accurate = (flags & REDIS_TEST_ACCURATE);

    TEST("Create int list") {
        lp = createIntList();
        assert(lpLength(lp) == 6);
        lpFree(lp);
    }

    TEST("Create list") {
        lp = createList();
        assert(lpLength(lp) == 4);
        lpFree(lp);
    }

    TEST("Test lpPrepend") {
        lp = lpNew(0);
        lp = lpPrepend(lp, (unsigned char*)"abc", 3);
        lp = lpPrepend(lp, (unsigned char*)"1024", 4);
        verifyEntry(lpSeek(lp, 0), (unsigned char*)"1024", 4);
        verifyEntry(lpSeek(lp, 1), (unsigned char*)"abc", 3);
        lpFree(lp);
    }

    TEST("Test lpPrependInteger") {
        lp = lpNew(0);
        lp = lpPrependInteger(lp, 127);
        lp = lpPrependInteger(lp, 4095);
        lp = lpPrependInteger(lp, 32767);
        lp = lpPrependInteger(lp, 8388607);
        lp = lpPrependInteger(lp, 2147483647);
        lp = lpPrependInteger(lp, 9223372036854775807);
        verifyEntry(lpSeek(lp, 0), (unsigned char*)"9223372036854775807", 19);
        verifyEntry(lpSeek(lp, -1), (unsigned char*)"127", 3);
        lpFree(lp);
    }

    TEST("Get element at index") {
        lp = createList();
        verifyEntry(lpSeek(lp, 0), (unsigned char*)"hello", 5);
        verifyEntry(lpSeek(lp, 3), (unsigned char*)"1024", 4);
        verifyEntry(lpSeek(lp, -1), (unsigned char*)"1024", 4);
        verifyEntry(lpSeek(lp, -4), (unsigned char*)"hello", 5);
        assert(lpSeek(lp, 4) == NULL);
        assert(lpSeek(lp, -5) == NULL);
        lpFree(lp);
    }
    
    TEST("Pop list") {
        lp = createList();
        lp = pop(lp, 1);
        lp = pop(lp, 0);
        lp = pop(lp, 1);
        lp = pop(lp, 1);
        lpFree(lp);
    }

    TEST("Get element at index") {
        lp = createList();
        verifyEntry(lpSeek(lp, 0), (unsigned char*)"hello", 5);
        verifyEntry(lpSeek(lp, 3), (unsigned char*)"1024", 4);
        verifyEntry(lpSeek(lp, -1), (unsigned char*)"1024", 4);
        verifyEntry(lpSeek(lp, -4), (unsigned char*)"hello", 5);
        assert(lpSeek(lp, 4) == NULL);
        assert(lpSeek(lp, -5) == NULL);
        lpFree(lp);
    }

    TEST("Iterate list from 0 to end") {
        lp = createList();
        p = lpFirst(lp);
        i = 0;
        while (p) {
            verifyEntry(p, (unsigned char*)mixlist[i], strlen(mixlist[i]));
            p = lpNext(lp, p);
            i++;
        }
        lpFree(lp);
    }
    
    TEST("Iterate list from 1 to end") {
        lp = createList();
        i = 1;
        p = lpSeek(lp, i);
        while (p) {
            verifyEntry(p, (unsigned char*)mixlist[i], strlen(mixlist[i]));
            p = lpNext(lp, p);
            i++;
        }
        lpFree(lp);
    }
    
    TEST("Iterate list from 2 to end") {
        lp = createList();
        i = 2;
        p = lpSeek(lp, i);
        while (p) {
            verifyEntry(p, (unsigned char*)mixlist[i], strlen(mixlist[i]));
            p = lpNext(lp, p);
            i++;
        }
        lpFree(lp);
    }
    
    TEST("Iterate from back to front") {
        lp = createList();
        p = lpLast(lp);
        i = 3;
        while (p) {
            verifyEntry(p, (unsigned char*)mixlist[i], strlen(mixlist[i]));
            p = lpPrev(lp, p);
            i--;
        }
        lpFree(lp);
    }
    
    TEST("Iterate from back to front, deleting all items") {
        lp = createList();
        p = lpLast(lp);
        i = 3;
        while ((p = lpLast(lp))) {
            verifyEntry(p, (unsigned char*)mixlist[i], strlen(mixlist[i]));
            lp = lpDelete(lp, p, &p);
            assert(p == NULL);
            i--;
        }
        lpFree(lp);
    }

    TEST("Delete whole listpack when num == -1");
    {
        lp = createList();
        lp = lpDeleteRange(lp, 0, -1);
        assert(lpLength(lp) == 0);
        assert(lp[LP_HDR_SIZE] == LP_EOF);
        assert(lpBytes(lp) == (LP_HDR_SIZE + 1));
        zfree(lp);

        lp = createList();
        unsigned char *ptr = lpFirst(lp);
        lp = lpDeleteRangeWithEntry(lp, &ptr, -1);
        assert(lpLength(lp) == 0);
        assert(lp[LP_HDR_SIZE] == LP_EOF);
        assert(lpBytes(lp) == (LP_HDR_SIZE + 1));
        zfree(lp);
    }

    TEST("Delete whole listpack with negative index");
    {
        lp = createList();
        lp = lpDeleteRange(lp, -4, 4);
        assert(lpLength(lp) == 0);
        assert(lp[LP_HDR_SIZE] == LP_EOF);
        assert(lpBytes(lp) == (LP_HDR_SIZE + 1));
        zfree(lp);

        lp = createList();
        unsigned char *ptr = lpSeek(lp, -4);
        lp = lpDeleteRangeWithEntry(lp, &ptr, 4);
        assert(lpLength(lp) == 0);
        assert(lp[LP_HDR_SIZE] == LP_EOF);
        assert(lpBytes(lp) == (LP_HDR_SIZE + 1));
        zfree(lp);
    }

    TEST("Delete inclusive range 0,0");
    {
        lp = createList();
        lp = lpDeleteRange(lp, 0, 1);
        assert(lpLength(lp) == 3);
        assert(lpSkip(lpLast(lp))[0] == LP_EOF); /* check set LP_EOF correctly */
        zfree(lp);

        lp = createList();
        unsigned char *ptr = lpFirst(lp);
        lp = lpDeleteRangeWithEntry(lp, &ptr, 1);
        assert(lpLength(lp) == 3);
        assert(lpSkip(lpLast(lp))[0] == LP_EOF); /* check set LP_EOF correctly */
        zfree(lp);
    }

    TEST("Delete inclusive range 0,1");
    {
        lp = createList();
        lp = lpDeleteRange(lp, 0, 2);
        assert(lpLength(lp) == 2);
        verifyEntry(lpFirst(lp), (unsigned char*)mixlist[2], strlen(mixlist[2]));
        zfree(lp);

        lp = createList();
        unsigned char *ptr = lpFirst(lp);
        lp = lpDeleteRangeWithEntry(lp, &ptr, 2);
        assert(lpLength(lp) == 2);
        verifyEntry(lpFirst(lp), (unsigned char*)mixlist[2], strlen(mixlist[2]));
        zfree(lp);
    }

    TEST("Delete inclusive range 1,2");
    {
        lp = createList();
        lp = lpDeleteRange(lp, 1, 2);
        assert(lpLength(lp) == 2);
        verifyEntry(lpFirst(lp), (unsigned char*)mixlist[0], strlen(mixlist[0]));
        zfree(lp);

        lp = createList();
        unsigned char *ptr = lpSeek(lp, 1);
        lp = lpDeleteRangeWithEntry(lp, &ptr, 2);
        assert(lpLength(lp) == 2);
        verifyEntry(lpFirst(lp), (unsigned char*)mixlist[0], strlen(mixlist[0]));
        zfree(lp);
    }
    
    TEST("Delete with start index out of range");
    {
        lp = createList();
        lp = lpDeleteRange(lp, 5, 1);
        assert(lpLength(lp) == 4);
        zfree(lp);
    }

    TEST("Delete with num overflow");
    {
        lp = createList();
        lp = lpDeleteRange(lp, 1, 5);
        assert(lpLength(lp) == 1);
        verifyEntry(lpFirst(lp), (unsigned char*)mixlist[0], strlen(mixlist[0]));
        zfree(lp);

        lp = createList();
        unsigned char *ptr = lpSeek(lp, 1);
        lp = lpDeleteRangeWithEntry(lp, &ptr, 5);
        assert(lpLength(lp) == 1);
        verifyEntry(lpFirst(lp), (unsigned char*)mixlist[0], strlen(mixlist[0]));
        zfree(lp);
    }

    TEST("Delete foo while iterating") {
        lp = createList();
        p = lpFirst(lp);
        while (p) {
            if (lpCompare(p, (unsigned char*)"foo", 3)) {
                lp = lpDelete(lp, p, &p);
            } else {
                p = lpNext(lp, p);
            }
        }
        lpFree(lp);
    }

    TEST("Replace with same size") {
        lp = createList(); /* "hello", "foo", "quux", "1024" */
        unsigned char *orig_lp = lp;
        p = lpSeek(lp, 0);
        lp = lpReplace(lp, &p, (unsigned char*)"zoink", 5);
        p = lpSeek(lp, 3);
        lp = lpReplace(lp, &p, (unsigned char*)"y", 1);
        p = lpSeek(lp, 1);
        lp = lpReplace(lp, &p, (unsigned char*)"65536", 5);
        p = lpSeek(lp, 0);
        assert(!memcmp((char*)p,
                       "\x85zoink\x06"
                       "\xf2\x00\x00\x01\x04" /* 65536 as int24 */
                       "\x84quux\05" "\x81y\x02" "\xff",
                       22));
        assert(lp == orig_lp); /* no reallocations have happened */
        lpFree(lp);
    }

    TEST("Replace with different size") {
        lp = createList(); /* "hello", "foo", "quux", "1024" */
        p = lpSeek(lp, 1);
        lp = lpReplace(lp, &p, (unsigned char*)"squirrel", 8);
        p = lpSeek(lp, 0);
        assert(!strncmp((char*)p,
                        "\x85hello\x06" "\x88squirrel\x09" "\x84quux\x05"
                        "\xc4\x00\x02" "\xff",
                        27));
        lpFree(lp);
    }

    TEST("Regression test for >255 byte strings") {
        char v1[257] = {0}, v2[257] = {0};
        memset(v1,'x',256);
        memset(v2,'y',256);
        lp = lpNew(0);
        lp = lpAppend(lp, (unsigned char*)v1 ,strlen(v1));
        lp = lpAppend(lp, (unsigned char*)v2 ,strlen(v2));

        /* Pop values again and compare their value. */
        p = lpFirst(lp);
        vstr = lpGet(p, &vlen, NULL);
        assert(strncmp(v1, (char*)vstr, vlen) == 0);
        p = lpSeek(lp, 1);
        vstr = lpGet(p, &vlen, NULL);
        assert(strncmp(v2, (char*)vstr, vlen) == 0);
        lpFree(lp);
    }

    TEST("Create long list and check indices") {
        lp = lpNew(0);
        char buf[32];
        int i,len;
        for (i = 0; i < 1000; i++) {
            len = sprintf(buf, "%d", i);
            lp = lpAppend(lp, (unsigned char*)buf, len);
        }
        for (i = 0; i < 1000; i++) {
            p = lpSeek(lp, i);
            vstr = lpGet(p, &vlen, NULL);
            assert(i == vlen);

            p = lpSeek(lp, -i-1);
            vstr = lpGet(p, &vlen, NULL);
            assert(999-i == vlen);
        }
        lpFree(lp);
    }

    TEST("Compare strings with listpack entries") {
        lp = createList();
        p = lpSeek(lp,0);
        assert(lpCompare(p,(unsigned char*)"hello",5));
        assert(!lpCompare(p,(unsigned char*)"hella",5));

        p = lpSeek(lp,3);
        assert(lpCompare(p,(unsigned char*)"1024",4));
        assert(!lpCompare(p,(unsigned char*)"1025",4));
        lpFree(lp);
    }

    TEST("lpMerge two empty listpacks") {
        unsigned char *lp1 = lpNew(0);
        unsigned char *lp2 = lpNew(0);

        /* Merge two empty listpacks, get empty result back. */
        lp1 = lpMerge(&lp1, &lp2);
        assert(lpLength(lp1) == 0);
        zfree(lp1);
    }

    TEST("lpMerge two listpacks - first larger than second") {
        unsigned char *lp1 = createIntList();
        unsigned char *lp2 = createList();

        size_t lp1_bytes = lpBytes(lp1);
        size_t lp2_bytes = lpBytes(lp2);
        unsigned long lp1_len = lpLength(lp1);
        unsigned long lp2_len = lpLength(lp2);

        unsigned char *lp3 = lpMerge(&lp1, &lp2);
        assert(lp3 == lp1);
        assert(lp2 == NULL);
        assert(lpLength(lp3) == (lp1_len + lp2_len));
        assert(lpBytes(lp3) == (lp1_bytes + lp2_bytes - LP_HDR_SIZE - 1));
        verifyEntry(lpSeek(lp3, 0), (unsigned char*)"4294967296", 10);
        verifyEntry(lpSeek(lp3, 5), (unsigned char*)"much much longer non integer", 28);
        verifyEntry(lpSeek(lp3, 6), (unsigned char*)"hello", 5);
        verifyEntry(lpSeek(lp3, -1), (unsigned char*)"1024", 4);
        zfree(lp3);
    }

    TEST("lpMerge two listpacks - second larger than first") {
        unsigned char *lp1 = createList();
        unsigned char *lp2 = createIntList();

        size_t lp1_bytes = lpBytes(lp1);
        size_t lp2_bytes = lpBytes(lp2);
        unsigned long lp1_len = lpLength(lp1);
        unsigned long lp2_len = lpLength(lp2);

        unsigned char *lp3 = lpMerge(&lp1, &lp2);
        assert(lp3 == lp2);
        assert(lp1 == NULL);
        assert(lpLength(lp3) == (lp1_len + lp2_len));
        assert(lpBytes(lp3) == (lp1_bytes + lp2_bytes - LP_HDR_SIZE - 1));
        verifyEntry(lpSeek(lp3, 0), (unsigned char*)"hello", 5);
        verifyEntry(lpSeek(lp3, 3), (unsigned char*)"1024", 4);
        verifyEntry(lpSeek(lp3, 4), (unsigned char*)"4294967296", 10);
        verifyEntry(lpSeek(lp3, -1), (unsigned char*)"much much longer non integer", 28);
        zfree(lp3);
    }

    TEST("Random pair with one element") {
        listpackEntry key, val;
        unsigned char *lp = lpNew(0);
        lp = lpAppend(lp, (unsigned char*)"abc", 3);
        lp = lpAppend(lp, (unsigned char*)"123", 3);
        lpRandomPair(lp, 1, &key, &val);
        assert(memcmp(key.sval, "abc", key.slen) == 0);
        assert(val.lval == 123);
        lpFree(lp);
    }

    TEST("Random pair with many elements") {
        listpackEntry key, val;
        unsigned char *lp = lpNew(0);
        lp = lpAppend(lp, (unsigned char*)"abc", 3);
        lp = lpAppend(lp, (unsigned char*)"123", 3);
        lp = lpAppend(lp, (unsigned char*)"456", 3);
        lp = lpAppend(lp, (unsigned char*)"def", 3);
        lpRandomPair(lp, 2, &key, &val);
        if (key.sval) {
            assert(!memcmp(key.sval, "abc", key.slen));
            assert(key.slen == 3);
            assert(val.lval == 123);
        }
        if (!key.sval) {
            assert(key.lval == 456);
            assert(!memcmp(val.sval, "def", val.slen));
        }
        lpFree(lp);
    }

    TEST("Random pairs with one element") {
        int count = 5;
        unsigned char *lp = lpNew(0);
        listpackEntry *keys = zmalloc(sizeof(listpackEntry) * count);
        listpackEntry *vals = zmalloc(sizeof(listpackEntry) * count);

        lp = lpAppend(lp, (unsigned char*)"abc", 3);
        lp = lpAppend(lp, (unsigned char*)"123", 3);
        lpRandomPairs(lp, count, keys, vals);
        assert(memcmp(keys[4].sval, "abc", keys[4].slen) == 0);
        assert(vals[4].lval == 123);
        zfree(keys);
        zfree(vals);
        lpFree(lp);
    }

    TEST("Random pairs with many elements") {
        int count = 5;
        lp = lpNew(0);
        listpackEntry *keys = zmalloc(sizeof(listpackEntry) * count);
        listpackEntry *vals = zmalloc(sizeof(listpackEntry) * count);

        lp = lpAppend(lp, (unsigned char*)"abc", 3);
        lp = lpAppend(lp, (unsigned char*)"123", 3);
        lp = lpAppend(lp, (unsigned char*)"456", 3);
        lp = lpAppend(lp, (unsigned char*)"def", 3);
        lpRandomPairs(lp, count, keys, vals);
        for (int i = 0; i < count; i++) {
            if (keys[i].sval) {
                assert(!memcmp(keys[i].sval, "abc", keys[i].slen));
                assert(keys[i].slen == 3);
                assert(vals[i].lval == 123);
            }
            if (!keys[i].sval) {
                assert(keys[i].lval == 456);
                assert(!memcmp(vals[i].sval, "def", vals[i].slen));
            }
        }
        zfree(keys);
        zfree(vals);
        lpFree(lp);
    }

    TEST("Random pairs unique with one element") {
        unsigned picked;
        int count = 5;
        lp = lpNew(0);
        listpackEntry *keys = zmalloc(sizeof(listpackEntry) * count);
        listpackEntry *vals = zmalloc(sizeof(listpackEntry) * count);

        lp = lpAppend(lp, (unsigned char*)"abc", 3);
        lp = lpAppend(lp, (unsigned char*)"123", 3);
        picked = lpRandomPairsUnique(lp, count, keys, vals);
        assert(picked == 1);
        assert(memcmp(keys[0].sval, "abc", keys[0].slen) == 0);
        assert(vals[0].lval == 123);
        zfree(keys);
        zfree(vals);
        lpFree(lp);
    }

    TEST("Random pairs unique with many elements") {
        unsigned picked;
        int count = 5;
        lp = lpNew(0);
        listpackEntry *keys = zmalloc(sizeof(listpackEntry) * count);
        listpackEntry *vals = zmalloc(sizeof(listpackEntry) * count);

        lp = lpAppend(lp, (unsigned char*)"abc", 3);
        lp = lpAppend(lp, (unsigned char*)"123", 3);
        lp = lpAppend(lp, (unsigned char*)"456", 3);
        lp = lpAppend(lp, (unsigned char*)"def", 3);
        picked = lpRandomPairsUnique(lp, count, keys, vals);
        assert(picked == 2);
        for (int i = 0; i < 2; i++) {
            if (keys[i].sval) {
                assert(!memcmp(keys[i].sval, "abc", keys[i].slen));
                assert(keys[i].slen == 3);
                assert(vals[i].lval == 123);
            }
            if (!keys[i].sval) {
                assert(keys[i].lval == 456);
                assert(!memcmp(vals[i].sval, "def", vals[i].slen));
            }
        }
        zfree(keys);
        zfree(vals);
        lpFree(lp);
    }

    TEST("push various encodings") {
        lp = lpNew(0);

        /* Push integer encode element using lpAppend */
        lp = lpAppend(lp, (unsigned char*)"127", 3);
        assert(LP_ENCODING_IS_7BIT_UINT(lpLast(lp)[0]));
        lp = lpAppend(lp, (unsigned char*)"4095", 4);
        assert(LP_ENCODING_IS_13BIT_INT(lpLast(lp)[0]));
        lp = lpAppend(lp, (unsigned char*)"32767", 5);
        assert(LP_ENCODING_IS_16BIT_INT(lpLast(lp)[0]));
        lp = lpAppend(lp, (unsigned char*)"8388607", 7);
        assert(LP_ENCODING_IS_24BIT_INT(lpLast(lp)[0]));
        lp = lpAppend(lp, (unsigned char*)"2147483647", 10);
        assert(LP_ENCODING_IS_32BIT_INT(lpLast(lp)[0]));
        lp = lpAppend(lp, (unsigned char*)"9223372036854775807", 19);
        assert(LP_ENCODING_IS_64BIT_INT(lpLast(lp)[0]));

        /* Push integer encode element using lpAppendInteger */
        lp = lpAppendInteger(lp, 127);
        assert(LP_ENCODING_IS_7BIT_UINT(lpLast(lp)[0]));
        verifyEntry(lpLast(lp), (unsigned char*)"127", 3);
        lp = lpAppendInteger(lp, 4095);
        verifyEntry(lpLast(lp), (unsigned char*)"4095", 4);
        assert(LP_ENCODING_IS_13BIT_INT(lpLast(lp)[0]));
        lp = lpAppendInteger(lp, 32767);
        verifyEntry(lpLast(lp), (unsigned char*)"32767", 5);
        assert(LP_ENCODING_IS_16BIT_INT(lpLast(lp)[0]));
        lp = lpAppendInteger(lp, 8388607);
        verifyEntry(lpLast(lp), (unsigned char*)"8388607", 7);
        assert(LP_ENCODING_IS_24BIT_INT(lpLast(lp)[0]));
        lp = lpAppendInteger(lp, 2147483647);
        verifyEntry(lpLast(lp), (unsigned char*)"2147483647", 10);
        assert(LP_ENCODING_IS_32BIT_INT(lpLast(lp)[0]));
        lp = lpAppendInteger(lp, 9223372036854775807);
        verifyEntry(lpLast(lp), (unsigned char*)"9223372036854775807", 19);
        assert(LP_ENCODING_IS_64BIT_INT(lpLast(lp)[0]));

        /* string encode */
        unsigned char *str = zmalloc(65535);
        memset(str, 0, 65535);
        lp = lpAppend(lp, (unsigned char*)str, 63);
        assert(LP_ENCODING_IS_6BIT_STR(lpLast(lp)[0]));
        lp = lpAppend(lp, (unsigned char*)str, 4095);
        assert(LP_ENCODING_IS_12BIT_STR(lpLast(lp)[0]));
        lp = lpAppend(lp, (unsigned char*)str, 65535);
        assert(LP_ENCODING_IS_32BIT_STR(lpLast(lp)[0]));
        zfree(str);
        lpFree(lp);
    }

    TEST("Test lpFind") {
        lp = createList();
        assert(lpFind(lp, lpFirst(lp), (unsigned char*)"abc", 3, 0) == NULL);
        verifyEntry(lpFind(lp, lpFirst(lp), (unsigned char*)"hello", 5, 0), (unsigned char*)"hello", 5);
        verifyEntry(lpFind(lp, lpFirst(lp), (unsigned char*)"1024", 4, 0), (unsigned char*)"1024", 4);
        lpFree(lp);
    }

    TEST("Test lpValidateIntegrity") {
        lp = createList();
        long count = 0;
        assert(lpValidateIntegrity(lp, lpBytes(lp), 1, lpValidation, &count) == 1);
        lpFree(lp);
    }

    TEST("Test number of elements exceeds LP_HDR_NUMELE_UNKNOWN") {
        lp = lpNew(0);
        for (int i = 0; i < LP_HDR_NUMELE_UNKNOWN + 1; i++)
            lp = lpAppend(lp, (unsigned char*)"1", 1);

        assert(lpGetNumElements(lp) == LP_HDR_NUMELE_UNKNOWN);
        assert(lpLength(lp) == LP_HDR_NUMELE_UNKNOWN+1);

        lp = lpDeleteRange(lp, -2, 2);
        assert(lpGetNumElements(lp) == LP_HDR_NUMELE_UNKNOWN);
        assert(lpLength(lp) == LP_HDR_NUMELE_UNKNOWN-1);
        assert(lpGetNumElements(lp) == LP_HDR_NUMELE_UNKNOWN-1); /* update length after lpLength */
        lpFree(lp);
    }

    TEST("Stress with random payloads of different encoding") {
        unsigned long long start = usec();
        int i,j,len,where;
        unsigned char *p;
        char buf[1024];
        int buflen;
        list *ref;
        listNode *refnode;

        int iteration = accurate ? 20000 : 20;
        for (i = 0; i < iteration; i++) {
            lp = lpNew(0);
            ref = listCreate();
            listSetFreeMethod(ref,(void (*)(void*))sdsfree);
            len = rand() % 256;

            /* Create lists */
            for (j = 0; j < len; j++) {
                where = (rand() & 1) ? 0 : 1;
                if (rand() % 2) {
                    buflen = randstring(buf,1,sizeof(buf)-1);
                } else {
                    switch(rand() % 3) {
                    case 0:
                        buflen = sprintf(buf,"%lld",(0LL + rand()) >> 20);
                        break;
                    case 1:
                        buflen = sprintf(buf,"%lld",(0LL + rand()));
                        break;
                    case 2:
                        buflen = sprintf(buf,"%lld",(0LL + rand()) << 20);
                        break;
                    default:
                        assert(NULL);
                    }
                }

                /* Add to listpack */
                if (where == 0) {
                    lp = lpPrepend(lp, (unsigned char*)buf, buflen);
                } else {
                    lp = lpAppend(lp, (unsigned char*)buf, buflen);
                }

                /* Add to reference list */
                if (where == 0) {
                    listAddNodeHead(ref,sdsnewlen(buf, buflen));
                } else if (where == 1) {
                    listAddNodeTail(ref,sdsnewlen(buf, buflen));
                } else {
                    assert(NULL);
                }
            }

            assert(listLength(ref) == lpLength(lp));
            for (j = 0; j < len; j++) {
                /* Naive way to get elements, but similar to the stresser
                 * executed from the Tcl test suite. */
                p = lpSeek(lp,j);
                refnode = listIndex(ref,j);

                vstr = lpGet(p, &vlen, intbuf);
                assert(memcmp(vstr,listNodeValue(refnode),vlen) == 0);
            }
            lpFree(lp);
            listRelease(ref);
        }
        printf("Done. usec=%lld\n\n", usec()-start);
    }

    TEST("Stress with variable listpack size") {
        unsigned long long start = usec();
        int maxsize = accurate ? 16384 : 16;
        stress(0,100000,maxsize,256);
        stress(1,100000,maxsize,256);
        printf("Done. usec=%lld\n\n", usec()-start);
    }

    /* Benchmarks */
    {
        int iteration = accurate ? 100000 : 100;
        lp = lpNew(0);
        TEST("Benchmark lpAppend") {
            unsigned long long start = usec();
            for (int i=0; i<iteration; i++) {
                char buf[4096] = "asdf";
                lp = lpAppend(lp, (unsigned char*)buf, 4);
                lp = lpAppend(lp, (unsigned char*)buf, 40);
                lp = lpAppend(lp, (unsigned char*)buf, 400);
                lp = lpAppend(lp, (unsigned char*)buf, 4000);
                lp = lpAppend(lp, (unsigned char*)"1", 1);
                lp = lpAppend(lp, (unsigned char*)"10", 2);
                lp = lpAppend(lp, (unsigned char*)"100", 3);
                lp = lpAppend(lp, (unsigned char*)"1000", 4);
                lp = lpAppend(lp, (unsigned char*)"10000", 5);
                lp = lpAppend(lp, (unsigned char*)"100000", 6);
            }
            printf("Done. usec=%lld\n", usec()-start);
        }

        TEST("Benchmark lpFind string") {
            unsigned long long start = usec();
            for (int i = 0; i < 2000; i++) {
                unsigned char *fptr = lpFirst(lp);
                fptr = lpFind(lp, fptr, (unsigned char*)"nothing", 7, 1);
            }
            printf("Done. usec=%lld\n", usec()-start);
        }

        TEST("Benchmark lpFind number") {
            unsigned long long start = usec();
            for (int i = 0; i < 2000; i++) {
                unsigned char *fptr = lpFirst(lp);
                fptr = lpFind(lp, fptr, (unsigned char*)"99999", 5, 1);
            }
            printf("Done. usec=%lld\n", usec()-start);
        }

        TEST("Benchmark lpSeek") {
            unsigned long long start = usec();
            for (int i = 0; i < 2000; i++) {
                lpSeek(lp, 99999);
            }
            printf("Done. usec=%lld\n", usec()-start);
        }

        TEST("Benchmark lpValidateIntegrity") {
            unsigned long long start = usec();
            for (int i = 0; i < 2000; i++) {
                lpValidateIntegrity(lp, lpBytes(lp), 1, NULL, NULL);
            }
            printf("Done. usec=%lld\n", usec()-start);
        }

        TEST("Benchmark lpCompare with string") {
            unsigned long long start = usec();
            for (int i = 0; i < 2000; i++) {
                unsigned char *eptr = lpSeek(lp,0);
                while (eptr != NULL) {
                    lpCompare(eptr,(unsigned char*)"nothing",7);
                    eptr = lpNext(lp,eptr);
                }
            }
            printf("Done. usec=%lld\n", usec()-start);
        }

        TEST("Benchmark lpCompare with number") {
            unsigned long long start = usec();
            for (int i = 0; i < 2000; i++) {
                unsigned char *eptr = lpSeek(lp,0);
                while (eptr != NULL) {
                    lpCompare(lp, (unsigned char*)"99999", 5);
                    eptr = lpNext(lp,eptr);
                }
            }
            printf("Done. usec=%lld\n", usec()-start);
        }

        lpFree(lp);
    }

    return 0;
}

#endif


================================================
FILE: src/redis/listpack.h
================================================
/* Listpack -- A lists of strings serialization format
 *
 * This file implements the specification you can find at:
 *
 *  https://github.com/antirez/listpack
 *
 * Copyright (c) 2017, Salvatore Sanfilippo <antirez at gmail dot com>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef __LISTPACK_H
#define __LISTPACK_H

#include <stdlib.h>
#include <stdint.h>

#define LP_INTBUF_SIZE 21 /* 20 digits of -2^63 + 1 null term = 21. */

/* lpInsert() where argument possible values: */
#define LP_BEFORE 0
#define LP_AFTER 1
#define LP_REPLACE 2

/* Each entry in the listpack is either a string or an integer. */
typedef struct {
    /* When string is used, it is provided with the length (slen). */
    unsigned char *sval;
    uint32_t slen;
    /* When integer is used, 'sval' is NULL, and lval holds the value. */
    long long lval;
} listpackEntry;

unsigned char *lpNew(size_t capacity);
void lpFree(unsigned char *lp);
unsigned char* lpShrinkToFit(unsigned char *lp);
unsigned char *lpInsertString(unsigned char *lp, const unsigned char *s, uint32_t slen,
                              unsigned char *p, int where, unsigned char **newp);
unsigned char *lpPrepend(unsigned char *lp, const unsigned char *s, uint32_t slen);
unsigned char *lpPrependInteger(unsigned char *lp, long long lval);
unsigned char *lpAppend(unsigned char *lp, const unsigned char *s, uint32_t slen);
unsigned char *lpAppendInteger(unsigned char *lp, long long lval);
unsigned char *lpInsertInteger(unsigned char *lp, long long lval, unsigned char *p, int where,
                               unsigned char **newp);
unsigned char *lpReplace(unsigned char *lp, unsigned char **p, const unsigned char *s, uint32_t slen);
unsigned char *lpReplaceInteger(unsigned char *lp, unsigned char **p, long long lval);
unsigned char *lpDelete(unsigned char *lp, unsigned char *p, unsigned char **newp);
unsigned char *lpDeleteRangeWithEntry(unsigned char *lp, unsigned char **p, unsigned long num);
unsigned char *lpDeleteRange(unsigned char *lp, long index, unsigned long num);
unsigned char *lpMerge(unsigned char **first, unsigned char **second);
unsigned long lpLength(unsigned char *lp);
unsigned char *lpGet(unsigned char *p, int64_t *count, unsigned char *intbuf);

// Fills count and returns 1 if the item is an integer, 0 otherwise.
int lpGetInteger(unsigned char *p, int64_t *ival);
int lpStringToInt64(const char *s, unsigned long slen, int64_t *value);

unsigned char *lpGetValue(unsigned char *p, unsigned int *slen, long long *lval);
unsigned char *lpFind(unsigned char *lp, unsigned char *p, unsigned char *s, uint32_t slen, unsigned int skip);
unsigned char *lpFirst(unsigned char *lp);
unsigned char *lpLast(unsigned char *lp);
unsigned char *lpNext(unsigned char *lp, unsigned char *p);
unsigned char *lpPrev(unsigned char *lp, unsigned char *p);
size_t lpBytes(unsigned char *lp);
unsigned char *lpSeek(unsigned char *lp, long index);
typedef int (*listpackValidateEntryCB)(unsigned char *p, unsigned int head_count, void *userdata);
int lpValidateIntegrity(unsigned char *lp, size_t size, int deep,
                        listpackValidateEntryCB entry_cb, void *cb_userdata);
unsigned char *lpValidateFirst(unsigned char *lp);
int lpValidateNext(unsigned char *lp, unsigned char **pp, size_t lpbytes);
unsigned int lpCompare(unsigned char *p, const unsigned char *s, uint32_t slen);
void lpRandomPair(unsigned char *lp, unsigned long total_count, listpackEntry *key, listpackEntry *val);
void lpRandomPairs(unsigned char *lp, unsigned int count, listpackEntry *keys, listpackEntry *vals);
unsigned int lpRandomPairsUnique(unsigned char *lp, unsigned int count, listpackEntry *keys, listpackEntry *vals);
int lpSafeToAdd(unsigned char* lp, size_t add);
void lpRepr(unsigned char *lp);

#ifdef REDIS_TEST
int listpackTest(int argc, char *argv[], int flags);
#endif

#endif


================================================
FILE: src/redis/lua/CMakeLists.txt
================================================
add_library(lua_modules STATIC
    cjson/fpconv.c cjson/strbuf.c cjson/lua_cjson.c
    cmsgpack/lua_cmsgpack.c
    struct/lua_struct.c
    bit/bit.c
)

target_compile_options(lua_modules PRIVATE
    -Wno-sign-compare -Wno-misleading-indentation -Wno-implicit-fallthrough -Wno-undefined-inline
    -Wno-stringop-overflow)

target_link_libraries(lua_modules TRDP::lua)


================================================
FILE: src/redis/lua/README.md
================================================
Since version 5.2 `luaL_register` is deprecated and removed. The new `luaL_newlib` function doesn't make the module globally available upon registration and is ment to be used with the `require` function.

To provide the modules globally, `luaL_newlib` is followed by a `lua_setglobal` for bit and struct.


================================================
FILE: src/redis/lua/bit/bit.c
================================================
/*
** Lua BitOp -- a bit operations library for Lua 5.1/5.2.
** http://bitop.luajit.org/
**
** Copyright (C) 2008-2012 Mike Pall. All rights reserved.
**
** Permission is hereby granted, free of charge, to any person obtaining
** a copy of this software and associated documentation files (the
** "Software"), to deal in the Software without restriction, including
** without limitation the rights to use, copy, modify, merge, publish,
** distribute, sublicense, and/or sell copies of the Software, and to
** permit persons to whom the Software is furnished to do so, subject to
** the following conditions:
**
** The above copyright notice and this permission notice shall be
** included in all copies or substantial portions of the Software.
**
** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
** SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
**
** [ MIT license: http://www.opensource.org/licenses/mit-license.php ]
*/

#define LUA_BITOP_VERSION	"1.0.3"

#define LUA_LIB
#include "lua.h"
#include "lauxlib.h"

#ifdef _MSC_VER
/* MSVC is stuck in the last century and doesn't have C99's stdint.h. */
typedef __int32 int32_t;
typedef unsigned __int32 uint32_t;
typedef unsigned __int64 uint64_t;
#else
#include <stdint.h>
#endif

typedef int32_t SBits;
typedef uint32_t UBits;

typedef union {
  lua_Number n;
#if defined(LUA_NUMBER_DOUBLE) || defined(LUA_FLOAT_DOUBLE)
  uint64_t b;
#else
  UBits b;
#endif
} BitNum;

/* Convert argument to bit type. */
static UBits barg(lua_State *L, int idx)
{
  BitNum bn;
  UBits b;
#if LUA_VERSION_NUM < 502
  bn.n = lua_tonumber(L, idx);
#else
  bn.n = luaL_checknumber(L, idx);
#endif
#if defined(LUA_NUMBER_DOUBLE) || defined(LUA_FLOAT_DOUBLE)
  bn.n += 6755399441055744.0;  /* 2^52+2^51 */
#ifdef SWAPPED_DOUBLE
  b = (UBits)(bn.b >> 32);
#else
  b = (UBits)bn.b;
#endif
#elif defined(LUA_NUMBER_INT)       || defined(LUA_INT_INT) || \
      defined(LUA_NUMBER_LONG)      || defined(LUA_INT_LONG) || \
      defined(LUA_NUMBER_LONGLONG)  || defined(LUA_INT_LONGLONG) || \
      defined(LUA_NUMBER_LONG_LONG) || defined(LUA_NUMBER_LLONG)
  if (sizeof(UBits) == sizeof(lua_Number))
    b = bn.b;
  else
    b = (UBits)(SBits)bn.n;
#elif defined(LUA_NUMBER_FLOAT) || defined(LUA_FLOAT_FLOAT)
#error "A 'float' lua_Number type is incompatible with this library"
#else
#error "Unknown number type, check LUA_NUMBER_*, LUA_FLOAT_*, LUA_INT_* in luaconf.h"
#endif
#if LUA_VERSION_NUM < 502
  if (b == 0 && !lua_isnumber(L, idx)) {
    luaL_typerror(L, idx, "number");
  }
#endif
  return b;
}

/* Return bit type. */
#if LUA_VERSION_NUM < 503
#define BRET(b)  lua_pushnumber(L, (lua_Number)(SBits)(b)); return 1;
#else
#define BRET(b)  lua_pushinteger(L, (lua_Integer)(SBits)(b)); return 1;
#endif

static int bit_tobit(lua_State *L) { BRET(barg(L, 1)) }
static int bit_bnot(lua_State *L) { BRET(~barg(L, 1)) }

#define BIT_OP(func, opr) \
  static int func(lua_State *L) { int i; UBits b = barg(L, 1); \
    for (i = lua_gettop(L); i > 1; i--) b opr barg(L, i); BRET(b) }
BIT_OP(bit_band, &=)
BIT_OP(bit_bor, |=)
BIT_OP(bit_bxor, ^=)

#define bshl(b, n)  (b << n)
#define bshr(b, n)  (b >> n)
#define bsar(b, n)  ((SBits)b >> n)
#define brol(b, n)  ((b << n) | (b >> (32-n)))
#define bror(b, n)  ((b << (32-n)) | (b >> n))
#define BIT_SH(func, fn) \
  static int func(lua_State *L) { \
    UBits b = barg(L, 1); UBits n = barg(L, 2) & 31; BRET(fn(b, n)) }
BIT_SH(bit_lshift, bshl)
BIT_SH(bit_rshift, bshr)
BIT_SH(bit_arshift, bsar)
BIT_SH(bit_rol, brol)
BIT_SH(bit_ror, bror)

static int bit_bswap(lua_State *L)
{
  UBits b = barg(L, 1);
  b = (b >> 24) | ((b >> 8) & 0xff00) | ((b & 0xff00) << 8) | (b << 24);
  BRET(b)
}

static int bit_tohex(lua_State *L)
{
  UBits b = barg(L, 1);
  SBits n = lua_isnone(L, 2) ? 8 : (SBits)barg(L, 2);
  const char *hexdigits = "0123456789abcdef";
  char buf[8];
  int i;
  if (n == INT32_MIN) n = INT32_MIN+1;
  if (n < 0) { n = -n; hexdigits = "0123456789ABCDEF"; }
  if (n > 8) n = 8;
  for (i = (int)n; --i >= 0; ) { buf[i] = hexdigits[b & 15]; b >>= 4; }
  lua_pushlstring(L, buf, (size_t)n);
  return 1;
}

static const struct luaL_Reg bit_funcs[] = {
  { "tobit",	bit_tobit },
  { "bnot",	bit_bnot },
  { "band",	bit_band },
  { "bor",	bit_bor },
  { "bxor",	bit_bxor },
  { "lshift",	bit_lshift },
  { "rshift",	bit_rshift },
  { "arshift",	bit_arshift },
  { "rol",	bit_rol },
  { "ror",	bit_ror },
  { "bswap",	bit_bswap },
  { "tohex",	bit_tohex },
  { NULL, NULL }
};

/* Signed right-shifts are implementation-defined per C89/C99.
** But the de facto standard are arithmetic right-shifts on two's
** complement CPUs. This behaviour is required here, so test for it.
*/
#define BAD_SAR		(bsar(-8, 2) != (SBits)-2)

LUALIB_API int luaopen_bit(lua_State *L)
{
  UBits b;
#if LUA_VERSION_NUM < 503
  lua_pushnumber(L, (lua_Number)1437217655L);
#else
	lua_pushinteger(L, (lua_Integer)1437217655L);
#endif
  b = barg(L, -1);
  if (b != (UBits)1437217655L || BAD_SAR) {  /* Perform a simple self-test. */
    const char *msg = "compiled with incompatible luaconf.h";
#if defined(LUA_NUMBER_DOUBLE) || defined(LUA_FLOAT_DOUBLE)
#ifdef _WIN32
    if (b == (UBits)1610612736L)
      msg = "use D3DCREATE_FPU_PRESERVE with DirectX";
#endif
    if (b == (UBits)1127743488L)
      msg = "not compiled with SWAPPED_DOUBLE";
#endif
    if (BAD_SAR)
      msg = "arithmetic right-shift broken";
    luaL_error(L, "bit library self-test failed (%s)", msg);
  }

  luaL_newlib(L, bit_funcs);
  lua_setglobal(L, "bit");

  return 1;
}


================================================
FILE: src/redis/lua/cjson/fpconv.c
================================================
/* fpconv - Floating point conversion routines
 *
 * Copyright (c) 2011-2012  Mark Pulford <mark@kyne.com.au>
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

/* JSON uses a '.' decimal separator. strtod() / sprintf() under C libraries
 * with locale support will break when the decimal separator is a comma.
 *
 * fpconv_* will around these issues with a translation buffer if required.
 */

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>

#include "fpconv.h"

/* Lua CJSON assumes the locale is the same for all threads within a
 * process and doesn't change after initialisation.
 *
 * This avoids the need for per thread storage or expensive checks
 * for call. */
static char locale_decimal_point = '.';

/* In theory multibyte decimal_points are possible, but
 * Lua CJSON only supports UTF-8 and known locales only have
 * single byte decimal points ([.,]).
 *
 * localconv() may not be thread safe (=>crash), and nl_langinfo() is
 * not supported on some platforms. Use sprintf() instead - if the
 * locale does change, at least Lua CJSON won't crash. */
static void fpconv_update_locale()
{
    char buf[8];

    snprintf(buf, sizeof(buf), "%g", 0.5);

    /* Failing this test might imply the platform has a buggy dtoa
     * implementation or wide characters */
    if (buf[0] != '0' || buf[2] != '5' || buf[3] != 0) {
        fprintf(stderr, "Error: wide characters found or printf() bug.");
        abort();
    }

    locale_decimal_point = buf[1];
}

/* Check for a valid number character: [-+0-9a-yA-Y.]
 * Eg: -0.6e+5, infinity, 0xF0.F0pF0
 *
 * Used to find the probable end of a number. It doesn't matter if
 * invalid characters are counted - strtod() will find the valid
 * number if it exists.  The risk is that slightly more memory might
 * be allocated before a parse error occurs. */
static inline int valid_number_character(char ch)
{
    char lower_ch;

    if ('0' <= ch && ch <= '9')
        return 1;
    if (ch == '-' || ch == '+' || ch == '.')
        return 1;

    /* Hex digits, exponent (e), base (p), "infinity",.. */
    lower_ch = ch | 0x20;
    if ('a' <= lower_ch && lower_ch <= 'y')
        return 1;

    return 0;
}

/* Calculate the size of the buffer required for a strtod locale
 * conversion. */
static int strtod_buffer_size(const char *s)
{
    const char *p = s;

    while (valid_number_character(*p))
        p++;

    return p - s;
}

/* Similar to strtod(), but must be passed the current locale's decimal point
 * character. Guaranteed to be called at the start of any valid number in a string */
double fpconv_strtod(const char *nptr, char **endptr)
{
    char localbuf[FPCONV_G_FMT_BUFSIZE];
    char *buf, *endbuf, *dp;
    int buflen;
    double value;

    /* System strtod() is fine when decimal point is '.' */
    if (locale_decimal_point == '.')
        return strtod(nptr, endptr);

    buflen = strtod_buffer_size(nptr);
    if (!buflen) {
        /* No valid characters found, standard strtod() return */
        *endptr = (char *)nptr;
        return 0;
    }

    /* Duplicate number into buffer */
    if (buflen >= FPCONV_G_FMT_BUFSIZE) {
        /* Handle unusually large numbers */
        buf = malloc(buflen + 1);
        if (!buf) {
            fprintf(stderr, "Out of memory");
            abort();
        }
    } else {
        /* This is the common case.. */
        buf = localbuf;
    }
    memcpy(buf, nptr, buflen);
    buf[buflen] = 0;

    /* Update decimal point character if found */
    dp = strchr(buf, '.');
    if (dp)
        *dp = locale_decimal_point;

    value = strtod(buf, &endbuf);
    *endptr = (char *)&nptr[endbuf - buf];
    if (buflen >= FPCONV_G_FMT_BUFSIZE)
        free(buf);

    return value;
}

/* "fmt" must point to a buffer of at least 6 characters */
static void set_number_format(char *fmt, int precision)
{
    int d1, d2, i;

    assert(1 <= precision && precision <= 14);

    /* Create printf format (%.14g) from precision */
    d1 = precision / 10;
    d2 = precision % 10;
    fmt[0] = '%';
    fmt[1] = '.';
    i = 2;
    if (d1) {
        fmt[i++] = '0' + d1;
    }
    fmt[i++] = '0' + d2;
    fmt[i++] = 'g';
    fmt[i] = 0;
}

/* Assumes there is always at least 32 characters available in the target buffer */
int fpconv_g_fmt(char *str, double num, int precision)
{
    char buf[FPCONV_G_FMT_BUFSIZE];
    char fmt[6];
    int len;
    char *b;

    set_number_format(fmt, precision);

    /* Pass through when decimal point character is dot. */
    if (locale_decimal_point == '.')
        return snprintf(str, FPCONV_G_FMT_BUFSIZE, fmt, num);

    /* snprintf() to a buffer then translate for other decimal point characters */
    len = snprintf(buf, FPCONV_G_FMT_BUFSIZE, fmt, num);

    /* Copy into target location. Translate decimal point if required */
    b = buf;
    do {
        *str++ = (*b == locale_decimal_point ? '.' : *b);
    } while(*b++);

    return len;
}

void fpconv_init()
{
    fpconv_update_locale();
}

/* vi:ai et sw=4 ts=4:
 */


================================================
FILE: src/redis/lua/cjson/fpconv.h
================================================
/* Lua CJSON floating point conversion routines */

/* Buffer required to store the largest string representation of a double.
 *
 * Longest double printed with %.14g is 21 characters long:
 * -1.7976931348623e+308 */
# define FPCONV_G_FMT_BUFSIZE   32

#ifdef USE_INTERNAL_FPCONV
static inline void fpconv_init()
{
    /* Do nothing - not required */
}
#else
extern void fpconv_init();
#endif

extern int fpconv_g_fmt(char*, double, int);
extern double fpconv_strtod(const char*, char**);

/* vi:ai et sw=4 ts=4:
 */


================================================
FILE: src/redis/lua/cjson/lua_cjson.c
================================================
/* Lua CJSON - JSON support for Lua
 *
 * Copyright (c) 2010-2012  Mark Pulford <mark@kyne.com.au>
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

/* Caveats:
 * - JSON "null" values are represented as lightuserdata since Lua
 *   tables cannot contain "nil". Compare with cjson.null.
 * - Invalid UTF-8 characters are not detected and will be passed
 *   untouched. If required, UTF-8 error checking should be done
 *   outside this library.
 * - Javascript comments are not part of the JSON spec, and are not
 *   currently supported.
 *
 * Note: Decoding is slower than encoding. Lua spends significant
 *       time (30%) managing tables when parsing JSON since it is
 *       difficult to know object/array sizes ahead of time.
 */

#include <assert.h>
#include <string.h>
#include <math.h>
#include <limits.h>
#include <lua.h>
#include <lauxlib.h>

#include "strbuf.h"
#include "fpconv.h"

#ifndef CJSON_MODNAME
#define CJSON_MODNAME   "cjson"
#endif

#ifndef CJSON_VERSION
#define CJSON_VERSION   "2.1devel"
#endif

/* Workaround for Solaris platforms missing isinf() */
#if !defined(isinf) && (defined(USE_INTERNAL_ISINF) || defined(MISSING_ISINF))
#define isinf(x) (!isnan(x) && isnan((x) - (x)))
#endif

#define DEFAULT_SPARSE_CONVERT 0
#define DEFAULT_SPARSE_RATIO 2
#define DEFAULT_SPARSE_SAFE 10
#define DEFAULT_ENCODE_MAX_DEPTH 1000
#define DEFAULT_DECODE_MAX_DEPTH 1000
#define DEFAULT_ENCODE_INVALID_NUMBERS 0
#define DEFAULT_DECODE_INVALID_NUMBERS 1
#define DEFAULT_ENCODE_KEEP_BUFFER 1
#define DEFAULT_ENCODE_NUMBER_PRECISION 14

#ifdef DISABLE_INVALID_NUMBERS
#undef DEFAULT_DECODE_INVALID_NUMBERS
#define DEFAULT_DECODE_INVALID_NUMBERS 0
#endif

typedef enum {
    T_OBJ_BEGIN,
    T_OBJ_END,
    T_ARR_BEGIN,
    T_ARR_END,
    T_STRING,
    T_NUMBER,
    T_BOOLEAN,
    T_NULL,
    T_COLON,
    T_COMMA,
    T_END,
    T_WHITESPACE,
    T_ERROR,
    T_UNKNOWN
} json_token_type_t;

static const char *json_token_type_name[] = {
    "T_OBJ_BEGIN",
    "T_OBJ_END",
    "T_ARR_BEGIN",
    "T_ARR_END",
    "T_STRING",
    "T_NUMBER",
    "T_BOOLEAN",
    "T_NULL",
    "T_COLON",
    "T_COMMA",
    "T_END",
    "T_WHITESPACE",
    "T_ERROR",
    "T_UNKNOWN",
    NULL
};

typedef struct {
    json_token_type_t ch2token[256];
    char escape2char[256];  /* Decoding */

    /* encode_buf is only allocated and used when
     * encode_keep_buffer is set */
    strbuf_t encode_buf;

    int encode_sparse_convert;
    int encode_sparse_ratio;
    int encode_sparse_safe;
    int encode_max_depth;
    int encode_invalid_numbers;     /* 2 => Encode as "null" */
    int encode_number_precision;
    int encode_keep_buffer;

    int decode_invalid_numbers;
    int decode_max_depth;
} json_config_t;

typedef struct {
    const char *data;
    const char *ptr;
    strbuf_t *tmp;    /* Temporary storage for strings */
    json_config_t *cfg;
    int current_depth;
} json_parse_t;

typedef struct {
    json_token_type_t type;
    int index;
    union {
        const char *string;
        double number;
        int boolean;
    } value;
    int string_len;
} json_token_t;

static const char *char2escape[256] = {
    "\\u0000", "\\u0001", "\\u0002", "\\u0003",
    "\\u0004", "\\u0005", "\\u0006", "\\u0007",
    "\\b", "\\t", "\\n", "\\u000b",
    "\\f", "\\r", "\\u000e", "\\u000f",
    "\\u0010", "\\u0011", "\\u0012", "\\u0013",
    "\\u0014", "\\u0015", "\\u0016", "\\u0017",
    "\\u0018", "\\u0019", "\\u001a", "\\u001b",
    "\\u001c", "\\u001d", "\\u001e", "\\u001f",
    NULL, NULL, "\\\"", NULL, NULL, NULL, NULL, NULL,
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, "\\/",
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
    NULL, NULL, NULL, NULL, "\\\\", NULL, NULL, NULL,
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, "\\u007f",
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
};

/* ===== CONFIGURATION ===== */

static json_config_t *json_fetch_config(lua_State *l)
{
    json_config_t *cfg;

    cfg = lua_touserdata(l, lua_upvalueindex(1));
    if (!cfg)
        luaL_error(l, "BUG: Unable to fetch CJSON configuration");

    return cfg;
}

/* Ensure the correct number of arguments have been provided.
 * Pad with nil to allow other functions to simply check arg[i]
 * to find whether an argument was provided */
static json_config_t *json_arg_init(lua_State *l, int args)
{
    luaL_argcheck(l, lua_gettop(l) <= args, args + 1,
                  "found too many arguments");

    while (lua_gettop(l) < args)
        lua_pushnil(l);

    return json_fetch_config(l);
}

/* Process integer options for configuration functions */
static int json_integer_option(lua_State *l, int optindex, int *setting,
                               int min, int max)
{
    char errmsg[64];
    int value;

    if (!lua_isnil(l, optindex)) {
        value = luaL_checkinteger(l, optindex);
        snprintf(errmsg, sizeof(errmsg), "expected integer between %d and %d", min, max);
        luaL_argcheck(l, min <= value && value <= max, 1, errmsg);
        *setting = value;
    }

    lua_pushinteger(l, *setting);

    return 1;
}

/* Process enumerated arguments for a configuration function */
static int json_enum_option(lua_State *l, int optindex, int *setting,
                            const char **options, int bool_true)
{
    static const char *bool_options[] = { "off", "on", NULL };

    if (!options) {
        options = bool_options;
        bool_true = 1;
    }

    if (!lua_isnil(l, optindex)) {
        if (bool_true && lua_isboolean(l, optindex))
            *setting = lua_toboolean(l, optindex) * bool_true;
        else
            *setting = luaL_checkoption(l, optindex, NULL, options);
    }

    if (bool_true && (*setting == 0 || *setting == bool_true))
        lua_pushboolean(l, *setting);
    else
        lua_pushstring(l, options[*setting]);

    return 1;
}

/* Configures handling of extremely sparse arrays:
 * convert: Convert extremely sparse arrays into objects? Otherwise error.
 * ratio: 0: always allow sparse; 1: never allow sparse; >1: use ratio
 * safe: Always use an array when the max index <= safe */
static int json_cfg_encode_sparse_array(lua_State *l)
{
    json_config_t *cfg = json_arg_init(l, 3);

    json_enum_option(l, 1, &cfg->encode_sparse_convert, NULL, 1);
    json_integer_option(l, 2, &cfg->encode_sparse_ratio, 0, INT_MAX);
    json_integer_option(l, 3, &cfg->encode_sparse_safe, 0, INT_MAX);

    return 3;
}

/* Configures the maximum number of nested arrays/objects allowed when
 * encoding */
static int json_cfg_encode_max_depth(lua_State *l)
{
    json_config_t *cfg = json_arg_init(l, 1);

    return json_integer_option(l, 1, &cfg->encode_max_depth, 1, INT_MAX);
}

/* Configures the maximum number of nested arrays/objects allowed when
 * encoding */
static int json_cfg_decode_max_depth(lua_State *l)
{
    json_config_t *cfg = json_arg_init(l, 1);

    return json_integer_option(l, 1, &cfg->decode_max_depth, 1, INT_MAX);
}

/* Configures number precision when converting doubles to text */
static int json_cfg_encode_number_precision(lua_State *l)
{
    json_config_t *cfg = json_arg_init(l, 1);

    return json_integer_option(l, 1, &cfg->encode_number_precision, 1, 14);
}

/* Configures JSON encoding buffer persistence */
static int json_cfg_encode_keep_buffer(lua_State *l)
{
    json_config_t *cfg = json_arg_init(l, 1);
    int old_value;

    old_value = cfg->encode_keep_buffer;

    json_enum_option(l, 1, &cfg->encode_keep_buffer, NULL, 1);

    /* Init / free the buffer if the setting has changed */
    if (old_value ^ cfg->encode_keep_buffer) {
        if (cfg->encode_keep_buffer)
            strbuf_init(&cfg->encode_buf, 0);
        else
            strbuf_free(&cfg->encode_buf);
    }

    return 1;
}

#if defined(DISABLE_INVALID_NUMBERS) && !defined(USE_INTERNAL_FPCONV)
void json_verify_invalid_number_setting(lua_State *l, int *setting)
{
    if (*setting == 1) {
        *setting = 0;
        luaL_error(l, "Infinity, NaN, and/or hexadecimal numbers are not supported.");
    }
}
#else
#define json_verify_invalid_number_setting(l, s)    do { } while(0)
#endif

static int json_cfg_encode_invalid_numbers(lua_State *l)
{
    static const char *options[] = { "off", "on", "null", NULL };
    json_config_t *cfg = json_arg_init(l, 1);

    json_enum_option(l, 1, &cfg->encode_invalid_numbers, options, 1);

    json_verify_invalid_number_setting(l, &cfg->encode_invalid_numbers);

    return 1;
}

static int json_cfg_decode_invalid_numbers(lua_State *l)
{
    json_config_t *cfg = json_arg_init(l, 1);

    json_enum_option(l, 1, &cfg->decode_invalid_numbers, NULL, 1);

    json_verify_invalid_number_setting(l, &cfg->encode_invalid_numbers);

    return 1;
}

static int json_destroy_config(lua_State *l)
{
    json_config_t *cfg;

    cfg = lua_touserdata(l, 1);
    if (cfg)
        strbuf_free(&cfg->encode_buf);
    cfg = NULL;

    return 0;
}

static void json_create_config(lua_State *l)
{
    json_config_t *cfg;
    int i;

    cfg = lua_newuserdata(l, sizeof(*cfg));

    /* Create GC method to clean up strbuf */
    lua_newtable(l);
    lua_pushcfunction(l, json_destroy_config);
    lua_setfield(l, -2, "__gc");
    lua_setmetatable(l, -2);

    cfg->encode_sparse_convert = DEFAULT_SPARSE_CONVERT;
    cfg->encode_sparse_ratio = DEFAULT_SPARSE_RATIO;
    cfg->encode_sparse_safe = DEFAULT_SPARSE_SAFE;
    cfg->encode_max_depth = DEFAULT_ENCODE_MAX_DEPTH;
    cfg->decode_max_depth = DEFAULT_DECODE_MAX_DEPTH;
    cfg->encode_invalid_numbers = DEFAULT_ENCODE_INVALID_NUMBERS;
    cfg->decode_invalid_numbers = DEFAULT_DECODE_INVALID_NUMBERS;
    cfg->encode_keep_buffer = DEFAULT_ENCODE_KEEP_BUFFER;
    cfg->encode_number_precision = DEFAULT_ENCODE_NUMBER_PRECISION;

#if DEFAULT_ENCODE_KEEP_BUFFER > 0
    strbuf_init(&cfg->encode_buf, 0);
#endif

    /* Decoding init */

    /* Tag all characters as an error */
    for (i = 0; i < 256; i++)
        cfg->ch2token[i] = T_ERROR;

    /* Set tokens that require no further processing */
    cfg->ch2token['{'] = T_OBJ_BEGIN;
    cfg->ch2token['}'] = T_OBJ_END;
    cfg->ch2token['['] = T_ARR_BEGIN;
    cfg->ch2token[']'] = T_ARR_END;
    cfg->ch2token[','] = T_COMMA;
    cfg->ch2token[':'] = T_COLON;
    cfg->ch2token['\0'] = T_END;
    cfg->ch2token[' '] = T_WHITESPACE;
    cfg->ch2token['\t'] = T_WHITESPACE;
    cfg->ch2token['\n'] = T_WHITESPACE;
    cfg->ch2token['\r'] = T_WHITESPACE;

    /* Update characters that require further processing */
    cfg->ch2token['f'] = T_UNKNOWN;     /* false? */
    cfg->ch2token['i'] = T_UNKNOWN;     /* inf, ininity? */
    cfg->ch2token['I'] = T_UNKNOWN;
    cfg->ch2token['n'] = T_UNKNOWN;     /* null, nan? */
    cfg->ch2token['N'] = T_UNKNOWN;
    cfg->ch2token['t'] = T_UNKNOWN;     /* true? */
    cfg->ch2token['"'] = T_UNKNOWN;     /* string? */
    cfg->ch2token['+'] = T_UNKNOWN;     /* number? */
    cfg->ch2token['-'] = T_UNKNOWN;
    for (i = 0; i < 10; i++)
        cfg->ch2token['0' + i] = T_UNKNOWN;

    /* Lookup table for parsing escape characters */
    for (i = 0; i < 256; i++)
        cfg->escape2char[i] = 0;          /* String error */
    cfg->escape2char['"'] = '"';
    cfg->escape2char['\\'] = '\\';
    cfg->escape2char['/'] = '/';
    cfg->escape2char['b'] = '\b';
    cfg->escape2char['t'] = '\t';
    cfg->escape2char['n'] = '\n';
    cfg->escape2char['f'] = '\f';
    cfg->escape2char['r'] = '\r';
    cfg->escape2char['u'] = 'u';          /* Unicode parsing required */
}

/* ===== ENCODING ===== */

static void json_encode_exception(lua_State *l, json_config_t *cfg, strbuf_t *json, int lindex,
                                  const char *reason)
{
    if (!cfg->encode_keep_buffer)
        strbuf_free(json);
    luaL_error(l, "Cannot serialise %s: %s",
                  lua_typename(l, lua_type(l, lindex)), reason);
}

/* json_append_string args:
 * - lua_State
 * - JSON strbuf
 * - String (Lua stack index)
 *
 * Returns nothing. Doesn't remove string from Lua stack */
static void json_append_string(lua_State *l, strbuf_t *json, int lindex)
{
    const char *escstr;
    int i;
    const char *str;
    size_t len;

    str = lua_tolstring(l, lindex, &len);

    /* Worst case is len * 6 (all unicode escapes).
     * This buffer is reused constantly for small strings
     * If there are any excess pages, they won't be hit anyway.
     * This gains ~5% speedup. */
    strbuf_ensure_empty_length(json, len * 6 + 2);

    strbuf_append_char_unsafe(json, '\"');
    for (i = 0; i < len; i++) {
        escstr = char2escape[(unsigned char)str[i]];
        if (escstr)
            strbuf_append_string(json, escstr);
        else
            strbuf_append_char_unsafe(json, str[i]);
    }
    strbuf_append_char_unsafe(json, '\"');
}

/* Find the size of the array on the top of the Lua stack
 * -1   object (not a pure array)
 * >=0  elements in array
 */
static int lua_array_length(lua_State *l, json_config_t *cfg, strbuf_t *json)
{
    double k;
    int max;
    int items;

    max = 0;
    items = 0;

    lua_pushnil(l);
    /* table, startkey */
    while (lua_next(l, -2) != 0) {
        /* table, key, value */
        if (lua_type(l, -2) == LUA_TNUMBER &&
            (k = lua_tonumber(l, -2))) {
            /* Integer >= 1 ? */
            if (floor(k) == k && k >= 1) {
                if (k > max)
                    max = k;
                items++;
                lua_pop(l, 1);
                continue;
            }
        }

        /* Must not be an array (non integer key) */
        lua_pop(l, 2);
        return -1;
    }

    /* Encode excessively sparse arrays as objects (if enabled) */
    if (cfg->encode_sparse_ratio > 0 &&
        max > items * cfg->encode_sparse_ratio &&
        max > cfg->encode_sparse_safe) {
        if (!cfg->encode_sparse_convert)
            json_encode_exception(l, cfg, json, -1, "excessively sparse array");

        return -1;
    }

    return max;
}

static void json_check_encode_depth(lua_State *l, json_config_t *cfg,
                                    int current_depth, strbuf_t *json)
{
    /* Ensure there are enough slots free to traverse a table (key,
     * value) and push a string for a potential error message.
     *
     * Unlike "decode", the key and value are still on the stack when
     * lua_checkstack() is called.  Hence an extra slot for luaL_error()
     * below is required just in case the next check to lua_checkstack()
     * fails.
     *
     * While this won't cause a crash due to the EXTRA_STACK reserve
     * slots, it would still be an improper use of the API. */
    if (current_depth <= cfg->encode_max_depth && lua_checkstack(l, 3))
        return;

    if (!cfg->encode_keep_buffer)
        strbuf_free(json);

    luaL_error(l, "Cannot serialise, excessive nesting (%d)",
               current_depth);
}

static void json_append_data(lua_State *l, json_config_t *cfg,
                             int current_depth, strbuf_t *json);

/* json_append_array args:
 * - lua_State
 * - JSON strbuf
 * - Size of passwd Lua array (top of stack) */
static void json_append_array(lua_State *l, json_config_t *cfg, int current_depth,
                              strbuf_t *json, int array_length)
{
    int comma, i;

    strbuf_append_char(json, '[');

    comma = 0;
    for (i = 1; i <= array_length; i++) {
        if (comma)
            strbuf_append_char(json, ',');
        else
            comma = 1;

        lua_rawgeti(l, -1, i);
        json_append_data(l, cfg, current_depth, json);
        lua_pop(l, 1);
    }

    strbuf_append_char(json, ']');
}

static void json_append_number(lua_State *l, json_config_t *cfg,
                               strbuf_t *json, int lindex)
{
    double num = lua_tonumber(l, lindex);
    int len;

    if (cfg->encode_invalid_numbers == 0) {
        /* Prevent encoding invalid numbers */
        if (isinf(num) || isnan(num))
            json_encode_exception(l, cfg, json, lindex, "must not be NaN or Inf");
    } else if (cfg->encode_invalid_numbers == 1) {
        /* Encode invalid numbers, but handle "nan" separately
         * since some platforms may encode as "-nan". */
        if (isnan(num)) {
            strbuf_append_mem(json, "nan", 3);
            return;
        }
    } else {
        /* Encode invalid numbers as "null" */
        if (isinf(num) || isnan(num)) {
            strbuf_append_mem(json, "null", 4);
            return;
        }
    }

    strbuf_ensure_empty_length(json, FPCONV_G_FMT_BUFSIZE);
    len = fpconv_g_fmt(strbuf_empty_ptr(json), num, cfg->encode_number_precision);
    strbuf_extend_length(json, len);
}

static void json_append_object(lua_State *l, json_config_t *cfg,
                               int current_depth, strbuf_t *json)
{
    int comma, keytype;

    /* Object */
    strbuf_append_char(json, '{');

    lua_pushnil(l);
    /* table, startkey */
    comma = 0;
    while (lua_next(l, -2) != 0) {
        if (comma)
            strbuf_append_char(json, ',');
        else
            comma = 1;

        /* table, key, value */
        keytype = lua_type(l, -2);
        if (keytype == LUA_TNUMBER) {
            strbuf_append_char(json, '"');
            json_append_number(l, cfg, json, -2);
            strbuf_append_mem(json, "\":", 2);
        } else if (keytype == LUA_TSTRING) {
            json_append_string(l, json, -2);
            strbuf_append_char(json, ':');
        } else {
            json_encode_exception(l, cfg, json, -2,
                                  "table key must be a number or string");
            /* never returns */
        }

        /* table, key, value */
        json_append_data(l, cfg, current_depth, json);
        lua_pop(l, 1);
        /* table, key */
    }

    strbuf_append_char(json, '}');
}

/* Serialise Lua data into JSON string. */
static void json_append_data(lua_State *l, json_config_t *cfg,
                             int current_depth, strbuf_t *json)
{
    int len;

    switch (lua_type(l, -1)) {
    case LUA_TSTRING:
        json_append_string(l, json, -1);
        break;
    case LUA_TNUMBER:
        json_append_number(l, cfg, json, -1);
        break;
    case LUA_TBOOLEAN:
        if (lua_toboolean(l, -1))
            strbuf_append_mem(json, "true", 4);
        else
            strbuf_append_mem(json, "false", 5);
        break;
    case LUA_TTABLE:
        current_depth++;
        json_check_encode_depth(l, cfg, current_depth, json);
        len = lua_array_length(l, cfg, json);
        if (len > 0)
            json_append_array(l, cfg, current_depth, json, len);
        else
            json_append_object(l, cfg, current_depth, json);
        break;
    case LUA_TNIL:
        strbuf_append_mem(json, "null", 4);
        break;
    case LUA_TLIGHTUSERDATA:
        if (lua_touserdata(l, -1) == NULL) {
            strbuf_append_mem(json, "null", 4);
            break;
        }
    default:
        /* Remaining types (LUA_TFUNCTION, LUA_TUSERDATA, LUA_TTHREAD,
         * and LUA_TLIGHTUSERDATA) cannot be serialised */
        json_encode_exception(l, cfg, json, -1, "type not supported");
        /* never returns */
    }
}

static int json_encode(lua_State *l)
{
    json_config_t *cfg = json_fetch_config(l);
    strbuf_t local_encode_buf;
    strbuf_t *encode_buf;
    char *json;
    int len;

    luaL_argcheck(l, lua_gettop(l) == 1, 1, "expected 1 argument");

    if (!cfg->encode_keep_buffer) {
        /* Use private buffer */
        encode_buf = &local_encode_buf;
        strbuf_init(encode_buf, 0);
    } else {
        /* Reuse existing buffer */
        encode_buf = &cfg->encode_buf;
        strbuf_reset(encode_buf);
    }

    json_append_data(l, cfg, 0, encode_buf);
    json = strbuf_string(encode_buf, &len);

    lua_pushlstring(l, json, len);

    if (!cfg->encode_keep_buffer)
        strbuf_free(encode_buf);

    return 1;
}

/* ===== DECODING ===== */

static void json_process_value(lua_State *l, json_parse_t *json,
                               json_token_t *token);

static int hexdigit2int(char hex)
{
    if ('0' <= hex  && hex <= '9')
        return hex - '0';

    /* Force lowercase */
    hex |= 0x20;
    if ('a' <= hex && hex <= 'f')
        return 10 + hex - 'a';

    return -1;
}

static int decode_hex4(const char *hex)
{
    int digit[4];
    int i;

    /* Convert ASCII hex digit to numeric digit
     * Note: this returns an error for invalid hex digits, including
     *       NULL */
    for (i = 0; i < 4; i++) {
        digit[i] = hexdigit2int(hex[i]);
        if (digit[i] < 0) {
            return -1;
        }
    }

    return (digit[0] << 12) +
           (digit[1] << 8) +
           (digit[2] << 4) +
            digit[3];
}

/* Converts a Unicode codepoint to UTF-8.
 * Returns UTF-8 string length, and up to 4 bytes in *utf8 */
static int codepoint_to_utf8(char *utf8, int codepoint)
{
    /* 0xxxxxxx */
    if (codepoint <= 0x7F) {
        utf8[0] = codepoint;
        return 1;
    }

    /* 110xxxxx 10xxxxxx */
    if (codepoint <= 0x7FF) {
        utf8[0] = (codepoint >> 6) | 0xC0;
        utf8[1] = (codepoint & 0x3F) | 0x80;
        return 2;
    }

    /* 1110xxxx 10xxxxxx 10xxxxxx */
    if (codepoint <= 0xFFFF) {
        utf8[0] = (codepoint >> 12) | 0xE0;
        utf8[1] = ((codepoint >> 6) & 0x3F) | 0x80;
        utf8[2] = (codepoint & 0x3F) | 0x80;
        return 3;
    }

    /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
    if (codepoint <= 0x1FFFFF) {
        utf8[0] = (codepoint >> 18) | 0xF0;
        utf8[1] = ((codepoint >> 12) & 0x3F) | 0x80;
        utf8[2] = ((codepoint >> 6) & 0x3F) | 0x80;
        utf8[3] = (codepoint & 0x3F) | 0x80;
        return 4;
    }

    return 0;
}


/* Called when index pointing to beginning of UTF-16 code escape: \uXXXX
 * \u is guaranteed to exist, but the remaining hex characters may be
 * missing.
 * Translate to UTF-8 and append to temporary token string.
 * Must advance index to the next character to be processed.
 * Returns: 0   success
 *          -1  error
 */
static int json_append_unicode_escape(json_parse_t *json)
{
    char utf8[4];       /* Surrogate pairs require 4 UTF-8 bytes */
    int codepoint;
    int surrogate_low;
    int len;
    int escape_len = 6;

    /* Fetch UTF-16 code unit */
    codepoint = decode_hex4(json->ptr + 2);
    if (codepoint < 0)
        return -1;

    /* UTF-16 surrogate pairs take the following 2 byte form:
     *      11011 x yyyyyyyyyy
     * When x = 0: y is the high 10 bits of the codepoint
     *      x = 1: y is the low 10 bits of the codepoint
     *
     * Check for a surrogate pair (high or low) */
    if ((codepoint & 0xF800) == 0xD800) {
        /* Error if the 1st surrogate is not high */
        if (codepoint & 0x400)
            return -1;

        /* Ensure the next code is a unicode escape */
        if (*(json->ptr + escape_len) != '\\' ||
            *(json->ptr + escape_len + 1) != 'u') {
            return -1;
        }

        /* Fetch the next codepoint */
        surrogate_low = decode_hex4(json->ptr + 2 + escape_len);
        if (surrogate_low < 0)
            return -1;

        /* Error if the 2nd code is not a low surrogate */
        if ((surrogate_low & 0xFC00) != 0xDC00)
            return -1;

        /* Calculate Unicode codepoint */
        codepoint = (codepoint & 0x3FF) << 10;
        surrogate_low &= 0x3FF;
        codepoint = (codepoint | surrogate_low) + 0x10000;
        escape_len = 12;
    }

    /* Convert codepoint to UTF-8 */
    len = codepoint_to_utf8(utf8, codepoint);
    if (!len)
        return -1;

    /* Append bytes and advance parse index */
    strbuf_append_mem_unsafe(json->tmp, utf8, len);
    json->ptr += escape_len;

    return 0;
}

static void json_set_token_error(json_token_t *token, json_parse_t *json,
                                 const char *errtype)
{
    token->type = T_ERROR;
    token->index = json->ptr - json->data;
    token->value.string = errtype;
}

static void json_next_string_token(json_parse_t *json, json_token_t *token)
{
    char *escape2char = json->cfg->escape2char;
    char ch;

    /* Caller must ensure a string is next */
    assert(*json->ptr == '"');

    /* Skip " */
    json->ptr++;

    /* json->tmp is the temporary strbuf used to accumulate the
     * decoded string value.
     * json->tmp is sized to handle JSON containing only a string value.
     */
    strbuf_reset(json->tmp);

    while ((ch = *json->ptr) != '"') {
        if (!ch) {
            /* Premature end of the string */
            json_set_token_error(token, json, "unexpected end of string");
            return;
        }

        /* Handle escapes */
        if (ch == '\\') {
            /* Fetch escape character */
            ch = *(json->ptr + 1);

            /* Translate escape code and append to tmp string */
            ch = escape2char[(unsigned char)ch];
            if (ch == 'u') {
                if (json_append_unicode_escape(json) == 0)
                    continue;

                json_set_token_error(token, json,
                                     "invalid unicode escape code");
                return;
            }
            if (!ch) {
                json_set_token_error(token, json, "invalid escape code");
                return;
            }

            /* Skip '\' */
            json->ptr++;
        }
        /* Append normal character or translated single character
         * Unicode escapes are handled above */
        strbuf_append_char_unsafe(json->tmp, ch);
        json->ptr++;
    }
    json->ptr++;    /* Eat final quote (") */

    strbuf_ensure_null(json->tmp);

    token->type = T_STRING;
    token->value.string = strbuf_string(json->tmp, &token->string_len);
}

/* JSON numbers should take the following form:
 *      -?(0|[1-9]|[1-9][0-9]+)(.[0-9]+)?([eE][-+]?[0-9]+)?
 *
 * json_next_number_token() uses strtod() which allows other forms:
 * - numbers starting with '+'
 * - NaN, -NaN, infinity, -infinity
 * - hexadecimal numbers
 * - numbers with leading zeros
 *
 * json_is_invalid_number() detects "numbers" which may pass strtod()'s
 * error checking, but should not be allowed with strict JSON.
 *
 * json_is_invalid_number() may pass numbers which cause strtod()
 * to generate an error.
 */
static int json_is_invalid_number(json_parse_t *json)
{
    const char *p = json->ptr;

    /* Reject numbers starting with + */
    if (*p == '+')
        return 1;

    /* Skip minus sign if it exists */
    if (*p == '-')
        p++;

    /* Reject numbers starting with 0x, or leading zeros */
    if (*p == '0') {
        int ch2 = *(p + 1);

        if ((ch2 | 0x20) == 'x' ||          /* Hex */
            ('0' <= ch2 && ch2 <= '9'))     /* Leading zero */
            return 1;

        return 0;
    } else if (*p <= '9') {
        return 0;                           /* Ordinary number */
    }

    /* Reject inf/nan */
    if (!strncasecmp(p, "inf", 3))
        return 1;
    if (!strncasecmp(p, "nan", 3))
        return 1;

    /* Pass all other numbers which may still be invalid, but
     * strtod() will catch them. */
    return 0;
}

static void json_next_number_token(json_parse_t *json, json_token_t *token)
{
    char *endptr;

    token->type = T_NUMBER;
    token->value.number = fpconv_strtod(json->ptr, &endptr);
    if (json->ptr == endptr)
        json_set_token_error(token, json, "invalid number");
    else
        json->ptr = endptr;     /* Skip the processed number */

    return;
}

/* Fills in the token struct.
 * T_STRING will return a pointer to the json_parse_t temporary string
 * T_ERROR will leave the json->ptr pointer at the error.
 */
static void json_next_token(json_parse_t *json, json_token_t *token)
{
    const json_token_type_t *ch2token = json->cfg->ch2token;
    int ch;

    /* Eat whitespace. */
    while (1) {
        ch = (unsigned char)*(json->ptr);
        token->type = ch2token[ch];
        if (token->type != T_WHITESPACE)
            break;
        json->ptr++;
    }

    /* Store location of new token. Required when throwing errors
     * for unexpected tokens (syntax errors). */
    token->index = json->ptr - json->data;

    /* Don't advance the pointer for an error or the end */
    if (token->type == T_ERROR) {
        json_set_token_error(token, json, "invalid token");
        return;
    }

    if (token->type == T_END) {
        return;
    }

    /* Found a known single character token, advance index and return */
    if (token->type != T_UNKNOWN) {
        json->ptr++;
        return;
    }

    /* Process characters which triggered T_UNKNOWN
     *
     * Must use strncmp() to match the front of the JSON string.
     * JSON identifier must be lowercase.
     * When strict_numbers if disabled, either case is allowed for
     * Infinity/NaN (since we are no longer following the spec..) */
    if (ch == '"') {
        json_next_string_token(json, token);
        return;
    } else if (ch == '-' || ('0' <= ch && ch <= '9')) {
        if (!json->cfg->decode_invalid_numbers && json_is_invalid_number(json)) {
            json_set_token_error(token, json, "invalid number");
            return;
        }
        json_next_number_token(json, token);
        return;
    } else if (!strncmp(json->ptr, "true", 4)) {
        token->type = T_BOOLEAN;
        token->value.boolean = 1;
        json->ptr += 4;
        return;
    } else if (!strncmp(json->ptr, "false", 5)) {
        token->type = T_BOOLEAN;
        token->value.boolean = 0;
        json->ptr += 5;
        return;
    } else if (!strncmp(json->ptr, "null", 4)) {
        token->type = T_NULL;
        json->ptr += 4;
        return;
    } else if (json->cfg->decode_invalid_numbers &&
               json_is_invalid_number(json)) {
        /* When decode_invalid_numbers is enabled, only attempt to process
         * numbers we know are invalid JSON (Inf, NaN, hex)
         * This is required to generate an appropriate token error,
         * otherwise all bad tokens will register as "invalid number"
         */
        json_next_number_token(json, token);
        return;
    }

    /* Token starts with t/f/n but isn't recognised above. */
    json_set_token_error(token, json, "invalid token");
}

/* This function does not return.
 * DO NOT CALL WITH DYNAMIC MEMORY ALLOCATED.
 * The only supported exception is the temporary parser string
 * json->tmp struct.
 * json and token should exist on the stack somewhere.
 * luaL_error() will long_jmp and release the stack */
static void json_throw_parse_error(lua_State *l, json_parse_t *json,
                                   const char *exp, json_token_t *token)
{
    const char *found;

    strbuf_free(json->tmp);

    if (token->type == T_ERROR)
        found = token->value.string;
    else
        found = json_token_type_name[token->type];

    /* Note: token->index is 0 based, display starting from 1 */
    luaL_error(l, "Expected %s but found %s at character %d",
               exp, found, token->index + 1);
}

static inline void json_decode_ascend(json_parse_t *json)
{
    json->current_depth--;
}

static void json_decode_descend(lua_State *l, json_parse_t *json, int slots)
{
    json->current_depth++;

    if (json->current_depth <= json->cfg->decode_max_depth &&
        lua_checkstack(l, slots)) {
        return;
    }

    strbuf_free(json->tmp);
    luaL_error(l, "Found too many nested data structures (%d) at character %d",
        json->current_depth, json->ptr - json->data);
}

static void json_parse_object_context(lua_State *l, json_parse_t *json)
{
    json_token_t token;

    /* 3 slots required:
     * .., table, key, value */
    json_decode_descend(l, json, 3);

    lua_newtable(l);

    json_next_token(json, &token);

    /* Handle empty objects */
    if (token.type == T_OBJ_END) {
        json_decode_ascend(json);
        return;
    }

    while (1) {
        if (token.type != T_STRING)
            json_throw_parse_error(l, json, "object key string", &token);

        /* Push key */
        lua_pushlstring(l, token.value.string, token.string_len);

        json_next_token(json, &token);
        if (token.type != T_COLON)
            json_throw_parse_error(l, json, "colon", &token);

        /* Fetch value */
        json_next_token(json, &token);
        json_process_value(l, json, &token);

        /* Set key = value */
        lua_rawset(l, -3);

        json_next_token(json, &token);

        if (token.type == T_OBJ_END) {
            json_decode_ascend(json);
            return;
        }

        if (token.type != T_COMMA)
            json_throw_parse_error(l, json, "comma or object end", &token);

        json_next_token(json, &token);
    }
}

/* Handle the array context */
static void json_parse_array_context(lua_State *l, json_parse_t *json)
{
    json_token_t token;
    int i;

    /* 2 slots required:
     * .., table, value */
    json_decode_descend(l, json, 2);

    lua_newtable(l);

    json_next_token(json, &token);

    /* Handle empty arrays */
    if (token.type == T_ARR_END) {
        json_decode_ascend(json);
        return;
    }

    for (i = 1; ; i++) {
        json_process_value(l, json, &token);
        lua_rawseti(l, -2, i);            /* arr[i] = value */

        json_next_token(json, &token);

        if (token.type == T_ARR_END) {
            json_decode_ascend(json);
            return;
        }

        if (token.type != T_COMMA)
            json_throw_parse_error(l, json, "comma or array end", &token);

        json_next_token(json, &token);
    }
}

/* Handle the "value" context */
static void json_process_value(lua_State *l, json_parse_t *json,
                               json_token_t *token)
{
    switch (token->type) {
    case T_STRING:
        lua_pushlstring(l, token->value.string, token->string_len);
        break;;
    case T_NUMBER: {
        double num = token->value.number;
        double intpart;
        /* Convert to integer when possible for Lua 5.1 compatibility.
         * This ensures tostring(cjson.decode('{"id":42}').id) returns "42" not "42.0" */
        if (modf(num, &intpart) == 0.0 &&
            intpart >= LUA_MININTEGER && intpart <= LUA_MAXINTEGER) {
            lua_pushinteger(l, (lua_Integer)intpart);
        } else {
            lua_pushnumber(l, num);
        }
        break;
    }
    case T_BOOLEAN:
        lua_pushboolean(l, token->value.boolean);
        break;;
    case T_OBJ_BEGIN:
        json_parse_object_context(l, json);
        break;;
    case T_ARR_BEGIN:
        json_parse_array_context(l, json);
        break;;
    case T_NULL:
        /* In Lua, setting "t[k] = nil" will delete k from the table.
         * Hence a NULL pointer lightuserdata object is used instead */
        lua_pushlightuserdata(l, NULL);
        break;;
    default:
        json_throw_parse_error(l, json, "value", token);
    }
}

static int json_decode(lua_State *l)
{
    json_parse_t json;
    json_token_t token;
    size_t json_len;

    luaL_argcheck(l, lua_gettop(l) == 1, 1, "expected 1 argument");

    json.cfg = json_fetch_config(l);
    json.data = luaL_checklstring(l, 1, &json_len);
    json.current_depth = 0;
    json.ptr = json.data;

    /* Detect Unicode other than UTF-8 (see RFC 4627, Sec 3)
     *
     * CJSON can support any simple data type, hence only the first
     * character is guaranteed to be ASCII (at worst: '"'). This is
     * still enough to detect whether the wrong encoding is in use. */
    if (json_len >= 2 && (!json.data[0] || !json.data[1]))
        luaL_error(l, "JSON parser does not support UTF-16 or UTF-32");

    /* Ensure the temporary buffer can hold the entire string.
     * This means we no longer need to do length checks since the decoded
     * string must be smaller than the entire json string */
    json.tmp = strbuf_new(json_len);

    json_next_token(&json, &token);
    json_process_value(l, &json, &token);

    /* Ensure there is no more input left */
    json_next_token(&json, &token);

    if (token.type != T_END)
        json_throw_parse_error(l, &json, "the end", &token);

    strbuf_free(json.tmp);

    return 1;
}

/* ===== INITIALISATION ===== */

#if !defined(LUA_VERSION_NUM) || LUA_VERSION_NUM < 502
/* Compatibility for Lua 5.1.
 *
 * luaL_setfuncs() is used to create a module table where the functions have
 * json_config_t as their first upvalue. Code borrowed from Lua 5.2 source. */
static void luaL_setfuncs (lua_State *l, const luaL_Reg *reg, int nup)
{
    int i;

    luaL_checkstack(l, nup, "too many upvalues");
    for (; reg->name != NULL; reg++) {  /* fill the table with given functions */
        for (i = 0; i < nup; i++)  /* copy upvalues to the top */
            lua_pushvalue(l, -nup);
        lua_pushcclosure(l, reg->func, nup);  /* closure with those upvalues */
        lua_setfield(l, -(nup + 2), reg->name);
    }
    lua_pop(l, nup);  /* remove upvalues */
}
#endif

/* Call target function in protected mode with all supplied args.
 * Assumes target function only returns a single non-nil value.
 * Convert and return thrown errors as: nil, "error message" */
static int json_protect_conversion(lua_State *l)
{
    int err;

    /* Deliberately throw an error for invalid arguments */
    luaL_argcheck(l, lua_gettop(l) == 1, 1, "expected 1 argument");

    /* pcall() the function stored as upvalue(1) */
    lua_pushvalue(l, lua_upvalueindex(1));
    lua_insert(l, 1);
    err = lua_pcall(l, 1, 1, 0);
    if (!err)
        return 1;

    if (err == LUA_ERRRUN) {
        lua_pushnil(l);
        lua_insert(l, -2);
        return 2;
    }

    /* Since we are not using a custom error handler, the only remaining
     * errors are memory related */
    return luaL_error(l, "Memory allocation error in CJSON protected call");
}

/* Return cjson module table */
static int lua_cjson_new(lua_State *l)
{
    luaL_Reg reg[] = {
        { "encode", json_encode },
        { "decode", json_decode },
        { "encode_sparse_array", json_cfg_encode_sparse_array },
        { "encode_max_depth", json_cfg_encode_max_depth },
        { "decode_max_depth", json_cfg_decode_max_depth },
        { "encode_number_precision", json_cfg_encode_number_precision },
        { "encode_keep_buffer", json_cfg_encode_keep_buffer },
        { "encode_invalid_numbers", json_cfg_encode_invalid_numbers },
        { "decode_invalid_numbers", json_cfg_decode_invalid_numbers },
        { "new", lua_cjson_new },
        { NULL, NULL }
    };

    /* Initialise number conversions */
    fpconv_init();

    /* cjson module table */
    lua_newtable(l);

    /* Register functions with config data as upvalue */
    json_create_config(l);
    luaL_setfuncs(l, reg, 1);

    /* Set cjson.null */
    lua_pushlightuserdata(l, NULL);
    lua_setfield(l, -2, "null");

    /* Set module name / version fields */
    lua_pushliteral(l, CJSON_MODNAME);
    lua_setfield(l, -2, "_NAME");
    lua_pushliteral(l, CJSON_VERSION);
    lua_setfield(l, -2, "_VERSION");

    return 1;
}

/* Return cjson.safe module table */
static int lua_cjson_safe_new(lua_State *l)
{
    const char *func[] = { "decode", "encode", NULL };
    int i;

    lua_cjson_new(l);

    /* Fix new() method */
    lua_pushcfunction(l, lua_cjson_safe_new);
    lua_setfield(l, -2, "new");

    for (i = 0; func[i]; i++) {
        lua_getfield(l, -1, func[i]);
        lua_pushcclosure(l, json_protect_conversion, 1);
        lua_setfield(l, -2, func[i]);
    }

    return 1;
}

int luaopen_cjson(lua_State *l)
{
    lua_cjson_new(l);

    lua_pushvalue(l, -1);
    lua_setglobal(l, CJSON_MODNAME);

    /* Return cjson table */
    return 1;
}

int luaopen_cjson_safe(lua_State *l)
{
    lua_cjson_safe_new(l);

    /* Return cjson.safe table */
    return 1;
}

/* vi:ai et sw=4 ts=4:
 */


================================================
FILE: src/redis/lua/cjson/strbuf.c
================================================
/* strbuf - String buffer routines
 *
 * Copyright (c) 2010-2012  Mark Pulford <mark@kyne.com.au>
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>

#include "strbuf.h"

static void die(const char *fmt, ...)
{
    va_list arg;

    va_start(arg, fmt);
    vfprintf(stderr, fmt, arg);
    va_end(arg);
    fprintf(stderr, "\n");

    exit(-1);
}

void strbuf_init(strbuf_t *s, int len)
{
    int size;

    if (len <= 0)
        size = STRBUF_DEFAULT_SIZE;
    else
        size = len + 1;         /* \0 terminator */

    s->buf = NULL;
    s->size = size;
    s->length = 0;
    s->increment = STRBUF_DEFAULT_INCREMENT;
    s->dynamic = 0;
    s->reallocs = 0;
    s->debug = 0;

    s->buf = malloc(size);
    if (!s->buf)
        die("Out of memory");

    strbuf_ensure_null(s);
}

strbuf_t *strbuf_new(int len)
{
    strbuf_t *s;

    s = malloc(sizeof(strbuf_t));
    if (!s)
        die("Out of memory");

    strbuf_init(s, len);

    /* Dynamic strbuf allocation / deallocation */
    s->dynamic = 1;

    return s;
}

void strbuf_set_increment(strbuf_t *s, int increment)
{
    /* Increment > 0:  Linear buffer growth rate
     * Increment < -1: Exponential buffer growth rate */
    if (increment == 0 || increment == -1)
        die("BUG: Invalid string increment");

    s->increment = increment;
}

static inline void debug_stats(strbuf_t *s)
{
    if (s->debug) {
        fprintf(stderr, "strbuf(%lx) reallocs: %d, length: %d, size: %d\n",
                (long)s, s->reallocs, s->length, s->size);
    }
}

/* If strbuf_t has not been dynamically allocated, strbuf_free() can
 * be called any number of times strbuf_init() */
void strbuf_free(strbuf_t *s)
{
    debug_stats(s);

    if (s->buf) {
        free(s->buf);
        s->buf = NULL;
    }
    if (s->dynamic)
        free(s);
}

char *strbuf_free_to_string(strbuf_t *s, int *len)
{
    char *buf;

    debug_stats(s);

    strbuf_ensure_null(s);

    buf = s->buf;
    if (len)
        *len = s->length;

    if (s->dynamic)
        free(s);

    return buf;
}

static int calculate_new_size(strbuf_t *s, int len)
{
    int reqsize, newsize;

    if (len <= 0)
        die("BUG: Invalid strbuf length requested");

    /* Ensure there is room for optional NULL termination */
    reqsize = len + 1;

    /* If the user has requested to shrink the buffer, do it exactly */
    if (s->size > reqsize)
        return reqsize;

    newsize = s->size;
    if (s->increment < 0) {
        /* Exponential sizing */
        while (newsize < reqsize)
            newsize *= -s->increment;
    } else {
        /* Linear sizing */
        newsize = ((newsize + s->increment - 1) / s->increment) * s->increment;
    }

    return newsize;
}


/* Ensure strbuf can handle a string length bytes long (ignoring NULL
 * optional termination). */
void strbuf_resize(strbuf_t *s, int len)
{
    int newsize;

    newsize = calculate_new_size(s, len);

    if (s->debug > 1) {
        fprintf(stderr, "strbuf(%lx) resize: %d => %d\n",
                (long)s, s->size, newsize);
    }

    s->size = newsize;
    s->buf = realloc(s->buf, s->size);
    if (!s->buf)
        die("Out of memory");
    s->reallocs++;
}

void strbuf_append_string(strbuf_t *s, const char *str)
{
    int space, i;

    space = strbuf_empty_length(s);

    for (i = 0; str[i]; i++) {
        if (space < 1) {
            strbuf_resize(s, s->length + 1);
            space = strbuf_empty_length(s);
        }

        s->buf[s->length] = str[i];
        s->length++;
        space--;
    }
}

/* strbuf_append_fmt() should only be used when an upper bound
 * is known for the output string. */
void strbuf_append_fmt(strbuf_t *s, int len, const char *fmt, ...)
{
    va_list arg;
    int fmt_len;

    strbuf_ensure_empty_length(s, len);

    va_start(arg, fmt);
    fmt_len = vsnprintf(s->buf + s->length, len, fmt, arg);
    va_end(arg);

    if (fmt_len < 0)
        die("BUG: Unable to convert number");  /* This should never happen.. */

    s->length += fmt_len;
}

/* strbuf_append_fmt_retry() can be used when the there is no known
 * upper bound for the output string. */
void strbuf_append_fmt_retry(strbuf_t *s, const char *fmt, ...)
{
    va_list arg;
    int fmt_len, try;
    int empty_len;

    /* If the first attempt to append fails, resize the buffer appropriately
     * and try again */
    for (try = 0; ; try++) {
        va_start(arg, fmt);
        /* Append the new formatted string */
        /* fmt_len is the length of the string required, excluding the
         * trailing NULL */
        empty_len = strbuf_empty_length(s);
        /* Add 1 since there is also space to store the terminating NULL. */
        fmt_len = vsnprintf(s->buf + s->length, empty_len + 1, fmt, arg);
        va_end(arg);

        if (fmt_len <= empty_len)
            break;  /* SUCCESS */
        if (try > 0)
            die("BUG: length of formatted string changed");

        strbuf_resize(s, s->length + fmt_len);
    }

    s->length += fmt_len;
}

/* vi:ai et sw=4 ts=4:
 */


================================================
FILE: src/redis/lua/cjson/strbuf.h
================================================
/* strbuf - String buffer routines
 *
 * Copyright (c) 2010-2012  Mark Pulford <mark@kyne.com.au>
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

#include <stdlib.h>
#include <stdarg.h>

/* Size: Total bytes allocated to *buf
 * Length: String length, excluding optional NULL terminator.
 * Increment: Allocation increments when resizing the string buffer.
 * Dynamic: True if created via strbuf_new()
 */

typedef struct {
    char *buf;
    int size;
    int length;
    int increment;
    int dynamic;
    int reallocs;
    int debug;
} strbuf_t;

#ifndef STRBUF_DEFAULT_SIZE
#define STRBUF_DEFAULT_SIZE 1023
#endif
#ifndef STRBUF_DEFAULT_INCREMENT
#define STRBUF_DEFAULT_INCREMENT -2
#endif

/* Initialise */
extern strbuf_t *strbuf_new(int len);
extern void strbuf_init(strbuf_t *s, int len);
extern void strbuf_set_increment(strbuf_t *s, int increment);

/* Release */
extern void strbuf_free(strbuf_t *s);
extern char *strbuf_free_to_string(strbuf_t *s, int *len);

/* Management */
extern void strbuf_resize(strbuf_t *s, int len);
static int strbuf_empty_length(strbuf_t *s);
static int strbuf_length(strbuf_t *s);
static char *strbuf_string(strbuf_t *s, int *len);
static void strbuf_ensure_empty_length(strbuf_t *s, int len);
static char *strbuf_empty_ptr(strbuf_t *s);
static void strbuf_extend_length(strbuf_t *s, int len);

/* Update */
extern void strbuf_append_fmt(strbuf_t *s, int len, const char *fmt, ...);
extern void strbuf_append_fmt_retry(strbuf_t *s, const char *format, ...);
static void strbuf_append_mem(strbuf_t *s, const char *c, int len);
extern void strbuf_append_string(strbuf_t *s, const char *str);
static void strbuf_append_char(strbuf_t *s, const char c);
static void strbuf_ensure_null(strbuf_t *s);

/* Reset string for before use */
static inline void strbuf_reset(strbuf_t *s)
{
    s->length = 0;
}

static inline int strbuf_allocated(strbuf_t *s)
{
    return s->buf != NULL;
}

/* Return bytes remaining in the string buffer
 * Ensure there is space for a NULL terminator. */
static inline int strbuf_empty_length(strbuf_t *s)
{
    return s->size - s->length - 1;
}

static inline void strbuf_ensure_empty_length(strbuf_t *s, int len)
{
    if (len > strbuf_empty_length(s))
        strbuf_resize(s, s->length + len);
}

static inline char *strbuf_empty_ptr(strbuf_t *s)
{
    return s->buf + s->length;
}

static inline void strbuf_extend_length(strbuf_t *s, int len)
{
    s->length += len;
}

static inline int strbuf_length(strbuf_t *s)
{
    return s->length;
}

static inline void strbuf_append_char(strbuf_t *s, const char c)
{
    strbuf_ensure_empty_length(s, 1);
    s->buf[s->length++] = c;
}

static inline void strbuf_append_char_unsafe(strbuf_t *s, const char c)
{
    s->buf[s->length++] = c;
}

static inline void strbuf_append_mem(strbuf_t *s, const char *c, int len)
{
    strbuf_ensure_empty_length(s, len);
    memcpy(s->buf + s->length, c, len);
    s->length += len;
}

static inline void strbuf_append_mem_unsafe(strbuf_t *s, const char *c, int len)
{
    memcpy(s->buf + s->length, c, len);
    s->length += len;
}

static inline void strbuf_ensure_null(strbuf_t *s)
{
    s->buf[s->length] = 0;
}

static inline char *strbuf_string(strbuf_t *s, int *len)
{
    if (len)
        *len = s->length;

    return s->buf;
}

/* vi:ai et sw=4 ts=4:
 */


================================================
FILE: src/redis/lua/cmsgpack/lua_cmsgpack.c
================================================
#include <math.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <assert.h>

#include "lua.h"
#include "lauxlib.h"

#define LUACMSGPACK_NAME        "cmsgpack"
#define LUACMSGPACK_SAFE_NAME   "cmsgpack_safe"
#define LUACMSGPACK_VERSION     "lua-cmsgpack 0.4.0"
#define LUACMSGPACK_COPYRIGHT   "Copyright (C) 2012, Salvatore Sanfilippo"
#define LUACMSGPACK_DESCRIPTION "MessagePack C implementation for Lua"

/* Allows a preprocessor directive to override MAX_NESTING */
#ifndef LUACMSGPACK_MAX_NESTING
    #define LUACMSGPACK_MAX_NESTING  16 /* Max tables nesting. */
#endif

/* Check if float or double can be an integer without loss of precision */
#define IS_INT_TYPE_EQUIVALENT(x, T) (!isinf(x) && (T)(x) == (x))

#define IS_INT64_EQUIVALENT(x) IS_INT_TYPE_EQUIVALENT(x, int64_t)
#define IS_INT_EQUIVALENT(x) IS_INT_TYPE_EQUIVALENT(x, int)

/* If size of pointer is equal to a 4 byte integer, we're on 32 bits. */
#if UINTPTR_MAX == UINT_MAX
    #define BITS_32 1
#else
    #define BITS_32 0
#endif

#if BITS_32
    #define lua_pushunsigned(L, n) lua_pushnumber(L, n)
#else
    #define lua_pushunsigned(L, n) lua_pushinteger(L, n)
#endif

/* =============================================================================
 * MessagePack implementation and bindings for Lua 5.1/5.2.
 * Copyright(C) 2012 Salvatore Sanfilippo <antirez@gmail.com>
 *
 * http://github.com/antirez/lua-cmsgpack
 *
 * For MessagePack specification check the following web site:
 * http://wiki.msgpack.org/display/MSGPACK/Format+specification
 *
 * See Copyright Notice at the end of this file.
 *
 * CHANGELOG:
 * 19-Feb-2012 (ver 0.1.0): Initial release.
 * 20-Feb-2012 (ver 0.2.0): Tables encoding improved.
 * 20-Feb-2012 (ver 0.2.1): Minor bug fixing.
 * 20-Feb-2012 (ver 0.3.0): Module renamed lua-cmsgpack (was lua-msgpack).
 * 04-Apr-2014 (ver 0.3.1): Lua 5.2 support and minor bug fix.
 * 07-Apr-2014 (ver 0.4.0): Multiple pack/unpack, lua allocator, efficiency.
 * ========================================================================== */

/* -------------------------- Endian conversion --------------------------------
 * We use it only for floats and doubles, all the other conversions performed
 * in an endian independent fashion. So the only thing we need is a function
 * that swaps a binary string if arch is little endian (and left it untouched
 * otherwise). */

/* Reverse memory bytes if arch is little endian. Given the conceptual
 * simplicity of the Lua build system we prefer check for endianess at runtime.
 * The performance difference should be acceptable. */
void memrevifle(void *ptr, size_t len) {
    unsigned char   *p = (unsigned char *)ptr,
                    *e = (unsigned char *)p+len-1,
                    aux;
    int test = 1;
    unsigned char *testp = (unsigned char*) &test;

    if (testp[0] == 0) return; /* Big endian, nothing to do. */
    len /= 2;
    while(len--) {
        aux = *p;
        *p = *e;
        *e = aux;
        p++;
        e--;
    }
}

/* ---------------------------- String buffer ----------------------------------
 * This is a simple implementation of string buffers. The only operation
 * supported is creating empty buffers and appending bytes to it.
 * The string buffer uses 2x preallocation on every realloc for O(N) append
 * behavior.  */

typedef struct mp_buf {
    unsigned char *b;
    size_t len, free;
} mp_buf;

void *mp_realloc(lua_State *L, void *target, size_t osize,size_t nsize) {
    void *(*local_realloc) (void *, void *, size_t osize, size_t nsize) = NULL;
    void *ud;

    local_realloc = lua_getallocf(L, &ud);

    return local_realloc(ud, target, osize, nsize);
}

mp_buf *mp_buf_new(lua_State *L) {
    mp_buf *buf = NULL;

    /* Old size = 0; new size = sizeof(*buf) */
    buf = (mp_buf*)mp_realloc(L, NULL, 0, sizeof(*buf));

    buf->b = NULL;
    buf->len = buf->free = 0;
    return buf;
}

void mp_buf_append(lua_State *L, mp_buf *buf, const unsigned char *s, size_t len) {
    if (buf->free < len) {
        size_t newsize = (buf->len+len)*2;

        buf->b = (unsigned char*)mp_realloc(L, buf->b, buf->len + buf->free, newsize);
        buf->free = newsize - buf->len;
    }
    memcpy(buf->b+buf->len,s,len);
    buf->len += len;
    buf->free -= len;
}

void mp_buf_free(lua_State *L, mp_buf *buf) {
    mp_realloc(L, buf->b, buf->len + buf->free, 0); /* realloc to 0 = free */
    mp_realloc(L, buf, sizeof(*buf), 0);
}

/* ---------------------------- String cursor ----------------------------------
 * This simple data structure is used for parsing. Basically you create a cursor
 * using a string pointer and a length, then it is possible to access the
 * current string position with cursor->p, check the remaining length
 * in cursor->left, and finally consume more string using
 * mp_cur_consume(cursor,len), to advance 'p' and subtract 'left'.
 * An additional field cursor->error is set to zero on initialization and can
 * be used to report errors. */

#define MP_CUR_ERROR_NONE   0
#define MP_CUR_ERROR_EOF    1   /* Not enough data to complete operation. */
#define MP_CUR_ERROR_BADFMT 2   /* Bad data format */

typedef struct mp_cur {
    const unsigned char *p;
    size_t left;
    int err;
} mp_cur;

void mp_cur_init(mp_cur *cursor, const unsigned char *s, size_t len) {
    cursor->p = s;
    cursor->left = len;
    cursor->err = MP_CUR_ERROR_NONE;
}

#define mp_cur_consume(_c,_len) do { _c->p += _len; _c->left -= _len; } while(0)

/* When there is not enough room we set an error in the cursor and return. This
 * is very common across the code so we have a macro to make the code look
 * a bit simpler. */
#define mp_cur_need(_c,_len) do { \
    if (_c->left < _len) { \
        _c->err = MP_CUR_ERROR_EOF; \
        return; \
    } \
} while(0)

/* ------------------------- Low level MP encoding -------------------------- */

void mp_encode_bytes(lua_State *L, mp_buf *buf, const unsigned char *s, size_t len) {
    unsigned char hdr[5];
    int hdrlen;

    if (len < 32) {
        hdr[0] = 0xa0 | (len&0xff); /* fix raw */
        hdrlen = 1;
    } else if (len <= 0xff) {
        hdr[0] = 0xd9;
        hdr[1] = len;
        hdrlen = 2;
    } else if (len <= 0xffff) {
        hdr[0] = 0xda;
        hdr[1] = (len&0xff00)>>8;
        hdr[2] = len&0xff;
        hdrlen = 3;
    } else {
        hdr[0] = 0xdb;
        hdr[1] = (len&0xff000000)>>24;
        hdr[2] = (len&0xff0000)>>16;
        hdr[3] = (len&0xff00)>>8;
        hdr[4] = len&0xff;
        hdrlen = 5;
    }
    mp_buf_append(L,buf,hdr,hdrlen);
    mp_buf_append(L,buf,s,len);
}

/* we assume IEEE 754 internal format for single and double precision floats. */
void mp_encode_double(lua_State *L, mp_buf *buf, double d) {
    unsigned char b[9];
    float f = d;

    assert(sizeof(f) == 4 && sizeof(d) == 8);
    if (d == (double)f) {
        b[0] = 0xca;    /* float IEEE 754 */
        memcpy(b+1,&f,4);
        memrevifle(b+1,4);
        mp_buf_append(L,buf,b,5);
    } else if (sizeof(d) == 8) {
        b[0] = 0xcb;    /* double IEEE 754 */
        memcpy(b+1,&d,8);
        memrevifle(b+1,8);
        mp_buf_append(L,buf,b,9);
    }
}

void mp_encode_int(lua_State *L, mp_buf *buf, int64_t n) {
    unsigned char b[9];
    int enclen;

    if (n >= 0) {
        if (n <= 127) {
            b[0] = n & 0x7f;    /* positive fixnum */
            enclen = 1;
        } else if (n <= 0xff) {
            b[0] = 0xcc;        /* uint 8 */
            b[1] = n & 0xff;
            enclen = 2;
        } else if (n <= 0xffff) {
            b[0] = 0xcd;        /* uint 16 */
            b[1] = (n & 0xff00) >> 8;
            b[2] = n & 0xff;
            enclen = 3;
        } else if (n <= 0xffffffffLL) {
            b[0] = 0xce;        /* uint 32 */
            b[1] = (n & 0xff000000) >> 24;
            b[2] = (n & 0xff0000) >> 16;
            b[3] = (n & 0xff00) >> 8;
            b[4] = n & 0xff;
            enclen = 5;
        } else {
            b[0] = 0xcf;        /* uint 64 */
            b[1] = (n & 0xff00000000000000LL) >> 56;
            b[2] = (n & 0xff000000000000LL) >> 48;
            b[3] = (n & 0xff0000000000LL) >> 40;
            b[4] = (n & 0xff00000000LL) >> 32;
            b[5] = (n & 0xff000000) >> 24;
            b[6] = (n & 0xff0000) >> 16;
            b[7] = (n & 0xff00) >> 8;
            b[8] = n & 0xff;
            enclen = 9;
        }
    } else {
        if (n >= -32) {
            b[0] = ((signed char)n);   /* negative fixnum */
            enclen = 1;
        } else if (n >= -128) {
            b[0] = 0xd0;        /* int 8 */
            b[1] = n & 0xff;
            enclen = 2;
        } else if (n >= -32768) {
            b[0] = 0xd1;        /* int 16 */
            b[1] = (n & 0xff00) >> 8;
            b[2] = n & 0xff;
            enclen = 3;
        } else if (n >= -2147483648LL) {
            b[0] = 0xd2;        /* int 32 */
            b[1] = (n & 0xff000000) >> 24;
            b[2] = (n & 0xff0000) >> 16;
            b[3] = (n & 0xff00) >> 8;
            b[4] = n & 0xff;
            enclen = 5;
        } else {
            b[0] = 0xd3;        /* int 64 */
            b[1] = (n & 0xff00000000000000LL) >> 56;
            b[2] = (n & 0xff000000000000LL) >> 48;
            b[3] = (n & 0xff0000000000LL) >> 40;
            b[4] = (n & 0xff00000000LL) >> 32;
            b[5] = (n & 0xff000000) >> 24;
            b[6] = (n & 0xff0000) >> 16;
            b[7] = (n & 0xff00) >> 8;
            b[8] = n & 0xff;
            enclen = 9;
        }
    }
    mp_buf_append(L,buf,b,enclen);
}

void mp_encode_array(lua_State *L, mp_buf *buf, int64_t n) {
    unsigned char b[5];
    int enclen;

    if (n <= 15) {
        b[0] = 0x90 | (n & 0xf);    /* fix array */
        enclen = 1;
    } else if (n <= 65535) {
        b[0] = 0xdc;                /* array 16 */
        b[1] = (n & 0xff00) >> 8;
        b[2] = n & 0xff;
        enclen = 3;
    } else {
        b[0] = 0xdd;                /* array 32 */
        b[1] = (n & 0xff000000) >> 24;
        b[2] = (n & 0xff0000) >> 16;
        b[3] = (n & 0xff00) >> 8;
        b[4] = n & 0xff;
        enclen = 5;
    }
    mp_buf_append(L,buf,b,enclen);
}

void mp_encode_map(lua_State *L, mp_buf *buf, int64_t n) {
    unsigned char b[5];
    int enclen;

    if (n <= 15) {
        b[0] = 0x80 | (n & 0xf);    /* fix map */
        enclen = 1;
    } else if (n <= 65535) {
        b[0] = 0xde;                /* map 16 */
        b[1] = (n & 0xff00) >> 8;
        b[2] = n & 0xff;
        enclen = 3;
    } else {
        b[0] = 0xdf;                /* map 32 */
        b[1] = (n & 0xff000000) >> 24;
        b[2] = (n & 0xff0000) >> 16;
        b[3] = (n & 0xff00) >> 8;
        b[4] = n & 0xff;
        enclen = 5;
    }
    mp_buf_append(L,buf,b,enclen);
}

/* --------------------------- Lua types encoding --------------------------- */

void mp_encode_lua_string(lua_State *L, mp_buf *buf) {
    size_t len;
    const char *s;

    s = lua_tolstring(L,-1,&len);
    mp_encode_bytes(L,buf,(const unsigned char*)s,len);
}

void mp_encode_lua_bool(lua_State *L, mp_buf *buf) {
    unsigned char b = lua_toboolean(L,-1) ? 0xc3 : 0xc2;
    mp_buf_append(L,buf,&b,1);
}

/* Lua 5.3 has a built in 64-bit integer type */
void mp_encode_lua_integer(lua_State *L, mp_buf *buf) {
#if (LUA_VERSION_NUM < 503) && BITS_32
    lua_Number i = lua_tonumber(L,-1);
#else
    lua_Integer i = lua_tointeger(L,-1);
#endif
    mp_encode_int(L, buf, (int64_t)i);
}

/* Lua 5.2 and lower only has 64-bit doubles, so we need to
 * detect if the double may be representable as an int
 * for Lua < 5.3 */
void mp_encode_lua_number(lua_State *L, mp_buf *buf) {
    lua_Number n = lua_tonumber(L,-1);

    if (IS_INT64_EQUIVALENT(n)) {
        mp_encode_lua_integer(L, buf);
    } else {
        mp_encode_double(L,buf,(double)n);
    }
}

void mp_encode_lua_type(lua_State *L, mp_buf *buf, int level);

/* Convert a lua table into a message pack list. */
void mp_encode_lua_table_as_array(lua_State *L, mp_buf *buf, int level) {
#if LUA_VERSION_NUM < 502
    size_t len = lua_objlen(L,-1), j;
#else
    size_t len = lua_rawlen(L,-1), j;
#endif

    mp_encode_array(L,buf,len);
    luaL_checkstack(L, 1, "in function mp_encode_lua_table_as_array");
    for (j = 1; j <= len; j++) {
        lua_pushnumber(L,j);
        lua_gettable(L,-2);
        mp_encode_lua_type(L,buf,level+1);
    }
}

/* Convert a lua table into a message pack key-value map. */
void mp_encode_lua_table_as_map(lua_State *L, mp_buf *buf, int level) {
    size_t len = 0;

    /* First step: count keys into table. No other way to do it with the
     * Lua API, we need to iterate a first time. Note that an alternative
     * would be to do a single run, and then hack the buffer to insert the
     * map opcodes for message pack. Too hackish for this lib. */
    luaL_checkstack(L, 3, "in function mp_encode_lua_table_as_map");
    lua_pushnil(L);
    while(lua_next(L,-2)) {
        lua_pop(L,1); /* remove value, keep key for next iteration. */
        len++;
    }

    /* Step two: actually encoding of the map. */
    mp_encode_map(L,buf,len);
    lua_pushnil(L);
    while(lua_next(L,-2)) {
        /* Stack: ... key value */
        lua_pushvalue(L,-2); /* Stack: ... key value key */
        mp_encode_lua_type(L,buf,level+1); /* encode key */
        mp_encode_lua_type(L,buf,level+1); /* encode val */
    }
}

/* Returns true if the Lua table on top of the stack is exclusively composed
 * of keys from numerical keys from 1 up to N, with N being the total number
 * of elements, without any hole in the middle. */
int table_is_an_array(lua_State *L) {
    int count = 0, max = 0;
#if LUA_VERSION_NUM < 503
    lua_Number n;
#else
    lua_Integer n;
#endif

    /* Stack top on function entry */
    int stacktop;

    stacktop = lua_gettop(L);

    lua_pushnil(L);
    while(lua_next(L,-2)) {
        /* Stack: ... key value */
        lua_pop(L,1); /* Stack: ... key */
        /* The <= 0 check is valid here because we're comparing indexes. */
#if LUA_VERSION_NUM < 503
        if ((LUA_TNUMBER != lua_type(L,-1)) || (n = lua_tonumber(L, -1)) <= 0 ||
            !IS_INT_EQUIVALENT(n))
#else
        if (!lua_isinteger(L,-1) || (n = lua_tointeger(L, -1)) <= 0)
#endif
        {
            lua_settop(L, stacktop);
            return 0;
        }
        max = (n > max ? n : max);
        count++;
    }
    /* We have the total number of elements in "count". Also we have
     * the max index encountered in "max". We can't reach this code
     * if there are indexes <= 0. If you also note that there can not be
     * repeated keys into a table, you have that if max==count you are sure
     * that there are all the keys form 1 to count (both included). */
    lua_settop(L, stacktop);
    return max == count;
}

/* If the length operator returns non-zero, that is, there is at least
 * an object at key '1', we serialize to message pack list. Otherwise
 * we use a map. */
void mp_encode_lua_table(lua_State *L, mp_buf *buf, int level) {
    if (table_is_an_array(L))
        mp_encode_lua_table_as_array(L,buf,level);
    else
        mp_encode_lua_table_as_map(L,buf,level);
}

void mp_encode_lua_null(lua_State *L, mp_buf *buf) {
    unsigned char b[1];

    b[0] = 0xc0;
    mp_buf_append(L,buf,b,1);
}

void mp_encode_lua_type(lua_State *L, mp_buf *buf, int level) {
    int t = lua_type(L,-1);

    /* Limit the encoding of nested tables to a specified maximum depth, so that
     * we survive when called against circular references in tables. */
    if (t == LUA_TTABLE && level == LUACMSGPACK_MAX_NESTING) t = LUA_TNIL;
    switch(t) {
    case LUA_TSTRING: mp_encode_lua_string(L,buf); break;
    case LUA_TBOOLEAN: mp_encode_lua_bool(L,buf); break;
    case LUA_TNUMBER:
    #if LUA_VERSION_NUM < 503
        mp_encode_lua_number(L,buf); break;
    #else
        if (lua_isinteger(L, -1)) {
            mp_encode_lua_integer(L, buf);
        } else {
            mp_encode_lua_number(L, buf);
        }
        break;
    #endif
    case LUA_TTABLE: mp_encode_lua_table(L,buf,level); break;
    default: mp_encode_lua_null(L,buf); break;
    }
    lua_pop(L,1);
}

/*
 * Packs all arguments as a stream for multiple upacking later.
 * Returns error if no arguments provided.
 */
int mp_pack(lua_State *L) {
    int nargs = lua_gettop(L);
    int i;
    mp_buf *buf;

    if (nargs == 0)
        return luaL_argerror(L, 0, "MessagePack pack needs input.");

    if (!lua_checkstack(L, nargs))
        return luaL_argerror(L, 0, "Too many arguments for MessagePack pack.");

    buf = mp_buf_new(L);
    for(i = 1; i <= nargs; i++) {
        /* Copy argument i to top of stack for _encode processing;
         * the encode function pops it from the stack when complete. */
        luaL_checkstack(L, 1, "in function mp_check");
        lua_pushvalue(L, i);

        mp_encode_lua_type(L,buf,0);

        lua_pushlstring(L,(char*)buf->b,buf->len);

        /* Reuse the buffer for the next operation by
         * setting its free count to the total buffer size
         * and the current position to zero. */
        buf->free += buf->len;
        buf->len = 0;
    }
    mp_buf_free(L, buf);

    /* Concatenate all nargs buffers together */
    lua_concat(L, nargs);
    return 1;
}

/* ------------------------------- Decoding --------------------------------- */

void mp_decode_to_lua_type(lua_State *L, mp_cur *c);

void mp_decode_to_lua_array(lua_State *L, mp_cur *c, size_t len) {
    assert(len <= UINT_MAX);
    int index = 1;

    lua_newtable(L);
    luaL_checkstack(L, 1, "in function mp_decode_to_lua_array");
    while(len--) {
        lua_pushnumber(L,index++);
        mp_decode_to_lua_type(L,c);
        if (c->err) return;
        lua_settable(L,-3);
    }
}

void mp_decode_to_lua_hash(lua_State *L, mp_cur *c, size_t len) {
    assert(len <= UINT_MAX);
    lua_newtable(L);
    while(len--) {
        mp_decode_to_lua_type(L,c); /* key */
        if (c->err) return;
        mp_decode_to_lua_type(L,c); /* value */
        if (c->err) return;
        lua_settable(L,-3);
    }
}

/* Decode a Message Pack raw object pointed by the string cursor 'c' to
 * a Lua type, that is left as the only result on the stack. */
void mp_decode_to_lua_type(lua_State *L, mp_cur *c) {
    mp_cur_need(c,1);

    /* If we return more than 18 elements, we must resize the stack to
     * fit all our return values.  But, there is no way to
     * determine how many objects a msgpack will unpack to up front, so
     * we request a +1 larger stack on each iteration (noop if stack is
     * big enough, and when stack does require resize it doubles in size) */
    luaL_checkstack(L, 1,
        "too many return values at once; "
        "use unpack_one or unpack_limit instead.");

    switch(c->p[0]) {
    case 0xcc:  /* uint 8 */
        mp_cur_need(c,2);
        lua_pushunsigned(L,c->p[1]);
        mp_cur_consume(c,2);
        break;
    case 0xd0:  /* int 8 */
        mp_cur_need(c,2);
        lua_pushinteger(L,(signed char)c->p[1]);
        mp_cur_consume(c,2);
        break;
    case 0xcd:  /* uint 16 */
        mp_cur_need(c,3);
        lua_pushunsigned(L,
            (c->p[1] << 8) |
             c->p[2]);
        mp_cur_consume(c,3);
        break;
    case 0xd1:  /* int 16 */
        mp_cur_need(c,3);
        lua_pushinteger(L,(int16_t)
            (c->p[1] << 8) |
             c->p[2]);
        mp_cur_consume(c,3);
        break;
    case 0xce:  /* uint 32 */
        mp_cur_need(c,5);
        lua_pushunsigned(L,
            ((uint32_t)c->p[1] << 24) |
            ((uint32_t)c->p[2] << 16) |
            ((uint32_t)c->p[3] << 8) |
             (uint32_t)c->p[4]);
        mp_cur_consume(c,5);
        break;
    case 0xd2:  /* int 32 */
        mp_cur_need(c,5);
        lua_pushinteger(L,
            ((int32_t)c->p[1] << 24) |
            ((int32_t)c->p[2] << 16) |
            ((int32_t)c->p[3] << 8) |
             (int32_t)c->p[4]);
        mp_cur_consume(c,5);
        break;
    case 0xcf:  /* uint 64 */
        mp_cur_need(c,9);
        lua_pushunsigned(L,
            ((uint64_t)c->p[1] << 56) |
            ((uint64_t)c->p[2] << 48) |
            ((uint64_t)c->p[3] << 40) |
            ((uint64_t)c->p[4] << 32) |
            ((uint64_t)c->p[5] << 24) |
            ((uint64_t)c->p[6] << 16) |
            ((uint64_t)c->p[7] << 8) |
             (uint64_t)c->p[8]);
        mp_cur_consume(c,9);
        break;
    case 0xd3:  /* int 64 */
        mp_cur_need(c,9);
#if LUA_VERSION_NUM < 503
        lua_pushnumber(L,
#else
        lua_pushinteger(L,
#endif
            ((int64_t)c->p[1] << 56) |
            ((int64_t)c->p[2] << 48) |
            ((int64_t)c->p[3] << 40) |
            ((int64_t)c->p[4] << 32) |
            ((int64_t)c->p[5] << 24) |
            ((int64_t)c->p[6] << 16) |
            ((int64_t)c->p[7] << 8) |
             (int64_t)c->p[8]);
        mp_cur_consume(c,9);
        break;
    case 0xc0:  /* nil */
        lua_pushnil(L);
        mp_cur_consume(c,1);
        break;
    case 0xc3:  /* true */
        lua_pushboolean(L,1);
        mp_cur_consume(c,1);
        break;
    case 0xc2:  /* false */
        lua_pushboolean(L,0);
        mp_cur_consume(c,1);
        break;
    case 0xca:  /* float */
        mp_cur_need(c,5);
        assert(sizeof(float) == 4);
        {
            float f;
            memcpy(&f,c->p+1,4);
            memrevifle(&f,4);
            lua_pushnumber(L,f);
            mp_cur_consume(c,5);
        }
        break;
    case 0xcb:  /* double */
        mp_cur_need(c,9);
        assert(sizeof(double) == 8);
        {
            double d;
            memcpy(&d,c->p+1,8);
            memrevifle(&d,8);
            lua_pushnumber(L,d);
            mp_cur_consume(c,9);
        }
        break;
    case 0xd9:  /* raw 8 */
        mp_cur_need(c,2);
        {
            size_t l = c->p[1];
            mp_cur_need(c,2+l);
            lua_pushlstring(L,(char*)c->p+2,l);
            mp_cur_consume(c,2+l);
        }
        break;
    case 0xda:  /* raw 16 */
        mp_cur_need(c,3);
        {
            size_t l = (c->p[1] << 8) | c->p[2];
            mp_cur_need(c,3+l);
            lua_pushlstring(L,(char*)c->p+3,l);
            mp_cur_consume(c,3+l);
        }
        break;
    case 0xdb:  /* raw 32 */
        mp_cur_need(c,5);
        {
            size_t l = ((size_t)c->p[1] << 24) |
                       ((size_t)c->p[2] << 16) |
                       ((size_t)c->p[3] << 8) |
                       (size_t)c->p[4];
            mp_cur_consume(c,5);
            mp_cur_need(c,l);
            lua_pushlstring(L,(char*)c->p,l);
            mp_cur_consume(c,l);
        }
        break;
    case 0xdc:  /* array 16 */
        mp_cur_need(c,3);
        {
            size_t l = (c->p[1] << 8) | c->p[2];
            mp_cur_consume(c,3);
            mp_decode_to_lua_array(L,c,l);
        }
        break;
    case 0xdd:  /* array 32 */
        mp_cur_need(c,5);
        {
            size_t l = ((size_t)c->p[1] << 24) |
                       ((size_t)c->p[2] << 16) |
                       ((size_t)c->p[3] << 8) |
                       (size_t)c->p[4];
            mp_cur_consume(c,5);
            mp_decode_to_lua_array(L,c,l);
        }
        break;
    case 0xde:  /* map 16 */
        mp_cur_need(c,3);
        {
            size_t l = (c->p[1] << 8) | c->p[2];
            mp_cur_consume(c,3);
            mp_decode_to_lua_hash(L,c,l);
        }
        break;
    case 0xdf:  /* map 32 */
        mp_cur_need(c,5);
        {
            size_t l = ((size_t)c->p[1] << 24) |
                       ((size_t)c->p[2] << 16) |
                       ((size_t)c->p[3] << 8) |
                       (size_t)c->p[4];
            mp_cur_consume(c,5);
            mp_decode_to_lua_hash(L,c,l);
        }
        break;
    default:    /* types that can't be idenitified by first byte value. */
        if ((c->p[0] & 0x80) == 0) {   /* positive fixnum */
            lua_pushunsigned(L,c->p[0]);
            mp_cur_consume(c,1);
        } else if ((c->p[0] & 0xe0) == 0xe0) {  /* negative fixnum */
            lua_pushinteger(L,(signed char)c->p[0]);
            mp_cur_consume(c,1);
        } else if ((c->p[0] & 0xe0) == 0xa0) {  /* fix raw */
            size_t l = c->p[0] & 0x1f;
            mp_cur_need(c,1+l);
            lua_pushlstring(L,(char*)c->p+1,l);
            mp_cur_consume(c,1+l);
        } else if ((c->p[0] & 0xf0) == 0x90) {  /* fix map */
            size_t l = c->p[0] & 0xf;
            mp_cur_consume(c,1);
            mp_decode_to_lua_array(L,c,l);
        } else if ((c->p[0] & 0xf0) == 0x80) {  /* fix map */
            size_t l = c->p[0] & 0xf;
            mp_cur_consume(c,1);
            mp_decode_to_lua_hash(L,c,l);
        } else {
            c->err = MP_CUR_ERROR_BADFMT;
        }
    }
}

int mp_unpack_full(lua_State *L, int limit, int offset) {
    size_t len;
    const char *s;
    mp_cur c;
    int cnt; /* Number of objects unpacked */
    int decode_all = (!limit && !offset);

    s = luaL_checklstring(L,1,&len); /* if no match, exits */

    if (offset < 0 || limit < 0) /* requesting negative off or lim is invalid */
        return luaL_error(L,
            "Invalid request to unpack with offset of %d and limit of %d.",
            offset, len);
    else if (offset > len)
        return luaL_error(L,
            "Start offset %d greater than input length %d.", offset, len);

    if (decode_all) limit = INT_MAX;

    mp_cur_init(&c,(const unsigned char *)s+offset,len-offset);

    /* We loop over the decode because this could be a stream
     * of multiple top-level values serialized together */
    for(cnt = 0; c.left > 0 && cnt < limit; cnt++) {
        mp_decode_to_lua_type(L,&c);

        if (c.err == MP_CUR_ERROR_EOF) {
            return luaL_error(L,"Missing bytes in input.");
        } else if (c.err == MP_CUR_ERROR_BADFMT) {
            return luaL_error(L,"Bad data format in input.");
        }
    }

    if (!decode_all) {
        /* c->left is the remaining size of the input buffer.
         * subtract the entire buffer size from the unprocessed size
         * to get our next start offset */
        int offset = len - c.left;

        luaL_checkstack(L, 1, "in function mp_unpack_full");

        /* Return offset -1 when we have have processed the entire buffer. */
        lua_pushinteger(L, c.left == 0 ? -1 : offset);
        /* Results are returned with the arg elements still
         * in place. Lua takes care of only returning
         * elements above the args for us.
         * In this case, we have one arg on the stack
         * for this function, so we insert our first return
         * value at position 2. */
        lua_insert(L, 2);
        cnt += 1; /* increase return count by one to make room for offset */
    }

    return cnt;
}

int mp_unpack(lua_State *L) {
    return mp_unpack_full(L, 0, 0);
}

int mp_unpack_one(lua_State *L) {
    int offset = luaL_optinteger(L, 2, 0);
    /* Variable pop because offset may not exist */
    lua_pop(L, lua_gettop(L)-1);
    return mp_unpack_full(L, 1, offset);
}

int mp_unpack_limit(lua_State *L) {
    int limit = luaL_checkinteger(L, 2);
    int offset = luaL_optinteger(L, 3, 0);
    /* Variable pop because offset may not exist */
    lua_pop(L, lua_gettop(L)-1);

    return mp_unpack_full(L, limit, offset);
}

int mp_safe(lua_State *L) {
    int argc, err, total_results;

    argc = lua_gettop(L);

    /* This adds our function to the bottom of the stack
     * (the "call this function" position) */
    lua_pushvalue(L, lua_upvalueindex(1));
    lua_insert(L, 1);

    err = lua_pcall(L, argc, LUA_MULTRET, 0);
    total_results = lua_gettop(L);

    if (!err) {
        return total_results;
    } else {
        lua_pushnil(L);
        lua_insert(L,-2);
        return 2;
    }
}

/* -------------------------------------------------------------------------- */
const struct luaL_Reg cmds[] = {
    {"pack", mp_pack},
    {"unpack", mp_unpack},
    {"unpack_one", mp_unpack_one},
    {"unpack_limit", mp_unpack_limit},
    {0}
};

int luaopen_create(lua_State *L) {
    int i;
    /* Manually construct our module table instead of
     * relying on _register or _newlib */
    lua_newtable(L);

    for (i = 0; i < (sizeof(cmds)/sizeof(*cmds) - 1); i++) {
        lua_pushcfunction(L, cmds[i].func);
        lua_setfield(L, -2, cmds[i].name);
    }

    /* Add metadata */
    lua_pushliteral(L, LUACMSGPACK_NAME);
    lua_setfield(L, -2, "_NAME");
    lua_pushliteral(L, LUACMSGPACK_VERSION);
    lua_setfield(L, -2, "_VERSION");
    lua_pushliteral(L, LUACMSGPACK_COPYRIGHT);
    lua_setfield(L, -2, "_COPYRIGHT");
    lua_pushliteral(L, LUACMSGPACK_DESCRIPTION);
    lua_setfield(L, -2, "_DESCRIPTION");
    return 1;
}

LUALIB_API int luaopen_cmsgpack(lua_State *L) {
    luaopen_create(L);

    lua_pushvalue(L, -1);
    lua_setglobal(L, LUACMSGPACK_NAME);

    return 1;
}

LUALIB_API int luaopen_cmsgpack_safe(lua_State *L) {
    int i;

    luaopen_cmsgpack(L);

    /* Wrap all functions in the safe handler */
    for (i = 0; i < (sizeof(cmds)/sizeof(*cmds) - 1); i++) {
        lua_getfield(L, -1, cmds[i].name);
        lua_pushcclosure(L, mp_safe, 1);
        lua_setfield(L, -2, cmds[i].name);
    }

#if LUA_VERSION_NUM < 502
    /* Register name globally for 5.1 */
    lua_pushvalue(L, -1);
    lua_setglobal(L, LUACMSGPACK_SAFE_NAME);
#endif

    return 1;
}

/******************************************************************************
* Copyright (C) 2012 Salvatore Sanfilippo.  All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
******************************************************************************/

================================================
FILE: src/redis/lua/struct/lua_struct.c
================================================
/*
** {======================================================
** Library for packing/unpacking structures.
** $Id: struct.c,v 1.7 2018/05/11 22:04:31 roberto Exp $
** See Copyright Notice at the end of this file
** =======================================================
*/
/*
** Valid formats:
** > - big endian
** < - little endian
** ![num] - alignment
** x - pading
** b/B - signed/unsigned byte
** h/H - signed/unsigned short
** l/L - signed/unsigned long
** T   - size_t
** i/In - signed/unsigned integer with size 'n' (default is size of int)
** cn - sequence of 'n' chars (from/to a string); when packing, n==0 means
        the whole string; when unpacking, n==0 means use the previous
        read number as the string length
** s - zero-terminated string
** f - float
** d - double
** ' ' - ignored
*/


#include <assert.h>
#include <ctype.h>
#include <limits.h>
#include <stddef.h>
#include <string.h>


#include "lua.h"
#include "lauxlib.h"


/* basic integer type */
#if !defined(STRUCT_INT)
#define STRUCT_INT	long
#endif

typedef STRUCT_INT Inttype;

/* corresponding unsigned version */
typedef unsigned STRUCT_INT Uinttype;


/* maximum size (in bytes) for integral types */
#define MAXINTSIZE	32

/* is 'x' a power of 2? */
#define isp2(x)		((x) > 0 && ((x) & ((x) - 1)) == 0)

/* dummy structure to get alignment requirements */
struct cD {
  char c;
  double d;
};


#define PADDING		(sizeof(struct cD) - sizeof(double))
#define MAXALIGN  	(PADDING > sizeof(int) ? PADDING : sizeof(int))


/* endian options */
#define BIG	0
#define LITTLE	1


static union {
  int dummy;
  char endian;
} const native = {1};


typedef struct Header {
  int endian;
  int align;
} Header;


static int getnum (lua_State *L, const char **fmt, int df) {
  if (!isdigit(**fmt))  /* no number? */
    return df;  /* return default value */
  else {
    int a = 0;
    do {
      if (a > (INT_MAX / 10) || a * 10 > (INT_MAX - (**fmt - '0')))
        luaL_error(L, "integral size overflow");
      a = a*10 + *((*fmt)++) - '0';
    } while (isdigit(**fmt));
    return a;
  }
}

#define defaultoptions(h)	((h)->endian = native.endian, (h)->align = 1)


static size_t optsize (lua_State *L, char opt, const char **fmt) {
  switch (opt) {
    case 'B': case 'b': return sizeof(char);
    case 'H': case 'h': return sizeof(short);
    case 'L': case 'l': return sizeof(long);
    case 'T': return sizeof(size_t);
    case 'f':  return sizeof(float);
    case 'd':  return sizeof(double);
    case 'x': return 1;
    case 'c': return  getnum(L, fmt, 1);
    case 'i': case 'I': {
      int sz = getnum(L, fmt, sizeof(int));
      if (sz > MAXINTSIZE)
        luaL_error(L, "integral size %d is larger than limit of %d",
                       sz, MAXINTSIZE);
      return sz;
    }
    default: return 0;  /* other cases do not need alignment */
  }
}


/*
** return number of bytes needed to align an element of size 'size'
** at current position 'len'
*/
static int gettoalign (size_t len, Header *h, int opt, size_t size) {
  if (size == 0 || opt == 'c') return 0;
  if (size > (size_t)h->align)
    size = h->align;  /* respect max. alignment */
  return (size - (len & (size - 1))) & (size - 1);
}


/*
** options to control endianess and alignment
*/
static void controloptions (lua_State *L, int opt, const char **fmt,
                            Header *h) {
  switch (opt) {
    case  ' ': return;  /* ignore white spaces */
    case '>': h->endian = BIG; return;
    case '<': h->endian = LITTLE; return;
    case '!': {
      int a = getnum(L, fmt, MAXALIGN);
      if (!isp2(a))
        luaL_error(L, "alignment %d is not a power of 2", a);
      h->align = a;
      return;
    }
    default: {
      const char *msg = lua_pushfstring(L, "invalid format option '%c'", opt);
      luaL_argerror(L, 1, msg);
    }
  }
}


static void putinteger (lua_State *L, luaL_Buffer *b, int arg, int endian,
                        int size) {
  lua_Number n = luaL_checknumber(L, arg);
  Uinttype value;
  char buff[MAXINTSIZE];
  if (n < 0)
    value = (Uinttype)(Inttype)n;
  else
    value = (Uinttype)n;
  if (endian == LITTLE) {
    int i;
    for (i = 0; i < size; i++) {
      buff[i] = (value & 0xff);
      value >>= 8;
    }
  }
  else {
    int i;
    for (i = size - 1; i >= 0; i--) {
      buff[i] = (value & 0xff);
      value >>= 8;
    }
  }
  luaL_addlstring(b, buff, size);
}


static void correctbytes (char *b, int size, int endian) {
  if (endian != native.endian) {
    int i = 0;
    while (i < --size) {
      char temp = b[i];
      b[i++] = b[size];
      b[size] = temp;
    }
  }
}


static int b_pack (lua_State *L) {
  luaL_Buffer b;
  const char *fmt = luaL_checkstring(L, 1);
  Header h;
  int arg = 2;
  size_t totalsize = 0;
  defaultoptions(&h);
  lua_pushnil(L);  /* mark to separate arguments from string buffer */
  luaL_buffinit(L, &b);
  while (*fmt != '\0') {
    int opt = *fmt++;
    size_t size = optsize(L, opt, &fmt);
    int toalign = gettoalign(totalsize, &h, opt, size);
    totalsize += toalign;
    while (toalign-- > 0) luaL_addchar(&b, '\0');
    switch (opt) {
      case 'b': case 'B': case 'h': case 'H':
      case 'l': case 'L': case 'T': case 'i': case 'I': {  /* integer types */
        putinteger(L, &b, arg++, h.endian, size);
        break;
      }
      case 'x': {
        luaL_addchar(&b, '\0');
        break;
      }
      case 'f': {
        float f = (float)luaL_checknumber(L, arg++);
        correctbytes((char *)&f, size, h.endian);
        luaL_addlstring(&b, (char *)&f, size);
        break;
      }
      case 'd': {
        double d = luaL_checknumber(L, arg++);
        correctbytes((char *)&d, size, h.endian);
        luaL_addlstring(&b, (char *)&d, size);
        break;
      }
      case 'c': case 's': {
        size_t l;
        const char *s = luaL_checklstring(L, arg++, &l);
        if (size == 0) size = l;
        luaL_argcheck(L, l >= (size_t)size, arg, "string too short");
        luaL_addlstring(&b, s, size);
        if (opt == 's') {
          luaL_addchar(&b, '\0');  /* add zero at the end */
          size++;
        }
        break;
      }
      default: controloptions(L, opt, &fmt, &h);
    }
    totalsize += size;
  }
  luaL_pushresult(&b);
  return 1;
}


static lua_Number getinteger (const char *buff, int endian,
                        int issigned, int size) {
  Uinttype l = 0;
  int i;
  if (endian == BIG) {
    for (i = 0; i < size; i++) {
      l <<= 8;
      l |= (Uinttype)(unsigned char)buff[i];
    }
  }
  else {
    for (i = size - 1; i >= 0; i--) {
      l <<= 8;
      l |= (Uinttype)(unsigned char)buff[i];
    }
  }
  if (!issigned)
    return (lua_Number)l;
  else {  /* signed format */
    Uinttype mask = (Uinttype)(~((Uinttype)0)) << (size*8 - 1);
    if (l & mask)  /* negative value? */
      l |= mask;  /* signal extension */
    return (lua_Number)(Inttype)l;
  }
}


static int b_unpack (lua_State *L) {
  Header h;
  const char *fmt = luaL_checkstring(L, 1);
  size_t ld;
  const char *data = luaL_checklstring(L, 2, &ld);
  size_t pos = luaL_optinteger(L, 3, 1);
  luaL_argcheck(L, pos > 0, 3, "offset must be 1 or greater");
  pos--; /* Lua indexes are 1-based, but here we want 0-based for C
          * pointer math. */
  int n = 0;  /* number of results */
  defaultoptions(&h);
  while (*fmt) {
    int opt = *fmt++;
    size_t size = optsize(L, opt, &fmt);
    pos += gettoalign(pos, &h, opt, size);
    luaL_argcheck(L, size <= ld && pos <= ld - size,
                   2, "data string too short");
    /* stack space for item + next position */
    luaL_checkstack(L, 2, "too many results");
    switch (opt) {
      case 'b': case 'B': case 'h': case 'H':
      case 'l': case 'L': case 'T': case 'i':  case 'I': {  /* integer types */
        int issigned = islower(opt);
        lua_Number res = getinteger(data+pos, h.endian, issigned, size);
        lua_pushnumber(L, res); n++;
        break;
      }
      case 'x': {
        break;
      }
      case 'f': {
        float f;
        memcpy(&f, data+pos, size);
        correctbytes((char *)&f, sizeof(f), h.endian);
        lua_pushnumber(L, f); n++;
        break;
      }
      case 'd': {
        double d;
        memcpy(&d, data+pos, size);
        correctbytes((char *)&d, sizeof(d), h.endian);
        lua_pushnumber(L, d); n++;
        break;
      }
      case 'c': {
        if (size == 0) {
          if (n == 0 || !lua_isnumber(L, -1))
            luaL_error(L, "format 'c0' needs a previous size");
          size = lua_tonumber(L, -1);
          lua_pop(L, 1); n--;
          luaL_argcheck(L, size <= ld && pos <= ld - size,
                           2, "data string too short");
        }
        lua_pushlstring(L, data+pos, size); n++;
        break;
      }
      case 's': {
        const char *e = (const char *)memchr(data+pos, '\0', ld - pos);
        if (e == NULL)
          luaL_error(L, "unfinished string in data");
        size = (e - (data+pos)) + 1;
        lua_pushlstring(L, data+pos, size - 1); n++;
        break;
      }
      default: controloptions(L, opt, &fmt, &h);
    }
    pos += size;
  }
  lua_pushinteger(L, pos + 1);  /* next position */
  return n + 1;
}


static int b_size (lua_State *L) {
  Header h;
  const char *fmt = luaL_checkstring(L, 1);
  size_t pos = 0;
  defaultoptions(&h);
  while (*fmt) {
    int opt = *fmt++;
    size_t size = optsize(L, opt, &fmt);
    pos += gettoalign(pos, &h, opt, size);
    if (opt == 's')
      luaL_argerror(L, 1, "option 's' has no fixed size");
    else if (opt == 'c' && size == 0)
      luaL_argerror(L, 1, "option 'c0' has no fixed size");
    if (!isalnum(opt))
      controloptions(L, opt, &fmt, &h);
    pos += size;
  }
  lua_pushinteger(L, pos);
  return 1;
}

/* }====================================================== */


static const struct luaL_Reg thislib[] = {
  {"pack", b_pack},
  {"unpack", b_unpack},
  {"size", b_size},
  {NULL, NULL}
};


LUALIB_API int luaopen_struct (lua_State *L);

LUALIB_API int luaopen_struct (lua_State *L) {
  luaL_newlib(L, thislib);
  lua_setglobal(L, "struct");
  return 1;
}


/******************************************************************************
* Copyright (C) 2010-2018 Lua.org, PUC-Rio.  All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
******************************************************************************/


================================================
FILE: src/redis/lzf.h
================================================
/*
 * Copyright (c) 2000-2008 Marc Alexander Lehmann <schmorp@schmorp.de>
 *
 * Redistribution and use in source and binary forms, with or without modifica-
 * tion, are permitted provided that the following conditions are met:
 *
 *   1.  Redistributions of source code must retain the above copyright notice,
 *       this list of conditions and the following disclaimer.
 *
 *   2.  Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
 * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
 * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
 * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
 * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 * OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * Alternatively, the contents of this file may be used under the terms of
 * the GNU General Public License ("GPL") version 2 or any later version,
 * in which case the provisions of the GPL are applicable instead of
 * the above. If you wish to allow the use of your version of this file
 * only under the terms of the GPL and not to allow others to use your
 * version of this file under the BSD license, indicate your decision
 * by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL. If you do not delete the
 * provisions above, a recipient may use your version of this file under
 * either the BSD or the GPL.
 */

#ifndef LZF_H
#define LZF_H

/***********************************************************************
**
**	lzf -- an extremely fast/free compression/decompression-method
**	http://liblzf.plan9.de/
**
**	This algorithm is believed to be patent-free.
**
***********************************************************************/

#define LZF_VERSION 0x0105 /* 1.5, API version */

/*
 * Compress in_len bytes stored at the memory block starting at
 * in_data and write the result to out_data, up to a maximum length
 * of out_len bytes.
 *
 * If the output buffer is not large enough or any error occurs return 0,
 * otherwise return the number of bytes used, which might be considerably
 * more than in_len (but less than 104% of the original size), so it
 * makes sense to always use out_len == in_len - 1), to ensure _some_
 * compression, and store the data uncompressed otherwise (with a flag, of
 * course.
 *
 * lzf_compress might use different algorithms on different systems and
 * even different runs, thus might result in different compressed strings
 * depending on the phase of the moon or similar factors. However, all
 * these strings are architecture-independent and will result in the
 * original data when decompressed using lzf_decompress.
 *
 * The buffers must not be overlapping.
 *
 * If the option LZF_STATE_ARG is enabled, an extra argument must be
 * supplied which is not reflected in this header file. Refer to lzfP.h
 * and lzf_c.c.
 *
 */
size_t
lzf_compress (const void *const in_data,  size_t in_len,
              void             *out_data, size_t out_len
#if LZF_STATE_ARG
      , LZF_STATE htab
#endif
              );

/*
 * Decompress data compressed with some version of the lzf_compress
 * function and stored at location in_data and length in_len. The result
 * will be stored at out_data up to a maximum of out_len characters.
 *
 * If the output buffer is not large enough to hold the decompressed
 * data, a 0 is returned and errno is set to E2BIG. Otherwise the number
 * of decompressed bytes (i.e. the original length of the data) is
 * returned.
 *
 * If an error in the compressed data is detected, a zero is returned and
 * errno is set to EINVAL.
 *
 * This function is very fast, about as fast as a copying loop.
 */
size_t
lzf_decompress (const void *const in_data,  size_t in_len,
                void             *out_data, size_t out_len);

#endif


================================================
FILE: src/redis/lzfP.h
================================================
/*
 * Copyright (c) 2000-2007 Marc Alexander Lehmann <schmorp@schmorp.de>
 *
 * Redistribution and use in source and binary forms, with or without modifica-
 * tion, are permitted provided that the following conditions are met:
 *
 *   1.  Redistributions of source code must retain the above copyright notice,
 *       this list of conditions and the following disclaimer.
 *
 *   2.  Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
 * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
 * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
 * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
 * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 * OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * Alternatively, the contents of this file may be used under the terms of
 * the GNU General Public License ("GPL") version 2 or any later version,
 * in which case the provisions of the GPL are applicable instead of
 * the above. If you wish to allow the use of your version of this file
 * only under the terms of the GPL and not to allow others to use your
 * version of this file under the BSD license, indicate your decision
 * by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL. If you do not delete the
 * provisions above, a recipient may use your version of this file under
 * either the BSD or the GPL.
 */

#ifndef LZFP_h
#define LZFP_h

// ROMAN: #define STANDALONE 1 /* at the moment, this is ok. */

/*  ROMAN: Moved below since it depends on LZF_STATE
#ifndef STANDALONE
# include "lzf.h"
#endif

*/

/*
 * Size of hashtable is (1 << HLOG) * sizeof (char *)
 * decompression is independent of the hash table size
 * the difference between 15 and 14 is very small
 * for small blocks (and 14 is usually a bit faster).
 * For a low-memory/faster configuration, use HLOG == 13;
 * For best compression, use 15 or 16 (or more, up to 22).
 */
#ifndef HLOG
# define HLOG 16
#endif

/*
 * Sacrifice very little compression quality in favour of compression speed.
 * This gives almost the same compression as the default code, and is
 * (very roughly) 15% faster. This is the preferred mode of operation.
 */
#ifndef VERY_FAST
# define VERY_FAST 1
#endif

/*
 * Sacrifice some more compression quality in favour of compression speed.
 * (roughly 1-2% worse compression for large blocks and
 * 9-10% for small, redundant, blocks and >>20% better speed in both cases)
 * In short: when in need for speed, enable this for binary data,
 * possibly disable this for text data.
 */
#ifndef ULTRA_FAST
# define ULTRA_FAST 0
#endif

/*
 * Unconditionally aligning does not cost very much, so do it if unsure
 */
#ifndef STRICT_ALIGN
# if !(defined(__i386) || defined (__amd64))
#  define STRICT_ALIGN 1
# else
#  define STRICT_ALIGN 0
# endif
#endif

/*
 * You may choose to pre-set the hash table (might be faster on some
 * modern cpus and large (>>64k) blocks, and also makes compression
 * deterministic/repeatable when the configuration otherwise is the same).
 */
#ifndef INIT_HTAB
# define INIT_HTAB 0
#endif

/*
 * Avoid assigning values to errno variable? for some embedding purposes
 * (linux kernel for example), this is necessary. NOTE: this breaks
 * the documentation in lzf.h. Avoiding errno has no speed impact.
 */
#ifndef AVOID_ERRNO
# define AVOID_ERRNO 0
#endif

/*
 * Whether to pass the LZF_STATE variable as argument, or allocate it
 * on the stack. For small-stack environments, define this to 1.
 * NOTE: this breaks the prototype in lzf.h.
 */
#ifndef LZF_STATE_ARG
# define LZF_STATE_ARG 1   // ROMAN
#endif

/*
 * Whether to add extra checks for input validity in lzf_decompress
 * and return EINVAL if the input stream has been corrupted. This
 * only shields against overflowing the input buffer and will not
 * detect most corrupted streams.
 * This check is not normally noticeable on modern hardware
 * (<1% slowdown), but might slow down older cpus considerably.
 */
#ifndef CHECK_INPUT
# define CHECK_INPUT 1
#endif

/*
 * Whether to store pointers or offsets inside the hash table. On
 * 64 bit architectures, pointers take up twice as much space,
 * and might also be slower. Default is to autodetect.
 * Notice: Don't set this value to 1, it will result in 'LZF_HSLOT'
 * not being able to store offset above UINT32_MAX in 64bit. */
#define LZF_USE_OFFSETS 0

/*****************************************************************************/
/* nothing should be changed below */

#ifdef __cplusplus
# include <cstring>
# include <climits>
using namespace std;
#else
# include <string.h>
# include <limits.h>
#endif

#ifndef LZF_USE_OFFSETS
# if defined (WIN32)
#  define LZF_USE_OFFSETS defined(_M_X64)
# else
#  if __cplusplus > 199711L
#   include <cstdint>
#  else
#   include <stdint.h>
#  endif
#  define LZF_USE_OFFSETS (UINTPTR_MAX > 0xffffffffU)
# endif
#endif

typedef unsigned char u8;

#if LZF_USE_OFFSETS
# define LZF_HSLOT_BIAS ((const u8 *)in_data)
  typedef unsigned int LZF_HSLOT;
#else
# define LZF_HSLOT_BIAS 0
  typedef const u8 *LZF_HSLOT;
#endif

typedef LZF_HSLOT LZF_STATE[1 << (HLOG)];

// ROMAN: moved here deliberately because we depend on LZF_STATE.
#ifndef STANDALONE
# include "lzf.h"
#endif

#if !STRICT_ALIGN
/* for unaligned accesses we need a 16 bit datatype. */
# if USHRT_MAX == 65535
    typedef unsigned short u16;
# elif UINT_MAX == 65535
    typedef unsigned int u16;
# else
#  undef STRICT_ALIGN
#  define STRICT_ALIGN 1
# endif
#endif

#if ULTRA_FAST
# undef VERY_FAST
#endif

#endif


================================================
FILE: src/redis/lzf_c.c
================================================
/*
 * Copyright (c) 2000-2010 Marc Alexander Lehmann <schmorp@schmorp.de>
 *
 * Redistribution and use in source and binary forms, with or without modifica-
 * tion, are permitted provided that the following conditions are met:
 *
 *   1.  Redistributions of source code must retain the above copyright notice,
 *       this list of conditions and the following disclaimer.
 *
 *   2.  Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
 * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
 * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
 * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
 * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 * OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * Alternatively, the contents of this file may be used under the terms of
 * the GNU General Public License ("GPL") version 2 or any later version,
 * in which case the provisions of the GPL are applicable instead of
 * the above. If you wish to allow the use of your version of this file
 * only under the terms of the GPL and not to allow others to use your
 * version of this file under the BSD license, indicate your decision
 * by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL. If you do not delete the
 * provisions above, a recipient may use your version of this file under
 * either the BSD or the GPL.
 */

#include "lzfP.h"

#define HSIZE (1 << (HLOG))

/*
 * don't play with this unless you benchmark!
 * the data format is not dependent on the hash function.
 * the hash function might seem strange, just believe me,
 * it works ;)
 */
#ifndef FRST
# define FRST(p) (((p[0]) << 8) | p[1])
# define NEXT(v,p) (((v) << 8) | p[2])
# if ULTRA_FAST
#  define IDX(h) ((( h             >> (3*8 - HLOG)) - h  ) & (HSIZE - 1))
# elif VERY_FAST
#  define IDX(h) ((( h             >> (3*8 - HLOG)) - h*5) & (HSIZE - 1))
# else
#  define IDX(h) ((((h ^ (h << 5)) >> (3*8 - HLOG)) - h*5) & (HSIZE - 1))
# endif
#endif
/*
 * IDX works because it is very similar to a multiplicative hash, e.g.
 * ((h * 57321 >> (3*8 - HLOG)) & (HSIZE - 1))
 * the latter is also quite fast on newer CPUs, and compresses similarly.
 *
 * the next one is also quite good, albeit slow ;)
 * (int)(cos(h & 0xffffff) * 1e6)
 */

#if 0
/* original lzv-like hash function, much worse and thus slower */
# define FRST(p) (p[0] << 5) ^ p[1]
# define NEXT(v,p) ((v) << 5) ^ p[2]
# define IDX(h) ((h) & (HSIZE - 1))
#endif

#define        MAX_LIT        (1 <<  5)
#define        MAX_OFF        (1 << 13)
#define        MAX_REF        ((1 << 8) + (1 << 3))

#if __GNUC__ >= 3
# define expect(expr,value)         __builtin_expect ((expr),(value))
# define inline                     inline
#else
# define expect(expr,value)         (expr)
# define inline                     static
#endif

#define expect_false(expr) expect ((expr) != 0, 0)
#define expect_true(expr)  expect ((expr) != 0, 1)

#if defined(__has_attribute)
# if __has_attribute(no_sanitize)
#  define NO_SANITIZE(sanitizer) __attribute__((no_sanitize(sanitizer)))
# endif
#endif

#if !defined(NO_SANITIZE)
# define NO_SANITIZE(sanitizer)
#endif

/*
 * compressed format
 *
 * 000LLLLL <L+1>    ; literal, L+1=1..33 octets
 * LLLooooo oooooooo ; backref L+1=1..7 octets, o+1=1..4096 offset
 * 111ooooo LLLLLLLL oooooooo ; backref L+8 octets, o+1=1..4096 offset
 *
 */
NO_SANITIZE("alignment")
size_t
lzf_compress (const void *const in_data, size_t in_len,
	      void *out_data, size_t out_len
#if LZF_STATE_ARG
              , LZF_STATE htab
#endif
              )
{
#if !LZF_STATE_ARG
  LZF_STATE htab;
#endif
  const u8 *ip = (const u8 *)in_data;
        u8 *op = (u8 *)out_data;
  const u8 *in_end  = ip + in_len;
        u8 *out_end = op + out_len;
  const u8 *ref;

  /* off requires a type wide enough to hold a general pointer difference.
   * ISO C doesn't have that (size_t might not be enough and ptrdiff_t only
   * works for differences within a single object). We also assume that no
   * no bit pattern traps. Since the only platform that is both non-POSIX
   * and fails to support both assumptions is windows 64 bit, we make a
   * special workaround for it.
   */
#if defined (WIN32) && defined (_M_X64)
  unsigned _int64 off; /* workaround for missing POSIX compliance */
#else
  size_t off;
#endif
  unsigned int hval;
  int lit;

  if (!in_len || !out_len)
    return 0;

#if INIT_HTAB
  memset (htab, 0, sizeof (htab));
#endif

  lit = 0; op++; /* start run */

  hval = FRST (ip);
  while (ip < in_end - 2)
    {
      LZF_HSLOT *hslot;

      hval = NEXT (hval, ip);
      hslot = htab + IDX (hval);
      ref = *hslot ? (*hslot + LZF_HSLOT_BIAS) : NULL; /* avoid applying zero offset to null pointer */
      *hslot = ip - LZF_HSLOT_BIAS;

      if (1
#if INIT_HTAB
          && ref < ip /* the next test will actually take care of this, but this is faster */
#endif
          && (off = ip - ref - 1) < MAX_OFF
          && ref > (u8 *)in_data
          && ref[2] == ip[2]
#if STRICT_ALIGN
          && ((ref[1] << 8) | ref[0]) == ((ip[1] << 8) | ip[0])
#else
          && *(u16 *)ref == *(u16 *)ip
#endif
        )
        {
          /* match found at *ref++ */
          unsigned int len = 2;
          size_t maxlen = in_end - ip - len;
          maxlen = maxlen > MAX_REF ? MAX_REF : maxlen;

          if (expect_false (op + 3 + 1 >= out_end)) /* first a faster conservative test */
            if (op - !lit + 3 + 1 >= out_end) /* second the exact but rare test */
              return 0;

          op [- lit - 1] = lit - 1; /* stop run */
          op -= !lit; /* undo run if length is zero */

          for (;;)
            {
              if (expect_true (maxlen > 16))
                {
                  len++; if (ref [len] != ip [len]) break;
                  len++; if (ref [len] != ip [len]) break;
                  len++; if (ref [len] != ip [len]) break;
                  len++; if (ref [len] != ip [len]) break;

                  len++; if (ref [len] != ip [len]) break;
                  len++; if (ref [len] != ip [len]) break;
                  len++; if (ref [len] != ip [len]) break;
                  len++; if (ref [len] != ip [len]) break;

                  len++; if (ref [len] != ip [len]) break;
                  len++; if (ref [len] != ip [len]) break;
                  len++; if (ref [len] != ip [len]) break;
                  len++; if (ref [len] != ip [len]) break;

                  len++; if (ref [len] != ip [len]) break;
                  len++; if (ref [len] != ip [len]) break;
                  len++; if (ref [len] != ip [len]) break;
                  len++; if (ref [len] != ip [len]) break;
                }

              do
                len++;
              while (len < maxlen && ref[len] == ip[len]);

              break;
            }

          len -= 2; /* len is now #octets - 1 */
          ip++;

          if (len < 7)
            {
              *op++ = (off >> 8) + (len << 5);
            }
          else
            {
              *op++ = (off >> 8) + (  7 << 5);
              *op++ = len - 7;
            }

          *op++ = off;

          lit = 0; op++; /* start run */

          ip += len + 1;

          if (expect_false (ip >= in_end - 2))
            break;

#if ULTRA_FAST || VERY_FAST
          --ip;
# if VERY_FAST && !ULTRA_FAST
          --ip;
# endif
          hval = FRST (ip);

          hval = NEXT (hval, ip);
          htab[IDX (hval)] = ip - LZF_HSLOT_BIAS;
          ip++;

# if VERY_FAST && !ULTRA_FAST
          hval = NEXT (hval, ip);
          htab[IDX (hval)] = ip - LZF_HSLOT_BIAS;
          ip++;
# endif
#else
          ip -= len + 1;

          do
            {
              hval = NEXT (hval, ip);
              htab[IDX (hval)] = ip - LZF_HSLOT_BIAS;
              ip++;
            }
          while (len--);
#endif
        }
      else
        {
          /* one more literal byte we must copy */
          if (expect_false (op >= out_end))
            return 0;

          lit++; *op++ = *ip++;

          if (expect_false (lit == MAX_LIT))
            {
              op [- lit - 1] = lit - 1; /* stop run */
              lit = 0; op++; /* start run */
            }
        }
    }

  if (op + 3 > out_end) /* at most 3 bytes can be missing here */
    return 0;

  while (ip < in_end)
    {
      lit++; *op++ = *ip++;

      if (expect_false (lit == MAX_LIT))
        {
          op [- lit - 1] = lit - 1; /* stop run */
          lit = 0; op++; /* start run */
        }
    }

  op [- lit - 1] = lit - 1; /* end run */
  op -= !lit; /* undo run if length is zero */

  return op - (u8 *)out_data;
}


================================================
FILE: src/redis/lzf_d.c
================================================
/*
 * Copyright (c) 2000-2010 Marc Alexander Lehmann <schmorp@schmorp.de>
 *
 * Redistribution and use in source and binary forms, with or without modifica-
 * tion, are permitted provided that the following conditions are met:
 *
 *   1.  Redistributions of source code must retain the above copyright notice,
 *       this list of conditions and the following disclaimer.
 *
 *   2.  Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
 * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
 * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
 * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
 * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 * OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * Alternatively, the contents of this file may be used under the terms of
 * the GNU General Public License ("GPL") version 2 or any later version,
 * in which case the provisions of the GPL are applicable instead of
 * the above. If you wish to allow the use of your version of this file
 * only under the terms of the GPL and not to allow others to use your
 * version of this file under the BSD license, indicate your decision
 * by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL. If you do not delete the
 * provisions above, a recipient may use your version of this file under
 * either the BSD or the GPL.
 */

#include "lzfP.h"

#if AVOID_ERRNO
# define SET_ERRNO(n)
#else
# include <errno.h>
# define SET_ERRNO(n) errno = (n)
#endif

#if USE_REP_MOVSB /* small win on amd, big loss on intel */
#if (__i386 || __amd64) && __GNUC__ >= 3
# define lzf_movsb(dst, src, len)                \
   asm ("rep movsb"                              \
        : "=D" (dst), "=S" (src), "=c" (len)     \
        :  "0" (dst),  "1" (src),  "2" (len));
#endif
#endif

#if defined(__GNUC__) && __GNUC__ >= 7
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
#endif
size_t
lzf_decompress (const void *const in_data,  size_t in_len,
                void             *out_data, size_t out_len)
{
  u8 const *ip = (const u8 *)in_data;
  u8       *op = (u8 *)out_data;
  u8 const *const in_end  = ip + in_len;
  u8       *const out_end = op + out_len;

  while (ip < in_end)
    {
      unsigned int ctrl;
      ctrl = *ip++;

      if (ctrl < (1 << 5)) /* literal run */
        {
          ctrl++;

          if (op + ctrl > out_end)
            {
              SET_ERRNO (E2BIG);
              return 0;
            }

#if CHECK_INPUT
          if (ip + ctrl > in_end)
            {
              SET_ERRNO (EINVAL);
              return 0;
            }
#endif

#ifdef lzf_movsb
          lzf_movsb (op, ip, ctrl);
#else
          switch (ctrl)
            {
              case 32: *op++ = *ip++; case 31: *op++ = *ip++; case 30: *op++ = *ip++; case 29: *op++ = *ip++;
              case 28: *op++ = *ip++; case 27: *op++ = *ip++; case 26: *op++ = *ip++; case 25: *op++ = *ip++;
              case 24: *op++ = *ip++; case 23: *op++ = *ip++; case 22: *op++ = *ip++; case 21: *op++ = *ip++;
              case 20: *op++ = *ip++; case 19: *op++ = *ip++; case 18: *op++ = *ip++; case 17: *op++ = *ip++;
              case 16: *op++ = *ip++; case 15: *op++ = *ip++; case 14: *op++ = *ip++; case 13: *op++ = *ip++;
              case 12: *op++ = *ip++; case 11: *op++ = *ip++; case 10: *op++ = *ip++; case  9: *op++ = *ip++;
              case  8: *op++ = *ip++; case  7: *op++ = *ip++; case  6: *op++ = *ip++; case  5: *op++ = *ip++;
              case  4: *op++ = *ip++; case  3: *op++ = *ip++; case  2: *op++ = *ip++; case  1: *op++ = *ip++;
            }
#endif
        }
      else /* back reference */
        {
          unsigned int len = ctrl >> 5;

          u8 *ref = op - ((ctrl & 0x1f) << 8) - 1;

#if CHECK_INPUT
          if (ip >= in_end)
            {
              SET_ERRNO (EINVAL);
              return 0;
            }
#endif
          if (len == 7)
            {
              len += *ip++;
#if CHECK_INPUT
              if (ip >= in_end)
                {
                  SET_ERRNO (EINVAL);
                  return 0;
                }
#endif
            }

          ref -= *ip++;

          if (op + len + 2 > out_end)
            {
              SET_ERRNO (E2BIG);
              return 0;
            }

          if (ref < (u8 *)out_data)
            {
              SET_ERRNO (EINVAL);
              return 0;
            }

#ifdef lzf_movsb
          len += 2;
          lzf_movsb (op, ref, len);
#else
          switch (len)
            {
              default:
                len += 2;

                if (op >= ref + len)
                  {
                    /* disjunct areas */
                    memcpy (op, ref, len);
                    op += len;
                  }
                else
                  {
                    /* overlapping, use octte by octte copying */
                    do
                      *op++ = *ref++;
                    while (--len);
                  }

                break;

              case 9: *op++ = *ref++; /* fall-thru */
              case 8: *op++ = *ref++; /* fall-thru */
              case 7: *op++ = *ref++; /* fall-thru */
              case 6: *op++ = *ref++; /* fall-thru */
              case 5: *op++ = *ref++; /* fall-thru */
              case 4: *op++ = *ref++; /* fall-thru */
              case 3: *op++ = *ref++; /* fall-thru */
              case 2: *op++ = *ref++; /* fall-thru */
              case 1: *op++ = *ref++; /* fall-thru */
              case 0: *op++ = *ref++; /* two octets more */
                      *op++ = *ref++; /* fall-thru */
            }
#endif
        }
    }

  return op - (u8 *)out_data;
}
#if defined(__GNUC__) && __GNUC__ >= 5
#pragma GCC diagnostic pop
#endif


================================================
FILE: src/redis/rax.c
================================================
/* Rax -- A radix tree implementation.
 *
 * Version 1.2 -- 7 February 2019
 *
 * Copyright (c) 2017-2019, Redis Ltd.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <stdio.h>
#include <errno.h>
#include <math.h>
#include "rax.h"


#ifndef RAX_MALLOC_INCLUDE
#define RAX_MALLOC_INCLUDE "rax_malloc.h"
#endif

#include RAX_MALLOC_INCLUDE

/* -------------------------------- Debugging ------------------------------ */

void raxDebugShowNode(const char *msg, raxNode *n);

/* Turn debugging messages on/off by compiling with RAX_DEBUG_MSG macro on.
 * When RAX_DEBUG_MSG is defined by default Rax operations will emit a lot
 * of debugging info to the standard output, however you can still turn
 * debugging on/off in order to enable it only when you suspect there is an
 * operation causing a bug using the function raxSetDebugMsg(). */
#ifdef RAX_DEBUG_MSG
#define debugf(...)                                          \
    if (raxDebugMsg) {                                       \
        printf("%s:%s:%d:\t", __FILE__, __func__, __LINE__); \
        printf(__VA_ARGS__);                                 \
        fflush(stdout);                                      \
    }

#define debugnode(msg, n) raxDebugShowNode(msg, n)
#else
#define debugf(...)
#define debugnode(msg, n)
#endif

/* By default log debug info if RAX_DEBUG_MSG is defined. */
static int raxDebugMsg = 1;

/* When debug messages are enabled, turn them on/off dynamically. By
 * default they are enabled. Set the state to 0 to disable, and 1 to
 * re-enable. */
void raxSetDebugMsg(int onoff) {
    raxDebugMsg = onoff;
}

/* ------------------------- raxStack functions --------------------------
 * The raxStack is a simple stack of pointers that is capable of switching
 * from using a stack-allocated array to dynamic heap once a given number of
 * items are reached. It is used in order to retain the list of parent nodes
 * while walking the radix tree in order to implement certain operations that
 * need to navigate the tree upward.
 * ------------------------------------------------------------------------- */

/* Initialize the stack. */
static inline void raxStackInit(raxStack *ts) {
    ts->stack = ts->static_items;
    ts->items = 0;
    ts->maxitems = RAX_STACK_STATIC_ITEMS;
    ts->oom = 0;
}

/* Push an item into the stack, returns 1 on success, 0 on out of memory. */
static inline int raxStackPush(raxStack *ts, void *ptr) {
    if (ts->items == ts->maxitems) {
        if (ts->stack == ts->static_items) {
            ts->stack = rax_malloc(sizeof(void *) * ts->maxitems * 2);
            if (ts->stack == NULL) {
                ts->stack = ts->static_items;
                ts->oom = 1;
                errno = ENOMEM;
                return 0;
            }
            memcpy(ts->stack, ts->static_items, sizeof(void *) * ts->maxitems);
        } else {
            void **newalloc = rax_realloc(ts->stack, sizeof(void *) * ts->maxitems * 2);
            if (newalloc == NULL) {
                ts->oom = 1;
                errno = ENOMEM;
                return 0;
            }
            ts->stack = newalloc;
        }
        ts->maxitems *= 2;
    }
    ts->stack[ts->items] = ptr;
    ts->items++;
    return 1;
}

/* Pop an item from the stack, the function returns NULL if there are no
 * items to pop. */
static inline void *raxStackPop(raxStack *ts) {
    if (ts->items == 0) return NULL;
    ts->items--;
    return ts->stack[ts->items];
}

/* Return the stack item at the top of the stack without actually consuming
 * it. */
static inline void *raxStackPeek(raxStack *ts) {
    if (ts->items == 0) return NULL;
    return ts->stack[ts->items - 1];
}

/* Free the stack in case we used heap allocation. */
static inline void raxStackFree(raxStack *ts) {
    if (ts->stack != ts->static_items) rax_free(ts->stack);
}

/* ----------------------------------------------------------------------------
 * Radix tree implementation
 * --------------------------------------------------------------------------*/

/* Return the padding needed in the characters section of a node having size
 * 'nodesize'. The padding is needed to store the child pointers to aligned
 * addresses. Note that we add 4 to the node size because the node has a four
 * bytes header. */
#define raxPadding(nodesize) ((sizeof(void *) - (((nodesize) + 4) % sizeof(void *))) & (sizeof(void *) - 1))

/* Return the pointer to the last child pointer in a node. For the compressed
 * nodes this is the only child pointer. */
#define raxNodeLastChildPtr(n)                                                  \
    ((raxNode **)(((char *)(n)) + raxNodeCurrentLength(n) - sizeof(raxNode *) - \
                  (((n)->iskey && !(n)->isnull) ? sizeof(void *) : 0)))

/* Return the pointer to the first child pointer. */
#define raxNodeFirstChildPtr(n) ((raxNode **)((n)->data + (n)->size + raxPadding((n)->size)))

/* Return the current total size of the node. Note that the second line
 * computes the padding after the string of characters, needed in order to
 * save pointers to aligned addresses. */
#define raxNodeCurrentLength(n)                                           \
    (sizeof(raxNode) + (n)->size + raxPadding((n)->size) +                \
     ((n)->iscompr ? sizeof(raxNode *) : sizeof(raxNode *) * (n)->size) + \
     (((n)->iskey && !(n)->isnull) * sizeof(void *)))

/* Allocate a new non compressed node with the specified number of children.
 * If datafield is true, the allocation is made large enough to hold the
 * associated data pointer.
 * Returns the new node pointer. On out of memory NULL is returned. */
raxNode *raxNewNode(size_t children, int datafield) {
    size_t nodesize = sizeof(raxNode) + children + raxPadding(children) + sizeof(raxNode *) * children;
    if (datafield) nodesize += sizeof(void *);
    raxNode *node = rax_malloc(nodesize);
    if (node == NULL) return NULL;
    node->iskey = 0;
    node->isnull = 0;
    node->iscompr = 0;
    node->size = children;
    return node;
}

/* Allocate a new rax and return its pointer. On out of memory the function
 * returns NULL. */
rax *raxNew(void) {
    rax *rax = rax_malloc(sizeof(*rax));
    if (rax == NULL) return NULL;
    rax->numele = 0;
    rax->numnodes = 1;
    rax->head = raxNewNode(0, 0);
    if (rax->head == NULL) {
        rax_free(rax);
        return NULL;
    } else {
        rax->alloc_size = rax_ptr_alloc_size(rax) + rax_ptr_alloc_size(rax->head);
        return rax;
    }
}

/* realloc the node to make room for auxiliary data in order
 * to store an item in that node. On out of memory NULL is returned. */
raxNode *raxReallocForData(raxNode *n, void *data) {
    if (data == NULL) return n; /* No reallocation needed, setting isnull=1 */
    size_t curlen = raxNodeCurrentLength(n);
    return rax_realloc(n, curlen + sizeof(void *));
}

/* Set the node auxiliary data to the specified pointer. */
void raxSetData(raxNode *n, void *data) {
    n->iskey = 1;
    if (data != NULL) {
        n->isnull = 0;
        void **ndata = (void **)((char *)n + raxNodeCurrentLength(n) - sizeof(void *));
        memcpy(ndata, &data, sizeof(data));
    } else {
        n->isnull = 1;
    }
}

/* Get the node auxiliary data. */
void *raxGetData(raxNode *n) {
    if (n->isnull) return NULL;
    void **ndata = (void **)((char *)n + raxNodeCurrentLength(n) - sizeof(void *));
    void *data;
    memcpy(&data, ndata, sizeof(data));
    return data;
}

/* Add a new child to the node 'n' representing the character 'c' and return
 * its new pointer, as well as the child pointer by reference. Additionally
 * '***parentlink' is populated with the raxNode pointer-to-pointer of where
 * the new child was stored, which is useful for the caller to replace the
 * child pointer if it gets reallocated.
 *
 * On success the new parent node pointer is returned (it may change because
 * of the realloc, so the caller should discard 'n' and use the new value).
 * On out of memory NULL is returned, and the old node is still valid. */
raxNode *raxAddChild(raxNode *n, unsigned char c, raxNode **childptr, raxNode ***parentlink) {
    assert(n->iscompr == 0);

    size_t curlen = raxNodeCurrentLength(n);
    n->size++;
    size_t newlen = raxNodeCurrentLength(n);
    n->size--; /* For now restore the original size. We'll update it only on
                  success at the end. */

    /* Alloc the new child we will link to 'n'. */
    raxNode *child = raxNewNode(0, 0);
    if (child == NULL) return NULL;

    /* Make space in the original node. */
    raxNode *newn = rax_realloc(n, newlen);
    if (newn == NULL) {
        rax_free(child);
        return NULL;
    }
    n = newn;

    /* After the reallocation, we have up to 8/16 (depending on the system
     * pointer size, and the required node padding) bytes at the end, that is,
     * the additional char in the 'data' section, plus one pointer to the new
     * child, plus the padding needed in order to store addresses into aligned
     * locations.
     *
     * So if we start with the following node, having "abde" edges.
     *
     * Note:
     * - We assume 4 bytes pointer for simplicity.
     * - Each space below corresponds to one byte
     *
     * [HDR*][abde][Aptr][Bptr][Dptr][Eptr]|AUXP|
     *
     * After the reallocation we need: 1 byte for the new edge character
     * plus 4 bytes for a new child pointer (assuming 32 bit machine).
     * However after adding 1 byte to the edge char, the header + the edge
     * characters are no longer aligned, so we also need 3 bytes of padding.
     * In total the reallocation will add 1+4+3 bytes = 8 bytes:
     *
     * (Blank bytes are represented by ".")
     *
     * [HDR*][abde][Aptr][Bptr][Dptr][Eptr]|AUXP|[....][....]
     *
     * Let's find where to insert the new child in order to make sure
     * it is inserted in-place lexicographically. Assuming we are adding
     * a child "c" in our case pos will be = 2 after the end of the following
     * loop. */
    int pos;
    for (pos = 0; pos < n->size; pos++) {
        if (n->data[pos] > c) break;
    }

    /* Now, if present, move auxiliary data pointer at the end
     * so that we can mess with the other data without overwriting it.
     * We will obtain something like that:
     *
     * [HDR*][abde][Aptr][Bptr][Dptr][Eptr][....][....]|AUXP|
     */
    unsigned char *src, *dst;
    if (n->iskey && !n->isnull) {
        src = ((unsigned char *)n + curlen - sizeof(void *));
        dst = ((unsigned char *)n + newlen - sizeof(void *));
        memmove(dst, src, sizeof(void *));
    }

    /* Compute the "shift", that is, how many bytes we need to move the
     * pointers section forward because of the addition of the new child
     * byte in the string section. Note that if we had no padding, that
     * would be always "1", since we are adding a single byte in the string
     * section of the node (where now there is "abde" basically).
     *
     * However we have padding, so it could be zero, or up to 8.
     *
     * Another way to think at the shift is, how many bytes we need to
     * move child pointers forward *other than* the obvious sizeof(void*)
     * needed for the additional pointer itself. */
    size_t shift = newlen - curlen - sizeof(void *);

    /* We said we are adding a node with edge 'c'. The insertion
     * point is between 'b' and 'd', so the 'pos' variable value is
     * the index of the first child pointer that we need to move forward
     * to make space for our new pointer.
     *
     * To start, move all the child pointers after the insertion point
     * of shift+sizeof(pointer) bytes on the right, to obtain:
     *
     * [HDR*][abde][Aptr][Bptr][....][....][Dptr][Eptr]|AUXP|
     */
    src = n->data + n->size + raxPadding(n->size) + sizeof(raxNode *) * pos;
    memmove(src + shift + sizeof(raxNode *), src, sizeof(raxNode *) * (n->size - pos));

    /* Move the pointers to the left of the insertion position as well. Often
     * we don't need to do anything if there was already some padding to use. In
     * that case the final destination of the pointers will be the same, however
     * in our example there was no pre-existing padding, so we added one byte
     * plus three bytes of padding. After the next memmove() things will look
     * like that:
     *
     * [HDR*][abde][....][Aptr][Bptr][....][Dptr][Eptr]|AUXP|
     */
    if (shift) {
        src = (unsigned char *)raxNodeFirstChildPtr(n);
        memmove(src + shift, src, sizeof(raxNode *) * pos);
    }

    /* Now make the space for the additional char in the data section,
     * but also move the pointers before the insertion point to the right
     * by shift bytes, in order to obtain the following:
     *
     * [HDR*][ab.d][e...][Aptr][Bptr][....][Dptr][Eptr]|AUXP|
     */
    src = n->data + pos;
    memmove(src + 1, src, n->size - pos);

    /* We can now set the character and its child node pointer to get:
     *
     * [HDR*][abcd][e...][Aptr][Bptr][....][Dptr][Eptr]|AUXP|
     * [HDR*][abcd][e...][Aptr][Bptr][Cptr][Dptr][Eptr]|AUXP|
     */
    n->data[pos] = c;
    n->size++;
    src = (unsigned char *)raxNodeFirstChildPtr(n);
    raxNode **childfield = (raxNode **)(src + sizeof(raxNode *) * pos);
    memcpy(childfield, &child, sizeof(child));
    *childptr = child;
    *parentlink = childfield;
    return n;
}

/* Turn the node 'n', that must be a node without any children, into a
 * compressed node representing a set of nodes linked one after the other
 * and having exactly one child each. The node can be a key or not: this
 * property and the associated value if any will be preserved.
 *
 * The function also returns a child node, since the last node of the
 * compressed chain cannot be part of the chain: it has zero children while
 * we can only compress inner nodes with exactly one child each. */
raxNode *raxCompressNode(raxNode *n, unsigned char *s, size_t len, raxNode **child) {
    assert(n->size == 0 && n->iscompr == 0);
    void *data = NULL; /* Initialized only to avoid warnings. */
    size_t newsize;

    debugf("Compress node: %.*s\n", (int)len, s);

    /* Allocate the child to link to this node. */
    *child = raxNewNode(0, 0);
    if (*child == NULL) return NULL;

    /* Make space in the parent node. */
    newsize = sizeof(raxNode) + len + raxPadding(len) + sizeof(raxNode *);
    if (n->iskey) {
        data = raxGetData(n); /* To restore it later. */
        if (!n->isnull) newsize += sizeof(void *);
    }
    raxNode *newn = rax_realloc(n, newsize);
    if (newn == NULL) {
        rax_free(*child);
        return NULL;
    }
    n = newn;

    n->iscompr = 1;
    n->size = len;
    memcpy(n->data, s, len);
    if (n->iskey) raxSetData(n, data);
    raxNode **childfield = raxNodeLastChildPtr(n);
    memcpy(childfield, child, sizeof(*child));
    return n;
}

/* Low level function that walks the tree looking for the string
 * 's' of 'len' bytes. The function returns the number of characters
 * of the key that was possible to process: if the returned integer
 * is the same as 'len', then it means that the node corresponding to the
 * string was found (however it may not be a key in case the node->iskey is
 * zero or if simply we stopped in the middle of a compressed node, so that
 * 'splitpos' is non zero).
 *
 * Otherwise if the returned integer is not the same as 'len', there was an
 * early stop during the tree walk because of a character mismatch.
 *
 * The node where the search ended (because the full string was processed
 * or because there was an early stop) is returned by reference as
 * '*stopnode' if the passed pointer is not NULL. This node link in the
 * parent's node is returned as '*plink' if not NULL. Finally, if the
 * search stopped in a compressed node, '*splitpos' returns the index
 * inside the compressed node where the search ended. This is useful to
 * know where to split the node for insertion.
 *
 * Note that when we stop in the middle of a compressed node with
 * a perfect match, this function will return a length equal to the
 * 'len' argument (all the key matched), and will return a *splitpos which is
 * always positive (that will represent the index of the character immediately
 * *after* the last match in the current compressed node).
 *
 * When instead we stop at a compressed node and *splitpos is zero, it
 * means that the current node represents the key (that is, none of the
 * compressed node characters are needed to represent the key, just all
 * its parents nodes). */
static inline size_t
raxLowWalk(rax *rax, unsigned char *s, size_t len, raxNode **stopnode, raxNode ***plink, int *splitpos, raxStack *ts) {
    raxNode *h = rax->head;
    raxNode **parentlink = &rax->head;

    size_t i = 0; /* Position in the string. */
    size_t j = 0; /* Position in the node children (or bytes if compressed).*/
    while (h->size && i < len) {
        debugnode("Lookup current node", h);
        unsigned char *v = h->data;

        if (h->iscompr) {
            for (j = 0; j < h->size && i < len; j++, i++) {
                if (v[j] != s[i]) break;
            }
            if (j != h->size) break;
        } else {
            /* Even when h->size is large, linear scan provides good
             * performances compared to other approaches that are in theory
             * more sounding, like performing a binary search. */
            for (j = 0; j < h->size; j++) {
                if (v[j] == s[i]) break;
            }
            if (j == h->size) break;
            i++;
        }

        if (ts) raxStackPush(ts, h); /* Save stack of parent nodes. */
        raxNode **children = raxNodeFirstChildPtr(h);
        if (h->iscompr) j = 0; /* Compressed node only child is at index 0. */
        memcpy(&h, children + j, sizeof(h));
        parentlink = children + j;
        j = 0; /* If the new node is non compressed and we do not
                  iterate again (since i == len) set the split
                  position to 0 to signal this node represents
                  the searched key. */
    }
    debugnode("Lookup stop node is", h);
    if (stopnode) *stopnode = h;
    if (plink) *plink = parentlink;
    if (splitpos && h->iscompr) *splitpos = j;
    return i;
}

/* Insert the element 's' of size 'len', setting as auxiliary data
 * the pointer 'data'. If the element is already present, the associated
 * data is updated (only if 'overwrite' is set to 1), and 0 is returned,
 * otherwise the element is inserted and 1 is returned. On out of memory the
 * function returns 0 as well but sets errno to ENOMEM, otherwise errno will
 * be set to 0.
 */
int raxGenericInsert(rax *rax, unsigned char *s, size_t len, void *data, void **old, int overwrite) {
    size_t i;
    int j = 0; /* Split position. If raxLowWalk() stops in a compressed
                  node, the index 'j' represents the char we stopped within the
                  compressed node, that is, the position where to split the
                  node for insertion. */
    raxNode *h, **parentlink;

    debugf("### Insert %.*s with value %p\n", (int)len, s, data);
    i = raxLowWalk(rax, s, len, &h, &parentlink, &j, NULL);

    /* If i == len we walked following the whole string. If we are not
     * in the middle of a compressed node, the string is either already
     * inserted or this middle node is currently not a key, but can represent
     * our key. We have just to reallocate the node and make space for the
     * data pointer. */
    if (i == len && (!h->iscompr || j == 0 /* not in the middle if j is 0 */)) {
        debugf("### Insert: node representing key exists\n");
        /* Make space for the value pointer if needed. */
        if (!h->iskey || (h->isnull && overwrite)) {
            size_t oldalloc = rax_ptr_alloc_size(h);
            h = raxReallocForData(h, data);
            if (h) {
                memcpy(parentlink, &h, sizeof(h));
                rax->alloc_size = rax->alloc_size - oldalloc + rax_ptr_alloc_size(h);
            }
        }
        if (h == NULL) {
            errno = ENOMEM;
            return 0;
        }

        /* Update the existing key if there is already one. */
        if (h->iskey) {
            if (old) *old = raxGetData(h);
            if (overwrite) raxSetData(h, data);
            errno = 0;
            return 0; /* Element already exists. */
        }

        /* Otherwise set the node as a key. Note that raxSetData()
         * will set h->iskey. */
        raxSetData(h, data);
        rax->numele++;
        return 1; /* Element inserted. */
    }

    /* If the node we stopped at is a compressed node, we need to
     * split it before to continue.
     *
     * Splitting a compressed node have a few possible cases.
     * Imagine that the node 'h' we are currently at is a compressed
     * node containing the string "ANNIBALE" (it means that it represents
     * nodes A -> N -> N -> I -> B -> A -> L -> E with the only child
     * pointer of this node pointing at the 'E' node, because remember that
     * we have characters at the edges of the graph, not inside the nodes
     * themselves.
     *
     * In order to show a real case imagine our node to also point to
     * another compressed node, that finally points at the node without
     * children, representing 'O':
     *
     *     "ANNIBALE" -> "SCO" -> []
     *
     * When inserting we may face the following cases. Note that all the cases
     * require the insertion of a non compressed node with exactly two
     * children, except for the last case which just requires splitting a
     * compressed node.
     *
     * 1) Inserting "ANNIENTARE"
     *
     *               |B| -> "ALE" -> "SCO" -> []
     *     "ANNI" -> |-|
     *               |E| -> (... continue algo ...) "NTARE" -> []
     *
     * 2) Inserting "ANNIBALI"
     *
     *                  |E| -> "SCO" -> []
     *     "ANNIBAL" -> |-|
     *                  |I| -> (... continue algo ...) []
     *
     * 3) Inserting "AGO" (Like case 1, but set iscompr = 0 into original node)
     *
     *            |N| -> "NIBALE" -> "SCO" -> []
     *     |A| -> |-|
     *            |G| -> (... continue algo ...) |O| -> []
     *
     * 4) Inserting "CIAO"
     *
     *     |A| -> "NNIBALE" -> "SCO" -> []
     *     |-|
     *     |C| -> (... continue algo ...) "IAO" -> []
     *
     * 5) Inserting "ANNI"
     *
     *     "ANNI" -> "BALE" -> "SCO" -> []
     *
     * The final algorithm for insertion covering all the above cases is as
     * follows.
     *
     * ============================= ALGO 1 =============================
     *
     * For the above cases 1 to 4, that is, all cases where we stopped in
     * the middle of a compressed node for a character mismatch, do:
     *
     * Let $SPLITPOS be the zero-based index at which, in the
     * compressed node array of characters, we found the mismatching
     * character. For example if the node contains "ANNIBALE" and we add
     * "ANNIENTARE" the $SPLITPOS is 4, that is, the index at which the
     * mismatching character is found.
     *
     * 1. Save the current compressed node $NEXT pointer (the pointer to the
     *    child element, that is always present in compressed nodes).
     *
     * 2. Create "split node" having as child the non common letter
     *    at the compressed node. The other non common letter (at the key)
     *    will be added later as we continue the normal insertion algorithm
     *    at step "6".
     *
     * 3a. IF $SPLITPOS == 0:
     *     Replace the old node with the split node, by copying the auxiliary
     *     data if any. Fix parent's reference. Free old node eventually
     *     (we still need its data for the next steps of the algorithm).
     *
     * 3b. IF $SPLITPOS != 0:
     *     Trim the compressed node (reallocating it as well) in order to
     *     contain $splitpos characters. Change child pointer in order to link
     *     to the split node. If new compressed node len is just 1, set
     *     iscompr to 0 (layout is the same). Fix parent's reference.
     *
     * 4a. IF the postfix len (the length of the remaining string of the
     *     original compressed node after the split character) is non zero,
     *     create a "postfix node". If the postfix node has just one character
     *     set iscompr to 0, otherwise iscompr to 1. Set the postfix node
     *     child pointer to $NEXT.
     *
     * 4b. IF the postfix len is zero, just use $NEXT as postfix pointer.
     *
     * 5. Set child[0] of split node to postfix node.
     *
     * 6. Set the split node as the current node, set current index at child[1]
     *    and continue insertion algorithm as usually.
     *
     * ============================= ALGO 2 =============================
     *
     * For case 5, that is, if we stopped in the middle of a compressed
     * node but no mismatch was found, do:
     *
     * Let $SPLITPOS be the zero-based index at which, in the
     * compressed node array of characters, we stopped iterating because
     * there were no more keys character to match. So in the example of
     * the node "ANNIBALE", adding the string "ANNI", the $SPLITPOS is 4.
     *
     * 1. Save the current compressed node $NEXT pointer (the pointer to the
     *    child element, that is always present in compressed nodes).
     *
     * 2. Create a "postfix node" containing all the characters from $SPLITPOS
     *    to the end. Use $NEXT as the postfix node child pointer.
     *    If the postfix node length is 1, set iscompr to 0.
     *    Set the node as a key with the associated value of the new
     *    inserted key.
     *
     * 3. Trim the current node to contain the first $SPLITPOS characters.
     *    As usually if the new node length is just 1, set iscompr to 0.
     *    Take the iskey / associated value as it was in the original node.
     *    Fix the parent's reference.
     *
     * 4. Set the postfix node as the only child pointer of the trimmed
     *    node created at step 1.
     */

    /* ------------------------- ALGORITHM 1 --------------------------- */
    if (h->iscompr && i != len) {
        debugf("ALGO 1: Stopped at compressed node %.*s (%p)\n", h->size, h->data, (void *)h);
        debugf("Still to insert: %.*s\n", (int)(len - i), s + i);
        debugf("Splitting at %d: '%c'\n", j, ((char *)h->data)[j]);
        debugf("Other (key) letter is '%c'\n", s[i]);

        /* 1: Save next pointer. */
        raxNode **childfield = raxNodeLastChildPtr(h);
        raxNode *next;
        memcpy(&next, childfield, sizeof(next));
        debugf("Next is %p\n", (void *)next);
        debugf("iskey %d\n", h->iskey);
        if (h->iskey) {
            debugf("key value is %p\n", raxGetData(h));
        }

        /* Set the length of the additional nodes we will need. */
        size_t trimmedlen = j;
        size_t postfixlen = h->size - j - 1;
        int split_node_is_key = !trimmedlen && h->iskey && !h->isnull;
        size_t nodesize;

        /* 2: Create the split node. Also allocate the other nodes we'll need
         *    ASAP, so that it will be simpler to handle OOM. */
        raxNode *splitnode = raxNewNode(1, split_node_is_key);
        raxNode *trimmed = NULL;
        raxNode *postfix = NULL;

        if (trimmedlen) {
            nodesize = sizeof(raxNode) + trimmedlen + raxPadding(trimmedlen) + sizeof(raxNode *);
            if (h->iskey && !h->isnull) nodesize += sizeof(void *);
            trimmed = rax_malloc(nodesize);
        }

        if (postfixlen) {
            nodesize = sizeof(raxNode) + postfixlen + raxPadding(postfixlen) + sizeof(raxNode *);
            postfix = rax_malloc(nodesize);
        }

        /* OOM? Abort now that the tree is untouched. */
        if (splitnode == NULL || (trimmedlen && trimmed == NULL) || (postfixlen && postfix == NULL)) {
            rax_free(splitnode);
            rax_free(trimmed);
            rax_free(postfix);
            errno = ENOMEM;
            return 0;
        }
        splitnode->data[0] = h->data[j];
        rax->alloc_size += rax_ptr_alloc_size(splitnode);

        if (j == 0) {
            /* 3a: Replace the old node with the split node. */
            if (h->iskey) {
                void *ndata = raxGetData(h);
                raxSetData(splitnode, ndata);
            }
            memcpy(parentlink, &splitnode, sizeof(splitnode));
        } else {
            /* 3b: Trim the compressed node. */
            trimmed->size = j;
            memcpy(trimmed->data, h->data, j);
            trimmed->iscompr = j > 1 ? 1 : 0;
            trimmed->iskey = h->iskey;
            trimmed->isnull = h->isnull;
            if (h->iskey && !h->isnull) {
                void *ndata = raxGetData(h);
                raxSetData(trimmed, ndata);
            }
            raxNode **cp = raxNodeLastChildPtr(trimmed);
            memcpy(cp, &splitnode, sizeof(splitnode));
            memcpy(parentlink, &trimmed, sizeof(trimmed));
            parentlink = cp; /* Set parentlink to splitnode parent. */
            rax->numnodes++;
            rax->alloc_size += rax_ptr_alloc_size(trimmed);
        }

        /* 4: Create the postfix node: what remains of the original
         * compressed node after the split. */
        if (postfixlen) {
            /* 4a: create a postfix node. */
            postfix->iskey = 0;
            postfix->isnull = 0;
            postfix->size = postfixlen;
            postfix->iscompr = postfixlen > 1;
            memcpy(postfix->data, h->data + j + 1, postfixlen);
            raxNode **cp = raxNodeLastChildPtr(postfix);
            memcpy(cp, &next, sizeof(next));
            rax->numnodes++;
            rax->alloc_size += rax_ptr_alloc_size(postfix);
        } else {
            /* 4b: just use next as postfix node. */
            postfix = next;
        }

        /* 5: Set splitnode first child as the postfix node. */
        raxNode **splitchild = raxNodeLastChildPtr(splitnode);
        memcpy(splitchild, &postfix, sizeof(postfix));

        /* 6. Continue insertion: this will cause the splitnode to
         * get a new child (the non common character at the currently
         * inserted key). */
        rax->alloc_size -= rax_ptr_alloc_size(h);
        rax_free(h);
        h = splitnode;
    } else if (h->iscompr && i == len) {
        /* ------------------------- ALGORITHM 2 --------------------------- */
        debugf("ALGO 2: Stopped at compressed node %.*s (%p) j = %d\n", h->size, h->data, (void *)h, j);

        /* Allocate postfix & trimmed nodes ASAP to fail for OOM gracefully. */
        size_t postfixlen = h->size - j;
        size_t nodesize = sizeof(raxNode) + postfixlen + raxPadding(postfixlen) + sizeof(raxNode *);
        if (data != NULL) nodesize += sizeof(void *);
        raxNode *postfix = rax_malloc(nodesize);

        nodesize = sizeof(raxNode) + j + raxPadding(j) + sizeof(raxNode *);
        if (h->iskey && !h->isnull) nodesize += sizeof(void *);
        raxNode *trimmed = rax_malloc(nodesize);

        if (postfix == NULL || trimmed == NULL) {
            rax_free(postfix);
            rax_free(trimmed);
            errno = ENOMEM;
            return 0;
        }

        /* 1: Save next pointer. */
        raxNode **childfield = raxNodeLastChildPtr(h);
        raxNode *next;
        memcpy(&next, childfield, sizeof(next));

        /* 2: Create the postfix node. */
        postfix->size = postfixlen;
        postfix->iscompr = postfixlen > 1;
        postfix->iskey = 1;
        postfix->isnull = 0;
        memcpy(postfix->data, h->data + j, postfixlen);
        raxSetData(postfix, data);
        raxNode **cp = raxNodeLastChildPtr(postfix);
        memcpy(cp, &next, sizeof(next));
        rax->numnodes++;
        rax->alloc_size += rax_ptr_alloc_size(postfix);

        /* 3: Trim the compressed node. */
        trimmed->size = j;
        trimmed->iscompr = j > 1;
        trimmed->iskey = 0;
        trimmed->isnull = 0;
        memcpy(trimmed->data, h->data, j);
        memcpy(parentlink, &trimmed, sizeof(trimmed));
        if (h->iskey) {
            void *aux = raxGetData(h);
            raxSetData(trimmed, aux);
        }
        rax->alloc_size += rax_ptr_alloc_size(trimmed);

        /* Fix the trimmed node child pointer to point to
         * the postfix node. */
        cp = raxNodeLastChildPtr(trimmed);
        memcpy(cp, &postfix, sizeof(postfix));

        /* Finish! We don't need to continue with the insertion
         * algorithm for ALGO 2. The key is already inserted. */
        rax->numele++;
        rax->alloc_size -= rax_ptr_alloc_size(h);
        rax_free(h);
        return 1; /* Key inserted. */
    }

    /* We walked the radix tree as far as we could, but still there are left
     * chars in our string. We need to insert the missing nodes. */
    while (i < len) {
        raxNode *child;
        size_t oldalloc = rax_ptr_alloc_size(h);

        /* If this node is going to have a single child, and there
         * are other characters, so that that would result in a chain
         * of single-childed nodes, turn it into a compressed node. */
        if (h->size == 0 && len - i > 1) {
            debugf("Inserting compressed node\n");
            size_t comprsize = len - i;
            if (comprsize > RAX_NODE_MAX_SIZE) comprsize = RAX_NODE_MAX_SIZE;
            raxNode *newh = raxCompressNode(h, s + i, comprsize, &child);
            if (newh == NULL) goto oom;
            h = newh;
            memcpy(parentlink, &h, sizeof(h));
            parentlink = raxNodeLastChildPtr(h);
            i += comprsize;
        } else {
            debugf("Inserting normal node\n");
            raxNode **new_parentlink;
            raxNode *newh = raxAddChild(h, s[i], &child, &new_parentlink);
            if (newh == NULL) goto oom;
            h = newh;
            memcpy(parentlink, &h, sizeof(h));
            parentlink = new_parentlink;
            i++;
        }
        rax->numnodes++;
        rax->alloc_size = rax->alloc_size - oldalloc + rax_ptr_alloc_size(h) + rax_ptr_alloc_size(child);
        h = child;
    }
    size_t oldalloc = rax_ptr_alloc_size(h);
    raxNode *newh = raxReallocForData(h, data);
    if (newh == NULL) goto oom;
    h = newh;
    if (!h->iskey) rax->numele++;
    raxSetData(h, data);
    memcpy(parentlink, &h, sizeof(h));
    rax->alloc_size = rax->alloc_size - oldalloc + rax_ptr_alloc_size(h);
    return 1; /* Element inserted. */

oom:
    /* This code path handles out of memory after part of the sub-tree was
     * already modified. Set the node as a key, and then remove it. However we
     * do that only if the node is a terminal node, otherwise if the OOM
     * happened reallocating a node in the middle, we don't need to free
     * anything. */
    if (h->size == 0) {
        h->isnull = 1;
        h->iskey = 1;
        rax->numele++; /* Compensate the next remove. */
        checkedRaxRemove(rax, s, i, NULL);
    }
    errno = ENOMEM;
    return 0;
}

/* Overwriting insert. Just a wrapper for raxGenericInsert() that will
 * update the element if there is already one for the same key. */
int raxInsert(rax *rax, unsigned char *s, size_t len, void *data, void **old) {
    return raxGenericInsert(rax, s, len, data, old, 1);
}

/* Non overwriting insert function: if an element with the same key
 * exists, the value is not updated and the function returns 0.
 * This is just a wrapper for raxGenericInsert(). */
int raxTryInsert(rax *rax, unsigned char *s, size_t len, void *data, void **old) {
    return raxGenericInsert(rax, s, len, data, old, 0);
}

/* Find a key in the rax: return 1 if the item is found, 0 otherwise.
 * If there is an item and 'value' is passed in a non-NULL pointer,
 * the value associated with the item is set at that address. */
int raxFind(rax *rax, unsigned char *s, size_t len, void **value) {
    raxNode *h;

    debugf("### Lookup: %.*s\n", (int)len, s);
    int splitpos = 0;
    size_t i = raxLowWalk(rax, s, len, &h, NULL, &splitpos, NULL);
    if (i != len || (h->iscompr && splitpos != 0) || !h->iskey) return 0;
    if (value != NULL) *value = raxGetData(h);
    return 1;
}

/* Return the memory address where the 'parent' node stores the specified
 * 'child' pointer, so that the caller can update the pointer with another
 * one if needed. The function assumes it will find a match, otherwise the
 * operation is an undefined behavior (it will continue scanning the
 * memory without any bound checking). */
raxNode **raxFindParentLink(raxNode *parent, raxNode *child) {
    raxNode **cp = raxNodeFirstChildPtr(parent);
    raxNode *c;
    while (1) {
        memcpy(&c, cp, sizeof(c));
        if (c == child) break;
        cp++;
    }
    return cp;
}

/* Low level child removal from node. The new node pointer (after the child
 * removal) is returned. Note that this function does not fix the pointer
 * of the parent node in its parent, so this task is up to the caller.
 * The function never fails for out of memory. */
raxNode *raxRemoveChild(raxNode *parent, raxNode *child) {
    debugnode("raxRemoveChild before", parent);
    /* If parent is a compressed node (having a single child, as for definition
     * of the data structure), the removal of the child consists into turning
     * it into a normal node without children. */
    if (parent->iscompr) {
        void *data = NULL;
        if (parent->iskey) data = raxGetData(parent);
        parent->isnull = 0;
        parent->iscompr = 0;
        parent->size = 0;
        if (parent->iskey) raxSetData(parent, data);
        debugnode("raxRemoveChild after", parent);
        return parent;
    }

    /* Otherwise we need to scan for the child pointer and memmove()
     * accordingly.
     *
     * 1. To start we seek the first element in both the children
     *    pointers and edge bytes in the node. */
    raxNode **cp = raxNodeFirstChildPtr(parent);
    raxNode **c = cp;
    unsigned char *e = parent->data;

    /* 2. Search the child pointer to remove inside the array of children
     *    pointers. */
    while (1) {
        raxNode *aux;
        memcpy(&aux, c, sizeof(aux));
        if (aux == child) break;
        c++;
        e++;
    }

    /* 3. Remove the edge and the pointer by memmoving the remaining children
     *    pointer and edge bytes one position before. */
    int taillen = parent->size - (e - parent->data) - 1;
    debugf("raxRemoveChild tail len: %d\n", taillen);
    memmove(e, e + 1, taillen);

    /* Compute the shift, that is the amount of bytes we should move our
     * child pointers to the left, since the removal of one edge character
     * and the corresponding padding change, may change the layout.
     * We just check if in the old version of the node there was at the
     * end just a single byte and all padding: in that case removing one char
     * will remove a whole sizeof(void*) word. */
    size_t shift = ((parent->size + 4) % sizeof(void *)) == 1 ? sizeof(void *) : 0;

    /* Move the children pointers before the deletion point. */
    if (shift) memmove(((char *)cp) - shift, cp, (parent->size - taillen - 1) * sizeof(raxNode **));

    /* Move the remaining "tail" pointers at the right position as well. */
    size_t valuelen = (parent->iskey && !parent->isnull) ? sizeof(void *) : 0;
    memmove(((char *)c) - shift, c + 1, taillen * sizeof(raxNode **) + valuelen);

    /* 4. Update size. */
    parent->size--;

    /* realloc the node according to the theoretical memory usage, to free
     * data if we are over-allocating right now. */
    raxNode *newnode = rax_realloc(parent, raxNodeCurrentLength(parent));
    if (newnode) {
        debugnode("raxRemoveChild after", newnode);
    }
    /* Note: if rax_realloc() fails we just return the old address, which
     * is valid. */
    return newnode ? newnode : parent;
}

/* Remove the specified item. Returns 1 if the item was found and
 * deleted, 0 otherwise. */
int raxRemove(rax *rax, unsigned char *s, size_t len, void **old) {
    raxNode *h;
    raxStack ts;

    debugf("### Delete: %.*s\n", (int)len, s);
    raxStackInit(&ts);
    int splitpos = 0;
    size_t i = raxLowWalk(rax, s, len, &h, NULL, &splitpos, &ts);
    if (i != len || (h->iscompr && splitpos != 0) || !h->iskey) {
        raxStackFree(&ts);
        return 0;
    }
    if (old) *old = raxGetData(h);
    h->iskey = 0;
    rax->numele--;

    /* If this node has no children, the deletion needs to reclaim the
     * no longer used nodes. This is an iterative process that needs to
     * walk the three upward, deleting all the nodes with just one child
     * that are not keys, until the head of the rax is reached or the first
     * node with more than one child is found. */

    int trycompress = 0; /* Will be set to 1 if we should try to optimize the
                            tree resulting from the deletion. */

    if (h->size == 0) {
        debugf("Key deleted in node without children. Cleanup needed.\n");
        raxNode *child = NULL;
        while (h != rax->head) {
            child = h;
            debugf("Freeing child %p [%.*s] key:%d\n", (void *)child, (int)child->size, (char *)child->data,
                   child->iskey);
            rax->alloc_size -= rax_ptr_alloc_size(child);
            rax_free(child);
            rax->numnodes--;
            h = raxStackPop(&ts);
            /* If this node has more then one child, or actually holds
             * a key, stop here. */
            if (h->iskey || (!h->iscompr && h->size != 1)) break;
        }
        if (child) {
            debugf("Unlinking child %p from parent %p\n", (void *)child, (void *)h);
            size_t oldalloc = rax_ptr_alloc_size(h);
            raxNode *new = raxRemoveChild(h, child);
            rax->alloc_size = rax->alloc_size - oldalloc + rax_ptr_alloc_size(new);
            if (new != h) {
                raxNode *parent = raxStackPeek(&ts);
                raxNode **parentlink;
                if (parent == NULL) {
                    parentlink = &rax->head;
                } else {
                    parentlink = raxFindParentLink(parent, h);
                }
                memcpy(parentlink, &new, sizeof(new));
            }

            /* If after the removal the node has just a single child
             * and is not a key, we need to try to compress it. */
            if (new->size == 1 && new->iskey == 0) {
                trycompress = 1;
                h = new;
            }
        }
    } else if (h->size == 1) {
        /* If the node had just one child, after the removal of the key
         * further compression with adjacent nodes is potentially possible. */
        trycompress = 1;
    }

    /* Don't try node compression if our nodes pointers stack is not
     * complete because of OOM while executing raxLowWalk() */
    if (trycompress && ts.oom) trycompress = 0;

    /* Recompression: if trycompress is true, 'h' points to a radix tree node
     * that changed in a way that could allow to compress nodes in this
     * sub-branch. Compressed nodes represent chains of nodes that are not
     * keys and have a single child, so there are two deletion events that
     * may alter the tree so that further compression is needed:
     *
     * 1) A node with a single child was a key and now no longer is a key.
     * 2) A node with two children now has just one child.
     *
     * We try to navigate upward till there are other nodes that can be
     * compressed, when we reach the upper node which is not a key and has
     * a single child, we scan the chain of children to collect the
     * compressible part of the tree, and replace the current node with the
     * new one, fixing the child pointer to reference the first non
     * compressible node.
     *
     * Example of case "1". A tree stores the keys "FOO" = 1 and
     * "FOOBAR" = 2:
     *
     *
     * "FOO" -> "BAR" -> [] (2)
     *           (1)
     *
     * After the removal of "FOO" the tree can be compressed as:
     *
     * "FOOBAR" -> [] (2)
     *
     *
     * Example of case "2". A tree stores the keys "FOOBAR" = 1 and
     * "FOOTER" = 2:
     *
     *          |B| -> "AR" -> [] (1)
     * "FOO" -> |-|
     *          |T| -> "ER" -> [] (2)
     *
     * After the removal of "FOOTER" the resulting tree is:
     *
     * "FOO" -> |B| -> "AR" -> [] (1)
     *
     * That can be compressed into:
     *
     * "FOOBAR" -> [] (1)
     */
    if (trycompress) {
        debugf("After removing %.*s:\n", (int)len, s);
        debugnode("Compression may be needed", h);
        debugf("Seek start node\n");

        /* Try to reach the upper node that is compressible.
         * At the end of the loop 'h' will point to the first node we
         * can try to compress and 'parent' to its parent. */
        raxNode *parent;
        while (1) {
            parent = raxStackPop(&ts);
            if (!parent || parent->iskey || (!parent->iscompr && parent->size != 1)) break;
            h = parent;
            debugnode("Going up to", h);
        }
        raxNode *start = h; /* Compression starting node. */

        /* Scan chain of nodes we can compress. */
        size_t comprsize = h->size;
        int nodes = 1;
        while (h->size != 0) {
            raxNode **cp = raxNodeLastChildPtr(h);
            memcpy(&h, cp, sizeof(h));
            if (h->iskey || (!h->iscompr && h->size != 1)) break;
            /* Stop here if going to the next node would result into
             * a compressed node larger than h->size can hold. */
            if (comprsize + h->size > RAX_NODE_MAX_SIZE) break;
            nodes++;
            comprsize += h->size;
        }
        if (nodes > 1) {
            /* If we can compress, create the new node and populate it. */
            size_t nodesize = sizeof(raxNode) + comprsize + raxPadding(comprsize) + sizeof(raxNode *);
            raxNode *new = rax_malloc(nodesize);
            /* An out of memory here just means we cannot optimize this
             * node, but the tree is left in a consistent state. */
            if (new == NULL) {
                raxStackFree(&ts);
                return 1;
            }
            new->iskey = 0;
            new->isnull = 0;
            new->iscompr = 1;
            new->size = comprsize;
            rax->numnodes++;
            rax->alloc_size += rax_ptr_alloc_size(new);

            /* Scan again, this time to populate the new node content and
             * to fix the new node child pointer. At the same time we free
             * all the nodes that we'll no longer use. */
            comprsize = 0;
            h = start;
            while (h->size != 0) {
                memcpy(new->data + comprsize, h->data, h->size);
                comprsize += h->size;
                raxNode **cp = raxNodeLastChildPtr(h);
                raxNode *tofree = h;
                memcpy(&h, cp, sizeof(h));
                rax->alloc_size -= rax_ptr_alloc_size(tofree);
                rax_free(tofree);
                rax->numnodes--;
                if (h->iskey || (!h->iscompr && h->size != 1)) break;
                if (comprsize + h->size > RAX_NODE_MAX_SIZE) break;
            }
            debugnode("New node", new);

            /* Now 'h' points to the first node that we still need to use,
             * so our new node child pointer will point to it. */
            raxNode **cp = raxNodeLastChildPtr(new);
            memcpy(cp, &h, sizeof(h));

            /* Fix parent link. */
            if (parent) {
                raxNode **parentlink = raxFindParentLink(parent, start);
                memcpy(parentlink, &new, sizeof(new));
            } else {
                rax->head = new;
            }

            debugf("Compressed %d nodes, %d total bytes\n", nodes, (int)comprsize);
        }
    }
    raxStackFree(&ts);
    return 1;
}

/* This is the core of raxFree(): performs a depth-first scan of the
 * tree and releases all the nodes found. */
void raxRecursiveFree(rax *rax, raxNode *n, void (*free_callback)(void*, void*), void* argument) {
    debugnode("free traversing",n);
    int numchildren = n->iscompr ? 1 : n->size;
    raxNode **cp = raxNodeLastChildPtr(n);
    while (numchildren--) {
        raxNode *child;
        memcpy(&child, cp, sizeof(child));
        raxRecursiveFree(rax,child,free_callback,argument);
        cp--;
    }
    debugnode("free depth-first", n);
    if (free_callback && n->iskey && !n->isnull) free_callback(raxGetData(n), argument);
    rax_free(n);
    rax->numnodes--;
}

/* Free the entire radix tree, invoking a free_callback function for each key's data. 
 * An additional argument is passed to the free_callback function.*/
 void raxFreeWithCallbackAndArgument(rax *rax, void (*free_callback)(void*, void*), void* argument) {
    raxRecursiveFree(rax,rax->head,free_callback, argument);
    assert(rax->numnodes == 0);
    rax_free(rax);
}

/* Wrapper for the callback to adapt it for the context */
void freeCallbackWrapper(void* data, void* argument) {
    if (!argument) {
        return;
    }
    void (*free_callback)(void*) = (void (*)(void*))argument;
    free_callback(data);
}

/* Free a whole radix tree, calling the specified callback in order to
 * free the auxiliary data. */
void raxFreeWithCallback(rax *rax, void (*free_callback)(void*)) {
    raxFreeWithCallbackAndArgument(rax, freeCallbackWrapper, (void*)free_callback);
}

/* Free a whole radix tree. */
void raxFree(rax *rax) {
    raxFreeWithCallback(rax, NULL);
}

/* ------------------------------- Iterator --------------------------------- */

/* Initialize a Rax iterator. This call should be performed a single time
 * to initialize the iterator, and must be followed by a raxSeek() call,
 * otherwise the raxPrev()/raxNext() functions will just return EOF. */
void raxStart(raxIterator *it, rax *rt) {
    it->flags = RAX_ITER_EOF; /* No crash if the iterator is not seeked. */
    it->rt = rt;
    it->key_len = 0;
    it->key = it->key_static_string;
    it->key_max = RAX_ITER_STATIC_LEN;
    it->data = NULL;
    it->node_cb = NULL;
    raxStackInit(&it->stack);
}

/* Append characters at the current key string of the iterator 'it'. This
 * is a low level function used to implement the iterator, not callable by
 * the user. Returns 0 on out of memory, otherwise 1 is returned. */
int raxIteratorAddChars(raxIterator *it, unsigned char *s, size_t len) {
    if (len == 0) return 1;
    if (it->key_max < it->key_len + len) {
        unsigned char *old = (it->key == it->key_static_string) ? NULL : it->key;
        size_t new_max = (it->key_len + len) * 2;
        it->key = rax_realloc(old, new_max);
        if (it->key == NULL) {
            it->key = (!old) ? it->key_static_string : old;
            errno = ENOMEM;
            return 0;
        }
        if (old == NULL) memcpy(it->key, it->key_static_string, it->key_len);
        it->key_max = new_max;
    }
    /* Use memmove since there could be an overlap between 's' and
     * it->key when we use the current key in order to re-seek. */
    memmove(it->key + it->key_len, s, len);
    it->key_len += len;
    return 1;
}

/* Remove the specified number of chars from the right of the current
 * iterator key. */
void raxIteratorDelChars(raxIterator *it, size_t count) {
    it->key_len -= count;
}

/* Do an iteration step towards the next element. At the end of the step the
 * iterator key will represent the (new) current key. If it is not possible
 * to step in the specified direction since there are no longer elements, the
 * iterator is flagged with RAX_ITER_EOF.
 *
 * If 'noup' is true the function starts directly scanning for the next
 * lexicographically smaller children, and the current node is already assumed
 * to be the parent of the last key node, so the first operation to go back to
 * the parent will be skipped. This option is used by raxSeek() when
 * implementing seeking a non existing element with the ">" or "<" options:
 * the starting node is not a key in that particular case, so we start the scan
 * from a node that does not represent the key set.
 *
 * The function returns 1 on success or 0 on out of memory. */
int raxIteratorNextStep(raxIterator *it, int noup) {
    if (it->flags & RAX_ITER_EOF) {
        return 1;
    } else if (it->flags & RAX_ITER_JUST_SEEKED) {
        it->flags &= ~RAX_ITER_JUST_SEEKED;
        return 1;
    }

    /* Save key len, stack items and the node where we are currently
     * so that on iterator EOF we can restore the current key and state. */
    size_t orig_key_len = it->key_len;
    size_t orig_stack_items = it->stack.items;
    raxNode *orig_node = it->node;

    while (1) {
        int children = it->node->iscompr ? 1 : it->node->size;
        if (!noup && children) {
            debugf("GO DEEPER\n");
            /* Seek the lexicographically smaller key in this subtree, which
             * is the first one found always going towards the first child
             * of every successive node. */
            if (!raxStackPush(&it->stack, it->node)) return 0;
            raxNode **cp = raxNodeFirstChildPtr(it->node);
            if (!raxIteratorAddChars(it, it->node->data, it->node->iscompr ? it->node->size : 1)) return 0;
            memcpy(&it->node, cp, sizeof(it->node));
            /* Call the node callback if any, and replace the node pointer
             * if the callback returns true. */
            if (it->node_cb && it->node_cb(&it->node)) memcpy(cp, &it->node, sizeof(it->node));
            /* For "next" step, stop every time we find a key along the
             * way, since the key is lexicographically smaller compared to
             * what follows in the sub-children. */
            if (it->node->iskey) {
                it->data = raxGetData(it->node);
                return 1;
            }
        } else {
            /* If we finished exploring the previous sub-tree, switch to the
             * new one: go upper until a node is found where there are
             * children representing keys lexicographically greater than the
             * current key. */
            while (1) {
                int old_noup = noup;

                /* Already on head? Can't go up, iteration finished. */
                if (!noup && it->node == it->rt->head) {
                    it->flags |= RAX_ITER_EOF;
                    it->stack.items = orig_stack_items;
                    it->key_len = orig_key_len;
                    it->node = orig_node;
                    return 1;
                }
                /* If there are no children at the current node, try parent's
                 * next child. */
                unsigned char prevchild = it->key[it->key_len - 1];
                if (!noup) {
                    it->node = raxStackPop(&it->stack);
                } else {
                    noup = 0;
                }
                /* Adjust the current key to represent the node we are
                 * at. */
                int todel = it->node->iscompr ? it->node->size : 1;
                raxIteratorDelChars(it, todel);

                /* Try visiting the next child if there was at least one
                 * additional child. */
                if (!it->node->iscompr && it->node->size > (old_noup ? 0 : 1)) {
                    raxNode **cp = raxNodeFirstChildPtr(it->node);
                    int i = 0;
                    while (i < it->node->size) {
                        debugf("SCAN NEXT %c\n", it->node->data[i]);
                        if (it->node->data[i] > prevchild) break;
                        i++;
                        cp++;
                    }
                    if (i != it->node->size) {
                        debugf("SCAN found a new node\n");
                        raxIteratorAddChars(it, it->node->data + i, 1);
                        if (!raxStackPush(&it->stack, it->node)) return 0;
                        memcpy(&it->node, cp, sizeof(it->node));
                        /* Call the node callback if any, and replace the node
                         * pointer if the callback returns true. */
                        if (it->node_cb && it->node_cb(&it->node)) memcpy(cp, &it->node, sizeof(it->node));
                        if (it->node->iskey) {
                            it->data = raxGetData(it->node);
                            return 1;
                        }
                        break;
                    }
                }
            }
        }
    }
}

/* Seek the greatest key in the subtree at the current node. Return 0 on
 * out of memory, otherwise 1. This is a helper function for different
 * iteration functions below. */
int raxSeekGreatest(raxIterator *it) {
    while (it->node->size) {
        if (it->node->iscompr) {
            if (!raxIteratorAddChars(it, it->node->data, it->node->size)) return 0;
        } else {
            if (!raxIteratorAddChars(it, it->node->data + it->node->size - 1, 1)) return 0;
        }
        raxNode **cp = raxNodeLastChildPtr(it->node);
        if (!raxStackPush(&it->stack, it->node)) return 0;
        memcpy(&it->node, cp, sizeof(it->node));
    }
    return 1;
}

/* Like raxIteratorNextStep() but implements an iteration step moving
 * to the lexicographically previous element. The 'noup' option has a similar
 * effect to the one of raxIteratorNextStep(). */
int raxIteratorPrevStep(raxIterator *it, int noup) {
    if (it->flags & RAX_ITER_EOF) {
        return 1;
    } else if (it->flags & RAX_ITER_JUST_SEEKED) {
        it->flags &= ~RAX_ITER_JUST_SEEKED;
        return 1;
    }

    /* Save key len, stack items and the node where we are currently
     * so that on iterator EOF we can restore the current key and state. */
    size_t orig_key_len = it->key_len;
    size_t orig_stack_items = it->stack.items;
    raxNode *orig_node = it->node;

    while (1) {
        int old_noup = noup;

        /* Already on head? Can't go up, iteration finished. */
        if (!noup && it->node == it->rt->head) {
            it->flags |= RAX_ITER_EOF;
            it->stack.items = orig_stack_items;
            it->key_len = orig_key_len;
            it->node = orig_node;
            return 1;
        }

        unsigned char prevchild = it->key[it->key_len - 1];
        if (!noup) {
            it->node = raxStackPop(&it->stack);
        } else {
            noup = 0;
        }

        /* Adjust the current key to represent the node we are
         * at. */
        int todel = it->node->iscompr ? it->node->size : 1;
        raxIteratorDelChars(it, todel);

        /* Try visiting the prev child if there is at least one
         * child. */
        if (!it->node->iscompr && it->node->size > (old_noup ? 0 : 1)) {
            raxNode **cp = raxNodeLastChildPtr(it->node);
            int i = it->node->size - 1;
            while (i >= 0) {
                debugf("SCAN PREV %c\n", it->node->data[i]);
                if (it->node->data[i] < prevchild) break;
                i--;
                cp--;
            }
            /* If we found a new subtree to explore in this node,
             * go deeper following all the last children in order to
             * find the key lexicographically greater. */
            if (i != -1) {
                debugf("SCAN found a new node\n");
                /* Enter the node we just found. */
                if (!raxIteratorAddChars(it, it->node->data + i, 1)) return 0;
                if (!raxStackPush(&it->stack, it->node)) return 0;
                memcpy(&it->node, cp, sizeof(it->node));
                /* Seek sub-tree max. */
                if (!raxSeekGreatest(it)) return 0;
            }
        }

        /* Return the key: this could be the key we found scanning a new
         * subtree, or if we did not find a new subtree to explore here,
         * before giving up with this node, check if it's a key itself. */
        if (it->node->iskey) {
            it->data = raxGetData(it->node);
            return 1;
        }
    }
}

/* Seek an iterator at the specified element.
 * Return 0 if the seek failed for syntax error or out of memory. Otherwise
 * 1 is returned. When 0 is returned for out of memory, errno is set to
 * the ENOMEM value. */
int raxSeek(raxIterator *it, const char *op, unsigned char *ele, size_t len) {
    int eq = 0, lt = 0, gt = 0, first = 0, last = 0;

    it->stack.items = 0; /* Just resetting. Initialized by raxStart(). */
    it->flags |= RAX_ITER_JUST_SEEKED;
    it->flags &= ~RAX_ITER_EOF;
    it->key_len = 0;
    it->node = NULL;

    /* Set flags according to the operator used to perform the seek. */
    if (op[0] == '>') {
        gt = 1;
        if (op[1] == '=') eq = 1;
    } else if (op[0] == '<') {
        lt = 1;
        if (op[1] == '=') eq = 1;
    } else if (op[0] == '=') {
        eq = 1;
    } else if (op[0] == '^') {
        first = 1;
    } else if (op[0] == '$') {
        last = 1;
    } else {
        errno = 0;
        return 0; /* Error. */
    }

    /* If there are no elements, set the EOF condition immediately and
     * return. */
    if (it->rt->numele == 0) {
        it->flags |= RAX_ITER_EOF;
        return 1;
    }

    if (first) {
        /* Seeking the first key greater or equal to the empty string
         * is equivalent to seeking the smaller key available. */
        return raxSeek(it, ">=", NULL, 0);
    }

    if (last) {
        /* Find the greatest key taking always the last child till a
         * final node is found. */
        it->node = it->rt->head;
        if (!raxSeekGreatest(it)) return 0;
        assert(it->node->iskey);
        it->data = raxGetData(it->node);
        return 1;
    }

    /* We need to seek the specified key. What we do here is to actually
     * perform a lookup, and later invoke the prev/next key code that
     * we already use for iteration. */
    int splitpos = 0;
    size_t i = raxLowWalk(it->rt, ele, len, &it->node, NULL, &splitpos, &it->stack);

    /* Return OOM on incomplete stack info. */
    if (it->stack.oom) return 0;

    if (eq && i == len && (!it->node->iscompr || splitpos == 0) && it->node->iskey) {
        /* We found our node, since the key matches and we have an
         * "equal" condition. */
        if (!raxIteratorAddChars(it, ele, len)) return 0; /* OOM. */
        it->data = raxGetData(it->node);
    } else if (lt || gt) {
        /* Exact key not found or eq flag not set. We have to set as current
         * key the one represented by the node we stopped at, and perform
         * a next/prev operation to seek. */
        raxIteratorAddChars(it, ele, i - splitpos);

        /* We need to set the iterator in the correct state to call next/prev
         * step in order to seek the desired element. */
        debugf("After initial seek: i=%d len=%d key=%.*s\n", (int)i, (int)len, (int)it->key_len, it->key);
        if (i != len && !it->node->iscompr) {
            /* If we stopped in the middle of a normal node because of a
             * mismatch, add the mismatching character to the current key
             * and call the iterator with the 'noup' flag so that it will try
             * to seek the next/prev child in the current node directly based
             * on the mismatching character. */
            if (!raxIteratorAddChars(it, ele + i, 1)) return 0;
            debugf("Seek normal node on mismatch: %.*s\n", (int)it->key_len, (char *)it->key);

            it->flags &= ~RAX_ITER_JUST_SEEKED;
            if (lt && !raxIteratorPrevStep(it, 1)) return 0;
            if (gt && !raxIteratorNextStep(it, 1)) return 0;
            it->flags |= RAX_ITER_JUST_SEEKED; /* Ignore next call. */
        } else if (i != len && it->node->iscompr) {
            debugf("Compressed mismatch: %.*s\n", (int)it->key_len, (char *)it->key);
            /* In case of a mismatch within a compressed node. */
            int nodechar = it->node->data[splitpos];
            int keychar = ele[i];
            it->flags &= ~RAX_ITER_JUST_SEEKED;
            if (gt) {
                /* If the key the compressed node represents is greater
                 * than our seek element, continue forward, otherwise set the
                 * state in order to go back to the next sub-tree. */
                if (nodechar > keychar) {
                    if (!raxIteratorNextStep(it, 0)) return 0;
                } else {
                    if (!raxIteratorAddChars(it, it->node->data, it->node->size)) return 0;
                    if (!raxIteratorNextStep(it, 1)) return 0;
                }
            }
            if (lt) {
                /* If the key the compressed node represents is smaller
                 * than our seek element, seek the greater key in this
                 * subtree, otherwise set the state in order to go back to
                 * the previous sub-tree. */
                if (nodechar < keychar) {
                    if (!raxSeekGreatest(it)) return 0;
                    it->data = raxGetData(it->node);
                } else {
                    if (!raxIteratorAddChars(it, it->node->data, it->node->size)) return 0;
                    if (!raxIteratorPrevStep(it, 1)) return 0;
                }
            }
            it->flags |= RAX_ITER_JUST_SEEKED; /* Ignore next call. */
        } else {
            debugf("No mismatch: %.*s\n", (int)it->key_len, (char *)it->key);
            /* If there was no mismatch we are into a node representing the
             * key, (but which is not a key or the seek operator does not
             * include 'eq'), or we stopped in the middle of a compressed node
             * after processing all the key. Continue iterating as this was
             * a legitimate key we stopped at. */
            it->flags &= ~RAX_ITER_JUST_SEEKED;
            if (it->node->iscompr && it->node->iskey && splitpos && lt) {
                /* If we stopped in the middle of a compressed node with
                 * perfect match, and the condition is to seek a key "<" than
                 * the specified one, then if this node is a key it already
                 * represents our match. For instance we may have nodes:
                 *
                 * "f" -> "oobar" = 1 -> "" = 2
                 *
                 * Representing keys "f" = 1, "foobar" = 2. A seek for
                 * the key < "foo" will stop in the middle of the "oobar"
                 * node, but will be our match, representing the key "f".
                 *
                 * So in that case, we don't seek backward. */
                it->data = raxGetData(it->node);
            } else {
                if (gt && !raxIteratorNextStep(it, 0)) return 0;
                if (lt && !raxIteratorPrevStep(it, 0)) return 0;
            }
            it->flags |= RAX_ITER_JUST_SEEKED; /* Ignore next call. */
        }
    } else {
        /* If we are here just eq was set but no match was found. */
        it->flags |= RAX_ITER_EOF;
        return 1;
    }
    return 1;
}

/* Go to the next element in the scope of the iterator 'it'.
 * If EOF (or out of memory) is reached, 0 is returned, otherwise 1 is
 * returned. In case 0 is returned because of OOM, errno is set to ENOMEM. */
int raxNext(raxIterator *it) {
    if (!raxIteratorNextStep(it, 0)) {
        errno = ENOMEM;
        return 0;
    }
    if (it->flags & RAX_ITER_EOF) {
        errno = 0;
        return 0;
    }
    return 1;
}

/* Go to the previous element in the scope of the iterator 'it'.
 * If EOF (or out of memory) is reached, 0 is returned, otherwise 1 is
 * returned. In case 0 is returned because of OOM, errno is set to ENOMEM. */
int raxPrev(raxIterator *it) {
    if (!raxIteratorPrevStep(it, 0)) {
        errno = ENOMEM;
        return 0;
    }
    if (it->flags & RAX_ITER_EOF) {
        errno = 0;
        return 0;
    }
    return 1;
}

/* Perform a random walk starting in the current position of the iterator.
 * Return 0 if the tree is empty or on out of memory. Otherwise 1 is returned
 * and the iterator is set to the node reached after doing a random walk
 * of 'steps' steps. If the 'steps' argument is 0, the random walk is performed
 * using a random number of steps between 1 and two times the logarithm of
 * the number of elements.
 *
 * NOTE: if you use this function to generate random elements from the radix
 * tree, expect a disappointing distribution. A random walk produces good
 * random elements if the tree is not sparse, however in the case of a radix
 * tree certain keys will be reported much more often than others. At least
 * this function should be able to explore every possible element eventually. */
int raxRandomWalk(raxIterator *it, size_t steps) {
    if (it->rt->numele == 0) {
        it->flags |= RAX_ITER_EOF;
        return 0;
    }

    if (steps == 0) {
        size_t fle = 1 + floor(log(it->rt->numele));
        fle *= 2;
        steps = 1 + rand() % fle;
    }

    raxNode *n = it->node;
    while (steps > 0 || !n->iskey) {
        int numchildren = n->iscompr ? 1 : n->size;
        int r = rand() % (numchildren + (n != it->rt->head));

        if (r == numchildren) {
            /* Go up to parent. */
            n = raxStackPop(&it->stack);
            int todel = n->iscompr ? n->size : 1;
            raxIteratorDelChars(it, todel);
        } else {
            /* Select a random child. */
            if (n->iscompr) {
                if (!raxIteratorAddChars(it, n->data, n->size)) return 0;
            } else {
                if (!raxIteratorAddChars(it, n->data + r, 1)) return 0;
            }
            raxNode **cp = raxNodeFirstChildPtr(n) + r;
            if (!raxStackPush(&it->stack, n)) return 0;
            memcpy(&n, cp, sizeof(n));
        }
        if (n->iskey) steps--;
    }
    it->node = n;
    it->data = raxGetData(it->node);
    return 1;
}

/* Compare the key currently pointed by the iterator to the specified
 * key according to the specified operator. Returns 1 if the comparison is
 * true, otherwise 0 is returned. */
int raxCompare(raxIterator *iter, const char *op, unsigned char *key, size_t key_len) {
    int eq = 0, lt = 0, gt = 0;

    if (op[0] == '=' || op[1] == '=') eq = 1;
    if (op[0] == '>')
        gt = 1;
    else if (op[0] == '<')
        lt = 1;
    else if (op[1] != '=')
        return 0; /* Syntax error. */

    size_t minlen = key_len < iter->key_len ? key_len : iter->key_len;
    int cmp = memcmp(iter->key, key, minlen);

    /* Handle == */
    if (lt == 0 && gt == 0) return cmp == 0 && key_len == iter->key_len;

    /* Handle >, >=, <, <= */
    if (cmp == 0) {
        /* Same prefix: longer wins. */
        if (eq && key_len == iter->key_len)
            return 1;
        else if (lt)
            return iter->key_len < key_len;
        else if (gt)
            return iter->key_len > key_len;
        else
            return 0; /* Avoid warning, just 'eq' is handled before. */
    } else if (cmp > 0) {
        return gt ? 1 : 0;
    } else /* (cmp < 0) */ {
        return lt ? 1 : 0;
    }
}

/* Free the iterator. */
void raxStop(raxIterator *it) {
    if (it->key != it->key_static_string) rax_free(it->key);
    raxStackFree(&it->stack);
}

/* Return if the iterator is in an EOF state. This happens when raxSeek()
 * failed to seek an appropriate element, so that raxNext() or raxPrev()
 * will return zero, or when an EOF condition was reached while iterating
 * with raxNext() and raxPrev(). */
int raxEOF(raxIterator *it) {
    return it->flags & RAX_ITER_EOF;
}

/* Return the number of elements inside the radix tree. */
uint64_t raxSize(rax *rax) {
    return rax->numele;
}

/* Return the rax tree allocation size in bytes */
size_t raxAllocSize(rax *rax) {
    return rax->alloc_size;
}

/* ----------------------------- Introspection ------------------------------ */

/* This function is mostly used for debugging and learning purposes.
 * It shows an ASCII representation of a tree on standard output, outline
 * all the nodes and the contained keys.
 *
 * The representation is as follow:
 *
 *  "foobar" (compressed node)
 *  [abc] (normal node with three children)
 *  [abc]=0x12345678 (node is a key, pointing to value 0x12345678)
 *  [] (a normal empty node)
 *
 *  Children are represented in new indented lines, each children prefixed by
 *  the "`-(x)" string, where "x" is the edge byte.
 *
 *  [abc]
 *   `-(a) "ladin"
 *   `-(b) [kj]
 *   `-(c) []
 *
 *  However when a node has a single child the following representation
 *  is used instead:
 *
 *  [abc] -> "ladin" -> []
 */

/* The actual implementation of raxShow(). */
void raxRecursiveShow(int level, int lpad, raxNode *n) {
    char s = n->iscompr ? '"' : '[';
    char e = n->iscompr ? '"' : ']';

    int numchars = printf("%c%.*s%c", s, n->size, n->data, e);
    if (n->iskey) {
        numchars += printf("=%p", raxGetData(n));
    }

    int numchildren = n->iscompr ? 1 : n->size;
    /* Note that 7 and 4 magic constants are the string length
     * of " `-(x) " and " -> " respectively. */
    if (level) {
        lpad += (numchildren > 1) ? 7 : 4;
        if (numchildren == 1) lpad += numchars;
    }
    raxNode **cp = raxNodeFirstChildPtr(n);
    for (int i = 0; i < numchildren; i++) {
        char *branch = " `-(%c) ";
        if (numchildren > 1) {
            printf("\n");
            for (int j = 0; j < lpad; j++) putchar(' ');
            printf(branch, n->data[i]);
        } else {
            printf(" -> ");
        }
        raxNode *child;
        memcpy(&child, cp, sizeof(child));
        raxRecursiveShow(level + 1, lpad, child);
        cp++;
    }
}

/* Show a tree, as outlined in the comment above. */
void raxShow(rax *rax) {
    raxRecursiveShow(0, 0, rax->head);
    putchar('\n');
}

/* Used by debugnode() macro to show info about a given node. */
void raxDebugShowNode(const char *msg, raxNode *n) {
    if (raxDebugMsg == 0) return;
    printf("%s: %p [%.*s] key:%u size:%u children:", msg, (void *)n, (int)n->size, (char *)n->data, n->iskey, n->size);
    int numcld = n->iscompr ? 1 : n->size;
    raxNode **cldptr = raxNodeLastChildPtr(n) - (numcld - 1);
    while (numcld--) {
        raxNode *child;
        memcpy(&child, cldptr, sizeof(child));
        cldptr++;
        printf("%p ", (void *)child);
    }
    printf("\n");
    fflush(stdout);
}

/* Touch all the nodes of a tree returning a check sum. This is useful
 * in order to make Valgrind detect if there is something wrong while
 * reading the data structure.
 *
 * This function was used in order to identify Rax bugs after a big refactoring
 * using this technique:
 *
 * 1. The rax-test is executed using Valgrind, adding a printf() so that for
 *    the fuzz tester we see what iteration in the loop we are in.
 * 2. After every modification of the radix tree made by the fuzz tester
 *    in rax-test.c, we add a call to raxTouch().
 * 3. Now as soon as an operation will corrupt the tree, raxTouch() will
 *    detect it (via Valgrind) immediately. We can add more calls to narrow
 *    the state.
 * 4. At this point a good idea is to enable Rax debugging messages immediately
 *    before the moment the tree is corrupted, to see what happens.
 */
unsigned long raxTouch(raxNode *n) {
    debugf("Touching %p\n", (void *)n);
    unsigned long sum = 0;
    if (n->iskey) {
        sum += (unsigned long)raxGetData(n);
    }

    int numchildren = n->iscompr ? 1 : n->size;
    raxNode **cp = raxNodeFirstChildPtr(n);
    int count = 0;
    for (int i = 0; i < numchildren; i++) {
        if (numchildren > 1) {
            sum += (long)n->data[i];
        }
        raxNode *child;
        memcpy(&child, cp, sizeof(child));
        if (child == (void *)0x65d1760) count++;
        if (count > 1) exit(1);
        sum += raxTouch(child);
        cp++;
    }
    return sum;
}

int checkedRaxRemove(rax *rax, unsigned char *s, size_t len, void **old) {
  int res = raxRemove(rax, s, len, old);
  if(res == 0) {
    // lp freed but node not removed!
    fprintf(stderr, "Error: corrupted listpack found.");
    abort();
  }
  return res;
}


================================================
FILE: src/redis/rax.h
================================================
/* Rax -- A radix tree implementation.
 *
 * Copyright (c) 2017-2018, Redis Ltd.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef RAX_H
#define RAX_H

#include <stdint.h>

/* Representation of a radix tree as implemented in this file, that contains
 * the strings "foo", "foobar" and "footer" after the insertion of each
 * word. When the node represents a key inside the radix tree, we write it
 * between [], otherwise it is written between ().
 *
 * This is the vanilla representation:
 *
 *              (f) ""
 *                \
 *                (o) "f"
 *                  \
 *                  (o) "fo"
 *                    \
 *                  [t   b] "foo"
 *                  /     \
 *         "foot" (e)     (a) "foob"
 *                /         \
 *      "foote" (r)         (r) "fooba"
 *              /             \
 *    "footer" []             [] "foobar"
 *
 * However, this implementation implements a very common optimization where
 * successive nodes having a single child are "compressed" into the node
 * itself as a string of characters, each representing a next-level child,
 * and only the link to the node representing the last character node is
 * provided inside the representation. So the above representation is turned
 * into:
 *
 *                  ["foo"] ""
 *                     |
 *                  [t   b] "foo"
 *                  /     \
 *        "foot" ("er")    ("ar") "foob"
 *                 /          \
 *       "footer" []          [] "foobar"
 *
 * However this optimization makes the implementation a bit more complex.
 * For instance if a key "first" is added in the above radix tree, a
 * "node splitting" operation is needed, since the "foo" prefix is no longer
 * composed of nodes having a single child one after the other. This is the
 * above tree and the resulting node splitting after this event happens:
 *
 *
 *                    (f) ""
 *                    /
 *                 (i o) "f"
 *                 /   \
 *    "firs"  ("rst")  (o) "fo"
 *              /        \
 *    "first" []       [t   b] "foo"
 *                     /     \
 *           "foot" ("er")    ("ar") "foob"
 *                    /          \
 *          "footer" []          [] "foobar"
 *
 * Similarly after deletion, if a new chain of nodes having a single child
 * is created (the chain must also not include nodes that represent keys),
 * it must be compressed back into a single node.
 *
 */

#define RAX_NODE_MAX_SIZE ((1 << 29) - 1)
typedef struct raxNode {
    uint32_t iskey : 1;   /* Does this node contain a key? */
    uint32_t isnull : 1;  /* Associated value is NULL (don't store it). */
    uint32_t iscompr : 1; /* Node is compressed. */
    uint32_t size : 29;   /* Number of children, or compressed string len. */
    /* Data layout is as follows:
     *
     * If node is not compressed we have 'size' bytes, one for each children
     * character, and 'size' raxNode pointers, point to each child node.
     * Note how the character is not stored in the children but in the
     * edge of the parents:
     *
     * [header iscompr=0][abc][a-ptr][b-ptr][c-ptr](value-ptr?)
     *
     * if node is compressed (iscompr bit is 1) the node has 1 children.
     * In that case the 'size' bytes of the string stored immediately at
     * the start of the data section, represent a sequence of successive
     * nodes linked one after the other, for which only the last one in
     * the sequence is actually represented as a node, and pointed to by
     * the current compressed node.
     *
     * [header iscompr=1][xyz][z-ptr](value-ptr?)
     *
     * Both compressed and not compressed nodes can represent a key
     * with associated data in the radix tree at any level (not just terminal
     * nodes).
     *
     * If the node has an associated key (iskey=1) and is not NULL
     * (isnull=0), then after the raxNode pointers pointing to the
     * children, an additional value pointer is present (as you can see
     * in the representation above as "value-ptr" field).
     */
    unsigned char data[];
} raxNode;

typedef struct rax {
    raxNode *head;     /* Pointer to root node of tree */
    uint64_t numele;   /* Number of keys in the tree */
    uint64_t numnodes; /* Number of rax nodes in the tree */
    size_t alloc_size; /* Total allocation size of the tree in bytes */
} rax;

/* Stack data structure used by raxLowWalk() in order to, optionally, return
 * a list of parent nodes to the caller. The nodes do not have a "parent"
 * field for space concerns, so we use the auxiliary stack when needed. */
#define RAX_STACK_STATIC_ITEMS 32
typedef struct raxStack {
    void **stack;           /* Points to static_items or an heap allocated array. */
    size_t items, maxitems; /* Number of items contained and total space. */
    /* Up to RAXSTACK_STACK_ITEMS items we avoid to allocate on the heap
     * and use this static array of pointers instead. */
    void *static_items[RAX_STACK_STATIC_ITEMS];
    int oom; /* True if pushing into this stack failed for OOM at some point. */
} raxStack;

/* Optional callback used for iterators and be notified on each rax node,
 * including nodes not representing keys. If the callback returns true
 * the callback changed the node pointer in the iterator structure, and the
 * iterator implementation will have to replace the pointer in the radix tree
 * internals. This allows the callback to reallocate the node to perform
 * very special operations, normally not needed by normal applications.
 *
 * This callback is used to perform very low level analysis of the radix tree
 * structure, scanning each possible node (but the root node), or in order to
 * reallocate the nodes to reduce the allocation fragmentation (this is the
 * server's application for this callback).
 *
 * This is currently only supported in forward iterations (raxNext) */
typedef int (*raxNodeCallback)(raxNode **noderef);

/* Radix tree iterator state is encapsulated into this data structure. */
#define RAX_ITER_STATIC_LEN 128
#define RAX_ITER_JUST_SEEKED (1 << 0) /* Iterator was just seeked. Return current \
                                         element for the first iteration and      \
                                         clear the flag. */
#define RAX_ITER_EOF (1 << 1)         /* End of iteration reached. */
#define RAX_ITER_SAFE (1 << 2)        /* Safe iterator, allows operations while \
                                         iterating. But it is slower. */
typedef struct raxIterator {
    int flags;
    rax *rt;            /* Radix tree we are iterating. */
    unsigned char *key; /* The current string. */
    void *data;         /* Data associated to this key. */
    size_t key_len;     /* Current key length. */
    size_t key_max;     /* Max key len the current key buffer can hold. */
    unsigned char key_static_string[RAX_ITER_STATIC_LEN];
    raxNode *node;           /* Current node. Only for unsafe iteration. */
    raxStack stack;          /* Stack used for unsafe iteration. */
    raxNodeCallback node_cb; /* Optional node callback. Normally set to NULL. */
} raxIterator;

/* Exported API. */
rax *raxNew(void);
int raxInsert(rax *rax, unsigned char *s, size_t len, void *data, void **old);
int raxTryInsert(rax *rax, unsigned char *s, size_t len, void *data, void **old);
int raxRemove(rax *rax, unsigned char *s, size_t len, void **old);
int raxFind(rax *rax, unsigned char *s, size_t len, void **value);
void raxFree(rax *rax);
void raxFreeWithCallback(rax *rax, void (*free_callback)(void*));
void raxFreeWithCallbackAndArgument(rax *rax, void (*free_callback)(void*, void*), void* argument);
void raxStart(raxIterator *it, rax *rt);
int raxSeek(raxIterator *it, const char *op, unsigned char *ele, size_t len);
int raxNext(raxIterator *it);
int raxPrev(raxIterator *it);
int raxRandomWalk(raxIterator *it, size_t steps);
int raxCompare(raxIterator *iter, const char *op, unsigned char *key, size_t key_len);
void raxStop(raxIterator *it);
int raxEOF(raxIterator *it);
void raxShow(rax *rax);
uint64_t raxSize(rax *rax);
size_t raxAllocSize(rax *rax);
unsigned long raxTouch(raxNode *n);
void raxSetDebugMsg(int onoff);

int checkedRaxRemove(rax *rax, unsigned char *s, size_t len, void **old);

/* Internal API. May be used by the node callback in order to access rax nodes
 * in a low level way, so this function is exported as well. */
void raxSetData(raxNode *n, void *data);

#endif


================================================
FILE: src/redis/rax_malloc.h
================================================
/* Rax -- A radix tree implementation.
 *
 * Copyright (c) 2017, Redis Ltd.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/* Allocator selection.
 *
 * This file is used in order to change the Rax allocator at compile time.
 * Just define the following defines to what you want to use. Also add
 * the include of your alternate allocator if needed (not needed in order
 * to use the default libc allocator). */

#ifndef RAX_ALLOC_H
#define RAX_ALLOC_H
#include "zmalloc.h"
#define rax_malloc zmalloc
#define rax_realloc zrealloc
#define rax_free zfree
#define rax_ptr_alloc_size zmalloc_size
#endif


================================================
FILE: src/redis/rdb.h
================================================
/*
 * Copyright (c) 2009-2012, Salvatore Sanfilippo <antirez at gmail dot com>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef __RDB_H
#define __RDB_H

#include <limits.h>
#include <stdio.h>
#include <time.h>

#include "redis_aux.h"

/* The current RDB version. When the format changes in a way that is no longer
 * backward compatible this number gets incremented. */
#define RDB_VERSION 12

/* We would like to serialize to version 9 such that our rdb files
 * can be loaded by redis version 6 (RDB_VERSION 9) */
#define RDB_SER_VERSION 9

/* Defines related to the dump file format. To store 32 bits lengths for short
 * keys requires a lot of space, so we check the most significant 2 bits of
 * the first byte to interpreter the length:
 *
 * 00|XXXXXX => if the two MSB are 00 the len is the 6 bits of this byte
 * 01|XXXXXX XXXXXXXX =>  01, the len is 14 bits, 6 bits + 8 bits of next byte
 * 10|000000 [32 bit integer] => A full 32 bit len in net byte order will follow
 * 10|000001 [64 bit integer] => A full 64 bit len in net byte order will follow
 * 11|OBKIND this means: specially encoded object will follow. The six bits
 *           number specify the kind of object that follows.
 *           See the RDB_ENC_* defines.
 *
 * Lengths up to 63 are stored using a single byte, most DB keys, and may
 * values, will fit inside. */
#define RDB_6BITLEN 0
#define RDB_14BITLEN 1
#define RDB_32BITLEN 0x80
#define RDB_64BITLEN 0x81
#define RDB_ENCVAL 3
#define RDB_LENERR UINT64_MAX

/* When a length of a string object stored on disk has the first two bits
 * set, the remaining six bits specify a special encoding for the object
 * accordingly to the following defines: */
#define RDB_ENC_INT8 0        /* 8 bit signed integer */
#define RDB_ENC_INT16 1       /* 16 bit signed integer */
#define RDB_ENC_INT32 2       /* 32 bit signed integer */
#define RDB_ENC_LZF 3         /* string compressed with FASTLZ */

/* Map object types to RDB object types. Macros starting with OBJ_ are for
 * memory storage and may change. Instead RDB types must be fixed because
 * we store them on disk. */
#define RDB_TYPE_STRING 0
#define RDB_TYPE_LIST   1
#define RDB_TYPE_SET    2
#define RDB_TYPE_ZSET   3
#define RDB_TYPE_HASH   4
#define RDB_TYPE_ZSET_2 5 /* ZSET version 2 with doubles stored in binary. */
#define RDB_TYPE_MODULE 6
#define RDB_TYPE_MODULE_PRE_GA 6 /* Used in 4.0 release candidates */
#define RDB_TYPE_MODULE_2 7 /* Module value with annotations for parsing without
                               the generating module being loaded. */
/* NOTE: WHEN ADDING NEW RDB TYPE, UPDATE rdbIsObjectType() BELOW */

/* Object types for encoded objects. */
#define RDB_TYPE_HASH_ZIPMAP    9
#define RDB_TYPE_LIST_ZIPLIST  10
#define RDB_TYPE_SET_INTSET    11
#define RDB_TYPE_ZSET_ZIPLIST  12
#define RDB_TYPE_HASH_ZIPLIST  13
#define RDB_TYPE_LIST_QUICKLIST 14
#define RDB_TYPE_STREAM_LISTPACKS 15
#define RDB_TYPE_HASH_LISTPACK 16
#define RDB_TYPE_ZSET_LISTPACK 17
#define RDB_TYPE_LIST_QUICKLIST_2   18
#define RDB_TYPE_STREAM_LISTPACKS_2 19
#define RDB_TYPE_SET_LISTPACK  20
#define RDB_TYPE_STREAM_LISTPACKS_3 21
/* NOTE: WHEN ADDING NEW RDB TYPE, UPDATE rdbIsObjectType() BELOW */

/* Test if a type is an object type. */
#define __rdbIsObjectType(t) (((t) >= 0 && (t) <= 7) || ((t) >= 9 && (t) <= 21))

/* Range 200-240 is used by Dragonfly specific opcodes */

/* Special RDB opcodes (saved/loaded with rdbSaveType/rdbLoadType). */
#define RDB_OPCODE_SLOT_INFO  244   /* Individual slot info, such as slot id and size (cluster mode only). */
#define RDB_OPCODE_FUNCTION   246   /* engine data */
#define RDB_OPCODE_FUNCTION2  245   /* function library data */
#define RDB_OPCODE_FUNCTION_PRE_GA   246   /* old function library data for 7.0 rc1 and rc2 */
#define RDB_OPCODE_MODULE_AUX 247   /* Module auxiliary data. */
#define RDB_OPCODE_IDLE       248   /* LRU idle time. */
#define RDB_OPCODE_FREQ       249   /* LFU frequency. */
#define RDB_OPCODE_AUX        250   /* RDB aux field. */
#define RDB_OPCODE_RESIZEDB   251   /* Hash table resize hint. */
#define RDB_OPCODE_EXPIRETIME_MS 252    /* Expire time in milliseconds. */
#define RDB_OPCODE_EXPIRETIME 253       /* Old expire time in seconds. */
#define RDB_OPCODE_SELECTDB   254   /* DB number of the following keys. */
#define RDB_OPCODE_EOF        255   /* End of the RDB file. */

/* Module serialized values sub opcodes */
#define RDB_MODULE_OPCODE_EOF   0   /* End of module value. */
#define RDB_MODULE_OPCODE_SINT  1   /* Signed integer. */
#define RDB_MODULE_OPCODE_UINT  2   /* Unsigned integer. */
#define RDB_MODULE_OPCODE_FLOAT 3   /* Float. */
#define RDB_MODULE_OPCODE_DOUBLE 4  /* Double. */
#define RDB_MODULE_OPCODE_STRING 5  /* String. */

/* rdbLoad...() functions flags. */
#define RDB_LOAD_NONE   0
#define RDB_LOAD_ENC    (1<<0)
#define RDB_LOAD_PLAIN  (1<<1)
#define RDB_LOAD_SDS    (1<<2)

/* flags on the purpose of rdb save or load */
#define RDBFLAGS_NONE 0                 /* No special RDB loading. */
#define RDBFLAGS_AOF_PREAMBLE (1<<0)    /* Load/save the RDB as AOF preamble. */
#define RDBFLAGS_REPLICATION (1<<1)     /* Load/save for SYNC. */
#define RDBFLAGS_ALLOW_DUP (1<<2)       /* Allow duplicated keys when loading.*/
#define RDBFLAGS_FEED_REPL (1<<3)       /* Feed replication stream when loading.*/
#define RDBFLAGS_KEEP_CACHE (1<<4)      /* Don't reclaim cache after rdb file is generated */

/* When rdbLoadObject() returns NULL, the err flag is
 * set to hold the type of error that occurred */
#define RDB_LOAD_ERR_EMPTY_KEY  1   /* Error of empty key */
#define RDB_LOAD_ERR_OTHER      2   /* Any other errors */

// ROMAN: those constants should be factored out to redis_base.h or something.
// Currently moved here from server.h
#define LONG_STR_SIZE      21          /* Bytes needed for long -> str + '\0' */

#define REDIS_VERSION "6.2.11"

#endif


================================================
FILE: src/redis/read.c
================================================
/*
 * Copyright (c) 2009-2011, Salvatore Sanfilippo <antirez at gmail dot com>
 * Copyright (c) 2010-2011, Pieter Noordhuis <pcnoordhuis at gmail dot com>
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <string.h>
#include <stdlib.h>

#include <unistd.h>
#include <strings.h>

#include <assert.h>
#include <errno.h>
#include <ctype.h>
#include <limits.h>
#include <math.h>

#include "sdsalloc.h"
#include "read.h"
#include "sds.h"


/* Initial size of our nested reply stack and how much we grow it when needd */
#define REDIS_READER_STACK_SIZE 9

static void __redisReaderSetError(redisReader *r, int type, const char *str) {
    size_t len;

    if (r->reply != NULL && r->fn && r->fn->freeObject) {
        r->fn->freeObject(r->reply);
        r->reply = NULL;
    }

    /* Clear input buffer on errors. */
    sdsfree(r->buf);
    r->buf = NULL;
    r->pos = r->len = 0;

    /* Reset task stack. */
    r->ridx = -1;

    /* Set error. */
    r->err = type;
    len = strlen(str);
    len = len < (sizeof(r->errstr)-1) ? len : (sizeof(r->errstr)-1);
    memcpy(r->errstr,str,len);
    r->errstr[len] = '\0';
}

static size_t chrtos(char *buf, size_t size, char byte) {
    size_t len = 0;

    switch(byte) {
    case '\\':
    case '"':
        len = snprintf(buf,size,"\"\\%c\"",byte);
        break;
    case '\n': len = snprintf(buf,size,"\"\\n\""); break;
    case '\r': len = snprintf(buf,size,"\"\\r\""); break;
    case '\t': len = snprintf(buf,size,"\"\\t\""); break;
    case '\a': len = snprintf(buf,size,"\"\\a\""); break;
    case '\b': len = snprintf(buf,size,"\"\\b\""); break;
    default:
        if (isprint(byte))
            len = snprintf(buf,size,"\"%c\"",byte);
        else
            len = snprintf(buf,size,"\"\\x%02x\"",(unsigned char)byte);
        break;
    }

    return len;
}

static void __redisReaderSetErrorProtocolByte(redisReader *r, char byte) {
    char cbuf[8], sbuf[128];

    chrtos(cbuf,sizeof(cbuf),byte);
    snprintf(sbuf,sizeof(sbuf),
        "Protocol error, got %s as reply type byte", cbuf);
    __redisReaderSetError(r,REDIS_ERR_PROTOCOL,sbuf);
}

static void __redisReaderSetErrorOOM(redisReader *r) {
    __redisReaderSetError(r,REDIS_ERR_OOM,"Out of memory");
}

static char *readBytes(redisReader *r, unsigned int bytes) {
    char *p;
    if (r->len-r->pos >= bytes) {
        p = r->buf+r->pos;
        r->pos += bytes;
        return p;
    }
    return NULL;
}

/* Find pointer to \r\n. */
static char *seekNewline(char *s, size_t len) {
    char *ret;

    /* We cannot match with fewer than 2 bytes */
    if (len < 2)
        return NULL;

    /* Search up to len - 1 characters */
    len--;

    /* Look for the \r */
    while ((ret = memchr(s, '\r', len)) != NULL) {
        if (ret[1] == '\n') {
            /* Found. */
            break;
        }
        /* Continue searching. */
        ret++;
        len -= ret - s;
        s = ret;
    }

    return ret;
}

/* Convert a string into a long long. Returns REDIS_OK if the string could be
 * parsed into a (non-overflowing) long long, REDIS_ERR otherwise. The value
 * will be set to the parsed value when appropriate.
 *
 * Note that this function demands that the string strictly represents
 * a long long: no spaces or other characters before or after the string
 * representing the number are accepted, nor zeroes at the start if not
 * for the string "0" representing the zero number.
 *
 * Because of its strictness, it is safe to use this function to check if
 * you can convert a string into a long long, and obtain back the string
 * from the number without any loss in the string representation. */
static int string2ll(const char *s, size_t slen, long long *value) {
    const char *p = s;
    size_t plen = 0;
    int negative = 0;
    unsigned long long v;

    if (plen == slen)
        return REDIS_ERR;

    /* Special case: first and only digit is 0. */
    if (slen == 1 && p[0] == '0') {
        if (value != NULL) *value = 0;
        return REDIS_OK;
    }

    if (p[0] == '-') {
        negative = 1;
        p++; plen++;

        /* Abort on only a negative sign. */
        if (plen == slen)
            return REDIS_ERR;
    }

    /* First digit should be 1-9, otherwise the string should just be 0. */
    if (p[0] >= '1' && p[0] <= '9') {
        v = p[0]-'0';
        p++; plen++;
    } else if (p[0] == '0' && slen == 1) {
        *value = 0;
        return REDIS_OK;
    } else {
        return REDIS_ERR;
    }

    while (plen < slen && p[0] >= '0' && p[0] <= '9') {
        if (v > (ULLONG_MAX / 10)) /* Overflow. */
            return REDIS_ERR;
        v *= 10;

        if (v > (ULLONG_MAX - (p[0]-'0'))) /* Overflow. */
            return REDIS_ERR;
        v += p[0]-'0';

        p++; plen++;
    }

    /* Return if not all bytes were used. */
    if (plen < slen)
        return REDIS_ERR;

    if (negative) {
        if (v > ((unsigned long long)(-(LLONG_MIN+1))+1)) /* Overflow. */
            return REDIS_ERR;
        if (value != NULL) *value = -v;
    } else {
        if (v > LLONG_MAX) /* Overflow. */
            return REDIS_ERR;
        if (value != NULL) *value = v;
    }
    return REDIS_OK;
}

static char *readLine(redisReader *r, int *_len) {
    char *p, *s;
    int len;

    p = r->buf+r->pos;
    s = seekNewline(p,(r->len-r->pos));
    if (s != NULL) {
        len = s-(r->buf+r->pos);
        r->pos += len+2; /* skip \r\n */
        if (_len) *_len = len;
        return p;
    }
    return NULL;
}

static void moveToNextTask(redisReader *r) {
    redisReadTask *cur, *prv;
    while (r->ridx >= 0) {
        /* Return a.s.a.p. when the stack is now empty. */
        if (r->ridx == 0) {
            r->ridx--;
            return;
        }

        cur = r->task[r->ridx];
        prv = r->task[r->ridx-1];
        assert(prv->type == REDIS_REPLY_ARRAY ||
               prv->type == REDIS_REPLY_MAP ||
               prv->type == REDIS_REPLY_ATTR ||
               prv->type == REDIS_REPLY_SET ||
               prv->type == REDIS_REPLY_PUSH);
        if (cur->idx == prv->elements-1) {
            r->ridx--;
        } else {
            /* Reset the type because the next item can be anything */
            assert(cur->idx < prv->elements);
            cur->type = -1;
            cur->elements = -1;
            cur->idx++;
            return;
        }
    }
}

static int processLineItem(redisReader *r) {
    redisReadTask *cur = r->task[r->ridx];
    void *obj;
    char *p;
    int len;

    if ((p = readLine(r,&len)) != NULL) {
        if (cur->type == REDIS_REPLY_INTEGER) {
            long long v;

            if (string2ll(p, len, &v) == REDIS_ERR) {
                __redisReaderSetError(r,REDIS_ERR_PROTOCOL,
                        "Bad integer value");
                return REDIS_ERR;
            }

            if (r->fn && r->fn->createInteger) {
                obj = r->fn->createInteger(cur,v);
            } else {
                obj = (void*)REDIS_REPLY_INTEGER;
            }
        } else if (cur->type == REDIS_REPLY_DOUBLE) {
            char buf[326], *eptr;
            double d;

            if ((size_t)len >= sizeof(buf)) {
                __redisReaderSetError(r,REDIS_ERR_PROTOCOL,
                        "Double value is too large");
                return REDIS_ERR;
            }

            memcpy(buf,p,len);
            buf[len] = '\0';

            if (len == 3 && strcasecmp(buf,"inf") == 0) {
                d = INFINITY; /* Positive infinite. */
            } else if (len == 4 && strcasecmp(buf,"-inf") == 0) {
                d = -INFINITY; /* Negative infinite. */
            } else if ((len == 3 && strcasecmp(buf,"nan") == 0) ||
                       (len == 4 && strcasecmp(buf, "-nan") == 0)) {
                d = NAN; /* nan. */
            } else {
                d = strtod((char*)buf,&eptr);
                /* RESP3 only allows "inf", "-inf", and finite values, while
                 * strtod() allows other variations on infinity,
                 * etc. We explicity handle our two allowed infinite cases and NaN
                 * above, so strtod() should only result in finite values. */
                if (buf[0] == '\0' || eptr != &buf[len] || !isfinite(d)) {
                    __redisReaderSetError(r,REDIS_ERR_PROTOCOL,
                            "Bad double value");
                    return REDIS_ERR;
                }
            }

            if (r->fn && r->fn->createDouble) {
                obj = r->fn->createDouble(cur,d,buf,len);
            } else {
                obj = (void*)REDIS_REPLY_DOUBLE;
            }
        } else if (cur->type == REDIS_REPLY_NIL) {
            if (len != 0) {
                __redisReaderSetError(r,REDIS_ERR_PROTOCOL,
                        "Bad nil value");
                return REDIS_ERR;
            }

            if (r->fn && r->fn->createNil)
                obj = r->fn->createNil(cur);
            else
                obj = (void*)REDIS_REPLY_NIL;
        } else if (cur->type == REDIS_REPLY_BOOL) {
            int bval;

            if (len != 1 || !strchr("tTfF", p[0])) {
                __redisReaderSetError(r,REDIS_ERR_PROTOCOL,
                        "Bad bool value");
                return REDIS_ERR;
            }

            bval = p[0] == 't' || p[0] == 'T';
            if (r->fn && r->fn->createBool)
                obj = r->fn->createBool(cur,bval);
            else
                obj = (void*)REDIS_REPLY_BOOL;
        } else if (cur->type == REDIS_REPLY_BIGNUM) {
            /* Ensure all characters are decimal digits (with possible leading
             * minus sign). */
            for (int i = 0; i < len; i++) {
                /* XXX Consider: Allow leading '+'? Error on leading '0's? */
                if (i == 0 && p[0] == '-') continue;
                if (p[i] < '0' || p[i] > '9') {
                    __redisReaderSetError(r,REDIS_ERR_PROTOCOL,
                            "Bad bignum value");
                    return REDIS_ERR;
                }
            }
            if (r->fn && r->fn->createString)
                obj = r->fn->createString(cur,p,len);
            else
                obj = (void*)REDIS_REPLY_BIGNUM;
        } else {
            /* Type will be error or status. */
            for (int i = 0; i < len; i++) {
                if (p[i] == '\r' || p[i] == '\n') {
                    __redisReaderSetError(r,REDIS_ERR_PROTOCOL,
                            "Bad simple string value");
                    return REDIS_ERR;
                }
            }
            if (r->fn && r->fn->createString)
                obj = r->fn->createString(cur,p,len);
            else
                obj = (void*)(uintptr_t)(cur->type);
        }

        if (obj == NULL) {
            __redisReaderSetErrorOOM(r);
            return REDIS_ERR;
        }

        /* Set reply if this is the root object. */
        if (r->ridx == 0) r->reply = obj;
        moveToNextTask(r);
        return REDIS_OK;
    }

    return REDIS_ERR;
}

static int processBulkItem(redisReader *r) {
    redisReadTask *cur = r->task[r->ridx];
    void *obj = NULL;
    char *p, *s;
    long long len;
    unsigned long bytelen;
    int success = 0;

    p = r->buf+r->pos;
    s = seekNewline(p,r->len-r->pos);
    if (s != NULL) {
        p = r->buf+r->pos;
        bytelen = s-(r->buf+r->pos)+2; /* include \r\n */

        if (string2ll(p, bytelen - 2, &len) == REDIS_ERR) {
            __redisReaderSetError(r,REDIS_ERR_PROTOCOL,
                    "Bad bulk string length");
            return REDIS_ERR;
        }

        if (len < -1 || (LLONG_MAX > SIZE_MAX && len > (long long)SIZE_MAX)) {
            __redisReaderSetError(r,REDIS_ERR_PROTOCOL,
                    "Bulk string length out of range");
            return REDIS_ERR;
        }

        if (len == -1) {
            /* The nil object can always be created. */
            if (r->fn && r->fn->createNil)
                obj = r->fn->createNil(cur);
            else
                obj = (void*)REDIS_REPLY_NIL;
            success = 1;
        } else {
            /* Only continue when the buffer contains the entire bulk item. */
            bytelen += len+2; /* include \r\n */
            if (r->pos+bytelen <= r->len) {
                if ((cur->type == REDIS_REPLY_VERB && len < 4) ||
                    (cur->type == REDIS_REPLY_VERB && s[5] != ':'))
                {
                    __redisReaderSetError(r,REDIS_ERR_PROTOCOL,
                            "Verbatim string 4 bytes of content type are "
                            "missing or incorrectly encoded.");
                    return REDIS_ERR;
                }
                if (r->fn && r->fn->createString)
                    obj = r->fn->createString(cur,s+2,len);
                else
                    obj = (void*)(uintptr_t)cur->type;
                success = 1;
            }
        }

        /* Proceed when obj was created. */
        if (success) {
            if (obj == NULL) {
                __redisReaderSetErrorOOM(r);
                return REDIS_ERR;
            }

            r->pos += bytelen;

            /* Set reply if this is the root object. */
            if (r->ridx == 0) r->reply = obj;
            moveToNextTask(r);
            return REDIS_OK;
        }
    }

    return REDIS_ERR;
}

static int redisReaderGrow(redisReader *r) {
    redisReadTask **aux;
    int newlen;

    /* Grow our stack size */
    newlen = r->tasks + REDIS_READER_STACK_SIZE;
    aux = s_realloc(r->task, sizeof(*r->task) * newlen);
    if (aux == NULL)
        goto oom;

    r->task = aux;

    /* Allocate new tasks */
    for (; r->tasks < newlen; r->tasks++) {
        r->task[r->tasks] = s_calloc(sizeof(**r->task));
        if (r->task[r->tasks] == NULL)
            goto oom;
    }

    return REDIS_OK;
oom:
    __redisReaderSetErrorOOM(r);
    return REDIS_ERR;
}

/* Process the array, map and set types. */
static int processAggregateItem(redisReader *r) {
    redisReadTask *cur = r->task[r->ridx];
    void *obj;
    char *p;
    long long elements;
    int root = 0, len;

    if (r->ridx == r->tasks - 1) {
        if (redisReaderGrow(r) == REDIS_ERR)
            return REDIS_ERR;
    }

    if ((p = readLine(r,&len)) != NULL) {
        if (string2ll(p, len, &elements) == REDIS_ERR) {
            __redisReaderSetError(r,REDIS_ERR_PROTOCOL,
                    "Bad multi-bulk length");
            return REDIS_ERR;
        }

        root = (r->ridx == 0);

        if (elements < -1 || (LLONG_MAX > SIZE_MAX && elements > SIZE_MAX) ||
            (r->maxelements > 0 && elements > r->maxelements))
        {
            __redisReaderSetError(r,REDIS_ERR_PROTOCOL,
                    "Multi-bulk length out of range");
            return REDIS_ERR;
        }

        if (elements == -1) {
            if (r->fn && r->fn->createNil)
                obj = r->fn->createNil(cur);
            else
                obj = (void*)REDIS_REPLY_NIL;

            if (obj == NULL) {
                __redisReaderSetErrorOOM(r);
                return REDIS_ERR;
            }

            moveToNextTask(r);
        } else {
            if (cur->type == REDIS_REPLY_MAP || cur->type == REDIS_REPLY_ATTR) elements *= 2;

            if (r->fn && r->fn->createArray)
                obj = r->fn->createArray(cur,elements);
            else
                obj = (void*)(uintptr_t)cur->type;

            if (obj == NULL) {
                __redisReaderSetErrorOOM(r);
                return REDIS_ERR;
            }

            /* Modify task stack when there are more than 0 elements. */
            if (elements > 0) {
                cur->elements = elements;
                cur->obj = obj;
                r->ridx++;
                r->task[r->ridx]->type = -1;
                r->task[r->ridx]->elements = -1;
                r->task[r->ridx]->idx = 0;
                r->task[r->ridx]->obj = NULL;
                r->task[r->ridx]->parent = cur;
                r->task[r->ridx]->privdata = r->privdata;
            } else {
                moveToNextTask(r);
            }
        }

        /* Set reply if this is the root object. */
        if (root) r->reply = obj;
        return REDIS_OK;
    }

    return REDIS_ERR;
}

static int processItem(redisReader *r) {
    redisReadTask *cur = r->task[r->ridx];
    char *p;

    /* check if we need to read type */
    if (cur->type < 0) {
        if ((p = readBytes(r,1)) != NULL) {
            switch (p[0]) {
            case '-':
                cur->type = REDIS_REPLY_ERROR;
                break;
            case '+':
                cur->type = REDIS_REPLY_STATUS;
                break;
            case ':':
                cur->type = REDIS_REPLY_INTEGER;
                break;
            case ',':
                cur->type = REDIS_REPLY_DOUBLE;
                break;
            case '_':
                cur->type = REDIS_REPLY_NIL;
                break;
            case '$':
                cur->type = REDIS_REPLY_STRING;
                break;
            case '*':
                cur->type = REDIS_REPLY_ARRAY;
                break;
            case '%':
                cur->type = REDIS_REPLY_MAP;
                break;
            case '|':
                cur->type = REDIS_REPLY_ATTR;
                break;
            case '~':
                cur->type = REDIS_REPLY_SET;
                break;
            case '#':
                cur->type = REDIS_REPLY_BOOL;
                break;
            case '=':
                cur->type = REDIS_REPLY_VERB;
                break;
            case '>':
                cur->type = REDIS_REPLY_PUSH;
                break;
            case '(':
                cur->type = REDIS_REPLY_BIGNUM;
                break;
            default:
                __redisReaderSetErrorProtocolByte(r,*p);
                return REDIS_ERR;
            }
        } else {
            /* could not consume 1 byte */
            return REDIS_ERR;
        }
    }

    /* process typed item */
    switch(cur->type) {
    case REDIS_REPLY_ERROR:
    case REDIS_REPLY_STATUS:
    case REDIS_REPLY_INTEGER:
    case REDIS_REPLY_DOUBLE:
    case REDIS_REPLY_NIL:
    case REDIS_REPLY_BOOL:
    case REDIS_REPLY_BIGNUM:
        return processLineItem(r);
    case REDIS_REPLY_STRING:
    case REDIS_REPLY_VERB:
        return processBulkItem(r);
    case REDIS_REPLY_ARRAY:
    case REDIS_REPLY_MAP:
    case REDIS_REPLY_ATTR:
    case REDIS_REPLY_SET:
    case REDIS_REPLY_PUSH:
        return processAggregateItem(r);
    default:
        assert(NULL);
        return REDIS_ERR; /* Avoid warning. */
    }
}

redisReader *redisReaderCreateWithFunctions(redisReplyObjectFunctions *fn) {
    redisReader *r;

    r = s_calloc(sizeof(redisReader));
    if (r == NULL)
        return NULL;

    r->buf = sdsempty();
    if (r->buf == NULL)
        goto oom;

    r->task = s_calloc(REDIS_READER_STACK_SIZE * sizeof(*r->task));
    if (r->task == NULL)
        goto oom;

    for (; r->tasks < REDIS_READER_STACK_SIZE; r->tasks++) {
        r->task[r->tasks] = s_calloc(sizeof(**r->task));
        if (r->task[r->tasks] == NULL)
            goto oom;
    }

    r->fn = fn;
    r->maxbuf = REDIS_READER_MAX_BUF;
    r->maxelements = REDIS_READER_MAX_ARRAY_ELEMENTS;
    r->ridx = -1;

    return r;
oom:
    redisReaderFree(r);
    return NULL;
}

void redisReaderFree(redisReader *r) {
    if (r == NULL)
        return;

    if (r->reply != NULL && r->fn && r->fn->freeObject)
        r->fn->freeObject(r->reply);

    if (r->task) {
        /* We know r->task[i] is allocated if i < r->tasks */
        for (int i = 0; i < r->tasks; i++) {
            s_free(r->task[i]);
        }

        s_free(r->task);
    }

    sdsfree(r->buf);
    s_free(r);
}

int redisReaderFeed(redisReader *r, const char *buf, size_t len) {
    sds newbuf;

    /* Return early when this reader is in an erroneous state. */
    if (r->err)
        return REDIS_ERR;

    /* Copy the provided buffer. */
    if (buf != NULL && len >= 1) {
        /* Destroy internal buffer when it is empty and is quite large. */
        if (r->len == 0 && r->maxbuf != 0 && sdsavail(r->buf) > r->maxbuf) {
            sdsfree(r->buf);
            r->buf = sdsempty();
            if (r->buf == 0) goto oom;

            r->pos = 0;
        }

        newbuf = sdscatlen(r->buf,buf,len);
        if (newbuf == NULL) goto oom;

        r->buf = newbuf;
        r->len = sdslen(r->buf);
    }

    return REDIS_OK;
oom:
    __redisReaderSetErrorOOM(r);
    return REDIS_ERR;
}

int redisReaderGetReply(redisReader *r, void **reply) {
    /* Default target pointer to NULL. */
    if (reply != NULL)
        *reply = NULL;

    /* Return early when this reader is in an erroneous state. */
    if (r->err)
        return REDIS_ERR;

    /* When the buffer is empty, there will never be a reply. */
    if (r->len == 0)
        return REDIS_OK;

    /* Set first item to process when the stack is empty. */
    if (r->ridx == -1) {
        r->task[0]->type = -1;
        r->task[0]->elements = -1;
        r->task[0]->idx = -1;
        r->task[0]->obj = NULL;
        r->task[0]->parent = NULL;
        r->task[0]->privdata = r->privdata;
        r->ridx = 0;
    }

    /* Process items in reply. */
    while (r->ridx >= 0)
        if (processItem(r) != REDIS_OK)
            break;

    /* Return ASAP when an error occurred. */
    if (r->err)
        return REDIS_ERR;

    /* Discard part of the buffer when we've consumed at least 1k, to avoid
     * doing unnecessary calls to memmove() in sds.c. */
    if (r->pos >= 1024) {
        if (sdsrange(r->buf,r->pos,-1) < 0) return REDIS_ERR;
        r->pos = 0;
        r->len = sdslen(r->buf);
    }

    /* Emit a reply when there is one. */
    if (r->ridx == -1) {
        if (reply != NULL) {
            *reply = r->reply;
        } else if (r->reply != NULL && r->fn && r->fn->freeObject) {
            r->fn->freeObject(r->reply);
        }
        r->reply = NULL;
    }
    return REDIS_OK;
}


================================================
FILE: src/redis/read.h
================================================
/*
 * Copyright (c) 2009-2011, Salvatore Sanfilippo <antirez at gmail dot com>
 * Copyright (c) 2010-2011, Pieter Noordhuis <pcnoordhuis at gmail dot com>
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */


#ifndef __HIREDIS_READ_H
#define __HIREDIS_READ_H
#include <stdio.h> /* for size_t */

#define REDIS_ERR -1
#define REDIS_OK 0

/* When an error occurs, the err flag in a context is set to hold the type of
 * error that occurred. REDIS_ERR_IO means there was an I/O error and you
 * should use the "errno" variable to find out what is wrong.
 * For other values, the "errstr" field will hold a description. */
#define REDIS_ERR_IO 1 /* Error in read or write */
#define REDIS_ERR_EOF 3 /* End of file */
#define REDIS_ERR_PROTOCOL 4 /* Protocol error */
#define REDIS_ERR_OOM 5 /* Out of memory */
#define REDIS_ERR_TIMEOUT 6 /* Timed out */
#define REDIS_ERR_OTHER 2 /* Everything else... */

#define REDIS_REPLY_STRING 1
#define REDIS_REPLY_ARRAY 2
#define REDIS_REPLY_INTEGER 3
#define REDIS_REPLY_NIL 4
#define REDIS_REPLY_STATUS 5
#define REDIS_REPLY_ERROR 6
#define REDIS_REPLY_DOUBLE 7
#define REDIS_REPLY_BOOL 8
#define REDIS_REPLY_MAP 9
#define REDIS_REPLY_SET 10
#define REDIS_REPLY_ATTR 11
#define REDIS_REPLY_PUSH 12
#define REDIS_REPLY_BIGNUM 13
#define REDIS_REPLY_VERB 14

/* Default max unused reader buffer. */
#define REDIS_READER_MAX_BUF (1024*16)

/* Default multi-bulk element limit */
#define REDIS_READER_MAX_ARRAY_ELEMENTS ((1LL<<32) - 1)

#ifdef __cplusplus
extern "C" {
#endif

typedef struct redisReadTask {
    int type;
    long long elements; /* number of elements in multibulk container */
    int idx; /* index in parent (array) object */
    void *obj; /* holds user-generated value for a read task */
    struct redisReadTask *parent; /* parent task */
    void *privdata; /* user-settable arbitrary field */
} redisReadTask;

typedef struct redisReplyObjectFunctions {
    void *(*createString)(const redisReadTask*, char*, size_t);
    void *(*createArray)(const redisReadTask*, size_t);
    void *(*createInteger)(const redisReadTask*, long long);
    void *(*createDouble)(const redisReadTask*, double, char*, size_t);
    void *(*createNil)(const redisReadTask*);
    void *(*createBool)(const redisReadTask*, int);
    void (*freeObject)(void*);
} redisReplyObjectFunctions;

typedef struct redisReader {
    int err; /* Error flags, 0 when there is no error */
    char errstr[128]; /* String representation of error when applicable */

    char *buf; /* Read buffer */
    size_t pos; /* Buffer cursor */
    size_t len; /* Buffer length */
    size_t maxbuf; /* Max length of unused buffer */
    long long maxelements; /* Max multi-bulk elements */

    redisReadTask **task;
    int tasks;

    int ridx; /* Index of current read task */
    void *reply; /* Temporary reply pointer */

    redisReplyObjectFunctions *fn;
    void *privdata;
} redisReader;

/* Public API for the protocol parser. */
redisReader *redisReaderCreateWithFunctions(redisReplyObjectFunctions *fn);
void redisReaderFree(redisReader *r);
int redisReaderFeed(redisReader *r, const char *buf, size_t len);
int redisReaderGetReply(redisReader *r, void **reply);

#define redisReaderSetPrivdata(_r, _p) (int)(((redisReader*)(_r))->privdata = (_p))
#define redisReaderGetObject(_r) (((redisReader*)(_r))->reply)
#define redisReaderGetError(_r) (((redisReader*)(_r))->errstr)

#ifdef __cplusplus
}
#endif

#endif


================================================
FILE: src/redis/redis_aux.c
================================================
#include "redis_aux.h"

#include <string.h>
#include <unistd.h>

#include "crc64.h"
#include "endianconv.h"
#include "zmalloc.h"

Server server;

void InitRedisTables() {
  crc64_init();
  memset(&server, 0, sizeof(server));

  server.max_map_field_len = 64;
  server.max_listpack_map_bytes = 1024;

  server.stream_node_max_entries = 100;
}

/* Toggle the 64 bit unsigned integer pointed by *p from little endian to
 * big endian */
void memrev64(void* p) {
  unsigned char *x = p, t;

  t = x[0];
  x[0] = x[7];
  x[7] = t;
  t = x[1];
  x[1] = x[6];
  x[6] = t;
  t = x[2];
  x[2] = x[5];
  x[5] = t;
  t = x[3];
  x[3] = x[4];
  x[4] = t;
}

// used by t_stream.c
uint64_t intrev64(uint64_t v) {
  memrev64(&v);
  return v;
}


================================================
FILE: src/redis/redis_aux.h
================================================
#ifndef __REDIS_AUX_H
#define __REDIS_AUX_H

#include "sds.h"

/* redis.h auxiliary definitions */
/* the last one in object.h is OBJ_STREAM and it is 6,
 * this will add enough place for Redis types to grow */
#define OBJ_JSON 15U
#define OBJ_SBF  16U
#define OBJ_CMS  17U
#define OBJ_TOPK 18U

// A pseudo type for keys stored in the db, same as OBJ_MODULE which is not used in Dragonfly.
#define OBJ_KEY  5U

/* How many types of objects exist */
#define OBJ_TYPE_MAX 19U

#define CONFIG_RUN_ID_SIZE 40U

typedef struct ServerStub {
  size_t max_map_field_len, max_listpack_map_bytes;

  long long stream_node_max_entries;
} Server;

extern Server server;

#define ZSET_MAX_LISTPACK_ENTRIES 128
#define ZSET_MAX_LISTPACK_VALUE 32

void InitRedisTables();

/* The actual Redis Object */
#define OBJ_STRING 0U    /* String object. */
#define OBJ_LIST 1U      /* List object. */
#define OBJ_SET 2U       /* Set object. */
#define OBJ_ZSET 3U      /* Sorted set object. */
#define OBJ_HASH 4U      /* Hash object. */
#define OBJ_MODULE 5U    /* Module object. */
#define OBJ_STREAM 6U    /* Stream object. */

/* Objects encoding. Some kind of objects like Strings and Hashes can be
 * internally represented in multiple ways. The 'encoding' field of the object
 * is set to one of this fields for this object. */
#define OBJ_ENCODING_RAW 0U     /* Raw representation */
#define OBJ_ENCODING_INT 1U     /* Encoded as integer */
#define OBJ_ENCODING_HT 2U      /* Encoded as hash table */
#define OBJ_ENCODING_ZIPMAP 3U  /* Encoded as zipmap */
#define OBJ_ENCODING_LINKEDLIST 4U /* No longer used: old list encoding. */
#define OBJ_ENCODING_ZIPLIST 5U /* Encoded as ziplist */
#define OBJ_ENCODING_INTSET 6U  /* Encoded as intset */
#define OBJ_ENCODING_SKIPLIST 7U  /* Encoded as skiplist */
#define OBJ_ENCODING_EMBSTR 8U  /* Embedded sds string encoding */
// #define OBJ_ENCODING_QUICKLIST 9U /* Encoded as linked list of ziplists */
#define OBJ_ENCODING_STREAM 10U /* Encoded as a radix tree of listpacks */
#define OBJ_ENCODING_LISTPACK 11 /* Encoded as a listpack */
#define OBJ_ENCODING_COMPRESS_INTERNAL 15U  /* Kept as lzf compressed, to pass compressed blob to another thread */


#endif /* __REDIS_AUX_H */


================================================
FILE: src/redis/sds.c
================================================
/* SDSLib 2.0 -- A C dynamic strings library
 *
 * Copyright (c) 2006-2015, Salvatore Sanfilippo <antirez at gmail dot com>
 * Copyright (c) 2015, Oran Agra
 * Copyright (c) 2015, Redis Labs, Inc
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>
#include <limits.h>
#include "sds.h"
#include "sdsalloc.h"

const char *SDS_NOINIT = "SDS_NOINIT";

static inline int sdsHdrSize(char type) {
    switch(type&SDS_TYPE_MASK) {
        case SDS_TYPE_5:
            return sizeof(struct sdshdr5);
        case SDS_TYPE_8:
            return sizeof(struct sdshdr8);
        case SDS_TYPE_16:
            return sizeof(struct sdshdr16);
        case SDS_TYPE_32:
            return sizeof(struct sdshdr32);
        case SDS_TYPE_64:
            return sizeof(struct sdshdr64);
    }
    return 0;
}

static inline char sdsReqType(size_t string_size) {
    if (string_size < 1<<5)
        return SDS_TYPE_5;
    if (string_size < 1<<8)
        return SDS_TYPE_8;
    if (string_size < 1<<16)
        return SDS_TYPE_16;
#if (LONG_MAX == LLONG_MAX)
    if (string_size < 1ll<<32)
        return SDS_TYPE_32;
    return SDS_TYPE_64;
#else
    return SDS_TYPE_32;
#endif
}

static inline size_t sdsTypeMaxSize(char type) {
    if (type == SDS_TYPE_5)
        return (1<<5) - 1;
    if (type == SDS_TYPE_8)
        return (1<<8) - 1;
    if (type == SDS_TYPE_16)
        return (1<<16) - 1;
#if (LONG_MAX == LLONG_MAX)
    if (type == SDS_TYPE_32)
        return (1ll<<32) - 1;
#endif
    return -1; /* this is equivalent to the max SDS_TYPE_64 or SDS_TYPE_32 */
}

/* Create a new sds string with the content specified by the 'init' pointer
 * and 'initlen'.
 * If NULL is used for 'init' the string is initialized with zero bytes.
 * If SDS_NOINIT is used, the buffer is left uninitialized;
 *
 * The string is always null-terminated (all the sds strings are, always) so
 * even if you create an sds string with:
 *
 * mystring = sdsnewlen("abc",3);
 *
 * You can print the string with printf() as there is an implicit \0 at the
 * end of the string. However the string is binary safe and can contain
 * \0 characters in the middle, as the length is stored in the sds header. */
sds _sdsnewlen(const void *init, size_t initlen, int trymalloc) {
    void *sh;
    sds s;
    char type = sdsReqType(initlen);
    /* Empty strings are usually created in order to append. Use type 8
     * since type 5 is not good at this. */
    if (type == SDS_TYPE_5 && initlen == 0) type = SDS_TYPE_8;
    int hdrlen = sdsHdrSize(type);
    unsigned char *fp; /* flags pointer. */
    size_t usable;

    assert(initlen + hdrlen + 1 > initlen); /* Catch size_t overflow */
    sh = trymalloc?
        s_trymalloc_usable(hdrlen+initlen+1, &usable) :
        s_malloc_usable(hdrlen+initlen+1, &usable);
    if (sh == NULL) return NULL;
    if (init==SDS_NOINIT)
        init = NULL;
    else if (!init)
        memset(sh, 0, hdrlen+initlen+1);
    s = (char*)sh+hdrlen;
    fp = ((unsigned char*)s)-1;
    usable = usable-hdrlen-1;
    if (usable > sdsTypeMaxSize(type))
        usable = sdsTypeMaxSize(type);
    switch(type) {
        case SDS_TYPE_5: {
            *fp = type | (initlen << SDS_TYPE_BITS);
            break;
        }
        case SDS_TYPE_8: {
            SDS_HDR_VAR(8,s);
            sh->len = initlen;
            sh->alloc = usable;
            *fp = type;
            break;
        }
        case SDS_TYPE_16: {
            SDS_HDR_VAR(16,s);
            sh->len = initlen;
            sh->alloc = usable;
            *fp = type;
            break;
        }
        case SDS_TYPE_32: {
            SDS_HDR_VAR(32,s);
            sh->len = initlen;
            sh->alloc = usable;
            *fp = type;
            break;
        }
        case SDS_TYPE_64: {
            SDS_HDR_VAR(64,s);
            sh->len = initlen;
            sh->alloc = usable;
            *fp = type;
            break;
        }
    }
    if (initlen && init)
        memcpy(s, init, initlen);
    s[initlen] = '\0';
    return s;
}

sds sdsnewlen(const void *init, size_t initlen) {
    return _sdsnewlen(init, initlen, 0);
}

/* Create an empty (zero length) sds string. Even in this case the string
 * always has an implicit null term. */
sds sdsempty(void) {
    return sdsnewlen("",0);
}

/* Create a new sds string starting from a null terminated C string. */
sds sdsnew(const char *init) {
    size_t initlen = (init == NULL) ? 0 : strlen(init);
    return sdsnewlen(init, initlen);
}

/* Duplicate an sds string. */
sds sdsdup(const sds s) {
    return sdsnewlen(s, sdslen(s));
}

/* Free an sds string. No operation is performed if 's' is NULL. */
void sdsfree(sds s) {
    if (s == NULL) return;
    s_free((char*)s-sdsHdrSize(s[-1]));
}

/* Set the sds string length to the length as obtained with strlen(), so
 * considering as content only up to the first null term character.
 *
 * This function is useful when the sds string is hacked manually in some
 * way, like in the following example:
 *
 * s = sdsnew("foobar");
 * s[2] = '\0';
 * sdsupdatelen(s);
 * printf("%d\n", sdslen(s));
 *
 * The output will be "2", but if we comment out the call to sdsupdatelen()
 * the output will be "6" as the string was modified but the logical length
 * remains 6 bytes. */
void sdsupdatelen(sds s) {
    size_t reallen = strlen(s);
    sdssetlen(s, reallen);
}

/* Modify an sds string in-place to make it empty (zero length).
 * However all the existing buffer is not discarded but set as free space
 * so that next append operations will not require allocations up to the
 * number of bytes previously available. */
void sdsclear(sds s) {
    sdssetlen(s, 0);
    s[0] = '\0';
}

/* Enlarge the free space at the end of the sds string so that the caller
 * is sure that after calling this function can overwrite up to addlen
 * bytes after the end of the string, plus one more byte for nul term.
 * If there's already sufficient free space, this function returns without any
 * action, if there isn't sufficient free space, it'll allocate what's missing,
 * and possibly more:
 * When greedy is 1, enlarge more than needed, to avoid need for future reallocs
 * on incremental growth.
 * When greedy is 0, enlarge just enough so that there's free space for 'addlen'.
 *
 * Note: this does not change the *length* of the sds string as returned
 * by sdslen(), but only the free buffer space we have. */
sds _sdsMakeRoomFor(sds s, size_t addlen, int greedy) {
    void *sh, *newsh;
    size_t avail = sdsavail(s);
    size_t len, newlen, reqlen;
    char type, oldtype = s[-1] & SDS_TYPE_MASK;
    int hdrlen;
    size_t usable;

    /* Return ASAP if there is enough space left. */
    if (avail >= addlen) return s;

    len = sdslen(s);
    sh = (char*)s-sdsHdrSize(oldtype);
    reqlen = newlen = (len+addlen);
    (void)reqlen;
    assert(newlen > len);   /* Catch size_t overflow */
    if (greedy == 1) {
    if (newlen < SDS_MAX_PREALLOC)
        newlen *= 2;
    else
        newlen += SDS_MAX_PREALLOC;
    }

    type = sdsReqType(newlen);

    /* Don't use type 5: the user is appending to the string and type 5 is
     * not able to remember empty space, so sdsMakeRoomFor() must be called
     * at every appending operation. */
    if (type == SDS_TYPE_5) type = SDS_TYPE_8;

    hdrlen = sdsHdrSize(type);
    assert(hdrlen + newlen + 1 > reqlen);  /* Catch size_t overflow */
    if (oldtype==type) {
        newsh = s_realloc_usable(sh, hdrlen+newlen+1, &usable);
        if (newsh == NULL) return NULL;
        s = (char*)newsh+hdrlen;
    } else {
        /* Since the header size changes, need to move the string forward,
         * and can't use realloc */
        newsh = s_malloc_usable(hdrlen+newlen+1, &usable);
        if (newsh == NULL) return NULL;
        memcpy((char*)newsh+hdrlen, s, len+1);
        s_free(sh);
        s = (char*)newsh+hdrlen;
        s[-1] = type;
        sdssetlen(s, len);
    }
    usable = usable-hdrlen-1;
    if (usable > sdsTypeMaxSize(type))
        usable = sdsTypeMaxSize(type);
    sdssetalloc(s, usable);
    return s;
}

/* Enlarge the free space at the end of the sds string more than needed,
 * This is useful to avoid repeated re-allocations when repeatedly appending to the sds. */
sds sdsMakeRoomFor(sds s, size_t addlen) {
    return _sdsMakeRoomFor(s, addlen, 1);
}

/* Unlike sdsMakeRoomFor(), this one just grows to the necessary size. */
sds sdsMakeRoomForNonGreedy(sds s, size_t addlen) {
    return _sdsMakeRoomFor(s, addlen, 0);
}

/* Reallocate the sds string so that it has no free space at the end. The
 * contained string remains not altered, but next concatenation operations
 * will require a reallocation.
 *
 * After the call, the passed sds string is no longer valid and all the
 * references must be substituted with the new pointer returned by the call. */
sds sdsRemoveFreeSpace(sds s) {
    void *sh, *newsh;
    char type, oldtype = s[-1] & SDS_TYPE_MASK;
    int hdrlen, oldhdrlen = sdsHdrSize(oldtype);
    size_t len = sdslen(s);
    size_t avail = sdsavail(s);
    sh = (char*)s-oldhdrlen;

    /* Return ASAP if there is no space left. */
    if (avail == 0) return s;

    /* Check what would be the minimum SDS header that is just good enough to
     * fit this string. */
    type = sdsReqType(len);
    hdrlen = sdsHdrSize(type);

    /* If the type is the same, or at least a large enough type is still
     * required, we just realloc(), letting the allocator to do the copy
     * only if really needed. Otherwise if the change is huge, we manually
     * reallocate the string to use the different header type. */
    if (oldtype==type || type > SDS_TYPE_8) {
        newsh = s_realloc(sh, oldhdrlen+len+1);
        if (newsh == NULL) return NULL;
        s = (char*)newsh+oldhdrlen;
    } else {
        newsh = s_malloc(hdrlen+len+1);
        if (newsh == NULL) return NULL;
        memcpy((char*)newsh+hdrlen, s, len+1);
        s_free(sh);
        s = (char*)newsh+hdrlen;
        s[-1] = type;
        sdssetlen(s, len);
    }
    sdssetalloc(s, len);
    return s;
}

/* Resize the allocation, this can make the allocation bigger or smaller,
 * if the size is smaller than currently used len, the data will be truncated */
sds sdsResize(sds s, size_t size) {
    void *sh, *newsh;
    char type, oldtype = s[-1] & SDS_TYPE_MASK;
    int hdrlen, oldhdrlen = sdsHdrSize(oldtype);
    size_t len = sdslen(s);
    sh = (char*)s-oldhdrlen;

    /* Return ASAP if the size is already good. */
    if (sdsalloc(s) == size) return s;

    /* Truncate len if needed. */
    if (size < len) len = size;

    /* Check what would be the minimum SDS header that is just good enough to
     * fit this string. */
    type = sdsReqType(size);
    /* Don't use type 5, it is not good for strings that are resized. */
    if (type == SDS_TYPE_5) type = SDS_TYPE_8;
    hdrlen = sdsHdrSize(type);

    /* If the type is the same, or can hold the size in it with low overhead
     * (larger than SDS_TYPE_8), we just realloc(), letting the allocator
     * to do the copy only if really needed. Otherwise if the change is
     * huge, we manually reallocate the string to use the different header
     * type. */
    if (oldtype==type || (type < oldtype && type > SDS_TYPE_8)) {
        newsh = s_realloc(sh, oldhdrlen+size+1);
        if (newsh == NULL) return NULL;
        s = (char*)newsh+oldhdrlen;
    } else {
        newsh = s_malloc(hdrlen+size+1);
        if (newsh == NULL) return NULL;
        memcpy((char*)newsh+hdrlen, s, len);
        s_free(sh);
        s = (char*)newsh+hdrlen;
        s[-1] = type;
    }
    s[len] = 0;
    sdssetlen(s, len);
    sdssetalloc(s, size);
    return s;
}

/* Return the total size of the allocation of the specified sds string,
 * including:
 * 1) The sds header before the pointer.
 * 2) The string.
 * 3) The free buffer at the end if any.
 * 4) The implicit null term.
 */
size_t sdsAllocSize(sds s) {
    size_t alloc = sdsalloc(s);
    return sdsHdrSize(s[-1])+alloc+1;
}

/* Return the pointer of the actual SDS allocation (normally SDS strings
 * are referenced by the start of the string buffer). */
void *sdsAllocPtr(sds s) {
    return (void*) (s-sdsHdrSize(s[-1]));
}

/* Increment the sds length and decrements the left free space at the
 * end of the string according to 'incr'. Also set the null term
 * in the new end of the string.
 *
 * This function is used in order to fix the string length after the
 * user calls sdsMakeRoomFor(), writes something after the end of
 * the current string, and finally needs to set the new length.
 *
 * Note: it is possible to use a negative increment in order to
 * right-trim the string.
 *
 * Usage example:
 *
 * Using sdsIncrLen() and sdsMakeRoomFor() it is possible to mount the
 * following schema, to cat bytes coming from the kernel to the end of an
 * sds string without copying into an intermediate buffer:
 *
 * oldlen = sdslen(s);
 * s = sdsMakeRoomFor(s, BUFFER_SIZE);
 * nread = read(fd, s+oldlen, BUFFER_SIZE);
 * ... check for nread <= 0 and handle it ...
 * sdsIncrLen(s, nread);
 */
void sdsIncrLen(sds s, ssize_t incr) {
    unsigned char flags = s[-1];
    size_t len;
    switch(flags&SDS_TYPE_MASK) {
        case SDS_TYPE_5: {
            unsigned char *fp = ((unsigned char*)s)-1;
            unsigned char oldlen = SDS_TYPE_5_LEN(flags);
            assert((incr > 0 && oldlen+incr < 32) || (incr < 0 && oldlen >= (unsigned int)(-incr)));
            *fp = SDS_TYPE_5 | ((oldlen+incr) << SDS_TYPE_BITS);
            len = oldlen+incr;
            break;
        }
        case SDS_TYPE_8: {
            SDS_HDR_VAR(8,s);
            assert((incr >= 0 && sh->alloc-sh->len >= incr) || (incr < 0 && sh->len >= (unsigned int)(-incr)));
            len = (sh->len += incr);
            break;
        }
        case SDS_TYPE_16: {
            SDS_HDR_VAR(16,s);
            assert((incr >= 0 && sh->alloc-sh->len >= incr) || (incr < 0 && sh->len >= (unsigned int)(-incr)));
            len = (sh->len += incr);
            break;
        }
        case SDS_TYPE_32: {
            SDS_HDR_VAR(32,s);
            assert((incr >= 0 && sh->alloc-sh->len >= (unsigned int)incr) || (incr < 0 && sh->len >= (unsigned int)(-incr)));
            len = (sh->len += incr);
            break;
        }
        case SDS_TYPE_64: {
            SDS_HDR_VAR(64,s);
            assert((incr >= 0 && sh->alloc-sh->len >= (uint64_t)incr) || (incr < 0 && sh->len >= (uint64_t)(-incr)));
            len = (sh->len += incr);
            break;
        }
        default: len = 0; /* Just to avoid compilation warnings. */
    }
    s[len] = '\0';
}

/* Grow the sds to have the specified length. Bytes that were not part of
 * the original length of the sds will be set to zero.
 *
 * if the specified length is smaller than the current length, no operation
 * is performed. */
sds sdsgrowzero(sds s, size_t len) {
    size_t curlen = sdslen(s);

    if (len <= curlen) return s;
    s = sdsMakeRoomFor(s,len-curlen);
    if (s == NULL) return NULL;

    /* Make sure added region doesn't contain garbage */
    memset(s+curlen,0,(len-curlen+1)); /* also set trailing \0 byte */
    sdssetlen(s, len);
    return s;
}

/* Append the specified binary-safe string pointed by 't' of 'len' bytes to the
 * end of the specified sds string 's'.
 *
 * After the call, the passed sds string is no longer valid and all the
 * references must be substituted with the new pointer returned by the call. */
sds sdscatlen(sds s, const void *t, size_t len) {
    size_t curlen = sdslen(s);

    s = sdsMakeRoomFor(s,len);
    if (s == NULL) return NULL;
    memcpy(s+curlen, t, len);
    sdssetlen(s, curlen+len);
    s[curlen+len] = '\0';
    return s;
}

/* Append the specified null terminated C string to the sds string 's'.
 *
 * After the call, the passed sds string is no longer valid and all the
 * references must be substituted with the new pointer returned by the call. */
sds sdscat(sds s, const char *t) {
    return sdscatlen(s, t, strlen(t));
}

/* Append the specified sds 't' to the existing sds 's'.
 *
 * After the call, the modified sds string is no longer valid and all the
 * references must be substituted with the new pointer returned by the call. */
sds sdscatsds(sds s, const sds t) {
    return sdscatlen(s, t, sdslen(t));
}

/* Destructively modify the sds string 's' to hold the specified binary
 * safe string pointed by 't' of length 'len' bytes. */
sds sdscpylen(sds s, const char *t, size_t len) {
    if (sdsalloc(s) < len) {
        s = sdsMakeRoomFor(s,len-sdslen(s));
        if (s == NULL) return NULL;
    }
    memcpy(s, t, len);
    s[len] = '\0';
    sdssetlen(s, len);
    return s;
}

/* Like sdscpylen() but 't' must be a null-terminated string so that the length
 * of the string is obtained with strlen(). */
sds sdscpy(sds s, const char *t) {
    return sdscpylen(s, t, strlen(t));
}

/* Helper for sdscatlonglong() doing the actual number -> string
 * conversion. 's' must point to a string with room for at least
 * SDS_LLSTR_SIZE bytes.
 *
 * The function returns the length of the null-terminated string
 * representation stored at 's'. */
#define SDS_LLSTR_SIZE 21
int sdsll2str(char *s, long long value) {
    char *p, aux;
    unsigned long long v;
    size_t l;

    /* Generate the string representation, this method produces
     * a reversed string. */
    if (value < 0) {
        /* Since v is unsigned, if value==LLONG_MIN, -LLONG_MIN will overflow. */
        if (value != LLONG_MIN) {
            v = -value;
        } else {
            v = ((unsigned long long)LLONG_MAX) + 1;
        }
    } else {
        v = value;
    }

    p = s;
    do {
        *p++ = '0'+(v%10);
        v /= 10;
    } while(v);
    if (value < 0) *p++ = '-';

    /* Compute length and add null term. */
    l = p-s;
    *p = '\0';

    /* Reverse the string. */
    p--;
    while(s < p) {
        aux = *s;
        *s = *p;
        *p = aux;
        s++;
        p--;
    }
    return l;
}

/* Identical sdsll2str(), but for unsigned long long type. */
int sdsull2str(char *s, unsigned long long v) {
    char *p, aux;
    size_t l;

    /* Generate the string representation, this method produces
     * a reversed string. */
    p = s;
    do {
        *p++ = '0'+(v%10);
        v /= 10;
    } while(v);

    /* Compute length and add null term. */
    l = p-s;
    *p = '\0';

    /* Reverse the string. */
    p--;
    while(s < p) {
        aux = *s;
        *s = *p;
        *p = aux;
        s++;
        p--;
    }
    return l;
}

/* Create an sds string from a long long value. It is much faster than:
 *
 * sdscatprintf(sdsempty(),"%lld\n", value);
 */
sds sdsfromlonglong(long long value) {
    char buf[SDS_LLSTR_SIZE + 10];
    int len = sdsll2str(buf,value);

    return sdsnewlen(buf,len);
}

/* Like sdscatprintf() but gets va_list instead of being variadic. */
sds sdscatvprintf(sds s, const char *fmt, va_list ap) {
    va_list cpy;
    char staticbuf[1024], *buf = staticbuf, *t;
    size_t buflen = strlen(fmt)*2;
    int bufstrlen;

    /* We try to start using a static buffer for speed.
     * If not possible we revert to heap allocation. */
    if (buflen > sizeof(staticbuf)) {
        buf = s_malloc(buflen);
        if (buf == NULL) return NULL;
    } else {
        buflen = sizeof(staticbuf);
    }

    /* Alloc enough space for buffer and \0 after failing to
     * fit the string in the current buffer size. */
    while(1) {
        va_copy(cpy,ap);
        bufstrlen = vsnprintf(buf, buflen, fmt, cpy);
        va_end(cpy);
        if (bufstrlen < 0) {
            if (buf != staticbuf) s_free(buf);
            return NULL;
        }
        if (((size_t)bufstrlen) >= buflen) {
            if (buf != staticbuf) s_free(buf);
            buflen = ((size_t)bufstrlen) + 1;
            buf = s_malloc(buflen);
            if (buf == NULL) return NULL;
            continue;
        }
        break;
    }

    /* Finally concat the obtained string to the SDS string and return it. */
    t = sdscatlen(s, buf, bufstrlen);
    if (buf != staticbuf) s_free(buf);
    return t;
}

/* Append to the sds string 's' a string obtained using printf-alike format
 * specifier.
 *
 * After the call, the modified sds string is no longer valid and all the
 * references must be substituted with the new pointer returned by the call.
 *
 * Example:
 *
 * s = sdsnew("Sum is: ");
 * s = sdscatprintf(s,"%d+%d = %d",a,b,a+b).
 *
 * Often you need to create a string from scratch with the printf-alike
 * format. When this is the need, just use sdsempty() as the target string:
 *
 * s = sdscatprintf(sdsempty(), "... your format ...", args);
 */
sds sdscatprintf(sds s, const char *fmt, ...) {
    va_list ap;
    char *t;
    va_start(ap, fmt);
    t = sdscatvprintf(s,fmt,ap);
    va_end(ap);
    return t;
}

/* This function is similar to sdscatprintf, but much faster as it does
 * not rely on sprintf() family functions implemented by the libc that
 * are often very slow. Moreover directly handling the sds string as
 * new data is concatenated provides a performance improvement.
 *
 * However this function only handles an incompatible subset of printf-alike
 * format specifiers:
 *
 * %s - C String
 * %S - SDS string
 * %i - signed int
 * %I - 64 bit signed integer (long long, int64_t)
 * %u - unsigned int
 * %U - 64 bit unsigned integer (unsigned long long, uint64_t)
 * %% - Verbatim "%" character.
 */
sds sdscatfmt(sds s, char const *fmt, ...) {
    size_t initlen = sdslen(s);
    const char *f = fmt;
    long i;
    va_list ap;

    /* To avoid continuous reallocations, let's start with a buffer that
     * can hold at least two times the format string itself. It's not the
     * best heuristic but seems to work in practice. */
    s = sdsMakeRoomFor(s, strlen(fmt)*2);
    va_start(ap,fmt);
    f = fmt;    /* Next format specifier byte to process. */
    i = initlen; /* Position of the next byte to write to dest str. */
    while(*f) {
        char next, *str;
        size_t l;
        long long num;
        unsigned long long unum;

        /* Make sure there is always space for at least 1 char. */
        if (sdsavail(s)==0) {
            s = sdsMakeRoomFor(s,1);
        }

        switch(*f) {
        case '%':
            next = *(f+1);
            if (next == '\0') break;
            f++;
            switch(next) {
            case 's':
            case 'S':
                str = va_arg(ap,char*);
                l = (next == 's') ? strlen(str) : sdslen(str);
                if (sdsavail(s) < l) {
                    s = sdsMakeRoomFor(s,l);
                }
                memcpy(s+i,str,l);
                sdsinclen(s,l);
                i += l;
                break;
            case 'i':
            case 'I':
                if (next == 'i')
                    num = va_arg(ap,int);
                else
                    num = va_arg(ap,long long);
                {
                    char buf[SDS_LLSTR_SIZE];
                    l = sdsll2str(buf,num);
                    if (sdsavail(s) < l) {
                        s = sdsMakeRoomFor(s,l);
                    }
                    memcpy(s+i,buf,l);
                    sdsinclen(s,l);
                    i += l;
                }
                break;
            case 'u':
            case 'U':
                if (next == 'u')
                    unum = va_arg(ap,unsigned int);
                else
                    unum = va_arg(ap,unsigned long long);
                {
                    char buf[SDS_LLSTR_SIZE];
                    l = sdsull2str(buf,unum);
                    if (sdsavail(s) < l) {
                        s = sdsMakeRoomFor(s,l);
                    }
                    memcpy(s+i,buf,l);
                    sdsinclen(s,l);
                    i += l;
                }
                break;
            default: /* Handle %% and generally %<unknown>. */
                s[i++] = next;
                sdsinclen(s,1);
                break;
            }
            break;
        default:
            s[i++] = *f;
            sdsinclen(s,1);
            break;
        }
        f++;
    }
    va_end(ap);

    /* Add null-term */
    s[i] = '\0';
    return s;
}

/* Remove the part of the string from left and from right composed just of
 * contiguous characters found in 'cset', that is a null terminated C string.
 *
 * After the call, the modified sds string is no longer valid and all the
 * references must be substituted with the new pointer returned by the call.
 *
 * Example:
 *
 * s = sdsnew("AA...AA.a.aa.aHelloWorld     :::");
 * s = sdstrim(s,"Aa. :");
 * printf("%s\n", s);
 *
 * Output will be just "HelloWorld".
 */
sds sdstrim(sds s, const char *cset) {
    char *end, *sp, *ep;
    size_t len;

    sp = s;
    ep = end = s+sdslen(s)-1;
    while(sp <= end && strchr(cset, *sp)) sp++;
    while(ep > sp && strchr(cset, *ep)) ep--;
    len = (ep-sp)+1;
    if (s != sp) memmove(s, sp, len);
    s[len] = '\0';
    sdssetlen(s,len);
    return s;
}

/* Changes the input string to be a subset of the original.
 * It does not release the free space in the string, so a call to
 * sdsRemoveFreeSpace may be wise after. */
void sdssubstr(sds s, size_t start, size_t len) {
    /* Clamp out of range input */
    size_t oldlen = sdslen(s);
    if (start >= oldlen) start = len = 0;
    if (len > oldlen-start) len = oldlen-start;

    /* Move the data */
    if (len) memmove(s, s+start, len);
    s[len] = 0;
    sdssetlen(s,len);
}

/* Turn the string into a smaller (or equal) string containing only the
 * substring specified by the 'start' and 'end' indexes.
 *
 * start and end can be negative, where -1 means the last character of the
 * string, -2 the penultimate character, and so forth.
 *
 * The interval is inclusive, so the start and end characters will be part
 * of the resulting string.
 *
 * The string is modified in-place.
 *
 * Return value:
 * -1 (error) if sdslen(s) is larger than maximum positive ssize_t value.
 *  0 on success.
 *
 * Example:
 *
 * s = sdsnew("Hello World");
 * sdsrange(s,1,-1); => "ello World"
 */
int sdsrange(sds s, ssize_t start, ssize_t end) {
    size_t newlen, len = sdslen(s);
    if (len > SSIZE_MAX) return -1;

    if (len == 0) return 0;
    if (start < 0) {
        start = len+start;
        if (start < 0) start = 0;
    }
    if (end < 0) {
        end = len+end;
        if (end < 0) end = 0;
    }
    newlen = (start > end) ? 0 : (end-start)+1;
    if (newlen != 0) {
        if (start >= (ssize_t)len) {
            newlen = 0;
        } else if (end >= (ssize_t)len) {
            end = len-1;
            newlen = (start > end) ? 0 : (end-start)+1;
        }
    } else {
        start = 0;
    }
    if (start && newlen) memmove(s, s+start, newlen);
    s[newlen] = 0;
    sdssetlen(s,newlen);
    return 0;
}

/* Apply tolower() to every character of the sds string 's'. */
void sdstolower(sds s) {
    size_t len = sdslen(s), j;

    for (j = 0; j < len; j++) s[j] = tolower(s[j]);
}

/* Apply toupper() to every character of the sds string 's'. */
void sdstoupper(sds s) {
    size_t len = sdslen(s), j;

    for (j = 0; j < len; j++) s[j] = toupper(s[j]);
}

/* Compare two sds strings s1 and s2 with memcmp().
 *
 * Return value:
 *
 *     positive if s1 > s2.
 *     negative if s1 < s2.
 *     0 if s1 and s2 are exactly the same binary string.
 *
 * If two strings share exactly the same prefix, but one of the two has
 * additional characters, the longer string is considered to be greater than
 * the smaller one. */
int sdscmp(const sds s1, const sds s2) {
    size_t l1, l2, minlen;
    int cmp;

    l1 = sdslen(s1);
    l2 = sdslen(s2);
    minlen = (l1 < l2) ? l1 : l2;
    cmp = memcmp(s1,s2,minlen);
    if (cmp == 0) return l1>l2? 1: (l1<l2? -1: 0);
    return cmp;
}

/* Split 's' with separator in 'sep'. An array
 * of sds strings is returned. *count will be set
 * by reference to the number of tokens returned.
 *
 * On out of memory, zero length string, zero length
 * separator, NULL is returned.
 *
 * Note that 'sep' is able to split a string using
 * a multi-character separator. For example
 * sdssplit("foo_-_bar","_-_"); will return two
 * elements "foo" and "bar".
 *
 * This version of the function is binary-safe but
 * requires length arguments. sdssplit() is just the
 * same function but for zero-terminated strings.
 */
sds *sdssplitlen(const char *s, ssize_t len, const char *sep, int seplen, int *count) {
    int elements = 0, slots = 5;
    long start = 0, j;
    sds *tokens;

    if (seplen < 1 || len <= 0) {
        *count = 0;
        return NULL;
    }
    tokens = s_malloc(sizeof(sds)*slots);
    if (tokens == NULL) return NULL;

    for (j = 0; j < (len-(seplen-1)); j++) {
        /* make sure there is room for the next element and the final one */
        if (slots < elements+2) {
            sds *newtokens;

            slots *= 2;
            newtokens = s_realloc(tokens,sizeof(sds)*slots);
            if (newtokens == NULL) goto cleanup;
            tokens = newtokens;
        }
        /* search the separator */
        if ((seplen == 1 && *(s+j) == sep[0]) || (memcmp(s+j,sep,seplen) == 0)) {
            tokens[elements] = sdsnewlen(s+start,j-start);
            if (tokens[elements] == NULL) goto cleanup;
            elements++;
            start = j+seplen;
            j = j+seplen-1; /* skip the separator */
        }
    }
    /* Add the final element. We are sure there is room in the tokens array. */
    tokens[elements] = sdsnewlen(s+start,len-start);
    if (tokens[elements] == NULL) goto cleanup;
    elements++;
    *count = elements;
    return tokens;

cleanup:
    {
        int i;
        for (i = 0; i < elements; i++) sdsfree(tokens[i]);
        s_free(tokens);
        *count = 0;
        return NULL;
    }
}

/* Free the result returned by sdssplitlen(), or do nothing if 'tokens' is NULL. */
void sdsfreesplitres(sds *tokens, int count) {
    if (!tokens) return;
    while(count--)
        sdsfree(tokens[count]);
    s_free(tokens);
}

/* Append to the sds string "s" an escaped string representation where
 * all the non-printable characters (tested with isprint()) are turned into
 * escapes in the form "\n\r\a...." or "\x<hex-number>".
 *
 * After the call, the modified sds string is no longer valid and all the
 * references must be substituted with the new pointer returned by the call. */
sds sdscatrepr(sds s, const char *p, size_t len) {
    s = sdscatlen(s,"\"",1);
    while(len--) {
        switch(*p) {
        case '\\':
        case '"':
            s = sdscatprintf(s,"\\%c",*p);
            break;
        case '\n': s = sdscatlen(s,"\\n",2); break;
        case '\r': s = sdscatlen(s,"\\r",2); break;
        case '\t': s = sdscatlen(s,"\\t",2); break;
        case '\a': s = sdscatlen(s,"\\a",2); break;
        case '\b': s = sdscatlen(s,"\\b",2); break;
        default:
            if (isprint(*p))
                s = sdscatprintf(s,"%c",*p);
            else
                s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
            break;
        }
        p++;
    }
    return sdscatlen(s,"\"",1);
}

/* Helper function for sdssplitargs() that returns non zero if 'c'
 * is a valid hex digit. */
int is_hex_digit(char c) {
    return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') ||
           (c >= 'A' && c <= 'F');
}

/* Helper function for sdssplitargs() that converts a hex digit into an
 * integer from 0 to 15 */
int hex_digit_to_int(char c) {
    switch(c) {
    case '0': return 0;
    case '1': return 1;
    case '2': return 2;
    case '3': return 3;
    case '4': return 4;
    case '5': return 5;
    case '6': return 6;
    case '7': return 7;
    case '8': return 8;
    case '9': return 9;
    case 'a': case 'A': return 10;
    case 'b': case 'B': return 11;
    case 'c': case 'C': return 12;
    case 'd': case 'D': return 13;
    case 'e': case 'E': return 14;
    case 'f': case 'F': return 15;
    default: return 0;
    }
}

/* Split a line into arguments, where every argument can be in the
 * following programming-language REPL-alike form:
 *
 * foo bar "newline are supported\n" and "\xff\x00otherstuff"
 *
 * The number of arguments is stored into *argc, and an array
 * of sds is returned.
 *
 * The caller should free the resulting array of sds strings with
 * sdsfreesplitres().
 *
 * Note that sdscatrepr() is able to convert back a string into
 * a quoted string in the same format sdssplitargs() is able to parse.
 *
 * The function returns the allocated tokens on success, even when the
 * input string is empty, or NULL if the input contains unbalanced
 * quotes or closed quotes followed by non space characters
 * as in: "foo"bar or "foo'
 */
sds *sdssplitargs(const char *line, int *argc) {
    const char *p = line;
    char *current = NULL;
    char **vector = NULL;

    *argc = 0;
    while(1) {
        /* skip blanks */
        while(*p && isspace(*p)) p++;
        if (*p) {
            /* get a token */
            int inq=0;  /* set to 1 if we are in "quotes" */
            int insq=0; /* set to 1 if we are in 'single quotes' */
            int done=0;

            if (current == NULL) current = sdsempty();
            while(!done) {
                if (inq) {
                    if (*p == '\\' && *(p+1) == 'x' &&
                                             is_hex_digit(*(p+2)) &&
                                             is_hex_digit(*(p+3)))
                    {
                        unsigned char byte;

                        byte = (hex_digit_to_int(*(p+2))*16)+
                                hex_digit_to_int(*(p+3));
                        current = sdscatlen(current,(char*)&byte,1);
                        p += 3;
                    } else if (*p == '\\' && *(p+1)) {
                        char c;

                        p++;
                        switch(*p) {
                        case 'n': c = '\n'; break;
                        case 'r': c = '\r'; break;
                        case 't': c = '\t'; break;
                        case 'b': c = '\b'; break;
                        case 'a': c = '\a'; break;
                        default: c = *p; break;
                        }
                        current = sdscatlen(current,&c,1);
                    } else if (*p == '"') {
                        /* closing quote must be followed by a space or
                         * nothing at all. */
                        if (*(p+1) && !isspace(*(p+1))) goto err;
                        done=1;
                    } else if (!*p) {
                        /* unterminated quotes */
                        goto err;
                    } else {
                        current = sdscatlen(current,p,1);
                    }
                } else if (insq) {
                    if (*p == '\\' && *(p+1) == '\'') {
                        p++;
                        current = sdscatlen(current,"'",1);
                    } else if (*p == '\'') {
                        /* closing quote must be followed by a space or
                         * nothing at all. */
                        if (*(p+1) && !isspace(*(p+1))) goto err;
                        done=1;
                    } else if (!*p) {
                        /* unterminated quotes */
                        goto err;
                    } else {
                        current = sdscatlen(current,p,1);
                    }
                } else {
                    switch(*p) {
                    case ' ':
                    case '\n':
                    case '\r':
                    case '\t':
                    case '\0':
                        done=1;
                        break;
                    case '"':
                        inq=1;
                        break;
                    case '\'':
                        insq=1;
                        break;
                    default:
                        current = sdscatlen(current,p,1);
                        break;
                    }
                }
                if (*p) p++;
            }
            /* add the token to the vector */
            vector = s_realloc(vector,((*argc)+1)*sizeof(char*));
            vector[*argc] = current;
            (*argc)++;
            current = NULL;
        } else {
            /* Even on empty input string return something not NULL. */
            if (vector == NULL) vector = s_malloc(sizeof(void*));
            return vector;
        }
    }

err:
    while((*argc)--)
        sdsfree(vector[*argc]);
    s_free(vector);
    if (current) sdsfree(current);
    *argc = 0;
    return NULL;
}

/* Modify the string substituting all the occurrences of the set of
 * characters specified in the 'from' string to the corresponding character
 * in the 'to' array.
 *
 * For instance: sdsmapchars(mystring, "ho", "01", 2)
 * will have the effect of turning the string "hello" into "0ell1".
 *
 * The function returns the sds string pointer, that is always the same
 * as the input pointer since no resize is needed. */
sds sdsmapchars(sds s, const char *from, const char *to, size_t setlen) {
    size_t j, i, l = sdslen(s);

    for (j = 0; j < l; j++) {
        for (i = 0; i < setlen; i++) {
            if (s[j] == from[i]) {
                s[j] = to[i];
                break;
            }
        }
    }
    return s;
}

/* Join an array of C strings using the specified separator (also a C string).
 * Returns the result as an sds string. */
sds sdsjoin(char **argv, int argc, char *sep) {
    sds join = sdsempty();
    int j;

    for (j = 0; j < argc; j++) {
        join = sdscat(join, argv[j]);
        if (j != argc-1) join = sdscat(join,sep);
    }
    return join;
}

/* Like sdsjoin, but joins an array of SDS strings. */
sds sdsjoinsds(sds *argv, int argc, const char *sep, size_t seplen) {
    sds join = sdsempty();
    int j;

    for (j = 0; j < argc; j++) {
        join = sdscatsds(join, argv[j]);
        if (j != argc-1) join = sdscatlen(join,sep,seplen);
    }
    return join;
}

/* Wrappers to the allocators used by SDS. Note that SDS will actually
 * just use the macros defined into sdsalloc.h in order to avoid to pay
 * the overhead of function calls. Here we define these wrappers only for
 * the programs SDS is linked to, if they want to touch the SDS internals
 * even if they use a different allocator. */
void *sds_malloc(size_t size) { return s_malloc(size); }
void *sds_realloc(void *ptr, size_t size) { return s_realloc(ptr,size); }
void sds_free(void *ptr) { s_free(ptr); }

/* Perform expansion of a template string and return the result as a newly
 * allocated sds.
 *
 * Template variables are specified using curly brackets, e.g. {variable}.
 * An opening bracket can be quoted by repeating it twice.
 */
sds sdstemplate(const char *template, sdstemplate_callback_t cb_func, void *cb_arg)
{
    sds res = sdsempty();
    const char *p = template;

    while (*p) {
        /* Find next variable, copy everything until there */
        const char *sv = strchr(p, '{');
        if (!sv) {
            /* Not found: copy till rest of template and stop */
            res = sdscat(res, p);
            break;
        } else if (sv > p) {
            /* Found: copy anything up to the beginning of the variable */
            res = sdscatlen(res, p, sv - p);
        }

        /* Skip into variable name, handle premature end or quoting */
        sv++;
        if (!*sv) goto error;       /* Premature end of template */
        if (*sv == '{') {
            /* Quoted '{' */
            p = sv + 1;
            res = sdscat(res, "{");
            continue;
        }

        /* Find end of variable name, handle premature end of template */
        const char *ev = strchr(sv, '}');
        if (!ev) goto error;

        /* Pass variable name to callback and obtain value. If callback failed,
         * abort. */
        sds varname = sdsnewlen(sv, ev - sv);
        sds value = cb_func(varname, cb_arg);
        sdsfree(varname);
        if (!value) goto error;

        /* Append value to result and continue */
        res = sdscat(res, value);
        sdsfree(value);
        p = ev + 1;
    }

    return res;

error:
    sdsfree(res);
    return NULL;
}

#ifdef REDIS_TEST
#include <stdio.h>
#include <limits.h>
#include "testhelp.h"

#define UNUSED(x) (void)(x)

static sds sdsTestTemplateCallback(sds varname, void *arg) {
    UNUSED(arg);
    static const char *_var1 = "variable1";
    static const char *_var2 = "variable2";

    if (!strcmp(varname, _var1)) return sdsnew("value1");
    else if (!strcmp(varname, _var2)) return sdsnew("value2");
    else return NULL;
}

int sdsTest(int argc, char **argv, int flags) {
    UNUSED(argc);
    UNUSED(argv);
    UNUSED(flags);

    {
        sds x = sdsnew("foo"), y;

        test_cond("Create a string and obtain the length",
            sdslen(x) == 3 && memcmp(x,"foo\0",4) == 0);

        sdsfree(x);
        x = sdsnewlen("foo",2);
        test_cond("Create a string with specified length",
            sdslen(x) == 2 && memcmp(x,"fo\0",3) == 0);

        x = sdscat(x,"bar");
        test_cond("Strings concatenation",
            sdslen(x) == 5 && memcmp(x,"fobar\0",6) == 0);

        x = sdscpy(x,"a");
        test_cond("sdscpy() against an originally longer string",
            sdslen(x) == 1 && memcmp(x,"a\0",2) == 0);

        x = sdscpy(x,"xyzxxxxxxxxxxyyyyyyyyyykkkkkkkkkk");
        test_cond("sdscpy() against an originally shorter string",
            sdslen(x) == 33 &&
            memcmp(x,"xyzxxxxxxxxxxyyyyyyyyyykkkkkkkkkk\0",33) == 0);

        sdsfree(x);
        x = sdscatprintf(sdsempty(),"%d",123);
        test_cond("sdscatprintf() seems working in the base case",
            sdslen(x) == 3 && memcmp(x,"123\0",4) == 0);

        sdsfree(x);
        x = sdscatprintf(sdsempty(),"a%cb",0);
        test_cond("sdscatprintf() seems working with \\0 inside of result",
            sdslen(x) == 3 && memcmp(x,"a\0""b\0",4) == 0);

        {
            sdsfree(x);
            char etalon[1024*1024];
            for (size_t i = 0; i < sizeof(etalon); i++) {
                etalon[i] = '0';
            }
            x = sdscatprintf(sdsempty(),"%0*d",(int)sizeof(etalon),0);
            test_cond("sdscatprintf() can print 1MB",
                sdslen(x) == sizeof(etalon) && memcmp(x,etalon,sizeof(etalon)) == 0);
        }

        sdsfree(x);
        x = sdsnew("--");
        x = sdscatfmt(x, "Hello %s World %I,%I--", "Hi!", LLONG_MIN,LLONG_MAX);
        test_cond("sdscatfmt() seems working in the base case",
            sdslen(x) == 60 &&
            memcmp(x,"--Hello Hi! World -9223372036854775808,"
                     "9223372036854775807--",60) == 0);
        printf("[%s]\n",x);

        sdsfree(x);
        x = sdsnew("--");
        x = sdscatfmt(x, "%u,%U--", UINT_MAX, ULLONG_MAX);
        test_cond("sdscatfmt() seems working with unsigned numbers",
            sdslen(x) == 35 &&
            memcmp(x,"--4294967295,18446744073709551615--",35) == 0);

        sdsfree(x);
        x = sdsnew(" x ");
        sdstrim(x," x");
        test_cond("sdstrim() works when all chars match",
            sdslen(x) == 0);

        sdsfree(x);
        x = sdsnew(" x ");
        sdstrim(x," ");
        test_cond("sdstrim() works when a single char remains",
            sdslen(x) == 1 && x[0] == 'x');

        sdsfree(x);
        x = sdsnew("xxciaoyyy");
        sdstrim(x,"xy");
        test_cond("sdstrim() correctly trims characters",
            sdslen(x) == 4 && memcmp(x,"ciao\0",5) == 0);

        y = sdsdup(x);
        sdsrange(y,1,1);
        test_cond("sdsrange(...,1,1)",
            sdslen(y) == 1 && memcmp(y,"i\0",2) == 0);

        sdsfree(y);
        y = sdsdup(x);
        sdsrange(y,1,-1);
        test_cond("sdsrange(...,1,-1)",
            sdslen(y) == 3 && memcmp(y,"iao\0",4) == 0);

        sdsfree(y);
        y = sdsdup(x);
        sdsrange(y,-2,-1);
        test_cond("sdsrange(...,-2,-1)",
            sdslen(y) == 2 && memcmp(y,"ao\0",3) == 0);

        sdsfree(y);
        y = sdsdup(x);
        sdsrange(y,2,1);
        test_cond("sdsrange(...,2,1)",
            sdslen(y) == 0 && memcmp(y,"\0",1) == 0);

        sdsfree(y);
        y = sdsdup(x);
        sdsrange(y,1,100);
        test_cond("sdsrange(...,1,100)",
            sdslen(y) == 3 && memcmp(y,"iao\0",4) == 0);

        sdsfree(y);
        y = sdsdup(x);
        sdsrange(y,100,100);
        test_cond("sdsrange(...,100,100)",
            sdslen(y) == 0 && memcmp(y,"\0",1) == 0);

        sdsfree(y);
        y = sdsdup(x);
        sdsrange(y,4,6);
        test_cond("sdsrange(...,4,6)",
            sdslen(y) == 0 && memcmp(y,"\0",1) == 0);

        sdsfree(y);
        y = sdsdup(x);
        sdsrange(y,3,6);
        test_cond("sdsrange(...,3,6)",
            sdslen(y) == 1 && memcmp(y,"o\0",2) == 0);

        sdsfree(y);
        sdsfree(x);
        x = sdsnew("foo");
        y = sdsnew("foa");
        test_cond("sdscmp(foo,foa)", sdscmp(x,y) > 0);

        sdsfree(y);
        sdsfree(x);
        x = sdsnew("bar");
        y = sdsnew("bar");
        test_cond("sdscmp(bar,bar)", sdscmp(x,y) == 0);

        sdsfree(y);
        sdsfree(x);
        x = sdsnew("aar");
        y = sdsnew("bar");
        test_cond("sdscmp(bar,bar)", sdscmp(x,y) < 0);

        sdsfree(y);
        sdsfree(x);
        x = sdsnewlen("\a\n\0foo\r",7);
        y = sdscatrepr(sdsempty(),x,sdslen(x));
        test_cond("sdscatrepr(...data...)",
            memcmp(y,"\"\\a\\n\\x00foo\\r\"",15) == 0);

        {
            unsigned int oldfree;
            char *p;
            int i;
            size_t step = 10, j;

            sdsfree(x);
            sdsfree(y);
            x = sdsnew("0");
            test_cond("sdsnew() free/len buffers", sdslen(x) == 1 && sdsavail(x) == 0);

            /* Run the test a few times in order to hit the first two
             * SDS header types. */
            for (i = 0; i < 10; i++) {
                size_t oldlen = sdslen(x);
                x = sdsMakeRoomFor(x,step);
                int type = x[-1]&SDS_TYPE_MASK;

                test_cond("sdsMakeRoomFor() len", sdslen(x) == oldlen);
                if (type != SDS_TYPE_5) {
                    test_cond("sdsMakeRoomFor() free", sdsavail(x) >= step);
                    oldfree = sdsavail(x);
                    UNUSED(oldfree);
                }
                p = x+oldlen;
                for (j = 0; j < step; j++) {
                    p[j] = 'A'+j;
                }
                sdsIncrLen(x,step);
            }
            test_cond("sdsMakeRoomFor() content",
                memcmp("0ABCDEFGHIJABCDEFGHIJABCDEFGHIJABCDEFGHIJABCDEFGHIJABCDEFGHIJABCDEFGHIJABCDEFGHIJABCDEFGHIJABCDEFGHIJ",x,101) == 0);
            test_cond("sdsMakeRoomFor() final length",sdslen(x)==101);

            sdsfree(x);
        }

        /* Simple template */
        x = sdstemplate("v1={variable1} v2={variable2}", sdsTestTemplateCallback, NULL);
        test_cond("sdstemplate() normal flow",
                  memcmp(x,"v1=value1 v2=value2",19) == 0);
        sdsfree(x);

        /* Template with callback error */
        x = sdstemplate("v1={variable1} v3={doesnotexist}", sdsTestTemplateCallback, NULL);
        test_cond("sdstemplate() with callback error", x == NULL);

        /* Template with empty var name */
        x = sdstemplate("v1={", sdsTestTemplateCallback, NULL);
        test_cond("sdstemplate() with empty var name", x == NULL);

        /* Template with truncated var name */
        x = sdstemplate("v1={start", sdsTestTemplateCallback, NULL);
        test_cond("sdstemplate() with truncated var name", x == NULL);

        /* Template with quoting */
        x = sdstemplate("v1={{{variable1}} {{} v2={variable2}", sdsTestTemplateCallback, NULL);
        test_cond("sdstemplate() with quoting",
                  memcmp(x,"v1={value1} {} v2=value2",24) == 0);
        sdsfree(x);

        /* Test sdsresize - extend */
        x = sdsnew("1234567890123456789012345678901234567890");
        x = sdsResize(x, 200);
        test_cond("sdsrezie() expand len", sdslen(x) == 40);
        test_cond("sdsrezie() expand strlen", strlen(x) == 40);
        test_cond("sdsrezie() expand alloc", sdsalloc(x) == 200);
        /* Test sdsresize - trim free space */
        x = sdsResize(x, 80);
        test_cond("sdsrezie() shrink len", sdslen(x) == 40);
        test_cond("sdsrezie() shrink strlen", strlen(x) == 40);
        test_cond("sdsrezie() shrink alloc", sdsalloc(x) == 80);
        /* Test sdsresize - crop used space */
        x = sdsResize(x, 30);
        test_cond("sdsrezie() crop len", sdslen(x) == 30);
        test_cond("sdsrezie() crop strlen", strlen(x) == 30);
        test_cond("sdsrezie() crop alloc", sdsalloc(x) == 30);
        /* Test sdsresize - extend to different class */
        x = sdsResize(x, 400);
        test_cond("sdsrezie() expand len", sdslen(x) == 30);
        test_cond("sdsrezie() expand strlen", strlen(x) == 30);
        test_cond("sdsrezie() expand alloc", sdsalloc(x) == 400);
        /* Test sdsresize - shrink to different class */
        x = sdsResize(x, 4);
        test_cond("sdsrezie() crop len", sdslen(x) == 4);
        test_cond("sdsrezie() crop strlen", strlen(x) == 4);
        test_cond("sdsrezie() crop alloc", sdsalloc(x) == 4);
        sdsfree(x);
    }
    return 0;
}
#endif


================================================
FILE: src/redis/sds.h
================================================
/* SDSLib 2.0 -- A C dynamic strings library
 *
 * Copyright (c) 2006-2015, Salvatore Sanfilippo <antirez at gmail dot com>
 * Copyright (c) 2015, Oran Agra
 * Copyright (c) 2015, Redis Labs, Inc
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef __SDS_H
#define __SDS_H

#define SDS_MAX_PREALLOC (1024*1024)
extern const char *SDS_NOINIT;

#include <sys/types.h>
#include <stdarg.h>
#include <stdint.h>

typedef char *sds;

/* Note: sdshdr5 is never used, we just access the flags byte directly.
 * However is here to document the layout of type 5 SDS strings. */
struct __attribute__ ((__packed__)) sdshdr5 {
    unsigned char flags; /* 3 lsb of type, and 5 msb of string length */
    char buf[];
};
struct __attribute__ ((__packed__)) sdshdr8 {
    uint8_t len; /* used */
    uint8_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};
struct __attribute__ ((__packed__)) sdshdr16 {
    uint16_t len; /* used */
    uint16_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};
struct __attribute__ ((__packed__)) sdshdr32 {
    uint32_t len; /* used */
    uint32_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};
struct __attribute__ ((__packed__)) sdshdr64 {
    uint64_t len; /* used */
    uint64_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};

#define SDS_TYPE_5  0
#define SDS_TYPE_8  1
#define SDS_TYPE_16 2
#define SDS_TYPE_32 3
#define SDS_TYPE_64 4
#define SDS_TYPE_MASK 7
#define SDS_TYPE_BITS 3
#define SDS_HDR(T,s) ((struct sdshdr##T *)((s)-(sizeof(struct sdshdr##T))))
#define SDS_HDR_VAR(T,s) struct sdshdr##T *sh = SDS_HDR(T,s);
#define SDS_TYPE_5_LEN(f) ((f)>>SDS_TYPE_BITS)

static inline size_t sdslen(const sds s) {
    unsigned char flags = s[-1];
    switch(flags&SDS_TYPE_MASK) {
        case SDS_TYPE_5:
            return SDS_TYPE_5_LEN(flags);
        case SDS_TYPE_8:
            return SDS_HDR(8,s)->len;
        case SDS_TYPE_16:
            return SDS_HDR(16,s)->len;
        case SDS_TYPE_32:
            return SDS_HDR(32,s)->len;
        case SDS_TYPE_64:
            return SDS_HDR(64,s)->len;
    }
    return 0;
}

static inline size_t sdsavail(const sds s) {
    unsigned char flags = s[-1];
    switch(flags&SDS_TYPE_MASK) {
        case SDS_TYPE_5: {
            return 0;
        }
        case SDS_TYPE_8: {
            SDS_HDR_VAR(8,s);
            return sh->alloc - sh->len;
        }
        case SDS_TYPE_16: {
            SDS_HDR_VAR(16,s);
            return sh->alloc - sh->len;
        }
        case SDS_TYPE_32: {
            SDS_HDR_VAR(32,s);
            return sh->alloc - sh->len;
        }
        case SDS_TYPE_64: {
            SDS_HDR_VAR(64,s);
            return sh->alloc - sh->len;
        }
    }
    return 0;
}

static inline void sdssetlen(sds s, size_t newlen) {
    unsigned char flags = s[-1];
    switch(flags&SDS_TYPE_MASK) {
        case SDS_TYPE_5:
            {
                unsigned char *fp = ((unsigned char*)s)-1;
                *fp = SDS_TYPE_5 | (newlen << SDS_TYPE_BITS);
            }
            break;
        case SDS_TYPE_8:
            SDS_HDR(8,s)->len = newlen;
            break;
        case SDS_TYPE_16:
            SDS_HDR(16,s)->len = newlen;
            break;
        case SDS_TYPE_32:
            SDS_HDR(32,s)->len = newlen;
            break;
        case SDS_TYPE_64:
            SDS_HDR(64,s)->len = newlen;
            break;
    }
}

static inline void sdsinclen(sds s, size_t inc) {
    unsigned char flags = s[-1];
    switch(flags&SDS_TYPE_MASK) {
        case SDS_TYPE_5:
            {
                unsigned char *fp = ((unsigned char*)s)-1;
                unsigned char newlen = SDS_TYPE_5_LEN(flags)+inc;
                *fp = SDS_TYPE_5 | (newlen << SDS_TYPE_BITS);
            }
            break;
        case SDS_TYPE_8:
            SDS_HDR(8,s)->len += inc;
            break;
        case SDS_TYPE_16:
            SDS_HDR(16,s)->len += inc;
            break;
        case SDS_TYPE_32:
            SDS_HDR(32,s)->len += inc;
            break;
        case SDS_TYPE_64:
            SDS_HDR(64,s)->len += inc;
            break;
    }
}

/* sdsalloc() = sdsavail() + sdslen() */
static inline size_t sdsalloc(const sds s) {
    unsigned char flags = s[-1];
    switch(flags&SDS_TYPE_MASK) {
        case SDS_TYPE_5:
            return SDS_TYPE_5_LEN(flags);
        case SDS_TYPE_8:
            return SDS_HDR(8,s)->alloc;
        case SDS_TYPE_16:
            return SDS_HDR(16,s)->alloc;
        case SDS_TYPE_32:
            return SDS_HDR(32,s)->alloc;
        case SDS_TYPE_64:
            return SDS_HDR(64,s)->alloc;
    }
    return 0;
}

static inline void sdssetalloc(sds s, size_t newlen) {
    unsigned char flags = s[-1];
    switch(flags&SDS_TYPE_MASK) {
        case SDS_TYPE_5:
            /* Nothing to do, this type has no total allocation info. */
            break;
        case SDS_TYPE_8:
            SDS_HDR(8,s)->alloc = newlen;
            break;
        case SDS_TYPE_16:
            SDS_HDR(16,s)->alloc = newlen;
            break;
        case SDS_TYPE_32:
            SDS_HDR(32,s)->alloc = newlen;
            break;
        case SDS_TYPE_64:
            SDS_HDR(64,s)->alloc = newlen;
            break;
    }
}

sds sdsnewlen(const void *init, size_t initlen);
sds sdsnew(const char *init);
sds sdsempty(void);
sds sdsdup(const sds s);
void sdsfree(sds s);
sds sdsgrowzero(sds s, size_t len);
sds sdscatlen(sds s, const void *t, size_t len);
sds sdscat(sds s, const char *t);
sds sdscatsds(sds s, const sds t);
sds sdscpylen(sds s, const char *t, size_t len);
sds sdscpy(sds s, const char *t);

sds sdscatvprintf(sds s, const char *fmt, va_list ap);
#ifdef __GNUC__
sds sdscatprintf(sds s, const char *fmt, ...)
    __attribute__((format(printf, 2, 3)));
#else
sds sdscatprintf(sds s, const char *fmt, ...);
#endif

sds sdscatfmt(sds s, char const *fmt, ...);
sds sdstrim(sds s, const char *cset);
void sdssubstr(sds s, size_t start, size_t len);
int sdsrange(sds s, ssize_t start, ssize_t end);
void sdsupdatelen(sds s);
void sdsclear(sds s);
int sdscmp(const sds s1, const sds s2);
sds *sdssplitlen(const char *s, ssize_t len, const char *sep, int seplen, int *count);
void sdsfreesplitres(sds *tokens, int count);
void sdstolower(sds s);
void sdstoupper(sds s);
sds sdsfromlonglong(long long value);
sds sdscatrepr(sds s, const char *p, size_t len);
sds *sdssplitargs(const char *line, int *argc);
sds sdsmapchars(sds s, const char *from, const char *to, size_t setlen);
sds sdsjoin(char **argv, int argc, char *sep);
sds sdsjoinsds(sds *argv, int argc, const char *sep, size_t seplen);

/* Callback for sdstemplate. The function gets called by sdstemplate
 * every time a variable needs to be expanded. The variable name is
 * provided as variable, and the callback is expected to return a
 * substitution value. Returning a NULL indicates an error.
 */
typedef sds (*sdstemplate_callback_t)(const sds variable, void *arg);
sds sdstemplate(const char *templ, sdstemplate_callback_t cb_func, void *cb_arg);

/* Low level functions exposed to the user API */
sds sdsMakeRoomFor(sds s, size_t addlen);
sds sdsMakeRoomForNonGreedy(sds s, size_t addlen);
void sdsIncrLen(sds s, ssize_t incr);
sds sdsRemoveFreeSpace(sds s);
sds sdsResize(sds s, size_t size);
size_t sdsAllocSize(sds s);
void *sdsAllocPtr(sds s);

/* Export the allocator used by SDS to the program using SDS.
 * Sometimes the program SDS is linked to, may use a different set of
 * allocators, but may want to allocate or free things that SDS will
 * respectively free or allocate. */
void *sds_malloc(size_t size);
void *sds_realloc(void *ptr, size_t size);
void sds_free(void *ptr);

#ifdef REDIS_TEST
int sdsTest(int argc, char *argv[], int flags);
#endif

#endif


================================================
FILE: src/redis/sdsalloc.h
================================================
/* SDSLib 2.0 -- A C dynamic strings library
 *
 * Copyright (c) 2006-2015, Salvatore Sanfilippo <antirez at gmail dot com>
 * Copyright (c) 2015, Redis Labs, Inc
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/* SDS allocator selection.
 *
 * This file is used in order to change the SDS allocator at compile time.
 * Just define the following defines to what you want to use. Also add
 * the include of your alternate allocator if needed (not needed in order
 * to use the default libc allocator). */

#ifndef __SDS_ALLOC_H__
#define __SDS_ALLOC_H__

#include "zmalloc.h"
#define s_malloc zmalloc
#define s_realloc zrealloc
#define s_calloc zcalloc
#define s_trymalloc ztrymalloc
#define s_tryrealloc ztryrealloc
#define s_free zfree
#define s_malloc_usable zmalloc_usable
#define s_realloc_usable zrealloc_usable
#define s_trymalloc_usable ztrymalloc_usable
#define s_tryrealloc_usable ztryrealloc_usable
#define s_free_usable zfree_usable

#endif


================================================
FILE: src/redis/siphash.c
================================================
/*
   SipHash reference C implementation

   Copyright (c) 2012-2016 Jean-Philippe Aumasson
   <jeanphilippe.aumasson@gmail.com>
   Copyright (c) 2012-2014 Daniel J. Bernstein <djb@cr.yp.to>
   Copyright (c) 2017 Salvatore Sanfilippo <antirez@gmail.com>

   To the extent possible under law, the author(s) have dedicated all copyright
   and related and neighboring rights to this software to the public domain
   worldwide. This software is distributed without any warranty.

   You should have received a copy of the CC0 Public Domain Dedication along
   with this software. If not, see
   <http://creativecommons.org/publicdomain/zero/1.0/>.

   ----------------------------------------------------------------------------

   This version was modified by Salvatore Sanfilippo <antirez@gmail.com>
   in the following ways:

   1. We use SipHash 1-2. This is not believed to be as strong as the
      suggested 2-4 variant, but AFAIK there are not trivial attacks
      against this reduced-rounds version, and it runs at the same speed
      as Murmurhash2 that we used previously, while the 2-4 variant slowed
      down Redis by a 4% figure more or less.
   2. Hard-code rounds in the hope the compiler can optimize it more
      in this raw from. Anyway we always want the standard 2-4 variant.
   3. Modify the prototype and implementation so that the function directly
      returns an uint64_t value, the hash itself, instead of receiving an
      output buffer. This also means that the output size is set to 8 bytes
      and the 16 bytes output code handling was removed.
   4. Provide a case insensitive variant to be used when hashing strings that
      must be considered identical by the hash table regardless of the case.
      If we don't have directly a case insensitive hash function, we need to
      perform a text transformation in some temporary buffer, which is costly.
   5. Remove debugging code.
   6. Modified the original test.c file to be a stand-alone function testing
      the function in the new form (returning an uint64_t) using just the
      relevant test vector.
 */
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>

/* Fast tolower() alike function that does not care about locale
 * but just returns a-z instead of A-Z. */
int siptlw(int c) {
    if (c >= 'A' && c <= 'Z') {
        return c+('a'-'A');
    } else {
        return c;
    }
}

#if defined(__has_attribute)
#if __has_attribute(no_sanitize)
#define NO_SANITIZE(sanitizer) __attribute__((no_sanitize(sanitizer)))
#endif
#endif

#if !defined(NO_SANITIZE)
#define NO_SANITIZE(sanitizer)
#endif

/* Test of the CPU is Little Endian and supports not aligned accesses.
 * Two interesting conditions to speedup the function that happen to be
 * in most of x86 servers. */
#if defined(__X86_64__) || defined(__x86_64__) || defined (__i386__) \
	|| defined (__aarch64__) || defined (__arm64__)
#define UNALIGNED_LE_CPU
#endif

#define ROTL(x, b) (uint64_t)(((x) << (b)) | ((x) >> (64 - (b))))

#define U32TO8_LE(p, v)                                                        \
    (p)[0] = (uint8_t)((v));                                                   \
    (p)[1] = (uint8_t)((v) >> 8);                                              \
    (p)[2] = (uint8_t)((v) >> 16);                                             \
    (p)[3] = (uint8_t)((v) >> 24);

#define U64TO8_LE(p, v)                                                        \
    U32TO8_LE((p), (uint32_t)((v)));                                           \
    U32TO8_LE((p) + 4, (uint32_t)((v) >> 32));

#ifdef UNALIGNED_LE_CPU
#define U8TO64_LE(p) (*((uint64_t*)(p)))
#else
#define U8TO64_LE(p)                                                           \
    (((uint64_t)((p)[0])) | ((uint64_t)((p)[1]) << 8) |                        \
     ((uint64_t)((p)[2]) << 16) | ((uint64_t)((p)[3]) << 24) |                 \
     ((uint64_t)((p)[4]) << 32) | ((uint64_t)((p)[5]) << 40) |                 \
     ((uint64_t)((p)[6]) << 48) | ((uint64_t)((p)[7]) << 56))
#endif

#define U8TO64_LE_NOCASE(p)                                                    \
    (((uint64_t)(siptlw((p)[0]))) |                                           \
     ((uint64_t)(siptlw((p)[1])) << 8) |                                      \
     ((uint64_t)(siptlw((p)[2])) << 16) |                                     \
     ((uint64_t)(siptlw((p)[3])) << 24) |                                     \
     ((uint64_t)(siptlw((p)[4])) << 32) |                                              \
     ((uint64_t)(siptlw((p)[5])) << 40) |                                              \
     ((uint64_t)(siptlw((p)[6])) << 48) |                                              \
     ((uint64_t)(siptlw((p)[7])) << 56))

#define SIPROUND                                                               \
    do {                                                                       \
        v0 += v1;                                                              \
        v1 = ROTL(v1, 13);                                                     \
        v1 ^= v0;                                                              \
        v0 = ROTL(v0, 32);                                                     \
        v2 += v3;                                                              \
        v3 = ROTL(v3, 16);                                                     \
        v3 ^= v2;                                                              \
        v0 += v3;                                                              \
        v3 = ROTL(v3, 21);                                                     \
        v3 ^= v0;                                                              \
        v2 += v1;                                                              \
        v1 = ROTL(v1, 17);                                                     \
        v1 ^= v2;                                                              \
        v2 = ROTL(v2, 32);                                                     \
    } while (0)

NO_SANITIZE("alignment")
uint64_t siphash(const uint8_t *in, const size_t inlen, const uint8_t *k) {
#ifndef UNALIGNED_LE_CPU
    uint64_t hash;
    uint8_t *out = (uint8_t*) &hash;
#endif
    uint64_t v0 = 0x736f6d6570736575ULL;
    uint64_t v1 = 0x646f72616e646f6dULL;
    uint64_t v2 = 0x6c7967656e657261ULL;
    uint64_t v3 = 0x7465646279746573ULL;
    uint64_t k0 = U8TO64_LE(k);
    uint64_t k1 = U8TO64_LE(k + 8);
    uint64_t m;
    const uint8_t *end = in + inlen - (inlen % sizeof(uint64_t));
    const int left = inlen & 7;
    uint64_t b = ((uint64_t)inlen) << 56;
    v3 ^= k1;
    v2 ^= k0;
    v1 ^= k1;
    v0 ^= k0;

    for (; in != end; in += 8) {
        m = U8TO64_LE(in);
        v3 ^= m;

        SIPROUND;

        v0 ^= m;
    }

    switch (left) {
    case 7: b |= ((uint64_t)in[6]) << 48; /* fall-thru */
    case 6: b |= ((uint64_t)in[5]) << 40; /* fall-thru */
    case 5: b |= ((uint64_t)in[4]) << 32; /* fall-thru */
    case 4: b |= ((uint64_t)in[3]) << 24; /* fall-thru */
    case 3: b |= ((uint64_t)in[2]) << 16; /* fall-thru */
    case 2: b |= ((uint64_t)in[1]) << 8; /* fall-thru */
    case 1: b |= ((uint64_t)in[0]); break;
    case 0: break;
    }

    v3 ^= b;

    SIPROUND;

    v0 ^= b;
    v2 ^= 0xff;

    SIPROUND;
    SIPROUND;

    b = v0 ^ v1 ^ v2 ^ v3;
#ifndef UNALIGNED_LE_CPU
    U64TO8_LE(out, b);
    return hash;
#else
    return b;
#endif
}

NO_SANITIZE("alignment")
uint64_t siphash_nocase(const uint8_t *in, const size_t inlen, const uint8_t *k)
{
#ifndef UNALIGNED_LE_CPU
    uint64_t hash;
    uint8_t *out = (uint8_t*) &hash;
#endif
    uint64_t v0 = 0x736f6d6570736575ULL;
    uint64_t v1 = 0x646f72616e646f6dULL;
    uint64_t v2 = 0x6c7967656e657261ULL;
    uint64_t v3 = 0x7465646279746573ULL;
    uint64_t k0 = U8TO64_LE(k);
    uint64_t k1 = U8TO64_LE(k + 8);
    uint64_t m;
    const uint8_t *end = in + inlen - (inlen % sizeof(uint64_t));
    const int left = inlen & 7;
    uint64_t b = ((uint64_t)inlen) << 56;
    v3 ^= k1;
    v2 ^= k0;
    v1 ^= k1;
    v0 ^= k0;

    for (; in != end; in += 8) {
        m = U8TO64_LE_NOCASE(in);
        v3 ^= m;

        SIPROUND;

        v0 ^= m;
    }

    switch (left) {
    case 7: b |= ((uint64_t)siptlw(in[6])) << 48; /* fall-thru */
    case 6: b |= ((uint64_t)siptlw(in[5])) << 40; /* fall-thru */
    case 5: b |= ((uint64_t)siptlw(in[4])) << 32; /* fall-thru */
    case 4: b |= ((uint64_t)siptlw(in[3])) << 24; /* fall-thru */
    case 3: b |= ((uint64_t)siptlw(in[2])) << 16; /* fall-thru */
    case 2: b |= ((uint64_t)siptlw(in[1])) << 8; /* fall-thru */
    case 1: b |= ((uint64_t)siptlw(in[0])); break;
    case 0: break;
    }

    v3 ^= b;

    SIPROUND;

    v0 ^= b;
    v2 ^= 0xff;

    SIPROUND;
    SIPROUND;

    b = v0 ^ v1 ^ v2 ^ v3;
#ifndef UNALIGNED_LE_CPU
    U64TO8_LE(out, b);
    return hash;
#else
    return b;
#endif
}


/* --------------------------------- TEST ------------------------------------ */

#ifdef SIPHASH_TEST

const uint8_t vectors_sip64[64][8] = {
    { 0x31, 0x0e, 0x0e, 0xdd, 0x47, 0xdb, 0x6f, 0x72, },
    { 0xfd, 0x67, 0xdc, 0x93, 0xc5, 0x39, 0xf8, 0x74, },
    { 0x5a, 0x4f, 0xa9, 0xd9, 0x09, 0x80, 0x6c, 0x0d, },
    { 0x2d, 0x7e, 0xfb, 0xd7, 0x96, 0x66, 0x67, 0x85, },
    { 0xb7, 0x87, 0x71, 0x27, 0xe0, 0x94, 0x27, 0xcf, },
    { 0x8d, 0xa6, 0x99, 0xcd, 0x64, 0x55, 0x76, 0x18, },
    { 0xce, 0xe3, 0xfe, 0x58, 0x6e, 0x46, 0xc9, 0xcb, },
    { 0x37, 0xd1, 0x01, 0x8b, 0xf5, 0x00, 0x02, 0xab, },
    { 0x62, 0x24, 0x93, 0x9a, 0x79, 0xf5, 0xf5, 0x93, },
    { 0xb0, 0xe4, 0xa9, 0x0b, 0xdf, 0x82, 0x00, 0x9e, },
    { 0xf3, 0xb9, 0xdd, 0x94, 0xc5, 0xbb, 0x5d, 0x7a, },
    { 0xa7, 0xad, 0x6b, 0x22, 0x46, 0x2f, 0xb3, 0xf4, },
    { 0xfb, 0xe5, 0x0e, 0x86, 0xbc, 0x8f, 0x1e, 0x75, },
    { 0x90, 0x3d, 0x84, 0xc0, 0x27, 0x56, 0xea, 0x14, },
    { 0xee, 0xf2, 0x7a, 0x8e, 0x90, 0xca, 0x23, 0xf7, },
    { 0xe5, 0x45, 0xbe, 0x49, 0x61, 0xca, 0x29, 0xa1, },
    { 0xdb, 0x9b, 0xc2, 0x57, 0x7f, 0xcc, 0x2a, 0x3f, },
    { 0x94, 0x47, 0xbe, 0x2c, 0xf5, 0xe9, 0x9a, 0x69, },
    { 0x9c, 0xd3, 0x8d, 0x96, 0xf0, 0xb3, 0xc1, 0x4b, },
    { 0xbd, 0x61, 0x79, 0xa7, 0x1d, 0xc9, 0x6d, 0xbb, },
    { 0x98, 0xee, 0xa2, 0x1a, 0xf2, 0x5c, 0xd6, 0xbe, },
    { 0xc7, 0x67, 0x3b, 0x2e, 0xb0, 0xcb, 0xf2, 0xd0, },
    { 0x88, 0x3e, 0xa3, 0xe3, 0x95, 0x67, 0x53, 0x93, },
    { 0xc8, 0xce, 0x5c, 0xcd, 0x8c, 0x03, 0x0c, 0xa8, },
    { 0x94, 0xaf, 0x49, 0xf6, 0xc6, 0x50, 0xad, 0xb8, },
    { 0xea, 0xb8, 0x85, 0x8a, 0xde, 0x92, 0xe1, 0xbc, },
    { 0xf3, 0x15, 0xbb, 0x5b, 0xb8, 0x35, 0xd8, 0x17, },
    { 0xad, 0xcf, 0x6b, 0x07, 0x63, 0x61, 0x2e, 0x2f, },
    { 0xa5, 0xc9, 0x1d, 0xa7, 0xac, 0xaa, 0x4d, 0xde, },
    { 0x71, 0x65, 0x95, 0x87, 0x66, 0x50, 0xa2, 0xa6, },
    { 0x28, 0xef, 0x49, 0x5c, 0x53, 0xa3, 0x87, 0xad, },
    { 0x42, 0xc3, 0x41, 0xd8, 0xfa, 0x92, 0xd8, 0x32, },
    { 0xce, 0x7c, 0xf2, 0x72, 0x2f, 0x51, 0x27, 0x71, },
    { 0xe3, 0x78, 0x59, 0xf9, 0x46, 0x23, 0xf3, 0xa7, },
    { 0x38, 0x12, 0x05, 0xbb, 0x1a, 0xb0, 0xe0, 0x12, },
    { 0xae, 0x97, 0xa1, 0x0f, 0xd4, 0x34, 0xe0, 0x15, },
    { 0xb4, 0xa3, 0x15, 0x08, 0xbe, 0xff, 0x4d, 0x31, },
    { 0x81, 0x39, 0x62, 0x29, 0xf0, 0x90, 0x79, 0x02, },
    { 0x4d, 0x0c, 0xf4, 0x9e, 0xe5, 0xd4, 0xdc, 0xca, },
    { 0x5c, 0x73, 0x33, 0x6a, 0x76, 0xd8, 0xbf, 0x9a, },
    { 0xd0, 0xa7, 0x04, 0x53, 0x6b, 0xa9, 0x3e, 0x0e, },
    { 0x92, 0x59, 0x58, 0xfc, 0xd6, 0x42, 0x0c, 0xad, },
    { 0xa9, 0x15, 0xc2, 0x9b, 0xc8, 0x06, 0x73, 0x18, },
    { 0x95, 0x2b, 0x79, 0xf3, 0xbc, 0x0a, 0xa6, 0xd4, },
    { 0xf2, 0x1d, 0xf2, 0xe4, 0x1d, 0x45, 0x35, 0xf9, },
    { 0x87, 0x57, 0x75, 0x19, 0x04, 0x8f, 0x53, 0xa9, },
    { 0x10, 0xa5, 0x6c, 0xf5, 0xdf, 0xcd, 0x9a, 0xdb, },
    { 0xeb, 0x75, 0x09, 0x5c, 0xcd, 0x98, 0x6c, 0xd0, },
    { 0x51, 0xa9, 0xcb, 0x9e, 0xcb, 0xa3, 0x12, 0xe6, },
    { 0x96, 0xaf, 0xad, 0xfc, 0x2c, 0xe6, 0x66, 0xc7, },
    { 0x72, 0xfe, 0x52, 0x97, 0x5a, 0x43, 0x64, 0xee, },
    { 0x5a, 0x16, 0x45, 0xb2, 0x76, 0xd5, 0x92, 0xa1, },
    { 0xb2, 0x74, 0xcb, 0x8e, 0xbf, 0x87, 0x87, 0x0a, },
    { 0x6f, 0x9b, 0xb4, 0x20, 0x3d, 0xe7, 0xb3, 0x81, },
    { 0xea, 0xec, 0xb2, 0xa3, 0x0b, 0x22, 0xa8, 0x7f, },
    { 0x99, 0x24, 0xa4, 0x3c, 0xc1, 0x31, 0x57, 0x24, },
    { 0xbd, 0x83, 0x8d, 0x3a, 0xaf, 0xbf, 0x8d, 0xb7, },
    { 0x0b, 0x1a, 0x2a, 0x32, 0x65, 0xd5, 0x1a, 0xea, },
    { 0x13, 0x50, 0x79, 0xa3, 0x23, 0x1c, 0xe6, 0x60, },
    { 0x93, 0x2b, 0x28, 0x46, 0xe4, 0xd7, 0x06, 0x66, },
    { 0xe1, 0x91, 0x5f, 0x5c, 0xb1, 0xec, 0xa4, 0x6c, },
    { 0xf3, 0x25, 0x96, 0x5c, 0xa1, 0x6d, 0x62, 0x9f, },
    { 0x57, 0x5f, 0xf2, 0x8e, 0x60, 0x38, 0x1b, 0xe5, },
    { 0x72, 0x45, 0x06, 0xeb, 0x4c, 0x32, 0x8a, 0x95, },
};


/* Test siphash using a test vector. Returns 0 if the function passed
 * all the tests, otherwise 1 is returned.
 *
 * IMPORTANT: The test vector is for SipHash 2-4. Before running
 * the test revert back the siphash() function to 2-4 rounds since
 * now it uses 1-2 rounds. */
int siphash_test(void) {
    uint8_t in[64], k[16];
    int i;
    int fails = 0;

    for (i = 0; i < 16; ++i)
        k[i] = i;

    for (i = 0; i < 64; ++i) {
        in[i] = i;
        uint64_t hash = siphash(in, i, k);
        const uint8_t *v = NULL;
        v = (uint8_t *)vectors_sip64;
        if (memcmp(&hash, v + (i * 8), 8)) {
            /* printf("fail for %d bytes\n", i); */
            fails++;
        }
    }

    /* Run a few basic tests with the case insensitive version. */
    uint64_t h1, h2;
    h1 = siphash((uint8_t*)"hello world",11,(uint8_t*)"1234567812345678");
    h2 = siphash_nocase((uint8_t*)"hello world",11,(uint8_t*)"1234567812345678");
    if (h1 != h2) fails++;

    h1 = siphash((uint8_t*)"hello world",11,(uint8_t*)"1234567812345678");
    h2 = siphash_nocase((uint8_t*)"HELLO world",11,(uint8_t*)"1234567812345678");
    if (h1 != h2) fails++;

    h1 = siphash((uint8_t*)"HELLO world",11,(uint8_t*)"1234567812345678");
    h2 = siphash_nocase((uint8_t*)"HELLO world",11,(uint8_t*)"1234567812345678");
    if (h1 == h2) fails++;

    if (!fails) return 0;
    return 1;
}

int main(void) {
    if (siphash_test() == 0) {
        printf("SipHash test: OK\n");
        return 0;
    } else {
        printf("SipHash test: FAILED\n");
        return 1;
    }
}

#endif


================================================
FILE: src/redis/stream.h
================================================
#ifndef STREAM_H
#define STREAM_H

#include "util.h"
#include "rax.h"
#include "sds.h"
#include "listpack.h"


typedef struct redisObject robj;

/* Stream item ID: a 128 bit number composed of a milliseconds time and
 * a sequence counter. IDs generated in the same millisecond (or in a past
 * millisecond if the clock jumped backward) will use the millisecond time
 * of the latest generated ID and an incremented sequence. */
typedef struct streamID {
    uint64_t ms;  /* Unix time in milliseconds. */
    uint64_t seq; /* Sequence number. */
} streamID;

typedef struct stream {
    struct rax *rax;                      /* The radix tree holding the stream. */
    uint64_t length;               /* Current number of elements inside this stream. */
    streamID last_id;              /* Zero if there are yet no items. */
    streamID first_id;             /* The first non-tombstone entry, zero if empty. */
    streamID max_deleted_entry_id; /* The maximal ID that was deleted. */
    uint64_t entries_added;        /* All time count of elements added. */
    struct rax *cgroups;                  /* Consumer groups dictionary: name -> streamCG */
} stream;

/* We define an iterator to iterate stream items in an abstract way, without
 * caring about the radix tree + listpack representation. Technically speaking
 * the iterator is only used inside streamReplyWithRange(), so could just
 * be implemented inside the function, but practically there is the AOF
 * rewriting code that also needs to iterate the stream to emit the XADD
 * commands. */
typedef struct streamIterator {
    stream *stream;         /* The stream we are iterating. */
    streamID master_id;     /* ID of the master entry at listpack head. */
    uint64_t master_fields_count;       /* Master entries # of fields. */
    unsigned char *master_fields_start; /* Master entries start in listpack. */
    unsigned char *master_fields_ptr;   /* Master field to emit next. */
    int entry_flags;                    /* Flags of entry we are emitting. */
    int rev;                /* True if iterating end to start (reverse). */
    int skip_tombstones;    /* True if not emitting tombstone entries. */
    uint64_t start_key[2];  /* Start key as 128 bit big endian. */
    uint64_t end_key[2];    /* End key as 128 bit big endian. */
    raxIterator ri;         /* Rax iterator. */
    unsigned char *lp;      /* Current listpack. */
    unsigned char *lp_ele;  /* Current listpack cursor. */
    unsigned char *lp_flags; /* Current entry flags pointer. */
    /* Buffers used to hold the string of lpGet() when the element is
     * integer encoded, so that there is no string representation of the
     * element inside the listpack itself. */
    unsigned char field_buf[LP_INTBUF_SIZE];
    unsigned char value_buf[LP_INTBUF_SIZE];
} streamIterator;

/* Consumer group. */
typedef struct streamCG {
    streamID last_id;       /* Last delivered (not acknowledged) ID for this
                               group. Consumers that will just ask for more
                               messages will served with IDs > than this. */
    long long entries_read; /* In a perfect world (CG starts at 0-0, no dels, no
                               XGROUP SETID, ...), this is the total number of
                               group reads. In the real world, the reasoning behind
                               this value is detailed at the top comment of
                               streamEstimateDistanceFromFirstEverEntry(). */
    rax *pel;               /* Pending entries list. This is a radix tree that
                               has every message delivered to consumers (without
                               the NOACK option) that was yet not acknowledged
                               as processed. The key of the radix tree is the
                               ID as a 64 bit big endian number, while the
                               associated value is a streamNACK structure.*/
    rax *consumers;         /* A radix tree representing the consumers by name
                               and their associated representation in the form
                               of streamConsumer structures. */
} streamCG;

/* A specific consumer in a consumer group.  */
typedef struct streamConsumer {
    mstime_t seen_time;   /* Last time this consumer tried to perform an action (attempted reading/claiming). */
    mstime_t active_time; /* Last time this consumer was active (successful reading/claiming). */
    sds name;             /* Consumer name. This is how the consumer
                             will be identified in the consumer group
                             protocol. Case sensitive. */
    rax *pel;             /* Consumer specific pending entries list: all
                             the pending messages delivered to this
                             consumer not yet acknowledged. Keys are
                             big endian message IDs, while values are
                             the same streamNACK structure referenced
                             in the "pel" of the consumer group structure
                             itself, so the value is shared. */
} streamConsumer;

/* Pending (yet not acknowledged) message in a consumer group. */
typedef struct streamNACK {
    mstime_t delivery_time;   /* Last time this message was delivered. */
    uint64_t delivery_count;  /* Number of times this message was delivered.*/
    streamConsumer *consumer; /* The consumer this message was delivered to
                                 in the last delivery. */
} streamNACK;


typedef struct {
  /* XADD options */
  streamID id;     /* User-provided ID, for XADD only. */
  int id_given;    /* Was an ID different than "*" specified? for XADD only. */
  int seq_given;   /* Was an ID different than "ms-*" specified? for XADD only. */
  int no_mkstream; /* if set to 1 do not create new stream */

  /* XADD + XTRIM common options */
  int trim_strategy;         /* TRIM_STRATEGY_* */
  int trim_strategy_arg_idx; /* Index of the count in MAXLEN/MINID, for rewriting. */
  int approx_trim;           /* If 1 only delete whole radix tree nodes, so
                              * the trim argument is not applied verbatim. */
  long long limit;           /* Maximum amount of entries to trim. If 0, no limitation
                              * on the amount of trimming work is enforced. */
  /* TRIM_STRATEGY_MAXLEN options */
  long long maxlen; /* After trimming, leave stream at this length . */
  /* TRIM_STRATEGY_MINID options */
  streamID minid; /* Trim by ID (No stream entries with ID < 'minid' will remain) */
} streamAddTrimArgs;

/* Prototypes of exported APIs. */
// struct client;

/* Flags for streamCreateConsumer */
#define SCC_DEFAULT 0
#define SCC_NO_NOTIFY (1 << 0)  /* Do not notify key space if consumer created */
#define SCC_NO_DIRTIFY (1 << 1) /* Do not dirty++ if consumer created */

#define SCG_INVALID_ENTRIES_READ -1
#define SCG_INVALID_LAG -1

#define TRIM_STRATEGY_NONE 0
#define TRIM_STRATEGY_MAXLEN 1
#define TRIM_STRATEGY_MINID 2

/* Every stream item inside the listpack, has a flags field that is used to
 * mark the entry as deleted, or having the same field as the "master"
 * entry at the start of the listpack. */
#define STREAM_ITEM_FLAG_NONE 0              /* No special flags. */
#define STREAM_ITEM_FLAG_DELETED (1 << 0)    /* Entry is deleted. Skip it. */
#define STREAM_ITEM_FLAG_SAMEFIELDS (1 << 1) /* Same fields as primary entry. */

void streamIteratorStart(streamIterator *si, stream *s, streamID *start, streamID *end, int rev);
int streamIteratorGetID(streamIterator *si, streamID *id, int64_t *numfields);
void streamIteratorGetField(streamIterator *si,
                            unsigned char **fieldptr,
                            unsigned char **valueptr,
                            int64_t *fieldlen,
                            int64_t *valuelen);
void streamIteratorStop(streamIterator *si);
streamCG *streamCreateCG(stream *s, const char *name, size_t namelen, streamID *id, long long entries_read);
void streamDecodeID(void *buf, streamID *id);
int streamCompareID(streamID *a, streamID *b);
void streamFreeNACK(streamNACK *na);

void streamGetEdgeID(stream *s, int first, int skip_tombstones, streamID *edge_id);
long long streamEstimateDistanceFromFirstEverEntry(stream *s, streamID *id);

#endif


================================================
FILE: src/redis/t_stream.c
================================================
/*
 * Copyright (c) 2017, Redis Ltd.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <errno.h>
#include <stdio.h>
#include <string.h>

#include "endianconv.h"
#include "stream.h"
#include "redis_aux.h"
#include "zmalloc.h"


/* For stream commands that require multiple IDs
 * when the number of IDs is less than 'STREAMID_STATIC_VECTOR_LEN',
 * avoid malloc allocation.*/
#define STREAMID_STATIC_VECTOR_LEN 8

/* Max pre-allocation for listpack. This is done to avoid abuse of a user
 * setting stream_node_max_bytes to a huge number. */
#define STREAM_LISTPACK_MAX_PRE_ALLOCATE 4096

/* Don't let listpacks grow too big, even if the user config allows it.
 * doing so can lead to an overflow (trying to store more than 32bit length
 * into the listpack header), or actually an assertion since lpInsert
 * will return NULL. */
#define STREAM_LISTPACK_MAX_SIZE (1 << 30)

/* -----------------------------------------------------------------------
 * Low level stream encoding: a radix tree of listpacks.
 * ----------------------------------------------------------------------- */
static inline int64_t lpGetIntegerIfValid(unsigned char *ele, int *valid) {
    int64_t v;
    unsigned char *e = lpGet(ele, &v, NULL);
    if (e == NULL) {
        if (valid) *valid = 1;
        return v;
    }
    long long ll;
    int ret = string2ll((char *)e, v, &ll);
    if (valid)
        *valid = ret;
    else
        serverAssert(ret != 0);
    v = ll;
    return v;
}

#define lpGetInteger(ele) lpGetIntegerIfValid(ele, NULL)

/* Get an edge streamID of a given listpack.
 * 'master_id' is an input param, used to build the 'edge_id' output param */
/* Convert the specified stream entry ID as a 128 bit big endian number, so
 * that the IDs can be sorted lexicographically. */
static void streamEncodeID(void *buf, streamID *id) {
    uint64_t e[2];
    e[0] = htonu64(id->ms);
    e[1] = htonu64(id->seq);
    memcpy(buf, e, sizeof(e));
}

/* This is the reverse of streamEncodeID(): the decoded ID will be stored
 * in the 'id' structure passed by reference. The buffer 'buf' must point
 * to a 128 bit big-endian encoded ID. */
void streamDecodeID(void *buf, streamID *id) {
    uint64_t e[2];
    memcpy(e, buf, sizeof(e));
    id->ms = ntohu64(e[0]);
    id->seq = ntohu64(e[1]);
}

/* Compare two stream IDs. Return -1 if a < b, 0 if a == b, 1 if a > b. */
int streamCompareID(streamID *a, streamID *b) {
    if (a->ms > b->ms)
        return 1;
    else if (a->ms < b->ms)
        return -1;
    /* The ms part is the same. Check the sequence part. */
    else if (a->seq > b->seq)
        return 1;
    else if (a->seq < b->seq)
        return -1;
    /* Everything is the same: IDs are equal. */
    return 0;
}

/* Retrieves the ID of the stream edge entry. An edge is either the first or
 * the last ID in the stream, and may be a tombstone. To filter out tombstones,
 * set the'skip_tombstones' argument to 1. */
void streamGetEdgeID(stream *s, int first, int skip_tombstones, streamID *edge_id) {
    streamIterator si;
    int64_t numfields;
    streamIteratorStart(&si, s, NULL, NULL, !first);
    si.skip_tombstones = skip_tombstones;
    int found = streamIteratorGetID(&si, edge_id, &numfields);
    if (!found) {
        streamID min_id = {0, 0}, max_id = {UINT64_MAX, UINT64_MAX};
        *edge_id = first ? max_id : min_id;
    }
    streamIteratorStop(&si);
}

/* Initialize the stream iterator, so that we can call iterating functions
 * to get the next items. This requires a corresponding streamIteratorStop()
 * at the end. The 'rev' parameter controls the direction. If it's zero the
 * iteration is from the start to the end element (inclusive), otherwise
 * if rev is non-zero, the iteration is reversed.
 *
 * Once the iterator is initialized, we iterate like this:
 *
 *  streamIterator myiterator;
 *  streamIteratorStart(&myiterator,...);
 *  int64_t numfields;
 *  while(streamIteratorGetID(&myiterator,&ID,&numfields)) {
 *      while(numfields--) {
 *          unsigned char *key, *value;
 *          size_t key_len, value_len;
 *          streamIteratorGetField(&myiterator,&key,&value,&key_len,&value_len);
 *
 *          ... do what you want with key and value ...
 *      }
 *  }
 *  streamIteratorStop(&myiterator); */
void streamIteratorStart(streamIterator *si, stream *s, streamID *start, streamID *end, int rev) {
    /* Initialize the iterator and translates the iteration start/stop
     * elements into a 128 big big-endian number. */
    if (start) {
        streamEncodeID(si->start_key, start);
    } else {
        si->start_key[0] = 0;
        si->start_key[1] = 0;
    }

    if (end) {
        streamEncodeID(si->end_key, end);
    } else {
        si->end_key[0] = UINT64_MAX;
        si->end_key[1] = UINT64_MAX;
    }

    /* Seek the correct node in the radix tree. */
    raxStart(&si->ri, s->rax);
    if (!rev) {
        if (start && (start->ms || start->seq)) {
            raxSeek(&si->ri, "<=", (unsigned char *)si->start_key, sizeof(si->start_key));
            if (raxEOF(&si->ri)) raxSeek(&si->ri, "^", NULL, 0);
        } else {
            raxSeek(&si->ri, "^", NULL, 0);
        }
    } else {
        if (end && (end->ms || end->seq)) {
            raxSeek(&si->ri, "<=", (unsigned char *)si->end_key, sizeof(si->end_key));
            if (raxEOF(&si->ri)) raxSeek(&si->ri, "$", NULL, 0);
        } else {
            raxSeek(&si->ri, "$", NULL, 0);
        }
    }
    si->stream = s;
    si->lp = NULL;           /* There is no current listpack right now. */
    si->lp_ele = NULL;       /* Current listpack cursor. */
    si->rev = rev;           /* Direction, if non-zero reversed, from end to start. */
    si->skip_tombstones = 1; /* By default tombstones aren't emitted. */
}

/* Return 1 and store the current item ID at 'id' if there are still
 * elements within the iteration range, otherwise return 0 in order to
 * signal the iteration terminated. */
int streamIteratorGetID(streamIterator *si, streamID *id, int64_t *numfields) {
    while (1) { /* Will stop when element > stop_key or end of radix tree. */
        /* If the current listpack is set to NULL, this is the start of the
         * iteration or the previous listpack was completely iterated.
         * Go to the next node. */
        if (si->lp == NULL || si->lp_ele == NULL) {
            if (!si->rev && !raxNext(&si->ri))
                return 0;
            else if (si->rev && !raxPrev(&si->ri))
                return 0;
            serverAssert(si->ri.key_len == sizeof(streamID));
            /* Get the master ID. */
            streamDecodeID(si->ri.key,&si->master_id);
            /* Get the master fields count. */
            si->lp = si->ri.data;
            si->lp_ele = lpFirst(si->lp);           /* Seek items count */
            si->lp_ele = lpNext(si->lp,si->lp_ele); /* Seek deleted count. */
            si->lp_ele = lpNext(si->lp,si->lp_ele); /* Seek num fields. */
            si->master_fields_count = lpGetInteger(si->lp_ele);
            si->lp_ele = lpNext(si->lp,si->lp_ele); /* Seek first field. */
            si->master_fields_start = si->lp_ele;
            /* We are now pointing to the first field of the master entry.
             * We need to seek either the first or the last entry depending
             * on the direction of the iteration. */
            if (!si->rev) {
                /* If we are iterating in normal order, skip the master fields
                 * to seek the first actual entry. */
                for (uint64_t i = 0; i < si->master_fields_count; i++)
                    si->lp_ele = lpNext(si->lp,si->lp_ele);
            } else {
                /* If we are iterating in reverse direction, just seek the
                 * last part of the last entry in the listpack (that is, the
                 * fields count). */
                si->lp_ele = lpLast(si->lp);
            }
        } else if (si->rev) {
            /* If we are iterating in the reverse order, and this is not
             * the first entry emitted for this listpack, then we already
             * emitted the current entry, and have to go back to the previous
             * one. */
            int64_t lp_count = lpGetInteger(si->lp_ele);
            while (lp_count--) si->lp_ele = lpPrev(si->lp, si->lp_ele);
            /* Seek lp-count of prev entry. */
            si->lp_ele = lpPrev(si->lp, si->lp_ele);
        }

        /* For every radix tree node, iterate the corresponding listpack,
         * returning elements when they are within range. */
        while (1) {
            if (!si->rev) {
                /* If we are going forward, skip the previous entry
                 * lp-count field (or in case of the master entry, the zero
                 * term field) */
                si->lp_ele = lpNext(si->lp,si->lp_ele);
                if (si->lp_ele == NULL) break;
            } else {
                /* If we are going backward, read the number of elements this
                 * entry is composed of, and jump backward N times to seek
                 * its start. */
                int64_t lp_count = lpGetInteger(si->lp_ele);
                if (lp_count == 0) { /* We reached the master entry. */
                    si->lp = NULL;
                    si->lp_ele = NULL;
                    break;
                }
                while(lp_count--) si->lp_ele = lpPrev(si->lp,si->lp_ele);
            }

            /* Get the flags entry. */
            si->lp_flags = si->lp_ele;
            int64_t flags = lpGetInteger(si->lp_ele);
            si->lp_ele = lpNext(si->lp,si->lp_ele); /* Seek ID. */

            /* Get the ID: it is encoded as difference between the master
             * ID and this entry ID. */
            *id = si->master_id;
            id->ms += lpGetInteger(si->lp_ele);
            si->lp_ele = lpNext(si->lp, si->lp_ele);
            id->seq += lpGetInteger(si->lp_ele);
            si->lp_ele = lpNext(si->lp, si->lp_ele);
            unsigned char buf[sizeof(streamID)];
            streamEncodeID(buf, id);

            /* The number of entries is here or not depending on the
             * flags. */
            if (flags & STREAM_ITEM_FLAG_SAMEFIELDS) {
                *numfields = si->master_fields_count;
            } else {
                *numfields = lpGetInteger(si->lp_ele);
                si->lp_ele = lpNext(si->lp, si->lp_ele);
            }
            serverAssert(*numfields >= 0);

            /* If current >= start, and the entry is not marked as
             * deleted or tombstones are included, emit it. */
            if (!si->rev) {
                if (memcmp(buf,si->start_key,sizeof(streamID)) >= 0 &&
                    (!si->skip_tombstones || !(flags & STREAM_ITEM_FLAG_DELETED)))
                {
                    if (memcmp(buf,si->end_key,sizeof(streamID)) > 0)
                        return 0; /* We are already out of range. */
                    si->entry_flags = flags;
                    if (flags & STREAM_ITEM_FLAG_SAMEFIELDS)
                        si->master_fields_ptr = si->master_fields_start;
                    return 1; /* Valid item returned. */
                }
            } else {
                if (memcmp(buf, si->end_key, sizeof(streamID)) <= 0 &&
                    (!si->skip_tombstones || !(flags & STREAM_ITEM_FLAG_DELETED))) {
                    if (memcmp(buf, si->start_key, sizeof(streamID)) < 0) return 0; /* We are already out of range. */
                    si->entry_flags = flags;
                    if (flags & STREAM_ITEM_FLAG_SAMEFIELDS)
                        si->master_fields_ptr = si->master_fields_start;
                    return 1; /* Valid item returned. */
                }
            }

            /* If we do not emit, we have to discard if we are going
             * forward, or seek the previous entry if we are going
             * backward. */
            if (!si->rev) {
                int64_t to_discard = (flags & STREAM_ITEM_FLAG_SAMEFIELDS) ? *numfields : *numfields * 2;
                for (int64_t i = 0; i < to_discard; i++) si->lp_ele = lpNext(si->lp, si->lp_ele);
            } else {
                int64_t prev_times = 4; /* flag + id ms + id seq + one more to
                                           go back to the previous entry "count"
                                           field. */
                /* If the entry was not flagged SAMEFIELD we also read the
                 * number of fields, so go back one more. */
                if (!(flags & STREAM_ITEM_FLAG_SAMEFIELDS)) prev_times++;
                while (prev_times--) si->lp_ele = lpPrev(si->lp, si->lp_ele);
            }
        }

        /* End of listpack reached. Try the next/prev radix tree node. */
    }
}

/* Get the field and value of the current item we are iterating. This should
 * be called immediately after streamIteratorGetID(), and for each field
 * according to the number of fields returned by streamIteratorGetID().
 * The function populates the field and value pointers and the corresponding
 * lengths by reference, that are valid until the next iterator call, assuming
 * no one touches the stream meanwhile. */
void streamIteratorGetField(streamIterator *si, unsigned char **fieldptr, unsigned char **valueptr, int64_t *fieldlen, int64_t *valuelen) {
    if (si->entry_flags & STREAM_ITEM_FLAG_SAMEFIELDS) {
        *fieldptr = lpGet(si->master_fields_ptr,fieldlen,si->field_buf);
        si->master_fields_ptr = lpNext(si->lp,si->master_fields_ptr);
    } else {
        *fieldptr = lpGet(si->lp_ele, fieldlen, si->field_buf);
        si->lp_ele = lpNext(si->lp, si->lp_ele);
    }
    *valueptr = lpGet(si->lp_ele, valuelen, si->value_buf);
    si->lp_ele = lpNext(si->lp, si->lp_ele);
}

/* Remove the current entry from the stream: can be called after the
 * GetID() API or after any GetField() call, however we need to iterate
 * a valid entry while calling this function. Moreover the function
 * requires the entry ID we are currently iterating, that was previously
 * returned by GetID().
 *
 * Note that after calling this function, next calls to GetField() can't
 * be performed: the entry is now deleted. Instead the iterator will
 * automatically re-seek to the next entry, so the caller should continue
 * with GetID(). */

/* Stop the stream iterator. The only cleanup we need is to free the rax
 * iterator, since the stream iterator itself is supposed to be stack
 * allocated. */
void streamIteratorStop(streamIterator *si) {
    raxStop(&si->ri);
}

static int streamIDEqZero(streamID *id) {
    return !(id->ms || id->seq);
}

/* This function returns a value that is the ID's logical read counter, or its
 * distance (the number of entries) from the first entry ever to have been added
 * to the stream.
 *
 * A counter is returned only in one of the following cases:
 * 1. The ID is the same as the stream's last ID. In this case, the returned
 *    is the same as the stream's entries_added counter.
 * 2. The ID equals that of the currently first entry in the stream, and the
 *    stream has no tombstones. The returned value, in this case, is the result
 *    of subtracting the stream's length from its added_entries, incremented by
 *    one.
 * 3. The ID less than the stream's first current entry's ID, and there are no
 *    tombstones. Here the estimated counter is the result of subtracting the
 *    stream's length from its added_entries.
 * 4. The stream's added_entries is zero, meaning that no entries were ever
 *    added.
 *
 * The special return value of ULLONG_MAX signals that the counter's value isn't
 * obtainable. It is returned in these cases:
 * 1. The provided ID, if it even exists, is somewhere between the stream's
 *    current first and last entries' IDs, or in the future.
 * 2. The stream contains one or more tombstones. */
long long streamEstimateDistanceFromFirstEverEntry(stream *s, streamID *id) {
    /* The counter of any ID in an empty, never-before-used stream is 0. */
    if (!s->entries_added) {
        return 0;
    }

    /* In the empty stream, if the ID is smaller or equal to the last ID,
     * it can set to the current added_entries value. */
    if (!s->length && streamCompareID(id, &s->last_id) < 1) {
        return s->entries_added;
    }

    if (!streamIDEqZero(id) && streamCompareID(id, &s->max_deleted_entry_id) < 0) {
        /* The ID is before the last tombstone, so the counter is unknown. */
        return SCG_INVALID_ENTRIES_READ;
    }

    int cmp_last = streamCompareID(id, &s->last_id);
    if (cmp_last == 0) {
        /* Return the exact counter of the last entry in the stream. */
        return s->entries_added;
    } else if (cmp_last > 0) {
        /* The counter of a future ID is unknown. */
        return SCG_INVALID_ENTRIES_READ;
    }

    int cmp_id_first = streamCompareID(id, &s->first_id);
    int cmp_xdel_first = streamCompareID(&s->max_deleted_entry_id, &s->first_id);
    if (streamIDEqZero(&s->max_deleted_entry_id) || cmp_xdel_first < 0) {
        /* There's definitely no fragmentation ahead. */
        if (cmp_id_first < 0) {
            /* Return the estimated counter. */
            return s->entries_added - s->length;
        } else if (cmp_id_first == 0) {
            /* Return the exact counter of the first entry in the stream. */
            return s->entries_added - s->length + 1;
        }
    }

    /* The ID is either before an XDEL that fragments the stream or an arbitrary
     * ID. Either case, so we can't make a prediction. */
    return SCG_INVALID_ENTRIES_READ;
}

/* Send the stream items in the specified range to the client 'c'. The range
 * the client will receive is between start and end inclusive, if 'count' is
 * non zero, no more than 'count' elements are sent.
 *
 * The 'end' pointer can be NULL to mean that we want all the elements from
 * 'start' till the end of the stream. If 'rev' is non zero, elements are
 * produced in reversed order from end to start.
 *
 * The function returns the number of entries emitted.
 *
 * If group and consumer are not NULL, the function performs additional work:
 * 1. It updates the last delivered ID in the group in case we are
 *    sending IDs greater than the current last ID.
 * 2. If the requested IDs are already assigned to some other consumer, the
 *    function will not return it to the client.
 * 3. An entry in the pending list will be created for every entry delivered
 *    for the first time to this consumer.
 * 4. The group's read counter is incremented if it is already valid and there
 *    are no future tombstones, or is invalidated (set to 0) otherwise. If the
 *    counter is invalid to begin with, we try to obtain it for the last
 *    delivered ID.
 *
 * The behavior may be modified passing non-zero flags:
 *
 * STREAM_RWR_NOACK: Do not create PEL entries, that is, the point "3" above
 *                   is not performed.
 * STREAM_RWR_RAWENTRIES: Do not emit array boundaries, but just the entries,
 *                        and return the number of entries emitted as usually.
 *                        This is used when the function is just used in order
 *                        to emit data and there is some higher level logic.
 *
 * The final argument 'spi' (stream propagation info pointer) is a structure
 * filled with information needed to propagate the command execution to AOF
 * and replicas, in the case a consumer group was passed: we need to generate
 * XCLAIM commands to create the pending list into AOF/replicas in that case.
 *
 * If 'spi' is set to NULL no propagation will happen even if the group was
 * given, but currently such a feature is never used by the code base that
 * will always pass 'spi' and propagate when a group is passed.
 *
 * Note that this function is recursive in certain cases. When it's called
 * with a non NULL group and consumer argument, it may call
 * streamReplyWithRangeFromConsumerPEL() in order to get entries from the
 * consumer pending entries list. However such a function will then call
 * streamReplyWithRange() in order to emit single entries (found in the
 * PEL by ID) to the client. This is the use case for the STREAM_RWR_RAWENTRIES
 * flag.
 */
#define STREAM_RWR_NOACK (1 << 0) /* Do not create entries in the PEL. */
#define STREAM_RWR_RAWENTRIES                                         \
    (1 << 1)                        /* Do not emit protocol for array \
                                       boundaries, just the entries. */
#define STREAM_RWR_HISTORY (1 << 2) /* Only serve consumer local PEL. */


/* -----------------------------------------------------------------------
 * Low level implementation of consumer groups
 * ----------------------------------------------------------------------- */


/* Free a NACK entry. */
void streamFreeNACK(streamNACK *na) {
    zfree(na);
}

/* Create a new consumer group in the context of the stream 's', having the
 * specified name, last server ID and reads counter. If a consumer group with
 * the same name already exists NULL is returned, otherwise the pointer to the
 * consumer group is returned. */
streamCG *streamCreateCG(stream *s, const char *name, size_t namelen, streamID *id, long long entries_read) {
    if (s->cgroups == NULL) s->cgroups = raxNew();
    if (raxFind(s->cgroups, (unsigned char *)name, namelen, NULL)) return NULL;

    streamCG *cg = zmalloc(sizeof(*cg));
    cg->pel = raxNew();
    cg->consumers = raxNew();
    cg->last_id = *id;
    cg->entries_read = entries_read;
    raxInsert(s->cgroups, (unsigned char *)name, namelen, cg, NULL);
    return cg;
}


================================================
FILE: src/redis/util.c
================================================
/*
 * Copyright (c) 2009-2012, Salvatore Sanfilippo <antirez at gmail dot com>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */


#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <limits.h>
#include <math.h>
#include <unistd.h>
#include <sys/time.h>
#include <float.h>
#include <stdint.h>
#include <errno.h>
#include <time.h>
#include "util.h"


/* Return the number of digits of 'v' when converted to string in radix 10.
 * See ll2string() for more information. */
static uint32_t digits10(uint64_t v) {
    if (v < 10) return 1;
    if (v < 100) return 2;
    if (v < 1000) return 3;
    if (v < 1000000000000UL) {
        if (v < 100000000UL) {
            if (v < 1000000) {
                if (v < 10000) return 4;
                return 5 + (v >= 100000);
            }
            return 7 + (v >= 10000000UL);
        }
        if (v < 10000000000UL) {
            return 9 + (v >= 1000000000UL);
        }
        return 11 + (v >= 100000000000UL);
    }
    return 12 + digits10(v / 1000000000000UL);
}

/* Convert a long long into a string. Returns the number of
 * characters needed to represent the number.
 * If the buffer is not big enough to store the string, 0 is returned.
 *
 * Based on the following article (that apparently does not provide a
 * novel approach but only publicizes an already used technique):
 *
 * https://www.facebook.com/notes/facebook-engineering/three-optimization-tips-for-c/10151361643253920
 *
 * Modified in order to handle signed integers since the original code was
 * designed for unsigned integers. */
int ll2string(char *dst, size_t dstlen, long long svalue) {
    static const char digits[201] =
        "0001020304050607080910111213141516171819"
        "2021222324252627282930313233343536373839"
        "4041424344454647484950515253545556575859"
        "6061626364656667686970717273747576777879"
        "8081828384858687888990919293949596979899";
    int negative;
    unsigned long long value;

    /* The main loop works with 64bit unsigned integers for simplicity, so
     * we convert the number here and remember if it is negative. */
    if (svalue < 0) {
        if (svalue != LLONG_MIN) {
            value = -svalue;
        } else {
            value = ((unsigned long long) LLONG_MAX)+1;
        }
        negative = 1;
    } else {
        value = svalue;
        negative = 0;
    }

    /* Check length. */
    uint32_t const length = digits10(value)+negative;
    if (length >= dstlen) return 0;

    /* Null term. */
    uint32_t next = length;
    dst[next] = '\0';
    next--;
    while (value >= 100) {
        int const i = (value % 100) * 2;
        value /= 100;
        dst[next] = digits[i + 1];
        dst[next - 1] = digits[i];
        next -= 2;
    }

    /* Handle last 1-2 digits. */
    if (value < 10) {
        dst[next] = '0' + (uint32_t) value;
    } else {
        int i = (uint32_t) value * 2;
        dst[next] = digits[i + 1];
        dst[next - 1] = digits[i];
    }

    /* Add sign. */
    if (negative) dst[0] = '-';
    return length;
}

/* Convert a string into a long long. Returns 1 if the string could be parsed
 * into a (non-overflowing) long long, 0 otherwise. The value will be set to
 * the parsed value when appropriate.
 *
 * Note that this function demands that the string strictly represents
 * a long long: no spaces or other characters before or after the string
 * representing the number are accepted, nor zeroes at the start if not
 * for the string "0" representing the zero number.
 *
 * Because of its strictness, it is safe to use this function to check if
 * you can convert a string into a long long, and obtain back the string
 * from the number without any loss in the string representation. */
int string2ll(const char *s, size_t slen, long long *value) {
    const char *p = s;
    size_t plen = 0;
    int negative = 0;
    unsigned long long v;

    /* A zero length string is not a valid number. */
    if (plen == slen)
        return 0;

    /* Special case: first and only digit is 0. */
    if (slen == 1 && p[0] == '0') {
        if (value != NULL) *value = 0;
        return 1;
    }

    /* Handle negative numbers: just set a flag and continue like if it
     * was a positive number. Later convert into negative. */
    if (p[0] == '-') {
        negative = 1;
        p++; plen++;

        /* Abort on only a negative sign. */
        if (plen == slen)
            return 0;
    }

    /* First digit should be 1-9, otherwise the string should just be 0. */
    if (p[0] >= '1' && p[0] <= '9') {
        v = p[0]-'0';
        p++; plen++;
    } else {
        return 0;
    }

    /* Parse all the other digits, checking for overflow at every step. */
    while (plen < slen && p[0] >= '0' && p[0] <= '9') {
        if (v > (ULLONG_MAX / 10)) /* Overflow. */
            return 0;
        v *= 10;

        if (v > (ULLONG_MAX - (p[0]-'0'))) /* Overflow. */
            return 0;
        v += p[0]-'0';

        p++; plen++;
    }

    /* Return if not all bytes were used. */
    if (plen < slen)
        return 0;

    /* Convert to negative if needed, and do the final overflow check when
     * converting from unsigned long long to long long. */
    if (negative) {
        if (v > ((unsigned long long)(-(LLONG_MIN+1))+1)) /* Overflow. */
            return 0;
        if (value != NULL) *value = -v;
    } else {
        if (v > LLONG_MAX) /* Overflow. */
            return 0;
        if (value != NULL) *value = v;
    }
    return 1;
}


================================================
FILE: src/redis/util.h
================================================
/*
 * Copyright (c) 2009-2012, Salvatore Sanfilippo <antirez at gmail dot com>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef __REDIS_UTIL_H
#define __REDIS_UTIL_H

#include <stdint.h>
#include <time.h>
#include <unistd.h>


/* The maximum number of characters needed to represent a long double
 * as a string (long double has a huge range).
 * This should be the size of the buffer given to ld2string */
#define MAX_LONG_DOUBLE_CHARS 5*1024

/* Error codes */
#define C_OK                    0
#define C_ERR                   -1


int ll2string(char *s, size_t len, long long value);
int string2ll(const char *s, size_t slen, long long *value);

#define LOG_MAX_LEN    1024 /* Default maximum length of syslog messages.*/

/* Log levels */
#define LL_DEBUG 0
#define LL_VERBOSE 1
#define LL_NOTICE 2
#define LL_WARNING 3
#define LL_RAW (1<<10) /* Modifier to log without timestamp */


/* Bytes needed for long -> str + '\0' */
#define LONG_STR_SIZE 21

void serverLog(int level, const char *fmt, ...);
void _serverPanic(const char *file, int line, const char *msg, ...);
void _serverAssert(const char *estr, const char *file, int line);

#define serverPanic(...) _serverPanic(__FILE__,__LINE__,__VA_ARGS__),_exit(1)
#define serverAssert(_e) ((_e)?(void)0 : (_serverAssert(#_e,__FILE__,__LINE__),_exit(1)))

typedef long long mstime_t; /* millisecond time type. */


#endif


================================================
FILE: src/redis/ziplist.c
================================================
/* The ziplist is a specially encoded dually linked list that is designed
 * to be very memory efficient. It stores both strings and integer values,
 * where integers are encoded as actual integers instead of a series of
 * characters. It allows push and pop operations on either side of the list
 * in O(1) time. However, because every operation requires a reallocation of
 * the memory used by the ziplist, the actual complexity is related to the
 * amount of memory used by the ziplist.
 *
 * ----------------------------------------------------------------------------
 *
 * ZIPLIST OVERALL LAYOUT
 * ======================
 *
 * The general layout of the ziplist is as follows:
 *
 * <zlbytes> <zltail> <zllen> <entry> <entry> ... <entry> <zlend>
 *
 * NOTE: all fields are stored in little endian, if not specified otherwise.
 *
 * <uint32_t zlbytes> is an unsigned integer to hold the number of bytes that
 * the ziplist occupies, including the four bytes of the zlbytes field itself.
 * This value needs to be stored to be able to resize the entire structure
 * without the need to traverse it first.
 *
 * <uint32_t zltail> is the offset to the last entry in the list. This allows
 * a pop operation on the far side of the list without the need for full
 * traversal.
 *
 * <uint16_t zllen> is the number of entries. When there are more than
 * 2^16-2 entries, this value is set to 2^16-1 and we need to traverse the
 * entire list to know how many items it holds.
 *
 * <uint8_t zlend> is a special entry representing the end of the ziplist.
 * Is encoded as a single byte equal to 255. No other normal entry starts
 * with a byte set to the value of 255.
 *
 * ZIPLIST ENTRIES
 * ===============
 *
 * Every entry in the ziplist is prefixed by metadata that contains two pieces
 * of information. First, the length of the previous entry is stored to be
 * able to traverse the list from back to front. Second, the entry encoding is
 * provided. It represents the entry type, integer or string, and in the case
 * of strings it also represents the length of the string payload.
 * So a complete entry is stored like this:
 *
 * <prevlen> <encoding> <entry-data>
 *
 * Sometimes the encoding represents the entry itself, like for small integers
 * as we'll see later. In such a case the <entry-data> part is missing, and we
 * could have just:
 *
 * <prevlen> <encoding>
 *
 * The length of the previous entry, <prevlen>, is encoded in the following way:
 * If this length is smaller than 254 bytes, it will only consume a single
 * byte representing the length as an unsigned 8 bit integer. When the length
 * is greater than or equal to 254, it will consume 5 bytes. The first byte is
 * set to 254 (FE) to indicate a larger value is following. The remaining 4
 * bytes take the length of the previous entry as value.
 *
 * So practically an entry is encoded in the following way:
 *
 * <prevlen from 0 to 253> <encoding> <entry>
 *
 * Or alternatively if the previous entry length is greater than 253 bytes
 * the following encoding is used:
 *
 * 0xFE <4 bytes unsigned little endian prevlen> <encoding> <entry>
 *
 * The encoding field of the entry depends on the content of the
 * entry. When the entry is a string, the first 2 bits of the encoding first
 * byte will hold the type of encoding used to store the length of the string,
 * followed by the actual length of the string. When the entry is an integer
 * the first 2 bits are both set to 1. The following 2 bits are used to specify
 * what kind of integer will be stored after this header. An overview of the
 * different types and encodings is as follows. The first byte is always enough
 * to determine the kind of entry.
 *
 * |00pppppp| - 1 byte
 *      String value with length less than or equal to 63 bytes (6 bits).
 *      "pppppp" represents the unsigned 6 bit length.
 * |01pppppp|qqqqqqqq| - 2 bytes
 *      String value with length less than or equal to 16383 bytes (14 bits).
 *      IMPORTANT: The 14 bit number is stored in big endian.
 * |10000000|qqqqqqqq|rrrrrrrr|ssssssss|tttttttt| - 5 bytes
 *      String value with length greater than or equal to 16384 bytes.
 *      Only the 4 bytes following the first byte represents the length
 *      up to 2^32-1. The 6 lower bits of the first byte are not used and
 *      are set to zero.
 *      IMPORTANT: The 32 bit number is stored in big endian.
 * |11000000| - 3 bytes
 *      Integer encoded as int16_t (2 bytes).
 * |11010000| - 5 bytes
 *      Integer encoded as int32_t (4 bytes).
 * |11100000| - 9 bytes
 *      Integer encoded as int64_t (8 bytes).
 * |11110000| - 4 bytes
 *      Integer encoded as 24 bit signed (3 bytes).
 * |11111110| - 2 bytes
 *      Integer encoded as 8 bit signed (1 byte).
 * |1111xxxx| - (with xxxx between 0001 and 1101) immediate 4 bit integer.
 *      Unsigned integer from 0 to 12. The encoded value is actually from
 *      1 to 13 because 0000 and 1111 can not be used, so 1 should be
 *      subtracted from the encoded 4 bit value to obtain the right value.
 * |11111111| - End of ziplist special entry.
 *
 * Like for the ziplist header, all the integers are represented in little
 * endian byte order, even when this code is compiled in big endian systems.
 *
 * EXAMPLES OF ACTUAL ZIPLISTS
 * ===========================
 *
 * The following is a ziplist containing the two elements representing
 * the strings "2" and "5". It is composed of 15 bytes, that we visually
 * split into sections:
 *
 *  [0f 00 00 00] [0c 00 00 00] [02 00] [00 f3] [02 f6] [ff]
 *        |             |          |       |       |     |
 *     zlbytes        zltail     zllen    "2"     "5"   end
 *
 * The first 4 bytes represent the number 15, that is the number of bytes
 * the whole ziplist is composed of. The second 4 bytes are the offset
 * at which the last ziplist entry is found, that is 12, in fact the
 * last entry, that is "5", is at offset 12 inside the ziplist.
 * The next 16 bit integer represents the number of elements inside the
 * ziplist, its value is 2 since there are just two elements inside.
 * Finally "00 f3" is the first entry representing the number 2. It is
 * composed of the previous entry length, which is zero because this is
 * our first entry, and the byte F3 which corresponds to the encoding
 * |1111xxxx| with xxxx between 0001 and 1101. We need to remove the "F"
 * higher order bits 1111, and subtract 1 from the "3", so the entry value
 * is "2". The next entry has a prevlen of 02, since the first entry is
 * composed of exactly two bytes. The entry itself, F6, is encoded exactly
 * like the first entry, and 6-1 = 5, so the value of the entry is 5.
 * Finally the special entry FF signals the end of the ziplist.
 *
 * Adding another element to the above string with the value "Hello World"
 * allows us to show how the ziplist encodes small strings. We'll just show
 * the hex dump of the entry itself. Imagine the bytes as following the
 * entry that stores "5" in the ziplist above:
 *
 * [02] [0b] [48 65 6c 6c 6f 20 57 6f 72 6c 64]
 *
 * The first byte, 02, is the length of the previous entry. The next
 * byte represents the encoding in the pattern |00pppppp| that means
 * that the entry is a string of length <pppppp>, so 0B means that
 * an 11 bytes string follows. From the third byte (48) to the last (64)
 * there are just the ASCII characters for "Hello World".
 *
 * ----------------------------------------------------------------------------
 *
 * Copyright (c) 2009-2012, Pieter Noordhuis <pcnoordhuis at gmail dot com>
 * Copyright (c) 2009-2017, 2020, Redis Ltd.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <limits.h>
#include "zmalloc.h"
#include "util.h"
#include "ziplist.h"
#include "config.h"
#include "endianconv.h"

#define ZIP_END 255         /* Special "end of ziplist" entry. */
#define ZIP_BIG_PREVLEN                                                                                                \
    254 /* ZIP_BIG_PREVLEN - 1 is the max number of bytes of                                                           \
           the previous entry, for the "prevlen" field prefixing                                                       \
           each entry, to be represented with just a single byte.                                                      \
           Otherwise it is represented as FE AA BB CC DD, where                                                        \
           AA BB CC DD are a 4 bytes unsigned integer                                                                  \
                               representing the previous entry len. */

/* Different encoding/length possibilities */
#define ZIP_STR_MASK 0xc0
#define ZIP_INT_MASK 0x30
#define ZIP_STR_06B (0 << 6)
#define ZIP_STR_14B (1 << 6)
#define ZIP_STR_32B (2 << 6)
#define ZIP_INT_16B (0xc0 | 0<<4)
#define ZIP_INT_32B (0xc0 | 1<<4)
#define ZIP_INT_64B (0xc0 | 2<<4)
#define ZIP_INT_24B (0xc0 | 3<<4)
#define ZIP_INT_8B 0xfe

/* 4 bit integer immediate encoding |1111xxxx| with xxxx between
 * 0001 and 1101. */
#define ZIP_INT_IMM_MASK                                                                                               \
    0x0f                     /* Mask to extract the 4 bits value. To add                                               \
                                   one is needed to reconstruct the value. */
#define ZIP_INT_IMM_MIN 0xf1    /* 11110001 */
#define ZIP_INT_IMM_MAX 0xfd    /* 11111101 */

#define INT24_MAX 0x7fffff
#define INT24_MIN (-INT24_MAX - 1)

/* Macro to determine if the entry is a string. String entries never start
 * with "11" as most significant bits of the first byte. */
#define ZIP_IS_STR(enc) (((enc) & ZIP_STR_MASK) < ZIP_STR_MASK)

/* Utility macros.*/

/* Return total bytes a ziplist is composed of. */
#define ZIPLIST_BYTES(zl)       (*((uint32_t*)(zl)))

/* Return the offset of the last item inside the ziplist. */
#define ZIPLIST_TAIL_OFFSET(zl) (*((uint32_t*)((zl)+sizeof(uint32_t))))

/* Return the length of a ziplist, or UINT16_MAX if the length cannot be
 * determined without scanning the whole ziplist. */
#define ZIPLIST_LENGTH(zl)      (*((uint16_t*)((zl)+sizeof(uint32_t)*2)))

/* The size of a ziplist header: two 32 bit integers for the total
 * bytes count and last item offset. One 16 bit integer for the number
 * of items field. */
#define ZIPLIST_HEADER_SIZE     (sizeof(uint32_t)*2+sizeof(uint16_t))

/* Size of the "end of ziplist" entry. Just one byte. */
#define ZIPLIST_END_SIZE        (sizeof(uint8_t))

/* Return the pointer to the first entry of a ziplist. */
#define ZIPLIST_ENTRY_HEAD(zl)  ((zl)+ZIPLIST_HEADER_SIZE)

/* Return the pointer to the last entry of a ziplist, using the
 * last entry offset inside the ziplist header. */
#define ZIPLIST_ENTRY_TAIL(zl)  ((zl)+intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl)))

/* Return the pointer to the last byte of a ziplist, which is, the
 * end of ziplist FF entry. */
#define ZIPLIST_ENTRY_END(zl)   ((zl)+intrev32ifbe(ZIPLIST_BYTES(zl))-ZIPLIST_END_SIZE)

/* Increment the number of items field in the ziplist header. Note that this
 * macro should never overflow the unsigned 16 bit integer, since entries are
 * always pushed one at a time. When UINT16_MAX is reached we want the count
 * to stay there to signal that a full scan is needed to get the number of
 * items inside the ziplist. */
#define ZIPLIST_INCR_LENGTH(zl, incr)                                                                                  \
    {                                                                                                                  \
    if (intrev16ifbe(ZIPLIST_LENGTH(zl)) < UINT16_MAX) \
        ZIPLIST_LENGTH(zl) = intrev16ifbe(intrev16ifbe(ZIPLIST_LENGTH(zl))+incr); \
}

/* Don't let ziplists grow over 1GB in any case, don't wanna risk overflow in
 * zlbytes*/
#define ZIPLIST_MAX_SAFETY_SIZE (1<<30)
int ziplistSafeToAdd(unsigned char* zl, size_t add) {
    size_t len = zl? ziplistBlobLen(zl): 0;
    if (len + add > ZIPLIST_MAX_SAFETY_SIZE) return 0;
    return 1;
}


/* We use this function to receive information about a ziplist entry.
 * Note that this is not how the data is actually encoded, is just what we
 * get filled by a function in order to operate more easily. */
typedef struct zlentry {
    unsigned int prevrawlensize; /* Bytes used to encode the previous entry len*/
    unsigned int prevrawlen;     /* Previous entry len. */
    unsigned int lensize;        /* Bytes used to encode this entry type/len.
                                    For example strings have a 1, 2 or 5 bytes
                                    header. Integers always use a single byte.*/
    unsigned int len;            /* Bytes used to represent the actual entry.
                                    For strings this is just the string length
                                    while for integers it is 1, 2, 3, 4, 8 or
                                    0 (for 4 bit immediate) depending on the
                                    number range. */
    unsigned int headersize;     /* prevrawlensize + lensize. */
    unsigned char encoding;      /* Set to ZIP_STR_* or ZIP_INT_* depending on
                                    the entry encoding. However for 4 bits
                                    immediate integers this can assume a range
                                    of values and must be range-checked. */
    unsigned char *p;            /* Pointer to the very start of the entry, that
                                    is, this points to prev-entry-len field. */
} zlentry;

#define ZIPLIST_ENTRY_ZERO(zle)                                                                                        \
    {                                                                                                                  \
    (zle)->prevrawlensize = (zle)->prevrawlen = 0; \
    (zle)->lensize = (zle)->len = (zle)->headersize = 0; \
    (zle)->encoding = 0; \
    (zle)->p = NULL; \
}

/* Extract the encoding from the byte pointed by 'ptr' and set it into
 * 'encoding' field of the zlentry structure. */
#define ZIP_ENTRY_ENCODING(ptr, encoding)                                                                              \
    do {                                                                                                               \
    (encoding) = ((ptr)[0]); \
    if ((encoding) < ZIP_STR_MASK) (encoding) &= ZIP_STR_MASK; \
} while(0)

#define ZIP_ENCODING_SIZE_INVALID 0xff
/* Return the number of bytes required to encode the entry type + length.
 * On error, return ZIP_ENCODING_SIZE_INVALID */
static inline unsigned int zipEncodingLenSize(unsigned char encoding) {
    if (encoding == ZIP_INT_16B || encoding == ZIP_INT_32B || encoding == ZIP_INT_24B || encoding == ZIP_INT_64B ||
        encoding == ZIP_INT_8B)
        return 1;
    if (encoding >= ZIP_INT_IMM_MIN && encoding <= ZIP_INT_IMM_MAX) return 1;
    if (encoding == ZIP_STR_06B) return 1;
    if (encoding == ZIP_STR_14B) return 2;
    if (encoding == ZIP_STR_32B) return 5;
    return ZIP_ENCODING_SIZE_INVALID;
}

#define ZIP_ASSERT_ENCODING(encoding)                                                                                  \
    do {                                                                                                               \
    assert(zipEncodingLenSize(encoding) != ZIP_ENCODING_SIZE_INVALID);         \
} while (0)

/* Return bytes needed to store integer encoded by 'encoding' */
static inline unsigned int zipIntSize(unsigned char encoding) {
    switch(encoding) {
    case ZIP_INT_8B:  return 1;
    case ZIP_INT_16B: return 2;
    case ZIP_INT_24B: return 3;
    case ZIP_INT_32B: return 4;
    case ZIP_INT_64B: return 8;
    }
    if (encoding >= ZIP_INT_IMM_MIN && encoding <= ZIP_INT_IMM_MAX) return 0; /* 4 bit immediate */
    /* bad encoding, covered by a previous call to ZIP_ASSERT_ENCODING */
    valkey_unreachable();
    return 0;
}

/* Write the encoding header of the entry in 'p'. If p is NULL it just returns
 * the amount of bytes required to encode such a length. Arguments:
 *
 * 'encoding' is the encoding we are using for the entry. It could be
 * ZIP_INT_* or ZIP_STR_* or between ZIP_INT_IMM_MIN and ZIP_INT_IMM_MAX
 * for single-byte small immediate integers.
 *
 * 'rawlen' is only used for ZIP_STR_* encodings and is the length of the
 * string that this entry represents.
 *
 * The function returns the number of bytes used by the encoding/length
 * header stored in 'p'. */
unsigned int zipStoreEntryEncoding(unsigned char *p, unsigned char encoding, unsigned int rawlen) {
    unsigned char len = 1, buf[5];

    if (ZIP_IS_STR(encoding)) {
        /* Although encoding is given it may not be set for strings,
         * so we determine it here using the raw length. */
        if (rawlen <= 0x3f) {
            if (!p) return len;
            buf[0] = ZIP_STR_06B | rawlen;
        } else if (rawlen <= 0x3fff) {
            len += 1;
            if (!p) return len;
            buf[0] = ZIP_STR_14B | ((rawlen >> 8) & 0x3f);
            buf[1] = rawlen & 0xff;
        } else {
            len += 4;
            if (!p) return len;
            buf[0] = ZIP_STR_32B;
            buf[1] = (rawlen >> 24) & 0xff;
            buf[2] = (rawlen >> 16) & 0xff;
            buf[3] = (rawlen >> 8) & 0xff;
            buf[4] = rawlen & 0xff;
        }
    } else {
        /* Implies integer encoding, so length is always 1. */
        if (!p) return len;
        buf[0] = encoding;
    }

    /* Store this length at p. */
    memcpy(p,buf,len);
    return len;
}

/* Decode the entry encoding type and data length (string length for strings,
 * number of bytes used for the integer for integer entries) encoded in 'ptr'.
 * The 'encoding' variable is input, extracted by the caller, the 'lensize'
 * variable will hold the number of bytes required to encode the entry
 * length, and the 'len' variable will hold the entry length.
 * On invalid encoding error, lensize is set to 0. */
#define ZIP_DECODE_LENGTH(ptr, encoding, lensize, len)                                                                 \
    do {                                                                                                               \
    if ((encoding) < ZIP_STR_MASK) {                                           \
        if ((encoding) == ZIP_STR_06B) {                                       \
            (lensize) = 1;                                                     \
            (len) = (ptr)[0] & 0x3f;                                           \
        } else if ((encoding) == ZIP_STR_14B) {                                \
            (lensize) = 2;                                                     \
            (len) = (((ptr)[0] & 0x3f) << 8) | (ptr)[1];                       \
        } else if ((encoding) == ZIP_STR_32B) {                                \
            (lensize) = 5;                                                     \
                (len) = ((uint32_t)(ptr)[1] << 24) | ((uint32_t)(ptr)[2] << 16) | ((uint32_t)(ptr)[3] << 8) |          \
                    ((uint32_t)(ptr)[4]);                                      \
        } else {                                                               \
            (lensize) = 0; /* bad encoding, should be covered by a previous */ \
            (len) = 0;     /* ZIP_ASSERT_ENCODING / zipEncodingLenSize, or  */ \
                           /* match the lensize after this macro with 0.    */ \
        }                                                                      \
    } else {                                                                   \
        (lensize) = 1;                                                         \
            if ((encoding) == ZIP_INT_8B)                                                                              \
                (len) = 1;                                                                                             \
            else if ((encoding) == ZIP_INT_16B)                                                                        \
                (len) = 2;                                                                                             \
            else if ((encoding) == ZIP_INT_24B)                                                                        \
                (len) = 3;                                                                                             \
            else if ((encoding) == ZIP_INT_32B)                                                                        \
                (len) = 4;                                                                                             \
            else if ((encoding) == ZIP_INT_64B)                                                                        \
                (len) = 8;                                                                                             \
        else if (encoding >= ZIP_INT_IMM_MIN && encoding <= ZIP_INT_IMM_MAX)   \
            (len) = 0; /* 4 bit immediate */                                   \
        else                                                                   \
            (lensize) = (len) = 0; /* bad encoding */                          \
    }                                                                          \
} while(0)

/* Encode the length of the previous entry and write it to "p". This only
 * uses the larger encoding (required in __ziplistCascadeUpdate). */
int zipStorePrevEntryLengthLarge(unsigned char *p, unsigned int len) {
    uint32_t u32;
    if (p != NULL) {
        p[0] = ZIP_BIG_PREVLEN;
        u32 = len;
        memcpy(p+1,&u32,sizeof(u32));
        memrev32ifbe(p+1);
    }
    return 1 + sizeof(uint32_t);
}

/* Encode the length of the previous entry and write it to "p". Return the
 * number of bytes needed to encode this length if "p" is NULL. */
unsigned int zipStorePrevEntryLength(unsigned char *p, unsigned int len) {
    if (p == NULL) {
        return (len < ZIP_BIG_PREVLEN) ? 1 : sizeof(uint32_t) + 1;
    } else {
        if (len < ZIP_BIG_PREVLEN) {
            p[0] = len;
            return 1;
        } else {
            return zipStorePrevEntryLengthLarge(p,len);
        }
    }
}

/* Return the number of bytes used to encode the length of the previous
 * entry. The length is returned by setting the var 'prevlensize'. */
#define ZIP_DECODE_PREVLENSIZE(ptr, prevlensize)                                                                       \
    do {                                                                                                               \
    if ((ptr)[0] < ZIP_BIG_PREVLEN) {                                          \
        (prevlensize) = 1;                                                     \
    } else {                                                                   \
        (prevlensize) = 5;                                                     \
    }                                                                          \
} while(0)

/* Return the length of the previous element, and the number of bytes that
 * are used in order to encode the previous element length.
 * 'ptr' must point to the prevlen prefix of an entry (that encodes the
 * length of the previous entry in order to navigate the elements backward).
 * The length of the previous entry is stored in 'prevlen', the number of
 * bytes needed to encode the previous entry length are stored in
 * 'prevlensize'. */
#define ZIP_DECODE_PREVLEN(ptr, prevlensize, prevlen)                                                                  \
    do {                                                                                                               \
    ZIP_DECODE_PREVLENSIZE(ptr, prevlensize);                                  \
    if ((prevlensize) == 1) {                                                  \
        (prevlen) = (ptr)[0];                                                  \
    } else { /* prevlensize == 5 */                                            \
            (prevlen) = ((ptr)[4] << 24) | ((ptr)[3] << 16) | ((ptr)[2] << 8) | ((ptr)[1]);                            \
    }                                                                          \
} while(0)

/* Given a pointer 'p' to the prevlen info that prefixes an entry, this
 * function returns the difference in number of bytes needed to encode
 * the prevlen if the previous entry changes of size.
 *
 * So if A is the number of bytes used right now to encode the 'prevlen'
 * field.
 *
 * And B is the number of bytes that are needed in order to encode the
 * 'prevlen' if the previous element will be updated to one of size 'len'.
 *
 * Then the function returns B - A
 *
 * So the function returns a positive number if more space is needed,
 * a negative number if less space is needed, or zero if the same space
 * is needed. */
int zipPrevLenByteDiff(unsigned char *p, unsigned int len) {
    unsigned int prevlensize;
    ZIP_DECODE_PREVLENSIZE(p, prevlensize);
    return zipStorePrevEntryLength(NULL, len) - prevlensize;
}

/* Check if string pointed to by 'entry' can be encoded as an integer.
 * Stores the integer value in 'v' and its encoding in 'encoding'. */
int zipTryEncoding(unsigned char *entry, unsigned int entrylen, long long *v, unsigned char *encoding) {
    long long value;

    if (entrylen >= 32 || entrylen == 0) return 0;
    if (string2ll((char*)entry,entrylen,&value)) {
        /* Great, the string can be encoded. Check what's the smallest
         * of our encoding types that can hold this value. */
        if (value >= 0 && value <= 12) {
            *encoding = ZIP_INT_IMM_MIN+value;
        } else if (value >= INT8_MIN && value <= INT8_MAX) {
            *encoding = ZIP_INT_8B;
        } else if (value >= INT16_MIN && value <= INT16_MAX) {
            *encoding = ZIP_INT_16B;
        } else if (value >= INT24_MIN && value <= INT24_MAX) {
            *encoding = ZIP_INT_24B;
        } else if (value >= INT32_MIN && value <= INT32_MAX) {
            *encoding = ZIP_INT_32B;
        } else {
            *encoding = ZIP_INT_64B;
        }
        *v = value;
        return 1;
    }
    return 0;
}

/* Store integer 'value' at 'p', encoded as 'encoding' */
void zipSaveInteger(unsigned char *p, int64_t value, unsigned char encoding) {
    int16_t i16;
    int32_t i32;
    int64_t i64;
    if (encoding == ZIP_INT_8B) {
        ((int8_t*)p)[0] = (int8_t)value;
    } else if (encoding == ZIP_INT_16B) {
        i16 = value;
        memcpy(p,&i16,sizeof(i16));
        memrev16ifbe(p);
    } else if (encoding == ZIP_INT_24B) {
        i32 = ((uint64_t)value)<<8;
        memrev32ifbe(&i32);
        memcpy(p,((uint8_t*)&i32)+1,sizeof(i32)-sizeof(uint8_t));
    } else if (encoding == ZIP_INT_32B) {
        i32 = value;
        memcpy(p,&i32,sizeof(i32));
        memrev32ifbe(p);
    } else if (encoding == ZIP_INT_64B) {
        i64 = value;
        memcpy(p,&i64,sizeof(i64));
        memrev64ifbe(p);
    } else if (encoding >= ZIP_INT_IMM_MIN && encoding <= ZIP_INT_IMM_MAX) {
        /* Nothing to do, the value is stored in the encoding itself. */
    } else {
        assert(NULL);
    }
}

/* Read integer encoded as 'encoding' from 'p' */
int64_t zipLoadInteger(unsigned char *p, unsigned char encoding) {
    int16_t i16;
    int32_t i32;
    int64_t i64, ret = 0;
    if (encoding == ZIP_INT_8B) {
        ret = ((int8_t*)p)[0];
    } else if (encoding == ZIP_INT_16B) {
        memcpy(&i16,p,sizeof(i16));
        memrev16ifbe(&i16);
        ret = i16;
    } else if (encoding == ZIP_INT_32B) {
        memcpy(&i32,p,sizeof(i32));
        memrev32ifbe(&i32);
        ret = i32;
    } else if (encoding == ZIP_INT_24B) {
        i32 = 0;
        memcpy(((uint8_t*)&i32)+1,p,sizeof(i32)-sizeof(uint8_t));
        memrev32ifbe(&i32);
        ret = i32>>8;
    } else if (encoding == ZIP_INT_64B) {
        memcpy(&i64,p,sizeof(i64));
        memrev64ifbe(&i64);
        ret = i64;
    } else if (encoding >= ZIP_INT_IMM_MIN && encoding <= ZIP_INT_IMM_MAX) {
        ret = (encoding & ZIP_INT_IMM_MASK)-1;
    } else {
        assert(NULL);
    }
    return ret;
}

/* Fills a struct with all information about an entry.
 * This function is the "unsafe" alternative to the one below.
 * Generally, all function that return a pointer to an element in the ziplist
 * will assert that this element is valid, so it can be freely used.
 * Generally functions such ziplistGet assume the input pointer is already
 * validated (since it's the return value of another function). */
static inline void zipEntry(unsigned char *p, zlentry *e) {
    ZIP_DECODE_PREVLEN(p, e->prevrawlensize, e->prevrawlen);
    ZIP_ENTRY_ENCODING(p + e->prevrawlensize, e->encoding);
    ZIP_DECODE_LENGTH(p + e->prevrawlensize, e->encoding, e->lensize, e->len);
    assert(e->lensize != 0); /* check that encoding was valid. */
    e->headersize = e->prevrawlensize + e->lensize;
    e->p = p;
}

/* Fills a struct with all information about an entry.
 * This function is safe to use on untrusted pointers, it'll make sure not to
 * try to access memory outside the ziplist payload.
 * Returns 1 if the entry is valid, and 0 otherwise. */
static inline int zipEntrySafe(unsigned char* zl, size_t zlbytes, unsigned char *p, zlentry *e, int validate_prevlen) {
    unsigned char *zlfirst = zl + ZIPLIST_HEADER_SIZE;
    unsigned char *zllast = zl + zlbytes - ZIPLIST_END_SIZE;
#define OUT_OF_RANGE(p) (unlikely((p) < zlfirst || (p) > zllast))

    /* If there's no possibility for the header to reach outside the ziplist,
     * take the fast path. (max lensize and prevrawlensize are both 5 bytes) */
    if (p >= zlfirst && p + 10 < zllast) {
        ZIP_DECODE_PREVLEN(p, e->prevrawlensize, e->prevrawlen);
        ZIP_ENTRY_ENCODING(p + e->prevrawlensize, e->encoding);
        ZIP_DECODE_LENGTH(p + e->prevrawlensize, e->encoding, e->lensize, e->len);
        e->headersize = e->prevrawlensize + e->lensize;
        e->p = p;
        /* We didn't call ZIP_ASSERT_ENCODING, so we check lensize was set to 0. */
        if (unlikely(e->lensize == 0)) return 0;
        /* Make sure the entry doesn't reach outside the edge of the ziplist */
        if (OUT_OF_RANGE(p + e->headersize + e->len)) return 0;
        /* Make sure prevlen doesn't reach outside the edge of the ziplist */
        if (validate_prevlen && OUT_OF_RANGE(p - e->prevrawlen)) return 0;
        return 1;
    }

    /* Make sure the pointer doesn't reach outside the edge of the ziplist */
    if (OUT_OF_RANGE(p)) return 0;

    /* Make sure the encoded prevlen header doesn't reach outside the allocation */
    ZIP_DECODE_PREVLENSIZE(p, e->prevrawlensize);
    if (OUT_OF_RANGE(p + e->prevrawlensize)) return 0;

    /* Make sure encoded entry header is valid. */
    ZIP_ENTRY_ENCODING(p + e->prevrawlensize, e->encoding);
    e->lensize = zipEncodingLenSize(e->encoding);
    if (unlikely(e->lensize == ZIP_ENCODING_SIZE_INVALID)) return 0;

    /* Make sure the encoded entry header doesn't reach outside the allocation */
    if (OUT_OF_RANGE(p + e->prevrawlensize + e->lensize)) return 0;

    /* Decode the prevlen and entry len headers. */
    ZIP_DECODE_PREVLEN(p, e->prevrawlensize, e->prevrawlen);
    ZIP_DECODE_LENGTH(p + e->prevrawlensize, e->encoding, e->lensize, e->len);
    e->headersize = e->prevrawlensize + e->lensize;

    /* Make sure the entry doesn't reach outside the edge of the ziplist */
    if (OUT_OF_RANGE(p + e->headersize + e->len)) return 0;

    /* Make sure prevlen doesn't reach outside the edge of the ziplist */
    if (validate_prevlen && OUT_OF_RANGE(p - e->prevrawlen)) return 0;

    e->p = p;
    return 1;
#undef OUT_OF_RANGE
}

/* Return the total number of bytes used by the entry pointed to by 'p'. */
static inline unsigned int zipRawEntryLengthSafe(unsigned char* zl, size_t zlbytes, unsigned char *p) {
    zlentry e;
    zipEntrySafe(zl, zlbytes, p, &e, 0);
    return e.headersize + e.len;
}

/* Return the total number of bytes used by the entry pointed to by 'p'. */
static inline unsigned int zipRawEntryLength(unsigned char *p) {
    zlentry e;
    zipEntry(p, &e);
    return e.headersize + e.len;
}

/* Validate that the entry doesn't reach outside the ziplist allocation. */
static inline void zipAssertValidEntry(unsigned char* zl, size_t zlbytes, unsigned char *p) {
    zlentry e;
    int res = zipEntrySafe(zl, zlbytes, p, &e, 1);
    assert(res);
    (void)res;
}

/* Create a new empty ziplist. */
unsigned char *ziplistNew(void) {
    unsigned int bytes = ZIPLIST_HEADER_SIZE+ZIPLIST_END_SIZE;
    unsigned char *zl = zmalloc(bytes);
    ZIPLIST_BYTES(zl) = intrev32ifbe(bytes);
    ZIPLIST_TAIL_OFFSET(zl) = intrev32ifbe(ZIPLIST_HEADER_SIZE);
    ZIPLIST_LENGTH(zl) = 0;
    zl[bytes-1] = ZIP_END;
    return zl;
}

/* Resize the ziplist. */
unsigned char *ziplistResize(unsigned char *zl, size_t len) {
    assert(len < UINT32_MAX);
    zl = zrealloc(zl,len);
    ZIPLIST_BYTES(zl) = intrev32ifbe(len);
    zl[len-1] = ZIP_END;
    return zl;
}

/* When an entry is inserted, we need to set the prevlen field of the next
 * entry to equal the length of the inserted entry. It can occur that this
 * length cannot be encoded in 1 byte and the next entry needs to be grow
 * a bit larger to hold the 5-byte encoded prevlen. This can be done for free,
 * because this only happens when an entry is already being inserted (which
 * causes a realloc and memmove). However, encoding the prevlen may require
 * that this entry is grown as well. This effect may cascade throughout
 * the ziplist when there are consecutive entries with a size close to
 * ZIP_BIG_PREVLEN, so we need to check that the prevlen can be encoded in
 * every consecutive entry.
 *
 * Note that this effect can also happen in reverse, where the bytes required
 * to encode the prevlen field can shrink. This effect is deliberately ignored,
 * because it can cause a "flapping" effect where a chain prevlen fields is
 * first grown and then shrunk again after consecutive inserts. Rather, the
 * field is allowed to stay larger than necessary, because a large prevlen
 * field implies the ziplist is holding large entries anyway.
 *
 * The pointer "p" points to the first entry that does NOT need to be
 * updated, i.e. consecutive fields MAY need an update. */
unsigned char *__ziplistCascadeUpdate(unsigned char *zl, unsigned char *p) {
    zlentry cur;
    size_t prevlen, prevlensize, prevoffset; /* Informat of the last changed entry. */
    size_t firstentrylen; /* Used to handle insert at head. */
    size_t rawlen, curlen = intrev32ifbe(ZIPLIST_BYTES(zl));
    size_t extra = 0, cnt = 0, offset;
    size_t delta = 4; /* Extra bytes needed to update a entry's prevlen (5-1). */
    unsigned char *tail = zl + intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl));

    /* Empty ziplist */
    if (p[0] == ZIP_END) return zl;

    zipEntry(
        p,
        &cur); /* no need for "safe" variant since the input pointer was validated by the function that returned it. */
    firstentrylen = prevlen = cur.headersize + cur.len;
    prevlensize = zipStorePrevEntryLength(NULL, prevlen);
    prevoffset = p - zl;
    p += prevlen;

    /* Iterate ziplist to find out how many extra bytes do we need to update it. */
    while (p[0] != ZIP_END) {
        assert(zipEntrySafe(zl, curlen, p, &cur, 0));

        /* Abort when "prevlen" has not changed. */
        if (cur.prevrawlen == prevlen) break;

        /* Abort when entry's "prevlensize" is big enough. */
        if (cur.prevrawlensize >= prevlensize) {
            if (cur.prevrawlensize == prevlensize) {
                zipStorePrevEntryLength(p, prevlen);
            } else {
                /* This would result in shrinking, which we want to avoid.
                 * So, set "prevlen" in the available bytes. */
                zipStorePrevEntryLengthLarge(p, prevlen);
            }
            break;
        }

        /* cur.prevrawlen means cur is the former head entry. */
        assert(cur.prevrawlen == 0 || cur.prevrawlen + delta == prevlen);

        /* Update prev entry's info and advance the cursor. */
        rawlen = cur.headersize + cur.len;
        prevlen = rawlen + delta; 
        prevlensize = zipStorePrevEntryLength(NULL, prevlen);
        prevoffset = p - zl;
        p += rawlen;
        extra += delta;
        cnt++;
    }

    /* Extra bytes is zero all update has been done(or no need to update). */
    if (extra == 0) return zl;

    /* Update tail offset after loop. */
    if (tail == zl + prevoffset) {
        /* When the last entry we need to update is also the tail, update tail offset
         * unless this is the only entry that was updated (so the tail offset didn't change). */
        if (extra - delta != 0) {
            ZIPLIST_TAIL_OFFSET(zl) = intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl)) + extra - delta);
        }
    } else {
        /* Update the tail offset in cases where the last entry we updated is not the tail. */
        ZIPLIST_TAIL_OFFSET(zl) = intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl)) + extra);
    }

    /* Now "p" points at the first unchanged byte in original ziplist,
     * move data after that to new ziplist. */
    offset = p - zl;
    zl = ziplistResize(zl, curlen + extra);
    p = zl + offset;
    memmove(p + extra, p, curlen - offset - 1);
    p += extra;

    /* Iterate all entries that need to be updated tail to head. */
    while (cnt) {
        zipEntry(zl + prevoffset,
                 &cur); /* no need for "safe" variant since we already iterated on all these entries above. */
        rawlen = cur.headersize + cur.len;
        /* Move entry to tail and reset prevlen. */
        memmove(p - (rawlen - cur.prevrawlensize), zl + prevoffset + cur.prevrawlensize, rawlen - cur.prevrawlensize);
        p -= (rawlen + delta);
        if (cur.prevrawlen == 0) {
            /* "cur" is the previous head entry, update its prevlen with firstentrylen. */
            zipStorePrevEntryLength(p, firstentrylen);
        } else {
            /* An entry's prevlen can only increment 4 bytes. */
            zipStorePrevEntryLength(p, cur.prevrawlen+delta);
        }
        /* Forward to previous entry. */
        prevoffset -= cur.prevrawlen;
        cnt--;
    }
    return zl;
}

/* Delete "num" entries, starting at "p". Returns pointer to the ziplist. */
unsigned char *__ziplistDelete(unsigned char *zl, unsigned char *p, unsigned int num) {
    unsigned int i, totlen, deleted = 0;
    size_t offset;
    int nextdiff = 0;
    zlentry first, tail;
    size_t zlbytes = intrev32ifbe(ZIPLIST_BYTES(zl));

    zipEntry(p, &first); /* no need for "safe" variant since the input pointer was validated by the function that
                            returned it. */
    for (i = 0; p[0] != ZIP_END && i < num; i++) {
        p += zipRawEntryLengthSafe(zl, zlbytes, p);
        deleted++;
    }

    assert(p >= first.p);
    totlen = p-first.p; /* Bytes taken by the element(s) to delete. */
    if (totlen > 0) {
        uint32_t set_tail;
        if (p[0] != ZIP_END) {
            /* Storing `prevrawlen` in this entry may increase or decrease the
             * number of bytes required compare to the current `prevrawlen`.
             * There always is room to store this, because it was previously
             * stored by an entry that is now being deleted. */
            nextdiff = zipPrevLenByteDiff(p,first.prevrawlen);

            /* Note that there is always space when p jumps backward: if
             * the new previous entry is large, one of the deleted elements
             * had a 5 bytes prevlen header, so there is for sure at least
             * 5 bytes free and we need just 4. */
            p -= nextdiff;
            assert(p >= first.p && p<zl+zlbytes-1);
            zipStorePrevEntryLength(p,first.prevrawlen);

            /* Update offset for tail */
            set_tail = intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))-totlen;

            /* When the tail contains more than one entry, we need to take
             * "nextdiff" in account as well. Otherwise, a change in the
             * size of prevlen doesn't have an effect on the *tail* offset. */
            assert(zipEntrySafe(zl, zlbytes, p, &tail, 1));
            if (p[tail.headersize+tail.len] != ZIP_END) {
                set_tail = set_tail + nextdiff;
            }

            /* Move tail to the front of the ziplist */
            /* since we asserted that p >= first.p. we know totlen >= 0,
             * so we know that p > first.p and this is guaranteed not to reach
             * beyond the allocation, even if the entries lens are corrupted. */
            size_t bytes_to_move = zlbytes-(p-zl)-1;
            memmove(first.p,p,bytes_to_move);
        } else {
            /* The entire tail was deleted. No need to move memory. */
            set_tail = (first.p-zl)-first.prevrawlen;
        }

        /* Resize the ziplist */
        offset = first.p-zl;
        zlbytes -= totlen - nextdiff;
        zl = ziplistResize(zl, zlbytes);
        p = zl+offset;

        /* Update record count */
        ZIPLIST_INCR_LENGTH(zl,-deleted);

        /* Set the tail offset computed above */
        assert(set_tail <= zlbytes - ZIPLIST_END_SIZE);
        ZIPLIST_TAIL_OFFSET(zl) = intrev32ifbe(set_tail);

        /* When nextdiff != 0, the raw length of the next entry has changed, so
         * we need to cascade the update throughout the ziplist */
        if (nextdiff != 0) zl = __ziplistCascadeUpdate(zl, p);
    }
    return zl;
}

/* Insert item at "p". */
unsigned char *__ziplistInsert(unsigned char *zl, unsigned char *p, unsigned char *s, unsigned int slen) {
    size_t curlen = intrev32ifbe(ZIPLIST_BYTES(zl)), reqlen, newlen;
    unsigned int prevlensize, prevlen = 0;
    size_t offset;
    int nextdiff = 0;
    unsigned char encoding = 0;
    long long value = 123456789; /* initialized to avoid warning. Using a value
                                    that is easy to see if for some reason
                                    we use it uninitialized. */
    zlentry tail;

    /* Find out prevlen for the entry that is inserted. */
    if (p[0] != ZIP_END) {
        ZIP_DECODE_PREVLEN(p, prevlensize, prevlen);
    } else {
        unsigned char *ptail = ZIPLIST_ENTRY_TAIL(zl);
        if (ptail[0] != ZIP_END) {
            prevlen = zipRawEntryLengthSafe(zl, curlen, ptail);
        }
    }

    /* See if the entry can be encoded */
    if (zipTryEncoding(s,slen,&value,&encoding)) {
        /* 'encoding' is set to the appropriate integer encoding */
        reqlen = zipIntSize(encoding);
    } else {
        /* 'encoding' is untouched, however zipStoreEntryEncoding will use the
         * string length to figure out how to encode it. */
        reqlen = slen;
    }
    /* We need space for both the length of the previous entry and
     * the length of the payload. */
    reqlen += zipStorePrevEntryLength(NULL,prevlen);
    reqlen += zipStoreEntryEncoding(NULL,encoding,slen);

    /* When the insert position is not equal to the tail, we need to
     * make sure that the next entry can hold this entry's length in
     * its prevlen field. */
    int forcelarge = 0;
    nextdiff = (p[0] != ZIP_END) ? zipPrevLenByteDiff(p,reqlen) : 0;
    if (nextdiff == -4 && reqlen < 4) {
        nextdiff = 0;
        forcelarge = 1;
    }

    /* Store offset because a realloc may change the address of zl. */
    offset = p-zl;
    newlen = curlen+reqlen+nextdiff;
    zl = ziplistResize(zl,newlen);
    p = zl+offset;

    /* Apply memory move when necessary and update tail offset. */
    if (p[0] != ZIP_END) {
        /* Subtract one because of the ZIP_END bytes */
        memmove(p+reqlen,p-nextdiff,curlen-offset-1+nextdiff);

        /* Encode this entry's raw length in the next entry. */
        if (forcelarge)
            zipStorePrevEntryLengthLarge(p+reqlen,reqlen);
        else
            zipStorePrevEntryLength(p+reqlen,reqlen);

        /* Update offset for tail */
        ZIPLIST_TAIL_OFFSET(zl) = intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl)) + reqlen);

        /* When the tail contains more than one entry, we need to take
         * "nextdiff" in account as well. Otherwise, a change in the
         * size of prevlen doesn't have an effect on the *tail* offset. */
        zipEntrySafe(zl, newlen, p + reqlen, &tail, 1);
        if (p[reqlen+tail.headersize+tail.len] != ZIP_END) {
            ZIPLIST_TAIL_OFFSET(zl) = intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl)) + nextdiff);
        }
    } else {
        /* This element will be the new tail. */
        ZIPLIST_TAIL_OFFSET(zl) = intrev32ifbe(p-zl);
    }

    /* When nextdiff != 0, the raw length of the next entry has changed, so
     * we need to cascade the update throughout the ziplist */
    if (nextdiff != 0) {
        offset = p-zl;
        zl = __ziplistCascadeUpdate(zl,p+reqlen);
        p = zl+offset;
    }

    /* Write the entry */
    p += zipStorePrevEntryLength(p,prevlen);
    p += zipStoreEntryEncoding(p,encoding,slen);
    if (ZIP_IS_STR(encoding)) {
        memcpy(p,s,slen);
    } else {
        zipSaveInteger(p,value,encoding);
    }
    ZIPLIST_INCR_LENGTH(zl,1);
    return zl;
}

/* Merge ziplists 'first' and 'second' by appending 'second' to 'first'.
 *
 * NOTE: The larger ziplist is reallocated to contain the new merged ziplist.
 * Either 'first' or 'second' can be used for the result.  The parameter not
 * used will be free'd and set to NULL.
 *
 * After calling this function, the input parameters are no longer valid since
 * they are changed and free'd in-place.
 *
 * The result ziplist is the contents of 'first' followed by 'second'.
 *
 * On failure: returns NULL if the merge is impossible.
 * On success: returns the merged ziplist (which is expanded version of either
 * 'first' or 'second', also frees the other unused input ziplist, and sets the
 * input ziplist argument equal to newly reallocated ziplist return value. */
unsigned char *ziplistMerge(unsigned char **first, unsigned char **second) {
    /* If any params are null, we can't merge, so NULL. */
    if (first == NULL || *first == NULL || second == NULL || *second == NULL) return NULL;

    /* Can't merge same list into itself. */
    if (*first == *second) return NULL;

    size_t first_bytes = intrev32ifbe(ZIPLIST_BYTES(*first));
    size_t first_len = intrev16ifbe(ZIPLIST_LENGTH(*first));

    size_t second_bytes = intrev32ifbe(ZIPLIST_BYTES(*second));
    size_t second_len = intrev16ifbe(ZIPLIST_LENGTH(*second));

    int append;
    unsigned char *source, *target;
    size_t target_bytes, source_bytes;
    /* Pick the largest ziplist so we can resize easily in-place.
     * We must also track if we are now appending or prepending to
     * the target ziplist. */
    if (first_len >= second_len) {
        /* retain first, append second to first. */
        target = *first;
        target_bytes = first_bytes;
        source = *second;
        source_bytes = second_bytes;
        append = 1;
    } else {
        /* else, retain second, prepend first to second. */
        target = *second;
        target_bytes = second_bytes;
        source = *first;
        source_bytes = first_bytes;
        append = 0;
    }

    /* Calculate final bytes (subtract one pair of metadata) */
    size_t zlbytes = first_bytes + second_bytes - ZIPLIST_HEADER_SIZE - ZIPLIST_END_SIZE;
    size_t zllength = first_len + second_len;

    /* Combined zl length should be limited within UINT16_MAX */
    zllength = zllength < UINT16_MAX ? zllength : UINT16_MAX;

    /* larger values can't be stored into ZIPLIST_BYTES */
    assert(zlbytes < UINT32_MAX);

    /* Save offset positions before we start ripping memory apart. */
    size_t first_offset = intrev32ifbe(ZIPLIST_TAIL_OFFSET(*first));
    size_t second_offset = intrev32ifbe(ZIPLIST_TAIL_OFFSET(*second));

    /* Extend target to new zlbytes then append or prepend source. */
    target = zrealloc(target, zlbytes);
    if (append) {
        /* append == appending to target */
        /* Copy source after target (copying over original [END]):
         *   [TARGET - END, SOURCE - HEADER] */
        memcpy(target + target_bytes - ZIPLIST_END_SIZE, source + ZIPLIST_HEADER_SIZE,
               source_bytes - ZIPLIST_HEADER_SIZE);
    } else {
        /* !append == prepending to target */
        /* Move target *contents* exactly size of (source - [END]),
         * then copy source into vacated space (source - [END]):
         *   [SOURCE - END, TARGET - HEADER] */
        memmove(target + source_bytes - ZIPLIST_END_SIZE, target + ZIPLIST_HEADER_SIZE,
                target_bytes - ZIPLIST_HEADER_SIZE);
        memcpy(target, source, source_bytes - ZIPLIST_END_SIZE);
    }

    /* Update header metadata. */
    ZIPLIST_BYTES(target) = intrev32ifbe(zlbytes);
    ZIPLIST_LENGTH(target) = intrev16ifbe(zllength);
    /* New tail offset is:
     *   + N bytes of first ziplist
     *   - 1 byte for [END] of first ziplist
     *   + M bytes for the offset of the original tail of the second ziplist
     *   - J bytes for HEADER because second_offset keeps no header. */
    ZIPLIST_TAIL_OFFSET(target) =
        intrev32ifbe((first_bytes - ZIPLIST_END_SIZE) + (second_offset - ZIPLIST_HEADER_SIZE));

    /* __ziplistCascadeUpdate just fixes the prev length values until it finds a
     * correct prev length value (then it assumes the rest of the list is okay).
     * We tell CascadeUpdate to start at the first ziplist's tail element to fix
     * the merge seam. */
    target = __ziplistCascadeUpdate(target, target+first_offset);

    /* Now free and NULL out what we didn't realloc */
    if (append) {
        zfree(*second);
        *second = NULL;
        *first = target;
    } else {
        zfree(*first);
        *first = NULL;
        *second = target;
    }
    return target;
}

unsigned char *ziplistPush(unsigned char *zl, unsigned char *s, unsigned int slen, int where) {
    unsigned char *p;
    p = (where == ZIPLIST_HEAD) ? ZIPLIST_ENTRY_HEAD(zl) : ZIPLIST_ENTRY_END(zl);
    return __ziplistInsert(zl,p,s,slen);
}

/* Returns an offset to use for iterating with ziplistNext. When the given
 * index is negative, the list is traversed back to front. When the list
 * doesn't contain an element at the provided index, NULL is returned. */
unsigned char *ziplistIndex(unsigned char *zl, int index) {
    unsigned char *p;
    unsigned int prevlensize, prevlen = 0;
    size_t zlbytes = intrev32ifbe(ZIPLIST_BYTES(zl));
    if (index < 0) {
        index = (-index)-1;
        p = ZIPLIST_ENTRY_TAIL(zl);
        if (p[0] != ZIP_END) {
            /* No need for "safe" check: when going backwards, we know the header
             * we're parsing is in the range, we just need to assert (below) that
             * the size we take doesn't cause p to go outside the allocation. */
            ZIP_DECODE_PREVLENSIZE(p, prevlensize);
            assert(p + prevlensize < zl + zlbytes - ZIPLIST_END_SIZE);
            ZIP_DECODE_PREVLEN(p, prevlensize, prevlen);
            while (prevlen > 0 && index--) {
                p -= prevlen;
                assert(p >= zl + ZIPLIST_HEADER_SIZE && p < zl + zlbytes - ZIPLIST_END_SIZE);
                ZIP_DECODE_PREVLEN(p, prevlensize, prevlen);
            }
        }
    } else {
        p = ZIPLIST_ENTRY_HEAD(zl);
        while (index--) {
            /* Use the "safe" length: When we go forward, we need to be careful
             * not to decode an entry header if it's past the ziplist allocation. */
            p += zipRawEntryLengthSafe(zl, zlbytes, p);
            if (p[0] == ZIP_END) break;
        }
    }
    if (p[0] == ZIP_END || index > 0) return NULL;
    zipAssertValidEntry(zl, zlbytes, p);
    return p;
}

/* Return pointer to next entry in ziplist.
 *
 * zl is the pointer to the ziplist
 * p is the pointer to the current element
 *
 * The element after 'p' is returned, otherwise NULL if we are at the end. */
unsigned char *ziplistNext(unsigned char *zl, unsigned char *p) {
    ((void) zl);
    size_t zlbytes = intrev32ifbe(ZIPLIST_BYTES(zl));

    /* "p" could be equal to ZIP_END, caused by ziplistDelete,
     * and we should return NULL. Otherwise, we should return NULL
     * when the *next* element is ZIP_END (there is no next entry). */
    if (p[0] == ZIP_END) {
        return NULL;
    }

    p += zipRawEntryLength(p);
    if (p[0] == ZIP_END) {
        return NULL;
    }

    zipAssertValidEntry(zl, zlbytes, p);
    return p;
}

/* Return pointer to previous entry in ziplist. */
unsigned char *ziplistPrev(unsigned char *zl, unsigned char *p) {
    unsigned int prevlensize, prevlen = 0;

    /* Iterating backwards from ZIP_END should return the tail. When "p" is
     * equal to the first element of the list, we're already at the head,
     * and should return NULL. */
    if (p[0] == ZIP_END) {
        p = ZIPLIST_ENTRY_TAIL(zl);
        return (p[0] == ZIP_END) ? NULL : p;
    } else if (p == ZIPLIST_ENTRY_HEAD(zl)) {
        return NULL;
    } else {
        ZIP_DECODE_PREVLEN(p, prevlensize, prevlen);
        assert(prevlen > 0);
        p-=prevlen;
        size_t zlbytes = intrev32ifbe(ZIPLIST_BYTES(zl));
        zipAssertValidEntry(zl, zlbytes, p);
        return p;
    }
}

/* Get entry pointed to by 'p' and store in either '*sstr' or 'sval' depending
 * on the encoding of the entry. '*sstr' is always set to NULL to be able
 * to find out whether the string pointer or the integer value was set.
 * Return 0 if 'p' points to the end of the ziplist, 1 otherwise. */
unsigned int ziplistGet(unsigned char *p, unsigned char **sstr, unsigned int *slen, long long *sval) {
    zlentry entry;
    if (p == NULL || p[0] == ZIP_END) return 0;
    if (sstr) *sstr = NULL;

    zipEntry(p, &entry); /* no need for "safe" variant since the input pointer was validated by the function that
                            returned it. */
    if (ZIP_IS_STR(entry.encoding)) {
        if (sstr) {
            *slen = entry.len;
            *sstr = p+entry.headersize;
        }
    } else {
        if (sval) {
            *sval = zipLoadInteger(p+entry.headersize,entry.encoding);
        }
    }
    return 1;
}

/* Insert an entry at "p". */
unsigned char *ziplistInsert(unsigned char *zl, unsigned char *p, unsigned char *s, unsigned int slen) {
    return __ziplistInsert(zl,p,s,slen);
}

/* Delete a single entry from the ziplist, pointed to by *p.
 * Also update *p in place, to be able to iterate over the
 * ziplist, while deleting entries. */
unsigned char *ziplistDelete(unsigned char *zl, unsigned char **p) {
    size_t offset = *p-zl;
    zl = __ziplistDelete(zl,*p,1);

    /* Store pointer to current element in p, because ziplistDelete will
     * do a realloc which might result in a different "zl"-pointer.
     * When the delete direction is back to front, we might delete the last
     * entry and end up with "p" pointing to ZIP_END, so check this. */
    *p = zl+offset;
    return zl;
}

/* Delete a range of entries from the ziplist. */
unsigned char *ziplistDeleteRange(unsigned char *zl, int index, unsigned int num) {
    unsigned char *p = ziplistIndex(zl,index);
    return (p == NULL) ? zl : __ziplistDelete(zl,p,num);
}

/* Replaces the entry at p. This is equivalent to a delete and an insert,
 * but avoids some overhead when replacing a value of the same size. */
unsigned char *ziplistReplace(unsigned char *zl, unsigned char *p, unsigned char *s, unsigned int slen) {
    /* get metadata of the current entry */
    zlentry entry;
    zipEntry(p, &entry);

    /* compute length of entry to store, excluding prevlen */
    unsigned int reqlen;
    unsigned char encoding = 0;
    long long value = 123456789; /* initialized to avoid warning. */
    if (zipTryEncoding(s,slen,&value,&encoding)) {
        reqlen = zipIntSize(encoding); /* encoding is set */
    } else {
        reqlen = slen; /* encoding == 0 */
    }
    reqlen += zipStoreEntryEncoding(NULL,encoding,slen);

    if (reqlen == entry.lensize + entry.len) {
        /* Simply overwrite the element. */
        p += entry.prevrawlensize;
        p += zipStoreEntryEncoding(p,encoding,slen);
        if (ZIP_IS_STR(encoding)) {
            memcpy(p,s,slen);
        } else {
            zipSaveInteger(p,value,encoding);
        }
    } else {
        /* Fallback. */
        zl = ziplistDelete(zl,&p);
        zl = ziplistInsert(zl,p,s,slen);
    }
    return zl;
}

/* Compare entry pointer to by 'p' with 'sstr' of length 'slen'. */
/* Return 1 if equal. */
unsigned int ziplistCompare(unsigned char *p, unsigned char *sstr, unsigned int slen) {
    zlentry entry;
    unsigned char sencoding;
    long long zval, sval;
    if (p[0] == ZIP_END) return 0;

    zipEntry(p, &entry); /* no need for "safe" variant since the input pointer was validated by the function that
                            returned it. */
    if (ZIP_IS_STR(entry.encoding)) {
        /* Raw compare */
        if (entry.len == slen) {
            return memcmp(p+entry.headersize,sstr,slen) == 0;
        } else {
            return 0;
        }
    } else {
        /* Try to compare encoded values. Don't compare encoding because
         * different implementations may encoded integers differently. */
        if (zipTryEncoding(sstr,slen,&sval,&sencoding)) {
          zval = zipLoadInteger(p+entry.headersize,entry.encoding);
          return zval == sval;
        }
    }
    return 0;
}

/* Find pointer to the entry equal to the specified entry. Skip 'skip' entries
 * between every comparison. Returns NULL when the field could not be found. */
unsigned char *
ziplistFind(unsigned char *zl, unsigned char *p, unsigned char *vstr, unsigned int vlen, unsigned int skip) {
    int skipcnt = 0;
    unsigned char vencoding = 0;
    long long vll = 0;
    size_t zlbytes = ziplistBlobLen(zl);

    while (p[0] != ZIP_END) {
        struct zlentry e;
        unsigned char *q;
        int res = zipEntrySafe(zl, zlbytes, p, &e, 1);
        assert(res);
        (void)res;

        q = p + e.prevrawlensize + e.lensize;

        if (skipcnt == 0) {
            /* Compare current entry with specified entry */
            if (ZIP_IS_STR(e.encoding)) {
                if (e.len == vlen && memcmp(q, vstr, vlen) == 0) {
                    return p;
                }
            } else {
                /* Find out if the searched field can be encoded. Note that
                 * we do it only the first time, once done vencoding is set
                 * to non-zero and vll is set to the integer value. */
                if (vencoding == 0) {
                    if (!zipTryEncoding(vstr, vlen, &vll, &vencoding)) {
                        /* If the entry can't be encoded we set it to
                         * UCHAR_MAX so that we don't retry again the next
                         * time. */
                        vencoding = UCHAR_MAX;
                    }
                    /* Must be non-zero by now */
                    assert(vencoding);
                }

                /* Compare current entry with specified entry, do it only
                 * if vencoding != UCHAR_MAX because if there is no encoding
                 * possible for the field it can't be a valid integer. */
                if (vencoding != UCHAR_MAX) {
                    long long ll = zipLoadInteger(q, e.encoding);
                    if (ll == vll) {
                        return p;
                    }
                }
            }

            /* Reset skip count */
            skipcnt = skip;
        } else {
            /* Skip entry */
            skipcnt--;
        }

        /* Move to next entry */
        p = q + e.len;
    }

    return NULL;
}

/* Return length of ziplist. */
unsigned int ziplistLen(unsigned char *zl) {
    unsigned int len = 0;
    if (intrev16ifbe(ZIPLIST_LENGTH(zl)) < UINT16_MAX) {
        len = intrev16ifbe(ZIPLIST_LENGTH(zl));
    } else {
        unsigned char *p = zl+ZIPLIST_HEADER_SIZE;
        size_t zlbytes = intrev32ifbe(ZIPLIST_BYTES(zl));
        while (*p != ZIP_END) {
            p += zipRawEntryLengthSafe(zl, zlbytes, p);
            len++;
        }

        /* Re-store length if small enough */
        if (len < UINT16_MAX) ZIPLIST_LENGTH(zl) = intrev16ifbe(len);
    }
    return len;
}

/* Return ziplist blob size in bytes. */
size_t ziplistBlobLen(unsigned char *zl) {
    return intrev32ifbe(ZIPLIST_BYTES(zl));
}

void ziplistRepr(unsigned char *zl) {
    unsigned char *p;
    int index = 0;
    zlentry entry;
    size_t zlbytes = ziplistBlobLen(zl);

    printf("{total bytes %u} "
        "{num entries %u}\n"
        "{tail offset %u}\n",
           intrev32ifbe(ZIPLIST_BYTES(zl)), intrev16ifbe(ZIPLIST_LENGTH(zl)), intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl)));
    p = ZIPLIST_ENTRY_HEAD(zl);
    while(*p != ZIP_END) {
        zipEntrySafe(zl, zlbytes, p, &entry, 1);
        printf(
            "{\n"
                "\taddr 0x%08lx,\n"
                "\tindex %2d,\n"
                "\toffset %5lu,\n"
                "\thdr+entry len: %5u,\n"
                "\thdr len%2u,\n"
                "\tprevrawlen: %5u,\n"
                "\tprevrawlensize: %2u,\n"
                "\tpayload %5u\n",
               (long unsigned)p, index, (unsigned long)(p - zl), entry.headersize + entry.len, entry.headersize,
               entry.prevrawlen, entry.prevrawlensize, entry.len);
        printf("\tbytes: ");
        for (unsigned int i = 0; i < entry.headersize+entry.len; i++) {
            printf("%02x|",p[i]);
        }
        printf("\n");
        p += entry.headersize;
        if (ZIP_IS_STR(entry.encoding)) {
            printf("\t[str]");
            if (entry.len > 40) {
                if (fwrite(p,40,1,stdout) == 0) perror("fwrite");
                printf("...");
            } else {
                if (entry.len && fwrite(p, entry.len, 1, stdout) == 0) perror("fwrite");
            }
        } else {
            printf("\t[int]%lld", (long long) zipLoadInteger(p,entry.encoding));
        }
        printf("\n}\n");
        p += entry.len;
        index++;
    }
    printf("{end}\n\n");
}

/* Validate the integrity of the data structure.
 * when `deep` is 0, only the integrity of the header is validated.
 * when `deep` is 1, we scan all the entries one by one. */
int ziplistValidateIntegrity(unsigned char *zl,
                             size_t size,
                             int deep,
                             ziplistValidateEntryCB entry_cb,
                             void *cb_userdata) {
    /* check that we can actually read the header. (and ZIP_END) */
    if (size < ZIPLIST_HEADER_SIZE + ZIPLIST_END_SIZE) return 0;

    /* check that the encoded size in the header must match the allocated size. */
    size_t bytes = intrev32ifbe(ZIPLIST_BYTES(zl));
    if (bytes != size) return 0;

    /* the last byte must be the terminator. */
    if (zl[size - ZIPLIST_END_SIZE] != ZIP_END) return 0;

    /* make sure the tail offset isn't reaching outside the allocation. */
    if (intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl)) > size - ZIPLIST_END_SIZE) return 0;

    if (!deep) return 1;

    unsigned int count = 0;
    unsigned int header_count = intrev16ifbe(ZIPLIST_LENGTH(zl));
    unsigned char *p = ZIPLIST_ENTRY_HEAD(zl);
    unsigned char *prev = NULL;
    size_t prev_raw_size = 0;
    while(*p != ZIP_END) {
        struct zlentry e;
        /* Decode the entry headers and fail if invalid or reaches outside the allocation */
        if (!zipEntrySafe(zl, size, p, &e, 1)) return 0;

        /* Make sure the record stating the prev entry size is correct. */
        if (e.prevrawlen != prev_raw_size) return 0;

        /* Optionally let the caller validate the entry too. */
        if (entry_cb && !entry_cb(p, header_count, cb_userdata)) return 0;

        /* Move to the next entry */
        prev_raw_size = e.headersize + e.len;
        prev = p;
        p += e.headersize + e.len;
        count++;
    }

    /* Make sure 'p' really does point to the end of the ziplist. */
    if (p != zl + bytes - ZIPLIST_END_SIZE) return 0;

    /* Make sure the <zltail> entry really do point to the start of the last entry. */
    if (prev != NULL && prev != ZIPLIST_ENTRY_TAIL(zl)) return 0;

    /* Check that the count in the header is correct */
    if (header_count != UINT16_MAX && count != header_count) return 0;

    return 1;
}

/* Randomly select a pair of key and value.
 * total_count is a pre-computed length/2 of the ziplist (to avoid calls to ziplistLen)
 * 'key' and 'val' are used to store the result key value pair.
 * 'val' can be NULL if the value is not needed. */
void ziplistRandomPair(unsigned char *zl, unsigned long total_count, ziplistEntry *key, ziplistEntry *val) {
    int ret;
    unsigned char *p;

    /* Avoid div by zero on corrupt ziplist */
    assert(total_count);

    /* Generate even numbers, because ziplist saved K-V pair */
    int r = (rand() % total_count) * 2;
    p = ziplistIndex(zl, r);
    ret = ziplistGet(p, &key->sval, &key->slen, &key->lval);
    assert(ret != 0);
    (void)ret;
    if (!val)
        return;
    p = ziplistNext(zl, p);
    ret = ziplistGet(p, &val->sval, &val->slen, &val->lval);
    assert(ret != 0);
}

/* int compare for qsort */
int uintCompare(const void *a, const void *b) {
    return (*(unsigned int *) a - *(unsigned int *) b);
}

/* Helper method to store a string into from val or lval into dest */
static inline void ziplistSaveValue(unsigned char *val, unsigned int len, long long lval, ziplistEntry *dest) {
    dest->sval = val;
    dest->slen = len;
    dest->lval = lval;
}

/* Randomly select count of key value pairs and store into 'keys' and
 * 'vals' args. The order of the picked entries is random, and the selections
 * are non-unique (repetitions are possible).
 * The 'vals' arg can be NULL in which case we skip these. */
void ziplistRandomPairs(unsigned char *zl, unsigned int count, ziplistEntry *keys, ziplistEntry *vals) {
    unsigned char *p, *key, *value;
    unsigned int klen = 0, vlen = 0;
    long long klval = 0, vlval = 0;

    /* Notice: the index member must be first due to the use in uintCompare */
    typedef struct {
        unsigned int index;
        unsigned int order;
    } rand_pick;
    rand_pick *picks = zmalloc(sizeof(rand_pick)*count);
    unsigned int total_size = ziplistLen(zl)/2;

    /* Avoid div by zero on corrupt ziplist */
    assert(total_size);

    /* create a pool of random indexes (some may be duplicate). */
    for (unsigned int i = 0; i < count; i++) {
        picks[i].index = (rand() % total_size) * 2; /* Generate even indexes */
        /* keep track of the order we picked them */
        picks[i].order = i;
    }

    /* sort by indexes. */
    qsort(picks, count, sizeof(rand_pick), uintCompare);

    /* fetch the elements form the ziplist into a output array respecting the original order. */
    unsigned int zipindex = picks[0].index, pickindex = 0;
    p = ziplistIndex(zl, zipindex);
    while (ziplistGet(p, &key, &klen, &klval) && pickindex < count) {
        p = ziplistNext(zl, p);
        assert(ziplistGet(p, &value, &vlen, &vlval));
        while (pickindex < count && zipindex == picks[pickindex].index) {
            int storeorder = picks[pickindex].order;
            ziplistSaveValue(key, klen, klval, &keys[storeorder]);
            if (vals) ziplistSaveValue(value, vlen, vlval, &vals[storeorder]);
             pickindex++;
        }
        zipindex += 2;
        p = ziplistNext(zl, p);
    }

    zfree(picks);
}

/* Randomly select count of key value pairs and store into 'keys' and
 * 'vals' args. The selections are unique (no repetitions), and the order of
 * the picked entries is NOT-random.
 * The 'vals' arg can be NULL in which case we skip these.
 * The return value is the number of items picked which can be lower than the
 * requested count if the ziplist doesn't hold enough pairs. */
unsigned int ziplistRandomPairsUnique(unsigned char *zl, unsigned int count, ziplistEntry *keys, ziplistEntry *vals) {
    unsigned char *p, *key;
    unsigned int klen = 0;
    long long klval = 0;
    unsigned int total_size = ziplistLen(zl)/2;
    unsigned int index = 0;
    if (count > total_size) count = total_size;

    /* To only iterate once, every time we try to pick a member, the probability
     * we pick it is the quotient of the count left we want to pick and the
     * count still we haven't visited in the dict, this way, we could make every
     * member be equally picked.*/
    p = ziplistIndex(zl, 0);
    unsigned int picked = 0, remaining = count;
    while (picked < count && p) {
        double randomDouble = ((double)rand()) / RAND_MAX;
        double threshold = ((double)remaining) / (total_size - index);
        if (randomDouble <= threshold) {
            assert(ziplistGet(p, &key, &klen, &klval));
            ziplistSaveValue(key, klen, klval, &keys[picked]);
            p = ziplistNext(zl, p);
            assert(p);
            if (vals) {
                assert(ziplistGet(p, &key, &klen, &klval));
                ziplistSaveValue(key, klen, klval, &vals[picked]);
            }
            remaining--;
            picked++;
        } else {
            p = ziplistNext(zl, p);
            assert(p);
        }
        p = ziplistNext(zl, p);
        index++;
    }
    return picked;
}


================================================
FILE: src/redis/ziplist.h
================================================
/*
 * Copyright (c) 2009-2012, Pieter Noordhuis <pcnoordhuis at gmail dot com>
 * Copyright (c) 2009-2012, Salvatore Sanfilippo <antirez at gmail dot com>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef _ZIPLIST_H
#define _ZIPLIST_H

#define ZIPLIST_HEAD 0
#define ZIPLIST_TAIL 1

/* Each entry in the ziplist is either a string or an integer. */
typedef struct {
    /* When string is used, it is provided with the length (slen). */
    unsigned char *sval;
    unsigned int slen;
    /* When integer is used, 'sval' is NULL, and lval holds the value. */
    long long lval;
} ziplistEntry;

unsigned char *ziplistNew(void);
unsigned char *ziplistMerge(unsigned char **first, unsigned char **second);
unsigned char *ziplistPush(unsigned char *zl, unsigned char *s, unsigned int slen, int where);
unsigned char *ziplistIndex(unsigned char *zl, int index);
unsigned char *ziplistNext(unsigned char *zl, unsigned char *p);
unsigned char *ziplistPrev(unsigned char *zl, unsigned char *p);
unsigned int ziplistGet(unsigned char *p, unsigned char **sval, unsigned int *slen, long long *lval);
unsigned char *ziplistInsert(unsigned char *zl, unsigned char *p, unsigned char *s, unsigned int slen);
unsigned char *ziplistDelete(unsigned char *zl, unsigned char **p);
unsigned char *ziplistDeleteRange(unsigned char *zl, int index, unsigned int num);
unsigned char *ziplistReplace(unsigned char *zl, unsigned char *p, unsigned char *s, unsigned int slen);
unsigned int ziplistCompare(unsigned char *p, unsigned char *s, unsigned int slen);
unsigned char *ziplistFind(unsigned char *zl, unsigned char *p, unsigned char *vstr, unsigned int vlen, unsigned int skip);
unsigned int ziplistLen(unsigned char *zl);
size_t ziplistBlobLen(unsigned char *zl);
void ziplistRepr(unsigned char *zl);
typedef int (*ziplistValidateEntryCB)(unsigned char* p, unsigned int head_count, void* userdata);
int ziplistValidateIntegrity(unsigned char *zl, size_t size, int deep,
                             ziplistValidateEntryCB entry_cb, void *cb_userdata);
void ziplistRandomPair(unsigned char *zl, unsigned long total_count, ziplistEntry *key, ziplistEntry *val);
void ziplistRandomPairs(unsigned char *zl, unsigned int count, ziplistEntry *keys, ziplistEntry *vals);
unsigned int ziplistRandomPairsUnique(unsigned char *zl, unsigned int count, ziplistEntry *keys, ziplistEntry *vals);
int ziplistSafeToAdd(unsigned char* zl, size_t add);

#ifdef REDIS_TEST
int ziplistTest(int argc, char *argv[], int accurate);
#endif

#endif /* _ZIPLIST_H */


================================================
FILE: src/redis/zmalloc.c
================================================
/* zmalloc - total amount of allocated memory aware version of malloc()
 *
 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <assert.h>

/* This function provide us access to the original libc free(). This is useful
 * for instance to free results obtained by backtrace_symbols(). We need
 * to define this function before including zmalloc.h that may shadow the
 * free implementation if we use jemalloc or another non standard allocator. */
/*void zlibc_free(void *ptr) {
    free(ptr);
}*/

#include <string.h>
#include <pthread.h>
#include "config.h"
#include "zmalloc.h"
#include "atomicvar.h"

#ifdef HAVE_MALLOC_SIZE
#define PREFIX_SIZE (0)
#define ASSERT_NO_SIZE_OVERFLOW(sz)
#else
#if defined(__sun) || defined(__sparc) || defined(__sparc__)
#define PREFIX_SIZE (sizeof(long long))
#else
#define PREFIX_SIZE (sizeof(size_t))
#endif
#define ASSERT_NO_SIZE_OVERFLOW(sz) assert((sz) + PREFIX_SIZE > (sz))
#endif

/* When using the libc allocator, use a minimum allocation size to match the
 * jemalloc behavior that doesn't return NULL in this case.
 */
#define MALLOC_MIN_SIZE(x) ((x) > 0 ? (x) : sizeof(long))
/* Explicitly override malloc/free etc when using tcmalloc. */
#if defined(USE_TCMALLOC)
#define malloc(size) tc_malloc(size)
#define calloc(count,size) tc_calloc(count,size)
#define realloc(ptr,size) tc_realloc(ptr,size)
#define free(ptr) tc_free(ptr)
#elif defined(USE_JEMALLOC)
#define malloc(size) je_malloc(size)
#define calloc(count,size) je_calloc(count,size)
#define realloc(ptr,size) je_realloc(ptr,size)
#define free(ptr) je_free(ptr)
#define mallocx(size,flags) je_mallocx(size,flags)
#define dallocx(ptr,flags) je_dallocx(ptr,flags)
#endif

#define update_zmalloc_stat_alloc(__n) used_memory_tl += (__n)
#define update_zmalloc_stat_free(__n)  used_memory_tl -= (__n)

__thread ssize_t used_memory_tl = 0;


static void zmalloc_default_oom(size_t size) {
    fprintf(stderr, "zmalloc: Out of memory trying to allocate %zu bytes\n",
        size);
    fflush(stderr);
    abort();
}

static void (*zmalloc_oom_handler)(size_t) = zmalloc_default_oom;

void init_zmalloc_threadlocal() {
}

/* Try allocating memory, and return NULL if failed.
 * '*usable' is set to the usable size if non NULL. */
void *ztrymalloc_usable(size_t size, size_t *usable) {
    ASSERT_NO_SIZE_OVERFLOW(size);
    void *ptr = malloc(MALLOC_MIN_SIZE(size)+PREFIX_SIZE);

    if (!ptr) return NULL;
#ifdef HAVE_MALLOC_SIZE
    size = zmalloc_size(ptr);
    update_zmalloc_stat_alloc(size);
    if (usable) *usable = size;
    return ptr;
#else
    *((size_t*)ptr) = size;
    update_zmalloc_stat_alloc(size+PREFIX_SIZE);
    if (usable) *usable = size;
    return (char*)ptr+PREFIX_SIZE;
#endif
}

/* Allocate memory or panic */
void *zmalloc(size_t size) {
    void *ptr = ztrymalloc_usable(size, NULL);
    if (!ptr) zmalloc_oom_handler(size);
    return ptr;
}

/* Try allocating memory, and return NULL if failed. */
void *ztrymalloc(size_t size) {
    void *ptr = ztrymalloc_usable(size, NULL);
    return ptr;
}

/* Allocate memory or panic.
 * '*usable' is set to the usable size if non NULL. */
void *zmalloc_usable(size_t size, size_t *usable) {
    void *ptr = ztrymalloc_usable(size, usable);
    if (!ptr) zmalloc_oom_handler(size);
    return ptr;
}

size_t znallocx(size_t size) {
#if defined(USE_JEMALLOC)
    return je_ncallocx(size, 0);
#else
    return size;
#endif
}

void zfree_size(void* ptr, size_t size) {
#if defined(USE_JEMALLOC)
    je_sdallocx(ptr, size, 0);
#else
    free(ptr);
    (void)size;
#endif

}

/* Allocation and free functions that bypass the thread cache
 * and go straight to the allocator arena bins.
 * Currently implemented only for jemalloc. Used for online defragmentation. */
#ifdef HAVE_DEFRAG
void *zmalloc_no_tcache(size_t size) {
    ASSERT_NO_SIZE_OVERFLOW(size);
    void *ptr = mallocx(size+PREFIX_SIZE, MALLOCX_TCACHE_NONE);
    if (!ptr) zmalloc_oom_handler(size);
    update_zmalloc_stat_alloc(zmalloc_size(ptr));
    return ptr;
}

void zfree_no_tcache(void *ptr) {
    if (ptr == NULL) return;
    update_zmalloc_stat_free(zmalloc_size(ptr));
    dallocx(ptr, MALLOCX_TCACHE_NONE);
}
#endif

/* Try allocating memory and zero it, and return NULL if failed.
 * '*usable' is set to the usable size if non NULL. */
void *ztrycalloc_usable(size_t size, size_t *usable) {
    ASSERT_NO_SIZE_OVERFLOW(size);
    void *ptr = calloc(1, MALLOC_MIN_SIZE(size)+PREFIX_SIZE);
    if (ptr == NULL) return NULL;

#ifdef HAVE_MALLOC_SIZE
    size = zmalloc_size(ptr);
    update_zmalloc_stat_alloc(size);
    if (usable) *usable = size;
    return ptr;
#else
    *((size_t*)ptr) = size;
    update_zmalloc_stat_alloc(size+PREFIX_SIZE);
    if (usable) *usable = size;
    return (char*)ptr+PREFIX_SIZE;
#endif
}

/* Allocate memory and zero it or panic */
void *zcalloc(size_t size) {
    void *ptr = ztrycalloc_usable(size, NULL);

    if (!ptr) zmalloc_oom_handler(size);
    return ptr;
}

/* Try allocating memory, and return NULL if failed. */
void *ztrycalloc(size_t size) {
    void *ptr = ztrycalloc_usable(size, NULL);
    return ptr;
}

/* Allocate memory or panic.
 * '*usable' is set to the usable size if non NULL. */
void *zcalloc_usable(size_t size, size_t *usable) {
    void *ptr = ztrycalloc_usable(size, usable);
    if (!ptr) zmalloc_oom_handler(size);
    return ptr;
}

/* Try reallocating memory, and return NULL if failed.
 * '*usable' is set to the usable size if non NULL. */
void *ztryrealloc_usable(void *ptr, size_t size, size_t *usable) {
    ASSERT_NO_SIZE_OVERFLOW(size);
#ifndef HAVE_MALLOC_SIZE
    void *realptr;
#endif
    size_t oldsize;
    void *newptr;

    /* not allocating anything, just redirect to free. */
    if (size == 0 && ptr != NULL) {
        zfree(ptr);
        if (usable) *usable = 0;
        return NULL;
    }
    /* Not freeing anything, just redirect to malloc. */
    if (ptr == NULL)
        return ztrymalloc_usable(size, usable);
#ifdef HAVE_MALLOC_SIZE
    oldsize = zmalloc_size(ptr);
    newptr = realloc(ptr,size);
    if (newptr == NULL) {
        if (usable) *usable = 0;
        return NULL;
    }

    update_zmalloc_stat_free(oldsize);
    size = zmalloc_size(newptr);
    update_zmalloc_stat_alloc(size);
    if (usable) *usable = size;
    return newptr;
#else
    realptr = (char*)ptr-PREFIX_SIZE;
    oldsize = *((size_t*)realptr);
    newptr = realloc(realptr,size+PREFIX_SIZE);
    if (newptr == NULL) {
        if (usable) *usable = 0;
        return NULL;
    }

    *((size_t*)newptr) = size;
    update_zmalloc_stat_free(oldsize);
    update_zmalloc_stat_alloc(size);
    if (usable) *usable = size;
    return (char*)newptr+PREFIX_SIZE;
#endif
}

/* Reallocate memory and zero it or panic */
void *zrealloc(void *ptr, size_t size) {
    ptr = ztryrealloc_usable(ptr, size, NULL);
    if (!ptr && size != 0) zmalloc_oom_handler(size);
    return ptr;
}

/* Try Reallocating memory, and return NULL if failed. */
void *ztryrealloc(void *ptr, size_t size) {
    ptr = ztryrealloc_usable(ptr, size, NULL);
    return ptr;
}

/* Reallocate memory or panic.
 * '*usable' is set to the usable size if non NULL. */
void *zrealloc_usable(void *ptr, size_t size, size_t *usable) {
    ptr = ztryrealloc_usable(ptr, size, usable);
    if (!ptr && size != 0) zmalloc_oom_handler(size);
    return ptr;
}
/* Provide zmalloc_size() for systems where this function is not provided by
 * malloc itself, given that in that case we store a header with this
 * information as the first bytes of every allocation. */
#ifndef HAVE_MALLOC_SIZE
size_t zmalloc_size(void *ptr) {
    void *realptr = (char*)ptr-PREFIX_SIZE;
    size_t size = *((size_t*)realptr);
    return size+PREFIX_SIZE;
}
size_t zmalloc_usable_size(void *ptr) {
    return zmalloc_size(ptr)-PREFIX_SIZE;
}
#endif

void zfree(void *ptr) {
#ifndef HAVE_MALLOC_SIZE
    void *realptr;
    size_t oldsize;
#endif

    if (ptr == NULL) return;
#ifdef HAVE_MALLOC_SIZE
    update_zmalloc_stat_free(zmalloc_size(ptr));
    free(ptr);
#else
    realptr = (char*)ptr-PREFIX_SIZE;
    oldsize = *((size_t*)realptr);
    update_zmalloc_stat_free(oldsize+PREFIX_SIZE);
    free(realptr);
#endif
}

void zmalloc_set_oom_handler(void (*oom_handler)(size_t)) {
    zmalloc_oom_handler = oom_handler;
}

/* Get the RSS information in an OS-specific way.
 *
 * WARNING: the function zmalloc_get_rss() is not designed to be fast
 * and may not be called in the busy loops where Redis tries to release
 * memory expiring or swapping out objects.
 *
 * For this kind of "fast RSS reporting" usages use instead the
 * function RedisEstimateRSS() that is a much faster (and less precise)
 * version of the function. */

#if defined(HAVE_PROC_STAT)
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

size_t zmalloc_get_rss(void) {
    int page = sysconf(_SC_PAGESIZE);
    size_t rss;
    char buf[4096];
    char filename[256];
    int fd, count;
    char *p, *x;

    snprintf(filename,256,"/proc/%ld/stat",(long) getpid());
    if ((fd = open(filename,O_RDONLY)) == -1) return 0;
    if (read(fd,buf,4096) <= 0) {
        close(fd);
        return 0;
    }
    close(fd);

    p = buf;
    count = 23; /* RSS is the 24th field in /proc/<pid>/stat */
    while(p && count--) {
        p = strchr(p,' ');
        if (p) p++;
    }
    if (!p) return 0;
    x = strchr(p,' ');
    if (!x) return 0;
    *x = '\0';

    rss = strtoll(p,NULL,10);
    rss *= page;
    return rss;
}
#elif defined(HAVE_TASKINFO)
#include <sys/types.h>
#include <sys/sysctl.h>
#include <mach/task.h>
#include <mach/mach_init.h>

size_t zmalloc_get_rss(void) {
    task_t task = MACH_PORT_NULL;
    struct task_basic_info t_info;
    mach_msg_type_number_t t_info_count = TASK_BASIC_INFO_COUNT;

    if (task_for_pid(current_task(), getpid(), &task) != KERN_SUCCESS)
        return 0;
    task_info(task, TASK_BASIC_INFO, (task_info_t)&t_info, &t_info_count);

    return t_info.resident_size;
}
#elif defined(__FreeBSD__) || defined(__DragonFly__)
#include <sys/types.h>
#include <sys/sysctl.h>
#include <sys/user.h>

size_t zmalloc_get_rss(void) {
    struct kinfo_proc info;
    size_t infolen = sizeof(info);
    int mib[4];
    mib[0] = CTL_KERN;
    mib[1] = KERN_PROC;
    mib[2] = KERN_PROC_PID;
    mib[3] = getpid();

    if (sysctl(mib, 4, &info, &infolen, NULL, 0) == 0)
#if defined(__FreeBSD__)
        return (size_t)info.ki_rssize * getpagesize();
#else
        return (size_t)info.kp_vm_rssize * getpagesize();
#endif

    return 0L;
}
#elif defined(__NetBSD__)
#include <sys/types.h>
#include <sys/sysctl.h>

size_t zmalloc_get_rss(void) {
    struct kinfo_proc2 info;
    size_t infolen = sizeof(info);
    int mib[6];
    mib[0] = CTL_KERN;
    mib[1] = KERN_PROC;
    mib[2] = KERN_PROC_PID;
    mib[3] = getpid();
    mib[4] = sizeof(info);
    mib[5] = 1;
    if (sysctl(mib, 4, &info, &infolen, NULL, 0) == 0)
        return (size_t)info.p_vm_rssize * getpagesize();

    return 0L;
}
#elif defined(HAVE_PSINFO)
#include <unistd.h>
#include <sys/procfs.h>
#include <fcntl.h>

size_t zmalloc_get_rss(void) {
    struct prpsinfo info;
    char filename[256];
    int fd;

    snprintf(filename,256,"/proc/%ld/psinfo",(long) getpid());

    if ((fd = open(filename,O_RDONLY)) == -1) return 0;
    if (ioctl(fd, PIOCPSINFO, &info) == -1) {
        close(fd);
	return 0;
    }

    close(fd);
    return info.pr_rssize;
}
#else
size_t zmalloc_get_rss(void) {
    /* If we can't get the RSS in an OS-specific way for this system just
     * return the memory usage we estimated in zmalloc()..
     *
     * Fragmentation will appear to be always 1 (no fragmentation)
     * of course... */
    return zmalloc_used_memory();
}
#endif

#if defined(USE_JEMALLOC)

int zmalloc_get_allocator_info(size_t *allocated,
                               size_t *active,
                               size_t *resident) {
    uint64_t epoch = 1;
    size_t sz;
    *allocated = *resident = *active = 0;
    /* Update the statistics cached by mallctl. */
    sz = sizeof(epoch);
    je_mallctl("epoch", &epoch, &sz, &epoch, sz);
    sz = sizeof(size_t);
    /* Unlike RSS, this does not include RSS from shared libraries and other non
     * heap mappings. */
    je_mallctl("stats.resident", resident, &sz, NULL, 0);
    /* Unlike resident, this doesn't not include the pages jemalloc reserves
     * for re-use (purge will clean that). */
    je_mallctl("stats.active", active, &sz, NULL, 0);
    /* Unlike zmalloc_used_memory, this matches the stats.resident by taking
     * into account all allocations done by this process (not only zmalloc). */
    je_mallctl("stats.allocated", allocated, &sz, NULL, 0);
    return 1;
}

void set_jemalloc_bg_thread(int enable) {
    /* let jemalloc do purging asynchronously, required when there's no traffic
     * after flushdb */
    char val = !!enable;
    je_mallctl("background_thread", NULL, 0, &val, 1);
}

int jemalloc_purge() {
    /* return all unused (reserved) pages to the OS */
    char tmp[32];
    unsigned narenas = 0;
    size_t sz = sizeof(unsigned);
    if (!je_mallctl("arenas.narenas", &narenas, &sz, NULL, 0)) {
        sprintf(tmp, "arena.%d.purge", narenas);
        if (!je_mallctl(tmp, NULL, 0, NULL, 0))
            return 0;
    }
    return -1;
}

#else

int zmalloc_get_allocator_info(size_t *allocated,
                               size_t *active,
                               size_t *resident) {
    *allocated = *resident = *active = 0;
    return 1;
}

void set_jemalloc_bg_thread(int enable) {
    ((void)(enable));
}

int jemalloc_purge() {
    return 0;
}

#endif

#if defined(__APPLE__)
/* For proc_pidinfo() used later in zmalloc_get_smap_bytes_by_field().
 * Note that this file cannot be included in zmalloc.h because it includes
 * a Darwin queue.h file where there is a "LIST_HEAD" macro (!) defined
 * conficting with Redis user code. */
#include <libproc.h>
#endif

/* Get the sum of the specified field (converted form kb to bytes) in
 * /proc/self/smaps. The field must be specified with trailing ":" as it
 * apperas in the smaps output.
 *
 * If a pid is specified, the information is extracted for such a pid,
 * otherwise if pid is -1 the information is reported is about the
 * current process.
 *
 * Example: zmalloc_get_smap_bytes_by_field("Rss:",-1);
 */
#if defined(HAVE_PROC_SMAPS)
size_t zmalloc_get_smap_bytes_by_field(char *field, long pid) {
    char line[1024];
    size_t bytes = 0;
    int flen = strlen(field);
    FILE *fp;

    if (pid == -1) {
        fp = fopen("/proc/self/smaps","r");
    } else {
        char filename[128];
        snprintf(filename,sizeof(filename),"/proc/%ld/smaps",pid);
        fp = fopen(filename,"r");
    }

    if (!fp) return 0;
    while(fgets(line,sizeof(line),fp) != NULL) {
        if (strncmp(line,field,flen) == 0) {
            char *p = strchr(line,'k');
            if (p) {
                *p = '\0';
                bytes += strtol(line+flen,NULL,10) * 1024;
            }
        }
    }
    fclose(fp);
    return bytes;
}
#else
/* Get sum of the specified field from libproc api call.
 * As there are per page value basis we need to convert
 * them accordingly.
 *
 * Note that AnonHugePages is a no-op as THP feature
 * is not supported in this platform
 */
size_t zmalloc_get_smap_bytes_by_field(char *field, long pid) {
#if defined(__APPLE__)
    struct proc_regioninfo pri;
    if (pid == -1) pid = getpid();
    if (proc_pidinfo(pid, PROC_PIDREGIONINFO, 0, &pri,
                     PROC_PIDREGIONINFO_SIZE) == PROC_PIDREGIONINFO_SIZE)
    {
        int pagesize = getpagesize();
        if (!strcmp(field, "Private_Dirty:")) {
            return (size_t)pri.pri_pages_dirtied * pagesize;
        } else if (!strcmp(field, "Rss:")) {
            return (size_t)pri.pri_pages_resident * pagesize;
        } else if (!strcmp(field, "AnonHugePages:")) {
            return 0;
        }
    }
    return 0;
#endif
    ((void) field);
    ((void) pid);
    return 0;
}
#endif

/* Return the total number bytes in pages marked as Private Dirty.
 *
 * Note: depending on the platform and memory footprint of the process, this
 * call can be slow, exceeding 1000ms!
 */
size_t zmalloc_get_private_dirty(long pid) {
    return zmalloc_get_smap_bytes_by_field("Private_Dirty:",pid);
}

/* Returns the size of physical memory (RAM) in bytes.
 * It looks ugly, but this is the cleanest way to achieve cross platform results.
 * Cleaned up from:
 *
 * http://nadeausoftware.com/articles/2012/09/c_c_tip_how_get_physical_memory_size_system
 *
 * Note that this function:
 * 1) Was released under the following CC attribution license:
 *    http://creativecommons.org/licenses/by/3.0/deed.en_US.
 * 2) Was originally implemented by David Robert Nadeau.
 * 3) Was modified for Redis by Matt Stancliff.
 * 4) This note exists in order to comply with the original license.
 */
size_t zmalloc_get_memory_size(void) {
#if defined(__unix__) || defined(__unix) || defined(unix) || \
    (defined(__APPLE__) && defined(__MACH__))
#if defined(CTL_HW) && (defined(HW_MEMSIZE) || defined(HW_PHYSMEM64))
    int mib[2];
    mib[0] = CTL_HW;
#if defined(HW_MEMSIZE)
    mib[1] = HW_MEMSIZE;            /* OSX. --------------------- */
#elif defined(HW_PHYSMEM64)
    mib[1] = HW_PHYSMEM64;          /* NetBSD, OpenBSD. --------- */
#endif
    int64_t size = 0;               /* 64-bit */
    size_t len = sizeof(size);
    if (sysctl( mib, 2, &size, &len, NULL, 0) == 0)
        return (size_t)size;
    return 0L;          /* Failed? */

#elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE)
    /* FreeBSD, Linux, OpenBSD, and Solaris. -------------------- */
    return (size_t)sysconf(_SC_PHYS_PAGES) * (size_t)sysconf(_SC_PAGESIZE);

#elif defined(CTL_HW) && (defined(HW_PHYSMEM) || defined(HW_REALMEM))
    /* DragonFly BSD, FreeBSD, NetBSD, OpenBSD, and OSX. -------- */
    int mib[2];
    mib[0] = CTL_HW;
#if defined(HW_REALMEM)
    mib[1] = HW_REALMEM;        /* FreeBSD. ----------------- */
#elif defined(HW_PHYSMEM)
    mib[1] = HW_PHYSMEM;        /* Others. ------------------ */
#endif
    unsigned int size = 0;      /* 32-bit */
    size_t len = sizeof(size);
    if (sysctl(mib, 2, &size, &len, NULL, 0) == 0)
        return (size_t)size;
    return 0L;          /* Failed? */
#else
    return 0L;          /* Unknown method to get the data. */
#endif
#else
    return 0L;          /* Unknown OS. */
#endif
}

#ifdef REDIS_TEST
#define UNUSED(x) ((void)(x))
int zmalloc_test(int argc, char **argv, int accurate) {
    void *ptr;

    UNUSED(argc);
    UNUSED(argv);
    UNUSED(accurate);
    printf("Malloc prefix size: %d\n", (int) PREFIX_SIZE);
    printf("Initial used memory: %zu\n", zmalloc_used_memory());
    ptr = zmalloc(123);
    printf("Allocated 123 bytes; used: %zu\n", zmalloc_used_memory());
    ptr = zrealloc(ptr, 456);
    printf("Reallocated to 456 bytes; used: %zu\n", zmalloc_used_memory());
    zfree(ptr);
    printf("Freed pointer; used: %zu\n", zmalloc_used_memory());
    return 0;
}
#endif


================================================
FILE: src/redis/zmalloc.h
================================================
/* zmalloc - total amount of allocated memory aware version of malloc()
 *
 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef __ZMALLOC_H
#define __ZMALLOC_H

#include <stdint.h>

/* Double expansion needed for stringification of macro values. */
#define __xstr(s) __zm_str(s)
#define __zm_str(s) #s

#if defined(USE_JEMALLOC)
#define ZMALLOC_LIB ("jemalloc-" __xstr(JEMALLOC_VERSION_MAJOR) "." __xstr(JEMALLOC_VERSION_MINOR) "." __xstr(JEMALLOC_VERSION_BUGFIX))
#include <jemalloc/jemalloc.h>
#if (JEMALLOC_VERSION_MAJOR == 2 && JEMALLOC_VERSION_MINOR >= 1) || (JEMALLOC_VERSION_MAJOR > 2)
#define HAVE_MALLOC_SIZE 1
#define zmalloc_size(p) je_malloc_usable_size(p)
#else
#error "Newer version of jemalloc required"
#endif

#elif defined(__APPLE__)
#include <malloc/malloc.h>
#define HAVE_MALLOC_SIZE 1
#ifdef USE_ZMALLOC_MI
#define zmalloc_size(p) zmalloc_usable_size(p)
#else
#define zmalloc_size(p) malloc_size(p)
#endif
#define ZMALLOC_LIB "macos"
#endif

/* On native libc implementations, we should still do our best to provide a
 * HAVE_MALLOC_SIZE capability. This can be set explicitly as well:
 *
 * NO_MALLOC_USABLE_SIZE disables it on all platforms, even if they are
 *      known to support it.
 * USE_MALLOC_USABLE_SIZE forces use of malloc_usable_size() regardless
 *      of platform.
 */
#ifndef ZMALLOC_LIB
#define ZMALLOC_LIB "libc"

#include <malloc.h>

#define HAVE_MALLOC_SIZE 1
#ifdef USE_ZMALLOC_MI
#define zmalloc_size(p) zmalloc_usable_size(p)
#else
#define zmalloc_size(p) malloc_usable_size(p)
#endif

#endif  // ZMALLOC_LIB

/* We can enable the Redis defrag capabilities only if we are using Jemalloc
 * and the version used is our special version modified for Redis having
 * the ability to return per-allocation fragmentation hints. */
#if defined(USE_JEMALLOC) && defined(JEMALLOC_FRAG_HINT)
#define HAVE_DEFRAG
#endif

void *zmalloc(size_t size);
void *zcalloc(size_t size);
void *zrealloc(void *ptr, size_t size);
void *ztrymalloc(size_t size);
void *ztrycalloc(size_t size);
void *ztryrealloc(void *ptr, size_t size);
void zfree(void *ptr);

size_t znallocx(size_t size); // Equivalent to nallocx for jemalloc or mi_good_size for mimalloc.
void zfree_size(void* ptr, size_t size);  // equivalent to sdallocx or mi_free_size

void *zmalloc_usable(size_t size, size_t *usable);
void *zcalloc_usable(size_t size, size_t *usable);
void *zrealloc_usable(void *ptr, size_t size, size_t *usable);
void *ztrymalloc_usable(size_t size, size_t *usable);
void *ztrycalloc_usable(size_t size, size_t *usable);
void *ztryrealloc_usable(void *ptr, size_t size, size_t *usable);

// size_t zmalloc_used_memory(void);
void zmalloc_set_oom_handler(void (*oom_handler)(size_t));
size_t zmalloc_get_rss(void);
int zmalloc_get_allocator_info(size_t *allocated, size_t *active, size_t *resident);
void set_jemalloc_bg_thread(int enable);
int jemalloc_purge();
size_t zmalloc_get_private_dirty(long pid);
size_t zmalloc_get_smap_bytes_by_field(char *field, long pid);
size_t zmalloc_get_memory_size(void);
size_t zmalloc_usable_size(const void* p);

/* get the memory usage + the number of wasted locations of memory
Based on a given threshold (ratio < 1).
Note that if a block is not used, it would not counted as wasted
*/
int zmalloc_get_allocator_wasted_blocks(float ratio, size_t* allocated, size_t* commited,
                                        size_t* wasted);
struct fragmentation_info {
  size_t committed;

  // a temporary metric to compare against "committed" in production.
  // TODO: delete it once we are confident committed is computed correctly.
  size_t committed_golden;
  size_t wasted;
  unsigned bin;
};

// Like zmalloc_get_allocator_wasted_blocks but incremental.
// struct fragmentation_info must be passed first set to zero. Returns -1 needs to continue,
// 0 if done.
int zmalloc_get_allocator_fragmentation_step(float ratio, struct fragmentation_info* info);

/*
 * checks whether a page that the pointer ptr located at is underutilized.
 * This uses the current local thread heap.
 * return 0 if not, 1 if underutilized
 */
struct mi_page_usage_stats_s;
void zmalloc_page_is_underutilized(void* ptr, float ratio, int collect_stats, struct mi_page_usage_stats_s* result);
char* zstrdup(const char* s);

void init_zmalloc_threadlocal(void* heap);
extern __thread ssize_t zmalloc_used_memory_tl;

#undef __zm_str
#undef __xstr

#endif /* __ZMALLOC_H */


================================================
FILE: src/redis/zmalloc_mi.c
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include <assert.h>
#include <mimalloc.h>

#define MI_BUILD_RELEASE 1
#include <mimalloc/types.h>
#include <string.h>
#include <unistd.h>

#include "zmalloc.h"

__thread ssize_t zmalloc_used_memory_tl = 0;
__thread mi_heap_t* zmalloc_heap = NULL;

mi_page_usage_stats_t mi_heap_page_is_underutilized(mi_heap_t* heap, void* p, float ratio,
                                                    bool collect_stats);

/* Allocate memory or panic */
void* zmalloc(size_t size) {
  assert(zmalloc_heap);
  void* res = mi_heap_malloc(zmalloc_heap, size);
  size_t usable = mi_usable_size(res);

  // assertion does not hold. Basically mi_good_size is not a good function for
  // doing accounting.
  // assert(usable == mi_good_size(size));
  zmalloc_used_memory_tl += usable;

  return res;
}

void* ztrymalloc_usable(size_t size, size_t* usable) {
  return zmalloc_usable(size, usable);
}

size_t zmalloc_usable_size(const void* p) {
  return mi_usable_size(p);
}

void zfree(void* ptr) {
  size_t usable = mi_usable_size(ptr);

  // assert(zmalloc_used_memory_tl >= (ssize_t)usable);
  zmalloc_used_memory_tl -= usable;

  mi_free_size(ptr, usable);
}

void* zrealloc(void* ptr, size_t size) {
  size_t usable;
  return zrealloc_usable(ptr, size, &usable);
}

void* zcalloc(size_t size) {
  // mi_good_size(size) is not working. try for example, size=690557.

  void* res = mi_heap_calloc(zmalloc_heap, 1, size);
  size_t usable = mi_usable_size(res);
  zmalloc_used_memory_tl += usable;

  return res;
}

void* zmalloc_usable(size_t size, size_t* usable) {
  assert(zmalloc_heap);
  void* res = mi_heap_malloc(zmalloc_heap, size);
  size_t uss = mi_usable_size(res);
  *usable = uss;

  zmalloc_used_memory_tl += uss;

  return res;
}

void* zrealloc_usable(void* ptr, size_t size, size_t* usable) {
  ssize_t prev = mi_usable_size(ptr);

  void* res = mi_heap_realloc(zmalloc_heap, ptr, size);
  ssize_t uss = mi_usable_size(res);
  *usable = uss;
  zmalloc_used_memory_tl += (uss - prev);

  return res;
}

size_t znallocx(size_t size) {
  return mi_good_size(size);
}

void zfree_size(void* ptr, size_t size) {
  ssize_t uss = mi_usable_size(ptr);
  zmalloc_used_memory_tl -= uss;
  mi_free_size(ptr, uss);
}

void* ztrymalloc(size_t size) {
  size_t usable;
  return zmalloc_usable(size, &usable);
}

void* ztrycalloc(size_t size) {
  size_t g = mi_good_size(size);
  zmalloc_used_memory_tl += g;
  void* ptr = mi_heap_calloc(zmalloc_heap, 1, size);
  assert(mi_usable_size(ptr) == g);
  return ptr;
}

typedef struct Sum_s {
  size_t allocated;
  size_t comitted;
} Sum_t;

typedef struct {
  size_t allocated;
  size_t comitted;
  size_t wasted;
  float ratio;
} MemUtilized_t;

bool heap_visit_cb(const mi_heap_t* heap, const mi_heap_area_t* area, void* block,
                   size_t block_size, void* arg) {
  assert(area->used < (1u << 31));

  Sum_t* sum = (Sum_t*)arg;

  // mimalloc mistakenly exports used in blocks instead of bytes.
  sum->allocated += block_size * area->used;
  sum->comitted += area->committed;
  return true;  // continue iteration
};

bool heap_count_wasted_blocks(const mi_heap_t* heap, const mi_heap_area_t* area, void* block,
                              size_t block_size, void* arg) {
  assert(area->used < (1u << 31));

  MemUtilized_t* sum = (MemUtilized_t*)arg;

  // mimalloc mistakenly exports used in blocks instead of bytes.
  size_t used = block_size * area->used;
  sum->allocated += used;
  sum->comitted += area->committed;

  if (used < area->committed * sum->ratio) {
    sum->wasted += (area->committed - used);
  }
  return true;  // continue iteration
};

int zmalloc_get_allocator_info(size_t* allocated, size_t* active, size_t* resident) {
  Sum_t sum = {0};

  mi_heap_visit_blocks(zmalloc_heap, false /* visit all blocks*/, heap_visit_cb, &sum);
  *allocated = sum.allocated;
  *resident = sum.comitted;
  *active = 0;

  return 1;
}

int zmalloc_get_allocator_wasted_blocks(float ratio, size_t* allocated, size_t* commited,
                                        size_t* wasted) {
  MemUtilized_t sum = {.allocated = 0, .comitted = 0, .wasted = 0, .ratio = ratio};

  mi_heap_visit_blocks(zmalloc_heap, false /* visit all blocks*/, heap_count_wasted_blocks, &sum);
  *allocated = sum.allocated;
  *commited = sum.comitted;
  *wasted = sum.wasted;
  return 1;
}

// Implemented based on this mimalloc code:
// https://github.com/microsoft/mimalloc/blob/main/src/heap.c#L27
int zmalloc_get_allocator_fragmentation_step(float ratio, struct fragmentation_info* info) {
  if (zmalloc_heap->page_count == 0 || info->bin >= MI_BIN_FULL) {
    // We avoid iterating over full pages since they are fully utilized.
    return 0;
  }

  mi_page_queue_t* pq = &zmalloc_heap->pages[info->bin];
  const mi_page_t* page = pq->first;
  while (page != NULL) {
    const mi_page_t* next = page->next;

    const size_t bsize = page->block_size;

    size_t committed = page->capacity * bsize;
    info->committed += committed;
    if (page->used < page->capacity) {
      size_t used = page->used * bsize;

      size_t threshold = (double)committed * ratio;
      if (used < threshold) {
        info->wasted += (committed - used);
      }
    }
    page = next;
  }

  info->bin++;
  if (info->bin == MI_BIN_FULL) {  // reached end of bins, reset state
    info->committed_golden = info->committed;
    // Add total comitted size of MI_BIN_FULL that we do not traverse
    // as its tracked by zmalloc_heap->full_page_size variable.
    info->committed += zmalloc_heap->full_page_size;

    // TODO: it's a test code that makes sure `full_page_size` is correct.
    // Remove it once we are confident with the implementation.
    mi_page_queue_t* pq = &zmalloc_heap->pages[MI_BIN_FULL];
    const mi_page_t* page = pq->first;
    while (page != NULL) {
      info->committed_golden += page->capacity * page->block_size;
      page = page->next;
    }
    info->bin = 0;
    return 0;
  }

  return -1;
}

void init_zmalloc_threadlocal(void* heap) {
  if (zmalloc_heap)
    return;
  zmalloc_heap = heap;
}

void zmalloc_page_is_underutilized(void* ptr, float ratio, int collect_stats,
                                   mi_page_usage_stats_t* result) {
  *result = mi_heap_page_is_underutilized(zmalloc_heap, ptr, ratio, collect_stats);
}

char* zstrdup(const char* s) {
  size_t l = strlen(s) + 1;
  char* p = zmalloc(l);

  memcpy(p, s, l);
  return p;
}


================================================
FILE: src/server/CMakeLists.txt
================================================
option(DF_ENABLE_MEMORY_TRACKING "Adds memory tracking debugging via MEMORY TRACK command" ON)
option(PRINT_STACKTRACES_ON_SIGNAL "Enables DF to print all fiber stacktraces on SIGUSR1" OFF)

option(WITH_COLLECTION_CMDS "Compile SET/HASH/ZSET/STREAM commands" ON)
option(WITH_EXTENSION_CMDS "Compile BLOOM/BITOPS/GEO/HLL/JSON commands" ON)

option(WITH_TIERING "Compile for macos" ON)
if(APPLE)
    message(STATUS "Macos detected. Set WITH_TIERING=off")
    set(WITH_TIERING OFF CACHE BOOL "Compile for macos" FORCE)
endif()

add_executable(dragonfly dfly_main.cc version_monitor.cc)
add_custom_target(check_dfly WORKING_DIRECTORY .. COMMAND ctest -L DFLY)
cxx_link(dragonfly base dragonfly_lib)

if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND CMAKE_BUILD_TYPE STREQUAL "Release")
  # Add core2 only to this file, thus avoiding instructions in this object file that
  # can cause SIGILL.
  set_source_files_properties(dfly_main.cc PROPERTIES COMPILE_FLAGS "-march=core2")
endif()

set_property(SOURCE dfly_main.cc APPEND PROPERTY COMPILE_DEFINITIONS
             SOURCE_PATH_FROM_BUILD_ENV=${CMAKE_SOURCE_DIR})

add_executable(dfly_bench dfly_bench.cc)
cxx_link(dfly_bench dfly_parser_lib fibers2 absl::random_random redis_lib)

# Include journal sources (not separate target for now)
add_subdirectory(journal)
if(NOT DEFINED DF_JOURNAL_SRCS)
  message(FATAL_ERROR "Journal source files not exported via DF_JOURNAL_SRCS")
endif()

# Define transaction library
add_library(dfly_transaction db_slice.cc blocking_controller.cc
            cluster_support.cc common.cc command_registry.cc
            execution_state.cc stats.cc synchronization.cc
            ${DF_JOURNAL_SRCS}
            server_state.cc table.cc  transaction.cc tx_base.cc
            serializer_commons.cc
            acl/acl_log.cc slowlog.cc channel_store.cc)
cxx_link(dfly_transaction dfly_core strings_lib TRDP::fast_float TRDP::hdr_histogram)

# Include search module
add_subdirectory(search)
if(NOT DEFINED DF_SEARCH_SRCS)
  message(FATAL_ERROR "Search source files not exported via DF_SEARCH_SRCS")
endif()

if (WITH_SEARCH)
  add_definitions(-DWITH_SEARCH)
endif()

# Include tiering module
add_subdirectory(tiering)
if (WITH_TIERING)
  add_definitions(-DWITH_TIERING)
  SET(DF_TIERING_SRCS tiered_storage.cc)
  helio_cxx_test(tiered_storage_test dfly_test_lib LABELS DFLY)
endif()

# Include cluster sources definitons (not separate target for now)
add_subdirectory(cluster)
if (NOT DEFINED DF_CLUSTER_SRCS)
  message(FATAL_ERROR "Cluster source files not exported via DF_CLUSTER_SRCS")
endif()

# Optionally compile collection commands
if (WITH_COLLECTION_CMDS)
  set(DF_FAMILY_SRCS set_family.cc hset_family.cc zset_family.cc stream_family.cc)
  add_definitions(-DWITH_COLLECTION_CMDS)
else()
  set(DF_FAMILY_SRCS collection_family_fallback.cc)
endif()

# Optionally compile extension commands
if (WITH_EXTENSION_CMDS)
  list(APPEND DF_FAMILY_SRCS geo_family.cc hll_family.cc bitops_family.cc bloom_family.cc cms_family.cc json_family.cc)
  add_definitions(-DWITH_EXTENSION_CMDS)
endif()

# Optionally include tiered_storage which interfaces with tiering_module
add_library(dragonfly_lib
            engine_shard.cc engine_shard_set.cc
            config_registry.cc conn_context.cc
            debugcmd.cc dflycmd.cc error.cc family_utils.cc string_stats.cc ${DF_SEARCH_SRCS}
            server_family.cc string_family.cc list_family.cc generic_family.cc
            ${DF_FAMILY_SRCS}
            main_service.cc memory_cmd.cc rdb_load.cc rdb_load_context.cc rdb_save.cc replica.cc http_api.cc
            protocol_client.cc serializer_base.cc snapshot.cc script_mgr.cc
            detail/compressor.cc detail/decompress.cc detail/save_stages_controller.cc detail/snapshot_storage.cc
            version.cc container_utils.cc
            multi_command_squasher.cc
            ${DF_TIERING_SRCS}
            ${DF_CLUSTER_SRCS}
            acl/user.cc acl/user_registry.cc acl/acl_family.cc
            acl/validator.cc
            sharding.cc cmd_support.cc)

if (DF_ENABLE_MEMORY_TRACKING)
  target_compile_definitions(dragonfly_lib PRIVATE DFLY_ENABLE_MEMORY_TRACKING)
  target_compile_definitions(dragonfly PRIVATE DFLY_ENABLE_MEMORY_TRACKING)
endif()

if (PRINT_STACKTRACES_ON_SIGNAL)
  target_compile_definitions(dragonfly_lib PRIVATE PRINT_STACKTRACES_ON_SIGNAL)
endif()

if (WITH_AWS)
  SET(AWS_LIB awsv2_lib)
  add_definitions(-DWITH_AWS)
endif()

if (WITH_GCP)
  SET(GCP_LIB gcp_lib)
  add_definitions(-DWITH_GCP)
endif()

cxx_link(dragonfly_lib dfly_transaction dfly_facade dfly_tiering
         redis_lib ${AWS_LIB} ${GCP_LIB} azure_lib jsonpath
         strings_lib html_lib
         http_client_lib absl::random_random TRDP::jsoncons TRDP::zstd TRDP::lz4
         TRDP::croncpp TRDP::flatbuffers)

if (DF_USE_SSL)
  set(TLS_LIB tls_lib)
  target_compile_definitions(dragonfly_lib PRIVATE DFLY_USE_SSL)
endif()

add_library(dfly_test_lib test_utils.cc)
cxx_link(dfly_test_lib dragonfly_lib facade_test gtest_main_ext)

helio_cxx_test(dragonfly_test dfly_test_lib LABELS DFLY)
helio_cxx_test(multi_test dfly_test_lib LABELS DFLY)
helio_cxx_test(generic_family_test dfly_test_lib LABELS DFLY)
helio_cxx_test(hset_family_test dfly_test_lib LABELS DFLY)
helio_cxx_test(list_family_test dfly_test_lib LABELS DFLY)
helio_cxx_test(server_family_test dfly_test_lib LABELS DFLY)
helio_cxx_test(set_family_test dfly_test_lib LABELS DFLY)
helio_cxx_test(stream_family_test dfly_test_lib LABELS DFLY)
helio_cxx_test(string_family_test dfly_test_lib LABELS DFLY)
helio_cxx_test(bitops_family_test dfly_test_lib LABELS DFLY)
helio_cxx_test(rdb_test dfly_test_lib DATA testdata/empty.rdb testdata/redis6_small.rdb
         testdata/redis6_stream.rdb testdata/hll.rdb testdata/redis7_small.rdb
         testdata/redis_json.rdb testdata/RDB_TYPE_STREAM_LISTPACKS_2.rdb
         testdata/RDB_TYPE_STREAM_LISTPACKS_3.rdb testdata/ignore_expiry.rdb LABELS DFLY)
helio_cxx_test(zset_family_test dfly_test_lib LABELS DFLY)
helio_cxx_test(geo_family_test dfly_test_lib LABELS DFLY)
helio_cxx_test(blocking_controller_test dfly_test_lib LABELS DFLY)
helio_cxx_test(json_family_test dfly_test_lib LABELS DFLY)
helio_cxx_test(json_family_memory_test dfly_test_lib LABELS DFLY)
helio_cxx_test(journal/journal_test dfly_test_lib LABELS DFLY)
helio_cxx_test(hll_family_test dfly_test_lib LABELS DFLY)
helio_cxx_test(string_stats_test dfly_test_lib LABELS DFLY)
helio_cxx_test(bloom_family_test dfly_test_lib LABELS DFLY)
helio_cxx_test(cms_family_test dfly_test_lib LABELS DFLY)
helio_cxx_test(cluster/cluster_config_test dfly_test_lib LABELS DFLY)
helio_cxx_test(cluster/cluster_family_test dfly_test_lib LABELS DFLY)
helio_cxx_test(acl/acl_family_test dfly_test_lib LABELS DFLY)
helio_cxx_test(engine_shard_set_test dfly_test_lib LABELS DFLY)
helio_cxx_test(serializer_base_test dfly_test_lib LABELS DFLY)

add_dependencies(check_dfly dragonfly_test json_family_test list_family_test
                 generic_family_test memcache_parser_test rdb_test journal_test
                 redis_parser_test stream_family_test string_family_test
                 bitops_family_test set_family_test zset_family_test geo_family_test
                 hll_family_test cluster_config_test cluster_family_test acl_family_test
                 json_family_memory_test)

if (WITH_SEARCH)
  helio_cxx_test(search/search_family_test dfly_test_lib LABELS DFLY)
  helio_cxx_test(search/aggregator_test dfly_test_lib LABELS DFLY)
  helio_cxx_test(search/index_join_test dfly_test_lib LABELS DFLY)

  add_dependencies(check_dfly search_family_test aggregator_test index_join_test)
endif()


================================================
FILE: src/server/acl/acl_commands_def.h
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <cstdint>
#include <limits>
#include <string>
#include <utility>
#include <vector>

#include "absl/container/flat_hash_map.h"
#include "absl/container/flat_hash_set.h"
#include "base/logging.h"

namespace dfly::acl {

/* There are 21 ACL categories as of redis 7
 *
 */

enum AclCat {
  KEYSPACE = 1ULL << 0,
  READ = 1ULL << 1,
  WRITE = 1ULL << 2,
  SET = 1ULL << 3,
  SORTEDSET = 1ULL << 4,
  LIST = 1ULL << 5,
  HASH = 1ULL << 6,
  STRING = 1ULL << 7,
  BITMAP = 1ULL << 8,
  HYPERLOGLOG = 1ULL << 9,
  GEO = 1ULL << 10,
  STREAM = 1ULL << 11,
  PUBSUB = 1ULL << 12,
  ADMIN = 1ULL << 13,
  FAST = 1ULL << 14,
  SLOW = 1ULL << 15,
  BLOCKING = 1ULL << 16,
  DANGEROUS = 1ULL << 17,
  CONNECTION = 1ULL << 18,
  TRANSACTION = 1ULL << 19,
  SCRIPTING = 1ULL << 20,

  // Extensions
  CMS = 1ULL << 27,
  BLOOM = 1ULL << 28,
  FT_SEARCH = 1ULL << 29,
  THROTTLE = 1ULL << 30,
  JSON = 1ULL << 31
};

constexpr uint64_t ALL_COMMANDS = std::numeric_limits<uint64_t>::max();
constexpr uint64_t NONE_COMMANDS = std::numeric_limits<uint64_t>::min();

inline size_t NumberOfFamilies(size_t number = 0) {
  static size_t number_of_families = number;
  return number_of_families;
}

using CategoryIndexTable = absl::flat_hash_map<std::string_view, uint32_t>;
using ReverseCategoryIndexTable = std::vector<std::string>;
// bit index to index in the REVERSE_CATEGORY_INDEX_TABLE
using CategoryToIdxStore = absl::flat_hash_map<uint32_t, uint32_t>;

using RevCommandField = std::vector<std::string>;
using RevCommandsIndexStore = std::vector<RevCommandField>;
using CategoryToCommandsIndexStore = absl::flat_hash_map<std::string, std::vector<uint64_t>>;

// Special flag/mask for all
constexpr uint32_t NONE = 0;
constexpr uint32_t ALL = std::numeric_limits<uint32_t>::max();

enum class KeyOp : int8_t { READ, WRITE, READ_WRITE };

using GlobType = std::pair<std::string, KeyOp>;

struct AclKeys {
  std::vector<GlobType> key_globs;
  // The user is allowed to "touch" any key. No glob matching required.
  // Alias for ~*
  bool all_keys = false;
};

// The second bool denotes if the pattern contains an asterisk and it's
// used to pattern match PSUBSCRIBE that requires exact literals
using GlobTypePubSub = std::pair<std::string, bool>;

struct AclPubSub {
  std::vector<GlobTypePubSub> globs;
  // The user can execute any variant of pub/sub/psub. No glob matching required.
  // Alias for &* just like all_keys for AclKeys above.
  bool all_channels = false;
};

struct UserCredentials {
  uint32_t acl_categories{0};
  std::vector<uint64_t> acl_commands;
  AclKeys keys;
  AclPubSub pub_sub;
  std::string ns;
  size_t db{0};
};

}  // namespace dfly::acl


================================================
FILE: src/server/acl/acl_family.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.

#include "server/acl/acl_family.h"

#include <algorithm>
#include <cctype>
#include <chrono>
#include <deque>
#include <memory>
#include <numeric>
#include <optional>
#include <random>
#include <string>
#include <string_view>
#include <utility>
#include <variant>

#include "absl/container/flat_hash_set.h"
#include "absl/flags/commandlineflag.h"
#include "absl/strings/escaping.h"
#include "absl/strings/match.h"
#include "absl/strings/numbers.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/str_split.h"
#include "base/flags.h"
#include "base/logging.h"
#include "core/overloaded.h"
#include "facade/dragonfly_connection.h"
#include "facade/dragonfly_listener.h"
#include "facade/facade_types.h"
#include "facade/reply_builder.h"
#include "io/file.h"
#include "io/file_util.h"
#include "server/acl/acl_commands_def.h"
#include "server/acl/acl_log.h"
#include "server/acl/validator.h"
#include "server/command_registry.h"
#include "server/common.h"
#include "server/config_registry.h"
#include "server/conn_context.h"
#include "server/error.h"
#include "server/server_state.h"
#include "util/proactor_pool.h"

using namespace std;

ABSL_FLAG(string, aclfile, "", "Path and name to aclfile");
ABSL_DECLARE_FLAG(uint32_t, dbnum);

namespace dfly::acl {

namespace {

string PasswordsToString(const absl::flat_hash_set<string>& passwords, bool nopass, bool full_sha);
using MaterializedContents = optional<vector<vector<string_view>>>;

MaterializedContents MaterializeFileContents(vector<string>* usernames, string_view file_contents);

string AclKeysToString(const AclKeys& keys);

string AclPubSubToString(const AclPubSub& pub_sub);

void SendAclSecurityEvents(const AclLog::LogEntry& entry, facade::RedisReplyBuilder* rb);

string AclDbToString(size_t db);

template <typename P>
void TraverseEvictImpl(P predicate, facade::Listener* main_listener, util::ProactorPool* pool);
}  // namespace

AclFamily::AclFamily(UserRegistry* registry, util::ProactorPool* pool)
    : registry_(registry), pool_(pool) {
  dbnum_ = absl::GetFlag(FLAGS_dbnum);
}

void AclFamily::Acl(CmdArgList args, CommandContext* cmd_cntx) {
  cmd_cntx->SendError("Wrong number of arguments for acl command");
}

void AclFamily::List(CmdArgList args, CommandContext* cmd_cntx) {
  const auto registry_with_lock = registry_->GetRegistryWithLock();
  const auto& registry = registry_with_lock.registry;
  auto* rb = static_cast<facade::RedisReplyBuilder*>(cmd_cntx->rb());
  rb->StartArray(registry.size());

  for (const auto& [username, user] : registry) {
    string buffer = "user ";
    const string password = PasswordsToString(user.Passwords(), user.HasNopass(), false);

    const string acl_keys = AclKeysToString(user.Keys());

    const string acl_pub_sub = AclPubSubToString(user.PubSub());

    const string maybe_space_com = acl_keys.empty() ? "" : " ";

    const string acl_cat_and_commands =
        AclCatAndCommandToString(user.CatChanges(), user.CmdChanges());

    const string db_index = AclDbToString(user.Db());

    using namespace string_view_literals;

    absl::StrAppend(&buffer, username, " ", user.IsActive() ? "on "sv : "off "sv, password,
                    acl_keys, maybe_space_com, acl_pub_sub, " ", acl_cat_and_commands, " $",
                    db_index);

    rb->SendSimpleString(buffer);
  }
}

void AclFamily::StreamUpdatesToAllProactorConnections(const std::string& user,
                                                      const Commands& update_commands,
                                                      const AclKeys& update_keys,
                                                      const AclPubSub& update_pub_sub, size_t db) {
  auto update_cb = [&]([[maybe_unused]] size_t id, util::Connection* conn) {
    DCHECK(conn);
    auto connection = static_cast<facade::Connection*>(conn);
    if (!connection->IsHttp() && connection->cntx()) {
      auto* cntx = static_cast<dfly::ConnectionContext*>(connection->cntx());
      if (user == cntx->authed_username) {
        cntx->acl_commands = update_commands;
        cntx->keys = update_keys;
        cntx->pub_sub = update_pub_sub;
        cntx->acl_db_idx = db;
      }
    }
  };

  if (main_listener_ && main_listener_->protocol() == facade::Protocol::REDIS) {
    main_listener_->TraverseConnections(update_cb);
  }
}

using facade::ErrorReply;

void AclFamily::SetUser(CmdArgList args, CommandContext* cmd_cntx) {
  string_view username = facade::ToSV(args[0]);
  auto reg = registry_->GetRegistryWithWriteLock();
  const bool exists = reg.registry.contains(username);
  const bool has_all_keys = exists ? reg.registry.find(username)->second.Keys().all_keys : false;
  auto req = ParseAclSetUser(args.subspan(1), false, has_all_keys);

  auto error_case = [cmd_cntx](ErrorReply&& error) { cmd_cntx->SendError(error); };

  auto update_case = [username, &reg, cmd_cntx, this, exists](User::UpdateRequest&& req) {
    auto& user = reg.registry[username];
    if (!exists) {
      User::UpdateRequest default_req;
      default_req.updates = {User::UpdateRequest::CategoryValueType{User::Sign::MINUS, acl::ALL}};
      user.Update(std::move(default_req), CategoryToIdx(), reverse_cat_table_,
                  CategoryToCommandsIndex());
    }
    const bool reset_channels = req.reset_channels;
    user.Update(std::move(req), CategoryToIdx(), reverse_cat_table_, CategoryToCommandsIndex());
    // Send ok first because the connection might get evicted
    cmd_cntx->SendOk();
    if (exists) {
      if (!reset_channels) {
        StreamUpdatesToAllProactorConnections(string(username), user.AclCommands(), user.Keys(),
                                              user.PubSub(), user.Db());
      }
      // We evict connections that had their channels reseted
      else {
        EvictOpenConnectionsOnAllProactors({username});
      }
    }
  };

  std::visit(Overloaded{error_case, update_case}, std::move(req));
}

void AclFamily::EvictOpenConnectionsOnAllProactors(const absl::flat_hash_set<string_view>& users) {
  return TraverseEvictImpl(
      [&](auto* ctx) {
        auto* dfly_ctx = static_cast<dfly::ConnectionContext*>(ctx);
        return ctx && users.contains(dfly_ctx->authed_username);
      },
      main_listener_, pool_);
}

void AclFamily::EvictOpenConnectionsOnAllProactorsWithRegistry(
    const UserRegistry::RegistryType& registry) {
  return TraverseEvictImpl(
      [&](auto* ctx) {
        auto* dfly_ctx = static_cast<dfly::ConnectionContext*>(ctx);
        return ctx && dfly_ctx->authed_username != "default" &&
               registry.contains(dfly_ctx->authed_username);
      },
      main_listener_, pool_);
}

void AclFamily::DelUser(CmdArgList args, CommandContext* cmd_cntx) {
  auto& registry = *registry_;
  absl::flat_hash_set<string_view> users;

  for (auto arg : args) {
    string_view username = facade::ToSV(arg);
    if (username == "default") {
      continue;
    }
    if (registry.RemoveUser(username)) {
      users.insert(username);
    }
  }

  if (users.empty()) {
    cmd_cntx->rb()->SendLong(0);
    return;
  }
  VLOG(1) << "Evicting open acl connections";
  EvictOpenConnectionsOnAllProactors(users);
  VLOG(1) << "Done evicting open acl connections";
  cmd_cntx->rb()->SendLong(users.size());
}

void AclFamily::WhoAmI(CmdArgList args, CommandContext* cmd_cntx) {
  auto* rb = static_cast<facade::RedisReplyBuilder*>(cmd_cntx->rb());
  rb->SendBulkString(absl::StrCat("User is ", cmd_cntx->server_conn_cntx()->authed_username));
}

string AclFamily::RegistryToString() const {
  auto registry_with_read_lock = registry_->GetRegistryWithLock();
  auto& registry = registry_with_read_lock.registry;
  string result;
  for (auto& [username, user] : registry) {
    string command = "USER ";
    const string password = PasswordsToString(user.Passwords(), user.HasNopass(), true);

    const string acl_keys = AclKeysToString(user.Keys());

    const string maybe_space = acl_keys.empty() ? "" : " ";

    const string acl_pub_sub = AclPubSubToString(user.PubSub());

    const string acl_cat_and_commands =
        AclCatAndCommandToString(user.CatChanges(), user.CmdChanges());

    const string db_index = AclDbToString(user.Db());

    using namespace string_view_literals;

    absl::StrAppend(&result, command, username, " ", user.IsActive() ? "ON "sv : "OFF "sv, password,
                    acl_keys, maybe_space, acl_pub_sub, " ", acl_cat_and_commands, " $", db_index,
                    "\n");
  }

  return result;
}

void AclFamily::Save(CmdArgList args, CommandContext* cmd_cntx) {
  auto acl_file_path = absl::GetFlag(FLAGS_aclfile);
  auto* builder = cmd_cntx->rb();
  if (acl_file_path.empty()) {
    builder->SendError("Dragonfly is not configured to use an ACL file.");
    return;
  }

  auto res = io::OpenWrite(acl_file_path);
  if (!res) {
    std::string error = absl::StrCat("Failed to open the aclfile: ", res.error().message());
    LOG(ERROR) << error;
    builder->SendError(error);
    return;
  }

  std::unique_ptr<io::WriteFile> file(res.value());
  std::string output = RegistryToString();
  auto ec = file->Write(output);

  if (ec) {
    std::string error = absl::StrCat("Failed to write to the aclfile: ", ec.message());
    LOG(ERROR) << error;
    builder->SendError(error);
    return;
  }

  ec = file->Close();
  if (ec) {
    std::string error = absl::StrCat("Failed to close the aclfile ", ec.message());
    LOG(WARNING) << error;
    builder->SendError(error);
    return;
  }

  builder->SendOk();
}

GenericError AclFamily::LoadToRegistryFromFile(std::string_view full_path,
                                               SinkReplyBuilder* builder) {
  auto is_file_read = io::ReadFileToString(full_path);
  if (!is_file_read) {
    auto error = absl::StrCat("Dragonfly could not load ACL file ", full_path, " with error ",
                              is_file_read.error().message());

    LOG(WARNING) << error;
    return {std::move(error)};
  }

  auto file_contents = std::move(is_file_read.value());

  if (file_contents.empty()) {
    return {"Empty file"};
  }

  std::vector<std::string> usernames;
  auto materialized = MaterializeFileContents(&usernames, file_contents);

  if (!materialized) {
    std::string error = "Error materializing acl file";
    LOG(WARNING) << error;
    return {std::move(error)};
  }

  std::vector<User::UpdateRequest> requests;

  for (auto& cmds : *materialized) {
    auto req = ParseAclSetUser(cmds, true);
    if (std::holds_alternative<ErrorReply>(req)) {
      auto error = std::move(std::get<ErrorReply>(req));
      LOG(WARNING) << "Error while parsing aclfile: " << error.ToSv();
      return {std::string(error.ToSv())};
    }
    requests.push_back(std::move(std::get<User::UpdateRequest>(req)));
  }

  auto registry_with_wlock = registry_->GetRegistryWithWriteLock();
  auto& registry = registry_with_wlock.registry;
  if (builder) {
    builder->SendOk();
    // Evict open connections for old users
    EvictOpenConnectionsOnAllProactorsWithRegistry(registry);
    registry.clear();
  }

  for (size_t i = 0; i < usernames.size(); ++i) {
    User::UpdateRequest default_req;
    default_req.updates = {User::UpdateRequest::CategoryValueType{User::Sign::MINUS, acl::ALL}};
    auto& user = registry[usernames[i]];
    user.Update(std::move(default_req), CategoryToIdx(), reverse_cat_table_,
                CategoryToCommandsIndex());
    user.Update(std::move(requests[i]), CategoryToIdx(), reverse_cat_table_,
                CategoryToCommandsIndex());
  }

  if (!registry.contains("default")) {
    auto& user = registry["default"];
    user.Update(registry_->DefaultUserUpdateRequest(), CategoryToIdx(), reverse_cat_table_,
                CategoryToCommandsIndex());
  }

  return {};
}

bool AclFamily::Load() {
  auto acl_file = absl::GetFlag(FLAGS_aclfile);
  return !LoadToRegistryFromFile(acl_file, nullptr);
}

void AclFamily::Load(CmdArgList args, CommandContext* cmd_cntx) {
  auto acl_file = absl::GetFlag(FLAGS_aclfile);
  auto* rb = static_cast<facade::RedisReplyBuilder*>(cmd_cntx->rb());
  if (acl_file.empty()) {
    rb->SendError("Dragonfly is not configured to use an ACL file.");
    return;
  }

  const auto load_error = LoadToRegistryFromFile(acl_file, rb);

  if (load_error) {
    rb->SendError(absl::StrCat("Error loading: ", acl_file, " ", load_error.Format()));
  }
}

void AclFamily::Log(CmdArgList args, CommandContext* cmd_cntx) {
  auto* rb = static_cast<facade::RedisReplyBuilder*>(cmd_cntx->rb());
  if (args.size() > 1) {
    return rb->SendError(facade::OpStatus::OUT_OF_RANGE);
  }

  size_t max_output = 10;
  if (!args.empty()) {
    auto option = facade::ToSV(args[0]);
    if (absl::EqualsIgnoreCase(option, "RESET")) {
      pool_->AwaitFiberOnAll(
          [](auto index, auto* context) { ServerState::tlocal()->acl_log.Reset(); });
      rb->SendOk();
      return;
    }

    if (!absl::SimpleAtoi(facade::ToSV(args[0]), &max_output)) {
      rb->SendError("Invalid count");
      return;
    }
  }

  std::vector<AclLog::LogType> logs(pool_->size());
  pool_->AwaitFiberOnAll([&logs, max_output](auto index, auto* context) {
    logs[index] = ServerState::tlocal()->acl_log.GetLog(max_output);
  });

  size_t total_entries = 0;
  for (auto& log : logs) {
    total_entries += log.size();
  }

  if (total_entries == 0) {
    rb->SendEmptyArray();
    return;
  }

  auto n_way_minimum = [](const auto& logs) {
    size_t id = 0;
    AclLog::LogEntry limit;
    const AclLog::LogEntry* max = &limit;
    for (size_t i = 0; i < logs.size(); ++i) {
      if (!logs[i].empty() && logs[i].front() < *max) {
        id = i;
        max = &logs[i].front();
      }
    }

    return id;
  };

  rb->StartArray(total_entries);

  for (size_t i = 0; i < total_entries; ++i) {
    const auto min = n_way_minimum(logs);
    SendAclSecurityEvents(logs[min].front(), rb);
    logs[min].pop_front();
  }
}

void AclFamily::Users(CmdArgList args, CommandContext* cmd_cntx) {
  const auto registry_with_lock = registry_->GetRegistryWithLock();
  const auto& registry = registry_with_lock.registry;
  auto* rb = static_cast<facade::RedisReplyBuilder*>(cmd_cntx->rb());

  rb->StartArray(registry.size());
  for (const auto& [username, _] : registry) {
    rb->SendSimpleString(username);
  }
}

void AclFamily::Cat(CmdArgList args, CommandContext* cmd_cntx) {
  auto* rb = static_cast<facade::RedisReplyBuilder*>(cmd_cntx->rb());

  if (args.size() > 1) {
    rb->SendError(facade::OpStatus::SYNTAX_ERR);
    return;
  }

  if (args.size() == 1) {
    string category = absl::AsciiStrToUpper(ArgS(args, 0));

    if (!cat_table_.contains(category)) {
      auto error = absl::StrCat("Unknown category: ", category);
      rb->SendError(error);
      return;
    }

    const uint32_t cid_mask = cat_table_.find(category)->second;
    std::vector<std::string_view> results;
    // TODO replace this with indexer
    auto cb = [cid_mask, &results](auto name, auto& cid) {
      if (cid_mask & cid.acl_categories()) {
        results.push_back(name);
      }
    };

    cmd_registry_->Traverse(cb);
    rb->StartArray(results.size());
    for (const auto& command : results) {
      rb->SendSimpleString(command);
    }

    return;
  }

  size_t total_categories = 0;
  for (auto& elem : reverse_cat_table_) {
    if (elem != "_RESERVED") {
      ++total_categories;
    }
  }

  rb->StartArray(total_categories);
  for (auto& elem : reverse_cat_table_) {
    if (elem != "_RESERVED") {
      rb->SendSimpleString(elem);
    }
  }
}

void AclFamily::GetUser(CmdArgList args, CommandContext* cmd_cntx) {
  auto username = facade::ToSV(args[0]);
  const auto registry_with_lock = registry_->GetRegistryWithLock();
  const auto& registry = registry_with_lock.registry;
  auto* rb = static_cast<facade::RedisReplyBuilder*>(cmd_cntx->rb());

  if (!registry.contains(username)) {
    rb->SendNull();
    return;
  }
  auto& user = registry.find(username)->second;
  std::string status = user.IsActive() ? "on" : "off";
  auto pass = PasswordsToString(user.Passwords(), user.HasNopass(), false);
  if (!pass.empty()) {
    pass.pop_back();
  }

  rb->StartArray(10);

  rb->SendSimpleString("flags");
  const size_t total_elements = (pass != "nopass") ? 1 : 2;
  rb->StartArray(total_elements);
  rb->SendSimpleString(status);
  if (total_elements == 2) {
    rb->SendSimpleString(pass);
  }

  rb->SendSimpleString("passwords");
  if (pass != "nopass" && !pass.empty()) {
    rb->SendSimpleString(pass);
  } else {
    rb->SendEmptyArray();
  }
  rb->SendSimpleString("commands");

  const std::string acl_cat_and_commands =
      AclCatAndCommandToString(user.CatChanges(), user.CmdChanges());

  rb->SendSimpleString(acl_cat_and_commands);

  rb->SendSimpleString("keys");
  std::string keys = AclKeysToString(user.Keys());
  if (!keys.empty()) {
    rb->SendSimpleString(keys);
  } else {
    rb->SendEmptyArray();
  }

  rb->SendSimpleString("channels");
  std::string pub_sub = AclPubSubToString(user.PubSub());
  rb->SendSimpleString(pub_sub);
}

void AclFamily::GenPass(CmdArgList args, CommandContext* cmd_cntx) {
  auto* builder = cmd_cntx->rb();
  if (args.length() > 1) {
    builder->SendError(facade::UnknownSubCmd("GENPASS", "ACL"));
    return;
  }
  uint32_t random_bits = 256;
  if (args.length() == 1) {
    auto requested_bits = facade::ArgS(args, 0);

    if (!absl::SimpleAtoi(requested_bits, &random_bits) || random_bits == 0 || random_bits > 4096) {
      return builder->SendError(
          "ACL GENPASS argument must be the number of bits for the output password, a positive "
          "number up to 4096");
    }
  }
  std::random_device urandom("/dev/urandom");
  const size_t result_length = (random_bits + 3) / 4;
  constexpr size_t step_size = sizeof(decltype(std::random_device::max()));
  std::string response;
  for (size_t bytes_written = 0; bytes_written < result_length; bytes_written += step_size) {
    absl::StrAppendFormat(&response, "%08x", urandom());
  }

  response.resize(result_length);

  builder->SendSimpleString(response);
}

void AclFamily::DryRun(CmdArgList args, CommandContext* cmd_cntx) {
  auto* rb = static_cast<facade::RedisReplyBuilder*>(cmd_cntx->rb());
  auto username = facade::ArgS(args, 0);
  const auto registry_with_lock = registry_->GetRegistryWithLock();
  const auto& registry = registry_with_lock.registry;
  if (!registry.contains(username)) {
    auto error = absl::StrCat("User '", username, "' not found");
    rb->SendError(error);
    return;
  }

  string command = absl::AsciiStrToUpper(ArgS(args, 1));
  auto* cid = cmd_registry_->Find(command);
  if (!cid || cid->IsAlias()) {
    auto error = absl::StrCat("Command '", command, "' not found");
    rb->SendError(error);
    return;
  }

  const auto& user = registry.find(username)->second;
  // Stub, used to mimic connection context for a user.
  ConnectionContext stub(nullptr, acl::UserCredentials{});
  stub.acl_commands = user.AclCommandsRef();
  // "mock" without an actual connection we can't know which db is active so we skip this check
  // for DryRun.
  stub.acl_db_idx = {};
  stub.keys = {{}, true};
  const auto [is_allowed, reason] = IsUserAllowedToInvokeCommandGeneric(stub, *cid, {});
  if (is_allowed) {
    rb->SendOk();
    return;
  }

  auto msg = absl::StrCat("This user has no permissions to run the '", command, "' command");

  rb->SendBulkString(msg);
}

void AclFamily::Init(facade::Listener* main_listener, UserRegistry* registry) {
  main_listener_ = main_listener;
  registry_ = registry;
  config_registry.RegisterMutable("requirepass", [this](const absl::CommandLineFlag& flag) {
    User::UpdateRequest rqst;
    rqst.passwords.push_back({flag.CurrentValue()});
    registry_->MaybeAddAndUpdate("default", std::move(rqst));
    return true;
  });
  auto acl_file = absl::GetFlag(FLAGS_aclfile);
  if (!acl_file.empty() && Load()) {
    return;
  }
  registry_->Init(&CategoryToIdx(), &reverse_cat_table_, &CategoryToCommandsIndex());
}

std::string AclFamily::AclCatToString(uint32_t acl_category, User::Sign sign) const {
  std::string res = sign == User::Sign::PLUS ? "+@" : "-@";
  if (acl_category == acl::ALL) {
    absl::StrAppend(&res, "all");
    return res;
  }

  const auto& index = CategoryToIdx().at(acl_category);
  absl::StrAppend(&res, absl::AsciiStrToLower(reverse_cat_table_[index]));
  return res;
}

std::string AclFamily::AclCommandToString(size_t family, uint64_t mask, User::Sign sign) const {
  // This is constant but can be optimized with an indexer
  const auto& rev_index = CommandsRevIndexer();
  std::string res;
  std::string prefix = (sign == User::Sign::PLUS) ? "+" : "-";
  if (mask == ALL_COMMANDS) {
    for (const auto& cmd : rev_index[family]) {
      absl::StrAppend(&res, prefix, absl::AsciiStrToLower(cmd), " ");
    }
    res.pop_back();
    return res;
  }

  size_t pos = 0;
  while (mask != 0) {
    ++pos;
    mask = mask >> 1;
  }
  --pos;
  absl::StrAppend(&res, prefix, absl::AsciiStrToLower(rev_index[family][pos]));
  return res;
}

namespace {
struct CategoryAndMetadata {
  User::CategoryChange change;
  User::ChangeMetadata metadata;
};

struct CommandAndMetadata {
  User::CommandChange change;
  User::ChangeMetadata metadata;
};

using MergeResult = std::vector<std::variant<CategoryAndMetadata, CommandAndMetadata>>;

MergeResult MergeTables(const User::CategoryChanges& categories,
                        const User::CommandChanges& commands) {
  MergeResult result;
  for (auto [cat, meta] : categories) {
    result.push_back(CategoryAndMetadata{cat, meta});
  }

  for (auto [cmd, meta] : commands) {
    result.push_back(CommandAndMetadata{cmd, meta});
  }

  std::sort(result.begin(), result.end(), [](const auto& l, const auto& r) {
    auto fetch = [](const auto& l) { return l.metadata.seq_no; };
    return std::visit(fetch, l) < std::visit(fetch, r);
  });

  return result;
}

using MaterializedContents = std::optional<std::vector<std::vector<std::string_view>>>;

MaterializedContents MaterializeFileContents(std::vector<std::string>* usernames,
                                             std::string_view file_contents) {
  // This is fine, a very large file will top at 1-2 mb. And that's for 5000+ users with 400
  // characters per line
  std::vector<std::string_view> commands = absl::StrSplit(file_contents, "\n");
  std::vector<std::vector<std::string_view>> materialized;
  materialized.reserve(commands.size());
  usernames->reserve(commands.size());
  for (auto& command : commands) {
    if (command.empty())
      continue;
    std::vector<std::string_view> cmds = absl::StrSplit(command, ' ', absl::SkipEmpty());
    if (!absl::EqualsIgnoreCase(cmds[0], "USER") || cmds.size() < 4) {
      return {};
    }

    usernames->push_back(std::string(cmds[1]));
    cmds.erase(cmds.begin(), cmds.begin() + 2);
    materialized.push_back(cmds);
  }
  return materialized;
}

struct ParseKeyResult {
  std::string glob;
  KeyOp op;
  bool all_keys{false};
  bool reset_keys{false};
};

std::optional<ParseKeyResult> MaybeParseAclKey(std::string_view command) {
  if (absl::EqualsIgnoreCase(command, "ALLKEYS") || command == "~*") {
    return ParseKeyResult{"", {}, true};
  }

  if (absl::EqualsIgnoreCase(command, "RESETKEYS")) {
    return ParseKeyResult{"", {}, false, true};
  }

  auto op = KeyOp::READ_WRITE;

  if (absl::StartsWith(command, "%RW")) {
    command = command.substr(3);
  } else if (absl::StartsWith(command, "%R")) {
    op = KeyOp::READ;
    command = command.substr(2);
  } else if (absl::StartsWith(command, "%W")) {
    op = KeyOp::WRITE;
    command = command.substr(2);
  }

  if (!absl::StartsWith(command, "~")) {
    return {};
  }

  auto key = command.substr(1);
  if (key.empty()) {
    return {};
  }
  return ParseKeyResult{std::string(key), op};
}

struct ParsePubSubResult {
  std::string glob;
  bool has_asterisk{false};
  bool all_channels{false};
  bool reset_channels{false};
};

std::optional<ParsePubSubResult> MaybeParseAclPubSub(std::string_view command) {
  if (absl::EqualsIgnoreCase(command, "ALLCHANNELS") || command == "&*") {
    return ParsePubSubResult{"", false, true, false};
  }

  if (absl::EqualsIgnoreCase(command, "RESETCHANNELS")) {
    return ParsePubSubResult{"", false, false, true};
  }

  if (absl::StartsWith(command, "&") && command.size() >= 2) {
    const auto glob = command.substr(1);
    const bool has_asterisk = glob.find('*') != std::string_view::npos;
    return ParsePubSubResult{std::string(glob), has_asterisk};
  }

  return {};
}

std::optional<size_t> MaybeParseAclDflySelect(std::string_view command, uint32_t dbnum) {
  if (!absl::StartsWith(command, "$")) {
    return std::nullopt;
  }

  size_t res = 0;
  if (absl::SimpleAtoi(command.substr(1), &res) && res < dbnum) {
    return {res};
  }

  if (absl::EqualsIgnoreCase(command.substr(1), "ALL")) {
    return {std::numeric_limits<size_t>::max()};
  }

  return std::nullopt;
}

std::string PrettyPrintSha(std::string_view pass, bool all) {
  if (all) {
    return absl::BytesToHexString(pass);
  }
  return absl::BytesToHexString(pass.substr(0, 15)).substr(0, 15);
};

std::optional<User::UpdatePass> MaybeParsePassword(std::string_view command, bool hashed) {
  using UpPass = User::UpdatePass;
  if (command == "nopass") {
    return UpPass{"", false, true};
  }

  if (command == "resetpass") {
    return UpPass{"", false, false, true};
  }

  if (command[0] == '>' || (hashed && command[0] == '#')) {
    return UpPass{std::string(command.substr(1))};
  }

  if (command[0] == '<') {
    return UpPass{std::string(command.substr(1)), true};
  }

  return {};
}

std::optional<bool> MaybeParseStatus(std::string_view command) {
  if (command == "ON") {
    return true;
  }
  if (command == "OFF") {
    return false;
  }
  return {};
}

std::string PasswordsToString(const absl::flat_hash_set<std::string>& passwords, bool nopass,
                              bool full_sha) {
  if (nopass) {
    return "nopass ";
  }
  std::string result;
  for (const auto& pass : passwords) {
    absl::StrAppend(&result, "#", PrettyPrintSha(pass, full_sha), " ");
  }

  return result;
}

std::string AclKeysToString(const AclKeys& keys) {
  if (keys.all_keys) {
    return "~*";
  }
  std::string result;
  for (auto& [pattern, op] : keys.key_globs) {
    if (op == KeyOp::READ_WRITE) {
      absl::StrAppend(&result, "~", pattern, " ");
      continue;
    }
    std::string op_str = (op == KeyOp::READ) ? "R" : "W";
    absl::StrAppend(&result, "%", op_str, "~", pattern, " ");
  }

  if (!result.empty()) {
    result.pop_back();
  }
  return result;
}

std::string AclPubSubToString(const AclPubSub& pub_sub) {
  if (pub_sub.all_channels) {
    return "&*";
  }

  std::string result = "resetchannels ";

  for (const auto& [glob, has_asterisk] : pub_sub.globs) {
    absl::StrAppend(&result, "&", glob, " ");
  }

  if (result.back() == ' ') {
    result.pop_back();
  }

  return result;
}

void SendAclSecurityEvents(const AclLog::LogEntry& entry, facade::RedisReplyBuilder* rb) {
  rb->StartArray(12);
  rb->SendSimpleString("reason");
  using Reason = AclLog::Reason;
  std::string reason;
  if (entry.reason == Reason::COMMAND) {
    reason = "COMMAND";
  } else if (entry.reason == Reason::KEY) {
    reason = "KEY";
  } else if (entry.reason == Reason::PUB_SUB) {
    reason = "PUB_SUB";
  } else {
    reason = "AUTH";
  }

  rb->SendSimpleString(reason);
  rb->SendSimpleString("object");
  rb->SendSimpleString(entry.object);
  rb->SendSimpleString("username");
  rb->SendSimpleString(entry.username);
  rb->SendSimpleString("age-seconds");

  auto now_diff = std::chrono::system_clock::now() - entry.entry_creation;
  auto secs = std::chrono::duration_cast<std::chrono::seconds>(now_diff);
  auto left_over = now_diff - std::chrono::duration_cast<std::chrono::microseconds>(secs);
  auto age = absl::StrCat(secs.count(), ".", left_over.count());
  rb->SendSimpleString(absl::StrCat(age));
  rb->SendSimpleString("client-info");
  rb->SendSimpleString(entry.client_info);
  rb->SendSimpleString("timestamp-created");
  rb->SendLong(entry.entry_creation.time_since_epoch().count());
}

std::string AclDbToString(size_t db) {
  return std::numeric_limits<size_t>::max() == db ? "all" : absl::StrCat(db);
}

// Fetches the connections that predicate P evaluates to true and shuts them
// down gracefully.
template <typename P>
void TraverseEvictImpl(P predicate, facade::Listener* main_listener, util::ProactorPool* pool) {
  auto close_cb = [&](unsigned idx, util::ProactorBase* p) {
    std::vector<facade::Connection::WeakRef> connections;
    auto traverse_cb = [&](unsigned id, util::Connection* conn) {
      auto connection = static_cast<facade::Connection*>(conn);
      auto ctx = connection->cntx();
      if (predicate(ctx)) {
        connections.push_back(connection->Borrow());
      }
    };

    main_listener->TraverseConnectionsOnThread(traverse_cb, UINT32_MAX, nullptr);

    for (auto& tcon : connections) {
      facade::Connection* conn = tcon.Get();
      if (conn && conn->socket()->proactor()->GetPoolIndex() == p->GetPoolIndex()) {
        // preemptive for TlsSocket
        conn->ShutdownSelfBlocking();
      }
    }
  };

  pool->AwaitFiberOnAll(close_cb);
}

}  // namespace

std::string AclFamily::AclCatAndCommandToString(const User::CategoryChanges& cat,
                                                const User::CommandChanges& cmds) const {
  std::string result;

  auto tables = MergeTables(cat, cmds);

  auto cat_visitor = [&result, this](const CategoryAndMetadata& val) {
    const auto& [change, meta] = val;
    absl::StrAppend(&result, AclCatToString(change, meta.sign), " ");
  };

  auto cmd_visitor = [&result, this](const CommandAndMetadata& val) {
    const auto& [change, meta] = val;
    const auto [family, bit_index] = change;
    absl::StrAppend(&result, AclCommandToString(family, bit_index, meta.sign), " ");
  };

  Overloaded visitor{cat_visitor, cmd_visitor};

  for (auto change : tables) {
    std::visit(visitor, change);
  }

  if (!result.empty()) {
    result.pop_back();
  }

  return result;
}

using OptCat = std::optional<uint32_t>;

// bool == true if +
// bool == false if -
std::pair<OptCat, bool> AclFamily::MaybeParseAclCategory(std::string_view command) const {
  if (absl::EqualsIgnoreCase(command, "ALLCOMMANDS")) {
    return {cat_table_.at("ALL"), true};
  }

  if (absl::EqualsIgnoreCase(command, "NOCOMMANDS")) {
    return {cat_table_.at("ALL"), false};
  }

  if (absl::StartsWith(command, "+@")) {
    auto res = cat_table_.find(command.substr(2));
    if (res == cat_table_.end()) {
      return {};
    }
    return {res->second, true};
  }

  if (absl::StartsWith(command, "-@")) {
    auto res = cat_table_.find(command.substr(2));
    if (res == cat_table_.end()) {
      return {};
    }
    return {res->second, false};
  }

  return {};
}

std::optional<std::string> AclFamily::MaybeParseNamespace(std::string_view command) const {
  constexpr std::string_view kPrefix = "NAMESPACE:";
  if (absl::StartsWith(command, kPrefix)) {
    return std::string(command.substr(kPrefix.size()));
  }
  return std::nullopt;
}

std::pair<AclFamily::OptCommand, bool> AclFamily::MaybeParseAclCommand(
    std::string_view command) const {
  if (absl::StartsWith(command, "+")) {
    auto res = cmd_registry_->Find(command.substr(1));
    if (!res || res->IsAlias()) {
      return {};
    }
    std::pair<size_t, uint64_t> cmd{res->GetFamily(), res->GetBitIndex()};
    return {cmd, true};
  }

  if (absl::StartsWith(command, "-")) {
    auto res = cmd_registry_->Find(command.substr(1));
    if (!res || res->IsAlias()) {
      return {};
    }
    std::pair<size_t, uint64_t> cmd{res->GetFamily(), res->GetBitIndex()};
    return {cmd, false};
  }

  return {};
}

using facade::ErrorReply;

std::variant<User::UpdateRequest, ErrorReply> AclFamily::ParseAclSetUser(
    const facade::ArgRange& args, bool hashed, bool has_all_keys, bool has_all_channels) const {
  User::UpdateRequest req;

  for (std::string_view arg : args) {
    if (auto pass = MaybeParsePassword(facade::ToSV(arg), hashed); pass) {
      req.passwords.push_back(std::move(*pass));

      if (hashed && absl::StartsWith(facade::ToSV(arg), "#")) {
        req.passwords.back().is_hashed = true;
      }
      continue;
    }

    if (auto res = MaybeParseAclKey(facade::ToSV(arg)); res) {
      auto& [glob, op, all_keys, reset_keys] = *res;
      if ((has_all_keys && !all_keys && !reset_keys) ||
          (req.allow_all_keys && !all_keys && !reset_keys)) {
        return ErrorReply(absl::StrCat(
            "Error in ACL SETUSER modifier \'", facade::ToSV(arg),
            "\': Adding a pattern after the * pattern (or the "
            "'allkeys' flag) is not valid and does not have any effect. Try 'resetkeys' to start "
            "with an empty list of patterns"));
      }

      req.allow_all_keys = all_keys;
      req.reset_all_keys = reset_keys;
      if (reset_keys) {
        has_all_keys = false;
      }
      req.keys.push_back({std::move(glob), op, all_keys, reset_keys});
      continue;
    }

    if (auto res = MaybeParseAclPubSub(facade::ToSV(arg)); res) {
      auto& [glob, has_asterisk, all_channels, reset_channels] = *res;
      if ((has_all_channels && !all_channels && !reset_channels) ||
          (req.all_channels && !all_channels && !reset_channels)) {
        return ErrorReply(
            absl::StrCat("ERR Error in ACL SETUSER modifier \'", facade::ToSV(arg),
                         "\': Adding a pattern after the * pattern (or the 'allchannels' flag) is "
                         "not valid and does not have any effect. Try 'resetchannels' to start "
                         "with an empty list of channels"));
      }

      req.all_channels = all_channels;
      req.reset_channels = reset_channels;
      if (reset_channels) {
        has_all_channels = false;
      }
      req.pub_sub.push_back({std::move(glob), has_asterisk, all_channels, reset_channels});
      continue;
    }

    if (auto res = MaybeParseAclDflySelect(facade::ToSV(arg), dbnum_); res) {
      if (req.select_db) {
        return ErrorReply("ERR Error, select db $ was used twice");
      }
      req.select_db = res;
      continue;
    }

    std::string command = absl::AsciiStrToUpper(arg);

    if (auto status = MaybeParseStatus(command); status) {
      if (req.is_active) {
        return ErrorReply("Multiple ON/OFF are not allowed");
      }
      req.is_active = *status;
      continue;
    }

    auto [cat, add] = MaybeParseAclCategory(command);
    if (cat) {
      using Sign = User::Sign;
      using Val = std::pair<Sign, uint32_t>;
      auto val = add ? Val{Sign::PLUS, *cat} : Val{Sign::MINUS, *cat};
      req.updates.push_back(val);
      continue;
    }

    auto ns = MaybeParseNamespace(command);
    if (ns.has_value()) {
      req.ns = *ns;
      continue;
    }

    auto [cmd, sign] = MaybeParseAclCommand(command);
    if (!cmd) {
      return ErrorReply(absl::StrCat("Unrecognized parameter ", command));
    }

    using Sign = User::Sign;
    using Val = User::UpdateRequest::CommandsValueType;
    auto [index, bit] = *cmd;
    auto val = sign ? Val{Sign::PLUS, index, bit} : Val{Sign::MINUS, index, bit};
    req.updates.push_back(val);
  }

  return req;
}

void AclFamily::BuildIndexers(RevCommandsIndexStore families) {
  size_t family_count = acl::NumberOfFamilies(families.size());
  CommandsRevIndexer(std::move(families));
  CategoryToCommandsIndexStore index;
  cmd_registry_->Traverse([&](std::string_view, auto& cid) {
    const uint32_t cat = cid.acl_categories();
    const size_t family = cid.GetFamily();
    DCHECK_LT(family, family_count);
    const uint64_t bit_index = cid.GetBitIndex();
    for (size_t i = 0; i < 32; ++i) {
      if (cat & 1 << i) {
        std::string_view cat_name = reverse_cat_table_[i];
        if (index[cat_name].empty()) {
          index[cat_name].resize(CommandsRevIndexer().size());
        }
        index[cat_name][family] |= bit_index;
      }
    }
  });

  CategoryToCommandsIndex(std::move(index));
  CategoryToIdxStore idx_store;
  for (size_t i = 0; i < 32; ++i) {
    idx_store[1 << i] = i;
  }
  CategoryToIdx(std::move(idx_store));
}

void AclFamily::Help(CmdArgList args, CommandContext* cmd_cntx) {
  string_view help_arr[] = {
      "ACL <subcommand> [<arg> [value] [opt] ...]. Subcommands are:",
      "CAT [<category>]",
      "    List all commands that belong to <category>, or all command categories",
      "    when no category is specified.",
      "DELUSER <username> [<username> ...]",
      "    Delete a list of users.",
      "DRYRUN <username> <command> [<arg> ...]",
      "    Returns whether the user can execute the given command without executing the command.",
      "GETUSER <username>",
      "    Get the user's details.",
      "GENPASS [<bits>]",
      "    Generate a secure 256-bit user password. The optional `bits` argument can",
      "    be used to specify a different size.",
      "LIST",
      "    Show users details in config file format.",
      "LOAD",
      "    Reload users from the ACL file.",
      "LOG [<count> | RESET]",
      "    Show the ACL log entries.",
      "SAVE",
      "    Save the current config to the ACL file.",
      "SETUSER <username> <attribute> [<attribute> ...]",
      "    Create or modify a user with the specified attributes.",
      "USERS",
      "    List all the registered usernames.",
      "WHOAMI",
      "    Return the current connection username.",
      "HELP",
      "    Print this help."};
  auto* rb = static_cast<facade::RedisReplyBuilder*>(cmd_cntx->rb());
  return rb->SendSimpleStrArr(help_arr);
}

using MemberFunc = void (AclFamily::*)(CmdArgList args, CommandContext* cmd_cntx);

CommandId::Handler HandlerFunc(AclFamily* acl, MemberFunc f) {
  return [=](CmdArgList args, CommandContext* cmd_cntx) { return (acl->*f)(args, cmd_cntx); };
}

#define HFUNC(x) SetHandler(HandlerFunc(this, &AclFamily::x))

constexpr uint32_t kAcl = acl::CONNECTION;
constexpr uint32_t kList = acl::ADMIN | acl::SLOW | acl::DANGEROUS;
constexpr uint32_t kSetUser = acl::ADMIN | acl::SLOW | acl::DANGEROUS;
constexpr uint32_t kDelUser = acl::ADMIN | acl::SLOW | acl::DANGEROUS;
constexpr uint32_t kWhoAmI = acl::SLOW;
constexpr uint32_t kSave = acl::ADMIN | acl::SLOW | acl::DANGEROUS;
constexpr uint32_t kLoad = acl::ADMIN | acl::SLOW | acl::DANGEROUS;
constexpr uint32_t kLog = acl::ADMIN | acl::SLOW | acl::DANGEROUS;
constexpr uint32_t kUsers = acl::ADMIN | acl::SLOW | acl::DANGEROUS;
constexpr uint32_t kCat = acl::SLOW;
constexpr uint32_t kGetUser = acl::ADMIN | acl::SLOW | acl::DANGEROUS;
constexpr uint32_t kDryRun = acl::ADMIN | acl::SLOW | acl::DANGEROUS;
constexpr uint32_t kGenPass = acl::SLOW;
constexpr uint32_t kHelp = acl::SLOW;

// We can't implement the ACL commands and its respective subcommands LIST, CAT, etc
// the usual way, (that is, one command called ACL which then dispatches to the subcommand
// based on the second argument) because each of the subcommands has different ACL
// categories. Therefore, to keep it compatible with the CommandId, I need to treat them
// as separate commands in the registry. This is the least intrusive change because it's very
// easy to handle that case explicitly in `DispatchCommand`.

void AclFamily::Register(dfly::CommandRegistry* registry) {
  using CI = dfly::CommandId;
  const uint32_t kAclMask = CO::ADMIN | CO::NOSCRIPT | CO::LOADING;
  registry->StartFamily();
  *registry << CI{"ACL", CO::NOSCRIPT | CO::LOADING, 0, 0, 0, acl::kAcl}.HFUNC(Acl);
  *registry << CI{"ACL LIST", kAclMask, 1, 0, 0, acl::kList}.HFUNC(List);
  *registry << CI{"ACL SETUSER", kAclMask, -2, 0, 0, acl::kSetUser}.HFUNC(SetUser);
  *registry << CI{"ACL DELUSER", kAclMask, -2, 0, 0, acl::kDelUser}.HFUNC(DelUser);
  *registry << CI{"ACL WHOAMI", kAclMask, 1, 0, 0, acl::kWhoAmI}.HFUNC(WhoAmI);
  *registry << CI{"ACL SAVE", kAclMask, 1, 0, 0, acl::kSave}.HFUNC(Save);
  *registry << CI{"ACL LOAD", kAclMask, 1, 0, 0, acl::kLoad}.HFUNC(Load);
  *registry << CI{"ACL LOG", kAclMask, 0, 0, 0, acl::kLog}.HFUNC(Log);
  *registry << CI{"ACL USERS", kAclMask, 1, 0, 0, acl::kUsers}.HFUNC(Users);
  *registry << CI{"ACL CAT", kAclMask, -1, 0, 0, acl::kCat}.HFUNC(Cat);
  *registry << CI{"ACL GETUSER", kAclMask, 2, 0, 0, acl::kGetUser}.HFUNC(GetUser);
  *registry << CI{"ACL DRYRUN", kAclMask, 3, 0, 0, acl::kDryRun}.HFUNC(DryRun);
  *registry << CI{"ACL GENPASS", CO::NOSCRIPT | CO::LOADING, -1, 0, 0, acl::kGenPass}.HFUNC(
      GenPass);
  *registry << CI{"ACL HELP", kAclMask, 0, 0, 0, acl::kHelp}.HFUNC(Help);
  cmd_registry_ = registry;

  // build indexers
  BuildIndexers(cmd_registry_->GetFamilies());
}

#undef HFUNC

}  // namespace dfly::acl


================================================
FILE: src/server/acl/acl_family.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <cstdint>
#include <optional>
#include <string_view>
#include <vector>

#include "absl/container/flat_hash_set.h"
#include "facade/facade_types.h"
#include "helio/util/proactor_pool.h"
#include "server/acl/acl_commands_def.h"
#include "server/acl/user_registry.h"
#include "server/command_registry.h"
#include "server/execution_state.h"

namespace facade {
class SinkReplyBuilder;
class Listener;
}  // namespace facade

namespace dfly {

using facade::CmdArgList;

class ConnectionContext;
namespace acl {

class AclFamily final {
 public:
  explicit AclFamily(UserRegistry* registry, util::ProactorPool* pool);

  void Register(CommandRegistry* registry);
  void Init(facade::Listener* listener, UserRegistry* registry);

 private:
  using SinkReplyBuilder = facade::SinkReplyBuilder;

  void Acl(CmdArgList args, CommandContext* cmd_cntx);
  void List(CmdArgList args, CommandContext* cmd_cntx);
  void SetUser(CmdArgList args, CommandContext* cmd_cntx);
  void DelUser(CmdArgList args, CommandContext* cmd_cntx);
  void WhoAmI(CmdArgList args, CommandContext* cmd_cntx);
  void Save(CmdArgList args, CommandContext* cmd_cntx);
  void Load(CmdArgList args, CommandContext* cmd_cntx);
  // Helper function for bootstrap
  bool Load();
  void Log(CmdArgList args, CommandContext* cmd_cntx);
  void Users(CmdArgList args, CommandContext* cmd_cntx);
  void Cat(CmdArgList args, CommandContext* cmd_cntx);
  void GetUser(CmdArgList args, CommandContext* cmd_cntx);
  void DryRun(CmdArgList args, CommandContext* cmd_cntx);
  void GenPass(CmdArgList args, CommandContext* cmd_cntx);
  void Help(CmdArgList args, CommandContext* cmd_cntx);

  // Helper function that updates all open connections and their
  // respective ACL fields on all the available proactor threads
  using Commands = std::vector<uint64_t>;
  void StreamUpdatesToAllProactorConnections(const std::string& user,
                                             const Commands& update_commands,
                                             const AclKeys& update_keys,
                                             const AclPubSub& update_pub_sub, size_t db);

  // Helper function that closes all open connection from the deleted user
  void EvictOpenConnectionsOnAllProactors(const absl::flat_hash_set<std::string_view>& user);

  // Helper function that closes all open connections for users in the registry
  void EvictOpenConnectionsOnAllProactorsWithRegistry(const UserRegistry::RegistryType& registry);

  // Helper function that loads the acl state of an acl file into the user registry
  GenericError LoadToRegistryFromFile(std::string_view full_path, SinkReplyBuilder* builder);

  // Serializes the whole registry into a string
  std::string RegistryToString() const;

  std::string AclCatToString(uint32_t acl_category, User::Sign sign) const;

  std::string AclCommandToString(size_t family, uint64_t mask, User::Sign sign) const;

  // Serializes category and command to string
  std::string AclCatAndCommandToString(const User::CategoryChanges& cat,
                                       const User::CommandChanges& cmds) const;

  using OptCat = std::optional<uint32_t>;
  std::pair<OptCat, bool> MaybeParseAclCategory(std::string_view command) const;

  using OptCommand = std::optional<std::pair<size_t, uint64_t>>;
  std::pair<OptCommand, bool> MaybeParseAclCommand(std::string_view command) const;

  std::optional<std::string> MaybeParseNamespace(std::string_view command) const;

  std::variant<User::UpdateRequest, facade::ErrorReply> ParseAclSetUser(
      const facade::ArgRange& args, bool hashed = false, bool has_all_keys = false,
      bool has_all_channels = false) const;

  void BuildIndexers(RevCommandsIndexStore families);

  // Data members

  facade::Listener* main_listener_{nullptr};
  UserRegistry* registry_;
  CommandRegistry* cmd_registry_;
  util::ProactorPool* pool_;

  // Indexes

  // See definitions for NONE and ALL in facade/acl_commands_def.h
  const CategoryIndexTable cat_table_{{"KEYSPACE", KEYSPACE},
                                      {"READ", READ},
                                      {"WRITE", WRITE},
                                      {"SET", SET},
                                      {"SORTEDSET", SORTEDSET},
                                      {"LIST", LIST},
                                      {"HASH", HASH},
                                      {"STRING", STRING},
                                      {"BITMAP", BITMAP},
                                      {"HYPERLOG", HYPERLOGLOG},
                                      {"GEO", GEO},
                                      {"STREAM", STREAM},
                                      {"PUBSUB", PUBSUB},
                                      {"ADMIN", ADMIN},
                                      {"FAST", FAST},
                                      {"SLOW", SLOW},
                                      {"BLOCKING", BLOCKING},
                                      {"DANGEROUS", DANGEROUS},
                                      {"CONNECTION", CONNECTION},
                                      {"TRANSACTION", TRANSACTION},
                                      {"SCRIPTING", SCRIPTING},
                                      {"CMS", CMS},
                                      {"BLOOM", BLOOM},
                                      {"FT_SEARCH", FT_SEARCH},
                                      {"SEARCH", FT_SEARCH},  // Alias for FT_SEARCH
                                      {"THROTTLE", THROTTLE},
                                      {"JSON", JSON},
                                      {"ALL", ALL}};

  // bit 0 at index 0
  // bit 1 at index 1
  // bit n at index n
  const ReverseCategoryIndexTable reverse_cat_table_{
      "KEYSPACE",  "READ",      "WRITE",     "SET",       "SORTEDSET",  "LIST",        "HASH",
      "STRING",    "BITMAP",    "HYPERLOG",  "GEO",       "STREAM",     "PUBSUB",      "ADMIN",
      "FAST",      "SLOW",      "BLOCKING",  "DANGEROUS", "CONNECTION", "TRANSACTION", "SCRIPTING",
      "_RESERVED", "_RESERVED", "_RESERVED", "_RESERVED", "_RESERVED",  "_RESERVED",   "CMS",
      "BLOOM",     "FT_SEARCH", "THROTTLE",  "JSON"};

  // We need this to act as a const member, since the initialization of const data members
  // must be done on the constructor. However, these are initialized a little later, when
  // we Register the commands
  const CategoryToIdxStore& CategoryToIdx(CategoryToIdxStore store = {}) const {
    static CategoryToIdxStore cat_idx = std::move(store);
    return cat_idx;
  }

  const RevCommandsIndexStore& CommandsRevIndexer(RevCommandsIndexStore store = {}) const {
    static RevCommandsIndexStore rev_index_store = std::move(store);
    return rev_index_store;
  }

  const CategoryToCommandsIndexStore& CategoryToCommandsIndex(
      CategoryToCommandsIndexStore store = {}) const {
    static CategoryToCommandsIndexStore index = std::move(store);
    return index;
  }

  size_t dbnum_ = 0;

  // Only for testing interface
 public:
  // Helper accessors for tests. Do not use them directly.
  const ReverseCategoryIndexTable& GetRevTable() const {
    return reverse_cat_table_;
  }

  // We could make CommandsRevIndexer public, but I want this to be
  // clear that this is for TESTING so do not use this in the codebase
  const RevCommandsIndexStore& GetCommandsRevIndexer() const {
    return CommandsRevIndexer();
  }
};

}  // namespace acl
}  // namespace dfly


================================================
FILE: src/server/acl/acl_family_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/acl/acl_family.h"

#include <absl/container/flat_hash_map.h>
#include <absl/strings/ascii.h>
#include <absl/strings/str_cat.h>

#include "base/flags.h"
#include "base/gtest.h"
#include "base/logging.h"
#include "facade/facade_test.h"
#include "server/acl/acl_commands_def.h"
#include "server/command_registry.h"
#include "server/test_utils.h"

using namespace testing;

ABSL_DECLARE_FLAG(std::vector<std::string>, rename_command);
ABSL_DECLARE_FLAG(std::vector<std::string>, command_alias);

namespace dfly {

class AclFamilyTest : public BaseFamilyTest {
 protected:
};

class AclFamilyTestRename : public BaseFamilyTest {
  void SetUp() override {
    absl::SetFlag(&FLAGS_rename_command, {"ACL=ROCKS"});
    absl::SetFlag(&FLAGS_command_alias, {"___SET=SET"});
    ResetService();
  }
};

TEST_F(AclFamilyTest, AclSetUser) {
  TestInitAclFam();
  auto resp = Run({"ACL", "SETUSER"});
  EXPECT_THAT(resp, ErrArg("ERR wrong number of arguments for 'acl setuser' command"));

  resp = Run({"ACL", "SETUSER", "kostas", "ONN"});
  EXPECT_THAT(resp, ErrArg("ERR Unrecognized parameter ONN"));

  resp = Run({"ACL", "SETUSER", "kostas", "+@nonsense"});
  EXPECT_THAT(resp, ErrArg("ERR Unrecognized parameter +@NONSENSE"));

  resp = Run({"ACL", "SETUSER", "vlad"});
  EXPECT_THAT(resp, "OK");
  resp = Run({"ACL", "LIST"});
  auto vec = resp.GetVec();
  EXPECT_THAT(vec, UnorderedElementsAre("user default on nopass ~* &* +@all $all",
                                        "user vlad off resetchannels -@all $all"));

  resp = Run({"ACL", "SETUSER", "vlad", "+ACL"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"ACL", "LIST"});
  vec = resp.GetVec();
  EXPECT_THAT(vec, UnorderedElementsAre("user default on nopass ~* &* +@all $all",
                                        "user vlad off resetchannels -@all +acl $all"));

  resp = Run({"ACL", "SETUSER", "vlad", "on", ">pass", ">temp"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"ACL", "LIST"});
  vec = resp.GetVec();
  EXPECT_THAT(vec.size(), 2);
  auto contains_vlad = [](const auto& vec) {
    const std::string default_user = "user default on nopass ~* &* +@all $all";
    const std::string a_permutation =
        "user vlad on #a6864eb339b0e1f #d74ff0ee8da3b98 resetchannels -@all +acl $all";
    const std::string b_permutation =
        "user vlad on #d74ff0ee8da3b98 #a6864eb339b0e1f resetchannels -@all +acl $all";
    std::string_view other;
    if (vec[0] == default_user) {
      other = vec[1].GetView();
    } else if (vec[1] == default_user) {
      other = vec[0].GetView();
    } else {
      return false;
    }

    return other == a_permutation || other == b_permutation;
  };

  EXPECT_THAT(contains_vlad(vec), true);

  resp = Run({"AUTH", "vlad", "pass"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"AUTH", "vlad", "temp"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"AUTH", "default", R"("")"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"ACL", "SETUSER", "vlad", ">another"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"ACL", "SETUSER", "vlad", "<another"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"ACL", "LIST"});
  vec = resp.GetVec();
  EXPECT_THAT(vec.size(), 2);
  EXPECT_THAT(contains_vlad(vec), true);

  resp = Run({"ACL", "SETUSER", "vlad", "resetpass"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"ACL", "LIST"});
  vec = resp.GetVec();
  EXPECT_THAT(vec, UnorderedElementsAre("user default on nopass ~* &* +@all $all",
                                        "user vlad on resetchannels -@all +acl $all"));

  // +@NONE should not exist anymore. It's not in the spec.
  resp = Run({"ACL", "SETUSER", "rand", "+@NONE"});
  EXPECT_THAT(resp, ErrArg("ERR Unrecognized parameter +@NONE"));

  resp = Run({"ACL", "SETUSER", "rand", "ALLCOMMANDS"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"ACL", "LIST"});
  vec = resp.GetVec();
  EXPECT_THAT(vec, UnorderedElementsAre("user default on nopass ~* &* +@all $all",
                                        "user vlad on resetchannels -@all +acl $all",
                                        "user rand off resetchannels +@all $all"));

  resp = Run({"ACL", "SETUSER", "rand", "NOCOMMANDS"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"ACL", "LIST"});
  vec = resp.GetVec();
  EXPECT_THAT(vec, UnorderedElementsAre("user default on nopass ~* &* +@all $all",
                                        "user vlad on resetchannels -@all +acl $all",
                                        "user rand off resetchannels -@all $all"));
}

TEST_F(AclFamilyTest, AclDelUser) {
  TestInitAclFam();
  auto resp = Run({"ACL", "DELUSER"});
  EXPECT_THAT(resp, ErrArg("ERR wrong number of arguments for 'acl deluser' command"));

  resp = Run({"ACL", "DELUSER", "default"});
  EXPECT_THAT(resp, IntArg(0));

  resp = Run({"ACL", "DELUSER", "NOTEXISTS"});
  EXPECT_THAT(resp, IntArg(0));

  resp = Run({"ACL", "SETUSER", "kostas", "ON"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"ACL", "DELUSER", "KOSTAS", "NONSENSE"});
  EXPECT_THAT(resp, IntArg(0));

  resp = Run({"ACL", "DELUSER", "kostas"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"ACL", "DELUSER", "kostas"});
  EXPECT_THAT(resp, IntArg(0));

  resp = Run({"ACL", "LIST"});
  EXPECT_THAT(resp.GetString(), "user default on nopass ~* &* +@all $all");

  Run({"ACL", "SETUSER", "michael", "ON"});
  Run({"ACL", "SETUSER", "kobe", "ON"});
  resp = Run({"ACL", "DELUSER", "michael", "kobe"});
  EXPECT_THAT(resp, IntArg(2));
}

TEST_F(AclFamilyTest, AclList) {
  TestInitAclFam();
  auto resp = Run({"ACL", "LIST", "NONSENSE"});
  EXPECT_THAT(resp, ErrArg("ERR wrong number of arguments for 'acl list' command"));

  resp = Run({"ACL", "SETUSER", "kostas", ">pass", "+@admin"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"ACL", "SETUSER", "adi", ">pass", "+@fast"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"ACL", "LIST"});
  auto vec = resp.GetVec();
  EXPECT_THAT(
      vec, UnorderedElementsAre("user default on nopass ~* &* +@all $all",
                                "user kostas off #d74ff0ee8da3b98 resetchannels -@all +@admin $all",
                                "user adi off #d74ff0ee8da3b98 resetchannels -@all +@fast $all"));
}

TEST_F(AclFamilyTest, AclAuth) {
  TestInitAclFam();
  auto resp = Run({"AUTH", "default", R"("")"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"ACL", "SETUSER", "shahar", ">mypass"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"AUTH", "shahar", "wrongpass"});
  EXPECT_THAT(resp, ErrArg("WRONGPASS invalid username-password pair or user is disabled."));

  resp = Run({"AUTH", "shahar", "mypass"});
  EXPECT_THAT(resp, ErrArg("WRONGPASS invalid username-password pair or user is disabled."));

  // Activate the user
  resp = Run({"ACL", "SETUSER", "shahar", "ON", "+@fast"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"AUTH", "shahar", "mypass"});
  EXPECT_THAT(resp, "OK");
}

TEST_F(AclFamilyTest, AclWhoAmI) {
  TestInitAclFam();
  auto resp = Run({"ACL", "WHOAMI", "WHO"});
  EXPECT_THAT(resp, ErrArg("ERR wrong number of arguments for 'acl whoami' command"));

  resp = Run({"ACL", "SETUSER", "kostas", "ON", ">pass", "+@SLOW"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"AUTH", "kostas", "pass"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"ACL", "WHOAMI"});
  EXPECT_THAT(resp, "User is kostas");
}

TEST_F(AclFamilyTest, TestAllCategories) {
  const auto* fam = TestInitAclFam();
  for (auto& cat : fam->GetRevTable()) {
    if (cat != "_RESERVED") {
      auto resp = Run({"ACL", "SETUSER", "kostas", absl::StrCat("+@", cat)});
      EXPECT_THAT(resp, "OK");

      resp = Run({"ACL", "LIST"});
      EXPECT_THAT(resp.GetVec(),
                  UnorderedElementsAre("user default on nopass ~* &* +@all $all",
                                       absl::StrCat("user kostas off resetchannels -@all ", "+@",
                                                    absl::AsciiStrToLower(cat), " $all")));

      resp = Run({"ACL", "SETUSER", "kostas", absl::StrCat("-@", cat)});
      EXPECT_THAT(resp, "OK");

      resp = Run({"ACL", "LIST"});
      EXPECT_THAT(resp.GetVec(),
                  UnorderedElementsAre("user default on nopass ~* &* +@all $all",
                                       absl::StrCat("user kostas off resetchannels -@all ", "-@",
                                                    absl::AsciiStrToLower(cat), " $all")));

      resp = Run({"ACL", "DELUSER", "kostas"});
      EXPECT_THAT(resp, IntArg(1));
    }
  }

  for (auto& cat : fam->GetRevTable()) {
    if (cat != "_RESERVED") {
      auto resp = Run({"ACL", "SETUSER", "kostas", absl::StrCat("+@", cat)});
      EXPECT_THAT(resp, "OK");
    }
  }
  // This won't work because of __RESERVED
  // TODO(fix this)
  //  auto resp = Run({"ACL", "LIST"});
  //  EXPECT_THAT(resp.GetVec(), UnorderedElementsAre("user default on nopass +@ALL",
  //  absl::StrCat("user kostas off nopass ", "+@ALL")));
  //

  // TODO(Bug here fix none/all)
  //  auto resp = Run({"ACL", "SETUSER", "kostas", "+@NONE"});
  //  EXPECT_THAT(resp, "OK");
  //
  //  resp = Run({"ACL", "LIST"});
  //  EXPECT_THAT(resp.GetVec(), UnorderedElementsAre("user default on nopass +@ALL", "user kostas
  //  off nopass +@NONE"));
}

TEST_F(AclFamilyTest, TestAllCommands) {
  const auto* fam = TestInitAclFam();
  const auto& rev_indexer = fam->GetCommandsRevIndexer();
  for (const auto& family : rev_indexer) {
    for (const auto& command_name : family) {
      auto resp = Run({"ACL", "SETUSER", "kostas", absl::StrCat("+", command_name)});
      EXPECT_THAT(resp, "OK");

      resp = Run({"ACL", "LIST"});
      EXPECT_THAT(resp.GetVec(),
                  UnorderedElementsAre("user default on nopass ~* &* +@all $all",
                                       absl::StrCat("user kostas off resetchannels -@all ", "+",
                                                    absl::AsciiStrToLower(command_name), " $all")));

      resp = Run({"ACL", "SETUSER", "kostas", absl::StrCat("-", command_name)});

      resp = Run({"ACL", "LIST"});
      EXPECT_THAT(resp.GetVec(),
                  UnorderedElementsAre("user default on nopass ~* &* +@all $all",
                                       absl::StrCat("user kostas off resetchannels -@all ", "-",
                                                    absl::AsciiStrToLower(command_name), " $all")));

      resp = Run({"ACL", "DELUSER", "kostas"});
      EXPECT_THAT(resp, IntArg(1));
    }
  }
}

TEST_F(AclFamilyTest, TestUsers) {
  TestInitAclFam();
  auto resp = Run({"ACL", "SETUSER", "abhra", "ON"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"ACL", "SETUSER", "ari"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"ACL", "USERS"});
  EXPECT_THAT(resp.GetVec(), UnorderedElementsAre("default", "abhra", "ari"));
}

TEST_F(AclFamilyTest, TestCat) {
  TestInitAclFam();
  auto resp = Run({"ACL", "CAT", "nonsense"});
  EXPECT_THAT(resp, ErrArg("ERR Unknown category: NONSENSE"));

  resp = Run({"ACL", "CAT"});
  EXPECT_GE(resp.GetVec().size(), 24u);

  resp = Run({"ACL", "CAT", "STRING"});

  EXPECT_THAT(resp.GetVec(),
              IsSupersetOf({"GETSET", "GETRANGE", "INCRBYFLOAT", "GETDEL",  "DECRBY", "PREPEND",
                            "SETEX",  "MSET",     "SET",         "PSETEX",  "SUBSTR", "DECR",
                            "STRLEN", "INCR",     "INCRBY",      "MGET",    "GET",    "SETNX",
                            "GETEX",  "APPEND",   "MSETNX",      "SETRANGE"}));
}

TEST_F(AclFamilyTest, TestGetUser) {
  TestInitAclFam();
  auto resp = Run({"ACL", "GETUSER", "kostas"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  resp = Run({"ACL", "GETUSER", "default"});
  const auto& vec = resp.GetVec();
  EXPECT_THAT(vec[0], "flags");
  EXPECT_THAT(vec[1].GetVec(), UnorderedElementsAre("on", "nopass"));
  EXPECT_THAT(vec[2], "passwords");
  EXPECT_TRUE(vec[3].GetVec().empty());
  EXPECT_THAT(vec[4], "commands");
  EXPECT_THAT(vec[5], "+@all");
  EXPECT_THAT(vec[6], "keys");
  EXPECT_THAT(vec[7], "~*");
  EXPECT_THAT(vec[8], "channels");
  EXPECT_THAT(vec[9], "&*");

  resp = Run({"ACL", "SETUSER", "kostas", "+@STRING", "+HSET"});
  resp = Run({"ACL", "GETUSER", "kostas"});
  const auto& kvec = resp.GetVec();
  EXPECT_THAT(kvec[0], "flags");
  EXPECT_THAT(kvec[1].GetVec(), UnorderedElementsAre("off"));
  EXPECT_THAT(kvec[2], "passwords");
  EXPECT_TRUE(kvec[3].GetVec().empty());
  EXPECT_THAT(kvec[4], "commands");
  EXPECT_THAT(kvec[5], "-@all +@string +hset");
  EXPECT_THAT(kvec[6], "keys");
  EXPECT_THAT(kvec[7], RespArray(ElementsAre()));
  EXPECT_THAT(kvec[8], "channels");
  EXPECT_THAT(kvec[9], "resetchannels");
}

TEST_F(AclFamilyTest, TestDryRun) {
  TestInitAclFam();
  auto resp = Run({"ACL", "DRYRUN"});
  EXPECT_THAT(resp, ErrArg("ERR wrong number of arguments for 'acl dryrun' command"));

  resp = Run({"ACL", "DRYRUN", "default"});
  EXPECT_THAT(resp, ErrArg("ERR wrong number of arguments for 'acl dryrun' command"));

  resp = Run({"ACL", "DRYRUN", "default", "get", "more"});
  EXPECT_THAT(resp, ErrArg("ERR wrong number of arguments for 'acl dryrun' command"));

  resp = Run({"ACL", "DRYRUN", "kostas", "more"});
  EXPECT_THAT(resp, ErrArg("ERR User 'kostas' not found"));

  resp = Run({"ACL", "DRYRUN", "default", "nope"});
  EXPECT_THAT(resp, ErrArg("ERR Command 'NOPE' not found"));

  resp = Run({"ACL", "DRYRUN", "default", "SET"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"ACL", "SETUSER", "kostas", "+GET"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"ACL", "DRYRUN", "kostas", "GET"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"ACL", "DRYRUN", "kostas", "SET"});
  EXPECT_THAT(resp, "This user has no permissions to run the 'SET' command");
}

TEST_F(AclFamilyTest, AclGenPassTooManyArguments) {
  TestInitAclFam();

  auto resp = Run({"ACL", "GENPASS", "1", "2"});
  EXPECT_THAT(resp.GetString(),
              "ERR Unknown subcommand or wrong number of arguments for 'GENPASS'. Try ACL HELP.");
}

TEST_F(AclFamilyTest, AclGenPassOutOfRange) {
  std::string expectedError =
      "ERR ACL GENPASS argument must be the number of bits for the output password, a positive "
      "number up to 4096";

  auto resp = Run({"ACL", "GENPASS", "-1"});
  EXPECT_THAT(resp.GetString(), expectedError);

  resp = Run({"ACL", "GENPASS", "0"});
  EXPECT_THAT(resp.GetString(), expectedError);

  resp = Run({"ACL", "GENPASS", "4097"});
  EXPECT_THAT(resp.GetString(), expectedError);
}

TEST_F(AclFamilyTest, AclGenPass) {
  auto resp = Run({"ACL", "GENPASS"});
  auto actualPassword = resp.GetString();

  // should be 256 bits or 64 bytes in hex
  EXPECT_THAT(actualPassword.length(), 64);

  // 1 bit - 4 bits should all produce a single hex character
  for (int i = 1; i <= 4; i++) {
    resp = Run({"ACL", "GENPASS", std::to_string(i)});
    EXPECT_THAT(resp.GetString().length(), 1);
  }
  // 5 bits - 8 bits should all produce two hex characters
  for (int i = 5; i <= 8; i++) {
    resp = Run({"ACL", "GENPASS", std::to_string(i)});
    EXPECT_THAT(resp.GetString().length(), 2);
  }

  // and the pattern continues
  resp = Run({"ACL", "GENPASS", "9"});
  EXPECT_THAT(resp.GetString().length(), 3);
}

TEST_F(AclFamilyTestRename, AclRename) {
  auto resp = Run({"ACL", "SETUSER", "billy"});
  EXPECT_THAT(resp, ErrArg("ERR unknown command `ACL`"));

  resp = Run({"ROCKS", "SETUSER", "billy", "ON", ">mypass"});
  EXPECT_THAT(resp.GetString(), "OK");

  resp = Run({"ROCKS", "DELUSER", "billy"});
  EXPECT_THAT(resp, IntArg(1));
}

TEST_F(AclFamilyTest, TestKeys) {
  TestInitAclFam();
  auto resp = Run({"ACL", "SETUSER", "temp", "~foo", "~bar*"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"ACL", "GETUSER", "temp"});
  auto& vec = resp.GetVec();
  EXPECT_THAT(vec[6], "keys");
  EXPECT_THAT(vec[7], "~foo ~bar*");

  resp = Run({"ACL", "SETUSER", "temp", "~*", "~foo"});
  EXPECT_THAT(resp, ErrArg("ERR Error in ACL SETUSER modifier '~foo': Adding a pattern after the * "
                           "pattern (or the 'allkeys' flag) is not valid and does not have any "
                           "effect. Try 'resetkeys' to start with an empty list of patterns"));

  resp = Run({"ACL", "SETUSER", "temp", "~*"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"ACL", "SETUSER", "temp", "~foo"});
  EXPECT_THAT(resp, ErrArg("ERR Error in ACL SETUSER modifier '~foo': Adding a pattern after the * "
                           "pattern (or the 'allkeys' flag) is not valid and does not have any "
                           "effect. Try 'resetkeys' to start with an empty list of patterns"));

  resp = Run({"ACL", "SETUSER", "temp", "resetkeys"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"ACL", "GETUSER", "temp"});
  EXPECT_TRUE(resp.GetVec()[7].GetVec().empty());

  resp = Run({"ACL", "SETUSER", "temp", "%R~foo"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"ACL", "GETUSER", "temp"});
  EXPECT_THAT(resp.GetVec()[7], "%R~foo");

  resp = Run({"ACL", "SETUSER", "temp", "resetkeys", "%W~foo"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"ACL", "GETUSER", "temp"});
  EXPECT_THAT(resp.GetVec()[7], "%W~foo");

  resp = Run({"ACL", "SETUSER", "temp", "resetkeys", "%RW~foo"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"ACL", "GETUSER", "temp"});
  EXPECT_THAT(resp.GetVec()[7], "~foo");

  resp = Run({"ACL", "SETUSER", "temp", "resetkeys", "%K~foo"});
  EXPECT_THAT(resp, ErrArg("ERR Unrecognized parameter %K~FOO"));

  resp = Run({"ACL", "SETUSER", "temp", "resetkeys", "%Rfoo"});
  EXPECT_THAT(resp, ErrArg("ERR Unrecognized parameter %RFOO"));
}

TEST_F(AclFamilyTest, TestPubSub) {
  TestInitAclFam();

  auto resp = Run({"ACL", "SETUSER", "temp", "&foo", "&b*r"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"ACL", "GETUSER", "temp"});
  auto vec = resp.GetVec();
  EXPECT_THAT(vec[8], "channels");
  EXPECT_THAT(vec[9], "resetchannels &foo &b*r");

  resp = Run({"ACL", "SETUSER", "temp", "allchannels", "&bar"});
  EXPECT_THAT(resp, ErrArg("ERR Error in ACL SETUSER modifier '&bar': Adding a pattern after the * "
                           "pattern (or the 'allchannels' flag) is "
                           "not valid and does not have any effect. Try 'resetchannels' to start "
                           "with an empty list of channels"));

  resp = Run({"ACL", "SETUSER", "temp", "allchannels"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"ACL", "GETUSER", "temp"});
  vec = resp.GetVec();
  EXPECT_THAT(vec[8], "channels");
  EXPECT_THAT(vec[9], "&*");

  resp = Run({"ACL", "SETUSER", "temp", "resetchannels", "&foo"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"ACL", "GETUSER", "temp"});
  vec = resp.GetVec();
  EXPECT_THAT(vec[8], "channels");
  EXPECT_THAT(vec[9], "resetchannels &foo");

  resp =
      Run("ACL setuser demo on resetkeys resetchannels ~app|managed-resources|* "
          "&app|managed-resources|* +publish +ping >passwd");
  resp = Run("AUTH demo passwd");
  EXPECT_THAT(resp, "OK");

  resp = Run("publish app|managed-resources|xyz test");
  EXPECT_THAT(resp, IntArg(0));
}

TEST_F(AclFamilyTest, TestAlias) {
  auto resp = Run({"ACL", "SETUSER", "luke", "+___SET"});
  EXPECT_THAT(resp, ErrArg("ERR Unrecognized parameter +___SET"));

  resp = Run({"ACL", "SETUSER", "leia", "-___SET"});
  EXPECT_THAT(resp, ErrArg("ERR Unrecognized parameter -___SET"));

  resp = Run({"ACL", "SETUSER", "anakin", "+SET"});
  EXPECT_EQ(resp, "OK");

  resp = Run({"ACL", "SETUSER", "jarjar", "allcommands"});
  EXPECT_EQ(resp, "OK");

  resp = Run({"ACL", "DRYRUN", "jarjar", "___SET"});
  EXPECT_THAT(resp, ErrArg("ERR Command '___SET' not found"));
  EXPECT_EQ(Run({"ACL", "DRYRUN", "jarjar", "SET"}), "OK");
}

TEST_F(AclFamilyTest, TestAclLogUB) {
  auto resp = Run({"ACL", "LOG"});
  EXPECT_TRUE(resp.GetVec().empty());

  resp = Run({"ACL", "LOG", "2", "RESET"});
  EXPECT_THAT(resp, ErrArg("ERR index out of range"));
}

}  // namespace dfly


================================================
FILE: src/server/acl/acl_log.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/acl/acl_log.h"

#include <chrono>
#include <iterator>

#include "base/flags.h"
#include "base/logging.h"
#include "facade/dragonfly_connection.h"
#include "server/conn_context.h"

ABSL_FLAG(uint32_t, acllog_max_len, 32,
          "Specify the number of log entries. Logs are kept locally for each thread "
          "and therefore the total number of entries are acllog_max_len * threads");

namespace dfly::acl {

AclLog::AclLog() : total_entries_allowed_(absl::GetFlag(FLAGS_acllog_max_len)) {
}

void AclLog::Add(const ConnectionContext& cntx, std::string object, Reason reason,
                 std::string tried_to_auth) {
  if (total_entries_allowed_ == 0) {
    return;
  }

  if (log_.size() == total_entries_allowed_) {
    log_.pop_back();
  }

  std::string username;
  // We can't use a conditional here because the result is the common type which is a const-ref
  if (tried_to_auth.empty()) {
    username = cntx.authed_username;
  } else {
    username = std::move(tried_to_auth);
  }

  std::string client_info = cntx.conn()->GetClientInfo();
  using clock = std::chrono::system_clock;
  LogEntry entry = {std::move(username), std::move(client_info), std::move(object), reason,
                    clock::now()};
  log_.push_front(std::move(entry));
}

void AclLog::Reset() {
  log_.clear();
}

AclLog::LogType AclLog::GetLog(size_t number_of_entries) const {
  auto start = log_.begin();
  auto end = log_.size() <= number_of_entries ? log_.end() : std::next(start, number_of_entries);
  return {start, end};
}

void AclLog::SetTotalEntries(size_t total_entries) {
  if (log_.size() > total_entries) {
    log_.erase(std::next(log_.begin(), total_entries), log_.end());
  }

  total_entries_allowed_ = total_entries;
}

}  // namespace dfly::acl


================================================
FILE: src/server/acl/acl_log.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <chrono>
#include <deque>
#include <string>

namespace dfly {

class ConnectionContext;

namespace acl {

class AclLog {
 public:
  explicit AclLog();

  enum class Reason { COMMAND, AUTH, KEY, PUB_SUB };

  struct LogEntry {
    std::string username;
    std::string client_info;
    std::string object;
    Reason reason;
    using TimePoint = std::chrono::time_point<std::chrono::system_clock>;
    TimePoint entry_creation = TimePoint::max();

    friend bool operator<(const LogEntry& lhs, const LogEntry& rhs) {
      return lhs.entry_creation < rhs.entry_creation;
    }
  };

  void Add(const ConnectionContext& cntx, std::string object, Reason reason,
           std::string tried_to_auth = "");
  void Reset();

  using LogType = std::deque<LogEntry>;

  LogType GetLog(size_t number_of_entries) const;

  void SetTotalEntries(size_t total_entries);

 private:
  LogType log_;
  size_t total_entries_allowed_;
};

}  // namespace acl
}  // namespace dfly


================================================
FILE: src/server/acl/user.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/acl/user.h"

#include <openssl/sha.h>

#include <limits>

#include "absl/container/flat_hash_set.h"
#include "absl/strings/escaping.h"
#include "core/overloaded.h"

namespace dfly::acl {

namespace {
std::string StringSHA256(std::string_view password) {
  std::string hash;
  hash.resize(SHA256_DIGEST_LENGTH);
  SHA256(reinterpret_cast<const unsigned char*>(password.data()), password.size(),
         reinterpret_cast<unsigned char*>(hash.data()));
  return hash;
}

}  // namespace

User::User() {
  commands_ = std::vector<uint64_t>(NumberOfFamilies(), 0);
}

void User::Update(UpdateRequest&& req, const CategoryToIdxStore& cat_to_id,
                  const ReverseCategoryIndexTable& reverse_cat,
                  const CategoryToCommandsIndexStore& cat_to_commands) {
  for (auto& pass : req.passwords) {
    if (pass.nopass) {
      SetNopass();
      continue;
    }
    if (pass.unset) {
      UnsetPassword(pass.password);
      continue;
    }
    if (pass.reset_password) {
      password_hashes_.clear();
      continue;
    }
    SetPasswordHash(pass.password, pass.is_hashed);
  }

  auto cat_visitor = [&, this](UpdateRequest::CategoryValueType cat) {
    auto [sign, category] = cat;
    if (sign == Sign::PLUS) {
      SetAclCategoriesAndIncrSeq(category, cat_to_id, reverse_cat, cat_to_commands);
      return;
    }
    UnsetAclCategoriesAndIncrSeq(category, cat_to_id, reverse_cat, cat_to_commands);
  };

  auto cmd_visitor = [this](UpdateRequest::CommandsValueType cmd) {
    auto [sign, index, bit_index] = cmd;
    if (sign == Sign::PLUS) {
      SetAclCommandsAndIncrSeq(index, bit_index);
      return;
    }
    UnsetAclCommandsAndIncrSeq(index, bit_index);
  };

  Overloaded visitor{cat_visitor, cmd_visitor};

  for (auto req : req.updates) {
    std::visit(visitor, req);
  }

  if (!req.keys.empty()) {
    SetKeyGlobs(std::move(req.keys));
  }

  if (!req.pub_sub.empty()) {
    SetPubSub(std::move(req.pub_sub));
  }

  if (req.is_active) {
    SetIsActive(*req.is_active);
  }

  SetSelectDb(req.select_db);

  SetNamespace(req.ns);
}

void User::SetPasswordHash(std::string_view password, bool is_hashed) {
  nopass_ = false;
  if (is_hashed) {
    std::string binary;
    if (absl::HexStringToBytes(password, &binary)) {
      password_hashes_.insert(binary);
    } else {
      LOG(ERROR) << "Invalid password hash: " << password;
    }
    return;
  }
  password_hashes_.insert(StringSHA256(password));
}

void User::UnsetPassword(std::string_view password) {
  password_hashes_.erase(StringSHA256(password));
}

void User::SetNamespace(const std::string& ns) {
  namespace_ = ns;
}

void User::SetSelectDb(std::optional<size_t> db) {
  if (db) {
    db_ = *db;
  }
}

size_t User::Db() const {
  return db_;
}

const std::string& User::Namespace() const {
  return namespace_;
}

bool User::HasPassword(std::string_view password) const {
  if (nopass_) {
    return true;
  }
  return password_hashes_.contains(StringSHA256(password));
}

void User::SetAclCategoriesAndIncrSeq(uint32_t cat, const CategoryToIdxStore& cat_to_id,
                                      const ReverseCategoryIndexTable& reverse_cat,
                                      const CategoryToCommandsIndexStore& cat_to_commands) {
  acl_categories_ |= cat;
  if (cat == acl::ALL) {
    SetAclCommands(std::numeric_limits<size_t>::max(), 0);
  } else {
    auto id = cat_to_id.at(cat);
    std::string_view name = reverse_cat[id];
    const auto& commands_group = cat_to_commands.at(name);
    for (size_t fam_id = 0; fam_id < commands_group.size(); ++fam_id) {
      SetAclCommands(fam_id, commands_group[fam_id]);
    }
  }

  CategoryChange change{cat};
  cat_changes_[change] = ChangeMetadata{Sign::PLUS, seq_++};
}

void User::UnsetAclCategoriesAndIncrSeq(uint32_t cat, const CategoryToIdxStore& cat_to_id,
                                        const ReverseCategoryIndexTable& reverse_cat,
                                        const CategoryToCommandsIndexStore& cat_to_commands) {
  acl_categories_ ^= cat;
  if (cat == acl::ALL) {
    UnsetAclCommands(std::numeric_limits<size_t>::max(), 0);
  } else {
    auto id = cat_to_id.at(cat);
    std::string_view name = reverse_cat[id];
    const auto& commands_group = cat_to_commands.at(name);
    for (size_t fam_id = 0; fam_id < commands_group.size(); ++fam_id) {
      UnsetAclCommands(fam_id, commands_group[fam_id]);
    }
  }

  CategoryChange change{cat};
  cat_changes_[change] = ChangeMetadata{Sign::MINUS, seq_++};
}

void User::SetAclCommands(size_t index, uint64_t bit_index) {
  if (index == std::numeric_limits<size_t>::max()) {
    for (auto& family : commands_) {
      family = ALL_COMMANDS;
    }
    return;
  }
  commands_[index] |= bit_index;
}

void User::SetAclCommandsAndIncrSeq(size_t index, uint64_t bit_index) {
  SetAclCommands(index, bit_index);
  CommandChange change{index, bit_index};
  cmd_changes_[change] = ChangeMetadata{Sign::PLUS, seq_++};
}

void User::UnsetAclCommands(size_t index, uint64_t bit_index) {
  if (index == std::numeric_limits<size_t>::max()) {
    for (auto& family : commands_) {
      family = NONE_COMMANDS;
    }
    return;
  }
  SetAclCommands(index, bit_index);
  commands_[index] ^= bit_index;
}

void User::UnsetAclCommandsAndIncrSeq(size_t index, uint64_t bit_index) {
  UnsetAclCommands(index, bit_index);
  CommandChange change{index, bit_index};
  cmd_changes_[change] = ChangeMetadata{Sign::MINUS, seq_++};
}

uint32_t User::AclCategory() const {
  return acl_categories_;
}

std::vector<uint64_t> User::AclCommands() const {
  return commands_;
}

const std::vector<uint64_t>& User::AclCommandsRef() const {
  return commands_;
}

void User::SetIsActive(bool is_active) {
  is_active_ = is_active;
}

bool User::IsActive() const {
  return is_active_;
}

const absl::flat_hash_set<std::string>& User::Passwords() const {
  return password_hashes_;
}

bool User::HasNopass() const {
  return nopass_;
}

const AclKeys& User::Keys() const {
  return keys_;
}

const AclPubSub& User::PubSub() const {
  return pub_sub_;
}

const User::CategoryChanges& User::CatChanges() const {
  return cat_changes_;
}

const User::CommandChanges& User::CmdChanges() const {
  return cmd_changes_;
}

void User::SetKeyGlobs(std::vector<UpdateKey> keys) {
  for (auto& key : keys) {
    if (key.all_keys) {
      keys_.key_globs.clear();
      keys_.all_keys = true;
    } else if (key.reset_keys) {
      keys_.key_globs.clear();
      keys_.all_keys = false;
    } else {
      keys_.key_globs.push_back({std::move(key.key), key.op});
    }
  }
}

void User::SetPubSub(std::vector<UpdatePubSub> pub_sub) {
  for (auto& pattern : pub_sub) {
    if (pattern.all_channels) {
      pub_sub_.globs.clear();
      pub_sub_.all_channels = true;
    } else if (pattern.reset_channels) {
      pub_sub_.globs.clear();
      pub_sub_.all_channels = false;
    } else {
      pub_sub_.globs.push_back({std::move(pattern.pattern), pattern.has_asterisk});
    }
  }
}

void User::SetNopass() {
  nopass_ = true;
  password_hashes_.clear();
}

}  // namespace dfly::acl


================================================
FILE: src/server/acl/user.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <cstdint>
#include <limits>
#include <optional>
#include <string>
#include <string_view>
#include <tuple>
#include <utility>
#include <vector>

#include "absl/container/flat_hash_map.h"
#include "absl/container/flat_hash_set.h"
#include "absl/hash/hash.h"
#include "server/acl/acl_commands_def.h"

namespace dfly::acl {

class User final {
 public:
  enum class Sign : int8_t { PLUS, MINUS };

  struct UpdateKey {
    std::string key;
    KeyOp op;
    bool all_keys = false;
    bool reset_keys = false;
  };

  struct UpdatePass {
    std::string password;
    // Set to denote remove password
    bool unset{false};
    bool nopass{false};
    bool reset_password{false};
    bool is_hashed{false};
  };

  struct UpdatePubSub {
    std::string pattern;
    bool has_asterisk{false};
    bool all_channels{false};
    bool reset_channels{false};
  };

  struct UpdateRequest {
    std::vector<UpdatePass> passwords;

    std::optional<bool> is_active{};

    bool is_hashed{false};

    // Categories and commands
    using CategoryValueType = std::pair<Sign, uint32_t>;
    // If index s numberic_limits::max() then it's a +all flag
    using CommandsValueType = std::tuple<Sign, size_t /*index*/, uint64_t /*bit*/>;
    using UpdateType = std::vector<std::variant<CategoryValueType, CommandsValueType>>;
    UpdateType updates;

    // keys
    std::vector<UpdateKey> keys;
    bool reset_all_keys{false};
    bool allow_all_keys{false};

    // pub/sub
    std::vector<UpdatePubSub> pub_sub;
    bool reset_channels{false};
    bool all_channels{false};

    // TODO allow reset all
    // bool reset_all{false};

    // DFLY specific
    std::optional<size_t> select_db;
    std::string ns;
  };

  using CategoryChange = uint32_t;
  using CommandChange = std::pair<size_t, uint64_t>;

  struct ChangeMetadata {
    Sign sign;
    size_t seq_no;
  };

  /* Used for default user
   * password = nopass
   * acl_categories = +@all
   * is_active = true;
   */
  User();

  User(const User&) = delete;
  User(User&&) = default;

  // For single step updates
  void Update(UpdateRequest&& req, const CategoryToIdxStore& cat_to_id,
              const ReverseCategoryIndexTable& reverse_cat,
              const CategoryToCommandsIndexStore& cat_to_commands);

  bool HasPassword(std::string_view password) const;

  uint32_t AclCategory() const;

  std::vector<uint64_t> AclCommands() const;
  const std::vector<uint64_t>& AclCommandsRef() const;

  bool IsActive() const;

  const absl::flat_hash_set<std::string>& Passwords() const;

  bool HasNopass() const;

  // Selector maps a command string (like HSET, SET etc) to
  // its respective ID within the commands vector.
  static size_t Selector(std::string_view);

  const AclKeys& Keys() const;

  const AclPubSub& PubSub() const;

  const std::string& Namespace() const;

  size_t Db() const;

  using CategoryChanges = absl::flat_hash_map<CategoryChange, ChangeMetadata>;
  using CommandChanges = absl::flat_hash_map<CommandChange, ChangeMetadata>;

  const CategoryChanges& CatChanges() const;

  const CommandChanges& CmdChanges() const;

 private:
  void SetAclCategoriesAndIncrSeq(uint32_t cat, const CategoryToIdxStore& cat_to_id,
                                  const ReverseCategoryIndexTable& reverse_cat,
                                  const CategoryToCommandsIndexStore& cat_to_commands);
  void UnsetAclCategoriesAndIncrSeq(uint32_t cat, const CategoryToIdxStore& cat_to_id,
                                    const ReverseCategoryIndexTable& reverse_cat,
                                    const CategoryToCommandsIndexStore& cat_to_commands);

  // For ACL commands
  void SetAclCommands(size_t index, uint64_t bit_index);
  void UnsetAclCommands(size_t index, uint64_t bit_index);

  void SetAclCommandsAndIncrSeq(size_t index, uint64_t bit_index);
  void UnsetAclCommandsAndIncrSeq(size_t index, uint64_t bit_index);

  // For is_active flag
  void SetIsActive(bool is_active);

  // For passwords
  void SetPasswordHash(std::string_view password, bool is_hashed);
  void UnsetPassword(std::string_view password);

  // For ACL key globs
  void SetKeyGlobs(std::vector<UpdateKey> keys);

  // For ACL pub/sub
  void SetPubSub(std::vector<UpdatePubSub> pub_sub);

  void SetNamespace(const std::string& ns);

  void SetSelectDb(std::optional<size_t> db);

  // Set NOPASS and remove all passwords
  void SetNopass();

  // Passwords for each user
  absl::flat_hash_set<std::string> password_hashes_;
  // if `nopass` is used
  bool nopass_ = false;

  uint32_t acl_categories_{NONE};
  // Each element index in the vector corresponds to a familly of commands
  // Each bit in the uin64_t field at index id, corresponds to a specific
  // command of that family. Look on TableCommandBuilder and on Service::Register
  // on how this mapping is built during the startup/registration of commands
  std::vector<uint64_t> commands_;

  // We also need to track all the explicit changes (ACL SETUSER) of acl's in-order.
  // To speed up insertion we use the flat_hash_map and a seq_ variable which is a
  // strictly monotonically increasing number that is used for ordering. Both of these
  // indexers are merged and then sorted by the seq_ number when for example we print
  // the ACL rules of each user via ACL LIST.
  CategoryChanges cat_changes_;
  CommandChanges cmd_changes_;
  // Global modification order for changes in rules for acl commands and categories
  size_t seq_ = 0;

  // Glob patterns for the keys that a user is allowed to read/write
  AclKeys keys_;

  // Glob patterns for pub/sub channels
  AclPubSub pub_sub_;

  // if the user is on/off
  bool is_active_{false};

  std::string namespace_;

  // if db == std::numeric_limits<size_t>::max() then all db's.
  // Otherwise user restricted to the value of db_
  size_t db_{std::numeric_limits<size_t>::max()};
};

}  // namespace dfly::acl


================================================
FILE: src/server/acl/user_registry.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/acl/user_registry.h"

#include <limits>
#include <mutex>

#include "base/flags.h"
#include "facade/facade_types.h"
#include "server/acl/acl_commands_def.h"

ABSL_DECLARE_FLAG(std::string, requirepass);

using namespace util;

namespace dfly::acl {

void UserRegistry::MaybeAddAndUpdate(std::string_view username, User::UpdateRequest req) {
  std::unique_lock<fb2::SharedMutex> lock(mu_);
  auto& user = registry_[username];
  user.Update(std::move(req), *cat_to_id_table_, *reverse_cat_table_, *cat_to_commands_table_);
}

bool UserRegistry::RemoveUser(std::string_view username) {
  std::unique_lock<fb2::SharedMutex> lock(mu_);
  return registry_.erase(username);
}

UserCredentials UserRegistry::GetCredentials(std::string_view username) const {
  std::shared_lock<fb2::SharedMutex> lock(mu_);
  auto it = registry_.find(username);
  if (it == registry_.end()) {
    return {};
  }
  auto& user = it->second;
  return {user.AclCategory(), user.AclCommands(), user.Keys(),
          user.PubSub(),      user.Namespace(),   user.Db()};
}

bool UserRegistry::IsUserActive(std::string_view username) const {
  std::shared_lock<fb2::SharedMutex> lock(mu_);
  auto it = registry_.find(username);
  if (it == registry_.end()) {
    return false;
  }
  return it->second.IsActive();
}

bool UserRegistry::AuthUser(std::string_view username, std::string_view password) const {
  std::shared_lock<fb2::SharedMutex> lock(mu_);
  const auto& user = registry_.find(username);
  if (user == registry_.end()) {
    return false;
  }

  return user->second.IsActive() && user->second.HasPassword(password);
}

UserRegistry::RegistryViewWithLock UserRegistry::GetRegistryWithLock() const {
  std::shared_lock<fb2::SharedMutex> lock(mu_);
  return {std::move(lock), registry_};
}

UserRegistry::RegistryWithWriteLock UserRegistry::GetRegistryWithWriteLock() {
  std::unique_lock<fb2::SharedMutex> lock(mu_);
  return {std::move(lock), registry_};
}

UserRegistry::UserWithWriteLock::UserWithWriteLock(std::unique_lock<fb2::SharedMutex> lk,
                                                   const User& user, bool exists)
    : user(user), exists(exists), registry_lk_(std::move(lk)) {
}

User::UpdateRequest UserRegistry::DefaultUserUpdateRequest() const {
  // Assign field by field to supress an annoying compiler warning
  User::UpdateRequest req;
  req.passwords = std::vector<User::UpdatePass>{{"", false, true}};
  req.is_active = true;
  req.updates = {std::pair<User::Sign, uint32_t>{User::Sign::PLUS, acl::ALL}};
  req.keys = {User::UpdateKey{"~*", KeyOp::READ_WRITE, true, false}};
  req.pub_sub = {User::UpdatePubSub{"", false, true, false}};
  return req;
}

void UserRegistry::Init(const CategoryToIdxStore* cat_to_id_table,
                        const ReverseCategoryIndexTable* reverse_cat_table,
                        const CategoryToCommandsIndexStore* cat_to_commands_table) {
  // if there exists an acl file to load from, requirepass
  // will not overwrite the default's user password loaded from
  // that file. Loading the default's user password from a file
  // has higher priority than the deprecated flag
  cat_to_id_table_ = cat_to_id_table;
  reverse_cat_table_ = reverse_cat_table;
  cat_to_commands_table_ = cat_to_commands_table;
  auto default_user = DefaultUserUpdateRequest();
  auto maybe_password = absl::GetFlag(FLAGS_requirepass);
  if (!maybe_password.empty()) {
    default_user.passwords.front().password = std::move(maybe_password);
    default_user.passwords.front().nopass = false;
  } else if (const char* env_var = getenv("DFLY_PASSWORD"); env_var) {
    default_user.passwords.front().password = env_var;
    default_user.passwords.front().nopass = false;
  } else if (const char* env_var = getenv("DFLY_requirepass"); env_var) {
    default_user.passwords.front().password = env_var;
    default_user.passwords.front().nopass = false;
  }
  MaybeAddAndUpdate("default", std::move(default_user));
}

}  // namespace dfly::acl


================================================
FILE: src/server/acl/user_registry.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/flat_hash_map.h>

#include <algorithm>
#include <shared_mutex>
#include <string>
#include <utility>
#include <vector>

#include "server/acl/user.h"
#include "util/fibers/synchronization.h"

namespace dfly::acl {

class UserRegistry {
 private:
  template <template <typename T> typename LockT, typename RegT> class RegistryWithLock;

 public:
  UserRegistry() = default;

  UserRegistry(const UserRegistry&) = delete;
  UserRegistry(UserRegistry&&) = delete;

  void Init(const CategoryToIdxStore* cat_to_id_table,
            const ReverseCategoryIndexTable* reverse_cat_table,
            const CategoryToCommandsIndexStore* cat_to_commands_table);

  using RegistryType = absl::flat_hash_map<std::string, User>;

  // Acquires a write lock of mu_
  // If the user with name `username` does not exist, it's added in the store with
  // the exact fields found in req
  // If the user exists, the bitfields are updated with a `logical and` operation
  void MaybeAddAndUpdate(std::string_view username, User::UpdateRequest req);

  // Acquires a write lock on mu_
  // Removes user from the store
  // kills already existing connections from the removed user
  bool RemoveUser(std::string_view username);

  // Acquires a read lock
  UserCredentials GetCredentials(std::string_view username) const;

  // Acquires a read lock
  bool IsUserActive(std::string_view username) const;

  // Acquires a read lock
  bool AuthUser(std::string_view username, std::string_view password) const;

  using RegistryViewWithLock = RegistryWithLock<std::shared_lock, const RegistryType&>;
  using RegistryWithWriteLock = RegistryWithLock<std::unique_lock, RegistryType&>;

  // Helper function used for printing users via ACL LIST
  RegistryViewWithLock GetRegistryWithLock() const;

  // Helper function to propagate a write lock outside the registry's scope
  RegistryWithWriteLock GetRegistryWithWriteLock();

  // Helper class for accessing a user with a ReadLock outside the scope of UserRegistry
  class UserWithWriteLock {
   public:
    UserWithWriteLock(std::unique_lock<util::fb2::SharedMutex> lk, const User& user, bool exists);
    const User& user;
    const bool exists;

   private:
    std::unique_lock<util::fb2::SharedMutex> registry_lk_;
  };

  User::UpdateRequest DefaultUserUpdateRequest() const;

 private:
  RegistryType registry_;
  mutable util::fb2::SharedMutex mu_;

  // Helper class for accessing the registry with a ReadLock outside the scope of UserRegistry
  template <template <typename T> typename LockT, typename RegT> class RegistryWithLock {
   public:
    RegistryWithLock(LockT<util::fb2::SharedMutex> lk, RegT reg)
        : registry(reg), registry_lk_(std::move(lk)) {
    }
    RegT registry;

   private:
    LockT<util::fb2::SharedMutex> registry_lk_;
  };

  const CategoryToIdxStore* cat_to_id_table_;
  const ReverseCategoryIndexTable* reverse_cat_table_;
  const CategoryToCommandsIndexStore* cat_to_commands_table_;
};

}  // namespace dfly::acl


================================================
FILE: src/server/acl/validator.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/acl/validator.h"

#include <absl/strings/numbers.h>

#include "base/logging.h"
#include "core/glob_matcher.h"
#include "facade/dragonfly_connection.h"
#include "server/acl/acl_commands_def.h"
#include "server/command_registry.h"
#include "server/conn_context.h"
#include "server/server_state.h"
#include "server/transaction.h"

namespace dfly::acl {

namespace {

bool Matches(std::string_view pattern, std::string_view target) {
  GlobMatcher matcher(pattern, true);
  return matcher.Matches(target);
};

bool ValidateCommand(const std::vector<uint64_t>& acl_commands, const CommandId& id) {
  const size_t index = id.GetFamily();
  const uint64_t command_mask = id.GetBitIndex();
  DCHECK_LT(index, acl_commands.size());

  return (acl_commands[index] & command_mask) != 0;
}

[[nodiscard]] std::pair<bool, AclLog::Reason> IsPubSubCommandAuthorized(
    bool literal_match, const std::vector<uint64_t>& acl_commands, const AclPubSub& pub_sub,
    CmdArgList tail_args, const CommandId& id) {
  if (!ValidateCommand(acl_commands, id)) {
    return {false, AclLog::Reason::COMMAND};
  }

  auto iterate_globs = [&](std::string_view target) {
    for (auto& [glob, has_asterisk] : pub_sub.globs) {
      if (literal_match && (glob == target)) {
        return true;
      }
      if (!literal_match && Matches(glob, target)) {
        return true;
      }
    }
    return false;
  };

  bool allowed = true;
  if (!pub_sub.all_channels) {
    std::string_view name = id.name();
    if (name == "PUBLISH" || name == "SPUBLISH") {
      auto channel = tail_args[0];
      allowed &= iterate_globs(facade::ToSV(channel));
    } else {
      for (auto channel : tail_args) {
        allowed &= iterate_globs(facade::ToSV(channel));
      }
    }
  }

  return {allowed, AclLog::Reason::PUB_SUB};
}

}  // namespace

[[nodiscard]] bool IsUserAllowedToInvokeCommand(const ConnectionContext& cntx, const CommandId& id,
                                                ArgSlice tail_args) {
  if (cntx.skip_acl_validation) {
    return true;
  }

  if (id.IsAlias()) {
    return false;
  }

  std::pair<bool, AclLog::Reason> auth_res;

  if (auto pkind = id.PubSubKind(); pkind) {
    bool is_pattern = *pkind == CO::PubSubKind::PATTERN;
    auth_res =
        IsPubSubCommandAuthorized(is_pattern, cntx.acl_commands, cntx.pub_sub, tail_args, id);
  } else {
    auth_res = IsUserAllowedToInvokeCommandGeneric(cntx, id, tail_args);
  }

  const auto [is_authed, reason] = auth_res;

  if (!is_authed) {
    auto& log = ServerState::tlocal()->acl_log;
    log.Add(cntx, std::string(id.name()), reason);
  }

  return is_authed;
}

[[nodiscard]] std::pair<bool, AclLog::Reason> IsUserAllowedToInvokeCommandGeneric(
    const ConnectionContext& cntx, const CommandId& id, CmdArgList tail_args) {
  const size_t max = std::numeric_limits<size_t>::max();
  // Once we support ranges this must change
  const bool reject_move_command = cntx.acl_db_idx != max && id.name() == "MOVE";
  const bool reject_trans_command =
      cntx.acl_db_idx != max && cntx.acl_db_idx != cntx.db_index() && id.IsTransactional();
  if (reject_move_command || reject_trans_command) {
    return {false, AclLog::Reason::AUTH};
  }
  size_t res = 0;
  if (tail_args.size() == 1 && id.name() == "SELECT" && absl::SimpleAtoi(tail_args[0], &res) &&
      cntx.acl_db_idx != max && cntx.acl_db_idx != res) {
    return {false, AclLog::Reason::AUTH};
  }

  const auto& acl_commands = cntx.acl_commands;
  const auto& keys = cntx.keys;
  if (!ValidateCommand(acl_commands, id)) {
    return {false, AclLog::Reason::COMMAND};
  }

  const bool is_read_command = id.IsReadOnly();
  const bool is_write_command = id.IsJournaled();

  auto iterate_globs = [&](auto target) {
    for (auto& [elem, op] : keys.key_globs) {
      if (Matches(elem, target)) {
        if (is_read_command && (op == KeyOp::READ || op == KeyOp::READ_WRITE)) {
          return true;
        }
        if (is_write_command && (op == KeyOp::WRITE || op == KeyOp::READ_WRITE)) {
          return true;
        }
      }
    }
    return false;
  };

  bool keys_allowed = true;
  if (!keys.all_keys && id.first_key_pos() != 0 && (is_read_command || is_write_command)) {
    auto keys_index = DetermineKeys(&id, tail_args);
    DCHECK(keys_index);

    for (std::string_view key : keys_index->Range(tail_args))
      keys_allowed &= iterate_globs(key);
  }

  return {keys_allowed, AclLog::Reason::KEY};
}

}  // namespace dfly::acl


================================================
FILE: src/server/acl/validator.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <utility>

#include "facade/facade_types.h"
#include "server/acl/acl_log.h"
#include "server/command_registry.h"

namespace dfly::acl {

struct AclKeys;
struct AclPubSub;

std::pair<bool, AclLog::Reason> IsUserAllowedToInvokeCommandGeneric(const ConnectionContext& cntx,
                                                                    const CommandId& id,
                                                                    facade::CmdArgList tail_args);

bool IsUserAllowedToInvokeCommand(const ConnectionContext& cntx, const CommandId& id,
                                  facade::CmdArgList tail_args);
}  // namespace dfly::acl


================================================
FILE: src/server/bitops_family.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include <absl/strings/ascii.h>
#include <absl/strings/match.h>

#include <nonstd/expected.hpp>

#include "base/logging.h"
#include "facade/cmd_arg_parser.h"
#include "facade/op_status.h"
#include "facade/reply_builder.h"
#include "server/acl/acl_commands_def.h"
#include "server/command_families.h"
#include "server/command_registry.h"
#include "server/conn_context.h"
#include "server/db_slice.h"
#include "server/engine_shard_set.h"
#include "server/error.h"
#include "server/namespaces.h"
#include "server/transaction.h"
#include "src/core/overloaded.h"
#include "util/varz.h"

namespace dfly {
using namespace facade;
using namespace std;

namespace {

using ShardStringResults = vector<OpResult<string>>;
const int32_t OFFSET_FACTOR = 8;  // number of bits in byte
const char* OR_OP_NAME = "OR";
const char* XOR_OP_NAME = "XOR";
const char* AND_OP_NAME = "AND";
const char* NOT_OP_NAME = "NOT";

using BitsStrVec = vector<string>;

// The following is the list of the functions that would handle the
// commands that handle the bit operations
void BitPos(CmdArgList args, CommandContext* cmd_cntx);
void BitCount(CmdArgList args, CommandContext* cmd_cntx);
void BitField(CmdArgList args, CommandContext* cmd_cntx);
void BitFieldRo(CmdArgList args, CommandContext* cmd_cntx);
void BitOp(CmdArgList args, CommandContext* cmd_cntx);
void GetBit(CmdArgList args, CommandContext* cmd_cntx);
void SetBit(CmdArgList args, CommandContext* cmd_cntx);

OpResult<string> ReadValue(const DbContext& context, string_view key, EngineShard* shard);
OpResult<bool> ReadValueBitsetAt(const OpArgs& op_args, string_view key, uint32_t offset);
OpResult<std::size_t> CountBitsForValue(const OpArgs& op_args, string_view key, int64_t start,
                                        int64_t end, bool bit_value);
OpResult<int64_t> FindFirstBitWithValue(const OpArgs& op_args, string_view key, bool value,
                                        int64_t start, int64_t end, bool as_bit);
string GetString(const PrimeValue& pv);
bool SetBitValue(uint32_t offset, bool bit_value, string* entry);
std::size_t CountBitSetByByteIndices(string_view at, std::size_t start, std::size_t end);
std::size_t CountBitSet(string_view str, int64_t start, int64_t end, bool bits);
std::size_t CountBitSetByBitIndices(string_view at, std::size_t start, std::size_t end);
string RunBitOperationOnValues(string_view op, const BitsStrVec& values);

// ------------------------------------------------------------------------- //

// This function can be used for any case where we allowing out of bound
// access where the default in this case would be 0 -such as bitop
uint8_t GetByteAt(string_view s, std::size_t at) {
  return at >= s.size() ? 0 : s[at];
}

// For XOR, OR, AND operations on a collection of bytes
template <typename BitOp, typename SkipOp>
string BitOpString(BitOp operation_f, SkipOp skip_f, const BitsStrVec& values, string new_value) {
  // at this point, values are not empty
  std::size_t max_size = new_value.size();

  if (values.size() > 1) {
    for (std::size_t i = 0; i < max_size; i++) {
      std::uint8_t new_entry = operation_f(GetByteAt(values[0], i), GetByteAt(values[1], i));
      for (std::size_t j = 2; j < values.size(); ++j) {
        new_entry = operation_f(new_entry, GetByteAt(values[j], i));
        if (skip_f(new_entry)) {
          break;
        }
      }
      new_value[i] = new_entry;
    }
    return new_value;
  } else {
    return values[0];
  }
}

// Helper functions to support operations
// so we would not need to check which
// operations to run in the look (unlike
// https://github.com/redis/redis/blob/c2b0c13d5c0fab49131f6f5e844f80bfa43f6219/src/bitops.c#L607)
constexpr bool SkipAnd(uint8_t byte) {
  return byte == 0x0;
}

constexpr bool SkipOr(uint8_t byte) {
  return byte == 0xff;
}

constexpr bool SkipXor(uint8_t) {
  return false;
}

constexpr uint8_t AndOp(uint8_t left, uint8_t right) {
  return left & right;
}

constexpr uint8_t OrOp(uint8_t left, uint8_t right) {
  return left | right;
}

constexpr uint8_t XorOp(uint8_t left, uint8_t right) {
  return left ^ right;
}

string BitOpNotString(string from) {
  std::transform(from.begin(), from.end(), from.begin(), [](auto c) { return ~c; });
  return from;
}

//  Bits manipulation functions
constexpr int32_t GetBitIndex(uint32_t offset) noexcept {
  return offset % OFFSET_FACTOR;
}

constexpr int32_t GetNormalizedBitIndex(uint32_t offset) noexcept {
  return (OFFSET_FACTOR - 1) - GetBitIndex(offset);
}

constexpr int32_t GetByteIndex(uint32_t offset) noexcept {
  return offset / OFFSET_FACTOR;
}

uint8_t GetByteValue(string_view str, uint32_t offset) {
  return static_cast<uint8_t>(str[GetByteIndex(offset)]);
}

constexpr bool CheckBitStatus(uint8_t byte, uint32_t offset) {
  return byte & (0x1 << offset);
}

constexpr std::uint8_t CountBitsRange(std::uint8_t byte, std::uint8_t from, uint8_t to) {
  int count = 0;
  for (int i = from; i < to; i++) {
    count += CheckBitStatus(byte, GetNormalizedBitIndex(i));
  }
  return count;
}

// Count the number of bits that are on, on bytes boundaries: i.e. Start and end are the indices for
// bytes locations inside str CountBitSetByByteIndices
std::size_t CountBitSetByByteIndices(string_view at, std::size_t start, std::size_t end) {
  if (start >= end) {
    return 0;
  }
  end = std::min(end, at.size());  // don't overflow
  std::uint32_t count =
      std::accumulate(std::next(at.begin(), start), std::next(at.begin(), end), 0,
                      [](auto counter, uint8_t ch) { return counter + absl::popcount(ch); });
  return count;
}

// Count the number of bits that are on, on bits boundaries: i.e. Start and end are the indices for
// bits locations inside str
std::size_t CountBitSetByBitIndices(string_view at, std::size_t start, std::size_t end) {
  auto first_byte_index = GetByteIndex(start);
  auto last_byte_index = GetByteIndex(end);
  if (start % OFFSET_FACTOR == 0 && end % OFFSET_FACTOR == 0) {
    return CountBitSetByByteIndices(at, first_byte_index, last_byte_index);
  }
  const auto last_bit_first_byte =
      first_byte_index != last_byte_index ? OFFSET_FACTOR : GetBitIndex(end);
  const auto first_byte = GetByteValue(at, start);
  std::uint32_t count = CountBitsRange(first_byte, GetBitIndex(start), last_bit_first_byte);
  if (first_byte_index < last_byte_index) {
    first_byte_index++;
    const auto last_byte = GetByteValue(at, end);
    count += CountBitsRange(last_byte, 0, GetBitIndex(end));
    count += CountBitSetByByteIndices(at, first_byte_index, last_byte_index);
  }
  return count;
}

// Returns normalized offset of `offset` in `size`. `size` is assumed to be a size of a container,
// and as such the returned value is always in the range [0, size]. If `offset` is negative, it is
// treated as an offset from the end and is normalized to be a positive offset from the start.
int64_t NormalizedOffset(int64_t size, int64_t offset) {
  if (offset < 0) {
    offset = size + offset;
  }
  return std::min(std::max(offset, int64_t{0}), size);
}

// General purpose function to count the number of bits that are on.
// The parameters for start, end and bits are defaulted to the start of the string,
// end of the string and bits are false.
// Note that when bits is false, it means that we are looking on byte boundaries.
std::size_t CountBitSet(string_view str, int64_t start, int64_t end, bool bits) {
  const int64_t strlen = bits ? str.size() * OFFSET_FACTOR : str.size();

  if (start < 0)
    start = strlen + start;
  if (end < 0)
    end = strlen + end;

  end = min(end, strlen);

  if (strlen == 0 || start > end)
    return 0;

  start = max(start, int64_t(0));
  end = max(end, int64_t(0));

  ++end;
  return bits ? CountBitSetByBitIndices(str, start, end)
              : CountBitSetByByteIndices(str, start, end);
}

// return true if bit is on
bool GetBitValue(const string& entry, uint32_t offset) {
  const auto byte_val{GetByteValue(entry, offset)};
  const auto index{GetNormalizedBitIndex(offset)};
  return CheckBitStatus(byte_val, index);
}

constexpr uint8_t TurnBitOn(uint8_t on, uint32_t offset) {
  return on |= 1 << offset;
}

constexpr uint8_t TurnBitOff(uint8_t on, uint32_t offset) {
  return on &= ~(1 << offset);
}

bool SetBitValue(uint32_t offset, bool bit_value, string* entry) {
  // we need to return the old value after setting the value for offset
  const auto old_value{GetBitValue(*entry, offset)};  // save this as the return value
  auto byte{GetByteValue(*entry, offset)};
  const auto bit_index{GetNormalizedBitIndex(offset)};
  byte = bit_value ? TurnBitOn(byte, bit_index) : TurnBitOff(byte, bit_index);
  (*entry)[GetByteIndex(offset)] = byte;
  return old_value;
}

// ------------------------------------------------------------------------- //

class ElementAccess {
 private:
  string_view key_;
  DbContext context_;
  mutable DbSlice::ItAndUpdater updater_;

 public:
  ElementAccess(string_view key, const OpArgs& args) : key_{key}, context_{args.db_cntx} {
  }

  /* If allow_wrong_type = true - it still finds the element even if it's WRONG_TYPE. This is used
     for blind updates. See BITOP operation. */
  OpStatus Find(bool allow_wrong_type);

  bool IsNewEntry() const {
    return updater_.is_new;
  }

  string Value() const;

  bool GetByteAtIndex(size_t idx, uint8_t* res) const;
  void SetByteAtIndex(size_t idx, uint8_t value) const;

  void Commit(string_view new_value) const;

  // return nullopt when key exists but it's not encoded as string
  // return true if key exists and false if it doesn't
  std::optional<bool> Exists();
};

std::optional<bool> ElementAccess::Exists() {
  auto& db_slice = context_.ns->GetCurrentDbSlice();
  auto res = db_slice.FindReadOnly(context_, key_, OBJ_STRING);
  if (res.status() == OpStatus::WRONG_TYPE) {
    return {};
  }
  return res.status() != OpStatus::KEY_NOTFOUND;
}

OpStatus ElementAccess::Find(bool allow_wrong_type) {
  auto& db_slice = context_.ns->GetCurrentDbSlice();
  // If we allow wrong type, we use nullopt to indicate that we don't care about the type.
  auto op_res = db_slice.AddOrFind(
      context_, key_, allow_wrong_type ? std::nullopt : std::optional<unsigned>{OBJ_STRING});
  RETURN_ON_BAD_STATUS(op_res);
  auto& add_res = *op_res;

  updater_ = std::move(add_res);

  return OpStatus::OK;
}

string ElementAccess::Value() const {
  return IsNewEntry() ? string{} : GetString(updater_.it->second);
}

bool ElementAccess::GetByteAtIndex(size_t idx, uint8_t* res) const {
  DCHECK(!IsNewEntry());
  return updater_.it->second.GetByteAtIndex(idx, res);
}

void ElementAccess::SetByteAtIndex(size_t idx, uint8_t val) const {
  DCHECK(!IsNewEntry());
  DCHECK_LT(idx, updater_.it->second.Size());
  auto [success, _] = updater_.it->second.SetByteAtIndex(idx, val);
  if (success) {
    updater_.post_updater.Run();
  }
}

void ElementAccess::Commit(string_view new_value) const {
  if (new_value.empty()) {
    if (!IsNewEntry()) {
      updater_.post_updater.Run();
    } else {
      // No need to run, it was a new entry and it got removed
      updater_.post_updater.Cancel();
    }
    context_.ns->GetCurrentDbSlice().Del(context_, updater_.it);
  } else {
    if (!IsNewEntry() && updater_.it->second.ObjType() != OBJ_STRING) {
      updater_.post_updater.ReduceHeapUsage();
    }
    updater_.it->second.SetString(new_value);
    updater_.post_updater.Run();
  }
}

// =============================================
// Set a new value to a given bit

OpResult<bool> BitNewValue(const OpArgs& args, string_view key, uint32_t offset, bool bit_value) {
  ElementAccess element_access{key, args};
  auto& db_slice = args.GetDbSlice();
  DCHECK(db_slice.IsDbValid(args.db_cntx.db_index));
  bool old_value = false;

  auto find_res = element_access.Find(false);

  if (find_res != OpStatus::OK) {
    VLOG(1) << "Find failed for key: " << key << " with error: " << find_res;
    return find_res;
  }

  const size_t byte_index = GetByteIndex(offset);

  // Create a new entry
  if (element_access.IsNewEntry()) {
    VLOG(2) << "Creating new key: " << key << " with size: " << (byte_index + 1) << " bytes";
    string new_entry(byte_index + 1, 0);
    old_value = SetBitValue(offset, bit_value, &new_entry);
    element_access.Commit(new_entry);
    return old_value;
  }

  // Get byte where bit offset is located. If offset is out of bound it means
  // that we need to extend the string otherwise we just update.
  uint8_t existing_byte;
  if (element_access.GetByteAtIndex(byte_index, &existing_byte)) {
    VLOG(2) << "Updating key: " << key << " at byte index: " << byte_index;
    uint32_t bit_index = GetNormalizedBitIndex(offset);
    old_value = CheckBitStatus(existing_byte, bit_index);
    if (old_value != bit_value) {
      existing_byte =
          bit_value ? TurnBitOn(existing_byte, bit_index) : TurnBitOff(existing_byte, bit_index);
      element_access.SetByteAtIndex(byte_index, existing_byte);
    }
  } else {
    VLOG(2) << "Extending key: " << key << " to " << (byte_index + 1) << " bytes";
    string existing_entry{element_access.Value()};
    existing_entry.resize(byte_index + 1, 0);
    SetBitValue(offset, bit_value, &existing_entry);
    // We always need to commit the extended key
    element_access.Commit(existing_entry);
  }

  return old_value;
}

// ---------------------------------------------------------

string RunBitOperationOnValues(string_view op, const BitsStrVec& values) {
  // This function accept an operation (either OR, XOR, NOT or OR), and run bit operation
  // on all the values we got from the database. Note that in case that one of the values
  // is shorter than the other it would return a 0 and the operation would continue
  // until we ran the longest value. The function will return the resulting new value
  std::size_t max_len = 0;
  std::size_t max_len_index = 0;

  const auto BitOperation = [&]() {
    if (op == OR_OP_NAME) {
      string default_str{values[max_len_index]};
      return BitOpString(OrOp, SkipOr, values, std::move(default_str));
    } else if (op == XOR_OP_NAME) {
      return BitOpString(XorOp, SkipXor, values, string(max_len, 0));
    } else if (op == AND_OP_NAME) {
      return BitOpString(AndOp, SkipAnd, values, string(max_len, 0));
    } else if (op == NOT_OP_NAME) {
      return BitOpNotString(values[0]);
    } else {
      LOG(FATAL) << "Operation not supported '" << op << "'";
      return string{};  // otherwise we will have warning of not returning value
    }
  };

  if (values.empty()) {  // this is ok in case we don't have the src keys
    return string{};
  }
  // The new result is the max length input
  max_len = values[0].size();
  for (std::size_t i = 1; i < values.size(); ++i) {
    if (values[i].size() > max_len) {
      max_len = values[i].size();
      max_len_index = i;
    }
  }
  return BitOperation();
}

OpResult<string> CombineResultOp(ShardStringResults result, string_view op) {
  // take valid result for each shard
  BitsStrVec values;
  for (auto&& res : result) {
    if (res) {
      auto v = res.value();
      values.emplace_back(std::move(v));
    } else {
      if (res.status() != OpStatus::KEY_NOTFOUND) {
        // something went wrong, just bale out
        return res;
      }
    }
  }

  // and combine them to single result
  return RunBitOperationOnValues(op, values);
}

// For bitop not - we cannot accumulate
OpResult<string> RunBitOpNot(const OpArgs& op_args, string_view key) {
  // if we found the value, just return, if not found then skip, otherwise report an error
  DbSlice& db_slice = op_args.GetDbSlice();
  auto find_res = db_slice.FindReadOnly(op_args.db_cntx, key, OBJ_STRING);
  if (find_res) {
    return GetString(find_res.value()->second);
  } else {
    return find_res.status();
  }
}

// Read only operation where we are running the bit operation on all the
// values that belong to same shard.
OpResult<string> RunBitOpOnShard(string_view op, const OpArgs& op_args, ShardArgs::Iterator start,
                                 ShardArgs::Iterator end) {
  DCHECK(start != end);
  if (op == NOT_OP_NAME) {
    return RunBitOpNot(op_args, *start);
  }

  DbSlice& db_slice = op_args.GetDbSlice();
  BitsStrVec values;

  // collect all the value for this shard
  for (; start != end; ++start) {
    auto find_res = db_slice.FindReadOnly(op_args.db_cntx, *start, OBJ_STRING);
    if (find_res) {
      values.emplace_back(GetString(find_res.value()->second));
    } else {
      if (find_res.status() == OpStatus::KEY_NOTFOUND) {
        continue;  // this is allowed, just return empty string per Redis
      } else {
        return find_res.status();
      }
    }
  }
  // Run the operation on all the values that we found
  string op_result = RunBitOperationOnValues(op, values);
  return op_result;
}

template <typename T>
void HandleOpValueResult(const OpResult<T>& result, SinkReplyBuilder* builder) {
  static_assert(std::is_integral<T>::value,
                "we are only handling types that are integral types in the return types from "
                "here");
  if (result) {
    builder->SendLong(result.value());
  } else {
    switch (result.status()) {
      case OpStatus::WRONG_TYPE:
        builder->SendError(kWrongTypeErr);
        break;
      case OpStatus::OUT_OF_MEMORY:
        builder->SendError(kOutOfMemory);
        break;
      default:
        builder->SendLong(0);  // in case we don't have the value we should just send 0
        break;
    }
  }
}

// ------------------------------------------------------------------------- //
//  Impl for the command functions
void BitPos(CmdArgList args, CommandContext* cmd_cntx) {
  // Support for the command BITPOS
  // See details at https://redis.io/commands/bitpos/
  auto* builder = cmd_cntx->rb();
  if (args.size() < 1 || args.size() > 5) {
    return builder->SendError(kSyntaxErr);
  }

  string_view key = ArgS(args, 0);

  int32_t value{0};
  int64_t start = 0;
  int64_t end = std::numeric_limits<int64_t>::max();
  bool as_bit = false;

  if (!absl::SimpleAtoi(ArgS(args, 1), &value)) {
    return builder->SendError(kInvalidIntErr);
  } else if (value != 0 && value != 1) {
    return builder->SendError("The bit argument must be 1 or 0");
  }

  if (args.size() >= 3) {
    if (!absl::SimpleAtoi(ArgS(args, 2), &start)) {
      return builder->SendError(kInvalidIntErr);
    }

    if (args.size() >= 4) {
      if (!absl::SimpleAtoi(ArgS(args, 3), &end)) {
        return builder->SendError(kInvalidIntErr);
      }

      if (args.size() >= 5) {
        string arg = absl::AsciiStrToUpper(ArgS(args, 4));
        if (arg == "BIT") {
          as_bit = true;
        } else if (arg == "BYTE") {
          as_bit = false;
        } else {
          return builder->SendError(kSyntaxErr);
        }
      }
    }
  }

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return FindFirstBitWithValue(t->GetOpArgs(shard), key, value, start, end, as_bit);
  };
  OpResult<int64_t> res = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  HandleOpValueResult(res, builder);
}

void BitCount(CmdArgList args, CommandContext* cmd_cntx) {
  // Support for the command BITCOUNT
  // See details at https://redis.io/commands/bitcount/
  // Please note that if the key don't exists, it would return 0

  CmdArgParser parser(args);
  auto key = parser.Next<string_view>();

  std::pair<int64_t, int64_t> start_end;
  if (parser.HasNext()) {
    auto tuple_result = parser.Next<int64_t, int64_t>();
    start_end = std::make_pair(std::get<0>(tuple_result), std::get<1>(tuple_result));
  } else {
    start_end = std::make_pair(0, std::numeric_limits<int64_t>::max());
  }

  bool as_bit = parser.HasNext() ? parser.MapNext("BYTE", false, "BIT", true) : false;
  if (!parser.Finalize()) {
    return cmd_cntx->SendError(parser.TakeError().MakeReply());
  }
  auto cb = [&, start_end](Transaction* t, EngineShard* shard) {
    return CountBitsForValue(t->GetOpArgs(shard), key, start_end.first, start_end.second, as_bit);
  };
  OpResult<std::size_t> res = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  HandleOpValueResult(res, cmd_cntx->rb());
}

// GCC yields a wrong warning about uninitialized optional use
#ifndef __clang__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#endif

enum class EncodingType { UINT, INT, NILL };

struct CommonAttributes {
  EncodingType type;
  size_t encoding_bit_size;
  size_t offset;
};

// We either return the result of the subcommand (int64_t) or nullopt
// to represent overflow/underflow failures
using ResultType = std::optional<int64_t>;

struct Overflow {
  enum Policy { WRAP, SAT, FAIL };

  // Used to check for unsigned overflow/underflow.
  // If incr is non zero, we check for overflows in the expression incr + *value
  // If incr is zero, we check for overflows in the expression *value
  // If the overflow fails because of Policy::FAIL, it returns false. Otherwise, true.
  // The result of handling the overflow is stored in the pointer value
  bool UIntOverflow(int64_t incr, size_t total_bits, int64_t* value) const;

  // Used to check for signed overflow/underflow.
  // If incr is non zero, we check for overflows in the expression incr + *value
  // If incr is zero, we check for overflows in the expression *value
  // If the overflow fails because of Policy::FAIL, it returns false. Otherwise, true.
  // The result of handling the overflow is stored in the pointer value
  bool IntOverflow(size_t total_bits, int64_t incr, bool add, int64_t* value) const;

  Policy type = WRAP;
};

bool Overflow::UIntOverflow(int64_t incr, size_t total_bits, int64_t* value) const {
  // total up to 63 bits -- we do not support 64 bit unsigned
  const uint64_t max = (1UL << total_bits) - 1;

  uint64_t incr_value = incr;
  if (incr_value + *value > max) {
    switch (type) {
      case Overflow::WRAP:
        // safe to do, won't overflow, both incr and value are <= than 2^63 - 1
        *value = (incr_value + *value) & max;
        break;
      case Overflow::SAT:
        *value = max;
        break;
      case Overflow::FAIL:
        *value = 0;
        return false;
    }
    return true;
  }

  *value = incr_value + *value;
  return true;
}

bool Overflow::IntOverflow(size_t total_bits, int64_t incr, bool add, int64_t* value) const {
  // This is exactly how redis handles signed overflow and we use the exact same chore
  const int64_t int_max = std::numeric_limits<int64_t>::max();
  const int64_t max = (total_bits == 64) ? int_max : ((1L << (total_bits - 1)) - 1);
  const int64_t min = (-max) - 1;
  auto switch_overflow = [&](int64_t wrap_case, int64_t sat_case, int64_t i) {
    switch (type) {
      case Overflow::WRAP: {
        uint64_t msb = 1UL << (total_bits - 1);
        uint64_t a = *value, b = incr;
        // Perform addition as unsigned so that's defined
        uint64_t c = a + b;
        if (total_bits < 64) {
          uint64_t mask = static_cast<uint64_t>(-1) << total_bits;
          if (c & msb) {
            c |= mask;
          } else {
            c &= ~mask;
          }
        }
        *value = c;
        break;
      }
      case Overflow::SAT:
        *value = sat_case;
        break;
      case Overflow::FAIL:
        *value = 0;
        return false;
    }
    return true;
  };

  // maxincr/minincr can overflow but it won't be an issue because we only use them
  // after checking 'value' range, so when they are used no overflow
  // happens. 'uint64_t' cast is there just to prevent undefined behavior on
  // overflow */
  int64_t maxincr = static_cast<uint64_t>(max) - *value;
  int64_t minincr = min - *value;

  // overflow
  if (*value > max || (total_bits != 64 && incr > maxincr) ||
      (*value >= 0 && incr > 0 && incr > maxincr)) {
    return switch_overflow(min, max, 1);
  }

  // underflow
  if (*value < min || (total_bits != 64 && incr < minincr) ||
      (*value < 0 && incr < 0 && incr < minincr)) {
    return switch_overflow(max, min, -1);
  }

  *value = *value + incr;

  return true;
}

class Get {
 public:
  explicit Get(CommonAttributes attr) : attr_(attr) {
  }

  // Apply the GET subcommand to the bitfield bytes.
  // Return either the subcommand result (int64_t) or empty optional if failed because of
  // Policy:FAIL
  ResultType ApplyTo(Overflow ov, const string* bitfield) const;

 private:
  CommonAttributes attr_;
};

ResultType Get::ApplyTo(Overflow ov, const string* bitfield) const {
  const auto& bytes = *bitfield;
  const int32_t total_bytes = static_cast<int32_t>(bytes.size());
  const size_t offset = attr_.offset;
  auto last_byte_offset = GetByteIndex(attr_.offset + attr_.encoding_bit_size - 1);

  if (GetByteIndex(offset) >= total_bytes) {
    return 0;
  }

  const string* result_str = bitfield;
  string buff;
  uint32_t lsb = attr_.offset + attr_.encoding_bit_size - 1;
  if (last_byte_offset >= total_bytes) {
    buff = *bitfield;
    buff.resize(last_byte_offset + 1, 0);
    result_str = &buff;
  }

  const bool is_negative =
      CheckBitStatus(GetByteValue(bytes, offset), GetNormalizedBitIndex(offset));

  int64_t result = 0;
  for (size_t i = 0; i < attr_.encoding_bit_size; ++i) {
    uint8_t byte{GetByteValue(*result_str, lsb)};
    int32_t index = GetNormalizedBitIndex(lsb);
    int64_t old_bit = CheckBitStatus(byte, index);
    result |= old_bit << i;
    --lsb;
  }

  if (is_negative && attr_.type == EncodingType::INT && result > 0) {
    result |= -1L ^ ((1L << attr_.encoding_bit_size) - 1);
  }

  return result;
}

class Set {
 public:
  explicit Set(CommonAttributes attr, int64_t value) : attr_(attr), set_value_(value) {
  }

  // Apply the SET subcommand to the bitfield value.
  // Return either the subcommand result (int64_t) or empty optional if failed because of
  // Policy:FAIL Updates the bitfield to contain the new value
  ResultType ApplyTo(Overflow ov, string* bitfield);

 private:
  // Helper function that delegates overflow checking to the Overflow object
  bool HandleOverflow(Overflow ov);

  CommonAttributes attr_;
  int64_t set_value_;
};

ResultType Set::ApplyTo(Overflow ov, string* bitfield) {
  string& bytes = *bitfield;
  const int32_t total_bytes = static_cast<int32_t>(bytes.size());
  auto last_byte_offset = GetByteIndex(attr_.offset + attr_.encoding_bit_size - 1) + 1;
  const size_t offset = attr_.offset;
  if (last_byte_offset > total_bytes) {
    bytes.resize(last_byte_offset, 0);
  }

  if (!HandleOverflow(ov)) {
    return {};
  }

  uint32_t lsb = attr_.offset + attr_.encoding_bit_size - 1;
  int64_t old_value = 0;

  const bool is_negative =
      CheckBitStatus(GetByteValue(*bitfield, offset), GetNormalizedBitIndex(offset));
  for (size_t i = 0; i < attr_.encoding_bit_size; ++i) {
    bool bit_value = (set_value_ >> i) & 0x01;
    uint8_t byte{GetByteValue(bytes, lsb)};
    int32_t index = GetNormalizedBitIndex(lsb);
    int64_t old_bit = CheckBitStatus(byte, index);
    byte = bit_value ? TurnBitOn(byte, index) : TurnBitOff(byte, index);
    bytes[GetByteIndex(lsb)] = byte;
    old_value |= old_bit << i;
    --lsb;
  }

  if (is_negative && attr_.type == EncodingType::INT && old_value > 0) {
    // Sign extension for negative signed integers.
    // Is creates a mask that sets all upper bits to 1
    // and converts positive old_value (15) to correct negative value (-1)
    // Example: 4-bit field 1111 should be -1, not 15.
    old_value |= -1L ^ ((1L << attr_.encoding_bit_size) - 1);
  }

  return old_value;
}

bool Set::HandleOverflow(Overflow ov) {
  size_t total_bits = attr_.encoding_bit_size;
  if (attr_.type == EncodingType::UINT) {
    return ov.UIntOverflow(0, attr_.encoding_bit_size, &set_value_);
  }

  return ov.IntOverflow(total_bits, 0, false, &set_value_);
}

class IncrBy {
 public:
  explicit IncrBy(CommonAttributes attr, int64_t val) : attr_(attr), incr_value_(val) {
  }

  // Apply the INCRBY subcommand to the bitfield value.
  // Return either the subcommand result (int64_t) or empty optional if failed because of
  // Policy:FAIL Updates the bitfield to contain the new incremented value
  ResultType ApplyTo(Overflow ov, string* bitfield);

 private:
  // Helper function that delegates overflow checking to the Overflow object
  bool HandleOverflow(Overflow ov, int64_t* previous);

  CommonAttributes attr_;
  int64_t incr_value_;
};

ResultType IncrBy::ApplyTo(Overflow ov, string* bitfield) {
  string& bytes = *bitfield;
  Get get(attr_);
  auto res = get.ApplyTo(ov, &bytes);
  const int32_t total_bytes = static_cast<int32_t>(bytes.size());
  auto last_byte_offset = GetByteIndex(attr_.offset + attr_.encoding_bit_size - 1);

  if (last_byte_offset >= total_bytes) {
    bytes.resize(last_byte_offset + 1, 0);
  }

  if (!HandleOverflow(ov, &*res)) {
    return {};
  }

  Set set(attr_, *res);
  set.ApplyTo(ov, &bytes);
  return *res;
}

bool IncrBy::HandleOverflow(Overflow ov, int64_t* previous) {
  if (attr_.type == EncodingType::UINT) {
    return ov.UIntOverflow(incr_value_, attr_.encoding_bit_size, previous);
  }

  const size_t total_bits = attr_.encoding_bit_size;
  return ov.IntOverflow(total_bits, incr_value_, true, previous);
}

// Subcommand types for each of the subcommands of the BITFIELD command
using Command = std::variant<Get, Set, Overflow, IncrBy>;

using Result = std::optional<ResultType>;

// Visitor for all the subcommand variants. Calls ApplyTo, to execute the subcommand
class CommandApplyVisitor {
 public:
  explicit CommandApplyVisitor(string bitfield) : bitfield_(std::move(bitfield)) {
  }

  Result operator()(Get get) {
    return get.ApplyTo(overflow_, &bitfield_);
  }

  template <typename T> Result operator()(T update) {
    should_commit_ = true;
    return update.ApplyTo(overflow_, &bitfield_);
  }

  Result operator()(Overflow overflow) {
    overflow_ = overflow;
    return {};
  }

  string_view Bitfield() const {
    return bitfield_;
  }

  bool ShouldCommit() const {
    return should_commit_;
  }

 private:
  // Most recent overflow object encountered. We cache it to make the overflow
  // policy changes stick among different subcommands
  Overflow overflow_;
  // This will be commited if it was updated
  string bitfield_;
  // If either of the subcommands SET|INCRBY is used we should persist the changes.
  // Otherwise, we only used a read only subcommand (GET)
  bool should_commit_ = false;
};

// A lit of subcommands used in BITFIELD command
using CommandList = vector<Command>;

// Helper class used in the shard cb that abstracts away the iteration and execution of subcommands
class StateExecutor {
 public:
  explicit StateExecutor(ElementAccess access) : access_{std::move(access)} {
  }

  //  Iterates over all of the parsed subcommands and executes them one by one. At the end,
  //  if an update subcommand SET|INCRBY was used, commit back the changes via the ElementAccess
  //  object
  OpResult<vector<ResultType>> Execute(const CommandList& commands);

 private:
  ElementAccess access_;
};

OpResult<vector<ResultType>> StateExecutor::Execute(const CommandList& commands) {
  auto res = access_.Exists();
  if (!res) {
    return {OpStatus::WRONG_TYPE};
  }
  string value;
  if (*res) {
    access_.Find(false);
    value = access_.Value();
  }

  vector<ResultType> results;
  CommandApplyVisitor visitor(std::move(value));
  for (auto& command : commands) {
    auto res = std::visit(visitor, command);
    if (res) {
      results.push_back(*res);
    }
  }

  if (visitor.ShouldCommit()) {
    access_.Find(false);
    access_.Commit(visitor.Bitfield());
  }

  return results;
}

const char kInvalidBitfieldTypeErr[] =
    "invalid bitfield type. use something like i16 u8. note that u64 is not supported but i64 is.";

nonstd::expected<CommonAttributes, string> ParseCommonAttr(CmdArgParser* parser) {
  CommonAttributes parsed;
  using nonstd::make_unexpected;

  auto [encoding, offset_str] = parser->Next<string_view, string_view>();

  if (encoding.empty()) {
    return make_unexpected(kSyntaxErr);
  }

  // Check case-sensitivity - only lowercase 'u' and 'i' are allowed
  if (encoding[0] == 'u') {
    parsed.type = EncodingType::UINT;
  } else if (encoding[0] == 'i') {
    parsed.type = EncodingType::INT;
  } else {
    return make_unexpected(kInvalidBitfieldTypeErr);
  }

  string_view bits = encoding.substr(1);

  // Additional validation: check if bits part contains any invalid characters
  for (char c : bits) {
    if (!std::isdigit(c)) {
      return make_unexpected(kInvalidBitfieldTypeErr);
    }
  }

  if (!absl::SimpleAtoi(bits, &parsed.encoding_bit_size)) {
    return make_unexpected(kSyntaxErr);
  }

  if (parsed.encoding_bit_size <= 0 || parsed.encoding_bit_size > 64) {
    return make_unexpected(kInvalidBitfieldTypeErr);
  }

  if (parsed.encoding_bit_size == 64 && parsed.type == EncodingType::UINT) {
    return make_unexpected(kInvalidBitfieldTypeErr);
  }

  bool is_proxy = false;
  if (absl::StartsWith(offset_str, "#")) {
    offset_str = offset_str.substr(1);
    is_proxy = true;
  }
  if (!absl::SimpleAtoi(offset_str, &parsed.offset)) {
    return make_unexpected(kSyntaxErr);
  }
  if (is_proxy) {
    parsed.offset = parsed.offset * parsed.encoding_bit_size;
  }
  return parsed;
}

// Parses a list of arguments (without key) to a CommandList.
// Returns the CommandList if the parsing completed succefully or string
// to indicate an error
nonstd::expected<CommandList, string> ParseToCommandList(CmdArgList args, bool read_only) {
  enum class Cmds { OVERFLOW_OPT, GET_OPT, SET_OPT, INCRBY_OPT };
  CommandList result;

  using nonstd::make_unexpected;

  CmdArgParser parser(args);
  while (parser.HasNext()) {
    auto cmd = parser.MapNext("OVERFLOW", Cmds::OVERFLOW_OPT, "GET", Cmds::GET_OPT, "SET",
                              Cmds::SET_OPT, "INCRBY", Cmds::INCRBY_OPT);
    if (parser.TakeError()) {
      return make_unexpected(kSyntaxErr);
    }

    if (cmd == Cmds::OVERFLOW_OPT) {
      // BITFIELD_RO shouldn't support this cmd, but it is ignored in Valkey so we ignore it too
      using pol = Overflow::Policy;
      auto res = parser.MapNext("SAT", pol::SAT, "WRAP", pol::WRAP, "FAIL", pol::FAIL);
      if (!parser.HasError()) {
        result.push_back(Overflow{res});
        continue;
      }
      parser.TakeError();
      return make_unexpected(kSyntaxErr);
    }

    auto maybe_attr = ParseCommonAttr(&parser);
    if (!maybe_attr.has_value()) {
      parser.TakeError();
      return make_unexpected(std::move(maybe_attr.error()));
    }

    auto attr = maybe_attr.value();
    if (cmd == Cmds::GET_OPT) {
      result.push_back(Command(Get(attr)));
      continue;
    }

    if (read_only) {
      return make_unexpected("BITFIELD_RO only supports the GET subcommand");
    }

    int64_t value = parser.Next<int64_t>();
    if (parser.TakeError()) {
      return make_unexpected(kSyntaxErr);
    }
    if (cmd == Cmds::SET_OPT) {
      result.push_back(Command(Set(attr, value)));
      continue;
    }

    if (cmd == Cmds::INCRBY_OPT) {
      result.push_back(Command(IncrBy(attr, value)));
      continue;
    }
    parser.TakeError();
    return make_unexpected(kSyntaxErr);
  }

  return result;
}

void SendResults(const vector<ResultType>& results, SinkReplyBuilder* builder) {
  auto* rb = static_cast<RedisReplyBuilder*>(builder);
  const size_t total = results.size();
  if (total == 0) {
    rb->SendEmptyArray();
    return;
  }

  RedisReplyBuilder::ArrayScope scope{rb, results.size()};
  for (const auto& elem : results) {
    if (elem)
      rb->SendLong(*elem);
    else
      rb->SendNull();
  }
}

void BitFieldGeneric(CmdArgList args, bool read_only, Transaction* tx, SinkReplyBuilder* builder) {
  if (args.size() == 1) {
    auto* rb = static_cast<RedisReplyBuilder*>(builder);
    rb->SendEmptyArray();
    return;
  }
  auto key = ArgS(args, 0);
  auto maybe_ops_list = ParseToCommandList(args.subspan(1), read_only);

  if (!maybe_ops_list.has_value()) {
    builder->SendError(maybe_ops_list.error());
    return;
  }
  CommandList cmd_list = std::move(maybe_ops_list.value());

  auto cb = [&cmd_list, &key](Transaction* t, EngineShard* shard) -> OpResult<vector<ResultType>> {
    StateExecutor executor(ElementAccess(key, t->GetOpArgs(shard)));
    return executor.Execute(cmd_list);
  };

  OpResult<vector<ResultType>> res = tx->ScheduleSingleHopT(std::move(cb));

  if (res == OpStatus::WRONG_TYPE) {
    builder->SendError(kWrongTypeErr);
    return;
  }

  SendResults(*res, builder);
}

void BitField(CmdArgList args, CommandContext* cmd_cntx) {
  BitFieldGeneric(args, false, cmd_cntx->tx(), cmd_cntx->rb());
}

void BitFieldRo(CmdArgList args, CommandContext* cmd_cntx) {
  BitFieldGeneric(args, true, cmd_cntx->tx(), cmd_cntx->rb());
}

#ifndef __clang__
#pragma GCC diagnostic pop
#endif

void BitOp(CmdArgList args, CommandContext* cmd_cntx) {
  static const std::array<string_view, 4> BITOP_OP_NAMES{OR_OP_NAME, XOR_OP_NAME, AND_OP_NAME,
                                                         NOT_OP_NAME};
  string op = absl::AsciiStrToUpper(ArgS(args, 0));
  string_view dest_key = ArgS(args, 1);
  bool illegal = std::none_of(BITOP_OP_NAMES.begin(), BITOP_OP_NAMES.end(),
                              [&op](auto val) { return op == val; });

  auto* builder = cmd_cntx->rb();
  if (illegal || (op == NOT_OP_NAME && args.size() > 3)) {
    return builder->SendError(kSyntaxErr);  // too many arguments
  }

  // Multi shard access - read only
  ShardStringResults result_set(shard_set->size(), OpStatus::KEY_NOTFOUND);
  ShardId dest_shard = Shard(dest_key, result_set.size());

  auto shard_bitop = [&](Transaction* t, EngineShard* shard) {
    ShardArgs largs = t->GetShardArgs(shard->shard_id());
    DCHECK(!largs.Empty());
    ShardArgs::Iterator start = largs.begin(), end = largs.end();
    if (shard->shard_id() == dest_shard) {
      CHECK_EQ(*start, dest_key);
      ++start;
      if (start == end) {  // no more keys to check
        return OpStatus::OK;
      }
    }
    OpArgs op_args = t->GetOpArgs(shard);
    result_set[shard->shard_id()] = RunBitOpOnShard(op, op_args, start, end);
    return OpStatus::OK;
  };

  cmd_cntx->tx()->Execute(std::move(shard_bitop), false);  // we still have more work to do
  // All result from each shard
  const auto joined_results = CombineResultOp(result_set, op);
  // Second phase - save to target key if successful
  if (!joined_results) {
    cmd_cntx->tx()->Conclude();
    cmd_cntx->SendError(joined_results.status());
    return;
  } else {
    auto op_result = joined_results.value();
    auto store_cb = [&](Transaction* t, EngineShard* shard) {
      if (shard->shard_id() == dest_shard) {
        ElementAccess operation{dest_key, t->GetOpArgs(shard)};
        auto find_res = operation.Find(true);

        // BITOP command acts as a blind update. If the key existed and its type
        // was not a string we still want to Commit with the new value.
        if (find_res == OpStatus::OK || find_res == OpStatus::WRONG_TYPE) {
          operation.Commit(op_result);

          if (shard->journal()) {
            if (op_result.empty()) {
              // We need to delete it if the key exists. If it doesn't, we just
              // skip it and do not send it to the replica at all.
              if (!operation.IsNewEntry()) {
                RecordJournal(t->GetOpArgs(shard), "DEL", {dest_key});
              }
            } else {
              RecordJournal(t->GetOpArgs(shard), "SET", {dest_key, op_result});
            }
          }
        }
      }
      return OpStatus::OK;
    };

    cmd_cntx->tx()->Execute(std::move(store_cb), true);
    builder->SendLong(op_result.size());
  }
}

void GetBit(CmdArgList args, CommandContext* cmd_cntx) {
  // Support for the command "GETBIT key offset"
  // see https://redis.io/commands/getbit/

  uint32_t offset{0};
  string_view key = ArgS(args, 0);

  if (!absl::SimpleAtoi(ArgS(args, 1), &offset)) {
    return cmd_cntx->SendError(kInvalidIntErr);
  }
  auto cb = [&](Transaction* t, EngineShard* shard) {
    return ReadValueBitsetAt(t->GetOpArgs(shard), key, offset);
  };
  OpResult<bool> res = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  HandleOpValueResult(res, cmd_cntx->rb());
}

void SetBit(CmdArgList args, CommandContext* cmd_cntx) {
  // Support for the command "SETBIT key offset new_value"
  // see https://redis.io/commands/setbit/

  CmdArgParser parser(args);
  auto [key, offset, value] = parser.Next<string_view, uint32_t, FInt<0, 1>>();

  if (auto err = parser.TakeError(); err) {
    return cmd_cntx->SendError(err.MakeReply());
  }

  auto cb = [&, &key = key, &offset = offset, &value = value](Transaction* t, EngineShard* shard) {
    return BitNewValue(t->GetOpArgs(shard), key, offset, value != 0);
  };

  OpResult<bool> res = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  HandleOpValueResult(res, cmd_cntx->rb());
}

// ------------------------------------------------------------------------- //
// This are the "callbacks" that we're using from above
string GetString(const PrimeValue& pv) {
  string res;
  pv.GetString(&res);
  return res;
}

OpResult<bool> ReadValueBitsetAt(const OpArgs& op_args, string_view key, uint32_t offset) {
  DbSlice& db_slice = op_args.GetDbSlice();
  auto it_res = db_slice.FindReadOnly(op_args.db_cntx, key, OBJ_STRING);

  if (!it_res.ok()) {
    return it_res.status();
  }

  const PrimeValue& pv = it_res.value()->second;

  uint8_t byte_value = 0;
  if (!pv.GetByteAtIndex(GetByteIndex(offset), &byte_value)) {
    return false;
  }

  const auto bit_index = GetNormalizedBitIndex(offset);
  return CheckBitStatus(byte_value, bit_index);
}

OpResult<string> ReadValue(const DbContext& context, string_view key, EngineShard* shard) {
  DbSlice& db_slice = context.GetDbSlice(shard->shard_id());
  auto it_res = db_slice.FindReadOnly(context, key, OBJ_STRING);
  if (!it_res.ok()) {
    return it_res.status();
  }

  const PrimeValue& pv = it_res.value()->second;

  return GetString(pv);
}

OpResult<std::size_t> CountBitsForValue(const OpArgs& op_args, string_view key, int64_t start,
                                        int64_t end, bool bit_value) {
  OpResult<string> result = ReadValue(op_args.db_cntx, key, op_args.shard);

  if (result) {  // if this is not found, just return 0 - per Redis
    return CountBitSet(result.value(), start, end, bit_value);
  } else {
    return result.status();
  }
}

// Returns the bit position (where MSB is 0, LSB is 7) of the leftmost bit that
// equals `value` in `byte`. Returns 8 if not found.
std::size_t GetFirstBitWithValueInByte(uint8_t byte, bool value) {
  if (value) {
    return absl::countl_zero(byte);
  } else {
    return absl::countl_one(byte);
  }
}

int64_t FindFirstBitWithValueAsBit(string_view value_str, bool bit_value, int64_t start,
                                   int64_t end) {
  for (int64_t i = start; i <= end; ++i) {
    if (static_cast<size_t>(GetByteIndex(i)) >= value_str.size()) {
      break;
    }
    const uint8_t current_byte = GetByteValue(value_str, i);
    bool current_bit = CheckBitStatus(current_byte, GetNormalizedBitIndex(i));
    if (current_bit != bit_value) {
      continue;
    }

    return i;
  }

  return -1;
}

int64_t FindFirstBitWithValueAsByte(string_view value_str, bool bit_value, int64_t start,
                                    int64_t end) {
  for (int64_t i = start; i <= end; ++i) {
    if (static_cast<size_t>(i) >= value_str.size()) {
      break;
    }
    const uint8_t current_byte = value_str[i];
    const uint8_t kNotFoundByte = bit_value ? 0 : std::numeric_limits<uint8_t>::max();
    if (current_byte == kNotFoundByte) {
      continue;
    }

    return i * OFFSET_FACTOR + GetFirstBitWithValueInByte(current_byte, bit_value);
  }

  return -1;
}

OpResult<int64_t> FindFirstBitWithValue(const OpArgs& op_args, string_view key, bool bit_value,
                                        int64_t start, int64_t end, bool as_bit) {
  OpResult<string> value = ReadValue(op_args.db_cntx, key, op_args.shard);

  // non-existent keys are handled exactly as in Redis's implementation,
  // even though it contradicts its docs:
  //     If a clear bit isn't found in the specified range, the function returns -1
  //     as the user specified a clear range and there are no 0 bits in that range
  if (!value) {
    return bit_value ? -1 : 0;
  }

  string_view value_str = value.value();
  int64_t size = value_str.size();
  if (as_bit) {
    size *= OFFSET_FACTOR;
  }

  int64_t normalized_start = NormalizedOffset(size, start);
  int64_t normalized_end = NormalizedOffset(size, end);
  if (normalized_start > normalized_end) {
    return -1;  // Return -1 for negative ranges, per Redis
  }

  int64_t position;
  if (as_bit) {
    position = FindFirstBitWithValueAsBit(value_str, bit_value, normalized_start, normalized_end);
  } else {
    position = FindFirstBitWithValueAsByte(value_str, bit_value, normalized_start, normalized_end);
  }

  if (position == -1 && !bit_value && static_cast<size_t>(start) < value_str.size() &&
      end == std::numeric_limits<int64_t>::max()) {
    // Returning bit-size of the value, compatible with Redis (but is a weird API).
    return value_str.size() * OFFSET_FACTOR;
  } else {
    return position;
  }
}

}  // namespace

void RegisterBitopsFamily(CommandRegistry* registry) {
  using CI = CommandId;
  registry->StartFamily(acl::BITMAP);
  *registry << CI{"BITPOS", CO::CommandOpt::READONLY, -3, 1, 1}.SetHandler(&BitPos)
            << CI{"BITCOUNT", CO::READONLY, -2, 1, 1}.SetHandler(&BitCount)
            << CI{"BITFIELD", CO::JOURNALED, -2, 1, 1}.SetHandler(&BitField)
            << CI{"BITFIELD_RO", CO::FAST | CO::READONLY, -2, 1, 1}.SetHandler(&BitFieldRo)
            << CI{"BITOP", CO::JOURNALED | CO::NO_AUTOJOURNAL, -4, 2, -1}.SetHandler(&BitOp)
            << CI{"GETBIT", CO::READONLY | CO::FAST, 3, 1, 1}.SetHandler(&GetBit)
            << CI{"SETBIT", CO::JOURNALED | CO::DENYOOM, 4, 1, 1}.SetHandler(&SetBit);
}

}  // namespace dfly


================================================
FILE: src/server/bitops_family_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#include <bitset>
#include <iomanip>
#include <iostream>
#include <limits>
#include <string>
#include <string_view>

#include "absl/strings/str_cat.h"
#include "base/gtest.h"
#include "base/logging.h"
#include "facade/facade_test.h"
#include "server/conn_context.h"
#include "server/engine_shard_set.h"
#include "server/error.h"
#include "server/test_utils.h"
#include "server/transaction.h"

using namespace testing;
using namespace std;
using namespace util;
using absl::StrCat;

namespace dfly {

class Bytes {
  using char_t = std::uint8_t;
  using string_type = std::basic_string<char_t>;

 public:
  enum State { GOOD, ERROR, NIL };

  Bytes(std::initializer_list<std::uint8_t> bytes) : data_(bytes.size(), 0) {
    // note - we want this to be like its would be used in redis where most significate bit is to
    // the "left"
    std::copy(rbegin(bytes), rend(bytes), data_.begin());
  }

  explicit Bytes(unsigned long long n) : data_(sizeof(n), 0) {
    FromNumber(n);
  }

  static Bytes From(unsigned long long x) {
    return Bytes(x);
  }

  explicit Bytes(State state) : state_{state} {
  }

  Bytes(const char_t* ch, std::size_t len) : data_(ch, len) {
  }

  Bytes(const char* ch, std::size_t len) : Bytes(reinterpret_cast<const char_t*>(ch), len) {
  }

  explicit Bytes(std::string_view from) : Bytes(from.data(), from.size()) {
  }

  static Bytes From(RespExpr&& r);

  std::size_t Size() const {
    return data_.size();
  }

  operator std::string_view() const {
    return std::string_view(reinterpret_cast<const char*>(data_.data()), Size());
  }

  std::ostream& Print(std::ostream& os) const;

  std::ostream& PrintHex(std::ostream& os) const;

 private:
  template <typename T> void FromNumber(T num) {
    // note - we want this to be like its would be used in redis where most significate bit is to
    // the "left"
    std::size_t i = 0;
    for (const char_t* s = reinterpret_cast<const char_t*>(&num); i < sizeof(T); s++, i++) {
      data_[i] = *s;
    }
  }

  string_type data_;
  State state_ = GOOD;
};

Bytes Bytes::From(RespExpr&& r) {
  if (r.type == RespExpr::STRING) {
    return Bytes(ToSV(r.GetBuf()));
  } else {
    if (r.type == RespExpr::NIL || r.type == RespExpr::NIL_ARRAY) {
      return Bytes{Bytes::NIL};
    } else {
      return Bytes(Bytes::ERROR);
    }
  }
}

std::ostream& Bytes::Print(std::ostream& os) const {
  if (state_ == GOOD) {
    for (auto c : data_) {
      std::bitset<8> b{c};
      os << b << ":";
    }
  } else {
    if (state_ == NIL) {
      os << "nil";
    } else {
      os << "error";
    }
  }
  return os;
}

std::ostream& Bytes::PrintHex(std::ostream& os) const {
  if (state_ == GOOD) {
    for (auto c : data_) {
      os << std::hex << std::setfill('0') << std::setw(2) << (std::uint16_t)c << ":";
    }
  } else {
    if (state_ == NIL) {
      os << "nil";
    } else {
      os << "error";
    }
  }
  return os;
}

inline bool operator==(const Bytes& left, const Bytes& right) {
  return static_cast<const std::string_view&>(left) == static_cast<const std::string_view&>(right);
}

inline bool operator!=(const Bytes& left, const Bytes& right) {
  return !(left == right);
}

inline Bytes operator"" _b(unsigned long long x) {
  return Bytes::From(x);
}

inline Bytes operator"" _b(const char* x, std::size_t s) {
  return Bytes{x, s};
}

inline Bytes operator"" _b(const char* x) {
  return Bytes{x, std::strlen(x)};
}

inline std::ostream& operator<<(std::ostream& os, const Bytes& bs) {
  return bs.PrintHex(os);
}

class BitOpsFamilyTest : public BaseFamilyTest {
 protected:
  // only for bitop XOR, OR, AND tests
  void BitOpSetKeys();
};

// for the bitop tests we need to test with multiple keys as the issue
// is that we need to make sure that accessing multiple shards creates
// the correct result
// Since this is bit operations, we are using the bytes data type
// that makes the verification more ergonomics.
const std::pair<std::string_view, Bytes> KEY_VALUES_BIT_OP[] = {
    {"first_key", 0xFFAACC01_b},
    {"key_second", {0x1, 0xBB}},
    {"_this_is_the_third_key", {0x01, 0x05, 0x15, 0x20, 0xAA, 0xCC}},
    {"the_last_key_we_have", 0xAACC_b}};

// For the bitop XOR OR and AND we are setting these keys/value pairs
void BitOpsFamilyTest::BitOpSetKeys() {
  auto resp = Run({"set", KEY_VALUES_BIT_OP[0].first, KEY_VALUES_BIT_OP[0].second});
  EXPECT_EQ(resp, "OK");
  resp = Run({"set", KEY_VALUES_BIT_OP[1].first, KEY_VALUES_BIT_OP[1].second});
  EXPECT_EQ(resp, "OK");
  resp = Run({"set", KEY_VALUES_BIT_OP[2].first, KEY_VALUES_BIT_OP[2].second});
  EXPECT_EQ(resp, "OK");
  resp = Run({"set", KEY_VALUES_BIT_OP[3].first, KEY_VALUES_BIT_OP[3].second});
  EXPECT_EQ(resp, "OK");
}

const long EXPECTED_VALUE_SETBIT[] = {0, 1, 1, 0, 0, 0,
                                      0, 1, 0, 1, 1, 0};  // taken from running this on redis
const int32_t ITERATIONS = sizeof(EXPECTED_VALUE_SETBIT) / sizeof(EXPECTED_VALUE_SETBIT[0]);

TEST_F(BitOpsFamilyTest, GetBit) {
  auto resp = Run({"set", "foo", "abc"});

  EXPECT_EQ(resp, "OK");

  for (int32_t i = 0; i < ITERATIONS; i++) {
    EXPECT_EQ(EXPECTED_VALUE_SETBIT[i], CheckedInt({"getbit", "foo", std::to_string(i)}));
  }

  // make sure that when accessing bit that is not in the range its working and we are
  // getting 0
  EXPECT_EQ(0, CheckedInt({"getbit", "foo", std::to_string(strlen("abc") + 5)}));
}

TEST_F(BitOpsFamilyTest, SetBitExistingKey) {
  // this test would test when we have the value in place and
  // we are overriding and existing key
  // so there are no allocations of keys
  auto resp = Run({"set", "foo", "abc"});

  EXPECT_EQ(resp, "OK");

  // we are setting all to 1s first, we are expecting to get the old values
  for (int32_t i = 0; i < ITERATIONS; i++) {
    EXPECT_EQ(EXPECTED_VALUE_SETBIT[i], CheckedInt({"setbit", "foo", std::to_string(i), "1"}));
  }

  for (int32_t i = 0; i < ITERATIONS; i++) {
    EXPECT_EQ(1, CheckedInt({"getbit", "foo", std::to_string(i)}));
  }
}

TEST_F(BitOpsFamilyTest, SetBitMissingKey) {
  // This test would run without pre-allocated existing key
  // so we need to allocate the key as part of setting the values
  for (int32_t i = 0; i < ITERATIONS; i++) {  // we are setting all to 1s first, we are expecting
    // get 0s since we didn't have this key before
    EXPECT_EQ(0, CheckedInt({"setbit", "foo", std::to_string(i), "1"}));
  }
  // now all that we set are at 1s
  for (int32_t i = 0; i < ITERATIONS; i++) {
    EXPECT_EQ(1, CheckedInt({"getbit", "foo", std::to_string(i)}));
  }
}

TEST_F(BitOpsFamilyTest, SetBitIncorrectValues) {
  EXPECT_EQ(0, CheckedInt({"setbit", "foo", "0", "1"}));
  EXPECT_THAT(Run({"setbit", "foo", "1", "-1"}),
              ErrArg("ERR value is not an integer or out of range"));
  EXPECT_THAT(Run({"setbit", "foo", "2", "11"}),
              ErrArg("ERR value is not an integer or out of range"));
  EXPECT_THAT(Run({"setbit", "foo", "3", "a"}),
              ErrArg("ERR value is not an integer or out of range"));
  EXPECT_THAT(Run({"setbit", "foo", "4", "O"}),
              ErrArg("ERR value is not an integer or out of range"));
  EXPECT_EQ(1, CheckedInt({"getbit", "foo", "0"}));
  EXPECT_EQ(0, CheckedInt({"getbit", "foo", "1"}));
  EXPECT_EQ(0, CheckedInt({"getbit", "foo", "2"}));
  EXPECT_EQ(0, CheckedInt({"getbit", "foo", "3"}));
  EXPECT_EQ(0, CheckedInt({"getbit", "foo", "4"}));
}

TEST_F(BitOpsFamilyTest, SetBitExtendExistingKey) {
  // This test verifies SETBIT correctly extends an existing key beyond its current length.
  // It sets up a small 3-byte key ("abc") and then sets a bit far beyond byte index 2,
  // ensuring the string is extended with zeros and the bit is set correctly.
  auto resp = Run({"set", "foo", "abc"});
  EXPECT_EQ(resp, "OK");

  // Verify initial string length is 3 bytes (24 bits)
  EXPECT_EQ(3, CheckedInt({"strlen", "foo"}));

  // Set bit at offset 100 (byte index 12, bit 4 within that byte)
  // This should extend the string from 3 bytes to 13 bytes
  // The old value should be 0 since the string didn't extend that far
  EXPECT_EQ(0, CheckedInt({"setbit", "foo", "100", "1"}));

  // Verify the string was extended to 13 bytes (100 bits / 8 = 12.5, rounded up to 13)
  EXPECT_EQ(13, CheckedInt({"strlen", "foo"}));

  // Verify the bit at offset 100 is now set to 1
  EXPECT_EQ(1, CheckedInt({"getbit", "foo", "100"}));

  // Verify bits in the extended region (between original end and new bit) are 0
  EXPECT_EQ(0, CheckedInt({"getbit", "foo", "24"}));  // First bit after "abc"
  EXPECT_EQ(0, CheckedInt({"getbit", "foo", "50"}));  // Middle of extended region
  EXPECT_EQ(0, CheckedInt({"getbit", "foo", "99"}));  // Just before the set bit

  // Verify original bits are unchanged
  EXPECT_EQ(EXPECTED_VALUE_SETBIT[0], CheckedInt({"getbit", "foo", "0"}));
  EXPECT_EQ(EXPECTED_VALUE_SETBIT[1], CheckedInt({"getbit", "foo", "1"}));
  EXPECT_EQ(EXPECTED_VALUE_SETBIT[2], CheckedInt({"getbit", "foo", "2"}));

  // Set the same bit to 0 and verify we get back 1 (the current value)
  EXPECT_EQ(1, CheckedInt({"setbit", "foo", "100", "0"}));
  EXPECT_EQ(0, CheckedInt({"getbit", "foo", "100"}));
}

const int32_t EXPECTED_VALUES_BYTES_BIT_COUNT[] = {  // got this from redis 0 as start index
    4, 7, 11, 14, 17, 21, 21, 21, 21};

const int32_t BYTES_EXPECTED_VALUE_LEN =
    sizeof(EXPECTED_VALUES_BYTES_BIT_COUNT) / sizeof(EXPECTED_VALUES_BYTES_BIT_COUNT[0]);

TEST_F(BitOpsFamilyTest, BitCountByte) {
  // This would run without the bit flag - meaning it count on bytes boundaries
  auto resp = Run({"set", "foo", "farbar"});
  EXPECT_EQ(resp, "OK");
  EXPECT_EQ(0, CheckedInt({"bitcount", "foo2"}));  // on none existing key we are expecting 0

  for (int32_t i = 0; i < BYTES_EXPECTED_VALUE_LEN; i++) {
    EXPECT_EQ(EXPECTED_VALUES_BYTES_BIT_COUNT[i],
              CheckedInt({"bitcount", "foo", "0", std::to_string(i)}));
  }
  EXPECT_EQ(21, CheckedInt({"bitcount", "foo"}));  // the total number of bits in this value
}

TEST_F(BitOpsFamilyTest, BitCountByteSubRange) {
  // This test test using some sub ranges of bit count on bytes
  auto resp = Run({"set", "foo", "farbar"});
  EXPECT_EQ(resp, "OK");
  EXPECT_EQ(3, CheckedInt({"bitcount", "foo", "1", "1"}));
  EXPECT_EQ(7, CheckedInt({"bitcount", "foo", "1", "2"}));
  EXPECT_EQ(4, CheckedInt({"bitcount", "foo", "2", "2"}));
  EXPECT_EQ(0, CheckedInt({"bitcount", "foo", "3", "2"}));  // illegal range
  EXPECT_EQ(10, CheckedInt({"bitcount", "foo", "-3", "-1"}));
  EXPECT_EQ(13, CheckedInt({"bitcount", "foo", "-5", "-2"}));
  EXPECT_EQ(0, CheckedInt({"bitcount", "foo", "-1", "-2"}));  // illegal range
  EXPECT_EQ(0, CheckedInt({"bitcount", "foo", "1", "0"}));    // illegal range
}

TEST_F(BitOpsFamilyTest, BitCountByteBitSubRange) {
  // This test test using some sub ranges of bit count on bytes
  auto resp = Run({"set", "foo", "abcdef"});
  EXPECT_EQ(resp, "OK");
  resp = Run({"bitcount", "foo", "bar", "BIT"});
  ASSERT_THAT(resp, ErrArg("value is not an integer or out of range"));

  EXPECT_EQ(1, CheckedInt({"bitcount", "foo", "1", "1", "BIT"}));
  EXPECT_EQ(2, CheckedInt({"bitcount", "foo", "1", "2", "BIT"}));
  EXPECT_EQ(1, CheckedInt({"bitcount", "foo", "2", "2", "BIT"}));
  EXPECT_EQ(0, CheckedInt({"bitcount", "foo", "3", "2", "bit"}));  // illegal range
  EXPECT_EQ(2, CheckedInt({"bitcount", "foo", "-3", "-1", "bit"}));
  EXPECT_EQ(2, CheckedInt({"bitcount", "foo", "-5", "-2", "bit"}));
  EXPECT_EQ(4, CheckedInt({"bitcount", "foo", "1", "9", "bit"}));
  EXPECT_EQ(7, CheckedInt({"bitcount", "foo", "2", "19", "bit"}));
  EXPECT_EQ(0, CheckedInt({"bitcount", "foo", "-1", "-2", "bit"}));  // illegal range
}

// ------------------------- BITOP tests

const auto EXPECTED_LEN_BITOP =
    std::max(KEY_VALUES_BIT_OP[0].second.Size(), KEY_VALUES_BIT_OP[1].second.Size());
const auto EXPECTED_LEN_BITOP2 = std::max(EXPECTED_LEN_BITOP, KEY_VALUES_BIT_OP[2].second.Size());
const auto EXPECTED_LEN_BITOP3 = std::max(EXPECTED_LEN_BITOP2, KEY_VALUES_BIT_OP[3].second.Size());

TEST_F(BitOpsFamilyTest, BitOpsAnd) {
  BitOpSetKeys();
  auto resp = Run({"bitop", "foo", "bar", "abc"});  // should failed this is illegal operation
  ASSERT_THAT(resp, ErrArg("syntax error"));
  // run with none existing keys, should return 0
  EXPECT_EQ(0, CheckedInt({"bitop", "and", "dest_key", "1", "2", "3"}));

  // bitop AND single key
  EXPECT_EQ(KEY_VALUES_BIT_OP[0].second.Size(),
            CheckedInt({"bitop", "and", "foo_out", KEY_VALUES_BIT_OP[0].first}));

  auto res = Bytes::From(Run({"get", "foo_out"}));
  EXPECT_EQ(res, KEY_VALUES_BIT_OP[0].second);

  // this will 0 all values other than one bit it would end with result with length ==
  //     FOO_KEY_VALUE && value == BAR_KEY_VALUE
  EXPECT_EQ(EXPECTED_LEN_BITOP, CheckedInt({"bitop", "and", "foo-out", KEY_VALUES_BIT_OP[0].first,
                                            KEY_VALUES_BIT_OP[1].first}));
  const auto EXPECTED_RESULT = Bytes((0xffaacc01 & 0x1BB));  // first and second values
  res = Bytes::From(Run({"get", "foo-out"}));
  EXPECT_EQ(res, EXPECTED_RESULT);

  // test bitop AND with 3 keys
  EXPECT_EQ(EXPECTED_LEN_BITOP2,
            CheckedInt({"bitop", "and", "foo-out", KEY_VALUES_BIT_OP[0].first,
                        KEY_VALUES_BIT_OP[1].first, KEY_VALUES_BIT_OP[2].first}));
  const auto EXPECTED_RES2 = Bytes((0xffaacc01 & 0x1BB & 0x01051520AACC));
  res = Bytes::From(Run({"get", "foo-out"}));
  EXPECT_EQ(EXPECTED_RES2, res);

  // test bitop AND with 4 parameters
  const auto EXPECTED_RES3 = Bytes((0xffaacc01 & 0x1BB & 0x01051520AACC & 0xAACC));
  EXPECT_EQ(EXPECTED_LEN_BITOP3, CheckedInt({"bitop", "and", "foo-out", KEY_VALUES_BIT_OP[0].first,
                                             KEY_VALUES_BIT_OP[1].first, KEY_VALUES_BIT_OP[2].first,
                                             KEY_VALUES_BIT_OP[3].first}));
  res = Bytes::From(Run({"get", "foo-out"}));
  EXPECT_EQ(EXPECTED_RES3, res);
}

TEST_F(BitOpsFamilyTest, BitOpsOr) {
  BitOpSetKeys();

  EXPECT_EQ(0, CheckedInt({"bitop", "or", "dest_key", "1", "2", "3"}));

  // bitop or single key
  EXPECT_EQ(KEY_VALUES_BIT_OP[0].second.Size(),
            CheckedInt({"bitop", "or", "foo_out", KEY_VALUES_BIT_OP[0].first}));

  auto res = Bytes::From(Run({"get", "foo_out"}));
  EXPECT_EQ(res, KEY_VALUES_BIT_OP[0].second);

  // bitop OR 2 keys
  EXPECT_EQ(EXPECTED_LEN_BITOP, CheckedInt({"bitop", "or", "foo-out", KEY_VALUES_BIT_OP[0].first,
                                            KEY_VALUES_BIT_OP[1].first}));
  const auto EXPECTED_RESULT = Bytes((0xffaacc01 | 0x1BB));  // first or second values
  res = Bytes::From(Run({"get", "foo-out"}));
  EXPECT_EQ(res, EXPECTED_RESULT);

  // bitop OR with 3 keys
  EXPECT_EQ(EXPECTED_LEN_BITOP2,
            CheckedInt({"bitop", "or", "foo-out", KEY_VALUES_BIT_OP[0].first,
                        KEY_VALUES_BIT_OP[1].first, KEY_VALUES_BIT_OP[2].first}));
  const auto EXPECTED_RES2 = Bytes((0xffaacc01 | 0x1BB | 0x01051520AACC));
  res = Bytes::From(Run({"get", "foo-out"}));
  EXPECT_EQ(EXPECTED_RES2, res);

  // bitop OR with 4 keys
  const auto EXPECTED_RES3 = Bytes((0xffaacc01 | 0x1BB | 0x01051520AACC | 0xAACC));
  EXPECT_EQ(EXPECTED_LEN_BITOP3, CheckedInt({"bitop", "or", "foo-out", KEY_VALUES_BIT_OP[0].first,
                                             KEY_VALUES_BIT_OP[1].first, KEY_VALUES_BIT_OP[2].first,
                                             KEY_VALUES_BIT_OP[3].first}));
  res = Bytes::From(Run({"get", "foo-out"}));
  EXPECT_EQ(EXPECTED_RES3, res);
}

TEST_F(BitOpsFamilyTest, BitOpsXor) {
  BitOpSetKeys();

  EXPECT_EQ(0, CheckedInt({"bitop", "or", "dest_key", "1", "2", "3"}));

  // bitop XOR on single key
  EXPECT_EQ(KEY_VALUES_BIT_OP[0].second.Size(),
            CheckedInt({"bitop", "xor", "foo_out", KEY_VALUES_BIT_OP[0].first}));
  auto res = Bytes::From(Run({"get", "foo_out"}));
  EXPECT_EQ(res, KEY_VALUES_BIT_OP[0].second);

  // bitop on XOR with two keys
  EXPECT_EQ(EXPECTED_LEN_BITOP, CheckedInt({"bitop", "xor", "foo-out", KEY_VALUES_BIT_OP[0].first,
                                            KEY_VALUES_BIT_OP[1].first}));
  const auto EXPECTED_RESULT = Bytes((0xffaacc01 ^ 0x1BB));  // first xor second values
  res = Bytes::From(Run({"get", "foo-out"}));
  EXPECT_EQ(res, EXPECTED_RESULT);

  // bitop XOR with 3 keys
  EXPECT_EQ(EXPECTED_LEN_BITOP2,
            CheckedInt({"bitop", "xor", "foo-out", KEY_VALUES_BIT_OP[0].first,
                        KEY_VALUES_BIT_OP[1].first, KEY_VALUES_BIT_OP[2].first}));
  const auto EXPECTED_RES2 = Bytes((0xffaacc01 ^ 0x1BB ^ 0x01051520AACC));
  res = Bytes::From(Run({"get", "foo-out"}));
  EXPECT_EQ(EXPECTED_RES2, res);

  // bitop XOR with 4 keys
  const auto EXPECTED_RES3 = Bytes((0xffaacc01 ^ 0x1BB ^ 0x01051520AACC ^ 0xAACC));
  EXPECT_EQ(EXPECTED_LEN_BITOP3, CheckedInt({"bitop", "xor", "foo-out", KEY_VALUES_BIT_OP[0].first,
                                             KEY_VALUES_BIT_OP[1].first, KEY_VALUES_BIT_OP[2].first,
                                             KEY_VALUES_BIT_OP[3].first}));
  res = Bytes::From(Run({"get", "foo-out"}));
  EXPECT_EQ(EXPECTED_RES3, res);
}

TEST_F(BitOpsFamilyTest, BitOpsNot) {
  // should failed this is illegal number of args
  auto resp = Run({"bitop", "not", "bar", "abc", "efg"});
  ASSERT_THAT(resp, ErrArg("syntax error"));

  // Make sure that this works with none existing key as well
  EXPECT_EQ(0, CheckedInt({"bitop", "NOT", "bit-op-not-none-existing-key-results",
                           "this-key-do-not-exists"}));
  ASSERT_THAT(Run({"get", "bit-op-not-none-existing-key-results"}), ArgType(RespExpr::Type::NIL));

  EXPECT_EQ(Run({"set", "foo", "bar"}), "OK");
  EXPECT_EQ(0, CheckedInt({"bitop", "NOT", "foo", "this-key-do-not-exists"}));
  ASSERT_THAT(Run({"get", "foo"}), ArgType(RespExpr::Type::NIL));

  // Change the type of foo. Bitops is similar to set command. It's a blind update.
  ASSERT_THAT(Run({"hset", "foo", "bar", "val"}), IntArg(1));
  EXPECT_EQ(0, CheckedInt({"bitop", "NOT", "foo", "this-key-do-not-exists"}));
  ASSERT_THAT(Run({"get", "foo"}), ArgType(RespExpr::Type::NIL));

  // test bitop not
  resp = Run({"set", KEY_VALUES_BIT_OP[0].first, KEY_VALUES_BIT_OP[0].second});
  EXPECT_EQ(KEY_VALUES_BIT_OP[0].second.Size(),
            CheckedInt({"bitop", "not", "foo_out", KEY_VALUES_BIT_OP[0].first}));
  auto res = Bytes::From(Run({"get", "foo_out"}));

  const auto NOT_RESULTS = Bytes(~0xFFAACC01ull);
  EXPECT_EQ(res, NOT_RESULTS);
}

TEST_F(BitOpsFamilyTest, BitOpOverwritesNonStringKeyAccounting) {
  string long_value(128, 'a');
  auto resp = Run({"set", "src", long_value});
  EXPECT_EQ(resp, "OK");

  resp = Run({"rpush", "dest", "a", "b", "c"});
  EXPECT_THAT(resp, IntArg(3));

  Metrics before = GetMetrics();
  ASSERT_FALSE(before.db_stats.empty());
  const size_t list_before = before.db_stats[0].memory_usage_by_type[OBJ_LIST];
  const size_t str_before = before.db_stats[0].memory_usage_by_type[OBJ_STRING];
  ASSERT_GT(list_before, 0u);

  resp = Run({"bitop", "or", "dest", "src"});
  EXPECT_THAT(resp, IntArg(128));
  EXPECT_EQ(Run({"type", "dest"}), "string");
  EXPECT_EQ(Run({"get", "dest"}), long_value);

  Metrics after = GetMetrics();
  const size_t list_after = after.db_stats[0].memory_usage_by_type[OBJ_LIST];
  const size_t str_after = after.db_stats[0].memory_usage_by_type[OBJ_STRING];
  EXPECT_EQ(0, list_after);
  EXPECT_GT(str_after, str_before);
}

TEST_F(BitOpsFamilyTest, BitPos) {
  ASSERT_EQ(Run({"set", "a", "\x00\x00\x06\xff\xf0"_b}), "OK");

  // Find clear bits
  EXPECT_EQ(0, CheckedInt({"bitpos", "a", "0"}));
  EXPECT_EQ(8, CheckedInt({"bitpos", "a", "0", "1"}));
  EXPECT_EQ(16, CheckedInt({"bitpos", "a", "0", "2"}));
  EXPECT_EQ(-1, CheckedInt({"bitpos", "a", "0", "100"}));
  EXPECT_EQ(-1, CheckedInt({"bitpos", "a", "0", "100", "103"}));
  EXPECT_EQ(-1, CheckedInt({"bitpos", "a", "0", "100", "0"}));
  EXPECT_EQ(0, CheckedInt({"bitpos", "a", "0", "0", "100"}));
  EXPECT_EQ(8, CheckedInt({"bitpos", "a", "0", "1", "100"}));
  EXPECT_EQ(0, CheckedInt({"bitpos", "a", "0", "0", "-3"}));
  EXPECT_EQ(8, CheckedInt({"bitpos", "a", "0", "1", "-2"}));
  EXPECT_EQ(36, CheckedInt({"bitpos", "a", "0", "3"}));
  EXPECT_EQ(36, CheckedInt({"bitpos", "a", "0", "4"}));
  EXPECT_EQ(36, CheckedInt({"bitpos", "a", "0", "-2"}));
  EXPECT_EQ(36, CheckedInt({"bitpos", "a", "0", "-2", "-1"}));
  EXPECT_EQ(36, CheckedInt({"bitpos", "a", "0", "-1"}));
  EXPECT_EQ(0, CheckedInt({"bitpos", "a", "0", "-100"}));

  // Find clear bits, explicitly mention "BYTE"
  EXPECT_EQ(-1, CheckedInt({"bitpos", "a", "0", "100", "103", "BYTE"}));
  EXPECT_EQ(-1, CheckedInt({"bitpos", "a", "0", "100", "0", "BYTE"}));
  EXPECT_EQ(0, CheckedInt({"bitpos", "a", "0", "0", "100", "BYTE"}));
  EXPECT_EQ(8, CheckedInt({"bitpos", "a", "0", "1", "100", "BYTE"}));
  EXPECT_EQ(0, CheckedInt({"bitpos", "a", "0", "0", "-3", "BYTE"}));
  EXPECT_EQ(8, CheckedInt({"bitpos", "a", "0", "1", "-2", "BYTE"}));
  EXPECT_EQ(36, CheckedInt({"bitpos", "a", "0", "-2", "-1", "BYTE"}));

  // Find clear bits using "BIT"
  EXPECT_EQ(-1, CheckedInt({"bitpos", "a", "0", "100", "103", "BIT"}));
  EXPECT_EQ(-1, CheckedInt({"bitpos", "a", "0", "100", "0", "BIT"}));
  EXPECT_EQ(0, CheckedInt({"bitpos", "a", "0", "0", "100", "BIT"}));
  EXPECT_EQ(1, CheckedInt({"bitpos", "a", "0", "1", "100", "BIT"}));
  EXPECT_EQ(2, CheckedInt({"bitpos", "a", "0", "2", "100", "BIT"}));
  EXPECT_EQ(16, CheckedInt({"bitpos", "a", "0", "16", "100", "BIT"}));
  EXPECT_EQ(23, CheckedInt({"bitpos", "a", "0", "21", "100", "BIT"}));
  EXPECT_EQ(36, CheckedInt({"bitpos", "a", "0", "24", "100", "BIT"}));
  EXPECT_EQ(0, CheckedInt({"bitpos", "a", "0", "0", "-3", "BIT"}));
  EXPECT_EQ(1, CheckedInt({"bitpos", "a", "0", "1", "-2", "BIT"}));
  EXPECT_EQ(38, CheckedInt({"bitpos", "a", "0", "-2", "-1", "BIT"}));

  // Find set bits
  EXPECT_EQ(21, CheckedInt({"bitpos", "a", "1"}));
  EXPECT_EQ(21, CheckedInt({"bitpos", "a", "1", "0"}));
  EXPECT_EQ(21, CheckedInt({"bitpos", "a", "1", "1"}));
  EXPECT_EQ(21, CheckedInt({"bitpos", "a", "1", "2"}));
  EXPECT_EQ(24, CheckedInt({"bitpos", "a", "1", "3"}));
  EXPECT_EQ(32, CheckedInt({"bitpos", "a", "1", "4"}));
  EXPECT_EQ(32, CheckedInt({"bitpos", "a", "1", "-1"}));
  EXPECT_EQ(24, CheckedInt({"bitpos", "a", "1", "-2"}));
  EXPECT_EQ(21, CheckedInt({"bitpos", "a", "1", "-3"}));
  EXPECT_EQ(21, CheckedInt({"bitpos", "a", "1", "-4"}));
  EXPECT_EQ(21, CheckedInt({"bitpos", "a", "1", "-5"}));
  EXPECT_EQ(21, CheckedInt({"bitpos", "a", "1", "-6"}));
  EXPECT_EQ(21, CheckedInt({"bitpos", "a", "1", "-100"}));
  EXPECT_EQ(-1, CheckedInt({"bitpos", "a", "1", "0", "0"}));
  EXPECT_EQ(-1, CheckedInt({"bitpos", "a", "1", "0", "1"}));
  EXPECT_EQ(21, CheckedInt({"bitpos", "a", "1", "0", "3"}));
  EXPECT_EQ(21, CheckedInt({"bitpos", "a", "1", "0", "100"}));
  EXPECT_EQ(21, CheckedInt({"bitpos", "a", "1", "2", "2"}));
  EXPECT_EQ(21, CheckedInt({"bitpos", "a", "1", "2", "3"}));
  EXPECT_EQ(32, CheckedInt({"bitpos", "a", "1", "-1", "-1"}));
  EXPECT_EQ(24, CheckedInt({"bitpos", "a", "1", "-2", "-1"}));
  EXPECT_EQ(-1, CheckedInt({"bitpos", "a", "1", "-1", "-2"}));

  // Find set bits, explicitly mention "BYTE"
  EXPECT_EQ(-1, CheckedInt({"bitpos", "a", "1", "0", "0", "BYTE"}));
  EXPECT_EQ(-1, CheckedInt({"bitpos", "a", "1", "0", "1", "BYTE"}));
  EXPECT_EQ(21, CheckedInt({"bitpos", "a", "1", "0", "3", "BYTE"}));
  EXPECT_EQ(21, CheckedInt({"bitpos", "a", "1", "0", "100", "BYTE"}));
  EXPECT_EQ(21, CheckedInt({"bitpos", "a", "1", "2", "2", "BYTE"}));
  EXPECT_EQ(21, CheckedInt({"bitpos", "a", "1", "2", "3", "BYTE"}));
  EXPECT_EQ(32, CheckedInt({"bitpos", "a", "1", "-1", "-1", "BYTE"}));
  EXPECT_EQ(24, CheckedInt({"bitpos", "a", "1", "-2", "-1", "BYTE"}));
  EXPECT_EQ(-1, CheckedInt({"bitpos", "a", "1", "-1", "-2", "BYTE"}));

  // Find set bits using "BIT"
  EXPECT_EQ(-1, CheckedInt({"bitpos", "a", "1", "0", "0", "BIT"}));
  EXPECT_EQ(-1, CheckedInt({"bitpos", "a", "1", "0", "1", "BIT"}));
  EXPECT_EQ(21, CheckedInt({"bitpos", "a", "1", "0", "21", "BIT"}));
  EXPECT_EQ(21, CheckedInt({"bitpos", "a", "1", "21", "21", "BIT"}));
  EXPECT_EQ(21, CheckedInt({"bitpos", "a", "1", "21", "100", "BIT"}));
  EXPECT_EQ(21, CheckedInt({"bitpos", "a", "1", "0", "100", "BIT"}));
  EXPECT_EQ(-1, CheckedInt({"bitpos", "a", "1", "-1", "-1", "BIT"}));
  EXPECT_EQ(-1, CheckedInt({"bitpos", "a", "1", "-4", "-1", "BIT"}));
  EXPECT_EQ(35, CheckedInt({"bitpos", "a", "1", "-5", "-1", "BIT"}));
  EXPECT_EQ(34, CheckedInt({"bitpos", "a", "1", "-6", "-1", "BIT"}));

  // Make sure we behave like Redis does when looking for clear bits in an all-set string.
  ASSERT_EQ(Run({"set", "b", "\xff\xff\xff"_b}), "OK");
  EXPECT_EQ(24, CheckedInt({"bitpos", "b", "0"}));
  EXPECT_EQ(24, CheckedInt({"bitpos", "b", "0", "0"}));
  EXPECT_EQ(24, CheckedInt({"bitpos", "b", "0", "1"}));
  EXPECT_EQ(24, CheckedInt({"bitpos", "b", "0", "2"}));
  EXPECT_EQ(-1, CheckedInt({"bitpos", "b", "0", "3"}));
  EXPECT_EQ(-1, CheckedInt({"bitpos", "b", "0", "0", "1"}));
  EXPECT_EQ(-1, CheckedInt({"bitpos", "b", "0", "0", "1", "BYTE"}));
  EXPECT_EQ(-1, CheckedInt({"bitpos", "b", "0", "0", "3"}));
  EXPECT_EQ(-1, CheckedInt({"bitpos", "b", "0", "0", "3", "BYTE"}));

  ASSERT_EQ(Run({"set", "empty", ""_b}), "OK");
  EXPECT_EQ(-1, CheckedInt({"bitpos", "empty", "0"}));
  EXPECT_EQ(-1, CheckedInt({"bitpos", "empty", "0", "1"}));

  // Non-existent key should be treated like padded with zeros string.
  EXPECT_EQ(-1, CheckedInt({"bitpos", "d", "1"}));
  EXPECT_EQ(0, CheckedInt({"bitpos", "d", "0"}));

  // Make sure we accept only 0 and 1 for the bit mode arguement.
  const auto argument_must_be_0_or_1_error = ErrArg("ERR The bit argument must be 1 or 0");
  ASSERT_THAT(Run({"bitpos", "d", "2"}), argument_must_be_0_or_1_error);
  ASSERT_THAT(Run({"bitpos", "d", "42"}), argument_must_be_0_or_1_error);
  ASSERT_THAT(Run({"bitpos", "d", "-1"}), argument_must_be_0_or_1_error);
}

TEST_F(BitOpsFamilyTest, BitFieldParsing) {
  const auto syntax_error = ErrArg("ERR syntax error");
  // Parsing Errors
  ASSERT_THAT(Run({"bitfield", "foo", "set", "u1"}), syntax_error);
  ASSERT_THAT(Run({"bitfield", "foo", "set", "u1", "0"}), syntax_error);
  ASSERT_THAT(Run({"bitfield", "foo", "set", "u1", "0", "0", "55"}), syntax_error);
  ASSERT_THAT(Run({"bitfield", "foo", "set", "u1", "0", "0", "get", "u1"}), syntax_error);
  ASSERT_THAT(Run({"bitfield", "foo", "incrby", "u1"}), syntax_error);
  ASSERT_THAT(Run({"bitfield", "foo", "incrby", "u1", "0"}), syntax_error);
  ASSERT_THAT(Run({"bitfield", "foo", "get", "u1", "0", "15"}), syntax_error);
  ASSERT_THAT(Run({"bitfield", "foo", "get"}), syntax_error);
  ASSERT_THAT(Run({"bitfield", "foo", "set", "u1", "0", "0", "set"}), syntax_error);
  ASSERT_THAT(Run({"bitfield", "foo", "overflow"}), syntax_error);
  ASSERT_THAT(Run({"bitfield", "foo", "overflow", "nonsense"}), syntax_error);

  // Range errors
  auto expected_error = ErrArg(
      "ERR invalid bitfield type. use something like i16 u8. note that u64 is not supported but "
      "i64 is.");

  ASSERT_THAT(Run({"bitfield", "foo", "set", "u0", "0", "0"}), expected_error);
  ASSERT_THAT(Run({"bitfield", "foo", "set", "u0", "0", "0"}), expected_error);
  ASSERT_THAT(Run({"bitfield", "foo", "set", "u64", "0", "0"}), expected_error);
  ASSERT_THAT(Run({"bitfield", "foo", "set", "u65", "0", "0"}), expected_error);
  ASSERT_THAT(Run({"bitfield", "foo", "set", "i65", "0", "0"}), expected_error);

  expected_error = ErrArg("BITFIELD_RO only supports the GET subcommand");
  ASSERT_THAT(Run({"bitfield_ro", "foo", "set", "u1", "0", "0"}), expected_error);
  ASSERT_THAT(Run({"bitfield_ro", "foo", "incrby", "i64", "0", "15"}), expected_error);
}

TEST_F(BitOpsFamilyTest, BitFieldCreate) {
  // check that SET, INCR create the key when it does not exist
  ASSERT_THAT(Run({"bitfield", "foo", "set", "u1", "0", "1"}), IntArg(0));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "u1", "0"}), IntArg(1));
  ASSERT_THAT(Run({"bitfield", "foo", "incrby", "u1", "1", "1"}), IntArg(1));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "u1", "1"}), IntArg(1));
}

TEST_F(BitOpsFamilyTest, BitFieldOverflowUnderflow) {
  Run({"bitfield", "foo", "set", "u2", "0", "2"});

  // unsigned 1bit
  ASSERT_THAT(Run({"bitfield", "foo", "set", "u1", "0", "2"}), IntArg(1));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "u1", "0"}), IntArg(0));
  ASSERT_THAT(Run({"bitfield", "foo", "incrby", "u1", "1", "2"}), IntArg(0));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "u1", "1"}), IntArg(0));

  // unsigned 63bit
  int64_t max = std::numeric_limits<int64_t>::max();
  Run({"bitfield", "foo", "set", "i64", "0", StrCat(max)});
  ASSERT_THAT(Run({"bitfield", "foo", "incrby", "i64", "0", "1"}), IntArg(-max - 1));

  // signed 1 bit
  Run({"bitfield", "foo", "set", "i1", "0", "-2"});
  ASSERT_THAT(Run({"bitfield", "foo", "get", "i1", "0"}), IntArg(0));
  ASSERT_THAT(Run({"bitfield", "foo", "incrby", "i1", "0", "-1"}), IntArg(-1));
  ASSERT_THAT(Run({"bitfield", "foo", "incrby", "i1", "0", "-1"}), IntArg(0));
  ASSERT_THAT(Run({"bitfield", "foo", "incrby", "i1", "0", "-3"}), IntArg(-1));

  int64_t min = std::numeric_limits<int64_t>::min();
  Run({"bitfield", "foo", "set", "i8", "0", StrCat(min)});
  ASSERT_THAT(Run({"bitfield", "foo", "get", "i8", "0"}), IntArg(0));

  // signed 64 bit
  Run({"bitfield", "foo", "set", "i64", "0", StrCat(min)});
  ASSERT_THAT(Run({"bitfield", "foo", "incrby", "i64", "0", "-1"}), IntArg(max));

  // overflow sat
  // unsigned 8 bit
  Run({"bitfield", "foo", "set", "u1", "0", "0"});
  ASSERT_THAT(Run({"bitfield", "foo", "overflow", "sat", "incrby", "u8", "0", "300"}), IntArg(255));
  ASSERT_THAT(Run({"bitfield", "foo", "overflow", "sat", "incrby", "u8", "0", "10"}), IntArg(255));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "u8", "0"}), IntArg(255));

  // unsigned 63 bit
  Run({"bitfield", "foo", "set", "u63", "0", "0"});
  ASSERT_THAT(Run({"bitfield", "foo", "overflow", "sat", "set", "u63", "0", StrCat(max)}),
              IntArg(0));
  ASSERT_THAT(Run({"bitfield", "foo", "overflow", "sat", "incrby", "u63", "0", "10"}), IntArg(max));

  // signed 8 bit
  Run({"bitfield", "foo", "set", "u8", "0", "0"});
  ASSERT_THAT(Run({"bitfield", "foo", "overflow", "sat", "set", "i8", "0", "300"}), IntArg(0));
  ASSERT_THAT(Run({"bitfield", "foo", "overflow", "sat", "incrby", "i8", "0", "-127"}), IntArg(0));
  ASSERT_THAT(Run({"bitfield", "foo", "overflow", "sat", "incrby", "i8", "0", "-255"}),
              IntArg(-128));

  // signed 64 bit
  Run({"bitfield", "foo", "set", "i64", "0", "0"});
  ASSERT_THAT(Run({"bitfield", "foo", "overflow", "sat", "set", "i64", "0", StrCat(max)}),
              IntArg(0));
  ASSERT_THAT(Run({"bitfield", "foo", "overflow", "sat", "incrby", "i64", "0", "100"}),
              IntArg(max));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "i64", "0"}), IntArg(max));
  ASSERT_THAT(Run({"bitfield", "foo", "overflow", "sat", "set", "i64", "0", StrCat(min)}),
              IntArg(max));
  ASSERT_THAT(Run({"bitfield", "foo", "overflow", "sat", "incrby", "i64", "0", "-100"}),
              IntArg(min));

  // overflow fail
  // unsigned
  ASSERT_THAT(Run({"bitfield", "foo", "overflow", "fail", "set", "u8", "0", "300"}),
              ArgType(RespExpr::Type::NIL));
  ASSERT_THAT(Run({"bitfield", "foo", "overflow", "fail", "incrby", "u1", "0", "10"}),
              ArgType(RespExpr::Type::NIL));
  ASSERT_THAT(Run({"bitfield", "foo", "overflow", "fail", "incrby", "u1", "0", "-10"}),
              ArgType(RespExpr::Type::NIL));

  // signed
  ASSERT_THAT(Run({"bitfield", "foo", "overflow", "fail", "incrby", "i8", "0", "300"}),
              ArgType(RespExpr::Type::NIL));
  ASSERT_THAT(Run({"bitfield", "foo", "overflow", "fail", "incrby", "i1", "0", "10"}),
              ArgType(RespExpr::Type::NIL));
  ASSERT_THAT(Run({"bitfield", "foo", "overflow", "fail", "incrby", "i1", "0", "-10"}),
              ArgType(RespExpr::Type::NIL));

  // stickiness of overflow among operations in a chain
  ASSERT_THAT(Run({"bitfield", "foo", "overflow", "fail", "set", "u8", "0", "300", "set", "u1", "0",
                   "400"}),
              RespArray(ElementsAre(ArgType(RespExpr::NIL), ArgType(RespExpr::NIL))));
}

TEST_F(BitOpsFamilyTest, BitFieldOperations) {
  // alligned offset reads/writes unsigned
  Run({"bitfield", "foo", "set", "u32", "0", "0"});
  // Set the bit battern 01111000 00000001 00000001 00001010
  ASSERT_THAT(Run({"bitfield", "foo", "set", "u8", "0", "120"}), IntArg(0));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "u8", "0"}), IntArg(120));

  ASSERT_THAT(Run({"bitfield", "foo", "set", "u8", "8", "1"}), IntArg(0));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "u8", "8"}), IntArg(1));

  ASSERT_THAT(Run({"bitfield", "foo", "set", "u8", "16", "1"}), IntArg(0));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "u8", "16"}), IntArg(1));

  ASSERT_THAT(Run({"bitfield", "foo", "set", "u8", "24", "10"}), IntArg(0));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "u8", "24"}), IntArg(10));

  ASSERT_THAT(Run({"bitfield", "foo", "get", "u32", "0"}), IntArg(2013331722));

  ASSERT_THAT(Run({"bitfield", "foo", "incrby", "u8", "0", "120"}), IntArg(240));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "u8", "0"}), IntArg(240));

  ASSERT_THAT(Run({"bitfield", "foo", "incrby", "u16", "0", "120"}), IntArg(61561));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "u16", "0"}), IntArg(61561));

  // alligned offset reads/writes signed
  Run({"bitfield", "foo", "set", "u32", "0", "0"});
  // Set the bit battern 10001000 11111111 11111111 11110110
  ASSERT_THAT(Run({"bitfield", "foo", "set", "i8", "0", "-120"}), IntArg(0));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "i8", "0"}), IntArg(-120));

  ASSERT_THAT(Run({"bitfield", "foo", "set", "i8", "8", "-1"}), IntArg(0));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "i8", "8"}), IntArg(-1));

  ASSERT_THAT(Run({"bitfield", "foo", "set", "i8", "16", "-1"}), IntArg(0));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "i8", "16"}), IntArg(-1));

  ASSERT_THAT(Run({"bitfield", "foo", "set", "i8", "24", "-10"}), IntArg(0));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "i8", "24"}), IntArg(-10));

  ASSERT_THAT(Run({"bitfield", "foo", "get", "i32", "0"}), IntArg(-1996488714));

  ASSERT_THAT(Run({"bitfield", "foo", "incrby", "i8", "0", "-8"}), IntArg(-128));

  // nonalligned offset reads/writes unsigned
  Run({"bitfield", "foo", "set", "i64", "0", "0"});
  // Set the bit battern 00000000 10000000 10000000 10000000 10000000
  ASSERT_THAT(Run({"bitfield", "foo", "set", "u8", "1", "1"}), IntArg(0));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "u8", "1"}), IntArg(1));

  ASSERT_THAT(Run({"bitfield", "foo", "set", "u8", "9", "1"}), IntArg(0));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "u8", "9"}), IntArg(1));

  ASSERT_THAT(Run({"bitfield", "foo", "set", "u8", "17", "1"}), IntArg(0));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "u8", "17"}), IntArg(1));

  ASSERT_THAT(Run({"bitfield", "foo", "set", "u8", "25", "1"}), IntArg(0));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "u8", "25"}), IntArg(1));

  ASSERT_THAT(Run({"bitfield", "foo", "get", "u8", "0"}), IntArg(0));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "u1", "8"}), IntArg(1));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "u1", "16"}), IntArg(1));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "u1", "24"}), IntArg(1));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "u1", "32"}), IntArg(1));

  ASSERT_THAT(Run({"bitfield", "foo", "get", "u33", "0"}), IntArg(16843009));

  // nonalligned offset reads/writes signed
  Run({"bitfield", "foo", "set", "i64", "0", "0"});
  // Set the bit battern 1111111 11111111 0000000 000000001
  ASSERT_THAT(Run({"bitfield", "foo", "set", "i8", "1", "-1"}), IntArg(0));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "i8", "1"}), IntArg(-1));

  ASSERT_THAT(Run({"bitfield", "foo", "set", "i8", "9", "-1"}), IntArg(0));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "i8", "9"}), IntArg(-1));

  ASSERT_THAT(Run({"bitfield", "foo", "set", "i8", "17", "0"}), IntArg(0));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "i8", "17"}), IntArg(0));

  ASSERT_THAT(Run({"bitfield", "foo", "set", "i8", "25", "1"}), IntArg(0));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "i8", "25"}), IntArg(1));

  ASSERT_THAT(Run({"bitfield", "foo", "get", "i32", "1"}), IntArg(-65535));

  // chaining
  Run({
      "bitfield", "foo", "set", "u1", "0", "1", "set", "u1", "1", "1", "set", "u1",
      "2",        "1",   "set", "u1", "3", "1", "set", "u1", "4", "1", "set", "u1",
      "5",        "1",   "set", "u1", "6", "1", "set", "u1", "7", "1",
  });

  ASSERT_THAT(Run({"bitfield", "foo", "get", "u8", "0"}), IntArg(255));

  ASSERT_THAT(Run({
                  "bitfield",
                  "foo",
                  "set",
                  "u1",
                  "0",
                  "0",
                  "incrby",
                  "u1",
                  "0",
                  "1",
                  "get",
                  "u1",
                  "0",
              }),
              RespArray(ElementsAre(IntArg(1), IntArg(1), IntArg(1))));

  // check for positional offsets
  Run({"bitfield", "foo", "set", "u8", "#0", "1", "set", "u8", "#1", "1", "set", "u8", "#2", "1"});

  ASSERT_THAT(Run({"bitfield", "foo", "get", "u1", "7"}), IntArg(1));
  ASSERT_THAT(Run({"bitfield", "foo", "get", "u1", "15"}), IntArg(1));
}

TEST_F(BitOpsFamilyTest, BitFieldLargeOffset) {
  Run({"set", "foo", "bar"});

  auto resp = Run({"bitfield", "foo", "get", "u32", "0", "overflow", "fail", "incrby", "u32", "0",
                   "4294967295"});
  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(1650553344), ArgType(RespExpr::NIL))));

  resp = Run({"strlen", "foo"});
  EXPECT_THAT(resp, 4);

  resp = Run({"get", "foo"});
  EXPECT_THAT(ToSV(resp.GetBuf()), Eq(std::string_view("bar\0", 4)));

  resp = Run({"bitfield", "foo", "get", "u32", "4294967295"});
  EXPECT_THAT(resp, 0);
}

TEST_F(BitOpsFamilyTest, BitFieldIssue5237_SetOverflowSat) {
  Run({"set", "key:bitfield_set", "\xff\xf0\x00"});
  auto resp = Run({"bitfield", "key:bitfield_set", "overflow", "sat", "set", "i4", "0", "8", "set",
                   "i4", "4", "7"});

  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(-1), IntArg(-1))));
}

TEST_F(BitOpsFamilyTest, BitFieldIssue5237_IncrbyCorrectness) {
  Run({"set", "key:bitfield_incr", "\xff\xf0\x00"});
  auto resp = Run(
      {"bitfield", "key:bitfield_incr", "incrby", "u8", "0", "85", "incrby", "u8", "16", "170"});

  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(84), IntArg(170))));
}

TEST_F(BitOpsFamilyTest, BitFieldIssue5237_InvalidTypeUppercase_Set) {
  auto expected_error = ErrArg(
      "ERR invalid bitfield type. use something like i16 u8. note that u64 is not supported but "
      "i64 is.");

  ASSERT_THAT(Run({"bitfield", "key:bitfield_set:wrong:args", "set", "I8", "0", "0"}),
              expected_error);
}

TEST_F(BitOpsFamilyTest, BitFieldIssue5237_InvalidTypeUppercase_Get) {
  auto expected_error = ErrArg(
      "ERR invalid bitfield type. use something like i16 u8. note that u64 is not supported but "
      "i64 is.");

  ASSERT_THAT(Run({"bitfield", "key:bitfield_get:wrong:args", "get", "I8", "0"}), expected_error);
}

TEST_F(BitOpsFamilyTest, BitFieldAdditionalWrongArguments) {
  // Additional tests to match Python test coverage
  const auto syntax_error = ErrArg("ERR syntax error");
  auto expected_error = ErrArg(
      "ERR invalid bitfield type. use something like i16 u8. note that u64 is not supported but "
      "i64 is.");

  // Additional invalid encoding types (from Python tests)
  ASSERT_THAT(Run({"bitfield", "foo", "get", "i-42", "0"}), expected_error);
  ASSERT_THAT(Run({"bitfield", "foo", "get", "i5?", "0"}), expected_error);
  ASSERT_THAT(Run({"bitfield", "foo", "get", "i0", "0"}), expected_error);
  ASSERT_THAT(Run({"bitfield", "foo", "set", "i-42", "0", "0"}), expected_error);
  ASSERT_THAT(Run({"bitfield", "foo", "set", "i5?", "0", "0"}), expected_error);
  ASSERT_THAT(Run({"bitfield", "foo", "set", "i0", "0", "0"}), expected_error);

  // Test negative offsets (should be syntax error)
  ASSERT_THAT(Run({"bitfield", "foo", "get", "i16", "-1"}), syntax_error);
  ASSERT_THAT(Run({"bitfield", "foo", "set", "i16", "-1", "0"}), syntax_error);
  ASSERT_THAT(Run({"bitfield", "foo", "incrby", "i16", "-1", "1"}), syntax_error);

  // Test invalid values for SET and INCRBY (generates syntax error during parsing)
  ASSERT_THAT(Run({"bitfield", "foo", "set", "i16", "0", "foo"}), syntax_error);
  ASSERT_THAT(Run({"bitfield", "foo", "incrby", "i16", "0", "bar"}), syntax_error);
}

TEST_F(BitOpsFamilyTest, BitFieldNoOps) {
  EXPECT_THAT(Run({"BITFIELD", "k", "OVERFLOW", "SAT"}), RespArray(ElementsAre()));
  EXPECT_THAT(Run({"BITFIELD", "k"}), RespArray(ElementsAre()));
  EXPECT_THAT(Run({"BITFIELD_RO", "k", "OVERFLOW", "SAT"}), RespArray(ElementsAre()));
  EXPECT_THAT(Run({"BITFIELD_RO", "k"}), RespArray(ElementsAre()));
}

}  // end of namespace dfly


================================================
FILE: src/server/blocking_controller.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/blocking_controller.h"

#include <absl/container/inlined_vector.h>

#include <boost/smart_ptr/intrusive_ptr.hpp>

#include "base/logging.h"
#include "server/engine_shard_set.h"
#include "server/namespaces.h"
#include "server/transaction.h"

namespace dfly {

using namespace std;

struct WatchItem {
  Transaction* trans;
  KeyReadyChecker key_ready_checker;

  Transaction* get() const {
    return trans;
  }

  WatchItem(Transaction* t, KeyReadyChecker krc) : trans(t), key_ready_checker(std::move(krc)) {
  }
};

struct BlockingController::WatchQueue {
  deque<WatchItem> items;

  // Updated  by both coordinator and shard threads but at different times.
  enum State { SUSPENDED, ACTIVE } state = SUSPENDED;

  auto Find(Transaction* tx) const {
    return find_if(items.begin(), items.end(),
                   [tx](const WatchItem& wi) { return wi.get() == tx; });
  }
};

// Watch state per db.
struct BlockingController::DbWatchTable {
  // Watch queues per key
  absl::flat_hash_map<std::string, std::unique_ptr<WatchQueue>> queue_map;

  // awakened keys point to blocked keys that can potentially be unblocked.
  absl::flat_hash_set<std::string> awakened_keys;

  // returns true if awake event was added.
  // Requires that the key queue be in the required state.
  bool AddAwakeEvent(string_view key);

  // Returns true if awakened tx was removed from the queue.
  bool UnwatchTx(string_view key, Transaction* tx);
};

bool BlockingController::DbWatchTable::UnwatchTx(string_view key, Transaction* tx) {
  auto wq_it = queue_map.find(key);

  // With multiple same keys we may have misses because the first iteration
  // on the same key could remove the queue.
  if (wq_it == queue_map.end())
    return false;

  WatchQueue* wq = wq_it->second.get();
  DCHECK(!wq->items.empty());

  bool res = false;
  if (wq->state == WatchQueue::ACTIVE && wq->items.front().get() == tx) {
    wq->items.pop_front();

    // We suspend the queue and add keys to re-verification.
    // If they are still present, this queue will be reactivated below.
    wq->state = WatchQueue::SUSPENDED;

    if (!wq->items.empty())
      awakened_keys.insert(wq_it->first);  // send for further validation.
    res = true;
  } else {
    // tx can be is_awakened == true because of some other key and this queue would be
    // in suspended and we still need to clean it up.
    // the suspended item does not have to be the first one in the queue.
    // This shard has not been awakened and in case this transaction in the queue
    // we must clean it up.
    if (auto it = wq->Find(tx); it != wq->items.end()) {
      wq->items.erase(it);
    }
  }

  if (wq->items.empty()) {
    DVLOG(1) << "queue_map.erase";
    awakened_keys.erase(wq_it->first);
    queue_map.erase(wq_it);
  }
  return res;
}

BlockingController::BlockingController(EngineShard* owner, Namespace* ns) : owner_(owner), ns_(ns) {
}

BlockingController::~BlockingController() {
}

bool BlockingController::DbWatchTable::AddAwakeEvent(string_view key) {
  auto it = queue_map.find(key);

  if (it == queue_map.end() || it->second->state != WatchQueue::SUSPENDED)
    return false;  /// nobody watches this key or state does not match.

  return awakened_keys.insert(it->first).second;
}

// Removes tx from its watch queues if tx appears there.
void BlockingController::RemovedWatched(Keys keys, Transaction* tx) {
  DCHECK(tx);
  VLOG(1) << "FinalizeBlocking [" << owner_->shard_id() << "]" << tx->DebugId();

  bool removed = awakened_transactions_.erase(tx);
  DCHECK(!removed || (tx->DEBUG_GetLocalMask(owner_->shard_id()) & Transaction::AWAKED_Q));

  auto dbit = watched_dbs_.find(tx->GetDbIndex());

  // Can happen if it was the only transaction in the queue and it was notified and removed.
  if (dbit == watched_dbs_.end())
    return;

  DbWatchTable& wt = *dbit->second;

  // Add keys of processed transaction so we could awake the next one in the queue
  // in case those keys still exist.
  for (string_view key : keys) {
    bool removed_awakened = wt.UnwatchTx(key, tx);
    CHECK(!removed_awakened || removed)
        << tx->DebugId() << " " << key << " " << tx->DEBUG_GetLocalMask(owner_->shard_id());
  }

  if (wt.queue_map.empty()) {
    watched_dbs_.erase(dbit);
  }
  awakened_indices_.emplace(tx->GetDbIndex());
}

// Runs on the shard thread.
void BlockingController::NotifyPending() {
  const Transaction* tx = owner_->GetContTx();
  CHECK(tx == nullptr) << tx->DebugId();

  DbContext context;
  context.ns = ns_;
  context.time_now_ms = GetCurrentTimeMs();

  for (DbIndex index : awakened_indices_) {
    auto dbit = watched_dbs_.find(index);
    if (dbit == watched_dbs_.end())
      continue;

    context.db_index = index;
    DbWatchTable& wt = *dbit->second;  // pointer stability due to node_hash_map
    for (string_view key : wt.awakened_keys) {
      DVLOG(1) << "Processing awakened key " << key;
      auto w_it = wt.queue_map.find(key);
      CHECK(w_it != wt.queue_map.end());

      WatchQueue* wq = w_it->second.get();
      NotifyWatchQueue(key, wq, context);
      if (wq->items.empty())
        wt.queue_map.erase(w_it);
    }
    wt.awakened_keys.clear();

    if (wt.queue_map.empty()) {
      watched_dbs_.erase(dbit);
    }
  }
  awakened_indices_.clear();
}

void BlockingController::AddWatched(Keys watch_keys, KeyReadyChecker krc, Transaction* trans) {
  auto [dbit, added] = watched_dbs_.emplace(trans->GetDbIndex(), nullptr);
  if (added) {
    dbit->second = make_unique<DbWatchTable>();
  }

  DbWatchTable& wt = *dbit->second;

  for (auto key : watch_keys) {
    auto [res, inserted] = wt.queue_map.emplace(key, nullptr);
    if (inserted)
      res->second = make_unique<WatchQueue>();

    if (!res->second->items.empty()) {
      Transaction* last = res->second->items.back().get();
      DCHECK_GT(last->GetUseCount(), 0u);

      // Duplicate keys case. We push only once per key.
      if (last == trans)
        continue;
    }
    DVLOG(2) << "Emplace " << trans->DebugId() << " to watch " << key;
    res->second->items.emplace_back(trans, krc);
  }
}

// Called from commands like lpush.
void BlockingController::Awaken(DbIndex db_index, string_view db_key) {
  auto it = watched_dbs_.find(db_index);
  if (it == watched_dbs_.end())
    return;

  DbWatchTable& wt = *it->second;
  DCHECK(!wt.queue_map.empty());

  if (wt.AddAwakeEvent(db_key)) {
    VLOG(1) << "Touch: db(" << db_index << ") " << db_key;
    awakened_indices_.insert(db_index);
  }
}

// Marks the queue as active and notifies the first transaction in the queue.
void BlockingController::NotifyWatchQueue(std::string_view key, WatchQueue* wq,
                                          const DbContext& context) {
  DCHECK_EQ(wq->state, WatchQueue::SUSPENDED);

  auto& queue = wq->items;
  ShardId sid = owner_->shard_id();

  // In the most cases we shouldn't have skipped elements at all
  absl::InlinedVector<dfly::WatchItem, 4> skipped;
  while (!queue.empty()) {
    auto& wi = queue.front();
    Transaction* head = wi.get();
    // We check may the transaction be notified otherwise move it to the end of the queue
    if (wi.key_ready_checker(owner_, context, head, key)) {
      DVLOG(2) << "WQ-Pop " << head->DebugId() << " from key " << key << " committed txid "
               << owner_->committed_txid();
      if (head->NotifySuspended(sid, key)) {
        wq->state = WatchQueue::ACTIVE;
        // We deliberately keep the notified transaction in the queue to know which queue
        // must handled when this transaction finished.
        awakened_transactions_.insert(head);
        break;
      }
    } else {
      skipped.push_back(std::move(wi));
    }

    queue.pop_front();
  }
  std::move(skipped.begin(), skipped.end(), std::back_inserter(queue));
}

size_t BlockingController::NumWatched(DbIndex db_indx) const {
  auto it = watched_dbs_.find(db_indx);
  if (it == watched_dbs_.end())
    return 0;

  return it->second->queue_map.size();
}

vector<string> BlockingController::GetWatchedKeys(DbIndex db_indx) const {
  vector<string> res;
  auto it = watched_dbs_.find(db_indx);

  if (it != watched_dbs_.end()) {
    for (const auto& k_v : it->second->queue_map) {
      res.push_back(k_v.first);
    }
  }

  return res;
}

}  // namespace dfly


================================================
FILE: src/server/blocking_controller.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/btree_map.h>
#include <absl/container/flat_hash_map.h>
#include <absl/container/flat_hash_set.h>

#include "base/string_view_sso.h"
#include "server/tx_base.h"

namespace dfly {

// Used for tracking keys of blocking transactions and properly notifying them.
// First, keys are marked as watched and associated with an owner transaction. A mutating
// transaction marks them as touched, and once it concludes, the watching transactions are notified.
class BlockingController {
 public:
  explicit BlockingController(EngineShard* owner, Namespace* ns);
  ~BlockingController();

  using Keys = ShardArgs;

  bool HasAwakedTransaction() const {
    return !awakened_transactions_.empty();
  }

  const auto& awakened_transactions() const {
    return awakened_transactions_;
  }

  // Associate given keys with transaction, checked via the krc checker
  void AddWatched(Keys watch_keys, KeyReadyChecker krc, Transaction* me);

  // Remove transaction from watching these keys
  void RemovedWatched(Keys keys, Transaction* tx);

  // Mark given key as awakened. Called by commands mutating this key.
  void Awaken(DbIndex db_index, std::string_view key);

  // Notify transactions of awakened keys
  void NotifyPending();

  // Used in tests and debugging functions.
  size_t NumWatched(DbIndex db_indx) const;
  std::vector<std::string> GetWatchedKeys(DbIndex db_indx) const;

 private:
  struct WatchQueue;
  struct DbWatchTable;

  void NotifyWatchQueue(std::string_view key, WatchQueue* wqm, const DbContext& context);

  EngineShard* owner_;
  Namespace* ns_;

  // TODO: check if unique_ptr indirection is required
  absl::flat_hash_map<DbIndex, std::unique_ptr<DbWatchTable>> watched_dbs_;  // watched keys
  absl::flat_hash_set<DbIndex> awakened_indices_;  // watched_dbs_ with awakened keys

  // Transactions that got awakened with NotifySuspended
  // TODO: Used only for one DCHECK
  absl::flat_hash_set<Transaction*> awakened_transactions_;
};
}  // namespace dfly


================================================
FILE: src/server/blocking_controller_test.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/blocking_controller.h"

#include <gmock/gmock.h>

#include "base/logging.h"
#include "facade/facade_stats.h"
#include "server/acl/acl_commands_def.h"
#include "server/command_registry.h"
#include "server/engine_shard_set.h"
#include "server/namespaces.h"
#include "server/server_state.h"
#include "server/transaction.h"
#include "util/fibers/pool.h"

namespace dfly {

using namespace util;
using namespace std;
using namespace std::chrono;
using namespace testing;

constexpr size_t kNumThreads = 3;

class BlockingControllerTest : public Test {
 protected:
  BlockingControllerTest() : cid_("blpop", 0, -3, 1, -2, acl::NONE) {
  }
  void SetUp() override;
  void TearDown() override;

  static void SetUpTestSuite() {
    ServerState::Init(kNumThreads, kNumThreads, nullptr, nullptr);
    facade::tl_facade_stats = new facade::FacadeStats;
  }

  std::unique_ptr<ProactorPool> pp_;
  boost::intrusive_ptr<Transaction> trans_;
  CommandId cid_;
  StringVec str_vec_;
  CmdArgVec arg_vec_;
};

void BlockingControllerTest::SetUp() {
  pp_.reset(fb2::Pool::Epoll(kNumThreads));
  pp_->Run();
  pp_->AwaitBrief([](unsigned index, ProactorBase* p) {
    ServerState::Init(index, kNumThreads, nullptr, nullptr);
    if (facade::tl_facade_stats == nullptr) {
      facade::tl_facade_stats = new facade::FacadeStats;
    }
  });

  shard_set = new EngineShardSet(pp_.get());
  shard_set->Init(kNumThreads, nullptr);

  trans_.reset(new Transaction{&cid_});

  str_vec_.assign({"x", "z", "0"});
  for (auto& s : str_vec_) {
    arg_vec_.emplace_back(s);
  }

  trans_->InitByArgs(&namespaces->GetDefaultNamespace(), 0, {arg_vec_.data(), arg_vec_.size()});
  CHECK_EQ(0u, Shard("x", shard_set->size()));
  CHECK_EQ(2u, Shard("z", shard_set->size()));

  const TestInfo* const test_info = UnitTest::GetInstance()->current_test_info();
  LOG(INFO) << "Starting " << test_info->name();
}

void BlockingControllerTest::TearDown() {
  shard_set->PreShutdown();
  shard_set->Shutdown();
  delete shard_set;

  pp_->Stop();
  pp_.reset();
}

TEST_F(BlockingControllerTest, Basic) {
  trans_->ScheduleSingleHop([&](Transaction* t, EngineShard* shard) {
    BlockingController bc(shard, &namespaces->GetDefaultNamespace());
    auto keys = t->GetShardArgs(shard->shard_id());
    bc.AddWatched(
        keys, [](auto...) { return true; }, t);
    EXPECT_EQ(1, bc.NumWatched(0));

    bc.RemovedWatched(keys, t);
    EXPECT_EQ(0, bc.NumWatched(0));
    return OpStatus::OK;
  });
}

TEST_F(BlockingControllerTest, Timeout) {
  time_point tp = steady_clock::now() + chrono::milliseconds(10);
  bool blocked;
  bool paused;

  facade::OpStatus status = trans_->WaitOnWatch(
      tp, Transaction::kShardArgs, [](auto...) { return true; }, &blocked, &paused);

  EXPECT_EQ(status, facade::OpStatus::TIMED_OUT);
  unsigned num_watched = shard_set->Await(

      0, [&] {
        return namespaces->GetDefaultNamespace()
            .GetBlockingController(EngineShard::tlocal()->shard_id())
            ->NumWatched(0);
      });

  EXPECT_EQ(0, num_watched);
  trans_.reset();
}

}  // namespace dfly


================================================
FILE: src/server/bloom_family.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#include "core/bloom.h"
#include "facade/cmd_arg_parser.h"
#include "facade/error.h"
#include "facade/reply_builder.h"
#include "server/acl/acl_commands_def.h"
#include "server/command_families.h"
#include "server/command_registry.h"
#include "server/conn_context.h"
#include "server/db_slice.h"
#include "server/engine_shard_set.h"
#include "server/error.h"
#include "server/transaction.h"

namespace dfly {

using namespace facade;
using namespace std;

namespace {

constexpr double kDefaultFpProb = 0.01;
constexpr double kDefaultGrowFactor = 2;
struct SbfParams {
  uint32_t init_capacity;
  double error;
  double grow_factor = kDefaultGrowFactor;

  bool ok() const {
    return error > 0 and error < 0.5;
  }
};

using AddResult = absl::InlinedVector<OpResult<bool>, 4>;
using ExistsResult = absl::InlinedVector<bool, 4>;

OpStatus OpReserve(const SbfParams& params, const OpArgs& op_args, string_view key) {
  auto& db_slice = op_args.GetDbSlice();
  auto op_res = db_slice.AddOrFind(op_args.db_cntx, key, OBJ_SBF);
  RETURN_ON_BAD_STATUS(op_res);

  if (!op_res->is_new)
    return OpStatus::KEY_EXISTS;

  PrimeValue& pv = op_res->it->second;
  pv.SetSBF(params.init_capacity, params.error, params.grow_factor);

  return OpStatus::OK;
}

// Returns true, if item was added, false if it was already "present".
OpResult<AddResult> OpAdd(const OpArgs& op_args, string_view key, CmdArgList items) {
  auto& db_slice = op_args.GetDbSlice();

  auto op_res = db_slice.AddOrFind(op_args.db_cntx, key, OBJ_SBF);
  RETURN_ON_BAD_STATUS(op_res);

  PrimeValue& pv = op_res->it->second;

  if (op_res->is_new) {
    pv.SetSBF(0, kDefaultFpProb, kDefaultGrowFactor);
  }

  SBF* sbf = pv.GetSBF();
  AddResult result(items.size());
  for (size_t i = 0; i < items.size(); ++i) {
    result[i] = sbf->Add(ToSV(items[i]));
  }
  return result;
}

OpResult<ExistsResult> OpExists(const OpArgs& op_args, string_view key, CmdArgList items) {
  auto& db_slice = op_args.GetDbSlice();
  OpResult op_res = db_slice.FindReadOnly(op_args.db_cntx, key, OBJ_SBF);
  if (!op_res)
    return op_res.status();
  auto it = (*op_res);

  const SBF* sbf = it->second.GetSBF();
  ExistsResult result(items.size());

  for (size_t i = 0; i < items.size(); ++i) {
    result[i] = sbf->Exists(ToSV(items[i]));
  }

  return result;
}

void CmdReserve(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser(args);
  string_view key = parser.Next();
  SbfParams params;

  tie(params.error, params.init_capacity) = parser.Next<double, uint32_t>();
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (parser.TakeError())
    return rb->SendError(kSyntaxErr);

  if (!params.ok())
    return rb->SendError("error rate is out of range", kSyntaxErrType);

  const auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpReserve(params, t->GetOpArgs(shard), key);
  };

  OpStatus res = cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));
  if (res == OpStatus::KEY_EXISTS) {
    return rb->SendError("item exists");
  }
  return rb->SendError(res);
}

void CmdAdd(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  args.remove_prefix(1);

  const auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpAdd(t->GetOpArgs(shard), key, args);
  };

  OpResult res = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  OpStatus status = res.status();
  if (res) {
    if (res->front())
      return cmd_cntx->SendLong(*res->front());
    else
      status = res->front().status();
  }

  return cmd_cntx->SendError(status);
}

void CmdExists(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  args.remove_prefix(1);
  const auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpExists(t->GetOpArgs(shard), key, args);
  };

  OpResult res = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  return cmd_cntx->SendLong(res ? res->front() : 0);
}

void CmdMAdd(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  args.remove_prefix(1);

  const auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpAdd(t->GetOpArgs(shard), key, args);
  };

  RedisReplyBuilder* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  OpResult res = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  if (!res) {
    return rb->SendError(res.status());
  }
  const AddResult& add_res = *res;

  RedisReplyBuilder::ArrayScope scope{rb, add_res.size()};
  for (const OpResult<bool>& val : add_res) {
    if (val) {
      rb->SendLong(*val);
    } else {
      rb->SendError(val.status());
    }
  }
}

void CmdMExists(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  args.remove_prefix(1);

  const auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpExists(t->GetOpArgs(shard), key, args);
  };

  OpResult res = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  RedisReplyBuilder::ArrayScope scope{rb, args.size()};
  for (size_t i = 0; i < args.size(); ++i) {
    rb->SendLong(res ? res->at(i) : 0);
  }
}

}  // namespace

using CI = CommandId;

#define HFUNC(x) SetHandler(&Cmd##x)

void RegisterBloomFamily(CommandRegistry* registry) {
  registry->StartFamily();

  *registry << CI{"BF.RESERVE", CO::JOURNALED | CO::DENYOOM | CO::FAST, -4, 1, 1, acl::BLOOM}.HFUNC(
                   Reserve)
            << CI{"BF.ADD", CO::JOURNALED | CO::DENYOOM | CO::FAST, 3, 1, 1, acl::BLOOM}.HFUNC(Add)
            << CI{"BF.MADD", CO::JOURNALED | CO::DENYOOM | CO::FAST, -3, 1, 1, acl::BLOOM}.HFUNC(
                   MAdd)
            << CI{"BF.EXISTS", CO::READONLY | CO::FAST, 3, 1, 1, acl::BLOOM}.HFUNC(Exists)
            << CI{"BF.MEXISTS", CO::READONLY | CO::FAST, -3, 1, 1, acl::BLOOM}.HFUNC(MExists);
};

}  // namespace dfly


================================================
FILE: src/server/bloom_family_test.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#include "facade/facade_test.h"
#include "server/test_utils.h"

namespace dfly {

using testing::ElementsAre;

class BloomFamilyTest : public BaseFamilyTest {
 protected:
};

TEST_F(BloomFamilyTest, Basic) {
  auto resp = Run({"bf.reserve", "b1", "0.1", "32"});
  EXPECT_EQ(resp, "OK");
  EXPECT_EQ(Run({"type", "b1"}), "MBbloom--");
  EXPECT_THAT(Run({"bf.add", "b1", "a"}), IntArg(1));
  EXPECT_THAT(Run({"bf.add", "b1", "b"}), IntArg(1));
  EXPECT_THAT(Run({"bf.add", "b1", "b"}), IntArg(0));
  EXPECT_THAT(Run({"bf.add", "b2", "b"}), IntArg(1));
  EXPECT_EQ(Run({"type", "b2"}), "MBbloom--");

  EXPECT_THAT(Run({"bf.exists", "b2", "c"}), IntArg(0));
  EXPECT_THAT(Run({"bf.exists", "b3", "c"}), IntArg(0));
  EXPECT_THAT(Run({"bf.exists", "b2", "b"}), IntArg(1));
  Run({"set", "str", "foo"});
  EXPECT_THAT(Run({"bf.exists", "str", "b"}), IntArg(0));
}

TEST_F(BloomFamilyTest, Multiple) {
  auto resp = Run({"bf.mexists", "bf1", "a", "b", "c"});
  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(0), IntArg(0), IntArg(0))));

  Run({"set", "str", "foo"});
  resp = Run({"bf.mexists", "str", "a", "b", "c"});
  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(0), IntArg(0), IntArg(0))));

  resp = Run({"bf.madd", "str", "a"});
  EXPECT_THAT(resp, ErrArg("WRONG"));

  resp = Run({"bf.madd", "bf1", "a", "b", "c"});
  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(1), IntArg(1), IntArg(1))));
  resp = Run({"bf.madd", "bf1", "a", "b", "c"});
  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(0), IntArg(0), IntArg(0))));
  resp = Run({"bf.mexists", "bf1", "a", "b", "c"});
  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(1), IntArg(1), IntArg(1))));
}

}  // namespace dfly


================================================
FILE: src/server/channel_store.cc
================================================
#include "server/channel_store.h"

// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include <absl/container/fixed_array.h>

#include "base/logging.h"
#include "core/glob_matcher.h"
#include "facade/dragonfly_connection.h"
#include "server/cluster/slot_set.h"
#include "server/cluster_support.h"
#include "server/conn_context.h"
#include "server/engine_shard_set.h"
#include "server/server_state.h"

namespace dfly {
using namespace std;

namespace {

// Build functor for sending messages to connection
auto BuildSender(string_view channel, facade::ArgRange messages, bool sharded = false,
                 bool unsubscribe = false) {
  absl::FixedArray<string_view, 1> views(messages.Size());
  size_t messages_size = accumulate(messages.begin(), messages.end(), 0,
                                    [](int sum, string_view str) { return sum + str.size(); });
  auto buf = shared_ptr<char[]>{new char[channel.size() + messages_size]};
  {
    memcpy(buf.get(), channel.data(), channel.size());
    char* ptr = buf.get() + channel.size();

    size_t i = 0;
    for (string_view message : messages) {
      memcpy(ptr, message.data(), message.size());
      views[i++] = {ptr, message.size()};
      ptr += message.size();
    }
  }

  return [channel, buf = std::move(buf), views = std::move(views), sharded, unsubscribe](
             facade::Connection* conn, string pattern) {
    string_view channel_view{buf.get(), channel.size()};
    for (std::string_view message_view : views) {
      conn->SendPubMessageAsync(
          {std::move(pattern), buf, channel_view, message_view, sharded, unsubscribe});
    }
  };
}

}  // namespace

bool ChannelStore::Subscriber::ByThread(const Subscriber& lhs, const Subscriber& rhs) {
  return ByThreadId(lhs, rhs.LastKnownThreadId());
}

bool ChannelStore::Subscriber::ByThreadId(const Subscriber& lhs, const unsigned thread) {
  return lhs.LastKnownThreadId() < thread;
}

ChannelStore::UpdatablePointer::UpdatablePointer(const UpdatablePointer& other) {
  ptr.store(other.ptr.load(memory_order_relaxed), memory_order_relaxed);
}

ChannelStore::SubscribeMap* ChannelStore::UpdatablePointer::Get() const {
  return ptr.load(memory_order_acquire);  // sync pointed memory
}

void ChannelStore::UpdatablePointer::Set(ChannelStore::SubscribeMap* sm) {
  ptr.store(sm, memory_order_release);  // sync pointed memory
}

ChannelStore::SubscribeMap* ChannelStore::UpdatablePointer::operator->() const {
  return Get();
}

const ChannelStore::SubscribeMap& ChannelStore::UpdatablePointer::operator*() const {
  return *Get();
}

void ChannelStore::ChannelMap::Add(string_view key, ConnectionContext* me, uint32_t thread_id) {
  auto it = find(key);
  if (it == end())
    it = emplace(key, new SubscribeMap{}).first;
  it->second->emplace(me, thread_id);
}

void ChannelStore::ChannelMap::Remove(string_view key, ConnectionContext* me) {
  if (auto it = find(key); it != end()) {
    it->second->erase(me);
    if (it->second->empty())
      erase(it);
  }
}

void ChannelStore::ChannelMap::DeleteAll() {
  for (auto [k, ptr] : *this)
    delete ptr.Get();
}

ChannelStore::ChannelStore() : channels_{new ChannelMap{}}, patterns_{new ChannelMap{}} {
  control_block.most_recent = this;
}

ChannelStore::ChannelStore(ChannelMap* channels, ChannelMap* patterns)
    : channels_{channels}, patterns_{patterns} {
}

void ChannelStore::Destroy() {
  control_block.update_mu.lock();
  control_block.update_mu.unlock();

  auto* store = control_block.most_recent.load(memory_order_relaxed);
  for (auto* chan_map : {store->channels_, store->patterns_}) {
    chan_map->DeleteAll();
    delete chan_map;
  }
  delete control_block.most_recent;
}

ChannelStore::ControlBlock ChannelStore::control_block;

unsigned ChannelStore::SendMessages(std::string_view channel, facade::ArgRange messages,
                                    bool sharded) const {
  vector<Subscriber> subscribers = FetchSubscribers(channel);
  if (subscribers.empty())
    return 0;

  // Make sure none of the threads publish buffer limits is reached. We don't reserve memory ahead
  // and don't prevent the buffer from possibly filling, but the approach is good enough for
  // limiting fast producers. Most importantly, we can use DispatchBrief below as we block here
  int32_t last_thread = -1;

  for (auto& sub : subscribers) {
    int sub_thread = sub.LastKnownThreadId();
    DCHECK_LE(last_thread, sub_thread);
    if (last_thread == sub_thread)  // skip same thread
      continue;

    if (sub.IsExpired())
      continue;

    // Make sure the connection thread has enough memory budget to accept the message.
    // This is a heuristic and not entirely hermetic since the connection memory might
    // get filled again.
    facade::Connection::EnsureMemoryBudget(sub_thread);
    last_thread = sub_thread;
  }

  auto subscribers_ptr = make_shared<decltype(subscribers)>(std::move(subscribers));
  auto cb = [subscribers_ptr, send = BuildSender(channel, messages, sharded)](unsigned idx, auto*) {
    auto it = lower_bound(subscribers_ptr->begin(), subscribers_ptr->end(), idx,
                          ChannelStore::Subscriber::ByThreadId);
    while (it != subscribers_ptr->end() && it->LastKnownThreadId() == idx) {
      if (auto* ptr = it->Get(); ptr && ptr->cntx() != nullptr)
        send(ptr, it->pattern);
      it++;
    }
  };
  shard_set->pool()->DispatchBrief(std::move(cb));

  return subscribers_ptr->size();
}

vector<ChannelStore::Subscriber> ChannelStore::FetchSubscribers(string_view channel) const {
  vector<Subscriber> res;

  if (auto it = channels_->find(channel); it != channels_->end())
    Fill(*it->second, string{}, &res);

  for (const auto& [pat, subs] : *patterns_) {
    GlobMatcher matcher{pat, true};
    if (matcher.Matches(channel))
      Fill(*subs, pat, &res);
  }

  sort(res.begin(), res.end(), Subscriber::ByThread);
  return res;
}

void ChannelStore::Fill(const SubscribeMap& src, const string& pattern, vector<Subscriber>* out) {
  out->reserve(out->size() + src.size());
  for (const auto [cntx, thread_id] : src) {
    // `cntx` is expected to be valid as it unregisters itself from the channel_store before
    // closing.
    CHECK(cntx->conn_state.subscribe_info);
    Subscriber sub{cntx->conn()->Borrow(), pattern};
    out->push_back(std::move(sub));
  }
}

std::vector<string> ChannelStore::ListChannels(const string_view pattern) const {
  vector<string> res;
  GlobMatcher matcher{pattern, true};
  for (const auto& [channel, _] : *channels_) {
    if (pattern.empty() || matcher.Matches(channel))
      res.push_back(channel);
  }
  return res;
}

size_t ChannelStore::PatternCount() const {
  return patterns_->size();
}

void ChannelStore::UnsubscribeAfterClusterSlotMigration(const cluster::SlotSet& deleted_slots) {
  if (deleted_slots.Empty()) {
    return;
  }

  const uint32_t tid = util::ProactorBase::me()->GetPoolIndex();
  ChannelStoreUpdater csu(false, false, nullptr, tid);

  for (const auto& [channel, _] : *channels_) {
    auto channel_slot = KeySlot(channel);
    if (deleted_slots.Contains(channel_slot)) {
      csu.Record(channel);
    }
  }

  csu.ApplyAndUnsubscribe();
}

// TODO: Reuse common code with Send function
// TODO: Find proper solution to hacky `force_unsubscribe` flag or at least move logic out of io
void ChannelStore::UnsubscribeConnectionsFromDeletedSlots(const ChannelsSubMap& sub_map,
                                                          uint32_t idx) {
  for (const auto& [channel, subscribers] : sub_map) {
    // ignored by pub sub handler because should_unsubscribe is true
    std::string msg = "__ignore__";
    auto send = BuildSender(channel, {facade::ArgSlice{msg}}, false, true);

    auto it = lower_bound(subscribers.begin(), subscribers.end(), idx,
                          ChannelStore::Subscriber::ByThreadId);
    while (it != subscribers.end() && it->LastKnownThreadId() == idx) {
      // if ptr->cntx() is null, a connection might have closed or be in the process of closing
      if (auto* ptr = it->Get(); ptr && ptr->cntx() != nullptr) {
        DCHECK(it->pattern.empty());
        send(ptr, it->pattern);
      }
      ++it;
    }
  }
}

ChannelStoreUpdater::ChannelStoreUpdater(bool pattern, bool to_add, ConnectionContext* cntx,
                                         uint32_t thread_id)
    : pattern_{pattern}, to_add_{to_add}, cntx_{cntx}, thread_id_{thread_id} {
}

void ChannelStoreUpdater::Record(string_view key) {
  ops_.emplace_back(key);
}

pair<ChannelStore::ChannelMap*, bool> ChannelStoreUpdater::GetTargetMap(ChannelStore* store) {
  auto* target = pattern_ ? store->patterns_ : store->channels_;

  for (auto key : ops_) {
    auto it = target->find(key);
    DCHECK(it != target->end() || to_add_);
    // We need to make a copy, if we are going to add or delete new map slot.
    if ((to_add_ && it == target->end()) || (!to_add_ && it->second->size() == 1))
      return {new ChannelStore::ChannelMap{*target}, true};
  }

  return {target, false};
}

void ChannelStoreUpdater::Modify(ChannelMap* target, string_view key) {
  using SubscribeMap = ChannelStore::SubscribeMap;

  auto it = target->find(key);

  // New key, add new slot.
  if (to_add_ && it == target->end()) {
    target->emplace(key, new SubscribeMap{{cntx_, thread_id_}});
    return;
  }

  // Last entry for key, remove slot.
  if (!to_add_ && it->second->size() == 1) {
    DCHECK(it->second->begin()->first == cntx_);
    freelist_.push_back(it->second.Get());
    target->erase(it);
    return;
  }

  // RCU update existing SubscribeMap entry.
  DCHECK(!it->second->empty());
  auto* replacement = new SubscribeMap{*it->second};
  if (to_add_)
    replacement->emplace(cntx_, thread_id_);
  else
    replacement->erase(cntx_);

  // The pointer can still be in use, so delay freeing it
  // until the dispatch and update the slot atomically.
  freelist_.push_back(it->second.Get());
  it->second.Set(replacement);
}

void ChannelStoreUpdater::Apply() {
  // Wait for other updates to finish, lock the control block and update store pointer.
  auto& cb = ChannelStore::control_block;
  cb.update_mu.lock();
  auto* store = cb.most_recent.load(memory_order_relaxed);

  // Get target map (copied if needed) and apply operations.
  auto [target, copied] = GetTargetMap(store);
  for (auto key : ops_)
    Modify(target, key);

  // Prepare replacement.
  auto* replacement = store;
  if (copied) {
    auto* new_chans = pattern_ ? store->channels_ : target;
    auto* new_patterns = pattern_ ? target : store->patterns_;
    replacement = new ChannelStore{new_chans, new_patterns};
  }

  // Update control block and unlock it.
  cb.most_recent.store(replacement, memory_order_relaxed);
  cb.update_mu.unlock();

  // Update thread local references. Readers fetch subscribers via FetchSubscribers,
  // which runs without preemption, and store references to them in self container Subscriber
  // structs. This means that any point on the other thread is safe to update the channel store.
  // Regardless of whether we need to replace, we dispatch to make sure all
  // queued SubscribeMaps in the freelist are no longer in use.
  shard_set->pool()->AwaitBrief([](unsigned idx, util::ProactorBase*) {
    ServerState::tlocal()->UpdateChannelStore(
        // Do not use memory_order_relaxed, we need to fetch the latest value of
        // the control block
        ChannelStore::control_block.most_recent.load(std::memory_order_seq_cst));
  });

  // Delete previous map and channel store.
  if (copied) {
    delete (pattern_ ? store->patterns_ : store->channels_);
    delete store;
  }

  for (auto ptr : freelist_)
    delete ptr;
}

void ChannelStoreUpdater::ApplyAndUnsubscribe() {
  DCHECK(to_add_ == false);
  DCHECK(pattern_ == false);
  DCHECK(cntx_ == nullptr);

  if (ops_.empty()) {
    return;
  }

  // Wait for other updates to finish, lock the control block and update store pointer.
  auto& cb = ChannelStore::control_block;
  cb.update_mu.lock();
  auto* store = cb.most_recent.load(memory_order_relaxed);

  // Deep copy, we will remove channels
  auto* target = new ChannelStore::ChannelMap{*store->channels_};

  for (auto key : ops_) {
    auto it = target->find(key);
    freelist_.push_back(it->second.Get());
    target->erase(it);
    continue;
  }

  // Prepare replacement.
  auto* replacement = new ChannelStore{target, store->patterns_};

  // Update control block and unlock it.
  cb.most_recent.store(replacement, memory_order_relaxed);
  cb.update_mu.unlock();

  // FetchSubscribers is not thead safe so we need to fetch here before we do the hop below.
  // Bonus points because now we compute subscribers only once.
  absl::flat_hash_map<std::string_view, std::vector<ChannelStore::Subscriber>> subs;
  for (auto channel : ops_) {
    auto channel_subs = ServerState::tlocal()->channel_store()->FetchSubscribers(channel);
    DCHECK(!subs.contains(channel));
    subs[channel] = std::move(channel_subs);
  }
  // Update thread local references. Readers fetch subscribers via FetchSubscribers,
  // which runs without preemption, and store references to them in self container Subscriber
  // structs. This means that any point on the other thread is safe to update the channel store.
  // Regardless of whether we need to replace, we dispatch to make sure all
  // queued SubscribeMaps in the freelist are no longer in use.
  shard_set->pool()->AwaitFiberOnAll([&subs](unsigned idx, util::ProactorBase*) {
    ServerState::tlocal()->UnsubscribeSlotsAndUpdateChannelStore(
        subs, ChannelStore::control_block.most_recent.load(memory_order_relaxed));
  });

  // Delete previous map and channel store.
  delete store->channels_;
  delete store;

  for (auto ptr : freelist_)
    delete ptr;
}

}  // namespace dfly


================================================
FILE: src/server/channel_store.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once

#include <absl/container/flat_hash_map.h>

#include <string_view>

#include "facade/connection_ref.h"
#include "facade/facade_types.h"
#include "util/fibers/synchronization.h"

namespace dfly {

class ConnectionContext;
class ChannelStoreUpdater;

namespace cluster {
class SlotSet;
}

// ChannelStore manages PUB/SUB subscriptions.
//
// Updates are carried out via RCU (read-copy-update). Each thread stores a pointer to ChannelStore
// in its local ServerState and uses it for reads. Whenever an update needs to be performed,
// a new ChannelStore is constructed with the requested modifications and broadcasted to all
// threads.
//
// ServerState ChannelStore* -> ChannelMap* -> atomic<SubscribeMap*> (cntx -> thread)
//
// Specifically, whenever a new channel is registered or a channel is removed fully,
// a new ChannelMap for the specified type (channel/pattern) needs to be constructed. However, if
// only a single SubscribeMap is modified (no map ChannelMap slots are added or removed),
// we can update only it with a simpler version of RCU, as SubscribeMap is stored as an atomic
// pointer inside ChannelMap.
//
// To prevent parallel (and thus overlapping) updates, a centralized ControlBlock is used.
// Update operations are carried out by the ChannelStoreUpdater.
//
// A centralized ChannelStore, contrary to sharded storage, avoids contention on a single shard
// thread for heavy throughput on a single channel and thus seamlessly scales on multiple threads
// even with a small number of channels. In general, it has a slightly lower latency, due to the
// fact that no hop is required to fetch the subscribers.
class ChannelStore {
  friend class ChannelStoreUpdater;

 public:
  struct Subscriber : public facade::ConnectionRef {
    Subscriber(ConnectionRef ref, const std::string& pattern)
        : facade::ConnectionRef(std::move(ref)), pattern(pattern) {
    }

    // Sort by thread-id. Subscriber without owner comes first.
    static bool ByThread(const Subscriber& lhs, const Subscriber& rhs);
    static bool ByThreadId(const Subscriber& lhs, const unsigned thread);

    std::string pattern;  // non-empty if registered via psubscribe
  };

  ChannelStore();

  // Send messages to channel, block on connection backpressure
  unsigned SendMessages(std::string_view channel, facade::ArgRange messages, bool sharded) const;

  // Fetch all subscribers for channel, including matching patterns.
  std::vector<Subscriber> FetchSubscribers(std::string_view channel) const;

  std::vector<std::string> ListChannels(const std::string_view pattern) const;

  size_t PatternCount() const;

  void UnsubscribeAfterClusterSlotMigration(const cluster::SlotSet& deleted_slots);

  using ChannelsSubMap =
      absl::flat_hash_map<std::string_view, std::vector<ChannelStore::Subscriber>>;
  void UnsubscribeConnectionsFromDeletedSlots(const ChannelsSubMap& sub_map, uint32_t idx);

  // Destroy current instance and delete it.
  static void Destroy();

 private:
  using ThreadId = unsigned;

  // Subscribers for a single channel/pattern.
  using SubscribeMap = absl::flat_hash_map<ConnectionContext*, ThreadId>;

  // Wrapper around atomic pointer that allows copying and moving.
  // Made to overcome restrictions of absl::flat_hash_map.
  // Copy/Move don't need to be atomic with RCU.
  struct UpdatablePointer {
    UpdatablePointer(SubscribeMap* sm) : ptr{sm} {
    }

    UpdatablePointer(const UpdatablePointer& other);

    SubscribeMap* Get() const;
    void Set(SubscribeMap* sm);

    SubscribeMap* operator->() const;
    const SubscribeMap& operator*() const;

   private:
    std::atomic<SubscribeMap*> ptr;
  };

  // SubscriberMaps for channels/patterns.
  struct ChannelMap : absl::flat_hash_map<std::string, UpdatablePointer> {
    void Add(std::string_view key, ConnectionContext* me, uint32_t thread_id);
    void Remove(std::string_view key, ConnectionContext* me);

    // Delete all stored SubscribeMap pointers.
    void DeleteAll();
  };

  // Centralized controller to prevent overlaping updates.
  struct ControlBlock {
    std::atomic<ChannelStore*> most_recent;
    util::fb2::Mutex update_mu;  // locked during updates.
  };

 private:
  static ControlBlock control_block;

  ChannelStore(ChannelMap* channels, ChannelMap* patterns);

  static void Fill(const SubscribeMap& src, const std::string& pattern,
                   std::vector<Subscriber>* out);

  ChannelMap* channels_;
  ChannelMap* patterns_;
};

// Performs RCU (read-copy-update) updates to the channel store.
// See ChannelStore header top for design details.
// Queues operations and performs them with Apply().
class ChannelStoreUpdater {
 public:
  ChannelStoreUpdater(bool pattern, bool to_add, ConnectionContext* cntx, uint32_t thread_id);

  void Record(std::string_view key);
  void Apply();

  // Used for cluster when slots migrate. We need to:
  // 1. Remove the channel from the copy.
  // 2. Unsuscribe all the connections from each channel.
  // 3. Update the control block pointer.
  void ApplyAndUnsubscribe();

 private:
  using ChannelMap = ChannelStore::ChannelMap;

  // Get target map and flag whether it was copied.
  // Must be called with locked control block.
  std::pair<ChannelMap*, bool> GetTargetMap(ChannelStore* store);

  // Apply modify operation to target map.
  void Modify(ChannelMap* target, std::string_view key);

 private:
  bool pattern_;
  bool to_add_;
  ConnectionContext* cntx_;
  uint32_t thread_id_;

  // Pending operations.
  std::vector<std::string_view> ops_;

  // Replaced SubscribeMaps that need to be deleted safely.
  std::vector<ChannelStore::SubscribeMap*> freelist_;
};

}  // namespace dfly


================================================
FILE: src/server/cluster/CMakeLists.txt
================================================
SET(DF_CLUSTER_SRCS
    cluster/cluster_config.cc cluster/cluster_family.cc cluster/incoming_slot_migration.cc
    cluster/outgoing_slot_migration.cc cluster/cluster_defs.cc cluster/cluster_utility.cc
    cluster/coordinator.cc
    PARENT_SCOPE)


================================================
FILE: src/server/cluster/cluster_config.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "cluster_config.h"

#include <absl/container/flat_hash_set.h>
#include <absl/strings/match.h>

#include <optional>
#include <string_view>

#include "base/logging.h"
#include "core/json/json_object.h"

using namespace std;

namespace dfly::cluster {

namespace {

thread_local shared_ptr<ClusterConfig> tl_cluster_config;

bool HasValidNodeIds(const ClusterShardInfos& new_config) {
  absl::flat_hash_set<string_view> nodes;

  auto CheckAndInsertNode = [&](string_view node) {
    auto [_, inserted] = nodes.insert(node);
    return inserted;
  };

  for (const auto& shard : new_config) {
    if (!CheckAndInsertNode(shard.master.id)) {
      LOG(ERROR) << "Master " << shard.master.id << " appears more than once";
      return false;
    }
    for (const auto& replica : shard.replicas) {
      if (!CheckAndInsertNode(replica.id)) {
        LOG(ERROR) << "Replica " << replica.id << " appears more than once";
        return false;
      }
    }
  }

  return true;
}

bool IsConfigValid(const ClusterShardInfos& new_config) {
  // Make sure that all slots are set exactly once.
  vector<bool> slots_found(kMaxSlotNum + 1);

  if (!HasValidNodeIds(new_config)) {
    return false;
  }

  for (const auto& shard : new_config) {
    for (const auto& slot_range : shard.slot_ranges) {
      if (slot_range.start > slot_range.end) {
        LOG(ERROR) << "Invalid cluster config: start=" << slot_range.start
                   << " is larger than end=" << slot_range.end;
        return false;
      }

      for (SlotId slot = slot_range.start; slot <= slot_range.end; ++slot) {
        if (slot >= slots_found.size()) {
          LOG(ERROR) << "Invalid cluster config: slot=" << slot
                     << " is bigger than allowed max=" << slots_found.size();
          return false;
        }

        if (slots_found[slot]) {
          LOG(ERROR) << "Invalid cluster config: slot=" << slot
                     << " was already configured by another slot range.";
          return false;
        }

        slots_found[slot] = true;
      }
    }
  }

  if (!all_of(slots_found.begin(), slots_found.end(), [](bool b) { return b; }) > 0UL) {
    LOG(ERROR) << "Invalid cluster config: some slots were missing.";
    return false;
  }

  return true;
}
}  // namespace

/* static */
shared_ptr<ClusterConfig> ClusterConfig::CreateFromConfig(string_view my_id,
                                                          const ClusterShardInfos& config) {
  if (!IsConfigValid(config)) {
    return nullptr;
  }

  shared_ptr<ClusterConfig> result(new ClusterConfig());

  result->my_id_ = my_id;
  result->config_ = config;

  for (const auto& shard : result->config_) {
    const bool is_master = shard.master.id == my_id;
    const bool owned_by_me =
        is_master || any_of(shard.replicas.begin(), shard.replicas.end(),
                            [&](const ClusterNodeInfo& node) { return node.id == my_id; });
    if (owned_by_me) {
      result->my_slots_.Set(shard.slot_ranges, true);
      if (is_master) {
        result->is_master_ = true;
        result->my_outgoing_migrations_ = shard.migrations;
      }
    } else {
      for (const auto& m : shard.migrations) {
        if (my_id == m.node_info.id) {
          auto incoming_migration = m;
          // for incoming migration we need the source node
          incoming_migration.node_info.id = shard.master.id;
          result->my_incoming_migrations_.push_back(std::move(incoming_migration));
        }
      }
    }
  }

  return result;
}

namespace {
constexpr string_view kInvalidConfigPrefix = "Invalid JSON cluster config: "sv;

template <typename T> optional<T> ReadNumeric(const TmpJson& obj) {
  if (!obj.is_number()) {
    LOG(ERROR) << kInvalidConfigPrefix << "object is not a number " << obj;
    return nullopt;
  }

  try {
    return obj.as<T>();
  } catch (const std::exception& e) {
    LOG(ERROR) << kInvalidConfigPrefix << "number conversion error: " << e.what();
    return nullopt;
  }
}

optional<SlotRanges> GetClusterSlotRanges(const TmpJson& slots) {
  if (!slots.is_array()) {
    LOG(ERROR) << kInvalidConfigPrefix << "slot_ranges is not an array " << slots;
    return nullopt;
  }

  std::vector<SlotRange> ranges;

  for (const auto& range : slots.array_range()) {
    if (!range.is_object()) {
      LOG(ERROR) << kInvalidConfigPrefix << "slot_ranges element is not an object " << range;
      return nullopt;
    }

    optional<SlotId> start = ReadNumeric<SlotId>(range.at_or_null("start"));
    optional<SlotId> end = ReadNumeric<SlotId>(range.at_or_null("end"));
    if (!start.has_value() || !end.has_value()) {
      return nullopt;
    }

    ranges.push_back({.start = start.value(), .end = end.value()});
  }

  return SlotRanges(ranges);
}

optional<ClusterExtendedNodeInfo> ParseClusterNode(const TmpJson& json) {
  if (!json.is_object()) {
    LOG(ERROR) << kInvalidConfigPrefix << "node config is not an object " << json;
    return nullopt;
  }

  ClusterExtendedNodeInfo node;

  {
    auto id = json.at_or_null("id");
    if (!id.is_string()) {
      LOG(ERROR) << kInvalidConfigPrefix << "invalid id for node " << json;
      return nullopt;
    }
    node.id = std::move(id).as_string();
  }

  {
    auto ip = json.at_or_null("ip");
    if (!ip.is_string()) {
      LOG(ERROR) << kInvalidConfigPrefix << "invalid ip for node " << json;
      return nullopt;
    }
    node.ip = std::move(ip).as_string();
  }

  {
    auto port = ReadNumeric<uint16_t>(json.at_or_null("port"));
    if (!port.has_value()) {
      return nullopt;
    }
    node.port = port.value();
  }

  {
    auto health = json.at_or_null("health");
    if (!health.is_null()) {
      if (!health.is_string()) {
        LOG(ERROR) << kInvalidConfigPrefix << "invalid health status for node " << json;
      } else {
        auto health_str = std::move(health).as_string();
        if (absl::EqualsIgnoreCase(health_str, "FAIL")) {
          node.health = NodeHealth::FAIL;
        } else if (absl::EqualsIgnoreCase(health_str, "LOADING")) {
          node.health = NodeHealth::LOADING;
        } else if (absl::EqualsIgnoreCase(health_str, "ONLINE")) {
          node.health = NodeHealth::ONLINE;
        } else if (absl::EqualsIgnoreCase(health_str, "HIDDEN")) {
          node.health = NodeHealth::HIDDEN;
        } else {
          LOG(ERROR) << kInvalidConfigPrefix << "invalid health status for node: " << health_str;
        }
      }
    }
  }

  return node;
}

optional<std::vector<MigrationInfo>> ParseMigrations(const TmpJson& json) {
  std::vector<MigrationInfo> res;
  if (json.is_null()) {
    return res;
  }

  if (!json.is_array()) {
    LOG(INFO) << "no migrations found: " << json;
    return nullopt;
  }

  for (const auto& element : json.array_range()) {
    auto node_id = element.at_or_null("node_id");
    auto ip = element.at_or_null("ip");
    auto port = ReadNumeric<uint16_t>(element.at_or_null("port"));
    auto slots = GetClusterSlotRanges(element.at_or_null("slot_ranges"));

    if (!node_id.is_string() || !ip.is_string() || !port || !slots) {
      LOG(ERROR) << kInvalidConfigPrefix << "invalid migration json " << json;
      return nullopt;
    }

    res.emplace_back(MigrationInfo{
        .slot_ranges = std::move(*slots),
        .node_info =
            ClusterNodeInfo{.id = node_id.as_string(), .ip = ip.as_string(), .port = *port}});
  }
  return res;
}

optional<ClusterShardInfos> BuildClusterConfigFromJson(const TmpJson& json) {
  std::vector<ClusterShardInfo> config;

  if (!json.is_array()) {
    LOG(ERROR) << kInvalidConfigPrefix << "not an array " << json;
    return nullopt;
  }

  for (const auto& element : json.array_range()) {
    ClusterShardInfo shard;

    if (!element.is_object()) {
      LOG(ERROR) << kInvalidConfigPrefix << "shard element is not an object " << element;
      return nullopt;
    }

    auto slots = GetClusterSlotRanges(element.at_or_null("slot_ranges"));
    if (!slots.has_value()) {
      return nullopt;
    }
    shard.slot_ranges = std::move(slots).value();

    auto master = ParseClusterNode(element.at_or_null("master"));
    if (!master.has_value()) {
      return nullopt;
    }
    shard.master = std::move(master).value();

    auto replicas = element.at_or_null("replicas");
    if (!replicas.is_array()) {
      LOG(ERROR) << kInvalidConfigPrefix << "replicas is not an array " << replicas;
      return nullopt;
    }

    for (const auto& replica : replicas.array_range()) {
      auto node = ParseClusterNode(replica);
      if (!node.has_value()) {
        return nullopt;
      }
      shard.replicas.push_back(std::move(node).value());
    }

    auto migrations = ParseMigrations(element.at_or_null("migrations"));
    if (!migrations) {
      return nullopt;
    }
    shard.migrations = std::move(*migrations);

    config.push_back(std::move(shard));
  }

  return ClusterShardInfos(config);
}
}  // namespace

/* static */
shared_ptr<ClusterConfig> ClusterConfig::CreateFromConfig(string_view my_id,
                                                          std::string_view json_str) {
  optional<TmpJson> json_config = JsonFromString(json_str);
  if (!json_config.has_value()) {
    LOG(ERROR) << "Can't parse JSON for ClusterConfig " << json_str;
    return nullptr;
  }

  optional<ClusterShardInfos> config = BuildClusterConfigFromJson(json_config);
  if (!config.has_value()) {
    return nullptr;
  }

  return CreateFromConfig(my_id, config.value());
}

std::shared_ptr<ClusterConfig> ClusterConfig::CloneWithChanges(
    const SlotRanges& enable_slots, const SlotRanges& disable_slots) const {
  auto new_config = std::make_shared<ClusterConfig>(*this);
  new_config->my_slots_.Set(enable_slots, true);
  new_config->my_slots_.Set(disable_slots, false);
  return new_config;
}

std::shared_ptr<ClusterConfig> ClusterConfig::CloneWithoutMigrations() const {
  auto new_config = std::make_shared<ClusterConfig>(*this);
  new_config->my_incoming_migrations_.clear();
  new_config->my_outgoing_migrations_.clear();
  return new_config;
}

bool ClusterConfig::IsMySlot(SlotId id) const {
  if (id > kMaxSlotNum) {
    DCHECK(false) << "Requesting a non-existing slot id " << id;
    return false;
  }

  return my_slots_.Contains(id);
}

bool ClusterConfig::IsMySlot(std::string_view key) const {
  return IsMySlot(KeySlot(key));
}

ClusterNodeInfo ClusterConfig::GetMasterNodeForSlot(SlotId id) const {
  CHECK_LE(id, kMaxSlotNum) << "Requesting a non-existing slot id " << id;
  for (const auto& shard : config_) {
    if (shard.slot_ranges.Contains(id)) {
      if (shard.master.id == my_id_) {
        // The only reason why this function call and shard.master == my_id_ is the slot was
        // migrated
        for (const auto& m : shard.migrations) {
          if (m.slot_ranges.Contains(id)) {
            for (const auto& shard : config_) {
              if (shard.master.id == m.node_info.id) {
                return shard.master;
              }
            }
          }
        }
      }
      return shard.master;
    }
  }

  DCHECK(false) << "Can't find master node for slot " << id;
  return {};
}

ClusterShardInfos ClusterConfig::GetConfig() const {
  return config_;
}

const SlotSet& ClusterConfig::GetOwnedSlots() const {
  return my_slots_;
}

static std::vector<MigrationInfo> GetMissingMigrations(const std::vector<MigrationInfo>& haystack,
                                                       const std::vector<MigrationInfo>& needle) {
  std::vector<MigrationInfo> res;
  for (const auto& h : haystack) {
    if (find(needle.begin(), needle.end(), h) == needle.end()) {
      res.push_back(h);
    }
  }
  return res;
}

std::vector<MigrationInfo> ClusterConfig::GetNewOutgoingMigrations(
    const std::shared_ptr<ClusterConfig>& prev) const {
  return prev ? GetMissingMigrations(my_outgoing_migrations_, prev->my_outgoing_migrations_)
              : my_outgoing_migrations_;
}

std::vector<MigrationInfo> ClusterConfig::GetNewIncomingMigrations(
    const std::shared_ptr<ClusterConfig>& prev) const {
  return prev ? GetMissingMigrations(my_incoming_migrations_, prev->my_incoming_migrations_)
              : my_incoming_migrations_;
}

std::vector<MigrationInfo> ClusterConfig::GetFinishedOutgoingMigrations(
    const std::shared_ptr<ClusterConfig>& prev) const {
  return prev ? GetMissingMigrations(prev->my_outgoing_migrations_, my_outgoing_migrations_)
              : std::vector<MigrationInfo>();
}

std::vector<MigrationInfo> ClusterConfig::GetFinishedIncomingMigrations(
    const std::shared_ptr<ClusterConfig>& prev) const {
  return prev ? GetMissingMigrations(prev->my_incoming_migrations_, my_incoming_migrations_)
              : std::vector<MigrationInfo>();
}

std::shared_ptr<ClusterConfig> ClusterConfig::Current() {
  return tl_cluster_config;
}

void ClusterConfig::SetCurrent(std::shared_ptr<ClusterConfig> config) {
  tl_cluster_config = std::move(config);
}

}  // namespace dfly::cluster


================================================
FILE: src/server/cluster/cluster_config.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <memory>
#include <string_view>
#include <vector>

#include "src/server/cluster/slot_set.h"

namespace dfly::cluster {

class ClusterConfig {
 public:
  // Returns an instance with `config` if it is valid.
  // Returns heap-allocated object as it is too big for a stack frame.
  static std::shared_ptr<ClusterConfig> CreateFromConfig(std::string_view my_id,
                                                         const ClusterShardInfos& config);

  // Parses `json_config` into `ClusterShardInfos` and calls the above overload.
  static std::shared_ptr<ClusterConfig> CreateFromConfig(std::string_view my_id,
                                                         std::string_view json_config);

  std::shared_ptr<ClusterConfig> CloneWithChanges(const SlotRanges& enable_slots,
                                                  const SlotRanges& disable_slots) const;

  std::shared_ptr<ClusterConfig> CloneWithoutMigrations() const;

  // If key is in my slots ownership return true
  bool IsMySlot(SlotId id) const;
  bool IsMySlot(std::string_view key) const;

  const std::string& MyId() const {
    return my_id_;
  }

  bool is_master() const {
    return is_master_;
  }

  // Returns the master configured for `id`.
  ClusterNodeInfo GetMasterNodeForSlot(SlotId id) const;

  ClusterShardInfos GetConfig() const;

  // Use wisely, only after a deep copy of ClusterConfig and
  // to edit the config in place.
  ClusterShardInfos& GetMutableConfig() {
    return config_;
  }

  const SlotSet& GetOwnedSlots() const;

  std::vector<MigrationInfo> GetNewOutgoingMigrations(
      const std::shared_ptr<ClusterConfig>& prev) const;
  std::vector<MigrationInfo> GetNewIncomingMigrations(
      const std::shared_ptr<ClusterConfig>& prev) const;
  std::vector<MigrationInfo> GetFinishedOutgoingMigrations(
      const std::shared_ptr<ClusterConfig>& prev) const;
  std::vector<MigrationInfo> GetFinishedIncomingMigrations(
      const std::shared_ptr<ClusterConfig>& prev) const;

  std::vector<MigrationInfo> GetIncomingMigrations() const {
    return my_incoming_migrations_;
  }

  // Returns a thread-local pointer.
  static std::shared_ptr<ClusterConfig> Current();

  // Set a thread-local pointer.
  static void SetCurrent(std::shared_ptr<ClusterConfig> config);

 private:
  struct SlotEntry {
    const ClusterShardInfo* shard = nullptr;
    bool owned_by_me = false;
  };

  ClusterConfig() = default;

  bool is_master_ = false;
  std::string my_id_;
  ClusterShardInfos config_;

  SlotSet my_slots_;
  std::vector<MigrationInfo> my_outgoing_migrations_;
  std::vector<MigrationInfo> my_incoming_migrations_;
};

}  // namespace dfly::cluster


================================================
FILE: src/server/cluster/cluster_config_test.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/cluster/cluster_config.h"

#include <gmock/gmock-matchers.h>

#include <jsoncons/json.hpp>

#include "base/gtest.h"
#include "base/logging.h"
#include "server/test_utils.h"

using namespace std;
using namespace testing;
using Node = dfly::cluster::ClusterNodeInfo;

namespace dfly::cluster {

MATCHER_P(NodeMatches, expected, "") {
  return arg.id == expected.id && arg.ip == expected.ip && arg.port == expected.port;
}

class ClusterConfigTest : public BaseFamilyTest {
 protected:
  const string kMyId = "my-id";
};

inline string_view GetTag(string_view key) {
  return LockTagOptions::instance().Tag(key);
}

TEST_F(ClusterConfigTest, KeyTagTest) {
  SetTestFlag("lock_on_hashtags", "true");

  EXPECT_EQ(GetTag("{user1000}.following"), "user1000");

  EXPECT_EQ(GetTag("foo{{bar}}zap"), "{bar");

  EXPECT_EQ(GetTag("foo{bar}{zap}"), "bar");

  string_view key = " foo{}{bar}";
  EXPECT_EQ(key, GetTag(key));

  key = "{}foo{bar}{zap}";
  EXPECT_EQ(key, GetTag(key));

  SetTestFlag("locktag_delimiter", ":");
  TEST_InvalidateLockTagOptions();

  key = "{user1000}.following";
  EXPECT_EQ(GetTag(key), key);

  EXPECT_EQ(GetTag("bull:queue1:123"), "queue1");
  EXPECT_EQ(GetTag("bull:queue:1:123"), "queue");
  EXPECT_EQ(GetTag("bull:queue:1:123:456:789:1000"), "queue");

  key = "bull::queue:1:123";
  EXPECT_EQ(GetTag(key), key);

  SetTestFlag("locktag_delimiter", ":");
  SetTestFlag("locktag_skip_n_end_delimiters", "0");
  SetTestFlag("locktag_prefix", "bull");
  TEST_InvalidateLockTagOptions();
  EXPECT_EQ(GetTag("bull:queue:123"), "queue");
  EXPECT_EQ(GetTag("bull:queue:123:456:789:1000"), "queue");

  key = "not-bull:queue1:123";
  EXPECT_EQ(GetTag(key), key);

  SetTestFlag("locktag_delimiter", ":");
  SetTestFlag("locktag_skip_n_end_delimiters", "1");
  SetTestFlag("locktag_prefix", "bull");
  TEST_InvalidateLockTagOptions();

  key = "bull:queue1:123";
  EXPECT_EQ(GetTag(key), key);
  EXPECT_EQ(GetTag("bull:queue:1:123"), "queue:1");
  EXPECT_EQ(GetTag("bull:queue:1:123:456:789:1000"), "queue:1");

  key = "bull::queue:1:123";
  EXPECT_EQ(GetTag(key), key);

  SetTestFlag("locktag_delimiter", "|");
  SetTestFlag("locktag_skip_n_end_delimiters", "2");
  SetTestFlag("locktag_prefix", "");
  TEST_InvalidateLockTagOptions();

  EXPECT_EQ(GetTag("|a|b|c|d|e"), "a|b|c");
}

TEST_F(ClusterConfigTest, ConfigSetInvalidEmpty) {
  EXPECT_EQ(ClusterConfig::CreateFromConfig(kMyId, ClusterShardInfos{}), nullptr);
}

TEST_F(ClusterConfigTest, ConfigSetInvalidMissingSlots) {
  EXPECT_EQ(
      ClusterConfig::CreateFromConfig(
          kMyId,
          {{.slot_ranges = SlotRanges({{.start = 0, .end = 16000}}),
            .master = {{.id = "other", .ip = "192.168.0.100", .port = 7000}, NodeHealth::ONLINE},
            .replicas = {},
            .migrations = {}}}),
      nullptr);
}

TEST_F(ClusterConfigTest, ConfigSetInvalidDoubleBookedSlot) {
  EXPECT_EQ(ClusterConfig::CreateFromConfig(
                kMyId, ClusterShardInfos(
                           {{.slot_ranges = SlotRanges({{.start = 0, .end = 0x3FFF}}),
                             .master = {{.id = "other", .ip = "192.168.0.100", .port = 7000},
                                        NodeHealth::ONLINE},
                             .replicas = {},
                             .migrations = {}},
                            {.slot_ranges = SlotRanges({{.start = 0, .end = 0}}),
                             .master = {{.id = "other2", .ip = "192.168.0.101", .port = 7001},
                                        NodeHealth::ONLINE},
                             .replicas = {},
                             .migrations = {}}})),
            nullptr);
}

TEST_F(ClusterConfigTest, ConfigSetInvalidSlotId) {
  EXPECT_EQ(
      ClusterConfig::CreateFromConfig(
          kMyId,
          {{.slot_ranges = SlotRanges({{.start = 0, .end = 0x3FFF + 1}}),
            .master = {{.id = "other", .ip = "192.168.0.100", .port = 7000}, NodeHealth::ONLINE},
            .replicas = {},
            .migrations = {}}}),
      nullptr);
}

TEST_F(ClusterConfigTest, ConfigSetOk) {
  auto config = ClusterConfig::CreateFromConfig(
      kMyId, {{.slot_ranges = SlotRanges({{.start = 0, .end = 0x3FFF}}),
               .master = {{.id = "other", .ip = "192.168.0.100", .port = 7000}, NodeHealth::ONLINE},
               .replicas = {},
               .migrations = {}}});
  EXPECT_NE(config, nullptr);
  EXPECT_THAT(config->GetMasterNodeForSlot(0),
              NodeMatches(Node{.id = "other", .ip = "192.168.0.100", .port = 7000}));
  EXPECT_TRUE(config->GetOwnedSlots().Empty());
}

TEST_F(ClusterConfigTest, ConfigSetOkWithReplica) {
  auto config = ClusterConfig::CreateFromConfig(
      kMyId,
      {{.slot_ranges = SlotRanges({{.start = 0, .end = 0x3FFF}}),
        .master = {{.id = "other-master", .ip = "192.168.0.100", .port = 7000}, NodeHealth::ONLINE},
        .replicas = {{{.id = "other-replica", .ip = "192.168.0.101", .port = 7001},
                      NodeHealth::ONLINE}},
        .migrations = {}}});
  EXPECT_NE(config, nullptr);
  EXPECT_THAT(config->GetMasterNodeForSlot(0),
              NodeMatches(Node{.id = "other-master", .ip = "192.168.0.100", .port = 7000}));
}

TEST_F(ClusterConfigTest, ConfigSetMultipleInstances) {
  auto config = ClusterConfig::CreateFromConfig(
      kMyId,
      ClusterShardInfos(
          {{.slot_ranges = SlotRanges({{.start = 0, .end = 5'000}}),
            .master = {{.id = "other-master", .ip = "192.168.0.100", .port = 7000},
                       NodeHealth::ONLINE},
            .replicas = {{{.id = "other-replica", .ip = "192.168.0.101", .port = 7001},
                          NodeHealth::ONLINE}},
            .migrations = {}},
           {.slot_ranges = SlotRanges({{.start = 5'001, .end = 10'000}}),
            .master = {{.id = kMyId, .ip = "192.168.0.102", .port = 7002}, NodeHealth::ONLINE},
            .replicas = {{{.id = "other-replica2", .ip = "192.168.0.103", .port = 7003},
                          NodeHealth::ONLINE}},
            .migrations = {}},
           {.slot_ranges = SlotRanges({{.start = 10'001, .end = 0x3FFF}}),
            .master = {{.id = "other-master3", .ip = "192.168.0.104", .port = 7004},
                       NodeHealth::ONLINE},
            .replicas = {{{.id = "other-replica3", .ip = "192.168.0.105", .port = 7005},
                          NodeHealth::ONLINE}},
            .migrations = {}}}));
  EXPECT_NE(config, nullptr);
  SlotSet owned_slots = config->GetOwnedSlots();
  EXPECT_EQ(owned_slots.ToSlotRanges().Size(), 1);
  EXPECT_EQ(owned_slots.Count(), 5'000);

  {
    for (int i = 0; i <= 5'000; ++i) {
      EXPECT_THAT(config->GetMasterNodeForSlot(i),
                  NodeMatches(Node{.id = "other-master", .ip = "192.168.0.100", .port = 7000}));
      EXPECT_FALSE(config->IsMySlot(i));
      EXPECT_FALSE(owned_slots.Contains(i));
    }
  }
  {
    for (int i = 5'001; i <= 10'000; ++i) {
      EXPECT_THAT(config->GetMasterNodeForSlot(i),
                  NodeMatches(Node{.id = kMyId, .ip = "192.168.0.102", .port = 7002}));
      EXPECT_TRUE(config->IsMySlot(i));
      EXPECT_TRUE(owned_slots.Contains(i));
    }
  }
  {
    for (int i = 10'001; i <= 0x3FFF; ++i) {
      EXPECT_THAT(config->GetMasterNodeForSlot(i),
                  NodeMatches(Node{.id = "other-master3", .ip = "192.168.0.104", .port = 7004}));
      EXPECT_FALSE(config->IsMySlot(i));
      EXPECT_FALSE(owned_slots.Contains(i));
    }
  }
}

TEST_F(ClusterConfigTest, ConfigSetInvalidSlotRanges) {
  // Note that slot_ranges is not an object
  EXPECT_EQ(ClusterConfig::CreateFromConfig(kMyId, R"json(
                [
                  {
                    "slot_ranges": "0,16383",
                    "master": {
                      "id": "abcd1234",
                      "ip": "10.0.0.1",
                      "port": 7000
                    },
                    "replicas": []
                  }
                ])json"),
            nullptr);
}

TEST_F(ClusterConfigTest, ConfigSetInvalidSlotRangeStart) {
  // Note that slot_ranges.start is not a number
  EXPECT_EQ(ClusterConfig::CreateFromConfig(kMyId, R"json(
                [
                  {
                    "slot_ranges": [
                      {
                        "start": "0",
                        "end": 16383
                      }
                    ],
                    "master": {
                      "id": "abcd1234",
                      "ip": "10.0.0.1",
                      "port": 7000
                    },
                    "replicas": []
                  }
                ])json"),
            nullptr);
}

TEST_F(ClusterConfigTest, ConfigSetInvalidSlotRangeEnd) {
  // Note that slot_ranges.end is not a number
  EXPECT_EQ(ClusterConfig::CreateFromConfig(kMyId, R"json(
                [
                  {
                    "slot_ranges": [
                      {
                        "start": 0,
                        "end": "16383"
                      }
                    ],
                    "master": {
                      "id": "abcd1234",
                      "ip": "10.0.0.1",
                      "port": 7000
                    },
                    "replicas": []
                  }
                ])json"),
            nullptr);
}

TEST_F(ClusterConfigTest, ConfigSetInvalidMissingMaster) {
  EXPECT_EQ(ClusterConfig::CreateFromConfig(kMyId, R"json(
                [
                  {
                    "slot_ranges": [
                      {
                        "start": 0,
                        "end": 16383
                      }
                    ]
                  }
                ])json"),
            nullptr);
}

TEST_F(ClusterConfigTest, ConfigSetInvalidMasterNotObject) {
  // Note that master is not an object
  EXPECT_EQ(ClusterConfig::CreateFromConfig(kMyId, R"json(
                [
                  {
                    "slot_ranges": [
                      {
                        "start": 0,
                        "end": 16383
                      }
                    ],
                    "master": 123,
                    "replicas": []
                  }
                ])json"),
            nullptr);
}

TEST_F(ClusterConfigTest, ConfigSetInvalidMasterMissingId) {
  EXPECT_EQ(ClusterConfig::CreateFromConfig(kMyId, R"json(
                [
                  {
                    "slot_ranges": [
                      {
                        "start": 0,
                        "end": 16383
                      }
                    ],
                    "master": {
                      "ip": "10.0.0.0",
                      "port": 8000
                    },
                    "replicas": []
                  }
                ])json"),
            nullptr);
}

TEST_F(ClusterConfigTest, ConfigSetInvalidMasterMissingIp) {
  EXPECT_EQ(ClusterConfig::CreateFromConfig(kMyId, R"json(
                [
                  {
                    "slot_ranges": [
                      {
                        "start": 0,
                        "end": 16383
                      }
                    ],
                    "master": {
                      "id": "abcdefg",
                      "port": 8000
                    },
                    "replicas": []
                  }
                ])json"),
            nullptr);
}

TEST_F(ClusterConfigTest, ConfigSetInvalidMasterMissingPort) {
  EXPECT_EQ(ClusterConfig::CreateFromConfig(kMyId, R"json(
                [
                  {
                    "slot_ranges": [
                      {
                        "start": 0,
                        "end": 16383
                      }
                    ],
                    "master": {
                      "id": "abcdefg",
                      "ip": "10.0.0.0"
                    },
                    "replicas": []
                  }
                ])json"),
            nullptr);
}

TEST_F(ClusterConfigTest, ConfigSetInvalidMissingReplicas) {
  EXPECT_EQ(ClusterConfig::CreateFromConfig(kMyId, R"json(
                [
                  {
                    "slot_ranges": [
                      {
                        "start": 0,
                        "end": 16383
                      }
                    ],
                    "master": {
                      "id": "abcdefg",
                      "ip": "10.0.0.0",
                      "port": 8000
                    }
                  }
                ])json"),
            nullptr);
}

TEST_F(ClusterConfigTest, ConfigSetInvalidRepeatingMasterId) {
  EXPECT_EQ(ClusterConfig::CreateFromConfig(kMyId, R"json(
                [
                  {
                    "slot_ranges": [
                      {
                        "start": 0,
                        "end": 10000
                      }
                    ],
                    "master": {
                      "id": "abcdefg",
                      "ip": "10.0.0.0",
                      "port": 8000
                    },
                    "replicas": []
                  },
                  {
                    "slot_ranges": [
                      {
                        "start": 10001,
                        "end": 16383
                      }
                    ],
                    "master": {
                      "id": "abcdefg",
                      "ip": "10.0.0.0",
                      "port": 8000
                    },
                    "replicas": []
                  }
                ])json"),
            nullptr);
}

TEST_F(ClusterConfigTest, ConfigSetInvalidRepeatingReplicaId) {
  EXPECT_EQ(ClusterConfig::CreateFromConfig(kMyId, R"json(
                [
                  {
                    "slot_ranges": [
                      {
                        "start": 0,
                        "end": 16383
                      }
                    ],
                    "master": {
                      "id": "abcdefg",
                      "ip": "10.0.0.0",
                      "port": 8000
                    },
                    "replicas": [
                      {
                        "id": "xyz",
                        "ip": "10.0.0.1",
                        "port": 8001
                      },
                      {
                        "id": "xyz",
                        "ip": "10.0.0.2",
                        "port": 8002
                      }
                    ]
                  }
                ])json"),
            nullptr);
}

TEST_F(ClusterConfigTest, ConfigSetInvalidRepeatingMasterAndReplicaId) {
  EXPECT_EQ(ClusterConfig::CreateFromConfig(kMyId, R"json(
                [
                  {
                    "slot_ranges": [
                      {
                        "start": 0,
                        "end": 16383
                      }
                    ],
                    "master": {
                      "id": "abcdefg",
                      "ip": "10.0.0.0",
                      "port": 8000
                    },
                    "replicas": [
                      {
                        "id": "abcdefg",
                        "ip": "10.0.0.1",
                        "port": 8001
                      }
                    ]
                  }
                ])json"),
            nullptr);
}

TEST_F(ClusterConfigTest, ConfigSetMigrations) {
  const auto* config_str = R"json(
  [
    {
      "slot_ranges": [ { "start": 0, "end": 8000 } ],
      "master": { "id": "id0", "ip": "localhost", "port": 3000 },
      "replicas": [],
      "migrations": [{ "slot_ranges": [ { "start": 7000, "end": 8000 } ]
                     , "ip": "127.0.0.1", "port" : 9001, "node_id": "id1" }]
    },
    {
      "slot_ranges": [ { "start": 8001, "end": 16383 } ],
      "master": { "id": "id1", "ip": "localhost", "port": 3001 },
      "replicas": []
    }
  ])json";

  auto config1 = ClusterConfig::CreateFromConfig("id0", config_str);
  EXPECT_EQ(
      config1->GetNewOutgoingMigrations(nullptr),
      (std::vector<MigrationInfo>{{.slot_ranges = SlotRanges({{7000, 8000}}),
                                   .node_info = {.id = "id1", .ip = "127.0.0.1", .port = 9001}}}));

  EXPECT_TRUE(config1->GetFinishedOutgoingMigrations(nullptr).empty());
  EXPECT_TRUE(config1->GetNewIncomingMigrations(nullptr).empty());
  EXPECT_TRUE(config1->GetFinishedIncomingMigrations(nullptr).empty());

  auto config2 = ClusterConfig::CreateFromConfig("id1", config_str);
  EXPECT_EQ(
      config2->GetNewIncomingMigrations(nullptr),
      (std::vector<MigrationInfo>{{.slot_ranges = SlotRanges({{7000, 8000}}),
                                   .node_info = {.id = "id0", .ip = "127.0.0.1", .port = 9001}}}));

  EXPECT_TRUE(config2->GetFinishedOutgoingMigrations(nullptr).empty());
  EXPECT_TRUE(config2->GetNewOutgoingMigrations(nullptr).empty());
  EXPECT_TRUE(config2->GetFinishedIncomingMigrations(nullptr).empty());

  auto config3 = ClusterConfig::CreateFromConfig("id2", config_str);
  EXPECT_TRUE(config3->GetFinishedOutgoingMigrations(nullptr).empty());
  EXPECT_TRUE(config3->GetNewIncomingMigrations(nullptr).empty());
  EXPECT_TRUE(config3->GetFinishedIncomingMigrations(nullptr).empty());
  EXPECT_TRUE(config3->GetNewOutgoingMigrations(nullptr).empty());

  const auto* config_str2 = R"json(
  [
    {
      "slot_ranges": [ { "start": 0, "end": 6999 } ],
      "master": { "id": "id0", "ip": "localhost", "port": 3000 },
      "replicas": []
    },
    {
      "slot_ranges": [ { "start": 7000, "end": 16383 } ],
      "master": { "id": "id1", "ip": "localhost", "port": 3001 },
      "replicas": []
    }
  ])json";

  auto config4 = ClusterConfig::CreateFromConfig("id0", config_str2);
  auto config5 = ClusterConfig::CreateFromConfig("id1", config_str2);

  EXPECT_EQ(
      config4->GetFinishedOutgoingMigrations(config1),
      (std::vector<MigrationInfo>{{.slot_ranges = SlotRanges({{7000, 8000}}),
                                   .node_info = {.id = "id1", .ip = "127.0.0.1", .port = 9001}}}));
  EXPECT_TRUE(config4->GetNewIncomingMigrations(config1).empty());
  EXPECT_TRUE(config4->GetFinishedIncomingMigrations(config1).empty());
  EXPECT_TRUE(config4->GetNewOutgoingMigrations(config1).empty());

  EXPECT_EQ(
      config5->GetFinishedIncomingMigrations(config2),
      (std::vector<MigrationInfo>{{.slot_ranges = SlotRanges({{7000, 8000}}),
                                   .node_info = {.id = "id0", .ip = "127.0.0.1", .port = 9001}}}));
  EXPECT_TRUE(config5->GetNewIncomingMigrations(config2).empty());
  EXPECT_TRUE(config5->GetFinishedOutgoingMigrations(config2).empty());
  EXPECT_TRUE(config5->GetNewOutgoingMigrations(config2).empty());
}

TEST_F(ClusterConfigTest, InvalidConfigMigrationsWithoutIP) {
  auto config = ClusterConfig::CreateFromConfig("id0", R"json(
  [
    {
      "slot_ranges": [ { "start": 0, "end": 8000 } ],
      "master": { "id": "id0", "ip": "localhost", "port": 3000 },
      "replicas": [],
      "migrations": [{ "slot_ranges": [ { "start": 7000, "end": 8000 } ]
                     , "port" : 9001, "node_id": "id1" }]
    },
    {
      "slot_ranges": [ { "start": 8001, "end": 16383 } ],
      "master": { "id": "id1", "ip": "localhost", "port": 3001 },
      "replicas": []
    }
  ])json");

  EXPECT_EQ(config, nullptr);
}

TEST_F(ClusterConfigTest, SlotSetAPI) {
  {
    SlotSet ss(false);
    EXPECT_EQ(ss.ToSlotRanges(), SlotRanges());
    EXPECT_FALSE(ss.All());
    EXPECT_TRUE(ss.Empty());
  }
  {
    SlotSet ss(true);
    EXPECT_EQ(ss.ToSlotRanges(), SlotRanges({{0, SlotRange::kMaxSlotId}}));
    EXPECT_TRUE(ss.All());
    EXPECT_FALSE(ss.Empty());
  }
  {
    SlotSet ss(SlotRanges({{0, 1000}, {1001, 2000}}));
    EXPECT_EQ(ss.ToSlotRanges(), SlotRanges({SlotRange{0, 2000}}));
    EXPECT_EQ(ss.Count(), 2001);

    for (uint16_t i = 0; i < 2000; ++i) {
      EXPECT_TRUE(ss.Contains(i));
    }
    for (uint16_t i = 2001; i <= SlotRange::kMaxSlotId; ++i) {
      EXPECT_FALSE(ss.Contains(i));
    }

    EXPECT_FALSE(ss.All());
    EXPECT_FALSE(ss.Empty());

    ss.Set(5010, true);
    EXPECT_EQ(ss.ToSlotRanges(), SlotRanges({{0, 2000}, {5010, 5010}}));

    ss.Set(SlotRanges({{5000, 5100}}), true);
    EXPECT_EQ(ss.ToSlotRanges(), SlotRanges({{0, 2000}, {5000, 5100}}));

    ss.Set(5050, false);
    EXPECT_EQ(ss.ToSlotRanges(), SlotRanges({{0, 2000}, {5000, 5049}, {5051, 5100}}));

    ss.Set(5500, false);
    EXPECT_EQ(ss.ToSlotRanges(), SlotRanges({{0, 2000}, {5000, 5049}, {5051, 5100}}));

    ss.Set(SlotRanges({{5090, 5100}}), false);
    EXPECT_EQ(ss.ToSlotRanges(), SlotRanges({{0, 2000}, {5000, 5049}, {5051, 5089}}));

    SlotSet ss1(SlotRanges({{1001, 2000}}));

    EXPECT_EQ(ss.GetRemovedSlots(ss1).ToSlotRanges(),
              SlotRanges({{0, 1000}, {5000, 5049}, {5051, 5089}}));
    EXPECT_EQ(ss1.GetRemovedSlots(ss).ToSlotRanges(), SlotRanges());
  }
}

TEST_F(ClusterConfigTest, ConfigComparison) {
  auto config1 = ClusterConfig::CreateFromConfig("id0", R"json(
  [
    {
      "slot_ranges": [ { "start": 0, "end": 8000 } ],
      "master": { "id": "id0", "ip": "localhost", "port": 3000 },
      "replicas": [],
      "migrations": [{ "slot_ranges": [ { "start": 7000, "end": 8000 } ]
                     , "ip": "127.0.0.1", "port" : 9001, "node_id": "id1" }]
    },
    {
      "slot_ranges": [ { "start": 8001, "end": 16383 } ],
      "master": { "id": "id1", "ip": "localhost", "port": 3001 },
      "replicas": []
    }
  ])json");

  EXPECT_EQ(config1->GetConfig(), config1->GetConfig());

  auto config2 = ClusterConfig::CreateFromConfig("id0", R"json(
  [
    {
      "slot_ranges": [ { "start": 0, "end": 16383 } ],
      "master": { "id": "id0", "ip": "localhost", "port": 3000 },
      "replicas": [],
      "migrations": [{ "slot_ranges": [ { "start": 7000, "end": 8000 } ]
                     , "ip": "127.0.0.1", "port" : 9001, "node_id": "id1" }]
    }
  ])json");
  EXPECT_NE(config1->GetConfig(), config2->GetConfig());
  EXPECT_EQ(config2->GetConfig(), config2->GetConfig());

  auto config3 = ClusterConfig::CreateFromConfig("id0", R"json(
  [
    {
      "slot_ranges": [ { "start": 0, "end": 8000 } ],
      "master": { "id": "id0", "ip": "localhost", "port": 3000 },
      "replicas": [],
      "migrations": [{ "slot_ranges": [ { "start": 7000, "end": 8000 } ]
                     , "ip": "127.0.0.1", "port" : 9002, "node_id": "id1" }]
    },
    {
      "slot_ranges": [ { "start": 8001, "end": 16383 } ],
      "master": { "id": "id1", "ip": "localhost", "port": 3001 },
      "replicas": []
    }
  ])json");
  EXPECT_NE(config1->GetConfig(), config3->GetConfig());
  EXPECT_NE(config2->GetConfig(), config3->GetConfig());
  EXPECT_EQ(config3->GetConfig(), config3->GetConfig());

  auto config4 = ClusterConfig::CreateFromConfig("id0", R"json(
  [
    {
      "slot_ranges": [ { "start": 0, "end": 8000 } ],
      "master": { "id": "id0", "ip": "localhost", "port": 3000 },
      "replicas": [],
      "migrations": [{ "slot_ranges": [ { "start": 7000, "end": 8000 } ]
                     , "ip": "127.0.0.1", "port" : 9001, "node_id": "id2" }]
    },
    {
      "slot_ranges": [ { "start": 8001, "end": 16383 } ],
      "master": { "id": "id1", "ip": "localhost", "port": 3001 },
      "replicas": []
    }
  ])json");

  EXPECT_NE(config1->GetConfig(), config4->GetConfig());
  EXPECT_NE(config2->GetConfig(), config4->GetConfig());
  EXPECT_NE(config3->GetConfig(), config4->GetConfig());
  EXPECT_EQ(config4->GetConfig(), config4->GetConfig());

  auto config5 = ClusterConfig::CreateFromConfig("id0", R"json(
  [
    {
      "slot_ranges": [ { "start": 0, "end": 8000 } ],
      "master": { "id": "id2", "ip": "localhost", "port": 3000 },
      "replicas": [],
      "migrations": [{ "slot_ranges": [ { "start": 7000, "end": 8000 } ]
                     , "ip": "127.0.0.1", "port" : 9001, "node_id": "id1" }]
    },
    {
      "slot_ranges": [ { "start": 8001, "end": 16383 } ],
      "master": { "id": "id1", "ip": "localhost", "port": 3001 },
      "replicas": []
    }
  ])json");
  EXPECT_NE(config1->GetConfig(), config5->GetConfig());
  EXPECT_NE(config2->GetConfig(), config5->GetConfig());
  EXPECT_NE(config3->GetConfig(), config5->GetConfig());
  EXPECT_NE(config4->GetConfig(), config5->GetConfig());
  EXPECT_EQ(config5->GetConfig(), config5->GetConfig());
}

TEST_F(ClusterConfigTest, NodesHealth) {
  auto config1 = ClusterConfig::CreateFromConfig("id0", R"json(
  [
    {
      "slot_ranges": [ { "start": 0, "end": 16383 } ],
      "master": { "id": "id0", "ip": "localhost", "port": 3000, "health" : "online" },
      "replicas": [{ "id": "id1", "ip": "localhost", "port": 3001, "health" : "loading" },
                   { "id": "id2", "ip": "localhost", "port": 3002, "health" : "fail" }],
      "migrations": [{ "slot_ranges": [ { "start": 7000, "end": 8000 } ]
                     , "ip": "127.0.0.1", "port" : 9001, "node_id": "id1" }]
    }

  ])json");

  EXPECT_EQ(config1->GetConfig().begin()->master.health, NodeHealth::ONLINE);
  EXPECT_EQ(config1->GetConfig().begin()->replicas.front().health, NodeHealth::LOADING);
  EXPECT_EQ(config1->GetConfig().begin()->replicas.back().health, NodeHealth::FAIL);
}

}  // namespace dfly::cluster


================================================
FILE: src/server/cluster/cluster_defs.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "cluster_defs.h"

#include <absl/strings/str_cat.h>
#include <absl/strings/str_join.h>

#include "base/logging.h"
#include "cluster_config.h"
#include "facade/error.h"
#include "slot_set.h"

using namespace std;

namespace dfly::cluster {
std::string SlotRange::ToString() const {
  return absl::StrCat("[", start, ", ", end, "]");
}

SlotRanges::SlotRanges(std::vector<SlotRange> ranges) : ranges_(std::move(ranges)) {
  std::sort(ranges_.begin(), ranges_.end());
}

void SlotRanges::Merge(const SlotRanges& sr) {
  ranges_.reserve(ranges_.size() + sr.Size());
  for (const auto& r : sr) {
    ranges_.push_back(r);
  }
  std::sort(ranges_.begin(), ranges_.end());
}

std::string SlotRanges::ToString() const {
  return absl::StrJoin(ranges_, ", ", [](std::string* out, SlotRange range) {
    absl::StrAppend(out, range.ToString());
  });
}

std::string MigrationInfo::ToString() const {
  return absl::StrCat(node_info.id, ",", node_info.ip, ":", node_info.port, " (",
                      slot_ranges.ToString(), ")");
}

bool ClusterShardInfo::operator==(const ClusterShardInfo& r) const {
  if (slot_ranges == r.slot_ranges && master == r.master) {
    auto lreplicas = replicas;
    auto lmigrations = migrations;
    auto rreplicas = r.replicas;
    auto rmigrations = r.migrations;
    std::sort(lreplicas.begin(), lreplicas.end());
    std::sort(lmigrations.begin(), lmigrations.end());
    std::sort(rreplicas.begin(), rreplicas.end());
    std::sort(rmigrations.begin(), rmigrations.end());
    return lreplicas == rreplicas && lmigrations == rmigrations;
  }
  return false;
}

ClusterShardInfos::ClusterShardInfos(std::vector<ClusterShardInfo> infos)
    : infos_(std::move(infos)) {
  std::sort(infos_.begin(), infos_.end());
}

facade::ErrorReply SlotOwnershipError(SlotId slot_id) {
  const auto cluster_config = ClusterConfig::Current();
  if (!cluster_config)
    return facade::ErrorReply{facade::kClusterNotConfigured};

  if (!cluster_config->IsMySlot(slot_id)) {
    // See more details here: https://redis.io/docs/reference/cluster-spec/#moved-redirection
    cluster::ClusterNodeInfo master = cluster_config->GetMasterNodeForSlot(slot_id);
    return facade::ErrorReply{absl::StrCat("-MOVED ", slot_id, " ", master.ip, ":", master.port),
                              "MOVED"};
  }
  return facade::ErrorReply{facade::OpStatus::OK};
}

std::string_view ToString(NodeHealth nh) {
  switch (nh) {
    case NodeHealth::FAIL:
      return "fail";
    case NodeHealth::LOADING:
      return "loading";
    case NodeHealth::ONLINE:
      return "online";
    case NodeHealth::HIDDEN:
      DCHECK(false);  // shouldn't be used
      return "hidden";
  }
  DCHECK(false);
  return "undefined_health";
}

}  // namespace dfly::cluster


================================================
FILE: src/server/cluster/cluster_defs.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <cstdint>
#include <optional>
#include <string>
#include <string_view>
#include <vector>

#include "facade/cmd_arg_parser.h"
#include "facade/facade_types.h"
#include "server/cluster_support.h"

namespace dfly::cluster {

// A SlotId validated to be within [0, kMaxSlotNum], usable directly with CmdArgParser::Next().
using ParsedSlotId = facade::FInt<SlotId{0}, SlotId{kMaxSlotNum}>;

struct SlotRange {
  static constexpr SlotId kMaxSlotId = 0x3FFF;
  SlotId start = 0;
  SlotId end = 0;

  bool operator==(const SlotRange& r) const noexcept {
    return start == r.start && end == r.end;
  }

  bool operator<(const SlotRange& r) const noexcept {
    return start < r.start || (start == r.start && end < r.end);
  }

  bool IsValid() const noexcept {
    return start <= end && start <= kMaxSlotId && end <= kMaxSlotId;
  }

  bool Contains(SlotId id) const noexcept {
    return id >= start && id <= end;
  }

  std::string ToString() const;
};

class SlotRanges {
 public:
  SlotRanges() = default;
  explicit SlotRanges(std::vector<SlotRange> ranges);

  bool Contains(SlotId id) const noexcept {
    for (const auto& sr : ranges_) {
      if (sr.Contains(id))
        return true;
    }
    return false;
  }

  size_t Size() const noexcept {
    return ranges_.size();
  }

  bool Empty() const noexcept {
    return ranges_.empty();
  }

  void Merge(const SlotRanges& sr);

  bool operator==(const SlotRanges& r) const noexcept {
    return ranges_ == r.ranges_;
  }

  std::string ToString() const;

  auto begin() const noexcept {
    return ranges_.cbegin();
  }

  auto end() const noexcept {
    return ranges_.cend();
  }

 private:
  std::vector<SlotRange> ranges_;
};

struct ClusterNodeInfo {
  std::string id;
  std::string ip;
  uint16_t port = 0;

  bool operator==(const ClusterNodeInfo& r) const noexcept {
    return port == r.port && ip == r.ip && id == r.id;
  }

  bool operator<(const ClusterNodeInfo& r) const noexcept {
    return id < r.id;
  }
};

enum class NodeHealth : std::uint8_t { FAIL, LOADING, ONLINE, HIDDEN };
std::string_view ToString(NodeHealth nh);

struct ClusterExtendedNodeInfo : ClusterNodeInfo {
  NodeHealth health = NodeHealth::ONLINE;
  bool operator==(const ClusterExtendedNodeInfo& r) const noexcept {
    return health == r.health && ClusterNodeInfo::operator==(r);
  }
};

struct MigrationInfo {
  SlotRanges slot_ranges;
  ClusterNodeInfo node_info;

  bool operator==(const MigrationInfo& r) const noexcept {
    return node_info == r.node_info && slot_ranges == r.slot_ranges;
  }

  bool operator<(const MigrationInfo& r) const noexcept {
    return node_info < r.node_info;
  }

  std::string ToString() const;
};

struct ClusterShardInfo {
  SlotRanges slot_ranges;
  ClusterExtendedNodeInfo master;
  std::vector<ClusterExtendedNodeInfo> replicas;
  std::vector<MigrationInfo> migrations;

  bool operator==(const ClusterShardInfo& r) const;

  bool operator<(const ClusterShardInfo& r) const noexcept {
    return master < r.master;
  }
};

class ClusterShardInfos {
 public:
  ClusterShardInfos() = default;
  ClusterShardInfos(std::vector<ClusterShardInfo> infos);
  ClusterShardInfos(ClusterShardInfo info) : infos_({info}) {
  }

  auto begin() const noexcept {
    return infos_.cbegin();
  }

  auto end() const noexcept {
    return infos_.cend();
  }

  auto begin() noexcept {
    return infos_.begin();
  }

  auto end() noexcept {
    return infos_.end();
  }

  auto size() const noexcept {
    return infos_.size();
  }

  bool empty() const noexcept {
    return infos_.empty();
  }

  bool operator==(const ClusterShardInfos& r) const noexcept {
    return infos_ == r.infos_;
  }

  bool operator!=(const ClusterShardInfos& r) const noexcept {
    return infos_ != r.infos_;
  }

  auto Unwrap() const {
    return infos_;
  }

 private:
  std::vector<ClusterShardInfo> infos_;
};

// MigrationState constants are ordered in state changing order
enum class MigrationState : uint8_t { C_CONNECTING, C_SYNC, C_ERROR, C_FINISHED, C_FATAL };

// Errors during slot migration
static constexpr std::string_view kUnknownMigration = "UNKNOWN_MIGRATION";
static constexpr std::string_view kIncomingMigrationOOM = "INCOMING_MIGRATION_OOM";

// return error message if slot doesn't belong to this node
facade::ErrorReply SlotOwnershipError(SlotId slot_id);

}  // namespace dfly::cluster


================================================
FILE: src/server/cluster/cluster_family.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/cluster/cluster_family.h"

#include <absl/cleanup/cleanup.h>
#include <absl/strings/ascii.h>
#include <absl/strings/str_cat.h>

#include <memory>
#include <mutex>
#include <string>

#include "base/flags.h"
#include "base/logging.h"
#include "facade/cmd_arg_parser.h"
#include "facade/dragonfly_connection.h"
#include "facade/dragonfly_listener.h"
#include "facade/error.h"
#include "facade/reply_builder.h"
#include "server/acl/acl_commands_def.h"
#include "server/channel_store.h"
#include "server/cluster/coordinator.h"
#include "server/command_registry.h"
#include "server/conn_context.h"
#include "server/dflycmd.h"
#include "server/error.h"
#include "server/journal/journal.h"
#include "server/main_service.h"
#include "server/namespaces.h"
#include "server/server_family.h"
#include "server/server_state.h"
#include "util/fibers/synchronization.h"

ABSL_FLAG(std::string, cluster_announce_ip, "",
          "IP address that Dragonfly announces to cluster clients");

ABSL_FLAG(std::string, cluster_node_id, "",
          "ID within a cluster, used for slot assignment. MUST be unique. If empty, uses master "
          "replication ID (random string)");

ABSL_DECLARE_FLAG(int32_t, port);
ABSL_DECLARE_FLAG(uint16_t, announce_port);
ABSL_DECLARE_FLAG(bool, managed_service_info);

namespace dfly {
namespace acl {
constexpr uint32_t kCluster = SLOW;
// Reconsider to maybe more sensible defaults
constexpr uint32_t kDflyCluster = ADMIN | SLOW;
constexpr uint32_t kReadOnly = FAST | CONNECTION;
constexpr uint32_t kReadWrite = FAST | CONNECTION;
constexpr uint32_t kDflyMigrate = ADMIN | SLOW | DANGEROUS;
}  // namespace acl
}  // namespace dfly

namespace dfly::cluster {
namespace {

using namespace std;
using namespace facade;
using namespace util;
using Payload = journal::Entry::Payload;
using CI = CommandId;

constexpr char kIdNotFound[] = "syncid not found";

constexpr string_view kClusterDisabled =
    "Cluster is disabled. Enabled via passing --cluster_mode=emulated|yes";

}  // namespace

ClusterFamily::ClusterFamily(ServerFamily* server_family) : server_family_(server_family) {
  CHECK_NOTNULL(server_family_);

  InitializeCluster();

  id_ = absl::GetFlag(FLAGS_cluster_node_id);
  if (id_.empty()) {
    id_ = server_family_->master_replid();
  } else if (IsClusterEmulated()) {
    LOG(ERROR) << "Setting --cluster_node_id in emulated mode is unsupported";
    exit(1);
  }
}

void ClusterFamily::Shutdown() {
  Coordinator::Current().Shutdown();
  shard_set->pool()->at(0)->Await([this]() ABSL_LOCKS_EXCLUDED(set_config_mu) {
    PreparedToRemoveOutgoingMigrations outgoing_migrations;  // should be removed without mutex lock
    {
      util::fb2::LockGuard lk(set_config_mu);
      if (!ClusterConfig::Current())
        return;

      auto empty_config = ClusterConfig::Current()->CloneWithoutMigrations();
      outgoing_migrations = TakeOutOutgoingMigrations(empty_config, ClusterConfig::Current());
      RemoveIncomingMigrations(
          empty_config->GetFinishedIncomingMigrations(ClusterConfig::Current()));

      util::fb2::LockGuard migration_lk(migration_mu_);
      DCHECK(outgoing_migration_jobs_.empty());
      DCHECK(incoming_migrations_jobs_.empty());
    }
  });
}

std::optional<ClusterShardInfos> ClusterFamily::GetShardInfos(ConnectionContext* cntx) const {
  if (IsClusterEmulated()) {
    return {GetEmulatedShardInfo(cntx)};
  }

  if (ClusterConfig::Current() != nullptr) {
    return ClusterConfig::Current()->GetConfig();
  }
  return nullopt;
}

ClusterShardInfo ClusterFamily::GetEmulatedShardInfo(ConnectionContext* cntx) const {
  ClusterShardInfo info{.slot_ranges = SlotRanges({{.start = 0, .end = kMaxSlotNum}}),
                        .master = {},
                        .replicas = {},
                        .migrations = {}};

  optional<Metrics::ReplicaInfo> repl_info = server_family_->GetReplicaSummary();
  ServerState& etl = *ServerState::tlocal();
  if (!repl_info) {
    DCHECK(etl.is_master);
    std::string cluster_announce_ip = absl::GetFlag(FLAGS_cluster_announce_ip);
    std::string preferred_endpoint =
        cluster_announce_ip.empty() ? cntx->conn()->LocalBindAddress() : cluster_announce_ip;
    uint16_t cluster_announce_port = absl::GetFlag(FLAGS_announce_port);
    uint16_t preferred_port = cluster_announce_port == 0
                                  ? static_cast<uint16_t>(absl::GetFlag(FLAGS_port))
                                  : cluster_announce_port;

    info.master = {{.id = id_, .ip = preferred_endpoint, .port = preferred_port},
                   NodeHealth::ONLINE};

    if (cntx->conn()->IsPrivileged() || !absl::GetFlag(FLAGS_managed_service_info)) {
      for (const auto& replica : server_family_->GetDflyCmd()->GetReplicasRoleInfo()) {
        info.replicas.push_back({{.id = replica.id,
                                  .ip = replica.address,
                                  .port = static_cast<uint16_t>(replica.listening_port)},
                                 NodeHealth::ONLINE});
      }
    }
  } else {
    // TODO: We currently don't save the master's ID in the replica
    info.master = {{.id = "", .ip = repl_info->summary.host, .port = repl_info->summary.port},
                   NodeHealth::ONLINE};
    info.replicas.push_back({{.id = id_,
                              .ip = cntx->conn()->LocalBindAddress(),
                              .port = static_cast<uint16_t>(absl::GetFlag(FLAGS_port))},
                             NodeHealth::ONLINE});
  }

  return info;
}

void ClusterFamily::ClusterHelp(SinkReplyBuilder* builder) {
  string_view help_arr[] = {
      "CLUSTER <subcommand> [<arg> [value] [opt] ...]. Subcommands are:",
      "SLOTS",
      "   Return information about slots range mappings. Each range is made of:",
      "   start, end, master and replicas IP addresses, ports and ids.",
      "NODES",
      "   Return cluster configuration seen by node. Output format:",
      "   <id> <ip:port> <flags> <master> <pings> <pongs> <epoch> <link> <slot> ...",
      "INFO",
      "  Return information about the cluster",
      "HELP",
      "    Prints this help.",
  };
  auto* rb = static_cast<RedisReplyBuilder*>(builder);
  return rb->SendSimpleStrArr(help_arr);
}

namespace {
void ClusterShardsImpl(const ClusterShardInfos& config, SinkReplyBuilder* builder) {
  // For more details https://redis.io/commands/cluster-shards/
  constexpr unsigned int kEntrySize = 4;
  auto* rb = static_cast<RedisReplyBuilder*>(builder);

  auto WriteNode = [&](const ClusterExtendedNodeInfo& node, string_view role) {
    constexpr unsigned int kNodeSize = 14;
    rb->StartArray(kNodeSize);
    rb->SendBulkString("id");
    rb->SendBulkString(node.id);
    rb->SendBulkString("endpoint");
    rb->SendBulkString(node.ip);
    rb->SendBulkString("ip");
    rb->SendBulkString(node.ip);
    rb->SendBulkString("port");
    rb->SendLong(node.port);
    rb->SendBulkString("role");
    rb->SendBulkString(role);
    rb->SendBulkString("replication-offset");
    rb->SendLong(0);
    rb->SendBulkString("health");
    rb->SendBulkString(ToString(node.health));
  };

  rb->StartArray(config.size());
  for (const auto& shard : config) {
    rb->StartArray(kEntrySize);
    rb->SendBulkString("slots");

    rb->StartArray(shard.slot_ranges.Size() * 2);
    for (const auto& slot_range : shard.slot_ranges) {
      rb->SendLong(slot_range.start);
      rb->SendLong(slot_range.end);
    }

    rb->SendBulkString("nodes");
    rb->StartArray(1 + shard.replicas.size());
    WriteNode(shard.master, "master");
    for (const auto& replica : shard.replicas) {
      WriteNode(replica, "replica");
    }
  }
}
}  // namespace

void ClusterFamily::ClusterShards(SinkReplyBuilder* builder, ConnectionContext* cntx) {
  auto config = GetShardInfos(cntx);
  if (config) {
    // we need to remove hiden replicas
    auto shards_info = config->Unwrap();
    for (auto& shard : shards_info) {
      auto new_end = std::remove_if(shard.replicas.begin(), shard.replicas.end(),
                                    [](const auto& r) { return r.health == NodeHealth::HIDDEN; });
      shard.replicas.erase(new_end, shard.replicas.end());
    }
    return ClusterShardsImpl({shards_info}, builder);
  }
  return builder->SendError(kClusterNotConfigured);
}

namespace {
void ClusterSlotsImpl(ClusterShardInfos config, SinkReplyBuilder* builder) {
  // For more details https://redis.io/commands/cluster-slots/
  auto* rb = static_cast<RedisReplyBuilder*>(builder);

  auto WriteNode = [&](const ClusterNodeInfo& node) {
    constexpr unsigned int kNodeSize = 3;
    rb->StartArray(kNodeSize);
    rb->SendBulkString(node.ip);
    rb->SendLong(node.port);
    rb->SendBulkString(node.id);
  };

  unsigned int slot_ranges = 0;

  // we need to remove hiden and fail replicas
  auto shards_info = config.Unwrap();
  for (auto& shard : shards_info) {
    slot_ranges += shard.slot_ranges.Size();
    auto new_end = std::remove_if(shard.replicas.begin(), shard.replicas.end(), [](const auto& r) {
      return r.health == NodeHealth::HIDDEN || r.health == NodeHealth::FAIL ||
             r.health == NodeHealth::LOADING;
    });
    shard.replicas.erase(new_end, shard.replicas.end());
  }

  config = {shards_info};

  rb->StartArray(slot_ranges);
  for (const auto& shard : config) {
    for (const auto& slot_range : shard.slot_ranges) {
      const unsigned int array_size =
          /* slot-start, slot-end */ 2 + /* master */ 1 + /* replicas */ shard.replicas.size();
      rb->StartArray(array_size);
      rb->SendLong(slot_range.start);
      rb->SendLong(slot_range.end);
      WriteNode(shard.master);
      for (const auto& replica : shard.replicas) {
        WriteNode(replica);
      }
    }
  }
}
}  // namespace

void ClusterFamily::ClusterSlots(SinkReplyBuilder* builder, ConnectionContext* cntx) {
  auto shard_infos = GetShardInfos(cntx);
  if (shard_infos) {
    return ClusterSlotsImpl(*shard_infos, builder);
  }
  return builder->SendError(kClusterNotConfigured);
}

namespace {
void ClusterNodesImpl(const ClusterShardInfos& config, string_view my_id,
                      SinkReplyBuilder* builder) {
  // For more details https://redis.io/commands/cluster-nodes/

  string result;

  auto WriteNode = [&](const ClusterExtendedNodeInfo& node, string_view role, string_view master_id,
                       const SlotRanges& ranges) {
    absl::StrAppend(&result, node.id, " ");

    absl::StrAppend(&result, node.ip, ":", node.port, "@", node.port, " ");

    if (my_id == node.id) {
      absl::StrAppend(&result, "myself,");
    }
    absl::StrAppend(&result, role, " ");

    absl::StrAppend(&result, master_id, " ");

    absl::StrAppend(&result,
                    node.health != NodeHealth::FAIL ? "0 0 0 connected" : "0 0 0 disconnected");

    for (const auto& range : ranges) {
      absl::StrAppend(&result, " ", range.start);
      if (range.start != range.end) {
        absl::StrAppend(&result, "-", range.end);
      }
    }

    // Separate lines with only \n, not \r\n, see #2726
    absl::StrAppend(&result, "\n");
  };

  for (const auto& shard : config) {
    WriteNode(shard.master, "master", "-", shard.slot_ranges);
    for (const auto& replica : shard.replicas) {
      // Only the master prints ranges, so we send an empty set for replicas.
      if (replica.health != NodeHealth::HIDDEN) {
        WriteNode(replica, "slave", shard.master.id, {});
      }
    }
  }

  auto* rb = static_cast<RedisReplyBuilder*>(builder);
  return rb->SendBulkString(result);
}
}  // namespace

void ClusterFamily::ClusterNodes(SinkReplyBuilder* builder, ConnectionContext* cntx) {
  auto shard_infos = GetShardInfos(cntx);
  if (shard_infos) {
    return ClusterNodesImpl(*shard_infos, id_, builder);
  }
  return builder->SendError(kClusterNotConfigured);
}

namespace {
void ClusterInfoImpl(const ClusterShardInfos& config, SinkReplyBuilder* builder) {
  std::string msg;
  auto append = [&msg](absl::AlphaNum a1, absl::AlphaNum a2) {
    // Separate lines with \r\n, not \n, see #2726
    absl::StrAppend(&msg, a1, ":", a2, "\r\n");
  };

  // Initialize response variables to emulated mode.
  string_view state = "ok"sv;
  SlotId slots_assigned = kMaxSlotNum + 1;
  size_t known_nodes = 1;
  long epoch = 1;
  size_t cluster_size = 1;

  if (config.empty()) {
    state = "fail"sv;
    slots_assigned = 0;
    cluster_size = 0;
    known_nodes = 0;
  } else {
    known_nodes = 0;
    cluster_size = 0;
    for (const auto& shard_config : config) {
      known_nodes += 1;  // For master
      known_nodes += shard_config.replicas.size();

      if (!shard_config.slot_ranges.Empty()) {
        ++cluster_size;
      }
    }
  }

  append("cluster_state", state);
  append("cluster_slots_assigned", slots_assigned);
  append("cluster_slots_ok", slots_assigned);  // We do not support other failed nodes.
  append("cluster_slots_pfail", 0);
  append("cluster_slots_fail", 0);
  append("cluster_known_nodes", known_nodes);
  append("cluster_size", cluster_size);
  append("cluster_current_epoch", epoch);
  append("cluster_my_epoch", 1);
  append("cluster_stats_messages_ping_sent", 1);
  append("cluster_stats_messages_pong_sent", 1);
  append("cluster_stats_messages_sent", 1);
  append("cluster_stats_messages_ping_received", 1);
  append("cluster_stats_messages_pong_received", 1);
  append("cluster_stats_messages_meet_received", 0);
  append("cluster_stats_messages_received", 1);
  auto* rb = static_cast<RedisReplyBuilder*>(builder);
  rb->SendBulkString(msg);
}
}  // namespace

void ClusterFamily::ClusterInfo(SinkReplyBuilder* builder, ConnectionContext* cntx) {
  auto shard_infos = GetShardInfos(cntx);
  return ClusterInfoImpl(shard_infos.value_or(ClusterShardInfos{}), builder);
}

void ClusterFamily::KeySlot(CmdArgList args, SinkReplyBuilder* builder) {
  if (args.size() != 2) {
    return builder->SendError(WrongNumArgsError("CLUSTER KEYSLOT"));
  }

  SlotId id = dfly::KeySlot(ArgS(args, 1));
  return builder->SendLong(id);
}

void ClusterFamily::Cluster(CmdArgList args, CommandContext* cmd_cntx) {
  // In emulated cluster mode, all slots are mapped to the same host, and number of cluster
  // instances is thus 1.
  string sub_cmd = absl::AsciiStrToUpper(ArgS(args, 0));

  auto* builder = cmd_cntx->rb();
  if (!IsClusterEnabledOrEmulated()) {
    return builder->SendError(kClusterDisabled);
  }

  if (sub_cmd == "KEYSLOT") {
    return KeySlot(args, builder);
  }

  if (args.size() > 1) {
    return builder->SendError(WrongNumArgsError(absl::StrCat("CLUSTER ", sub_cmd)));
  }

  auto* cntx = cmd_cntx->server_conn_cntx();
  if (sub_cmd == "HELP") {
    return ClusterHelp(builder);
  } else if (sub_cmd == "MYID") {
    return ClusterMyId(builder);
  } else if (sub_cmd == "SHARDS") {
    return ClusterShards(builder, cntx);
  } else if (sub_cmd == "SLOTS") {
    return ClusterSlots(builder, cntx);
  } else if (sub_cmd == "NODES") {
    return ClusterNodes(builder, cntx);
  } else if (sub_cmd == "INFO") {
    return ClusterInfo(builder, cntx);
  } else {
    return builder->SendError(facade::UnknownSubCmd(sub_cmd, "CLUSTER"), facade::kSyntaxErrType);
  }
}

void ClusterFamily::ReadOnly(CmdArgList args, CommandContext* cmd_cntx) {
  cmd_cntx->rb()->SendOk();
}

void ClusterFamily::ReadWrite(CmdArgList args, CommandContext* cmd_cntx) {
  if (!IsClusterEmulated()) {
    return cmd_cntx->SendError(kClusterDisabled);
  }
  cmd_cntx->rb()->SendOk();
}

void ClusterFamily::DflyCluster(CmdArgList args, CommandContext* cmd_cntx) {
  auto* builder = cmd_cntx->rb();
  auto* cntx = cmd_cntx->server_conn_cntx();
  if (!(IsClusterEnabled() || (IsClusterEmulated() && cntx->journal_emulated))) {
    return builder->SendError("Cluster is disabled. Use --cluster_mode=yes to enable.");
  }

  string sub_cmd = absl::AsciiStrToUpper(ArgS(args, 0));
  args.remove_prefix(1);  // remove subcommand name
  if (sub_cmd == "GETSLOTINFO") {
    return DflyClusterGetSlotInfo(args, cmd_cntx);
  } else if (sub_cmd == "CONFIG") {
    return DflyClusterConfig(args, cmd_cntx);
  } else if (sub_cmd == "FLUSHSLOTS") {
    return DflyClusterFlushSlots(args, cmd_cntx);
  } else if (sub_cmd == "SLOT-MIGRATION-STATUS") {
    return DflySlotMigrationStatus(args, cmd_cntx);
  }

  return builder->SendError(UnknownSubCmd(sub_cmd, "DFLYCLUSTER"), kSyntaxErrType);
}

void ClusterFamily::ClusterMyId(SinkReplyBuilder* builder) {
  builder->SendSimpleString(id_);
}

namespace {

void DeleteSlots(const SlotRanges& slots_ranges) {
  if (slots_ranges.Empty()) {
    return;
  }

  auto cb = [&](auto*) {
    EngineShard* shard = EngineShard::tlocal();
    if (shard == nullptr)
      return;

    namespaces->GetDefaultNamespace().GetDbSlice(shard->shard_id()).FlushSlots(slots_ranges);
  };
  shard_set->pool()->AwaitFiberOnAll(std::move(cb));

  auto* channel_store = ServerState::tlocal()->channel_store();
  auto deleted = SlotSet(slots_ranges);
  channel_store->UnsubscribeAfterClusterSlotMigration(deleted);
}

void WriteFlushSlotsToJournal(const SlotRanges& slot_ranges) {
  if (slot_ranges.Empty()) {
    return;
  }

  // Build args
  vector<string> args;
  args.reserve(slot_ranges.Size() + 1);
  args.push_back("FLUSHSLOTS");
  for (SlotRange range : slot_ranges) {
    args.push_back(absl::StrCat(range.start));
    args.push_back(absl::StrCat(range.end));
  }

  // Build view
  vector<string_view> args_view(args.size());
  for (size_t i = 0; i < args.size(); ++i) {
    args_view[i] = args[i];
  }

  auto cb = [&](auto*) {
    EngineShard* shard = EngineShard::tlocal();
    if (shard == nullptr) {
      return;
    }

    if (!shard->journal()) {
      return;
    }

    // Send journal entry
    // TODO: Break slot migration upon FLUSHSLOTS
    journal::RecordEntry(/* txid= */ 0, journal::Op::COMMAND, /* dbid= */ 0, nullopt,
                         Payload("DFLYCLUSTER", args_view));
  };
  shard_set->pool()->AwaitFiberOnAll(std::move(cb));
}
}  // namespace

void ClusterFamily::DflyClusterConfig(CmdArgList args, CommandContext* cmd_cntx) {
  if (args.size() != 1) {
    return cmd_cntx->SendError(WrongNumArgsError("DFLYCLUSTER CONFIG"));
  }

  string_view json_str = ArgS(args, 0);
  shared_ptr<ClusterConfig> new_config = ClusterConfig::CreateFromConfig(id_, json_str);
  if (new_config == nullptr) {
    LOG(WARNING) << "Can't set cluster config";
    return cmd_cntx->SendError("Invalid cluster configuration.");
  } else if (ClusterConfig::Current() &&
             ClusterConfig::Current()->GetConfig() == new_config->GetConfig()) {
    return cmd_cntx->SendOk();
  }

  PreparedToRemoveOutgoingMigrations outgoing_migrations;  // should be removed without mutex lock

  {
    VLOG(1) << "Setting new cluster config: " << json_str;
    util::fb2::LockGuard gu(set_config_mu);

    outgoing_migrations = TakeOutOutgoingMigrations(new_config, ClusterConfig::Current());
    RemoveIncomingMigrations(new_config->GetFinishedIncomingMigrations(ClusterConfig::Current()));

    SlotRanges enable_slots, disable_slots;

    {
      util::fb2::LockGuard lk(migration_mu_);
      // If migration state is changed simultaneously, the changes to config will be applied after
      // set_config_mu is unlocked and even if we apply the same changes 2 times it's not a problem
      for (const auto& m : incoming_migrations_jobs_) {
        if (m->GetState() == MigrationState::C_FINISHED) {
          enable_slots.Merge(m->GetSlots());
        }
      }
      for (const auto& m : outgoing_migration_jobs_) {
        if (m->GetState() == MigrationState::C_FINISHED) {
          disable_slots.Merge(m->GetSlots());
        }
      }
    }

    new_config = new_config->CloneWithChanges(enable_slots, disable_slots);

    StartNewSlotMigrations(*new_config);

    SlotSet before =
        ClusterConfig::Current() ? ClusterConfig::Current()->GetOwnedSlots() : SlotSet(true);

    auto* conn = cmd_cntx->conn();
    // Ignore blocked commands because we filter them with CancelBlockingOnThread
    DispatchTracker tracker{server_family_->GetNonPriviligedListeners(), conn,
                            true /* ignore paused */, true /* ignore blocked */};

    auto blocking_filter = [&new_config](ArgSlice keys) {
      bool moved =
          any_of(keys.begin(), keys.end(), [&](auto k) { return !new_config->IsMySlot(k); });
      return moved ? OpStatus::KEY_MOVED : OpStatus::OK;
    };

    auto cb = [this, &tracker, &new_config, blocking_filter](util::ProactorBase*) {
      server_family_->CancelBlockingOnThread(blocking_filter);
      ClusterConfig::SetCurrent(new_config);
      tracker.TrackOnThread();
    };

    server_family_->service().proactor_pool().AwaitFiberOnAll(std::move(cb));
    DCHECK(ClusterConfig::Current() != nullptr);

    if (!tracker.Wait(absl::Seconds(1))) {
      LOG(WARNING) << "Cluster config change timed for: " << MyID();
    }

    SlotSet after = ClusterConfig::Current()->GetOwnedSlots();
    if (ServerState::tlocal()->is_master) {
      auto deleted_slots = (before.GetRemovedSlots(after)).ToSlotRanges();
      deleted_slots.Merge(outgoing_migrations.slot_ranges);
      DeleteSlots(deleted_slots);
      LOG_IF(INFO, !deleted_slots.Empty())
          << "Flushing newly unowned slots: " << deleted_slots.ToString();
      WriteFlushSlotsToJournal(deleted_slots);
    }
  }

  return cmd_cntx->SendOk();
}

void ClusterFamily::DflyClusterGetSlotInfo(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser(args);
  parser.ExpectTag("SLOTS");
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  RETURN_ON_PARSE_ERROR(parser, cmd_cntx);

  vector<std::pair<SlotId, SlotStats>> slots_stats;
  while (parser.HasNext()) {
    auto arg = parser.Next<std::string_view>();
    // Check if argument contains a dash for range notation (e.g., "1-100")
    size_t dash_pos = arg.find('-');
    if (dash_pos != std::string_view::npos && dash_pos > 0) {
      // Parse as range: start-end
      std::string_view start_str = arg.substr(0, dash_pos);
      std::string_view end_str = arg.substr(dash_pos + 1);

      uint32_t start_slot, end_slot;
      if (!absl::SimpleAtoi(start_str, &start_slot) || !absl::SimpleAtoi(end_str, &end_slot)) {
        return cmd_cntx->SendError("Invalid slot range format");
      }

      if (start_slot > kMaxSlotNum || end_slot > kMaxSlotNum) {
        return cmd_cntx->SendError("Invalid slot id");
      }

      // Swap if range is specified in reverse order (e.g., "100-0")
      if (start_slot > end_slot) {
        std::swap(start_slot, end_slot);
      }

      for (uint32_t sid = start_slot; sid <= end_slot; ++sid) {
        slots_stats.emplace_back(sid, SlotStats{});
      }
    } else {
      // Parse as single slot id
      uint32_t sid;
      if (!absl::SimpleAtoi(arg, &sid)) {
        return cmd_cntx->SendError(kInvalidIntErr);
      }
      if (sid > kMaxSlotNum) {
        return cmd_cntx->SendError("Invalid slot id");
      }
      slots_stats.emplace_back(sid, SlotStats{});
    }
  }

  if (slots_stats.empty()) {
    return cmd_cntx->SendError(kSyntaxErr);
  }

  fb2::Mutex mu;

  auto cb = [&](auto*) ABSL_LOCKS_EXCLUDED(mu) {
    EngineShard* shard = EngineShard::tlocal();
    if (shard == nullptr)
      return;

    util::fb2::LockGuard lk(mu);
    for (auto& [slot, data] : slots_stats) {
      data += namespaces->GetDefaultNamespace().GetDbSlice(shard->shard_id()).GetSlotStats(slot);
    }
  };

  shard_set->pool()->AwaitFiberOnAll(std::move(cb));

  rb->StartArray(slots_stats.size());

  for (const auto& slot_data : slots_stats) {
    rb->StartArray(9);
    rb->SendLong(slot_data.first);
    rb->SendBulkString("key_count");
    rb->SendLong(slot_data.second.key_count);
    rb->SendBulkString("total_reads");
    rb->SendLong(slot_data.second.total_reads);
    rb->SendBulkString("total_writes");
    rb->SendLong(slot_data.second.total_writes);

    // Account for both the values and the table space of the entries.
    // Each entry is comprised from CompactObj for key and CompactObj for value.
    // Sometimes the values are very small and table space becomes significant.
    rb->SendBulkString("memory_bytes");
    rb->SendLong(slot_data.second.memory_bytes +
                 slot_data.second.key_count * sizeof(CompactObj) * 2);
  }
}

void ClusterFamily::DflyClusterFlushSlots(CmdArgList args, CommandContext* cmd_cntx) {
  LOG(INFO) << "Got DFLYCLUSTER FLUSHSLOTS " << args;

  std::vector<SlotRange> slot_ranges;

  CmdArgParser parser(args);
  do {
    auto [slot_start, slot_end] = parser.Next<ParsedSlotId, ParsedSlotId>();
    RETURN_ON_PARSE_ERROR(parser, cmd_cntx);
    if (slot_start > slot_end) {
      return cmd_cntx->SendError("Invalid slot range");
    }
    slot_ranges.emplace_back(SlotRange{slot_start, slot_end});
  } while (parser.HasNext());

  DeleteSlots(SlotRanges(std::move(slot_ranges)));

  return cmd_cntx->SendOk();
}

void ClusterFamily::StartNewSlotMigrations(const ClusterConfig& new_config) {
  // TODO Add validating and error processing
  auto out_migrations = new_config.GetNewOutgoingMigrations(ClusterConfig::Current());
  auto in_migrations = new_config.GetNewIncomingMigrations(ClusterConfig::Current());

  util::fb2::LockGuard lk(migration_mu_);

  for (auto& m : out_migrations) {
    auto migration = make_shared<OutgoingMigration>(std::move(m), this, server_family_);
    outgoing_migration_jobs_.emplace_back(migration);
    migration->Start();
  }

  for (auto& m : in_migrations) {
    auto migration = make_shared<IncomingSlotMigration>(m.node_info.id, &server_family_->service(),
                                                        m.slot_ranges);
    incoming_migrations_jobs_.emplace_back(migration);
  }
}

static string_view StateToStr(MigrationState state) {
  switch (state) {
    case MigrationState::C_CONNECTING:
      return "CONNECTING"sv;
    case MigrationState::C_SYNC:
      return "SYNC"sv;
    case MigrationState::C_ERROR:
      return "ERROR"sv;
    case MigrationState::C_FINISHED:
      return "FINISHED"sv;
    case MigrationState::C_FATAL:
      return "FATAL"sv;
  }
  DCHECK(false) << "Unknown State value " << static_cast<underlying_type_t<MigrationState>>(state);
  return "UNDEFINED_STATE"sv;
}

void ClusterFamily::DflySlotMigrationStatus(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser(args);

  util::fb2::LockGuard lk(migration_mu_);

  string_view node_id;
  if (parser.HasNext()) {
    node_id = parser.Next<std::string_view>();
    RETURN_ON_PARSE_ERROR(parser, cmd_cntx);
  }

  struct Reply {
    string_view direction;
    string node_id;
    string_view state;
    size_t keys_number;
    string error;
  };
  vector<Reply> reply;
  reply.reserve(incoming_migrations_jobs_.size() + outgoing_migration_jobs_.size());

  auto append_answer = [&reply](string_view direction, string node_id, string_view filter,
                                MigrationState state, size_t keys_number, string error) {
    if (filter.empty() || filter == node_id) {
      error = error.empty() ? "0" : error;
      reply.emplace_back(
          Reply{direction, std::move(node_id), StateToStr(state), keys_number, std::move(error)});
    }
  };

  for (const auto& m : incoming_migrations_jobs_) {
    append_answer("in", m->GetSourceID(), node_id, m->GetState(), m->GetKeyCount(),
                  m->GetErrorStr());
  }
  for (const auto& m : outgoing_migration_jobs_) {
    append_answer("out", m->GetMigrationInfo().node_info.id, node_id, m->GetState(),
                  m->GetKeyCount(), m->GetErrorStr());
  }

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  rb->StartArray(reply.size());
  for (const auto& r : reply) {
    rb->StartArray(5);
    rb->SendBulkString(r.direction);
    rb->SendBulkString(r.node_id);
    rb->SendBulkString(r.state);
    rb->SendLong(r.keys_number);
    rb->SendBulkString(r.error);
  }
}

void ClusterFamily::DflyMigrate(CmdArgList args, CommandContext* cmd_cntx) {
  string sub_cmd = absl::AsciiStrToUpper(ArgS(args, 0));

  args.remove_prefix(1);

  if (sub_cmd == "INIT") {
    InitMigration(args, cmd_cntx);
  } else if (sub_cmd == "FLOW") {
    DflyMigrateFlow(args, cmd_cntx);
  } else if (sub_cmd == "ACK") {
    DflyMigrateAck(args, cmd_cntx);
  } else {
    cmd_cntx->SendError(facade::UnknownSubCmd(sub_cmd, "DFLYMIGRATE"), facade::kSyntaxErrType);
  }
}

std::shared_ptr<IncomingSlotMigration> ClusterFamily::GetIncomingMigration(
    std::string_view source_id) {
  util::fb2::LockGuard lk(migration_mu_);
  for (const auto& mj : incoming_migrations_jobs_) {
    if (mj->GetSourceID() == source_id) {
      return mj;
    }
  }
  return nullptr;
}

ClusterFamily::PreparedToRemoveOutgoingMigrations::~PreparedToRemoveOutgoingMigrations() = default;

[[nodiscard]] ClusterFamily::PreparedToRemoveOutgoingMigrations
ClusterFamily::TakeOutOutgoingMigrations(shared_ptr<ClusterConfig> new_config,
                                         shared_ptr<ClusterConfig> old_config) {
  auto migrations = new_config->GetFinishedOutgoingMigrations(old_config);
  util::fb2::LockGuard lk(migration_mu_);
  SlotRanges removed_slots;
  PreparedToRemoveOutgoingMigrations res;
  for (const auto& m : migrations) {
    auto it = std::find_if(outgoing_migration_jobs_.begin(), outgoing_migration_jobs_.end(),
                           [&m](const auto& om) {
                             // we can have only one migration per target-source pair
                             return m.node_info.id == om->GetMigrationInfo().node_info.id;
                           });
    DCHECK(it != outgoing_migration_jobs_.end());
    DCHECK(it->get() != nullptr);
    OutgoingMigration& migration = *it->get();
    const auto& slots = migration.GetSlots();
    removed_slots.Merge(slots);
    LOG(INFO) << "Outgoing migration cancelled: slots " << slots.ToString() << " to "
              << migration.GetHostIp() << ":" << migration.GetPort();
    migration.Finish();
    res.migrations.push_back(std::move(*it));
    outgoing_migration_jobs_.erase(it);
  }

  // Flush non-owned migrations
  SlotSet migration_slots(removed_slots);
  res.slot_ranges = migration_slots.GetRemovedSlots(new_config->GetOwnedSlots()).ToSlotRanges();

  // Flushing of removed slots is done outside this function.
  return res;
}

namespace {

// returns removed incoming migration
bool RemoveIncomingMigrationImpl(std::vector<std::shared_ptr<IncomingSlotMigration>>& jobs,
                                 string_view source_id) {
  auto it = std::find_if(jobs.begin(), jobs.end(), [source_id](const auto& im) {
    // we can have only one migration per target-source pair
    return source_id == im->GetSourceID();
  });
  if (it == jobs.end()) {
    return false;
  }
  DCHECK(it->get() != nullptr);
  std::shared_ptr<IncomingSlotMigration> migration = *it;

  // Flush non-owned migrations
  SlotSet migration_slots(migration->GetSlots());
  SlotSet removed = migration_slots.GetRemovedSlots(ClusterConfig::Current()->GetOwnedSlots());

  migration->Stop();
  // all migration fibers has migration shared_ptr so the object can be removed later
  jobs.erase(it);

  // TODO make it outside in one run with other slots that should be flushed
  if (!removed.Empty()) {
    auto removed_ranges = removed.ToSlotRanges();
    LOG_IF(WARNING, migration->GetState() == MigrationState::C_FINISHED)
        << "Flushing slots of removed FINISHED migration " << migration->GetSourceID()
        << ", slots: " << removed_ranges.ToString();
    DeleteSlots(removed_ranges);
  }

  return true;
}
}  // namespace

void ClusterFamily::RemoveIncomingMigrations(const std::vector<MigrationInfo>& migrations) {
  util::fb2::LockGuard lk(migration_mu_);
  for (const auto& m : migrations) {
    RemoveIncomingMigrationImpl(incoming_migrations_jobs_, m.node_info.id);
    VLOG(1) << "Migration was canceled from: " << m.node_info.id;
  }
}

void ClusterFamily::InitMigration(CmdArgList args, CommandContext* cmd_cntx) {
  VLOG(1) << "Create incoming migration, args: " << args;
  CmdArgParser parser{args};

  auto [source_id, flows_num] = parser.Next<string_view, uint32_t>();

  std::vector<SlotRange> slots;
  do {
    auto [slot_start, slot_end] = parser.Next<SlotId, SlotId>();
    slots.emplace_back(SlotRange{slot_start, slot_end});
  } while (parser.HasNext());

  RETURN_ON_PARSE_ERROR(parser, cmd_cntx);

  SlotRanges slot_ranges(std::move(slots));

  std::shared_ptr<IncomingSlotMigration> migration;
  {
    util::fb2::LockGuard lk(migration_mu_);

    auto it = find_if(incoming_migrations_jobs_.begin(), incoming_migrations_jobs_.end(),
                      [source_id = source_id, &slot_ranges](const auto& migration) {
                        return migration->GetSourceID() == source_id &&
                               migration->GetSlots() == slot_ranges;
                      });

    if (it != incoming_migrations_jobs_.end()) {
      migration = *it;
    }
  }

  if (!migration) {
    VLOG(1) << "Unrecognized incoming migration from " << source_id;
    return cmd_cntx->SendSimpleString(kUnknownMigration);
  }

  if (migration->GetState() != MigrationState::C_CONNECTING) {
    migration->Stop();
    auto slots = migration->GetSlots();
    LOG(INFO) << "Flushing slots during migration reinitialization " << migration->GetSourceID()
              << ", slots: " << slots.ToString();
    DeleteSlots(slots);
  }

  if (migration->GetState() == MigrationState::C_FATAL) {
    return cmd_cntx->SendError(absl::StrCat("-", kIncomingMigrationOOM));
  }

  migration->Init(flows_num);

  return cmd_cntx->SendOk();
}

void ClusterFamily::DflyMigrateFlow(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  auto [source_id, shard_id] = parser.Next<std::string_view, uint32_t>();

  RETURN_ON_PARSE_ERROR(parser, cmd_cntx);

  VLOG(1) << "Create flow " << source_id << " shard_id: " << shard_id;

  cmd_cntx->conn()->SetName(absl::StrCat("migration_flow_", source_id));

  auto migration = GetIncomingMigration(source_id);

  if (!migration) {
    return cmd_cntx->SendError(kIdNotFound);
  }

  auto* conn_cntx = cmd_cntx->server_conn_cntx();
  DCHECK(conn_cntx->sync_dispatch);
  // we do this to be ignored by the dispatch tracker
  // TODO provide a more clear approach
  conn_cntx->sync_dispatch = false;

  cmd_cntx->SendOk();

  // Try migrating the connection if we have the same shard configuration
  if (migration->ShardNum() == shard_set->size() &&
      int32_t(shard_id) != fb2::ProactorBase::me()->GetPoolIndex()) {
    DCHECK_LT(shard_id, shard_set->size());
    if (bool success = conn_cntx->conn()->Migrate(shard_set->pool()->at(shard_id)); !success) {
      cmd_cntx->SendError("invalid state");
      return;
    }
  }

  migration->StartFlow(shard_id, conn_cntx->conn()->socket());
}

void ClusterFamily::ApplyMigrationSlotRangeToConfig(std::string_view node_id,
                                                    const SlotRanges& slots, bool is_incoming) {
  VLOG(1) << "Update config for slots ranges: " << slots.ToString() << " for " << MyID() << " : "
          << node_id;
  util::fb2::LockGuard gu(set_config_mu);
  util::fb2::LockGuard lk(migration_mu_);

  bool is_migration_valid = false;
  if (is_incoming) {
    for (const auto& mj : incoming_migrations_jobs_) {
      if (mj->GetSourceID() == node_id && slots == mj->GetSlots()) {
        is_migration_valid = true;
        break;
      }
    }
  } else {
    for (const auto& mj : outgoing_migration_jobs_) {
      if (mj->GetMigrationInfo().node_info.id == node_id &&
          mj->GetMigrationInfo().slot_ranges == slots) {
        is_migration_valid = true;
        break;
      }
    }
  }
  if (!is_migration_valid) {
    LOG(WARNING) << "Config wasn't updated for slots ranges: " << slots.ToString() << " for "
                 << MyID() << " : " << node_id;
    return;
  }

  auto new_config = is_incoming ? ClusterConfig::Current()->CloneWithChanges(slots, {})
                                : ClusterConfig::Current()->CloneWithChanges({}, slots);

  auto blocking_filter = [&new_config](ArgSlice keys) {
    bool moved = any_of(keys.begin(), keys.end(), [&](auto k) { return !new_config->IsMySlot(k); });
    return moved ? OpStatus::KEY_MOVED : OpStatus::OK;
  };
  // we don't need to use DispatchTracker here because for IncomingMingration we don't have
  // connectionas that should be tracked and for Outgoing migration we do it under Pause
  server_family_->service().proactor_pool().AwaitFiberOnAll(
      [this, &new_config, &blocking_filter](util::ProactorBase*) {
        server_family_->CancelBlockingOnThread(blocking_filter);
        ClusterConfig::SetCurrent(new_config);
      });
  DCHECK(ClusterConfig::Current() != nullptr);
  VLOG(1) << "Config is updated for slots ranges: " << slots.ToString() << " for " << MyID()
          << " : " << node_id;
}

void ClusterFamily::DflyMigrateAck(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  auto [source_id, attempt] = parser.Next<std::string_view, long>();

  RETURN_ON_PARSE_ERROR(parser, cmd_cntx);

  VLOG(1) << "DFLYMIGRATE ACK" << args;
  auto in_migrations = ClusterConfig::Current()->GetIncomingMigrations();
  auto m_it =
      std::find_if(in_migrations.begin(), in_migrations.end(),
                   [source_id = source_id](const auto& m) { return m.node_info.id == source_id; });
  if (m_it == in_migrations.end()) {
    LOG(WARNING) << "migration isn't in config";
    return cmd_cntx->SendSimpleString(kUnknownMigration);
  }

  auto migration = GetIncomingMigration(source_id);
  if (!migration)
    return cmd_cntx->SendError(kIdNotFound);

  if (!migration->Join(attempt)) {
    if (migration->GetState() == MigrationState::C_FATAL) {
      return cmd_cntx->SendError(absl::StrCat("-", kIncomingMigrationOOM));
    } else {
      return cmd_cntx->SendError("Join timeout happened");
    }
  }

  ApplyMigrationSlotRangeToConfig(migration->GetSourceID(), migration->GetSlots(), true);

  return cmd_cntx->rb()->SendLong(attempt);
}

void ClusterFamily::PauseAllIncomingMigrations(bool pause) {
  util::fb2::LockGuard lk(migration_mu_);
  LOG_IF(ERROR, incoming_migrations_jobs_.empty()) << "No incoming migrations!";
  for (auto& im : incoming_migrations_jobs_) {
    im->Pause(pause);
  }
}

size_t ClusterFamily::MigrationsErrorsCount() const {
  util::fb2::LockGuard lk(migration_mu_);

  size_t error_num = 0;

  for (const auto& mj : incoming_migrations_jobs_) {
    error_num += mj->GetErrorsCount();
  }

  for (const auto& mj : outgoing_migration_jobs_) {
    error_num += mj->GetErrorsCount();
  }

  return error_num;
}

void ClusterFamily::ReconcileMasterSlots(std::string_view repl_id) {
  util::fb2::LockGuard gu(set_config_mu);
  util::fb2::LockGuard lk(migration_mu_);

  auto config = ClusterConfig::Current();

  // Sanity -- we should not reach there
  if (!config) {
    LOG(ERROR) << "Cluster config after takeover is empty";
    return;
  }

  for (auto& info : config->GetMutableConfig()) {
    // we are updating the old config
    if (info.master.id == id_) {
      if (!info.replicas.empty()) {
        auto target = std::find_if(info.replicas.begin(), info.replicas.end(),
                                   [repl_id](const auto& e) { return e.id == repl_id; });

        if (target == info.replicas.end()) {
          auto topology =
              absl::StrCat("[",
                           absl::StrJoin(info.replicas, ",",
                                         [](std::string* out, const auto& r) { *out = r.id; }),
                           "]");
          LOG(ERROR) << "info.master.id=" << id_ << ". Missing repl_id=" << repl_id
                     << " from cluster topology " << topology
                     << ". Slot redirection after takeover corrupted.";

          return;
        }

        info.master = *target;
        info.replicas.clear();
      }
      return;
    }
  }
}

void ClusterFamily::ReconcileReplicaSlots() {
  util::fb2::LockGuard gu(set_config_mu);
  util::fb2::LockGuard lk(migration_mu_);

  auto config = ClusterConfig::Current();

  // Sanity -- we should not reach there
  if (!config) {
    LOG(ERROR) << "Cluster config after takeover is empty";
    return;
  }

  auto new_config = ClusterConfig::Current()->CloneWithChanges({}, {});
  // Replace master with replica in shard config.
  bool found = false;
  for (ClusterShardInfo& info : new_config->GetMutableConfig()) {
    for (const auto& replica : info.replicas) {
      if (replica.id == id_) {
        info.master = replica;
        // New master has no replicas
        info.replicas.clear();
        found = true;
        break;
      }
    }
    if (found)
      break;
  }

  LOG_IF(ERROR, !found) << "Did not find replica in the cluster map";

  server_family_->service().proactor_pool().AwaitFiberOnAll(
      [&new_config](util::ProactorBase*) { ClusterConfig::SetCurrent(new_config); });
}

using EngineFunc = void (ClusterFamily::*)(CmdArgList args, CommandContext* cmd_cntx);

inline CommandId::Handler HandlerFunc(ClusterFamily* se, EngineFunc f) {
  return [=](CmdArgList args, CommandContext* cmd_cntx) { return (se->*f)(args, cmd_cntx); };
}

#define HFUNC(x) SetHandler(HandlerFunc(this, &ClusterFamily::x))

void ClusterFamily::Register(CommandRegistry* registry) {
  registry->StartFamily();
  *registry << CI{"CLUSTER", CO::READONLY | CO::LOADING, -2, 0, 0, acl::kCluster}.HFUNC(Cluster)
            << CI{"DFLYCLUSTER",    CO::ADMIN | CO::GLOBAL_TRANS | CO::HIDDEN, -2, 0, 0,
                  acl::kDflyCluster}
                   .HFUNC(DflyCluster)
            << CI{"READONLY", CO::READONLY, 1, 0, 0, acl::kReadOnly}.HFUNC(ReadOnly)
            << CI{"READWRITE", CO::READONLY, 1, 0, 0, acl::kReadWrite}.HFUNC(ReadWrite)
            << CI{"DFLYMIGRATE", CO::ADMIN | CO::HIDDEN, -1, 0, 0, acl::kDflyMigrate}.HFUNC(
                   DflyMigrate);
}

}  // namespace dfly::cluster


================================================
FILE: src/server/cluster/cluster_family.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <string>

#include "facade/conn_context.h"
#include "facade/facade_types.h"
#include "server/cluster/cluster_config.h"
#include "server/cluster/incoming_slot_migration.h"
#include "server/cluster/outgoing_slot_migration.h"

namespace facade {
class SinkReplyBuilder;
}  // namespace facade

namespace dfly {
class ServerFamily;
class CommandRegistry;
class ConnectionContext;
class CommandContext;
}  // namespace dfly

namespace dfly::cluster {

class ClusterFamily {
 public:
  explicit ClusterFamily(ServerFamily* server_family);

  void Register(CommandRegistry* registry);

  void Shutdown() ABSL_LOCKS_EXCLUDED(set_config_mu);

  void ApplyMigrationSlotRangeToConfig(std::string_view node_id, const SlotRanges& slots,
                                       bool is_outgoing);

  const std::string& MyID() const {
    return id_;
  }

  // Only for debug purpose. Pause/Resume all incoming migrations
  void PauseAllIncomingMigrations(bool pause) ABSL_LOCKS_EXCLUDED(migration_mu_);

  size_t MigrationsErrorsCount() const ABSL_LOCKS_EXCLUDED(migration_mu_);

  // Helper functions to be used during takeover from both nodes (master and replica).
  // It reconciles the cluster configuration for both nodes to reflect the node
  // role changes after the takeover.
  // For the taking over node it's called at the end of the ReplTakeOver flow
  // and for the taken over node it's called at the end of the dflycmd::TakeOver
  void ReconcileMasterSlots(std::string_view repl_id)
      ABSL_LOCKS_EXCLUDED(set_config_mu, migration_mu_);

  void ReconcileReplicaSlots() ABSL_LOCKS_EXCLUDED(set_config_mu, migration_mu_);

 private:
  using SinkReplyBuilder = facade::SinkReplyBuilder;

  // Cluster commands compatible with Redis
  void Cluster(CmdArgList args, CommandContext* cmd_cntx);
  void ClusterHelp(SinkReplyBuilder* builder);
  void ClusterShards(SinkReplyBuilder* builder, ConnectionContext* cntx);
  void ClusterSlots(SinkReplyBuilder* builder, ConnectionContext* cntx);
  void ClusterNodes(SinkReplyBuilder* builder, ConnectionContext* cntx);
  void ClusterInfo(SinkReplyBuilder* builder, ConnectionContext* cntx);
  void ClusterMyId(SinkReplyBuilder* builder);

  void KeySlot(CmdArgList args, SinkReplyBuilder* builder);

  void ReadOnly(CmdArgList args, CommandContext* cmd_cntx);
  void ReadWrite(CmdArgList args, CommandContext* cmd_cntx);

  // Custom Dragonfly commands for cluster management
  void DflyCluster(CmdArgList args, CommandContext* cmd_cntx);
  void DflyClusterConfig(CmdArgList args, CommandContext* cmd_cntx);

  void DflyClusterGetSlotInfo(CmdArgList args, CommandContext* cmd_cntx)
      ABSL_LOCKS_EXCLUDED(migration_mu_);
  void DflyClusterFlushSlots(CmdArgList args, CommandContext* cmd_cntx);
  void DflySlotMigrationStatus(CmdArgList args, CommandContext* cmd_cntx)
      ABSL_LOCKS_EXCLUDED(migration_mu_);

  // DFLYMIGRATE is internal command defines several steps in slots migrations process
  void DflyMigrate(CmdArgList args, CommandContext* cmd_cntx);

  // DFLYMIGRATE INIT is internal command to create incoming migration object
  void InitMigration(CmdArgList args, CommandContext* cmd_cntx) ABSL_LOCKS_EXCLUDED(migration_mu_);

  // DFLYMIGRATE FLOW initiate second step in slots migration procedure
  // this request should be done for every shard on the target node
  // this method assocciate connection and shard that will be the data
  // source for migration
  void DflyMigrateFlow(CmdArgList args, CommandContext* cmd_cntx);

  void DflyMigrateAck(CmdArgList args, CommandContext* cmd_cntx);

  std::shared_ptr<IncomingSlotMigration> GetIncomingMigration(std::string_view source_id)
      ABSL_LOCKS_EXCLUDED(migration_mu_);

  void StartNewSlotMigrations(const ClusterConfig& new_config);

  // must be destroyed excluded set_config_mu and migration_mu_ locks
  struct PreparedToRemoveOutgoingMigrations {
    std::vector<std::shared_ptr<OutgoingMigration>> migrations;
    SlotRanges slot_ranges;
    ~PreparedToRemoveOutgoingMigrations() ABSL_LOCKS_EXCLUDED(migration_mu_, set_config_mu);
  };

  [[nodiscard]] PreparedToRemoveOutgoingMigrations TakeOutOutgoingMigrations(
      std::shared_ptr<ClusterConfig> new_config, std::shared_ptr<ClusterConfig> old_config)
      ABSL_LOCKS_EXCLUDED(migration_mu_);
  void RemoveIncomingMigrations(const std::vector<MigrationInfo>& migrations)
      ABSL_LOCKS_EXCLUDED(migration_mu_);

  mutable util::fb2::Mutex migration_mu_;  // guard migrations operations
  // holds all incoming slots migrations that are currently in progress.
  std::vector<std::shared_ptr<IncomingSlotMigration>> incoming_migrations_jobs_
      ABSL_GUARDED_BY(migration_mu_);

  // holds all outgoing slots migrations that are currently in progress
  std::vector<std::shared_ptr<OutgoingMigration>> outgoing_migration_jobs_
      ABSL_GUARDED_BY(migration_mu_);

  std::optional<ClusterShardInfos> GetShardInfos(ConnectionContext* cntx) const;

  ClusterShardInfo GetEmulatedShardInfo(ConnectionContext* cntx) const;

  // Guards set configuration, so that we won't handle 2 in parallel.
  mutable util::fb2::Mutex set_config_mu;

  std::string id_;

  ServerFamily* server_family_ = nullptr;
};

}  // namespace dfly::cluster


================================================
FILE: src/server/cluster/cluster_family_test.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include <absl/flags/reflection.h>
#include <gmock/gmock-matchers.h>
#include <gtest/gtest-matchers.h>

#include <string>
#include <string_view>

#include "absl/strings/str_replace.h"
#include "absl/strings/substitute.h"
#include "absl/time/clock.h"
#include "absl/time/time.h"
#include "base/gtest.h"
#include "base/logging.h"
#include "core/detail/gen_utils.h"
#include "facade/facade_test.h"
#include "server/test_utils.h"

namespace dfly::cluster {
namespace {

using namespace std;
using namespace testing;

class ClusterFamilyTest : public BaseFamilyTest {
 public:
  ClusterFamilyTest() {
    SetTestFlag("cluster_mode", "yes");
  }

 protected:
  static constexpr string_view kInvalidConfiguration = "Invalid cluster configuration";

  string GetMyId() {
    return Run({"cluster", "myid"}).GetString();
  }

  void ConfigSingleNodeCluster(string id) {
    string config_template = R"json(
      [
        {
          "slot_ranges": [
            {
              "start": 0,
              "end": 16383
            }
          ],
          "master": {
            "id": "$0",
            "ip": "10.0.0.1",
            "port": 7000,
            "health": "online"
          },
          "replicas": []
        }
      ])json";
    string config = absl::Substitute(config_template, id);
    EXPECT_EQ(RunPrivileged({"dflycluster", "config", config}), "OK");
  }
};

TEST_F(ClusterFamilyTest, ClusterConfigInvalidJSON) {
  EXPECT_THAT(RunPrivileged({"dflycluster", "config", "invalid JSON"}),
              ErrArg("Invalid cluster configuration."));

  string cluster_info = Run({"cluster", "info"}).GetString();
  EXPECT_THAT(cluster_info, HasSubstr("cluster_state:fail"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_slots_assigned:0"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_slots_ok:0"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_known_nodes:0"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_size:0"));

  EXPECT_THAT(Run({"cluster", "shards"}), ErrArg("Cluster is not yet configured"));
  EXPECT_THAT(Run({"cluster", "slots"}), ErrArg("Cluster is not yet configured"));
  EXPECT_THAT(Run({"cluster", "nodes"}), ErrArg("Cluster is not yet configured"));
}

TEST_F(ClusterFamilyTest, ClusterConfigInvalidConfig) {
  EXPECT_THAT(RunPrivileged({"dflycluster", "config", "[]"}), ErrArg(kInvalidConfiguration));

  string cluster_info = Run({"cluster", "info"}).GetString();
  EXPECT_THAT(cluster_info, HasSubstr("cluster_state:fail"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_slots_assigned:0"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_slots_ok:0"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_known_nodes:0"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_size:0"));
}

TEST_F(ClusterFamilyTest, ClusterConfigInvalidMissingSlots) {
  EXPECT_THAT(RunPrivileged({"dflycluster", "config", R"json(
      [
        {
          "slot_ranges": [
            {
              "start": 0,
              "end": 100
            }
          ],
          "master": {
            "id": "abcd1234",
            "ip": "10.0.0.1",
            "port": 7000
          },
          "replicas": []
        }
      ])json"}),
              ErrArg(kInvalidConfiguration));

  string cluster_info = Run({"cluster", "info"}).GetString();
  EXPECT_THAT(cluster_info, HasSubstr("cluster_state:fail"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_slots_assigned:0"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_slots_ok:0"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_known_nodes:0"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_size:0"));
}

TEST_F(ClusterFamilyTest, ClusterConfigInvalidOverlappingSlots) {
  EXPECT_THAT(RunPrivileged({"dflycluster", "config", R"json(
      [
        {
          "slot_ranges": [
            {
              "start": 0,
              "end": 1000
            }
          ],
          "master": {
            "id": "abcd1234",
            "ip": "10.0.0.1",
            "port": 7000
          },
          "replicas": []
        },
        {
          "slot_ranges": [
            {
              "start": 800,
              "end": 16383
            }
          ],
          "master": {
            "id": "abcd1234",
            "ip": "10.0.0.1",
            "port": 7000
          },
          "replicas": []
        }
      ])json"}),
              ErrArg(kInvalidConfiguration));

  string cluster_info = Run({"cluster", "info"}).GetString();
  EXPECT_THAT(cluster_info, HasSubstr("cluster_state:fail"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_slots_assigned:0"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_slots_ok:0"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_known_nodes:0"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_size:0"));
}

TEST_F(ClusterFamilyTest, ClusterConfigNoReplicas) {
  ConfigSingleNodeCluster("abcd1234");
  string cluster_info = Run({"cluster", "info"}).GetString();
  EXPECT_THAT(cluster_info, HasSubstr("cluster_state:ok"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_slots_assigned:16384"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_slots_ok:16384"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_known_nodes:1"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_size:1"));

  EXPECT_THAT(Run({"cluster", "shards"}),
              RespArray(ElementsAre("slots",                                            //
                                    RespArray(ElementsAre(IntArg(0), IntArg(16'383))),  //
                                    "nodes",                                            //
                                    RespArray(ElementsAre(                              //
                                        RespArray(ElementsAre(                          //
                                            "id", "abcd1234",                           //
                                            "endpoint", "10.0.0.1",                     //
                                            "ip", "10.0.0.1",                           //
                                            "port", IntArg(7000),                       //
                                            "role", "master",                           //
                                            "replication-offset", IntArg(0),            //
                                            "health", "online")))))));

  EXPECT_THAT(Run({"get", "x"}).GetString(),
              testing::MatchesRegex(R"(MOVED [0-9]+ 10.0.0.1:7000)"));

  EXPECT_THAT(Run({"cluster", "slots"}),
              RespArray(ElementsAre(IntArg(0),              //
                                    IntArg(16'383),         //
                                    RespArray(ElementsAre(  //
                                        "10.0.0.1",         //
                                        IntArg(7'000),      //
                                        "abcd1234")))));

  EXPECT_EQ(Run({"cluster", "nodes"}),
            "abcd1234 10.0.0.1:7000@7000 master - 0 0 0 connected 0-16383\n");
}

TEST_F(ClusterFamilyTest, ClusterConfigFull) {
  EXPECT_EQ(RunPrivileged({"dflycluster", "config", R"json(
      [
        {
          "slot_ranges": [
            {
              "start": 0,
              "end": 16383
            }
          ],
          "master": {
            "id": "abcd1234",
            "ip": "10.0.0.1",
            "port": 7000,
            "health": "online"
          },
          "replicas": [
            {
              "id": "wxyz",
              "ip": "10.0.0.10",
              "port": 8000,
              "health": "online"
            }
          ]
        }
      ])json"}),
            "OK");

  string cluster_info = Run({"cluster", "info"}).GetString();
  EXPECT_THAT(cluster_info, HasSubstr("cluster_state:ok"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_slots_assigned:16384"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_slots_ok:16384"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_known_nodes:2"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_size:1"));

  EXPECT_THAT(Run({"cluster", "shards"}),
              RespArray(ElementsAre("slots",                                            //
                                    RespArray(ElementsAre(IntArg(0), IntArg(16'383))),  //
                                    "nodes",                                            //
                                    RespArray(ElementsAre(                              //
                                        RespArray(ElementsAre(                          //
                                            "id", "abcd1234",                           //
                                            "endpoint", "10.0.0.1",                     //
                                            "ip", "10.0.0.1",                           //
                                            "port", IntArg(7000),                       //
                                            "role", "master",                           //
                                            "replication-offset", IntArg(0),            //
                                            "health", "online")),                       //
                                        RespArray(ElementsAre(                          //
                                            "id", "wxyz",                               //
                                            "endpoint", "10.0.0.10",                    //
                                            "ip", "10.0.0.10",                          //
                                            "port", IntArg(8000),                       //
                                            "role", "replica",                          //
                                            "replication-offset", IntArg(0),            //
                                            "health", "online")))))));

  EXPECT_THAT(Run({"cluster", "slots"}),
              RespArray(ElementsAre(IntArg(0),              //
                                    IntArg(16'383),         //
                                    RespArray(ElementsAre(  //
                                        "10.0.0.1",         //
                                        IntArg(7'000),      //
                                        "abcd1234")),       //
                                    RespArray(ElementsAre(  //
                                        "10.0.0.10",        //
                                        IntArg(8'000),      //
                                        "wxyz")))));

  EXPECT_EQ(Run({"cluster", "nodes"}),
            "abcd1234 10.0.0.1:7000@7000 master - 0 0 0 connected 0-16383\n"
            "wxyz 10.0.0.10:8000@8000 slave abcd1234 0 0 0 connected\n");
}

TEST_F(ClusterFamilyTest, ClusterConfigFullMultipleInstances) {
  EXPECT_EQ(RunPrivileged({"dflycluster", "config", R"json(
      [
        {
          "slot_ranges": [
            {
              "start": 0,
              "end": 10000
            }
          ],
          "master": {
            "id": "abcd1234",
            "ip": "10.0.0.1",
            "port": 7000,
            "health": "fail"
          },
          "replicas": [
            {
              "id": "wxyz",
              "ip": "10.0.0.10",
              "port": 8000,
              "health": "online"
            }
          ]
        },
        {
          "slot_ranges": [
            {
              "start": 10001,
              "end": 16383
            }
          ],
          "master": {
            "id": "efgh7890",
            "ip": "10.0.0.2",
            "port": 7001,
            "health": "online"
          },
          "replicas": [
            {
              "id": "qwerty",
              "ip": "10.0.0.11",
              "port": 8001,
              "health": "online"
            },
             {
              "id": "qwerty1",
              "ip": "10.0.0.12",
              "port": 8002,
              "health": "loading"
            },
             {
              "id": "qwerty2",
              "ip": "10.0.0.13",
              "port": 8003,
              "health": "fail"
            },
             {
              "id": "qwerty3",
              "ip": "10.0.0.14",
              "port": 8004,
              "health": "hidden"
            }
          ]
        }
      ])json"}),
            "OK");

  string cluster_info = Run({"cluster", "info"}).GetString();
  EXPECT_THAT(cluster_info, HasSubstr("cluster_state:ok"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_slots_assigned:16384"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_slots_ok:16384"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_known_nodes:7"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_size:2"));

  EXPECT_THAT(Run({"cluster", "shards"}),
              RespArray(ElementsAre(
                  RespArray(ElementsAre("slots",                                                 //
                                        RespArray(ElementsAre(IntArg(0), IntArg(10'000))),       //
                                        "nodes",                                                 //
                                        RespArray(ElementsAre(                                   //
                                            RespArray(ElementsAre(                               //
                                                "id", "abcd1234",                                //
                                                "endpoint", "10.0.0.1",                          //
                                                "ip", "10.0.0.1",                                //
                                                "port", IntArg(7000),                            //
                                                "role", "master",                                //
                                                "replication-offset", IntArg(0),                 //
                                                "health", "fail")),                              //
                                            RespArray(ElementsAre(                               //
                                                "id", "wxyz",                                    //
                                                "endpoint", "10.0.0.10",                         //
                                                "ip", "10.0.0.10",                               //
                                                "port", IntArg(8000),                            //
                                                "role", "replica",                               //
                                                "replication-offset", IntArg(0),                 //
                                                "health", "online")))))),                        //
                  RespArray(ElementsAre("slots",                                                 //
                                        RespArray(ElementsAre(IntArg(10'001), IntArg(16'383))),  //
                                        "nodes",                                                 //
                                        RespArray(ElementsAre(                                   //
                                            RespArray(ElementsAre(                               //
                                                "id", "efgh7890",                                //
                                                "endpoint", "10.0.0.2",                          //
                                                "ip", "10.0.0.2",                                //
                                                "port", IntArg(7001),                            //
                                                "role", "master",                                //
                                                "replication-offset", IntArg(0),                 //
                                                "health", "online")),                            //
                                            RespArray(ElementsAre(                               //
                                                "id", "qwerty",                                  //
                                                "endpoint", "10.0.0.11",                         //
                                                "ip", "10.0.0.11",                               //
                                                "port", IntArg(8001),                            //
                                                "role", "replica",                               //
                                                "replication-offset", IntArg(0),                 //
                                                "health", "online")),                            //
                                            RespArray(ElementsAre(                               //
                                                "id", "qwerty1",                                 //
                                                "endpoint", "10.0.0.12",                         //
                                                "ip", "10.0.0.12",                               //
                                                "port", IntArg(8002),                            //
                                                "role", "replica",                               //
                                                "replication-offset", IntArg(0),                 //
                                                "health", "loading")),                           //
                                            RespArray(ElementsAre(                               //
                                                "id", "qwerty2",                                 //
                                                "endpoint", "10.0.0.13",                         //
                                                "ip", "10.0.0.13",                               //
                                                "port", IntArg(8003),                            //
                                                "role", "replica",                               //
                                                "replication-offset", IntArg(0),                 //
                                                "health", "fail")))))))));

  EXPECT_THAT(Run({"cluster", "slots"}),
              RespArray(ElementsAre(                            //
                  RespArray(ElementsAre(IntArg(0),              //
                                        IntArg(10'000),         //
                                        RespArray(ElementsAre(  //
                                            "10.0.0.1",         //
                                            IntArg(7'000),      //
                                            "abcd1234")),       //
                                        RespArray(ElementsAre(  //
                                            "10.0.0.10",        //
                                            IntArg(8'000),      //
                                            "wxyz")))),         //
                  RespArray(ElementsAre(IntArg(10'001),         //
                                        IntArg(16'383),         //
                                        RespArray(ElementsAre(  //
                                            "10.0.0.2",         //
                                            IntArg(7'001),      //
                                            "efgh7890")),       //
                                        RespArray(ElementsAre(  //
                                            "10.0.0.11",        //
                                            IntArg(8'001),      //
                                            "qwerty")))))));

  EXPECT_THAT(Run({"cluster", "nodes"}),
              "abcd1234 10.0.0.1:7000@7000 master - 0 0 0 disconnected 0-10000\n"
              "wxyz 10.0.0.10:8000@8000 slave abcd1234 0 0 0 connected\n"
              "efgh7890 10.0.0.2:7001@7001 master - 0 0 0 connected 10001-16383\n"
              "qwerty 10.0.0.11:8001@8001 slave efgh7890 0 0 0 connected\n"
              "qwerty1 10.0.0.12:8002@8002 slave efgh7890 0 0 0 connected\n"
              "qwerty2 10.0.0.13:8003@8003 slave efgh7890 0 0 0 disconnected\n");

  absl::InsecureBitGen eng;
  while (true) {
    string random_key = GetRandomHex(eng, 40);
    SlotId slot = KeySlot(random_key);
    if (slot > 10'000) {
      continue;
    }

    EXPECT_THAT(Run({"get", random_key}).GetString(),
                testing::MatchesRegex(R"(MOVED [0-9]+ 10.0.0.1:7000)"));
    break;
  }

  while (true) {
    string random_key = GetRandomHex(eng, 40);
    SlotId slot = KeySlot(random_key);
    if (slot <= 10'000) {
      continue;
    }

    EXPECT_THAT(Run({"get", random_key}).GetString(),
                testing::MatchesRegex(R"(MOVED [0-9]+ 10.0.0.2:7001)"));
    break;
  }
}

TEST_F(ClusterFamilyTest, ClusterGetSlotInfoInvalid) {
  constexpr string_view kErr = "ERR syntax error";
  EXPECT_THAT(RunPrivileged({"dflycluster", "getslotinfo"}), ErrArg(kErr));
  EXPECT_THAT(RunPrivileged({"dflycluster", "getslotinfo", "s"}), ErrArg(kErr));
  EXPECT_THAT(RunPrivileged({"dflycluster", "getslotinfo", "slots"}), ErrArg(kErr));
}

TEST_F(ClusterFamilyTest, ClusterGetSlotInfo) {
  ConfigSingleNodeCluster(GetMyId());

  constexpr string_view kKey = "some-key";
  const SlotId slot = KeySlot(kKey);
  EXPECT_NE(slot, 0) << "We need to choose another key";

  const string value(1'000, '#');  // Long string - to use heap
  EXPECT_EQ(Run({"SET", kKey, value}), "OK");

  EXPECT_THAT(
      RunPrivileged({"dflycluster", "getslotinfo", "slots", "0", absl::StrCat(slot)}),
      RespArray(ElementsAre(
          RespArray(ElementsAre(IntArg(0), "key_count", IntArg(0), "total_reads", IntArg(0),
                                "total_writes", IntArg(0), "memory_bytes", IntArg(0))),
          RespArray(ElementsAre(IntArg(slot), "key_count", IntArg(1), "total_reads", IntArg(0),
                                "total_writes", IntArg(1), "memory_bytes", Not(IntArg(0)))))));

  EXPECT_EQ(Run({"GET", kKey}), value);

  EXPECT_THAT(
      RunPrivileged({"dflycluster", "getslotinfo", "slots", "0", absl::StrCat(slot)}),
      RespArray(ElementsAre(
          RespArray(ElementsAre(IntArg(0), "key_count", IntArg(0), "total_reads", IntArg(0),
                                "total_writes", IntArg(0), "memory_bytes", IntArg(0))),
          RespArray(ElementsAre(IntArg(slot), "key_count", IntArg(1), "total_reads", IntArg(1),
                                "total_writes", IntArg(1), "memory_bytes", Not(IntArg(0)))))));

  EXPECT_EQ(Run({"SET", kKey, "value2"}), "OK");

  EXPECT_THAT(
      RunPrivileged({"dflycluster", "getslotinfo", "slots", "0", absl::StrCat(slot)}),
      RespArray(ElementsAre(
          RespArray(ElementsAre(IntArg(0), "key_count", IntArg(0), "total_reads", IntArg(0),
                                "total_writes", IntArg(0), "memory_bytes", IntArg(0))),
          RespArray(ElementsAre(IntArg(slot), "key_count", IntArg(1), "total_reads", IntArg(1),
                                "total_writes", IntArg(2), "memory_bytes", IntArg(36))))));
}

TEST_F(ClusterFamilyTest, ClusterGetSlotInfoRanges) {
  ConfigSingleNodeCluster(GetMyId());

  // Test basic range syntax: 0-2 should return 3 slots
  auto result = RunPrivileged({"dflycluster", "getslotinfo", "slots", "0-2"});
  ASSERT_EQ(result.GetVec().size(), 3u);
  EXPECT_THAT(result.GetVec()[0], RespArray(ElementsAre(IntArg(0), _, _, _, _, _, _, _, _)));
  EXPECT_THAT(result.GetVec()[1], RespArray(ElementsAre(IntArg(1), _, _, _, _, _, _, _, _)));
  EXPECT_THAT(result.GetVec()[2], RespArray(ElementsAre(IntArg(2), _, _, _, _, _, _, _, _)));

  // Test mixed syntax: range + individual slots
  result = RunPrivileged({"dflycluster", "getslotinfo", "slots", "0-1", "5", "10-11"});
  ASSERT_EQ(result.GetVec().size(), 5u);
  EXPECT_THAT(result.GetVec()[0], RespArray(ElementsAre(IntArg(0), _, _, _, _, _, _, _, _)));
  EXPECT_THAT(result.GetVec()[1], RespArray(ElementsAre(IntArg(1), _, _, _, _, _, _, _, _)));
  EXPECT_THAT(result.GetVec()[2], RespArray(ElementsAre(IntArg(5), _, _, _, _, _, _, _, _)));
  EXPECT_THAT(result.GetVec()[3], RespArray(ElementsAre(IntArg(10), _, _, _, _, _, _, _, _)));
  EXPECT_THAT(result.GetVec()[4], RespArray(ElementsAre(IntArg(11), _, _, _, _, _, _, _, _)));

  // Test reversed range (5-2 should be treated as 2-5)
  result = RunPrivileged({"dflycluster", "getslotinfo", "slots", "5-2"});
  ASSERT_EQ(result.GetVec().size(), 4u);
  EXPECT_THAT(result.GetVec()[0], RespArray(ElementsAre(IntArg(2), _, _, _, _, _, _, _, _)));
  EXPECT_THAT(result.GetVec()[1], RespArray(ElementsAre(IntArg(3), _, _, _, _, _, _, _, _)));
  EXPECT_THAT(result.GetVec()[2], RespArray(ElementsAre(IntArg(4), _, _, _, _, _, _, _, _)));
  EXPECT_THAT(result.GetVec()[3], RespArray(ElementsAre(IntArg(5), _, _, _, _, _, _, _, _)));

  // Test invalid slot id in range
  EXPECT_THAT(RunPrivileged({"dflycluster", "getslotinfo", "slots", "0-20000"}),
              ErrArg("Invalid slot id"));

  // Test invalid range format
  EXPECT_THAT(RunPrivileged({"dflycluster", "getslotinfo", "slots", "abc-def"}),
              ErrArg("Invalid slot range format"));

  // Edge cases with dashes
  EXPECT_THAT(RunPrivileged({"dflycluster", "getslotinfo", "slots", "-1"}),
              ErrArg("value is not an integer or out of range"));
  EXPECT_THAT(RunPrivileged({"dflycluster", "getslotinfo", "slots", "1-"}),
              ErrArg("Invalid slot range format"));
  EXPECT_THAT(RunPrivileged({"dflycluster", "getslotinfo", "slots", "1--2"}),
              ErrArg("Invalid slot range format"));
  EXPECT_THAT(RunPrivileged({"dflycluster", "getslotinfo", "slots", "1-2-3"}),
              ErrArg("Invalid slot range format"));
  EXPECT_THAT(RunPrivileged({"dflycluster", "getslotinfo", "slots", "1---2"}),
              ErrArg("Invalid slot range format"));
}

TEST_F(ClusterFamilyTest, ClusterSlotsPopulate) {
  ConfigSingleNodeCluster(GetMyId());

  Run({"debug", "populate", "10000", "key", "4", "SLOTS", "0", "1000"});

  for (int i = 0; i <= 1'000; ++i) {
    EXPECT_THAT(RunPrivileged({"dflycluster", "getslotinfo", "slots", absl::StrCat(i)}),
                RespArray(ElementsAre(IntArg(i), "key_count", Not(IntArg(0)), _, _, _, _, _, _)));
  }

  for (int i = 1'001; i <= 16'383; ++i) {
    EXPECT_THAT(RunPrivileged({"dflycluster", "getslotinfo", "slots", absl::StrCat(i)}),
                RespArray(ElementsAre(IntArg(i), "key_count", IntArg(0), _, _, _, _, _, _)));
  }
}

TEST_F(ClusterFamilyTest, ClusterEvalCrossslot) {
  ConfigSingleNodeCluster(GetMyId());

  auto res = Run({"EVAL", "return redis.call('MSET', 'x1', 'x1', 'x2', 'x2', 'x3', 'x3');", "3",
                  "x1", "x2", "x3"});

  EXPECT_THAT(res, ErrArg("CROSSSLOT"));

  auto sha =
      Run({"SCRPIT", "LOAD", "return redis.call('MSET', 'x1', 'x1', 'x2', 'x2', 'x3', 'x3');", "3",
           "x1", "x2", "x3"});

  EXPECT_THAT(Run({"EVALSHA", sha.GetString(), "3", "x1", "x2", "x3"}), ErrArg("CROSSSLOT"));
}

TEST_F(ClusterFamilyTest, ClusterMultiExec) {
  ConfigSingleNodeCluster(GetMyId());

  Run({"MULTI"});
  Run({"SET", "X1", "X1"});
  Run({"SET", "X2", "X2"});
  Run({"SET", "X3", "X3"});

  EXPECT_THAT(Run({"EXEC"}), ErrArg("CROSSSLOT"));
}

TEST_F(ClusterFamilyTest, ClusterConfigDeleteSlots) {
  ConfigSingleNodeCluster(GetMyId());

  Run({"debug", "populate", "100000"});

  EXPECT_THAT(
      RunPrivileged({"dflycluster", "getslotinfo", "slots", "1", "2"}),
      RespArray(ElementsAre(
          RespArray(ElementsAre(IntArg(1), "key_count", Not(IntArg(0)), "total_reads", IntArg(0),
                                "total_writes", Not(IntArg(0)), "memory_bytes", IntArg(108))),
          RespArray(ElementsAre(IntArg(2), "key_count", Not(IntArg(0)), "total_reads", IntArg(0),
                                "total_writes", Not(IntArg(0)), "memory_bytes", IntArg(360))))));

  ConfigSingleNodeCluster("abc");

  ExpectConditionWithinTimeout([&]() { return CheckedInt({"dbsize"}) == 0; });

  EXPECT_THAT(
      RunPrivileged({"dflycluster", "getslotinfo", "slots", "1", "2"}),
      RespArray(ElementsAre(
          RespArray(ElementsAre(IntArg(1), "key_count", IntArg(0), "total_reads", IntArg(0),
                                "total_writes", Not(IntArg(0)), "memory_bytes", IntArg(0))),
          RespArray(ElementsAre(IntArg(2), "key_count", IntArg(0), "total_reads", IntArg(0),
                                "total_writes", Not(IntArg(0)), "memory_bytes", IntArg(0))))));
}

// Test issue #1302
TEST_F(ClusterFamilyTest, ClusterConfigDeleteSlotsNoCrashOnShutdown) {
  ConfigSingleNodeCluster(GetMyId());

  Run({"debug", "populate", "100000"});

  EXPECT_THAT(
      RunPrivileged({"dflycluster", "getslotinfo", "slots", "1", "2"}),
      RespArray(ElementsAre(
          RespArray(ElementsAre(IntArg(1), "key_count", Not(IntArg(0)), "total_reads", IntArg(0),
                                "total_writes", Not(IntArg(0)), "memory_bytes", IntArg(108))),
          RespArray(ElementsAre(IntArg(2), "key_count", Not(IntArg(0)), "total_reads", IntArg(0),
                                "total_writes", Not(IntArg(0)), "memory_bytes", IntArg(360))))));

  // After running the new config we start a fiber that removes all slots from current instance
  // we immediately shut down to test that we do not crash.
  ConfigSingleNodeCluster("abc");
}

TEST_F(ClusterFamilyTest, ClusterConfigDeleteSomeSlots) {
  string config_template = R"json(
      [
        {
          "slot_ranges": [
            {
              "start": 0,
              "end": $1
            }
          ],
          "master": {
            "id": "$0",
            "ip": "10.0.0.1",
            "port": 7000
          },
          "replicas": []
        },
        {
          "slot_ranges": [
            {
              "start": $2,
              "end": 16383
            }
          ],
          "master": {
            "id": "other",
            "ip": "10.0.0.2",
            "port": 7000
          },
          "replicas": []
        }
      ])json";

  string config = absl::Substitute(config_template, GetMyId(), "8000", "8001");

  EXPECT_EQ(RunPrivileged({"dflycluster", "config", config}), "OK");

  Run({"debug", "populate", "1", "key", "4", "SLOTS", "7999", "7999"});
  Run({"debug", "populate", "2", "key", "4", "SLOTS", "8000", "8000"});

  EXPECT_THAT(RunPrivileged({"dflycluster", "getslotinfo", "slots", "7999", "8000"}),
              RespArray(ElementsAre(
                  RespArray(ElementsAre(IntArg(7999), "key_count", IntArg(1), _, _, _, _, _, _)),
                  RespArray(ElementsAre(IntArg(8000), "key_count", IntArg(2), _, _, _, _, _, _)))));
  EXPECT_THAT(Run({"dbsize"}), IntArg(3));

  // Move ownership over 8000 to other master
  config = absl::Substitute(config_template, GetMyId(), "7999", "8000");
  EXPECT_EQ(RunPrivileged({"dflycluster", "config", config}), "OK");

  // Verify that keys for slot 8000 were deleted, while key for slot 7999 was kept
  ExpectConditionWithinTimeout([&]() { return CheckedInt({"dbsize"}) == 1; });

  EXPECT_THAT(RunPrivileged({"dflycluster", "getslotinfo", "slots", "7999", "8000"}),
              RespArray(ElementsAre(
                  RespArray(ElementsAre(IntArg(7999), "key_count", IntArg(1), _, _, _, _, _, _)),
                  RespArray(ElementsAre(IntArg(8000), "key_count", IntArg(0), _, _, _, _, _, _)))));
}

TEST_F(ClusterFamilyTest, ClusterModeSelectNotAllowed) {
  EXPECT_THAT(Run({"select", "1"}), ErrArg("SELECT is not allowed in cluster mode"));
  EXPECT_EQ(Run({"select", "0"}), "OK");
}

TEST_F(ClusterFamilyTest, ClusterModePubSubNotAllowed) {
  EXPECT_THAT(Run({"PUBLISH", "ch", "message"}),
              ErrArg("PUBLISH is not supported in cluster mode yet"));
  EXPECT_THAT(Run({"SUBSCRIBE", "ch"}), ErrArg("SUBSCRIBE is not supported in cluster mode yet"));
  EXPECT_THAT(Run({"UNSUBSCRIBE", "ch"}),
              ErrArg("UNSUBSCRIBE is not supported in cluster mode yet"));
  EXPECT_THAT(Run({"PSUBSCRIBE", "ch?"}),
              ErrArg("PSUBSCRIBE is not supported in cluster mode yet"));
  EXPECT_THAT(Run({"PUNSUBSCRIBE", "ch?"}),
              ErrArg("PUNSUBSCRIBE is not supported in cluster mode yet"));
}

// SSUBSCRIBE and SPUBLISH work in cluster mode
TEST_F(ClusterFamilyTest, ClusterModePubSub) {
  single_response_ = false;
  ConfigSingleNodeCluster(GetMyId());

  // Ssubscribe works as expected
  auto resp = pp_->at(1)->Await([&] { return Run({"SSUBSCRIBE", "cluster-channel"}); });
  EXPECT_THAT(resp, RespElementsAre("ssubscribe", "cluster-channel", IntArg(1)));

  // Send-receive a single message
  resp = pp_->at(0)->Await([&] {
    return Run({"SPUBLISH", "cluster-channel", "a simple message"});
  });
  EXPECT_THAT(resp, IntArg(1));

  pp_->AwaitFiberOnAll([](util::ProactorBase* pb) {});

  ASSERT_EQ(1, SubscriberMessagesLen("IO1"));
  const auto& msg = GetPublishedMessage("IO1", 0);
  EXPECT_TRUE(msg.is_sharded);
  EXPECT_EQ("cluster-channel", msg.channel);
  EXPECT_EQ("a simple message", msg.message);

  // Sunsubscribe
  resp = pp_->at(1)->Await([&] { return Run({"SUNSUBSCRIBE", "cluster-channel"}); });
  EXPECT_THAT(resp, RespElementsAre("sunsubscribe", "cluster-channel", IntArg(0)));
}

TEST_F(ClusterFamilyTest, ClusterFirstConfigCallDropsEntriesNotOwnedByNode) {
  InitWithDbFilename();

  Run({"debug", "populate", "50000"});

  EXPECT_EQ(Run({"save", "df"}), "OK");

  auto save_info = service_->server_family().GetLastSaveInfo();
  EXPECT_EQ(Run({"dfly", "load", save_info.file_name}), "OK");
  EXPECT_EQ(CheckedInt({"dbsize"}), 50000);

  ConfigSingleNodeCluster("abcd1234");

  // Make sure `dbsize` all slots were removed
  ExpectConditionWithinTimeout([&]() { return CheckedInt({"dbsize"}) == 0; });
}

TEST_F(ClusterFamilyTest, SnapshotBiggerThanMaxMemory) {
  InitWithDbFilename();
  ConfigSingleNodeCluster(GetMyId());

  Run({"debug", "populate", "50000"});
  EXPECT_EQ(Run({"save", "df"}), "OK");

  max_memory_limit = 10000;
  auto save_info = service_->server_family().GetLastSaveInfo();
  EXPECT_EQ(Run({"dfly", "load", save_info.file_name}), "OK");
}

TEST_F(ClusterFamilyTest, Keyslot) {
  // Example from Redis' command reference: https://redis.io/commands/cluster-keyslot/
  EXPECT_THAT(Run({"cluster", "keyslot", "somekey"}), IntArg(11'058));

  // Test hash tags
  EXPECT_THAT(Run({"cluster", "keyslot", "prefix{somekey}suffix"}), IntArg(11'058));

  EXPECT_EQ(CheckedInt({"cluster", "keyslot", "abc{def}ghi"}),
            CheckedInt({"cluster", "keyslot", "123{def}456"}));
}

TEST_F(ClusterFamilyTest, FlushSlots) {
  EXPECT_EQ(Run({"debug", "populate", "100", "key", "4", "slots", "0", "1"}), "OK");

  EXPECT_THAT(RunPrivileged({"dflycluster", "getslotinfo", "slots", "0", "1"}),
              RespArray(ElementsAre(
                  RespArray(ElementsAre(IntArg(0), "key_count", Not(IntArg(0)), "total_reads", _,
                                        "total_writes", _, "memory_bytes", _)),
                  RespArray(ElementsAre(IntArg(1), "key_count", Not(IntArg(0)), "total_reads", _,
                                        "total_writes", _, "memory_bytes", _)))));

  ExpectConditionWithinTimeout([&]() {
    return RunPrivileged({"dflycluster", "flushslots", "0", "0"}) == "OK";
  });
  util::ThisFiber::SleepFor(10ms);
  EXPECT_THAT(RunPrivileged({"dflycluster", "getslotinfo", "slots", "0", "1"}),
              RespArray(ElementsAre(
                  RespArray(ElementsAre(IntArg(0), "key_count", IntArg(0), "total_reads", _,
                                        "total_writes", _, "memory_bytes", _)),
                  RespArray(ElementsAre(IntArg(1), "key_count", Not(IntArg(0)), "total_reads", _,
                                        "total_writes", _, "memory_bytes", _)))));

  EXPECT_EQ(RunPrivileged({"dflycluster", "flushslots", "0", "1"}), "OK");
  util::ThisFiber::SleepFor(10ms);
  EXPECT_THAT(
      RunPrivileged({"dflycluster", "getslotinfo", "slots", "0", "1"}),
      RespArray(ElementsAre(RespArray(ElementsAre(IntArg(0), "key_count", IntArg(0), "total_reads",
                                                  _, "total_writes", _, "memory_bytes", _)),
                            RespArray(ElementsAre(IntArg(1), "key_count", IntArg(0), "total_reads",
                                                  _, "total_writes", _, "memory_bytes", _)))));
}

TEST_F(ClusterFamilyTest, FlushSlotsOutOfBounds) {
  EXPECT_THAT(RunPrivileged({"dflycluster", "flushslots", "0", "16384"}),
              ErrArg("value is not an integer or out of range"));
  EXPECT_THAT(RunPrivileged({"dflycluster", "flushslots", "16384", "16384"}),
              ErrArg("value is not an integer or out of range"));
  EXPECT_THAT(RunPrivileged({"dflycluster", "flushslots", "100", "50"}),
              ErrArg("Invalid slot range"));
}

TEST_F(ClusterFamilyTest, FlushSlotsAndImmediatelySetValue) {
  for (int count : {1, 10, 100, 1000, 10000, 100000}) {
    ConfigSingleNodeCluster(GetMyId());

    EXPECT_EQ(Run({"debug", "populate", absl::StrCat(count), "key", "4"}), "OK");
    EXPECT_EQ(Run({"get", "key:0"}), "xxxx");

    EXPECT_THAT(Run({"cluster", "keyslot", "key:0"}), IntArg(2592));
    EXPECT_THAT(Run({"dbsize"}), IntArg(count));
    auto slot_size_response = Run({"dflycluster", "getslotinfo", "slots", "2592"});
    EXPECT_THAT(slot_size_response, RespArray(ElementsAre(_, "key_count", _, "total_reads", _,
                                                          "total_writes", _, "memory_bytes", _)));
    auto slot_size = slot_size_response.GetVec()[2].GetInt();
    EXPECT_TRUE(slot_size.has_value());

    EXPECT_EQ(Run({"dflycluster", "flushslots", "2592", "2592"}), "OK");
    // key:0 should have been removed, so APPEND will end up with key:0 == ZZZZ
    EXPECT_THAT(Run({"append", "key:0", "ZZZZ"}), IntArg(4));
    EXPECT_EQ(Run({"get", "key:0"}), "ZZZZ");
    // db size should be count - (size of slot 2592) + 1, where 1 is for 'key:0'
    ExpectConditionWithinTimeout(
        [&]() { return CheckedInt({"dbsize"}) == (count - *slot_size + 1); });

    ResetService();
  }
}

TEST_F(ClusterFamilyTest, ClusterCrossSlot) {
  ConfigSingleNodeCluster(GetMyId());

  EXPECT_EQ(Run({"SET", "key", "value"}), "OK");
  EXPECT_EQ(Run({"GET", "key"}), "value");

  EXPECT_EQ(Run({"MSET", "key", "value2"}), "OK");
  EXPECT_EQ(Run({"MGET", "key"}), "value2");

  EXPECT_THAT(Run({"MSET", "key", "value", "key2", "value2"}), ErrArg("CROSSSLOT"));
  EXPECT_THAT(Run({"MGET", "key", "key2"}), ErrArg("CROSSSLOT"));
  EXPECT_THAT(Run({"ZINTERSTORE", "key", "2", "key1", "key2"}), ErrArg("CROSSSLOT"));

  EXPECT_EQ(Run({"MSET", "key{tag}", "value", "key2{tag}", "value2"}), "OK");
  EXPECT_THAT(Run({"MGET", "key{tag}", "key2{tag}"}), RespArray(ElementsAre("value", "value2")));
}

class ClusterFamilyEmulatedTest : public ClusterFamilyTest {
 public:
  ClusterFamilyEmulatedTest() {
    SetTestFlag("cluster_mode", "emulated");
    SetTestFlag("cluster_announce_ip", "fake-host");
  }
};

TEST_F(ClusterFamilyEmulatedTest, ClusterInfo) {
  string cluster_info = Run({"cluster", "info"}).GetString();
  EXPECT_THAT(cluster_info, HasSubstr("cluster_state:ok"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_slots_assigned:16384"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_slots_ok:16384"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_known_nodes:1"));
  EXPECT_THAT(cluster_info, HasSubstr("cluster_size:1"));
}

TEST_F(ClusterFamilyEmulatedTest, ClusterShardInfos) {
  EXPECT_THAT(Run({"cluster", "shards"}),
              RespArray(ElementsAre("slots",                                           //
                                    RespArray(ElementsAre(IntArg(0), IntArg(16383))),  //
                                    "nodes",                                           //
                                    RespArray(ElementsAre(                             //
                                        RespArray(ElementsAre(                         //
                                            "id", GetMyId(),                           //
                                            "endpoint", "fake-host",                   //
                                            "ip", "fake-host",                         //
                                            "port", IntArg(6379),                      //
                                            "role", "master",                          //
                                            "replication-offset", IntArg(0),           //
                                            "health", "online")))))));
}

TEST_F(ClusterFamilyEmulatedTest, ClusterSlots) {
  EXPECT_THAT(Run({"cluster", "slots"}),
              RespArray(ElementsAre(IntArg(0),              //
                                    IntArg(16383),          //
                                    RespArray(ElementsAre(  //
                                        "fake-host",        //
                                        IntArg(6379),       //
                                        GetMyId())))));
}

TEST_F(ClusterFamilyEmulatedTest, ClusterNodes) {
  auto res = Run({"cluster", "nodes"});
  EXPECT_THAT(res, GetMyId() + " fake-host:6379@6379 myself,master - 0 0 0 connected 0-16383\n");
}

TEST_F(ClusterFamilyEmulatedTest, ForbidenCommands) {
  auto res = Run({"DFLYCLUSTER", "GETSLOTINFO", "SLOTS", "1"});
  EXPECT_THAT(res, ErrArg("Cluster is disabled. Use --cluster_mode=yes to enable."));
}

}  // namespace
}  // namespace dfly::cluster


================================================
FILE: src/server/cluster/cluster_utility.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/cluster/cluster_utility.h"

#include "server/cluster/cluster_defs.h"
#include "server/common.h"
#include "server/db_slice.h"
#include "server/engine_shard_set.h"
#include "server/namespaces.h"

using namespace std;

namespace dfly::cluster {

uint64_t GetKeyCount(const SlotRanges& slots) {
  std::atomic_uint64_t keys = 0;

  shard_set->pool()->AwaitFiberOnAll([&](auto*) {
    EngineShard* shard = EngineShard::tlocal();
    if (shard == nullptr)
      return;

    uint64_t shard_keys = 0;
    for (const SlotRange& range : slots) {
      for (SlotId slot = range.start; slot <= range.end; slot++) {
        shard_keys += namespaces->GetDefaultNamespace()
                          .GetDbSlice(shard->shard_id())
                          .GetSlotStats(slot)
                          .key_count;
      }
    }
    keys.fetch_add(shard_keys);
  });

  return keys.load();
}

}  // namespace dfly::cluster


================================================
FILE: src/server/cluster/cluster_utility.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include "server/cluster/cluster_defs.h"

namespace dfly::cluster {

uint64_t GetKeyCount(const SlotRanges& slots);

}  // namespace dfly::cluster


================================================
FILE: src/server/cluster/coordinator.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/cluster/coordinator.h"

#include "base/logging.h"
#include "facade/redis_parser.h"
#include "facade/socket_utils.h"
#include "server/cluster/cluster_config.h"

using namespace std;
using namespace facade;

namespace dfly::cluster {

class Coordinator::CrossShardRequest {
 public:
  CrossShardRequest(std::string cmd, Coordinator::RespCB cb, uint32_t total_shards)
      : command_(std::move(cmd)), cb_(std::move(cb)), shard_processed_(total_shards) {
  }

  const std::string& GetCommand() const {
    return command_;
  }

  template <class... Args> void Exec(Args&&... args) {
    cb_(std::forward<Args>(args)...);
    if (shard_processed_.fetch_sub(1, std::memory_order_relaxed) == 1) {
      future_.Resolve(GenericError{});
    }
  }

  util::fb2::Future<GenericError>& GetFuture() {
    return future_;
  }

 private:
  std::string command_;
  Coordinator::RespCB cb_;
  util::fb2::Future<GenericError> future_;
  std::atomic_uint32_t shard_processed_;
};

class Coordinator::CrossShardClient : public ProtocolClient {
 public:
  CrossShardClient(std::string host, uint16_t port) : ProtocolClient(std::move(host), port) {
  }

  using ProtocolClient::CloseSocket;
  ~CrossShardClient() {
    exec_st_.Cancel();
    waker_.notifyAll();
    CloseSocket();
    send_fb_.Join();
    resp_fb_.Join();
  }

  [[nodiscard]] bool Init() {
    VLOG(1) << "Resolving host DNS to " << server().Description();
    if (error_code ec = ResolveHostDns(); ec) {
      LOG(WARNING) << "Could not resolve host DNS to " << server().Description() << ": "
                   << ec.message();
      exec_st_.ReportError(GenericError(ec, "Could not resolve host dns."));
      return false;
    }
    VLOG(1) << "Start coordinator connection to " << server().Description();
    auto timeout = 3000ms;  // TODO add flag;
    if (auto ec = ConnectAndAuth(timeout, &exec_st_); ec) {
      LOG(WARNING) << "Couldn't connect to " << server().Description() << ": " << ec.message()
                   << ", socket state: " << GetSocketInfo(Sock()->native_handle());
      exec_st_.ReportError(GenericError(ec, "Couldn't connect to source."));
      return false;
    }

    ResetParser(RedisParser::Mode::CLIENT);
    send_fb_ = util::fb2::Fiber("CSS_SendFb", &CrossShardClient::SendFb, this);
    resp_fb_ = util::fb2::Fiber("CSS_RespFb", &CrossShardClient::RespFb, this);
    return true;
  }

  void Cancel() {
    exec_st_.Cancel();
    ShutdownSocket();
  }

  void EnqueueCommand(CrossShardRequestPtr req) {
    {
      std::lock_guard lk(send_mu_);
      send_queue_.push(req);
      ready_to_send_ = true;
    }
    {
      std::lock_guard lk(resp_mu_);
      resp_queue_.push(req);
      ready_to_resp_ = true;
    }

    waker_.notifyAll();
  }

  void SendFb() {
    while (!exec_st_.IsCancelled()) {
      waker_.await([this] { return exec_st_.IsCancelled() || ready_to_send_; });
      if (exec_st_.IsCancelled())
        return;
      std::lock_guard lk(send_mu_);
      while (!send_queue_.empty()) {
        if (auto ec = ProtocolClient::SendCommand(send_queue_.front()->GetCommand()); ec) {
          exec_st_.ReportError(GenericError(
              ec, absl::StrCat("Coordinator could not send command to : ", server().Description(),
                               "socket state: ", GetSocketInfo(Sock()->native_handle()))));
          // TODO reinit connection.
          break;
        }
        send_queue_.pop();
      }
      ready_to_send_ = false;
    }
  }

  void RespFb() {
    while (!exec_st_.IsCancelled()) {
      waker_.await([this] { return exec_st_.IsCancelled() || ready_to_resp_; });
      if (exec_st_.IsCancelled())
        return;
      std::lock_guard lk(resp_mu_);
      constexpr auto timeout = 3000;  // TODO add flag and add usage in ReadRespReply.
      while (!resp_queue_.empty()) {
        auto resp = TakeRespReply(timeout);
        if (!resp) {
          LOG(WARNING) << "Error reading response from " << server().Description() << ": "
                       << resp.error()
                       << ", socket state: " + GetSocketInfo(Sock()->native_handle());

          // TODO make all requests fail in this case.
          // TODO reinit connection.
          LOG(FATAL) << "Coordinator RespFb read error, not implemented recovery yet.";
          break;
        }
        resp_queue_.front()->Exec(*resp);
        resp_queue_.pop();
      }
      ready_to_resp_ = false;
    }
  }

 private:
  std::queue<std::shared_ptr<CrossShardRequest>> send_queue_;
  std::queue<std::shared_ptr<CrossShardRequest>> resp_queue_;

  util::fb2::Fiber send_fb_;
  util::fb2::Fiber resp_fb_;
  util::fb2::EventCount waker_;

  mutable util::fb2::Mutex send_mu_;
  mutable util::fb2::Mutex resp_mu_;
  std::atomic_bool ready_to_send_ = false;
  std::atomic_bool ready_to_resp_ = false;
};

Coordinator& Coordinator::Current() {
  static Coordinator instance;
  return instance;
}

std::shared_ptr<Coordinator::CrossShardClient> Coordinator::GetClient(const std::string& host,
                                                                      uint16_t port) {
  for (const auto& client : clients_) {
    if (client->GetHost() == host && client->GetPort() == port) {
      return client;
    }
  }
  auto new_client = std::make_shared<CrossShardClient>(host, port);
  if (new_client->Init()) {
    clients_.emplace_back(new_client);
    return new_client;
  }
  return nullptr;
}

util::fb2::Future<GenericError> Coordinator::DispatchAll(std::string command, RespCB cb) {
  auto cluster_config = ClusterConfig::Current();
  if (!cluster_config) {
    VLOG(2) << "No cluster config found for coordinator plan creation.";
    LOG(FATAL) << "No cluster config, not implemented logic yet.";
    return {};
  }

  if (!cluster_config->is_master()) {
    VLOG(2) << "Current node isn't master, the command should be executed locally:" << command;
    util::fb2::Future<GenericError> res;
    res.Resolve(GenericError{});
    return res;
  }

  VLOG(2) << "Dispatching command to all shards: " << command;
  auto shards_config = cluster_config->GetConfig();

  auto shard_request = std::make_shared<CrossShardRequest>(std::move(command), std::move(cb),
                                                           shards_config.size() - 1);

  for (const auto& shard : shards_config) {
    if (shard.master.id == cluster_config->MyId()) {
      continue;
    }
    const auto& client = GetClient(shard.master.ip, shard.master.port);
    if (!client) {
      VLOG(1) << "Could not get coordinator client for " << shard.master.ip << ":"
              << shard.master.port;
      cb(RESPObj());  // TODO add error propagation.
      LOG(FATAL) << "No error processing, not implemented logic yet.";
      return {};
    }
    client->EnqueueCommand(shard_request);
  }
  return shard_request->GetFuture();
}

}  // namespace dfly::cluster


================================================
FILE: src/server/cluster/coordinator.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once

#include "server/cluster/cluster_defs.h"
#include "server/protocol_client.h"
#include "util/fibers/future.h"

namespace dfly::cluster {

// Coordinator needs to create and manage connections between nodes in the cluster for cross shard
// commands. All cross-shard commands are dispatched through the Coordinator.
// It can be used to exeute commands on all shards or specific shards.
class Coordinator {
 public:
  using RespCB = std::function<void(const facade::RESPObj&)>;  // TODO add error.

  static Coordinator& Current();
  [[nodiscard]] util::fb2::Future<GenericError> DispatchAll(std::string command, RespCB cb);

  void Shutdown() {
    // TODO add proper shutdown logic. We need to prevent new clients creation. Maybe we need to
    // wait destroying of existing clients.
    clients_.clear();
  }

 private:
  Coordinator() = default;
  class CrossShardClient;
  class CrossShardRequest;
  using CrossShardRequestPtr = std::shared_ptr<Coordinator::CrossShardRequest>;
  std::shared_ptr<CrossShardClient> GetClient(const std::string& host, uint16_t port);
  std::vector<std::shared_ptr<CrossShardClient>> clients_;
};

}  // namespace dfly::cluster


================================================
FILE: src/server/cluster/incoming_slot_migration.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/cluster/incoming_slot_migration.h"

#include <absl/cleanup/cleanup.h>
#include <absl/strings/str_cat.h>

#include "base/flags.h"
#include "base/logging.h"
#include "cluster_utility.h"
#include "facade/service_interface.h"
#include "facade/socket_utils.h"
#include "server/error.h"
#include "server/journal/executor.h"
#include "server/journal/serializer.h"
#include "server/journal/tx_executor.h"
#include "server/main_service.h"
#include "util/fibers/synchronization.h"

ABSL_DECLARE_FLAG(int, migration_finalization_timeout_ms);
ABSL_FLAG(uint32_t, slot_migration_throttle_us, 0,
          "Incoming migration throttle time in us, we throttle every 100us of migration commands "
          "processing, 0 to disable. Recommended value is 20. Values more than 50 can "
          "significantly reduce migration speed.");

namespace dfly::cluster {

using namespace std;
using namespace util;
using namespace facade;

// ClusterShardMigration manage data receiving in slots migration process.
// It is created per shard on the target node to initiate FLOW step.
class ClusterShardMigration {
 public:
  ClusterShardMigration(uint32_t shard_id, Service* service, IncomingSlotMigration* in_migration,
                        util::fb2::BlockingCounter bc)
      : source_shard_id_(shard_id),
        is_finished_(false),
        socket_(nullptr),
        executor_(service),
        in_migration_(in_migration),
        bc_(bc) {
  }

  void Pause(bool pause) {
    pause_ = pause;
  }

  void Start(ExecutionState* cntx, util::FiberSocketBase* source) ABSL_LOCKS_EXCLUDED(mu_) {
    {
      util::fb2::LockGuard lk(mu_);
      if (is_finished_) {
        return;
      }
      is_finished_ = true;
      socket_ = source;
    }

    absl::Cleanup cleanup([this]() ABSL_LOCKS_EXCLUDED(mu_) {
      util::fb2::LockGuard lk(mu_);
      socket_ = nullptr;
    });
    JournalReader reader{source, 0};
    TransactionReader tx_reader;
    uint64_t last_sleep = fb2::ProactorBase::GetMonotonicTimeNs();

    const uint64_t throttle_us = absl::GetFlag(FLAGS_slot_migration_throttle_us);
    TransactionData tx_data;
    while (cntx->IsRunning()) {
      if (pause_) {
        ThisFiber::SleepFor(100ms);
        continue;
      }

      bool success = tx_reader.NextTxData(&reader, cntx, &tx_data);
      if (!success) {
        if (auto err = cntx->GetError(); err) {
          LOG(WARNING) << "Error reading from migration socket for shard " << source_shard_id_
                       << ": " << err.Format()
                       << ", socket state: " << GetSocketInfo(source->native_handle());
        }
        break;
      }

      while (tx_data.opcode == journal::Op::LSN) {
        VLOG(2) << "Attempt to finalize flow " << source_shard_id_ << " attempt " << tx_data.lsn;
        last_attempt_.store(tx_data.lsn);
        bc_->Dec();  // we can Join the flow now
        // if we get new data, attempt is failed
        if (success = tx_reader.NextTxData(&reader, cntx, &tx_data); !success) {
          VLOG(1) << "Finalized flow " << source_shard_id_;
          return;
        }

        if (in_migration_->GetState() == MigrationState::C_FATAL) {
          VLOG(1) << "Flow finalization " << source_shard_id_
                  << " canceled due memory limit reached";
          return;
        }
        if (!tx_data.command.empty()) {
          VLOG(1) << "Flow finalization failed " << source_shard_id_ << " by "
                  << tx_data.command.Front();
        } else {
          VLOG(1) << "Flow finalization failed " << source_shard_id_ << " by opcode "
                  << (int)tx_data.opcode;
        }

        bc_->Add();  // the flow isn't finished so we lock it again
      }
      if (tx_data.opcode == journal::Op::PING) {
        // TODO check about ping logic
      } else {
        auto err = ExecuteTx(std::move(tx_data), cntx);
        // Break incoming slot migration if command reported OOM
        if (err == std::errc::not_enough_memory) {
          cntx->ReportError(std::string{kIncomingMigrationOOM});
          in_migration_->ReportFatalError(std::string{kIncomingMigrationOOM});
          break;
        }
      }
      if (throttle_us > 0) {
        // every 100us we do sleep for 20us to allow other commands to be processed
        if (uint64_t now = fb2::ProactorBase::GetMonotonicTimeNs(); now - last_sleep > 100000) {
          ThisFiber::SleepFor(std::chrono::microseconds(throttle_us));
          last_sleep = now;
        }
      }
    }

    VLOG(2) << "Flow " << source_shard_id_ << " canceled";
    bc_->Dec();  // we should provide ability to join the flow
  }

  std::error_code Cancel() {
    util::fb2::LockGuard lk(mu_);
    if (socket_ != nullptr) {
      return socket_->proactor()->Await([s = socket_]() {
        if (s->IsOpen()) {
          auto ec = s->Shutdown(SHUT_RDWR);  // Does not Close(), only forbids further I/O.
          LOG_IF(WARNING, ec) << "Error shutting down socket for shard migration: " << ec.message()
                              << ", socket state: " << GetSocketInfo(s->native_handle());
          return ec;
        }
        return std::error_code();
      });
    }
    if (!is_finished_) {
      is_finished_ = true;
      bc_->Dec();  // we should provide ability to join the flow if the Start() wasn't called
    }

    return {};
  }

  long GetLastAttempt() const {
    return last_attempt_.load();
  }

 private:
  std::error_code ExecuteTx(TransactionData&& tx_data, ExecutionState* cntx) {
    if (!cntx->IsRunning()) {
      return {};
    }

    if (!tx_data.IsGlobalCmd()) {
      facade::DispatchResult res = executor_.Execute(tx_data.dbid, tx_data.command);
      return res == facade::DispatchResult::OOM ? make_error_code(errc::not_enough_memory)
                                                : error_code();
    } else {
      // TODO check which global commands should be supported
      std::string error = absl::StrCat("We don't support command: ", tx_data.command[0],
                                       " in cluster migration process.");
      LOG(ERROR) << error;
      cntx->ReportError(error);
      in_migration_->ReportError(error);
    }

    return {};
  }

  uint32_t source_shard_id_;
  util::fb2::Mutex mu_;
  bool is_finished_ ABSL_GUARDED_BY(mu_);
  util::FiberSocketBase* socket_ ABSL_GUARDED_BY(mu_);
  JournalExecutor executor_;
  IncomingSlotMigration* in_migration_;
  util::fb2::BlockingCounter bc_;
  atomic_long last_attempt_{-1};
  atomic_bool pause_ = false;
};

IncomingSlotMigration::IncomingSlotMigration(string source_id, Service* se, SlotRanges slots)
    : source_id_(std::move(source_id)), service_(*se), slots_(std::move(slots)), bc_(0) {
}

IncomingSlotMigration::~IncomingSlotMigration() {
}

void IncomingSlotMigration::Pause(bool pause) {
  VLOG(1) << "Pausing migration " << pause;
  for (auto& flow : shard_flows_) {
    flow->Pause(pause);
  }
}

bool IncomingSlotMigration::Join(long attempt) {
  const absl::Time start = absl::Now();
  const absl::Duration timeout =
      absl::Milliseconds(absl::GetFlag(FLAGS_migration_finalization_timeout_ms));

  while (true) {
    const absl::Time now = absl::Now();
    const absl::Duration passed = now - start;
    VLOG_EVERY_N(1, 10000) << "Checking whether to continue with join " << passed << " vs "
                           << timeout;
    if (passed >= timeout) {
      LOG(WARNING) << "Can't join migration in time for " << source_id_;
      ReportError(GenericError("Can't join migration in time"));
      return false;
    }

    // If any of migration shards reported ERROR (OOM) we can return error
    if (GetState() == MigrationState::C_FATAL) {
      return false;
    }

    // if data was sent after LSN, WaitFor() always returns false so to reduce wait time
    // we check current state and if WaitFor false but GetLastAttempt() == attempt
    // the Join is failed and we can return false
    const auto remaining_time = absl::ToInt64Milliseconds(timeout - passed);
    const auto wait_time = (remaining_time > 100 ? 100 : remaining_time) * 1ms;

    const auto is_attempt_correct =
        std::all_of(shard_flows_.begin(), shard_flows_.end(),
                    [attempt](const auto& flow) { return flow->GetLastAttempt() == attempt; });

    auto wait_res = bc_->WaitFor(wait_time);
    if (is_attempt_correct) {
      if (wait_res) {
        util::fb2::LockGuard lk(state_mu_);
        state_ = MigrationState::C_FINISHED;
        keys_number_ = cluster::GetKeyCount(slots_);
      } else {
        LOG(WARNING) << "Can't join migration because of data after LSN for " << source_id_;
        ReportError(GenericError("Can't join migration in time"));
      }
      return wait_res;
    }
  }
}

void IncomingSlotMigration::Stop() {
  util::fb2::LockGuard lk(state_mu_);
  string_view log_state = state_ == MigrationState::C_FINISHED ? "Finishing" : "Cancelling";
  LOG(INFO) << log_state << " incoming migration of slots " << slots_.ToString();
  cntx_.Cancel();

  for (auto& flow : shard_flows_) {
    if (auto err = flow->Cancel(); err) {
      VLOG(1) << "Error during flow Stop: " << err;
    }
  }

  // Don't wait if we reached FATAL state
  if (state_ == MigrationState::C_FATAL) {
    return;
  }

  // we need to Join the migration process to prevent data corruption
  const absl::Time start = absl::Now();
  const absl::Duration timeout =
      absl::Milliseconds(absl::GetFlag(FLAGS_migration_finalization_timeout_ms));

  while (true) {
    const absl::Time now = absl::Now();
    const absl::Duration passed = now - start;
    VLOG(1) << "Checking whether to continue with stop " << passed << " vs " << timeout;

    if (bc_->WaitFor(absl::ToInt64Milliseconds(timeout - passed) * 1ms)) {
      return;
    } else if (passed >= timeout) {
      LOG(ERROR) << "Can't stop migration in time";
      return;
    }
  }
}

void IncomingSlotMigration::Init(uint32_t shards_num) {
  util::fb2::LockGuard lk(state_mu_);
  cntx_.Reset(nullptr);
  state_ = MigrationState::C_SYNC;

  bc_ = BlockingCounter(shards_num);
  shard_flows_.resize(shards_num);
  for (unsigned i = 0; i < shards_num; ++i) {
    shard_flows_[i].reset(new ClusterShardMigration(i, &service_, this, bc_));
  }
}

void IncomingSlotMigration::StartFlow(uint32_t shard, util::FiberSocketBase* source) {
  shard_flows_[shard]->Start(&cntx_, source);
  VLOG(1) << "Incoming flow " << shard
          << (GetState() == MigrationState::C_FINISHED ? " finished " : " cancelled ") << "for "
          << source_id_;
  if (GetState() == MigrationState::C_FATAL) {
    Stop();
  }
}

size_t IncomingSlotMigration::GetKeyCount() const {
  {
    util::fb2::LockGuard lk(state_mu_);
    if (state_ == MigrationState::C_FINISHED) {
      return keys_number_;
    }
  }

  return cluster::GetKeyCount(slots_);
}

}  // namespace dfly::cluster


================================================
FILE: src/server/cluster/incoming_slot_migration.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once

#include "helio/util/fiber_socket_base.h"
#include "server/cluster/cluster_defs.h"
#include "server/execution_state.h"

namespace dfly {
class Service;
}

namespace dfly::cluster {
class ClusterShardMigration;

// The main entity on the target side that manage slots migration process
// Manage connections between the target and source node,
// manage migration process state and data
class IncomingSlotMigration {
 public:
  IncomingSlotMigration(std::string source_id, Service* se, SlotRanges slots);
  ~IncomingSlotMigration();

  // process data from FDLYMIGRATE FLOW cmd
  // executes until Stop called or connection closed
  void StartFlow(uint32_t shard, util::FiberSocketBase* source);

  // Waits until all flows got FIN opcode.
  // returns true if we joined false if timeout is readed
  // After Join we still can get data due to error situation
  [[nodiscard]] bool Join(long attempt);

  // Stop and join the migration, can be called even after migration is finished
  void Stop();

  // Init/Reinit migration
  void Init(uint32_t shards_num);

  MigrationState GetState() const {
    util::fb2::LockGuard lk(state_mu_);
    return state_;
  }

  const SlotRanges& GetSlots() const {
    return slots_;
  }

  const std::string& GetSourceID() const {
    return source_id_;
  }

  size_t ShardNum() const {
    return shard_flows_.size();
  }

  // Switch to  FATAL state and store error message
  void ReportFatalError(dfly::GenericError err) ABSL_LOCKS_EXCLUDED(state_mu_, error_mu_) {
    errors_count_.fetch_add(1, std::memory_order_relaxed);
    util::fb2::LockGuard lk_state(state_mu_);
    util::fb2::LockGuard lk_error(error_mu_);
    state_ = MigrationState::C_FATAL;
    last_error_ = std::move(err);
  }

  void ReportError(dfly::GenericError err) ABSL_LOCKS_EXCLUDED(error_mu_) {
    errors_count_.fetch_add(1, std::memory_order_relaxed);
    util::fb2::LockGuard lk(error_mu_);
    if (GetState() != MigrationState::C_FATAL)
      last_error_ = std::move(err);
  }

  std::string GetErrorStr() const ABSL_LOCKS_EXCLUDED(error_mu_) {
    util::fb2::LockGuard lk(error_mu_);
    return last_error_.Format();
  }

  size_t GetErrorsCount() const {
    return errors_count_.load(std::memory_order_relaxed);
  }

  size_t GetKeyCount() const;

  void Pause(bool pause);

 private:
  std::string source_id_;
  Service& service_;
  std::vector<std::unique_ptr<ClusterShardMigration>> shard_flows_;
  SlotRanges slots_;
  ExecutionState cntx_;

  mutable util::fb2::Mutex error_mu_;
  dfly::GenericError last_error_ ABSL_GUARDED_BY(error_mu_);
  std::atomic<size_t> errors_count_ = 0;

  mutable util::fb2::Mutex state_mu_;
  MigrationState state_ ABSL_GUARDED_BY(state_mu_) = MigrationState::C_CONNECTING;

  // when migration is finished we need to store number of migrated keys
  // because new request can add or remove keys and we get incorrect statistic
  size_t keys_number_ = 0;

  util::fb2::BlockingCounter bc_;
};

}  // namespace dfly::cluster


================================================
FILE: src/server/cluster/outgoing_slot_migration.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/cluster/outgoing_slot_migration.h"

#include <absl/flags/flag.h>

#include <atomic>

#include "absl/cleanup/cleanup.h"
#include "base/logging.h"
#include "cluster_family.h"
#include "cluster_utility.h"
#include "facade/socket_utils.h"
#include "server/db_slice.h"
#include "server/engine_shard_set.h"
#include "server/error.h"
#include "server/journal/streamer.h"
#include "server/main_service.h"
#include "server/namespaces.h"
#include "server/server_family.h"
#include "util/fibers/synchronization.h"

ABSL_FLAG(int, slot_migration_connection_timeout_ms, 2000,
          "Connection creating timeout for migration operations");
ABSL_FLAG(int, migration_finalization_timeout_ms, 30000,
          "Timeout for migration finalization operation");

using namespace std;
using namespace facade;
using namespace util;

namespace dfly::cluster {

class OutgoingMigration::SliceSlotMigration : private ProtocolClient {
 public:
  SliceSlotMigration(DbSlice* slice, ServerContext server_context, SlotSet slots,
                     OutgoingMigration* om)
      : ProtocolClient(server_context), streamer_(slice, std::move(slots), &exec_st_) {
    exec_st_.SwitchErrorHandler([om](auto ge) { om->Finish(std::move(ge)); });
  }

  ~SliceSlotMigration() {
    CloseSocket();
    // it should already be unregistered, this cancel was added to avoid race condition that we
    // possibly have.
    bool unregistered = streamer_.Cancel();
    LOG_IF(DFATAL, unregistered)
        << "Streamer was not unregistered properly. Check code for race conditions.";
    exec_st_.JoinErrorHandler();
  }

  // Send DFLYMIGRATE FLOW
  void PrepareFlow(const std::string& node_id) {
    uint32_t shard_id = EngineShard::tlocal()->shard_id();

    VLOG(1) << "Connecting to source node_id " << node_id << " shard_id " << shard_id;
    auto timeout = absl::GetFlag(FLAGS_slot_migration_connection_timeout_ms) * 1ms;
    if (auto ec = ConnectAndAuth(timeout, &exec_st_); ec) {
      LOG(WARNING) << "Couldn't connect to source node_id " << node_id << " shard_id " << shard_id
                   << ": " << ec.message()
                   << ", socket state: " + GetSocketInfo(Sock()->native_handle());
      exec_st_.ReportError(GenericError(ec, "Couldn't connect to source."));
      return;
    }

    ResetParser(RedisParser::Mode::CLIENT);

    std::string cmd = absl::StrCat("DFLYMIGRATE FLOW ", node_id, " ", shard_id);
    VLOG(1) << "cmd: " << cmd;

    if (auto ec = SendCommandAndReadResponse(cmd); ec) {
      exec_st_.ReportError(GenericError(ec, cmd));
      return;
    }

    if (!CheckRespIsSimpleReply("OK")) {
      exec_st_.ReportError(absl::StrCat("Incorrect response for FLOW cmd: ",
                                        ToSV(LastResponseArgs().front().GetBuf())));
      return;
    }
  }

  // Register db_slice and journal change listeners
  void PrepareSync() {
    streamer_.Start(Sock());
  }

  // Run restore streamer
  void RunSync() {
    streamer_.Run();
  }

  void Cancel() {
    // Shutdown socket and allow IO loops to return.
    ShutdownSocket();
    streamer_.Cancel();
  }

  void Finalize(long attempt) {
    streamer_.SendFinalize(attempt);
  }

  dfly::GenericError GetError() const {
    return exec_st_.GetError();
  }

  using ProtocolClient::CloseSocket;

 private:
  RestoreStreamer streamer_;
};

OutgoingMigration::OutgoingMigration(MigrationInfo info, ClusterFamily* cf, ServerFamily* sf)
    : ProtocolClient(info.node_info.ip, info.node_info.port),
      migration_info_(std::move(info)),
      slot_migrations_(shard_set->size()),
      server_family_(sf),
      cf_(cf),
      tx_(new Transaction{sf->service().FindCmd("DFLYCLUSTER")}) {
  tx_->InitByArgs(&namespaces->GetDefaultNamespace(), 0, {});
}

OutgoingMigration::~OutgoingMigration() {
  main_sync_fb_.JoinIfNeeded();

  exec_st_.JoinErrorHandler();
  // Destroy each flow in its dedicated thread, because we could be the last
  // owner of the db tables
  OnAllShards([](auto& migration) {
    if (migration) {
      migration.reset();
    }
  });

  CloseSocket();
}

bool OutgoingMigration::ChangeState(MigrationState new_state) {
  util::fb2::LockGuard lk(state_mu_);
  if (state_ == MigrationState::C_FINISHED) {
    return false;
  }

  state_ = new_state;
  return true;
}

void OutgoingMigration::OnAllShards(std::function<void(UniqueSliceSlotMigration&)> func) {
  shard_set->RunBlockingInParallel(
      [this, &func](auto* shard) { func(slot_migrations_[shard->shard_id()]); });
}

void OutgoingMigration::Finish(const GenericError& error) {
  auto next_state = MigrationState::C_FINISHED;
  if (error) {
    // If OOM error move to FATAL, non-recoverable  state
    if (error == errc::not_enough_memory) {
      next_state = MigrationState::C_FATAL;
    } else {
      next_state = MigrationState::C_ERROR;
      exec_st_.ReportError(error);
    }
    LOG(WARNING) << "Finish outgoing migration for " << cf_->MyID() << ": "
                 << migration_info_.node_info.id << " with error: " << error.Format();

  } else {
    LOG(INFO) << "Finish outgoing migration for " << cf_->MyID() << ": "
              << migration_info_.node_info.id;
  }

  bool should_cancel_flows = false;
  absl::Cleanup on_exit([this]() { ShutdownSocket(); });

  {
    util::fb2::LockGuard lk(state_mu_);
    switch (state_) {
      case MigrationState::C_FATAL:
      case MigrationState::C_FINISHED:
        return;  // Already finished, nothing else to do

      case MigrationState::C_CONNECTING:
        should_cancel_flows = false;
        break;

      case MigrationState::C_SYNC:
      case MigrationState::C_ERROR:
        should_cancel_flows = true;
        break;
    }
    state_ = next_state;
  }

  if (next_state == MigrationState::C_FATAL) {
    // Fatal state stop any further processing of migration so we need to update error here
    SetLastError(error);
  }

  if (should_cancel_flows) {
    OnAllShards([](auto& migration) {
      CHECK(migration != nullptr);
      migration->Cancel();
    });
    exec_st_.JoinErrorHandler();
  }
}

MigrationState OutgoingMigration::GetState() const {
  util::fb2::LockGuard lk(state_mu_);
  return state_;
}

void OutgoingMigration::SyncFb() {
  VLOG(1) << "Starting outgoing migration fiber for migration " << migration_info_.ToString();

  const absl::Time start_time = absl::Now();

  // we retry starting migration until "cancel" is happened
  while (GetState() != MigrationState::C_FINISHED) {
    if (!ChangeState(MigrationState::C_CONNECTING)) {
      break;
    }

    if (exec_st_.IsError()) {
      ResetError();
      ThisFiber::SleepFor(500ms);  // wait some time before next retry
    }

    VLOG(1) << "Connecting to target node";
    auto timeout = absl::GetFlag(FLAGS_slot_migration_connection_timeout_ms) * 1ms;
    if (auto ec = ConnectAndAuth(timeout, &exec_st_); ec) {
      LOG(WARNING) << "Can't connect to target node " << server().Description()
                   << " for migration: " << ec.message()
                   << ", socket state: " + GetSocketInfo(Sock()->native_handle());
      exec_st_.ReportError(GenericError(ec, "Couldn't connect to source."));
      continue;
    }

    VLOG(1) << "Migration initiating";
    ResetParser(RedisParser::Mode::CLIENT);
    auto cmd = absl::StrCat("DFLYMIGRATE INIT ", cf_->MyID(), " ", slot_migrations_.size());
    for (const auto& s : migration_info_.slot_ranges) {
      absl::StrAppend(&cmd, " ", s.start, " ", s.end);
    }

    if (auto ec = SendCommandAndReadResponse(cmd); ec) {
      LOG(WARNING) << "Could not send INIT command to " << server().Description()
                   << " for migration: " << ec.message()
                   << ", socket state: " + GetSocketInfo(Sock()->native_handle());
      exec_st_.ReportError(GenericError(ec, "Could not send INIT command."));
      continue;
    }

    if (!CheckRespIsSimpleReply("OK")) {
      // Break outgoing migration if INIT from incoming node responded with OOM. Usually this will
      // happen on second iteration after first failed with OOM. Sending second INIT is required to
      // cleanup slots on incoming slot migration node.
      if (CheckRespSimpleError(kIncomingMigrationOOM)) {
        Finish(GenericError{std::make_error_code(errc::not_enough_memory),
                            std::string(kIncomingMigrationOOM)});
        break;
      }
      if (CheckRespIsSimpleReply(kUnknownMigration)) {
        const absl::Duration passed = absl::Now() - start_time;
        // we provide 30 seconds to distribute the config to all nodes to avoid extra errors
        // reporting
        if (passed >= absl::Milliseconds(30000)) {
          exec_st_.ReportError(GenericError(LastResponseArgs().front().GetString()));
        } else {
          ThisFiber::SleepFor(500ms);  // to prevent too many attempts
        }
      } else {
        exec_st_.ReportError(GenericError(LastResponseArgs().front().GetString()));
      }
      continue;
    }

    OnAllShards([this](auto& migration) {
      DbSlice& db_slice = namespaces->GetDefaultNamespace().GetCurrentDbSlice();
      journal::StartInThread();
      migration = std::make_unique<SliceSlotMigration>(&db_slice, server(),
                                                       migration_info_.slot_ranges, this);
    });

    if (!ChangeState(MigrationState::C_SYNC)) {
      break;
    }

    OnAllShards([this](auto& migration) { migration->PrepareFlow(cf_->MyID()); });
    if (!exec_st_.IsRunning()) {
      continue;
    }

    // Global transactional cut for migration to register db_slice and journal
    // listeners
    {
      Transaction::Guard tg{tx_.get()};
      OnAllShards([](auto& migration) { migration->PrepareSync(); });
    }

    if (!exec_st_.IsRunning()) {
      continue;
    }

    OnAllShards([](auto& migration) { migration->RunSync(); });

    if (!exec_st_.IsRunning()) {
      continue;
    }

    long attempt = 0;
    while (GetState() != MigrationState::C_FINISHED && !FinalizeMigration(++attempt)) {
      // Break loop and don't sleep in case of C_FATAL
      if (GetState() == MigrationState::C_FATAL) {
        break;
      }
      // Process commands that were on pause and try again
      VLOG(1) << "Waiting for migration to finalize...";
      ThisFiber::SleepFor(500ms);
    }
    if (!exec_st_.IsRunning()) {
      continue;
    }
    break;
  }

  VLOG(1) << "Exiting outgoing migration fiber for migration " << migration_info_.ToString();
}

bool OutgoingMigration::FinalizeMigration(long attempt) {
  // if it's not the 1st attempt and flows are work correctly we try to
  // reconnect and ACK one more time
  LOG(INFO) << "Finalize migration for " << cf_->MyID() << " : " << migration_info_.node_info.id
            << " attempt " << attempt;
  if (attempt > 1) {
    if (!exec_st_.IsRunning()) {
      return true;
    }
    auto timeout = absl::GetFlag(FLAGS_slot_migration_connection_timeout_ms) * 1ms;
    if (auto ec = ConnectAndAuth(timeout, &exec_st_); ec) {
      LOG(WARNING) << "Couldn't connect to " << cf_->MyID() << " : " << migration_info_.node_info.id
                   << " attempt " << attempt << ": " << ec.message()
                   << ", socket state: " + GetSocketInfo(Sock()->native_handle());
      return false;
    }
  }

  // Migration finalization has to be done via client pause because commands need to
  // be blocked on coordinator level to avoid intializing transactions with stale cluster slot info
  // TODO implement blocking on migrated slots only
  bool is_block_active = true;
  auto is_pause_in_progress = [&is_block_active] { return is_block_active; };
  auto pause_fb_opt =
      dfly::Pause(server_family_->GetNonPriviligedListeners(), &namespaces->GetDefaultNamespace(),
                  nullptr, ClientPause::ALL, is_pause_in_progress);

  DCHECK(pause_fb_opt);
  if (!pause_fb_opt) {
    auto err = absl::StrCat("Migration finalization time out ", cf_->MyID(), " : ",
                            migration_info_.node_info.id, " attempt ", attempt);

    LOG(WARNING) << err;
    SetLastError(std::move(err));
  }

  absl::Cleanup cleanup([&is_block_active, &pause_fb_opt]() {
    if (pause_fb_opt) {
      is_block_active = false;
      pause_fb_opt->JoinIfNeeded();
    }
  });

  LOG(INFO) << "FINALIZE flows for " << cf_->MyID() << " : " << migration_info_.node_info.id;
  OnAllShards([attempt](auto& migration) { migration->Finalize(attempt); });

  auto cmd = absl::StrCat("DFLYMIGRATE ACK ", cf_->MyID(), " ", attempt);
  VLOG(1) << "send " << cmd;

  if (auto err = SendCommand(cmd); err) {
    LOG(WARNING) << "Error during sending DFLYMIGRATE ACK to " << server().Description() << ": "
                 << err.message() << ", socket state: " + GetSocketInfo(Sock()->native_handle());
    return false;
  }

  const absl::Time start = absl::Now();
  const int64_t ack_timeout_ms = absl::GetFlag(FLAGS_migration_finalization_timeout_ms);
  while (true) {
    const absl::Time now = absl::Now();
    const int64_t passed_ms = absl::ToInt64Milliseconds(now - start);
    if (passed_ms >= ack_timeout_ms) {
      LOG(WARNING) << "Timeout fot ACK " << cf_->MyID() << " : " << migration_info_.node_info.id
                   << " attempt " << attempt;
      return false;
    }

    if (auto resp = ReadRespReply(ack_timeout_ms - passed_ms); !resp) {
      LOG(WARNING) << "Error reading response to ACK command from " << server().Description()
                   << ": " << resp.error()
                   << ", socket state: " + GetSocketInfo(Sock()->native_handle());
      return false;
    }

    // Check OOM from incoming slot migration on ACK request
    if (CheckRespSimpleError(kIncomingMigrationOOM)) {
      Finish(GenericError{std::make_error_code(errc::not_enough_memory),
                          std::string(kIncomingMigrationOOM)});
      return false;
    }

    if (!CheckRespFirstTypes({RespExpr::INT64})) {
      LOG(WARNING) << "Incorrect response type for " << cf_->MyID() << " : "
                   << migration_info_.node_info.id << " attempt " << attempt
                   << " msg: " << facade::ToSV(LastResponseArgs().front().GetBuf());
      return false;
    }

    if (const auto res = get<int64_t>(LastResponseArgs().front().u); res == attempt) {
      break;
    } else {
      LOG(WARNING) << "Incorrect attempt payload " << cf_->MyID() << " : "
                   << migration_info_.node_info.id << ", sent " << attempt << " received " << res;
    }
  }

  if (!exec_st_.GetError()) {
    Finish();
    keys_number_ = cluster::GetKeyCount(migration_info_.slot_ranges);
    cf_->ApplyMigrationSlotRangeToConfig(migration_info_.node_info.id, migration_info_.slot_ranges,
                                         false);
  }
  return true;
}

void OutgoingMigration::Start() {
  VLOG(1) << "Resolving host DNS for outgoing migration";
  if (error_code ec = ResolveHostDns(); ec) {
    LOG(WARNING) << "Could not resolve host DNS for outgoing migration to "
                 << server().Description() << ": " << ec.message();
    exec_st_.ReportError(GenericError(ec, "Could not resolve host dns."));
    return;
  }

  main_sync_fb_ = fb2::Fiber("outgoing_migration", &OutgoingMigration::SyncFb, this);
}

size_t OutgoingMigration::GetKeyCount() const {
  util::fb2::LockGuard lk(state_mu_);
  if (state_ == MigrationState::C_FINISHED) {
    return keys_number_;
  }
  return cluster::GetKeyCount(migration_info_.slot_ranges);
}
}  // namespace dfly::cluster


================================================
FILE: src/server/cluster/outgoing_slot_migration.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once

#include "server/cluster/cluster_defs.h"
#include "server/protocol_client.h"
#include "server/transaction.h"

namespace dfly {

class ServerFamily;

}  // namespace dfly
namespace dfly::cluster {
class ClusterFamily;

// Whole outgoing slots migration manager
class OutgoingMigration : private ProtocolClient {
 public:
  OutgoingMigration(MigrationInfo info, ClusterFamily* cf, ServerFamily* sf);
  ~OutgoingMigration();

  // start migration process, sends INIT command to the target node
  void Start();

  // if is_error = false mark migration as FINISHED and cancel migration if it's not finished yet
  // can be called from any thread, but only after Start()
  // if is_error = true and migration is in progress it will be restarted otherwise nothing happens
  void Finish(const GenericError& error = {}) ABSL_LOCKS_EXCLUDED(state_mu_);

  MigrationState GetState() const ABSL_LOCKS_EXCLUDED(state_mu_);

  const std::string& GetHostIp() const {
    return server().host;
  };

  uint16_t GetPort() const {
    return server().port;
  };

  const SlotRanges& GetSlots() const {
    return migration_info_.slot_ranges;
  }

  const MigrationInfo GetMigrationInfo() const {
    return migration_info_;
  }

  void ResetError() {
    if (exec_st_.IsError()) {
      SetLastError(exec_st_.GetError());
      exec_st_.Reset(nullptr);
    }
  }

  void SetLastError(dfly::GenericError err) ABSL_LOCKS_EXCLUDED(error_mu_) {
    if (!err)
      return;
    errors_count_.fetch_add(1, std::memory_order_relaxed);
    util::fb2::LockGuard lk(error_mu_);
    last_error_ = std::move(err);
  }

  std::string GetErrorStr() const ABSL_LOCKS_EXCLUDED(error_mu_) {
    util::fb2::LockGuard lk(error_mu_);
    return last_error_.Format();
  }

  size_t GetErrorsCount() const {
    return errors_count_.load(std::memory_order_relaxed);
  }

  size_t GetKeyCount() const ABSL_LOCKS_EXCLUDED(state_mu_);

 private:
  MigrationState GetStateImpl() const;

  // SliceSlotMigration manages state and data transferring for the corresponding shard
  class SliceSlotMigration;

  using UniqueSliceSlotMigration = std::unique_ptr<SliceSlotMigration>;

  void SyncFb();
  // return true if migration is finalized even with C_ERROR state
  bool FinalizeMigration(long attempt);

  bool ChangeState(MigrationState new_state) ABSL_LOCKS_EXCLUDED(state_mu_);

  void OnAllShards(std::function<void(UniqueSliceSlotMigration&)>);

  MigrationInfo migration_info_;
  std::vector<std::unique_ptr<SliceSlotMigration>> slot_migrations_;
  ServerFamily* server_family_;
  ClusterFamily* cf_;
  mutable util::fb2::Mutex error_mu_;
  dfly::GenericError last_error_ ABSL_GUARDED_BY(error_mu_);
  std::atomic<size_t> errors_count_ = 0;

  util::fb2::Fiber main_sync_fb_;

  mutable util::fb2::Mutex state_mu_;
  MigrationState state_ ABSL_GUARDED_BY(state_mu_) = MigrationState::C_CONNECTING;

  boost::intrusive_ptr<Transaction> tx_;

  // when migration is finished we need to store number of migrated keys
  // because new request can add or remove keys and we get incorrect statistic
  size_t keys_number_ = 0;
};

}  // namespace dfly::cluster


================================================
FILE: src/server/cluster/slot_set.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <bitset>
#include <memory>
#include <vector>

#include "cluster_defs.h"

namespace dfly::cluster {

class SlotSet {
 public:
  static constexpr SlotId kSlotsNumber = SlotRange::kMaxSlotId + 1;
  using TBitSet = std::bitset<kSlotsNumber>;

  SlotSet(bool full_house = false) {
    slots_ = std::make_unique<TBitSet>();
    if (full_house)
      slots_->flip();
  }

  SlotSet(const SlotRanges& slot_ranges) {
    slots_ = std::make_unique<TBitSet>();
    Set(slot_ranges, true);
  }

  SlotSet(const SlotSet& s) {
    slots_ = std::make_unique<TBitSet>(*s.slots_);
  }

  SlotSet(SlotSet&& s) = default;

  bool Contains(SlotId slot) const {
    return slots_->test(slot);
  }

  void Set(const SlotRanges& slot_ranges, bool value) {
    for (const auto& slot_range : slot_ranges) {
      for (auto i = slot_range.start; i <= slot_range.end; ++i) {
        slots_->set(i, value);
      }
    }
  }

  void Set(SlotId slot, bool value) {
    slots_->set(slot, value);
  }

  bool Empty() const {
    return slots_->none();
  }

  size_t Count() const {
    return slots_->count();
  }

  bool All() const {
    return slots_->all();
  }

  // Get SlotSet that are absent in the slots
  SlotSet GetRemovedSlots(const SlotSet& slots) const {
    // we need to avoid stack usage to prevent stack overflow
    SlotSet res(slots);
    res.slots_->flip();
    *res.slots_ &= *slots_;
    return res;
  }

  SlotRanges ToSlotRanges() const {
    std::vector<SlotRange> res;

    for (SlotId i = 0; i < kSlotsNumber; ++i) {
      if (!slots_->test(i)) {
        continue;
      } else {
        auto& range = res.emplace_back(SlotRange{i, i});
        for (++i; i < kSlotsNumber && slots_->test(i); ++i) {
          range.end = i;
        }
      }
    }

    return SlotRanges(res);
  }

 private:
  SlotSet(std::unique_ptr<TBitSet> s) {
    slots_ = std::move(s);
  }

 private:
  std::unique_ptr<TBitSet> slots_;
};

}  // namespace dfly::cluster


================================================
FILE: src/server/cluster_support.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

extern "C" {
#include "redis/crc16.h"
}

#include "base/flags.h"
#include "base/logging.h"
#include "cluster_support.h"
#include "common.h"

using namespace std;

ABSL_FLAG(string, cluster_mode, "",
          "Cluster mode supported. Possible values are "
          "'emulated', 'yes' or ''");

ABSL_FLAG(bool, experimental_cluster_shard_by_slot, false,
          "If true, cluster mode is enabled and sharding is done by slot. "
          "Otherwise, sharding is done by hash tag.");

namespace dfly {

void UniqueSlotChecker::Add(std::string_view key) {
  if (!IsClusterEnabled()) {
    return;
  }

  Add(KeySlot(key));
}

void UniqueSlotChecker::Add(SlotId slot_id) {
  if (!IsClusterEnabled()) {
    return;
  }

  if (slot_id_ == kNoSlotId) {
    slot_id_ = slot_id;
  } else if (slot_id_ != slot_id) {
    slot_id_ = kCrossSlot;
  }
}

optional<SlotId> UniqueSlotChecker::GetUniqueSlotId() const {
  return slot_id_ > kMaxSlotNum ? optional<SlotId>() : slot_id_;
}

using namespace detail;

void InitializeCluster() {
  string cluster_mode_str = absl::GetFlag(FLAGS_cluster_mode);

  if (cluster_mode_str == "emulated") {
    cluster_mode = ClusterMode::kEmulatedCluster;
  } else if (cluster_mode_str == "yes") {
    cluster_mode = ClusterMode::kRealCluster;
  } else if (cluster_mode_str.empty()) {
    cluster_mode = ClusterMode::kNoCluster;
  } else {
    LOG(ERROR) << "Invalid value for flag --cluster_mode. Exiting...";
    exit(1);
  }

  if (cluster_mode != ClusterMode::kNoCluster) {
    cluster_shard_by_slot = absl::GetFlag(FLAGS_experimental_cluster_shard_by_slot);
  }
}

SlotId KeySlot(std::string_view key) {
  string_view tag = LockTagOptions::instance().Tag(key);
  return crc16(tag.data(), tag.length()) & kMaxSlotNum;
}

bool IsClusterShardedByTag() {
  return IsClusterEnabledOrEmulated() || LockTagOptions::instance().enabled;
}

}  // namespace dfly


================================================
FILE: src/server/cluster_support.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <cstdint>
#include <optional>
#include <string_view>

#include "server/common_types.h"

namespace dfly {

namespace detail {

enum class ClusterMode {
  kUninitialized,
  kNoCluster,
  kEmulatedCluster,
  kRealCluster,
};

inline ClusterMode cluster_mode = ClusterMode::kUninitialized;
inline bool cluster_shard_by_slot = false;

};  // namespace detail

constexpr SlotId kMaxSlotNum = 0x3FFF;

// A simple utility class that "aggregates" SlotId-s and can tell whether all inputs were the same.
// Only works when cluster is enabled.
class UniqueSlotChecker {
 public:
  void Add(std::string_view key);
  void Add(SlotId slot_id);

  std::optional<SlotId> GetUniqueSlotId() const;

  bool IsCrossSlot() const {
    return slot_id_ == kCrossSlot;
  }

  void Reset() {
    slot_id_ = kNoSlotId;
  }

 private:
  // kNoSlotId - if slot wasn't set at all
  static constexpr SlotId kNoSlotId = kMaxSlotNum + 1;
  // kCrossSlot - if several different slots were set
  static constexpr SlotId kCrossSlot = kNoSlotId + 1;

  SlotId slot_id_ = kNoSlotId;
};

SlotId KeySlot(std::string_view key);

void InitializeCluster();

inline bool IsClusterEnabled() {
  return detail::cluster_mode == detail::ClusterMode::kRealCluster;
}

inline bool IsClusterEmulated() {
  return detail::cluster_mode == detail::ClusterMode::kEmulatedCluster;
}

inline bool IsClusterEnabledOrEmulated() {
  return IsClusterEnabled() || IsClusterEmulated();
}

inline bool IsClusterShardedBySlot() {
  return detail::cluster_shard_by_slot;
}

bool IsClusterShardedByTag();

}  // namespace dfly


================================================
FILE: src/server/cmd_support.cc
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/cmd_support.h"

#include <absl/cleanup/cleanup.h>

#include "base/logging.h"

namespace dfly::cmd {

bool SingleHopWaiter::await_ready() noexcept {
  auto* tx = cmd_cntx->tx();

  if (!cmd_cntx->IsDeferredReply()) {
    // Use fiber blocking in synchronous mode
    tx->ScheduleSingleHop(callback);
    return true;
  } else {
    // Schedule async hop and keep transaction alive
    tx->SingleHopAsync(callback);
    tx_keepalive_ = tx;
    return false;
  }
}

void SingleHopWaiter::await_suspend(std::coroutine_handle<> handle) const noexcept {
  cmd_cntx->Resolve(tx_keepalive_->Blocker(), handle);
}

facade::OpStatus SingleHopWaiter::await_resume() const noexcept {
  return *cmd_cntx->tx()->LocalResultPtr();
}

void CmdR::Coro::return_value(const facade::ErrorReply& err) const noexcept {
  cmd_cntx->SendError(err);
}

}  // namespace dfly::cmd


================================================
FILE: src/server/cmd_support.h
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/functional/function_ref.h>

#include <concepts>
#include <coroutine>
#include <variant>

#include "facade/error.h"
#include "facade/op_status.h"
#include "server/conn_context.h"
#include "server/engine_shard.h"
#include "server/transaction.h"
#include "util/fibers/synchronization.h"

namespace dfly::cmd {

// Awaitable sentinel for the single hop of a transaction. Used instead of the
// actual awaitable to allow Promise to inject context implicitly and make command code simple.
using SingleHopSentinel = Transaction::RunnableType;

// Awaitable in command context for the single hop of a transaction with return value
template <typename RT> using SingleHopSentinelT = absl::FunctionRef<RT(Transaction*, EngineShard*)>;

// Perform single hop. Returns awaitable that resolves to resulting OpStatus
SingleHopSentinel SingleHop(const auto& f) {
  return f;
}

// Perform single hop. Returns awaitable that resolves to return value.
auto SingleHopT(const auto& f) -> SingleHopSentinelT<decltype(f(nullptr, nullptr))> {
  return f;
}

// Awaitable object for waiting for the single hop of a transaction to finish.
// Avoids coroutine suspending in synchronous mode, doing a fiber suspend instead.
// In asynchronous mode it registers the promise / blocker on the context.
struct SingleHopWaiter {
  bool await_ready() noexcept;
  void await_suspend(std::coroutine_handle<> handle) const noexcept;
  facade::OpStatus await_resume() const noexcept;

  CommandContext* cmd_cntx;
  Transaction::RunnableType callback;
  boost::intrusive_ptr<Transaction> tx_keepalive_ = nullptr;
};

// Extension of SingleHopWaiter capturing the return value of the callback
template <typename RT> struct SingleHopWaiterT : public SingleHopWaiter {
  static_assert(std::is_base_of_v<facade::OpResultBase, RT>);

  SingleHopWaiterT(CommandContext* cmd_cntx,
                   absl::FunctionRef<RT(Transaction*, EngineShard*)> callback)
      : SingleHopWaiter{cmd_cntx, *this}, callback{callback} {
  }

  OpStatus operator()(Transaction* tx, EngineShard* es) const {
    result = callback(tx, es);
    return result.status();
  }

  RT&& await_resume() noexcept {
    return std::move(result);
  }

  absl::FunctionRef<RT(Transaction*, EngineShard*)> callback;
  mutable RT result;
};

// Return type of async command. No actual use as of now
struct CmdR {
  struct Coro;
  using promise_type = Coro;
};

constexpr CmdR kAborted = {};

// Underlying driver (promise) of coroutine that defines its context
struct CmdR::Coro {
  // Coroutine created of a top level command
  Coro(facade::CmdArgList arg, CommandContext* cmd_cntx) : cmd_cntx{cmd_cntx} {
  }

  // Coroutine created of a internal function with arguments
  template <typename... Ts> Coro(CommandContext* cmd_cntx, const Ts&... ts) : cmd_cntx{cmd_cntx} {
  }

  // Use it waiter directly cases when it needs to stay in scope to keep the transaction alive
  auto& await_transform(SingleHopWaiter& waiter) const {
    return waiter;
  }

  auto await_transform(SingleHopSentinel callback) const {
    return SingleHopWaiter{cmd_cntx, callback};
  }

  template <typename RT> auto await_transform(SingleHopSentinelT<RT> callback) const {
    return SingleHopWaiterT<RT>{cmd_cntx, callback};
  }

  // Return error
  void return_value(const facade::ErrorReply& err) const noexcept;

  // Conclude command without any error
  void return_value(std::nullopt_t) const noexcept {
  }

  // Blank default implementations
  CmdR get_return_object() {
    return {};
  }
  void unhandled_exception() noexcept {
  }
  std::suspend_never initial_suspend() noexcept {
    return {};
  }
  std::suspend_never final_suspend() noexcept {
    return {};
  }

  CommandContext* cmd_cntx;
};

}  // namespace dfly::cmd


================================================
FILE: src/server/cms_family.cc
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/cms.h"
#include "facade/cmd_arg_parser.h"
#include "facade/error.h"
#include "facade/reply_builder.h"
#include "server/acl/acl_commands_def.h"
#include "server/command_families.h"
#include "server/command_registry.h"
#include "server/conn_context.h"
#include "server/db_slice.h"
#include "server/engine_shard_set.h"
#include "server/error.h"
#include "server/transaction.h"

namespace dfly {

using namespace facade;
using namespace std;

namespace {

constexpr char kCmsNotFound[] = "CMS: key does not exist";
constexpr char kCmsWrongNumKeys[] = "CMS: wrong number of keys";
constexpr char kCmsWrongNumKeysWeights[] = "CMS: wrong number of keys/weights";
constexpr char kCmsCannotParseNumber[] = "CMS: Cannot parse number";

OpStatus OpInitByDim(const OpArgs& op_args, string_view key, uint32_t width, uint32_t depth) {
  auto& db_slice = op_args.GetDbSlice();
  auto op_res = db_slice.AddOrFind(op_args.db_cntx, key, OBJ_CMS);
  RETURN_ON_BAD_STATUS(op_res);

  if (!op_res->is_new)
    return OpStatus::KEY_EXISTS;

  PrimeValue& pv = op_res->it->second;
  pv.SetCMS(width, depth);

  return OpStatus::OK;
}

OpStatus OpInitByProb(const OpArgs& op_args, string_view key, double error, double probability) {
  auto& db_slice = op_args.GetDbSlice();
  auto op_res = db_slice.AddOrFind(op_args.db_cntx, key, OBJ_CMS);
  RETURN_ON_BAD_STATUS(op_res);

  if (!op_res->is_new)
    return OpStatus::KEY_EXISTS;

  PrimeValue& pv = op_res->it->second;
  CMS* cms = CompactObj::AllocateMR<CMS>(CMS::ErrorRateTag{}, error, probability,
                                         CompactObj::memory_resource());
  pv.SetCMS(cms);

  return OpStatus::OK;
}

OpResult<vector<int64_t>> OpIncrBy(const OpArgs& op_args, string_view key,
                                   const vector<pair<string_view, int64_t>>& items) {
  auto& db_slice = op_args.GetDbSlice();
  OpResult op_res = db_slice.FindMutable(op_args.db_cntx, key, OBJ_CMS);
  if (!op_res)
    return op_res.status();

  CMS* cms = op_res->it->second.GetCMS();
  vector<int64_t> result;
  result.reserve(items.size());

  for (const auto& [item, incr] : items) {
    result.push_back(cms->IncrBy(item, incr));
  }

  return result;
}

OpResult<vector<int64_t>> OpQuery(const OpArgs& op_args, string_view key, CmdArgList items) {
  auto& db_slice = op_args.GetDbSlice();
  OpResult op_res = db_slice.FindReadOnly(op_args.db_cntx, key, OBJ_CMS);
  if (!op_res)
    return op_res.status();

  const CMS* cms = op_res.value()->second.GetCMS();
  vector<int64_t> result;
  result.reserve(items.size());

  for (auto arg : items) {
    result.push_back(cms->Query(ToSV(arg)));
  }

  return result;
}

struct CmsInfo {
  uint32_t width = 0;
  uint32_t depth = 0;
  int64_t count = 0;
};

OpResult<CmsInfo> OpInfo(const OpArgs& op_args, string_view key) {
  auto& db_slice = op_args.GetDbSlice();
  OpResult op_res = db_slice.FindReadOnly(op_args.db_cntx, key, OBJ_CMS);
  if (!op_res)
    return op_res.status();

  const CMS* cms = op_res.value()->second.GetCMS();
  return CmsInfo{cms->width(), cms->depth(), cms->total_count()};
}

void CmdInitByDim(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser(args);
  string_view key = parser.Next();
  uint32_t width, depth;

  tie(width, depth) = parser.Next<uint32_t, uint32_t>();
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  RETURN_ON_PARSE_ERROR(parser, rb);

  if (width == 0 || depth == 0) {
    return rb->SendError("CMS: width and depth must be greater than 0");
  }

  const auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpInitByDim(t->GetOpArgs(shard), key, width, depth);
  };

  OpStatus res = cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));
  if (res == OpStatus::KEY_EXISTS) {
    return rb->SendError("item exists");
  }
  if (res == OpStatus::OK) {
    return rb->SendOk();
  }
  return rb->SendError(res);
}

void CmdInitByProb(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser(args);
  string_view key = parser.Next();
  double error, probability;

  tie(error, probability) = parser.Next<double, double>();
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  RETURN_ON_PARSE_ERROR(parser, rb);

  if (error <= 0 || error >= 1) {
    return rb->SendError("CMS: error must be between 0 and 1 exclusive");
  }
  if (probability <= 0 || probability >= 1) {
    return rb->SendError("CMS: probability must be between 0 and 1 exclusive");
  }

  const auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpInitByProb(t->GetOpArgs(shard), key, error, probability);
  };

  OpStatus res = cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));
  if (res == OpStatus::KEY_EXISTS) {
    return rb->SendError("item exists");
  }
  if (res == OpStatus::OK) {
    return rb->SendOk();
  }
  return rb->SendError(res);
}

void CmdIncrBy(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  args.remove_prefix(1);

  // Parse item/increment pairs
  if (args.size() < 2 || args.size() % 2 != 0) {
    return cmd_cntx->SendError(kSyntaxErr);
  }

  vector<pair<string_view, int64_t>> items;
  items.reserve(args.size() / 2);

  for (size_t i = 0; i < args.size(); i += 2) {
    string_view item = ToSV(args[i]);
    int64_t incr;
    if (!absl::SimpleAtoi(ToSV(args[i + 1]), &incr)) {
      return cmd_cntx->SendError(kCmsCannotParseNumber);
    }
    if (incr <= 0) {
      return cmd_cntx->SendError("CMS: increment must be a positive integer");
    }
    items.emplace_back(item, incr);
  }

  const auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpIncrBy(t->GetOpArgs(shard), key, items);
  };

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  OpResult<vector<int64_t>> res = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  if (!res) {
    if (res.status() == OpStatus::KEY_NOTFOUND) {
      return rb->SendError(kCmsNotFound);
    }
    return rb->SendError(res.status());
  }

  SinkReplyBuilder::ReplyScope scope(rb);
  rb->StartArray(res->size());
  for (int64_t count : *res) {
    rb->SendLong(count);
  }
}

void CmdQuery(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  args.remove_prefix(1);

  if (args.empty()) {
    return cmd_cntx->SendError(kSyntaxErr);
  }

  const auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpQuery(t->GetOpArgs(shard), key, args);
  };

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  OpResult<vector<int64_t>> res = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  if (!res) {
    if (res.status() == OpStatus::KEY_NOTFOUND) {
      return rb->SendError(kCmsNotFound);
    }
    return rb->SendError(res.status());
  }

  SinkReplyBuilder::ReplyScope scope(rb);
  rb->StartArray(res->size());
  for (int64_t count : *res) {
    rb->SendLong(count);
  }
}

void CmdInfo(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  const auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpInfo(t->GetOpArgs(shard), key);
  };

  OpResult<CmsInfo> res = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  if (!res) {
    if (res.status() == OpStatus::KEY_NOTFOUND) {
      return rb->SendError(kCmsNotFound);
    }
    return rb->SendError(res.status());
  }

  {
    SinkReplyBuilder::ReplyScope scope(rb);
    rb->StartArray(6);
    rb->SendBulkString("width");
    rb->SendLong(res->width);
    rb->SendBulkString("depth");
    rb->SendLong(res->depth);
    rb->SendBulkString("count");
    rb->SendLong(res->count);
  }
}

// Structure to hold CMS data collected from a shard when merging
struct CmsShardData {
  size_t src_index = 0;
  string_view key;
  uint32_t width = 0;
  uint32_t depth = 0;
  int64_t count = 0;
  vector<int64_t> counters;

  CmsShardData(size_t src_idx, string_view k, uint32_t w, uint32_t d, int64_t c,
               const int64_t* data, size_t size)
      : src_index(src_idx), key(k), width(w), depth(d), count(c), counters(data, data + size) {
  }
};

struct CmsMergeArgs {
  string_view dest_key;
  vector<string_view> src_keys;
  vector<int64_t> weights;
};

bool ParseMergeArgs(CmdArgList args, RedisReplyBuilder* rb, CmsMergeArgs* out) {
  CmdArgParser parser(args);
  uint32_t num_keys;

  out->dest_key = parser.Next();
  num_keys = parser.Next<uint32_t>();
  if (auto err = parser.TakeError(); err) {
    rb->SendError(err.MakeReply());
    return false;
  }

  if (num_keys == 0) {
    rb->SendError(kCmsWrongNumKeys);
    return false;
  }

  if (parser.Tail().size() < num_keys) {
    rb->SendError(kSyntaxErr);
    return false;
  }

  out->src_keys.reserve(num_keys);
  for (uint32_t i = 0; i < num_keys; ++i) {
    out->src_keys.push_back(parser.Next());
  }

  if (parser.HasNext()) {
    string_view weights_kw = parser.Next();
    if (!absl::EqualsIgnoreCase(weights_kw, "WEIGHTS")) {
      rb->SendError(kCmsWrongNumKeysWeights);
      return false;
    }

    out->weights.reserve(num_keys);
    for (uint32_t i = 0; i < num_keys; ++i) {
      if (!parser.HasNext()) {
        rb->SendError(kCmsWrongNumKeysWeights);
        return false;
      }

      int64_t weight;
      if (!absl::SimpleAtoi(parser.Next(), &weight)) {
        rb->SendError(kCmsCannotParseNumber);
        return false;
      }
      out->weights.push_back(weight);
    }
  }

  if (parser.HasNext()) {
    rb->SendError(kCmsWrongNumKeysWeights);
    return false;
  }

  if (out->weights.empty()) {
    out->weights.resize(num_keys, 1);
  }

  return true;
}

// Merge multiple CMS structures into a destination key.
void CmdMerge(CmdArgList args, CommandContext* cmd_cntx) {
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  CmsMergeArgs merge_args;
  if (!ParseMergeArgs(args, rb, &merge_args)) {
    return;
  }

  // multi-shard implementation
  // 1. fetch from all shards
  // 2. merge to dest
  Transaction* tx = cmd_cntx->tx();

  vector<OpResult<vector<CmsShardData>>> shard_results(shard_set->size(), OpStatus::SKIPPED);

  auto read_cb = [&](Transaction* t, EngineShard* shard) -> OpStatus {
    auto& db_slice = t->GetOpArgs(shard).GetDbSlice();
    const DbContext& db_cntx = t->GetDbContext();
    vector<CmsShardData> cms_list;

    // Check each source key to see if it belongs to this shard
    for (size_t src_idx = 0; src_idx < merge_args.src_keys.size(); ++src_idx) {
      string_view key = merge_args.src_keys[src_idx];
      ShardId key_shard = Shard(key, shard_set->size());
      if (key_shard != shard->shard_id()) {
        continue;
      }

      OpResult src_res = db_slice.FindReadOnly(db_cntx, key, OBJ_CMS);
      if (!src_res) {
        shard_results[shard->shard_id()] = src_res.status();
        return OpStatus::OK;
      }

      const CMS* cms = src_res.value()->second.GetCMS();
      size_t counter_count = cms->NumCounters();
      cms_list.emplace_back(src_idx, key, cms->width(), cms->depth(), cms->total_count(),
                            cms->Data(), counter_count);
    }

    if (!cms_list.empty()) {
      shard_results[shard->shard_id()] = std::move(cms_list);
    }
    return OpStatus::OK;
  };

  tx->Execute(read_cb, false /* do not conclude */);

  // Validate dimensions and make sure we found data for every source.
  uint32_t ref_width = 0, ref_depth = 0;
  size_t seen_sources = 0;

  // Check for errors and validate dimensions.
  for (auto& result : shard_results) {
    if (result.status() == OpStatus::SKIPPED)
      continue;

    if (!result) {
      tx->Conclude();
      if (result.status() == OpStatus::KEY_NOTFOUND) {
        return rb->SendError(kCmsNotFound);
      }
      return rb->SendError(result.status());
    }

    for (auto& cms_data : result.value()) {
      if (seen_sources == 0) {
        ref_width = cms_data.width;
        ref_depth = cms_data.depth;
      } else if (cms_data.width != ref_width || cms_data.depth != ref_depth) {
        tx->Conclude();
        return rb->SendError("CMS: dimension mismatch");
      }
      ++seen_sources;
    }
  }

  if (seen_sources != merge_args.src_keys.size()) {
    tx->Conclude();
    return rb->SendError(kCmsNotFound);
  }

  // Now write merged data to destination shard
  ShardId dest_shard_id = Shard(merge_args.dest_key, shard_set->size());
  OpStatus write_result = OpStatus::OK;

  auto write_cb = [&](Transaction* t, EngineShard* shard) -> OpStatus {
    if (shard->shard_id() != dest_shard_id) {
      return OpStatus::OK;
    }

    auto& db_slice = t->GetOpArgs(shard).GetDbSlice();
    OpResult dest_res = db_slice.FindMutable(t->GetDbContext(), merge_args.dest_key, OBJ_CMS);
    if (!dest_res) {
      write_result = dest_res.status();
      return OpStatus::OK;
    }

    CMS* dest_cms = dest_res->it->second.GetCMS();

    // Validate destination dimensions
    if (ref_width != dest_cms->width() || ref_depth != dest_cms->depth()) {
      write_result = OpStatus::INVALID_VALUE;
      return OpStatus::OK;
    }

    // Reset destination before merging so the result is the weighted sum of sources only.
    dest_cms->Reset();

    for (const auto& result : shard_results) {
      if (result.status() == OpStatus::SKIPPED)
        continue;

      for (const auto& cms_data : result.value()) {
        CMS temp_cms(cms_data.width, cms_data.depth, CompactObj::memory_resource());
        temp_cms.Load(cms_data.count, cms_data.counters.data());

        if (!dest_cms->MergeFrom(temp_cms, merge_args.weights[cms_data.src_index])) {
          write_result = OpStatus::INVALID_VALUE;
          return OpStatus::OK;
        }
      }
    }

    return OpStatus::OK;
  };

  tx->Execute(write_cb, true /* conclude */);

  if (write_result == OpStatus::KEY_NOTFOUND) {
    return rb->SendError(kCmsNotFound);
  }
  if (write_result == OpStatus::INVALID_VALUE) {
    return rb->SendError("CMS: dimension mismatch");
  }
  return rb->SendOk();
}

}  // namespace

using CI = CommandId;

#define HFUNC(x) SetHandler(&Cmd##x)

void RegisterCmsFamily(CommandRegistry* registry) {
  registry->StartFamily(acl::CMS);

  *registry << CI{"CMS.INITBYDIM", CO::DENYOOM | CO::FAST, 4, 1, 1}.HFUNC(InitByDim)
            << CI{"CMS.INITBYPROB", CO::DENYOOM | CO::FAST, 4, 1, 1}.HFUNC(InitByProb)
            << CI{"CMS.INCRBY", CO::DENYOOM | CO::FAST, -4, 1, 1}.HFUNC(IncrBy)
            << CI{"CMS.QUERY", CO::READONLY | CO::FAST, -3, 1, 1}.HFUNC(Query)
            << CI{"CMS.INFO", CO::READONLY | CO::FAST, 2, 1, 1}.HFUNC(Info)
            << CI{"CMS.MERGE", CO::DENYOOM | CO::VARIADIC_KEYS, -4, 3, 3}.HFUNC(Merge);
}

}  // namespace dfly


================================================
FILE: src/server/cms_family_test.cc
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#include "facade/facade_test.h"
#include "server/test_utils.h"

namespace dfly {

using testing::ElementsAre;

class CmsFamilyTest : public BaseFamilyTest {
 protected:
};

TEST_F(CmsFamilyTest, InitByDim) {
  auto resp = Run("cms.initbydim cms1 1000 5");
  EXPECT_EQ(resp, "OK");
  EXPECT_EQ(Run("type cms1"), "CMSk-TYPE");

  resp = Run("cms.initbydim cms1 100 5");
  EXPECT_THAT(resp, ErrArg("item exists"));

  resp = Run("cms.initbydim cms2 0 5");
  EXPECT_THAT(resp, ErrArg("width and depth must be greater than 0"));

  resp = Run("cms.initbydim cms3 5 0");
  EXPECT_THAT(resp, ErrArg("width and depth must be greater than 0"));
}

TEST_F(CmsFamilyTest, InitByProb) {
  auto resp = Run("cms.initbyprob cms1 0.01 0.01");
  EXPECT_EQ(resp, "OK");

  resp = Run("cms.initbyprob cms1 0.01 0.01");
  EXPECT_THAT(resp, ErrArg("item exists"));

  resp = Run("cms.initbyprob cms2 2 0.01");
  EXPECT_THAT(resp, ErrArg("error must be between 0 and 1"));

  resp = Run("cms.initbyprob cms3 0.01 0");
  EXPECT_THAT(resp, ErrArg("probability must be between 0 and 1"));
}

TEST_F(CmsFamilyTest, IncrBy) {
  Run("cms.initbydim cms 100 5");

  auto resp = Run("cms.incrby cms foo 3");
  EXPECT_THAT(resp, IntArg(3));

  resp = Run("cms.incrby cms foo 4 bar 1");
  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(7), IntArg(1))));

  // Should fail on non-existent key
  resp = Run("cms.incrby noexist foo 1");
  EXPECT_THAT(resp, ErrArg("CMS: key does not exist"));

  // Should fail with invalid number
  resp = Run("cms.incrby cms foo notanumber");
  EXPECT_THAT(resp, ErrArg("CMS: Cannot parse number"));
}

TEST_F(CmsFamilyTest, Query) {
  Run("cms.initbydim cms 100 5");
  Run("cms.incrby cms foo 5 bar 3");

  auto resp = Run("cms.query cms foo");
  EXPECT_THAT(resp, IntArg(5));

  resp = Run("cms.query cms foo bar");
  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(5), IntArg(3))));

  resp = Run("cms.query cms noexist");
  EXPECT_THAT(resp, IntArg(0));

  resp = Run("cms.query noexist foo");
  EXPECT_THAT(resp, ErrArg("CMS: key does not exist"));
}

TEST_F(CmsFamilyTest, Info) {
  Run("cms.initbydim cms 1000 5");
  Run("cms.incrby cms foo 5 bar 3 baz 9");

  auto resp = Run("cms.info cms");
  EXPECT_THAT(
      resp, RespArray(ElementsAre("width", IntArg(1000), "depth", IntArg(5), "count", IntArg(17))));

  resp = Run("cms.info noexist");
  EXPECT_THAT(resp, ErrArg("CMS: key does not exist"));
}

TEST_F(CmsFamilyTest, Merge) {
  Run("cms.initbydim A 100 5");
  Run("cms.initbydim B 100 5");
  Run("cms.initbydim C 100 5");

  Run("cms.incrby A foo 5 bar 3 baz 9");
  Run("cms.incrby B foo 2 foobar 3 baz 1");

  auto resp = Run("cms.query A foo bar baz");
  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(5), IntArg(3), IntArg(9))));

  resp = Run("cms.query B foo foobar baz");
  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(2), IntArg(3), IntArg(1))));

  resp = Run("cms.merge C 2 A B");
  EXPECT_EQ(resp, "OK");

  resp = Run("cms.query C foo bar baz foobar");
  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(7), IntArg(3), IntArg(10), IntArg(3))));

  resp = Run("cms.merge noexist 1 A");
  EXPECT_THAT(resp, ErrArg("CMS: key does not exist"));

  resp = Run("cms.merge C 0 A");
  EXPECT_THAT(resp, ErrArg("CMS: wrong number of keys"));

  resp = Run("cms.merge A 1 B WEIGHTS 4 3");
  EXPECT_THAT(resp, ErrArg("CMS: wrong number of keys/weights"));

  resp = Run("cms.merge A 2 B noexist WEIGHTS 4 3");
  EXPECT_THAT(resp, ErrArg("CMS: key does not exist"));

  // Merge A into B, should return A values (destination is reset before merge)
  resp = Run("cms.merge B 1 A");
  EXPECT_EQ(resp, "OK");
  resp = Run("cms.query B foo bar baz");
  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(5), IntArg(3), IntArg(9))));
}

TEST_F(CmsFamilyTest, MergeWithWeights) {
  Run("cms.initbydim A 100 5");
  Run("cms.initbydim B 100 5");
  Run("cms.initbydim C 100 5");

  Run("cms.incrby A foo 5 bar 3 baz 9");
  Run("cms.incrby B foo 2 bar 3 baz 1");

  // Merge with weights: A contributes 2x, B contributes 3x
  // foo: 5*2 + 2*3 = 16
  // bar: 3*2 + 3*3 = 15
  // baz: 9*2 + 1*3 = 21
  auto resp = Run("cms.merge C 2 A B WEIGHTS 2 3");
  EXPECT_EQ(resp, "OK");

  resp = Run("cms.query C foo bar baz");
  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(16), IntArg(15), IntArg(21))));
}

TEST_F(CmsFamilyTest, MergeWithDuplicateSourceKeysPreservesWeightOrder) {
  Run("cms.initbydim A 100 5");
  Run("cms.initbydim C 100 5");

  Run("cms.incrby A foo 2 bar 4");

  auto resp = Run("cms.merge C 2 A A WEIGHTS 1 3");
  EXPECT_EQ(resp, "OK");

  resp = Run("cms.query C foo bar");
  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(8), IntArg(16))));

  resp = Run("cms.info C");
  EXPECT_THAT(
      resp, RespArray(ElementsAre("width", IntArg(100), "depth", IntArg(5), "count", IntArg(24))));
}

// Backported from tests/fakeredis/test/test_stack/test_cms.py::test_cms_info
TEST_F(CmsFamilyTest, InfoAfterMerges) {
  Run("cms.initbydim A 1000 5");
  Run("cms.initbydim B 1000 5");
  Run("cms.initbydim C 1000 5");

  Run("cms.incrby A foo 5 bar 3 baz 9");
  Run("cms.incrby B foo 2 bar 3 baz 1");

  auto resp = Run("cms.query A foo bar baz");
  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(5), IntArg(3), IntArg(9))));

  resp = Run("cms.query B foo bar baz");
  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(2), IntArg(3), IntArg(1))));

  resp = Run("cms.merge C 2 A B");
  EXPECT_EQ(resp, "OK");
  resp = Run("cms.query C foo bar baz");
  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(7), IntArg(6), IntArg(10))));

  resp = Run("cms.merge C 2 A B WEIGHTS 1 2");
  EXPECT_EQ(resp, "OK");
  resp = Run("cms.query C foo bar baz");
  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(9), IntArg(9), IntArg(11))));

  resp = Run("cms.merge C 2 A B WEIGHTS 2 3");
  EXPECT_EQ(resp, "OK");
  resp = Run("cms.query C foo bar baz");
  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(16), IntArg(15), IntArg(21))));

  resp = Run("cms.info A");
  EXPECT_THAT(
      resp, RespArray(ElementsAre("width", IntArg(1000), "depth", IntArg(5), "count", IntArg(17))));

  resp = Run("cms.info noexist");
  EXPECT_THAT(resp, ErrArg("CMS: key does not exist"));
}

}  // namespace dfly


================================================
FILE: src/server/collection_family_fallback.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#ifndef WITH_COLLECTION_CMDS

#include "base/logging.h"
#include "server/hset_family.h"
#include "server/set_family.h"
#include "server/stream_family.h"
#include "server/zset_family.h"
namespace dfly {

using namespace std;

namespace {
void Fail() {
  CHECK(false) << "Compiled without command support";
}
}  // namespace

StreamMemTracker::StreamMemTracker() {
}

void StreamMemTracker::UpdateStreamSize(PrimeValue& pv) const {
}

StringMap* HSetFamily::ConvertToStrMap(uint8_t* lp) {
  Fail();
  return nullptr;
}

StringSet* SetFamily::ConvertToStrSet(const intset* is, size_t expected_len) {
  Fail();
  return nullptr;
}

uint32_t SetFamily::MaxIntsetEntries() {
  Fail();
  return 0;
}

LoadBlobResult SetFamily::LoadLPSetBlob(std::string_view blob, PrimeValue* pv) {
  Fail();
  return LoadBlobResult::kCorrupted;
}

LoadBlobResult SetFamily::LoadIntSetBlob(std::string_view blob, PrimeValue* pv) {
  Fail();
  return LoadBlobResult::kCorrupted;
}

LoadBlobResult HSetFamily::LoadZiplistBlob(std::string_view blob, PrimeValue* pv) {
  Fail();
  return LoadBlobResult::kCorrupted;
}

LoadBlobResult HSetFamily::LoadListpackBlob(std::string_view blob, PrimeValue* pv) {
  Fail();
  return LoadBlobResult::kCorrupted;
}

LoadBlobResult ZSetFamily::LoadZiplistBlob(std::string_view blob, PrimeValue* pv) {
  Fail();
  return LoadBlobResult::kCorrupted;
}

LoadBlobResult ZSetFamily::LoadListpackBlob(std::string_view blob, PrimeValue* pv) {
  Fail();
  return LoadBlobResult::kCorrupted;
}

OpResult<ZSetFamily::MScoreResponse> ZSetFamily::ZGetMembers(CmdArgList args, Transaction* tx,
                                                             SinkReplyBuilder* builder) {
  Fail();
  return {};
}

OpResult<ZSetFamily::AddResult> ZSetFamily::OpAdd(const OpArgs& op_args, const ZParams& zparams,
                                                  std::string_view key, ScoredMemberSpan members) {
  Fail();
  return {};
}

OpResult<double> ZSetFamily::OpScore(const OpArgs& op_args, std::string_view key,
                                     std::string_view member) {
  Fail();
  return 0;
}

void ZSetFamily::ZAddGeneric(std::string_view key, const ZParams& zparams, ScoredMemberSpan memb_sp,
                             CommandContext* cmd_cntx) {
  Fail();
}

OpResult<void> ZSetFamily::OpKeyExisted(const OpArgs& op_args, std::string_view key) {
  Fail();
  return {};
}

OpResult<std::vector<ZSetFamily::ScoredArray>> ZSetFamily::OpRanges(
    const std::vector<ZRangeSpec>& range_specs, const OpArgs& op_args, std::string_view key) {
  Fail();
  return {};
}

}  // namespace dfly

#endif


================================================
FILE: src/server/command_families.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

// Included by family object files that implement only their respective registration function.
// Self-registration would require updating the build process to fix linking issues.
namespace dfly {

class CommandRegistry;

void RegisterStringFamily(CommandRegistry*);
void RegisterListFamily(CommandRegistry*);
void RegisterBitopsFamily(CommandRegistry*);
void RegisterGeoFamily(CommandRegistry*);
void RegisterHllFamily(CommandRegistry*);
void RegisterBloomFamily(CommandRegistry*);
void RegisterCmsFamily(CommandRegistry*);
void RegisterJsonFamily(CommandRegistry*);

}  // namespace dfly


================================================
FILE: src/server/command_registry.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/command_registry.h"

#include <absl/container/inlined_vector.h>
#include <absl/strings/match.h>
#include <absl/strings/str_cat.h>
#include <absl/strings/str_split.h>
#include <absl/time/clock.h>
#include <hdr/hdr_histogram.h>

#include "base/bits.h"
#include "base/flags.h"
#include "base/logging.h"
#include "base/stl_util.h"
#include "facade/dragonfly_connection.h"
#include "facade/error.h"
#include "server/acl/acl_commands_def.h"

using namespace std;
ABSL_FLAG(vector<string>, rename_command, {},
          "Change the name of commands, format is: <cmd1_name>=<cmd1_new_name>, "
          "<cmd2_name>=<cmd2_new_name>");
ABSL_FLAG(vector<string>, restricted_commands, {},
          "Commands restricted to connections on the admin port");

ABSL_FLAG(vector<string>, oom_deny_commands, {},
          "Additinal commands that will be marked as denyoom");

ABSL_FLAG(vector<string>, command_alias, {},
          "Add an alias for given command(s), format is: <alias>=<original>, <alias>=<original>. "
          "Aliases must be set identically on replicas, if applicable");

ABSL_FLAG(bool, latency_tracking, false, "If true, track latency for commands");

namespace dfly {

using namespace facade;

using absl::AsciiStrToUpper;
using absl::GetFlag;
using absl::StrCat;
using absl::StrSplit;

namespace {

uint32_t ImplicitCategories(uint32_t mask) {
  if (mask & CO::ADMIN)
    mask |= CO::NOSCRIPT;
  return mask;
}

uint32_t ImplicitAclCategories(uint32_t mask) {
  mask = ImplicitCategories(mask);
  uint32_t out = 0;

  if (mask & CO::JOURNALED)
    out |= acl::WRITE;

  if ((mask & CO::READONLY) && ((mask & CO::NOSCRIPT) == 0))
    out |= acl::READ;

  if (mask & CO::ADMIN)
    out |= acl::ADMIN | acl::DANGEROUS;

  // todo pubsub

  if (mask & CO::FAST)
    out |= acl::FAST;

  if (mask & CO::BLOCKING)
    out |= acl::BLOCKING;

  if ((out & acl::FAST) == 0)
    out |= acl::SLOW;

  return out;
}

using CmdLineMapping = absl::flat_hash_map<std::string, std::string>;

CmdLineMapping ParseCmdlineArgMap(const absl::Flag<std::vector<std::string>>& flag) {
  const auto& mappings = absl::GetFlag(flag);
  CmdLineMapping parsed_mappings;
  parsed_mappings.reserve(mappings.size());

  for (const std::string& mapping : mappings) {
    absl::InlinedVector<std::string_view, 2> kv = absl::StrSplit(mapping, '=');
    if (kv.size() != 2) {
      LOG(ERROR) << "Malformed command '" << mapping << "' for " << flag.Name()
                 << ", expected key=value";
      exit(1);
    }

    std::string key = absl::AsciiStrToUpper(kv[0]);
    std::string value = absl::AsciiStrToUpper(kv[1]);

    if (key == value) {
      LOG(ERROR) << "Invalid attempt to map " << key << " to itself in " << flag.Name();
      exit(1);
    }

    if (!parsed_mappings.emplace(std::move(key), std::move(value)).second) {
      LOG(ERROR) << "Duplicate insert to " << flag.Name() << " not allowed";
      exit(1);
    }
  }
  return parsed_mappings;
}

CmdLineMapping OriginalToAliasMap() {
  CmdLineMapping original_to_alias;
  CmdLineMapping alias_to_original = ParseCmdlineArgMap(FLAGS_command_alias);
  original_to_alias.reserve(alias_to_original.size());
  std::for_each(std::make_move_iterator(alias_to_original.begin()),
                std::make_move_iterator(alias_to_original.end()),
                [&original_to_alias](auto&& pair) {
                  original_to_alias.emplace(std::move(pair.second), std::move(pair.first));
                });

  return original_to_alias;
}

constexpr int64_t kLatencyHistogramMinValue = 1;        // Minimum value in usec
constexpr int64_t kLatencyHistogramMaxValue = 1000000;  // Maximum value in usec (1s)
constexpr int32_t kLatencyHistogramPrecision = 2;

}  // namespace

CommandId::CommandId(const char* name, uint32_t mask, int8_t arity, int8_t first_key,
                     int8_t last_key, std::optional<uint32_t> acl_categories)
    : facade::CommandId(name, ImplicitCategories(mask), arity, first_key, last_key,
                        acl_categories.value_or(ImplicitAclCategories(mask))) {
  implicit_acl_ = !acl_categories.has_value();
  bool is_latency_tracked = GetFlag(FLAGS_latency_tracking);
  if (is_latency_tracked) {
    hdr_histogram* hist = nullptr;
    const int init_result = hdr_init(kLatencyHistogramMinValue, kLatencyHistogramMaxValue,
                                     kLatencyHistogramPrecision, &hist);
    CHECK_EQ(init_result, 0) << "failed to initialize histogram for command " << name;
    latency_histogram_ = hist;
  }

  if (name_.rfind("EVAL", 0) == 0)
    kind_multi_ctr_ = CO::MultiControlKind::EVAL;
  else if (base::_in(name_, {"EXEC", "MULTI", "DISCARD"}))
    kind_multi_ctr_ = CO::MultiControlKind::EXEC;
  else if (base::_in(name_, {"PUBLISH", "SUBSCRIBE", "UNSUBSCRIBE"}))
    kind_pubsub_ = CO::PubSubKind::REGULAR;
  else if (base::_in(name_, {"PSUBSCRIBE", "PUNSUBSCRIBE"}))
    kind_pubsub_ = CO::PubSubKind::PATTERN;
  else if (base::_in(name_, {"SPUBLISH", "SSUBSCRIBE", "SUNSUBSCRIBE"}))
    kind_pubsub_ = CO::PubSubKind::SHARDED;
  can_be_monitored_ = (opt_mask_ & CO::ADMIN) == 0 && name_ != "EXEC";

  if (base::_in(name_, {"MSET", "MSETNX"}))
    interleave_step_ = 2;
  else if (name_ == "JSON.MSET")
    interleave_step_ = 3;
}

CommandId::~CommandId() {
  // Aliases share the same latency histogram, so we only close it if this is not an alias.
  if (latency_histogram_ && !is_alias_) {
    hdr_close(latency_histogram_);
  }
}

CommandId CommandId::Clone(const std::string_view name) const {
  CommandId cloned =
      CommandId{name.data(), opt_mask_, arity_, first_key_, last_key_, acl_categories_};
  cloned.handler_ = handler_;
  cloned.opt_mask_ = opt_mask_ | CO::HIDDEN;
  cloned.acl_categories_ = acl_categories_;
  cloned.implicit_acl_ = implicit_acl_;
  cloned.interleave_step_ = interleave_step_;
  cloned.is_alias_ = true;

  // explicit sharing of the object since it's an alias we can do that.
  // I am assuming that the source object lifetime is at least as of the cloned object.
  if (cloned.latency_histogram_) {
    hdr_close(cloned.latency_histogram_);  // Free the histogram in the cloned object.
    cloned.latency_histogram_ = static_cast<hdr_histogram*>(latency_histogram_);
  }
  return cloned;
}

bool CommandId::IsTransactional() const {
  if (first_key_ > 0 || (opt_mask_ & CO::GLOBAL_TRANS) || (opt_mask_ & CO::NO_KEY_TRANSACTIONAL))
    return true;

  if (name_ == "EVAL" || name_ == "EVALSHA" || name_ == "EVAL_RO" || name_ == "EVALSHA_RO" ||
      name_ == "EXEC")
    return true;

  return false;
}

bool CommandId::IsMultiTransactional() const {
  return kind_multi_ctr_.has_value();
}

optional<facade::ErrorReply> CommandId::Validate(CmdArgList tail_args) const {
  if ((arity() > 0 && tail_args.size() + 1 != size_t(arity())) ||
      (arity() < 0 && tail_args.size() + 1 < size_t(-arity()))) {
    string prefix;
    if (name() == "EXEC")
      prefix = "-EXECABORT Transaction discarded because of: ";
    return facade::ErrorReply{prefix + facade::WrongNumArgsError(name()), kSyntaxErrType};
  }

  if (interleave_step_ && tail_args.size() % interleave_step_ != 0) {
    return facade::ErrorReply{facade::WrongNumArgsError(name()), kSyntaxErrType};
  }

  if (validator_)
    return validator_(tail_args);
  return nullopt;
}

void CommandId::ResetStats(unsigned thread_index) {
  command_stats_[thread_index] = {0, 0};
  if (hdr_histogram* h = latency_histogram_; h != nullptr) {
    hdr_reset(h);
    std::atomic_thread_fence(std::memory_order_seq_cst);
  }
}

void CommandId::RecordLatency(unsigned tid, uint64_t latency_usec) const {
  auto& ent = command_stats_[tid];

  ++ent.first;
  ent.second += latency_usec;

  if (latency_histogram_) {
    hdr_record_value_atomic(latency_histogram_, latency_usec);
  }
}

CommandRegistry::CommandRegistry() {
  cmd_rename_map_ = ParseCmdlineArgMap(FLAGS_rename_command);

  for (const string& name : GetFlag(FLAGS_restricted_commands)) {
    restricted_cmds_.emplace(AsciiStrToUpper(name));
  }

  for (const string& name : GetFlag(FLAGS_oom_deny_commands)) {
    oomdeny_cmds_.emplace(AsciiStrToUpper(name));
  }
}

void CommandRegistry::Init(unsigned int thread_count) {
  const CmdLineMapping original_to_alias = OriginalToAliasMap();
  absl::flat_hash_map<std::string, CommandId> alias_to_command_id;
  alias_to_command_id.reserve(original_to_alias.size());
  for (auto& [_, cmd] : cmd_map_) {
    cmd.Init(thread_count);
    if (auto it = original_to_alias.find(cmd.name()); it != original_to_alias.end()) {
      auto alias_cmd = cmd.Clone(it->second);
      alias_cmd.Init(thread_count);
      alias_to_command_id.insert({it->second, std::move(alias_cmd)});
    }
  }
  std::copy(std::make_move_iterator(alias_to_command_id.begin()),
            std::make_move_iterator(alias_to_command_id.end()),
            std::inserter(cmd_map_, cmd_map_.end()));
}

CommandRegistry& CommandRegistry::operator<<(CommandId cmd) {
  string k = string(cmd.name());

  absl::InlinedVector<std::string_view, 2> maybe_subcommand = StrSplit(cmd.name(), " ");
  const bool is_sub_command = maybe_subcommand.size() == 2;
  if (const auto it = cmd_rename_map_.find(maybe_subcommand.front()); it != cmd_rename_map_.end()) {
    if (it->second.empty()) {
      return *this;  // Incase of empty string we want to remove the command from registry.
    }
    k = is_sub_command ? StrCat(it->second, " ", maybe_subcommand[1]) : it->second;
  }

  if (restricted_cmds_.find(k) != restricted_cmds_.end()) {
    cmd.SetRestricted(true);
  }

  if (oomdeny_cmds_.find(k) != oomdeny_cmds_.end()) {
    cmd.SetFlag(CO::DENYOOM);
  }

  cmd.SetFamily(family_of_commands_.size() - 1);
  if (acl_category_)
    cmd.SetAclCategory(*acl_category_);

  if (!is_sub_command || absl::StartsWith(cmd.name(), "ACL")) {
    cmd.SetBitIndex(1ULL << bit_index_);
    family_of_commands_.back().emplace_back(k);
    ++bit_index_;
  } else {
    DCHECK(absl::StartsWith(k, family_of_commands_.back().back()));
    cmd.SetBitIndex(1ULL << (bit_index_ - 1));
  }
  CHECK(cmd_map_.emplace(k, std::move(cmd)).second) << k;

  return *this;
}

void CommandRegistry::StartFamily(std::optional<uint32_t> acl_category) {
  family_of_commands_.emplace_back();
  bit_index_ = 0;
  acl_category_ = acl_category;
}

std::string_view CommandRegistry::RenamedOrOriginal(std::string_view orig) const {
  if (!cmd_rename_map_.empty() && cmd_rename_map_.contains(orig)) {
    return cmd_rename_map_.find(orig)->second;
  }
  return orig;
}

CommandRegistry::FamiliesVec CommandRegistry::GetFamilies() {
  return std::move(family_of_commands_);
}

std::pair<const CommandId*, ParsedArgs> CommandRegistry::FindExtended(string_view cmd,
                                                                      ParsedArgs tail_args) const {
  if (cmd == RenamedOrOriginal("ACL"sv)) {
    if (tail_args.empty()) {
      return {Find(cmd), {}};
    }

    auto second_cmd = absl::AsciiStrToUpper(tail_args.Front());
    string full_cmd = StrCat(cmd, " ", second_cmd);

    return {Find(full_cmd), tail_args.Tail()};
  }

  const CommandId* res = Find(cmd);
  if (!res)
    return {nullptr, {}};

  // A workaround for XGROUP HELP that does not fit our static taxonomy of commands.
  if (tail_args.size() == 1 && res->name() == "XGROUP") {
    if (absl::EqualsIgnoreCase(tail_args.Front(), "HELP")) {
      res = Find("_XGROUP_HELP");
    }
  }
  return {res, tail_args};
}

absl::flat_hash_map<std::string, hdr_histogram*> CommandRegistry::LatencyMap() const {
  absl::flat_hash_map<std::string, hdr_histogram*> cmd_latencies;
  cmd_latencies.reserve(cmd_map_.size());
  for (const auto& [cmd_name, cmd] : cmd_map_) {
    cmd_latencies.insert({absl::AsciiStrToLower(cmd_name), cmd.GetLatencyHist()});
  }
  return cmd_latencies;
}

}  // namespace dfly


================================================
FILE: src/server/command_registry.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/flat_hash_map.h>
#include <absl/container/flat_hash_set.h>
#include <absl/types/span.h>

#include <functional>
#include <optional>

#include "base/function2.hpp"
#include "facade/command_id.h"
#include "facade/facade_types.h"

struct hdr_histogram;

namespace dfly {

namespace CO {

enum CommandOpt : uint32_t {
  READONLY = 1U << 0,
  FAST = 1U << 1,       // Unused?
  JOURNALED = 1U << 2,  // Command is logged to AOF / Journal.
  LOADING = 1U << 3,    // Command allowed during LOADING state.
  DENYOOM = 1U << 4,    // use-memory in redis.

  DANGEROUS = 1U << 5,  // Dangerous commands are logged when used

  VARIADIC_KEYS = 1U << 6,  // arg 2 determines number of keys. Relevant for ZUNIONSTORE, EVAL etc.

  ADMIN = 1U << 7,  // implies NOSCRIPT,
  NOSCRIPT = 1U << 8,
  BLOCKING = 1U << 9,
  HIDDEN = 1U << 10,  // does not show in COMMAND command output
  GLOBAL_TRANS = 1U << 12,
  STORE_LAST_KEY = 1U << 13,  // The command my have a store key as the last argument.

  NO_AUTOJOURNAL = 1U << 15,  // Skip automatically logging command to journal inside transaction.

  // Allows commands without keys to respect transaction ordering and enables journaling by default
  NO_KEY_TRANSACTIONAL = 1U << 16,
  NO_KEY_TX_SPAN_ALL = 1U << 17,  // All shards are active for the no-key-transactional command

  // The same callback can be run multiple times without corrupting the result. Used for
  // opportunistic optimizations where inconsistencies can only be detected afterwards.
  IDEMPOTENT = 1U << 18,
};

enum class PubSubKind : uint8_t { REGULAR = 0, PATTERN = 1, SHARDED = 2 };

// Commands controlling any multi command execution.
// They often need to be handled separately from regular commands in many contexts
enum class MultiControlKind : uint8_t {
  EVAL,  // EVAL, EVAL_RO, EVALSHA, EVALSHA_RO
  EXEC,  // EXEC, MULTI, DISCARD
};

};  // namespace CO

// Per thread vector of command stats. Each entry is {cmd_calls, cmd_latency_agg in usec}.
using CmdCallStats = std::pair<uint64_t, uint64_t>;

class CommandId;
class CommandContext;

// TODO: move it to helio
// Makes sure that the POD T that is passed to the constructor is reset to default state
template <typename T> class MoveOnly {
 public:
  MoveOnly() = default;

  MoveOnly(const MoveOnly&) = delete;
  MoveOnly& operator=(const MoveOnly&) = delete;

  MoveOnly(MoveOnly&& t) noexcept : value_(std::move(t.value_)) {
    t.value_ = T{};  // Reset the passed value to default state
  }

  MoveOnly& operator=(const T& t) noexcept {
    value_ = t;
    return *this;
  }

  operator const T&() const {  // NOLINT
    return value_;
  }

 private:
  T value_{};
};

class CommandId : public facade::CommandId {
 public:
  using CmdArgList = facade::CmdArgList;

  // NOTICE: name must be a literal string, otherwise metrics break! (see cmd_stats_map in
  // server_state.h)
  CommandId(const char* name, uint32_t mask, int8_t arity, int8_t first_key, int8_t last_key,
            std::optional<uint32_t> acl_categories = std::nullopt);

  CommandId(CommandId&& o) = default;

  ~CommandId();

  [[nodiscard]] CommandId Clone(std::string_view name) const;

  void Init(unsigned thread_count) {
    command_stats_ = std::make_unique<CmdCallStats[]>(thread_count);
  }

  using Handler = fu2::function_base<true, true, fu2::capacity_default, false, false,
                                     void(CmdArgList, CommandContext*) const>;
  using ArgValidator = fu2::function_base<true, true, fu2::capacity_default, false, false,
                                          std::optional<facade::ErrorReply>(CmdArgList) const>;

  // Returns the invoke time in usec.
  void Invoke(CmdArgList args, CommandContext* cmd_cntx) const {
    handler_(args, cmd_cntx);
  }

  // Returns error if validation failed, otherwise nullopt
  std::optional<facade::ErrorReply> Validate(CmdArgList tail_args) const;

  bool IsTransactional() const;

  bool IsMultiTransactional() const;

  bool IsReadOnly() const {
    return opt_mask_ & CO::READONLY;
  }

  bool IsJournaled() const {
    return opt_mask_ & CO::JOURNALED;
  }

  bool IsBlocking() const {
    return opt_mask_ & CO::BLOCKING;
  }

  // See deduction logic for details. We don't monitor ADMIN commands
  // and log the final `EXEC` command manually at the end.
  bool CanBeMonitored() const {
    return can_be_monitored_;
  }

  int8_t interleaved_step() const {
    return interleave_step_;
  }

  template <typename RT> CommandId&& SetAsyncHandler(RT f(CmdArgList, CommandContext*)) && {
    support_async_ = true;
    handler_ = [f](CmdArgList args, CommandContext* cntx) { f(args, cntx); };
    return std::move(*this);
  }

  CommandId&& SetHandler(Handler f, bool async_support = false) && {
    support_async_ |= async_support;
    handler_ = std::move(f);
    return std::move(*this);
  }

  CommandId&& SetValidator(ArgValidator f) && {
    validator_ = std::move(f);
    return std::move(*this);
  }

  bool is_multi_key() const {
    return (last_key_ != first_key_) || (opt_mask_ & CO::VARIADIC_KEYS);
  }

  void ResetStats(unsigned thread_index);

  CmdCallStats GetStats(unsigned thread_index) const {
    return command_stats_[thread_index];
  }

  void SetAclCategory(uint32_t mask) {
    if (implicit_acl_)
      acl_categories_ |= mask;
  }

  bool IsAlias() const {
    return is_alias_;
  }

  hdr_histogram* GetLatencyHist() const {
    return latency_histogram_;
  }

  std::optional<CO::PubSubKind> PubSubKind() const {
    return kind_pubsub_;
  }

  // Returns value if this command controls multi command execution (EVAL, EXEC & helpers)
  std::optional<CO::MultiControlKind> MultiControlKind() const {
    return kind_multi_ctr_;
  }

  void RecordLatency(unsigned tid, uint64_t latency_usec) const;

  bool SupportsAsync() const {
    return support_async_;
  }

 private:
  std::optional<CO::PubSubKind> kind_pubsub_;
  std::optional<CO::MultiControlKind> kind_multi_ctr_;

  // The following fields must copy manually in the move constructor.
  bool implicit_acl_;
  bool is_alias_{false};
  bool can_be_monitored_{true};
  bool support_async_{false};
  int8_t interleave_step_{0};

  std::unique_ptr<CmdCallStats[]> command_stats_;
  Handler handler_;
  ArgValidator validator_;
  MoveOnly<hdr_histogram*> latency_histogram_;  // Histogram for command latency in usec
};

class CommandRegistry {
 public:
  CommandRegistry();

  void Init(unsigned thread_count);

  CommandRegistry& operator<<(CommandId cmd);

  const CommandId* Find(std::string_view cmd) const {
    auto it = cmd_map_.find(cmd);
    return it == cmd_map_.end() ? nullptr : &it->second;
  }

  CommandId* Find(std::string_view cmd) {
    auto it = cmd_map_.find(cmd);
    return it == cmd_map_.end() ? nullptr : &it->second;
  }

  using TraverseCb = std::function<void(std::string_view, const CommandId&)>;

  void Traverse(TraverseCb cb) {
    for (const auto& k_v : cmd_map_) {
      cb(k_v.first, k_v.second);
    }
  }

  void ResetCallStats(unsigned thread_index) {
    for (auto& k_v : cmd_map_) {
      k_v.second.ResetStats(thread_index);
    }
  }

  void MergeCallStats(unsigned thread_index,
                      std::function<void(std::string_view, const CmdCallStats&)> cb) const {
    for (const auto& k_v : cmd_map_) {
      auto src = k_v.second.GetStats(thread_index);
      if (src.first == 0)
        continue;
      cb(k_v.second.name(), src);
    }
  }

  void StartFamily(std::optional<uint32_t> acl_category = std::nullopt);

  std::string_view RenamedOrOriginal(std::string_view orig) const;

  using FamiliesVec = std::vector<std::vector<std::string>>;
  FamiliesVec GetFamilies();

  std::pair<const CommandId*, facade::ParsedArgs> FindExtended(std::string_view cmd,
                                                               facade::ParsedArgs tail_args) const;

  absl::flat_hash_map<std::string, hdr_histogram*> LatencyMap() const;

 private:
  absl::flat_hash_map<std::string, CommandId> cmd_map_;
  absl::flat_hash_map<std::string, std::string> cmd_rename_map_;
  absl::flat_hash_set<std::string> restricted_cmds_;
  absl::flat_hash_set<std::string> oomdeny_cmds_;

  FamiliesVec family_of_commands_;
  size_t bit_index_;
  std::optional<uint32_t> acl_category_;  // category of family currently being built
};

}  // namespace dfly


================================================
FILE: src/server/common.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/common.h"

#include <absl/random/random.h>
#include <absl/strings/match.h>
#include <absl/strings/str_cat.h>
#include <fast_float/fast_float.h>

#include <system_error>

extern "C" {
#include "redis/rdb.h"
}

#include "base/flags.h"
#include "base/logging.h"
#include "core/compact_object.h"
#include "core/glob_matcher.h"
#include "core/interpreter.h"
#include "facade/cmd_arg_parser.h"
#include "server/conn_context.h"
#include "server/engine_shard_set.h"
#include "server/error.h"
#include "server/journal/journal.h"
#include "server/server_state.h"
#include "server/transaction.h"

// We've generalized "hashtags" so that users can specify custom delimiter and closures, see below.
// If I had a time machine, I'd rename this to lock_on_tags.
ABSL_FLAG(bool, lock_on_hashtags, false,
          "When true, locks are done in the {hashtag} level instead of key level. Hashtag "
          "extraction can be further configured with locktag_* flags.");

// We would have used `char` instead of `string`, but that's impossible.
ABSL_FLAG(
    std::string, locktag_delimiter, "",
    "If set, this char is used to extract a lock tag by looking at delimiters, like hash tags. If "
    "unset, regular hashtag extraction is done (with {}). Must be used with --lock_on_hashtags");

ABSL_FLAG(unsigned, locktag_skip_n_end_delimiters, 0,
          "How many closing tag delimiters should we skip when extracting lock tags. 0 for no "
          "skipping. For example, when delimiter is ':' and this flag is 2, the locktag for "
          "':a:b:c:d:e' will be 'a:b:c'.");

ABSL_FLAG(std::string, locktag_prefix, "",
          "Only keys with this prefix participate in tag extraction.");

namespace dfly {

using namespace std;
using namespace util;

namespace {

// Thread-local cache with static linkage.
thread_local std::optional<LockTagOptions> locktag_lock_options;

}  // namespace

void TEST_InvalidateLockTagOptions() {
  locktag_lock_options = nullopt;  // For test main thread
  CHECK(shard_set != nullptr);
  shard_set->pool()->AwaitBrief(
      [](ShardId shard, ProactorBase* proactor) { locktag_lock_options = nullopt; });
}

const LockTagOptions& LockTagOptions::instance() {
  if (!locktag_lock_options.has_value()) {
    string delimiter = absl::GetFlag(FLAGS_locktag_delimiter);
    if (delimiter.empty()) {
      delimiter = "{}";
    } else if (delimiter.size() == 1) {
      delimiter += delimiter;  // Copy delimiter (e.g. "::") so that it's easier to use below
    } else {
      LOG(ERROR) << "Invalid value for locktag_delimiter - must be a single char";
      exit(-1);
    }

    locktag_lock_options = {
        .enabled = absl::GetFlag(FLAGS_lock_on_hashtags),
        .open_locktag = delimiter[0],
        .close_locktag = delimiter[1],
        .skip_n_end_delimiters = absl::GetFlag(FLAGS_locktag_skip_n_end_delimiters),
        .prefix = absl::GetFlag(FLAGS_locktag_prefix),
    };
  }

  return *locktag_lock_options;
}

std::string_view LockTagOptions::Tag(std::string_view key) const {
  if (!absl::StartsWith(key, prefix)) {
    return key;
  }

  const size_t start = key.find(open_locktag);
  if (start == key.npos) {
    return key;
  }

  size_t end = start;
  for (unsigned i = 0; i <= skip_n_end_delimiters; ++i) {
    size_t next = end + 1;
    end = key.find(close_locktag, next);
    if (end == key.npos || end == next) {
      return key;
    }
  }

  return key.substr(start + 1, end - start - 1);
}

const char* GlobalStateName(GlobalState s) {
  switch (s) {
    case GlobalState::ACTIVE:
      return "ACTIVE";
    case GlobalState::LOADING:
      return "LOADING";
    case GlobalState::SHUTTING_DOWN:
      return "SHUTTING DOWN";
    case GlobalState::TAKEN_OVER:
      return "TAKEN OVER";
  }
  ABSL_UNREACHABLE();
}

const char* RdbTypeName(unsigned type) {
  switch (type) {
    case RDB_TYPE_STRING:
      return "string";
    case RDB_TYPE_LIST:
      return "list";
    case RDB_TYPE_SET:
      return "set";
    case RDB_TYPE_ZSET:
      return "zset";
    case RDB_TYPE_HASH:
      return "hash";
    case RDB_TYPE_STREAM_LISTPACKS:
      return "stream";
  }
  return "other";
}

bool ParseDouble(string_view src, double* value) {
  if (src.empty())
    return false;

  if (absl::EqualsIgnoreCase(src, "-inf")) {
    *value = -HUGE_VAL;
  } else if (absl::EqualsIgnoreCase(src, "+inf")) {
    *value = HUGE_VAL;
  } else {
    fast_float::from_chars_result result = fast_float::from_chars(src.data(), src.end(), *value);
    // nan double could be sent as "nan" with any case.
    if (int(result.ec) != 0 || result.ptr != src.end() || isnan(*value))
      return false;
  }
  return true;
}

OpResult<ScanOpts> ScanOpts::TryFrom(CmdArgList args, bool allow_novalues) {
  ScanOpts scan_opts;
  facade::CmdArgParser parser(args);

  while (parser.HasNext()) {
    std::string_view pattern;
    std::string_view type_str;

    if (parser.Check("NOVALUES")) {
      if (!allow_novalues) {
        return facade::OpStatus::SYNTAX_ERR;
      }
      scan_opts.novalues = true;
    } else if (parser.Check("COUNT", &scan_opts.limit)) {
      if (scan_opts.limit == 0)
        scan_opts.limit = 1;
    } else if (parser.Check("MATCH", &pattern)) {
      if (pattern != "*")
        scan_opts.matcher.reset(new GlobMatcher{pattern, true});
    } else if (parser.Check("TYPE", &type_str)) {
      CompactObjType obj_type = ObjTypeFromString(type_str);
      if (obj_type == kInvalidCompactObjType) {
        return facade::OpStatus::SYNTAX_ERR;
      }
      scan_opts.type_filter = obj_type;
    } else if (parser.Check("BUCKET", &scan_opts.bucket_id)) {
      // no-op
    } else if (parser.Check("ATTR")) {
      scan_opts.mask =
          parser.MapNext("v", ScanOpts::Mask::Volatile, "p", ScanOpts::Mask::Permanent, "a",
                         ScanOpts::Mask::Accessed, "u", ScanOpts::Mask::Untouched);
    } else if (parser.Check("MINMSZ", &scan_opts.min_malloc_size)) {
      // no-op
    } else
      return facade::OpStatus::SYNTAX_ERR;
  }  // while

  // Check for parsing errors (e.g. missing values or invalid integers)
  if (auto err = parser.TakeError()) {
    if (err.type == facade::CmdArgParser::INVALID_INT) {
      return facade::OpStatus::INVALID_INT;
    }
    return facade::OpStatus::SYNTAX_ERR;
  }

  return scan_opts;
}

bool ScanOpts::Matches(std::string_view val_name) const {
  return !matcher || matcher->Matches(val_name);
}

std::ostream& operator<<(std::ostream& os, const GlobalState& state) {
  return os << GlobalStateName(state);
}

ScanOpts::~ScanOpts() {
}

BorrowedInterpreter::BorrowedInterpreter(Transaction* tx, ConnectionState* state) {
  // Ensure squashing ignores EVAL. We can't run on a stub context, because it doesn't have our
  // preborrowed interpreter (which can't be shared on multiple threads).
  CHECK(!tx->IsSquashedStub());

  if (auto borrowed = state->exec_info.preborrowed_interpreter; borrowed) {
    // Ensure a preborrowed interpreter is only set for an already running MULTI transaction.
    CHECK_EQ(state->exec_info.state, ConnectionState::ExecInfo::EXEC_RUNNING);

    interpreter_ = borrowed;
  } else {
    // A scheduled transaction occupies a place in the transaction queue and holds locks,
    // preventing other transactions from progressing. Blocking below can deadlock!
    CHECK(!tx->IsScheduled());

    interpreter_ = ServerState::tlocal()->BorrowInterpreter();
    owned_ = true;
  }
}

BorrowedInterpreter::~BorrowedInterpreter() {
  if (owned_)
    ServerState::tlocal()->ReturnInterpreter(interpreter_);
}

}  // namespace dfly


================================================
FILE: src/server/common.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <atomic>
#include <cstddef>
#include <cstdint>
#include <string_view>
#include <vector>

#include "facade/facade_types.h"
#include "server/common_types.h"

namespace dfly {

using CompactObjType = unsigned;
class GlobMatcher;

// Dependent on ExpirePeriod representation of the value.
constexpr int64_t kMaxExpireDeadlineSec = (1u << 28) - 1;  // 8.5 years
constexpr int64_t kMaxExpireDeadlineMs = kMaxExpireDeadlineSec * 1000;

using facade::ArgS;
using facade::CmdArgList;
using facade::CmdArgVec;
using facade::MutableSlice;
using facade::OpResult;

using StringVec = std::vector<std::string>;

class CommandId;
struct ConnectionState;
class Namespaces;

struct LockTagOptions {
  bool enabled = false;
  char open_locktag = '{';
  char close_locktag = '}';
  unsigned skip_n_end_delimiters = 0;
  std::string prefix;

  // Returns the tag according to the rules defined by this options object.
  std::string_view Tag(std::string_view key) const;

  static const LockTagOptions& instance();
};

std::ostream& operator<<(std::ostream& os, const GlobalState& state);

const char* GlobalStateName(GlobalState gs);

bool ParseHumanReadableBytes(std::string_view str, int64_t* num_bytes);
bool ParseDouble(std::string_view src, double* value);

const char* RdbTypeName(unsigned type);

// Globally used atomics for memory readings
inline std::atomic_uint64_t used_mem_current{0};
inline std::atomic_uint64_t rss_mem_current{0};
// Current value of --maxmemory flag
inline std::atomic_uint64_t max_memory_limit{0};

inline Namespaces* namespaces = nullptr;

// version 5.11 maps to 511 etc.
// set upon server start.
inline unsigned kernel_version = 0;

struct ScanOpts {
  ~ScanOpts();  // because of forward declaration
  ScanOpts() = default;
  ScanOpts(ScanOpts&& other) = default;

  bool Matches(std::string_view val_name) const;
  static OpResult<ScanOpts> TryFrom(CmdArgList args, bool allow_novalues = false);

  std::unique_ptr<GlobMatcher> matcher;
  size_t limit = 10;
  std::optional<CompactObjType> type_filter;
  unsigned bucket_id = UINT_MAX;
  enum class Mask {
    Volatile,   // volatile, keys that have ttl
    Permanent,  // permanent, keys that do not have ttl
    Accessed,   // accessed, the key has been accessed since the last load/flush event, or the last
                // time a flag was reset.
    Untouched,  // untouched, the key has not been accessed/touched.
  };
  std::optional<Mask> mask;
  size_t min_malloc_size = 0;
  bool novalues = false;
};

// I use relative time from Feb 1, 2023 in seconds.
constexpr uint64_t kMemberExpiryBase = 1675209600;

inline uint32_t MemberTimeSeconds(uint64_t now_ms) {
  return (now_ms / 1000) - kMemberExpiryBase;
}

// Ensures availability of an interpreter for EVAL-like commands and it's automatic release.
// If it's part of MULTI, the preborrowed interpreter is returned, otherwise a new is acquired.
struct BorrowedInterpreter {
  BorrowedInterpreter(Transaction* tx, ConnectionState* state);

  ~BorrowedInterpreter();

  // Give up ownership of the interpreter, it must be returned manually.
  Interpreter* Release() && {
    assert(owned_);
    owned_ = false;
    return interpreter_;
  }

  operator Interpreter*() {
    return interpreter_;
  }

 private:
  Interpreter* interpreter_ = nullptr;
  bool owned_ = false;
};

}  // namespace dfly


================================================
FILE: src/server/common_types.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <cstdint>

namespace dfly {

// Transaction, replication and partitioning identifiers
using LSN = uint64_t;
using TxId = uint64_t;
using TxClock = uint64_t;
using SlotId = std::uint16_t;

// Database and shard identifiers (moved from tx_base.h to reduce compilation dependencies)
using DbIndex = uint16_t;
using ShardId = uint16_t;
using LockFp = uint64_t;  // a key fingerprint used by the LockTable.

constexpr DbIndex kInvalidDbId = DbIndex(-1);
constexpr ShardId kInvalidSid = ShardId(-1);
constexpr DbIndex kMaxDbId = 1024;  // Reasonable starting point.

// Server state and time enums (moved from common.h to reduce compilation dependencies)
enum class GlobalState : uint8_t {
  ACTIVE,
  LOADING,
  SHUTTING_DOWN,
  TAKEN_OVER,
};

enum class TimeUnit : uint8_t { SEC, MSEC };

enum class LoadBlobResult : uint8_t {
  kSuccess,
  kCorrupted,
  kOutOfMemory,
  kEmpty,
};

enum ExpireFlags {
  EXPIRE_ALWAYS = 0,
  EXPIRE_NX = 1 << 0,  // Set expiry only when key has no expiry
  EXPIRE_XX = 1 << 2,  // Set expiry only when the key has expiry
  EXPIRE_GT = 1 << 3,  // GT: Set expiry only when the new expiry is greater than current one
  EXPIRE_LT = 1 << 4,  // LT: Set expiry only when the new expiry is less than current one
};

// Forward declarations for commonly used classes (to reduce header dependencies)
class EngineShard;
class Transaction;
class DbSlice;
class ConnectionContext;
class CommandContext;
class Namespace;
class CommandRegistry;
class Interpreter;

}  // namespace dfly


================================================
FILE: src/server/config_registry.cc
================================================
// Copyright 2023, Roman Gershman.  All rights reserved.
// See LICENSE for licensing terms.
//
#include "server/config_registry.h"

#include <absl/flags/reflection.h>
#include <absl/strings/match.h>
#include <absl/strings/str_cat.h>
#include <absl/strings/str_replace.h>

#include "base/logging.h"
#include "core/glob_matcher.h"
#include "strings/human_readable.h"

namespace dfly {
namespace {
using namespace std;

string NormalizeConfigName(string_view name) {
  return absl::StrReplaceAll(name, {{"-", "_"}, {".", "_"}});
}
}  // namespace

// Convert internal flag name back to user-facing format
// Example: search_query_string_bytes -> search.query-string-bytes
string DenormalizeConfigName(string_view name) {
  string result{name};
  if (absl::StartsWith(result, "search_")) {
    // Replace first underscore after "search" with dot
    result.replace(6, 1, ".");
    // Replace remaining underscores with dashes
    for (size_t i = 7; i < result.size(); ++i) {
      if (result[i] == '_') {
        result[i] = '-';
      }
    }
  }
  return result;
}

// Returns true if the value was updated.
auto ConfigRegistry::Set(string_view config_name, string_view value) -> SetResult {
  string name = NormalizeConfigName(config_name);

  util::fb2::LockGuard lk(mu_);
  auto it = registry_.find(name);
  if (it == registry_.end())
    return SetResult::UNKNOWN;
  if (!it->second.is_mutable)
    return SetResult::READONLY;

  auto cb = it->second.cb;

  absl::CommandLineFlag* flag = absl::FindCommandLineFlag(name);
  CHECK(flag) << config_name;
  if (string error; !flag->ParseFrom(value, &error)) {
    LOG(WARNING) << error;
    return SetResult::INVALID;
  }

  bool success = !cb || cb(*flag);
  return success ? SetResult::OK : SetResult::INVALID;
}

absl::CommandLineFlag* ConfigRegistry::GetFlag(std::string_view config_name) {
  string name = NormalizeConfigName(config_name);

  {
    util::fb2::LockGuard lk(mu_);
    if (!registry_.contains(name))
      return nullptr;
  }

  absl::CommandLineFlag* flag = absl::FindCommandLineFlag(name);
  CHECK(flag);
  return flag;
}

optional<string> ConfigRegistry::Get(string_view config_name) {
  absl::CommandLineFlag* flag = GetFlag(config_name);
  if (!flag) {
    return nullopt;
  }

  // For MemoryBytesFlag, return numeric bytes for compatibility.
  if (flag->IsOfType<strings::MemoryBytesFlag>()) {
    auto val = flag->TryGet<strings::MemoryBytesFlag>();
    if (val.has_value()) {
      return absl::StrCat(val->value);
    }
  }

  return flag->CurrentValue();
}

void ConfigRegistry::Reset() {
  util::fb2::LockGuard lk(mu_);
  registry_.clear();
}

vector<string> ConfigRegistry::List(string_view glob) const {
  string normalized_glob = NormalizeConfigName(glob);
  GlobMatcher matcher(normalized_glob, false /* case insensitive*/);

  vector<string> res;
  util::fb2::LockGuard lk(mu_);

  for (const auto& [name, _] : registry_) {
    if (matcher.Matches(name))
      res.push_back(name);
  }
  return res;
}

void ConfigRegistry::RegisterInternal(string_view config_name, bool is_mutable, WriteCb cb) {
  string name = NormalizeConfigName(config_name);

  absl::CommandLineFlag* flag = absl::FindCommandLineFlag(name);
  CHECK(flag) << "Unknown config name: " << name;

  util::fb2::LockGuard lk(mu_);
  auto [it, inserted] = registry_.emplace(name, Entry{std::move(cb), is_mutable});
  CHECK(inserted) << "Duplicate config name: " << name;
}

void ConfigRegistry::ValidateCustomSetter(std::string_view name, WriteCb setter) const {
  absl::CommandLineFlag* flag = absl::FindCommandLineFlag(name);
  CHECK(flag) << "Unknown config name: " << name;
  if (setter) {
    bool cb_match = setter(*flag);
    CHECK(cb_match) << "Possible type mismatch with setter for flag " << name;
  }
}

}  // namespace dfly


================================================
FILE: src/server/config_registry.h
================================================
// Copyright 2023, Roman Gershman.  All rights reserved.
// See LICENSE for licensing terms.
//

#include <absl/container/flat_hash_map.h>
#include <absl/flags/reflection.h>

#include "util/fibers/synchronization.h"

namespace dfly {

// Allows reading and modifying pre-registered configuration values by string names.
// This class treats dashes (-) are as underscores (_).
class ConfigRegistry {
 public:
  // Accepts the new value as argument. Return true if config was successfully updated.
  using WriteCb = std::function<bool(const absl::CommandLineFlag&)>;

  ConfigRegistry& Register(std::string_view name) {
    RegisterInternal(name, false, {});
    return *this;
  }

  ConfigRegistry& RegisterMutable(std::string_view name, WriteCb cb = {}) {
    RegisterInternal(name, true, std::move(cb));
    return *this;
  }

  template <typename T>
  ConfigRegistry& RegisterSetter(std::string_view name, std::function<void(const T&)> f) {
    ValidateCustomSetter(name,
                         [](const absl::CommandLineFlag& flag) { return flag.IsOfType<T>(); });

    return RegisterMutable(name, [f](const absl::CommandLineFlag& flag) {
      auto res = flag.TryGet<T>();
      if (res.has_value()) {
        f(*res);
        return true;
      }
      return false;
    });
  }

  enum class SetResult : uint8_t {
    OK,
    UNKNOWN,
    READONLY,
    INVALID,
  };

  // Returns true if the value was updated.
  SetResult Set(std::string_view config_name, std::string_view value) ABSL_LOCKS_EXCLUDED(mu_);

  std::optional<std::string> Get(std::string_view config_name) ABSL_LOCKS_EXCLUDED(mu_);

  absl::CommandLineFlag* GetFlag(std::string_view config_name) ABSL_LOCKS_EXCLUDED(mu_);

  void Reset();

  std::vector<std::string> List(std::string_view glob) const ABSL_LOCKS_EXCLUDED(mu_);

 private:
  void RegisterInternal(std::string_view name, bool is_mutable, WriteCb cb)
      ABSL_LOCKS_EXCLUDED(mu_);
  void ValidateCustomSetter(std::string_view name, WriteCb setter) const;

  mutable util::fb2::Mutex mu_;

  struct Entry {
    WriteCb cb;
    bool is_mutable;
  };

  absl::flat_hash_map<std::string, Entry> registry_ ABSL_GUARDED_BY(mu_);
};

inline ConfigRegistry config_registry;

// Convert internal flag name back to user-facing format for search parameters
// Example: search_query_string_bytes -> search.query-string-bytes
std::string DenormalizeConfigName(std::string_view name);

}  // namespace dfly


================================================
FILE: src/server/conn_context.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/conn_context.h"

#include <atomic>

#include "base/logging.h"
#include "common/heap_size.h"
#include "facade/reply_builder.h"
#include "server/acl/acl_commands_def.h"
#include "server/channel_store.h"
#include "server/command_registry.h"
#include "server/engine_shard_set.h"
#include "server/server_family.h"
#include "server/server_state.h"
#include "server/transaction.h"
#include "src/facade/dragonfly_connection.h"

namespace dfly {

using namespace std;
using namespace facade;
using cmn::HeapSize;

namespace {
void SendSubscriptionChangedResponse(string_view action, std::optional<string_view> topic,
                                     unsigned count, RedisReplyBuilder* rb) {
  rb->StartCollection(3, CollectionType::PUSH);
  rb->SendBulkString(action);
  if (topic.has_value())
    rb->SendBulkString(topic.value());
  else
    rb->SendNull();
  rb->SendLong(count);
}

vector<string> FormatExecSlowlog(const ConnectionState& state) {
  const auto& info = state.exec_info;
  return {absl::StrCat("num_cmds: ", info.body.size()), absl::StrCat("is_write: ", info.is_write)};
}

vector<string> FormatEvalSlowlog(const ConnectionState& state) {
  if (!state.script_info)  // EVAL failed to initialize (error)
    return {};

  const auto& sinfo = *state.script_info;
  return {
      string{sinfo.stats.sha, sizeof(sinfo.stats.sha)},
      absl::StrCat("num_cmds: ", sinfo.stats.num_commands),
      absl::StrCat("slow_cmds: ", sinfo.stats.slow_commands.load(memory_order_relaxed)),
      absl::StrCat("tx_mode: ", int(sinfo.stats.tx_mode)),
      absl::StrCat("tx_shards: ", int(sinfo.stats.tx_shards)),
      absl::StrCat("is_write: ", !sinfo.read_only),
      absl::StrCat("lock_tags: ", sinfo.lock_tags.size()),
  };
}

}  // namespace

StoredCmd::StoredCmd(const CommandId* cid, facade::ArgSlice args, facade::ReplyMode mode)
    : cid_{cid}, args_{args}, reply_mode_{mode} {
  backed_ = std::make_unique<cmn::BackedArguments>(args.begin(), args.end(), args.size());
  args_ = facade::ParsedArgs{*backed_};
}

CmdArgList StoredCmd::Slice(CmdArgVec* scratch) const {
  return args_.ToSlice(scratch);
}

std::string StoredCmd::FirstArg() const {
  if (NumArgs() == 0) {
    return {};
  }
  return string{args_.Front()};
}

ConnectionContext::ConnectionContext(facade::Connection* owner, acl::UserCredentials cred)
    : facade::ConnectionContext(owner) {
  if (owner) {
    skip_acl_validation = owner->IsPrivileged();
    has_main_or_memcache_listener = owner->IsMainOrMemcache();
  }

  keys = std::move(cred.keys);
  pub_sub = std::move(cred.pub_sub);
  if (cred.acl_commands.empty()) {
    acl_commands = std::vector<uint64_t>(acl::NumberOfFamilies(), acl::NONE_COMMANDS);
  } else {
    acl_commands = std::move(cred.acl_commands);
  }
  acl_db_idx = cred.db;
}

void ConnectionContext::ChangeMonitor(bool start) {
  // Ensure idempotency: MONITOR may be queued multiple times inside MULTI/EXEC.
  if (start == monitor)
    return;

  // This will either remove or register a new connection
  // at the "top level" thread --> ServerState context
  // note that we are registering/removing this connection to the thread at which at run
  // then notify all other threads that there is a change in the number of monitors
  auto& my_monitors = ServerState::tlocal()->Monitors();
  if (start) {
    my_monitors.Add(conn());
  } else {
    VLOG(1) << "connection " << conn()->GetClientId() << " no longer needs to be monitored";
    my_monitors.Remove(conn());
  }
  // Tell other threads that about the change in the number of connection that we monitor
  shard_set->pool()->AwaitBrief(
      [start](unsigned, auto*) { ServerState::tlocal()->Monitors().NotifyChangeCount(start); });
  EnableMonitoring(start);
}

void ConnectionContext::ChangeSubscription(bool to_add, bool to_reply, bool sharded,
                                           CmdArgList args, facade::RedisReplyBuilder* rb) {
  vector<unsigned> result = ChangeSubscriptions(args, false, to_add, to_reply);

  if (to_reply) {
    const string_view actionRegular[2] = {"unsubscribe", "subscribe"};
    const string_view actionSharded[2] = {"sunsubscribe", "ssubscribe"};
    const absl::Span<const string_view> action = sharded ? actionSharded : actionRegular;
    SinkReplyBuilder::ReplyScope scope{rb};
    for (size_t i = 0; i < result.size(); ++i) {
      SendSubscriptionChangedResponse(action[to_add], ArgS(args, i), result[i], rb);
    }
  }
}

void ConnectionContext::ChangePSubscription(bool to_add, bool to_reply, CmdArgList args,
                                            facade::RedisReplyBuilder* rb) {
  vector<unsigned> result = ChangeSubscriptions(args, true, to_add, to_reply);

  if (to_reply) {
    const char* action[2] = {"punsubscribe", "psubscribe"};
    if (result.size() == 0) {
      return SendSubscriptionChangedResponse(action[to_add], std::nullopt, 0, rb);
    }

    SinkReplyBuilder::ReplyScope scope{rb};
    for (size_t i = 0; i < result.size(); ++i) {
      SendSubscriptionChangedResponse(action[to_add], ArgS(args, i), result[i], rb);
    }
  }
}

void ConnectionContext::UnsubscribeAll(bool to_reply, facade::RedisReplyBuilder* rb) {
  if (to_reply && (!conn_state.subscribe_info || conn_state.subscribe_info->channels.empty())) {
    return SendSubscriptionChangedResponse("unsubscribe", std::nullopt, 0, rb);
  }
  StringVec channels(conn_state.subscribe_info->channels.begin(),
                     conn_state.subscribe_info->channels.end());
  CmdArgVec arg_vec(channels.begin(), channels.end());
  ChangeSubscription(false, to_reply, false, CmdArgList{arg_vec}, rb);
}

void ConnectionContext::PUnsubscribeAll(bool to_reply, facade::RedisReplyBuilder* rb) {
  if (to_reply && (!conn_state.subscribe_info || conn_state.subscribe_info->patterns.empty())) {
    return SendSubscriptionChangedResponse("punsubscribe", std::nullopt, 0, rb);
  }

  StringVec patterns(conn_state.subscribe_info->patterns.begin(),
                     conn_state.subscribe_info->patterns.end());
  CmdArgVec arg_vec(patterns.begin(), patterns.end());
  ChangePSubscription(false, to_reply, CmdArgList{arg_vec}, rb);
}

size_t ConnectionState::ExecInfo::UsedMemory() const {
  return HeapSize(body) + HeapSize(watched_keys);
}

void ConnectionState::ExecInfo::AddStoredCmd(const CommandId* cid, ArgSlice args) {
  body.emplace_back(cid, args);
  stored_cmd_bytes += body.back().UsedMemory();
  is_write |= cid->IsJournaled();
}

size_t ConnectionState::ExecInfo::ClearStoredCmds() {
  const size_t used = GetStoredCmdBytes();
  vector<StoredCmd>{}.swap(body);
  stored_cmd_bytes = 0;
  return used;
}

size_t ConnectionState::ScriptInfo::UsedMemory() const {
  return HeapSize(lock_tags) + async_cmds_heap_mem;
}

size_t ConnectionState::SubscribeInfo::UsedMemory() const {
  return HeapSize(channels) + HeapSize(patterns);
}

size_t ConnectionState::UsedMemory() const {
  return HeapSize(exec_info) + HeapSize(script_info) + HeapSize(subscribe_info);
}

size_t ConnectionContext::UsedMemory() const {
  return facade::ConnectionContext::UsedMemory() + HeapSize(conn_state) +
         HeapSize(authed_username) + HeapSize(acl_commands) + HeapSize(keys.key_globs) +
         HeapSize(pub_sub.globs);
}

void ConnectionContext::Unsubscribe(std::string_view channel) {
  auto* sinfo = conn_state.subscribe_info.get();
  DCHECK(sinfo);
  auto erased = sinfo->channels.erase(channel);
  DCHECK(erased);
  if (sinfo->IsEmpty()) {
    conn_state.subscribe_info.reset();
    DCHECK_GE(subscriptions, 1u);
    --subscriptions;
  }
}

vector<unsigned> ConnectionContext::ChangeSubscriptions(CmdArgList channels, bool pattern,
                                                        bool to_add, bool to_reply) {
  vector<unsigned> result(to_reply ? channels.size() : 0, 0);

  if (!to_add && !conn_state.subscribe_info)
    return result;

  if (!conn_state.subscribe_info) {
    DCHECK(to_add);

    conn_state.subscribe_info.reset(new ConnectionState::SubscribeInfo);
    subscriptions++;
  }

  auto& sinfo = *conn_state.subscribe_info.get();
  auto& local_store = pattern ? sinfo.patterns : sinfo.channels;

  int32_t tid = util::ProactorBase::me()->GetPoolIndex();
  DCHECK_GE(tid, 0);

  ChannelStoreUpdater csu{pattern, to_add, this, uint32_t(tid)};

  // Gather all the channels we need to subscribe to / remove.
  size_t i = 0;
  for (string_view channel : channels) {
    if (to_add && local_store.emplace(channel).second)
      csu.Record(channel);
    else if (!to_add && local_store.erase(channel) > 0)
      csu.Record(channel);

    if (to_reply)
      result[i++] = sinfo.SubscriptionCount();
  }

  csu.Apply();

  // Important to reset conn_state.subscribe_info only after all references to it were
  // removed.
  if (!to_add && conn_state.subscribe_info->IsEmpty()) {
    conn_state.subscribe_info.reset();
    DCHECK_GE(subscriptions, 1u);
    subscriptions--;
  }

  return result;
}

void ConnectionState::ExecInfo::Clear() {
  DCHECK(!preborrowed_interpreter);  // Must have been released properly
  state = EXEC_INACTIVE;
  const size_t cleared_size = ClearStoredCmds();
  ServerState::tlocal()->stats.stored_cmd_bytes -= cleared_size;
  is_write = false;
  ClearWatched();
}

void ConnectionState::ExecInfo::ClearWatched() {
  watched_keys.clear();
  watched_dirty.store(false, memory_order_relaxed);
  watched_existed = 0;
}

bool ConnectionState::ClientTracking::ShouldTrackKeys() const {
  if (!IsTrackingOn()) {
    return false;
  }

  if (noloop_ == true) {
    // Once we implement REDIRECT this should return true since noloop
    // without it only affects the current connection
    return false;
  }

  if (option_ == NONE) {
    return true;
  }

  const bool match = (seq_num_ == (1 + caching_seq_num_));
  return option_ == OPTIN ? match : !match;
}

void CommandContext::ReuseInternal() {
  cid_ = nullptr;
  tx_ = nullptr;
  arg_slice_backing.clear();
  start_time_ns = 0;
}

void CommandContext::RecordLatency(facade::ArgSlice tail_args) const {
  DCHECK_GT(start_time_ns, 0u);
  int64_t after = absl::GetCurrentTimeNanos();

  ServerState* ss = ServerState::SafeTLocal();  // Might have migrated thread, read after invocation
  int64_t execution_time_usec = (after - start_time_ns) / 1000;

  cid_->RecordLatency(ss->thread_index(), execution_time_usec);
  DCHECK(conn_cntx_ != nullptr);

  // TODO: we should probably discard more than only blocking commands here
  const auto* conn = server_conn_cntx()->conn();
  if (conn == nullptr || (cid_->opt_mask() & CO::BLOCKING))
    return;

  if (!ss->ShouldLogSlowCmd(execution_time_usec))  // It was not a slow command
    return;

  auto* cntx = static_cast<dfly::ConnectionContext*>(conn_cntx());

  // Log nested commands of scripts that made it into slowlog
  if (auto sinfo = cntx->conn_state.script_info.get(); !cid_->MultiControlKind() && sinfo)
    sinfo->stats.slow_commands.fetch_add(1, memory_order_relaxed);

  vector<string> aux_params;
  CmdArgVec aux_slice;

  // Rewrite arguments for exec/eval with stats
  if (auto mck = cid_->MultiControlKind(); mck) {
    switch (*mck) {
      case CO::MultiControlKind::EXEC:
        if (cid_->name() == "EXEC")
          aux_params = FormatExecSlowlog(cntx->conn_state);
        break;
      case CO::MultiControlKind::EVAL:
        aux_params = FormatEvalSlowlog(cntx->conn_state);
        break;
    };
    aux_slice = {aux_params.begin(), aux_params.end()};
    if (tail_args.size() > 0) {
      if (!aux_params.empty())
        tail_args.remove_prefix(1);  // remove script/sha from eval/evalsha
      aux_slice.insert(aux_slice.end(), tail_args.begin(), tail_args.end());
    }
    tail_args = aux_slice;
  }

  ServerState::SafeTLocal()->GetSlowLog().Add(cid_->name(), tail_args, conn->GetName(),
                                              conn->RemoteEndpointStr(), execution_time_usec,
                                              absl::GetCurrentTimeNanos() / 1000);
}

}  // namespace dfly


================================================
FILE: src/server/conn_context.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/flat_hash_set.h>

#include "facade/conn_context.h"
#include "facade/parsed_command.h"
#include "facade/reply_mode.h"
#include "server/acl/acl_commands_def.h"
#include "server/common.h"
#include "server/tx_base.h"
#include "server/version.h"

namespace dfly {

class EngineShardSet;
class ChannelStore;
class Interpreter;
struct FlowInfo;

// Stores command id and arguments for delayed invocation.
// Used for storing MULTI/EXEC commands.
class StoredCmd {
 public:
  // Deep copy of args, creates backing storage internally.
  StoredCmd(const CommandId* cid, ArgSlice args, facade::ReplyMode mode = facade::ReplyMode::FULL);

  // Shallow copy of args.
  StoredCmd(const CommandId* cid, facade::ParsedArgs args)
      : cid_{cid}, args_{args}, reply_mode_(facade::ReplyMode::FULL) {
  }

  size_t NumArgs() const {
    return args_.size();
  }

  size_t UsedMemory() const {
    return backed_ ? backed_->HeapMemory() + sizeof(*backed_) : 0;
  }

  facade::ArgSlice Slice(CmdArgVec* scratch) const;
  std::string FirstArg() const;

  const CommandId* Cid() const {
    return cid_;
  }

  facade::ReplyMode ReplyMode() const {
    return reply_mode_;
  }

 private:
  const CommandId* cid_;     // underlying command
  facade::ParsedArgs args_;  // arguments

  // TODO: we could optimize the storage further by introducing StoredCmdCollection and
  // keep the backing storage there. Then this class will only use shallow copies.
  std::unique_ptr<cmn::BackedArguments> backed_;
  facade::ReplyMode reply_mode_;  // reply mode
};

struct ConnectionState {
  // MULTI-EXEC transaction related data.
  struct ExecInfo {
    enum ExecState : uint8_t { EXEC_INACTIVE, EXEC_COLLECT, EXEC_RUNNING, EXEC_ERROR };

    ExecInfo() = default;
    // ExecInfo is immovable due to being referenced from DbSlice.
    ExecInfo(ExecInfo&&) = delete;

    bool IsCollecting() const {
      return state == EXEC_COLLECT;
    }

    bool IsRunning() const {
      return state == EXEC_RUNNING;
    }

    // Resets to blank state after EXEC or DISCARD
    void Clear();

    // Resets local watched keys info. Does not unregister the keys from DbSlices.
    void ClearWatched();

    size_t UsedMemory() const;

    // Deep copies arguments and updates the stored_cmd_bytes.
    void AddStoredCmd(const CommandId* cid, ArgSlice args);

    // Empties the body vector and resets stored_cmd_bytes to 0. Returns the size before data was
    // cleared.
    size_t ClearStoredCmds();

    // Returns memory used by the body field without iterating over each stored command
    size_t GetStoredCmdBytes() const {
      return stored_cmd_bytes + body.capacity() * sizeof(StoredCmd);
    }

    ExecState state = EXEC_INACTIVE;
    std::vector<StoredCmd> body;
    bool is_write = false;

    std::vector<std::pair<DbIndex, std::string>> watched_keys;  // List of keys registered by WATCH
    std::atomic_bool watched_dirty = false;  // Set if a watched key was changed before EXEC
    uint32_t watched_existed = 0;            // Number of times watch was called on an existing key

    // If the transaction contains EVAL calls, preborrow an interpreter that will be used for all of
    // them. This has to be done to avoid potentially blocking when borrowing interpreters amid
    // executing the multi transaction, which can create deadlocks by blocking other transactions
    // that already borrowed all available interpreters but wait for keys to be unlocked.
    Interpreter* preborrowed_interpreter = nullptr;

    // The total size of all stored commands kept in "body". Does not include memory allocated by
    // the "body" vector.
    size_t stored_cmd_bytes = 0;
  };

  // Lua-script related data.
  struct ScriptInfo {
    size_t UsedMemory() const;

    absl::flat_hash_set<LockTag> lock_tags;  // declared tags
    bool read_only = false;

    size_t async_cmds_heap_mem = 0;     // bytes used by async_cmds
    size_t async_cmds_heap_limit = 0;   // max bytes allowed for async_cmds
    std::vector<StoredCmd> async_cmds;  // aggregated by acall

    struct Stats {
      char sha[40];                            // sha of script
      unsigned num_commands = 0;               // total number of command executed
      std::atomic_uint32_t slow_commands = 0;  // commands that made it into slowlog

      uint8_t tx_mode = 0;     // value of Transaction::MultiMode
      unsigned tx_shards = 0;  // Number of shards on the transaction
    } stats;
  };

  // PUB-SUB messaging related data.
  struct SubscribeInfo {
    bool IsEmpty() const {
      return channels.empty() && patterns.empty();
    }

    unsigned SubscriptionCount() const {
      return channels.size() + patterns.size();
    }

    size_t UsedMemory() const;

    // TODO: to provide unique_strings across service. This will allow us to use string_view here.
    absl::flat_hash_set<std::string> channels;
    absl::flat_hash_set<std::string> patterns;
  };

  struct ReplicationInfo {
    // If this server is master, and this connection is from a secondary replica,
    // then it holds positive sync session id.
    uint32_t repl_session_id = 0;
    uint32_t repl_flow_id = UINT32_MAX;
    std::string repl_ip_address;
    uint32_t repl_listening_port = 0;
    DflyVersion repl_version = DflyVersion::VER1;
  };

  struct SquashingInfo {
    // Pointer to the original underlying context of the base command.
    // Only const access it possible for reading from multiple threads,
    // each squashing thread has its own proxy context that contains this info.
    const ConnectionContext* owner = nullptr;
  };

  size_t UsedMemory() const;

  // Client tracking is a per-connection state machine that adheres to the requirements
  // of the CLIENT TRACKING command. Note that the semantics described below are enforced
  // by the tests in server_family_test. The rules are:
  // 1. If CLIENT TRACKING is ON then each READ command must be tracked. Invalidation
  //    messages are sent `only once`. Subsequent changes of the same key require the
  //    client to re-read the key in order to receive the next invalidation message.
  // 2. CLIENT TRACKING ON OPTIN turns on optional tracking. Read commands are not
  //    tracked unless the client issues a CLIENT CACHING YES command which conditionally
  //    allows the tracking of the command that follows CACHING YES). For example:
  //    >> CLIENT TRACKING ON
  //    >> CLIENT CACHING YES
  //    >> GET foo  <--------------------- From now foo is being tracked
  //    However:
  //    >> CLIENT TRACKING ON
  //    >> CLIENT CACHING YES
  //    >> SET foo bar
  //    >> GET foo <--------------------- is *NOT* tracked since GET does not succeed CACHING
  //    Also, in the context of multi transactions, CLIENT CACHING YES is *STICKY*:
  //    >> CLIENT TRACKING ON
  //    >> CLIENT CACHING YES
  //    >> MULTI
  //    >>   GET foo
  //    >>   SET foo bar
  //    >>   GET brother_foo
  //    >> EXEC
  //    From this point onwards `foo` and `get` keys are tracked. Same aplies if CACHING YES
  //    is used within the MULTI/EXEC block.
  //
  // The state machine implements the above rules. We need to track:
  // 1. If TRACKING is ON and OPTIN
  // 2. Stickiness of CACHING as described above
  //
  // We introduce a monotonic counter called sequence number which we increment only:
  // * On InvokeCmd when we are not Collecting (multi)
  // We introduce another counter called caching_seq_num which is set to seq_num
  // when the users sends a CLIENT CACHING YES command
  // If seq_num == caching_seq_num + 1 then we know that we should Track().
  class ClientTracking {
   public:
    enum Options : uint8_t {
      NONE,   // NO subcommand, that is no OPTIN and no OUTPUT was used when CLIENT TRACKING was
              // called. We track all keys of read commands.
      OPTIN,  // OPTIN was used with CLIENT TRACKING. We only track keys of read commands preceded
              // by CACHING TRUE command.
      OPTOUT  // OPTOUT was used with CLIENT TRACKING. We track all keys of read commands except the
              // ones preceded by a CACHING FALSE command.
    };

    // Sets to true when CLIENT TRACKING is ON
    void SetClientTracking(bool is_on) {
      tracking_enabled_ = is_on;
    }

    // Increment current sequence number
    void IncrementSequenceNumber() {
      ++seq_num_;
    }

    // Set if OPTIN/OPTOUT subcommand is used in CLIENT TRACKING
    void SetOption(Options option) {
      option_ = option;
    }

    void SetNoLoop(bool noloop) {
      noloop_ = noloop;
    }

    // Check if the keys should be tracked. Result adheres to the state machine described above.
    bool ShouldTrackKeys() const;

    // Check only if CLIENT TRACKING is ON
    bool IsTrackingOn() const {
      return tracking_enabled_;
    }

    // Called by CLIENT CACHING YES and caches the current seq_num_
    void SetCachingSequenceNumber(bool is_multi) {
      // We need -1 when we are in multi
      caching_seq_num_ = is_multi && seq_num_ != 0 ? seq_num_ - 1 : seq_num_;
    }

    void ResetCachingSequenceNumber() {
      caching_seq_num_ = 1;
    }

    bool HasOption(Options option) const {
      return option_ == option;
    }

   private:
    // a flag indicating whether the client has turned on client tracking.
    bool tracking_enabled_ = false;
    bool noloop_ = false;
    Options option_ = NONE;
    // sequence number
    size_t seq_num_ = 0;
    size_t caching_seq_num_ = 1;
  };

 public:
  DbIndex db_index = 0;

  ExecInfo exec_info;
  ReplicationInfo replication_info;

  std::unique_ptr<ScriptInfo> script_info;
  std::unique_ptr<SubscribeInfo> subscribe_info;
  ClientTracking tracking_info_;
};

class ConnectionContext : public facade::ConnectionContext {
 public:
  ConnectionContext(facade::Connection* owner, dfly::acl::UserCredentials cred);

  struct DebugInfo {
    uint32_t shards_count = 0;
    TxClock clock = 0;
  };

  DebugInfo last_command_debug;

  // TODO: to introduce proper accessors.
  Namespace* ns = nullptr;
  Transaction* transaction = nullptr;

  ConnectionState conn_state;

  DbIndex db_index() const {
    return conn_state.db_index;
  }

  void ChangeSubscription(bool to_add, bool to_reply, bool sharded, CmdArgList args,
                          facade::RedisReplyBuilder* rb);

  void ChangePSubscription(bool to_add, bool to_reply, CmdArgList args,
                           facade::RedisReplyBuilder* rb);
  void UnsubscribeAll(bool to_reply, facade::RedisReplyBuilder* rb);
  void PUnsubscribeAll(bool to_reply, facade::RedisReplyBuilder* rb);
  void ChangeMonitor(bool start);  // either start or stop monitor on a given connection

  size_t UsedMemory() const override;

  virtual void Unsubscribe(std::string_view channel) override;

  // Whether this connection is a connection from a replica to its master.
  // This flag is true only on replica side, where we need to setup a special ConnectionContext
  // instance that helps applying commands coming from master.
  bool is_replicating = false;

  bool monitor = false;  // when a monitor command is sent over a given connection, we need to aware
                         // of it as a state for the connection
  bool journal_emulated = false;  // whether it is used to dispatch journal commands

  // Reference to a master-side FlowInfo for this connection if it is a replication connection.
  FlowInfo* master_repl_flow = nullptr;

  // The related connection is bound to main listener or serves the memcached protocol
  bool has_main_or_memcache_listener = false;

  // ACLs.
  // The following variables represent the ACL rules of the context.
  // Each command, before run, is authorized against those rules by
  // IsUserAllowedToInvokeCmd(and variants) in validator.cc

  // Username
  std::string authed_username{"default"};

  // Each entry in the list is a bitfield representing a specific command family,
  // where each bit corresponds to an individual command within that family.
  // Together, these entries encode the user's full ACL to commands.
  // The index 'i' in 'acl_commands[i]' refers to the command family based on
  // its registration order at runtime. For more details, see acl_commands_def.h.
  std::vector<uint64_t> acl_commands;

  // Keyspace. Each key referenced in a command must match (any) of the rules (globs).
  dfly::acl::AclKeys keys;

  // Pub/sub channels. Each channel referenced in a command must match (any) of the rules (globs).
  dfly::acl::AclPubSub pub_sub;

  // db index, std::numeric_limits<size_t>::max for ALL db's. Dragonfly specific extension.
  size_t acl_db_idx = std::numeric_limits<size_t>::max();

  // Skip ACL validation, used by internal commands and commands run on admin port
  bool skip_acl_validation = false;

 private:
  void EnableMonitoring(bool enable) {
    subscriptions++;  // required to support the monitoring
    monitor = enable;
  }

  std::vector<unsigned> ChangeSubscriptions(CmdArgList channels, bool pattern, bool to_add,
                                            bool to_reply);
};

class CommandContext : public facade::ParsedCommand {
 public:
  CommandContext() = default;
  CommandContext(facade::SinkReplyBuilder* rb, facade::ConnectionContext* conn_cntx) {
    Init(rb, conn_cntx);
  }

  void SetupTx(const CommandId* cid, Transaction* tx) {
    cid_ = cid;
    tx_ = tx;
  }

  void UpdateCid(const CommandId* cid) {
    cid_ = cid;
  }

  virtual size_t GetSize() const override {
    return sizeof(CommandContext);
  }

  ConnectionContext* server_conn_cntx() const {
    return static_cast<ConnectionContext*>(conn_cntx_);
  }

  void RecordLatency(facade::ArgSlice tail_args) const;

  facade::Connection* conn() const {
    return conn_cntx_->conn();
  }

  facade::SinkReplyBuilder* SwapReplier(facade::SinkReplyBuilder* new_rb) {
    return std::exchange(rb_, new_rb);
  }

  Transaction* tx() const {
    return tx_;
  }

  const CommandId* cid() const {
    return cid_;
  }

  uint64_t start_time_ns = 0;

  // Stores backing array for tail args slice
  CmdArgVec arg_slice_backing;

 protected:
  void ReuseInternal() final;

  Transaction* tx_ = nullptr;
  const CommandId* cid_ = nullptr;
};

}  // namespace dfly


================================================
FILE: src/server/container_utils.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#include "server/container_utils.h"

#include "base/flags.h"
#include "base/logging.h"
#include "core/detail/listpack_wrap.h"
#include "core/qlist.h"
#include "core/sorted_map.h"
#include "core/string_map.h"
#include "core/string_set.h"
#include "server/db_slice.h"
#include "server/engine_shard_set.h"
#include "server/namespaces.h"
#include "server/transaction.h"
#include "src/facade/op_status.h"

extern "C" {
#include "redis/intset.h"
#include "redis/listpack.h"
#include "redis/redis_aux.h"
#include "redis/util.h"
}

namespace dfly::container_utils {
using namespace std;
namespace {

struct ShardFFResult {
  PrimeKey key;
  ShardId sid = kInvalidSid;
};

// Returns (iterator, args-index) if found, KEY_NOTFOUND otherwise.
// If multiple keys are found, returns the first index in the ArgSlice.
OpResult<std::pair<DbSlice::ConstIterator, unsigned>> FindFirstReadOnly(const DbSlice& db_slice,
                                                                        const DbContext& cntx,
                                                                        const ShardArgs& args,
                                                                        int req_obj_type) {
  DCHECK(!args.Empty());

  for (auto it = args.begin(); it != args.end(); ++it) {
    OpResult<DbSlice::ConstIterator> res = db_slice.FindReadOnly(cntx, *it, req_obj_type);
    if (res)
      return make_pair(res.value(), unsigned(it.index()));
    if (res.status() != OpStatus::KEY_NOTFOUND)
      return res.status();
  }

  VLOG(2) << "FindFirst not found";
  return OpStatus::KEY_NOTFOUND;
}

// Find first non-empty key of a single shard transaction, pass it to `func` and return the key.
// If no such key exists or a wrong type is found, the apropriate status is returned.
// Optimized version of `FindFirstNonEmpty` below.
OpResult<string> FindFirstNonEmptySingleShard(Transaction* trans, int req_obj_type,
                                              BlockingResultCb func) {
  DCHECK_EQ(trans->GetUniqueShardCnt(), 1u);
  string key;
  auto cb = [&](Transaction* t, EngineShard* shard) -> Transaction::RunnableResult {
    ShardId sid = shard->shard_id();
    auto args = t->GetShardArgs(sid);
    auto ff_res = FindFirstReadOnly(t->GetDbSlice(sid), t->GetDbContext(), args, req_obj_type);

    if (ff_res == OpStatus::WRONG_TYPE)
      return OpStatus::WRONG_TYPE;

    if (ff_res == OpStatus::KEY_NOTFOUND)
      return {OpStatus::KEY_NOTFOUND, Transaction::RunnableResult::AVOID_CONCLUDING};

    CHECK(ff_res.ok());  // No other errors possible
    ff_res->first->first.GetString(&key);
    func(t, shard, key);
    return OpStatus::OK;
  };

  // Schedule single hop and hopefully find a key, otherwise avoid concluding
  OpStatus status = trans->ScheduleSingleHop(cb);
  if (status == OpStatus::OK)
    return key;
  return status;
}

// Find first non-empty key (sorted by order in command arguments) and return it,
// otherwise return not found or wrong type error.
OpResult<ShardFFResult> FindFirstNonEmpty(Transaction* trans, int req_obj_type) {
  DCHECK_GT(trans->GetUniqueShardCnt(), 1u);

  using FFResult = std::tuple<PrimeKey, unsigned, ShardId>;  // key, argument index, sid
  VLOG(2) << "FindFirst::Find " << trans->DebugId();

  // Holds Find results: (iterator to a found key, and its index in the passed arguments).
  // See DbSlice::FindFirst for more details.
  std::vector<OpResult<FFResult>> find_res(shard_set->size());
  std::fill(find_res.begin(), find_res.end(), OpStatus::KEY_NOTFOUND);

  auto cb = [&](Transaction* t, EngineShard* shard) {
    ShardId sid = shard->shard_id();
    auto args = t->GetShardArgs(sid);
    auto ff_res = FindFirstReadOnly(t->GetDbSlice(sid), t->GetDbContext(), args, req_obj_type);
    if (ff_res) {
      find_res[shard->shard_id()] =
          FFResult{ff_res->first->first.AsRef(), ff_res->second, shard->shard_id()};
    } else {
      find_res[shard->shard_id()] = ff_res.status();
    }
    return OpStatus::OK;
  };

  trans->Execute(std::move(cb), false);

  // If any key is of the wrong type, report it immediately
  if (std::find(find_res.begin(), find_res.end(), OpStatus::WRONG_TYPE) != find_res.end())
    return OpStatus::WRONG_TYPE;

  // Order result by their keys position in the command arguments, push errors to back
  auto comp = [](const OpResult<FFResult>& lhs, const OpResult<FFResult>& rhs) {
    if (!lhs || !rhs)
      return lhs.ok();
    size_t i1 = std::get<1>(*lhs);
    size_t i2 = std::get<1>(*rhs);
    return i1 < i2;
  };

  // Find first element by the order above, so the first key. Returns error only if all are errors
  auto it = std::min_element(find_res.begin(), find_res.end(), comp);
  DCHECK(it != find_res.end());

  if (*it == OpStatus::KEY_NOTFOUND)
    return OpStatus::KEY_NOTFOUND;

  CHECK(it->ok());  // No other errors than WRONG_TYPE and KEY_NOTFOUND
  FFResult& res = **it;
  return ShardFFResult{std::get<PrimeKey>(res).AsRef(), std::get<ShardId>(res)};
}

}  // namespace

using namespace std;

bool IterateList(const PrimeValue& pv, const IterateFunc& func, size_t start, size_t end) {
  DCHECK_LE(start, end);
  bool success = true;
  size_t len = pv.Size();
  if (len == 0) {
    return true;
  }

  if (end >= len) {
    end = len - 1;
    if (start > end) {
      return true;
    }
  }

  if (pv.Encoding() == kEncodingListPack) {
    uint8_t* lp = static_cast<uint8_t*>(pv.RObjPtr());
    uint8_t* p = lpSeek(lp, start);
    while (p && start <= end) {
      unsigned int slen;
      long long lval;
      uint8_t* vstr = lpGetValue(p, &slen, &lval);

      if (vstr) {
        success = func(ContainerEntry{reinterpret_cast<const char*>(vstr), slen});
      } else {
        success = func(ContainerEntry{lval});
      }

      if (!success)
        break;

      p = lpNext(lp, p);
      start++;
    }
    return success;
  }

  DCHECK_EQ(pv.Encoding(), kEncodingQL2);
  QList* ql = static_cast<QList*>(pv.RObjPtr());

  ql->Iterate(
      [&](const CollectionEntry& entry) {
        success = func(entry);
        return success;
      },
      start, end);
  return success;
}

bool IterateSet(const PrimeValue& pv, const IterateFunc& func) {
  bool success = true;
  if (pv.Encoding() == kEncodingIntSet) {
    intset* is = static_cast<intset*>(pv.RObjPtr());
    int64_t ival;
    int ii = 0;

    while (success && intsetGet(is, ii++, &ival)) {
      success = func(ContainerEntry{ival});
    }
  } else {
    for (sds ptr : *static_cast<StringSet*>(pv.RObjPtr())) {
      if (!func(ContainerEntry{ptr, sdslen(ptr)})) {
        success = false;
        break;
      }
    }
  }

  return success;
}

bool IterateSortedSet(const PrimeValue& pv, const IterateSortedFunc& func, size_t start, size_t end,
                      bool reverse, bool use_score) {
  size_t llen = pv.Size();
  if (llen == 0)
    return true;

  if (end >= llen)
    end = llen - 1;

  if (start > end || start >= llen)
    return true;

  size_t rangelen = end - start + 1;

  if (pv.Encoding() == OBJ_ENCODING_LISTPACK) {
    uint8_t* zl = static_cast<uint8_t*>(pv.RObjPtr());
    uint8_t *eptr, *sptr;
    uint8_t* vstr;
    unsigned int vlen;
    long long vlong;
    double score = 0.0;

    if (reverse) {
      eptr = lpSeek(zl, -2 - long(2 * start));
    } else {
      eptr = lpSeek(zl, 2 * start);
    }
    DCHECK(eptr);

    sptr = lpNext(zl, eptr);

    bool success = true;
    while (success && rangelen--) {
      DCHECK(eptr != NULL && sptr != NULL);
      vstr = lpGetValue(eptr, &vlen, &vlong);

      // don't bother to extract the score if it's gonna be ignored.
      if (use_score)
        score = detail::ZzlGetScore(sptr);

      if (vstr == NULL) {
        success = func(ContainerEntry{vlong}, score);
      } else {
        success = func(ContainerEntry{reinterpret_cast<const char*>(vstr), vlen}, score);
      }

      if (reverse) {
        detail::ZzlPrev(zl, &eptr, &sptr);
      } else {
        detail::ZzlNext(zl, &eptr, &sptr);
      };
    }
    return success;
  } else {
    CHECK_EQ(pv.Encoding(), OBJ_ENCODING_SKIPLIST);
    auto* smap = static_cast<detail::SortedMap*>(pv.RObjPtr());
    return smap->Iterate(start, rangelen, reverse, [&](sds ele, double score) {
      return func(ContainerEntry{ele, sdslen(ele)}, score);
    });
  }
  return false;
}

bool IterateMap(const PrimeValue& pv, const IterateKVFunc& func) {
  bool finished = true;

  if (pv.Encoding() == kEncodingListPack) {
    detail::ListpackWrap lw{static_cast<uint8_t*>(pv.RObjPtr())};
    for (const auto [key, val] : lw) {
      if (!func(ContainerEntry{key.data(), key.size()}, ContainerEntry{val.data(), val.size()})) {
        finished = false;
        break;
      }
    }
  } else {
    StringMap* sm = static_cast<StringMap*>(pv.RObjPtr());
    for (const auto& k_v : *sm) {
      if (!func(ContainerEntry{k_v.first, sdslen(k_v.first)},
                ContainerEntry{k_v.second, sdslen(k_v.second)})) {
        finished = false;
        break;
      }
    }
  }
  return finished;
}

StringMap* GetStringMap(const PrimeValue& pv, const DbContext& db_context) {
  DCHECK_EQ(pv.Encoding(), kEncodingStrMap2);
  StringMap* res = static_cast<StringMap*>(pv.RObjPtr());
  uint32_t map_time = MemberTimeSeconds(db_context.time_now_ms);
  res->set_time(map_time);
  return res;
}

OpResult<string> RunCbOnFirstNonEmptyBlocking(Transaction* trans, int req_obj_type,
                                              BlockingResultCb func, unsigned limit_ms,
                                              bool* block_flag, bool* pause_flag) {
  string result_key;

  // Fast path. If we have only a single shard, we can run opportunistically with a single hop.
  // If we don't find anything, we abort concluding and keep scheduled.
  // Slow path: schedule, find results from shards, execute action if found.
  OpResult<ShardFFResult> result;
  if (trans->GetUniqueShardCnt() == 1) {
    auto res = FindFirstNonEmptySingleShard(trans, req_obj_type, func);
    if (res.ok()) {
      return res;
    } else {
      result = res.status();
    }
  } else {
    result = FindFirstNonEmpty(trans, req_obj_type);
  }

  // If a non-empty key exists, execute the callback immediately
  if (result.ok()) {
    auto cb = [&](Transaction* t, EngineShard* shard) {
      if (shard->shard_id() == result->sid) {
        result->key.GetString(&result_key);
        func(t, shard, result_key);
      }
      return OpStatus::OK;
    };
    trans->Execute(std::move(cb), true);
    return result_key;
  }

  // Abort on possible errors: wrong type, etc
  if (result.status() != OpStatus::KEY_NOTFOUND) {
    trans->Conclude();
    return result.status();
  }

  // Multi transactions are not allowed to block
  if (trans->IsMulti()) {
    trans->Conclude();
    return OpStatus::TIMED_OUT;
  }

  DCHECK(trans->IsScheduled());  // single shard optimization didn't forget to schedule
  VLOG(1) << "Blocking " << trans->DebugId();

  // If timeout (limit_ms) is zero, block indefinitely
  auto limit_tp = Transaction::time_point::max();
  if (limit_ms > 0) {
    using namespace std::chrono;
    limit_tp = steady_clock::now() + milliseconds(limit_ms);
  }

  auto* ns = &trans->GetNamespace();
  const auto key_checker = [req_obj_type, ns](EngineShard* owner, const DbContext& context,
                                              Transaction*, std::string_view key) -> bool {
    return ns->GetDbSlice(owner->shard_id()).FindReadOnly(context, key, req_obj_type).ok();
  };

  auto status =
      trans->WaitOnWatch(limit_tp, Transaction::kShardArgs, key_checker, block_flag, pause_flag);

  if (status != OpStatus::OK)
    return status;

  auto cb = [&](Transaction* t, EngineShard* shard) {
    if (auto wake_key = t->GetWakeKey(shard->shard_id()); wake_key) {
      result_key = *wake_key;
      func(t, shard, result_key);
    }
    return OpStatus::OK;
  };
  trans->Execute(std::move(cb), true);
  return result_key;
}

}  // namespace dfly::container_utils


================================================
FILE: src/server/container_utils.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once

#include "base/logging.h"
#include "core/collection_entry.h"
#include "core/compact_object.h"
#include "facade/op_status.h"
#include "server/table.h"

extern "C" {
#include "redis/listpack.h"
}

#include <functional>

namespace dfly {

class StringMap;

namespace container_utils {

// IsContainer returns true if the iterator points to a container type.
inline bool IsContainer(const PrimeValue& pv) {
  unsigned type = pv.ObjType();
  return (type == OBJ_LIST || type == OBJ_SET || type == OBJ_ZSET);
}

using ContainerEntry = CollectionEntry;

using IterateFunc = std::function<bool(ContainerEntry)>;
using IterateSortedFunc = std::function<bool(ContainerEntry, double)>;
using IterateKVFunc = std::function<bool(ContainerEntry, ContainerEntry)>;

// Iterate over all values in [start, end] range (inclusive) and call func(val).
// Iteration stops as soon
// as func return false. Returns true if it successfully processed all elements
// without breaking.
bool IterateList(const PrimeValue& pv, const IterateFunc& func, size_t start = 0,
                 size_t end = SIZE_MAX);

// Iterate over all values and call func(val). Iteration stops as soon
// as func return false. Returns true if it successfully processed all elements
// without stopping.
bool IterateSet(const PrimeValue& pv, const IterateFunc& func);

// Iterate over all values and call func(val). Iteration stops as soon
// as func return false. Returns true if it successfully processed all elements
// without stopping.
bool IterateSortedSet(const PrimeValue& pv, const IterateSortedFunc& func, size_t start = 0,
                      size_t end = SIZE_MAX, bool reverse = false, bool use_score = false);

bool IterateMap(const PrimeValue& pv, const IterateKVFunc& func);

// Get StringMap pointer from primetable value. Sets expire time from db_context
StringMap* GetStringMap(const PrimeValue& pv, const DbContext& db_context);

using BlockingResultCb =
    std::function<void(Transaction*, EngineShard*, std::string_view /* key */)>;

// Block until a any key of the transaction becomes non-empty and executes the callback.
// If multiple keys are non-empty when this function is called, the callback is executed
// immediately with the first key listed in the tx arguments.
facade::OpResult<std::string> RunCbOnFirstNonEmptyBlocking(Transaction* trans, int req_obj_type,
                                                           BlockingResultCb cb, unsigned limit_ms,
                                                           bool* block_flag, bool* pause_flag);

};  // namespace container_utils

}  // namespace dfly


================================================
FILE: src/server/db_slice.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/db_slice.h"

#include "core/dense_set.h"

extern "C" {
#include "redis/hyperloglog.h"
}

#include <absl/cleanup/cleanup.h>

#include "base/flags.h"
#include "base/logging.h"
#include "core/top_keys.h"
#include "facade/dragonfly_connection.h"
#include "search/doc_index.h"
#include "server/channel_store.h"
#include "server/cluster/slot_set.h"
#include "server/conn_context.h"
#include "server/engine_shard_set.h"
#include "server/error.h"
#include "server/journal/journal.h"
#include "server/server_state.h"
#include "server/tiered_storage.h"
#include "strings/human_readable.h"
#include "util/fibers/fibers.h"
#include "util/fibers/stacktrace.h"

ABSL_FLAG(uint32_t, max_eviction_per_heartbeat, 100,
          "The maximum number of key-value pairs that will be deleted in each eviction "
          "when heartbeat based eviction is triggered under memory pressure.");

ABSL_FLAG(uint32_t, max_segment_to_consider, 4,
          "The maximum number of dashtable segments to scan in each eviction "
          "when heartbeat based eviction is triggered under memory pressure.");

ABSL_FLAG(double, table_growth_margin, 0.4,
          "Prevents table from growing if number of free slots x average object size x this ratio "
          "is larger than memory budget.");

ABSL_FLAG(std::string, notify_keyspace_events, "",
          "notify-keyspace-events. Only Ex is supported for now");

ABSL_FLAG(bool, cluster_flush_decommit_memory, false, "Decommit memory after flushing slots");

namespace dfly {

using namespace std;
using namespace util;
using absl::GetFlag;
using namespace facade;
using Payload = journal::Entry::Payload;

namespace {

constexpr auto kPrimeSegmentSize = PrimeTable::kSegBytes;

// mi_malloc good size is 32768. i.e. we have malloc waste of 1.5%.
static_assert(kPrimeSegmentSize <= 32304);

void AccountObjectMemory(string_view key, unsigned type, int64_t size, DbTable* db) {
  DCHECK_NE(db, nullptr);
  if (size == 0)
    return;

  DbTableStats& stats = db->stats;

  stats.AddTypeMemoryUsage(type, size);

  if (db->slots_stats) {
    db->slots_stats[KeySlot(key)].memory_bytes += size;
  }
}

class PrimeEvictionPolicy {
 public:
  static constexpr bool can_evict = true;  // we implement eviction functionality.
  static constexpr bool can_gc = true;

  // mem_offset - memory_offset that we should account for in addition to DbSlice::memory_budget.
  // May be negative.
  PrimeEvictionPolicy(const DbContext& cntx, bool can_evict, ssize_t mem_offset, ssize_t soft_limit,
                      DbSlice* db_slice, bool apply_memory_limit)
      : db_slice_(db_slice),
        mem_offset_(mem_offset),
        soft_limit_(soft_limit),
        cntx_(cntx),
        can_evict_(can_evict),
        apply_memory_limit_(apply_memory_limit) {
  }

  // A hook function that is called every time a segment is full and requires splitting.
  void RecordSplit(PrimeTable::Segment_t* segment) {
    DVLOG(2) << "split: " << segment->SlowSize() << "/" << segment->capacity();
  }
  void OnMove(PrimeTable::Cursor source, PrimeTable::Cursor dest) {
    moved_items_.push_back(std::make_pair(source, dest));
  }

  bool CanGrow(const PrimeTable& tbl) const;

  unsigned GarbageCollect(const PrimeTable::HotBuckets& eb, PrimeTable* me);
  unsigned Evict(const PrimeTable::HotBuckets& eb, PrimeTable* me);

  unsigned evicted() const {
    return evicted_;
  }

  unsigned checked() const {
    return checked_;
  }
  const DbSlice::MovedItemsVec& moved_items() {
    return moved_items_;
  }

 private:
  DbSlice::MovedItemsVec moved_items_;
  DbSlice* db_slice_;
  ssize_t mem_offset_;
  ssize_t soft_limit_ = 0;
  const DbContext cntx_;

  unsigned evicted_ = 0;
  unsigned checked_ = 0;

  // unlike static constexpr can_evict, this parameter tells whether we can evict
  // items in runtime.
  const bool can_evict_;
  const bool apply_memory_limit_;
};

bool PrimeEvictionPolicy::CanGrow(const PrimeTable& tbl) const {
  ssize_t mem_available = db_slice_->memory_budget() + mem_offset_;
  if (!apply_memory_limit_ || mem_available > soft_limit_)
    return true;

  DCHECK_LE(tbl.size(), tbl.capacity());
  DCHECK_GT(tbl.size(), 0u);

  // We take a conservative stance here -
  // we estimate how much memory we will take with the current capacity
  // even though we may currently use less memory.
  // see https://github.com/dragonflydb/dragonfly/issues/256#issuecomment-1227095503
  size_t table_free_items = ((tbl.capacity() - tbl.size()) + PrimeTable::kSegCapacity);

  size_t obj_memory_usage = db_slice_->GetDBTable(cntx_.db_index)->stats.obj_memory_usage;
  size_t avg_obj_size = obj_memory_usage / tbl.size();

  // Catch significant discrepancies in average object size estimation.
  // Note that this may happen if for example, db0 hosts a lot of small keys,
  // db1 hosts huge keys etc. The goal of this comparison is to detect these cases and
  // confirm that discrepancy is justified. Once we gather empirical evidence,
  // we can remove this check and drop `db_slice_->bytes_per_object()` computation entirely.
  if (avg_obj_size * 20 < db_slice_->bytes_per_object() ||
      avg_obj_size > db_slice_->bytes_per_object() * 20) {
    LOG_EVERY_T(WARNING, 1) << "Avg object size estimation for the table is " << avg_obj_size
                            << " vs "
                            << " overall object size estimation " << db_slice_->bytes_per_object();
  }
  size_t obj_bytes_estimation =
      (avg_obj_size * table_free_items) * GetFlag(FLAGS_table_growth_margin);

  bool can_grow = mem_available > int64_t(PrimeTable::kSegBytes + obj_bytes_estimation);
  if (can_grow) {
    VLOG(1) << "free_items: " << table_free_items << ", obj_bytes: " << avg_obj_size << " vs "
            << db_slice_->bytes_per_object() << " "
            << " mem_available: " << mem_available;
  } else {
    LOG_EVERY_T(INFO, 1) << "Can't grow, free_items " << table_free_items
                         << ", obj_bytes: " << avg_obj_size << " vs "
                         << db_slice_->bytes_per_object() << " "
                         << " mem_available: " << mem_available;
  }

  return can_grow;
}

unsigned PrimeEvictionPolicy::GarbageCollect(const PrimeTable::HotBuckets& eb, PrimeTable* me) {
  unsigned res = 0;

  if (db_slice_->WillBlockOnJournalWrite()) {
    return res;
  }

  // Disable flush journal changes to prevent preemtion in GarbageCollect.
  journal::DisableFlushGuard journal_flush_guard(db_slice_->shard_owner()->journal());

  // bool should_print = (eb.key_hash % 128) == 0;

  // based on tests - it's more efficient to pass regular buckets to gc.
  // stash buckets are filled last so much smaller change they have expired items.
  string scratch;
  unsigned num_buckets =
      std::min<unsigned>(PrimeTable::HotBuckets::kRegularBuckets, eb.num_buckets);
  for (unsigned i = 0; i < num_buckets; ++i) {
    auto bucket_it = eb.at(i);
    for (; !bucket_it.is_done(); ++bucket_it) {
      if (bucket_it->first.HasExpire()) {
        string_view key = bucket_it->first.GetSlice(&scratch);
        ++checked_;
        auto [prime_it, exp_it] = db_slice_->ExpireIfNeeded(
            cntx_, DbSlice::Iterator(bucket_it, StringOrView::FromView(key)));
        if (prime_it.is_done())
          ++res;
      }
    }
  }

  return res;
}

unsigned PrimeEvictionPolicy::Evict(const PrimeTable::HotBuckets& eb, PrimeTable* me) {
  if (!can_evict_ || db_slice_->WillBlockOnJournalWrite())
    return 0;

  // Disable flush journal changes to prevent preemtion in evict.
  journal::DisableFlushGuard journal_flush_guard(db_slice_->shard_owner()->journal());

  constexpr size_t kNumStashBuckets = ABSL_ARRAYSIZE(eb.probes.by_type.stash_buckets);

  // choose "randomly" a stash bucket to evict an item.
  auto bucket_it = eb.probes.by_type.stash_buckets[eb.key_hash % kNumStashBuckets];
  auto last_slot_it = bucket_it;
  last_slot_it += (PrimeTable::kSlotNum - 1);
  if (!last_slot_it.is_done()) {
    // don't evict sticky items
    if (last_slot_it->first.IsSticky()) {
      return 0;
    }

    DbTable* table = db_slice_->GetDBTable(cntx_.db_index);
    auto& lt = table->trans_locks;
    string scratch;
    string_view key = last_slot_it->first.GetSlice(&scratch);
    // do not evict locked keys
    if (lt.Find(LockTag(key)).has_value())
      return 0;

    // log the evicted keys to journal.
    if (auto journal = db_slice_->shard_owner()->journal(); journal) {
      RecordExpiryBlocking(cntx_.db_index, key);
    }
    db_slice_->Del(cntx_, DbSlice::Iterator(last_slot_it, StringOrView::FromView(key)));

    ++evicted_;
  }
  me->ShiftRight(bucket_it);

  return 1;
}

class AsyncDeleter {
 public:
  static void EnqueDeletion(uint32_t next, DenseSet* ds);
  static void Shutdown();

 private:
  static constexpr uint32_t kClearStepSize = 1024;
  struct ClearNode {
    DenseSet* ds;
    uint32_t cursor;
    ClearNode* next;

    ClearNode(DenseSet* d, uint32_t c, ClearNode* n) : ds(d), cursor(c), next(n) {
    }
  };

  // Asynchronously deletes entries during the cpu-idle time.
  static int32_t IdleCb();

  // We add async deletion requests to a linked list and process them asynchronously
  // in each thread.
  static __thread ClearNode* head_;
};

__thread AsyncDeleter::ClearNode* AsyncDeleter::head_ = nullptr;

void AsyncDeleter::EnqueDeletion(uint32_t next, DenseSet* ds) {
  bool launch_task = (head_ == nullptr);

  // register ds
  head_ = new ClearNode{ds, next, head_};
  ProactorBase* pb = ProactorBase::me();
  DCHECK(pb);
  DVLOG(2) << "Adding async deletion task, thread " << pb->GetPoolIndex() << " " << launch_task;
  if (launch_task) {
    pb->AddOnIdleTask(&IdleCb);
  }
}

void AsyncDeleter::Shutdown() {
  // we do not bother with deleting objects scheduled for asynchronous deletion
  // during the shutdown. this should work well because we destroy mimalloc heap anyways.
  while (head_) {
    auto* next = head_->next;
    delete head_;
    head_ = next;
  }
}

int32_t AsyncDeleter::IdleCb() {
  if (head_ == nullptr)
    return -1;  // unregister itself.

  auto* current = head_;

  DVLOG(2) << "IdleCb " << current->cursor;
  uint32_t next = current->ds->ClearStep(current->cursor, kClearStepSize);
  if (next == current->ds->BucketCount()) {  // reached the end.
    CompactObj::DeleteMR<DenseSet>(current->ds);
    head_ = current->next;
    delete current;
  } else {
    current->cursor = next;
  }
  return ProactorBase::kOnIdleMaxLevel;
};

inline void TouchTopKeysIfNeeded(string_view key, DbTable::SampleTopKeys* sample) {
  if (sample) {
    sample->top_keys->Touch(key);
    ++sample->total_samples;
  }
}

inline void TouchHllIfNeeded(string_view key, DbTable::SampleUniqueKeys* sample) {
  if (sample) {
    HllBufferPtr hll_buf;
    hll_buf.size = getDenseHllSize();
    hll_buf.hll = sample->dense_hll;
    pfadd_dense(hll_buf, reinterpret_cast<const uint8_t*>(key.data()), key.size());
    ++sample->total_samples;
  }
}

inline void TouchValuesHistogramIfNeeded(const PrimeValue& pv, base::Histogram* hist) {
  if (hist) {
    hist->Add(pv.Size());
  }
}

inline bool MayDeleteAsynchronously(const PrimeValue& pv) {
  unsigned obj_type = pv.ObjType();
  return (obj_type == OBJ_SET || obj_type == OBJ_HASH) && pv.Encoding() == kEncodingStrMap2;
}

}  // namespace

#define ADD(x) (x) += o.x

DbStats& DbStats::operator+=(const DbStats& o) {
  constexpr size_t kDbSz = sizeof(DbStats) - sizeof(DbTableStats);
  static_assert(kDbSz == 24);

  DbTableStats::operator+=(o);

  ADD(key_count);
  ADD(prime_capacity);
  ADD(table_mem_usage);

  return *this;
}

SliceEvents& SliceEvents::operator+=(const SliceEvents& o) {
  static_assert(sizeof(SliceEvents) == 136, "You should update this function with new fields");

  ADD(evicted_keys);
  ADD(hard_evictions);
  ADD(expired_keys);
  ADD(garbage_collected);
  ADD(stash_unloaded);
  ADD(bumpups);
  ADD(garbage_checked);
  ADD(hits);
  ADD(misses);
  ADD(mutations);
  ADD(insertion_rejections);
  ADD(update);
  ADD(ram_hits);
  ADD(ram_cool_hits);
  ADD(ram_misses);
  ADD(huff_encode_total);
  ADD(huff_encode_success);
  return *this;
}

#undef ADD

class DbSlice::PrimeBumpPolicy {
 public:
  bool CanBump(const CompactObj& obj) const {
    return !obj.IsSticky();
  }
  void OnMove(PrimeTable::Cursor source, PrimeTable::Cursor dest) {
    moved_items_.push_back(std::make_pair(source, dest));
  }

  const DbSlice::MovedItemsVec& moved_items() {
    return moved_items_;
  }

 private:
  DbSlice::MovedItemsVec moved_items_;
};

DbSlice::DbSlice(uint32_t index, bool cache_mode, EngineShard* owner)
    : shard_id_(index),
      cache_mode_(cache_mode),
      owner_(owner),
      client_tracking_map_(owner->memory_resource()) {
  db_arr_.emplace_back();
  CreateDb(0);
  expire_base_[0] = expire_base_[1] = 0;

  std::string keyspace_events = GetFlag(FLAGS_notify_keyspace_events);
  if (!keyspace_events.empty() && keyspace_events != "Ex") {
    LOG(ERROR) << "Only Ex is currently supported";
    exit(0);
  }
  expired_keys_events_recording_ = !keyspace_events.empty();
}

DbSlice::~DbSlice() {
  // we do not need this code but it's easier to debug in case we encounter
  // memory allocation bugs during delete operations.

  for (auto& db : db_arr_) {
    if (!db)
      continue;
    db.reset();
  }

  AsyncDeleter::Shutdown();
}

auto DbSlice::GetStats() const -> Stats {
  Stats s;
  s.events = events_;
  s.db_stats.resize(db_arr_.size());

  for (size_t i = 0; i < db_arr_.size(); ++i) {
    if (!db_arr_[i])
      continue;
    const auto& db_wrap = *db_arr_[i];
    DbStats& stats = s.db_stats[i];
    stats = db_wrap.stats;
    stats.key_count = db_wrap.prime.size();
    stats.prime_capacity = db_wrap.prime.capacity();
    stats.table_mem_usage = db_wrap.table_memory();
  }
  auto co_stats = CompactObj::GetStatsThreadLocal();
  s.small_string_bytes = co_stats.small_string_bytes;
  s.events.huff_encode_total = co_stats.huff_encode_total;
  s.events.huff_encode_success = co_stats.huff_encode_success;

  return s;
}

SlotStats DbSlice::GetSlotStats(SlotId sid) const {
  CHECK(db_arr_[0]);
  return db_arr_[0]->slots_stats[sid];
}

void DbSlice::Reserve(DbIndex db_ind, size_t key_size) {
  ActivateDb(db_ind);

  auto& db = db_arr_[db_ind];
  DCHECK(db);

  db->prime.Reserve(key_size);
}

DbSlice::AutoUpdater::AutoUpdater() {
}

DbSlice::AutoUpdater::AutoUpdater(AutoUpdater&& o) noexcept {
  *this = std::move(o);
}

DbSlice::AutoUpdater& DbSlice::AutoUpdater::operator=(AutoUpdater&& o) noexcept {
  Run();
  fields_ = o.fields_;
  o.Cancel();
  return *this;
}

DbSlice::AutoUpdater::~AutoUpdater() {
  Run();
}

void DbSlice::AutoUpdater::ReduceHeapUsage() {
  AccountObjectMemory(fields_.key, fields_.it->second.ObjType(), -fields_.orig_value_heap_size,
                      fields_.db_slice->GetDBTable(fields_.db_ind));
  fields_.orig_value_heap_size = 0;  // Reset to avoid double accounting.
}

void DbSlice::AutoUpdater::Run() {
  if (fields_.db_slice == nullptr) {
    return;
  }

  // Check that AutoUpdater does not run after a key was removed.
  // If this CHECK() failed for you, it probably means that you deleted a key while having an auto
  // updater in scope. You'll probably want to call Run() (or Cancel() - but be careful).
  DCHECK(IsValid(fields_.db_slice->db_arr_[fields_.db_ind]->prime.Find(fields_.key)));

  CHECK_NE(fields_.db_slice, nullptr);

  ssize_t delta = static_cast<int64_t>(fields_.it->second.MallocUsed()) -
                  static_cast<int64_t>(fields_.orig_value_heap_size);
  AccountObjectMemory(fields_.key, fields_.it->second.ObjType(), delta,
                      fields_.db_slice->GetDBTable(fields_.db_ind));
  fields_.db_slice->PostUpdate(fields_.db_ind, fields_.key);
  Cancel();  // Reset to not run again
}

void DbSlice::AutoUpdater::Cancel() {
  this->fields_ = {};
}

DbSlice::AutoUpdater::AutoUpdater(DbIndex db_ind, std::string_view key, const Iterator& it,
                                  DbSlice* db_slice)
    : fields_{.db_slice = db_slice,
              .db_ind = db_ind,
              .it = it,
              .key = key,
              .orig_value_heap_size = it->second.MallocUsed()} {
  DCHECK(IsValid(it));
}

DbSlice::ItAndUpdater DbSlice::FindMutable(const Context& cntx, string_view key) {
  return std::move(FindMutableInternal(cntx, key, std::nullopt).value());
}

OpResult<DbSlice::ItAndUpdater> DbSlice::FindMutable(const Context& cntx, string_view key,
                                                     unsigned req_obj_type) {
  return FindMutableInternal(cntx, key, req_obj_type);
}

OpResult<DbSlice::ItAndUpdater> DbSlice::FindMutableInternal(const Context& cntx, string_view key,
                                                             std::optional<unsigned> req_obj_type) {
  auto res = FindInternal(cntx, key, req_obj_type, UpdateStatsMode::kMutableStats);
  if (!res.ok()) {
    return res.status();
  }

  auto it = Iterator(res->it, StringOrView::FromView(key));
  auto exp_it = ExpIterator(res->exp_it, StringOrView::FromView(key));
  PreUpdateBlocking(cntx.db_index, it);
  // PreUpdate() might have caused a deletion of `it`
  if (res->it.IsOccupied()) {
    DCHECK_GE(db_arr_[cntx.db_index]->stats.obj_memory_usage, res->it->second.MallocUsed());

    return {{it, exp_it, AutoUpdater{cntx.db_index, key, it, this}}};
  } else {
    return OpStatus::KEY_NOTFOUND;
  }
}

DbSlice::ItAndExpConst DbSlice::FindReadOnly(const Context& cntx, std::string_view key) const {
  auto res = FindInternal(cntx, key, std::nullopt, UpdateStatsMode::kReadStats);
  return {ConstIterator(res->it, StringOrView::FromView(key)),
          ExpConstIterator(res->exp_it, StringOrView::FromView(key))};
}

OpResult<DbSlice::ConstIterator> DbSlice::FindReadOnly(const Context& cntx, string_view key,
                                                       unsigned req_obj_type) const {
  auto res = FindInternal(cntx, key, req_obj_type, UpdateStatsMode::kReadStats);
  if (res.ok()) {
    return ConstIterator(res->it, StringOrView::FromView(key));
  }
  return res.status();
}

auto DbSlice::FindInternal(const Context& cntx, string_view key, optional<unsigned> req_obj_type,
                           UpdateStatsMode stats_mode) const -> OpResult<PrimeItAndExp> {
  if (!IsDbValid(cntx.db_index)) {  // Can it even happen?
    LOG(DFATAL) << "Invalid db index " << cntx.db_index;
    return OpStatus::KEY_NOTFOUND;
  }

  auto& db = *db_arr_[cntx.db_index];
  PrimeItAndExp res;
  res.it = db.prime.Find(key);
  int miss_weight = (stats_mode == UpdateStatsMode::kReadStats);

  if (!IsValid(res.it)) {
    events_.misses += miss_weight;
    db.stats.events.misses += miss_weight;
    return OpStatus::KEY_NOTFOUND;
  }

  TouchTopKeysIfNeeded(key, db.sample_top_keys);
  TouchHllIfNeeded(key, db.sample_unique_keys);
  TouchValuesHistogramIfNeeded(res.it->second, db.sample_values_hist);

  if (req_obj_type.has_value() && res.it->second.ObjType() != req_obj_type.value()) {
    events_.misses += miss_weight;
    db.stats.events.misses += miss_weight;
    return OpStatus::WRONG_TYPE;
  }

  if (res.it->first.HasExpire()) {  // check expiry state
    res = ExpireIfNeeded(cntx, res.it);
    if (!IsValid(res.it)) {
      events_.misses += miss_weight;
      db.stats.events.misses += miss_weight;
      return OpStatus::KEY_NOTFOUND;
    }
  }

  DCHECK(IsValid(res.it));

  if (IsCacheMode()) {
    fetched_items_.insert({res.it->first.HashCode(), cntx.db_index});
  }

  switch (stats_mode) {
    case UpdateStatsMode::kMutableStats:
      events_.mutations++;
      break;
    case UpdateStatsMode::kReadStats:
      events_.hits++;
      db.stats.events.hits++;
      if (db.slots_stats) {
        db.slots_stats[KeySlot(key)].total_reads++;
      }
      if (res.it->second.IsExternal()) {
        if (res.it->second.IsCool())
          events_.ram_cool_hits++;
        else
          events_.ram_misses++;
      } else {
        events_.ram_hits++;
      }
      break;
  }

  auto& pv = res.it->second;

  // Cancel any pending stashes of looked up values
  // Rationale: we either look it up for reads - and then it's hot, or alternatively,
  // we follow up with modifications, so the pending stash becomes outdated.
  if (pv.HasStashPending()) {
    owner_->tiered_storage()->CancelStash(cntx.db_index, key, &pv);
  }

  // Fetch back cool items
  if (pv.IsExternal() && pv.IsCool()) {
    pv = owner_->tiered_storage()->Warmup(cntx.db_index, pv.GetCool());
  }

  // Mark this entry as being looked up. We use key (first) deliberately to preserve the hotness
  // attribute of the entry in case of value overrides.
  res.it->first.SetTouched(true);

  return res;
}

OpResult<DbSlice::ItAndUpdater> DbSlice::AddOrFind(const Context& cntx, string_view key,
                                                   std::optional<unsigned> req_obj_type) {
  return AddOrFindInternal(cntx, key, req_obj_type);
}

OpResult<DbSlice::ItAndUpdater> DbSlice::AddOrFindInternal(const Context& cntx, string_view key,
                                                           std::optional<unsigned> req_obj_type) {
  DCHECK(IsDbValid(cntx.db_index));

  DbTable& db = *db_arr_[cntx.db_index];
  auto res = FindInternal(cntx, key, req_obj_type, UpdateStatsMode::kMutableStats);

  if (res.ok()) {
    Iterator it(res->it, StringOrView::FromView(key));
    ExpIterator exp_it(res->exp_it, StringOrView::FromView(key));
    PreUpdateBlocking(cntx.db_index, it);

    // PreUpdate() might have caused a deletion of `it`
    if (res->it.IsOccupied()) {
      return ItAndUpdater{
          .it = it, .exp_it = exp_it, .post_updater{cntx.db_index, key, it, this}, .is_new = false};
    } else {
      res = OpStatus::KEY_NOTFOUND;
    }
  } else if (res == OpStatus::WRONG_TYPE) {
    return OpStatus::WRONG_TYPE;
  }

  auto status = res.status();
  CHECK(status == OpStatus::KEY_NOTFOUND || status == OpStatus::OUT_OF_MEMORY) << status;

  // It's a new entry.
  CallChangeCallbacks(cntx.db_index, ChangeReq{key});

  ssize_t memory_offset = -key.size();
  size_t reclaimed = 0;
  // If we are low on memory due to cold storage, free some memory.
  if (owner_->tiered_storage()) {
    // At least 40KB bytes to cover potential segment split.
    ssize_t red_line = std::max<size_t>(key.size() * 2, 40_KB);
    if (memory_budget_ < red_line) {
      size_t goal = red_line - memory_budget_;
      reclaimed = owner_->tiered_storage()->ReclaimMemory(goal);
      memory_budget_ += reclaimed;
    }

    // CoolMemoryUsage is the memory that we can always reclaim, like in the block above,
    // therefore we include it for PrimeEvictionPolicy considerations.
    memory_offset += owner_->tiered_storage()->CoolMemoryUsage();
  }

  // In case we are loading from rdb file or replicating we want to disable conservative memory
  // checks (inside PrimeEvictionPolicy::CanGrow) and reject insertions only after we pass max
  // memory limit. When loading a snapshot created by the same server configuration (memory and
  // number of shards) we will create a different dash table segment directory tree, because the
  // tree shape is related to the order of entries insertion. Therefore when loading data from
  // snapshot or from replication the conservative memory checks might fail as the new tree might
  // have more segments. Because we dont want to fail loading a snapshot from the same server
  // configuration we disable this checks on loading and replication.
  bool apply_memory_limit =
      !owner_->IsReplica() && !(ServerState::tlocal()->gstate() == GlobalState::LOADING);

  // If we are over limit in non-cache scenario, just be conservative and throw.
  if (apply_memory_limit && !IsCacheMode() && memory_budget_ + memory_offset < 0) {
    LOG_EVERY_T(WARNING, 1) << "AddOrFind: over limit, budget: " << memory_budget_
                            << " reclaimed: " << reclaimed << " offset: " << memory_offset;
    events_.insertion_rejections++;
    return OpStatus::OUT_OF_MEMORY;
  }

  ssize_t soft_budget_limit =
      (0.3 * max_memory_limit.load(memory_order_relaxed)) / shard_set->size();
  PrimeEvictionPolicy evp{cntx,          (IsCacheMode() && !owner_->IsReplica()),
                          memory_offset, soft_budget_limit,
                          this,          apply_memory_limit};

  // Fast-path if change_cb_ is empty so we Find or Add using
  // the insert operation: twice more efficient.
  PrimeIterator it;

  ssize_t table_before = db.prime.mem_usage();

  try {
    it = db.prime.InsertNew(key, PrimeValue{}, evp);
  } catch (bad_alloc& e) {
    LOG_EVERY_T(WARNING, 1) << "AddOrFind: InsertNew failed, budget: " << memory_budget_
                            << " reclaimed: " << reclaimed << " offset: " << memory_offset;
    events_.insertion_rejections++;
    return OpStatus::OUT_OF_MEMORY;
  }
  CallMovedCallbacks(cntx.db_index, evp.moved_items());

  events_.mutations++;
  ssize_t table_increase = db.prime.mem_usage() - table_before;
  memory_budget_ -= table_increase;

  if (memory_budget_ < 0 && apply_memory_limit) {
    // We may reach the state when our memory usage is below the limit even if we
    // do not add new segments. For example, we have half full segments
    // and we add new objects or update the existing ones and our memory usage grows.
    // We do not require for a single operation to unload the whole negative debt.
    // Instead, we create a positive, converging force that should help with freeing enough memory.
    // Free at least K bytes or 3% of the total debt.
    // TODO: to reenable and optimize this - this call significantly slows down server
    // when evictions are running.
#if 0
    size_t evict_goal = std::max<size_t>(512, (-evp.mem_budget()) / 32);
    auto [items, bytes] = FreeMemWithEvictionStep(cntx.db_index, it.segment_id(), evict_goal);
    events_.hard_evictions += items;
#endif
  }

  table_memory_ += table_increase;
  entries_count_++;

  if (it->first.IsInline()) {
    ++db.stats.inline_keys;
  } else {
    AccountObjectMemory(key, OBJ_KEY, it->first.MallocUsed(), &db);  // Account for key
  }

  DCHECK_EQ(it->second.MallocUsed(), 0UL);  // Make sure accounting is no-op
  it.SetVersion(NextVersion());

  TouchTopKeysIfNeeded(key, db.sample_top_keys);
  TouchHllIfNeeded(key, db.sample_unique_keys);

  events_.garbage_collected = db.prime.garbage_collected();
  events_.stash_unloaded = db.prime.stash_unloaded();
  events_.evicted_keys += evp.evicted();
  db.stats.events.evicted_keys += evp.evicted();
  events_.garbage_checked += evp.checked();
  if (db.slots_stats) {
    SlotId sid = KeySlot(key);
    db.slots_stats[sid].key_count += 1;
  }

  return ItAndUpdater{
      .it = Iterator(it, StringOrView::FromView(key)),
      .exp_it = ExpIterator{},
      .post_updater{cntx.db_index, key, Iterator(it, StringOrView::FromView(key)), this},
      .is_new = true};
}

void DbSlice::ActivateDb(DbIndex db_ind) {
  if (db_arr_.size() <= db_ind)
    db_arr_.resize(db_ind + 1);
  CreateDb(db_ind);
}

void DbSlice::Del(Context cntx, Iterator it, DbTable* db_table, bool async) {
  CHECK(IsValid(it));

  ExpIterator exp_it;
  DbTable* table = db_table ? db_table : db_arr_[cntx.db_index].get();
  auto obj_type = it->second.ObjType();

  if (doc_del_cb_ && (obj_type == OBJ_JSON || obj_type == OBJ_HASH)) {
    string tmp;
    string_view key = it->first.GetSlice(&tmp);
    doc_del_cb_(key, cntx, it->second);
  }

  PerformDeletionAtomic(it, exp_it, table, async);
}

void DbSlice::DelMutable(Context cntx, ItAndUpdater it_updater) {
  it_updater.post_updater.Run();
  Del(cntx, it_updater.it);
}

void DbSlice::FlushSlotsFb(const cluster::SlotSet& slot_ids) {
  VLOG(1) << "Start FlushSlotsFb";
  // Slot deletion can take time as it traverses all the database, hence it runs in fiber.
  // We want to flush all the data of a slot that was added till the time the call to FlushSlotsFb
  // was made. Therefore we delete slots entries with version < next_version
  uint64_t next_version = 0;
  uint64_t del_count = 0;

  // Explicitly copy table smart pointer to keep reference count up (flushall drops it)
  boost::intrusive_ptr<DbTable> table = db_arr_.front();
  size_t memory_before = table->table_memory() + table->stats.obj_memory_usage;

  DbContext db_cntx;
  db_cntx.time_now_ms = GetCurrentTimeMs();
  db_cntx.db_index = table->index;

  std::string tmp;
  auto iterate_bucket = [&](PrimeTable::bucket_iterator it) {
    it.AdvanceIfNotOccupied();
    while (!it.is_done()) {
      std::string_view key = it->first.GetSlice(&tmp);
      SlotId sid = KeySlot(key);
      if (slot_ids.Contains(sid) && it.GetVersion() < next_version) {
        // We use copy of table smart pointer and pass it as table because FLLUSHALL can drop table.
        Del(db_cntx, Iterator::FromPrime(it), table.get());
        ++del_count;
      }
      ++it;
    }
  };

  auto on_change = [&](DbIndex db_index, const ChangeReq& req) {
    FiberAtomicGuard fg;
    PrimeTable* table = GetTables(db_index).first;

    if (const PrimeTable::bucket_iterator* bit = req.update()) {
      if (!bit->is_done() && bit->GetVersion() < next_version) {
        iterate_bucket(*bit);
      }
    } else {
      string_view key = get<string_view>(req.change);
      table->CVCUponInsert(next_version, key,
                           [next_version, iterate_bucket](PrimeTable::bucket_iterator it) {
                             DCHECK_LT(it.GetVersion(), next_version);
                             iterate_bucket(it);
                           });
    }
  };
  next_version = RegisterOnChange(std::move(on_change));

  ServerState& etl = *ServerState::tlocal();
  PrimeTable* pt = &table->prime;
  PrimeTable::Cursor cursor;

  do {
    PrimeTable::Cursor next = pt->TraverseBuckets(cursor, iterate_bucket);
    cursor = next;
    ThisFiber::Yield();
  } while (cursor && etl.gstate() != GlobalState::SHUTTING_DOWN);

  VLOG(1) << "FlushSlotsFb del count is: " << del_count;
  UnregisterOnChange(next_version);

  if (absl::GetFlag(FLAGS_cluster_flush_decommit_memory)) {
    int64_t start = absl::GetCurrentTimeNanos();
    etl.DecommitMemory(ServerState::kDataHeap);
    int64_t took = absl::GetCurrentTimeNanos() - start;
    size_t memory_after = table->table_memory() + table->stats.obj_memory_usage;

    LOG(INFO) << "Memory decommit took " << took << "ns, deleted " << del_count << ", memory delta "
              << (memory_before - memory_after);
  }
}

void DbSlice::FlushSlots(const cluster::SlotRanges& slot_ranges) {
  cluster::SlotSet slot_set(slot_ranges);
  InvalidateSlotWatches(slot_set);
  fb2::Fiber("flush_slots", [this, slot_set = std::move(slot_set)]() mutable {
    FlushSlotsFb(slot_set);
  }).Detach();
}

util::fb2::Fiber DbSlice::FlushDbIndexes(const std::vector<DbIndex>& indexes) {
  bool clear_tiered = owner_->tiered_storage() != nullptr;

  if (clear_tiered)
    RemoveOffloadedEntriesFromTieredStorage(indexes, db_arr_);

  DbTableArray flush_db_arr(db_arr_.size());

  for (DbIndex index : indexes) {
    if (index == 0) {
      // TODO: Async dealloc?
      // TODO: Drop of global HNSW index doesn't respect per-shard ordering
      owner_->search_indices()->DropAllIndices();
    }

    table_memory_ -= db_arr_[index]->table_memory();
    entries_count_ -= db_arr_[index]->prime.size();

    InvalidateDbWatches(index);
    flush_db_arr[index] = std::move(db_arr_[index]);

    CreateDb(index);
    std::swap(db_arr_[index]->trans_locks, flush_db_arr[index]->trans_locks);
  }

  LOG_IF(DFATAL, !fetched_items_.empty())
      << "Some operation might bumped up items outside of a transaction";

  auto cb = [flush_db_arr = std::move(flush_db_arr)]() mutable {
    flush_db_arr.clear();
    ServerState::tlocal()->DecommitMemory(ServerState::kDataHeap | ServerState::kBackingHeap |
                                          ServerState::kGlibcmalloc);
  };

  return {"flush_dbs", std::move(cb)};
}

util::fb2::Fiber DbSlice::FlushDb(DbIndex db_ind) {
  DVLOG(1) << "Flushing db " << db_ind;

  // clear client tracking map.
  client_tracking_map_.clear();

  if (db_ind != kDbAll)  // Flush a single database if a specific index is provided
    return FlushDbIndexes({db_ind});

  std::vector<DbIndex> indexes;
  indexes.reserve(db_arr_.size());
  for (DbIndex i = 0; i < db_arr_.size(); ++i) {
    if (db_arr_[i]) {
      indexes.push_back(i);
    }
  }

  return FlushDbIndexes(indexes);
}

void DbSlice::AddExpire(DbIndex db_ind, const Iterator& main_it, uint64_t at) {
  bool had_expire = main_it->first.HasExpire();
  bool was_inline = main_it->first.IsInline();
  ssize_t old_malloc = static_cast<ssize_t>(main_it->first.MallocUsed());

  main_it->first.SetExpireTime(at);

  auto& db = *db_arr_[db_ind];
  ssize_t new_malloc = static_cast<ssize_t>(main_it->first.MallocUsed());
  if (was_inline && !main_it->first.IsInline()) {
    --db.stats.inline_keys;
    AccountObjectMemory(main_it.key(), OBJ_KEY, new_malloc, &db);
  } else if (new_malloc != old_malloc) {
    AccountObjectMemory(main_it.key(), OBJ_KEY, new_malloc - old_malloc, &db);
  }

  if (!had_expire)
    ++db.stats.expire_count;
}

bool DbSlice::RemoveExpire(DbIndex db_ind, const Iterator& main_it) {
  if (!main_it->first.HasExpire())
    return false;

  DCHECK(!main_it->first.IsInline());  // SDS_TTL_TAG is never inline
  ssize_t old_malloc = static_cast<ssize_t>(main_it->first.MallocUsed());

  main_it->first.ClearExpireTime();

  auto& db = *db_arr_[db_ind];
  ssize_t new_malloc = static_cast<ssize_t>(main_it->first.MallocUsed());
  if (main_it->first.IsInline()) {
    AccountObjectMemory(main_it.key(), OBJ_KEY, -old_malloc, &db);
    ++db.stats.inline_keys;
  } else if (new_malloc != old_malloc) {
    AccountObjectMemory(main_it.key(), OBJ_KEY, new_malloc - old_malloc, &db);
  }

  --db.stats.expire_count;
  return true;
}

bool DbSlice::SetMCFlag(DbIndex db_ind, const PrimeKey& key, uint32_t flag) {
  DCHECK(!key.IsRef());

  auto& db = *db_arr_[db_ind];
  string scratch;
  if (flag == 0 && !db.mcflag.Empty()) {
    auto mcit = db.mcflag.Find(key.GetSlice(&scratch));
    if (mcit != db.mcflag.end()) {
      db.mcflag.Erase(mcit);
      return true;
    }
  } else if (flag != 0) {
    auto [it, _] = db.mcflag.Insert(key.GetSlice(&scratch), flag);
    it->second = flag;
    return true;
  }
  return false;
}

uint32_t DbSlice::GetMCFlag(DbIndex db_ind, const PrimeKey& key) const {
  auto& db = *db_arr_[db_ind];
  string scratch;
  auto it = db.mcflag.Find(key.GetSlice(&scratch));
  if (it.is_done()) {
    LOG(DFATAL) << "Internal error, inconsistent state, mcflag should be present but not found "
                << key.ToString();
    return 0;
  }
  return it->second;
}

OpResult<DbSlice::ItAndUpdater> DbSlice::AddNew(const Context& cntx, string_view key,
                                                PrimeValue obj, uint64_t expire_at_ms) {
  auto op_result = AddOrUpdateInternal(cntx, key, std::move(obj), expire_at_ms, false);
  RETURN_ON_BAD_STATUS(op_result);
  auto& res = *op_result;
  CHECK(res.is_new);

  return DbSlice::ItAndUpdater{
      .it = res.it, .exp_it = res.exp_it, .post_updater = std::move(res.post_updater)};
}

int64_t DbSlice::ExpireParams::Cap(int64_t value, TimeUnit unit) {
  return unit == TimeUnit::SEC ? min(value, kMaxExpireDeadlineSec)
                               : min(value, kMaxExpireDeadlineMs);
}

pair<int64_t, int64_t> DbSlice::ExpireParams::Calculate(uint64_t now_ms, bool cap) const {
  if (persist)
    return {0, 0};

  // return a negative absolute time if we overflow.
  if (unit == TimeUnit::SEC && value > INT64_MAX / 1000) {
    return {0, -1};
  }

  int64_t msec = (unit == TimeUnit::SEC) ? value * 1000 : value;
  int64_t rel_msec = absolute ? msec - now_ms : msec;
  if (cap)
    rel_msec = Cap(rel_msec, TimeUnit::MSEC);
  return make_pair(rel_msec, now_ms + rel_msec);
}

OpResult<int64_t> DbSlice::UpdateExpire(const Context& cntx, Iterator prime_it,
                                        ExpIterator expire_it, const ExpireParams& params) {
  constexpr uint64_t kPersistValue = 0;
  DCHECK(params.IsDefined());
  DCHECK(IsValid(prime_it));

  if (params.persist) {  // Persist means remove expiry
    RemoveExpire(cntx.db_index, prime_it);
    return kPersistValue;
  }

  auto [rel_msec, abs_msec] = params.Calculate(cntx.time_now_ms, false);
  if (abs_msec < 0 || rel_msec > kMaxExpireDeadlineMs) {
    return OpStatus::OUT_OF_RANGE;
  }

  int64_t current_cmp = numeric_limits<int64_t>::max();  // inf if no expiry is set
  bool satisfied = params.expire_options == ExpireFlags::EXPIRE_ALWAYS;

  if (prime_it->first.HasExpire()) {
    current_cmp = prime_it->first.GetExpireTime();
    satisfied |= (params.expire_options & ExpireFlags::EXPIRE_XX);
  } else {
    satisfied |= (params.expire_options & ExpireFlags::EXPIRE_NX);
  }

  satisfied |= (params.expire_options & ExpireFlags::EXPIRE_LT) && (abs_msec < current_cmp);
  satisfied |= (params.expire_options & ExpireFlags::EXPIRE_GT) && (abs_msec > current_cmp);

  if (!satisfied)
    return OpStatus::SKIPPED;

  // If we update and the new value is already expired, delete the key
  if (rel_msec <= 0) {
    Del(cntx, prime_it);
    return -1;
  }

  AddExpire(cntx.db_index, prime_it, abs_msec);
  return abs_msec;
}

OpResult<DbSlice::ItAndUpdater> DbSlice::AddOrUpdateInternal(const Context& cntx,
                                                             std::string_view key, PrimeValue obj,
                                                             uint64_t expire_at_ms,
                                                             bool force_update) {
  DCHECK(!obj.IsRef());

  auto op_result = AddOrFind(cntx, key, std::nullopt);
  RETURN_ON_BAD_STATUS(op_result);

  auto& res = *op_result;
  if (!res.is_new && !force_update)  // have not inserted.
    return op_result;

  auto& it = res.it;

  it->second = std::move(obj);

  if (expire_at_ms) {
    AddExpire(cntx.db_index, it, expire_at_ms);
  } else {
    RemoveExpire(cntx.db_index, it);
  }

  return op_result;
}

OpResult<DbSlice::ItAndUpdater> DbSlice::AddOrUpdate(const Context& cntx, string_view key,
                                                     PrimeValue obj, uint64_t expire_at_ms) {
  return AddOrUpdateInternal(cntx, key, std::move(obj), expire_at_ms, true);
}

size_t DbSlice::DbSize(DbIndex db_ind) const {
  DCHECK_LT(db_ind, db_array_size());

  if (IsDbValid(db_ind)) {
    return db_arr_[db_ind]->prime.size();
  }
  return 0;
}

bool DbSlice::Acquire(IntentLock::Mode mode, const KeyLockArgs& lock_args) {
  if (lock_args.fps.empty()) {  // Can be empty for NO_KEY_TRANSACTIONAL commands.
    return true;
  }
  DCHECK_LT(lock_args.db_index, db_array_size());

  auto& lt = db_arr_[lock_args.db_index]->trans_locks;
  bool lock_acquired = true;

  if (lock_args.fps.size() == 1) {
    lock_acquired = lt.Acquire(lock_args.fps.front(), mode);
    uniq_fps_ = {lock_args.fps.front()};  // needed only for tests.
  } else {
    uniq_fps_.clear();

    for (LockFp fp : lock_args.fps) {
      if (uniq_fps_.insert(fp).second) {
        lock_acquired &= lt.Acquire(fp, mode);
      }
    }
  }

  DVLOG(2) << "Acquire " << IntentLock::ModeName(mode) << " for " << lock_args.fps[0]
           << " has_acquired: " << lock_acquired;

  return lock_acquired;
}

void DbSlice::Release(IntentLock::Mode mode, const KeyLockArgs& lock_args) {
  if (lock_args.fps.empty()) {  // Can be empty for NO_KEY_TRANSACTIONAL commands.
    return;
  }

  DVLOG(2) << "Release " << IntentLock::ModeName(mode) << " for " << lock_args.fps[0];
  auto& lt = db_arr_[lock_args.db_index]->trans_locks;
  if (lock_args.fps.size() == 1) {
    uint64_t fp = lock_args.fps.front();
    lt.Release(fp, mode);
  } else {
    uniq_fps_.clear();
    for (LockFp fp : lock_args.fps) {
      if (uniq_fps_.insert(fp).second) {
        lt.Release(fp, mode);
      }
    }
  }
  uniq_fps_.clear();
}

bool DbSlice::CheckLock(IntentLock::Mode mode, DbIndex dbid, uint64_t fp) const {
  const auto& lt = db_arr_[dbid]->trans_locks;
  auto lock = lt.Find(fp);
  if (lock) {
    return lock->Check(mode);
  }
  return true;
}

void DbSlice::PreUpdateBlocking(DbIndex db_ind, const Iterator& it) {
  CallChangeCallbacks(db_ind, ChangeReq{it.GetInnerIt()});  // blocking point.
  auto inner_it = it.GetInnerIt();                          // must call again to launder.
  inner_it.SetVersion(NextVersion());
}

void DbSlice::PostUpdate(DbIndex db_ind, std::string_view key) {
  auto& db = *db_arr_[db_ind];
  auto& watched_keys = db.watched_keys;
  if (!watched_keys.empty()) {
    // Check if the key is watched.
    if (auto wit = watched_keys.find(key); wit != watched_keys.end()) {
      for (auto* dirty_ptr : wit->second)
        dirty_ptr->store(true, memory_order_relaxed);
      // No connections need to watch it anymore.
      watched_keys.erase(wit);
    }
  }

  ++events_.update;

  if (db.slots_stats) {
    db.slots_stats[KeySlot(key)].total_writes += 1;
  }

  if (!client_tracking_map_.empty()) {
    QueueInvalidationTrackingMessageAtomic(key);
  }
}

DbSlice::ItAndExp DbSlice::ExpireIfNeeded(const Context& cntx, Iterator it) const {
  auto res = ExpireIfNeeded(cntx, it.GetInnerIt());
  return {.it = Iterator::FromPrime(res.it), .exp_it = ExpIterator::FromPrime(res.exp_it)};
}

DbSlice::PrimeItAndExp DbSlice::ExpireIfNeeded(const Context& cntx, PrimeIterator it) const {
  if (!it->first.HasExpire()) {
    LOG(DFATAL) << "Invalid call to ExpireIfNeeded";
    return {it, ExpireIterator{}};
  }

  int64_t expire_time = it->first.GetExpireTime();

  // Never do expiration on replica or if expiration is disabled.
  if (int64_t(cntx.time_now_ms) < expire_time || owner_->IsReplica() || !expire_allowed_) {
    return {it, ExpireIterator{}};
  }

  string scratch;
  string_view key = it->first.GetSlice(&scratch);

  // Replicate expiry
  if (auto journal = owner_->journal(); journal) {
    RecordExpiryBlocking(cntx.db_index, key);
  }

  auto& db = db_arr_[cntx.db_index];
  if (expired_keys_events_recording_)
    db->expired_keys_events_.emplace_back(key);

  auto obj_type = it->second.ObjType();
  if (doc_del_cb_ && (obj_type == OBJ_JSON || obj_type == OBJ_HASH)) {
    doc_del_cb_(key, cntx, it->second);
  }

  const_cast<DbSlice*>(this)->PerformDeletionAtomic(Iterator(it, StringOrView::FromView(key)),
                                                    ExpIterator{}, db.get());

  ++events_.expired_keys;
  db->stats.events.expired_keys++;

  return {PrimeIterator{}, ExpireIterator{}};
}

void DbSlice::ExpireAllIfNeeded() {
  // We hold no locks to any of the keys so we should Wait() here such that
  // we don't preempt in ExpireIfNeeded
  serialization_latch_.Wait();
  // Disable flush journal changes to prevent preemtion in traverse.
  journal::DisableFlushGuard journal_flush_guard(owner_->journal());

  for (DbIndex db_index = 0; db_index < db_arr_.size(); db_index++) {
    if (!db_arr_[db_index])
      continue;
    auto& db = *db_arr_[db_index];

    auto cb = [&](PrimeTable::iterator prime_it) {
      if (prime_it->first.HasExpire()) {
        ExpireIfNeeded(Context{nullptr, db_index, GetCurrentTimeMs()}, prime_it);
      }
    };

    PrimeTable::Cursor cursor;
    do {
      cursor = db.prime.Traverse(cursor, cb);
    } while (cursor);
  }
}

uint64_t DbSlice::RegisterOnChange(ChangeCallback cb) {
  return change_cb_.emplace_back(NextVersion(), std::move(cb)).first;
}

uint64_t DbSlice::RegisterOnMove(MovedCallback cb) {
  ++next_moved_id_;
  moved_cb_.emplace_back(next_moved_id_, cb);
  return next_moved_id_;
}

// Ordering invariant (PIT mode):
//   When the traversal fiber visits a bucket in BucketSaveCb, earlier-registered snapshots
//   (those with snapshot_version_ < this snapshot's version) may not have serialized this bucket
//   yet. FlushChangeToEarlierCallbacks invokes their OnDbChange callbacks so they serialize the
//   bucket before the current snapshot stamps it with its own version. Without this, an earlier
//   snapshot could miss the bucket entirely — its traversal already passed it, and the version
//   stamp from the current snapshot would cause the earlier snapshot's OnDbChange to skip it.
void DbSlice::FlushChangeToEarlierCallbacks(DbIndex db_ind, Iterator it, uint64_t upper_bound) {
  unique_lock<LocalLatch> lk(serialization_latch_);

  uint64_t bucket_version = it.GetVersion();
  // change_cb_ is ordered by version.
  DVLOG(2) << "Running callbacks in dbid " << db_ind << " with bucket_version=" << bucket_version
           << ", upper_bound=" << upper_bound;

  const size_t limit = change_cb_.size();
  auto ccb = change_cb_.begin();
  for (size_t i = 0; i < limit; ++i) {
    uint64_t cb_version = ccb->first;
    DCHECK_LE(cb_version, upper_bound);
    if (cb_version == upper_bound) {
      return;
    }
    if (bucket_version < cb_version) {
      ccb->second(db_ind, ChangeReq{it.GetInnerIt()});
    }
    ++ccb;
  }
}

//! Unregisters the callback.
void DbSlice::UnregisterOnChange(uint64_t id) {
  serialization_latch_.Wait();
  auto it = find_if(change_cb_.begin(), change_cb_.end(),
                    [id](const auto& cb) { return cb.first == id; });
  CHECK(it != change_cb_.end());
  change_cb_.erase(it);
}

void DbSlice::UnregisterOnMoved(uint64_t id) {
  serialization_latch_.Wait();
  auto it =
      find_if(moved_cb_.begin(), moved_cb_.end(), [id](const auto& cb) { return cb.first == id; });
  CHECK(it != moved_cb_.end());
  moved_cb_.erase(it);
}

auto DbSlice::DeleteExpiredStep(const Context& cntx, unsigned count) -> DeleteExpiredStats {
  auto& db = *db_arr_[cntx.db_index];
  DeleteExpiredStats result;

  std::string stash;

  unsigned checked = 0;
  auto cb = [&](PrimeTable::iterator it) {
    result.traversed++;

    if (!it->first.HasExpire())
      return;

    checked++;

    string_view key = it->first.GetSlice(&stash);
    if (!CheckLock(IntentLock::EXCLUSIVE, cntx.db_index, key))
      return;

    int64_t ttl = it->first.GetExpireTime() - cntx.time_now_ms;
    if (ttl <= 0) {
      result.deleted_bytes += it->first.MallocUsed() + it->second.MallocUsed();
      ExpireIfNeeded(cntx, it);
      ++result.deleted;
    }
  };

  unsigned i = 0;

  auto quota_remains = [] {
    // Break out of traversal if we spent more than 1ms
    return base::CycleClock::ToUsec(ThisFiber::GetRunningTimeCycles()) < 1000;
  };

  for (; i < count / 3 && quota_remains(); ++i) {
    db.expire_cursor = db.prime.Traverse(db.expire_cursor, cb);
  }

  // Continue traversing if we had a strong deletion rate among checked TTL keys.
  if (result.deleted * 4 > checked) {
    for (; i < count && quota_remains(); ++i) {
      db.expire_cursor = db.prime.Traverse(db.expire_cursor, cb);
    }
  }

  // Send and clear accumulated expired key events
  if (auto& events = db_arr_[cntx.db_index]->expired_keys_events_; !events.empty()) {
    ChannelStore* store = ServerState::tlocal()->channel_store();
    store->SendMessages(absl::StrCat("__keyevent@", cntx.db_index, "__:expired"), events, false);
    events.clear();
  }

  return result;
}

int32_t DbSlice::GetNextSegmentForEviction(int32_t segment_id, DbIndex db_ind) const {
  // wraps around if we reached the end
  return db_arr_[db_ind]->prime.NextSeg((size_t)segment_id) %
         db_arr_[db_ind]->prime.GetSegmentCount();
}

pair<uint64_t, size_t> DbSlice::FreeMemWithEvictionStepAtomic(DbIndex db_ind, const Context& cntx,
                                                              size_t starting_segment_id,
                                                              size_t increase_goal_bytes) {
  // Disable flush journal changes to prevent preemtion
  journal::DisableFlushGuard journal_flush_guard(shard_owner()->journal());
  FiberAtomicGuard guard;
  DCHECK(!owner_->IsReplica());

  size_t evicted_items = 0, evicted_bytes = 0;

  if (owner_->tiered_storage()) {
    evicted_bytes = owner_->tiered_storage()->ReclaimMemory(increase_goal_bytes);
    if (evicted_bytes >= increase_goal_bytes)
      return {0, evicted_bytes};
  }

  if ((!IsCacheMode()) || !expire_allowed_)
    return {0, 0};

  auto max_eviction_per_hb = GetFlag(FLAGS_max_eviction_per_heartbeat);
  auto max_segment_to_consider = GetFlag(FLAGS_max_segment_to_consider);

  auto time_start = absl::GetCurrentTimeNanos();
  auto& db_table = db_arr_[db_ind];
  constexpr int32_t num_slots = PrimeTable::Segment_t::kSlotNum;

  string tmp;

  bool record_keys = owner_->journal() || expired_keys_events_recording_;
  vector<string> keys_to_journal;

  for (int32_t slot_id = num_slots - 1; slot_id >= 0; --slot_id) {
    for (int32_t bucket_id = PrimeTable::LargestBucketId(); bucket_id >= 0; --bucket_id) {
      // pick a random segment to start with in each eviction,
      // as segment_id does not imply any recency, and random selection should be fair enough
      int32_t segment_id = starting_segment_id;
      for (size_t num_seg_visited = 0; num_seg_visited < max_segment_to_consider;
           ++num_seg_visited, segment_id = GetNextSegmentForEviction(segment_id, db_ind)) {
        const auto& segment = db_table->prime.GetSegment(segment_id);
        if (unsigned(bucket_id) >= segment->num_buckets())
          bucket_id = segment->num_buckets() - 1;
        const auto& bucket = segment->GetBucket(bucket_id);
        if (bucket.IsEmpty() || !bucket.IsBusy(slot_id))
          continue;

        auto evict_it = db_table->prime.GetIterator(segment_id, bucket_id, slot_id);
        // TODO: consider evicting inline entries as well

        bool has_allocated = evict_it->second.HasAllocated() || evict_it->first.HasAllocated();
        if (evict_it->first.IsSticky() || !has_allocated)
          continue;

        // check if the key is locked by looking up transaction table.
        const auto& lt = db_table->trans_locks;
        string_view key = evict_it->first.GetSlice(&tmp);
        if (lt.Find(LockTag(key)).has_value())
          continue;

        if (record_keys)
          keys_to_journal.emplace_back(key);

        evicted_bytes += evict_it->first.MallocUsed() + evict_it->second.MallocUsed();
        ++evicted_items;

        Del(cntx, Iterator(evict_it, StringOrView::FromView(key)));

        // returns when whichever condition is met first
        if ((evicted_items == max_eviction_per_hb) || (evicted_bytes >= increase_goal_bytes))
          goto finish;
      }
    }
  }

finish:
  // send the deletion to the replicas.
  for (string_view key : keys_to_journal) {
    if (auto journal = owner_->journal(); journal)
      // Won't block because we disabled journal flushing. See first line of this function.
      RecordExpiryBlocking(db_ind, key);

    if (expired_keys_events_recording_)
      db_table->expired_keys_events_.emplace_back(key);
  }

  // This might not always be atomic on exceptional cases -- see comments on the function
  // declaration.
  SendQueuedInvalidationMessagesAsync();
  auto time_finish = absl::GetCurrentTimeNanos();
  events_.evicted_keys += evicted_items;
  db_arr_[db_ind]->stats.events.evicted_keys += evicted_items;
  DVLOG(2) << "Eviction time (us): " << (time_finish - time_start) / 1000;
  return pair<uint64_t, size_t>{evicted_items, evicted_bytes};
}

void DbSlice::CreateDb(DbIndex db_ind) {
  auto& db = db_arr_[db_ind];
  if (!db) {
    db.reset(new DbTable{owner_->memory_resource(), db_ind});
    table_memory_ += db->table_memory();
  }
}

void DbSlice::RegisterWatchedKey(DbIndex db_indx, std::string_view key,
                                 std::atomic_bool* dirty_ptr) {
  // Because we might insert while another fiber is preempted
  db_arr_[db_indx]->watched_keys[key].push_back(dirty_ptr);
}

void DbSlice::UnregisterConnectionWatches(absl::Span<const std::pair<DbIndex, std::string>> keys,
                                          const std::atomic_bool* dirty_ptr) {
  for (const auto& [db_indx, key] : keys) {
    auto& watched_keys = db_arr_[db_indx]->watched_keys;
    if (auto it = watched_keys.find(key); it != watched_keys.end()) {
      it->second.erase(std::remove(it->second.begin(), it->second.end(), dirty_ptr),
                       it->second.end());
      if (it->second.empty())
        watched_keys.erase(it);
    }
  }
}

void DbSlice::InvalidateDbWatches(DbIndex db_indx) {
  for (const auto& [key, conn_list] : db_arr_[db_indx]->watched_keys) {
    for (auto* dirty_ptr : conn_list)
      dirty_ptr->store(true, memory_order_relaxed);
  }
}

void DbSlice::InvalidateSlotWatches(const cluster::SlotSet& slot_ids) {
  for (const auto& [key, conn_list] : db_arr_[0]->watched_keys) {
    SlotId sid = KeySlot(key);
    if (!slot_ids.Contains(sid)) {
      continue;
    }
    for (auto* dirty_ptr : conn_list)
      dirty_ptr->store(true, memory_order_relaxed);
  }
}

void DbSlice::RemoveOffloadedEntriesFromTieredStorage(absl::Span<const DbIndex> indices,
                                                      const DbTableArray& db_arr) const {
  // Currently being used only for tiered storage.
  TieredStorage* tiered_storage = shard_owner()->tiered_storage();
  string scratch;
  for (DbIndex index : indices) {
    const auto& db_ptr = db_arr[index];
    if (!db_ptr)
      continue;

    // Delete all tiered entries
    PrimeTable::Cursor cursor;
    do {
      cursor = db_ptr->prime.Traverse(cursor, [&](PrimeIterator it) {
        if (it->second.IsExternal()) {
          tiered_storage->Delete(index, &it->second);
        } else if (it->second.HasStashPending()) {
          tiered_storage->CancelStash(index, it->first.GetSlice(&scratch), &it->second);
        }
      });
    } while (cursor);

    // While tiered_storage may delete some of its entries asynchronously, it updates
    // stats.tiered_entries immediately during the Delete call, therefore tiered_entries
    // should be zero by this point.
    CHECK_EQ(db_ptr->stats.tiered_entries, 0u);
  }
}

void DbSlice::SetDocDeletionCallback(DocDeletionCallback ddcb) {
  doc_del_cb_ = std::move(ddcb);
}

void DbSlice::ResetUpdateEvents() {
  events_.update = 0;
}

void DbSlice::ResetEvents() {
  events_ = {};
  for (auto& db : db_arr_) {
    if (db) {
      db->stats.events = {};
    }
  }
}

void DbSlice::SetNotifyKeyspaceEvents(std::string_view notify_keyspace_events) {
  expired_keys_events_recording_ = !notify_keyspace_events.empty();
}

void DbSlice::QueueInvalidationTrackingMessageAtomic(std::string_view key) {
  FiberAtomicGuard guard;
  auto it = client_tracking_map_.find(key);
  if (it == client_tracking_map_.end()) {
    return;
  }

  ConnectionHashSet moved_set = std::move(it->second);
  client_tracking_map_.erase(it);

  auto [pend_it, inserted] = pending_send_map_.emplace(key, std::move(moved_set));
  if (!inserted) {
    ConnectionHashSet& client_set = pend_it->second;
    for (auto& weak_ref : moved_set) {
      client_set.insert(weak_ref);
    }
  }
}

void DbSlice::SendQueuedInvalidationMessagesCb(const TrackingMap& track_map,
                                               unsigned calling_thread_id) const {
  for (auto& [key, client_list] : track_map) {
    for (auto& weak_ref : client_list) {
      if (weak_ref.IsExpired() || (weak_ref.LastKnownThreadId() != calling_thread_id)) {
        continue;  // Expired or migrated.
      }
      auto* conn = weak_ref.Get();
      auto* cntx = static_cast<ConnectionContext*>(conn->cntx());
      if (cntx && cntx->conn_state.tracking_info_.IsTrackingOn()) {
        conn->SendInvalidationMessageAsync({key});
      }
    }
  }
}

void DbSlice::SendQueuedInvalidationMessages() {
  // We run while loop because when we block below, we might have new items added to
  // pending_send_map_.
  while (!pending_send_map_.empty()) {
    // Notify all the clients. this function is not efficient,
    // because it broadcasts to all threads unrelated to the subscribers for the key.
    auto local_map = std::move(pending_send_map_);
    pending_send_map_ = {};
    auto cb = [&](unsigned thread_id, util::ProactorBase*) {
      SendQueuedInvalidationMessagesCb(local_map, thread_id);
    };

    shard_set->pool()->AwaitBrief(std::move(cb));
  }
}

// This function might preempt if the task queue within DispatchBrief is full and we can't
// enqueue the callback. Although a rare case, this code might not be atomic.
void DbSlice::SendQueuedInvalidationMessagesAsync() {
  if (pending_send_map_.empty()) {
    return;
  }
  // DispatchBrief will copy local_map
  auto cb = [lm = std::move(pending_send_map_), this](unsigned idx, util::ProactorBase*) {
    SendQueuedInvalidationMessagesCb(lm, idx);
  };

  shard_set->pool()->DispatchBrief(std::move(cb));
}

void DbSlice::StartSampleTopK(DbIndex db_ind, uint32_t min_freq) {
  auto& db = *db_arr_[db_ind];
  if (db.sample_top_keys) {
    LOG(INFO) << "Sampling already started for db " << db_ind;
    return;
  }

  TopKeys::Options opts;
  opts.min_key_count_to_record = min_freq;
  db.sample_top_keys = new DbTable::SampleTopKeys;
  db.sample_top_keys->top_keys = new TopKeys(opts);
}

auto DbSlice::StopSampleTopK(DbIndex db_ind) -> SamplingResult {
  auto& db = *db_arr_[db_ind];

  if (!db.sample_top_keys) {
    LOG(WARNING) << "Sampling not started for db " << db_ind;
    return {};
  }

  auto fmap = db.sample_top_keys->top_keys->GetTopKeys();
  SamplingResult result;
  result.total_samples = db.sample_top_keys->total_samples;
  delete db.sample_top_keys;
  db.sample_top_keys = nullptr;

  result.top_keys.reserve(fmap.size());
  while (!fmap.empty()) {
    auto node = fmap.extract(fmap.begin());  // Clear the map to avoid memory leak.
    result.top_keys.emplace_back(std::move(node.key()), node.mapped());
  }
  return result;
}

void DbSlice::StartSampleKeys(DbIndex db_ind) {
  auto& db = *db_arr_[db_ind];
  if (db.sample_unique_keys) {
    LOG(INFO) << "Sampling already started for db " << db_ind;
    return;
  }

  HllBufferPtr hll_buf;
  hll_buf.size = getDenseHllSize();
  hll_buf.hll = new uint8_t[hll_buf.size];
  CHECK_EQ(0, createDenseHll(hll_buf));
  db.sample_unique_keys = new DbTable::SampleUniqueKeys;
  db.sample_unique_keys->dense_hll = hll_buf.hll;
}

// Returns number of unique keys sampled.
auto DbSlice::StopSampleKeys(DbIndex db_ind) -> UniqueSampleResult {
  auto& db = *db_arr_[db_ind];
  if (!db.sample_unique_keys) {
    LOG(INFO) << "Keys sampling not started for db " << db_ind;
    return {};
  }
  HllBufferPtr hll_buf;
  hll_buf.hll = db.sample_unique_keys->dense_hll;
  hll_buf.size = getDenseHllSize();
  UniqueSampleResult result;
  result.unique_keys_count = pfcountSingle(hll_buf);
  result.total_samples = db.sample_unique_keys->total_samples;

  delete db.sample_unique_keys;
  db.sample_unique_keys = nullptr;

  return result;
}

void DbSlice::StartSampleValues(DbIndex db_ind) {
  auto& db = *db_arr_[db_ind];
  if (db.sample_values_hist) {
    LOG(INFO) << "Sampling already started for db " << db_ind;
    return;
  }

  db.sample_values_hist = new base::Histogram();
}

unique_ptr<base::Histogram> DbSlice::StopSampleValues(DbIndex db_ind) {
  auto& db = *db_arr_[db_ind];
  if (!db.sample_values_hist) {
    LOG(INFO) << "Values sampling not started for db " << db_ind;
    return {};
  }

  return unique_ptr<base::Histogram>{exchange(db.sample_values_hist, nullptr)};
}

void DbSlice::PerformDeletionAtomic(const Iterator& del_it, const ExpIterator& exp_it,
                                    DbTable* table, bool async) {
  FiberAtomicGuard guard;
  size_t table_before = table->table_memory();

  if (del_it->second.HasFlag()) {
    if (!SetMCFlag(table->index, del_it->first, 0)) {
      LOG(DFATAL) << "Internal error, inconsistent state, mcflag should be present but not found "
                  << del_it->first.ToString();
    }
  }

  DbTableStats& stats = table->stats;

  if (del_it->first.HasExpire())
    --stats.expire_count;

  PrimeValue& pv = del_it->second;

  if (pv.HasStashPending()) {
    string scratch;
    string_view key = del_it->first.GetSlice(&scratch);
    shard_owner()->tiered_storage()->CancelStash(table->index, key, &pv);
  } else if (pv.IsExternal()) {
    shard_owner()->tiered_storage()->Delete(table->index, &del_it->second);
  }

  ssize_t value_heap_size = pv.MallocUsed(), key_size_used = del_it->first.MallocUsed();
  if (del_it->first.IsInline()) {
    --stats.inline_keys;
  } else {
    AccountObjectMemory(del_it.key(), OBJ_KEY, -key_size_used, table);  // Key
  }
  AccountObjectMemory(del_it.key(), pv.ObjType(), -value_heap_size, table);  // Value

  if (async && MayDeleteAsynchronously(pv)) {
    DenseSet* ds = (DenseSet*)pv.RObjPtr();
    pv.SetRObjPtr(nullptr);
    const size_t kClearStepSize = 512;

    uint32_t next = ds->ClearStep(0, kClearStepSize);
    if (next < ds->BucketCount()) {
      AsyncDeleter::EnqueDeletion(next, ds);
    } else {
      CompactObj::DeleteMR<DenseSet>(ds);
    }
  }

  if (table->slots_stats) {
    SlotId sid = KeySlot(del_it.key());
    table->slots_stats[sid].key_count -= 1;
  }

  table->prime.Erase(del_it.GetInnerIt());

  // Note, currently we do not shrink our tables upon deletion.
  // This DCHECK ensures that if we decide to do so, we will have to update table_memory_
  // accordingly.
  DCHECK_EQ(table->table_memory(), table_before);

  --entries_count_;
  memory_budget_ += (value_heap_size + key_size_used);

  if (!client_tracking_map_.empty()) {
    QueueInvalidationTrackingMessageAtomic(del_it.key());
  }
}

void DbSlice::OnCbFinishBlocking() {
  if (IsCacheMode()) {
    // move fetched items to local variable
    auto fetched_items = std::move(fetched_items_);
    fetched_items_ = {};
    for (const auto& [key_hash, db_index] : fetched_items) {
      auto& db = *db_arr_[db_index];

      // We intentionally don't do extra key checking on this callback to speedup
      // fetching. Probability of having hash collision is quite low and for bumpup
      // purposes it should be fine if different key (with same hash) is returned.
      auto predicate = [](const PrimeKey&) { return true; };

      PrimeIterator it = db.prime.FindFirst(key_hash, predicate);

      if (!IsValid(it)) {
        continue;
      }

      if (!change_cb_.empty()) {
        auto bump_cb = [&](PrimeTable::bucket_iterator bit) {
          CallChangeCallbacks(db_index, ChangeReq{bit});
        };
        db.prime.CVCUponBump(change_cb_.back().first, it, bump_cb);
      }

      // We must not change the bucket's internal order during serialization
      serialization_latch_.Wait();
      PrimeBumpPolicy policy;
      auto bump_it = db.prime.BumpUp(it, policy);
      if (bump_it != it) {  // the item was bumped
        ++events_.bumpups;
      }
      CallMovedCallbacks(db_index, policy.moved_items());
    }
  }

  // Sends only if !pending_send_map_.empty()
  SendQueuedInvalidationMessages();
}

void DbSlice::CallChangeCallbacks(DbIndex id, const ChangeReq& cr) const {
  if (change_cb_.empty())
    return;

  // does not preempt, just increments the counter.
  unique_lock<LocalLatch> lk(serialization_latch_);

  const size_t limit = change_cb_.size();
  auto ccb = change_cb_.begin();
  for (size_t i = 0; i < limit; ++i) {
    CHECK(ccb->second);
    ccb->second(id, cr);
    ++ccb;
  }
}

void DbSlice::CallMovedCallbacks(
    DbIndex id, const std::vector<std::pair<PrimeTable::Cursor, PrimeTable::Cursor>>& moved_items) {
  if (moved_cb_.empty())
    return;

  // does not preempt, just increments the counter.
  unique_lock<LocalLatch> lk(serialization_latch_);

  const size_t limit = moved_cb_.size();
  auto ccb = moved_cb_.begin();
  for (size_t i = 0; i < limit; ++i) {
    CHECK(ccb->second);
    ccb->second(id, moved_items);
    ++ccb;
  }
}

}  // namespace dfly


================================================
FILE: src/server/db_slice.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/flat_hash_map.h>
#include <absl/container/flat_hash_set.h>

#include <atomic>

#include "common/string_or_view.h"
#include "core/mi_memory_resource.h"
#include "facade/connection_ref.h"
#include "facade/op_status.h"
#include "server/common.h"
#include "server/common_types.h"
#include "server/synchronization.h"
#include "server/table.h"
#include "server/tx_base.h"
#include "util/fibers/fibers.h"
#include "util/fibers/synchronization.h"

namespace dfly {

namespace cluster {
class SlotRanges;
class SlotSet;
}  // namespace cluster

using facade::OpResult;

struct DbStats : public DbTableStats {
  // number of active keys.
  size_t key_count = 0;

  // total number of slots in prime dictionary (key capacity).
  size_t prime_capacity = 0;

  // Memory used by dictionaries.
  size_t table_mem_usage = 0;

  // We override additional DbStats fields explicitly in DbSlice::GetStats().
  using DbTableStats::operator=;

  DbStats& operator+=(const DbStats& o);
};

struct SliceEvents {
  // Number of eviction events.
  size_t evicted_keys = 0;

  // evictions that were performed when we have a negative memory budget.
  size_t hard_evictions = 0;
  size_t expired_keys = 0;
  size_t garbage_checked = 0;
  size_t garbage_collected = 0;
  size_t stash_unloaded = 0;
  size_t bumpups = 0;  // how many bump-upds we did.

  // hits/misses on keys
  size_t hits = 0;
  size_t misses = 0;
  size_t mutations = 0;

  // ram hit/miss when tiering is enabled
  size_t ram_hits = 0;
  size_t ram_cool_hits = 0;
  size_t ram_misses = 0;

  // how many insertions were rejected due to OOM.
  size_t insertion_rejections = 0;

  // how many updates and insertions of keys between snapshot intervals
  size_t update = 0;

  uint64_t huff_encode_total = 0, huff_encode_success = 0;

  SliceEvents& operator+=(const SliceEvents& o);
};

class DbSlice {
  DbSlice(const DbSlice&) = delete;
  void operator=(const DbSlice&) = delete;

 public:
  // Auto-laundering iterator wrapper. Laundering means re-finding keys if they moved between
  // buckets.
  template <typename T> class IteratorT {
   public:
    IteratorT() = default;

    IteratorT(T it, StringOrView key)
        : it_(it), fiber_epoch_(util::fb2::FiberSwitchEpoch()), key_(std::move(key)) {
    }

    static IteratorT FromPrime(T it) {
      if (!IsValid(it)) {
        return IteratorT();
      }

      std::string key;
      it->first.GetString(&key);
      return IteratorT(it, StringOrView::FromString(std::move(key)));
    }

    IteratorT(const IteratorT& o) = default;
    IteratorT(IteratorT&& o) = default;
    IteratorT& operator=(const IteratorT& o) = default;
    IteratorT& operator=(IteratorT&& o) = default;

    // Do NOT store this iterator in a variable, as it will not be laundered automatically.
    const T& GetInnerIt() const {
      LaunderIfNeeded();
      return it_;
    }

    auto operator->() const {
      return GetInnerIt().operator->();
    }

    auto is_done() const {
      return GetInnerIt().is_done();
    }

    std::string_view key() const {
      return key_.view();
    }

    auto IsOccupied() const {
      return GetInnerIt().IsOccupied();
    }

    auto GetVersion() const {
      return GetInnerIt().GetVersion();
    }

   private:
    void LaunderIfNeeded() const;  // const is a lie

    mutable T it_;
    mutable uint64_t fiber_epoch_ = 0;
    StringOrView key_;
  };

  using Iterator = IteratorT<PrimeIterator>;
  using ConstIterator = IteratorT<PrimeConstIterator>;
  using ExpIterator = IteratorT<ExpireIterator>;
  using ExpConstIterator = IteratorT<ExpireConstIterator>;

  class AutoUpdater {
   public:
    AutoUpdater();
    AutoUpdater(const AutoUpdater& o) = delete;
    AutoUpdater& operator=(const AutoUpdater& o) = delete;
    AutoUpdater(AutoUpdater&& o) noexcept;
    AutoUpdater& operator=(AutoUpdater&& o) noexcept;
    ~AutoUpdater();

    // Removes the memory usage attributed to the iterator and resets orig_heap_size.
    // Used when the existing object is overridden by a new one.
    void ReduceHeapUsage();

    void Run();
    void Cancel();

   private:
    // Wrap members in a struct to auto generate operator=
    struct Fields {
      DbSlice* db_slice = nullptr;
      DbIndex db_ind = 0;

      // TODO: remove `it` from ItAndUpdater as it's redundant with respect to this iterator.
      Iterator it;
      std::string_view key;

      // The following fields are calculated at init time
      size_t orig_value_heap_size = 0;
    };

    AutoUpdater(DbIndex db_ind, std::string_view key, const Iterator& it, DbSlice* db_slice);

    friend class DbSlice;

    Fields fields_ = {};
  };

  struct Stats {
    // DbStats db;
    std::vector<DbStats> db_stats;
    SliceEvents events;
    size_t small_string_bytes = 0;
  };

  using Context = DbContext;
  using ChangeReq = dfly::ChangeReq;

  // Called before deleting an element to notify the search indices.
  using DocDeletionCallback =
      std::function<void(std::string_view, const Context&, const PrimeValue& pv)>;

  struct ExpireParams {
    bool IsDefined() const {
      return persist || value > INT64_MIN;
    }

    static int64_t Cap(int64_t value, TimeUnit unit);

    // Calculate relative and absolue timepoints.
    std::pair<int64_t, int64_t> Calculate(uint64_t now_msec, bool cap) const;

    // Return true if relative expiration is in the past
    bool IsExpired(uint64_t now_msec) const {
      return Calculate(now_msec, false).first < 0;
    }

   public:
    int64_t value = INT64_MIN;  // undefined
    TimeUnit unit = TimeUnit::SEC;

    bool absolute = false;
    bool persist = false;        // persist means remove all expiry
    int32_t expire_options = 0;  // ExpireFlags
  };

  DbSlice(uint32_t index, bool cache_mode, EngineShard* owner);
  ~DbSlice();

  // Activates `db_ind` database if it does not exist (see ActivateDb below).
  void Reserve(DbIndex db_ind, size_t key_size);

  // Returns statistics for the whole db slice. A bit heavy operation.
  Stats GetStats() const;

  // Returns slot statistics for db 0.
  SlotStats GetSlotStats(SlotId sid) const;

  void UpdateExpireBase(uint64_t now, unsigned generation) {
    expire_base_[generation & 1] = now;
  }

  void UpdateMemoryParams(int64_t budget, size_t bytes_per_object) {
    memory_budget_ = budget;
    bytes_per_object_ = bytes_per_object;
  }

  ssize_t memory_budget() const {
    return memory_budget_;
  }

  size_t bytes_per_object() const {
    return bytes_per_object_;
  }

  int64_t ExpireTime(const ExpirePeriod& val) const {
    return expire_base_[0] + val.duration_ms();
  }

  ExpirePeriod FromAbsoluteTime(uint64_t time_ms) const {
    return ExpirePeriod{time_ms - expire_base_[0]};
  }

  struct ItAndUpdater {
    Iterator it;
    ExpIterator exp_it;
    AutoUpdater post_updater;
    bool is_new = false;
  };

  ItAndUpdater FindMutable(const Context& cntx, std::string_view key);
  OpResult<ItAndUpdater> FindMutable(const Context& cntx, std::string_view key,
                                     unsigned req_obj_type);

  struct ItAndExpConst {
    ConstIterator it;
    ExpConstIterator exp_it;
  };

  ItAndExpConst FindReadOnly(const Context& cntx, std::string_view key) const;
  OpResult<ConstIterator> FindReadOnly(const Context& cntx, std::string_view key,
                                       unsigned req_obj_type) const;

  // Consider using req_obj_type to specify the type of object you expect.
  // Because it can evaluate to bugs like this:
  // - We already have a key but with another type you expect.
  // - During FindMutable we will not use req_obj_type, so the object type will not be checked.
  // - AddOrFind will return the object with this key but with a different type.
  // - Then you will update this object with a different type, which will lead to an error.
  // If you proved the key type on your own, please add a comment there why don't specify
  // req_obj_type
  OpResult<ItAndUpdater> AddOrFind(const Context& cntx, std::string_view key,
                                   std::optional<unsigned> req_obj_type);

  // Same as AddOrSkip, but overwrites in case entry exists.
  OpResult<ItAndUpdater> AddOrUpdate(const Context& cntx, std::string_view key, PrimeValue obj,
                                     uint64_t expire_at_ms);

  // Adds a new entry. Requires: key does not exist in this slice.
  // Returns the iterator to the newly added entry.
  // Returns OpStatus::OUT_OF_MEMORY if bad_alloc is thrown
  OpResult<ItAndUpdater> AddNew(const Context& cntx, std::string_view key, PrimeValue obj,
                                uint64_t expire_at_ms);

  // Update entry expiration. Return epxiration timepoint in abs milliseconds, or -1 if the entry
  // already expired and was deleted;
  facade::OpResult<int64_t> UpdateExpire(const Context& cntx, Iterator prime_it, ExpIterator exp_it,
                                         const ExpireParams& params);

  // Adds expiry on a key. If the key already has expiry, updates it.
  void AddExpire(DbIndex db_ind, const Iterator& main_it, uint64_t at);

  // Removes expiry from a key. Returns true if expiry existed and was removed.
  bool RemoveExpire(DbIndex db_ind, const Iterator& main_it);

  // Returns false if no action was taken, true if the mc flag was set or removed.
  bool SetMCFlag(DbIndex db_ind, const PrimeKey& key, uint32_t flag);

  uint32_t GetMCFlag(DbIndex db_ind, const PrimeKey& key) const;

  // Creates a database with index `db_ind`. If such database exists does nothing.
  void ActivateDb(DbIndex db_ind);

  // Deletes the iterator. The iterator must be valid.
  // Context argument is used only for document removal and it just needs
  // timestamp field. Last argument, db_table, is optional and is used only in FlushSlotsCb.
  // If async is set, AsyncDeleter will enqueue deletion of the object
  void Del(Context cntx, Iterator it, DbTable* db_table = nullptr, bool async = false);

  // Deletes a key after FindMutable(). Runs post_updater before deletion
  // to update memory accounting while the key is still valid.
  // Takes ownership of it_updater (pass by value with move semantics).
  void DelMutable(Context cntx, ItAndUpdater it_updater);

  constexpr static DbIndex kDbAll = 0xFFFF;

  // Flushes db_ind or all databases if kDbAll is passed
  util::fb2::Fiber FlushDb(DbIndex db_ind);

  // Flushes the data of given slot ranges.
  void FlushSlots(const cluster::SlotRanges& slot_ranges);

  EngineShard* shard_owner() const {
    return owner_;
  }

  ShardId shard_id() const {
    return shard_id_;
  }

  void OnCbFinishBlocking();

  bool Acquire(IntentLock::Mode m, const KeyLockArgs& lock_args);
  void Release(IntentLock::Mode m, const KeyLockArgs& lock_args);

  // Returns true if the key can be locked under m. Does not lock.
  bool CheckLock(IntentLock::Mode mode, DbIndex dbid, uint64_t fp) const;
  bool CheckLock(IntentLock::Mode mode, DbIndex dbid, std::string_view key) const {
    return CheckLock(mode, dbid, LockTag(key).Fingerprint());
  }

  size_t db_array_size() const {
    return db_arr_.size();
  }

  bool IsDbValid(DbIndex id) const {
    return id < db_arr_.size() && bool(db_arr_[id]);
  }

  auto CopyDBTablePtr(DbIndex id) {
    return db_arr_[id];
  }

  DbTable* GetDBTable(DbIndex id) {
    return db_arr_[id].get();
  }

  const DbTable* GetDBTable(DbIndex id) const {
    return db_arr_[id].get();
  }

  std::pair<PrimeTable*, ExpireTable*> GetTables(DbIndex id) {
    return std::pair<PrimeTable*, ExpireTable*>(&db_arr_[id]->prime, nullptr);
  }

  // Returns existing keys count in the db.
  size_t DbSize(DbIndex db_ind) const;

  DbTableStats* MutableStats(DbIndex db_ind) {
    return &db_arr_[db_ind]->stats;
  }

  // Check whether 'it' has not expired. Returns it if it's still valid. Otherwise, erases it
  // from both tables and return Iterator{}.
  struct ItAndExp {
    Iterator it;
    ExpIterator exp_it;
  };
  ItAndExp ExpireIfNeeded(const Context& cntx, Iterator it) const;

  // Iterate over all expire table entries and delete expired.
  void ExpireAllIfNeeded();

  // Current version of this slice.
  // We maintain a shared versioning scheme for all databases in the slice.
  uint64_t version() const {
    return version_;
  }

  size_t table_memory() const {
    return table_memory_;
  }

  size_t entries_count() const {
    return entries_count_;
  }

  using ChangeCallback = std::function<void(DbIndex, const ChangeReq&)>;
  // Holds pairs of source and destination cursors for items moved in the dash table
  using MovedItemsVec = std::vector<std::pair<PrimeTable::Cursor, PrimeTable::Cursor>>;
  using MovedCallback = std::function<void(DbIndex, const MovedItemsVec&)>;

  //! Registers the callback to be called for each change.
  //! Returns the registration id which is also the unique version of the dbslice
  //! at a time of the call.
  uint64_t RegisterOnChange(ChangeCallback cb);

  //! Registers the callback to be called after items are moved in table.
  //! Returns the registration id which is also the unique version of the dbslice
  //! at a time of the call.
  uint64_t RegisterOnMove(MovedCallback cb);

  bool HasRegisteredCallbacks() const {
    return !change_cb_.empty();
  }

  // Call registered callbacks with version less than upper_bound.
  void FlushChangeToEarlierCallbacks(DbIndex db_ind, Iterator it, uint64_t upper_bound);

  //! Unregisters the callback.
  void UnregisterOnChange(uint64_t id);

  void UnregisterOnMoved(uint64_t id);

  struct DeleteExpiredStats {
    uint32_t deleted = 0;        // number of deleted items due to expiry.
    uint32_t deleted_bytes = 0;  // total bytes of deleted items.
    uint32_t traversed = 0;      // total number of traversed entries in the prime table.
  };

  // Deletes some amount of possible expired items.
  DeleteExpiredStats DeleteExpiredStep(const Context& cntx, unsigned count);

  // Evicts items with dynamically allocated data from the primary table.
  // Does not shrink tables.
  // Returns number of (elements,bytes) freed due to evictions.
  std::pair<uint64_t, size_t> FreeMemWithEvictionStepAtomic(DbIndex db_indx, const Context& cntx,
                                                            size_t starting_segment_id,
                                                            size_t increase_goal_bytes);

  int32_t GetNextSegmentForEviction(int32_t segment_id, DbIndex db_ind) const;

  const DbTableArray& databases() const {
    return db_arr_;
  }

  void TEST_EnableCacheMode() {
    cache_mode_ = 1;
  }

  bool IsCacheMode() const {
    // During loading time we never bump elements.
    return cache_mode_ && (load_ref_count_ == 0);
  }

  void IncrLoadInProgress() {
    ++load_ref_count_;
  }

  void DecrLoadInProgress() {
    --load_ref_count_;
  }

  bool IsLoadRefCountZero() const {
    return load_ref_count_ == 0;
  }

  // Test hook to inspect last locked keys.
  const auto& TEST_GetLastLockedFps() const {
    return uniq_fps_;
  }

  // Register key to be watched - when touched, set dirty_ptr to true
  void RegisterWatchedKey(DbIndex db_indx, std::string_view key, std::atomic_bool* dirty_ptr);

  // Unregisted all watched key for given dirty_ptr
  void UnregisterConnectionWatches(absl::Span<const std::pair<DbIndex, std::string>> keys,
                                   const std::atomic_bool* dirty_ptr);

  void SetDocDeletionCallback(DocDeletionCallback ddcb);

  // Resets the event counter for updates/insertions
  void ResetUpdateEvents();

  // Resets events_ member. Used by CONFIG RESETSTAT
  void ResetEvents();

  // Controls the expiry/eviction state. The server may enter states where
  // Both evictions and expiries will be stopped for a short period of time.
  void SetExpireAllowed(bool is_allowed) {
    expire_allowed_ = is_allowed;
  }

  // Track keys for the client represented by the the weak reference to its connection.
  void TrackKey(const facade::ConnectionRef& conn_ref, std::string_view key) {
    client_tracking_map_[key].insert(conn_ref);
  }

  // Does not check for non supported events. Callers must parse the string and reject it
  // if it's not empty and not EX.
  void SetNotifyKeyspaceEvents(std::string_view notify_keyspace_events);

  bool WillBlockOnJournalWrite() const {
    return serialization_latch_.IsBlocked();
  }

  LocalLatch* GetLatch() {
    return &serialization_latch_;
  }

  void StartSampleTopK(DbIndex db_ind, uint32_t min_freq);

  struct SamplingResult {
    std::vector<std::pair<std::string, uint64_t>> top_keys;  // key -> frequency pairs.
    uint64_t total_samples = 0;                              // Total number of keys sampled.
  };
  SamplingResult StopSampleTopK(DbIndex db_ind);

  void StartSampleKeys(DbIndex db_ind);

  // Returns number of unique keys sampled.
  struct UniqueSampleResult {
    uint64_t unique_keys_count = 0;  // Number of unique keys sampled.
    uint64_t total_samples = 0;      // Total number of keys sampled.
  };
  UniqueSampleResult StopSampleKeys(DbIndex db_ind);

  void StartSampleValues(DbIndex db_ind);

  // Returns a histogram of sampled values.
  std::unique_ptr<base::Histogram> StopSampleValues(DbIndex db_ind);

 private:
  void PreUpdateBlocking(DbIndex db_ind, const Iterator& it);
  void PostUpdate(DbIndex db_ind, std::string_view key);

  OpResult<ItAndUpdater> AddOrUpdateInternal(const Context& cntx, std::string_view key,
                                             PrimeValue obj, uint64_t expire_at_ms,
                                             bool force_update);

  void FlushSlotsFb(const cluster::SlotSet& slot_ids);
  util::fb2::Fiber FlushDbIndexes(const std::vector<DbIndex>& indexes);

  // Invalidate all watched keys in database. Used on FLUSH.
  void InvalidateDbWatches(DbIndex db_indx);

  // Invalidate all watched keys for given slots. Used on FlushSlots.
  void InvalidateSlotWatches(const cluster::SlotSet& slot_ids);

  // Clear tiered storage entries for the specified indices. Called during flushing some indices.
  void RemoveOffloadedEntriesFromTieredStorage(absl::Span<const DbIndex> indices,
                                               const DbTableArray& db_arr) const;

  void PerformDeletionAtomic(const Iterator& del_it, const ExpIterator& exp_it, DbTable* table,
                             bool async = false);

  // Queues invalidation message to the clients that are tracking the change to a key.
  void QueueInvalidationTrackingMessageAtomic(std::string_view key);
  void SendQueuedInvalidationMessages();
  void SendQueuedInvalidationMessagesAsync();

  void CreateDb(DbIndex index);

  enum class UpdateStatsMode : uint8_t {
    kReadStats,
    kMutableStats,
  };

  struct PrimeItAndExp {
    PrimeIterator it;
    ExpireIterator exp_it;
  };

  PrimeItAndExp ExpireIfNeeded(const Context& cntx, PrimeIterator it) const;

  OpResult<ItAndUpdater> AddOrFindInternal(const Context& cntx, std::string_view key,
                                           std::optional<unsigned> req_obj_type);

  OpResult<PrimeItAndExp> FindInternal(const Context& cntx, std::string_view key,
                                       std::optional<unsigned> req_obj_type,
                                       UpdateStatsMode stats_mode) const;
  OpResult<ItAndUpdater> FindMutableInternal(const Context& cntx, std::string_view key,
                                             std::optional<unsigned> req_obj_type);

  uint64_t NextVersion() {
    return version_++;
  }

  void CallChangeCallbacks(DbIndex id, const ChangeReq& cr) const;
  void CallMovedCallbacks(DbIndex id, const MovedItemsVec& moved_items);

  // We need this because registered callbacks might yield and when they do so we want
  // to avoid Heartbeat or Flushing the db.
  // This latch protects us against this case.
  mutable LocalLatch serialization_latch_;

  ShardId shard_id_;
  uint8_t cache_mode_ : 1;

  EngineShard* owner_;

  int64_t expire_base_[2];  // Used for expire logic, represents a real clock.
  bool expire_allowed_ = true;

  uint64_t version_ = 1;  // Used to version entries in the PrimeTable.
  uint64_t next_moved_id_ = 1;

  // Estimation of available memory dedicated to this shard.
  // Recalculated periodically by dividing free memory left among all shards equally
  ssize_t memory_budget_ = SSIZE_MAX / 2;
  size_t bytes_per_object_ = 0;

  size_t table_memory_ = 0;
  uint64_t entries_count_ = 0;
  unsigned load_ref_count_ = 0;

  mutable SliceEvents events_;  // we may change this even for const operations.

  DbTableArray db_arr_;

  // key for bump up items pair contains <key hash, db_index>
  using FetchedItemKey = std::pair<uint64_t, DbIndex>;

  struct FpHasher {
    size_t operator()(uint64_t val) const {
      return val;
    }
    size_t operator()(const FetchedItemKey& val) const {
      return val.first;
    }
  };

  // Used in temporary computations in Acquire/Release.
  mutable absl::flat_hash_set<uint64_t, FpHasher> uniq_fps_;

  // ordered from the smallest to largest version.
  std::list<std::pair<uint64_t, ChangeCallback>> change_cb_;

  std::list<std::pair<uint32_t, MovedCallback>> moved_cb_;

  // Used in temporary computations in Find item and CbFinish
  // This set is used to hold fingerprints of key accessed during the run of
  // a transaction callback (not the whole transaction).
  // We track them to avoid bumping them again (in any direction) so that the iterators to
  // the fetched keys will not be invalidated. We must do it for atomic operations,
  // for operations that preempt in the middle we have another mechanism -
  // auto laundering iterators, so in case of preemption we do not mind that fetched_items are
  // cleared or changed.
  mutable absl::flat_hash_set<FetchedItemKey, FpHasher> fetched_items_;

  // Registered by shard indices on when first document index is created.
  DocDeletionCallback doc_del_cb_;

  // Record whenever a key expired to DbTable::expired_keys_events_ for keyspace notifications
  bool expired_keys_events_recording_ = true;

  struct Hash {
    size_t operator()(const facade::ConnectionRef& c) const {
      return std::hash<uint32_t>()(c.GetClientId());
    }
  };

  // the following type definitions are confusing, and they are for achieving memory
  // usage tracking for client_tracking_map_ data structure through C++'s memory resource and
  // and polymorphic allocator (new C++ features)
  // the declarations below meant to say:
  // absl::flat_hash_map<std::string,
  //                    absl::flat_hash_set<facade::Connection::WeakRef, Hash>>
  //                    client_tracking_map_
  using HashSetAllocator = PMR_NS::polymorphic_allocator<facade::ConnectionRef>;

  using ConnectionHashSet =
      absl::flat_hash_set<facade::ConnectionRef, Hash,
                          absl::container_internal::hash_default_eq<facade::ConnectionRef>,
                          HashSetAllocator>;

  using AllocatorType = PMR_NS::polymorphic_allocator<std::pair<std::string, ConnectionHashSet>>;

  using TrackingMap =
      absl::flat_hash_map<std::string, ConnectionHashSet,
                          absl::container_internal::hash_default_hash<std::string>,
                          absl::container_internal::hash_default_eq<std::string>, AllocatorType>;
  TrackingMap client_tracking_map_, pending_send_map_;

  void SendQueuedInvalidationMessagesCb(const TrackingMap& track_map, unsigned idx) const;

  class PrimeBumpPolicy;
};

inline bool IsValid(const DbSlice::Iterator& it) {
  return dfly::IsValid(it.GetInnerIt());
}

inline bool IsValid(const DbSlice::ConstIterator& it) {
  return dfly::IsValid(it.GetInnerIt());
}

inline bool IsValid(const DbSlice::ExpIterator& it) {
  return dfly::IsValid(it.GetInnerIt());
}

inline bool IsValid(const DbSlice::ExpConstIterator& it) {
  return dfly::IsValid(it.GetInnerIt());
}

template <typename T> void DbSlice::IteratorT<T>::LaunderIfNeeded() const {
  if (!dfly::IsValid(it_)) {
    return;
  }

  uint64_t current_epoch = util::fb2::FiberSwitchEpoch();
  if (current_epoch != fiber_epoch_) {
    if (!it_.IsOccupied() || it_->first != key_.view()) {
      it_ = it_.owner().Find(key_.view());
    }
    fiber_epoch_ = current_epoch;
  }
}

}  // namespace dfly


================================================
FILE: src/server/debugcmd.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#include "server/debugcmd.h"

#include "core/detail/gen_utils.h"

#define HUF_STATIC_LINKING_ONLY

extern "C" {
#include "huff/hist.h"
#include "huff/huf.h"
#include "redis/redis_aux.h"
}

#include <absl/cleanup/cleanup.h>
#include <absl/random/random.h>
#include <absl/strings/escaping.h>
#include <absl/strings/match.h>
#include <absl/strings/str_cat.h>
#include <lz4.h>
#include <zdict.h>
#include <zstd.h>

#include <algorithm>
#include <filesystem>
#include <numeric>

#include "base/flags.h"
#include "base/logging.h"
#include "core/huff_coder.h"
#include "core/qlist.h"
#include "core/sorted_map.h"
#include "core/string_map.h"
#include "core/string_set.h"
#include "facade/cmd_arg_parser.h"
#include "facade/dragonfly_connection.h"
#include "server/blocking_controller.h"
#include "server/container_utils.h"
#include "server/engine_shard_set.h"
#include "server/error.h"
#include "server/main_service.h"
#include "server/multi_command_squasher.h"
#include "server/namespaces.h"
#include "server/rdb_load.h"
#include "server/server_state.h"
#include "server/string_stats.h"
#include "server/transaction.h"

using namespace std;

ABSL_DECLARE_FLAG(string, dir);
ABSL_DECLARE_FLAG(string, dbfilename);
ABSL_DECLARE_FLAG(bool, df_snapshot_format);

ABSL_FLAG(bool, background_debug_jobs, false, "Use background fibers for debug jobs");

namespace dfly {

using namespace util;
using boost::intrusive_ptr;
using namespace facade;
using absl::StrAppend;
using absl::StrCat;

namespace {

struct ObjInfo {
  unsigned type = 0;
  unsigned encoding;
  unsigned bucket_id = 0;
  unsigned slot_id = 0;

  // for lists - how many nodes do they have.
  unsigned num_nodes = 0;
  unsigned num_compressed = 0;

  enum LockStatus : uint8_t { NONE, S, X } lock_status = NONE;

  int64_t ttl = INT64_MAX;
  optional<uint32_t> external_len;

  bool has_sec_precision = false;
  bool found = false;
};

struct ValueCompressInfo {
  size_t raw_size = 0;
  size_t compressed_size = 0;
};

std::string GenerateValue(size_t val_size, bool random_value, absl::InsecureBitGen* gen) {
  if (random_value) {
    return GetRandomHex(*gen, val_size);
  } else {
    return string(val_size, 'x');
  }
}

tuple<const CommandId*, absl::InlinedVector<string, 5>> GeneratePopulateCommand(
    string_view type, std::string key, size_t val_size, bool random_value, uint32_t elements,
    const CommandRegistry& registry, absl::InsecureBitGen* gen) {
  absl::InlinedVector<string, 5> args;
  args.push_back(std::move(key));

  const CommandId* cid = nullptr;
  if (type == "STRING") {
    cid = registry.Find("SET");
    args.push_back(GenerateValue(val_size, random_value, gen));
  } else if (type == "LIST") {
    cid = registry.Find("LPUSH");
    for (uint32_t i = 0; i < elements; ++i) {
      args.push_back(GenerateValue(val_size, random_value, gen));
    }
  } else if (type == "SET") {
    cid = registry.Find("SADD");
    for (size_t i = 0; i < elements; ++i) {
      args.push_back(GenerateValue(val_size, random_value, gen));
    }
  } else if (type == "HASH") {
    cid = registry.Find("HSET");
    for (size_t i = 0; i < elements; ++i) {
      args.push_back(GenerateValue(val_size / 2, random_value, gen));
      args.push_back(GenerateValue(val_size / 2, random_value, gen));
    }
  } else if (type == "ZSET") {
    cid = registry.Find("ZADD");
    for (size_t i = 0; i < elements; ++i) {
      args.push_back(StrCat((*gen)() % val_size));
      args.push_back(GenerateValue(val_size, random_value, gen));
    }
  } else if (type == "JSON") {
    cid = registry.Find("JSON.MERGE");
    args.push_back("$");

    string json = "{";
    for (size_t i = 0; i < elements; ++i) {
      absl::StrAppend(&json, "\"", GenerateValue(val_size / 2, random_value, gen), "\":\"",
                      GenerateValue(val_size / 2, random_value, gen), "\",");
    }
    json[json.size() - 1] = '}';  // Replace last ',' with '}'
    args.push_back(json);
  } else if (type == "STREAM") {
    cid = registry.Find("XADD");
    args.push_back("*");
    for (size_t i = 0; i < elements; ++i) {
      args.push_back(GenerateValue(val_size / 2, random_value, gen));
      args.push_back(GenerateValue(val_size / 2, random_value, gen));
    }
  }

  return {cid, args};
}

struct ObjHist {
  base::Histogram key_len;
  base::Histogram val_len;    // overall malloc-used size of the value.
  base::Histogram card;       // for sets, hashmaps etc - it's number of entries.
  base::Histogram entry_len;  // for sets, hashmaps etc - it's the length of each entry.
  base::Histogram listpack;   // for listpack encodings - the malloc used of the listpack.
};

// Returns number of O(1) steps executed.
void AddObjHist(PrimeIterator it, ObjHist* hist) {
  using namespace container_utils;
  const PrimeValue& pv = it->second;
  size_t val_len = 0;

  auto per_entry_cb = [&](ContainerEntry entry) {
    if (entry.IsString()) {
      val_len += entry.size();
      hist->entry_len.Add(entry.size());
    } else {
      val_len += 8;  // size of long
    }
    return true;
  };

  hist->key_len.Add(it->first.MallocUsed());

  if (pv.ObjType() == OBJ_LIST) {
    IterateList(pv, per_entry_cb);
    if (pv.Encoding() == kEncodingQL2) {
      const QList* ql = static_cast<QList*>(pv.RObjPtr());
      val_len = ql->MallocUsed(true);
    } else if (pv.Encoding() == kEncodingListPack) {
      val_len = pv.MallocUsed();
      hist->listpack.Add(val_len);
    }
  } else if (pv.ObjType() == OBJ_ZSET) {
    IterateSortedSet(pv, [&](ContainerEntry entry, double) { return per_entry_cb(entry); });
    val_len = 0;  // reset - will be calculated below.
    if (pv.Encoding() == OBJ_ENCODING_LISTPACK) {
      hist->listpack.Add(pv.MallocUsed());
    }
  } else if (pv.ObjType() == OBJ_SET) {
    IterateSet(pv, per_entry_cb);
    val_len = 0;  // reset - will be calculated below.
    if (pv.Encoding() == kEncodingIntSet) {
      hist->listpack.Add(pv.MallocUsed());
    }
  } else if (pv.ObjType() == OBJ_HASH) {
    IterateMap(pv, [&](ContainerEntry key, ContainerEntry value) {
      hist->entry_len.Add(key.size() + value.size());
      return true;
    });
    if (pv.Encoding() == kEncodingListPack) {
      hist->listpack.Add(pv.MallocUsed());
    }
  }
  // TODO: streams

  if (val_len == 0) {
    // Fallback
    val_len = pv.MallocUsed(true);
  }

  hist->val_len.Add(val_len);

  if (pv.ObjType() != OBJ_STRING && pv.ObjType() != OBJ_JSON)
    hist->card.Add(pv.Size());
}

// ObjType -> ObjHist
//
using ObjHistMap = absl::flat_hash_map<unsigned, unique_ptr<ObjHist>>;

void MergeObjHistMap(ObjHistMap&& src, ObjHistMap* dest) {
  for (auto& [obj_type, src_hist] : src) {
    auto& dest_hist = (*dest)[obj_type];
    if (!dest_hist) {
      dest_hist = std::move(src_hist);
    } else {
      dest_hist->key_len.Merge(src_hist->key_len);
      dest_hist->val_len.Merge(src_hist->val_len);
      dest_hist->card.Merge(src_hist->card);
      dest_hist->entry_len.Merge(src_hist->entry_len);
      dest_hist->listpack.Merge(src_hist->listpack);
    }
  }
}

struct SegmentInfo {
  base::Histogram hist;
};

void DoSegmentHist(EngineShard* shard, ConnectionContext* cntx, SegmentInfo* info) {
  auto& db_slice = cntx->ns->GetDbSlice(shard->shard_id());
  DbTable* dbt = db_slice.GetDBTable(cntx->db_index());
  if (dbt == nullptr)
    return;

  unsigned steps = 0;
  auto& prime = dbt->prime;
  for (size_t i = 0; i < prime.GetSegmentCount(); i = prime.NextSeg(i)) {
    const auto* segment = prime.GetSegment(i);

    info->hist.Add(segment->SlowSize());
    if (++steps % 2000 == 0) {
      ThisFiber::Yield();
    }
  }
}

struct HufHist {
  static constexpr unsigned kMaxSymbol = 255;
  array<unsigned, kMaxSymbol + 1> hist;  // histogram of symbols.
  unsigned max_symbol = 0;               // what is the max symbol of the histogram.

  HufHist() {
    hist.fill(0);
  }

  void Merge(const HufHist& other) {
    max_symbol = std::max(max_symbol, other.max_symbol);
    for (unsigned i = 0; i <= max_symbol; ++i) {
      hist[i] += other.hist[i];
    }
  }

  unsigned MaxFreqCount() const;
};

unsigned HufHist::MaxFreqCount() const {
  unsigned max_freq = 0;
  for (unsigned i = 0; i < kMaxSymbol; ++i) {
    if (hist[i] > max_freq) {
      max_freq = hist[i];
    }
  }
  return max_freq;
}

constexpr unsigned kMaxFreqPerShard = 1U << 20;
constexpr unsigned kMaxFreqTotal = static_cast<unsigned>((1U << 31) * 0.9);

void DoComputeHist(CompactObjType type, EngineShard* shard, ConnectionContext* cntx,
                   HufHist* dest) {
  auto& db_slice = cntx->ns->GetDbSlice(shard->shard_id());
  DbTable* dbt = db_slice.GetDBTable(cntx->db_index());
  CHECK(dbt);

  PrimeTable::Cursor cursor;
  unsigned steps = 0;
  string scratch;
  constexpr size_t kMaxLen = 512;
  PrimeTable& table = dbt->prime;

  do {
    cursor = table.Traverse(cursor, [&](PrimeIterator it) {
      scratch.clear();
      ++steps;
      if (type == kInvalidCompactObjType) {  // KEYSPACE
        if (it->first.MallocUsed() > 0) {
          it->first.GetString(&scratch);
        }
      } else if (type == OBJ_STRING && it->second.ObjType() == OBJ_STRING) {
        if (it->second.MallocUsed() > 0) {
          it->second.GetString(&scratch);
        }
      } else if (type == OBJ_ZSET && it->second.ObjType() == OBJ_ZSET) {
        container_utils::IterateSortedSet(
            it->second, [&](container_utils::ContainerEntry entry, double) {
              ++steps;
              if (entry.IsString()) {
                HIST_add(dest->hist.data(), entry.data(), entry.size());
              }
              return true;
            });
      } else if (type == OBJ_LIST && it->second.ObjType() == OBJ_LIST) {
        container_utils::IterateList(it->second, [&](container_utils::ContainerEntry entry) {
          ++steps;
          if (entry.IsString()) {
            HIST_add(dest->hist.data(), entry.data(), entry.size());
          }
          return true;
        });
      } else if (type == OBJ_HASH && it->second.ObjType() == OBJ_HASH) {
        container_utils::IterateMap(it->second, [&](container_utils::ContainerEntry key,
                                                    container_utils::ContainerEntry value) {
          ++steps;
          if (key.IsString()) {
            HIST_add(dest->hist.data(), key.data(), key.size());
          }
          if (value.IsString()) {
            HIST_add(dest->hist.data(), value.data(), value.size());
          }
          return true;
        });
      }

      if (!scratch.empty()) {
        size_t len = std::min(scratch.size(), kMaxLen);
        HIST_add(dest->hist.data(), scratch.data(), len);
      }
    });

    if (steps >= 40000) {
      if (dest->MaxFreqCount() > kMaxFreqPerShard) {
        break;
      }

      steps = 0;
      ThisFiber::Yield();
    }
  } while (cursor);
  dest->max_symbol = HufHist::kMaxSymbol;
  while (dest->max_symbol && dest->hist[dest->max_symbol] == 0)
    --dest->max_symbol;
}

ObjInfo InspectOp(ConnectionContext* cntx, string_view key) {
  auto& db_slice = cntx->ns->GetCurrentDbSlice();
  auto db_index = cntx->db_index();
  auto* pt = db_slice.GetTables(db_index).first;

  PrimeIterator it = pt->Find(key);
  ObjInfo oinfo;
  if (IsValid(it)) {
    const PrimeValue& pv = it->second;

    oinfo.found = true;
    oinfo.type = pv.ObjType();
    oinfo.encoding = pv.Encoding();
    oinfo.bucket_id = it.bucket_id();
    oinfo.slot_id = it.slot_id();

    if (pv.ObjType() == OBJ_LIST && pv.Encoding() == kEncodingQL2) {
      const QList* qlist = static_cast<const QList*>(pv.RObjPtr());
      oinfo.num_nodes = qlist->node_count();
      auto* node = qlist->Head();

      while (node) {
        if (node->encoding == QUICKLIST_NODE_ENCODING_LZF) {
          ++oinfo.num_compressed;
        }
        node = node->next;
      }
    }

    if (pv.IsExternal()) {
      oinfo.external_len.emplace(pv.GetExternalSlice().second);
    }

    if (it->first.HasExpire()) {
      time_t exp_time = it->first.GetExpireTime();
      oinfo.ttl = exp_time - GetCurrentTimeMs();
      oinfo.has_sec_precision = false;  // Embedded TTL is always ms precision.
    }
  }

  if (!db_slice.CheckLock(IntentLock::EXCLUSIVE, db_index, key)) {
    oinfo.lock_status =
        db_slice.CheckLock(IntentLock::SHARED, db_index, key) ? ObjInfo::S : ObjInfo::X;
  }

  return oinfo;
}

OpResult<ValueCompressInfo> EstimateCompression(ConnectionContext* cntx, string_view key) {
  auto& db_slice = cntx->ns->GetCurrentDbSlice();
  auto db_index = cntx->db_index();
  auto* pt = db_slice.GetTables(db_index).first;

  PrimeIterator it = pt->Find(key);
  if (!IsValid(it)) {
    return OpStatus::KEY_NOTFOUND;
  }

  // Only strings are supported right now.
  if (it->second.ObjType() != OBJ_STRING && it->second.ObjType() != OBJ_LIST) {
    return OpStatus::WRONG_TYPE;
  }
  ValueCompressInfo info;

  if (it->second.ObjType() == OBJ_LIST) {
    if (it->second.Encoding() != kEncodingQL2) {
      return OpStatus::WRONG_TYPE;
    }

    const QList* src = static_cast<const QList*>(it->second.RObjPtr());
    info.raw_size = src->MallocUsed(true);
    QList qlist(-2, 1);
    auto copy_cb = [&](QList::Entry entry) {
      qlist.Push(entry.view(), QList::HEAD);
      return true;
    };
    src->Iterate(copy_cb, 0, -1);
    info.compressed_size = qlist.MallocUsed(true);
    return info;
  }

  string scratch;
  string_view value = it->second.GetSlice(&scratch);

  info.raw_size = value.size();
  info.compressed_size = info.raw_size;

  if (info.raw_size >= 32) {
    size_t compressed_size = ZSTD_compressBound(value.size());
    unique_ptr<char[]> compressed(new char[compressed_size]);
    info.compressed_size =
        ZSTD_compress(compressed.get(), compressed_size, value.data(), value.size(), 5);
  }

  return info;
};

const char* EncodingName(unsigned obj_type, unsigned encoding) {
  switch (obj_type) {
    case OBJ_STRING:
      return "raw";
    case OBJ_LIST:
      switch (encoding) {
        case kEncodingQL2:
          return "quicklist";
        case kEncodingListPack:
          return "listpack";
      }
      break;
    case OBJ_SET:
      ABSL_FALLTHROUGH_INTENDED;
    case OBJ_ZSET:
      ABSL_FALLTHROUGH_INTENDED;
    case OBJ_HASH:
      switch (encoding) {
        case kEncodingIntSet:
          return "intset";
        case kEncodingStrMap2:
          return "dense_set";
        case OBJ_ENCODING_SKIPLIST:  // we kept the old enum for zset
          return "btree";
        case OBJ_ENCODING_LISTPACK:
          ABSL_FALLTHROUGH_INTENDED;
        case kEncodingListPack:
          return "listpack";
      }
      break;
    case OBJ_JSON:
      switch (encoding) {
        case kEncodingJsonCons:
          return "jsoncons";
        case kEncodingJsonFlat:
          return "jsonflat";
      }
      break;
    case OBJ_STREAM:
      return "stream";
  }
  return "unknown";
}

struct IOStat {
  uint64_t conn_received = 0;
  uint64_t curr_conn_count = 0;
  uint64_t cmd_total = 0, pipelined_cmd_total = 0;
  size_t io_read_bytes = 0;
  uint64_t io_reads_total = 0;

  void From(const facade::FacadeStats& fs);
  void Print(RedisReplyBuilder* rb) const;

  IOStat& operator-=(const IOStat& other);
};

void IOStat::From(const facade::FacadeStats& fs) {
  conn_received = fs.conn_stats.conn_received_cnt;
  curr_conn_count = fs.conn_stats.num_conns_main;
  cmd_total = fs.conn_stats.command_cnt_main;
  pipelined_cmd_total = fs.conn_stats.pipelined_cmd_cnt;
  io_read_bytes = fs.conn_stats.io_read_bytes;
  io_reads_total = fs.conn_stats.io_read_cnt;
}

void IOStat::Print(RedisReplyBuilder* rb) const {
  rb->StartCollection(6, CollectionType::MAP);
  rb->SendSimpleString("connections_received");
  rb->SendLong(conn_received);
  rb->SendSimpleString("current_conn_count");
  rb->SendLong(curr_conn_count);
  rb->SendSimpleString("commands_total");
  rb->SendLong(cmd_total);
  rb->SendSimpleString("pipelined_commands_total");
  rb->SendLong(pipelined_cmd_total);
  rb->SendSimpleString("io_read_bytes");
  rb->SendLong(io_read_bytes);
  rb->SendSimpleString("io_reads_total");
  rb->SendLong(io_reads_total);
}

IOStat& IOStat::operator-=(const IOStat& other) {
  conn_received -= other.conn_received;
  curr_conn_count -= other.curr_conn_count;
  cmd_total -= other.cmd_total;
  pipelined_cmd_total -= other.pipelined_cmd_total;
  io_read_bytes -= other.io_read_bytes;
  io_reads_total -= other.io_reads_total;

  return *this;
}

// Traverse over all entries on all databases, manage cpu time automatically
template <typename F> void TraverseAllEntries(bool background, ConnectionContext* cntx, F&& f) {
  util::fb2::BlockingCounter bc{0};
  for (uint32_t i = 0; i < shard_set->size(); ++i) {
    bc->Add(1);
    util::ProactorBase* dest = shard_set->pool()->at(i);

    auto cb = [f /* copy per thread */, bc, cntx, background]() mutable {
      auto* shard = EngineShard::tlocal();
      auto& db_slice = cntx->ns->GetDbSlice(shard->shard_id());

      for (unsigned i = 0; i < db_slice.db_array_size(); ++i) {
        boost::intrusive_ptr<DbTable> dbt = db_slice.CopyDBTablePtr(i);
        if (!dbt)
          continue;

        PrimeTable::Cursor cursor;
        do {
          cursor = dbt->prime.Traverse(cursor, f);
          if (background) {
            ThisFiber::Yield();
          } else if (base::CycleClock::ToUsec(ThisFiber::GetRunningTimeCycles()) >= 500) {
            ThisFiber::Yield();
          }
        } while (cursor);
      }
      bc->Dec();
    };
    dest->DispatchBrief([cb, background]() mutable {
      using namespace util::fb2;
      Fiber::Opts opts{
          .priority = background ? FiberPriority::BACKGROUND : FiberPriority::NORMAL,
          .name = "Debug/Traverse",
      };
      Fiber(opts, std::move(cb)).Detach();
    });
  }
  bc->Wait();
}

}  // namespace

DebugCmd::DebugCmd(ServerFamily* owner, cluster::ClusterFamily* cf, ConnectionContext* cntx)
    : sf_(*owner), cf_(*cf), cntx_(cntx) {
}

void DebugCmd::Run(CmdArgList args, CommandContext* cmd_cntx) {
  string subcmd = absl::AsciiStrToUpper(ArgS(args, 0));
  if (subcmd == "HELP") {
    string_view help_arr[] = {
        "DEBUG <subcommand> [<arg> [value] [opt] ...]. Subcommands are:",
        "EXEC",
        "    Show the descriptors of the MULTI/EXEC transactions that were processed by ",
        "    the server. For each EXEC/i descriptor, 'i' is the number of shards it touches. ",
        "    Each descriptor details the commands it contained followed by number of their ",
        "    arguments. Each descriptor is prefixed by its frequency count",
        "OBJECT <key> [COMPRESS]",
        "    Show low-level info about `key` and associated value.",
        "RELOAD [option ...]",
        "    Save the RDB on disk and reload it back to memory. Valid <option> values:",
        "    * NOSAVE: the database will be loaded from an existing RDB file.",
        "    Examples:",
        "    * DEBUG RELOAD NOSAVE: replace the current database with the contents of an",
        "      existing RDB file.",
        "REPLICA PAUSE/RESUME",
        "    Stops replica from reconnecting to master, or resumes",
        "MIGRATION PAUSE/RESUME",
        "    Stops/resumes incoming migration process only in the SYNC state",
        "REPLICA OFFSET",
        "    Return sync id and array of number of journal commands executed for each replica flow",
        "WATCHED",
        "    Shows the watched keys as a result of BLPOP and similar operations.",
        "POPULATE <count> [prefix] [size] [RAND] [SLOTS start end] [TYPE type] [ELEMENTS elements]"
        " [EXPIRE start end]",
        "    Create <count> string keys named key:<num> with value value:<num>.",
        "    If <prefix> is specified then it is used instead of the 'key' prefix.",
        "    If <size> is specified then X character is concatenated multiple times to value:<num>",
        "    to meet value size.",
        "    If RAND is specified then value will be set to random hex string in specified size.",
        "    If SLOTS is specified then create keys only in given slots range.",
        "    TYPE specifies data type (must be STRING/LIST/SET/HASH/ZSET/JSON/STREAM), default "
        "STRING.",
        "    ELEMENTS specifies how many sub elements if relevant (like entries in a list / set).",
        "    EXPIRE specifies key expire ttl range.",
        "OBJHIST",
        "    Prints histogram of object sizes.",
        "STACKTRACE",
        "    Prints the stacktraces of all current fibers to the logs.",
        "SHARDS",
        "    Prints memory usage and key stats per shard, as well as min/max indicators.",
        "TOPK ON [min_freq] | OFF [max_keys]",
        "    Turns on or off sampling of topk keys. Provides top keys with at least <min_freq> ",
        "    during the sampling period. The results are returned in descending order of frequency",
        "    when calling TOPK OFF command. First result is the sampled keys count.",
        "KEYS ON | OFF",
        "    Turns on/off counting of unique keys. Results are returned when calling ",
        "    KEYS OFF command. The results is array with two integers: unique keys count and ",
        "    sampled keys count.",
        "VALUES ON | OFF",
        "    Turns on/off measurement of value length distribution. Results are returned when ",
        "    calling VALUES OFF command.",
        "TX",
        "    Performs transaction analysis per shard.",
        "TRAFFIC <path>/<file_prefix> | [STOP]",
        "    Use <path>/<file_prefix> to start traffic logging to the specified path.",
        "    All recorded files will have the specified prefix.",
        "    Use 'STOP' or do not specify any arguments to stop traffic logging.",
        "RECVSIZE [<tid> | ENABLE | DISABLE]",
        "    Prints the histogram of the received request sizes on the given thread",
        "COMPRESSION [IMPORT <bintable> | EXPORT | SET <bintable>] [type]",
        "    Estimate the compressibility of values of the given type. if no type is given, ",
        "    checks compressibility of keys. If IN is specified, then the provided ",
        "    bintable is used to check compressibility. If OUT is specified, then ",
        "    the serialized table is printed as well",
        "IOSTATS [PS]",
        "    Prints IO stats per thread. If PS is specified, prints thread-level stats ",
        "    per second.",
        "SEGMENTS",
        "    Prints segment info for the current database.",
        "COMPACT-TABLE threshold",
        "    Attempts to merge underutilized segments in dash table",
        "UNIQ-STRS",
        "    Prints per-object unique string stats and estimated dedup savings across shards.",
        "HELP",
        "    Prints this help.",
    };
    auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
    return rb->SendSimpleStrArr(help_arr);
  }

  VLOG(1) << "subcmd " << subcmd;

  if (subcmd == "POPULATE") {
    return Populate(args, cmd_cntx);
  }

  if (subcmd == "RELOAD") {
    return Reload(args, cmd_cntx);
  }

  if (subcmd == "REPLICA" && args.size() == 2) {
    return Replica(args, cmd_cntx);
  }

  if (subcmd == "MIGRATION" && args.size() == 2) {
    return Migration(args, cmd_cntx);
  }

  if (subcmd == "WATCHED") {
    return Watched(cmd_cntx);
  }

  if (subcmd == "OBJECT" && args.size() >= 2) {
    string_view key = ArgS(args, 1);
    args.remove_prefix(2);
    return Inspect(key, args, cmd_cntx);
  }

  if (subcmd == "TX") {
    return TxAnalysis(cmd_cntx);
  }

  if (subcmd == "OBJHIST") {
    return ObjHist(cmd_cntx);
  }

  if (subcmd == "STACKTRACE") {
    return Stacktrace(cmd_cntx);
  }

  if (subcmd == "SHARDS") {
    return Shards(cmd_cntx);
  }

  if (subcmd == "EXEC") {
    return Exec(cmd_cntx);
  }

  if (subcmd == "TRAFFIC") {
    return LogTraffic(args.subspan(1), cmd_cntx);
  }

  if (subcmd == "RECVSIZE" && args.size() == 2) {
    return RecvSize(ArgS(args, 1), cmd_cntx);
  }

  if (subcmd == "TOPK" && args.size() >= 2) {
    return Topk(args.subspan(1), cmd_cntx);
  }

  if (subcmd == "KEYS" && args.size() >= 2) {
    return Keys(args.subspan(1), cmd_cntx);
  }

  if (subcmd == "VALUES" && args.size() >= 2) {
    return Values(args.subspan(1), cmd_cntx);
  }
  if (subcmd == "COMPRESSION") {
    return Compression(args.subspan(1), cmd_cntx);
  }

  if (subcmd == "IOSTATS") {
    return IOStats(args.subspan(1), cmd_cntx);
  }
  if (subcmd == "SEGMENTS") {
    return Segments(args.subspan(1), cmd_cntx);
  }

  if (subcmd == "COMPACT-TABLE") {
    return CompactTable(args.subspan(1), cmd_cntx);
  }

  if (subcmd == "UNIQ-STRS") {
    return CountUniqueStrings(cmd_cntx);
  }

  string reply = UnknownSubCmd(subcmd, "DEBUG");
  return cmd_cntx->SendError(reply, kSyntaxErrType);
}

void DebugCmd::Shutdown() {
  // disable traffic logging
  shard_set->pool()->AwaitFiberOnAll([](auto*) { facade::Connection::StopTrafficLogging(); });
}

void DebugCmd::Reload(CmdArgList args, CommandContext* cmd_cntx) {
  bool save = true;

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  for (size_t i = 1; i < args.size(); ++i) {
    string opt = absl::AsciiStrToUpper(ArgS(args, i));
    VLOG(1) << "opt " << opt;

    if (opt == "NOSAVE") {
      save = false;
    } else {
      return cmd_cntx->SendError("DEBUG RELOAD only supports the NOSAVE options.");
    }
  }

  if (save) {
    string err_details;
    VLOG(1) << "Performing save";

    GenericError ec = sf_.DoSave();
    if (ec) {
      return cmd_cntx->SendError(ec.Format());
    }
  }

  string last_save_file = sf_.GetLastSaveInfo().file_name;

  sf_.FlushAll(cntx_->ns);

  if (auto fut_ec = sf_.Load(last_save_file, ServerFamily::LoadExistingKeys::kFail); fut_ec) {
    GenericError ec = fut_ec->Get();
    if (ec) {
      string msg = ec.Format();
      LOG(WARNING) << "Could not load file " << msg;
      return cmd_cntx->SendError(msg);
    }
  }

  rb->SendOk();
}

void DebugCmd::Replica(CmdArgList args, CommandContext* cmd_cntx) {
  args.remove_prefix(1);

  string opt = absl::AsciiStrToUpper(ArgS(args, 0));

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (opt == "PAUSE" || opt == "RESUME") {
    sf_.PauseReplication(opt == "PAUSE");
    return rb->SendOk();
  } else if (opt == "OFFSET") {
    const auto offset_info = sf_.GetReplicaOffsetInfo();
    if (offset_info) {
      rb->StartArray(2);
      rb->SendBulkString(offset_info.value().sync_id);
      rb->StartArray(offset_info.value().flow_offsets.size());
      for (uint64_t offset : offset_info.value().flow_offsets) {
        rb->SendLong(offset);
      }
      return;
    } else {
      return cmd_cntx->SendError("I am master");
    }
  }
  return cmd_cntx->SendError(UnknownSubCmd("replica", "DEBUG"));
}

void DebugCmd::Migration(CmdArgList args, CommandContext* cmd_cntx) {
  args.remove_prefix(1);

  string opt = absl::AsciiStrToUpper(ArgS(args, 0));
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (opt == "PAUSE" || opt == "RESUME") {
    cf_.PauseAllIncomingMigrations(opt == "PAUSE");
    return rb->SendOk();
  }
  return cmd_cntx->SendError(UnknownSubCmd("MIGRATION", "DEBUG"));
}

enum PopulateFlag { FLAG_RAND, FLAG_TYPE, FLAG_ELEMENTS, FLAG_SLOT, FLAG_EXPIRE, FLAG_UNKNOWN };

// Populate arguments format:
// required: (total count) (key prefix) (val size)
// optional: [RAND | TYPE typename | ELEMENTS element num | SLOTS (key value)+ | EXPIRE start end]
optional<DebugCmd::PopulateOptions> DebugCmd::ParsePopulateArgs(CmdArgList args,
                                                                CommandContext* cmd_cntx) {
  CmdArgParser parser(args.subspan(1));
  PopulateOptions options;

  options.total_count = parser.Next<uint64_t>();
  options.prefix = parser.NextOrDefault<string_view>("key");
  options.val_size = parser.NextOrDefault<uint32_t>(16);
  while (parser.HasNext()) {
    PopulateFlag flag = parser.MapNext("RAND", FLAG_RAND, "TYPE", FLAG_TYPE, "ELEMENTS",
                                       FLAG_ELEMENTS, "SLOTS", FLAG_SLOT, "EXPIRE", FLAG_EXPIRE);
    switch (flag) {
      case FLAG_RAND:
        options.populate_random_values = true;
        break;
      case FLAG_TYPE:
        options.type = absl::AsciiStrToUpper(parser.Next<string_view>());
        break;
      case FLAG_ELEMENTS:
        options.elements = parser.Next<uint32_t>();
        break;
      case FLAG_SLOT: {
        auto [start, end] = parser.Next<FInt<0, 16383>, FInt<0, 16383>>();
        options.slot_range = cluster::SlotRange{SlotId(start), SlotId(end)};
        break;
      }
      case FLAG_EXPIRE: {
        auto [min_ttl, max_ttl] = parser.Next<uint32_t, uint32_t>();
        if (min_ttl >= max_ttl) {
          cmd_cntx->SendError(kExpiryOutOfRange);
          (void)parser.TakeError();
          return nullopt;
        }
        options.expire_ttl_range = std::make_pair(min_ttl, max_ttl);
        break;
      }
      default:
        LOG(FATAL) << "Unexpected flag in PopulateArgs. Args: " << args;
        break;
    }
  }
  if (parser.HasError()) {
    cmd_cntx->SendError(parser.TakeError().MakeReply());
    return nullopt;
  }
  if (options.val_size == 0) {
    cmd_cntx->SendError("val_size must be positive");
    return nullopt;
  }
  return options;
}

void DebugCmd::Populate(CmdArgList args, CommandContext* cmd_cntx) {
  optional<PopulateOptions> options = ParsePopulateArgs(args, cmd_cntx);
  if (!options.has_value()) {
    return;
  }
  DCHECK(sf_.AreAllReplicasInStableSync());

  ProactorPool& pp = sf_.service().proactor_pool();
  size_t runners_count = pp.size();
  vector<pair<uint64_t, uint64_t>> ranges(runners_count - 1);
  uint64_t batch_size = options->total_count / runners_count;
  size_t from = 0;
  for (size_t i = 0; i < ranges.size(); ++i) {
    ranges[i].first = from;
    ranges[i].second = batch_size;
    from += batch_size;
  }
  ranges.emplace_back(from, options->total_count - from);

  vector<fb2::Fiber> fb_arr(ranges.size());
  for (size_t i = 0; i < ranges.size(); ++i) {
    auto range = ranges[i];

    // whatever we do, we should not capture i by reference.
    fb_arr[i] = pp.at(i)->LaunchFiber([range, options, this] {
      this->PopulateRangeFiber(range.first, range.second, options.value());
    });
  }
  for (auto& fb : fb_arr)
    fb.Join();

  cmd_cntx->rb()->SendOk();

  DCHECK(sf_.AreAllReplicasInStableSync());
}

void DebugCmd::PopulateRangeFiber(uint64_t from, uint64_t num_of_keys,
                                  const PopulateOptions& options) {
  ThisFiber::SetName("populate_range");
  VLOG(1) << "PopulateRange: " << from << "-" << (from + num_of_keys - 1);

  string key = StrCat(options.prefix, ":");
  size_t prefsize = key.size();
  DbIndex db_indx = cntx_->db_index();
  EngineShardSet& ess = *shard_set;
  std::vector<PopulateBatch> ps(ess.size(), PopulateBatch{db_indx});

  uint64_t index = from;
  uint64_t to = from + num_of_keys;
  uint64_t added = 0;
  while (added < num_of_keys) {
    if ((index >= to) && ((index - to) % options.total_count == 0)) {
      index = index - num_of_keys + options.total_count;
    }
    key.resize(prefsize);  // shrink back

    StrAppend(&key, index);

    if (options.slot_range.has_value()) {
      // Each fiber will add num_of_keys. Keys are in the form of <key_prefix>:<index>
      // We need to make sure that different fibers will not add the same key.
      // Fiber starting <key_prefix>:<from> to <key_prefix>:<from+num_of_keys-1>
      // then continue to <key_prefix>:<from+total_count> to
      // <key_prefix>:<from+total_count+num_of_keys-1> and continue until num_of_keys are added.

      // Add keys only in slot range.
      SlotId sid = KeySlot(key);
      if (sid < options.slot_range->start || sid > options.slot_range->end) {
        ++index;
        continue;
      }
    }
    ShardId sid = Shard(key, ess.size());

    auto& shard_batch = ps[sid];
    shard_batch.index[shard_batch.sz++] = index;
    ++added;
    ++index;

    if (shard_batch.sz == 32) {
      ess.Add(sid, [this, index, options, shard_batch]() {
        DoPopulateBatch(options, shard_batch);
        if (index % 50 == 0) {
          ThisFiber::Yield();
        }
      });

      // we capture shard_batch by value so we can override it here.
      shard_batch.sz = 0;
    }
  }

  ess.AwaitRunningOnShardQueue([&](EngineShard* shard) {
    DoPopulateBatch(options, ps[shard->shard_id()]);
    // Debug populate does not use transaction framework therefore we call OnCbFinishBlocking
    // manually after running the callback Note that running debug populate while running
    // flushall/db can cause dcheck fail because the finish cb is executed just when we finish
    // populating the database.
    cntx_->ns->GetDbSlice(shard->shard_id()).OnCbFinishBlocking();
  });
}

void DebugCmd::Exec(CommandContext* cmd_cntx) {
  EngineShardSet& ess = *shard_set;
  fb2::Mutex mu;
  std::map<string, unsigned> freq_cnt;

  ess.pool()->AwaitFiberOnAll([&](auto*) {
    for (const auto& k_v : ServerState::tlocal()->exec_freq_count) {
      unique_lock lk(mu);
      freq_cnt[k_v.first] += k_v.second;
    }
  });

  string res;
  for (const auto& k_v : freq_cnt) {
    StrAppend(&res, k_v.second, ":", k_v.first, "\n");
  }
  StrAppend(&res, "--------------------------\n");

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  rb->SendVerbatimString(res);
}

void DebugCmd::LogTraffic(CmdArgList args, CommandContext* cmd_cntx) {
  optional<string> path;
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (ProactorBase::me()->GetKind() != ProactorBase::IOURING) {
    return cmd_cntx->SendError("Traffic recording supported only on iouring");
  }

  if (args.size() == 1 && absl::AsciiStrToUpper(facade::ToSV(args.front())) != "STOP"sv) {
    path = ArgS(args, 0);
    LOG(INFO) << "Logging to traffic to " << *path << "*.bin";
  } else {
    LOG(INFO) << "Traffic logging stopped";
  }

  shard_set->pool()->AwaitFiberOnAll([path](auto*) {
    if (path)
      facade::Connection::StartTrafficLogging(*path);
    else
      facade::Connection::StopTrafficLogging();
  });
  rb->SendOk();
}

void DebugCmd::Inspect(string_view key, CmdArgList args, CommandContext* cmd_cntx) {
  EngineShardSet& ess = *shard_set;
  ShardId sid = Shard(key, ess.size());
  VLOG(1) << "DebugCmd::Inspect " << key;

  bool check_compression = false;
  if (args.size() == 1) {
    check_compression = absl::AsciiStrToUpper(ArgS(args, 0)) == "COMPRESS";
  }
  string resp;
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (check_compression) {
    auto cb = [&] { return EstimateCompression(cntx_, key); };
    auto res = ess.Await(sid, std::move(cb));
    if (!res) {
      cmd_cntx->SendError(res.status());
      return;
    }
    StrAppend(&resp, "raw_size: ", res->raw_size, ", compressed_size: ", res->compressed_size);
    if (res->raw_size > 0) {
      StrAppend(&resp, " ratio: ", static_cast<double>(res->compressed_size) / (res->raw_size));
    }
  } else {
    auto cb = [&] { return InspectOp(cntx_, key); };

    ObjInfo res = ess.Await(sid, std::move(cb));

    if (!res.found) {
      cmd_cntx->SendError(kKeyNotFoundErr);
      return;
    }

    StrAppend(&resp, "encoding:", EncodingName(res.type, res.encoding),
              " bucket_id:", res.bucket_id);
    StrAppend(&resp, " slot:", res.slot_id, " shard:", sid);

    if (res.ttl != INT64_MAX) {
      StrAppend(&resp, " ttl:", res.ttl, res.has_sec_precision ? "s" : "ms");
    }

    if (res.external_len) {
      StrAppend(&resp, " spill_len:", *res.external_len);
    }

    if (res.num_nodes) {
      // node count
      StrAppend(&resp, " nc:", res.num_nodes);
    }

    if (res.num_compressed) {
      // compressed nodes
      StrAppend(&resp, " cn:", res.num_compressed);
    }

    if (res.lock_status != ObjInfo::NONE) {
      StrAppend(&resp, " lock:", res.lock_status == ObjInfo::X ? "x" : "s");
    }
  }
  rb->SendSimpleString(resp);
}

void DebugCmd::Watched(CommandContext* cmd_cntx) {
  fb2::Mutex mu;

  vector<string> watched_keys;
  vector<string> awaked_trans;

  auto cb = [&](EngineShard* shard) {
    auto* bc = cntx_->ns->GetBlockingController(shard->shard_id());
    if (bc) {
      auto keys = bc->GetWatchedKeys(cntx_->db_index());

      lock_guard lk(mu);
      watched_keys.insert(watched_keys.end(), keys.begin(), keys.end());
      for (auto* tx : bc->awakened_transactions()) {
        awaked_trans.push_back(StrCat("[", shard->shard_id(), "] ", tx->DebugId()));
      }
    }
  };

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  shard_set->RunBlockingInParallel(cb);
  rb->StartArray(4);
  rb->SendBulkString("awaked");
  rb->SendBulkStrArr(awaked_trans);
  rb->SendBulkString("watched");
  rb->SendBulkStrArr(watched_keys);
}

void DebugCmd::TxAnalysis(CommandContext* cmd_cntx) {
  vector<EngineShard::TxQueueInfo> shard_info(shard_set->size());

  auto cb = [&](EngineShard* shard) {
    auto& info = shard_info[shard->shard_id()];
    info = shard->AnalyzeTxQueue();
  };

  shard_set->RunBriefInParallel(cb);

  string result;
  for (unsigned i = 0; i < shard_set->size(); ++i) {
    const auto& info = shard_info[i];
    StrAppend(&result, "shard", i, ":\n", info.Format(), "\n");
  }
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  rb->SendVerbatimString(result);
}

void DebugCmd::ObjHist(CommandContext* cmd_cntx) {
  vector<ObjHistMap> obj_hist_map_arr(shard_set->size());
  auto cb = [&obj_hist_map_arr](PrimeIterator it) {
    unsigned obj_type = it->second.ObjType();
    auto& hist_ptr = obj_hist_map_arr[EngineShard::tlocal()->shard_id()][obj_type];
    if (!hist_ptr) {
      hist_ptr.reset(new struct ObjHist);
    }
    AddObjHist(it, hist_ptr.get());
  };
  TraverseAllEntries(absl::GetFlag(FLAGS_background_debug_jobs), cntx_, cb);

  for (size_t i = shard_set->size() - 1; i > 0; --i) {
    MergeObjHistMap(std::move(obj_hist_map_arr[i]), &obj_hist_map_arr[0]);
  }

  string result;
  absl::StrAppend(&result, "___begin object histogram___\n\n");

  for (auto& [obj_type, hist_ptr] : obj_hist_map_arr[0]) {
    StrAppend(&result, "OBJECT:", ObjTypeToString(obj_type), "\n");
    StrAppend(&result, "________________________________________________________________\n");
    StrAppend(&result, "Key memory used:\n", hist_ptr->key_len.ToString(), "\n");
    StrAppend(&result, "Values - Total Memory used:\n", hist_ptr->val_len.ToString(), "\n");
    if (hist_ptr->card.count() > 0) {
      StrAppend(&result, "Cardinality histogram (number of elements in sets):\n",
                hist_ptr->card.ToString(), "\n");
    }
    StrAppend(&result, "Items length histogram:\n", hist_ptr->entry_len.ToString(), "\n");
    if (hist_ptr->listpack.count() > 0) {
      StrAppend(&result, "Listpack histogram:\n", hist_ptr->listpack.ToString(), "\n");
    }
  }

  absl::StrAppend(&result, "___end object histogram___\n");
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  rb->SendVerbatimString(result);
}

void DebugCmd::Stacktrace(CommandContext* cmd_cntx) {
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  fb2::Mutex m;
  shard_set->pool()->AwaitFiberOnAll([&m](unsigned index, ProactorBase* base) {
    EngineShard* es = EngineShard::tlocal();
    string txq;
    if (es) {
      EngineShard::TxQueueInfo txq_info = es->AnalyzeTxQueue();
      txq = txq_info.Format();
    }
    std::unique_lock lk(m);
    LOG_IF(INFO, !txq.empty()) << "Shard" << index << ": " << txq;
    fb2::detail::FiberInterface::PrintAllFiberStackTraces();
  });
  base::FlushLogs();
  rb->SendOk();
}

void DebugCmd::Shards(CommandContext* cmd_cntx) {
  struct ShardInfo {
    uint64_t used_memory = 0;
    uint64_t key_count = 0;
    uint64_t prime_capacity = 0;
    uint64_t expire_count = 0;
    uint64_t key_reads = 0;
    size_t avg_object_size = 0;
  };

  vector<ShardInfo> infos(shard_set->size());
  shard_set->RunBriefInParallel([&](EngineShard* shard) {
    auto sid = shard->shard_id();
    auto& db_slice = cntx_->ns->GetDbSlice(sid);
    auto slice_stats = db_slice.GetStats();
    auto& stats = infos[sid];

    stats.used_memory = shard->UsedMemory();
    for (const auto& db_stats : slice_stats.db_stats) {
      stats.key_count += db_stats.key_count;
      stats.prime_capacity += db_stats.prime_capacity;
      stats.expire_count += db_stats.expire_count;
    }
    stats.avg_object_size = db_slice.bytes_per_object();
    stats.key_reads = slice_stats.events.hits + slice_stats.events.misses;
  });

#define ADD_STAT(i, stat) absl::StrAppend(&out, "shard", i, "_", #stat, ": ", infos[i].stat, "\n");
#define MAXMIN_STAT(stat)                                   \
  {                                                         \
    uint64_t minv = std::numeric_limits<uint64_t>::max();   \
    uint64_t maxv = 0;                                      \
    for (const auto& info : infos) {                        \
      minv = std::min(minv, info.stat);                     \
      maxv = std::max(maxv, info.stat);                     \
    }                                                       \
    absl::StrAppend(&out, "max_", #stat, ": ", maxv, "\n"); \
    absl::StrAppend(&out, "min_", #stat, ": ", minv, "\n"); \
  }

  string out;
  absl::StrAppend(&out, "num_shards: ", shard_set->size(), "\n");

  for (size_t i = 0; i < infos.size(); i++) {
    ADD_STAT(i, used_memory);
    ADD_STAT(i, key_count);
    ADD_STAT(i, expire_count);
    ADD_STAT(i, key_reads);

    absl::StrAppend(&out, "shard", i,
                    "_prime_utilization: ", double(infos[i].key_count) / infos[i].prime_capacity,
                    "\n");
    absl::StrAppend(&out, "shard", i, "_avg_object_size: ", infos[i].avg_object_size, "\n");
  }

  MAXMIN_STAT(used_memory);
  MAXMIN_STAT(key_count);
  MAXMIN_STAT(expire_count);
  MAXMIN_STAT(key_reads);

#undef ADD_STAT
#undef MAXMIN_STAT
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  rb->SendVerbatimString(out);
}

void DebugCmd::RecvSize(string_view param, CommandContext* cmd_cntx) {
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  uint8_t enable = 2;
  if (absl::EqualsIgnoreCase(param, "ENABLE"))
    enable = 1;
  else if (absl::EqualsIgnoreCase(param, "DISABLE"))
    enable = 0;

  if (enable < 2) {
    shard_set->pool()->AwaitBrief(
        [enable](auto, auto*) { facade::Connection::TrackRequestSize(enable == 1); });
    return rb->SendOk();
  }

  unsigned tid;
  if (!absl::SimpleAtoi(param, &tid) || tid >= shard_set->pool()->size()) {
    return cmd_cntx->SendError(kUintErr);
  }

  string hist;
  shard_set->pool()->at(tid)->AwaitBrief(
      [&]() { facade::Connection::GetRequestSizeHistogramThreadLocal(&hist); });
  rb->SendVerbatimString(hist);
}

void DebugCmd::Topk(CmdArgList args, CommandContext* cmd_cntx) {
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  DCHECK_GE(args.size(), 1u);

  string_view subcmd = ArgS(args, 0);
  if (absl::EqualsIgnoreCase(subcmd, "ON")) {
    uint32_t min_freq = 100;
    if (args.size() > 1) {
      if (!absl::SimpleAtoi(ArgS(args, 1), &min_freq))
        return cmd_cntx->SendError(kUintErr);
    }
    shard_set->RunBriefInParallel([&](EngineShard* es) {
      cntx_->ns->GetDbSlice(es->shard_id()).StartSampleTopK(cntx_->db_index(), min_freq);
    });
    return rb->SendOk();
  }

  if (absl::EqualsIgnoreCase(subcmd, "OFF")) {
    vector<DbSlice::SamplingResult> results(shard_set->size());
    uint32_t max_keys = 50;

    if (args.size() > 1) {
      if (!absl::SimpleAtoi(ArgS(args, 1), &max_keys))
        return cmd_cntx->SendError(kUintErr);
    }

    shard_set->RunBriefInParallel([&](EngineShard* es) {
      results[es->shard_id()] =
          cntx_->ns->GetDbSlice(es->shard_id()).StopSampleTopK(cntx_->db_index());
    });

    vector<pair<uint64_t, string>> items;
    uint64_t total_keys = 0;
    for (const auto& res : results) {
      total_keys += res.total_samples;
      for (const auto& k_v : res.top_keys) {
        items.emplace_back(k_v.second, k_v.first);
        push_heap(items.begin(), items.end(), std::greater<>());
        if (items.size() > max_keys) {
          pop_heap(items.begin(), items.end(), std::greater<>());
          items.pop_back();
        }
      }
    }

    rb->StartArray(2);
    rb->SendLong(total_keys);
    rb->StartArray(items.size());
    for (const auto& k_v : items) {
      rb->SendBulkString(StrCat(k_v.second, ":", k_v.first));
    }
    return;
  }

  return cmd_cntx->SendError(kSyntaxErr);
}

void DebugCmd::Keys(CmdArgList args, CommandContext* cmd_cntx) {
  string_view subcmd = ArgS(args, 0);
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (absl::EqualsIgnoreCase(subcmd, "ON")) {
    shard_set->RunBriefInParallel([&](EngineShard* es) {
      cntx_->ns->GetDbSlice(es->shard_id()).StartSampleKeys(cntx_->db_index());
    });
    return rb->SendOk();
  }

  if (absl::EqualsIgnoreCase(subcmd, "OFF")) {
    atomic_uint64_t uniq_keys{0}, total_samples{0};
    shard_set->RunBriefInParallel([&](EngineShard* es) {
      DbSlice::UniqueSampleResult res =
          cntx_->ns->GetDbSlice(es->shard_id()).StopSampleKeys(cntx_->db_index());
      uniq_keys.fetch_add(res.unique_keys_count, memory_order_relaxed);
      total_samples.fetch_add(res.total_samples, memory_order_relaxed);
    });

    uint64_t arr[2] = {uniq_keys.load(), total_samples.load()};
    return rb->SendLongArr(absl::MakeConstSpan(arr));
  }

  return cmd_cntx->SendError(kSyntaxErr);
}

void DebugCmd::Values(CmdArgList args, CommandContext* cmd_cntx) {
  string_view subcmd = ArgS(args, 0);
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (absl::EqualsIgnoreCase(subcmd, "ON")) {
    shard_set->RunBriefInParallel([&](EngineShard* es) {
      cntx_->ns->GetDbSlice(es->shard_id()).StartSampleValues(cntx_->db_index());
    });
    return rb->SendOk();
  }

  vector<unique_ptr<base::Histogram>> histograms(shard_set->size());
  if (absl::EqualsIgnoreCase(subcmd, "OFF")) {
    shard_set->RunBriefInParallel([&](EngineShard* es) {
      histograms[es->shard_id()] =
          cntx_->ns->GetDbSlice(es->shard_id()).StopSampleValues(cntx_->db_index());
    });

    base::Histogram merged_histogram;
    for (const auto& hist : histograms) {
      if (hist) {
        merged_histogram.Merge(*hist);
      }
    }
    return rb->SendVerbatimString(merged_histogram.ToString());
  }

  return cmd_cntx->SendError(kSyntaxErr);
}

static size_t PostProcessHist(HufHist* dest) {
  size_t total_freq = 0;
  auto& hist = dest->hist;
  unsigned max_freq = 0;

  for (unsigned i = 0; i <= HufHist::kMaxSymbol; i++) {
    // raw_size may count less characters than the actual size because
    // we may cut the counting early.
    total_freq += hist[i];
    if (hist[i] == 0) {
      hist[i] = 1;  // Avoid zero frequency symbols.
    }
  }

  if (total_freq > kMaxFreqTotal) {
    // huffman encoder has a bug with frequencies too high, so we scale down everything
    // to avoid overflow.
    double scale = static_cast<double>(max_freq) / kMaxFreqTotal;
    for (unsigned i = 0; i <= HufHist::kMaxSymbol; i++) {
      hist[i] = unsigned(hist[i] / scale);
      if (hist[i] == 0) {
        hist[i] = 1;  // Avoid zero frequency symbols.
      }
    }
  }
  return total_freq;
}

void DebugCmd::Compression(CmdArgList args, CommandContext* cmd_cntx) {
  CompactObjType type = kInvalidCompactObjType;
  CmdArgParser parser(args);
  string bintable;
  bool print_bintable = false;

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (parser.Check("SET", &bintable)) {
    // SET <bintable> [type]
    string raw;
    atomic_bool succeed = absl::Base64Unescape(bintable, &raw);
    if (succeed) {
      CompactObj::HuffmanDomain domain = CompactObj::HUFF_KEYS;
      if (parser.HasNext()) {
        string_view type_str = parser.Next();
        type = ObjTypeFromString(type_str);
        if (type != OBJ_STRING) {  // Currently only string type is supported.
          return cmd_cntx->SendError(kSyntaxErr);
        }
        domain = CompactObj::HUFF_STRING_VALUES;
      }
      shard_set->RunBriefInParallel([&](EngineShard* shard) {
        if (!CompactObj::InitHuffmanThreadLocal(domain, raw)) {
          succeed = false;
        }
      });
    }
    return succeed ? rb->SendOk() : cmd_cntx->SendError("Failed to set bintable");
  }

  if (parser.Check("EXPORT")) {
    print_bintable = true;
  } else if (parser.Check("IMPORT", &bintable)) {
    string raw;
    bool succeed = absl::Base64Unescape(bintable, &raw);
    if (succeed) {
      bintable = raw;
    }
  }

  if (parser.HasNext()) {
    string_view type_str = parser.Next();
    type = ObjTypeFromString(type_str);
    if (type == kInvalidCompactObjType) {
      return cmd_cntx->SendError(kSyntaxErr);
    }
  }

  RETURN_ON_PARSE_ERROR(parser, cmd_cntx);

  fb2::Mutex mu;
  HufHist hist;
  shard_set->RunBlockingInParallel([&](EngineShard* shard) {
    HufHist local;
    DoComputeHist(type, shard, cntx_, &local);
    std::unique_lock lk(mu);
    hist.Merge(local);
  });

  size_t num_bits = 0, compressed_size = 0, raw_size = 0;
  if (hist.max_symbol) {
    HuffmanEncoder huff_enc;
    string err_msg;

    raw_size = PostProcessHist(&hist);

    if (bintable.empty()) {
      if (!huff_enc.Build(hist.hist.data(), HufHist::kMaxSymbol, &err_msg)) {
        return cmd_cntx->SendError(StrCat("Internal error: ", err_msg));
      }
    } else {
      // Try to read the bintable and create a ctable from it.
      if (!huff_enc.Load(bintable, &err_msg)) {
        return cmd_cntx->SendError(StrCat("Internal error: ", err_msg));
      }
    }
    num_bits = huff_enc.num_bits();
    compressed_size = huff_enc.EstimateCompressedSize(hist.hist.data(), HufHist::kMaxSymbol);

    if (print_bintable) {
      bintable = huff_enc.Export();
    } else {
      bintable.clear();
    }
  }

  unsigned map_len = print_bintable ? 6 : 5;

  rb->StartCollection(map_len, CollectionType::MAP);
  rb->SendSimpleString("max_symbol");
  rb->SendLong(hist.max_symbol);

  rb->SendSimpleString("max_bits");
  rb->SendLong(num_bits);
  rb->SendSimpleString("raw_size");
  rb->SendLong(raw_size);
  rb->SendSimpleString("compressed_size");
  rb->SendLong(compressed_size);
  rb->SendSimpleString("ratio");
  double ratio = raw_size > 0 ? static_cast<double>(compressed_size) / raw_size : 0;
  rb->SendDouble(ratio);
  if (print_bintable) {
    rb->SendSimpleString("bintable");
    rb->SendBulkString(absl::Base64Escape(bintable));
  }
}

void DebugCmd::IOStats(CmdArgList args, CommandContext* cmd_cntx) {
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  bool per_second = !args.empty() && absl::EqualsIgnoreCase(args[0], "PS");
  vector<IOStat> stats(shard_set->pool()->size());

  shard_set->pool()->AwaitBrief(
      [&](unsigned index, ProactorBase*) { stats[index].From(*facade::tl_facade_stats); });

  if (per_second) {
    ThisFiber::SleepFor(1s);
    vector<IOStat> stats2(shard_set->pool()->size());
    shard_set->pool()->AwaitBrief(
        [&](unsigned index, ProactorBase*) { stats2[index].From(*facade::tl_facade_stats); });

    for (size_t i = 0; i < stats.size(); ++i) {
      stats2[i] -= stats[i];
    }
    stats = std::move(stats2);
  }

  rb->StartArray(stats.size());
  for (const auto& stat : stats) {
    stat.Print(rb);
  }
}

void DebugCmd::Segments(CmdArgList args, CommandContext* cmd_cntx) {
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  vector<SegmentInfo> info(shard_set->size());

  shard_set->RunBlockingInParallel([&](EngineShard* shard) {
    auto& hist = info[shard->shard_id()];
    DoSegmentHist(shard, cntx_, &hist);
  });

  base::Histogram hist;
  for (const auto& seg_info : info) {
    hist.Merge(seg_info.hist);
  }
  string result;
  absl::StrAppend(&result, "___begin segment info___\n\n");
  absl::StrAppend(&result, "Segment Capacity: ", PrimeTable::kSegCapacity, "\n");
  absl::StrAppend(&result, "Segment Size Histogram: \n");
  absl::StrAppend(&result, hist.ToString(), "\n");
  rb->SendVerbatimString(result);
}

void DebugCmd::CompactTable(CmdArgList args, CommandContext* cmd_cntx) {
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  double threshold = 0.25;
  if (args.size() > 0) {
    if (!absl::SimpleAtod(facade::ToSV(args[0]), &threshold)) {
      return rb->SendError("Invalid threshold value");
    }
    if (threshold <= 0.0 || threshold > 1.0) {
      return rb->SendError("Threshold must be between 0 and 1");
    }
  }

  const DbIndex db_idx = cmd_cntx->server_conn_cntx()->db_index();
  std::vector<size_t> results(shard_set->size());
  shard_set->RunBlockingInParallel([&](EngineShard* shard) {
    results[shard->shard_id()] = shard->CompactTable(threshold, db_idx);
  });

  rb->SendLong(std::accumulate(results.begin(), results.end(), 0ul));
}

void DebugCmd::CountUniqueStrings(const CommandContext* cmd_cntx) const {
  using PerShardStats = std::array<std::unique_ptr<UniqueStrings>, OBJ_HASH + 1>;

  vector<PerShardStats> all_shards(shard_set->size());
  auto cb = [&all_shards](PrimeIterator it) {
    const unsigned obj_type = it->second.ObjType();
    if (obj_type != OBJ_HASH && obj_type != OBJ_LIST && obj_type != OBJ_SET &&
        obj_type != OBJ_ZSET) {
      return;
    }

    auto& entry = all_shards[EngineShard::tlocal()->shard_id()][obj_type];
    if (!entry) {
      entry = std::make_unique<UniqueStrings>();
    }

    if (obj_type == OBJ_HASH)
      entry->AddHMap(it->second);
    else if (obj_type == OBJ_LIST)
      entry->AddList(it->second);
    else if (obj_type == OBJ_SET)
      entry->AddSet(it->second);
    else if (obj_type == OBJ_ZSET)
      entry->AddZSet(it->second);
  };

  TraverseAllEntries(absl::GetFlag(FLAGS_background_debug_jobs), cntx_, cb);

  std::array<UniqueStrings, OBJ_HASH + 1> summary;
  for (const PerShardStats& shard_stat : all_shards) {
    for (CompactObjType obj_type = OBJ_LIST; obj_type <= OBJ_HASH; ++obj_type) {
      if (shard_stat[obj_type]) {
        summary[obj_type].Add(*shard_stat[obj_type]);
      }
    }
  }

  string result;
  StrAppend(&result, "___begin unique string stats___\n\n");

  for (CompactObjType obj_type = OBJ_LIST; obj_type <= OBJ_HASH; ++obj_type) {
    const UniqueStrings& stats = summary[obj_type];
    if (stats.total_count == 0) {
      continue;
    }
    StrAppend(&result, "OBJECT:", ObjTypeToString(obj_type), "\n");
    StrAppend(&result, "________________________________________________________________\n");
    StrAppend(&result, stats.ToString("Strings"));
    StrAppend(&result, "\n");
  }

  StrAppend(&result, "___end unique string stats___\n");
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  rb->SendVerbatimString(result);
}

void DebugCmd::DoPopulateBatch(const PopulateOptions& options, const PopulateBatch& batch) {
  auto* exec_cid = sf_.service().mutable_registry()->Find("EXEC");
  boost::intrusive_ptr<Transaction> local_tx = new Transaction{exec_cid};
  local_tx->StartMultiNonAtomic();
  boost::intrusive_ptr<Transaction> stub_tx =
      new Transaction{local_tx.get(), EngineShard::tlocal()->shard_id(), nullopt};

  absl::InlinedVector<string_view, 5> args_view;
  facade::CapturingReplyBuilder crb;
  absl::InsecureBitGen gen;
  CommandContext cmd_cntx{&crb, cntx_};
  cmd_cntx.SetupTx(exec_cid, stub_tx.get());

  for (unsigned i = 0; i < batch.sz; ++i) {
    string key = StrCat(options.prefix, ":", batch.index[i]);
    uint32_t elements_left = options.elements;

    // limit rss grow by 32K by limiting the element count in each command.
    // for stream we use 4 fields and (elements / 4) stream entries
    uint32_t max_batch_elements =
        options.type == "STREAM" ? 4 : std::max(32_KB / options.val_size, 1ULL);
    while (elements_left) {
      uint32_t populate_elements = std::min(max_batch_elements, elements_left);
      if (options.type == "STREAM" && populate_elements > 4) {
        // populate_elements % 4 == 0, because we add 4 fields into one stream entry
        populate_elements -= (populate_elements % 4);
      }
      elements_left -= populate_elements;
      auto [cid, args] = GeneratePopulateCommand(options.type, key, options.val_size,
                                                 options.populate_random_values, populate_elements,
                                                 *sf_.service().mutable_registry(), &gen);
      if (!cid) {
        LOG_EVERY_N(WARNING, 10'000) << "Unable to find command, was it renamed?";
        break;
      }

      args_view.clear();
      for (auto& arg : args) {
        args_view.push_back(arg);
      }
      auto args_span = absl::MakeSpan(args_view);
      stub_tx->MultiSwitchCmd(cid);
      crb.SetReplyMode(ReplyMode::NONE);
      stub_tx->InitByArgs(cntx_->ns, cntx_->conn_state.db_index, args_span);
      cmd_cntx.UpdateCid(cid);
      sf_.service().InvokeCmd(args_span, &cmd_cntx);
    }

    if (options.expire_ttl_range.has_value()) {
      uint32_t start = options.expire_ttl_range->first;
      uint32_t end = options.expire_ttl_range->second;
      uint32_t expire_ttl = rand() % (end - start) + start;
      VLOG(1) << "set key " << key << " expire ttl as " << expire_ttl;
      auto cid = sf_.service().mutable_registry()->Find("EXPIRE");
      absl::InlinedVector<string, 5> args;
      args.push_back(std::move(key));
      args.push_back(to_string(expire_ttl));
      args_view.clear();
      for (auto& arg : args) {
        args_view.push_back(arg);
      }
      auto args_span = absl::MakeSpan(args_view);
      crb.SetReplyMode(ReplyMode::NONE);
      stub_tx->MultiSwitchCmd(cid);
      stub_tx->InitByArgs(cntx_->ns, cntx_->conn_state.db_index, args_span);
      cmd_cntx.UpdateCid(cid);
      sf_.service().InvokeCmd(args_span, &cmd_cntx);
    }
  }

  local_tx->UnlockMulti();
}

}  // namespace dfly


================================================
FILE: src/server/debugcmd.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include "server/cluster/cluster_defs.h"
#include "server/conn_context.h"

namespace dfly {

namespace cluster {
class ClusterFamily;
}

class EngineShardSet;
class ServerFamily;

class DebugCmd {
 private:
  struct PopulateOptions {
    uint64_t total_count = 0;
    std::string_view prefix{"key"};
    uint32_t val_size = 16;
    bool populate_random_values = false;
    std::string type{"STRING"};
    uint32_t elements = 1;

    std::optional<cluster::SlotRange> slot_range;
    std::optional<std::pair<uint32_t, uint32_t>> expire_ttl_range;
  };

 public:
  DebugCmd(ServerFamily* owner, cluster::ClusterFamily* cf, ConnectionContext* cntx);

  void Run(CmdArgList args, CommandContext* cmd_cntx);

  static void Shutdown();

 private:
  void Populate(CmdArgList args, CommandContext* cmd_cntx);
  static std::optional<PopulateOptions> ParsePopulateArgs(CmdArgList args,
                                                          CommandContext* cmd_cntx);
  void PopulateRangeFiber(uint64_t from, uint64_t count, const PopulateOptions& opts);

  void Reload(CmdArgList args, CommandContext* cmd_cntx);
  void Replica(CmdArgList args, CommandContext* cmd_cntx);
  void Migration(CmdArgList args, CommandContext* cmd_cntx);

  void Exec(CommandContext* cmd_cntx);
  void Inspect(std::string_view key, CmdArgList args, CommandContext* cmd_cntx);
  void Watched(CommandContext* cmd_cntx);
  void TxAnalysis(CommandContext* cmd_cntx);
  void ObjHist(CommandContext* cmd_cntx);
  void Stacktrace(CommandContext* cmd_cntx);
  void Shards(CommandContext* cmd_cntx);
  void LogTraffic(CmdArgList, CommandContext* cmd_cntx);
  void RecvSize(std::string_view param, CommandContext* cmd_cntx);
  void Topk(CmdArgList args, CommandContext* cmd_cntx);
  void Keys(CmdArgList args, CommandContext* cmd_cntx);
  void Values(CmdArgList args, CommandContext* cmd_cntx);
  void Compression(CmdArgList args, CommandContext* cmd_cntx);
  void IOStats(CmdArgList args, CommandContext* cmd_cntx);
  void Segments(CmdArgList args, CommandContext* cmd_cntx);
  void CompactTable(CmdArgList args, CommandContext* cmd_cntx);
  void CountUniqueStrings(const CommandContext* cmd_cntx) const;
  struct PopulateBatch {
    DbIndex dbid;
    uint64_t index[32];
    uint64_t sz = 0;

    explicit PopulateBatch(DbIndex id) : dbid(id) {
    }
  };

  void DoPopulateBatch(const PopulateOptions& options, const PopulateBatch& batch);

  ServerFamily& sf_;
  cluster::ClusterFamily& cf_;
  ConnectionContext* cntx_;
};

}  // namespace dfly


================================================
FILE: src/server/detail/compressor.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/detail/compressor.h"

#include <absl/flags/flag.h>
#include <lz4frame.h>
#include <zstd.h>

#include "base/logging.h"

ABSL_FLAG(int, compression_level, 2, "The compression level to use on zstd/lz4 compression");

namespace dfly::detail {

using namespace std;

class ZstdCompressor : public CompressorImpl {
 public:
  ZstdCompressor() {
    cctx_ = ZSTD_createCCtx();
  }
  ~ZstdCompressor() {
    ZSTD_freeCCtx(cctx_);
  }

  io::Result<io::Bytes> Compress(io::Bytes data);

 private:
  ZSTD_CCtx* cctx_;
  base::PODArray<uint8_t> compr_buf_;
};

io::Result<io::Bytes> ZstdCompressor::Compress(io::Bytes data) {
  size_t buf_size = ZSTD_compressBound(data.size());
  if (compr_buf_.capacity() < buf_size) {
    compr_buf_.reserve(buf_size);
  }
  size_t compressed_size = ZSTD_compressCCtx(cctx_, compr_buf_.data(), compr_buf_.capacity(),
                                             data.data(), data.size(), compression_level_);

  if (ZSTD_isError(compressed_size)) {
    LOG(ERROR) << "ZSTD_compressCCtx failed with error " << ZSTD_getErrorName(compressed_size);
    return nonstd::make_unexpected(make_error_code(errc::operation_not_supported));
  }
  compressed_size_total_ += compressed_size;
  uncompressed_size_total_ += data.size();
  return io::Bytes(compr_buf_.data(), compressed_size);
}

class Lz4Compressor : public CompressorImpl {
 public:
  Lz4Compressor() {
    LZ4F_errorCode_t code = LZ4F_createCompressionContext(&cctx_, LZ4F_VERSION);
    CHECK(!LZ4F_isError(code));
  }

  ~Lz4Compressor() {
    LZ4F_errorCode_t code = LZ4F_freeCompressionContext(cctx_);
    CHECK(!LZ4F_isError(code));
  }

  // compress a string of data
  io::Result<io::Bytes> Compress(io::Bytes data);

 private:
  LZ4F_cctx* cctx_;
};

io::Result<io::Bytes> Lz4Compressor::Compress(io::Bytes data) {
  LZ4F_preferences_t lz4_pref = LZ4F_INIT_PREFERENCES;
  lz4_pref.compressionLevel = compression_level_;
  lz4_pref.frameInfo.contentSize = data.size();

  size_t buf_size = LZ4F_compressFrameBound(data.size(), &lz4_pref);
  if (compr_buf_.capacity() < buf_size) {
    compr_buf_.reserve(buf_size);
  }

  size_t frame_size =
      LZ4F_compressFrame_usingCDict(cctx_, compr_buf_.data(), compr_buf_.capacity(), data.data(),
                                    data.size(), nullptr /* dict */, &lz4_pref);
  if (LZ4F_isError(frame_size)) {
    LOG(ERROR) << "LZ4F_compressFrame failed with error " << LZ4F_getErrorName(frame_size);
    return nonstd::make_unexpected(make_error_code(errc::operation_not_supported));
  }

  compressed_size_total_ += frame_size;
  uncompressed_size_total_ += data.size();
  return io::Bytes(compr_buf_.data(), frame_size);
}

CompressorImpl::CompressorImpl() {
  compression_level_ = absl::GetFlag(FLAGS_compression_level);
}

CompressorImpl::~CompressorImpl() {
  VLOG(1) << "compressed size: " << compressed_size_total_;
  VLOG(1) << "uncompressed size: " << uncompressed_size_total_;
}

unique_ptr<CompressorImpl> CompressorImpl::CreateZstd() {
  return make_unique<ZstdCompressor>();
}

unique_ptr<CompressorImpl> CompressorImpl::CreateLZ4() {
  return make_unique<Lz4Compressor>();
}

}  // namespace dfly::detail


================================================
FILE: src/server/detail/compressor.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <memory>

#include "base/pod_array.h"
#include "io/io.h"

namespace dfly::detail {

class CompressorImpl {
 public:
  static std::unique_ptr<CompressorImpl> CreateZstd();
  static std::unique_ptr<CompressorImpl> CreateLZ4();

  CompressorImpl();
  virtual ~CompressorImpl();
  virtual io::Result<io::Bytes> Compress(io::Bytes data) = 0;

 protected:
  int compression_level_ = 1;
  size_t compressed_size_total_ = 0;
  size_t uncompressed_size_total_ = 0;
  base::PODArray<uint8_t> compr_buf_;
};

}  // namespace dfly::detail


================================================
FILE: src/server/detail/decompress.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/detail/decompress.h"

#include <lz4frame.h>
#include <zstd.h>

#include "base/logging.h"
#include "server/error.h"
#include "server/rdb_extensions.h"

namespace dfly {

namespace detail {

using io::IoBuf;
using rdb::errc;
using namespace std;

inline auto Unexpected(errc ev) {
  return nonstd::make_unexpected(RdbError(ev));
}

class ZstdDecompress : public DecompressImpl {
 public:
  ZstdDecompress() {
    dctx_ = ZSTD_createDCtx();
  }
  ~ZstdDecompress() {
    ZSTD_freeDCtx(dctx_);
  }

  io::Result<io::IoBuf*> Decompress(std::string_view str);

 private:
  ZSTD_DCtx* dctx_;
};

io::Result<io::IoBuf*> ZstdDecompress::Decompress(std::string_view str) {
  // Prepare membuf memory to uncompressed string.
  auto uncomp_size = ZSTD_getFrameContentSize(str.data(), str.size());
  if (uncomp_size == ZSTD_CONTENTSIZE_UNKNOWN) {
    LOG(ERROR) << "Zstd compression missing frame content size";
    return Unexpected(errc::invalid_encoding);
  }
  if (uncomp_size == ZSTD_CONTENTSIZE_ERROR) {
    LOG(ERROR) << "Invalid ZSTD compressed string";
    return Unexpected(errc::invalid_encoding);
  }

  uncompressed_mem_buf_.Reserve(uncomp_size + 1);

  // Uncompress string to membuf
  IoBuf::Bytes dest = uncompressed_mem_buf_.AppendBuffer();
  if (dest.size() < uncomp_size) {
    return Unexpected(errc::out_of_memory);
  }
  size_t const d_size =
      ZSTD_decompressDCtx(dctx_, dest.data(), dest.size(), str.data(), str.size());
  if (d_size == 0 || d_size != uncomp_size) {
    LOG(ERROR) << "Invalid ZSTD compressed string";
    return Unexpected(errc::rdb_file_corrupted);
  }
  uncompressed_mem_buf_.CommitWrite(d_size);

  // Add opcode of compressed blob end to membuf.
  dest = uncompressed_mem_buf_.AppendBuffer();
  if (dest.size() < 1) {
    return Unexpected(errc::out_of_memory);
  }
  dest[0] = RDB_OPCODE_COMPRESSED_BLOB_END;
  uncompressed_mem_buf_.CommitWrite(1);

  return &uncompressed_mem_buf_;
}

class Lz4Decompress : public DecompressImpl {
 public:
  Lz4Decompress() {
    auto result = LZ4F_createDecompressionContext(&dctx_, LZ4F_VERSION);
    CHECK(!LZ4F_isError(result));
  }
  ~Lz4Decompress() {
    auto result = LZ4F_freeDecompressionContext(dctx_);
    CHECK(!LZ4F_isError(result));
  }

  io::Result<base::IoBuf*> Decompress(std::string_view str);

 private:
  LZ4F_dctx* dctx_;
};

io::Result<base::IoBuf*> Lz4Decompress::Decompress(std::string_view data) {
  LZ4F_frameInfo_t frame_info;
  size_t frame_size = data.size();

  // Get content size from frame data
  size_t consumed = frame_size;  // The nb of bytes consumed from data will be written into consumed
  size_t res = LZ4F_getFrameInfo(dctx_, &frame_info, data.data(), &consumed);
  if (LZ4F_isError(res)) {
    LOG(ERROR) << "LZ4F_getFrameInfo failed with error " << LZ4F_getErrorName(res);
    return Unexpected(errc::rdb_file_corrupted);
    ;
  }

  if (frame_info.contentSize == 0) {
    LOG(ERROR) << "Missing frame content size";
    return Unexpected(errc::rdb_file_corrupted);
  }

  // reserve place for uncompressed data and end opcode
  size_t reserve = frame_info.contentSize + 1;
  uncompressed_mem_buf_.Reserve(reserve);
  IoBuf::Bytes dest = uncompressed_mem_buf_.AppendBuffer();
  if (dest.size() < reserve) {
    return Unexpected(errc::out_of_memory);
  }

  // Uncompress data to membuf
  string_view src = data.substr(consumed);
  size_t src_size = src.size();

  size_t ret = 1;
  while (ret != 0) {
    IoBuf::Bytes dest = uncompressed_mem_buf_.AppendBuffer();
    size_t dest_capacity = dest.size();

    // It will read up to src_size bytes from src,
    // and decompress data into dest, of capacity dest_capacity
    // The nb of bytes consumed from src will be written into src_size
    // The nb of bytes decompressed into dest will be written into dest_capacity
    ret = LZ4F_decompress(dctx_, dest.data(), &dest_capacity, src.data(), &src_size, nullptr);
    if (LZ4F_isError(ret)) {
      LOG(ERROR) << "LZ4F_decompress failed with error " << LZ4F_getErrorName(ret);
      return Unexpected(errc::rdb_file_corrupted);
    }
    consumed += src_size;

    uncompressed_mem_buf_.CommitWrite(dest_capacity);
    src = src.substr(src_size);
    src_size = src.size();
  }
  if (consumed != frame_size) {
    return Unexpected(errc::rdb_file_corrupted);
  }
  if (uncompressed_mem_buf_.InputLen() != frame_info.contentSize) {
    return Unexpected(errc::rdb_file_corrupted);
  }

  // Add opcode of compressed blob end to membuf.
  dest = uncompressed_mem_buf_.AppendBuffer();
  if (dest.size() < 1) {
    return Unexpected(errc::out_of_memory);
  }
  dest[0] = RDB_OPCODE_COMPRESSED_BLOB_END;
  uncompressed_mem_buf_.CommitWrite(1);

  return &uncompressed_mem_buf_;
}

unique_ptr<DecompressImpl> DecompressImpl::CreateLZ4() {
  return make_unique<Lz4Decompress>();
}

unique_ptr<DecompressImpl> DecompressImpl::CreateZstd() {
  return make_unique<ZstdDecompress>();
}

}  // namespace detail
}  // namespace dfly


================================================
FILE: src/server/detail/decompress.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once

#include <memory>

#include "io/io.h"
#include "io/io_buf.h"

namespace dfly {

namespace detail {

class DecompressImpl {
 public:
  static std::unique_ptr<DecompressImpl> CreateLZ4();
  static std::unique_ptr<DecompressImpl> CreateZstd();

  DecompressImpl() : uncompressed_mem_buf_{1U << 14} {
  }
  virtual ~DecompressImpl() {
  }

  virtual io::Result<io::IoBuf*> Decompress(std::string_view str) = 0;

 protected:
  io::IoBuf uncompressed_mem_buf_;
};

}  // namespace detail
}  // namespace dfly


================================================
FILE: src/server/detail/save_stages_controller.cc
================================================

// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/detail/save_stages_controller.h"

#include <absl/strings/match.h>

#include <numeric>

#include "base/flags.h"
#include "base/logging.h"
#include "core/detail/gen_utils.h"
#include "server/detail/snapshot_storage.h"
#include "server/main_service.h"
#include "server/namespaces.h"
#include "server/script_mgr.h"
#include "server/transaction.h"
#include "strings/human_readable.h"

using namespace std;

ABSL_DECLARE_FLAG(string, dir);
ABSL_DECLARE_FLAG(string, dbfilename);

namespace dfly {
namespace detail {

using namespace util;
using absl::GetFlag;
using absl::StrCat;
using fb2::OpenLinux;

namespace fs = std::filesystem;

namespace {

// Create a directory and all its parents if they don't exist.
error_code CreateDirs(fs::path dir_path) {
  error_code ec;
  fs::file_status dir_status = fs::status(dir_path, ec);
  if (ec == errc::no_such_file_or_directory) {
    fs::create_directories(dir_path, ec);
    if (!ec)
      dir_status = fs::status(dir_path, ec);
  }
  return ec;
}

// modifies 'filename' to be "filename-postfix.extension"
void SetExtension(absl::AlphaNum postfix, string_view extension, fs::path* filename) {
  filename->replace_extension();  // clear if exists
  *filename += StrCat("-", postfix, extension);
}

void ExtendDfsFilenameWithShard(int shard, string_view extension, fs::path* filename) {
  // dragonfly snapshot.
  SetExtension(absl::Dec(shard, absl::kZeroPad4), extension, filename);
}

}  // namespace

GenericError ValidateFilename(const fs::path& filename, bool new_version) {
  if (filename.empty()) {
    return {};
  }

  string filename_str = filename.string();
  if (filename_str.front() == '"') {
    return {
        "filename should not start with '\"', could it be that you put quotes in the flagfile?"};
  }

  bool is_cloud_path = IsCloudPath(filename_str);

  if (!filename.parent_path().empty() && !is_cloud_path) {
    return {absl::StrCat("filename may not contain directory separators (Got \"", filename.c_str(),
                         "\"). dbfilename should specify the filename without the directory")};
  }

  if (!filename.has_extension()) {
    return {};
  }

  if (new_version) {
    if (absl::EqualsIgnoreCase(filename.extension().c_str(), ".rdb")) {
      return {absl::StrCat(
          "DF snapshot format is used but '.rdb' extension was given. Use --nodf_snapshot_format "
          "or remove the filename extension.")};
    } else {
      return {absl::StrCat("DF snapshot format requires no filename extension. Got \"",
                           filename.extension().c_str(), "\"")};
    }
  }
  if (!new_version && !absl::EqualsIgnoreCase(filename.extension().c_str(), ".rdb")) {
    return {absl::StrCat("Bad filename extension \"", filename.extension().c_str(),
                         "\" for SAVE with type RDB")};
  }
  return {};
}

GenericError RdbSnapshot::Start(SaveMode save_mode, const std::string& path,
                                const RdbSaver::GlobalData& glob_data,
                                const std::string& snapshot_id) {
  VLOG(1) << "Saving RDB " << path;

  CHECK_NOTNULL(snapshot_storage_);
  auto res = snapshot_storage_->OpenWriteFile(path);
  if (!res) {
    return res.error();
  }

  auto [file, file_type] = *res;
  io_sink_.reset(file);

  is_linux_file_ = file_type & FileType::IO_URING;
  bool align_writes = (file_type & FileType::DIRECT) != 0;
  saver_.reset(
      new RdbSaver(io_sink_.get(), save_mode, align_writes, snapshot_id, DflyVersion::CURRENT_VER));

  return saver_->SaveHeader(std::move(glob_data));
}

error_code RdbSnapshot::SaveBody() {
  return saver_->SaveBody(cntx_);
}

error_code RdbSnapshot::WaitSnapshotInShard(EngineShard* shard) {
  return saver_->WaitSnapshotInShard(shard);
}

size_t RdbSnapshot::GetSaveBuffersSize() {
  CHECK(saver_);
  return saver_->GetTotalBuffersSize();
}

void RdbSnapshot::FillFreqMap() {
  saver_->FillFreqMap(&freq_map_);
}

RdbSaver::SnapshotStats RdbSnapshot::GetCurrentSnapshotProgress() const {
  CHECK(saver_);
  return saver_->GetCurrentSnapshotProgress();
}

error_code RdbSnapshot::Close() {
#ifdef __linux__
  if (is_linux_file_) {
    return static_cast<LinuxWriteWrapper*>(io_sink_.get())->Close();
  }
#endif

  error_code ec;

  // S3 implementation is stack hungry. We use a fiber to close the file to
  // avoid wasting stack space.
  auto fb = ProactorBase::me()->LaunchFiber(
      fb2::Launch::post, boost::context::fixedsize_stack{40 * 1024}, "write_file_close",
      [&] { ec = static_cast<io::WriteFile*>(io_sink_.get())->Close(); });
  fb.Join();
  return ec;
}

void RdbSnapshot::StartInShard(EngineShard* shard) {
  saver_->StartSnapshotInShard(false, &cntx_, shard);
  started_shards_.fetch_add(1, memory_order_relaxed);
}

SaveStagesController::SaveStagesController(SaveStagesInputs&& inputs)
    : SaveStagesInputs{std::move(inputs)} {
  start_time_ = time(NULL);
}

SaveStagesController::~SaveStagesController() {
  if (!snapshots_.empty() && snapshots_[0].first) {
    LOG(INFO) << "Forcefully closing save controller";
    WaitAllSnapshots();
    Finalize();
  }
}

std::optional<SaveInfo> SaveStagesController::Init() {
  if (auto err = BuildFullPath(); err) {
    shared_err_ = err;
    return GetSaveInfo();
  }

  snapshots_.resize(use_dfs_format_ ? shard_set->size() + 1 : 1);
  for (auto& [snapshot, _] : snapshots_)
    snapshot = make_unique<RdbSnapshot>(fq_threadpool_, snapshot_storage_.get());

  return {};
}

void SaveStagesController::Start() {
  if (use_dfs_format_)
    SaveDfs();
  else
    SaveRdb();
}

void SaveStagesController::WaitAllSnapshots() {
  if (use_dfs_format_) {
    shard_set->RunBlockingInParallel([&](EngineShard* shard) { WaitSnapshotInShard(shard); });
    SaveBody(shard_set->size());
  } else {
    SaveBody(0);
  }
}

SaveInfo SaveStagesController::Finalize() {
  RunStage(&SaveStagesController::CloseCb);

  if (auto err = FinalizeFileMovement(); err) {
    shared_err_ = err;
  }

  return GetSaveInfo();
}

size_t SaveStagesController::GetSaveBuffersSize() {
  std::atomic<size_t> total_bytes{0};

  auto add_snapshot_bytes = [this, &total_bytes](ShardId sid) {
    if (auto& snapshot = snapshots_[sid].first; snapshot && snapshot->HasStarted()) {
      total_bytes.fetch_add(snapshot->GetSaveBuffersSize(), memory_order_relaxed);
    }
  };

  if (!snapshots_.empty()) {
    if (use_dfs_format_) {
      shard_set->RunBriefInParallel([&](EngineShard* es) { add_snapshot_bytes(es->shard_id()); });

    } else {
      // When rdb format save is running, there is only one rdb saver instance, it is running on the
      // connection thread that runs the save command.
      add_snapshot_bytes(0);
    }
  }

  return total_bytes.load(memory_order_relaxed);
}

RdbSaver::SnapshotStats SaveStagesController::GetCurrentSnapshotProgress() const {
  if (snapshots_.empty()) {
    return {0, 0};
  }

  std::vector<RdbSaver::SnapshotStats> results(snapshots_.size());
  auto fetch = [this, &results](ShardId sid) {
    if (auto& snapshot = snapshots_[sid].first; snapshot && snapshot->HasStarted()) {
      results[sid] = snapshot->GetCurrentSnapshotProgress();
    }
  };

  if (use_dfs_format_) {
    shard_set->RunBriefInParallel([&](EngineShard* es) { fetch(es->shard_id()); });
    RdbSaver::SnapshotStats init{0, 0};
    return std::accumulate(
        results.begin(), results.end(), init, [](auto init, auto pr) -> RdbSaver::SnapshotStats {
          return {init.current_keys + pr.current_keys, init.total_keys + pr.total_keys};
        });
  }
  fetch(0);
  return results[0];
}

// In the new version (.dfs) we store a file for every shard and one more summary file.
// Summary file is always last in snapshots array.
void SaveStagesController::SaveDfs() {
  // Extend all filenames with -{sid} or -summary and append .dfs.tmp
  const string_view ext = snapshot_storage_->IsCloud() ? ".dfs" : ".dfs.tmp";
  ShardId sid = 0;
  for (auto& [_, filename] : snapshots_) {
    filename = full_path_;
    if (sid < shard_set->size())
      ExtendDfsFilenameWithShard(sid++, ext, &filename);
    else
      SetExtension("summary", ext, &filename);
  }

  absl::InsecureBitGen gen;
  std::string snapshot_id = GetRandomHex(gen, 32);
  // Save summary file.
  SaveDfsSingle(nullptr, snapshot_id);

  // Save shard files.
  auto cb = [this, &snapshot_id](Transaction* t, EngineShard* shard) {
    SaveDfsSingle(shard, snapshot_id);
    return OpStatus::OK;
  };
  trans_->ScheduleSingleHop(std::move(cb));
}

// Start saving a dfs file on shard
void SaveStagesController::SaveDfsSingle(EngineShard* shard, const std::string& snapshot_id) {
  // for summary file, shard=null and index=shard_set->size(), see SaveDfs() above
  auto& [snapshot, filename] = snapshots_[shard ? shard->shard_id() : shard_set->size()];

  SaveMode mode = shard == nullptr ? SaveMode::SUMMARY : SaveMode::SINGLE_SHARD;
  bool is_summary = (shard == nullptr);
  auto glob_data = RdbSaver::GetGlobalData(service_, is_summary);

  if (auto err = snapshot->Start(mode, filename, glob_data, snapshot_id); err) {
    shared_err_ = err;
    snapshot.reset();
    return;
  }

  if (mode == SaveMode::SINGLE_SHARD)
    snapshot->StartInShard(shard);
}

// Save a single rdb file
void SaveStagesController::SaveRdb() {
  auto& [snapshot, filename] = snapshots_.front();

  filename = full_path_;
  if (!filename.has_extension())
    filename += ".rdb";
  if (!snapshot_storage_->IsCloud())
    filename += ".tmp";

  // RDB is a summary file (contains all global data)
  if (auto err =
          snapshot->Start(SaveMode::RDB, filename, RdbSaver::GetGlobalData(service_, true), "");
      err) {
    snapshot.reset();
    return;
  }

  auto cb = [snapshot = snapshot.get()](Transaction* t, EngineShard* shard) {
    snapshot->StartInShard(shard);
    return OpStatus::OK;
  };
  trans_->ScheduleSingleHop(std::move(cb));
}

uint32_t SaveStagesController::GetCurrentSaveDuration() {
  return time(nullptr) - start_time_;
}

SaveInfo SaveStagesController::GetSaveInfo() {
  SaveInfo info;
  info.save_time = start_time_;
  info.duration_sec = GetCurrentSaveDuration();

  if (shared_err_) {
    info.error = *shared_err_;
    return info;
  }

  fs::path resulting_path = full_path_;
  if (use_dfs_format_)
    SetExtension("summary", ".dfs", &resulting_path);
  else
    resulting_path.replace_extension();  // remove .tmp

  LOG(INFO) << "Saving " << resulting_path << " finished after "
            << strings::HumanReadableElapsedTime(info.duration_sec);

  info.freq_map.clear();
  for (const auto& k_v : rdb_name_map_) {
    info.freq_map.emplace_back(k_v);
  }

  info.file_name = resulting_path.generic_string();

  return info;
}

// Remove .tmp extension or delete files in case of error
GenericError SaveStagesController::FinalizeFileMovement() {
  if (snapshot_storage_->IsCloud())
    return {};
  DVLOG(1) << "FinalizeFileMovement start";

  // If the shared_err is set, the snapshot saving failed
  bool has_error = bool(shared_err_);

  std::error_code ec;
  for (const auto& [_, filename] : snapshots_) {
    if (has_error) {
      filesystem::remove(filename, ec);
    } else {
      filesystem::rename(filename, fs::path{filename}.replace_extension(""), ec);
    }
    if (ec)
      break;
  }
  DVLOG(1) << "FinalizeFileMovement end";
  return GenericError(ec);
}

// Build full path: get dir, try creating dirs, get filename with placeholder
GenericError SaveStagesController::BuildFullPath() {
  fs::path dir_path = cloud_uri_.empty() ? GetFlag(FLAGS_dir) : cloud_uri_;
  if (!dir_path.empty() && cloud_uri_.empty() && !IsCloudPath(GetFlag(FLAGS_dir))) {
    if (auto ec = CreateDirs(dir_path); ec)
      return {ec, "Failed to create directories"};
  }

  fs::path filename = basename_.empty() ? GetFlag(FLAGS_dbfilename) : basename_;
  if (filename.empty())
    return {"filename is not specified"};

  if (auto err = ValidateFilename(filename, use_dfs_format_); err)
    return err;

  SubstituteFilenamePlaceholders(
      &filename, {.ts = "%Y-%m-%dT%H:%M:%S", .year = "%Y", .month = "%m", .day = "%d"});

  tm time_tm;
  localtime_r(&start_time_, &time_tm);
  string src_format = filename.string();
  string dest_buf(src_format.size() + 128, '\0');
  size_t len = strftime(dest_buf.data(), dest_buf.size(), src_format.c_str(), &time_tm);
  if (len == 0)
    return {"invalid dbfilename format"};
  dest_buf.resize(len);

  full_path_ = dir_path / dest_buf;

  return {};
}

void SaveStagesController::SaveBody(unsigned index) {
  CHECK(!use_dfs_format_ || index == shard_set->size());  // used in rdb and df summary file
  if (auto& snapshot = snapshots_[index].first; snapshot && snapshot->HasStarted()) {
    shared_err_ = snapshot->SaveBody();
  }
}

void SaveStagesController::WaitSnapshotInShard(EngineShard* shard) {
  if (auto& snapshot = snapshots_[shard->shard_id()].first; snapshot && snapshot->HasStarted()) {
    shared_err_ = snapshot->WaitSnapshotInShard(shard);
  }
}

void SaveStagesController::CloseCb(unsigned index) {
  if (auto& snapshot = snapshots_[index].first; snapshot && snapshot->HasStarted()) {
    snapshot->FillFreqMap();
    shared_err_ = snapshot->Close();

    unique_lock lk{rdb_name_map_mu_};
    for (const auto& k_v : snapshot->freq_map())
      rdb_name_map_[RdbTypeName(k_v.first)] += k_v.second;
    lk.unlock();
    snapshot.reset();
  }

  if (auto* es = EngineShard::tlocal(); use_dfs_format_ && es)
    namespaces->GetDefaultNamespace().GetDbSlice(es->shard_id()).ResetUpdateEvents();
}

void SaveStagesController::RunStage(void (SaveStagesController::*cb)(unsigned)) {
  if (use_dfs_format_) {
    shard_set->RunBlockingInParallel([&](EngineShard* es) { (this->*cb)(es->shard_id()); });
    (this->*cb)(shard_set->size());
  } else {
    (this->*cb)(0);
  }
}

}  // namespace detail
}  // namespace dfly


================================================
FILE: src/server/detail/save_stages_controller.h
================================================

// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <filesystem>

#include "server/rdb_save.h"
#include "util/fibers/fiberqueue_threadpool.h"

namespace dfly {

class Transaction;
class Service;

namespace detail {

class SnapshotStorage;

struct SaveInfo {
  time_t save_time = 0;  // epoch time in seconds.
  uint32_t duration_sec = 0;
  std::string file_name;
  std::vector<std::pair<std::string_view, size_t>> freq_map;  // RDB_TYPE_xxx -> count mapping.
  GenericError error;
};

struct SaveStagesInputs {
  bool use_dfs_format_;
  std::string_view cloud_uri_;
  std::string_view basename_;
  Transaction* trans_;
  Service* service_;
  util::fb2::FiberQueueThreadPool* fq_threadpool_;
  std::shared_ptr<SnapshotStorage> snapshot_storage_;
  // true if the command that triggered this flow is bgsave. false otherwise.
  bool is_bg_save_;
};

class RdbSnapshot {
 public:
  RdbSnapshot(util::fb2::FiberQueueThreadPool* fq_tp, SnapshotStorage* snapshot_storage)
      : snapshot_storage_{snapshot_storage} {
  }

  GenericError Start(SaveMode save_mode, const string& path, const RdbSaver::GlobalData& glob_data,
                     const std::string& snapshot_id);
  void StartInShard(EngineShard* shard);

  error_code SaveBody();
  error_code WaitSnapshotInShard(EngineShard* shard);
  void FillFreqMap();
  error_code Close();
  size_t GetSaveBuffersSize();

  RdbSaver::SnapshotStats GetCurrentSnapshotProgress() const;

  const RdbTypeFreqMap& freq_map() const {
    return freq_map_;
  }

  bool HasStarted() const {
    return started_shards_.load(std::memory_order_relaxed) > 0 ||
           (saver_ && saver_->Mode() == SaveMode::SUMMARY);
  }

 private:
  bool is_linux_file_ = false;
  SnapshotStorage* snapshot_storage_ = nullptr;

  std::atomic_uint32_t started_shards_ = 0;

  unique_ptr<io::Sink> io_sink_;
  unique_ptr<RdbSaver> saver_;
  RdbTypeFreqMap freq_map_;

  ExecutionState cntx_{};
};

struct SaveStagesController : public SaveStagesInputs {
  explicit SaveStagesController(SaveStagesInputs&& input);
  // Objects of this class are used concurrently. Call this function
  // in a mutually exlusive context to avoid data races.
  // Also call this function before any call to `WaitAllSnapshots`
  // Returns empty optional on success and SaveInfo on failure
  std::optional<SaveInfo> Init();
  void Start();

  ~SaveStagesController();

  // Safe to call and no locks required
  void WaitAllSnapshots();

  // Call this function after you `WaitAllSnapshots`to finalize the chore.
  // Performs cleanup of the object internally.
  SaveInfo Finalize();
  size_t GetSaveBuffersSize();
  uint32_t GetCurrentSaveDuration();
  RdbSaver::SnapshotStats GetCurrentSnapshotProgress() const;

  bool IsBgSave() const {
    return is_bg_save_;
  }

 private:
  // In the new version (.dfs) we store a file for every shard and one more summary file.
  // Summary file is always last in snapshots array.
  void SaveDfs();

  // Start saving a dfs file on shard
  void SaveDfsSingle(EngineShard* shard, const std::string& snapshot_id);
  void SaveSnashot(EngineShard* shard);
  void WaitSnapshotInShard(EngineShard* shard);

  // Save a single rdb file
  void SaveRdb();

  SaveInfo GetSaveInfo();

  // Remove .tmp extension or delete files in case of error
  GenericError FinalizeFileMovement();

  // Build full path: get dir, try creating dirs, get filename with placeholder
  GenericError BuildFullPath();

  void SaveBody(unsigned index);

  void CloseCb(unsigned index);

  void RunStage(void (SaveStagesController::*cb)(unsigned));

  time_t start_time_;
  std::filesystem::path full_path_;

  AggregateGenericError shared_err_;
  std::vector<std::pair<std::unique_ptr<RdbSnapshot>, std::filesystem::path>> snapshots_;

  absl::flat_hash_map<string_view, size_t> rdb_name_map_;
  util::fb2::Mutex rdb_name_map_mu_;
  bool is_bg_save_ = false;
};

GenericError ValidateFilename(const std::filesystem::path& filename, bool new_version);

}  // namespace detail
}  // namespace dfly


================================================
FILE: src/server/detail/snapshot_storage.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.

#include "server/detail/snapshot_storage.h"

#include <absl/strings/str_replace.h>
#include <absl/strings/strip.h>

#ifdef WITH_AWS
#include <aws/core/auth/AWSCredentialsProvider.h>
#include <aws/s3/S3Client.h>
#include <aws/s3/model/ListObjectsV2Request.h>
#include <aws/s3/model/PutObjectRequest.h>

#include "util/aws/aws.h"
#include "util/aws/credentials_provider_chain.h"
#include "util/aws/s3_endpoint_provider.h"
#include "util/aws/s3_read_file.h"
#include "util/aws/s3_write_file.h"
#endif

#ifdef WITH_GCP
#include "util/cloud/gcp/gcs_file.h"
#endif

#include <regex>

#include "base/logging.h"
#include "io/file_util.h"
#include "server/engine_shard_set.h"
#include "util/cloud/azure/creds_provider.h"
#include "util/cloud/azure/storage.h"
#include "util/fibers/fiber_file.h"
namespace dfly {
namespace detail {

using namespace util;
using namespace std;

namespace {

constexpr string_view kSummarySuffix = "summary.dfs"sv;

pair<string, string> GetBucketPath(string_view path) {
  string_view clean = path;
  auto prefix = absl::StartsWith(clean, kS3Prefix) ? kS3Prefix : kGCSPrefix;
  clean = absl::StripPrefix(clean, prefix);

  size_t pos = clean.find('/');
  if (pos == string_view::npos) {
    return make_pair(string(clean), "");
  }

  string bucket_name{clean.substr(0, pos)};
  string obj_path{clean.substr(pos + 1)};

  return make_pair(std::move(bucket_name), std::move(obj_path));
}

#ifdef __linux__
const int kRdbWriteFlags = O_CREAT | O_WRONLY | O_TRUNC | O_CLOEXEC | O_DIRECT;
#endif

std::string EscapeRegex(string_view input) {
  // List of regex special characters that need escaping
  // We don't escape "{}" since we use them for our own placeholders.
  constexpr std::string_view chars{"\\.^$|?*+()[]"};
  std::string escaped;

  // Reserve space to avoid multiple reallocations
  escaped.reserve(input.size() * 1.1);

  for (char c : input) {
    // If the character is in our specialChars list, prepend a backslash
    if (chars.find(c) != std::string::npos) {
      escaped += '\\';
    }
    escaped += c;
  }

  return escaped;
}

}  // namespace

string SnapshotStorage::FindMatchingFile(string_view prefix, string_view dbfilename,
                                         vector<SnapStat> keys) {
  std::sort(std::begin(keys), std::end(keys),
            [](const SnapStat& l, const SnapStat& r) { return l.last_modified > r.last_modified; });

  // Create a regex to match the object keys, substituting the timestamp
  // and adding an extension if needed.
  fs::path fl_path{prefix};
  fl_path.append(dbfilename);
  fl_path = EscapeRegex(fl_path.string());

  SubstituteFilenamePlaceholders(&fl_path,
                                 {.ts = "([0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2})",
                                  .year = "([0-9]{4})",
                                  .month = "([0-9]{2})",
                                  .day = "([0-9]{2})"});
  if (!fl_path.has_extension()) {
    fl_path += "(-summary.dfs|.rdb)";
  }
  const std::regex re(fl_path.string());

  for (const SnapStat& key : keys) {
    DVLOG(1) << "Checking object key: " << key.name << " against regex: " << fl_path.string();
    std::smatch m;
    if (std::regex_match(key.name, m, re)) {
      return key.name;
    }
  }
  return {};
}

io::Result<SnapshotStorage::ExpandResult, GenericError> SnapshotStorage::ExpandSnapshot(
    const string& load_path) {
  if (!(absl::EndsWith(load_path, ".rdb") || absl::EndsWith(load_path, "summary.dfs"))) {
    return nonstd::make_unexpected(
        GenericError(std::make_error_code(std::errc::invalid_argument), "Bad filename extension"));
  }

  error_code ec = CheckPath(load_path);
  if (ec) {
    return nonstd::make_unexpected(GenericError(ec, "File not found"));
  }

  ExpandResult result;

  // Collect all other files in case we're loading dfs.
  if (absl::EndsWith(load_path, "summary.dfs")) {
    auto res = ExpandFromPath(load_path);
    if (!res) {
      return nonstd::make_unexpected(res.error());
    }
    result = std::move(*res);
    result.push_back(load_path);
  } else {
    result.push_back(load_path);
  }
  return result;
}

FileSnapshotStorage::FileSnapshotStorage(fb2::FiberQueueThreadPool* fq_threadpool)
    : fq_threadpool_{fq_threadpool} {
}

io::Result<std::pair<io::Sink*, uint8_t>, GenericError> FileSnapshotStorage::OpenWriteFile(
    const std::string& path) {
  if (fq_threadpool_) {  // EPOLL
    FiberWriteOptions opts;
    opts.direct = true;

    auto res = OpenFiberWriteFile(path, fq_threadpool_, opts);
    if (!res) {
      return nonstd::make_unexpected(GenericError(res.error(), "Couldn't open file for writing"));
    }

    return std::pair(*res, FileType::FILE | FileType::DIRECT);
  } else {
#ifdef __linux__
    auto res = fb2::OpenLinux(path, kRdbWriteFlags, 0666);
    if (!res) {
      return nonstd::make_unexpected(GenericError(
          res.error(),
          "Couldn't open file for writing (is direct I/O supported by the file system?)"));
    }

    uint8_t file_type = FileType::FILE | FileType::IO_URING;
    if (kRdbWriteFlags & O_DIRECT) {
      file_type |= FileType::DIRECT;
    }
    return std::pair(new LinuxWriteWrapper(res->release()), file_type);
#else
    LOG(FATAL) << "Linux I/O is not supported on this platform";
#endif
  }
}

io::ReadonlyFileOrError FileSnapshotStorage::OpenReadFile(const std::string& path) {
#ifdef __linux__
  if (fq_threadpool_) {
    return OpenFiberReadFile(path, fq_threadpool_);
  } else {
    return fb2::OpenRead(path);
  }
#else
  return OpenFiberReadFile(path, fq_threadpool_);
#endif
}

io::Result<std::string, GenericError> FileSnapshotStorage::LoadPath(std::string_view dir,
                                                                    std::string_view dbfilename) {
  if (dbfilename.empty())
    return {};

  fs::path data_folder;
  if (dir.empty()) {
    data_folder = fs::current_path();
  } else {
    std::error_code file_ec;
    data_folder = fs::canonical(dir, file_ec);
    if (file_ec) {
      return nonstd::make_unexpected(GenericError{file_ec, "Data directory error"});
    }
  }

  LOG(INFO) << "Load snapshot: Searching for snapshot in directory: " << data_folder;

  fs::path fl_path = data_folder.append(dbfilename);
  // If we've found an exact match we're done.
  if (fs::exists(fl_path))
    return fl_path.generic_string();

  SubstituteFilenamePlaceholders(&fl_path, {"*", "*", "*", "*"});
  if (!fl_path.has_extension()) {
    fl_path += "*";
  }
  io::Result<io::StatShortVec> short_vec = io::StatFiles(fl_path.generic_string());
  if (short_vec) {
    std::sort(short_vec->begin(), short_vec->end(),
              [](const io::StatShort& l, const io::StatShort& r) {
                return std::difftime(l.last_modified, r.last_modified) < 0;
              });
    auto it = std::find_if(short_vec->rbegin(), short_vec->rend(), [](const auto& stat) {
      return absl::EndsWith(stat.name, ".rdb") || absl::EndsWith(stat.name, kSummarySuffix);
    });
    if (it != short_vec->rend())
      return it->name;
  } else {
    return nonstd::make_unexpected(
        GenericError(short_vec.error(), "Could not stat snapshot directory"));
  }

  return nonstd::make_unexpected(GenericError(
      std::make_error_code(std::errc::no_such_file_or_directory), "Snapshot not found"));
}

io::Result<vector<string>, GenericError> FileSnapshotStorage::ExpandFromPath(const string& path) {
  string glob = absl::StrReplaceAll(path, {{"summary", "????"}});
  io::Result<io::StatShortVec> files = io::StatFiles(glob);

  if (!files || files->size() == 0) {
    return nonstd::make_unexpected(GenericError(make_error_code(errc::no_such_file_or_directory),
                                                "Cound not find DFS shard files"));
  }

  vector<string> paths;
  for (auto& fstat : *files) {
    paths.push_back(std::move(fstat.name));
  }

  return paths;
}

error_code FileSnapshotStorage::CheckPath(const string& path) {
  error_code ec;
  std::ignore = fs::canonical(path, ec);
  return ec;
}

#ifdef WITH_GCP
GcsSnapshotStorage::~GcsSnapshotStorage() {
  util::http::TlsClient::FreeContext(ctx_);
}

error_code GcsSnapshotStorage::Init(unsigned connect_ms) {
  error_code ec = creds_provider_.Init(connect_ms);
  if (ec)
    return ec;

  ctx_ = util::http::TlsClient::CreateSslContext();
  return ec;
}

io::Result<std::pair<io::Sink*, uint8_t>, GenericError> GcsSnapshotStorage::OpenWriteFile(
    const std::string& path) {
  CHECK(ctx_);

  pair<string, string> bucket_path = GetBucketPath(path);
  fb2::ProactorBase* proactor = fb2::ProactorBase::me();
  unique_ptr<http::ClientPool> conn_pool = cloud::GCS::CreateApiConnectionPool(ctx_, proactor);
  cloud::GcsWriteFileOptions opts;
  opts.creds_provider = &creds_provider_;
  opts.pool = conn_pool.release();
  opts.pool_owned = true;

  io::Result<io::WriteFile*> dest_res =
      cloud::OpenWriteGcsFile(bucket_path.first, bucket_path.second, opts);
  if (!dest_res) {
    return nonstd::make_unexpected(GenericError(dest_res.error(), "Could not open file"));
  }

  return std::pair(*dest_res, FileType::CLOUD);
}

io::ReadonlyFileOrError GcsSnapshotStorage::OpenReadFile(const std::string& path) {
  if (!IsGCSPath(path))
    return nonstd::make_unexpected(GenericError("Invalid GCS path"));

  auto [bucket, key] = GetBucketPath(path);
  fb2::ProactorBase* proactor = fb2::ProactorBase::me();
  unique_ptr<http::ClientPool> conn_pool = cloud::GCS::CreateApiConnectionPool(ctx_, proactor);
  cloud::GcsReadFileOptions opts;
  opts.creds_provider = &creds_provider_;
  opts.pool = conn_pool.release();
  opts.pool_owned = true;

  return cloud::OpenReadGcsFile(bucket, key, opts);
}

io::Result<std::string, GenericError> GcsSnapshotStorage::LoadPath(string_view dir,
                                                                   string_view dbfilename) {
  if (dbfilename.empty())
    return "";

  auto [bucket_name, prefix] = GetBucketPath(dir);

  // GCS needs trailing slash to match prefix sub path
  if (!prefix.empty() && prefix.back() != '/') {
    prefix += '/';
  }

  fb2::ProactorBase* proactor = shard_set->pool()->GetNextProactor();

  io::Result<vector<SnapStat>, GenericError> keys =
      proactor->Await([this, proactor, bucket_name = bucket_name,
                       prefix = prefix]() -> io::Result<vector<SnapStat>, GenericError> {
        cloud::GCS gcs(&creds_provider_, ctx_, proactor);
        vector<SnapStat> res;
        error_code ec =
            gcs.List(bucket_name, prefix, false, [&res](const cloud::StorageListItem& item) {
              res.emplace_back(SnapStat{string(item.key), item.mtime_ns});
            });
        if (ec)
          return nonstd::make_unexpected(GenericError(ec, "Failed to list objects"));
        return res;
      });

  if (!keys) {
    return nonstd::make_unexpected(keys.error());
  }

  auto match_key = FindMatchingFile(prefix, dbfilename, *keys);
  if (!match_key.empty()) {
    return absl::StrCat(kGCSPrefix, bucket_name, "/", match_key);
  }
  return nonstd::make_unexpected(GenericError(
      std::make_error_code(std::errc::no_such_file_or_directory), "Snapshot not found"));
}

io::Result<vector<string>, GenericError> GcsSnapshotStorage::ExpandFromPath(
    const string& load_path) {
  if (!IsGCSPath(load_path))
    return nonstd::make_unexpected(
        GenericError(make_error_code(errc::invalid_argument), "Invalid GCS path"));

  if (!absl::EndsWith(load_path, kSummarySuffix))
    return vector<string>{};

  const auto [bucket_name, obj_path] = GetBucketPath(load_path);
  regex re(absl::StrReplaceAll(obj_path, {{"summary", "[0-9]{4}"}}));
  string_view prefix = absl::StripSuffix(obj_path, kSummarySuffix);

  // Find snapshot shard files if we're loading DFS.
  fb2::ProactorBase* proactor = shard_set->pool()->GetNextProactor();
  auto paths = proactor->Await([&, &bucket_name =
                                       bucket_name]() -> io::Result<vector<string>, GenericError> {
    vector<string> res;
    cloud::GCS gcs(&creds_provider_, ctx_, proactor);

    error_code ec = gcs.List(bucket_name, prefix, false, [&](const cloud::StorageListItem& item) {
      std::smatch m;
      string key{item.key};
      if (std::regex_match(key, m, re)) {
        res.push_back(absl::StrCat(kGCSPrefix, bucket_name, "/", item.key));
      }
    });

    if (ec) {
      return nonstd::make_unexpected(ec);
    }

    return res;
  });

  if (!paths || paths->empty()) {
    return nonstd::make_unexpected(
        GenericError{std::make_error_code(std::errc::no_such_file_or_directory),
                     "Cound not find DFS snapshot shard files"});
  }

  return *paths;
}

error_code GcsSnapshotStorage::CheckPath(const std::string& path) {
  return {};
}
#endif

// AZURE

AzureSnapshotStorage::AzureSnapshotStorage() {
  creds_provider_ = make_unique<util::cloud::azure::Credentials>();
}

AzureSnapshotStorage::~AzureSnapshotStorage() {
  util::http::TlsClient::FreeContext(ctx_);
}

error_code AzureSnapshotStorage::Init(unsigned connect_ms) {
  error_code ec = creds_provider_->Init(connect_ms);
  if (!ec) {
    ctx_ = util::http::TlsClient::CreateSslContext();
  }
  return ec;
}

io::Result<std::pair<io::Sink*, uint8_t>, GenericError> AzureSnapshotStorage::OpenWriteFile(
    const std::string& path) {
  return nonstd::make_unexpected(GenericError("Not implemented"));
}

io::ReadonlyFileOrError AzureSnapshotStorage::OpenReadFile(const std::string& path) {
  if (!IsAzurePath(path))
    return nonstd::make_unexpected(GenericError("Invalid azure path"));

  auto [bucket, key] = GetBucketPath(path);

  return nonstd::make_unexpected(GenericError("Not implemented"));
}

io::Result<std::string, GenericError> AzureSnapshotStorage::LoadPath(string_view dir,
                                                                     string_view dbfilename) {
  if (dbfilename.empty())
    return "";

  auto [bucket_name, prefix] = GetBucketPath(dir);

  // TODO: check if needed
  if (!prefix.empty() && prefix.back() != '/') {
    prefix += '/';
  }

  fb2::ProactorBase* proactor = shard_set->pool()->GetNextProactor();

  io::Result<vector<SnapStat>, GenericError> keys =
      proactor->Await([this, bucket_name = bucket_name,
                       prefix = prefix]() -> io::Result<vector<SnapStat>, GenericError> {
        cloud::azure::Storage azure((cloud::azure::Credentials*)creds_provider_.get());
        vector<SnapStat> res;
        error_code ec =
            azure.List(bucket_name, prefix, false, 500, [&res](const cloud::StorageListItem& item) {
              res.emplace_back(string(item.key), item.mtime_ns);
            });
        if (ec)
          return nonstd::make_unexpected(GenericError(ec, "Failed to list objects"));
        return res;
      });

  if (!keys) {
    return nonstd::make_unexpected(keys.error());
  }

  auto match_key = FindMatchingFile(prefix, dbfilename, *keys);
  if (!match_key.empty()) {
    return absl::StrCat(kGCSPrefix, bucket_name, "/", match_key);
  }
  return nonstd::make_unexpected(GenericError(
      std::make_error_code(std::errc::no_such_file_or_directory), "Snapshot not found"));
}

io::Result<vector<string>, GenericError> AzureSnapshotStorage::ExpandFromPath(
    const string& load_path) {
  if (!IsAzurePath(load_path))
    return nonstd::make_unexpected(
        GenericError(make_error_code(errc::invalid_argument), "Invalid Azure path"));

  if (!absl::EndsWith(load_path, kSummarySuffix))
    return vector<string>{};

  const auto [bucket_name, obj_path] = GetBucketPath(load_path);
  regex re(absl::StrReplaceAll(obj_path, {{"summary", "[0-9]{4}"}}));
  string_view prefix = absl::StripSuffix(obj_path, kSummarySuffix);

  // Find snapshot shard files if we're loading DFS.
  fb2::ProactorBase* proactor = shard_set->pool()->GetNextProactor();
  auto paths = proactor->Await(
      [&, &bucket_name = bucket_name]() -> io::Result<vector<string>, GenericError> {
        vector<string> res;
        cloud::azure::Storage azure(creds_provider_.get());

        error_code ec =
            azure.List(bucket_name, prefix, false, 500, [&](const cloud::StorageListItem& item) {
              std::smatch m;
              string key{item.key};
              if (std::regex_match(key, m, re)) {
                res.push_back(absl::StrCat(kAzurePrefix, bucket_name, "/", item.key));
              }
            });

        if (ec) {
          return nonstd::make_unexpected(ec);
        }

        return res;
      });

  if (!paths || paths->empty()) {
    return nonstd::make_unexpected(
        GenericError{std::make_error_code(std::errc::no_such_file_or_directory),
                     "Cound not find DFS snapshot shard files"});
  }

  return *paths;
}

error_code AzureSnapshotStorage::CheckPath(const std::string& path) {
  return {};
}

#ifdef WITH_AWS
AwsS3SnapshotStorage::AwsS3SnapshotStorage(const std::string& endpoint, bool https,
                                           bool ec2_metadata, bool sign_payload) {
  shard_set->pool()->GetNextProactor()->Await([&] {
    if (!ec2_metadata) {
      setenv("AWS_EC2_METADATA_DISABLED", "true", 0);
    }
    // S3ClientConfiguration may request configuration and credentials from
    // EC2 metadata so must be run in a proactor thread.
    Aws::S3::S3ClientConfiguration s3_conf;
    s3_conf.checksumConfig.responseChecksumValidation =
        Aws::Client::ResponseChecksumValidation::WHEN_REQUIRED;

    LOG(INFO) << "Creating AWS S3 client; region=" << s3_conf.region << "; https=" << std::boolalpha
              << https << "; endpoint=" << endpoint;
    if (!sign_payload) {
      s3_conf.payloadSigningPolicy = Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never;
    }
    std::shared_ptr<Aws::Auth::AWSCredentialsProvider> credentials_provider =
        std::make_shared<aws::CredentialsProviderChain>();
    // Pass a custom endpoint. If empty uses the S3 endpoint.
    std::shared_ptr<Aws::S3::S3EndpointProviderBase> endpoint_provider =
        std::make_shared<aws::S3EndpointProvider>(endpoint, https);
    s3_ = std::make_shared<Aws::S3::S3Client>(credentials_provider, endpoint_provider, s3_conf);
  });
}

io::Result<std::pair<io::Sink*, uint8_t>, GenericError> AwsS3SnapshotStorage::OpenWriteFile(
    const std::string& path) {
  optional<pair<string, string>> bucket_path = GetBucketPath(path);
  if (!bucket_path) {
    return nonstd::make_unexpected(GenericError("Invalid S3 path"));
  }
  auto [bucket, key] = *bucket_path;

  fb2::ProactorBase* proactor = ProactorBase::me();

  // We run S3 operations via a temporary fiber to avoid agressive stack consumption.
  io::Result<std::pair<io::Sink*, uint8_t>, GenericError> result;
  auto fb = proactor->LaunchFiber(
      fb2::Launch::post, boost::context::fixedsize_stack{40 * 1024}, "open_s3_write", [&] {
        io::Result<aws::S3WriteFile> file = aws::S3WriteFile::Open(bucket, key, s3_);
        if (!file) {
          result = nonstd::make_unexpected(GenericError(file.error(), "Failed to open write file"));
          return;
        }

        aws::S3WriteFile* f = new aws::S3WriteFile(std::move(*file));
        result = std::pair<io::Sink*, uint8_t>(f, FileType::CLOUD);
      });
  fb.Join();
  return result;
}

io::ReadonlyFileOrError AwsS3SnapshotStorage::OpenReadFile(const std::string& path) {
  std::optional<std::pair<std::string, std::string>> bucket_path = GetBucketPath(path);
  if (!bucket_path) {
    return nonstd::make_unexpected(GenericError("Invalid S3 path"));
  }
  auto [bucket, key] = *bucket_path;
  return new aws::S3ReadFile(bucket, key, s3_);
}

io::Result<std::string, GenericError> AwsS3SnapshotStorage::LoadPath(std::string_view dir,
                                                                     std::string_view dbfilename) {
  if (dbfilename.empty())
    return "";

  auto [bucket_name, prefix] = GetBucketPath(dir);

  LOG(INFO) << "Load snapshot: Searching for snapshot in S3 path: " << kS3Prefix << bucket_name
            << "/" << prefix;
  io::Result<std::vector<SnapStat>, GenericError> keys = ListObjects(bucket_name, prefix);
  if (!keys) {
    return nonstd::make_unexpected(keys.error());
  }

  auto match_key = FindMatchingFile(prefix, dbfilename, *keys);
  if (!match_key.empty()) {
    return absl::StrCat(kS3Prefix, bucket_name, "/", match_key);
  }
  return nonstd::make_unexpected(GenericError(
      std::make_error_code(std::errc::no_such_file_or_directory), "Snapshot not found"));
}

io::Result<vector<string>, GenericError> AwsS3SnapshotStorage::ExpandFromPath(
    const string& load_path) {
  optional<pair<string, string>> bucket_path = GetBucketPath(load_path);
  if (!bucket_path) {
    return nonstd::make_unexpected(
        GenericError{std::make_error_code(std::errc::invalid_argument), "Invalid S3 path"});
  }

  auto& [bucket_name, obj_path] = *bucket_path;

  // Limit prefix to objects in the same 'directory' as load_path.
  const size_t pos = obj_path.find_last_of('/');
  const std::string prefix = (pos == std::string_view::npos) ? "" : obj_path.substr(0, pos + 1);

  io::Result<std::vector<SnapStat>, GenericError> list_res = ListObjects(bucket_name, prefix);
  if (!list_res) {
    return nonstd::make_unexpected(list_res.error());
  }

  vector<string> paths;
  obj_path = EscapeRegex(obj_path);
  const std::regex re(absl::StrReplaceAll(obj_path, {{"summary", "[0-9]{4}"}}));

  for (const SnapStat& key : *list_res) {
    std::smatch m;
    DVLOG(1) << "Checking object key: " << key.name << " against regex: " << obj_path;

    if (std::regex_match(key.name, m, re)) {
      paths.push_back(std::string(kS3Prefix) + bucket_name + "/" + key.name);
    }
  }

  if (paths.empty()) {
    return nonstd::make_unexpected(
        GenericError{std::make_error_code(std::errc::no_such_file_or_directory),
                     "Cound not find DFS snapshot shard files"});
  }

  return paths;
}

error_code AwsS3SnapshotStorage::CheckPath(const std::string& path) {
  return {};
}

io::Result<std::vector<AwsS3SnapshotStorage::SnapStat>, GenericError>
AwsS3SnapshotStorage::ListObjects(std::string_view bucket_name, std::string_view prefix) {
  // Each list objects request has a 1000 object limit, so page through the
  // objects if needed.
  std::string continuation_token;
  std::vector<SnapStat> keys;

  // We use a random proactor because this function might be called from the main thread.
  fb2::ProactorBase* proactor = shard_set->pool()->GetNextProactor();

  do {
    Aws::S3::Model::ListObjectsV2Request request;
    request.SetBucket(std::string(bucket_name));
    if (!prefix.empty()) {
      // Ensure prefix ends with '/' to treat it as a directory-like namespace and avoid
      // matching objects with similar prefix names.
      if (prefix.back() == '/') {
        request.SetPrefix(std::string(prefix));
      } else {
        request.SetPrefix(std::string(prefix) + '/');
      }
    }
    request.SetDelimiter("/");

    if (!continuation_token.empty()) {
      request.SetContinuationToken(continuation_token);
    }

    Aws::S3::Model::ListObjectsV2Outcome outcome;

    // We use fibers to wrap the s3 call to avoid stack exhaustion.
    auto fb = proactor->LaunchFiber(
        fb2::Launch::post, boost::context::fixedsize_stack{40 * 1024}, "list_s3",
        [&, &bucket_name = bucket_name] { outcome = s3_->ListObjectsV2(request); });

    fb.Join();

    if (outcome.IsSuccess()) {
      continuation_token = outcome.GetResult().GetNextContinuationToken();
      for (const auto& object : outcome.GetResult().GetContents()) {
        keys.emplace_back(object.GetKey(), object.GetLastModified().Millis());
      }
    } else if (outcome.GetError().GetExceptionName() == "PermanentRedirect") {
      return nonstd::make_unexpected(
          GenericError{"Failed list objects in S3 bucket: Permanent redirect; Ensure your "
                       "configured AWS region matches the S3 bucket region"});
    } else if (outcome.GetError().GetErrorType() == Aws::S3::S3Errors::NO_SUCH_BUCKET) {
      return nonstd::make_unexpected(GenericError{
          "Failed list objects in S3 bucket: Bucket not found: " + std::string(bucket_name)});
    } else if (outcome.GetError().GetErrorType() == Aws::S3::S3Errors::INVALID_ACCESS_KEY_ID) {
      return nonstd::make_unexpected(
          GenericError{"Failed list objects in S3 bucket: Invalid access key ID"});
    } else if (outcome.GetError().GetErrorType() == Aws::S3::S3Errors::SIGNATURE_DOES_NOT_MATCH) {
      return nonstd::make_unexpected(
          GenericError{"Failed list objects in S3 bucket: Invalid signature; Check your AWS "
                       "credentials are correct"});
    } else if (outcome.GetError().GetExceptionName() == "InvalidToken") {
      return nonstd::make_unexpected(
          GenericError{"Failed list objects in S3 bucket: Invalid token; Check your AWS "
                       "credentials are correct"});
    } else {
      return nonstd::make_unexpected(GenericError{"Failed list objects in S3 bucket: " +
                                                  outcome.GetError().GetExceptionName()});
    }
  } while (!continuation_token.empty());
  return keys;
}
#endif

#ifdef __linux__
io::Result<size_t> LinuxWriteWrapper::WriteSome(const iovec* v, uint32_t len) {
  io::Result<size_t> res = lf_->WriteSome(v, len, offset_, 0);
  if (res) {
    offset_ += *res;
  }

  return res;
}
#endif

void SubstituteFilenamePlaceholders(fs::path* filename, const FilenameSubstitutions& fns) {
  *filename = absl::StrReplaceAll(
      filename->string(),
      {{"{Y}", fns.year}, {"{m}", fns.month}, {"{d}", fns.day}, {"{timestamp}", fns.ts}});
}

}  // namespace detail
}  // namespace dfly


================================================
FILE: src/server/detail/snapshot_storage.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.

#pragma once

#ifdef WITH_AWS
#include <aws/s3/S3Client.h>
#endif

#ifdef WITH_GCP
#include "util/cloud/gcp/gcp_creds_provider.h"
#include "util/cloud/gcp/gcs.h"
#endif

#include <absl/strings/match.h>

#include <filesystem>
#include <string>
#include <string_view>
#include <utility>

#include "io/io.h"
#include "server/execution_state.h"
#include "util/cloud/utils.h"
#include "util/fibers/fiberqueue_threadpool.h"
#include "util/fibers/uring_file.h"

namespace dfly {
namespace detail {

namespace fs = std::filesystem;

constexpr std::string_view kS3Prefix = "s3://";
constexpr std::string_view kGCSPrefix = "gs://";
constexpr std::string_view kAzurePrefix = "az://";

const size_t kBucketConnectMs = 2000;

enum FileType : uint8_t {
  FILE = (1u << 0),
  CLOUD = (1u << 1),
  IO_URING = (1u << 2),
  DIRECT = (1u << 3),
};

class SnapshotStorage {
 public:
  virtual ~SnapshotStorage() = default;

  // Opens the file at the given path, and returns the open file and file
  // type, which is a bitmask of FileType.
  virtual io::Result<std::pair<io::Sink*, uint8_t>, GenericError> OpenWriteFile(
      const std::string& path) = 0;

  virtual io::ReadonlyFileOrError OpenReadFile(const std::string& path) = 0;

  // Returns the path of the RDB file or DFS summary file to load.
  virtual io::Result<std::string, GenericError> LoadPath(std::string_view dir,
                                                         std::string_view dbfilename) = 0;

  using ExpandResult = std::vector<std::string>;
  // Searches for all the relevant snapshot files given the RDB file or DFS summary file path.
  io::Result<ExpandResult, GenericError> ExpandSnapshot(const std::string& load_path);

  virtual bool IsCloud() const {
    return false;
  }

 protected:
  struct SnapStat {
    SnapStat(std::string file_name, int64_t ts) : name(std::move(file_name)), last_modified(ts) {
    }
    std::string name;
    int64_t last_modified;
  };

  // Returns empty string if nothing is matched. vector is passed by value on purpose, as it is
  // been sorted inside.
  static std::string FindMatchingFile(std::string_view prefix, std::string_view dbfilename,
                                      std::vector<SnapStat> keys);

  virtual io::Result<std::vector<std::string>, GenericError> ExpandFromPath(
      const std::string& path) = 0;

  virtual std::error_code CheckPath(const std::string& path) = 0;
};

class FileSnapshotStorage : public SnapshotStorage {
 public:
  explicit FileSnapshotStorage(util::fb2::FiberQueueThreadPool* fq_threadpool);

  io::Result<std::pair<io::Sink*, uint8_t>, GenericError> OpenWriteFile(
      const std::string& path) override;

  io::ReadonlyFileOrError OpenReadFile(const std::string& path) override;

  io::Result<std::string, GenericError> LoadPath(std::string_view dir,
                                                 std::string_view dbfilename) override;

 private:
  io::Result<std::vector<std::string>, GenericError> ExpandFromPath(const std::string& path) final;

  std::error_code CheckPath(const std::string& path) final;
  util::fb2::FiberQueueThreadPool* fq_threadpool_;
};

#ifdef WITH_GCP
class GcsSnapshotStorage : public SnapshotStorage {
 public:
  ~GcsSnapshotStorage();

  std::error_code Init(unsigned connect_ms);

  io::Result<std::pair<io::Sink*, uint8_t>, GenericError> OpenWriteFile(
      const std::string& path) override;

  io::ReadonlyFileOrError OpenReadFile(const std::string& path) override;

  io::Result<std::string, GenericError> LoadPath(std::string_view dir,
                                                 std::string_view dbfilename) override;

  bool IsCloud() const final {
    return true;
  }

 private:
  io::Result<std::vector<std::string>, GenericError> ExpandFromPath(const std::string& path) final;

  std::error_code CheckPath(const std::string& path) final;

  util::cloud::GCPCredsProvider creds_provider_;
  SSL_CTX* ctx_ = NULL;
};
#endif

class AzureSnapshotStorage : public SnapshotStorage {
 public:
  AzureSnapshotStorage();
  ~AzureSnapshotStorage();

  std::error_code Init(unsigned connect_ms);

  io::Result<std::pair<io::Sink*, uint8_t>, GenericError> OpenWriteFile(
      const std::string& path) override;

  io::ReadonlyFileOrError OpenReadFile(const std::string& path) override;

  io::Result<std::string, GenericError> LoadPath(std::string_view dir,
                                                 std::string_view dbfilename) override;

  bool IsCloud() const final {
    return true;
  }

 private:
  io::Result<std::vector<std::string>, GenericError> ExpandFromPath(const std::string& path) final;

  std::error_code CheckPath(const std::string& path) final;

  std::unique_ptr<util::cloud::CredentialsProvider> creds_provider_;
  SSL_CTX* ctx_ = NULL;
};

#ifdef WITH_AWS
class AwsS3SnapshotStorage : public SnapshotStorage {
 public:
  AwsS3SnapshotStorage(const std::string& endpoint, bool https, bool ec2_metadata,
                       bool sign_payload);

  io::Result<std::pair<io::Sink*, uint8_t>, GenericError> OpenWriteFile(
      const std::string& path) override;

  io::ReadonlyFileOrError OpenReadFile(const std::string& path) override;

  io::Result<std::string, GenericError> LoadPath(std::string_view dir,
                                                 std::string_view dbfilename) override;

  bool IsCloud() const final {
    return true;
  }

 private:
  io::Result<std::vector<std::string>, GenericError> ExpandFromPath(const std::string& path) final;

  std::error_code CheckPath(const std::string& path) final;

  // List the objects in the given bucket with the given prefix. This must
  // run from a proactor.
  io::Result<std::vector<SnapStat>, GenericError> ListObjects(std::string_view bucket_name,
                                                              std::string_view prefix);

  std::shared_ptr<Aws::S3::S3Client> s3_;
};

#endif

#ifdef __linux__
// takes ownership over the file.
class LinuxWriteWrapper : public io::Sink {
 public:
  explicit LinuxWriteWrapper(util::fb2::LinuxFile* lf) : lf_(lf) {
  }

  io::Result<size_t> WriteSome(const iovec* v, uint32_t len) final;

  std::error_code Close() {
    return lf_->Close();
  }

 private:
  std::unique_ptr<util::fb2::LinuxFile> lf_;
  off_t offset_ = 0;
};
#endif

struct FilenameSubstitutions {
  std::string_view ts;
  std::string_view year;
  std::string_view month;
  std::string_view day;
};

void SubstituteFilenamePlaceholders(fs::path* filename, const FilenameSubstitutions& fns);

inline bool IsS3Path(std::string_view path) {
  return absl::StartsWith(path, detail::kS3Prefix);
}

inline bool IsGCSPath(std::string_view path) {
  return absl::StartsWith(path, detail::kGCSPrefix);
}

inline bool IsAzurePath(std::string_view path) {
  return absl::StartsWith(path, detail::kAzurePrefix);
}

inline bool IsCloudPath(std::string_view path) {
  return IsS3Path(path) || IsGCSPath(path) || IsAzurePath(path);
}

}  // namespace detail
}  // namespace dfly


================================================
FILE: src/server/detail/table.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include "core/compact_object.h"
#include "core/dash.h"
#include "core/expire_period.h"

namespace dfly {

namespace detail {

using PrimeKey = CompactKey;
using PrimeValue = CompactValue;

struct PrimeTablePolicy {
  enum { kSlotNum = 14, kBucketNum = 56 };

  static constexpr bool kUseVersion = true;

  static uint64_t HashFn(const PrimeKey& s) {
    return s.HashCode();
  }

  static uint64_t HashFn(std::string_view u) {
    return CompactObj::HashCode(u);
  }

  static void DestroyKey(PrimeKey& cs) {
    cs.Reset();
  }

  static void DestroyValue(PrimeValue& o) {
    o.Reset();
  }

  static bool Equal(const PrimeKey& s1, std::string_view s2) {
    return s1 == s2;
  }

  static bool Equal(const PrimeKey& s1, const PrimeKey& s2) {
    return s1 == s2;
  }
};

struct ExpireTablePolicy {
  enum : uint8_t { kSlotNum = 14, kBucketNum = 56 };
  static constexpr bool kUseVersion = false;

  static uint64_t HashFn(const PrimeKey& s) {
    return s.HashCode();
  }

  static uint64_t HashFn(std::string_view u) {
    return CompactObj::HashCode(u);
  }

  static void DestroyKey(PrimeKey& cs) {
    cs.Reset();
  }

  static void DestroyValue(uint32_t val) {
  }

  static bool Equal(const PrimeKey& s1, std::string_view s2) {
    return s1 == s2;
  }
};

}  // namespace detail
}  // namespace dfly


================================================
FILE: src/server/detail/wrapped_json_path.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <string_view>
#include <utility>
#include <variant>

#include "base/logging.h"
#include "common/string_or_view.h"
#include "core/json/json_object.h"
#include "core/json/path.h"
#include "facade/op_status.h"

namespace dfly {

using facade::OpResult;
using facade::OpStatus;
using Nothing = std::monostate;
using JsonExpression = jsoncons::jsonpath::jsonpath_expression<JsonType>;

namespace details {
template <typename T>
void OptionalEmplace(bool keep_defined, std::optional<T> src, std::optional<T>* dest);

template <typename T> void OptionalEmplace(bool keep_defined, T src, T* dest);
}  // namespace details

template <typename T>
using JsonPathReadOnlyCallback = absl::FunctionRef<T(std::string_view, const JsonType&)>;

template <typename T = Nothing> struct MutateCallbackResult {
  bool should_be_deleted = false;
  std::optional<T> value;
};

template <typename T>
using JsonPathMutateCallback =
    absl::FunctionRef<MutateCallbackResult<T>(std::optional<std::string_view>, JsonType*)>;

enum class JsonPathType { kV2, kLegacy /*Or V1*/ };
constexpr JsonPathType kDefaultJsonPathType = JsonPathType::kV2;

struct CallbackResultOptions {
 public:
  enum class SavingOrder { kSaveFirst, kSaveLast };
  enum class OnEmpty { kSendNil, kSendWrongType };

  // Default options for WrappedJsonPath::ExecuteReadOnlyCallback
  static CallbackResultOptions DefaultReadOnlyOptions(
      SavingOrder saving_order = SavingOrder::kSaveLast);
  // Default options for WrappedJsonPath::ExecuteMutateCallback
  static CallbackResultOptions DefaultMutateOptions();

  OnEmpty on_empty;
  SavingOrder saving_order{SavingOrder::kSaveLast};
  std::optional<JsonPathType> path_type{std::nullopt};
};

template <typename T> class JsonCallbackResult {
 private:
  template <typename V> struct is_optional : std::false_type {};

  template <typename V> struct is_optional<std::optional<V>> : std::true_type {};

 public:
  using SavingOrder = CallbackResultOptions::SavingOrder;
  using OnEmpty = CallbackResultOptions::OnEmpty;

  JsonCallbackResult() = default;

  explicit JsonCallbackResult(CallbackResultOptions options);

  void AddValue(T value);

  bool Empty() const;

  bool IsV1() const;
  const T& AsV1() const;
  const auto& AsV2() const;

  bool ShouldSendNil() const;
  bool ShouldSendWrongType() const;

 private:
  std::vector<T> result_;
  CallbackResultOptions options_{OnEmpty::kSendWrongType, SavingOrder::kSaveLast,
                                 kDefaultJsonPathType};
};

class WrappedJsonPath {
 public:
  static constexpr std::string_view kV1PathRootElement = ".";
  static constexpr std::string_view kV2PathRootElement = "$";

  WrappedJsonPath(json::Path json_path, cmn::StringOrView path, JsonPathType path_type);

  WrappedJsonPath(JsonExpression expression, cmn::StringOrView path, JsonPathType path_type);

  template <typename T>
  JsonCallbackResult<T> ExecuteReadOnlyCallback(const JsonType* json_entry,
                                                JsonPathReadOnlyCallback<T> cb,
                                                CallbackResultOptions options) const;

  template <typename T>
  OpResult<JsonCallbackResult<std::optional<T>>> ExecuteMutateCallback(
      JsonType* json_entry, JsonPathMutateCallback<T> cb, CallbackResultOptions options) const;

  bool IsLegacyModePath() const;

  bool RefersToRootElement() const;

  // Returns true if this is internal implementation of json path
  // Check AsJsonPath
  bool HoldsJsonPath() const;

  // Internal implementation of json path
  const json::Path& AsJsonPath() const;
  // Jsoncons implementation of json path
  const JsonExpression& AsJsonExpression() const;

  // Returns the path as a string_view.
  std::string_view Path() const;

 private:
  CallbackResultOptions InitializePathType(CallbackResultOptions options) const;

 private:
  std::variant<json::Path, JsonExpression> parsed_path_;
  cmn::StringOrView path_;
  JsonPathType path_type_ = kDefaultJsonPathType;
};

// Implementation
/******************************************************************/
namespace details {

template <typename T>
void OptionalEmplace(bool keep_defined, std::optional<T> src, std::optional<T>* dest) {
  if (!keep_defined || !dest->has_value()) {
    dest->swap(src);
  }
}

template <typename T> void OptionalEmplace(bool keep_defined, T src, T* dest) {
  if (!keep_defined) {
    *dest = std::move(src);
  }
}

}  // namespace details

inline CallbackResultOptions CallbackResultOptions::DefaultReadOnlyOptions(
    SavingOrder saving_order) {
  return CallbackResultOptions{OnEmpty::kSendNil, saving_order};
}

inline CallbackResultOptions CallbackResultOptions::DefaultMutateOptions() {
  return CallbackResultOptions{OnEmpty::kSendWrongType};
}

template <typename T>
JsonCallbackResult<T>::JsonCallbackResult(CallbackResultOptions options) : options_(options) {
}

template <typename T> void JsonCallbackResult<T>::AddValue(T value) {
  if (result_.empty() || !IsV1()) {
    result_.push_back(std::move(value));
    return;
  }

  details::OptionalEmplace(options_.saving_order == SavingOrder::kSaveFirst, std::move(value),
                           &result_.front());
}

template <typename T> bool JsonCallbackResult<T>::Empty() const {
  return result_.empty();
}

template <typename T> bool JsonCallbackResult<T>::IsV1() const {
  return options_.path_type == JsonPathType::kLegacy;
}

template <typename T> const T& JsonCallbackResult<T>::AsV1() const {
  return result_.front();
}

template <typename T> const auto& JsonCallbackResult<T>::AsV2() const {
  return result_;
}

template <typename T> bool JsonCallbackResult<T>::ShouldSendNil() const {
  return IsV1() && options_.on_empty == OnEmpty::kSendNil && result_.empty();
}

template <typename T> bool JsonCallbackResult<T>::ShouldSendWrongType() const {
  if (IsV1()) {
    if (result_.empty() && options_.on_empty == OnEmpty::kSendWrongType)
      return true;

    if constexpr (is_optional<T>::value) {
      return !result_.front().has_value();
    }
  }
  return false;
}

inline WrappedJsonPath::WrappedJsonPath(json::Path json_path, cmn::StringOrView path,
                                        JsonPathType path_type)
    : parsed_path_(std::move(json_path)), path_(std::move(path)), path_type_(path_type) {
}

inline WrappedJsonPath::WrappedJsonPath(JsonExpression expression, cmn::StringOrView path,
                                        JsonPathType path_type)
    : parsed_path_(std::move(expression)), path_(std::move(path)), path_type_(path_type) {
}

template <typename T>
JsonCallbackResult<T> WrappedJsonPath::ExecuteReadOnlyCallback(
    const JsonType* json_entry, JsonPathReadOnlyCallback<T> cb,
    CallbackResultOptions options) const {
  JsonCallbackResult<T> read_result{InitializePathType(options)};

  auto eval_callback = [&cb, &read_result](std::string_view path, const JsonType& val) {
    read_result.AddValue(cb(path, val));
  };

  if (HoldsJsonPath()) {
    const auto& json_path = AsJsonPath();
    json::EvaluatePath(json_path, *json_entry,
                       [&eval_callback](std::optional<std::string_view> key, const JsonType& val) {
                         eval_callback(key ? *key : std::string_view{}, val);
                       });
  } else {
    const auto& json_expression = AsJsonExpression();
    json_expression.evaluate(*json_entry, eval_callback);
  }

  return read_result;
}

template <typename T>
OpResult<JsonCallbackResult<std::optional<T>>> WrappedJsonPath::ExecuteMutateCallback(
    JsonType* json_entry, JsonPathMutateCallback<T> cb, CallbackResultOptions options) const {
  JsonCallbackResult<std::optional<T>> mutate_result{InitializePathType(options)};

  auto mutate_callback = [&cb, &mutate_result](std::optional<std::string_view> path,
                                               JsonType* val) -> bool {
    auto res = cb(path, val);
    if (res.value.has_value()) {
      mutate_result.AddValue(std::move(res.value).value());
    } else if (!mutate_result.IsV1()) {
      mutate_result.AddValue(std::nullopt);
    }
    return res.should_be_deleted;
  };

  if (HoldsJsonPath()) {
    const auto& json_path = AsJsonPath();
    json::MutatePath(json_path, mutate_callback, json_entry);
  } else {
    using namespace jsoncons::jsonpath;
    using namespace jsoncons::jsonpath::detail;
    using Evaluator = jsonpath_evaluator<JsonType, JsonType&>;
    using ValueType = Evaluator::value_type;
    using Reference = Evaluator::reference;
    using JsonSelector = Evaluator::path_expression_type;

    custom_functions<JsonType> funcs = custom_functions<JsonType>();

    std::error_code ec;
    static_resources static_res(funcs);
    Evaluator e;

    JsonSelector expr = e.compile(static_res, path_.view(), ec);
    if (ec) {
      VLOG(1) << "Failed to mutate json with error: " << ec.message();
      return OpStatus::SYNTAX_ERR;
    }

    eval_context<ValueType, Reference> resources;

    auto f = [&mutate_callback](const basic_path_node<char>& path, JsonType& val) {
      mutate_callback(to_string(path), &val);
    };

    expr.evaluate(resources, *json_entry, JsonSelector::path_node_type{}, *json_entry, std::move(f),
                  result_options::nodups | result_options::path);
  }
  return mutate_result;
}

inline bool WrappedJsonPath::IsLegacyModePath() const {
  return path_type_ == JsonPathType::kLegacy;
}

inline bool WrappedJsonPath::RefersToRootElement() const {
  auto path = path_.view();
  return path.empty() || path == kV1PathRootElement || path == kV2PathRootElement;
}

inline bool WrappedJsonPath::HoldsJsonPath() const {
  return std::holds_alternative<json::Path>(parsed_path_);
}

inline const json::Path& WrappedJsonPath::AsJsonPath() const {
  return std::get<json::Path>(parsed_path_);
}

inline const JsonExpression& WrappedJsonPath::AsJsonExpression() const {
  return std::get<JsonExpression>(parsed_path_);
}

inline std::string_view WrappedJsonPath::Path() const {
  return path_.view();
}

inline CallbackResultOptions WrappedJsonPath::InitializePathType(
    CallbackResultOptions options) const {
  if (!options.path_type) {
    options.path_type = path_type_;
  }
  return options;
}

}  // namespace dfly


================================================
FILE: src/server/dfly_bench.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

extern "C" {
#include "redis/crc16.h"
}

#include <absl/container/flat_hash_set.h>
#include <absl/random/random.h>
#include <absl/strings/match.h>
#include <absl/strings/str_cat.h>
#include <absl/strings/str_format.h>
#include <absl/strings/str_split.h>

#include <boost/icl/interval_set.hpp>
#include <csignal>
#include <queue>
#include <shared_mutex>
#include <tuple>

#include "absl/time/clock.h"
#include "absl/time/time.h"
#include "base/histogram.h"
#include "base/init.h"
#include "base/random.h"
#include "base/zipf_gen.h"
#include "facade/redis_parser.h"
#include "io/io.h"
#include "io/io_buf.h"
#include "util/fibers/dns_resolve.h"
#include "util/fibers/pool.h"
#include "util/fibers/proactor_base.h"

// A load-test for DragonflyDB that fixes coordinated omission problem.

using std::string;

ABSL_FLAG(uint16_t, p, 6379, "Server port");
ABSL_FLAG(uint32_t, c, 20, "Number of connections per thread");
ABSL_FLAG(int32_t, qps, 20,
          "QPS schedule at which the generator sends requests to the server "
          "per single connection. 0 means - coordinated omission, and positive value will throttle "
          "the actual qps if server is slower than the target qps. "
          "negative value means - hard target, without throttling.");

ABSL_FLAG(uint32_t, n, 1000, "Number of requests to send per connection");
ABSL_FLAG(uint32_t, test_time, 0, "Testing time in seconds");
ABSL_FLAG(string, d, "16",
          "Specify value size as single number for fixed length or use min:max to generate random "
          "value length between min and max.");
ABSL_FLAG(string, h, "localhost", "server hostname/ip");
ABSL_FLAG(uint64_t, key_minimum, 0, "Min value for keys used");
ABSL_FLAG(uint64_t, key_maximum, 50'000'000, "Max value for keys used");
ABSL_FLAG(string, key_prefix, "key:", "keys prefix");
ABSL_FLAG(string, key_dist, "U", "U for uniform, N for normal, Z for zipfian, S for sequential");
ABSL_FLAG(double, zipf_alpha, 0.99, "zipfian alpha parameter");
ABSL_FLAG(uint64_t, seed, 42, "A seed for random data generation");
ABSL_FLAG(uint64_t, key_stddev, 0,
          "Standard deviation for non-uniform distribution, 0 chooses"
          " a default value of (max-min)/6");
ABSL_FLAG(uint32_t, pipeline, 1, "maximum number of pending requests per connection");
ABSL_FLAG(string, ratio, "1:10", "Set:Get ratio");
ABSL_FLAG(string, command, "",
          "custom command with __key__ placeholder for keys, "
          "__data__ for values, __score__ for doubles");
ABSL_FLAG(bool, random_data, true,
          "If true, generate random data for each request, otherwise uses incremental sequences."
          "Applies for __score__ and __data__ placeholders.");
ABSL_FLAG(string, P, "", "protocol can be empty (for RESP) or memcache_text");

ABSL_FLAG(bool, tcp_nodelay, false, "If true, set nodelay option on tcp socket");
ABSL_FLAG(bool, noreply, false, "If true, does not wait for replies. Relevant only for memcached.");

ABSL_FLAG(bool, probe_cluster, true,
          "If false, skips cluster-mode probing and works only in single node mode");

ABSL_FLAG(bool, greet, true,
          "If true, sends a greeting command on each connection, "
          "to make sure the connection succeeded");
ABSL_FLAG(bool, cluster_skip_tags, true,
          "If true, skips tags (compatible with memtier benchmark) in cluster mode, "
          "othewise adds hash tags to keys");
ABSL_FLAG(bool, ascii, true, "If true, use ascii characters for values");
ABSL_FLAG(bool, connect_only, false,
          "If true, will only connect to the server, without sending "
          "loadtest commands");
ABSL_FLAG(string, password, "", "password to authenticate the client");

using namespace std;
using namespace util;
using absl::GetFlag;
using absl::StrFormat;
using facade::RedisParser;
using facade::RespExpr;
using facade::RespVec;
using tcp = ::boost::asio::ip::tcp;
using absl::StrCat;

thread_local base::Xoroshiro128p bit_gen;
thread_local uint64_t seq_val = 1;

atomic_bool terminate_requested = false;

#if __INTELLISENSE__
#pragma diag_suppress 144
#endif

enum Protocol { RESP, MC_TEXT } protocol;
enum DistType { UNIFORM, NORMAL, ZIPFIAN, SEQUENTIAL } dist_type{UNIFORM};
constexpr uint16_t kNumSlots = 16384;

static string GetRandomBlob(size_t len, bool ascii) {
  static bool is_random = GetFlag(FLAGS_random_data);

  std::string res(len, '\0');
  size_t indx = 0;

  for (; indx + 16 <= len; indx += 16) {  // 2 chars per byte
    absl::numbers_internal::FastHexToBufferZeroPad16(is_random ? bit_gen() : seq_val++,
                                                     res.data() + indx);
  }

  DCHECK_LE(indx, len);

  if (indx < len) {
    uint64_t next_val = is_random ? bit_gen() : seq_val++;
    unsigned count = len - indx;

    // extract hex chars from least significant nibble, as it's the one that changes
    // with sequential values.
    for (unsigned j = 0; j < count; ++j) {
      res[indx++] = (next_val & 0x0F) + 'A';  // to ascii (not really hex, but ok for random data)
      next_val >>= 4;
    }
  }

  if (!ascii) {
    for (size_t i = 0; i < len; i++) {
      res[i] += 80;
    }
  }
  return res;
}

uint16_t SlotId(string_view str) {
  return crc16(str.data(), str.size()) % kNumSlots;
}

using SlotRange = pair<uint16_t, uint16_t>;

struct ShardInfo {
  vector<SlotRange> slots;  // list of [start, end] pairs. inclusive.
  tcp::endpoint endpoint;
};

using ClusterShards = vector<ShardInfo>;

class ShardSlots {
 private:
  using IntervalSet = boost::icl::interval_set<uint16_t>;
  using Interval = boost::icl::interval<uint16_t>;

 public:
  void SetClusterSlotRanges(const ClusterShards& cluster_shards) {
    for (auto shard : cluster_shards) {
      IntervalSet shard_slots_;
      for (auto& slot : shard.slots) {
        shard_slots_.insert(Interval::closed(slot.first, slot.second));
      }
      shards_slots_.emplace(shard.endpoint, shard_slots_);
    }
  }

  SlotRange NextSlotRange(const tcp::endpoint& ep, size_t i) {
    std::shared_lock<fb2::SharedMutex> lock(mu_);
    const auto& shard_slot_interval = shards_slots_[ep];
    unsigned index = i % shard_slot_interval.iterative_size();
    const auto& interval = next(shard_slot_interval.begin(), index);
    return SlotRange{boost::icl::first(*interval), boost::icl::last(*interval)};
  }

  bool Empty() const {
    return shards_slots_.empty();
  }

  size_t Size() const {
    return shards_slots_.size();
  }

  vector<tcp::endpoint> Endpoints() const {
    vector<tcp::endpoint> endpoints;
    for (const auto& shard : shards_slots_) {
      endpoints.push_back(shard.first);
    }
    return endpoints;
  }

  void MoveSlot(const tcp::endpoint& src_ep, const tcp::endpoint& dst_ep, uint16_t slot_id) {
    std::unique_lock<fb2::SharedMutex> lock(mu_);
    // Remove slot from source ep
    auto& src_shard_slots = shards_slots_[src_ep];
    // If slot id doesn't exists on source ep we have moved this slot before
    if (src_shard_slots.find(slot_id) == src_shard_slots.end()) {
      return;
    }
    src_shard_slots.subtract(slot_id);
    // Add slot to dest ep
    auto& dst_shard_slots = shards_slots_[dst_ep];
    dst_shard_slots.insert(slot_id);
  }

 private:
  struct Hasher {
    using is_transparent = void;
    size_t operator()(const tcp::endpoint& ep) const {
      std::size_t hash1 = std::hash<string>()(ep.address().to_string());
      std::size_t hash2 = std::hash<unsigned short>()(ep.port());
      return hash1 ^ (hash2 + 0x9e3779b9 + (hash1 << 6) + (hash1 >> 2));
    }
  };

  struct Eq {
    using is_transparent = void;
    bool operator()(const tcp::endpoint& left, const tcp::endpoint& right) const {
      return left == right;
    }
  };

 private:
  fb2::SharedMutex mu_;
  absl::flat_hash_map<tcp::endpoint, IntervalSet, Hasher, Eq> shards_slots_;
};

class KeyGenerator {
 public:
  KeyGenerator(uint32_t min, uint32_t max);

  string operator()(uint16_t from, uint16_t to) const;
  void EnableClusterMode();

  bool IsClusterEnabled() const {
    return !hash_slots_.empty();
  }

 private:
  string prefix_;
  uint64_t min_, max_, range_;
  mutable uint64_t seq_cursor_;
  double stddev_ = 1.0 / 6;
  mutable optional<base::ZipfianGenerator> zipf_;
  vector<string> hash_slots_;
};

class CommandGenerator {
 public:
  explicit CommandGenerator(KeyGenerator* keygen);

  string Next(SlotRange range);

  bool might_hit() const {
    return might_hit_;
  }

  bool noreply() const {
    return noreply_;
  }

 private:
  enum TemplateType : uint8_t { KEY, VALUE, SCORE };

  string FillSet(string_view key);
  string FillGet(string_view key);

  bool IsRandomValueLen() const {
    return value_len_min_ != value_len_max_;
  }

  KeyGenerator* keygen_;
  uint32_t ratio_set_ = 0, ratio_get_ = 0;
  string command_;

  using CmdPart = variant<string_view, TemplateType>;
  vector<CmdPart> cmd_parts_;

  string fixed_len_value_;  // used for fixed value string
  int32_t value_len_min_ = 0, value_len_max_ = 0;
  bool might_hit_ = false;
  bool noreply_ = false;
  bool is_ascii_ = true;
};

CommandGenerator::CommandGenerator(KeyGenerator* keygen) : keygen_(keygen) {
  command_ = GetFlag(FLAGS_command);
  is_ascii_ = GetFlag(FLAGS_ascii);

  pair<string, string> value_len_str = absl::StrSplit(GetFlag(FLAGS_d), ':');
  CHECK(absl::SimpleAtoi(value_len_str.first, &value_len_min_));
  if (!value_len_str.second.empty()) {
    CHECK(absl::SimpleAtoi(value_len_str.second, &value_len_max_));
  } else {
    value_len_max_ = value_len_min_;
  }

  if ((value_len_min_ < 0) || (value_len_max_ < 0) || (value_len_min_ > value_len_max_)) {
    LOG(ERROR) << "Invalid `-d " << GetFlag(FLAGS_d)
               << "` argument. Min and max values should be bigger than 0 and min value should "
                  "be smaller or equal to max. Setting to default (16).";
    value_len_max_ = value_len_min_ = 16;
  }

  if (!IsRandomValueLen()) {
    fixed_len_value_ = string(value_len_min_, is_ascii_ ? 'a' : char(130));
  }

  if (command_.empty()) {
    pair<string, string> ratio_str = absl::StrSplit(GetFlag(FLAGS_ratio), ':');
    CHECK(absl::SimpleAtoi(ratio_str.first, &ratio_set_));
    CHECK(absl::SimpleAtoi(ratio_str.second, &ratio_get_));
    return;
  }

  vector<string_view> parts = absl::StrSplit(command_, ' ', absl::SkipEmpty());
  for (string_view p : parts) {
    if (p == "__key__"sv) {
      cmd_parts_.emplace_back(KEY);
    } else if (p == "__data__"sv) {
      cmd_parts_.emplace_back(VALUE);
    } else if (p == "__score__"sv) {
      cmd_parts_.emplace_back(SCORE);
    } else {
      cmd_parts_.emplace_back(p);
    }
  }

  if (!cmd_parts_.empty()) {
    const string_view* cmd = get_if<string_view>(&cmd_parts_.front());
    if (cmd) {
      might_hit_ = absl::EqualsIgnoreCase(*cmd, "get") || absl::StartsWithIgnoreCase(*cmd, "mget");
    }
  }
}

string CommandGenerator::Next(SlotRange range) {
  noreply_ = false;

  if (command_.empty()) {
    string key = (*keygen_)(range.first, range.second);

    if (absl::Uniform(bit_gen, 0U, ratio_get_ + ratio_set_) < ratio_set_) {
      might_hit_ = false;
      return FillSet(key);
    }
    might_hit_ = true;
    return FillGet(key);
  }

  // For custom commands, we select a random slot and then use it for key generation.
  uint16_t slot_id = 0;

  if (keygen_->IsClusterEnabled()) {
    slot_id = absl::Uniform(absl::IntervalClosedClosed, bit_gen, range.first, range.second);
  }

  string str, gen_cmd;
  absl::StrAppend(&gen_cmd, "*", cmd_parts_.size(), "\r\n");
  for (const CmdPart& part : cmd_parts_) {
    if (auto p = get_if<string_view>(&part)) {
      absl::StrAppend(&gen_cmd, "$", p->size(), "\r\n", *p, "\r\n");
    } else {
      switch (get<TemplateType>(part)) {
        case KEY:
          str = (*keygen_)(slot_id, slot_id);
          break;
        case VALUE: {
          size_t value_len = IsRandomValueLen()
                                 ? absl::Uniform(bit_gen, value_len_min_, value_len_max_)
                                 : fixed_len_value_.size();
          str = GetRandomBlob(value_len, is_ascii_);
          break;
        }
        case SCORE: {
          uniform_real_distribution<double> uniform(0, 1);
          str = absl::StrCat(uniform(bit_gen));
        }
      }
      absl::StrAppend(&gen_cmd, "$", str.size(), "\r\n", str, "\r\n");
    }
  }

  return gen_cmd;
}

string CommandGenerator::FillSet(string_view key) {
  string res;
  string_view value = fixed_len_value_;
  string random_len_value;

  if (IsRandomValueLen()) {
    random_len_value = GetRandomBlob(absl::Uniform(bit_gen, value_len_min_, value_len_max_), true);
    value = random_len_value;
  }

  if (protocol == RESP) {
    absl::StrAppend(&res, "*3\r\n$3\r\nset\r\n$", key.size(), "\r\n", key);
    absl::StrAppend(&res, "\r\n$", value.size(), "\r\n", value, "\r\n");
  } else {
    DCHECK_EQ(protocol, MC_TEXT);
    absl::StrAppend(&res, "set ", key, " 0 0 ", value.size());
    if (GetFlag(FLAGS_noreply)) {
      absl::StrAppend(&res, " noreply");
      noreply_ = true;
    }

    absl::StrAppend(&res, "\r\n", value, "\r\n");
  }
  return res;
}

string CommandGenerator::FillGet(string_view key) {
  return absl::StrCat("get ", key, "\r\n");
}

struct ClientStats {
  base::Histogram total_hist, online_hist;

  uint64_t num_responses = 0;
  uint64_t qps = 0;
  uint64_t hit_count = 0;
  uint64_t hit_opportunities = 0;
  uint64_t num_errors = 0;
  unsigned num_clients = 0;

  ClientStats& operator+=(const ClientStats& o) {
    total_hist.Merge(o.total_hist);
    online_hist.Merge(o.online_hist);

    num_responses += o.num_responses;
    qps += o.qps;
    hit_count += o.hit_count;
    hit_opportunities += o.hit_opportunities;
    num_errors += o.num_errors;
    num_clients += o.num_clients;

    return *this;
  }
};

// Per connection driver.
class Driver {
 public:
  explicit Driver(uint32_t num_reqs, uint32_t time_limit, ClientStats* stats, ProactorBase* p,
                  ShardSlots* ss)
      : num_reqs_(num_reqs), time_limit_(time_limit), shard_slots_(*ss), stats_(*stats) {
    socket_.reset(p->CreateSocket());
    if (time_limit_ > 0)
      num_reqs_ = UINT32_MAX;
  }

  Driver(const Driver&) = delete;
  Driver(Driver&&) = delete;
  Driver& operator=(Driver&&) = delete;

  void Connect(unsigned index, const tcp::endpoint& ep);
  void Run(uint64_t* cycle_ns, CommandGenerator* cmd_gen);
  void Shutdown();

  float done() const {
    if (time_limit_ > 0)
      return double(absl::GetCurrentTimeNanos() - start_ns_) / (time_limit_ * 1e9);
    return double(received_) / num_reqs_;
  }

  unsigned pending_length() const {
    return reqs_.size();
  }

 private:
  void PopRequest();
  void ReceiveFb();
  void ParseRESP();
  void ParseMC();
  void RunCommandAndCheckResultIs(std::string_view cmd, std::string_view expected_res);

  struct Req {
    uint64_t start;
    bool might_hit;
  };

  uint32_t num_reqs_, time_limit_, received_ = 0;
  int64_t start_ns_ = 0;

  tcp::endpoint ep_;
  ShardSlots& shard_slots_;
  ClientStats& stats_;
  unique_ptr<FiberSocketBase> socket_;
  fb2::Fiber receive_fb_;
  queue<Req> reqs_;
  fb2::CondVarAny cnd_;

  facade::RedisParser parser_{RedisParser::Mode::CLIENT, 1 << 16};
  io::IoBuf io_buf_{512};
  unsigned blob_len_ = 0;
};

// Per thread client.
class TLocalClient {
 public:
  explicit TLocalClient(ProactorBase* p, ShardSlots* ss) : p_(p), shard_slots_(ss) {
  }

  TLocalClient(const TLocalClient&) = delete;

  void Connect(const tcp::endpoint& ep, const vector<tcp::endpoint>& shard_endpoints);
  void Disconnect();

  void Start(uint32_t key_min, uint32_t key_max, uint64_t cycle_ns);
  void Join();

  ClientStats stats;

  tuple<float, float> GetMinMaxDone() const {
    float min = 1, max = 0;

    for (unsigned i = 0; i < drivers_.size(); ++i) {
      float done = drivers_[i]->done();
      max = std::max(done, max);
      min = std::min(done, min);
    }

    return {min, max};
  }

  unsigned MaxPending() const {
    unsigned max = 0;
    for (unsigned i = 0; i < drivers_.size(); ++i) {
      if (drivers_[i]->pending_length() > max) {
        max = drivers_[i]->pending_length();
      }
    }
    return max;
  }

  unsigned num_conns() const {
    return drivers_.size();
  }

  void AdjustCycle();

 private:
  ProactorBase* p_;
  ShardSlots* shard_slots_;
  vector<unique_ptr<Driver>> drivers_;
  optional<KeyGenerator> key_gen_;
  optional<CommandGenerator> cmd_gen_;

  vector<fb2::Fiber> driver_fbs_;
  uint64_t cur_cycle_ns_;
  uint64_t target_cycle_;
  int64_t start_time_;
};

KeyGenerator::KeyGenerator(uint32_t min, uint32_t max)
    : min_(min), max_(max), range_(max - min + 1) {
  prefix_ = GetFlag(FLAGS_key_prefix);
  CHECK_GT(range_, 0u);

  seq_cursor_ = min_;
  switch (dist_type) {
    case NORMAL: {
      uint64_t stddev = GetFlag(FLAGS_key_stddev);
      if (stddev != 0) {
        stddev_ = double(stddev) / double(range_);
      }
      break;
    }
    case ZIPFIAN:
      zipf_.emplace(min, max, GetFlag(FLAGS_zipf_alpha));
      break;
    default:;
  }
}

string KeyGenerator::operator()(uint16_t from, uint16_t to) const {
  uint64_t key_suffix = 0;
  uint16_t slot_id = from;
  bool skip_tags = IsClusterEnabled() && GetFlag(FLAGS_cluster_skip_tags);
  string res;

  do {
    switch (dist_type) {
      case UNIFORM:
        key_suffix = absl::Uniform(bit_gen, min_, max_);
        break;
      case NORMAL: {
        double val = absl::Gaussian(bit_gen, 0.5, stddev_);
        key_suffix = min_ + uint64_t(val * range_);
        break;
      }
      case ZIPFIAN:
        key_suffix = zipf_->Next(bit_gen);
        break;
      case SEQUENTIAL:
        key_suffix = seq_cursor_++;
        if (seq_cursor_ > max_)
          seq_cursor_ = min_;
        break;
    }

    if (!skip_tags)
      break;

    // If we skip tags, we must make sure that the key fits the slot range.
    res = absl::StrCat(prefix_, key_suffix);
    slot_id = SlotId(res);
  } while (slot_id < from || slot_id > to);

  // If we are in cluster mode we add the hash slot to the key to make sure it lands in the correct
  // range.
  if (IsClusterEnabled()) {
    if (!skip_tags) {
      if (to > from)
        slot_id = absl::Uniform(absl::IntervalClosedClosed, bit_gen, from, to);
      absl::StrAppend(&res, prefix_, "{", hash_slots_[slot_id], "}", key_suffix);
    }
  } else {
    absl::StrAppend(&res, prefix_, key_suffix);
  }

  return res;
}

void KeyGenerator::EnableClusterMode() {
  hash_slots_.resize(kNumSlots);
  uint32_t i = 0;
  uint32_t num_slots_filled = 0;

  // Precompute the hash slots for each of the slot ids so given the slot id
  // we could generate a key that belongs to that slot.
  while (num_slots_filled < kNumSlots) {
    string key = absl::StrCat(i);
    uint16_t id = SlotId(key);
    if (hash_slots_[id].empty()) {
      hash_slots_[id] = std::move(key);
      num_slots_filled++;
    }
    ++i;
  }
}

void RunCommandAndCheckResultIs(std::string_view cmd, std::string_view expected,
                                FiberSocketBase* socket) {
  auto ec = socket->Write(io::Buffer(cmd));
  CHECK(!ec);

  uint8_t buf[128];
  auto res_sz = socket->Recv(io::MutableBytes(buf));
  CHECK(res_sz) << res_sz.error().message();
  string_view resp = io::View(io::Bytes(buf, *res_sz));
  CHECK_EQ(resp, expected) << resp;
}

void Driver::RunCommandAndCheckResultIs(std::string_view cmd, std::string_view expected_res) {
  ::RunCommandAndCheckResultIs(cmd, expected_res, socket_.get());
}

void Driver::Connect(unsigned index, const tcp::endpoint& ep) {
  VLOG(2) << "Connecting " << index << " to " << ep;
  error_code ec = socket_->Connect(ep);
  CHECK(!ec) << "Could not connect to " << ep << " " << ec;
  if (GetFlag(FLAGS_tcp_nodelay)) {
    int yes = 1;
    CHECK_EQ(0, setsockopt(socket_->native_handle(), IPPROTO_TCP, TCP_NODELAY, &yes, sizeof(yes)));
  }

  auto password = absl::GetFlag(FLAGS_password);
  if (!password.empty()) {
    auto command = absl::StrCat("AUTH ", password, "\r\n");
    RunCommandAndCheckResultIs(command, "+OK\r\n");
  } else if (absl::GetFlag(FLAGS_greet)) {
    // TCP Connect does not ensure that the connection was indeed accepted by the server.
    // if server backlog is too short the connection will get stuck in the accept queue.
    // Therefore, we send a ping command to ensure that every connection got connected.
    RunCommandAndCheckResultIs("PING\r\n", "+PONG\r\n");
  }
  ep_ = ep;
  receive_fb_ = MakeFiber(fb2::Launch::dispatch, [this] { ReceiveFb(); });
}

void Driver::Run(uint64_t* cycle_ns, CommandGenerator* cmd_gen) {
  start_ns_ = absl::GetCurrentTimeNanos();
  uint32_t pipeline = std::max<uint32_t>(GetFlag(FLAGS_pipeline), 1u);
  bool should_throttle = GetFlag(FLAGS_qps) > 0;

  stats_.num_clients++;
  int64_t time_limit_ns =
      time_limit_ > 0 ? int64_t(time_limit_) * 1'000'000'000 + start_ns_ : INT64_MAX;
  int64_t now = start_ns_;
  SlotRange slot_range{0, kNumSlots - 1};
  CHECK_GT(num_reqs_, 0u);

  uint32_t num_batches = ((num_reqs_ - 1) / pipeline) + 1;

  for (unsigned i = 0; i < num_batches && now < time_limit_ns && !terminate_requested; ++i) {
    if (i == num_batches - 1) {  // last batch
      pipeline = num_reqs_ - i * pipeline;
    }

    string out_buf;
    for (unsigned j = 0; j < pipeline; ++j) {
      // TODO: this skews the distribution if slot ranges are uneven.
      // Ideally we would like to pick randomly a single slot from all the ranges we have
      // and pass it to cmd_gen->Next below.
      if (!shard_slots_.Empty()) {
        slot_range = shard_slots_.NextSlotRange(ep_, i);
      }

      absl::StrAppend(&out_buf, cmd_gen->Next(slot_range));

      Req req;
      req.start = absl::GetCurrentTimeNanos();
      req.might_hit = cmd_gen->might_hit();

      reqs_.push(req);

      if (out_buf.size() >= 8192) {
        error_code ec = socket_->Write(io::Buffer(out_buf));
        out_buf.clear();
        if (ec && FiberSocketBase::IsConnClosed(ec)) {
          // TODO: report failure
          VLOG(1) << "Connection closed";
          break;
        }
        CHECK(!ec) << ec.message();
      }
      if (cmd_gen->noreply()) {
        PopRequest();
      }
    }

    if (!out_buf.empty()) {
      error_code ec = socket_->Write(io::Buffer(out_buf));
      CHECK(!ec || FiberSocketBase::IsConnClosed(ec)) << ec.message();
    }

    now = absl::GetCurrentTimeNanos();
    if (cycle_ns) {
      int64_t target_ts = start_ns_ + i * (*cycle_ns);
      int64_t sleep_ns = target_ts - now;
      if (reqs_.size() > pipeline * 2 && should_throttle && sleep_ns <= 0) {
        sleep_ns = 10'000;
      }

      if (sleep_ns > 0) {
        VLOG(5) << "Sleeping for " << sleep_ns << "ns";
        // There is no point in sending more requests if they are piled up in the server.
        do {
          ThisFiber::SleepFor(chrono::nanoseconds(sleep_ns));
        } while (should_throttle && reqs_.size() > pipeline * 2);
      } else if (i % 256 == 255) {
        ThisFiber::Yield();
        VLOG(5) << "Behind QPS schedule";
      }
    } else {
      // Coordinated omission.

      fb2::NoOpLock lk;
      cnd_.wait(lk, [this] { return reqs_.empty(); });
    }
  }

  int64_t finish = absl::GetCurrentTimeNanos();
  VLOG(1) << "Done queuing " << num_reqs_ << " requests, which took "
          << StrFormat("%.1fs", double(finish - start_ns_) / 1000'000'000)
          << ". Waiting for server processing";

  // TODO: to change to a condvar or something.
  while (!reqs_.empty()) {
    ThisFiber::SleepFor(1ms);
  }
  Shutdown();
}

void Driver::Shutdown() {
  std::ignore = socket_->Shutdown(SHUT_RDWR);  // breaks the receive fiber.
  receive_fb_.Join();
  std::ignore = socket_->Close();
  stats_.num_clients--;
}

static string_view FindLine(io::Bytes buf) {
  if (buf.size() < 2)
    return {};
  for (unsigned i = 0; i < buf.size() - 1; ++i) {
    if (buf[i] == '\r' && buf[i + 1] == '\n') {
      return io::View(buf.subspan(0, i + 2));
    }
  }
  return {};
};

void Driver::PopRequest() {
  uint64_t now = absl::GetCurrentTimeNanos();
  uint64_t usec = (now - reqs_.front().start) / 1000;
  stats_.online_hist.Add(usec);
  stats_.total_hist.Add(usec);
  stats_.hit_opportunities += reqs_.front().might_hit;
  ++received_;
  reqs_.pop();
  if (reqs_.empty()) {
    cnd_.notify_one();
  }
  ++stats_.num_responses;
}

void Driver::ReceiveFb() {
  uint64_t now = absl::GetCurrentTimeNanos();
  while (true) {
    io_buf_.EnsureCapacity(256);
    auto buf = io_buf_.AppendBuffer();
    VLOG(3) << "Socket read: " << reqs_.size();

    ::io::Result<size_t> recv_sz = socket_->Recv(buf);
    CHECK(recv_sz) << recv_sz.error().message();

    if (*recv_sz == 0) {
      LOG_IF(DFATAL, !reqs_.empty())
          << "Broke with " << reqs_.size() << " requests,  received: " << received_;
      // clear reqs - to prevent Driver::Run block on them indefinitely.
      decltype(reqs_)().swap(reqs_);
      break;
    }

    io_buf_.CommitWrite(*recv_sz);

    if (protocol == RESP) {
      ParseRESP();
    } else {
      // MC_TEXT
      ParseMC();
    }
  }
  double usec = (absl::GetCurrentTimeNanos() - now) / 1000;
  if (usec > 0)
    stats_.qps += uint64_t(double(received_) * 1e6 / usec);
  VLOG(1) << "ReceiveFb done";
}

void Driver::ParseRESP() {
  uint32_t consumed = 0;
  RedisParser::Result result = RedisParser::OK;
  RespVec parse_args;
  constexpr string_view kMovedErrorKey = "MOVED"sv;
  boost::system::error_code ec;

  do {
    result = parser_.Parse(io_buf_.InputBuffer(), &consumed, &parse_args);
    if (result == RedisParser::OK && !parse_args.empty()) {
      if (parse_args[0].type == RespExpr::ERROR) {
        string_view error = parse_args[0].GetView();
        VLOG(2) << "Error " << error;
        if (absl::StartsWith(error, kMovedErrorKey)) {
          error = error.substr(kMovedErrorKey.length());
          vector<string_view> parts =
              absl::StrSplit(absl::StripTrailingAsciiWhitespace(error), ' ', absl::SkipEmpty());

          CHECK_EQ(parts.size(), 2u);
          uint32_t slot_id;
          CHECK(absl::SimpleAtoi(parts[0], &slot_id));

          vector<string_view> addr_parts = absl::StrSplit(parts[1], ':');
          CHECK_EQ(2u, addr_parts.size());

          auto host = boost::asio::ip::make_address(addr_parts[0], ec);
          CHECK(!ec) << "make_address failed with error: " << ec.message()
                     << " while parsing address " << addr_parts[0];

          uint32_t port;
          CHECK(absl::SimpleAtoi(addr_parts[1], &port));
          CHECK_LT(port, 65536u);

          shard_slots_.MoveSlot(ep_, tcp::endpoint(host, port), slot_id);
        }
        ++stats_.num_errors;
      } else if (reqs_.front().might_hit && parse_args[0].type != RespExpr::NIL) {
        ++stats_.hit_count;
      }
      parse_args.clear();
      PopRequest();
    }
    io_buf_.ConsumeInput(consumed);
  } while (result == RedisParser::OK && io_buf_.InputLen() > 0);
}

void Driver::ParseMC() {
  while (true) {
    string_view line = FindLine(io_buf_.InputBuffer());
    if (line.empty())
      break;

    CHECK_EQ(line.back(), '\n');
    if (line == "STORED\r\n" || line == "END\r\n") {
      PopRequest();
      blob_len_ = 0;
    } else if (absl::StartsWith(line, "VALUE")) {
      // last token is a blob length.
      auto it = line.rbegin();
      while (it != line.rend() && *it != ' ')
        ++it;
      size_t len = it - line.rbegin() - 2;
      const char* start = &(*it) + 1;
      if (!absl::SimpleAtoi(string(start, len), &blob_len_)) {
        LOG(ERROR) << "Invalid blob len " << line;
        return;
      }
      ++stats_.hit_count;
    } else if (absl::StartsWith(line, "SERVER_ERROR")) {
      ++stats_.num_errors;
      PopRequest();
      blob_len_ = 0;
    } else {
      auto handle = socket_->native_handle();
      CHECK_EQ(blob_len_ + 2, line.size()) << line;
      blob_len_ = 0;
      VLOG(2) << "Got line " << handle << ": " << line;
    }
    io_buf_.ConsumeInput(line.size());
  }
}

void TLocalClient::Connect(const tcp::endpoint& ep, const vector<tcp::endpoint>& endpoints) {
  VLOG(2) << "Connecting client..." << ep;

  unsigned conn_per_shard = GetFlag(FLAGS_c);
  if (shard_slots_->Empty()) {
    drivers_.resize(conn_per_shard);
  } else {
    drivers_.resize(shard_slots_->Size() * conn_per_shard);
  }

  for (auto& driver : drivers_) {
    driver.reset(new Driver{GetFlag(FLAGS_n), GetFlag(FLAGS_test_time), &stats, p_, shard_slots_});
  }
  vector<fb2::Fiber> fbs(drivers_.size());

  for (size_t i = 0; i < fbs.size(); ++i) {
    vector<SlotRange> slots;
    tcp::endpoint shard_ep = ep;
    if (!shard_slots_->Empty()) {
      size_t shard = i / conn_per_shard;
      shard_ep = endpoints[shard];
    }
    fbs[i] =
        fb2::Fiber(StrCat("connect/", i), [&, shard_ep, i] { drivers_[i]->Connect(i, shard_ep); });
  }

  for (auto& fb : fbs)
    fb.Join();
}

void TLocalClient::Disconnect() {
  for (size_t i = 0; i < drivers_.size(); ++i) {
    drivers_[i]->Shutdown();
  }
}

void TLocalClient::Start(uint32_t key_min, uint32_t key_max, uint64_t cycle_ns) {
  key_gen_.emplace(key_min, key_max);
  cmd_gen_.emplace(&key_gen_.value());

  driver_fbs_.resize(drivers_.size());
  if (!shard_slots_->Empty()) {
    key_gen_->EnableClusterMode();
  }
  cur_cycle_ns_ = cycle_ns;
  target_cycle_ = cycle_ns;
  start_time_ = absl::GetCurrentTimeNanos();

  for (size_t i = 0; i < driver_fbs_.size(); ++i) {
    driver_fbs_[i] = fb2::Fiber(StrCat("run/", i), [&, i] {
      drivers_[i]->Run(cur_cycle_ns_ ? &cur_cycle_ns_ : nullptr, &cmd_gen_.value());
    });
  }
}

void TLocalClient::Join() {
  for (auto& fb : driver_fbs_)
    fb.Join();

  VLOG(1) << "Total hits: " << stats.hit_count;
}

void TLocalClient::AdjustCycle() {
  if (cur_cycle_ns_ == 0 || stats.num_responses == 0)
    return;

  // We adjust sleeping cycle per thread, and it's the same for all connection in this thread.
  // We compute the aggregated cycle so far based on responses, and if it
  // is greater than current we increase the current cycle. Otherwise,
  // we try slowly reducing the cycle back to the nominal one.

  int64_t running_time = absl::GetCurrentTimeNanos() - start_time_;
  int64_t real_cycle = running_time * drivers_.size() / stats.num_responses;
  if (real_cycle > cur_cycle_ns_ * 1.05) {
    cur_cycle_ns_ = (cur_cycle_ns_ + real_cycle) / 2;
    VLOG(1) << "Increasing cycle to " << cur_cycle_ns_;
  } else if (cur_cycle_ns_ > target_cycle_) {
    cur_cycle_ns_ -= (cur_cycle_ns_ - target_cycle_) * 0.2;
  }
}

thread_local unique_ptr<TLocalClient> client;

void WatchFiber(size_t num_shards, atomic_bool* finish_signal, ProactorPool* pp) {
  fb2::Mutex mutex;

  int64_t start_time = absl::GetCurrentTimeNanos();
  LOG(INFO) << "Started watching";

  int64_t last_print = start_time;
  uint64_t num_last_resp_cnt = 0;
  num_shards = max<size_t>(num_shards, 1u);
  uint64_t resp_goal = GetFlag(FLAGS_c) * pp->size() * GetFlag(FLAGS_n) * num_shards;
  uint32_t time_limit = GetFlag(FLAGS_test_time);
  bool should_throttle = GetFlag(FLAGS_qps) > 0;

  while (*finish_signal == false) {
    // we sleep with resolution of 1s but print with lower frequency to be more responsive
    // when benchmark finishes.
    ThisFiber::SleepFor(1s);
    if (should_throttle) {
      pp->AwaitBrief([](auto, auto*) { client->AdjustCycle(); });
    }

    int64_t now = absl::GetCurrentTimeNanos();
    if (now - last_print < 5000'000'000LL)  // 5s
      continue;

    ClientStats stats;
    float done_max = 0;
    float done_min = 1;
    unsigned max_pending = 0;

    pp->AwaitFiberOnAll([&](auto* p) {
      auto [mind, maxd] = client->GetMinMaxDone();
      unsigned max_pend = client->MaxPending();

      unique_lock lk(mutex);
      stats += client->stats;
      done_max = max(done_max, maxd);
      done_min = min(done_min, mind);
      max_pending = max(max_pending, max_pend);
      client->stats.online_hist.Clear();
    });

    uint64_t total_ms = (now - start_time) / 1'000'000;
    uint64_t period_ms = (now - last_print) / 1'000'000;
    uint64_t period_resp_cnt = stats.num_responses - num_last_resp_cnt;
    double done_perc = time_limit > 0 ? double(total_ms) / (10 * time_limit)
                                      : double(stats.num_responses) * 100 / resp_goal;
    double hitrate = stats.hit_opportunities > 0
                         ? 100 * double(stats.hit_count) / double(stats.hit_opportunities)
                         : 0;
    unsigned latency = stats.online_hist.Percentile(99);

    CONSOLE_INFO << total_ms / 1000 << "s: " << StrFormat("%.1f", done_perc)
                 << "% done, RPS(now/agg): " << period_resp_cnt * 1000 / period_ms << "/"
                 << stats.num_responses * 1000 / total_ms << ", errs: " << stats.num_errors
                 << ", hitrate: " << StrFormat("%.1f%%", hitrate)
                 << ", clients: " << stats.num_clients << "\n"
                 << "done_min: " << StrFormat("%.2f%%", done_min * 100)
                 << ", done_max: " << StrFormat("%.2f%%", done_max * 100)
                 << ", p99_lat(us): " << latency << ", max_pending: " << max_pending;

    last_print = now;
    num_last_resp_cnt = stats.num_responses;
  }
}

ClusterShards FetchClusterInfo(const tcp::endpoint& ep, ProactorBase* proactor) {
  unique_ptr<FiberSocketBase> socket(proactor->CreateSocket());
  error_code ec = socket->Connect(ep);
  CHECK(!ec) << "Could not connect to " << ep << " " << ec;

  if (const auto password = GetFlag(FLAGS_password); !password.empty()) {
    RunCommandAndCheckResultIs(StrFormat("AUTH %s\r\n", password), "+OK\r\n", socket.get());
  }

  ec = socket->Write(io::Buffer("cluster nodes\r\n"));
  CHECK(!ec);
  facade::RedisParser parser{RedisParser::CLIENT, 1024};
  uint8_t buf[1024];
  RespVec resp_vec;
  while (true) {
    io::Result<size_t> res = socket->Recv(buf);
    CHECK(res) << res.error().message();
    RespExpr::Buffer bytes(buf, *res);
    uint32_t consumed = 0;
    facade::RedisParser::Result result = parser.Parse(bytes, &consumed, &resp_vec);
    if (result == facade::RedisParser::OK) {
      break;
    }
    CHECK_EQ(result, facade::RedisParser::INPUT_PENDING);
  }
  CHECK_EQ(1u, resp_vec.size());
  std::ignore = socket->Close();
  if (resp_vec.front().type == RespExpr::ERROR) {
    LOG(INFO) << "Cluster command failed " << resp_vec.front().GetString();
    return {};
  }
  string cluster_spec = resp_vec.front().GetString();
  LOG(INFO) << "Cluster spec: " << cluster_spec;
  vector<string_view> lines = absl::StrSplit(cluster_spec, '\n', absl::SkipEmpty());
  ClusterShards res;
  for (string_view line : lines) {
    vector<string_view> parts = absl::StrSplit(line, ' ');
    // <id> <ip:port@cport[,hostname]> <flags> <master> <ping-sent> <pong-recv>
    // <config-epoch> <link-state> <slot> <slot> ... <slot>
    if (parts.size() < 9) {
      LOG(WARNING) << "Skipping line: " << line;
      continue;
    }
    ShardInfo shard;
    vector<string_view> addr_parts = absl::StrSplit(parts[1], ':');
    CHECK_EQ(2u, addr_parts.size());
    string host(addr_parts[0]);
    char ip_addr[INET6_ADDRSTRLEN];
    std::error_code ec = fb2::DnsResolve(host, ip_addr);
    CHECK(!ec) << "Could not resolve " << host << " " << ec;
    auto address = ::boost::asio::ip::make_address(ip_addr);

    uint32_t val;
    vector<string_view> port_parts = absl::StrSplit(addr_parts[1], '@');
    CHECK_EQ(2u, port_parts.size());
    CHECK(absl::SimpleAtoi(port_parts[0], &val));
    CHECK_LT(val, 65536u);

    shard.endpoint = tcp::endpoint(address, val);

    string_view flags = parts[2];
    absl::flat_hash_set<string_view> flags_set(absl::StrSplit(flags, ','));
    if (!flags_set.contains("master")) {
      LOG(INFO) << "Skipping non-master node " << shard.endpoint << " " << flags;
      continue;
    }

    for (size_t i = 8; i < parts.size(); ++i) {
      vector<string_view> slots = absl::StrSplit(parts[i], '-');
      if (!absl::SimpleAtoi(slots[0], &val) || val >= kNumSlots) {
        LOG(ERROR) << "Invalid slot definition " << parts[i];
        continue;
      }
      SlotRange slot_range{uint16_t(val), uint16_t(val)};
      if (slots.size() > 1) {
        CHECK(absl::SimpleAtoi(slots[1], &val));
        slot_range.second = val;
      }
      shard.slots.push_back(slot_range);
    }
    res.push_back(shard);
  }

  return res;
}

int main(int argc, char* argv[]) {
  MainInitGuard guard(&argc, &argv);

  unique_ptr<ProactorPool> pp;
#ifdef __linux__
  pp.reset(fb2::Pool::IOUring(256));
#else
  pp.reset(fb2::Pool::Epoll());
#endif
  pp->Run();
  fb2::InitDnsResolver(2000);

  ProactorBase::RegisterSignal({SIGTERM}, pp->GetNextProactor(), [](int) {
    CONSOLE_INFO << "terminate requested";
    terminate_requested = true;
  });

  string proto_str = GetFlag(FLAGS_P);
  if (proto_str == "memcache_text") {
    protocol = MC_TEXT;
  } else {
    CHECK(proto_str.empty());
    protocol = RESP;
  }

  string dist = GetFlag(FLAGS_key_dist);

  if (dist == "U") {
    dist_type = UNIFORM;
  } else if (dist == "N") {
    dist_type = NORMAL;
  } else if (dist == "Z") {
    dist_type = ZIPFIAN;
  } else if (dist == "S") {
    dist_type = SEQUENTIAL;
  } else {
    LOG(FATAL) << "Unknown distribution type: " << dist;
  }

  auto* proactor = pp->GetNextProactor();
  char ip_addr[128];

  error_code ec =
      proactor->Await([&] { return fb2::DnsResolve(GetFlag(FLAGS_h), 2000, ip_addr, proactor); });
  CHECK(!ec) << "Could not resolve " << GetFlag(FLAGS_h) << " " << ec;

  auto address = ::boost::asio::ip::make_address(ip_addr);
  tcp::endpoint ep{address, GetFlag(FLAGS_p)};

  ClusterShards shards;
  if (protocol == RESP && GetFlag(FLAGS_probe_cluster)) {
    shards = proactor->Await([&] { return FetchClusterInfo(ep, proactor); });
  }
  CONSOLE_INFO << "Connecting to "
               << (shards.empty() ? string("single node ")
                                  : absl::StrCat(shards.size(), " shard cluster"));

  if (!shards.empty() && !GetFlag(FLAGS_command).empty() && GetFlag(FLAGS_cluster_skip_tags)) {
    // For custom commands we may need to use the same hashtag for multiple keys.
    LOG(WARNING) << "Enforcing hash tags for custom commands";
    absl::SetFlag(&FLAGS_cluster_skip_tags, false);
  }

  ShardSlots shard_slots;
  shard_slots.SetClusterSlotRanges(shards);
  std::vector<tcp::endpoint> shard_endpoints = shard_slots.Endpoints();
  pp->AwaitBrief([&](unsigned index, auto* p) {
    base::SplitMix64 seed_mix(GetFlag(FLAGS_seed) + index * 0x6a45554a264d72bULL);
    auto seed = seed_mix();
    VLOG(1) << "Seeding bitgen with seed " << seed;
    bit_gen.seed(seed);
  });

  pp->AwaitFiberOnAll([&](unsigned index, auto* p) {
    client = make_unique<TLocalClient>(p, &shard_slots);
    client->Connect(ep, shard_endpoints);
  });

  absl::Duration duration;
  if (absl::GetFlag(FLAGS_connect_only)) {
    pp->AwaitFiberOnAll([&](unsigned index, auto* p) { client->Disconnect(); });
  } else {
    const uint32_t key_minimum = GetFlag(FLAGS_key_minimum);
    const uint32_t key_maximum = GetFlag(FLAGS_key_maximum);
    CHECK_LE(key_minimum, key_maximum);

    uint32_t thread_key_step = 0;
    uint32_t desired_qps = abs(GetFlag(FLAGS_qps));
    bool throttle = GetFlag(FLAGS_qps) > 0;
    const int64_t interval = desired_qps ? 1'000'000'000LL / desired_qps : 0;
    uint64_t num_reqs = GetFlag(FLAGS_n);

    uint64_t total_conn_num = GetFlag(FLAGS_c) * pp->size();
    uint64_t total_requests = num_reqs * total_conn_num;
    uint32_t time_limit = GetFlag(FLAGS_test_time);

    if (dist_type == SEQUENTIAL) {
      thread_key_step = std::max(1UL, (key_maximum - key_minimum + 1) / pp->size());
      if (total_requests > (key_maximum - key_minimum)) {
        CONSOLE_INFO << "Warning: only " << key_maximum - key_minimum
                     << " unique entries will be accessed with " << total_requests
                     << " total requests";
      }
    }

    if (!time_limit) {
      CONSOLE_INFO << "Running " << pp->size() << " threads, sending " << num_reqs
                   << " requests per each connection, or " << total_requests << " requests overall "
                   << (throttle ? "with" : "without") << " throttling";
    }
    if (interval) {
      CONSOLE_INFO << "At a rate of " << desired_qps << " rps per connection, i.e. request every "
                   << interval / 1000 << "us";
      CONSOLE_INFO << "Overall scheduled RPS: " << desired_qps * total_conn_num;
    } else {
      CONSOLE_INFO << "Coordinated omission mode - the rate is determined by the server";
    }

    atomic_bool finish{false};
    pp->AwaitBrief([&](unsigned index, auto* p) {
      uint32_t key_max = (thread_key_step > 0 && index + 1 < pp->size())
                             ? key_minimum + (index + 1) * thread_key_step - 1
                             : key_maximum;
      client->Start(key_minimum + index * thread_key_step, key_max, interval);
    });

    auto watch_fb =
        pp->GetNextProactor()->LaunchFiber([&] { WatchFiber(shards.size(), &finish, pp.get()); });
    const absl::Time start_time = absl::Now();

    // The actual run.
    pp->AwaitFiberOnAll([&](unsigned index, auto* p) { client->Join(); });

    duration = absl::Now() - start_time;
    finish.store(true);
    watch_fb.Join();
  }

  fb2::Mutex mutex;

  LOG(INFO) << "Resetting all threads";

  ClientStats summary;
  pp->AwaitFiberOnAll([&](auto* p) {
    unique_lock lk(mutex);
    summary += client->stats;
    lk.unlock();
    client.reset();
  });

  CONSOLE_INFO << "\nTotal time: " << duration
               << ". Overall number of requests: " << summary.num_responses
               << ", QPS: " << summary.qps << ", P99 lat: " << summary.total_hist.Percentile(99)
               << "us";

  if (summary.num_errors) {
    CONSOLE_INFO << "Got " << summary.num_errors << " error responses!";
  }

  CONSOLE_INFO << "Latency summary, all times are in usec:\n" << summary.total_hist.ToString();
  if (summary.hit_opportunities) {
    CONSOLE_INFO << "----------------------------------\nHit rate: "
                 << 100 * double(summary.hit_count) / double(summary.hit_opportunities) << "%\n";
  }
  pp->Stop();

  return 0;
}


================================================
FILE: src/server/dfly_main.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include <absl/flags/parse.h>
#include <absl/flags/usage.h>
#include <absl/flags/usage_config.h>
#include <absl/strings/match.h>
#include <absl/strings/str_cat.h>
#include <absl/strings/str_split.h>
#include <absl/strings/strip.h>

#include "absl/cleanup/cleanup.h"
#include "absl/container/inlined_vector.h"
#include "absl/strings/numbers.h"

#ifdef DFLY_ENABLE_MEMORY_TRACKING
#define INJECT_ALLOCATION_TRACKER
#include "core/allocation_tracker.h"
#else
#include <mimalloc-new-delete.h>
#endif

#ifdef __linux__
#include "util/fibers/uring_proactor.h"
#endif

#include <mimalloc.h>
#include <signal.h>

#include <iostream>
#include <memory>

#ifdef USE_AFL
#include <arpa/inet.h>
#include <fcntl.h>
#include <netinet/in.h>
#include <sys/socket.h>
#include <unistd.h>

#include <thread>
#endif

#include "base/init.h"
#include "base/proc_util.h"  // for GetKernelVersion
#include "facade/dragonfly_listener.h"
#include "io/file.h"
#include "io/file_util.h"
#include "io/proc_reader.h"
#include "server/common.h"
#include "server/generic_family.h"
#include "server/main_service.h"
#include "server/server_family.h"
#include "server/version.h"
#include "server/version_monitor.h"
#include "strings/human_readable.h"
#include "util/accept_server.h"
#include "util/fibers/pool.h"
#include "util/varz.h"

#ifdef __APPLE__
#include <crt_externs.h>
#define environ (*_NSGetEnviron())
#else
extern char** environ;
#endif

using namespace std;

ABSL_DECLARE_FLAG(int32_t, port);
ABSL_DECLARE_FLAG(uint32_t, memcached_port);
ABSL_DECLARE_FLAG(uint16_t, admin_port);
ABSL_DECLARE_FLAG(std::string, admin_bind);
ABSL_DECLARE_FLAG(strings::MemoryBytesFlag, maxmemory);

ABSL_FLAG(string, bind, "",
          "Bind address. If empty - binds on all interfaces. "
          "It's not advised due to security implications.");
ABSL_FLAG(string, pidfile, "", "If not empty - server writes its pid into the file");
ABSL_FLAG(string, unixsocket, "",
          "If not empty - specifies path for the Unix socket that will "
          "be used for listening for incoming connections.");
ABSL_FLAG(string, unixsocketperm, "", "Set permissions for unixsocket, in octal value.");
ABSL_FLAG(bool, force_epoll, false,
          "If true - uses linux epoll engine underneath. "
          "Can fit for kernels older than 5.10.");
ABSL_FLAG(
    string, allocation_tracker, "",
    "Logs stack trace of memory allocation within these ranges. Format is min:max,min:max,....");

ABSL_FLAG(bool, version_check, true,
          "If true, Will monitor for new releases on Dragonfly servers once a day.");

ABSL_FLAG(uint16_t, tcp_backlog, 256, "TCP listen(2) backlog parameter.");
ABSL_FLAG(uint16_t, uring_recv_buffer_cnt, 0,
          "How many buffer ring entries to allocate per thread for io_uring receive operations. "
          "Relevant only for modern kernels with io_uring enabled");

ABSL_FLAG(bool, omit_basic_usage, false, "Omit printing basic usage info.");

#ifdef USE_AFL
ABSL_FLAG(uint32_t, afl_loop_limit, UINT_MAX,
          "AFL++ persistent mode loop limit. Specifies how many fuzzing iterations "
          "to run before restarting the process. Higher values improve performance but "
          "may accumulate state.");
ABSL_FLAG(uint16_t, afl_target_port, 0,
          "Port to send fuzz input to. Defaults to --port (RESP). "
          "Set to --memcached_port to fuzz the memcache protocol.");
#endif

using namespace util;
using namespace facade;
using namespace io;
using absl::GetFlag;
using absl::StrCat;
using strings::HumanReadableNumBytes;

namespace dfly {

namespace {

#if ABSL_HAVE_ADDRESS_SANITIZER
// Increase stack size for all debug builds; tools like ASAN can require more than 50 KB.
constexpr size_t kAsanFactor = 2;
#else
constexpr size_t kAsanFactor = 1;
#endif

#ifdef NDEBUG
constexpr size_t kFiberStackBase = 32_KB;
#else
constexpr size_t kFiberStackBase = 48_KB;
#endif

// Default stack size for fibers. We decrease it by 16 bytes because some allocators
// need additional 8-16 bytes for their internal structures, thus over reserving additional
// memory pages if using round sizes.
constexpr size_t kFiberDefaultStackSize = kFiberStackBase * kAsanFactor - 16;

enum class TermColor : uint8_t { kDefault, kRed, kGreen, kYellow };

// Returns the ANSI color code for the given color. TermColor::kDefault is
// an invalid input.
const char* GetAnsiColorCode(TermColor color) {
  switch (color) {
    case TermColor::kRed:
      return "1";
    case TermColor::kGreen:
      return "2";
    case TermColor::kYellow:
      return "3";
    default:
      return nullptr;
  }
}

string ColorStart(TermColor color) {
  return StrCat("\033[0;3", GetAnsiColorCode(color), "m");
}

// Resets the terminal to default.
const char kColorEnd[] = "\033[m";

string ColoredStr(TermColor color, string_view str) {
  return StrCat(ColorStart(color), str, kColorEnd);
}

bool HelpshortFlags(std::string_view f) {
  return absl::StartsWith(f, "\033[0;32");
}

bool HelpFlags(std::string_view f) {
  return absl::StartsWith(f, "\033[0;3");
}

#define STRING_PP_NX(A) #A
#define STRING_MAKE_PP(A) STRING_PP_NX(A)

// This would create a string value from a "defined" location of the source code
// Note that SOURCE_PATH_FROM_BUILD_ENV is taken from the build system
#define BUILD_LOCATION_PATH STRING_MAKE_PP(SOURCE_PATH_FROM_BUILD_ENV)

string NormalizePaths(std::string_view path) {
  const std::string FULL_PATH = BUILD_LOCATION_PATH;
  const std::string FULL_PATH_SRC = FULL_PATH + "/src";
  const std::string FULL_PATH_HELIO = FULL_PATH + "/helio";

  if (absl::ConsumePrefix(&path, "../src/") || absl::ConsumePrefix(&path, FULL_PATH_SRC))
    return ColoredStr(TermColor::kGreen, path);

  if (absl::ConsumePrefix(&path, "../") || absl::ConsumePrefix(&path, FULL_PATH_HELIO))
    return ColoredStr(TermColor::kYellow, path);

  if (absl::ConsumePrefix(&path, "_deps/"))
    return string(path);

  return string(path);
}

template <typename... Args> unique_ptr<Listener> MakeListener(Args&&... args) {
  auto res = make_unique<Listener>(std::forward<Args>(args)...);
  res->SetConnFiberStackSize(kFiberDefaultStackSize);
  return res;
}

void RunEngine(ProactorPool* pool, AcceptServer* acceptor) {
  uint64_t maxmemory = absl::GetFlag(FLAGS_maxmemory);
  if (maxmemory > 0 && maxmemory < pool->size() * 256_MB) {
    LOG(ERROR) << "There are " << pool->size() << " threads, so "
               << HumanReadableNumBytes(pool->size() * 256_MB) << " are required. Exiting...";
    exit(1);
  }

  Service service(pool);

  auto tcp_disabled = GetFlag(FLAGS_port) == 0u;
  Listener* main_listener = nullptr;

  std::vector<facade::Listener*> listeners;

  // If we ever add a new listener, plz don't change this,
  // we depend on tcp listener to be at the front since we later
  // need to pass it to the AclFamily::Init
  if (!tcp_disabled) {
    auto listener = MakeListener(Protocol::REDIS, &service, Listener::Role::MAIN);
    main_listener = listener.get();
    listeners.push_back(listener.release());
  }

  const auto& bind = GetFlag(FLAGS_bind);

  // Protected mode: if no bind address is specified and no password is set,
  // bind only to localhost to prevent unauthorized remote access.
  // Only enabled when running under systemd (INVOCATION_ID is set) to avoid
  // breaking containerized deployments where binding to localhost would make
  // the service unreachable from the host.
  // GetPassword() checks both --requirepass flag and DFLY_PASSWORD env var.
  bool running_under_systemd = getenv("INVOCATION_ID") != nullptr;
  bool protected_mode = running_under_systemd && bind.empty() && GetPassword().empty();
  const char* bind_addr = nullptr;
  if (protected_mode) {
    bind_addr = "127.0.0.1";
    LOG(WARNING) << "Protected mode enabled. Binding to localhost only because no password is set. "
                 << "To accept remote connections, set a password with --requirepass or "
                 << "specify a bind address with --bind.";
  } else if (!bind.empty()) {
    bind_addr = bind.c_str();
  }

  int32_t port = GetFlag(FLAGS_port);
  // The reason for this code is a bit silly. We want to provide a way to
  // bind any 'random' available port. The way to do that is to call
  // bind with the argument port 0. However we can't expose this functionality
  // as is to our users: Since giving --port=0 to redis DISABLES the network
  // interface that would break users' existing configurations in potentionally
  // unsafe ways. For that reason the user's --port=-1 means to us 'bind port 0'.
  if (port == -1) {
    port = 0;
  } else if (port < 0 || port > 65535) {
    LOG(ERROR) << "Bad port number " << port;
    exit(1);
  }

  auto mc_port = GetFlag(FLAGS_memcached_port);
  string unix_sock = GetFlag(FLAGS_unixsocket);
  bool unlink_uds = false;
  absl::Cleanup maybe_unlink_uds([&unlink_uds, &unix_sock]() {
    if (unlink_uds) {
      unlink(unix_sock.c_str());
    }
  });

  if (!unix_sock.empty()) {
    string perm_str = GetFlag(FLAGS_unixsocketperm);
    uint32_t unix_socket_perm;
    if (perm_str.empty()) {
      // get umask of running process, indicates the permission bits that are turned off
      mode_t umask_val = umask(0);
      umask(umask_val);
      unix_socket_perm = 0777 & ~umask_val;
    } else {
      if (!absl::numbers_internal::safe_strtoi_base(perm_str, &unix_socket_perm, 8) ||
          unix_socket_perm > 0777) {
        LOG(ERROR) << "Invalid unixsocketperm: " << perm_str;
        exit(1);
      }
    }
    unlink(unix_sock.c_str());

    auto uds_listener = MakeListener(Protocol::REDIS, &service);
    error_code ec =
        acceptor->AddUDSListener(unix_sock.c_str(), unix_socket_perm, uds_listener.get());
    if (ec) {
      if (tcp_disabled) {
        LOG(ERROR) << "Could not open unix socket " << unix_sock
                   << ", and TCP listening is disabled (error: " << ec << "). Exiting.";
        exit(1);
      } else {
        LOG(WARNING) << "Could not open unix socket " << unix_sock << ", error " << ec;
      }
    } else {
      LOG(INFO) << "Listening on unix socket " << unix_sock;
      listeners.push_back(uds_listener.release());
      unlink_uds = true;
    }
  } else if (tcp_disabled) {
    LOG(ERROR)
        << "Did not receive a unix socket to listen to, yet TCP listening is disabled. Exiting.";
    exit(1);
  }

  std::uint16_t admin_port = GetFlag(FLAGS_admin_port);
  if (admin_port != 0) {
    const std::string& admin_bind = GetFlag(FLAGS_admin_bind);
    // Note passing the result of c_str() for empty string in optimized mode don't work, we must
    // explicitly set this to null in this case
    const char* interface_addr = admin_bind.empty() ? nullptr : admin_bind.c_str();
    const std::string printable_addr =
        absl::StrCat("admin socket ", interface_addr ? interface_addr : "any", ":", admin_port);
    auto admin_listener = MakeListener(Protocol::REDIS, &service, Listener::Role::PRIVILEGED);

    error_code ec = acceptor->AddListener(interface_addr, admin_port, admin_listener.get());

    if (ec) {
      LOG(ERROR) << "Failed to open " << printable_addr << ", error: " << ec.message();
    } else {
      LOG(INFO) << "Listening on " << printable_addr;
      listeners.push_back(admin_listener.release());
    }
  }

  if (main_listener) {
    error_code ec = acceptor->AddListener(bind_addr, port, main_listener);

    if (ec) {
      LOG(ERROR) << "Could not open port " << port << ", error: " << ec.message();
      exit(1);
    }

    if (port == 0) {
      absl::SetFlag(&FLAGS_port, main_listener->socket()->LocalEndpoint().port());
    }
  }

  if (mc_port > 0 && !tcp_disabled) {
    auto listener = MakeListener(Protocol::MEMCACHE, &service);
    error_code ec = acceptor->AddListener(bind_addr, mc_port, listener.get());
    if (ec) {
      LOG(ERROR) << "Could not open memcached port " << mc_port << ", error: " << ec.message();
      exit(1);
    }
    listeners.push_back(listener.release());
  }

  service.Init(acceptor, listeners);

  VersionMonitor version_monitor;

  // check if it's a production release tag.
  if (GetFlag(FLAGS_version_check) && kGitTag[0] == 'v' && strchr(kGitTag, '-') == nullptr) {
    version_monitor.Run(pool);
  }

  // Start the acceptor loop and wait for the server to shutdown.
  acceptor->Run();
  google::FlushLogFiles(google::INFO);  // Flush the header.

  acceptor->Wait();

  version_monitor.Shutdown();
  service.Shutdown();
}

bool CreatePidFile(const string& path) {
  Result<WriteFile*> res = OpenWrite(path);
  if (!res) {
    LOG(ERROR) << "Failed to open pidfile with error: " << res.error().message() << ". Exiting...";
    return false;
  }

  unique_ptr<WriteFile> wf(res.value());
  auto ec = wf->Write(to_string(getpid()));
  if (ec) {
    LOG(ERROR) << "Failed to write pid into pidfile with error: " << ec.message() << ". Exiting...";
    return false;
  }

  ec = wf->Close();
  if (ec) {
    LOG(WARNING) << "Failed to close pidfile file descriptor with error: " << ec.message() << ".";
  }

  return true;
}

#ifdef __linux__
bool ShouldUseEpollAPI(const base::sys::KernelVersion& kver) {
  if (GetFlag(FLAGS_force_epoll))
    return true;

  if (kver.kernel < 5 || (kver.kernel == 5 && kver.major < 10)) {
    LOG(WARNING) << "Kernel is older than 5.10, switching to epoll engine.";
    return true;
  }

  struct io_uring ring;
  io_uring_params params;
  memset(&params, 0, sizeof(params));

  int iouring_res = io_uring_queue_init_params(1024, &ring, &params);

  if (iouring_res == 0) {
    io_uring_queue_exit(&ring);
    return false;
  }

  iouring_res = -iouring_res;

  if (iouring_res == ENOSYS) {
    LOG(WARNING) << "iouring API is not supported. switching to epoll.";
  } else if (iouring_res == ENOMEM) {
    LOG(WARNING) << "io_uring does not have enough memory. That can happen when your "
                    "max locked memory is too limited. If you run via docker, "
                    "try adding '--ulimit memlock=-1' to \"docker run\" command."
                    "Meanwhile, switching to epoll";
  } else {
    LOG(WARNING) << "Weird error " << iouring_res << " switching to epoll";
  }

  return true;
}

void GetCGroupPath(string* memory_path, string* cpu_path) {
  CHECK(memory_path != nullptr) << "memory_path is null! (this shouldn't happen!)";
  CHECK(cpu_path != nullptr) << "cpu_path is null! (this shouldn't happen!)";

  // Begin by reading /proc/self/cgroup

  auto cg = io::ReadFileToString("/proc/self/cgroup");
  CHECK(cg.has_value()) << "Failed to read /proc/self/cgroup";

  string cgv = std::move(cg).value();

  // Next, depending on cgroup version we either read:
  // N:<cgroup name>:<path> -- in case of v1, in many lines
  // 0::<cgroup name> -- in case of v2, in a single line

  auto stripped = absl::StripAsciiWhitespace(cgv);

  vector<string_view> groups = absl::StrSplit(stripped, '\n');

  if (groups.size() == 1) {
    // for v2 we only read 0::<name>
    size_t pos = cgv.rfind(':');
    if (pos == string::npos) {
      LOG(ERROR) << "Failed to parse cgroupv2 format, got: " << cgv;
      exit(1);
    }

    auto cgroup = string_view(cgv.c_str() + pos + 1);
    string_view cgroup_stripped = absl::StripTrailingAsciiWhitespace(cgroup);

    *memory_path = absl::StrCat("/sys/fs/cgroup/", cgroup_stripped);
    *cpu_path = *memory_path;  // in v2 the path to the cgroup is singular
  } else {
    for (const auto& sv : groups) {
      // in v1 the format is
      // N:s1:2 where N is an integer, s1, s2 strings with s1 maybe empty.
      vector<string_view> entry = absl::StrSplit(sv, ':');
      if (entry.size() != 3u) {
        LOG(ERROR) << "Unsupported group " << sv;
        continue;
      }

      // in v1 there are several 'canonical' cgroups
      // we are interested in the 'memory' and the 'cpu,cpuacct' ones
      // which specify memory and cpu limits, respectively.
      if (entry[1] == "memory")
        *memory_path = absl::StrCat("/sys/fs/cgroup/memory/", entry[2]);

      if (entry[1] == "cpu,cpuacct")
        *cpu_path = absl::StrCat("/sys/fs/cgroup/cpu,cpuacct/", entry[2]);
    }
  }
}

// returns true on success.
bool UpdateResourceLimitsIfInsideContainer(io::MemInfoData* mdata, size_t* max_threads) {
  using absl::StrCat;

  // did we succeed in reading *something*? if not, exit.
  // note that all processes in Linux are in some cgroup, so at the very
  // least we should read something.
  bool read_something = false;

  auto read_mem = [&read_something](string_view path, size_t* output) {
    auto file = io::ReadFileToString(path);
    DVLOG(1) << "container limits: read " << path << ": " << file.value_or("N/A");

    size_t temp = numeric_limits<size_t>::max();

    if (file.has_value()) {
      if (!absl::StartsWith(*file, "max"))
        CHECK(absl::SimpleAtoi(*file, &temp))
            << "Failed in parsing cgroup limits, path: " << path << " (read: " << *file << ")";
      read_something = true;
    }

    *output = min(*output, temp);
  };

  string mem_path, cpu_path;
  GetCGroupPath(&mem_path, &cpu_path);

  if (mem_path.empty() || cpu_path.empty()) {
    return true;  // not a container
  }

  VLOG(1) << "mem_path = " << mem_path;
  VLOG(1) << "cpu_path = " << cpu_path;

  /* Update memory limits */

  // Start by reading global memory limits
  auto parse_limits = [&](std::string_view base_mem) {
    read_mem(StrCat(base_mem, "/memory.limit_in_bytes"), &mdata->mem_total);
    read_mem(StrCat(base_mem, "/memory.max"), &mdata->mem_total);
  };

  // For v1
  constexpr auto base_mem_v1 = "/sys/fs/cgroup/memory"sv;
  parse_limits(base_mem_v1);
  // For v2 if the previous failed
  constexpr auto base_mem_v2 = "/sys/fs/cgroup"sv;
  parse_limits(base_mem_v2);
  // For v2 under /user.slice
  constexpr auto base_mem_v2_slice = "/sys/fs/cgroup/user.slice"sv;
  parse_limits(base_mem_v2_slice);

  // Read cgroup-specific limits
  read_mem(StrCat(mem_path, "/memory.limit_in_bytes"), &mdata->mem_total);
  read_mem(StrCat(mem_path, "/memory.max"), &mdata->mem_total);
  read_mem(StrCat(mem_path, "/memory.high"), &mdata->mem_avail);
  mdata->mem_avail = min(mdata->mem_avail, mdata->mem_total);

  /* Update thread limits */

  auto read_cpu = [&read_something](string_view path, size_t* output) {
    double count{0}, timeshare{1};

    /**
     * Summarized: the function does one of the following:
     *
     * 1. read path/cpu.max -- for v2. The format of this file is:
     *  $COUNT $PERIOD
     * which indicates that we can use upto $COUNT shares in a $PERIOD of time.
     * If $COUNT is max, then we can use as much CPU as the system has. Otherwise,
     * this translates to $COUNT/$PERIOD threads.
     *
     * 2. read path/cpu.cfs_quota_us & path/cpu.cfs_period_us -- same idea, but for v1.
     */

    if (auto cpu = ReadFileToString(StrCat(path, "/cpu.max")); cpu.has_value()) {
      vector<string_view> res = absl::StrSplit(*cpu, ' ');

      // Some linux distributions do not have anything there.
      if (res.size() == 2u) {
        if (res[0] == "max")
          *output = 0u;
        else {
          CHECK(absl::SimpleAtod(res[0], &count))
              << "Failed in parsing cgroupv2 cpu count, path = " << path << " (read: " << *cpu
              << ")";
          CHECK(absl::SimpleAtod(res[1], &timeshare))
              << "Failed in parsing cgroupv2 cpu timeshare, path = " << path << " (read: " << *cpu
              << ")";

          *output = static_cast<size_t>(ceil(count / timeshare));
        }

        read_something = true;
      }
    } else if (auto quota = ReadFileToString(StrCat(path, "/cpu.cfs_quota_us"));
               quota.has_value()) {
      auto period = ReadFileToString(StrCat(path, "/cpu.cfs_period_us"));

      CHECK(period.has_value()) << "Failed to read cgroup cpu.cfs_period_us, but read "
                                   "cpu.cfs_quota_us (this shouldn't happen!)";

      CHECK(absl::SimpleAtod(quota.value(), &count))
          << "Failed in parsing cgroupv1 cpu timeshare, quota = " << path << " (read: " << *quota
          << ")";

      if (count == -1)  // on -1 there is no limit.
        count = 0;

      CHECK(absl::SimpleAtod(period.value(), &timeshare))
          << "Failed in parsing cgroupv1 cpu timeshare, path = " << path << " (read: " << *period
          << ")";

      *output = static_cast<size_t>(count / timeshare);
      read_something = true;
    }
  };

  constexpr auto base_cpu = "/sys/fs/cgroup/cpu"sv;
  read_cpu(base_cpu, max_threads);  // global cpu limits
  constexpr auto base_cpu_v2 = "/sys/fs/cgroup"sv;
  read_cpu(base_cpu_v2, max_threads);  // global cpu limits
  constexpr auto base_cpu_v2_slice = "/sys/fs/cgroup/user.slice"sv;
  read_cpu(base_cpu_v2_slice, max_threads);  // global cpu limits
  read_cpu(cpu_path, max_threads);           // cgroup-specific limits

  if (!read_something) {
    LOG(ERROR) << "Failed in deducing any cgroup limits with paths " << mem_path << " and "
               << cpu_path;
    return false;
  }
  return true;
}

#endif

void SetupAllocationTracker(ProactorPool* pool) {
#ifdef DFLY_ENABLE_MEMORY_TRACKING
  string flag = absl::GetFlag(FLAGS_allocation_tracker);
  vector<pair<size_t, size_t>> track_ranges;
  for (string_view entry : absl::StrSplit(flag, ",", absl::SkipEmpty())) {
    auto separator = entry.find(":");
    if (separator == entry.npos) {
      LOG(ERROR) << "Can't find ':' in element";
      exit(-1);
    }

    pair<size_t, size_t> p;
    if (!absl::SimpleAtoi(entry.substr(0, separator), &p.first)) {
      LOG(ERROR) << "Can't parse first number in pair";
      exit(-1);
    }
    if (!absl::SimpleAtoi(entry.substr(separator + 1), &p.second)) {
      LOG(ERROR) << "Can't parse second number in pair";
      exit(-1);
    }

    track_ranges.push_back(p);
  }

  pool->AwaitBrief([&](unsigned, ProactorBase*) {
    for (auto range : track_ranges) {
      if (!AllocationTracker::Get().Add(
              {.lower_bound = range.first, .upper_bound = range.second, .sample_odds = 1.0})) {
        LOG(ERROR) << "Unable to track allocation range";
        exit(-1);
      }
    }
  });
#endif
}

void RegisterBufRings(ProactorPool* pool) {
#ifdef __linux__
  auto bufcnt = absl::GetFlag(FLAGS_uring_recv_buffer_cnt);
  if (bufcnt == 0) {
    return;
  }

  if (dfly::kernel_version < 602 || pool->at(0)->GetKind() != ProactorBase::IOURING) {
    LOG(WARNING) << "uring_recv_buffer_cnt is only supported on kernels >= 6.2 and with "
                    "io_uring proactor";
    return;
  }

  // We need a power of 2 length.
  bufcnt = absl::bit_ceil(bufcnt);
  pool->AwaitBrief([&](unsigned, ProactorBase* pb) {
    auto up = static_cast<fb2::UringProactor*>(pb);
    int res = up->RegisterBufferRing(facade::kRecvSockGid, bufcnt, facade::kRecvBufSize);
    if (res != 0) {
      LOG(ERROR) << "Failed to register buf ring for proactor "
                 << util::detail::SafeErrorMessage(res);
      exit(1);
    }
  });
  LOG(INFO) << "Registered a bufring with " << bufcnt << " buffers of size " << facade::kRecvBufSize
            << " per thread ";
#endif
}

class MiMallocResource : public PMR_NS::memory_resource {
 private:
  void* do_allocate(std::size_t size, std::size_t align) final {
    return mi_malloc_aligned(size, align);
  }

  void do_deallocate(void* ptr, std::size_t size, std::size_t align) final {
    mi_free_size_aligned(ptr, size, align);
  }

  bool do_is_equal(const PMR_NS::memory_resource& o) const noexcept final {
    return this == &o;
  }
};

MiMallocResource g_mi_resource;

#ifdef USE_AFL
// AFL++ fuzzing helper functions
// These functions support AFL++ persistent mode fuzzing by handling server readiness checks,
// input reading, and test case execution. The __AFL_LOOP macro itself must remain in main()
// due to AFL++ instrumentation requirements.

// Waits for the Dragonfly server to become ready by attempting TCP connections.
// Returns true if server is ready, false otherwise.
// This is necessary because the server starts in a separate thread and we need to
// wait for it to be fully initialized before starting the fuzzing loop.
bool WaitForServerReady(uint16_t port, int max_attempts = 100) {
  for (int i = 0; i < max_attempts; i++) {
    std::this_thread::sleep_for(std::chrono::milliseconds(50));
    int s = socket(AF_INET, SOCK_STREAM, 0);
    if (s >= 0) {
      struct sockaddr_in a = {};
      a.sin_family = AF_INET;
      a.sin_port = htons(port);
      inet_pton(AF_INET, "127.0.0.1", &a.sin_addr);
      if (connect(s, (struct sockaddr*)&a, sizeof(a)) == 0) {
        close(s);
        return true;
      }
      close(s);
    }
  }
  return false;
}

// Configures stdin to non-blocking mode for AFL++ fuzzing.
// Non-blocking mode is required because AFL++ feeds input through stdin,
// and we need to handle cases where input might not be immediately available.
void ConfigureStdinNonBlocking() {
  fcntl(STDIN_FILENO, F_SETFL, fcntl(STDIN_FILENO, F_GETFL) | O_NONBLOCK);
}

// Reads fuzzing input from stdin with retry logic.
// AFL++ provides test cases through stdin, and this function handles reading them
// with appropriate retry logic for non-blocking I/O.
// Returns the number of bytes read, or -1 on error, or 0 if no data available after retries.
ssize_t ReadFuzzInput(char* buffer, size_t buffer_size) {
  ssize_t len = 0;
  for (int attempt = 0; attempt < 100 && len == 0; attempt++) {
    len = read(STDIN_FILENO, buffer, buffer_size);
    if (len < 0 && errno == EAGAIN) {
      usleep(10000);  // Wait 10ms and retry
      continue;
    }
    if (len < 0)
      break;
  }
  return len;
}

// Sends fuzzing input to the Dragonfly server and reads the response.
// This executes one fuzzing iteration by:
// 1. Creating a TCP socket connection to the server
// 2. Sending the fuzzed data
// 3. Reading a response (with timeout to prevent hangs)
// The function uses short timeouts to keep fuzzing fast and prevent AFL++ from stalling.
void SendFuzzInputToServer(uint16_t port, const char* data, ssize_t len) {
  int s = socket(AF_INET, SOCK_STREAM, 0);
  if (s >= 0) {
    struct timeval tv = {.tv_sec = 0, .tv_usec = 200000};
    setsockopt(s, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv));
    setsockopt(s, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv));

    struct sockaddr_in a = {};
    a.sin_family = AF_INET;
    a.sin_port = htons(port);
    inet_pton(AF_INET, "127.0.0.1", &a.sin_addr);
    if (connect(s, (struct sockaddr*)&a, sizeof(a)) == 0) {
      send(s, data, len, MSG_NOSIGNAL);
      char r[4096];
      recv(s, r, sizeof(r), 0);
    }
    close(s);
  }
}

// Initializes AFL++ fuzzing by starting the server in a separate thread,
// waiting for it to become ready, and preparing stdin for fuzzing input.
// Returns the server thread handle. The caller is responsible for the fuzzing loop.
std::thread InitAflFuzzing(ProactorPool* pool, AcceptServer* acceptor) {
  // Start server in a separate thread
  std::thread server_thread([pool, acceptor]() {
    dfly::RunEngine(pool, acceptor);
    pool->Stop();
  });

  uint16_t target_port = GetFlag(FLAGS_afl_target_port);
  uint16_t port = target_port ? target_port : GetFlag(FLAGS_port);

  // Wait for server to become ready
  if (!WaitForServerReady(port)) {
    LOG(ERROR) << "AFL++: Server not ready after 100 attempts, exiting...";
    exit(1);
  }

  uint32_t afl_loop_limit = GetFlag(FLAGS_afl_loop_limit);
  LOG(INFO) << "AFL++: Server ready, starting fuzzing loop with limit " << afl_loop_limit
            << " iterations...";

  // Configure stdin for AFL++ input
  ConfigureStdinNonBlocking();

  return server_thread;
}

// Executes one AFL++ fuzzing iteration: reads input from stdin and sends it to the server.
// Returns true if the iteration was successful, false if stdin EOF or error occurred.
bool RunAflFuzzingIteration(uint16_t port) {
  char buf[64 * 1024];

  // Read fuzzing input from stdin
  ssize_t len = ReadFuzzInput(buf, sizeof(buf));

  if (len <= 0)
    return false;  // stdin EOF or error

  // Send fuzzed input to the server
  SendFuzzInputToServer(port, buf, len);
  return true;
}
#endif  // USE_AFL

}  // namespace
}  // namespace dfly

extern "C" void _mi_options_init();

using namespace dfly;

void sigill_hdlr(int signo) {
  LOG(ERROR) << "An attempt to execute an instruction failed."
             << "The root cause might be an old hardware. Exiting...";
  exit(1);
}

void PrintBasicUsageInfo() {
  std::string output =
      "                   .--::--.                   \n"
      "   :+*=:          =@@@@@@@@=          :+*+:   \n"
      "  %@@@@@@%*=.     =@@@@@@@@-     .=*%@@@@@@#  \n"
      "  @@@@@@@@@@@@#+-. .%@@@@#. .-+#@@@@@@@@@@@%  \n"
      "  -@@@@@@@@@@@@@@@@*:#@@#:*@@@@@@@@@@@@@@@@-  \n"
      "    :+*********####-%@%%@%-####********++.    \n"
      "   .%@@@@@@@@@@@@@%:@@@@@@:@@@@@@@@@@@@@@%    \n"
      "   .@@@@@@@@%*+-:   =@@@@=  .:-+*%@@@@@@@%.   \n"
      "     =*+-:           ###*          .:-+*=     \n"
      "                     %@@%                     \n"
      "                     *@@*                     \n"
      "                     +@@=                     \n"
      "                     :##:                     \n"
      "                     :@@:                     \n"
      "                      @@                      \n"
      "                      ..                      \n"
      "* Logs will be written to the first available of the following paths:\n";

  for (const auto& dir : google::GetLoggingDirectories()) {
    const string_view maybe_slash = absl::EndsWith(dir, "/") ? "" : "/";
    absl::StrAppend(&output, dir, maybe_slash, "dragonfly.*\n");
  }

  absl::StrAppend(&output,
                  "* For the available flags type dragonfly [--help | --helpfull]\n"
                  "* Documentation can be found at: https://www.dragonflydb.io/docs\n");

  std::cout << output;
  std::cout.flush();
}

void ParseFlagsFromEnv() {
  const auto& flags = absl::GetAllFlags();
  for (char** env = environ; *env != nullptr; env++) {
    constexpr string_view kPrefix = "DFLY_";
    string_view environ_var = *env;
    if (absl::StartsWith(environ_var, kPrefix)) {
      // Per 'man environ', environment variables are included with their values
      // in the format "name=value". Need to strip them apart, in order to work with flags object
      pair<string_view, string_view> environ_pair =
          absl::StrSplit(absl::StripPrefix(environ_var, kPrefix), absl::MaxSplits('=', 1));
      const auto& [flag_name, flag_value] = environ_pair;
      if (flag_name == "DEV_ENV") {
        continue;  // DFLY_DEV_ENV is used to skip version check.
      }

      auto entry = flags.find(flag_name);
      if (entry != flags.end()) {
        if (absl::flags_internal::WasPresentOnCommandLine(flag_name)) {
          continue;
        }
        string error;
        auto& flag = entry->second;
        bool success = flag->ParseFrom(flag_value, &error);
        if (!success) {
          LOG(FATAL) << "could not parse flag " << flag->Name()
                     << " from environment variable. Error: " << error;
        }
      } else {
        LOG(FATAL) << "unknown environment variable DFLY_" << flag_name;
      }
    }
  }
}

int main(int argc, char* argv[]) {
  absl::SetProgramUsageMessage(
      R"(a modern in-memory store.

Usage: dragonfly [FLAGS]
)");

  absl::FlagsUsageConfig config;
  config.contains_help_flags = dfly::HelpFlags;
  config.contains_helpshort_flags = dfly::HelpshortFlags;
  config.normalize_filename = dfly::NormalizePaths;
  config.version_string = [] {
    string version = StrCat(dfly::kGitTag, "-", dfly::kGitSha);
    return StrCat("dragonfly ", ColoredStr(TermColor::kGreen, version),
                  "\nbuild time: ", ColoredStr(TermColor::kYellow, dfly::kBuildTime), "\n");
  };

  absl::SetFlagsUsageConfig(config);
  google::InitGoogleLogging(argv[0]);
  google::SetLogFilenameExtension(".log");

  MainInitGuard guard(&argc, &argv);

  ParseFlagsFromEnv();

  if (!GetFlag(FLAGS_omit_basic_usage)) {
    PrintBasicUsageInfo();
  }

  LOG(INFO) << "Starting dragonfly " << GetVersion() << "-" << kGitSha;

  struct sigaction act;
  act.sa_handler = sigill_hdlr;
  sigemptyset(&act.sa_mask);
  sigaction(SIGILL, &act, nullptr);

  // Ignore SIGHUP to prevent termination when the parent shell exits
  signal(SIGHUP, SIG_IGN);

  if (GetFlag(FLAGS_port) == 0u) {
    string usock = GetFlag(FLAGS_unixsocket);
    if (usock.length() == 0u) {
      LOG(ERROR) << "received --port 0, yet no unix socket to listen to. Exiting.";
      exit(1);
    }
    LOG(INFO) << "received --port 0, disabling TCP listening.";
    LOG(INFO) << "listening on unix socket " << usock << ".";
  }

  if (GetFlag(FLAGS_dbnum) > dfly::kMaxDbId) {
    LOG(ERROR) << "dbnum is too big. Exiting...";
    return 1;
  }

  string pidfile_path = GetFlag(FLAGS_pidfile);
  if (!pidfile_path.empty()) {
    if (!CreatePidFile(pidfile_path)) {
      return 1;
    }
  }

  io::MemInfoData mem_info = ReadMemInfo().value_or(io::MemInfoData{});
  size_t max_available_threads = 0u;

#ifdef __linux__
  UpdateResourceLimitsIfInsideContainer(&mem_info, &max_available_threads);
#endif

  if (mem_info.swap_total != 0)
    LOG(WARNING) << "SWAP is enabled. Consider disabling it when running Dragonfly.";

  dfly::max_memory_limit = absl::GetFlag(FLAGS_maxmemory);

  if (dfly::max_memory_limit == 0) {
    LOG(INFO) << "maxmemory has not been specified. Deciding myself....";

    size_t available = mem_info.mem_avail;
    size_t maxmemory = size_t(0.8 * available);
    if (maxmemory == 0) {
      LOG(ERROR) << "Could not deduce how much memory available. "
                 << "Use --maxmemory=... to specify explicitly";
      return 1;
    }
    LOG(INFO) << "Found " << HumanReadableNumBytes(available)
              << " available memory. Setting maxmemory to " << HumanReadableNumBytes(maxmemory);

    absl::SetFlag(&FLAGS_maxmemory, maxmemory);
    dfly::max_memory_limit = maxmemory;
  } else {
    string hr_limit = HumanReadableNumBytes(dfly::max_memory_limit);
    if (dfly::max_memory_limit > mem_info.mem_avail)
      LOG(WARNING) << "Got memory limit " << hr_limit << ", however only "
                   << HumanReadableNumBytes(mem_info.mem_avail) << " was found.";
    LOG(INFO) << "Max memory limit is: " << hr_limit;
  }

  // Initialize mi_malloc options
  // export MIMALLOC_VERBOSE=1 to see the options before the override.
  // _default functions override the default options vaues but if the options were set
  // via the environment variables, they will not be overridden.
  mi_option_set_enabled_default(mi_option_show_errors, true);
  mi_option_set_default(mi_option_purge_delay, 0);

  // To see the options after the override, use:
  // mi_options_print();

  fb2::SetDefaultStackResource(&g_mi_resource, kFiberDefaultStackSize);

  {
    unique_ptr<util::ProactorPool> pool;

#ifdef __linux__
    base::sys::KernelVersion kver;
    base::sys::GetKernelVersion(&kver);

    CHECK_LT(kver.major, 99u);
    dfly::kernel_version = kver.kernel * 100 + kver.major;

    bool use_epoll = ShouldUseEpollAPI(kver);

    if (use_epoll) {
      pool.reset(fb2::Pool::Epoll(max_available_threads));
    } else {
      pool.reset(fb2::Pool::IOUring(1024, max_available_threads));  // 1024 - iouring queue size.
    }
#else
    pool.reset(fb2::Pool::Epoll(max_available_threads));
#endif

    pool->Run();

    SetupAllocationTracker(pool.get());
    RegisterBufRings(pool.get());

    AcceptServer acceptor(pool.get(), &g_mi_resource, true);
    acceptor.set_back_log(absl::GetFlag(FLAGS_tcp_backlog));

#ifdef USE_AFL
    //  Persistent mode fuzzing integration:
    // - AFL++ generates test cases and feeds them through stdin
    // - This code reads from stdin and forwards the data to a real TCP connection to the Dragonfly
    //   server
    // - The server runs in a separate thread and processes the fuzzed input as if it came from a
    //   normal client
    // - Each fuzzing iteration: read stdin -> send to server via TCP -> read response -> repeat
    //
    // Process lifecycle:
    // - When stdin closes (EOF), the fuzzing process exits - this is expected behavior
    // - When the fuzzing session completes (__AFL_LOOP finishes), the process exits with code 0
    // - Exiting with code 0 is REQUIRED for AFL++ to work correctly
    // - This also enables "dry run" mode where AFL++ tests that the target can be fuzzed before
    //   starting the actual fuzzing campaign

    std::thread server_thread = dfly::InitAflFuzzing(pool.get(), &acceptor);

    uint16_t target_port = GetFlag(FLAGS_afl_target_port);
    uint16_t port = target_port ? target_port : GetFlag(FLAGS_port);
    uint32_t afl_loop_limit = GetFlag(FLAGS_afl_loop_limit);
    unsigned int loop_iteration = 0;

    // AFL++ persistent mode loop - this macro MUST stay in main() for proper instrumentation
    while (__AFL_LOOP(afl_loop_limit)) {
      loop_iteration++;
      if (!dfly::RunAflFuzzingIteration(port))
        break;  // stdin EOF or error
    }

    // AFL++ fuzzing session completed successfully
    LOG(INFO) << "AFL++: Loop finished after " << loop_iteration << " iterations, exiting...";
    // Use _exit(0) to skip cleanup - required by AFL++ persistent mode
    _exit(0);
#else
    dfly::RunEngine(pool.get(), &acceptor);
    pool->Stop();
#endif

    if (!pidfile_path.empty()) {
      unlink(pidfile_path.c_str());
    }
  }

  return 0;
}


================================================
FILE: src/server/dflycmd.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#include "server/dflycmd.h"

#include <absl/random/random.h>
#include <absl/strings/str_cat.h>
#include <absl/strings/str_split.h>
#include <absl/strings/strip.h>

#include <limits>
#include <memory>
#include <optional>
#include <utility>

#include "absl/cleanup/cleanup.h"
#include "absl/strings/numbers.h"
#include "base/flags.h"
#include "base/logging.h"
#include "core/detail/gen_utils.h"
#include "facade/cmd_arg_parser.h"
#include "facade/dragonfly_connection.h"
#include "facade/dragonfly_listener.h"
#include "facade/reply_builder.h"
#include "server/cluster_support.h"
#include "server/engine_shard_set.h"
#include "server/error.h"
#include "server/journal/journal.h"
#include "server/journal/streamer.h"
#include "server/main_service.h"
#include "server/namespaces.h"
#include "server/rdb_save.h"
#include "server/replica.h"
#include "server/server_family.h"
#include "server/server_state.h"
#include "server/transaction.h"
#include "util/fibers/synchronization.h"
using namespace std;

ABSL_DECLARE_FLAG(bool, info_replication_valkey_compatible);
ABSL_DECLARE_FLAG(uint32_t, replication_timeout);
ABSL_DECLARE_FLAG(uint32_t, shard_repl_backlog_len);

namespace dfly {

using namespace facade;
using namespace util;

using std::string;
using util::ProactorBase;

std::string_view SyncStateName(DflyCmd::SyncState sync_state) {
  switch (sync_state) {
    case DflyCmd::SyncState::PREPARATION:
      return "preparation";
    case DflyCmd::SyncState::FULL_SYNC:
      return "full_sync";
    case DflyCmd::SyncState::STABLE_SYNC:
      return absl::GetFlag(FLAGS_info_replication_valkey_compatible) ? "online" : "stable_sync";
    case DflyCmd::SyncState::CANCELLED:
      return "cancelled";
  }
  DCHECK(false) << "Unspported state " << int(sync_state);
  return "unsupported";
}

namespace {
const char kBadMasterId[] = "bad master id";
const char kIdNotFound[] = "syncid not found";
const char kInvalidSyncId[] = "bad sync id";
const char kInvalidState[] = "invalid state";

bool ToSyncId(string_view str, uint32_t* num) {
  if (!absl::StartsWith(str, "SYNC"))
    return false;
  str.remove_prefix(4);

  return absl::SimpleAtoi(str, num);
}

bool WaitReplicaFlowToCatchup(absl::Time end_time, const DflyCmd::ReplicaInfo* replica,
                              EngineShard* shard, bool with_ping) {
  // We don't want any writes to the journal after we send the `PING`,
  // and expirations could ruin that.
  namespaces->GetDefaultNamespace().GetDbSlice(shard->shard_id()).SetExpireAllowed(false);

  if (with_ping) {
    // PING forces replica to send the most recent last_acked_lsn.
    // ACKS from the replica are send only every X commands or every 3 seconds (flag configurable)
    // or when forced (by the PING above).
    journal::RecordEntry(0, journal::Op::PING, 0, nullopt, {});
  }

  const FlowInfo* flow = &replica->flows[shard->shard_id()];

  while (flow->last_acked_lsn < journal::GetLsn()) {
    if (absl::Now() > end_time) {
      LOG(WARNING) << "Couldn't synchronize with replica for takeover in time: " << replica->address
                   << ":" << replica->listening_port << ", last acked: " << flow->last_acked_lsn
                   << ", expecting " << journal::GetLsn();
      return false;
    }
    if (!replica->exec_st.IsRunning()) {
      return false;
    }
    LOG_EVERY_T(INFO, 1) << "Replica lsn:" << flow->last_acked_lsn
                         << " master lsn:" << journal::GetLsn()
                         << "; Journal streamer state: " << flow->streamer->FormatInternalState();
    ThisFiber::SleepFor(1ms);
  }

  return true;
}

}  // namespace

void DflyCmd::ReplicaInfo::Cancel() {
  util::fb2::LockGuard lk{shared_mu};
  if (replica_state == SyncState::CANCELLED) {
    return;
  }

  LOG(INFO) << "Disconnecting from replica " << address << ":" << listening_port;

  // Update state and cancel context.
  replica_state = SyncState::CANCELLED;
  exec_st.ReportCancelError();
  // Wait for tasks to finish.
  shard_set->RunBlockingInParallel([this](EngineShard* shard) {
    VLOG(2) << "Disconnecting flow " << shard->shard_id();

    FlowInfo* flow = &flows[shard->shard_id()];
    if (flow->cleanup) {
      flow->cleanup();
    }
    VLOG(2) << "After flow cleanup " << shard->shard_id();
    flow->conn = nullptr;
  });
  // Wait for error handler to quit.
  exec_st.JoinErrorHandler();
  VLOG(1) << "Disconnecting replica " << address << ":" << listening_port;
}

DflyCmd::DflyCmd(ServerFamily* server_family) : sf_(server_family) {
}

void DflyCmd::Run(CmdArgList args, CommandContext* cmd_cntx) {
  DCHECK_GE(args.size(), 1u);
  string sub_cmd = absl::AsciiStrToUpper(ArgS(args, 0));

  if (sub_cmd == "THREAD") {
    return Thread(args, cmd_cntx);
  }

  if (sub_cmd == "FLOW" && (args.size() >= 4 && args.size() <= 6)) {
    return Flow(args, cmd_cntx);
  }

  if (sub_cmd == "SYNC" && args.size() == 2) {
    return Sync(args, cmd_cntx);
  }

  if (sub_cmd == "STARTSTABLE" && args.size() == 2) {
    return StartStable(args, cmd_cntx);
  }

  if (sub_cmd == "TAKEOVER" && (args.size() == 3 || args.size() == 4)) {
    return TakeOver(args, cmd_cntx);
  }

  if (sub_cmd == "EXPIRE") {
    return Expire(args, cmd_cntx);
  }

  if (sub_cmd == "REPLICAOFFSET" && args.size() == 1) {
    return ReplicaOffset(args, cmd_cntx);
  }

  if (sub_cmd == "LOAD") {
    return Load(args, cmd_cntx);
  }

  auto* rb = static_cast<facade::RedisReplyBuilder*>(cmd_cntx->rb());
  if (sub_cmd == "HELP") {
    string_view help_arr[] = {
        "DFLY <subcommand> [<arg> [value] [opt] ...]. Subcommands are:",
        "THREAD",
        "    Returns connection thread index and number of threads",
        "THREAD <thread-id>",
        "    Migrates connection to thread <thread-id>",
        "EXPIRE",
        "    Collects all expired items.",
        "REPLICAOFFSET",
        "    Returns LSN (log sequence number) per shard. These are the sequential ids of the ",
        "    journal entry.",
        "LOAD <filename> [APPEND]",
        "    Loads <filename> RDB/DFS file into the data store.",
        "    * APPEND: Existing keys are NOT removed before loading the file, conflicting ",
        "      keys (that exist in both data store and in file) are overridden.",
        "HELP",
        "    Prints this help.",
    };
    return rb->SendSimpleStrArr(help_arr);
  }

  cmd_cntx->SendError(kSyntaxErr);
}

void DflyCmd::Thread(CmdArgList args, CommandContext* cmd_cntx) {
  util::ProactorPool* pool = shard_set->pool();

  auto* rb = static_cast<facade::RedisReplyBuilder*>(cmd_cntx->rb());
  if (args.size() == 1) {  // DFLY THREAD : returns connection thread index and number of threads.
    rb->StartArray(2);
    rb->SendLong(ProactorBase::me()->GetPoolIndex());
    rb->SendLong(long(pool->size()));
    return;
  }

  // DFLY THREAD to_thread : migrates current connection to a different thread.
  string_view arg = ArgS(args, 1);
  unsigned num_thread;
  if (!absl::SimpleAtoi(arg, &num_thread)) {
    return cmd_cntx->SendError(kSyntaxErr);
  }

  if (num_thread < pool->size()) {
    if (int(num_thread) != ProactorBase::me()->GetPoolIndex()) {
      auto* conn = cmd_cntx->conn();
      if (!conn->Migrate(pool->at(num_thread))) {
        // Listener::PreShutdown() triggered
        if (conn->socket()->IsOpen()) {
          return cmd_cntx->SendError(kInvalidState);
        }
        return;
      }
    }

    return rb->SendOk();
  }

  return cmd_cntx->SendError(kInvalidIntErr);
}

void DflyCmd::Flow(CmdArgList args, CommandContext* cmd_cntx) {
  string_view master_id = ArgS(args, 1);
  string_view sync_id_str = ArgS(args, 2);
  string_view flow_id_str = ArgS(args, 3);

  std::optional<LSN> seqid;
  std::optional<string> last_master_id;
  std::optional<string> last_master_lsn;
  if (args.size() == 5) {
    seqid.emplace();
    if (!absl::SimpleAtoi(ArgS(args, 4), &seqid.value())) {
      return cmd_cntx->SendError(facade::kInvalidIntErr);
    }
  } else if (args.size() == 6) {
    last_master_id = ArgS(args, 4);
    last_master_lsn = ArgS(args, 5);
  }

  VLOG(1) << "Got DFLY FLOW master_id: " << master_id << " sync_id: " << sync_id_str
          << " flow: " << flow_id_str << " seq: " << seqid.value_or(-1);

  if (master_id != sf_->master_replid()) {
    return cmd_cntx->SendError(kBadMasterId);
  }

  unsigned flow_id;
  if (!absl::SimpleAtoi(flow_id_str, &flow_id) || flow_id >= shard_set->size()) {
    return cmd_cntx->SendError(facade::kInvalidIntErr);
  }

  auto [sync_id, replica_ptr] = GetReplicaInfoOrReply(sync_id_str, cmd_cntx);
  if (!sync_id)
    return;

  string eof_token;
  std::string sync_type{"FULL"};
  {
    util::fb2::LockGuard lk{replica_ptr->shared_mu};

    if (replica_ptr->replica_state != SyncState::PREPARATION) {
      return cmd_cntx->SendError(kInvalidState);
    }

    // Set meta info on connection.
    auto* conn_cntx = cmd_cntx->server_conn_cntx();
    cmd_cntx->conn()->SetName(absl::StrCat("repl_flow_", sync_id));
    conn_cntx->conn_state.replication_info.repl_session_id = sync_id;
    conn_cntx->conn_state.replication_info.repl_flow_id = flow_id;
    conn_cntx->replica_conn = true;

    absl::InsecureBitGen gen;
    eof_token = GetRandomHex(gen, 40);

    auto& flow = replica_ptr->flows[flow_id];
    conn_cntx->master_repl_flow = &flow;
    flow.conn = cmd_cntx->conn();
    flow.eof_token = eof_token;
    flow.version = replica_ptr->version;

    if (!conn_cntx->conn()->Migrate(shard_set->pool()->at(flow_id))) {
      // Listener::PreShutdown() triggered
      if (conn_cntx->conn()->socket()->IsOpen()) {
        return cmd_cntx->SendError(kInvalidState);
      }
      return;
    }

    journal::StartInThread();

    std::optional<Replica::LastMasterSyncData> data = sf_->GetLastMasterData();
    std::optional<LSN> lsn_to_start_partial;
    // In this flow the master and the registered replica where synced from the same master.
    if (last_master_id && data && data->id == *last_master_id) {
      ++ServerState::tlocal()->stats.psync_requests_total;
      auto flow_lsn =
          ParseLsnVec(*last_master_lsn, data->last_journal_LSNs.size(), flow_id, cmd_cntx);
      if (!flow_lsn) {
        return;  // ParseLsnVec replies in case of error
      }

      if (IsLSNInPartialSyncBuffer(*flow_lsn)) {
        lsn_to_start_partial.emplace(*flow_lsn);
      }

    } else if (seqid.has_value() && IsLSNInPartialSyncBuffer(*seqid)) {
      lsn_to_start_partial.emplace(*seqid);
    }

    if (lsn_to_start_partial) {
      flow.start_partial_sync_at = *lsn_to_start_partial;
      sync_type = "PARTIAL";
      VLOG(1) << "Partial sync requested from LSN=" << flow.start_partial_sync_at.value()
              << " and is available. (current_lsn=" << journal::GetLsn() << ")";
    }
  }

  auto* rb = static_cast<facade::RedisReplyBuilder*>(cmd_cntx->rb());
  rb->StartArray(2);
  rb->SendSimpleString(sync_type);
  rb->SendSimpleString(eof_token);
}

void DflyCmd::Sync(CmdArgList args, CommandContext* cmd_cntx) {
  string_view sync_id_str = ArgS(args, 1);

  VLOG(1) << "Got DFLY SYNC " << sync_id_str;

  auto [sync_id, replica_ptr] = GetReplicaInfoOrReply(sync_id_str, cmd_cntx);
  if (!sync_id)
    return;

  util::fb2::LockGuard lk{replica_ptr->shared_mu};
  if (!CheckReplicaStateOrReply(*replica_ptr, SyncState::PREPARATION, cmd_cntx))
    return;

  // Start full sync.
  {
    Transaction::Guard tg{cmd_cntx->tx()};
    AggregateStatus status;

    // Use explicit assignment for replica_ptr, because capturing structured bindings is C++20.
    auto cb = [this, &status, replica_ptr = replica_ptr](EngineShard* shard) {
      status = StartFullSyncInThread(&replica_ptr->flows[shard->shard_id()], &replica_ptr->exec_st,
                                     shard);
    };
    shard_set->RunBlockingInParallel(std::move(cb));

    // TODO: Send better error
    if (*status != OpStatus::OK)
      return cmd_cntx->SendError(kInvalidState);
  }

  LOG(INFO) << "Started sync with replica " << replica_ptr->address << ":"
            << replica_ptr->listening_port;

  // protected by lk above.
  replica_ptr->replica_state = SyncState::FULL_SYNC;

  return cmd_cntx->SendOk();
}

void DflyCmd::StartStable(CmdArgList args, CommandContext* cmd_cntx) {
  string_view sync_id_str = ArgS(args, 1);

  VLOG(1) << "Got DFLY STARTSTABLE " << sync_id_str;

  auto [sync_id, replica_ptr] = GetReplicaInfoOrReply(sync_id_str, cmd_cntx);
  if (!sync_id)
    return;

  util::fb2::LockGuard lk{replica_ptr->shared_mu};
  auto repl_state = replica_ptr->replica_state;
  if (repl_state != SyncState::FULL_SYNC && repl_state != SyncState::PREPARATION) {
    cmd_cntx->SendError(kInvalidState);
    return;
  }

  // Check all flows are connected.
  // This might happen if a flow abruptly disconnected before sending the SYNC request.
  for (const FlowInfo& flow : replica_ptr->flows) {
    if (!flow.conn) {
      cmd_cntx->SendError(kInvalidState);
      return;
    }
  }

  {
    Transaction::Guard tg{cmd_cntx->tx()};
    AggregateStatus status;

    auto cb = [this, &status, replica_ptr = replica_ptr](EngineShard* shard) {
      FlowInfo* flow = &replica_ptr->flows[shard->shard_id()];

      // We are doing partial sync. We never started FullSync so we don't need to stop it.
      bool is_partial = flow->start_partial_sync_at.has_value();
      if (!is_partial) {
        status = StopFullSyncInThread(flow, &replica_ptr->exec_st, shard);
        if (*status != OpStatus::OK) {
          return;
        }
      }

      StartStableSyncInThread(flow, &replica_ptr->exec_st, shard);
    };
    shard_set->RunBlockingInParallel(std::move(cb));

    if (*status != OpStatus::OK)
      return cmd_cntx->SendError(kInvalidState);
  }

  LOG(INFO) << "Transitioned into stable sync with replica " << replica_ptr->address << ":"
            << replica_ptr->listening_port;

  replica_ptr->replica_state = SyncState::STABLE_SYNC;
  return cmd_cntx->SendOk();
}

bool DflyCmd::IsLSNInPartialSyncBuffer(LSN lsn) const {
  const bool exists = journal::GetLsn() == lsn || journal::IsLSNInBuffer(lsn);
  if (!exists) {
    LOG(INFO) << "Partial sync requested from stale LSN=" << lsn
              << " that the replication buffer doesn't contain this anymore (current_lsn="
              << journal::GetLsn() << "). Will perform a full sync of the data.";
    LOG(INFO) << "If this happens often you can control the replication buffer's size with the "
                 "--shard_repl_backlog_len option";
  }
  return exists;
}

std::optional<LSN> DflyCmd::ParseLsnVec(std::string_view last_master_lsn,
                                        size_t last_journal_lsn_size, size_t flow_id,
                                        CommandContext* cmd_cntx) {
  std::vector<std::string_view> lsn_str_vec = absl::StrSplit(last_master_lsn, '-');
  if (lsn_str_vec.size() != last_journal_lsn_size) {
    cmd_cntx->SendError(facade::kSyntaxErr);  // Unexpected flow. LSN vector of same master
                                              // should be the same size on all replicas.
    return std::nullopt;
  }

  std::vector<LSN> lsn_vec;
  lsn_vec.reserve(lsn_str_vec.size());

  for (string_view lsn_str : lsn_str_vec) {
    int64_t value;
    if (!absl::SimpleAtoi(lsn_str, &value)) {
      cmd_cntx->SendError(facade::kInvalidIntErr);
      return std::nullopt;
    }
    lsn_vec.push_back(value);
  }

  DCHECK(flow_id < lsn_vec.size());
  if (flow_id >= lsn_vec.size()) {
    LOG(ERROR) << "Invalid flow_id: " << flow_id << " exceeds LSN vector size: " << lsn_vec.size()
               << ". Disabling partial sync.";
    return std::nullopt;
  }

  return {lsn_vec[flow_id]};
}

// DFLY TAKEOVER <timeout_sec> [SAVE] <sync_id>
// timeout_sec - number of seconds to wait for TAKEOVER to converge.
// SAVE option is used only by tests.
void DflyCmd::TakeOver(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  parser.Next();
  float timeout = std::ceil(parser.Next<float>());
  if (timeout < 0) {
    // allow 0s timeout for tests.
    return cmd_cntx->SendError("timeout is negative");
  }

  bool save_flag = static_cast<bool>(parser.Check("SAVE"));

  string_view sync_id_str = parser.Next<std::string_view>();

  RETURN_ON_PARSE_ERROR(parser, cmd_cntx);

  VLOG(1) << "Got DFLY TAKEOVER " << sync_id_str << " time out:" << timeout;

  auto [sync_id, replica_ptr] = GetReplicaInfoOrReply(sync_id_str, cmd_cntx);
  if (!sync_id)
    return;

  {
    dfly::SharedLock lk{replica_ptr->shared_mu};
    if (!CheckReplicaStateOrReply(*replica_ptr, SyncState::STABLE_SYNC, cmd_cntx))
      return;

    auto prev_state = sf_->service().SwitchState(GlobalState::ACTIVE, GlobalState::TAKEN_OVER);
    if (prev_state != GlobalState::ACTIVE) {
      LOG(WARNING) << prev_state << " in progress, could not take over";
      return cmd_cntx->SendError("Takeover failed!");
    }
  }

  auto cluster_config_before = cluster::ClusterConfig::Current();

  LOG(INFO) << "Takeover initiated, locking down the database.";
  absl::Duration timeout_dur = absl::Seconds(timeout);
  absl::Time end_time = absl::Now() + timeout_dur;
  AggregateStatus status;

  // We need to await for all dispatches to finish: Otherwise a transaction might be scheduled
  // after this function exits but before the actual shutdown.
  facade::DispatchTracker tracker{sf_->GetNonPriviligedListeners(), cmd_cntx->conn(), false, false};
  shard_set->pool()->AwaitFiberOnAll([&](unsigned index, auto* pb) {
    sf_->CancelBlockingOnThread();
    tracker.TrackOnThread();
  });

  if (!tracker.Wait(timeout_dur)) {
    LOG(WARNING) << "Couldn't wait for commands to finish dispatching. " << timeout_dur;
    status = OpStatus::TIMED_OUT;

    auto cb = [&](unsigned thread_index, util::Connection* conn) {
      facade::Connection* dcon = static_cast<facade::Connection*>(conn);
      LOG(INFO) << dcon->DebugInfo();
    };

    for (auto* listener : sf_->GetListeners()) {
      listener->TraverseConnections(cb);
    }
  }

  VLOG(1) << "AwaitCurrentDispatches done";

  absl::Cleanup cleanup([] {
    VLOG(2) << "Enabling expiration";
    shard_set->RunBriefInParallel([](EngineShard* shard) {
      namespaces->GetDefaultNamespace().GetDbSlice(shard->shard_id()).SetExpireAllowed(true);
    });
  });

  atomic_bool catchup_success = true;
  if (*status == OpStatus::OK) {
    dfly::SharedLock lk{replica_ptr->shared_mu};
    auto cb = [replica_ptr = replica_ptr, end_time, &catchup_success](EngineShard* shard) {
      // PING to force the replica to send the last acked lsn.
      if (!WaitReplicaFlowToCatchup(end_time, replica_ptr.get(), shard, true)) {
        catchup_success.store(false);
      }
    };
    shard_set->RunBlockingInParallel(std::move(cb));
  }

  VLOG(1) << "WaitReplicaFlowToCatchup done";

  if (*status != OpStatus::OK || !catchup_success.load()) {
    sf_->service().SwitchState(GlobalState::TAKEN_OVER, GlobalState::ACTIVE);
    return cmd_cntx->SendError("Takeover failed!");
  }

  cmd_cntx->SendOk();

  atomic_bool rest_catchup_success = true;
  {
    util::fb2::LockGuard mu_lk(mu_);
    for (auto [id, repl_ptr] : replica_infos_) {
      if (replica_ptr == repl_ptr) {
        continue;
      }

      auto cb = [repl_ptr = repl_ptr, end_time, &rest_catchup_success](EngineShard* shard) {
        // We can't PING here as it will advance our LSN and disable partial sync for these nodes.
        // Instead, wait and be optimistic that the end_time is not short. If the nodes didn't sync
        // up in time, it's ok, they will fall back to full sync when reconfigured.
        if (!WaitReplicaFlowToCatchup(end_time, repl_ptr.get(), shard, false)) {
          rest_catchup_success.store(false);
        }
      };
      shard_set->RunBlockingInParallel(std::move(cb));
    }

    if (!rest_catchup_success) {
      LOG(WARNING) << "Some of the replica nodes did not sync in time.";
    }
  }

  if (save_flag) {
    VLOG(1) << "Save snapshot after Takeover.";
    if (auto ec = sf_->DoSave(true); ec) {
      LOG(WARNING) << "Failed to perform snapshot " << ec.Format();
    }
  }

  // For non-cluster mode we shutdown
  if (detail::cluster_mode != detail::ClusterMode::kRealCluster) {
    VLOG(1) << "Takeover accepted, shutting down.";
    std::string save_arg = "NOSAVE";
    MutableSlice sargs(save_arg);
    CommandContext child_cmd_cntx{cmd_cntx->rb(), nullptr};
    sf_->ShutdownCmd(CmdArgList(&sargs, 1), &child_cmd_cntx);
    return;
  }

  auto cluster_config_after = cluster::ClusterConfig::Current();
  if (cluster_config_after.get() != cluster_config_before.get()) {
    LOG(INFO) << "ReconcileMasterSlots() early exit. Config already updated";
    return;
  }
  sf_->service().cluster_family().ReconcileMasterSlots(replica_ptr->id);
}

void DflyCmd::Expire(CmdArgList args, CommandContext* cmd_cntx) {
  cmd_cntx->tx()->ScheduleSingleHop([](Transaction* t, EngineShard* shard) {
    t->GetDbSlice(shard->shard_id()).ExpireAllIfNeeded();
    return OpStatus::OK;
  });

  return cmd_cntx->SendOk();
}

void DflyCmd::ReplicaOffset(CmdArgList args, CommandContext* cmd_cntx) {
  std::vector<LSN> lsns(shard_set->size());
  shard_set->RunBriefInParallel([&](EngineShard* shard) {
    lsns[shard->shard_id()] = shard->journal() ? journal::GetLsn() : 0;
  });

  auto* rb = static_cast<facade::RedisReplyBuilder*>(cmd_cntx->rb());
  rb->SendLongArr(absl::MakeConstSpan(lsns));
}

void DflyCmd::Load(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  parser.ExpectTag("LOAD");
  string filename = parser.Next<string>();
  ServerFamily::LoadExistingKeys existing_keys = ServerFamily::LoadExistingKeys::kFail;

  if (parser.HasNext()) {
    parser.ExpectTag("APPEND");
    existing_keys = ServerFamily::LoadExistingKeys::kOverride;
  }

  if (parser.TakeError() || parser.HasNext() || filename.empty()) {
    return cmd_cntx->SendError(kSyntaxErr);
  }

  if (existing_keys == ServerFamily::LoadExistingKeys::kFail) {
    sf_->FlushAll(cmd_cntx->server_conn_cntx()->ns);
  }

  if (auto fut_ec = sf_->Load(filename, existing_keys); fut_ec) {
    GenericError ec = fut_ec->Get();
    if (ec) {
      string msg = ec.Format();
      LOG(WARNING) << "Could not load file " << msg;
      return cmd_cntx->SendError(msg);
    }
  }

  cmd_cntx->SendOk();
}

OpStatus DflyCmd::StartFullSyncInThread(FlowInfo* flow, ExecutionState* exec_st,
                                        EngineShard* shard) {
  DCHECK(shard);
  DCHECK(flow->conn);

  // The summary contains the LUA scripts, so make sure at least (and exactly one)
  // of the flows also contain them.
  SaveMode save_mode =
      shard->shard_id() == 0 ? SaveMode::SINGLE_SHARD_WITH_SUMMARY : SaveMode::SINGLE_SHARD;
  flow->saver =
      std::make_unique<RdbSaver>(flow->conn->socket(), save_mode, false, "", flow->version);

  flow->cleanup = [flow, shard]() {
    // socket shutdown is needed before calling saver->Cancel(). Because
    // we might cancel while the write to socket is blocking and
    // therefore if we wont cancel the socket the full sync fiber might
    // not get to pop entries from channel, which can cause dead lock if channel is full and some
    // callbacks are blocked on trying to insert to channel.
    flow->TryShutdownSocket();
    flow->saver->CancelInShard(shard);  // stops writing to journal stream to channel
    flow->saver.reset();
  };

  error_code ec;
  RdbSaver* saver = flow->saver.get();
  if (saver->Mode() == SaveMode::SUMMARY || saver->Mode() == SaveMode::SINGLE_SHARD_WITH_SUMMARY) {
    // Full sync summary - include all global data
    ec = saver->SaveHeader(saver->GetGlobalData(&sf_->service(), true));
  } else {
    // Per-shard - include only search index restore commands
    ec = saver->SaveHeader(saver->GetGlobalData(&sf_->service(), false));
  }
  if (ec) {
    exec_st->ReportError(ec);
    return OpStatus::CANCELLED;
  }

  saver->StartSnapshotInShard(true, exec_st, shard);

  return OpStatus::OK;
}

OpStatus DflyCmd::StopFullSyncInThread(FlowInfo* flow, ExecutionState* exec_st,
                                       EngineShard* shard) {
  DCHECK(shard);

  error_code ec = flow->saver->StopFullSyncInShard(shard);
  if (ec) {
    exec_st->ReportError(ec);
    return OpStatus::CANCELLED;
  }

  ec = flow->conn->socket()->Write(io::Buffer(flow->eof_token));
  if (ec) {
    exec_st->ReportError(ec);
    return OpStatus::CANCELLED;
  }

  // Reset cleanup and saver
  flow->cleanup = []() {};
  flow->saver.reset();
  return OpStatus::OK;
}

void DflyCmd::StartStableSyncInThread(FlowInfo* flow, ExecutionState* exec_st, EngineShard* shard) {
  // Create streamer for shard flows.
  DCHECK(shard);
  DCHECK(flow->conn);

  LSN partial_lsn = flow->start_partial_sync_at.value_or(0);
  JournalStreamer::Config config{
      .should_sent_lsn = true, .init_from_stable_sync = true, .start_partial_sync_at = partial_lsn};
  flow->streamer.reset(new JournalStreamer(exec_st, config));
  flow->streamer->Start(flow->conn->socket());

  // Register cleanup.
  flow->cleanup = [flow]() {
    flow->TryShutdownSocket();
    if (flow->streamer) {
      flow->streamer->Cancel();
    }
  };
}

auto DflyCmd::CreateSyncSession(ConnectionState* state) -> std::pair<uint32_t, unsigned> {
  util::fb2::LockGuard lk(mu_);
  unsigned sync_id = next_sync_id_++;

  unsigned flow_count = shard_set->size();
  auto err_handler = [this, sync_id](const GenericError& err) {
    LOG(INFO) << "Replication error: " << err.Format();

    // Spawn external fiber to allow destructing the context from outside
    // and return from the handler immediately.
    fb2::Fiber("stop_replication", &DflyCmd::StopReplication, this, sync_id).Detach();
  };

  string address = state->replication_info.repl_ip_address;
  uint32_t port = state->replication_info.repl_listening_port;

  LOG(INFO) << "Registered replica " << address << ":" << port;

  auto replica_ptr =
      make_shared<ReplicaInfo>(flow_count, std::move(address), port, std::move(err_handler));
  auto [it, inserted] = replica_infos_.emplace(sync_id, std::move(replica_ptr));
  CHECK(inserted);

  return {it->first, flow_count};
}

auto DflyCmd::GetReplicaInfoFromConnection(ConnectionState* state) -> std::shared_ptr<ReplicaInfo> {
  util::fb2::LockGuard lk(mu_);
  auto it = replica_infos_.find(state->replication_info.repl_session_id);
  if (it == replica_infos_.end()) {
    return nullptr;
  }

  return it->second;
}

void DflyCmd::OnClose(unsigned sync_id) {
  if (!sync_id)
    return;
  StopReplication(sync_id);
}

void DflyCmd::StopReplication(uint32_t sync_id) {
  auto replica_ptr = GetReplicaInfo(sync_id);
  if (!replica_ptr)
    return;
  VLOG(1) << "Stopping replication for sync_id: " << sync_id;

  // Because CancelReplication holds the per-replica mutex,
  // aborting connection will block here until cancellation finishes.
  // This allows keeping resources alive during the cleanup phase.
  replica_ptr->Cancel();

  util::fb2::LockGuard lk(mu_);
  replica_infos_.erase(sync_id);
}

// Because we need to annotate unique_lock
void DflyCmd::BreakStalledFlowsInShard() {
  std::unique_lock global_lock(mu_, try_to_lock);

  // give up on blocking because we run this function periodically in a background fiber,
  // so it will eventually grab the lock.
  if (!global_lock.owns_lock())
    return;

  ShardId sid = EngineShard::tlocal()->shard_id();

  vector<uint32_t> deleted;

  for (auto [sync_id, replica_ptr] : replica_infos_) {
    dfly::SharedLock replica_lock{replica_ptr->shared_mu};

    if (!replica_ptr->flows[sid].saver)
      continue;

    // If saver is present - we are currently using it for full sync.
    int64_t last_write_ns = replica_ptr->flows[sid].saver->GetLastWriteTime();
    int64_t timeout_ns = int64_t(absl::GetFlag(FLAGS_replication_timeout)) * 1'000'000LL;
    int64_t now = absl::GetCurrentTimeNanos();
    if (last_write_ns > 0 && last_write_ns + timeout_ns < now) {
      LOG(INFO) << "Master detected replication timeout, breaking full sync with replica, sync_id: "
                << sync_id << " last_write_ms: " << last_write_ns / 1000'000
                << ", now: " << now / 1000'000;

      deleted.push_back(sync_id);
      replica_lock.unlock();
      replica_ptr->Cancel();
    }
  }

  for (auto sync_id : deleted)
    replica_infos_.erase(sync_id);
}

shared_ptr<DflyCmd::ReplicaInfo> DflyCmd::GetReplicaInfo(uint32_t sync_id) {
  util::fb2::LockGuard lk(mu_);

  auto it = replica_infos_.find(sync_id);
  if (it != replica_infos_.end())
    return it->second;
  return {};
}

std::vector<ReplicaRoleInfo> DflyCmd::GetReplicasRoleInfo() const {
  std::vector<ReplicaRoleInfo> vec;
  util::fb2::LockGuard lk(mu_);

  vec.reserve(replica_infos_.size());
  map replication_lags = ReplicationLagsLocked();

  for (const auto& [id, info] : replica_infos_) {
    LSN lag = replication_lags[id];
    SyncState state = SyncState::PREPARATION;

    // If the replica state being updated, its lag is undefined,
    // the same applies of course if its state is not STABLE_SYNC.
    shared_lock lk(info->shared_mu, try_to_lock);
    if (lk.owns_lock()) {
      state = info->replica_state;
      // If the replica is not in stable sync, its lag is undefined, so we set it to 0.
      if (state != SyncState::STABLE_SYNC) {
        lag = 0;
      }
    } else {
      lag = 0;
    }
    vec.push_back(
        ReplicaRoleInfo{info->id, info->address, info->listening_port, SyncStateName(state), lag});
  }
  return vec;
}

void DflyCmd::GetReplicationMemoryStats(ReplicationMemoryStats* stats) const {
  atomic<size_t> streamer_bytes{0}, full_sync_bytes{0};

  {
    util::fb2::LockGuard lk{mu_};  // prevent state changes
    auto cb = [&](EngineShard* shard) ABSL_NO_THREAD_SAFETY_ANALYSIS {
      for (const auto& [_, info] : replica_infos_) {
        dfly::SharedLock repl_lk{info->shared_mu};

        // flows should not be empty.
        DCHECK(!info->flows.empty());
        if (info->flows.empty())
          continue;

        const auto& flow = info->flows[shard->shard_id()];
        if (flow.streamer)
          streamer_bytes.fetch_add(flow.streamer->UsedBytes(), memory_order_relaxed);
        if (flow.saver)
          full_sync_bytes.fetch_add(flow.saver->GetTotalBuffersSize(), memory_order_relaxed);
      }
    };
    shard_set->RunBlockingInParallel(cb);
  }
  stats->streamer_buf_capacity_bytes += streamer_bytes.load(memory_order_relaxed);
  stats->full_sync_buf_bytes += full_sync_bytes.load(memory_order_relaxed);
}

pair<uint32_t, shared_ptr<DflyCmd::ReplicaInfo>> DflyCmd::GetReplicaInfoOrReply(
    std::string_view id_str, CommandContext* cmd_cntx) {
  uint32_t sync_id;
  if (!ToSyncId(id_str, &sync_id)) {
    cmd_cntx->SendError(kInvalidSyncId);
    return {0, nullptr};
  }

  util::fb2::LockGuard lk(mu_);
  auto sync_it = replica_infos_.find(sync_id);
  if (sync_it == replica_infos_.end()) {
    cmd_cntx->SendError(kIdNotFound);
    return {0, nullptr};
  }

  return {sync_id, sync_it->second};
}

std::map<uint32_t, LSN> DflyCmd::ReplicationLagsLocked() const {
  DCHECK(!mu_.try_lock());  // expects to be under global lock
  if (replica_infos_.empty())
    return {};

  // In each shard we calculate a map of replica id to replication lag in the shard.
  std::vector<std::map<uint32_t, LSN>> shard_lags(shard_set->size());
  shard_set->RunBriefInParallel([&shard_lags, this](EngineShard* shard) {
    auto& lags = shard_lags[shard->shard_id()];
    for (const auto& info : ABSL_TS_UNCHECKED_READ(replica_infos_)) {
      const ReplicaInfo* replica = info.second.get();
      if (shard->journal()) {
        int64_t lag = journal::GetLsn() - replica->flows[shard->shard_id()].last_acked_lsn;
        lags[info.first] = lag;
      }
    }
  });

  // Merge the maps from all shards and derive the maximum lag for each replica.
  std::map<uint32_t, LSN> rv;
  for (const auto& lags : shard_lags) {
    for (auto [replica_id, lag] : lags) {
      rv[replica_id] = std::max(rv[replica_id], lag);
    }
  }
  return rv;
}

void DflyCmd::SetDflyClientVersion(ConnectionState* state, DflyVersion version) {
  auto replica_ptr = GetReplicaInfo(state->replication_info.repl_session_id);
  VLOG(1) << "Client version for session_id=" << state->replication_info.repl_session_id << " is "
          << int(version);

  replica_ptr->version = version;
}

// Must run under locked replica_info.mu.
// TODO: it's a bad design that we enforce replies under a lock because Send can potentially
// block, leading to high contention in some case. Split it and avoid replying under a lock.
bool DflyCmd::CheckReplicaStateOrReply(const ReplicaInfo& repl_info, SyncState expected,
                                       CommandContext* cmd_cntx) {
  if (repl_info.replica_state != expected) {
    cmd_cntx->SendError(kInvalidState);
    return false;
  }

  // Check all flows are connected.
  // This might happen if a flow abruptly disconnected before sending the SYNC request.
  for (const FlowInfo& flow : repl_info.flows) {
    if (!flow.conn) {
      cmd_cntx->SendError(kInvalidState);
      return false;
    }
  }

  return true;
}

void DflyCmd::Shutdown() {
  ReplicaInfoMap pending;
  {
    util::fb2::LockGuard lk(mu_);
    pending = std::move(replica_infos_);
  }

  for (auto& [_, replica_ptr] : pending) {
    replica_ptr->Cancel();
  }
}

void FlowInfo::TryShutdownSocket() {
  // Close socket for clean disconnect.
  if (conn->socket()->IsOpen()) {
    std::ignore = conn->socket()->Shutdown(SHUT_RDWR);
  }
}

FlowInfo::~FlowInfo() {
}

FlowInfo::FlowInfo() {
}

}  // namespace dfly


================================================
FILE: src/server/dflycmd.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/btree_map.h>

#include <atomic>
#include <memory>

#include "server/conn_context.h"
#include "server/execution_state.h"
#include "util/fibers/synchronization.h"

namespace facade {
class RedisReplyBuilder;
}  // namespace facade

namespace util {
class ListenerInterface;
}  // namespace util

namespace dfly {

class EngineShardSet;
class ServerFamily;
class RdbSaver;
class JournalStreamer;
struct ReplicaRoleInfo;
struct ReplicationMemoryStats;

// Stores information related to a single flow.
struct FlowInfo {
  FlowInfo();
  ~FlowInfo();

  // Shutdown associated socket if its still open.
  void TryShutdownSocket();

  facade::Connection* conn = nullptr;

  std::unique_ptr<RdbSaver> saver;            // Saver for full sync phase.
  std::unique_ptr<JournalStreamer> streamer;  // Streamer for stable sync phase
  std::string eof_token;

  DflyVersion version = DflyVersion::VER1;

  std::optional<LSN> start_partial_sync_at;
  uint64_t last_acked_lsn = 0;

  std::function<void()> cleanup;  // Optional cleanup for cancellation.
};

// DflyCmd is responsible for managing replication. A master instance can be connected
// to many replica instances, what is more, each of them can open multiple connections.
// This is why its important to understand replica lifecycle management before making
// any crucial changes.
//
// A ReplicaInfo instance is responsible for managing a replica's state and is accessible by its
// sync_id. Each per-thread connection is called a Flow and is represented by the FlowInfo
// instance, accessible by its index.
//
// An important aspect is synchronization and efficient locking. Two levels of locking are used:
//  1. Global locking.
//    Member  mutex `mu_` is used for synchronizing operations connected with internal data
//    structures.
//  2. Per-replica locking
//    ReplicaInfo contains a separate mutex that is used for replica-only routines. It is held
//    during state transitions (start full sync, start stable state sync), cancellation and member
//    access.
//
// Upon first connection from the replica, a new ReplicaInfo is created.
// It transitions through the following phases:
//  1. Preparation
//    During this start phase the "flows" are set up - one connection for every master thread. Those
//    connections registered by the FLOW command sent from each newly opened connection.
//  2. Full sync
//    This phase is initiated by the SYNC command. It makes sure all flows are connected and the
//    replica is in a valid state.
//  3. Stable state sync
//    After the replica has received confirmation, that each flow is ready to transition, it sends a
//    STARTSTABLE command. This transitions the replica into streaming journal changes.
//  4. Cancellation
//    This can happed due to an error at any phase or through a normal abort. For properly releasing
//    resources we need to run a multi-step cancellation procedure:
//    1. Transition state
//      We obtain the ReplicaInfo lock, transition into the cancelled state and cancel the context.
//    2. Joining tasks
//      Running tasks will stop on receiving the cancellation flag. Each FlowInfo has also an
//      optional cleanup handler, that is invoked after cancelling. This should allow recovering
//      from any state. The flows task will be awaited and joined if present.
//    3. Unlocking the mutex
//      Now that all tasks have finished and all cleanup handlers have run, we can safely release
//      the per-replica mutex, so that all OnClose handlers will unblock and  internal resources
//      will be released by dragonfly. Then the ReplicaInfo is removed from the global map.
//
//
class DflyCmd {
 public:
  // See class comments for state descriptions.
  enum class SyncState { PREPARATION, FULL_SYNC, STABLE_SYNC, CANCELLED };

  // Stores information related to a single replica.
  struct ABSL_LOCKABLE ReplicaInfo {
    ReplicaInfo(unsigned flow_count, std::string address, uint32_t listening_port,
                ExecutionState::ErrHandler err_handler)
        : replica_state{SyncState::PREPARATION},
          exec_st{std::move(err_handler)},
          address{std::move(address)},
          listening_port(listening_port),
          flows{flow_count} {
    }

    // Transition into cancelled state, run cleanup.
    void Cancel();

    SyncState replica_state;  // always guarded by shared_mu
    ExecutionState exec_st;

    std::string id;
    std::string address;
    uint32_t listening_port;
    DflyVersion version = DflyVersion::VER1;

    // Flows describe the state of shard-local flow.
    // They are always indexed by the shard index on the master.
    std::vector<FlowInfo> flows;

    util::fb2::SharedMutex shared_mu;  // See top of header for locking levels.
  };

 public:
  DflyCmd(ServerFamily* server_family);

  void Run(CmdArgList args, CommandContext* cmd_cntx);

  void OnClose(unsigned sync_id);

  // Stop all background processes so we can exit in orderly manner.
  void Shutdown();

  // Create new sync session. Returns (session_id, number of flows)
  std::pair<uint32_t, unsigned> CreateSyncSession(ConnectionState* state) ABSL_LOCKS_EXCLUDED(mu_);

  // Master side access method to replication info of that connection.
  std::shared_ptr<ReplicaInfo> GetReplicaInfoFromConnection(ConnectionState* state);

  // Master-side command. Provides Replica info.
  std::vector<ReplicaRoleInfo> GetReplicasRoleInfo() const ABSL_LOCKS_EXCLUDED(mu_);

  void GetReplicationMemoryStats(ReplicationMemoryStats* out) const ABSL_NO_THREAD_SAFETY_ANALYSIS;

  // Sets metadata.
  void SetDflyClientVersion(ConnectionState* state, DflyVersion version);

  // Tries to break those flows that stuck on socket write for too long time.
  void BreakStalledFlowsInShard() ABSL_NO_THREAD_SAFETY_ANALYSIS;

 private:
  // JOURNAL [START/STOP]
  // Start or stop journaling.
  // void Journal(CmdArgList args, ConnectionContext* cntx);

  // THREAD [to_thread]
  // Return connection thread index or migrate to another thread.
  void Thread(CmdArgList args, CommandContext* cmd_cntx);

  // FLOW <masterid> <syncid> <flowid> [<seqid>]
  // Register connection as flow for sync session.
  // If seqid is given, it means the client wants to try partial sync.
  // If it is possible, return Ok and prepare for a partial sync, else
  // return error and ask the replica to execute FLOW again.
  void Flow(CmdArgList args, CommandContext* cmd_cntx);

  // SYNC <syncid>
  // Initiate full sync.
  void Sync(CmdArgList args, CommandContext* cmd_cntx);

  // STARTSTABLE <syncid>
  // Switch to stable state replication.
  void StartStable(CmdArgList args, CommandContext* cmd_cntx);
  // TAKEOVER <syncid>
  // Shut this master down atomically with replica promotion.
  void TakeOver(CmdArgList args, CommandContext* cmd_cntx);

  // EXPIRE
  // Check all keys for expiry.
  void Expire(CmdArgList args, CommandContext* cmd_cntx);

  // REPLICAOFFSET
  // Return journal records num sent for each flow of replication.
  void ReplicaOffset(CmdArgList args, CommandContext* cmd_cntx);

  void Load(CmdArgList args, CommandContext* cmd_cntx);

  // Start full sync in thread. Start FullSyncFb. Called for each flow.
  facade::OpStatus StartFullSyncInThread(FlowInfo* flow, ExecutionState* cntx, EngineShard* shard);

  // Stop full sync in thread. Run state switch cleanup.
  facade::OpStatus StopFullSyncInThread(FlowInfo* flow, ExecutionState* cntx, EngineShard* shard);

  // Start stable sync in thread. Called for each flow.
  void StartStableSyncInThread(FlowInfo* flow, ExecutionState* cntx, EngineShard* shard);

  // Get ReplicaInfo by sync_id.
  std::shared_ptr<ReplicaInfo> GetReplicaInfo(uint32_t sync_id) ABSL_LOCKS_EXCLUDED(mu_);

  // Find sync info by id or send error reply.
  std::pair<uint32_t, std::shared_ptr<ReplicaInfo>> GetReplicaInfoOrReply(std::string_view id,
                                                                          CommandContext* cmd_cntx)
      ABSL_LOCKS_EXCLUDED(mu_);

  // Check replica is in expected state and flows are set-up correctly.
  bool CheckReplicaStateOrReply(const ReplicaInfo& ri, SyncState expected,
                                CommandContext* cmd_cntx);

  // Main entrypoint for stopping replication.
  void StopReplication(uint32_t sync_id) ABSL_LOCKS_EXCLUDED(mu_);

  std::optional<LSN> ParseLsnVec(std::string_view lsn_vec, size_t last_journal_lsn_size,
                                 size_t flow_id, CommandContext* cmd_cntx);

  // Checks if LSN exists in the partial sync buffer. If not, also LOG that we can't
  // partial sync.
  bool IsLSNInPartialSyncBuffer(LSN lsn) const;

  // Return a map between replication ID to lag. lag is defined as the maximum of difference
  // between the master's LSN and the last acknowledged LSN in over all shards.
  std::map<uint32_t, LSN> ReplicationLagsLocked() const ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);

  ServerFamily* sf_;  // Not owned
  uint32_t next_sync_id_ = 1;

  using ReplicaInfoMap = absl::btree_map<uint32_t, std::shared_ptr<ReplicaInfo>>;
  ReplicaInfoMap replica_infos_ ABSL_GUARDED_BY(mu_);

  mutable util::fb2::Mutex mu_;  // Guard global operations. See header top for locking levels.
};

std::string_view SyncStateName(DflyCmd::SyncState sync_state);

}  // namespace dfly


================================================
FILE: src/server/dragonfly_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

extern "C" {
#include "redis/sds.h"
#include "redis/zmalloc.h"
}

#include <absl/strings/ascii.h>
#include <absl/strings/str_join.h>
#include <absl/strings/strip.h>
#include <gmock/gmock.h>
#include <reflex/matcher.h>

#include "base/flags.h"
#include "base/gtest.h"
#include "base/logging.h"
#include "facade/facade_test.h"
#include "server/main_service.h"
#include "server/test_utils.h"

ABSL_DECLARE_FLAG(float, mem_defrag_threshold);
ABSL_DECLARE_FLAG(float, mem_defrag_waste_threshold);
ABSL_DECLARE_FLAG(uint32_t, mem_defrag_check_sec_interval);
ABSL_DECLARE_FLAG(std::vector<std::string>, rename_command);
ABSL_DECLARE_FLAG(bool, lua_resp2_legacy_float);
ABSL_DECLARE_FLAG(double, eviction_memory_budget_threshold);
ABSL_DECLARE_FLAG(std::vector<std::string>, command_alias);
ABSL_DECLARE_FLAG(bool, latency_tracking);

namespace dfly {

using namespace std;
using namespace util;
using absl::SetFlag;
using absl::StrCat;
using fb2::Fiber;
using ::io::Result;
using testing::AnyOf;
using testing::Contains;
using testing::ElementsAre;
using testing::HasSubstr;
using testing::Key;
using testing::Pair;

namespace {

constexpr unsigned kPoolThreadCount = 4;

const char kKey1[] = "x";
const char kKey2[] = "b";

const char kKeySid0[] = "x";
const char kKeySid1[] = "c";
const char kKeySid2[] = "b";

}  // namespace

// This test is responsible for server and main service
// (connection, transaction etc) families.
class DflyEngineTest : public BaseFamilyTest {
 protected:
  DflyEngineTest() {
    num_threads_ = kPoolThreadCount;
  }
};

class DflyEngineTestWithRegistry : public BaseFamilyTest {
 protected:
  DflyEngineTestWithRegistry() {
    num_threads_ = kPoolThreadCount;
    ResetService();
  }
};

class SingleThreadDflyEngineTest : public BaseFamilyTest {
 protected:
  SingleThreadDflyEngineTest() {
    num_threads_ = 1;
  }
};

class DefragDflyEngineTest : public SingleThreadDflyEngineTest {};

// TODO: to implement equivalent parsing in redis parser.
TEST_F(DflyEngineTest, Sds) {
  int argc;
  sds* argv = sdssplitargs("\r\n", &argc);
  EXPECT_EQ(0, argc);
  sdsfreesplitres(argv, argc);

  argv = sdssplitargs("\026 \020 \200 \277 \r\n", &argc);
  EXPECT_EQ(4, argc);
  EXPECT_STREQ("\026", argv[0]);
  sdsfreesplitres(argv, argc);

  argv = sdssplitargs(R"(abc "oops\n" )"
                      "\r\n",
                      &argc);
  EXPECT_EQ(2, argc);
  EXPECT_STREQ("oops\n", argv[1]);
  sdsfreesplitres(argv, argc);

  argv = sdssplitargs(R"( "abc\xf0" )"
                      "\t'oops\n'  \r\n",
                      &argc);
  ASSERT_EQ(2, argc);
  EXPECT_STREQ("abc\xf0", argv[0]);
  EXPECT_STREQ("oops\n", argv[1]);
  sdsfreesplitres(argv, argc);
}

class DflyRenameCommandTest : public DflyEngineTest {
 protected:
  DflyRenameCommandTest() {
    // rename flushall to myflushall, flushdb command will not be able to execute
    absl::SetFlag(
        &FLAGS_rename_command,
        std::vector<std::string>({"flushall=myflushall", "flushdb=", "ping=abcdefghijklmnop"}));
  }

  absl::FlagSaver _saver;
};

TEST_F(DflyRenameCommandTest, RenameCommand) {
  Run({"set", "a", "1"});
  ASSERT_EQ(1, CheckedInt({"dbsize"}));
  // flushall should not execute anything and should return error, as it was renamed.
  ASSERT_THAT(Run({"flushall"}), ErrArg("unknown command `FLUSHALL`"));

  ASSERT_EQ(1, CheckedInt({"dbsize"}));

  ASSERT_EQ(Run({"myflushall"}), "OK");

  ASSERT_EQ(0, CheckedInt({"dbsize"}));

  ASSERT_THAT(Run({"flushdb", "0"}), ErrArg("unknown command `FLUSHDB`"));

  ASSERT_THAT(Run({""}), ErrArg("unknown command ``"));

  ASSERT_THAT(Run({"ping"}), ErrArg("unknown command `PING`"));
  ASSERT_THAT(Run({"abcdefghijklmnop"}), "PONG");
}

TEST_F(SingleThreadDflyEngineTest, GlobalSingleThread) {
  Run({"set", "a", "1"});
  Run({"move", "a", "1"});
}

TEST_F(DflyEngineTest, LuaErrors) {
  auto resp = Run({"eval", "return redis.error_reply('some error')", "0"});
  EXPECT_THAT(resp, ErrArg("some error"));

  resp = Run({"eval", "return redis.pcall('foo', 'bar')", "0"});
  EXPECT_THAT(resp, ErrArg("ERR unknown command"));

  resp = Run({"eval", "return redis.pcall('incrby', 'foo', 'bar')", "1"});
  EXPECT_THAT(resp, ErrArg("ERR Number of keys can't be greater than number of args"));
}

TEST_F(DflyEngineTest, EvalResp) {
  auto resp = Run({"eval", "return 43", "0"});
  EXPECT_THAT(resp, IntArg(43));

  resp = Run({"eval", "return {5, 'foo', 17.5}", "0"});
  ASSERT_THAT(resp, ArrLen(3));
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(5), "foo", "17.5"));

  resp = Run({"eval", "return {map={a=1,b=2}}", "0"});
  ASSERT_THAT(resp, ArrLen(4));
  EXPECT_THAT(resp.GetVec(), AnyOf(ElementsAre("a", IntArg(1), "b", IntArg(2)),
                                   ElementsAre("b", IntArg(2), "a", IntArg(1))));
}

TEST_F(DflyEngineTest, EvalPublish) {
  auto resp = pp_->at(1)->Await([&] { return Run({"subscribe", "foo"}); });
  EXPECT_THAT(resp, ArrLen(3));

  resp = Run({"eval", "return redis.call('publish', 'foo', 'bar')", "0"});
  EXPECT_THAT(resp, IntArg(1));
}

TEST_F(DflyEngineTest, EvalBug59) {
  auto resp = Run({"eval", R"(
local epoch
if redis.call('exists', KEYS[2]) ~= 0 then
  epoch = redis.call("hget", KEYS[2], "e")
end
if epoch == false or epoch == nil then
  epoch = ARGV[6]
  redis.call("hset", KEYS[2], "e", epoch)
end
local offset = redis.call("hincrby", KEYS[2], "s", 1)
if ARGV[5] ~= '0' then
	redis.call("expire", KEYS[2], ARGV[5])
end
redis.call("xadd", KEYS[1], "MAXLEN", ARGV[2], offset, "d", ARGV[1])
redis.call("expire", KEYS[1], ARGV[3])
if ARGV[4] ~= '' then
	local payload = "__" .. "p1:" .. offset .. ":" .. epoch .. "__" .. ARGV[1]
	redis.call("publish", ARGV[4], payload)
end

return {offset, epoch}
    )",
                   "2", "x", "y", "1", "2", "3", "4", "5", "6"});
  ASSERT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(1), "6"));
}

// Scenario: 1. a lua call A schedules itself on shards 0, 1, 2.
//           2. another lua call B schedules itself on shards 1,2 but on shard 1 (or 2) it
//              schedules itself before A.
//              the order of scheduling: shard 0: A, shard 1: B, A. shard 2: B, A.
//           3. A is executes its first command first, which coincendently runs only on shard 0,
//              hence A finishes before B and then it tries to cleanup.
//           4. There was an incorrect cleanup of multi-transactions that breaks for shard 1 (or 2)
//              because it assume the A is at front of the queue.
TEST_F(DflyEngineTest, EvalBug713) {
  const char* script = "return redis.call('get', KEYS[1])";

  // A
  auto fb0 = pp_->at(1)->LaunchFiber([&] {
    ThisFiber::Yield();
    for (unsigned i = 0; i < 50; ++i) {
      Run({"eval", script, "3", kKeySid0, kKeySid1, kKeySid2});
    }
  });

  // B
  for (unsigned j = 0; j < 50; ++j) {
    Run({"eval", script, "2", kKeySid1, kKeySid2});
  }
  fb0.Join();
}

// Tests deadlock that happenned due to a fact that trans->Schedule was called
// before interpreter->Lock().
//
// The problematic scenario:
// 1. transaction 1 schedules itself and blocks on an interpreter lock
// 2. transaction 2 schedules itself, but meanwhile an interpreter unlocks itself and
//    transaction 2 grabs the lock but can not progress due to transaction 1 already
//    scheduled before.
TEST_F(DflyEngineTest, EvalBug713b) {
  const char* script = "return redis.call('get', KEYS[1])";

  const uint32_t kNumFibers = 20;
  Fiber fibers[kNumFibers];

  for (unsigned j = 0; j < kNumFibers; ++j) {
    fibers[j] = pp_->at(1)->LaunchFiber([j, script, this] {
      for (unsigned i = 0; i < 50; ++i) {
        Run(StrCat("fb", j), {"eval", script, "3", kKeySid0, kKeySid1, kKeySid2});
      }
    });
  }

  for (unsigned j = 0; j < kNumFibers; ++j) {
    fibers[j].Join();
  }
}

TEST_F(DflyEngineTest, EvalSha) {
  auto resp = Run({"script", "load", "return 5"});
  EXPECT_THAT(resp, ArgType(RespExpr::STRING));

  string sha{ToSV(resp.GetBuf())};

  resp = Run({"evalsha", sha, "0"});
  EXPECT_THAT(resp, IntArg(5));

  absl::AsciiStrToUpper(&sha);
  resp = Run({"evalsha", sha, "0"});
  EXPECT_THAT(resp, IntArg(5));

  resp = Run({"evalsha", "foobar", "0"});
  EXPECT_THAT(resp, ErrArg("No matching"));

  resp = Run({"evalsha", "", "0"});
  EXPECT_THAT(resp, ErrArg("No matching"));

  resp = Run({"script", "load", "\n return 5"});

  // Important to keep spaces in order to be compatible with Redis.
  // See https://github.com/dragonflydb/dragonfly/issues/146
  EXPECT_THAT(resp, "c6459b95a0e81df97af6fdd49b1a9e0287a57363");
}

TEST_F(DflyEngineTest, ScriptFlush) {
  auto resp = Run({"script", "load", "return 5"});
  EXPECT_THAT(resp, ArgType(RespExpr::STRING));
  string sha{ToSV(resp.GetBuf())};
  resp = Run({"evalsha", sha, "0"});
  EXPECT_THAT(5, resp.GetInt());
  resp = Run({"script", "exists", sha});
  EXPECT_THAT(1, resp.GetInt());

  resp = Run({"script", "flush"});
  resp = Run({"script", "exists", sha});
  EXPECT_THAT(0, resp.GetInt());
  EXPECT_THAT(Run({"evalsha", sha, "0"}), ErrArg("NOSCRIPT No matching script. Please use EVAL."));

  resp = Run({"script", "load", "return 5"});
  EXPECT_THAT(resp, ArgType(RespExpr::STRING));
  sha = string{ToSV(resp.GetBuf())};
  resp = Run({"evalsha", sha, "0"});
  EXPECT_THAT(5, resp.GetInt());
  resp = Run({"script", "exists", sha});
  EXPECT_THAT(1, resp.GetInt());
}

TEST_F(DflyEngineTestWithRegistry, Hello) {
  auto resp = Run({"hello"});
  ASSERT_THAT(resp, ArrLen(14));
  resp = Run({"hello", "2"});
  ASSERT_THAT(resp, ArrLen(14));

  EXPECT_THAT(
      resp.GetVec(),
      ElementsAre("server", "redis", "version", "7.4.0", "dragonfly_version",
                  ArgType(RespExpr::STRING), "proto", IntArg(2), "id", ArgType(RespExpr::INT64),
                  "mode", testing::AnyOf("standalone", "cluster"), "role", "master"));

  resp = Run({"hello", "3"});
  ASSERT_THAT(resp, ArrLen(14));
  EXPECT_THAT(
      resp.GetVec(),
      ElementsAre("server", "redis", "version", "7.4.0", "dragonfly_version",
                  ArgType(RespExpr::STRING), "proto", IntArg(3), "id", ArgType(RespExpr::INT64),
                  "mode", testing::AnyOf("standalone", "cluster"), "role", "master"));

  EXPECT_THAT(Run({"hello", "2", "AUTH", "uname", "pwd"}),
              ErrArg("WRONGPASS invalid username-password pair or user is disabled."));

  EXPECT_THAT(Run({"hello", "2", "AUTH", "default", "pwd"}),
              ErrArg("WRONGPASS invalid username-password pair or user is disabled."));

  resp = Run({"hello", "3", "AUTH", "default", ""});
  ASSERT_THAT(resp, ErrArg("WRONGPASS invalid username-password pair or user is disabled."));

  TestInitAclFam();

  resp = Run({"hello", "3", "AUTH", "default", "tmp"});
  ASSERT_THAT(resp, ArrLen(14));

  resp = Run({"hello", "3", "AUTH", "default", "tmp", "SETNAME", "myname"});
  ASSERT_THAT(resp, ArrLen(14));
}

using MP = MemcacheParser;

TEST_F(DflyEngineTest, Memcache) {
#if 0
  auto resp = RunMC(MP::SET, "key", MCArgs{"bar", 1});
  EXPECT_THAT(resp, ElementsAre("STORED"));

  resp = RunMC(MP::GETS, "key");
  EXPECT_THAT(resp, ElementsAre("VALUE key 1 3 0", "bar", "END"));

  resp = RunMC(MP::GET, "key");
  EXPECT_THAT(resp, ElementsAre("VALUE key 1 3", "bar", "END"));

  resp = RunMC(MP::ADD, "key", MCArgs{"bar", 1});
  EXPECT_THAT(resp, ElementsAre("NOT_STORED"));

  resp = RunMC(MP::REPLACE, "key2", MCArgs{"bar", 1});
  EXPECT_THAT(resp, ElementsAre("NOT_STORED"));

  resp = RunMC(MP::ADD, "key2", MCArgs{"bar2", 2});
  EXPECT_THAT(resp, ElementsAre("STORED"));

  resp = GetMC(MP::GET, {"key2", "key"});
  EXPECT_THAT(resp, ElementsAre("VALUE key2 2 4", "bar2", "VALUE key 1 3", "bar", "END"));

  resp = RunMC(MP::APPEND, "key2", MCArgs{"val2", 0});
  EXPECT_THAT(resp, ElementsAre("STORED"));
  resp = RunMC(MP::GET, "key2");
  EXPECT_THAT(resp, ElementsAre("VALUE key2 2 8", "bar2val2", "END"));

  resp = RunMC(MP::APPEND, "unkn", MCArgs{"val2", 0});
  EXPECT_THAT(resp, ElementsAre("NOT_STORED"));

  resp = RunMC(MP::GET, "unkn");
  EXPECT_THAT(resp, ElementsAre("END"));

  resp = GetMC(MP::GETS, {"key", "key2", "unknown"});
  EXPECT_THAT(resp, ElementsAre("VALUE key 1 3 0", "bar", "VALUE key2 2 8 0", "bar2val2", "END"));

  EXPECT_THAT(RunMC(MP::SET, "foo", MCArgs{"bar"}), ElementsAre("STORED"));

  EXPECT_THAT(RunMC(MP::SET, "foo", MCArgs{"bar"}), ElementsAre("STORED"));

  // 30 seconds into the future
  auto future_ts = time(nullptr) + 30;
  EXPECT_THAT(GetMC(MP::GAT, {StrCat(future_ts), "foo", "abc", "def", "ghi"}),
              ElementsAre("VALUE foo 0 3", "bar", "END"));

  EXPECT_THAT(GetMC(MP::GAT, {"1000"}),
              ElementsAre("SERVER_ERROR wrong number of arguments for 'gat' command"));
#endif
  EXPECT_THAT(RunMC(MP::SET, "persisted-key", MCArgs{"bar"}), ElementsAre("STORED"));
  // expiry of 0 removes the key expiry
  EXPECT_THAT(GetMC(MP::GAT, {"0", "persisted-key"}),
              ElementsAre("VALUE persisted-key 0 3", "bar", "END"));
}

TEST_F(DflyEngineTest, MemcacheIncr) {
  auto resp = RunMC(MP::INCR, "key", MCArgs{1});
  EXPECT_THAT(resp, ElementsAre("NOT_FOUND"));
  resp = RunMC(MP::SET, "key", MCArgs{"1"});
  EXPECT_THAT(resp, ElementsAre("STORED"));
  resp = RunMC(MP::INCR, "key", MCArgs{5});
  EXPECT_THAT(resp, ElementsAre("6"));
}

TEST_F(DflyEngineTest, MemcacheFlags) {
  using MP = MemcacheParser;

  auto resp = Run("resp", {"SET", "key", "bar", "_MCFLAGS", "42"});
  ASSERT_EQ(resp, "OK");
  MCResponse resp2 = RunMC(MP::GET, "key");
  EXPECT_THAT(resp2, ElementsAre("VALUE key 42 3", "bar", "END"));

  ASSERT_EQ(Run("resp", {"flushdb"}), "OK");
  pp_->AwaitFiberOnAll([](auto*) {
    if (auto* shard = EngineShard::tlocal(); shard) {
      EXPECT_EQ(namespaces->GetDefaultNamespace()
                    .GetDbSlice(shard->shard_id())
                    .GetDBTable(0)
                    ->mcflag.size(),
                0u);
    }
  });
}

TEST_F(DflyEngineTest, LimitMemory) {
  mi_option_enable(mi_option_limit_os_alloc);
  string blob(128, 'a');
  for (size_t i = 0; i < 10000; ++i) {
    auto resp = Run({"set", absl::StrCat(blob, i), blob});
    ASSERT_EQ(resp, "OK");
  }
}

TEST_F(DflyEngineTest, FlushAll) {
  auto fb0 = pp_->at(0)->LaunchFiber([&] { Run({"flushall"}); });

  auto fb1 = pp_->at(1)->LaunchFiber([&] {
    Run({"select", "2"});

    for (size_t i = 1; i < 100; ++i) {
      RespExpr resp = Run({"set", "foo", "bar"});
      ASSERT_EQ(resp, "OK");
      ThisFiber::Yield();
    }
  });

  fb0.Join();
  fb1.Join();
}

TEST_F(DflyEngineTest, OOM) {
  max_memory_limit = 300000;
  size_t i = 0;
  RespExpr resp;
  for (; i < 10000; i += 3) {
    resp = Run({"mset", StrCat("key", i), "bar", StrCat("key", i + 1), "bar", StrCat("key", i + 2),
                "bar"});
    if (resp != "OK")
      break;
    ASSERT_EQ(resp, "OK");
  }
  EXPECT_THAT(resp, ErrArg("Out of mem"));

  string_view commands[5] = {"set", "rpush", "sadd", "zadd", "hset"};
  for (unsigned j = 0; j < ABSL_ARRAYSIZE(commands); ++j) {
    string_view cmd = commands[j];
    vector<string_view> run_args({cmd, ""});
    if (cmd == "zadd") {
      run_args.push_back("1.1");
    } else if (cmd == "hset") {
      run_args.push_back("foo");
    }
    run_args.push_back("bar");

    for (unsigned i = 0; i < 5000; ++i) {
      auto str = StrCat("key", cmd, i);
      run_args[1] = str;
      resp = Run(run_args);

      if (resp.type == RespExpr::ERROR)
        break;

      ASSERT_THAT(resp, testing::AnyOf(IntArg(1), "OK")) << cmd;
    }
    EXPECT_THAT(resp, ErrArg("Out of mem"));
  }
}

/// Reproduces the case where items with expiry data were evicted,
/// and then written with the same key.
TEST_F(DflyEngineTest, Bug207) {
  max_memory_limit = 300000 * 4;

  // The threshold is set to 0.3 to trigger eviction earlier and prevent OOM.
  absl::FlagSaver fs;
  absl::SetFlag(&FLAGS_eviction_memory_budget_threshold, 0.3);

  shard_set->TEST_EnableCacheMode();

  /* The value should be large enough to avoid being inlined. Heartbeat evicts only objects for
   * which HasAllocated() returns true. */
  std::string value(1000, '.');

  ssize_t i = 0;
  RespExpr resp;
  for (; i < 1000; ++i) {
    resp = Run({"setex", StrCat("key", i), "30", value});
    ASSERT_EQ(resp, "OK");
  }

  auto metrics = GetMetrics();
  EXPECT_GT(metrics.events.evicted_keys, 0) << FormatMetrics(metrics);

  for (; i > 0; --i) {
    resp = Run({"setex", StrCat("key", i), "30", "bar"});
    ASSERT_EQ(resp, "OK");
  }
}

TEST_F(DflyEngineTest, StickyEviction) {
  max_memory_limit = 600000;  // 0.6mb
  shard_set->TEST_EnableCacheMode();

  string tmp_val(100, '.');

  ssize_t failed = -1;

  for (ssize_t i = 0; i < 4500; ++i) {
    string key = StrCat("volatile", i);
    ASSERT_EQ("OK", Run({"set", key, tmp_val}));
    usleep(1);
  }

  bool done = false;
  for (ssize_t i = 0; !done && i < 5000; ++i) {
    string key = StrCat("key", i);
    while (true) {
      if (Run({"set", key, tmp_val}) != "OK") {
        failed = i;
        done = true;
        break;
      }

      // Eviction could have happened right after set, before stick. If so, try again
      if (Run({"stick", key}).GetInt() == 1) {
        break;
      }
    }
  }

  ASSERT_GE(failed, 0);
  // Make sure none of the sticky values was evicted
  for (ssize_t i = 0; i < failed; ++i) {
    ASSERT_THAT(Run({"exists", StrCat("key", i)}), IntArg(1));
  }
}

TEST_F(DflyEngineTest, ZeroAllocationEviction) {
  max_memory_limit = 500000;  // 0.5mb
  shard_set->TEST_EnableCacheMode();

  // Create entries with zero-allocation values (small integers)
  // but with long keys to consume memory
  string long_key_prefix(50, 'k');  // 50 character prefix

  vector<string> keys;
  int successful_sets = 0;
  for (int i = 0; i < 1000; ++i) {
    string key = StrCat(long_key_prefix, i);
    auto result = Run({"set", key, to_string(i)});  // small integer value
    if (result == "OK") {
      keys.emplace_back(key);
      successful_sets++;
    } else {
      break;  // Stop when we hit memory limit
    }
  }

  ASSERT_GT(successful_sets, 10) << "Should be able to set at least some keys";

  // Fill up more memory to trigger eviction
  string large_value(500, 'v');
  for (int i = 0; i < 500; ++i) {
    string key = StrCat("trigger", i);
    Run({"set", key, large_value});  // This will trigger eviction
  }

  // Verify that some zero-allocation entries were evicted
  int evicted_count = 0;
  for (const string& key : keys) {
    if (Run({"exists", key}).GetInt() == 0) {
      evicted_count++;
    }
  }

  // Should have evicted some entries with zero-allocation values
  // but not external (disk storage) entries
  EXPECT_GT(evicted_count, 0) << "Zero-allocation entries should be evicted under memory pressure";
}

TEST_F(DflyEngineTest, PSubscribe) {
  single_response_ = false;
  auto resp = pp_->at(1)->Await([&] { return Run({"psubscribe", "a*", "b*"}); });
  EXPECT_THAT(resp, ArrLen(3));
  resp = pp_->at(0)->Await([&] { return Run({"publish", "ab", "foo"}); });
  EXPECT_THAT(resp, IntArg(1));

  pp_->AwaitFiberOnAll([](ProactorBase* pb) {});

  ASSERT_EQ(1, SubscriberMessagesLen("IO1"));

  const auto& msg = GetPublishedMessage("IO1", 0);
  EXPECT_EQ("foo", msg.message);
  EXPECT_EQ("ab", msg.channel);
  EXPECT_EQ("a*", msg.pattern);
}

TEST_F(DflyEngineTest, PSubscribeMatchOnlyStar) {
  single_response_ = false;
  auto resp = pp_->at(1)->Await([&] { return Run({"psubscribe", "*"}); });
  EXPECT_THAT(resp, ArrLen(3));
  resp = pp_->at(0)->Await([&] { return Run({"PUBLISH", "1234567890123456", "abc"}); });
  EXPECT_THAT(resp, IntArg(1));

  pp_->AwaitFiberOnAll([](ProactorBase* pb) {});

  ASSERT_EQ(1, SubscriberMessagesLen("IO1"));

  const auto& msg = GetPublishedMessage("IO1", 0);
  EXPECT_EQ("abc", msg.message);
  EXPECT_EQ("1234567890123456", msg.channel);
  EXPECT_EQ("*", msg.pattern);
}

TEST_F(DflyEngineTest, Unsubscribe) {
  auto resp = Run({"unsubscribe", "a"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("unsubscribe", "a", IntArg(0)));

  resp = Run({"unsubscribe"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("unsubscribe", ArgType(RespExpr::NIL), IntArg(0)));

  single_response_ = false;

  Run({"subscribe", "a", "b"});

  resp = Run({"unsubscribe", "a"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("unsubscribe", "a", IntArg(1)));

  resp = Run({"unsubscribe"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("unsubscribe", "b", IntArg(0)));
}

TEST_F(DflyEngineTest, PUnsubscribe) {
  auto resp = Run({"punsubscribe", "a*"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("punsubscribe", "a*", IntArg(0)));

  resp = Run({"punsubscribe"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("punsubscribe", ArgType(RespExpr::NIL), IntArg(0)));

  single_response_ = false;
  Run({"psubscribe", "a*", "b*"});

  resp = Run({"punsubscribe", "a*"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("punsubscribe", "a*", IntArg(1)));

  resp = Run({"punsubscribe"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("punsubscribe", "b*", IntArg(0)));
}

TEST_F(DflyEngineTest, Bug468) {
  RespExpr resp = Run({"multi"});
  ASSERT_EQ(resp, "OK");
  resp = Run({"SET", "foo", "bar", "EX", "moo"});
  ASSERT_EQ(resp, "QUEUED");

  resp = Run({"exec"});
  ASSERT_THAT(resp, ErrArg("not an integer"));

  ASSERT_FALSE(IsLocked(0, "foo"));

  resp = Run({"eval", "return redis.call('set', 'foo', 'bar', 'EX', 'moo')", "1", "foo"});
  ASSERT_THAT(resp, ErrArg("not an integer"));

  ASSERT_FALSE(IsLocked(0, "foo"));
}

TEST_F(DflyEngineTest, Bug496) {
  shard_set->RunBlockingInParallel([](EngineShard* shard) {
    auto& db = namespaces->GetDefaultNamespace().GetDbSlice(shard->shard_id());

    int cb_hits = 0;
    uint32_t cb_id =
        db.RegisterOnChange([&cb_hits](DbIndex, const DbSlice::ChangeReq&) { cb_hits++; });

    {
      auto res = *db.AddOrFind({}, "key-1", std::nullopt);
      EXPECT_TRUE(res.is_new);
      EXPECT_EQ(cb_hits, 1);
    }

    {
      auto res = *db.AddOrFind({}, "key-1", std::nullopt);
      EXPECT_FALSE(res.is_new);
      EXPECT_EQ(cb_hits, 2);
    }

    {
      auto res = *db.AddOrFind({}, "key-2", std::nullopt);
      EXPECT_TRUE(res.is_new);
      EXPECT_EQ(cb_hits, 3);
    }

    db.UnregisterOnChange(cb_id);
  });
}

TEST_F(DflyEngineTest, Issue607) {
  // https://github.com/dragonflydb/dragonfly/issues/607

  Run({"SET", "key", "value1"});
  EXPECT_EQ(Run({"GET", "key"}), "value1");

  Run({"SET", "key", "value2"});
  EXPECT_EQ(Run({"GET", "key"}), "value2");

  Run({"EXPIRE", "key", "1000"});

  Run({"SET", "key", "value3"});
  EXPECT_EQ(Run({"GET", "key"}), "value3");
}

TEST_F(DflyEngineTest, Issue679) {
  // https://github.com/dragonflydb/dragonfly/issues/679

  Run({"HMSET", "a", "key", "val"});
  Run({"EXPIRE", "a", "1000"});
  Run({"HMSET", "a", "key", "vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv"});
  Run({"EXPIRE", "a", "1001"});
}

TEST_F(DflyEngineTest, Issue742) {
  // https://github.com/dragonflydb/dragonfly/issues/607
  // The stack was not cleaned in case of an error and it blew up.
  for (int i = 0; i < 3'000; i++) {
    Run({"EVAL", "redis.get(KEYS[1], KEYS[2], KEYS[3], KEYS[4], KEYS[5])", "5", "k1", "k2", "k3",
         "k4", "k5"});
  }
}

TEST_F(DefragDflyEngineTest, TestDefragOption) {
  GTEST_SKIP() << "Defragmentation check takes too long. Disabling this test";

  // mem_defrag_threshold is based on RSS statistic, but we don't count it in the test
  absl::SetFlag(&FLAGS_mem_defrag_threshold, 0.0);
  absl::SetFlag(&FLAGS_mem_defrag_check_sec_interval, 0);
  absl::SetFlag(&FLAGS_mem_defrag_waste_threshold, 0.1);

  //  Fill data into dragonfly and then check if we have
  //  any location in memory to defrag. See issue #448 for details about this.
  constexpr size_t kMaxMemoryForTest = 1'100'000;
  constexpr int kNumberOfKeys = 1'000;  // this fill the memory
  constexpr int kKeySize = 637;
  constexpr int kMaxDefragTriesForTests = 30;
  constexpr int kFactor = 4;

  max_memory_limit = kMaxMemoryForTest;  // control memory size so no need for too many keys
  std::vector<std::string> keys2delete;
  keys2delete.push_back("del");

  // create keys that we would like to remove, try to make it none adjusting locations
  for (int i = 0; i < kNumberOfKeys; i += kFactor) {
    keys2delete.push_back("key-name:" + std::to_string(i));
  }

  std::vector<std::string_view> keys(keys2delete.begin(), keys2delete.end());

  Run({"SELECT", "2"});

  RespExpr resp = Run(
      {"DEBUG", "POPULATE", std::to_string(kNumberOfKeys), "key-name", std::to_string(kKeySize)});
  ASSERT_EQ(resp, "OK");
  auto r = CheckedInt({"DBSIZE"});

  ASSERT_EQ(r, kNumberOfKeys);

  shard_set->pool()->AwaitFiberOnAll([&](unsigned index, ProactorBase* base) {
    EngineShard* shard = EngineShard::tlocal();
    ASSERT_FALSE(shard == nullptr);  // we only have one and its should not be empty!
    ThisFiber::SleepFor(100ms);

    // make sure that the task that collect memory usage from all shard ran
    // for at least once, and that no defrag was done yet.
    auto stats = shard->stats();
    for (int i = 0; i < 3; i++) {
      ThisFiber::SleepFor(100ms);
      EXPECT_EQ(stats.defrag_realloc_total, 0);
    }
  });

  ArgSlice delete_cmd(keys);
  r = CheckedInt(delete_cmd);
  LOG(INFO) << "finish deleting memory entries " << r;
  // the first element in this is the command del so size is one less
  ASSERT_EQ(r, keys2delete.size() - 1);
  // At this point we need to see whether we did running the task and whether the task did something
  shard_set->pool()->AwaitFiberOnAll([&](unsigned index, ProactorBase* base) {
    EngineShard* shard = EngineShard::tlocal();
    ASSERT_TRUE(shard != nullptr);  // we only have one and its should not be empty!
    // a "busy wait" to ensure that memory defragmentations was successful:
    // the task ran and did it work
    auto stats = shard->stats();
    for (int i = 0; i < kMaxDefragTriesForTests && stats.defrag_realloc_total == 0; i++) {
      stats = shard->stats();
      ThisFiber::SleepFor(220ms);
    }
    // make sure that we successfully found places to defrag in memory
    EXPECT_GT(stats.defrag_realloc_total, 0);
    EXPECT_GE(stats.defrag_attempt_total, stats.defrag_realloc_total);
  });
}

TEST_F(DefragDflyEngineTest, DefragEventuallyFinishes) {
  Run("DEBUG POPULATE 5000 key 256");
  Run("FT.CREATE index ON HASH PREFIX 1 doc: SCHEMA t TAG WITHSUFFIXTRIE");
  for (int i = 0; i < 1000; ++i) {
    Run(absl::StrFormat("HSET doc:%d t category%d", i, i));
  }

  shard_set->pool()->AwaitFiberOnAll([&](unsigned, ProactorBase*) {
    auto* shard = EngineShard::tlocal();
    if (!shard)
      return;

    // Try to run defrag at least this many times and stop early if cursor reaches the end (winds
    // back to 0)
    constexpr auto max_attempts = 500;

    std::vector<uint64_t> cursor_states;
    cursor_states.reserve(max_attempts);

    cursor_states.push_back(shard->GetDefragCursor());
    EXPECT_EQ(cursor_states.back(), 0);

    for (int i = 0; i < max_attempts; ++i) {
      PageUsage page_usage{CollectPageStats::NO, 0, CycleQuota{CycleQuota::kDefaultDefragQuota}};
      page_usage.SetForceReallocate(true);

      shard->DoDefrag(&page_usage);
      cursor_states.push_back(shard->GetDefragCursor());
      if (cursor_states.back() == 0)
        return;
    }

    // Defrag ran at least once
    EXPECT_GT(cursor_states.size(), 1);
    EXPECT_EQ(cursor_states.back(), 0)
        << "did not conclude defragmenting in " << cursor_states.size() << " runs";

    EXPECT_GT(shard->stats().defrag_realloc_total, 0);
    EXPECT_GE(shard->stats().defrag_attempt_total, shard->stats().defrag_realloc_total);
  });
}

TEST_F(DflyEngineTest, Issue752) {
  // https://github.com/dragonflydb/dragonfly/issues/752
  // local_result_ member was not reset between commands
  Run({"multi"});
  auto resp = Run({"llen", kKey1});
  ASSERT_EQ(resp, "QUEUED");
  resp = Run({"del", kKey1, kKey2});
  ASSERT_EQ(resp, "QUEUED");
  resp = Run({"exec"});
  ASSERT_THAT(resp, ArrLen(2));
  ASSERT_THAT(resp.GetVec(), ElementsAre(IntArg(0), IntArg(0)));
}

TEST_F(DflyEngineTest, Latency) {
  Run({"latency", "latest"});
}

TEST_F(DflyEngineTest, EvalBug2664) {
  absl::FlagSaver fs;
  SetFlag(&FLAGS_lua_resp2_legacy_float, true);

  auto resp = Run({"eval", "return 42.9", "0"});
  EXPECT_THAT(resp, IntArg(42));
  resp = Run({"eval", "return -3.8", "0"});
  EXPECT_THAT(resp, IntArg(-3));

  resp = Run({"hello", "3"});
  ASSERT_THAT(resp, ArrLen(14));

  resp = Run({"eval", "return 42.9", "0"});
  EXPECT_THAT(resp, IntArg(42));
}

TEST_F(DflyEngineTest, MemoryUsage) {
  for (unsigned i = 0; i < 1000; ++i) {
    Run({"rpush", "l1", StrCat("val", i)});
  }

  for (unsigned i = 0; i < 1000; ++i) {
    Run({"rpush", "l2", StrCat(string(200, 'a'), i)});
  }
  auto resp = Run({"memory", "usage", "l1"});
  EXPECT_GT(*resp.GetInt(), 8000);

  resp = Run({"memory", "usage", "l2"});
  EXPECT_GT(*resp.GetInt(), 100000);
}

// MEMORY USAGE without a key caused a DCHECK crash in CmdArgParser destructor
// because the parser error was never consumed.
TEST_F(DflyEngineTest, MemoryUsageNoKey) {
  auto resp = Run({"memory", "usage"});
  EXPECT_THAT(resp, ErrArg("syntax error"));
}

TEST_F(DflyEngineTest, DebugObject) {
  Run({"set", "key", "value"});
  Run({"lpush", "l1", "a", "b"});
  Run({"sadd", "s1", "1", "2", "3"});
  Run({"sadd", "s2", "a", "b", "c"});
  Run({"zadd", "z1", "1", "a", "2", "b", "3", "c"});
  Run({"hset", "h1", "a", "1", "b", "2", "c", "3"});
  auto resp = Run({"debug", "object", "key"});
  EXPECT_THAT(resp.GetString(), HasSubstr("encoding:raw"));
  resp = Run({"debug", "object", "l1"});
  EXPECT_THAT(resp.GetString(), HasSubstr("encoding:listpack"));
  resp = Run({"debug", "object", "s1"});
  EXPECT_THAT(resp.GetString(), HasSubstr("encoding:intset"));
  resp = Run({"debug", "object", "s2"});
  EXPECT_THAT(resp.GetString(), HasSubstr("encoding:dense_set"));
  resp = Run({"debug", "object", "z1"});
  EXPECT_THAT(resp.GetString(), HasSubstr("encoding:listpack"));

  // Test promotion to quicklist
  Run({"lpush", "l1", string(3000, 'x')});
  resp = Run({"debug", "object", "l1"});
  EXPECT_THAT(resp.GetString(), HasSubstr("encoding:quicklist"));
}

TEST_F(DflyEngineTest, StreamMemInfo) {
  for (int i = 1; i < 2; ++i) {
    Run({"XADD", "test", std::to_string(i), "var", "val" + std::to_string(i)});
  }

  int64_t stream_mem_first = GetMetrics().db_stats[0].memory_usage_by_type[OBJ_STREAM];
  EXPECT_GT(stream_mem_first, 0);

  auto dump = Run({"dump", "test"});
  Run({"del", "test"});
  Run({"restore", "test", "0", facade::ToSV(dump.GetBuf())});

  int64_t stream_mem_second = GetMetrics().db_stats[0].memory_usage_by_type[OBJ_STREAM];

  // stream_mem_first != stream_mem_second due to a preallocation in XADD command (see
  // STREAM_LISTPACK_MAX_PRE_ALLOCATE)
  EXPECT_GT(stream_mem_second, 0);
}

TEST_F(DflyEngineTest, ReplicaofRejectOnLoad) {
  service_->SwitchState(GlobalState::ACTIVE, GlobalState::LOADING);

  RespExpr res = Run({"REPLICAOF", "localhost", "3779"});

  ASSERT_THAT(res, ErrArg("LOADING Dragonfly is loading the dataset in memory"));
}

// TODO: to test transactions with a single shard since then all transactions become local.
// To consider having a parameter in dragonfly engine controlling number of shards
// unconditionally from number of cpus. TO TEST BLPOP under multi for single/multi argument case.

TEST_F(DflyEngineTest, CommandMetricLabels) {
  EXPECT_EQ(Run({"SET", "foo", "bar"}), "OK");
  EXPECT_EQ(Run({"GET", "foo"}), "bar");
  const Metrics metrics = GetMetrics();

  // The test connection counts as other
  EXPECT_EQ(metrics.facade_stats.conn_stats.command_cnt_other, 2);
  EXPECT_EQ(metrics.facade_stats.conn_stats.command_cnt_main, 0);
  EXPECT_EQ(metrics.facade_stats.conn_stats.num_conns_main, 0);
  EXPECT_EQ(metrics.facade_stats.conn_stats.num_conns_other, 0);
}

TEST_F(DflyEngineTest, Huffman) {
  // enable compression for keys optimized for letter a.
  auto resp = Run({"debug", "compression", "set", "GBDgCpXW/////7/pygS5t9x7792qU1trLQ=="});
  EXPECT_EQ(resp, "OK");

  // for string values optimized for letter x.
  resp = Run({"debug", "compression", "set", "ChD4bAf/D/bPSwY=", "string"});
  EXPECT_EQ(resp, "OK");
  resp = Run({"debug", "populate", "200000", "aaaaaaaaaaaaaaaaaaaaaaaaaa", "32"});
  EXPECT_EQ(resp, "OK");

  auto metrics = GetMetrics();
  EXPECT_EQ(metrics.events.huff_encode_success, 400000);  // each key and value
  EXPECT_LT(metrics.heap_used_bytes, 14'000'000);         // less than 15mb
}

TEST_F(DflyEngineTest, MemoryKeys) {
  Run({"debug", "populate", "10000", "abcd_efgh_ijkl_mnop", "10"});
  auto metrics = GetMetrics();
  EXPECT_GT(metrics.db_stats[0].memory_usage_by_type[OBJ_KEY], 100000);
}

// Verify that inline_keys, expire_count, and OBJ_KEY memory stay consistent
// when expire is added/removed on inline keys (regression for memory underflow bug).
TEST_F(DflyEngineTest, ExpireInlineKeyAccounting) {
  // Keys short enough to be stored inline (kInlineLen = 16).
  constexpr int kCount = 10;
  for (int i = 0; i < kCount; i++)
    Run({"set", absl::StrCat("k", i), "v"});

  auto stats = GetMetrics().db_stats[0];
  EXPECT_EQ(stats.inline_keys, kCount);
  EXPECT_EQ(stats.expire_count, 0u);
  EXPECT_EQ(stats.memory_usage_by_type[OBJ_KEY], 0);

  // Setting expire transitions inline -> SDS_TTL_TAG (heap-allocated).
  for (int i = 0; i < kCount; i++)
    Run({"expire", absl::StrCat("k", i), "3600"});

  stats = GetMetrics().db_stats[0];
  EXPECT_EQ(stats.inline_keys, 0u);
  EXPECT_EQ(stats.expire_count, kCount);
  EXPECT_GT(stats.memory_usage_by_type[OBJ_KEY], 0);

  // PERSIST transitions SDS_TTL_TAG -> inline again.
  for (int i = 0; i < kCount; i++)
    Run({"persist", absl::StrCat("k", i)});

  stats = GetMetrics().db_stats[0];
  EXPECT_EQ(stats.inline_keys, kCount);
  EXPECT_EQ(stats.expire_count, 0u);
  EXPECT_EQ(stats.memory_usage_by_type[OBJ_KEY], 0);

  // Re-expire then delete: prior bug caused memory accounting underflow on deletion.
  for (int i = 0; i < kCount; i++)
    Run({"expire", absl::StrCat("k", i), "3600"});
  for (int i = 0; i < kCount; i++)
    Run({"del", absl::StrCat("k", i)});

  stats = GetMetrics().db_stats[0];
  EXPECT_EQ(stats.inline_keys, 0u);
  EXPECT_EQ(stats.expire_count, 0u);
  EXPECT_EQ(stats.memory_usage_by_type[OBJ_KEY], 0);
}

class DflyCommandAliasTest : public DflyEngineTest {
 protected:
  DflyCommandAliasTest() {
    SetFlag(&FLAGS_command_alias, {"___set=set", "___ping=ping"});
    SetFlag(&FLAGS_latency_tracking, true);
  }

  absl::FlagSaver saver_;
};

TEST_F(DflyCommandAliasTest, Aliasing) {
  EXPECT_EQ(Run({"SET", "foo", "bar"}), "OK");
  EXPECT_EQ(Run({"___SET", "a", "b"}), "OK");
  EXPECT_EQ(Run({"GET", "foo"}), "bar");
  EXPECT_EQ(Run({"GET", "a"}), "b");
  EXPECT_EQ(Run({"___ping"}), "PONG");

  Metrics metrics = GetMetrics();
  const auto& stats = metrics.cmd_stats_map;

  EXPECT_THAT(stats, Contains(Pair("___set", Key(1))));
  EXPECT_THAT(stats, Contains(Pair("set", Key(1))));
  EXPECT_THAT(stats, Contains(Pair("___ping", Key(1))));
  EXPECT_THAT(stats, Contains(Pair("get", Key(2))));

  // test stats within multi-exec
  EXPECT_EQ(Run({"multi"}), "OK");
  EXPECT_EQ(Run({"___set", "a", "x"}), "QUEUED");
  EXPECT_EQ(Run({"exec"}), "OK");

  metrics = GetMetrics();
  EXPECT_THAT(metrics.cmd_stats_map, Contains(Pair("___set", Key(2))));
  EXPECT_THAT(metrics.cmd_stats_map, Contains(Pair("set", Key(1))));
  EXPECT_THAT(metrics.cmd_stats_map, Contains(Pair("multi", Key(1))));
  EXPECT_THAT(metrics.cmd_stats_map, Contains(Pair("exec", Key(1))));
}

TEST_F(DflyCommandAliasTest, AliasesShareHistogramPtr) {
  EXPECT_EQ(Run({"SET", "foo", "bar"}), "OK");
  EXPECT_EQ(Run({"___SET", "a", "b"}), "OK");
  EXPECT_EQ(Run({"___ping"}), "PONG");

  const auto command_histograms = GetMetrics().cmd_latency_map;
  for (const auto& key : {"set", "___set", "___ping", "ping"}) {
    EXPECT_TRUE(command_histograms.contains(key));
  }

  EXPECT_EQ(command_histograms.at("set"), command_histograms.at("___set"));
  EXPECT_EQ(command_histograms.at("ping"), command_histograms.at("___ping"));
}

}  // namespace dfly


================================================
FILE: src/server/engine_shard.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/engine_shard.h"

#include <absl/strings/escaping.h>
#include <absl/strings/str_cat.h>
#include <absl/strings/str_format.h>

#include <memory>

#include "base/flags.h"
#include "core/huff_coder.h"
#include "core/page_usage/page_usage_stats.h"
#include "io/proc_reader.h"

extern "C" {
#include "redis/zmalloc.h"
}
#include "server/blocking_controller.h"
#include "server/db_slice.h"
#include "server/engine_shard_set.h"
#include "server/journal/journal.h"
#include "server/namespaces.h"
#include "server/search/doc_index.h"
#include "server/server_state.h"
#include "server/snapshot.h"
#include "server/tiered_storage.h"
#include "server/transaction.h"
#include "util/fibers/proactor_base.h"

using namespace std;

ABSL_FLAG(float, mem_defrag_threshold, 0.7,
          "Minimum percentage of used memory relative to maxmemory cap before running "
          "defragmentation");

ABSL_FLAG(uint32_t, mem_defrag_check_sec_interval, 60,
          "Number of seconds between every defragmentation necessity check");

ABSL_FLAG(float, mem_defrag_waste_threshold, 0.2,
          "The ratio of wasted/committed memory above which we run defragmentation");

ABSL_FLAG(float, mem_defrag_page_utilization_threshold, 0.8,
          "memory page under utilization threshold. Ratio between used and committed size, below "
          "this, memory in this page will defragmented");

ABSL_FLAG(int32_t, hz, 100,
          "Base frequency at which the server performs other background tasks. "
          "Warning: not advised to decrease in production.");

ABSL_FLAG(string, tiered_prefix, "",
          "Enables tiered storage if set. "
          "The string denotes the path and prefix of the files "
          " associated with tiered storage. Stronly advised to use "
          "high performance NVME ssd disks for this. Also, seems that pipeline_squash does "
          "not work well with tiered storage, so it's advised to set it to 0.");

ABSL_FLAG(bool, enable_heartbeat_eviction, true,
          "Enable eviction during heartbeat when memory is under pressure.");
ABSL_FLAG(bool, enable_heartbeat_rss_eviction, true,
          "Enable eviction during heartbeat when rss memory is under pressure. Eviction based "
          "on used_memory will still be enabled.");
ABSL_FLAG(double, eviction_memory_budget_threshold, 0.1,
          "Eviction starts when the free memory (including RSS memory) drops below "
          "eviction_memory_budget_threshold * max_memory_limit.");
ABSL_FLAG(bool, background_heartbeat, false, "Whether to run heartbeat as a background fiber");
ABSL_DECLARE_FLAG(uint32_t, max_eviction_per_heartbeat);

namespace dfly {

using absl::GetFlag;
using namespace util;

namespace {

constexpr uint64_t kCursorDoneState = 0u;

bool HasContendedLocks(ShardId shard_id, Transaction* trx, const DbTable* table) {
  auto is_contended = [table](LockFp fp) { return table->trans_locks.Find(fp)->IsContended(); };

  if (trx->IsMulti()) {
    auto fps = trx->GetMultiFps();
    for (const auto& [sid, fp] : fps) {
      if (sid == shard_id && is_contended(fp))
        return true;
    }
  } else {
    KeyLockArgs lock_args = trx->GetLockArgs(shard_id);
    for (size_t i = 0; i < lock_args.fps.size(); ++i) {
      if (is_contended(lock_args.fps[i]))
        return true;
    }
  }

  return false;
}

constexpr size_t kQueueLen = 64;

optional<uint32_t> GetPeriodicCycleMs() {
  int hz = GetFlag(FLAGS_hz);
  if (hz <= 0)
    return nullopt;

  uint32_t clock_cycle_ms = 1000 / hz;
  if (clock_cycle_ms == 0)
    clock_cycle_ms = 1;
  return clock_cycle_ms;
}

size_t CalculateHowManyBytesToEvictOnShard(size_t global_memory_limit, size_t global_used_memory,
                                           size_t shard_memory_threshold) {
  if (global_used_memory > global_memory_limit) {
    // Used memory is above the limit, we need to evict all bytes
    return (global_used_memory - global_memory_limit) / shard_set->size() + shard_memory_threshold;
  }

  const size_t shard_budget = (global_memory_limit - global_used_memory) / shard_set->size();
  return shard_budget < shard_memory_threshold ? (shard_memory_threshold - shard_budget) : 0;
}

class HuffmanCheckTask {
 public:
  HuffmanCheckTask() {
    hist_.fill(0);
  }

  int32_t Run(DbSlice* db_slice);

 private:
  PrimeTable::Cursor cursor_;

  static constexpr unsigned kMaxSymbol = 255;
  array<unsigned, kMaxSymbol + 1> hist_;  // histogram of symbols.
  string scratch_;
};

int32_t HuffmanCheckTask::Run(DbSlice* db_slice) {
  DbTable* db_table = db_slice->GetDBTable(0);  // we currently support only default db.
  if (!db_table)
    return -1;

  // incrementally aggregate frequency histogram.
  auto& prime = db_table->prime;

  constexpr uint32_t kMaxTraverses = 512;
  uint32_t traverses_count = 0;
  do {
    cursor_ = prime.Traverse(cursor_, [&](PrimeIterator it) {
      if (!it->first.IsInline()) {
        string_view val = it->first.GetSlice(&scratch_);
        for (unsigned char c : val) {
          hist_[c]++;
        }

        if (val.size() > 1024) {
          traverses_count = kMaxTraverses;  // return early.
          string{}.swap(scratch_);          // free memory.
        }
      }
    });
    traverses_count++;
  } while (traverses_count < kMaxTraverses && cursor_);

  if (cursor_)
    return 4;  // priority to continue later.

  // Finished scanning the table, now normalize the table.
  constexpr unsigned kMaxFreqTotal = static_cast<unsigned>((1U << 31) * 0.9);
  size_t total_freq = std::accumulate(hist_.begin(), hist_.end(), 0UL);
  if (total_freq == 0)
    return -1;

  // to avoid overflow.
  double scale = total_freq > kMaxFreqTotal ? static_cast<double>(total_freq) / kMaxFreqTotal : 1.0;
  for (unsigned i = 0; i <= kMaxSymbol; i++) {
    hist_[i] = static_cast<unsigned>(hist_[i] / scale);
    if (hist_[i] == 0) {
      hist_[i] = 1;  // Avoid zero frequency symbols.
    }
  }

  // Build the huffman table. We currently output the table to logs and just increase
  // the metric counter to signal that we built a table.

  HuffmanEncoder huff_enc;
  string error_msg;
  if (huff_enc.Build(hist_.data(), kMaxSymbol, &error_msg)) {
    size_t compressed_size = huff_enc.EstimateCompressedSize(hist_.data(), kMaxSymbol);
    LOG(INFO) << "Huffman table built, reducing character count from " << total_freq << " to "
              << compressed_size << ", compression ratio " << double(compressed_size) / total_freq;
    string bintable = huff_enc.Export();
    LOG(INFO) << "Huffman binary table: " << absl::Base64Escape(bintable);
    db_slice->shard_owner()->stats().huffman_tables_built++;
  } else {
    LOG(WARNING) << "Huffman build failed: " << error_msg;
  }

  return -1;  // task completed.
}

}  // namespace

__thread EngineShard* EngineShard::shard_ = nullptr;
uint64_t TEST_current_time_ms = 0;

string EngineShard::TxQueueInfo::Format() const {
  string res;

  if (tx_total > 0) {
    absl::StrAppend(&res, "tx armed ", tx_armed, ", total: ", tx_total, ",global:", tx_global,
                    ",runnable:", tx_runnable, "\n");
    absl::StrAppend(&res, ", head: ", head.debug_id_info, "\n");
  }
  if (total_locks > 0) {
    absl::StrAppend(&res, "locks total:", total_locks, ",contended:", contended_locks, "\n");
  }
  if (max_contention_score > 0) {
    absl::StrAppend(&res, "max contention score: ", max_contention_score,
                    ", lock: ", max_contention_lock, "\n");
  }

  return res;
}

EngineShard::Stats& EngineShard::Stats::operator+=(const Stats& o) {
  static_assert(sizeof(Stats) == 152);

#define ADD(x) x += o.x

  ADD(defrag_attempt_total);
  ADD(defrag_realloc_total);
  ADD(defrag_task_invocation_total);
  ADD(defrag_skipped_mem_under_threshold);
  ADD(defrag_skipped_within_check_interval);
  ADD(defrag_skipped_not_enough_fragmentation);
  ADD(poll_execution_total);
  ADD(tx_ooo_total);
  ADD(tx_optimistic_total);
  ADD(tx_batch_schedule_calls_total);
  ADD(tx_batch_scheduled_items_total);
  ADD(total_heartbeat_expired_keys);
  ADD(total_heartbeat_expired_bytes);
  ADD(total_heartbeat_expired_calls);
  ADD(total_migrated_keys);
  ADD(huffman_tables_built);
  ADD(stream_sequential_accesses);
  ADD(stream_random_accesses);
  ADD(stream_fetch_all_accesses);

#undef ADD
  return *this;
}

void EngineShard::DefragTaskState::UpdateScanState(uint64_t cursor_val) {
  cursor = cursor_val;
  // Once we're done with a db, jump to the next
  if (cursor == kCursorDoneState) {
    dbid++;
  }
}

void EngineShard::DefragTaskState::ResetScanState() {
  dbid = cursor = 0u;
}

// This function checks 3 things:
// 1. Don't try memory fragmentation if we don't use "enough" memory (control by
// mem_defrag_threshold flag)
// 2. We have memory blocks that can be better utilized (there is a "wasted memory" in them).
// 3. in case the above is OK, make sure that we have a "gap" between usage and commited memory
// (control by mem_defrag_waste_threshold flag)
EngineShard::DefragTaskState::SkipReason EngineShard::DefragTaskState::CheckRequired() {
  using enum SkipReason;
  if (cursor > kCursorDoneState) {
    VLOG(2) << "cursor: " << cursor;
    return NotSkipped;
  }

  size_t limit = max_memory_limit.load(memory_order_relaxed);

  const std::size_t memory_per_shard = limit / shard_set->size();
  if (memory_per_shard < (1 << 16)) {  // Too small.
    return MemoryTooLow;
  }

  thread_local fragmentation_info finfo{
      .committed = 0, .committed_golden = 0, .wasted = 0, .bin = 0};

  const std::size_t global_threshold = double(limit) * GetFlag(FLAGS_mem_defrag_threshold);
  if (global_threshold > rss_mem_current.load(memory_order_relaxed)) {
    finfo.bin = 0;  // reset.
    return MemoryBelowThreshold;
  }

  if (finfo.bin == 0) {  // did not start the iterative checking yet
    const auto now = time(nullptr);
    const auto seconds_from_prev_check = now - last_check_time;
    const auto mem_defrag_interval = GetFlag(FLAGS_mem_defrag_check_sec_interval);

    if (seconds_from_prev_check < mem_defrag_interval) {
      return CheckWithinInterval;
    }

    // start checking.
    finfo.committed = finfo.committed_golden = 0;
    finfo.wasted = 0;
    page_utilization_threshold = GetFlag(FLAGS_mem_defrag_page_utilization_threshold);
  }

  uint64_t start = absl::GetCurrentTimeNanos();
  int res = zmalloc_get_allocator_fragmentation_step(page_utilization_threshold, &finfo);
  uint64_t duration = absl::GetCurrentTimeNanos() - start;
  VLOG(1) << "Reading memory usage took " << duration << " ns on bin " << finfo.bin - 1;

  if (res == 0) {
    // finished checking.
    last_check_time = time(nullptr);

    if (finfo.committed != finfo.committed_golden) {
      LOG_FIRST_N(ERROR, 100) << "committed memory computed incorrectly: " << finfo.committed
                              << " vs " << finfo.committed_golden;
    }

    const double waste_threshold = GetFlag(FLAGS_mem_defrag_waste_threshold);
    if (finfo.wasted > size_t(finfo.committed * waste_threshold)) {
      VLOG(1) << "memory fragmentation issue found: " << finfo.wasted << " " << finfo.committed;
      return NotSkipped;
    }
    return NotEnoughFragmentation;
  }
  return CheckInProgress;
}

std::optional<CollectedPageStats> EngineShard::DoDefrag(PageUsage* page_usage) {
  // --------------------------------------------------------------------------
  // NOTE: This task is running with exclusive access to the shard.
  // i.e. - Since we are using shared nothing access here, and all access
  // are done using fibers, This fiber is run only when no other fiber in the
  // context of the controlling thread will access this shard!
  // --------------------------------------------------------------------------

  // TODO: enable tiered storage on non-default db slice
  DbSlice& slice = namespaces->GetDefaultNamespace().GetDbSlice(shard_->shard_id());

  // If we moved to an invalid db, skip as long as it's not the last one
  while (!slice.IsDbValid(defrag_state_.dbid) && defrag_state_.dbid + 1 < slice.db_array_size())
    defrag_state_.dbid++;

  // If we found no valid db, we finished traversing and start from scratch next time
  if (!slice.IsDbValid(defrag_state_.dbid)) {
    defrag_state_.ResetScanState();
    return std::nullopt;
  }

  DCHECK(slice.IsDbValid(defrag_state_.dbid));
  auto [prime_table, _unused_expire] = slice.GetTables(defrag_state_.dbid);
  PrimeTable::Cursor cur{defrag_state_.cursor};
  uint64_t reallocations = 0;
  uint64_t attempts = 0;

  DbTable* db_table = slice.GetDBTable(defrag_state_.dbid);
  do {
    cur = prime_table->Traverse(cur, [&](PrimeIterator it) {
      // for each value check whether we should move it because it
      // seats on underutilized page of memory, and if so, do it.
      const ssize_t original_size = it->second.MallocUsed();
      const bool did = it->second.DefragIfNeeded(page_usage);
      attempts++;
      if (did) {
        reallocations++;
        if (const ssize_t delta = it->second.MallocUsed() - original_size; delta != 0) {
          db_table->stats.AddTypeMemoryUsage(it->second.ObjType(), delta);
        }
      }
    });
  } while (!page_usage->QuotaDepleted() && cur && namespaces);
  const uint64_t used_cycles = page_usage->UsedQuotaCycles();
  const uint64_t usec = base::CycleClock::ToUsec(used_cycles);

  defrag_state_.UpdateScanState(cur.token());

  page_usage->ExtendQuota(50);
  const auto [quota_depleted, objects_moved] = shard_search_indices_->Defragment(page_usage);
  reallocations += objects_moved;

  stats_.defrag_realloc_total += reallocations;
  stats_.defrag_task_invocation_total++;
  stats_.defrag_attempt_total += attempts;

  const char* cursor_state =
      defrag_state_.cursor == kCursorDoneState ? "at the end" : "in progress";
  if (reallocations > 0) {
    VLOG(2) << absl::StrFormat(
        "shard %u: successfully defragmented %lu times in %lu cycles (%lu usec), "
        "cursor is %s",
        slice.shard_id(), reallocations, used_cycles, usec, cursor_state);
  } else {
    VLOG(2) << absl::StrFormat(
        "shard %u: ran defragmentation for %lu cycles (%lu usec), cursor at %s, "
        "but no locations for defragmentation were found",
        slice.shard_id(), used_cycles, usec, cursor_state);
  }

  return page_usage->CollectedStats();
}

// the memory defragmentation task is as follow:
//  1. Check if memory usage is high enough
//  2. Check if diff between commited and used memory is high enough
//  3. if all the above pass -> scan this shard and try to find whether we can move pointer to
//  underutilized pages values
//     if the cursor returned from scan is not in done state, schedule the task to run at high
//     priority.
//     otherwise lower the task priority so that it would not use the CPU when not required
uint32_t EngineShard::DefragTask() {
  using enum DefragTaskState::SkipReason;

  constexpr uint32_t kRunAtLowPriority = 0u;
  if (!namespaces) {
    return kRunAtLowPriority;
  }

  if (auto check_result = defrag_state_.CheckRequired(); check_result == NotSkipped) {
    VLOG(2) << shard_id_ << ": need to run defrag memory cursor state: " << defrag_state_.cursor;
    static const float threshold = GetFlag(FLAGS_mem_defrag_page_utilization_threshold);
    // TODO (abhijat): implement move ctor for PageUsage so this object can be moved into the task.
    PageUsage page_usage{CollectPageStats::NO, threshold,
                         CycleQuota{CycleQuota::kDefaultDefragQuota}};
    if (DoDefrag(&page_usage)) {
      // we didn't finish the scan
      return ProactorBase::kOnIdleMaxLevel;
    }
  } else {
    std::string_view reason;
    switch (check_result) {
      case MemoryTooLow:
        // Don't track stats for configuration which is not going to change
        reason = "memory too low";
        break;
      case MemoryBelowThreshold:
        reason = "rss below threshold";
        stats_.defrag_skipped_mem_under_threshold++;
        break;
      case CheckWithinInterval:
        reason = "defrag check ran too soon";
        stats_.defrag_skipped_within_check_interval++;
        break;
      case NotEnoughFragmentation:
        reason = "not enough fragmentation to defrag";
        stats_.defrag_skipped_not_enough_fragmentation++;
        break;
      case CheckInProgress:
        reason = "check is in progress";
        break;
      default:
        DCHECK(false) << "unexpected result";
    }
    VLOG(2) << shard_id_ << " skipped defragmentation task: " << reason;
  }
  return 6;  // priority.
}

EngineShard::EngineShard(util::ProactorBase* pb, mi_heap_t* heap)
    : txq_([](const Transaction* t) { return t->txid(); }),
      queue_(kQueueLen, 1, 1),
      queue2_(kQueueLen / 2, 2, 2),
      shard_id_(pb->GetPoolIndex()),
      mi_resource_(heap) {
  queue_.Start(absl::StrCat("shard_queue_", shard_id()));
  queue2_.Start(absl::StrCat("l2_queue_", shard_id()));
}

void EngineShard::Shutdown() {
  DVLOG(1) << "EngineShard::Shutdown";

  queue_.Shutdown();
  queue2_.Shutdown();
  DCHECK(!fiber_heartbeat_periodic_.IsJoinable());
  DCHECK(!fiber_shard_handler_periodic_.IsJoinable());
}

void EngineShard::StopPeriodicFiber() {
  ProactorBase::me()->RemoveOnIdleTask(defrag_task_id_);
  ProactorBase::me()->RemoveOnIdleTask(huffman_check_task_id_);

  fiber_heartbeat_periodic_done_.Notify();
  if (fiber_heartbeat_periodic_.IsJoinable()) {
    fiber_heartbeat_periodic_.Join();
  }
  fiber_shard_handler_periodic_done_.Notify();
  if (fiber_shard_handler_periodic_.IsJoinable()) {
    fiber_shard_handler_periodic_.Join();
  }
}

static void RunFPeriodically(std::function<void()> f, std::chrono::milliseconds period_ms,
                             std::string_view error_msg, util::fb2::Done* waiter) {
  int64_t last_heartbeat_ms = INT64_MAX;

  while (true) {
    if (waiter->WaitFor(period_ms)) {
      VLOG(2) << "finished running engine shard periodic task";
      return;
    }

    int64_t now_ms = fb2::ProactorBase::GetMonotonicTimeNs() / 1000000;
    if (now_ms - 5 * period_ms.count() > last_heartbeat_ms) {
      VLOG(1) << "This " << error_msg << " step took " << now_ms - last_heartbeat_ms << "ms";
    }
    f();
    last_heartbeat_ms = fb2::ProactorBase::GetMonotonicTimeNs() / 1000000;
  }
}

void EngineShard::StartPeriodicHeartbeatFiber(util::ProactorBase* pb) {
  auto cycle_ms = GetPeriodicCycleMs();
  if (!cycle_ms) {
    return;
  }
  auto heartbeat = [this]() { Heartbeat(); };

  eviction_state_.rss_eviction_enabled = GetFlag(FLAGS_enable_heartbeat_rss_eviction);
  std::chrono::milliseconds period_ms(*cycle_ms);

  fb2::Fiber::Opts fb_opts{.priority = absl::GetFlag(FLAGS_background_heartbeat)
                                           ? fb2::FiberPriority::BACKGROUND
                                           : fb2::FiberPriority::NORMAL,
                           .name = absl::StrCat("heartbeat_periodic", pb->GetPoolIndex())};
  fiber_heartbeat_periodic_ = fb2::Fiber(fb_opts, [this, period_ms, heartbeat]() mutable {
    RunFPeriodically(heartbeat, period_ms, "heartbeat", &fiber_heartbeat_periodic_done_);
  });
  defrag_task_id_ = pb->AddOnIdleTask([this]() { return DefragTask(); });
}

void EngineShard::StartPeriodicShardHandlerFiber(util::ProactorBase* pb,
                                                 std::function<void()> shard_handler) {
  auto clock_cycle_ms = GetPeriodicCycleMs();
  if (!clock_cycle_ms) {
    return;
  }

  // Minimum 100ms
  std::chrono::milliseconds period_ms(std::max(100u, *clock_cycle_ms));
  fiber_shard_handler_periodic_ = MakeFiber(
      [this, index = pb->GetPoolIndex(), period_ms, handler = std::move(shard_handler)]() mutable {
        ThisFiber::SetName(absl::StrCat("shard_handler_periodic", index));
        RunFPeriodically(std::move(handler), period_ms, "shard handler",
                         &fiber_shard_handler_periodic_done_);
      });
}

void EngineShard::InitThreadLocal(ProactorBase* pb) {
  CHECK(shard_ == nullptr) << pb->GetPoolIndex();

  mi_heap_t* data_heap = ServerState::tlocal()->data_heap();
  void* ptr = mi_heap_malloc_aligned(data_heap, sizeof(EngineShard), alignof(EngineShard));
  shard_ = new (ptr) EngineShard(pb, data_heap);

  CompactObj::InitThreadLocal(shard_->memory_resource());
  SmallString::InitThreadLocal(data_heap);
  InitTLStatelessAllocMR(shard_->memory_resource());

  shard_->shard_search_indices_ = std::make_unique<ShardDocIndices>();
}

void EngineShard::InitTieredStorage(ProactorBase* pb, size_t max_file_size) {
  if (string backing_prefix = GetFlag(FLAGS_tiered_prefix); !backing_prefix.empty()) {
    LOG_IF(FATAL, pb->GetKind() != ProactorBase::IOURING)
        << "Only ioring based backing storage is supported. Exiting...";

    // TODO: enable tiered storage on non-default namespace
    DbSlice& db_slice = namespaces->GetDefaultNamespace().GetDbSlice(shard_id());
    auto* shard = EngineShard::tlocal();
    shard->tiered_storage_ = make_unique<TieredStorage>(max_file_size, &db_slice);
    error_code ec = shard->tiered_storage_->Open(backing_prefix);
    CHECK(!ec) << ec.message();
  }
}

void EngineShard::DestroyThreadLocal() {
  if (!shard_)
    return;

  uint32_t shard_id = shard_->shard_id();
  mi_heap_t* tlh = shard_->mi_resource_.heap();

  shard_->Shutdown();

  detail::InternedString::ResetPool();
  shard_->~EngineShard();
  CleanupStatelessAllocMR();

  mi_free(shard_);
  shard_ = nullptr;
  CompactObj::InitThreadLocal(nullptr);

  mi_heap_delete(tlh);
  VLOG(1) << "Shard reset " << shard_id;
}

// Is called by Transaction::ExecuteAsync in order to run transaction tasks.
// Only runs in its own thread.
void EngineShard::PollExecution(const char* context, Transaction* trans) {
  DVLOG(2) << "PollExecution " << context << " " << (trans ? trans->DebugId() : "") << " "
           << txq_.size() << " " << (continuation_trans_ ? continuation_trans_->DebugId() : "");

  ShardId sid = shard_id();
  stats_.poll_execution_total++;

  // If any of the following flags are present, we are guaranteed to run in this function:
  // 1. AWAKED_Q -> Blocking transactions are executed immediately after waking up, they don't
  // occupy a place in txq and have highest priority
  // 2. WAS_SUSPENDED -> Suspended transactions are run to clean up and finalize blocking keys
  // 3. OUT_OF_ORDER -> Transactions without conflicting keys can run earlier than their position in
  // txq is reached
  uint16_t flags = Transaction::AWAKED_Q | Transaction::WAS_SUSPENDED | Transaction::OUT_OF_ORDER;
  auto [trans_mask, disarmed] =
      trans ? trans->DisarmInShardWhen(sid, flags) : make_pair(uint16_t(0), false);

  if (trans && trans_mask == 0)  // If not armed, it means that this poll task expired
    return;

  if (trans_mask & Transaction::AWAKED_Q) {
    CHECK(trans->GetNamespace().GetBlockingController(shard_id_)->HasAwakedTransaction());
    CHECK(continuation_trans_ == nullptr)
        << continuation_trans_->DebugId() << " when polling " << trans->DebugId()
        << "cont_mask: " << continuation_trans_->DEBUG_GetLocalMask(sid) << " vs "
        << trans->DEBUG_GetLocalMask(sid);

    // Commands like BRPOPLPUSH don't conclude immediately
    if (!trans->RunInShard(this, true)) {
      // execution is blocked while HasAwakedTransaction() returns true, so no need to set
      // continuation_trans_. Moreover, setting it for wakened multi-hop transactions may lead to
      // inconcistency, see BLMoveSimultaneously test.
      // continuation_trans_ = trans;
      return;
    }

    trans = nullptr;  // Avoid handling the caller below
  }

  bool update_stats = false;
  ++poll_concurrent_factor_;

  auto run = [this, &update_stats](Transaction* tx, bool allow_removal) -> bool /* concluding */ {
    update_stats = true;
    return tx->RunInShard(this, allow_removal);
  };

  // Check the currently running transaction, we have to handle it first until it concludes
  if (continuation_trans_) {
    bool is_self = continuation_trans_ == trans;
    if (is_self)
      trans = nullptr;

    if ((is_self && disarmed) || continuation_trans_->DisarmInShard(sid)) {
      if (bool concludes = run(continuation_trans_, true); concludes) {
        continuation_trans_ = nullptr;
        continuation_debug_id_.clear();
      } else {
        continuation_debug_id_ = continuation_trans_->DebugId(sid);
      }
    }
  }

  // Progress on the transaction queue if no transaction is running currently.
  Transaction* head = nullptr;

  while (continuation_trans_ == nullptr && !txq_.Empty()) {
    head = get<Transaction*>(txq_.Front());

    // Break if there are any awakened transactions, as we must give way to them
    // before continuing to handle regular transactions from the queue.
    if (head->GetNamespace().GetBlockingController(shard_id_) &&
        head->GetNamespace().GetBlockingController(shard_id_)->HasAwakedTransaction())
      break;

    VLOG(2) << "Considering head " << head->DebugId()
            << " isarmed: " << head->DEBUG_IsArmedInShard(sid);

    // If the transaction isn't armed yet, it will be handled by a successive poll
    bool should_run = (head == trans && disarmed) || head->DisarmInShard(sid);
    if (!should_run)
      break;

    // Avoid processing the caller transaction below if we found it in the queue,
    // because it most likely won't have enough time to arm itself again.
    if (head == trans)
      trans = nullptr;

    TxId txid = head->txid();

    // Update commited_txid before running, because RunInShard might block on i/o.
    // This way scheduling transactions won't see an understated value.
    DCHECK_LT(committed_txid_, txid);  //  strictly increasing when processed via txq
    committed_txid_ = txid;

    DCHECK(!continuation_trans_);  // while() check above ensures this.
    if (bool concludes = run(head, true); !concludes) {
      DCHECK_EQ(head->DEBUG_GetTxqPosInShard(sid), TxQueue::kEnd) << head->DebugId(sid);
      continuation_trans_ = head;
      continuation_debug_id_ = head->DebugId(sid);
    }
  }

  // If we disarmed, but didn't find ourselves in the loop, run now.
  if (trans && disarmed) {
    // if WAS_SUSPENDED is true but not AWAKED_Q, it means the transaction was awaked
    // in another thread and this one just follows along.
    DCHECK(trans_mask & (Transaction::OUT_OF_ORDER | Transaction::WAS_SUSPENDED));
    CHECK(trans != continuation_trans_);

    bool is_ooo = trans_mask & Transaction::OUT_OF_ORDER;

    // For OOO transactions that are still in the queue, we can not remove them unless
    // they conclude.
    bool concludes = run(trans, !is_ooo);
    if (is_ooo && concludes) {
      stats_.tx_ooo_total++;
    }

    // If the transaction concluded, it must remove itself from the tx queue.
    // Otherwise it is required to stay there to keep the relative order.
    if (!concludes && is_ooo) {
      LOG_IF(DFATAL, trans->DEBUG_GetTxqPosInShard(sid) == TxQueue::kEnd);
    }
  }
  --poll_concurrent_factor_;
  if (update_stats) {
    CacheStats();
  }
}

void EngineShard::RemoveContTx(Transaction* tx) {
  if (continuation_trans_ == tx) {
    continuation_trans_ = nullptr;
    continuation_debug_id_.clear();
  }
}

void EngineShard::Heartbeat() {
  DVLOG(3) << " Hearbeat";
  DCHECK(namespaces);

  CacheStats();

  // TODO: iterate over all namespaces
  DbSlice& db_slice = namespaces->GetDefaultNamespace().GetDbSlice(shard_id());

  // Skip heartbeat if global transaction is in process.
  // This is determined by attempting to check if shard lock can be acquired.
  const bool can_acquire_global_lock = shard_lock()->Check(IntentLock::Mode::EXCLUSIVE);

  if (db_slice.WillBlockOnJournalWrite() || !can_acquire_global_lock) {
    uint64_t now = absl::GetCurrentTimeNanos();

    uint64_t elapsed_ms = (now - stalled_start_ns_) / 1000000;

    if (stalled_start_ns_ && elapsed_ms > 1000) {
      LOG_EVERY_T(WARNING, 5) << "Stalled heartbeat() fiber for " << elapsed_ms / 1000
                              << " seconds";
    }
    stalled_start_ns_ = now;
    return;
  }
  stalled_start_ns_ = 0;

  thread_local bool check_huffman = (shard_id_ == 0);  // run it only on shard 0.
  if (check_huffman) {
    auto* ptr = db_slice.GetDBTable(0);
    if (ptr) {
      size_t key_usage = ptr->stats.memory_usage_by_type[OBJ_KEY];
      size_t obj_usage = ptr->stats.obj_memory_usage;

#ifdef NDEBUG
#define MB_THRESHOLD (50 * 1024 * 1024)
#else
#define MB_THRESHOLD (5 * 1024 * 1024)
#endif

      if (key_usage > MB_THRESHOLD && key_usage > obj_usage / 8) {
        VLOG(1) << "Scheduling huffman check task, key usage: " << key_usage
                << ", obj usage: " << obj_usage;

        check_huffman = false;  // trigger only once.

        // launch the task
        huffman_check_task_id_ =
            ProactorBase::me()->AddOnIdleTask([task = HuffmanCheckTask{}]() mutable {
              if (!shard_ || !namespaces) {
                return -1;
              }

              DbSlice& db_slice = namespaces->GetDefaultNamespace().GetDbSlice(shard_->shard_id());
              return task.Run(&db_slice);
            });
      }
    }
  }

  if (!IsReplica()) {  // Never run expiry/evictions on replica.
    RetireExpiredAndEvict();
  }

  if (tiered_storage_ && tiered_storage_->ShouldOffload()) {
    VLOG(1) << "Running Offloading, memory=" << db_slice.memory_budget()
            << ", cool memory: " << tiered_storage_->CoolMemoryUsage();

    for (unsigned i = 0; i < db_slice.db_array_size(); ++i) {
      if (!db_slice.IsDbValid(i))
        continue;
      tiered_storage_->RunOffloading(i);
    }
  }
}

void EngineShard::RetireExpiredAndEvict() {
  // Disable flush journal changes to prevent preemtion
  journal::DisableFlushGuard journal_flush_guard(journal_);

  // TODO: iterate over all namespaces
  DbSlice& db_slice = namespaces->GetDefaultNamespace().GetDbSlice(shard_id());
  constexpr double kTtlDeleteLimit = 200;

  uint32_t traversed = GetMovingSum6(TTL_TRAVERSE);
  uint32_t deleted = GetMovingSum6(TTL_DELETE);
  unsigned ttl_delete_target = 5;

  if (deleted > 10) {
    // deleted should be <= traversed.
    // hence we map our delete/traversed ratio into a range [0, kTtlDeleteLimit).
    // The higher ttl_delete_target the more likely we have lots of expired items that need
    // to be deleted.
    ttl_delete_target = unsigned(kTtlDeleteLimit * double(deleted) / (double(traversed) + 10));
  }

  DbContext db_cntx;
  db_cntx.time_now_ms = GetCurrentTimeMs();

  size_t deleted_bytes = 0;
  size_t eviction_goal = GetFlag(FLAGS_enable_heartbeat_eviction) ? CalculateEvictionBytes() : 0;

  for (unsigned i = 0; i < db_slice.db_array_size(); ++i) {
    if (!db_slice.IsDbValid(i))
      continue;

    db_cntx.db_index = i;
    auto [pt, _unused_expt] = db_slice.GetTables(i);
    uint64_t expire_count = db_slice.GetDBTable(i)->stats.expire_count;
    if (expire_count > 0) {
      // Scale traversal count to compensate for TTL key dilution in the prime table.
      // Since we now scan the prime table (not a dedicated expire table), most entries
      // may not have TTLs. We need more bucket traversals to check the same number of
      // TTL keys, but cap to avoid excessive work when TTL keys are extremely sparse.
      unsigned db_ttl_delete_target = ttl_delete_target;

      if (pt->size() >= expire_count * 2) {
        unsigned ratio = std::min(pt->size() / expire_count, 7UL);
        db_ttl_delete_target = ttl_delete_target * ratio;
      }
      DbSlice::DeleteExpiredStats stats = db_slice.DeleteExpiredStep(db_cntx, db_ttl_delete_target);

      deleted_bytes += stats.deleted_bytes;
      eviction_goal -= std::min(eviction_goal, size_t(stats.deleted_bytes));
      counter_[TTL_TRAVERSE].IncBy(stats.traversed);
      counter_[TTL_DELETE].IncBy(stats.deleted);
      stats_.total_heartbeat_expired_keys += stats.deleted;
      stats_.total_heartbeat_expired_bytes += stats.deleted_bytes;
      ++stats_.total_heartbeat_expired_calls;
      VLOG(2) << "Heartbeat expired " << stats.deleted << " keys with total bytes "
              << stats.deleted_bytes << " with total expire flow calls "
              << stats_.total_heartbeat_expired_calls;
    }

    if (eviction_goal) {
      uint32_t starting_segment_id = rand() % pt->GetSegmentCount();
      auto [evicted_items, evicted_bytes] =
          db_slice.FreeMemWithEvictionStepAtomic(i, db_cntx, starting_segment_id, eviction_goal);

      VLOG(2) << "Heartbeat eviction: Expected to evict " << eviction_goal
              << " bytes. Actually evicted " << evicted_items << " items, " << evicted_bytes
              << " bytes. Max eviction per heartbeat: "
              << GetFlag(FLAGS_max_eviction_per_heartbeat);

      deleted_bytes += evicted_bytes;
      eviction_goal -= std::min(eviction_goal, evicted_bytes);
    }
  }

  // Track deleted bytes only if we expect to lower memory
  if (eviction_state_.track_deleted_bytes) {
    eviction_state_.deleted_bytes_at_prev_eviction = deleted_bytes;
  }
}

// Adjust deleted bytes w.r.t shard used memory. If we increase shard used
// memory in current heartbeat we can invalidate deleted_bytes. Otherwise we adjust deleted
// bytes by diff.
void EngineShard::EvictionTaskState::AdjustDeletedBytes(size_t shard_used_memory) {
  if (shard_used_memory >= shard_used_memory_at_prev_eviction) {
    deleted_bytes_at_prev_eviction = 0;
  } else {
    deleted_bytes_at_prev_eviction = std::min(
        deleted_bytes_at_prev_eviction, shard_used_memory_at_prev_eviction - shard_used_memory);
  }
}

// Check if adding value of previous deleted bytes will be higher than rss memory budget and
// limit if needed.
void EngineShard::EvictionTaskState::LimitAccumulatedDeletedBytes(
    size_t shard_rss_over_memory_budget) {
  const size_t next_acc_deleted_bytes =
      acc_deleted_bytes_during_eviction + deleted_bytes_at_prev_eviction;
  acc_deleted_bytes_during_eviction = shard_rss_over_memory_budget > next_acc_deleted_bytes
                                          ? next_acc_deleted_bytes
                                          : shard_rss_over_memory_budget;
}

// Once the rss memory is lowered we can start also decreasing accumulated total bytes.
void EngineShard::EvictionTaskState::AdjustAccumulatedDeletedBytes(size_t global_used_rss_memory) {
  if (global_used_rss_memory < global_rss_memory_at_prev_eviction) {
    auto decrease_delete_bytes_before_rss_update =
        std::min(acc_deleted_bytes_during_eviction,
                 (global_rss_memory_at_prev_eviction - global_used_rss_memory) / shard_set->size());
    VLOG(2) << "deleted_bytes_before_rss_update: " << acc_deleted_bytes_during_eviction
            << " decrease_delete_bytes_before_rss_update: "
            << decrease_delete_bytes_before_rss_update;
    acc_deleted_bytes_during_eviction -= decrease_delete_bytes_before_rss_update;
  }
  LOG_IF(DFATAL, global_used_rss_memory < (acc_deleted_bytes_during_eviction * shard_set->size()))
      << "RSS eviction underflow "
      << "global_used_rss_memory: " << global_used_rss_memory
      << " total_deleted_bytes_on_eviction: " << acc_deleted_bytes_during_eviction;
}

size_t EngineShard::CalculateEvictionBytes() {
  const size_t shards_count = shard_set->size();
  const double eviction_memory_budget_threshold = GetFlag(FLAGS_eviction_memory_budget_threshold);

  // Calculate threshold for both used_memory and rss_memory
  const size_t limit = max_memory_limit.load(memory_order_relaxed);
  const size_t shard_memory_budget_threshold =
      size_t(limit * eviction_memory_budget_threshold) / shards_count;

  const size_t global_used_memory = used_mem_current.load(memory_order_relaxed);

  // Calculate how many bytes we need to evict on this shard
  size_t goal_bytes =
      CalculateHowManyBytesToEvictOnShard(limit, global_used_memory, shard_memory_budget_threshold);

  VLOG_IF(2, goal_bytes > 0) << "Used memory goal bytes: " << goal_bytes
                             << ", used memory: " << global_used_memory
                             << ", memory limit: " << max_memory_limit;

  // Check for `enable_heartbeat_rss_eviction` flag since it dynamic. And reset
  // state if flag has changed.
  bool rss_eviction_enabled_flag = GetFlag(FLAGS_enable_heartbeat_rss_eviction);
  if (eviction_state_.rss_eviction_enabled != rss_eviction_enabled_flag) {
    eviction_state_.Reset(rss_eviction_enabled_flag);
  }
  if (eviction_state_.rss_eviction_enabled) {
    const size_t global_used_rss_memory = rss_mem_current.load(memory_order_relaxed);
    const size_t rss_memory_threshold_start = limit * (1. - eviction_memory_budget_threshold);
    const size_t shard_used_memory = UsedMemory();

    // Adjust previous deleted bytes
    eviction_state_.AdjustDeletedBytes(shard_used_memory);

    // Calculate memory budget that is higher than rss_memory_threshold_start. This is our limit
    // for accumulated_deleted_bytes.
    const size_t shard_rss_over_memory_budget =
        global_used_rss_memory > rss_memory_threshold_start
            ? (global_used_rss_memory - rss_memory_threshold_start) / shards_count
            : 0;
    eviction_state_.LimitAccumulatedDeletedBytes(shard_rss_over_memory_budget);

    // Once the rss memory is lowered we can start also decreasing accumulated total bytes.
    eviction_state_.AdjustAccumulatedDeletedBytes(global_used_rss_memory);

    // Update rss/used memory for this heartbeat
    eviction_state_.global_rss_memory_at_prev_eviction = global_used_rss_memory;
    eviction_state_.shard_used_memory_at_prev_eviction = shard_used_memory;

    // If we underflow use limit as used_memory
    size_t used_rss_memory_with_deleted_bytes = std::min(
        global_used_rss_memory - eviction_state_.acc_deleted_bytes_during_eviction * shards_count,
        limit);

    // Try to evict more bytes if we are close to the rss memory limit
    size_t rss_goal_bytes = CalculateHowManyBytesToEvictOnShard(
        limit, used_rss_memory_with_deleted_bytes, shard_memory_budget_threshold);

    // RSS evictions starts so we should start tracking deleted_bytes
    if (rss_goal_bytes) {
      eviction_state_.track_deleted_bytes = true;
    } else {
      // There is no RSS eviction goal and we have cleared tracked deleted bytes
      if (!eviction_state_.acc_deleted_bytes_during_eviction) {
        eviction_state_.track_deleted_bytes = false;
      }
    }

    VLOG_IF(2, rss_goal_bytes > 0)
        << "Rss memory goal bytes: " << rss_goal_bytes
        << ", rss used memory: " << global_used_rss_memory << ", rss memory limit: " << limit
        << ", accumulated_deleted_bytes_during_eviction: "
        << eviction_state_.acc_deleted_bytes_during_eviction;

    goal_bytes = std::max(goal_bytes, rss_goal_bytes);
  }

  return goal_bytes;
}

void EngineShard::CacheStats() {
  uint64_t now = fb2::ProactorBase::GetMonotonicTimeNs();
  if (last_mem_params_.updated_at + 1000000 > now)  // 1ms
    return;

  size_t used_mem = UsedMemory();
  DbSlice& db_slice = namespaces->GetDefaultNamespace().GetDbSlice(shard_id());

  // Reflect local memory change on global value
  size_t delta = used_mem - last_mem_params_.used_mem;  // negative value wraps safely
  size_t current = used_mem_current.fetch_add(delta, memory_order_relaxed) + delta;
  ssize_t free_mem = max_memory_limit.load(memory_order_relaxed) - current;

  // Estimate bytes per object, excluding table memory
  size_t entries = db_slice.entries_count();
  size_t table_memory =
      db_slice.table_memory() + (tiered_storage_ ? tiered_storage_->CoolMemoryUsage() : 0);
  size_t obj_memory = table_memory <= used_mem ? used_mem - table_memory : 0;
  size_t bytes_per_obj = entries > 0 ? obj_memory / entries : 0;

  VLOG_EVERY_N(1, 500) << "Entries count " << entries << " "
                       << "obj_memory: " << obj_memory << ", bytes_per_obj: " << bytes_per_obj;

  db_slice.UpdateMemoryParams(free_mem / shard_set->size(), bytes_per_obj);
  last_mem_params_ = {now, used_mem};
}

size_t EngineShard::UsedMemory() const {
  return mi_resource_.used() + zmalloc_used_memory_tl + SmallString::UsedThreadLocal() +
         search_indices()->GetUsedMemory();
}

bool EngineShard::ShouldThrottleForTiering() const {
  // Throttle if the tiered storage is busy offloading (at least 30% of allowed capacity)
  return tiered_storage_ && tiered_storage_->WriteDepthUsage() > 0.3 &&
         tiered_storage_->ShouldOffload();
}

void EngineShard::FinalizeMulti(Transaction* tx) {
  if (continuation_trans_ == tx) {
    continuation_trans_ = nullptr;
  }

  // Wake only if no tx queue head is currently running
  auto* bc = tx->GetNamespace().GetBlockingController(shard_id());
  if (bc && continuation_trans_ == nullptr)
    bc->NotifyPending();

  PollExecution("unlockmulti", nullptr);
}

EngineShard::TxQueueInfo EngineShard::AnalyzeTxQueue() const {
  const TxQueue* queue = txq();

  ShardId sid = shard_id();
  TxQueueInfo info;

  if (queue->Empty())
    return info;

  auto cur = queue->Head();
  info.tx_total = queue->size();
  unsigned max_db_id = 0;

  auto& db_slice = namespaces->GetDefaultNamespace().GetCurrentDbSlice();

  {
    auto value = queue->At(cur);
    Transaction* trx = std::get<Transaction*>(value);
    info.head.debug_id_info = trx->DebugId(sid);
  }

  do {
    auto value = queue->At(cur);
    Transaction* trx = std::get<Transaction*>(value);
    // find maximum index of databases used by transactions
    if (trx->GetDbIndex() > max_db_id) {
      max_db_id = trx->GetDbIndex();
    }

    bool is_armed = trx->DEBUG_IsArmedInShard(sid);
    DVLOG(1) << "Inspecting " << trx->DebugId() << " is_armed " << is_armed;
    if (is_armed) {
      info.tx_armed++;

      if (trx->IsGlobal() || (trx->IsMulti() && trx->GetMultiMode() == Transaction::GLOBAL)) {
        info.tx_global++;
      } else {
        const DbTable* table = db_slice.GetDBTable(trx->GetDbIndex());
        bool can_run = !HasContendedLocks(sid, trx, table);
        if (can_run) {
          info.tx_runnable++;
        }
      }
    }
    cur = queue->Next(cur);
  } while (cur != queue->Head());

  // Analyze locks
  for (unsigned i = 0; i <= max_db_id; ++i) {
    const DbTable* table = db_slice.GetDBTable(i);
    if (table == nullptr)
      continue;

    info.total_locks += table->trans_locks.Size();
    for (const auto& [key, lock] : table->trans_locks) {
      if (lock.IsContended()) {
        info.contended_locks++;
        if (lock.ContentionScore() > info.max_contention_score) {
          info.max_contention_score = lock.ContentionScore();
          info.max_contention_lock = key;
        }
      }
    }
  }

  return info;
}

size_t EngineShard::CompactTable(double threshold, DbIndex db_idx) {
  DbSlice& db_slice = namespaces->GetDefaultNamespace().GetDbSlice(shard_id());
  auto& prime = db_slice.GetDBTable(db_idx)->prime;
  size_t total_seg_merged = 0;

  while (true) {
    bool merged_any = false;
    // Prompt GetSegmentCount() each iteration to handle directory resizes across preemptions
    for (size_t seg_id = 0; seg_id < prime.GetSegmentCount(); seg_id = prime.NextSeg(seg_id)) {
      if (SliceSnapshot::IsSnaphotInProgress()) {
        return total_seg_merged;
      }
      // Fetch segment pointer fresh each iteration
      auto* seg = prime.GetSegment(seg_id);

      unsigned buddy_id = prime.FindBuddyId(seg_id);
      if (buddy_id == seg_id)
        continue;

      if (seg_id > buddy_id)
        continue;

      auto* buddy = prime.GetSegment(buddy_id);

      const size_t combined = seg->SlowSize() + buddy->SlowSize();
      const size_t max_size = threshold * seg->capacity();

      if (combined > max_size)
        continue;

      if (prime.Merge(seg_id, buddy_id)) {
        ++total_seg_merged;
        merged_any = true;
      }

      // Yield after merge (don't hold pointers across yield)
      util::ThisFiber::Yield();
    }

    if (!merged_any)
      break;
  }

  return total_seg_merged;
}

}  // namespace dfly


================================================
FILE: src/server/engine_shard.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include "core/intent_lock.h"
#include "core/mi_memory_resource.h"
#include "core/page_usage/page_usage_stats.h"
#include "core/task_queue.h"
#include "core/tx_queue.h"
#include "server/common_types.h"
#include "util/sliding_counter.h"

typedef char* sds;

namespace dfly {

class EngineShardSet;
class TieredStorage;
class ShardDocIndices;

class EngineShard {
  friend class EngineShardSet;

 public:
  struct Stats {
    uint64_t defrag_attempt_total = 0;
    uint64_t defrag_realloc_total = 0;
    uint64_t defrag_task_invocation_total = 0;
    uint64_t defrag_skipped_mem_under_threshold = 0;
    uint64_t defrag_skipped_within_check_interval = 0;
    uint64_t defrag_skipped_not_enough_fragmentation = 0;
    uint64_t poll_execution_total = 0;

    // number of optimistic executions - that were run as part of the scheduling.
    uint64_t tx_optimistic_total = 0;
    uint64_t tx_ooo_total = 0;

    // Number of ScheduleBatchInShard calls.
    uint64_t tx_batch_schedule_calls_total = 0;

    // Number of transactions scheduled via ScheduleBatchInShard.
    uint64_t tx_batch_scheduled_items_total = 0;

    uint64_t total_heartbeat_expired_keys = 0;
    uint64_t total_heartbeat_expired_bytes = 0;
    uint64_t total_heartbeat_expired_calls = 0;

    // cluster stats
    uint64_t total_migrated_keys = 0;

    // how many huffman tables were built successfully in the background
    uint32_t huffman_tables_built = 0;

    // Stream access pattern metrics (per-command, not per-entry).
    uint64_t stream_sequential_accesses = 0;  // head/tail: XADD, XREAD recent, XTRIM, etc.
    uint64_t stream_random_accesses = 0;      // arbitrary-ID lookups: XRANGE partial, XDEL, XCLAIM
    uint64_t stream_fetch_all_accesses = 0;   // full stream scan from beginning

    Stats& operator+=(const Stats&);
  };

  // Sets up a new EngineShard in the thread.
  // If update_db_time is true, initializes periodic time update for its db_slice.
  static void InitThreadLocal(util::ProactorBase* pb);

  // Must be called after all InitThreadLocal() have finished
  void InitTieredStorage(util::ProactorBase* pb, size_t max_file_size);

  static void DestroyThreadLocal();

  static EngineShard* tlocal() {
    return shard_;
  }

  bool IsMyThread() const {
    return this == shard_;
  }

  ShardId shard_id() const {
    return shard_id_;
  }

  PMR_NS::memory_resource* memory_resource() {
    return &mi_resource_;
  }

  TaskQueue* GetFiberQueue() {
    return &queue_;
  }

  TaskQueue* GetSecondaryQueue() {
    return &queue2_;
  }

  // Processes TxQueue, blocked transactions or any other execution state related to that
  // shard. Tries executing the passed transaction if possible (does not guarantee though).
  void PollExecution(const char* context, Transaction* trans);

  // Returns transaction queue.
  TxQueue* txq() {
    return &txq_;
  }

  const TxQueue* txq() const {
    return &txq_;
  }

  TxId committed_txid() const {
    return committed_txid_;
  }

  // Signals whether shard-wide lock is active.
  // Transactions that conflict with shard locks must subscribe into pending queue.
  IntentLock* shard_lock() {
    return &shard_lock_;
  }

  // Remove current continuation trans if its equal to tx.
  void RemoveContTx(Transaction* tx);

  const Stats& stats() const {
    return stats_;
  }

  Stats& stats() {
    return stats_;
  }

  // Calculate memory used by shard by summing multiple sources
  size_t UsedMemory() const;

  TieredStorage* tiered_storage() {
    return tiered_storage_.get();
  }

  ShardDocIndices* search_indices() const {
    return shard_search_indices_.get();
  }

  // Moving average counters.
  enum MovingCnt : uint8_t { TTL_TRAVERSE, TTL_DELETE, COUNTER_TOTAL };

  // Returns moving sum over the last 6 seconds.
  uint32_t GetMovingSum6(MovingCnt type) const {
    return counter_[unsigned(type)].SumTail();
  }

  bool journal() const {
    return journal_;
  }

  void set_journal(bool enable) {
    journal_ = enable;
  }

  void SetReplica(bool replica) {
    is_replica_ = replica;
  }

  bool IsReplica() const {
    return is_replica_;
  }

  const Transaction* GetContTx() const {
    return continuation_trans_;
  }

  void StopPeriodicFiber();

  struct TxQueueItem {
    std::string debug_id_info;
  };

  struct TxQueueInfo {
    // Armed - those that the coordinator has armed with callbacks and wants them to run.
    // Runnable - those that could run (they own the locks) but probably can not run due
    // to head of line blocking in the transaction queue i.e. there is a transaction that
    // either is not armed or not runnable that is blocking the runnable transactions.
    // tx_total is the size of the transaction queue.
    unsigned tx_armed = 0, tx_total = 0, tx_runnable = 0, tx_global = 0;

    // total_locks - total number of the transaction locks in the shard.
    unsigned total_locks = 0;

    // contended_locks - number of locks that are contended by more than one transaction.
    unsigned contended_locks = 0;

    // The score of the lock with maximum contention (see IntentLock::ContetionScore for details).
    unsigned max_contention_score = 0;

    // the lock fingerprint with maximum contention score.
    uint64_t max_contention_lock;

    // We can use a vector to hold debug info for all items in the txqueue
    TxQueueItem head;

    std::string Format() const;
  };

  TxQueueInfo AnalyzeTxQueue() const;

  // Returns true if revelant write operations should throttle to wait for tiering to catch up.
  // The estimate is based on memory usage crossing tiering redline and the write depth being at
  // least 50% of allowed max, providing at least some guarantee of progress.
  bool ShouldThrottleForTiering() const;

  void FinalizeMulti(Transaction* tx);

  // Scan the shard with the cursor and apply defragmentation for database entries.
  // Returns collected page stats if defragmentation was performed.
  std::optional<CollectedPageStats> DoDefrag(PageUsage* page_usage);

  uint64_t GetDefragCursor() const {
    return defrag_state_.cursor;
  }

  // Return total segments merged.
  size_t CompactTable(double threshold, DbIndex db_idx);

 private:
  struct DefragTaskState {
    size_t dbid = 0u;
    uint64_t cursor = 0u;
    time_t last_check_time = 0;
    float page_utilization_threshold = 0.8;

    enum class SkipReason : uint8_t {
      MemoryTooLow,
      MemoryBelowThreshold,
      CheckWithinInterval,
      NotEnoughFragmentation,
      CheckInProgress,
      NotSkipped,
    };

    // check the current threshold and return a reason if we skip the defragmentation
    SkipReason CheckRequired();

    void UpdateScanState(uint64_t cursor_val);

    void ResetScanState();
  };

  struct EvictionTaskState {
    void Reset(bool rss_eviction_enabled_flag) {
      rss_eviction_enabled = rss_eviction_enabled_flag;
      shard_used_memory_at_prev_eviction = global_rss_memory_at_prev_eviction =
          acc_deleted_bytes_during_eviction = deleted_bytes_at_prev_eviction = 0;
    }
    void AdjustDeletedBytes(size_t shard_used_memory);
    void LimitAccumulatedDeletedBytes(size_t shard_rss_over_memory_budget);
    void AdjustAccumulatedDeletedBytes(size_t global_used_rss_memory);
    bool rss_eviction_enabled = true;
    bool track_deleted_bytes = false;
    size_t acc_deleted_bytes_during_eviction = 0;  // Accumulated deleted bytes during eviction
    size_t deleted_bytes_at_prev_eviction = 0;     // Bytes deleted in previous eviction
    size_t shard_used_memory_at_prev_eviction = 0;
    size_t global_rss_memory_at_prev_eviction = 0;
  };

  EngineShard(util::ProactorBase* pb, mi_heap_t* heap);

  // blocks the calling fiber.
  void Shutdown();  // called before destructing EngineShard.

  void StartPeriodicHeartbeatFiber(util::ProactorBase* pb);
  void StartPeriodicShardHandlerFiber(util::ProactorBase* pb, std::function<void()> shard_handler);

  void Heartbeat();
  void RetireExpiredAndEvict();

  /* Calculates the number of bytes to evict based on memory and rss memory usage. */
  size_t CalculateEvictionBytes();

  void CacheStats();

  // We are running a task that checks whether we need to
  // do memory de-fragmentation here, this task only run
  // when there are available CPU time.
  // --------------------------------------------------------------------------
  // NOTE: This task is running with exclusive access to the shard.
  // i.e. - Since we are using shared noting access here, and all access
  // are done using fibers, This fiber is run only when no other fiber in the
  // context of the controlling thread will access this shard!
  // --------------------------------------------------------------------------
  uint32_t DefragTask();

  TxQueue txq_;
  TaskQueue queue_, queue2_;

  ShardId shard_id_;
  Stats stats_;

  // Become passive if replica: don't automatially evict expired items.
  bool is_replica_ = false;
  bool journal_ = false;

  // Precise tracking of used memory by persistent shard local values and structures
  MiMemoryResource mi_resource_;

  struct {
    uint64_t updated_at = 0;  // from GetMonotonicTimeNs
    size_t used_mem = 0;
  } last_mem_params_;

  // Logical ts used to order distributed transactions.
  TxId committed_txid_ = 0;
  Transaction* continuation_trans_ = nullptr;
  std::string continuation_debug_id_;
  unsigned poll_concurrent_factor_ = 0;

  IntentLock shard_lock_;

  uint32_t defrag_task_id_ = UINT32_MAX, huffman_check_task_id_ = UINT32_MAX;
  EvictionTaskState eviction_state_;  // Used on eviction fiber
  util::fb2::Fiber fiber_heartbeat_periodic_;
  util::fb2::Done fiber_heartbeat_periodic_done_;

  util::fb2::Fiber fiber_shard_handler_periodic_;
  util::fb2::Done fiber_shard_handler_periodic_done_;

  DefragTaskState defrag_state_;
  std::unique_ptr<TieredStorage> tiered_storage_;
  // TODO: Move indices to Namespace
  std::unique_ptr<ShardDocIndices> shard_search_indices_;
  uint64_t stalled_start_ns_ = 0;
  using Counter = util::SlidingCounter<7>;

  Counter counter_[COUNTER_TOTAL];

  static __thread EngineShard* shard_;
};

}  // namespace dfly


================================================
FILE: src/server/engine_shard_set.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/engine_shard_set.h"

#include <sys/statvfs.h>

#include <filesystem>

#include "base/flags.h"
#include "base/logging.h"
#include "server/common.h"
#include "server/db_slice.h"
#include "server/namespaces.h"
#include "server/tiered_storage.h"
#include "strings/human_readable.h"

using namespace std;

ABSL_FLAG(bool, cache_mode, false,
          "If true, the backend behaves like a cache, "
          "by evicting entries when getting close to maxmemory limit");

ABSL_FLAG(strings::MemoryBytesFlag, tiered_max_file_size, strings::MemoryBytesFlag{},
          "Limit on maximum file size that is used by the database for tiered storage. "
          "0 - means the program will automatically determine its maximum file size. "
          "default: 0");

ABSL_DECLARE_FLAG(string, tiered_prefix);

namespace dfly {

using namespace tiering::literals;

using namespace util;
using absl::GetFlag;
using strings::HumanReadableNumBytes;

namespace {

uint64_t GetFsLimit() {
  std::filesystem::path file_path(GetFlag(FLAGS_tiered_prefix));
  std::string dir_name_str = file_path.parent_path().string();

  if (dir_name_str.empty())
    dir_name_str = ".";

  struct statvfs stat;
  if (statvfs(dir_name_str.c_str(), &stat) == 0) {
    uint64_t limit = stat.f_frsize * stat.f_blocks;
    return limit;
  }
  LOG(WARNING) << "Error getting filesystem information " << errno;
  return 0;
}

size_t GetTieredFileLimit(size_t threads) {
  string file_prefix = GetFlag(FLAGS_tiered_prefix);
  if (file_prefix.empty())
    return 0;

  size_t max_shard_file_size = 0;

  size_t max_file_size = absl::GetFlag(FLAGS_tiered_max_file_size).value;
  size_t max_file_size_limit = GetFsLimit();
  if (max_file_size == 0) {
    LOG(INFO) << "max_file_size has not been specified. Deciding myself....";
    max_file_size = (max_file_size_limit * 0.8);
  } else {
    if (max_file_size_limit < max_file_size) {
      LOG(WARNING) << "Got max file size " << HumanReadableNumBytes(max_file_size)
                   << ", however only " << HumanReadableNumBytes(max_file_size_limit)
                   << " disk space was found.";
    }
  }

  max_shard_file_size = max_file_size / threads;
  if (max_shard_file_size < 256_MB) {
    LOG(ERROR) << "Max tiering file size is too small. Setting: "
               << HumanReadableNumBytes(max_file_size) << " Required at least "
               << HumanReadableNumBytes(256_MB * threads) << ". Exiting..";
    exit(1);
  }
  LOG(INFO) << "Max file size is: " << HumanReadableNumBytes(max_file_size);

  return max_shard_file_size;
}

}  // namespace

/**


  _____                _               ____   _                      _  ____         _
 | ____| _ __    __ _ (_) _ __    ___ / ___| | |__    __ _  _ __  __| |/ ___|   ___ | |_
 |  _|  | '_ \  / _` || || '_ \  / _ \\___ \ | '_ \  / _` || '__|/ _` |\___ \  / _ \| __|
 | |___ | | | || (_| || || | | ||  __/ ___) || | | || (_| || |  | (_| | ___) ||  __/| |_
 |_____||_| |_| \__, ||_||_| |_| \___||____/ |_| |_| \__,_||_|   \__,_||____/  \___| \__|
                |___/

 */

EngineShardSet* shard_set = nullptr;

void EngineShardSet::Init(uint32_t sz, std::function<void()> shard_handler) {
  CHECK_EQ(0u, size());
  CHECK(namespaces == nullptr);

  shards_.reset(new EngineShard*[sz]);

  size_ = sz;
  size_t max_shard_file_size = GetTieredFileLimit(sz);
  pp_->AwaitFiberOnAll([this](uint32_t index, ProactorBase* pb) {
    if (index < size_) {
      InitThreadLocal(pb);
    }
  });

  // The order is important here. We must initialize namespaces after shards_.
  namespaces = new Namespaces();

  pp_->AwaitFiberOnAll([&](uint32_t index, ProactorBase* pb) {
    if (index < size_) {
      auto* shard = EngineShard::tlocal();
      shard->InitTieredStorage(pb, max_shard_file_size);

      // Must be last, as it accesses objects initialized above.
      // We can not move shard_handler because this code is called multiple times.
      shard->StartPeriodicHeartbeatFiber(pb);
      shard->StartPeriodicShardHandlerFiber(pb, shard_handler);
    }
  });
}

void EngineShardSet::PreShutdown() {
  RunBlockingInParallel([](EngineShard* shard) {
    shard->StopPeriodicFiber();

    // We must close tiered_storage before we destroy namespaces that own db slices.
    if (shard->tiered_storage()) {
      shard->tiered_storage()->Close();
    }
  });
}

void EngineShardSet::Shutdown() {
  // Calling Namespaces::Clear before destroying engine shards, because it accesses them
  // internally.
  namespaces->Clear();
  RunBlockingInParallel([](EngineShard*) { EngineShard::DestroyThreadLocal(); });

  delete namespaces;
  namespaces = nullptr;
}

void EngineShardSet::InitThreadLocal(ProactorBase* pb) {
  EngineShard::InitThreadLocal(pb);
  EngineShard* es = EngineShard::tlocal();
  shards_[es->shard_id()] = es;
}

void EngineShardSet::TEST_EnableCacheMode() {
  RunBlockingInParallel([](EngineShard* shard) {
    namespaces->GetDefaultNamespace().GetCurrentDbSlice().TEST_EnableCacheMode();
  });
}

}  // namespace dfly


================================================
FILE: src/server/engine_shard_set.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include "server/engine_shard.h"
#include "util/proactor_pool.h"

namespace dfly {

class TieredStorage;
class ShardDocIndices;
class BlockingController;
class EngineShardSet;

class EngineShardSet {
 public:
  struct CachedStats {
    std::atomic_uint64_t used_memory;

    CachedStats() : used_memory(0) {
    }

    CachedStats(const CachedStats& o) : used_memory(o.used_memory.load()) {
    }
  };

  explicit EngineShardSet(util::ProactorPool* pp) : pp_(pp) {
  }

  uint32_t size() const {
    return size_;
  }

  util::ProactorPool* pool() {
    return pp_;
  }

  void Init(uint32_t size, std::function<void()> shard_handler);

  // Shutdown sequence:
  // - EngineShardSet.PreShutDown()
  // - Namespaces.Clear()
  // - EngineShardSet.Shutdown()
  void PreShutdown();
  void Shutdown();

  // Uses a shard queue to dispatch. Callback runs in a dedicated fiber.
  template <typename F> auto Await(ShardId sid, F&& f) {
    return shards_[sid]->GetFiberQueue()->Await(std::forward<F>(f));
  }

  // Uses a shard queue to dispatch. Callback runs in a dedicated fiber.
  template <typename F> auto Add(ShardId sid, F&& f) {
    assert(sid < size_);
    return shards_[sid]->GetFiberQueue()->Add(std::forward<F>(f));
  }

  template <typename F> auto AddL2(ShardId sid, F&& f) {
    return shards_[sid]->GetSecondaryQueue()->Add(std::forward<F>(f));
  }

  // Runs a brief function on all shards. Waits for it to complete.
  // `func` must not preempt.
  template <typename U> void RunBriefInParallel(U&& func) const {
    RunBriefInParallel(std::forward<U>(func), [](auto i) { return true; });
  }

  // Runs a brief function on selected shards. Waits for it to complete.
  // `func` must not preempt.
  template <typename U, typename P> void RunBriefInParallel(U&& func, P&& pred) const;

  // Runs a possibly blocking function on all shards. Waits for it to complete.
  template <typename U> void RunBlockingInParallel(U&& func) {
    RunBlockingInParallel(std::forward<U>(func), [](auto i) { return true; });
  }

  // Runs a possibly blocking function on selected shards. Waits for it to complete.
  template <typename U, typename P> void RunBlockingInParallel(U&& func, P&& pred);

  // Runs func on all shards via the same shard queue that's been used by transactions framework.
  // The functions running inside the shard queue run atomically (sequentially)
  // with respect each other on the same shard.
  template <typename U> void AwaitRunningOnShardQueue(U&& func) {
    util::fb2::BlockingCounter bc(size_);
    for (size_t i = 0; i < size_; ++i) {
      Add(i, [&func, bc]() mutable {
        func(EngineShard::tlocal());
        bc->Dec();
      });
    }

    bc->Wait();
  }

  // Used in tests
  void TEST_EnableCacheMode();

 private:
  void InitThreadLocal(util::ProactorBase* pb);
  util::ProactorPool* pp_;
  std::unique_ptr<EngineShard*[]> shards_;
  uint32_t size_ = 0;
};

template <typename U, typename P>
void EngineShardSet::RunBriefInParallel(U&& func, P&& pred) const {
  util::fb2::BlockingCounter bc{0};

  for (uint32_t i = 0; i < size(); ++i) {
    if (!pred(i))
      continue;

    bc->Add(1);
    util::ProactorBase* dest = pp_->at(i);
    dest->DispatchBrief([&func, bc]() mutable {
      func(EngineShard::tlocal());
      bc->Dec();
    });
  }
  bc->Wait();
}

template <typename U, typename P> void EngineShardSet::RunBlockingInParallel(U&& func, P&& pred) {
  util::fb2::BlockingCounter bc{0};
  static_assert(std::is_invocable_v<U, EngineShard*>,
                "Argument must be invocable EngineShard* as argument.");
  static_assert(std::is_void_v<std::invoke_result_t<U, EngineShard*>>,
                "Callable must not have a return value!");

  for (uint32_t i = 0; i < size(); ++i) {
    if (!pred(i))
      continue;

    bc->Add(1);
    util::ProactorBase* dest = pp_->at(i);

    // the "Dispatch" call spawns a fiber underneath.
    dest->Dispatch([&func, bc]() mutable {
      func(EngineShard::tlocal());
      bc->Dec();
    });
  }
  bc->Wait();
}

ShardId Shard(std::string_view v, ShardId shard_num);

// absl::GetCurrentTimeNanos is twice faster than clock_gettime(CLOCK_REALTIME) on my laptop
// and 4 times faster than on a VM. it takes 5-10ns to do a call.

extern uint64_t TEST_current_time_ms;

inline uint64_t GetCurrentTimeMs() {
  return TEST_current_time_ms ? TEST_current_time_ms : absl::GetCurrentTimeNanos() / 1000000;
}

inline uint64_t GetCurrentTimeNs() {
  return TEST_current_time_ms ? TEST_current_time_ms * 1000000 : absl::GetCurrentTimeNanos();
}

extern EngineShardSet* shard_set;

}  // namespace dfly


================================================
FILE: src/server/engine_shard_set_test.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include <absl/flags/reflection.h>
#include <absl/strings/numbers.h>
#include <absl/strings/str_split.h>
#include <absl/strings/strip.h>
#include <gmock/gmock.h>

#include <map>
#include <string>
#include <string_view>
#include <vector>

#include "base/flags.h"
#include "base/gtest.h"
#include "base/logging.h"
#include "server/main_service.h"
#include "server/test_utils.h"

ABSL_DECLARE_FLAG(std::string, shard_round_robin_prefix);

namespace dfly {
namespace {

using namespace std;
using testing::Contains;
using testing::Pair;

class RoundRobinSharderTest : public BaseFamilyTest {
 protected:
  RoundRobinSharderTest() {
    absl::SetFlag(&FLAGS_shard_round_robin_prefix, "RR:");
    SetTestFlag("cluster_mode", "emulated");
    ResetService();
  }
};

TEST_F(RoundRobinSharderTest, RoundRobinShard) {
  if (shard_set->size() < 2) {
    GTEST_SKIP() << "Can only test round robin with 2+ shards";
  }

  Run({"set", "{RR:key0}", "value"});
  EXPECT_THAT(GetShardKeyCount(), Contains(Pair(0, 1)));  // shard 0 has 1 key
  EXPECT_THAT(GetShardKeyCount(), Contains(Pair(1, 0)));  // shard 1 has 0 keys

  Run({"set", "{RR:key1}", "value"});
  EXPECT_THAT(GetShardKeyCount(), Contains(Pair(0, 1)));  // shard 0 has 1 key
  EXPECT_THAT(GetShardKeyCount(), Contains(Pair(1, 1)));  // shard 1 also has 1 key

  Run({"set", "{RR:key2}", "value"});
  if (shard_set->size() == 2) {
    EXPECT_THAT(GetShardKeyCount(), Contains(Pair(0, 2)));
    EXPECT_THAT(GetShardKeyCount(), Contains(Pair(1, 1)));
  } else {
    EXPECT_THAT(GetShardKeyCount(), Contains(Pair(0, 1)));
    EXPECT_THAT(GetShardKeyCount(), Contains(Pair(1, 1)));
    EXPECT_THAT(GetShardKeyCount(), Contains(Pair(2, 1)));
  }
}

}  // namespace
}  // namespace dfly


================================================
FILE: src/server/error.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/error.h"

#include <absl/strings/str_cat.h>

using namespace std;

namespace dfly {
namespace rdb {

class error_category : public std::error_category {
 public:
  const char* name() const noexcept final {
    return "dragonfly.rdbload";
  }

  string message(int ev) const final;

  error_condition default_error_condition(int ev) const noexcept final;

  bool equivalent(int ev, const error_condition& condition) const noexcept final {
    return condition.value() == ev && &condition.category() == this;
  }

  bool equivalent(const error_code& error, int ev) const noexcept final {
    return error.value() == ev && &error.category() == this;
  }
};

string error_category::message(int ev) const {
  switch (ev) {
    case errc::wrong_signature:
      return "Wrong signature while trying to load from rdb file";
    case errc::out_of_memory:
      return "Out of memory, or used memory is too high";
    case errc::incorrect_snapshot_id:
      return "Snapshot id mismatch";
    default:
      return absl::StrCat("Internal error when loading RDB file ", ev);
      break;
  }
}

error_condition error_category::default_error_condition(int ev) const noexcept {
  return error_condition{ev, *this};
}

static error_category rdb_category;

}  // namespace rdb

error_code RdbError(rdb::errc ev) {
  return error_code{static_cast<int>(ev), rdb::rdb_category};
}

}  // namespace dfly


================================================
FILE: src/server/error.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <atomic>
#include <system_error>

#include "facade/error.h"

namespace dfly {

using facade::kDbIndOutOfRangeErr;
using facade::kInvalidDbIndErr;
using facade::kInvalidIntErr;
using facade::kSyntaxErr;
using facade::kWrongTypeErr;

#ifndef RETURN_ON_ERR

#define RETURN_ON_ERR_T(T, x)                                          \
  do {                                                                 \
    std::error_code __ec = (x);                                        \
    if (__ec) {                                                        \
      DLOG(ERROR) << "Error while calling " #x ": " << __ec.message(); \
      return (T)(__ec);                                                \
    }                                                                  \
  } while (0)

#define RETURN_ON_ERR(x) RETURN_ON_ERR_T(std::error_code, x)

#define RETURN_ON_GENERIC_ERR(x)                                   \
  do {                                                             \
    if (x) {                                                       \
      DLOG(ERROR) << "Error while calling " #x ": " << x.Format(); \
      return x;                                                    \
    }                                                              \
  } while (0)

#endif  // RETURN_ON_ERR

#ifndef RETURN_ON_BAD_STATUS

#define RETURN_ON_BAD_STATUS(x)  \
  do {                           \
    OpStatus __s = (x).status(); \
    if (__s != OpStatus::OK) {   \
      return __s;                \
    }                            \
  } while (0)

#endif  // RETURN_ON_BAD_STATUS

#ifndef GET_OR_SEND_UNEXPECTED

#define GET_OR_SEND_UNEXPECTED(expr)        \
  ({                                        \
    auto expr_res = (expr);                 \
    if (!expr_res) {                        \
      builder->SendError(expr_res.error()); \
      return;                               \
    }                                       \
    std::move(expr_res).value();            \
  })

#endif  // GET_OR_SEND_UNEXPECTED

namespace rdb {

enum errc {
  wrong_signature = 1,
  bad_version = 2,
  feature_not_supported = 3,
  duplicate_key = 4,
  rdb_file_corrupted = 5,
  bad_checksum = 6,
  bad_db_index = 7,
  invalid_rdb_type = 8,
  invalid_encoding = 9,
  empty_key = 10,
  out_of_memory = 11,
  bad_json_string = 12,
  unsupported_operation = 13,
  value_expired = 14,  // applying to set and hmap
  incorrect_snapshot_id = 15,
};

}  // namespace rdb

std::error_code RdbError(rdb::errc ev);

}  // namespace dfly


================================================
FILE: src/server/execution_state.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/execution_state.h"

#include <absl/strings/str_cat.h>

#include "base/logging.h"

namespace dfly {

using namespace std;

GenericError::operator std::error_code() const {
  return ec_;
}

GenericError::operator bool() const {
  return bool(ec_) || !details_.empty();
}

std::string GenericError::Format() const {
  if (!ec_ && details_.empty())
    return "";

  if (details_.empty())
    return ec_.message();
  else if (!ec_)
    return details_;
  else
    return absl::StrCat(ec_.message(), ": ", details_);
}

ExecutionState::~ExecutionState() {
  DCHECK(!err_handler_fb_.IsJoinable());
  err_handler_fb_.JoinIfNeeded();
}

GenericError ExecutionState::GetError() const {
  std::lock_guard lk(err_mu_);
  return err_;
}

void ExecutionState::ReportCancelError() {
  ReportError(std::make_error_code(errc::operation_canceled), "ExecutionState cancelled");
}

void ExecutionState::Reset(ErrHandler handler) {
  util::fb2::Fiber fb;

  unique_lock lk{err_mu_};
  err_ = {};
  err_handler_ = std::move(handler);
  state_.store(State::RUN, std::memory_order_relaxed);
  fb.swap(err_handler_fb_);
  lk.unlock();
  fb.JoinIfNeeded();
}

GenericError ExecutionState::SwitchErrorHandler(ErrHandler handler) {
  std::lock_guard lk{err_mu_};
  if (!err_) {
    // No need to check for the error handler - it can't be running
    // if no error is set.
    err_handler_ = std::move(handler);
  }
  return err_;
}

void ExecutionState::JoinErrorHandler() {
  util::fb2::Fiber fb;
  unique_lock lk{err_mu_};
  fb.swap(err_handler_fb_);
  lk.unlock();
  fb.JoinIfNeeded();
}

GenericError ExecutionState::ReportErrorInternal(GenericError&& err) {
  if (IsCancelled()) {
    LOG_IF(INFO, err != errc::operation_canceled) << err.Format();
    return {};
  }
  lock_guard lk{err_mu_};
  if (err_)
    return err_;

  err_ = std::move(err);

  // This context is either new or was Reset, where the handler was joined
  CHECK(!err_handler_fb_.IsJoinable());

  LOG(WARNING) << "ReportError: " << err_.Format();

  // We can move err_handler_ because it should run at most once.
  if (err_handler_)
    err_handler_fb_ = util::fb2::Fiber("report_internal_error", std::move(err_handler_), err_);
  state_.store(State::ERROR, std::memory_order_relaxed);
  return err_;
}

}  // namespace dfly


================================================
FILE: src/server/execution_state.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <atomic>
#include <functional>
#include <mutex>
#include <string>
#include <system_error>

#include "facade/facade_types.h"
#include "util/fibers/fibers.h"
#include "util/fibers/synchronization.h"

namespace dfly {

// AggregateValue is a thread safe utility to store the first
// truthy value;
template <typename T> struct AggregateValue {
  bool operator=(T val) {
    std::lock_guard l{mu_};
    if (!bool(current_) && bool(val)) {
      current_ = val;
    }
    return bool(val);
  }

  T operator*() {
    std::lock_guard l{mu_};
    return current_;
  }

  operator bool() {
    return bool(**this);
  }

  // Move out of value without critical section. Safe only when no longer in use.
  T Destroy() && {
    return std::move(current_);
  }

 private:
  util::fb2::Mutex mu_{};
  T current_{};
};

// Thread safe utility to store the first non null error.
using AggregateError = AggregateValue<std::error_code>;

// Thread safe utility to store the first non OK status.
using AggregateStatus = AggregateValue<facade::OpStatus>;
static_assert(bool(facade::OpStatus::OK) == false,
              "Default initialization should be a falsy OK value");

// Error wrapper, that stores error_code and optional string message.
class GenericError {
 public:
  GenericError() = default;
  GenericError(std::error_code ec) : ec_{ec}, details_{} {
  }
  GenericError(std::string details) : ec_{}, details_{std::move(details)} {
  }
  GenericError(std::error_code ec, std::string details) : ec_{ec}, details_{std::move(details)} {
  }

  operator std::error_code() const;
  operator bool() const;

  std::string Format() const;  // Get string representation of error.

 private:
  std::error_code ec_;
  std::string details_;
};

// Thread safe utility to store the first non null generic error.
using AggregateGenericError = AggregateValue<GenericError>;

// ExecutionState is a thread-safe utility for managing error reporting and cancellation for complex
// tasks. There are 3 states: RUN, CANCELLED, ERROR RUN and CANCELLED are just a state without any
// actions When report an error, only the first is stored, the next ones will be ignored. Then a
// special error handler is run, if present, and the ExecutionState is ERROR. The error handler is
// run in a separate handler to free up the caller.
// If the state is CANCELLED all errors are ignored
//
// ReportCancelError() reporting an `errc::operation_canceled` error.
class ExecutionState {
 public:
  using ErrHandler = std::function<void(const GenericError&)>;

  ExecutionState() = default;
  ExecutionState(ErrHandler err_handler) : err_handler_{std::move(err_handler)} {
  }

  ~ExecutionState();

  // TODO Remove. This function was created to reduce size of the code that should be refactored
  // Cancel() method should be used instead of this function
  // Report a cancel error the context by submitting an `errc::operation_canceled` error.
  // If the state is CANCELLED does nothing
  void ReportCancelError();

  bool IsRunning() const {
    return state_.load(std::memory_order_relaxed) == State::RUN;
  }

  bool IsError() const {
    return state_.load(std::memory_order_relaxed) == State::ERROR;
  }

  bool IsCancelled() const {
    return state_.load(std::memory_order_relaxed) == State::CANCELLED;
  }

  void Cancel() {
    state_.store(State::CANCELLED, std::memory_order_relaxed);
  }

  GenericError GetError() const;

  // Report an error by submitting arguments for GenericError.
  // If this is the first error that occured, then the error handler is run
  // and the context state set to ERROR.
  // If the state is CANCELLED does nothing
  template <typename... T> GenericError ReportError(T&&... ts) {
    return ReportErrorInternal(GenericError{std::forward<T>(ts)...});
  }

  // Wait for error handler to stop, reset error and state, assign new error handler.
  void Reset(ErrHandler handler);

  // Atomically replace the error handler if no error is present, and return the
  // current stored error. This function can be used to transfer cleanup responsibility safely
  //
  // Beware, never do this manually in two steps. If you check the state,
  // set the error handler and initialize resources, then the new error handler
  // will never run if the context was cancelled between the first two steps.
  GenericError SwitchErrorHandler(ErrHandler handler);

  // If any error handler is running, wait for it to stop.
  void JoinErrorHandler();

 private:
  GenericError ReportErrorInternal(GenericError&& err);

  enum class State { RUN, CANCELLED, ERROR };
  std::atomic<State> state_{State::RUN};
  GenericError err_;
  ErrHandler err_handler_;
  util::fb2::Fiber err_handler_fb_;

  // We use regular mutexes to be able to call ReportError directly from I/O callbacks.
  mutable std::mutex err_mu_;  // protects err_ and err_handler_
};

}  // namespace dfly


================================================
FILE: src/server/family_utils.cc
================================================
#include "server/family_utils.h"

#include <absl/container/flat_hash_set.h>
#include <absl/strings/str_cat.h>
#include <xxhash.h>

#include "base/logging.h"

extern "C" {
#include "redis/listpack.h"
#include "redis/sds.h"
#include "redis/stream.h"
#include "redis/ziplist.h"
#include "redis/zmalloc.h"
}

namespace dfly {

using namespace std;

namespace {

struct ZiplistCbArgs {
  long count = 0;
  absl::flat_hash_set<string_view> fields;
  unsigned char** lp;
};

int ZiplistPairsEntryConvertAndValidate(unsigned char* p, unsigned int head_count, void* userdata) {
  unsigned char* str;
  unsigned int slen;
  long long vll;

  ZiplistCbArgs* data = (ZiplistCbArgs*)userdata;

  if (data->fields.empty()) {
    data->fields.reserve(head_count / 2);
  }

  if (!ziplistGet(p, &str, &slen, &vll))
    return 0;

  if (((data->count) & 1) == 0) {
    sds field = str ? sdsnewlen(str, slen) : sdsfromlonglong(vll);
    auto [_, inserted] = data->fields.emplace(field, sdslen(field));
    if (!inserted) {
      sdsfree(field);
      return 0;
    }
  }

  if (str) {
    *(data->lp) = lpAppend(*(data->lp), (unsigned char*)str, slen);
  } else {
    *(data->lp) = lpAppendInteger(*(data->lp), vll);
  }

  (data->count)++;
  return 1;
}

}  // namespace

string XXH3_Digest(std::string_view s) {
  uint64_t hash = XXH3_64bits(s.data(), s.size());
  return absl::StrCat(absl::Hex(hash, absl::kZeroPad16));
}

sds WrapSds(std::string_view s) {
  static thread_local sds tmp_sds = sdsempty();
  return tmp_sds = sdscpylen(tmp_sds, s.data(), s.length());
}

NonUniquePicksGenerator::NonUniquePicksGenerator(RandomPick max_range) : max_range_(max_range) {
  CHECK_GT(max_range, RandomPick(0));
}

RandomPick NonUniquePicksGenerator::Generate() {
  return absl::Uniform(bitgen_, 0u, max_range_);
}

UniquePicksGenerator::UniquePicksGenerator(std::uint32_t picks_count, RandomPick max_range)
    : remaining_picks_count_(picks_count), picked_indexes_(picks_count) {
  CHECK_GE(max_range, picks_count);
  current_random_limit_ = max_range - picks_count;
}

RandomPick UniquePicksGenerator::Generate() {
  DCHECK_GT(remaining_picks_count_, 0u);

  remaining_picks_count_--;

  const RandomPick max_index = current_random_limit_++;
  const RandomPick random_index = absl::Uniform(bitgen_, 0u, max_index + 1u);

  const bool random_index_is_picked = picked_indexes_.emplace(random_index).second;
  if (random_index_is_picked) {
    return random_index;
  }

  picked_indexes_.insert(max_index);
  return max_index;
}

streamConsumer* StreamCreateConsumer(streamCG* cg, string_view name, uint64_t now_ms, int flags) {
  DCHECK(cg);
  DCHECK(!name.empty());
  if (cg == NULL)
    return NULL;

  streamConsumer* consumer = (streamConsumer*)zmalloc(sizeof(*consumer));

  int success =
      raxTryInsert(cg->consumers, (unsigned char*)name.data(), name.size(), consumer, NULL);
  if (!success) {
    zfree(consumer);
    return NULL;
  }
  consumer->name = sdsnewlen(name.data(), name.size());
  consumer->pel = raxNew();
  consumer->seen_time = now_ms;
  consumer->active_time = -1;

  return consumer;
}

int ZiplistPairsConvertAndValidateIntegrity(const uint8_t* zl, size_t size, unsigned char** lp) {
  ZiplistCbArgs data;
  data.lp = lp;

  int ret = ziplistValidateIntegrity(const_cast<uint8_t*>(zl), size, 1,
                                     ZiplistPairsEntryConvertAndValidate, &data);

  if (data.count & 1)
    ret = 0;

  for (auto field : data.fields) {
    sdsfree((sds)field.data());
  }
  return ret;
}

}  // namespace dfly


================================================
FILE: src/server/family_utils.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/flat_hash_set.h>
#include <absl/random/random.h>

#include <cstdint>
#include <string>
#include <string_view>

#include "facade/facade_types.h"
#include "server/engine_shard.h"
#include "server/search/doc_index.h"
#include "server/table.h"

extern "C" {
#include "redis/sds.h"
}

typedef struct streamConsumer streamConsumer;
typedef struct streamCG streamCG;

namespace dfly {

// Compute XXH3 hash and return as 16-character hex string
std::string XXH3_Digest(std::string_view s);

template <typename DenseSet>
std::vector<long> ExpireElements(DenseSet* owner, facade::CmdArgList values, uint32_t ttl_sec);

// Copy str to thread local sds instance. Valid until next WrapSds call on thread
sds WrapSds(std::string_view str);

using RandomPick = uint32_t;

class PicksGenerator {
 public:
  virtual RandomPick Generate() = 0;
  virtual ~PicksGenerator() = default;
};

class NonUniquePicksGenerator : public PicksGenerator {
 public:
  /* The generated value will be within the closed-open interval [0, max_range) */
  NonUniquePicksGenerator(RandomPick max_range);

  RandomPick Generate() override;

 private:
  const RandomPick max_range_;
  absl::BitGen bitgen_{};
};

/*
 * Generates unique index in O(1).
 *
 * picks_count specifies the number of random indexes to be generated.
 * In other words, this is the number of times the Generate() function is called.
 *
 * The class uses Robert Floyd's sampling algorithm
 * https://dl.acm.org/doi/pdf/10.1145/30401.315746
 * */
class UniquePicksGenerator : public PicksGenerator {
 public:
  /* The generated value will be within the closed-open interval [0, max_range) */
  UniquePicksGenerator(uint32_t picks_count, RandomPick max_range);

  RandomPick Generate() override;

 private:
  RandomPick current_random_limit_;
  uint32_t remaining_picks_count_;
  absl::flat_hash_set<RandomPick> picked_indexes_;
  absl::BitGen bitgen_{};
};

streamConsumer* StreamCreateConsumer(streamCG* cg, std::string_view name, uint64_t now_ms,
                                     int flags);

/* Use these methods to add or remove documents from the indexes for generic commands when the key
 * being modified could potentially be of type HSET or JSON. */
void AddKeyToIndexesIfNeeded(std::string_view key, const DbContext& db_cntx, PrimeValue& pv,
                             EngineShard* shard);
void RemoveKeyFromIndexesIfNeeded(std::string_view key, const DbContext& db_cntx,
                                  const PrimeValue& pv, EngineShard* shard);

// Validate and convert field/value ziplist pairs into listpack.
// Returns 1 on success, 0 on integrity failure.
int ZiplistPairsConvertAndValidateIntegrity(const uint8_t* zl, size_t size, unsigned char** lp);

// Returns true if this key type could potentially be indexed.
// Or in other words, if the key is of type HSET or JSON.
bool IsIndexedKeyType(const PrimeValue& pv);

// Implementation
/******************************************************************/
template <typename DenseSet>
inline std::vector<long> ExpireElements(DenseSet* owner, facade::CmdArgList values,
                                        uint32_t ttl_sec) {
  std::vector<long> res;
  res.reserve(values.size());

  for (size_t i = 0; i < values.size(); i++) {
    std::string_view field = facade::ToSV(values[i]);
    auto it = owner->Find(field);
    if (it != owner->end()) {
      it.SetExpiryTime(ttl_sec);
      res.emplace_back(ttl_sec == 0 ? 0 : 1);
    } else {
      res.emplace_back(-2);
    }
  }

  return res;
}

inline void AddKeyToIndexesIfNeeded(std::string_view key, const DbContext& db_cntx, PrimeValue& pv,
                                    EngineShard* shard) {
  if (IsIndexedKeyType(pv)) {
    shard->search_indices()->AddDoc(key, db_cntx, &pv);
  }
}

inline void RemoveKeyFromIndexesIfNeeded(std::string_view key, const DbContext& db_cntx,
                                         const PrimeValue& pv, EngineShard* shard) {
  if (IsIndexedKeyType(pv)) {
    shard->search_indices()->RemoveDoc(key, db_cntx, pv);
  }
}

inline bool IsIndexedKeyType(const PrimeValue& pv) {
  return pv.ObjType() == OBJ_HASH || pv.ObjType() == OBJ_JSON;
}

}  // namespace dfly


================================================
FILE: src/server/generic_family.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/generic_family.h"

#include <absl/strings/ascii.h>
#include <absl/strings/str_cat.h>

#include <optional>

#include "facade/cmd_arg_parser.h"
#include "facade/reply_builder.h"

extern "C" {
#include "redis/crc64.h"
}

#include "base/cycle_clock.h"
#include "base/flags.h"
#include "base/logging.h"
#include "core/glob_matcher.h"
#include "core/qlist.h"
#include "redis/rdb.h"
#include "server/acl/acl_commands_def.h"
#include "server/blocking_controller.h"
#include "server/cmd_support.h"
#include "server/command_registry.h"
#include "server/conn_context.h"
#include "server/container_utils.h"
#include "server/db_slice.h"
#include "server/engine_shard_set.h"
#include "server/error.h"
#include "server/family_utils.h"
#include "server/hset_family.h"
#include "server/journal/journal.h"
#include "server/namespaces.h"
#include "server/rdb_extensions.h"
#include "server/rdb_load.h"
#include "server/rdb_save.h"
#include "server/search/doc_index.h"
#include "server/set_family.h"
#include "server/tiered_storage.h"
#include "server/transaction.h"
#include "util/fibers/fibers.h"
#include "util/fibers/future.h"
#include "util/varz.h"

ABSL_FLAG(uint32_t, dbnum, 16, "Number of databases");
ABSL_FLAG(uint32_t, keys_output_limit, 8192, "Maximum number of keys output by keys command");
ABSL_FLAG(bool, unlink_experimental_async, true, "If true, runs unlink command asynchronously.");

namespace dfly {
using namespace std;
using namespace facade;

namespace {

constexpr uint32_t kMaxTtl = (1UL << 26);
constexpr size_t DUMP_FOOTER_SIZE = sizeof(uint64_t) + sizeof(uint16_t);  // version number and crc

std::optional<RdbVersion> GetRdbVersion(std::string_view msg, bool ignore_crc = false) {
  if (msg.size() <= DUMP_FOOTER_SIZE) {
    LOG(WARNING) << "got restore payload that is too short - " << msg.size();
    return std::nullopt;
  }

  // The footer looks like this: version (2 bytes) | crc64 (8 bytes)
  const std::uint8_t* footer =
      reinterpret_cast<const std::uint8_t*>(msg.data()) + (msg.size() - DUMP_FOOTER_SIZE);
  const RdbVersion version = (*(footer + 1) << 8 | (*footer));

  if (version > RDB_VERSION) {
    LOG(WARNING) << "got restore payload with illegal version - supporting version up to "
                 << RDB_VERSION << " got version " << version;
    return std::nullopt;
  }

  uint64_t expected_cs = absl::little_endian::Load64(footer + 2);  // skip the version

  if (!ignore_crc) {
    // Compute expected crc64 based on the actual data upto the expected crc64 field.
    uint64_t actual_cs =
        crc64(0, reinterpret_cast<const uint8_t*>(msg.data()), msg.size() - sizeof(uint64_t));

    if (actual_cs != expected_cs) {
      LOG(WARNING) << "CRC check failed for restore command, expecting: " << expected_cs << " got "
                   << actual_cs;
      return std::nullopt;
    }
  }

  return version;
}

template <typename It> int64_t GetExpireTime(const DbSlice& db_slice, const It& exp_it) {
  if (!IsValid(exp_it))
    return 0;

  return db_slice.ExpireTime(exp_it->second);
}

class InMemSource : public ::io::Source {
 public:
  explicit InMemSource(std::string_view buf) : buf_(buf) {
  }

  ::io::Result<size_t> ReadSome(const iovec* v, uint32_t len) final;

 protected:
  std::string_view buf_;
  off_t offs_ = 0;
};

::io::Result<size_t> InMemSource::ReadSome(const iovec* v, uint32_t len) {
  ssize_t read_total = 0;
  while (size_t(offs_) < buf_.size() && len > 0) {
    size_t read_sz = min<size_t>(buf_.size() - offs_, v->iov_len);
    memcpy(v->iov_base, buf_.data() + offs_, read_sz);
    read_total += read_sz;
    offs_ += read_sz;

    ++v;
    --len;
  }

  return read_total;
}

class RestoreArgs {
 private:
  static constexpr int64_t NO_EXPIRATION = 0;

  int64_t expiration_ = NO_EXPIRATION;
  bool abs_time_ = false;
  bool replace_ = false;  // if true, over-ride existing key
  bool sticky_ = false;

 public:
  RestoreArgs() = default;

  RestoreArgs(int64_t expiration, bool abs_time, bool replace)
      : expiration_(expiration), abs_time_(abs_time), replace_(replace) {
  }

  bool Replace() const {
    return replace_;
  }

  bool Sticky() const {
    return sticky_;
  }

  void SetSticky(bool sticky) {
    sticky_ = sticky;
  }

  uint64_t ExpirationTime() const {
    DCHECK_GE(expiration_, 0);
    return expiration_;
  }

  bool Expired() const {
    return expiration_ < 0;
  }

  bool HasExpiration() const {
    return expiration_ != NO_EXPIRATION;
  }

  [[nodiscard]] bool UpdateExpiration(int64_t now_msec);

  static OpResult<RestoreArgs> TryFrom(const CmdArgList& args);
};

class RdbRestoreValue : protected RdbLoaderBase {
 public:
  explicit RdbRestoreValue(RdbVersion rdb_version) {
    rdb_version_ = rdb_version;
  }

  OpResult<DbSlice::ItAndUpdater> Add(string_view key, string_view payload, const DbContext& cntx,
                                      const RestoreArgs& args, DbSlice* db_slice);

 private:
  std::optional<OpaqueObj> Parse(io::Source* source);
  int rdb_type_ = -1;
};

std::optional<RdbLoaderBase::OpaqueObj> RdbRestoreValue::Parse(io::Source* source) {
  src_ = source;
  if (pending_read_.remaining == 0) {
    io::Result<uint8_t> type_id = FetchType();
    if (type_id && rdbIsObjectTypeDF(type_id.value())) {
      rdb_type_ = *type_id;
    }
  }

  if (rdb_type_ == -1) {
    LOG(ERROR) << "failed to load type id from the input stream or type id is invalid";
    return std::nullopt;
  }

  OpaqueObj obj;
  error_code ec = ReadObj(rdb_type_, &obj);  // load the type from the input stream
  if (ec) {
    LOG(ERROR) << "failed to load data for type id " << rdb_type_;
    return std::nullopt;
  }

  return std::optional<OpaqueObj>(std::move(obj));
}

OpResult<DbSlice::ItAndUpdater> RdbRestoreValue::Add(string_view key, string_view data,
                                                     const DbContext& cntx, const RestoreArgs& args,
                                                     DbSlice* db_slice) {
  InMemSource data_src(data);
  PrimeValue pv;
  bool first_parse = true;
  do {
    auto opaque_res = Parse(&data_src);
    if (!opaque_res) {
      return OpStatus::INVALID_VALUE;
    }

    LoadConfig config;
    if (first_parse) {
      first_parse = false;
    } else {
      config.append = true;
    }
    if (pending_read_.remaining > 0) {
      config.chunked = true;
    }
    config.reserve = pending_read_.reserve;

    if (auto ec = FromOpaque(*opaque_res, config, &pv); ec) {
      // Handle value_expired gracefully - all fields expired during deserialize
      if (ec.value() == rdb::errc::value_expired) {
        return OpStatus::SKIPPED;
      }
      // we failed - report and exit
      LOG(WARNING) << "error while trying to read data: " << ec;
      return OpStatus::INVALID_VALUE;
    }
  } while (pending_read_.remaining > 0);

  auto res = db_slice->AddOrUpdate(cntx, key, std::move(pv), args.ExpirationTime());
  if (res) {
    res->it->first.SetSticky(args.Sticky());
    AddKeyToIndexesIfNeeded(key, cntx, res->it->second, db_slice->shard_owner());
  }
  return res;
}

[[nodiscard]] bool RestoreArgs::UpdateExpiration(int64_t now_msec) {
  if (HasExpiration()) {
    int64_t ttl = abs_time_ ? expiration_ - now_msec : expiration_;
    if (ttl > kMaxExpireDeadlineMs)
      ttl = kMaxExpireDeadlineMs;

    expiration_ = ttl < 0 ? -1 : ttl + now_msec;
  }
  return true;
}

// The structure that we are expecting is:
// args[0] == "key"
// args[1] == "ttl"
// args[2] == serialized value (list of chars that are used for the actual restore).
// args[3] .. args[n]: optional arguments that can be [REPLACE] [ABSTTL] [IDLETIME seconds]
//            [FREQ frequency], in any order
OpResult<RestoreArgs> RestoreArgs::TryFrom(const CmdArgList& args) {
  RestoreArgs out_args;
  string cur_arg{ArgS(args, 1)};  // extract ttl
  if (!absl::SimpleAtoi(cur_arg, &out_args.expiration_) || (out_args.expiration_ < 0)) {
    return OpStatus::INVALID_INT;
  }

  // the 3rd arg is the serialized value, so we are starting from one pass it
  // Note that all these are actually optional
  // note about the redis doc for this command: https://redis.io/commands/restore/
  // the IDLETIME and FREQ are not required, but to make this the same as in redis
  // we would parse them and ensure that they are correct, maybe later they will be used
  int64_t idle_time = 0;

  for (size_t i = 3; i < args.size(); ++i) {
    cur_arg = absl::AsciiStrToUpper(ArgS(args, i));
    bool additional = args.size() - i - 1 >= 1;
    if (cur_arg == "REPLACE") {
      out_args.replace_ = true;
    } else if (cur_arg == "ABSTTL") {
      out_args.abs_time_ = true;
    } else if (cur_arg == "STICK") {
      out_args.sticky_ = true;
    } else if (cur_arg == "IDLETIME" && additional) {
      ++i;
      cur_arg = ArgS(args, i);
      if (!absl::SimpleAtoi(cur_arg, &idle_time)) {
        return OpStatus::INVALID_INT;
      }
      if (idle_time < 0) {
        return OpStatus::SYNTAX_ERR;
      }
    } else if (cur_arg == "FREQ" && additional) {
      ++i;
      cur_arg = ArgS(args, i);
      int freq = 0;
      if (!absl::SimpleAtoi(cur_arg, &freq)) {
        return OpStatus::INVALID_INT;
      }
      if (freq < 0 || freq > 255) {
        return OpStatus::OUT_OF_RANGE;  // need to translate in this case
      }
    } else {
      LOG(WARNING) << "Got unknown command line option for restore '" << cur_arg << "'";
      return OpStatus::SYNTAX_ERR;
    }
  }
  return out_args;
}

OpResult<string> DumpToString(string_view key, const PrimeValue& pv, const OpArgs& op_args) {
  string str_res;

  if (pv.IsExternal() && !pv.IsCool()) {
    // TODO: consider moving blocking point to coordinator to avoid stalling shard queue
    auto res =
        ReadTieredString(op_args.db_cntx.db_index, key, pv, op_args.shard->tiered_storage()).Get();
    if (!res.has_value())
      return OpStatus::IO_ERROR;

    // TODO: allow saving string directly without proxy object
    str_res = RdbSerializerBase::DumpValue(PrimeValue{*res});
  } else {
    str_res = RdbSerializerBase::DumpValue(pv);
  }

  return {std::move(str_res)};
}

OpStatus OpPersist(const OpArgs& op_args, string_view key);

class Renamer {
 public:
  Renamer(Transaction* t, std::string_view src_key, std::string_view dest_key, unsigned shard_count,
          bool do_copy = false)
      : transaction_(t),
        src_key_(src_key),
        dest_key_(dest_key),
        src_sid_(Shard(src_key, shard_count)),
        dest_sid_(Shard(dest_key, shard_count)),
        do_copy_(do_copy) {
  }

  ErrorReply Rename(bool destination_should_not_exist);

 private:
  void FetchData();
  facade::OpStatus FinalizeRename();

  bool KeyExists(Transaction* t, EngineShard* shard, std::string_view key) const;
  void SerializeSrc(Transaction* t, EngineShard* shard);

  OpStatus DelSrc(Transaction* t, EngineShard* shard);
  OpStatus DeserializeDest(Transaction* t, EngineShard* shard);

  struct SerializedValue {
    std::string value;
    std::optional<RdbVersion> version;
    int64_t expire_ts;
    bool sticky;
  };

  Transaction* const transaction_;

  const std::string_view src_key_;
  const std::string_view dest_key_;
  const ShardId src_sid_;
  const ShardId dest_sid_;

  bool src_found_ = false;
  bool dest_found_ = false;
  bool do_copy_ = false;

  OpResult<SerializedValue> serialized_value_;
};

ErrorReply Renamer::Rename(bool destination_should_not_exist) {
  FetchData();

  if (!src_found_) {
    transaction_->Conclude();
    return OpStatus::KEY_NOTFOUND;
  }

  if (serialized_value_.status() != OpStatus::OK) {
    transaction_->Conclude();
    return serialized_value_.status();
  }

  if (!serialized_value_->version) {
    transaction_->Conclude();
    return ErrorReply{kInvalidDumpValueErr};
  }

  if (dest_found_ && destination_should_not_exist) {
    transaction_->Conclude();
    return OpStatus::KEY_EXISTS;
  }

  return FinalizeRename();
}

void Renamer::FetchData() {
  auto cb = [this](Transaction* t, EngineShard* shard) {
    auto args = t->GetShardArgs(shard->shard_id());
    DCHECK(1 == args.Size() || do_copy_);

    const ShardId shard_id = shard->shard_id();

    if (shard_id == src_sid_) {
      SerializeSrc(t, shard);
    }

    if (shard_id == dest_sid_) {
      dest_found_ = KeyExists(t, shard, dest_key_);
    }

    return OpStatus::OK;
  };

  transaction_->Execute(std::move(cb), false);
}

OpStatus Renamer::FinalizeRename() {
  OpStatus del_status = OpStatus::OK;
  OpStatus deserialize_status = OpStatus::OK;
  auto cb = [&](Transaction* t, EngineShard* shard) {
    const ShardId shard_id = shard->shard_id();

    if (!do_copy_ && shard_id == src_sid_) {
      del_status = DelSrc(t, shard);
    } else if (shard_id == dest_sid_) {
      deserialize_status = DeserializeDest(t, shard);
    }
    return OpStatus::OK;
  };

  transaction_->Execute(std::move(cb), true);

  LOG_IF(DFATAL,
         (deserialize_status != OpStatus::OK && deserialize_status != OpStatus::OUT_OF_MEMORY) ||
             del_status != OpStatus::OK)
      << "Error during rename command, deserialize_status: " << deserialize_status
      << " del_status: " << del_status;
  return deserialize_status != OpStatus::OK ? deserialize_status : del_status;
}

bool Renamer::KeyExists(Transaction* t, EngineShard* shard, std::string_view key) const {
  auto& db_slice = t->GetDbSlice(shard->shard_id());
  auto it = db_slice.FindReadOnly(t->GetDbContext(), key).it;
  return IsValid(it);
}

void Renamer::SerializeSrc(Transaction* t, EngineShard* shard) {
  auto& db_slice = t->GetDbSlice(shard->shard_id());
  auto [it, exp_it] = db_slice.FindReadOnly(t->GetDbContext(), src_key_);

  src_found_ = IsValid(it);
  if (!src_found_) {
    return;
  }

  OpResult<string> res = DumpToString(src_key_, it->second, t->GetOpArgs(shard));
  if (res.ok()) {
    optional rdb_version = GetRdbVersion(*res);
    int64_t exp_time = it->first.GetExpireTime();
    serialized_value_ =
        SerializedValue{std::move(*res), rdb_version, exp_time, it->first.IsSticky()};
  } else {
    serialized_value_ = res.status();
  }
}

OpStatus Renamer::DelSrc(Transaction* t, EngineShard* shard) {
  auto& db_slice = t->GetDbSlice(shard->shard_id());
  auto res = db_slice.FindMutable(t->GetDbContext(), src_key_);
  auto& it = res.it;

  CHECK(IsValid(it));

  DVLOG(1) << "Rename: removing the key '" << src_key_;

  db_slice.DelMutable(t->GetDbContext(), std::move(res));
  if (shard->journal()) {
    RecordJournal(t->GetOpArgs(shard), "DEL"sv, ArgSlice{src_key_}, 2);
  }

  return OpStatus::OK;
}

OpStatus Renamer::DeserializeDest(Transaction* t, EngineShard* shard) {
  DCHECK(serialized_value_);  // Verified in FetchData

  OpArgs op_args = t->GetOpArgs(shard);
  RestoreArgs restore_args{serialized_value_->expire_ts, true, true};

  if (!restore_args.UpdateExpiration(op_args.db_cntx.time_now_ms)) {
    return OpStatus::OUT_OF_RANGE;
  }

  auto& db_slice = t->GetDbSlice(shard->shard_id());
  auto dest_res = db_slice.FindMutable(op_args.db_cntx, dest_key_);

  if (dest_found_) {
    DVLOG(1) << "Rename: deleting the destiny key '" << dest_key_;
    db_slice.DelMutable(op_args.db_cntx, std::move(dest_res));
  }

  if (restore_args.Expired()) {
    VLOG(1) << "Rename: the new key '" << dest_key_ << "' already expired, will not save the value";

    if (dest_found_ && shard->journal()) {  // We need to delete old dest_key_ from replica
      RecordJournal(op_args, "DEL"sv, ArgSlice{dest_key_}, 2);
    }

    return OpStatus::OK;
  }

  restore_args.SetSticky(serialized_value_->sticky);

  RdbRestoreValue loader(serialized_value_->version.value());
  auto add_res =
      loader.Add(dest_key_, serialized_value_->value, op_args.db_cntx, restore_args, &db_slice);

  if (!add_res) {
    // SKIPPED means all fields expired during deserialize - treat as success
    if (add_res.status() == OpStatus::SKIPPED) {
      if (dest_found_ && shard->journal()) {
        RecordJournal(op_args, "DEL"sv, ArgSlice{dest_key_}, 2);
      }
      return OpStatus::OK;
    }
    return add_res.status();
  }

  LOG_IF(DFATAL, !add_res->is_new)
      << "Unexpected override for key " << dest_key_ << " " << dest_found_;
  auto bc = op_args.db_cntx.ns->GetBlockingController(op_args.shard->shard_id());
  if (bc) {
    bc->Awaken(t->GetDbIndex(), dest_key_);
  }

  if (shard->journal()) {
    auto expire_str = absl::StrCat(serialized_value_->expire_ts);

    absl::InlinedVector<std::string_view, 6> args(
        {dest_key_, expire_str, serialized_value_->value, "REPLACE"sv, "ABSTTL"sv});
    if (serialized_value_->sticky) {
      args.push_back("STICK"sv);
    }

    RecordJournal(op_args, "RESTORE"sv, args, 2);
  }

  return OpStatus::OK;
}

OpStatus OpPersist(const OpArgs& op_args, string_view key) {
  auto& db_slice = op_args.GetDbSlice();
  auto res = db_slice.FindMutable(op_args.db_cntx, key);

  if (!IsValid(res.it)) {
    return OpStatus::KEY_NOTFOUND;
  } else {
    bool cleared = db_slice.RemoveExpire(op_args.db_cntx.db_index, res.it);
    return cleared ? OpStatus::OK : OpStatus::SKIPPED;
  }
}

OpResult<std::string> OpDump(const OpArgs& op_args, string_view key) {
  auto& db_slice = op_args.GetDbSlice();
  auto [it, _] = db_slice.FindReadOnly(op_args.db_cntx, key);

  if (IsValid(it))
    return DumpToString(key, it->second, op_args);
  else
    return OpStatus::KEY_NOTFOUND;
}

OpStatus OpRestore(const OpArgs& op_args, std::string_view key, std::string_view payload,
                   RestoreArgs restore_args, RdbVersion rdb_version) {
  if (!restore_args.UpdateExpiration(op_args.db_cntx.time_now_ms)) {
    return OpStatus::OUT_OF_RANGE;
  }

  auto& db_slice = op_args.GetDbSlice();
  bool found_prev = false;

  // The redis impl (see cluster.c function restoreCommand), remove the old key if
  // the replace option is set, so lets do the same here
  {
    auto res = db_slice.FindMutable(op_args.db_cntx, key);
    if (IsValid(res.it)) {
      found_prev = true;
      if (restore_args.Replace()) {
        VLOG(1) << "restore command is running with replace, found old key '" << key
                << "' and removing it";
        db_slice.DelMutable(op_args.db_cntx, std::move(res));
      } else {
        // we are not allowed to replace it.
        return OpStatus::KEY_EXISTS;
      }
    }
  }

  if (restore_args.Expired()) {
    VLOG(1) << "the new key '" << key << "' already expired, will not save the value";
    return OpStatus::OK;
  }

  RdbRestoreValue loader(rdb_version);
  auto add_res = loader.Add(key, payload, op_args.db_cntx, restore_args, &db_slice);
  LOG_IF(DFATAL, add_res && !add_res->is_new)
      << "Unexpected override for key " << key << ", found previous " << found_prev
      << " override: " << restore_args.Replace()
      << ", type: " << ObjTypeToString(add_res->it->second.ObjType());

  return add_res.status();
}

bool ScanCb(const OpArgs& op_args, PrimeIterator prime_it, const ScanOpts& opts, StringVec* res) {
  auto& db_slice = op_args.GetDbSlice();

  DbSlice::Iterator it = DbSlice::Iterator::FromPrime(prime_it);
  if (prime_it->first.HasExpire()) {
    it = db_slice.ExpireIfNeeded(op_args.db_cntx, it).it;
    if (!IsValid(it))
      return false;
  }

  bool matches = !opts.type_filter || it->second.ObjType() == opts.type_filter;
  if (opts.mask.has_value()) {
    if (opts.mask == ScanOpts::Mask::Volatile) {
      matches &= it->first.HasExpire();
    } else if (opts.mask == ScanOpts::Mask::Permanent) {
      matches &= !it->first.HasExpire();
    } else if (opts.mask == ScanOpts::Mask::Accessed) {
      matches &= it->first.WasTouched();
    } else if (opts.mask == ScanOpts::Mask::Untouched) {
      matches &= !it->first.WasTouched();
    }
  }
  if (!matches)
    return false;

  if (opts.min_malloc_size > 0 && it->second.MallocUsed() < opts.min_malloc_size) {
    return false;
  }

  if (opts.bucket_id != UINT_MAX && opts.bucket_id != it.GetInnerIt().bucket_id()) {
    return false;
  }

  if (!opts.Matches(it.key())) {
    return false;
  }
  res->emplace_back(it.key());

  return true;
}

void OpScan(const OpArgs& op_args, const ScanOpts& scan_opts, uint64_t* cursor, StringVec* vec) {
  auto& db_slice = op_args.GetDbSlice();
  DCHECK(db_slice.IsDbValid(op_args.db_cntx.db_index));

  // ScanCb can preempt due to journaling expired entries and we need to make sure that
  // we enter the callback in a timing when journaling will not cause preemption. Otherwise,
  // the bucket might change as we Traverse and yield.
  db_slice.GetLatch()->Wait();

  // Disable flush journal changes to prevent preemtion in traverse.
  journal::DisableFlushGuard journal_flush_guard(op_args.shard->journal());
  unsigned cnt = 0;

  VLOG(1) << "PrimeTable " << db_slice.shard_id() << "/" << op_args.db_cntx.db_index << " has "
          << db_slice.DbSize(op_args.db_cntx.db_index);

  PrimeTable::Cursor cur{*cursor};
  auto* prime_table = db_slice.GetTables(op_args.db_cntx.db_index).first;

  const auto start_cycles = base::CycleClock::Now();

  // Don't allow it to monopolize cpu time.
  // Approximately 30 microseconds.
  const uint64_t timeout_cycles = base::CycleClock::Frequency() >> 15;

  do {
    cur = prime_table->Traverse(
        cur, [&](PrimeIterator it) { cnt += ScanCb(op_args, it, scan_opts, vec); });
  } while (cur && cnt < scan_opts.limit &&
           (base::CycleClock::Now() - start_cycles) < timeout_cycles);

  VLOG(1) << "OpScan " << db_slice.shard_id() << " cursor: " << cur.token();
  *cursor = cur.token();
}

uint64_t ScanGeneric(uint64_t cursor, const ScanOpts& scan_opts, StringVec* keys,
                     ConnectionContext* cntx) {
  ShardId sid = cursor % 1024;

  EngineShardSet* ess = shard_set;
  unsigned shard_count = ess->size();
  constexpr uint64_t kMaxScanTimeMs = 50;

  // Dash table returns a cursor with its right byte empty. We will use it
  // for encoding shard index. For now scan has a limitation of 255 shards.
  CHECK_LT(shard_count, 1024u);

  if (sid >= shard_count) {  // protection
    return 0;
  }

  cursor >>= 10;
  DbContext db_cntx{cntx->ns, cntx->conn_state.db_index, GetCurrentTimeMs()};

  do {
    auto cb = [&] {
      OpArgs op_args{EngineShard::tlocal(), nullptr, db_cntx};
      OpScan(op_args, scan_opts, &cursor, keys);
    };

    // Avoid deadlocking, if called from shard queue script
    if (EngineShard::tlocal() && EngineShard::tlocal()->shard_id() == sid) {
      cb();
      util::ThisFiber::Yield();
    } else {
      ess->Await(sid, cb);
    }

    if (cursor == 0) {
      ++sid;
      if (unsigned(sid) == shard_count)
        break;
    }

    // Break after kMaxScanTimeMs.
    uint64_t time_now_ms = GetCurrentTimeMs();
    if (time_now_ms > db_cntx.time_now_ms + kMaxScanTimeMs) {
      break;
    }
  } while (keys->size() < scan_opts.limit);

  if (sid < shard_count) {
    cursor = (cursor << 10) | sid;
  } else {
    DCHECK_EQ(0u, cursor);
  }

  return cursor;
}

void OpScanAndDelete(const OpArgs& op_args, const ScanOpts& scan_opts, uint64_t* cursor,
                     uint32_t* deleted) {
  StringVec keys;
  OpScan(op_args, scan_opts, cursor, &keys);

  auto& db_slice = op_args.GetDbSlice();
  uint32_t count = 0;
  for (const auto& key : keys) {
    auto it = db_slice.FindMutable(op_args.db_cntx, key).it;
    if (!IsValid(it))
      continue;
    db_slice.Del(op_args.db_cntx, it);
    if (op_args.shard->journal()) {
      RecordDelete(op_args.db_cntx.db_index, key);
    }
    ++count;
  }
  *deleted += count;
}

uint64_t RmGeneric(uint64_t cursor, const ScanOpts& scan_opts, uint32_t* deleted,
                   ConnectionContext* cntx) {
  ShardId sid = cursor % 1024;

  EngineShardSet* ess = shard_set;
  unsigned shard_count = ess->size();
  constexpr uint64_t kMaxRmTimeMs = 100;

  CHECK_LT(shard_count, 1024u);

  if (sid >= shard_count) {
    return 0;
  }

  cursor >>= 10;
  DbContext db_cntx{cntx->ns, cntx->conn_state.db_index, GetCurrentTimeMs()};

  *deleted = 0;

  do {
    auto cb = [&] {
      OpArgs op_args{EngineShard::tlocal(), nullptr, db_cntx};
      OpScanAndDelete(op_args, scan_opts, &cursor, deleted);
    };

    if (EngineShard::tlocal() && EngineShard::tlocal()->shard_id() == sid) {
      cb();
      util::ThisFiber::Yield();
    } else {
      ess->Await(sid, cb);
    }

    if (cursor == 0) {
      ++sid;
      if (unsigned(sid) == shard_count)
        break;
    }

    uint64_t time_now_ms = GetCurrentTimeMs();
    if (time_now_ms > db_cntx.time_now_ms + kMaxRmTimeMs) {
      break;
    }
  } while (*deleted < scan_opts.limit);

  if (sid < shard_count) {
    cursor = (cursor << 10) | sid;
  } else {
    DCHECK_EQ(0u, cursor);
  }

  return cursor;
}

OpStatus OpExpire(const OpArgs& op_args, string_view key, const DbSlice::ExpireParams& params) {
  auto& db_slice = op_args.GetDbSlice();
  auto find_res = db_slice.FindMutable(op_args.db_cntx, key);
  if (!IsValid(find_res.it)) {
    return OpStatus::KEY_NOTFOUND;
  }

  find_res.post_updater.Run();
  auto res = db_slice.UpdateExpire(op_args.db_cntx, find_res.it, find_res.exp_it, params);

  // If the value was deleted, replicate as DEL.
  // Else, replicate as PEXPIREAT with exact time.
  if (op_args.shard->journal() && res.ok()) {
    if (res.value() == -1) {
      RecordJournal(op_args, "DEL"sv, ArgSlice{key});
    } else {
      auto time = absl::StrCat(res.value());
      // Note: Don't forget to change this when adding arguments to expire commands.
      RecordJournal(op_args, "PEXPIREAT"sv, ArgSlice{key, time});
    }
  }

  return res.status();
}

#ifdef WITH_COLLECTION_CMDS
OpResult<vector<long>> OpFieldExpire(const OpArgs& op_args, string_view key, uint32_t ttl_sec,
                                     CmdArgList values) {
  auto& db_slice = op_args.GetDbSlice();
  auto [it, expire_it, auto_updater, is_new] = db_slice.FindMutable(op_args.db_cntx, key);

  if (!IsValid(it) || (it->second.ObjType() != OBJ_SET && it->second.ObjType() != OBJ_HASH)) {
    std::vector<long> res(values.size(), -2);
    return res;
  }

  PrimeValue* pv = &it->second;
  if (pv->ObjType() == OBJ_SET) {
    return SetFamily::SetFieldsExpireTime(op_args, ttl_sec, values, pv);
  } else {
    return HSetFamily::SetFieldsExpireTime(op_args, ttl_sec, ExpireFlags::EXPIRE_ALWAYS, key,
                                           values, pv);
  }
}

// returns -2 if the key was not found, -3 if the field was not found,
// -1 if ttl on the field was not found.
OpResult<long> OpFieldTtl(Transaction* t, EngineShard* shard, string_view key, string_view field) {
  auto& db_slice = t->GetDbSlice(shard->shard_id());
  const DbContext& db_cntx = t->GetDbContext();
  auto [it, expire_it] = db_slice.FindReadOnly(db_cntx, key);
  if (!IsValid(it))
    return -2;

  if (it->second.ObjType() != OBJ_SET && it->second.ObjType() != OBJ_HASH)
    return OpStatus::WRONG_TYPE;

  int32_t res = -1;
  if (it->second.ObjType() == OBJ_SET) {
    res = SetFamily::FieldExpireTime(db_cntx, it->second, field);
  } else {
    DCHECK_EQ(OBJ_HASH, it->second.ObjType());
    res = HSetFamily::FieldExpireTime(db_cntx, it->second, field);
  }
  return res <= 0 ? res : int32_t(res - MemberTimeSeconds(db_cntx.time_now_ms));
}
#else
OpResult<vector<long>> OpFieldExpire(const OpArgs& op_args, string_view key, uint32_t ttl_sec,
                                     CmdArgList values) {
  return OpStatus::SKIPPED;
}
OpResult<long> OpFieldTtl(Transaction* t, EngineShard* shard, string_view key, string_view field) {
  return OpStatus::SKIPPED;
}

#endif

OpResult<uint32_t> OpStick(const OpArgs& op_args, const ShardArgs& keys) {
  DVLOG(1) << "Stick: " << keys.Front();

  auto& db_slice = op_args.GetDbSlice();

  uint32_t res = 0;
  for (string_view key : keys) {
    auto find_res = db_slice.FindMutable(op_args.db_cntx, key);
    if (IsValid(find_res.it) && !find_res.it->first.IsSticky()) {
      find_res.it->first.SetSticky(true);
      ++res;
    }
  }

  return res;
}

OpResult<uint64_t> OpExpireTime(Transaction* t, EngineShard* shard, string_view key) {
  auto& db_slice = t->GetDbSlice(shard->shard_id());
  auto [it, expire_it] = db_slice.FindReadOnly(t->GetDbContext(), key);
  if (!IsValid(it))
    return OpStatus::KEY_NOTFOUND;

  if (!it->first.HasExpire())
    return OpStatus::SKIPPED;

  int64_t ttl_ms = it->first.GetExpireTime();
  DCHECK_GT(ttl_ms, 0);  // Otherwise FindReadOnly would return null.
  return ttl_ms;
}

// OpMove touches multiple databases (op_args.db_idx, target_db), so it assumes it runs
// as a global transaction.
// TODO: Allow running OpMove without a global transaction.
OpStatus OpMove(const OpArgs& op_args, string_view key, DbIndex target_db) {
  auto& db_slice = op_args.GetDbSlice();

  // Fetch value at key in current db.
  auto from_res = db_slice.FindMutable(op_args.db_cntx, key);
  if (!IsValid(from_res.it))
    return OpStatus::KEY_NOTFOUND;

  // Ensure target database exists.
  db_slice.ActivateDb(target_db);

  // Fetch value at key in target db.
  DbContext target_cntx = op_args.db_cntx;
  target_cntx.db_index = target_db;
  auto to_res = db_slice.FindReadOnly(target_cntx, key);
  if (IsValid(to_res.it))
    return OpStatus::KEY_EXISTS;

  bool sticky = from_res.it->first.IsSticky();
  uint64_t exp_ts = from_res.it->first.GetExpireTime();
  from_res.post_updater.Run();
  PrimeValue from_obj = std::move(from_res.it->second);

  db_slice.Del(op_args.db_cntx, from_res.it);
  auto op_result = db_slice.AddNew(target_cntx, key, std::move(from_obj), exp_ts);
  RETURN_ON_BAD_STATUS(op_result);
  auto& add_res = *op_result;
  add_res.it->first.SetSticky(sticky);

  auto bc = op_args.db_cntx.ns->GetBlockingController(op_args.shard->shard_id());
  if (add_res.it->second.ObjType() == OBJ_LIST && bc) {
    bc->Awaken(target_db, key);
  }

  return OpStatus::OK;
}

OpResult<void> OpRen(const OpArgs& op_args, string_view from_key, string_view to_key,
                     bool destination_should_not_exist) {
  auto* es = op_args.shard;
  auto& db_slice = op_args.GetDbSlice();
  auto from_res = db_slice.FindMutable(op_args.db_cntx, from_key);
  if (!IsValid(from_res.it))
    return OpStatus::KEY_NOTFOUND;

  if (from_key == to_key)
    return destination_should_not_exist ? OpStatus::KEY_EXISTS : OpStatus::OK;

  bool is_prior_list = false;
  auto to_res = db_slice.FindMutable(op_args.db_cntx, to_key);
  if (IsValid(to_res.it)) {
    if (destination_should_not_exist)
      return OpStatus::KEY_EXISTS;

    RemoveKeyFromIndexesIfNeeded(to_key, op_args.db_cntx, to_res.it->second, op_args.shard);
    is_prior_list = (to_res.it->second.ObjType() == OBJ_LIST);
  }

  // Delete the "from" document from the search index before deleting from the database
  RemoveKeyFromIndexesIfNeeded(from_key, op_args.db_cntx, from_res.it->second, op_args.shard);

  bool sticky = from_res.it->first.IsSticky();
  uint64_t exp_ts = from_res.it->first.GetExpireTime();
  from_res.post_updater.ReduceHeapUsage();

  // we keep the value we want to move.
  PrimeValue from_obj = std::move(from_res.it->second);

  if (IsValid(to_res.it)) {
    to_res.post_updater.ReduceHeapUsage();
    to_res.it->second = std::move(from_obj);

    if (exp_ts) {
      db_slice.AddExpire(op_args.db_cntx.db_index, to_res.it, exp_ts);
    } else {
      db_slice.RemoveExpire(op_args.db_cntx.db_index, to_res.it);
    }

    to_res.it->first.SetSticky(sticky);
    to_res.post_updater.Run();

    db_slice.DelMutable(op_args.db_cntx, std::move(from_res));
  } else {
    // Here we first delete from_it because AddNew below could invalidate from_it.
    // On the other hand, AddNew does not rely on the iterators - this is why we keep
    // the value in `from_obj`.
    db_slice.DelMutable(op_args.db_cntx, std::move(from_res));
    auto op_result = db_slice.AddNew(op_args.db_cntx, to_key, std::move(from_obj), exp_ts);
    RETURN_ON_BAD_STATUS(op_result);
    to_res = std::move(*op_result);
    to_res.it->first.SetSticky(sticky);
  }

  AddKeyToIndexesIfNeeded(to_key, op_args.db_cntx, to_res.it->second, op_args.shard);

  auto bc = op_args.db_cntx.ns->GetBlockingController(es->shard_id());
  if (!is_prior_list && to_res.it->second.ObjType() == OBJ_LIST && bc) {
    bc->Awaken(op_args.db_cntx.db_index, to_key);
  }
  return OpStatus::OK;
}

OpResult<uint64_t> OpTtl(Transaction* t, EngineShard* shard, string_view key) {
  auto opExpireTimeResult = OpExpireTime(t, shard, key);

  if (opExpireTimeResult) {
    auto now = t->GetDbContext().time_now_ms;
    DCHECK_GT(now, 0u);

    int64_t ttl_ms = opExpireTimeResult.value() - now;
    DCHECK_GT(ttl_ms, 0);  // Otherwise FindReadOnly would return null.
    return ttl_ms;
  } else {
    return opExpireTimeResult;
  }
}

ErrorReply RenameGeneric(CmdArgList args, bool destination_should_not_exist, Transaction* tx) {
  string_view key[2] = {ArgS(args, 0), ArgS(args, 1)};

  if (tx->GetUniqueShardCnt() == 1) {
    tx->ReviveAutoJournal();  // Safe to use RENAME with single shard
    auto cb = [&](Transaction* t, EngineShard* shard) {
      return OpRen(t->GetOpArgs(shard), key[0], key[1], destination_should_not_exist);
    };
    OpResult<void> result = tx->ScheduleSingleHopT(std::move(cb));

    return result.status();
  }

  Renamer renamer{tx, key[0], key[1], shard_set->size()};
  return renamer.Rename(destination_should_not_exist);
}

void ExpireTimeGeneric(CmdArgList args, TimeUnit unit, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);

  auto cb = [&](Transaction* t, EngineShard* shard) { return OpExpireTime(t, shard, key); };
  OpResult<uint64_t> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));

  if (result) {
    long ttl = (unit == TimeUnit::SEC) ? (result.value() + 500) / 1000 : result.value();
    cmd_cntx->SendLong(ttl);
    return;
  }

  switch (result.status()) {
    case OpStatus::KEY_NOTFOUND:
      cmd_cntx->SendLong(-2);
      break;
    default:
      LOG_IF(ERROR, result.status() != OpStatus::SKIPPED)
          << "Unexpected status " << result.status();
      cmd_cntx->SendLong(-1);
      break;
  }
}

void TtlGeneric(CmdArgList args, TimeUnit unit, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);

  auto cb = [&](Transaction* t, EngineShard* shard) { return OpTtl(t, shard, key); };
  OpResult<uint64_t> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));

  if (result) {
    long ttl = (unit == TimeUnit::SEC) ? (result.value() + 500) / 1000 : result.value();
    cmd_cntx->SendLong(ttl);
    return;
  }

  switch (result.status()) {
    case OpStatus::KEY_NOTFOUND:
      cmd_cntx->SendLong(-2);
      break;
    default:
      LOG_IF(ERROR, result.status() != OpStatus::SKIPPED)
          << "Unexpected status " << result.status();
      cmd_cntx->SendLong(-1);
      break;
  }
}

io::Result<int32_t, string> ParseExpireOptionsOrReply(const CmdArgList args) {
  int32_t flags = ExpireFlags::EXPIRE_ALWAYS;
  for (auto& arg : args) {
    string arg_sv = absl::AsciiStrToUpper(ToSV(arg));
    if (arg_sv == "NX") {
      flags |= ExpireFlags::EXPIRE_NX;
    } else if (arg_sv == "XX") {
      flags |= ExpireFlags::EXPIRE_XX;
    } else if (arg_sv == "GT") {
      flags |= ExpireFlags::EXPIRE_GT;
    } else if (arg_sv == "LT") {
      flags |= ExpireFlags::EXPIRE_LT;
    } else {
      return nonstd::make_unexpected(absl::StrCat("Unsupported option: ", arg_sv));
    }
  }

  if ((flags & ExpireFlags::EXPIRE_NX) && (flags & ExpireFlags::EXPIRE_XX)) {
    return nonstd::make_unexpected("NX and XX options at the same time are not compatible");
  }
  if ((flags & ExpireFlags::EXPIRE_GT) && (flags & ExpireFlags::EXPIRE_LT)) {
    return nonstd::make_unexpected("GT and LT options at the same time are not compatible");
  }
  return flags;
}

}  // namespace

OpResult<uint32_t> GenericFamily::OpDel(const OpArgs& op_args, const ShardArgs& keys, bool async) {
  DVLOG(1) << "Del: " << keys.Front() << " async: " << async;
  auto& db_slice = op_args.GetDbSlice();

  uint32_t res = 0;

  for (string_view key : keys) {
    auto it = db_slice.FindMutable(op_args.db_cntx, key).it;  // post_updater will run immediately
    if (!IsValid(it))
      continue;

    db_slice.Del(op_args.db_cntx, it, nullptr, async);
    ++res;
  }

  return res;
}

static cmd::CmdR CmdDel(CmdArgList args, CommandContext* cmd_cntx) {
  bool async_unlink =
      cmd_cntx->cid()->name() == "UNLINK" && absl::GetFlag(FLAGS_unlink_experimental_async);

  std::atomic_uint32_t result = 0;
  auto cb = [&](Transaction* tx, EngineShard* es) {
    auto args = tx->GetShardArgs(es->shard_id());
    auto op_args = tx->GetOpArgs(es);
    auto res = GenericFamily::OpDel(op_args, args, async_unlink);
    result.fetch_add(res.value_or(0), memory_order_relaxed);
    return OpStatus::OK;
  };

  co_await cmd::SingleHop(cb);
  uint32_t del_cnt = result.load(memory_order_relaxed);

  auto* rb = cmd_cntx->rb();
  if (cmd_cntx->mc_command()) {
    MCRender mc_render{cmd_cntx->mc_command()->cmd_flags};
    rb->SendSimpleString(del_cnt ? mc_render.RenderDeleted() : mc_render.RenderNotFound());
  } else {
    rb->SendLong(del_cnt);
  }
  co_return std::nullopt;
}

void GenericFamily::Delex(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);

  // Parse optional condition
  enum class Condition : uint8_t { NONE, IFEQ, IFNE, IFDEQ, IFDNE };
  Condition cond = Condition::NONE;
  string_view compare_value;

  if (args.size() == 1) {
    // DELEX key - no condition, behaves like DEL
    cond = Condition::NONE;
  } else if (args.size() == 2) {
    // DELEX key <something> - invalid, needs both condition and value
    // TODO: include error type in error reply
    return cmd_cntx->SendError(facade::WrongNumArgsError("DELEX"), kSyntaxErrType);
  } else if (args.size() == 3) {
    string_view opt = ArgS(args, 1);
    compare_value = ArgS(args, 2);

    if (absl::EqualsIgnoreCase(opt, "IFEQ")) {
      cond = Condition::IFEQ;
    } else if (absl::EqualsIgnoreCase(opt, "IFNE")) {
      cond = Condition::IFNE;
    } else if (absl::EqualsIgnoreCase(opt, "IFDEQ")) {
      cond = Condition::IFDEQ;
    } else if (absl::EqualsIgnoreCase(opt, "IFDNE")) {
      cond = Condition::IFDNE;
    } else {
      return cmd_cntx->SendError(facade::UnknownSubCmd(opt, "DELEX"), kSyntaxErrType);
    }
  } else {
    // args.size() > 3
    return cmd_cntx->SendError(facade::WrongNumArgsError("DELEX"), kSyntaxErrType);
  }

  // If no condition, delegate to standard DEL
  if (cond == Condition::NONE) {
    CmdDel(args, cmd_cntx);
    return;
  }

  auto compare_str = [&](string_view val) {
    bool is_digest = (cond == Condition::IFDEQ || cond == Condition::IFDNE);

    if (is_digest) {
      string dig = XXH3_Digest(val);
      return (dig == compare_value) == (cond == Condition::IFDEQ);
    }
    return (val == compare_value) == (cond == Condition::IFEQ);
  };

  // Execute conditional delete
  auto cb = [key, compare_str](Transaction* tx, EngineShard* es) -> OpResult<uint32_t> {
    auto& db_slice = tx->GetDbSlice(es->shard_id());
    auto it_res = db_slice.FindMutable(tx->GetDbContext(), key, OBJ_STRING);

    // Key doesn't exist
    if (!it_res.ok()) {
      if (it_res.status() == OpStatus::KEY_NOTFOUND)
        return 0;
      return it_res.status();
    }

    // Get the value
    const PrimeValue& pv = it_res->it->second;
    // Check condition
    bool should_delete = false;

    if (pv.IsExternal()) {
      util::fb2::Future<io::Result<bool>> fut = ReadTiered<bool>(
          tx->GetDbIndex(), key, pv, [&](string_view val) { return compare_str(val); },
          es->tiered_storage());

      auto result = fut.Get();
      if (!result)
        // Tiered storage read failed - return generic I/O error
        return OpStatus::IO_ERROR;
      should_delete = *result;
    } else {
      should_delete = compare_str(pv.ToString());
    }

    // Delete if condition is met
    if (should_delete) {
      db_slice.DelMutable(tx->GetDbContext(), std::move(*it_res));
      return 1;
    }

    return 0;
  };

  OpResult<uint32_t> result = cmd_cntx->tx()->ScheduleSingleHopT(cb);

  if (result) {
    cmd_cntx->SendLong(*result);
  } else {
    cmd_cntx->SendError(result.status());
  }
}

void GenericFamily::Ping(CmdArgList args, CommandContext* cmd_cntx) {
  if (args.size() > 1) {
    return cmd_cntx->SendError(facade::WrongNumArgsError("ping"), kSyntaxErrType);
  }

  string_view msg;

  // If a client in the subscribe state and in resp2 mode, it returns an array for some reason.
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (cmd_cntx->server_conn_cntx()->conn_state.subscribe_info && !rb->IsResp3()) {
    if (args.size() == 1) {
      msg = ArgS(args, 0);
    }

    auto replier = [msg = string(msg)](RedisReplyBuilder* rb) {
      string_view resp[2] = {"pong", msg};
      rb->SendBulkStrArr(resp);
    };
    return cmd_cntx->ReplyWith(std::move(replier));
  }

  if (args.size() == 0) {
    return cmd_cntx->SendSimpleString("PONG");
  }

  msg = ArgS(args, 0);
  DVLOG(2) << "Ping " << msg;

  auto replier = [msg = string(msg)](RedisReplyBuilder* rb) { rb->SendBulkString(msg); };
  return cmd_cntx->ReplyWith(std::move(replier));
}

void GenericFamily::Exists(CmdArgList args, CommandContext* cmd_cntx) {
  VLOG(1) << "Exists " << ArgS(args, 0);

  atomic_uint32_t result{0};

  auto cb = [&result](Transaction* t, EngineShard* shard) {
    ShardArgs args = t->GetShardArgs(shard->shard_id());
    auto res = OpExists(t->GetOpArgs(shard), args);
    result.fetch_add(res.value_or(0), memory_order_relaxed);

    return OpStatus::OK;
  };

  OpStatus status = cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));
  CHECK_EQ(OpStatus::OK, status);

  return cmd_cntx->SendLong(result.load(memory_order_acquire));
}

void GenericFamily::Persist(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);

  auto cb = [&](Transaction* t, EngineShard* shard) { return OpPersist(t->GetOpArgs(shard), key); };

  OpStatus status = cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));
  cmd_cntx->SendLong(status == OpStatus::OK);
}

void GenericFamily::Expire(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view sec = ArgS(args, 1);
  int64_t int_arg;

  if (!absl::SimpleAtoi(sec, &int_arg)) {
    return cmd_cntx->SendError(kInvalidIntErr);
  }

  int_arg = std::max<int64_t>(int_arg, -1);

  // silently cap the expire time to kMaxExpireDeadlineSec which is more than 8 years.
  if (int_arg > kMaxExpireDeadlineSec) {
    int_arg = kMaxExpireDeadlineSec;
  }

  auto expire_options = ParseExpireOptionsOrReply(args.subspan(2));
  if (!expire_options) {
    return cmd_cntx->SendError(expire_options.error());
  }
  DbSlice::ExpireParams params{.value = int_arg, .expire_options = expire_options.value()};

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpExpire(t->GetOpArgs(shard), key, params);
  };

  OpStatus status = cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));
  cmd_cntx->SendLong(status == OpStatus::OK);
}

void GenericFamily::ExpireAt(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view sec = ArgS(args, 1);
  int64_t int_arg;

  if (!absl::SimpleAtoi(sec, &int_arg)) {
    return cmd_cntx->SendError(kInvalidIntErr);
  }

  int_arg = std::max<int64_t>(int_arg, 0L);
  auto expire_options = ParseExpireOptionsOrReply(args.subspan(2));
  if (!expire_options) {
    return cmd_cntx->SendError(expire_options.error());
  }
  DbSlice::ExpireParams params{
      .value = int_arg, .absolute = true, .expire_options = expire_options.value()};

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpExpire(t->GetOpArgs(shard), key, params);
  };
  OpStatus status = cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));

  if (status == OpStatus::OUT_OF_RANGE) {
    return cmd_cntx->SendError(kExpiryOutOfRange);
  }

  cmd_cntx->SendLong(status == OpStatus::OK);
}

void GenericFamily::Keys(CmdArgList args, CommandContext* cmd_cntx) {
  string_view pattern(ArgS(args, 0));
  uint64_t cursor = 0;

  StringVec keys;

  ScanOpts scan_opts;
  if (pattern != "*") {
    scan_opts.matcher.reset(new GlobMatcher{pattern, true});
  }

  scan_opts.limit = 512;
  auto output_limit = absl::GetFlag(FLAGS_keys_output_limit);

  do {
    cursor = ScanGeneric(cursor, scan_opts, &keys, cmd_cntx->server_conn_cntx());
  } while (cursor != 0 && keys.size() < output_limit);

  auto replier = [keys = std::move(keys)](RedisReplyBuilder* rb) { rb->SendBulkStrArr(keys); };
  return cmd_cntx->ReplyWith(std::move(replier));
}

void GenericFamily::PexpireAt(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view msec = ArgS(args, 1);
  int64_t int_arg;

  if (!absl::SimpleAtoi(msec, &int_arg)) {
    return cmd_cntx->SendError(kInvalidIntErr);
  }

  int_arg = std::max<int64_t>(int_arg, 0L);
  auto expire_options = ParseExpireOptionsOrReply(args.subspan(2));
  if (!expire_options) {
    return cmd_cntx->SendError(expire_options.error());
  }
  DbSlice::ExpireParams params{.value = int_arg,
                               .unit = TimeUnit::MSEC,
                               .absolute = true,
                               .expire_options = expire_options.value()};

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpExpire(t->GetOpArgs(shard), key, params);
  };
  OpStatus status = cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));

  if (status == OpStatus::OUT_OF_RANGE) {
    return cmd_cntx->SendError(kExpiryOutOfRange);
  } else {
    cmd_cntx->SendLong(status == OpStatus::OK);
  }
}

void GenericFamily::Pexpire(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view msec = ArgS(args, 1);
  int64_t int_arg;

  if (!absl::SimpleAtoi(msec, &int_arg)) {
    return cmd_cntx->SendError(kInvalidIntErr);
  }
  int_arg = std::max<int64_t>(int_arg, -1);

  // to be more compatible with redis, we silently cap the expire time to kMaxExpireDeadlineSec
  if (int_arg > kMaxExpireDeadlineMs) {
    int_arg = kMaxExpireDeadlineMs;
  }

  auto expire_options = ParseExpireOptionsOrReply(args.subspan(2));
  if (!expire_options) {
    return cmd_cntx->SendError(expire_options.error());
  }
  DbSlice::ExpireParams params{
      .value = int_arg, .unit = TimeUnit::MSEC, .expire_options = expire_options.value()};

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpExpire(t->GetOpArgs(shard), key, params);
  };
  OpStatus status = cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));

  if (status == OpStatus::OUT_OF_RANGE) {
    return cmd_cntx->SendError(kExpiryOutOfRange);
  }
  cmd_cntx->SendLong(status == OpStatus::OK);
}

void GenericFamily::Stick(CmdArgList args, CommandContext* cmd_cntx) {
  Transaction* transaction = cmd_cntx->tx();
  VLOG(1) << "Stick " << ArgS(args, 0);

  atomic_uint32_t result{0};

  auto cb = [&result](const Transaction* t, EngineShard* shard) {
    ShardArgs args = t->GetShardArgs(shard->shard_id());
    auto res = OpStick(t->GetOpArgs(shard), args);
    result.fetch_add(res.value_or(0), memory_order_relaxed);

    return OpStatus::OK;
  };

  OpStatus status = transaction->ScheduleSingleHop(std::move(cb));
  CHECK_EQ(OpStatus::OK, status);

  DVLOG(2) << "Stick ts " << transaction->txid();

  uint32_t match_cnt = result.load(memory_order_relaxed);
  cmd_cntx->SendLong(match_cnt);
}

struct SortEntryBase {
  string key;
  const string* bound_value = nullptr;
  vector<string> get_values;  // Stores fetched GET pattern values

  void BindValue(const std::string* value) {
    bound_value = value;
  }

  std::string_view ResultKey() const {
    if (bound_value) {
      return *bound_value;
    }
    return key;
  }
};

// Used to conditionally store double score
struct SortEntryScore : public SortEntryBase {
  double score;
};

// SortEntry stores all data required for sorting
template <bool ALPHA>
struct SortEntry
    // Store score only if we need it
    : public std::conditional_t<ALPHA, SortEntryBase, SortEntryScore> {
  bool Parse(string&& item) {
    if constexpr (!ALPHA) {
      if (!absl::SimpleAtod(item, &this->score)) {
        if (!item.empty()) {
          return false;
        }
        this->score = 0;
      }
      if (std::isnan(this->score)) {
        return false;
      }
    }
    this->key = std::move(item);
    return true;
  }

  bool Parse(int64_t item) {
    if constexpr (!ALPHA) {
      this->score = item;
    }
    this->key = absl::StrCat(item);
    return true;
  }

  static bool less(const SortEntry& l, const SortEntry& r) {
    if constexpr (!ALPHA) {
      if (l.score < r.score) {
        return true;
      } else if (r.score < l.score) {
        return false;
      }
      // to prevent unstrict order we compare values lexicographically
    }
    return l.key < r.key;
  }

  static bool greater(const SortEntry& l, const SortEntry& r) {
    return less(r, l);
  }
};

// std::variant of all possible vectors of SortEntries
using SortEntryList = std::variant<
    // Used when sorting by double values
    std::vector<SortEntry<false>>,
    // Used when sorting by string values
    std::vector<SortEntry<true>>>;

// Create SortEntryList based on runtime arguments
SortEntryList MakeSortEntryList(bool alpha) {
  if (alpha)
    return SortEntryList{std::vector<SortEntry<true>>{}};
  else
    return SortEntryList{std::vector<SortEntry<false>>{}};
}

// Iterate over container with generic function that accepts strings and ints
template <typename F> bool Iterate(const PrimeValue& pv, F&& func) {
  switch (pv.ObjType()) {
    case OBJ_LIST:
      return container_utils::IterateList(pv, func);
    case OBJ_SET:
      return container_utils::IterateSet(pv, func);
    case OBJ_ZSET:
      return container_utils::IterateSortedSet(
          pv, [&](container_utils::ContainerEntry ce, double) { return func(ce); });
    default:
      return false;
  }
}

// Create a SortEntryList from given key
OpResult<CompactObjType> OpFetchSortEntries(const OpArgs& op_args, std::string_view key,
                                            SortEntryList* dest) {
  using namespace container_utils;

  auto it = op_args.GetDbSlice().FindReadOnly(op_args.db_cntx, key).it;
  if (!IsValid(it)) {
    return OpStatus::KEY_NOTFOUND;
  }
  if (!IsContainer(it->second)) {
    return OpStatus::WRONG_TYPE;
  }

  bool success = std::visit(
      [&pv = it->second](auto& entries) {
        entries.reserve(pv.Size());
        return Iterate(pv, [&entries](const ContainerEntry& entry) {
          if (entry.IsString())
            return entries.emplace_back().Parse(entry.ToString());
          else
            return entries.emplace_back().Parse(entry.as_long());
        });
      },
      *dest);
  if (!success)
    return OpStatus::INVALID_NUMERIC_RESULT;

  return it->second.ObjType();
}

// Fetch container elements as strings (for BY pattern support)
OpResult<pair<vector<string>, CompactObjType>> OpFetchContainerElements(const OpArgs& op_args,
                                                                        std::string_view key) {
  using namespace container_utils;

  auto it = op_args.GetDbSlice().FindReadOnly(op_args.db_cntx, key).it;
  if (!IsValid(it)) {
    return OpStatus::KEY_NOTFOUND;
  }
  if (!IsContainer(it->second)) {
    return OpStatus::WRONG_TYPE;
  }

  vector<string> elements;
  elements.reserve(it->second.Size());

  Iterate(it->second, [&elements](const ContainerEntry& entry) {
    elements.emplace_back(entry.ToString());
    return true;
  });

  return std::make_pair(std::move(elements), it->second.ObjType());
}

// Fetch a string value from a key (for BY pattern lookups)
// TODO: does not support tiering.
string OpFetchStringValue(const OpArgs& op_args, std::string_view key) {
  auto it = op_args.GetDbSlice().FindReadOnly(op_args.db_cntx, key).it;
  if (!IsValid(it) || it->second.ObjType() != OBJ_STRING) {
    return {};  // Missing key defaults to empty string
  }

  return it->second.ToString();
}

template <typename IteratorBegin, typename IteratorEnd>
OpResult<uint32_t> OpStore(const OpArgs& op_args, std::string_view key, IteratorBegin&& start_it,
                           IteratorEnd&& end_it, bool has_get_patterns) {
  uint32_t len = 0;

  // If we are about to overwrite an existing indexed document (HASH/JSON),
  // remove it from search indices first to avoid duplicate entries.
  auto existing = op_args.GetDbSlice().FindReadOnly(op_args.db_cntx, key).it;
  if (IsValid(existing)) {
    RemoveKeyFromIndexesIfNeeded(key, op_args.db_cntx, existing->second, op_args.shard);
  }

  QList* ql_v2 = CompactObj::AllocateMR<QList>();
  QList::Where where = QList::TAIL;
  for (auto it = start_it; it != end_it; ++it) {
    if (has_get_patterns) {
      // Store all GET pattern values for this entry
      for (const auto& value : it->get_values) {
        ql_v2->Push(value, where);
      }
    } else {
      // No GET patterns - store the element itself
      ql_v2->Push(it->ResultKey(), where);
    }
  }
  len = ql_v2->Size();

  PrimeValue pv;
  pv.InitRobj(OBJ_LIST, kEncodingQL2, ql_v2);

  // This would overwrite existing value if any with new list.
  auto op_res = op_args.GetDbSlice().AddOrUpdate(op_args.db_cntx, key, std::move(pv), 0);
  RETURN_ON_BAD_STATUS(op_res);

  return len;
}

struct SortParams {
  bool alpha = false;
  bool reversed = false;
  bool is_read_only = false;
  bool to_sort = true;

  optional<string_view> store_key;

  // first is offset, second is count
  optional<pair<uint32_t, uint32_t>> bounds;

  // These options are parsed but currently not fully supported or used by the visitor.
  optional<string_view> by_pattern;
  vector<string_view> get_patterns;
};

template <typename C>
auto GetSortRange(const C& entries, const optional<pair<uint32_t, uint32_t>>& bounds) {
  auto start_it = entries.begin();
  auto end_it = entries.end();
  if (bounds) {
    start_it += std::min<uint32_t>(bounds->first, entries.size());
    end_it = entries.begin() + std::min<uint32_t>(bounds->first + bounds->second, entries.size());
  }

  return std::make_pair(start_it, end_it);
};

// Generic GET pattern fetcher that abstracts element access and result storage.
// Handles pattern expansion, shard distribution, and parallel fetching.
// Special pattern "#" returns the element value itself.
// Uses "read uncommitted" isolation - fetches values across shards without transaction guarantees.
//
// Template parameters:
//   ElementContainer: Container type holding elements (e.g., vector<string>, vector<SortEntry>)
//   ElementAccessor: Callable that returns string_view for element at index: (size_t) ->
//   string_view ResultSetter: Callable that stores fetched value: (size_t elem_idx, size_t
//   pattern_idx, string value) -> void
template <typename ElementContainer, typename ElementAccessor, typename ResultSetter>
void FetchGetPatternValues(const SortParams& params, const DbContext& db_cntx,
                           const ElementContainer& elements, ElementAccessor get_element_key,
                           ResultSetter set_result) {
  if (params.get_patterns.empty())
    return;

  // Build a list of all external keys to fetch, organized by shard
  // Structure: keys_by_shard[shard_id] = [(elem_idx, pattern_idx, ext_key), ...]
  vector<vector<tuple<size_t, size_t, string>>> keys_by_shard(shard_set->size());

  // Build external keys for each element and pattern
  for (size_t elem_idx = 0; elem_idx < elements.size(); ++elem_idx) {
    for (size_t pattern_idx = 0; pattern_idx < params.get_patterns.size(); ++pattern_idx) {
      std::string_view pattern = params.get_patterns[pattern_idx];

      if (pattern == "#") {
        // Special pattern - return the element itself, no external fetch needed
        set_result(elem_idx, pattern_idx, string(get_element_key(elem_idx)));
        continue;
      }

      // Build external key by replacing '*' with the actual element value
      size_t star_pos = pattern.find('*');
      string ext_key;
      if (star_pos == std::string_view::npos) {
        // No asterisk - use pattern as literal key
        ext_key = string(pattern);
      } else {
        ext_key = absl::StrCat(pattern.substr(0, star_pos), get_element_key(elem_idx),
                               pattern.substr(star_pos + 1));
      }

      ShardId sid = Shard(ext_key, shard_set->size());
      keys_by_shard[sid].emplace_back(elem_idx, pattern_idx, std::move(ext_key));
    }
  }

  // Fetch all external keys in parallel across shards
  shard_set->RunBlockingInParallel([&](EngineShard* shard) {
    ShardId sid = shard->shard_id();
    for (const auto& [elem_idx, pattern_idx, ext_key] : keys_by_shard[sid]) {
      string value = OpFetchStringValue({shard, nullptr, db_cntx}, ext_key);
      set_result(elem_idx, pattern_idx, std::move(value));
    }
  });
}

// Fetches external keys referenced by GET patterns and fills the get_values in sort entries.
// For each entry, fetches values for all GET patterns. Special pattern "#" returns the element
// itself. Uses "read uncommitted" isolation - fetches values across shards without transaction
// guarantees.
template <bool ALPHA>
OpStatus PopulateGetPatternValues(const SortParams& params, const DbContext& db_cntx,
                                  std::vector<SortEntry<ALPHA>>* entries) {
  DCHECK(!params.get_patterns.empty());

  // Pre-allocate get_values for each entry
  for (auto& entry : *entries) {
    entry.get_values.resize(params.get_patterns.size());
  }

  // Use generic fetcher with lambdas to access ResultKey() and store in entry.get_values
  FetchGetPatternValues(
      params, db_cntx, *entries,
      [&](size_t idx) -> std::string_view { return (*entries)[idx].ResultKey(); },
      [&](size_t entry_idx, size_t pattern_idx, string value) {
        (*entries)[entry_idx].get_values[pattern_idx] = std::move(value);
      });

  return OpStatus::OK;
}

// Visitor to handle the actual sorting and reply generation
struct SortVisitor {
  const SortParams& params;
  CompactObjType result_type;
  CommandContext* cmd_cntx;
  vector<string> raw_elements;

  template <typename T> void operator()(T& entries) {
    using value_t = typename std::decay_t<decltype(entries)>::value_type;
    auto cmp = params.reversed ? &value_t::greater : &value_t::less;

    DCHECK(params.to_sort);

    DVLOG(2) << "Sorting " << entries.size() << " elements";

    // Sort logic
    if (params.bounds) {
      auto sort_it =
          entries.begin() +
          std::min<uint32_t>(params.bounds->first + params.bounds->second, entries.size());
      std::partial_sort(entries.begin(), sort_it, entries.end(), cmp);
    } else {
      std::sort(entries.begin(), entries.end(), cmp);
    }

    // Fetch GET pattern values if needed
    if (!params.get_patterns.empty()) {
      ConnectionContext* cntx = cmd_cntx->server_conn_cntx();
      DbContext db_cntx{cntx->ns, cntx->db_index(), GetCurrentTimeMs()};
      PopulateGetPatternValues(params, db_cntx, &entries);
    }

    if (!params.store_key) {
      bool is_set = (result_type == OBJ_SET || result_type == OBJ_ZSET);
      bool has_get_patterns = !params.get_patterns.empty();
      auto replier = [entries = std::move(entries), bounds = params.bounds, is_set,
                      has_get_patterns,
                      raw_elements = std::move(raw_elements)](RedisReplyBuilder* rb) {
        DVLOG(2) << "Replying with sorted entries, count: " << entries.size();
        auto [start_it, end_it] = GetSortRange(entries, bounds);

        size_t num_entries = std::distance(start_it, end_it);
        size_t collection_size = has_get_patterns && !entries.empty()
                                     ? num_entries * entries.front().get_values.size()
                                     : num_entries;

        rb->StartCollection(collection_size, is_set ? CollectionType::SET : CollectionType::ARRAY);

        for (auto it = start_it; it != end_it; ++it) {
          if (has_get_patterns && !it->get_values.empty()) {
            // Send all GET pattern values for this entry
            for (const auto& value : it->get_values) {
              rb->SendBulkString(value);
            }
          } else {
            // No GET patterns - send the element itself
            rb->SendBulkString(it->ResultKey());
          }
        }
      };
      cmd_cntx->ReplyWith(std::move(replier));
    } else {
      std::string_view store_key_sv = params.store_key.value();
      ShardId dest_sid = Shard(store_key_sv, shard_set->size());
      OpResult<uint32_t> store_len;
      bool has_get_patterns = !params.get_patterns.empty();

      auto store_callback = [&](Transaction* t, EngineShard* shard) {
        ShardId shard_id = shard->shard_id();
        if (shard_id == dest_sid) {
          auto [start_it, end_it] = GetSortRange(entries, params.bounds);
          store_len =
              OpStore(t->GetOpArgs(shard), store_key_sv, start_it, end_it, has_get_patterns);
        }
        return OpStatus::OK;
      };
      cmd_cntx->tx()->Execute(std::move(store_callback), true);

      if (store_len) {
        cmd_cntx->SendLong(store_len.value());
      } else {
        cmd_cntx->SendError(store_len.status());
      }
    }
  }
};

// Fetches external keys referenced by a BY pattern and fills the sort entries. We deliberately
// perform "read uncommitted" lookups across arbitrary shards, so this helper does not preserve the
// enclosing transaction's isolation guarantees.
OpStatus PopulateSortEntriesFromByPattern(const SortParams& params,
                                          const vector<string>& raw_elements,
                                          const DbContext& db_cntx, SortEntryList* sorted_entries) {
  DCHECK(params.by_pattern);

  vector<vector<pair<size_t, string>>> keys_by_shard(shard_set->size());
  std::string_view pattern = *params.by_pattern;
  size_t star_pos = pattern.find('*');
  DCHECK_NE(star_pos, std::string_view::npos);
  for (size_t i = 0; i < raw_elements.size(); ++i) {
    string ext_key =
        absl::StrCat(pattern.substr(0, star_pos), raw_elements[i], pattern.substr(star_pos + 1));
    ShardId sid = Shard(ext_key, shard_set->size());
    keys_by_shard[sid].emplace_back(i, std::move(ext_key));
  }

  std::visit([&](auto& entries) { entries.resize(raw_elements.size()); }, *sorted_entries);
  atomic_bool parse_error{false};
  shard_set->RunBlockingInParallel([&](EngineShard* shard) {
    ShardId sid = shard->shard_id();
    bool success = std::visit(
        [&](auto& dest) {
          for (const auto& [idx, ext_key] : keys_by_shard[sid]) {
            string external_value = OpFetchStringValue({shard, nullptr, db_cntx}, ext_key);
            auto& entry = dest[idx];
            if (!entry.Parse(std::move(external_value)))
              return false;
            entry.BindValue(&raw_elements[idx]);
          }
          return true;
        },
        *sorted_entries);
    if (!success) {
      parse_error.store(true, memory_order_relaxed);
    }
  });

  if (parse_error.load(memory_order_relaxed)) {
    return OpStatus::INVALID_NUMERIC_RESULT;
  }

  return OpStatus::OK;
}

void SortGeneric(CmdArgList args, CommandContext* cmd_cntx, bool is_read_only) {
  CmdArgParser parser(args);
  std::string_view key = parser.Next();
  SortParams params;
  params.is_read_only = is_read_only;

  while (parser.HasNext()) {
    if (parser.Check("ALPHA")) {
      params.alpha = true;
    } else if (parser.Check("DESC")) {
      params.reversed = true;
    } else if (parser.Check("ASC")) {
      params.reversed = false;
    } else if (parser.Check("LIMIT")) {
      uint32_t offset = parser.Next<uint32_t>();
      uint32_t limit = parser.Next<uint32_t>();
      params.bounds = {offset, limit};
    } else if (!is_read_only && parser.Check("STORE", &params.store_key)) {
    } else if (parser.Check("BY", &params.by_pattern)) {
    } else if (parser.Check("GET")) {
      params.get_patterns.push_back(parser.Next());
    } else {
      LOG_EVERY_T(ERROR, 1) << "Unsupported option " << parser.Peek();
      return cmd_cntx->SendError(kSyntaxErr);
    }
  }

  if (parser.HasError()) {
    return cmd_cntx->SendError(parser.TakeError().MakeReply());
  }

  // Validate BY pattern has exactly one '*'
  if (params.by_pattern) {
    size_t star_count = std::count(params.by_pattern->begin(), params.by_pattern->end(), '*');
    if (star_count == 0) {
      // "nosort" pattern - no '*' means skip sorting, preserve insertion order
      params.to_sort = false;
      params.by_pattern.reset();
    } else if (star_count != 1) {
      return cmd_cntx->SendError(kSyntaxErr);
    }
  }

  // Validate GET patterns: each pattern must be "#" or have at most 1 asterisk
  for (const auto& pattern : params.get_patterns) {
    if (pattern == "#") {
      continue;  // Special pattern, always valid
    }
    size_t star_count = std::count(pattern.begin(), pattern.end(), '*');
    if (star_count > 1) {
      return cmd_cntx->SendError(kSyntaxErr);
    }
  }

  // Asserting that if is_read_only as true, then store_key should not exist.
  DVLOG(1) << "is_read_only parameter: " << is_read_only
           << " and store_key parameter: " << bool(params.store_key);
  DCHECK(((is_read_only && !bool(params.store_key)) || !is_read_only));

  ConnectionContext* cntx = cmd_cntx->server_conn_cntx();
  DbContext db_cntx{cntx->ns, cntx->db_index(), GetCurrentTimeMs()};

  CompactObjType source_type = OBJ_STRING;  // undefined in this context

  // "BY nosort" or we need to sort by external keys - fetch unsorted first.
  bool fetch_unsorted = !params.to_sort || params.by_pattern;
  bool single_hop = !bool(params.store_key);
  vector<string> raw_elements;
  ShardId source_sid = Shard(key, shard_set->size());

  // The high level steps are:
  // 1. Fetch container elements (strings only, no parsing) if no sorting needed.
  // 2. If sorting needed, prepare SortEntryList and fetch external keys if BY pattern is used.
  // 3. Perform sorting and generate reply or store result if STORE option is used.
  // 4. If no sorting needed, reply with fetched raw elements (with LIMIT if any).
  if (fetch_unsorted) {
    // Step 1: Fetch container elements (strings only, no parsing)
    OpResult<pair<vector<string>, CompactObjType>> elem_result;

    auto fetch_cb = [&](Transaction* t, EngineShard* shard) {
      if (shard->shard_id() == source_sid) {
        elem_result = OpFetchContainerElements(t->GetOpArgs(shard), key);
      }
      return OpStatus::OK;
    };

    cmd_cntx->tx()->Execute(std::move(fetch_cb), single_hop);

    // elem_result->first is empty both for missing/empty containers and for errors;
    // use elem_result's OpStatus to distinguish actual error cases (e.g. WRONG_TYPE).
    if (elem_result->first.empty()) {
      cmd_cntx->tx()->Conclude();
      if (elem_result == OpStatus::WRONG_TYPE)
        return cmd_cntx->SendError(elem_result.status());
      else
        return static_cast<RedisReplyBuilder*>(cmd_cntx->rb())->SendEmptyArray();
    }

    raw_elements.swap(elem_result->first);
    source_type = elem_result->second;
  }

  if (params.to_sort) {
    // Step 2 and 3: Prepare SortEntryList, fetch external keys if needed, perform sorting

    auto sorted_entries =
        MakeSortEntryList(params.alpha);  // Numeric or alpha depending on params.alpha
    OpStatus sort_status = OpStatus::OK;

    // Handle BY pattern with external key lookups
    if (params.by_pattern) {
      DCHECK(source_type == OBJ_SET || source_type == OBJ_ZSET || source_type == OBJ_LIST);
      sort_status =
          PopulateSortEntriesFromByPattern(params, raw_elements, db_cntx, &sorted_entries);
    } else {  // No BY pattern, sort directly on fetched elements
      OpResult<CompactObjType> fetch_result;
      auto fetch_cb = [&](Transaction* t, EngineShard* shard) {
        // in case of SORT option, we fetch only on the source shard
        if (shard->shard_id() == source_sid) {
          fetch_result = OpFetchSortEntries(t->GetOpArgs(shard), key, &sorted_entries);
        }
        return OpStatus::OK;
      };

      cmd_cntx->tx()->Execute(std::move(fetch_cb), single_hop);
      sort_status = fetch_result.status();
      source_type = *fetch_result;
    }

    if (sort_status != OpStatus::OK) {
      DVLOG(2) << "Sorting failed with status " << sort_status;
      cmd_cntx->tx()->Conclude();
      if (sort_status == OpStatus::WRONG_TYPE)
        return cmd_cntx->SendError(sort_status);
      if (sort_status == OpStatus::INVALID_NUMERIC_RESULT)
        return cmd_cntx->SendError("One or more scores can't be converted into double");
      return static_cast<RedisReplyBuilder*>(cmd_cntx->rb())->SendEmptyArray();
    }

    SortVisitor visitor{params, source_type, cmd_cntx, std::move(raw_elements)};
    std::visit(visitor, sorted_entries);
    return;
  }

  // No sorting required, just reply with fetched raw elements (with LIMIT if any)
  DVLOG(1) << "Replying with unsorted " << raw_elements.size() << " elements from key " << key;
  DCHECK(!raw_elements.empty());

  // Fetch GET pattern values if needed (for unsorted path)
  vector<vector<string>> get_values_per_element;
  if (!params.get_patterns.empty()) {
    // Pre-allocate storage for GET pattern values
    get_values_per_element.resize(raw_elements.size(), vector<string>(params.get_patterns.size()));

    // Use generic fetcher with lambdas to access raw_elements and store in get_values_per_element
    FetchGetPatternValues(
        params, db_cntx, raw_elements,
        [&](size_t idx) -> std::string_view { return raw_elements[idx]; },
        [&](size_t elem_idx, size_t pattern_idx, string value) {
          get_values_per_element[elem_idx][pattern_idx] = std::move(value);
        });
  }

  auto replier = [raw_elements = std::move(raw_elements), params, source_type,
                  get_values = std::move(get_values_per_element)](RedisReplyBuilder* rb) {
    auto [start_it, end_it] = GetSortRange(raw_elements, params.bounds);
    bool is_set = (source_type == OBJ_SET || source_type == OBJ_ZSET);
    size_t num_entries = std::distance(start_it, end_it);
    size_t collection_size =
        !get_values.empty() ? num_entries * get_values.front().size() : num_entries;

    rb->StartCollection(collection_size, is_set ? CollectionType::SET : CollectionType::ARRAY);

    size_t elem_idx = start_it - raw_elements.begin();
    for (auto it = start_it; it != end_it; ++it, ++elem_idx) {
      if (!get_values.empty() && !get_values[elem_idx].empty()) {
        for (const auto& value : get_values[elem_idx]) {
          rb->SendBulkString(value);
        }
      } else {
        rb->SendBulkString(*it);
      }
    }
  };
  cmd_cntx->ReplyWith(std::move(replier));
}

void GenericFamily::Sort(CmdArgList args, CommandContext* cmd_cntx) {
  SortGeneric(args, cmd_cntx, false);
}

void GenericFamily::Sort_RO(CmdArgList args, CommandContext* cmd_cntx) {
  SortGeneric(args, cmd_cntx, true);
}

void GenericFamily::Restore(CmdArgList args, CommandContext* cmd_cntx) {
  std::string_view key = ArgS(args, 0);
  std::string_view serialized_value = ArgS(args, 2);

  auto rdb_version =
      GetRdbVersion(serialized_value, cmd_cntx->server_conn_cntx()->journal_emulated);
  if (!rdb_version) {
    return cmd_cntx->SendError(kInvalidDumpValueErr);
  }

  OpResult<RestoreArgs> restore_args = RestoreArgs::TryFrom(args);
  if (!restore_args) {
    if (restore_args.status() == OpStatus::OUT_OF_RANGE) {
      return cmd_cntx->SendError("Invalid IDLETIME value, must be >= 0");
    } else {
      return cmd_cntx->SendError(restore_args.status());
    }
  }

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpRestore(t->GetOpArgs(shard), key, serialized_value, restore_args.value(),
                     rdb_version.value());
  };

  OpStatus result = cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));

  switch (result) {
    case OpStatus::OK:
      return cmd_cntx->SendOk();
    case OpStatus::KEY_EXISTS:
      return cmd_cntx->SendError("-BUSYKEY Target key name already exists.");
    case OpStatus::INVALID_VALUE:
      return cmd_cntx->SendError("Bad data format");
    default:
      return cmd_cntx->SendError(result);
  }
}

void GenericFamily::FieldExpire(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  string_view key = parser.Next();
  string_view ttl_str = parser.Next();
  uint32_t ttl_sec;
  if (!absl::SimpleAtoi(ttl_str, &ttl_sec) || ttl_sec == 0 || ttl_sec > kMaxTtl) {
    return cmd_cntx->SendError(kInvalidIntErr);
  }
  CmdArgList fields = parser.Tail();

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpFieldExpire(t->GetOpArgs(shard), key, ttl_sec, fields);
  };

  OpResult<vector<long>> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));

  if (result) {
    auto replier = [vec = std::move(result.value())](RedisReplyBuilder* rb) {
      rb->SendLongArr(absl::MakeConstSpan(vec));
    };
    cmd_cntx->ReplyWith(std::move(replier));
  } else {
    cmd_cntx->SendError(result.status());
  }
}

// Returns -2 if key not found, WRONG_TYPE if key is not a set or hash
// -1 if the field does not have associated TTL on it, and -3 if field is not found.
void GenericFamily::FieldTtl(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view field = ArgS(args, 1);

  auto cb = [&](Transaction* t, EngineShard* shard) { return OpFieldTtl(t, shard, key, field); };

  OpResult<long> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));

  if (result) {
    cmd_cntx->SendLong(*result);
    return;
  }

  cmd_cntx->SendError(result.status());
}

void GenericFamily::Move(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view target_db_sv = ArgS(args, 1);
  int32_t target_db;
  if (!absl::SimpleAtoi(target_db_sv, &target_db)) {
    return cmd_cntx->SendError(kInvalidIntErr);
  }

  if (target_db < 0 || uint32_t(target_db) >= absl::GetFlag(FLAGS_dbnum)) {
    return cmd_cntx->SendError(kDbIndOutOfRangeErr);
  }

  if (target_db == cmd_cntx->tx()->GetDbIndex()) {
    return cmd_cntx->SendError("source and destination objects are the same");
  }

  OpStatus res = OpStatus::SKIPPED;
  ShardId target_shard = Shard(key, shard_set->size());
  auto cb = [&](Transaction* t, EngineShard* shard) {
    // MOVE runs as a global transaction and is therefore scheduled on every shard.
    if (target_shard == shard->shard_id()) {
      auto op_args = t->GetOpArgs(shard);
      res = OpMove(op_args, key, target_db);
      // MOVE runs as global command but we want to write the
      // command to only one journal.
      if (op_args.shard->journal()) {
        RecordJournal(op_args, "MOVE"sv, ArgSlice{key, target_db_sv});
      }
    }
    return OpStatus::OK;
  };

  cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));
  // Exactly one shard will call OpMove.
  DCHECK(res != OpStatus::SKIPPED);
  cmd_cntx->SendLong(res == OpStatus::OK);
}

void GenericFamily::Rename(CmdArgList args, CommandContext* cmd_cntx) {
  auto reply = RenameGeneric(args, false, cmd_cntx->tx());
  cmd_cntx->SendError(reply);
}

void GenericFamily::RenameNx(CmdArgList args, CommandContext* cmd_cntx) {
  auto reply = RenameGeneric(args, true, cmd_cntx->tx());
  if (!reply.status) {
    return cmd_cntx->SendError(reply.ToSv(), reply.kind);
  }

  OpStatus st = reply.status.value();
  if (st == OpStatus::OK) {
    cmd_cntx->SendLong(1);
  } else if (st == OpStatus::KEY_EXISTS) {
    cmd_cntx->SendLong(0);
  } else {
    cmd_cntx->SendError(st);
  }
}

void GenericFamily::Copy(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser(args);
  auto [k1, k2] = parser.Next<std::string_view, std::string_view>();
  bool replace = parser.Check("REPLACE");
  if (!parser.Finalize()) {
    return cmd_cntx->SendError(parser.TakeError().MakeReply());
  }

  if (k1 == k2) {
    return cmd_cntx->SendError("source and destination objects are the same");
  }

  Renamer renamer(cmd_cntx->tx(), k1, k2, shard_set->size(), true);
  auto reply = renamer.Rename(!replace);

  if (!reply.status) {
    return cmd_cntx->SendError(reply);
  }

  OpStatus st = reply.status.value();
  if (st == OpStatus::OK) {
    cmd_cntx->SendLong(1);
  } else if (st == OpStatus::KEY_EXISTS) {
    cmd_cntx->SendLong(0);
  } else if (st == OpStatus::KEY_NOTFOUND) {
    cmd_cntx->SendLong(0);
  } else {
    cmd_cntx->SendError(reply);
  }
}

void GenericFamily::ExpireTime(CmdArgList args, CommandContext* cmd_cntx) {
  ExpireTimeGeneric(args, TimeUnit::SEC, cmd_cntx);
}

void GenericFamily::PExpireTime(CmdArgList args, CommandContext* cmd_cntx) {
  ExpireTimeGeneric(args, TimeUnit::MSEC, cmd_cntx);
}

void GenericFamily::Ttl(CmdArgList args, CommandContext* cmd_cntx) {
  TtlGeneric(args, TimeUnit::SEC, cmd_cntx);
}

void GenericFamily::Pttl(CmdArgList args, CommandContext* cmd_cntx) {
  TtlGeneric(args, TimeUnit::MSEC, cmd_cntx);
}

void GenericFamily::Select(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  int64_t index;
  if (!absl::SimpleAtoi(key, &index)) {
    return cmd_cntx->SendError(kInvalidDbIndErr);
  }
  if (IsClusterEnabled() && index != 0) {
    return cmd_cntx->SendError("SELECT is not allowed in cluster mode");
  }
  if (index < 0 || index >= absl::GetFlag(FLAGS_dbnum)) {
    return cmd_cntx->SendError(kDbIndOutOfRangeErr);
  }
  auto* cntx = cmd_cntx->server_conn_cntx();
  if (cntx->conn_state.db_index == index) {
    // accept a noop.
    return cmd_cntx->SendOk();
  }

  // Only global/non-atomic multi transactions can change dbs safely,
  // locked-ahead transactions acquired keys ahead for a specific dbindex
  if (auto* tx = cmd_cntx->tx(); tx && tx->IsMulti()) {
    if (tx->GetMultiMode() == Transaction::LOCK_AHEAD)
      return cmd_cntx->SendError("SELECT is not allowed in regular EXEC/EVAL");
  }

  if (cntx->conn_state.exec_info.IsRunning()) {
    return cmd_cntx->SendError("SELECT is not allowed in a transaction");
  }

  cntx->conn_state.db_index = index;
  auto cb = [ns = cntx->ns, index](EngineShard* shard) {
    auto& db_slice = ns->GetDbSlice(shard->shard_id());
    db_slice.ActivateDb(index);
    return OpStatus::OK;
  };
  shard_set->RunBriefInParallel(std::move(cb));

  return cmd_cntx->SendOk();
}

void GenericFamily::Dump(CmdArgList args, CommandContext* cmd_cntx) {
  std::string_view key = ArgS(args, 0);
  DVLOG(1) << "Dumping before ::ScheduleSingleHopT " << key;
  auto cb = [&](Transaction* t, EngineShard* shard) { return OpDump(t->GetOpArgs(shard), key); };
  OpResult<string> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));

  if (result) {
    DVLOG(1) << "Dump " << cmd_cntx->tx()->DebugId() << ": " << key << ", dump size "
             << result.value().size();
    auto reply = [data = std::move(*result)](RedisReplyBuilder* rb) { rb->SendBulkString(data); };
    cmd_cntx->ReplyWith(std::move(reply));
  } else {
    static_cast<RedisReplyBuilder*>(cmd_cntx->rb())->SendNull();
  }
}

void GenericFamily::Type(CmdArgList args, CommandContext* cmd_cntx) {
  std::string_view key = ArgS(args, 0);

  auto cb = [&](Transaction* t, EngineShard* shard) -> OpResult<CompactObjType> {
    auto& db_slice = t->GetDbSlice(shard->shard_id());
    auto it = db_slice.FindReadOnly(t->GetDbContext(), key).it;
    if (!it.is_done()) {
      return it->second.ObjType();
    } else {
      return OpStatus::KEY_NOTFOUND;
    }
  };
  OpResult<CompactObjType> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  if (!result) {
    cmd_cntx->SendSimpleString("none");
  } else {
    cmd_cntx->SendSimpleString(ObjTypeToString(result.value()));
  }
}

void GenericFamily::Time(CmdArgList args, CommandContext* cmd_cntx) {
  uint64_t now_usec;
  if (cmd_cntx->tx()) {
    now_usec = cmd_cntx->tx()->GetDbContext().time_now_ms * 1000;
  } else {
    now_usec = absl::GetCurrentTimeNanos() / 1000;
  }
  DCHECK_GT(now_usec, 0u);

  auto replier = [now_usec](RedisReplyBuilder* rb) {
    rb->StartArray(2);
    rb->SendLong(now_usec / 1000000);
    rb->SendLong(now_usec % 1000000);
  };
  cmd_cntx->ReplyWith(std::move(replier));
}

void GenericFamily::Echo(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  auto replier = [key = string(key)](RedisReplyBuilder* rb) { rb->SendBulkString(key); };
  cmd_cntx->ReplyWith(std::move(replier));
}

// SCAN cursor [MATCH <glob>] [TYPE <type>] [COUNT <count>] [BUCKET <bucket_id>]
// [ATTR <mask>] [MLCGE <len>]
void GenericFamily::Scan(CmdArgList args, CommandContext* cmd_cntx) {
  string_view token = ArgS(args, 0);
  uint64_t cursor = 0;
  if (!absl::SimpleAtoi(token, &cursor)) {
    if (absl::EqualsIgnoreCase(token, "HELP")) {
      auto replier = [](RedisReplyBuilder* rb) {
        string_view help_arr[] = {
            "SCAN cursor [MATCH <glob>] [TYPE <type>] [COUNT <count>] [ATTR <mask>] [MINMSZ "
            "<len>]",
            "    MATCH <glob> - pattern to match keys against",
            "    TYPE <type> - type of values to match",
            "    COUNT <count> - number of keys to return",
            "    ATTR <v|p|a|u> - filter by attributes: v - volatile (ttl), ",
            "    p - persistent (no ttl), a - accessed since creation, u - untouched",
            "    MINMSZ <len> - keeps keys with values, whose allocated size is greater or equal "
            "to",
            "        the specified length",
        };

        rb->SendSimpleStrArr(help_arr);
      };
      return cmd_cntx->ReplyWith(std::move(replier));
    }
    return cmd_cntx->SendError("invalid cursor");
  }

  OpResult<ScanOpts> ops = ScanOpts::TryFrom(args.subspan(1));
  if (!ops) {
    DVLOG(1) << "Scan invalid args - return " << ops << " to the user";
    return cmd_cntx->SendError(ops.status());
  }

  const ScanOpts& scan_op = ops.value();

  StringVec keys;
  cursor = ScanGeneric(cursor, scan_op, &keys, cmd_cntx->server_conn_cntx());

  auto replier = [cursor, keys = std::move(keys)](RedisReplyBuilder* builder) {
    RedisReplyBuilder::ArrayScope scope{builder, 2};
    builder->SendBulkString(absl::StrCat(cursor));
    builder->SendBulkStrArr(keys);
  };

  cmd_cntx->ReplyWith(std::move(replier));
}

void GenericFamily::Rm(CmdArgList args, CommandContext* cmd_cntx) {
  string_view token = ArgS(args, 0);
  uint64_t cursor = 0;
  if (!absl::SimpleAtoi(token, &cursor)) {
    if (absl::EqualsIgnoreCase(token, "HELP")) {
      auto replier = [](RedisReplyBuilder* rb) {
        string_view help_arr[] = {
            "RM cursor [MATCH <glob>] [TYPE <type>] [COUNT <count>]",
            "    MATCH <glob> - pattern to match keys against",
            "    TYPE <type> - type of values to match (string, list, set, zset, hash, stream)",
            "    COUNT <count> - number of keys to delete per call",
        };
        rb->SendSimpleStrArr(help_arr);
      };
      return cmd_cntx->ReplyWith(std::move(replier));
    }
    return cmd_cntx->SendError("invalid cursor", kSyntaxErrType);
  }

  OpResult<ScanOpts> ops = ScanOpts::TryFrom(args.subspan(1));
  if (!ops) {
    return cmd_cntx->SendError(ops.status());
  }

  uint32_t deleted = 0;
  cursor = RmGeneric(cursor, ops.value(), &deleted, cmd_cntx->server_conn_cntx());

  auto replier = [cursor, deleted](RedisReplyBuilder* rb) {
    RedisReplyBuilder::ArrayScope scope{rb, 2};
    rb->SendBulkString(absl::StrCat(cursor));
    rb->SendLong(deleted);
  };
  cmd_cntx->ReplyWith(std::move(replier));
}

OpResult<uint32_t> GenericFamily::OpExists(const OpArgs& op_args, const ShardArgs& keys) {
  DVLOG(1) << "Exists: " << keys.Front();
  auto& db_slice = op_args.GetDbSlice();
  uint32_t res = 0;

  for (string_view key : keys) {
    auto find_res = db_slice.FindReadOnly(op_args.db_cntx, key);
    res += IsValid(find_res.it);
  }
  return res;
}

void GenericFamily::RandomKey(CmdArgList args, CommandContext* cmd_cntx) {
  const static size_t kMaxAttempts = 3;

  absl::BitGen bitgen;
  atomic_size_t candidates_counter{0};
  auto* cntx = cmd_cntx->server_conn_cntx();
  DbContext db_cntx{cntx->ns, cntx->conn_state.db_index, GetCurrentTimeMs()};
  ScanOpts scan_opts;
  scan_opts.limit = 3;  // number of entries per shard
  std::vector<StringVec> candidates_collection(shard_set->size());

  shard_set->RunBriefInParallel(
      [&](EngineShard* shard) {
        auto* prime_table =
            cntx->ns->GetDbSlice(shard->shard_id()).GetTables(db_cntx.db_index).first;
        if (prime_table->size() == 0) {
          return;
        }

        StringVec* candidates = &candidates_collection[shard->shard_id()];

        for (size_t i = 0; i <= kMaxAttempts; ++i) {
          if (!candidates->empty()) {
            break;
          }
          uint64_t cursor = 0;  // scans from the start of the shard after reaching kMaxAttemps
          if (i < kMaxAttempts) {
            cursor = prime_table->GetRandomCursor(&bitgen).token();
          }
          OpScan({shard, 0u, db_cntx}, scan_opts, &cursor, candidates);
        }

        candidates_counter.fetch_add(candidates->size(), memory_order_relaxed);
      },
      [&](ShardId) { return true; });

  auto candidates_count = candidates_counter.load(memory_order_relaxed);

  size_t random_idx = absl::Uniform<size_t>(bitgen, 0, candidates_count);
  for (auto& candidate : candidates_collection) {
    if (random_idx >= candidate.size()) {
      random_idx -= candidate.size();
    } else {
      auto replier = [key = std::move(candidate[random_idx])](RedisReplyBuilder* builder) {
        builder->SendBulkString(key);
      };
      return cmd_cntx->ReplyWith(std::move(replier));
    }
  }
  static_cast<RedisReplyBuilder*>(cmd_cntx->rb())->SendNull();
}

using CI = CommandId;

#define HFUNC(x) SetHandler(&GenericFamily::x)

namespace acl {

constexpr uint32_t kDel = KEYSPACE | WRITE | SLOW;
constexpr uint32_t kPing = FAST | CONNECTION;
constexpr uint32_t kEcho = FAST | CONNECTION;
constexpr uint32_t kExists = KEYSPACE | READ | FAST;
constexpr uint32_t kTouch = KEYSPACE | READ | FAST;
constexpr uint32_t kExpire = KEYSPACE | WRITE | FAST;
constexpr uint32_t kExpireAt = KEYSPACE | WRITE | FAST;
constexpr uint32_t kPersist = KEYSPACE | WRITE | FAST;
constexpr uint32_t kKeys = KEYSPACE | READ | SLOW | DANGEROUS;
constexpr uint32_t kPExpireAt = KEYSPACE | WRITE | FAST;
constexpr uint32_t kPExpire = KEYSPACE | WRITE | FAST;
constexpr uint32_t kRename = KEYSPACE | WRITE | SLOW;
constexpr uint32_t kCopy = KEYSPACE | WRITE | SLOW;
constexpr uint32_t kRenamNX = KEYSPACE | WRITE | FAST;
constexpr uint32_t kSelect = FAST | CONNECTION;
constexpr uint32_t kScan = KEYSPACE | READ | SLOW;
constexpr uint32_t kRm = KEYSPACE | WRITE | SLOW | DANGEROUS;
constexpr uint32_t kTTL = KEYSPACE | READ | FAST;
constexpr uint32_t kPTTL = KEYSPACE | READ | FAST;
constexpr uint32_t kFieldTtl = KEYSPACE | READ | FAST;
constexpr uint32_t kTime = FAST;
constexpr uint32_t kType = KEYSPACE | READ | FAST;
constexpr uint32_t kDump = KEYSPACE | READ | SLOW;
constexpr uint32_t kUnlink = KEYSPACE | WRITE | FAST;
constexpr uint32_t kStick = KEYSPACE | WRITE | FAST;
constexpr uint32_t kSort = WRITE | SET | SORTEDSET | LIST | SLOW | DANGEROUS;
constexpr uint32_t kSortRO = READ | SET | SORTEDSET | LIST | SLOW | DANGEROUS;
constexpr uint32_t kMove = KEYSPACE | WRITE | FAST;
constexpr uint32_t kRestore = KEYSPACE | WRITE | SLOW | DANGEROUS;
constexpr uint32_t kExpireTime = KEYSPACE | READ | FAST;
constexpr uint32_t kPExpireTime = KEYSPACE | READ | FAST;
constexpr uint32_t kFieldExpire = WRITE | HASH | SET | FAST;
}  // namespace acl

void GenericFamily::Register(CommandRegistry* registry) {
  constexpr auto kSelectOpts = CO::LOADING | CO::FAST;
  registry->StartFamily();
  *registry
      << CI{"DEL", CO::JOURNALED, -2, 1, -1, acl::kDel}.SetAsyncHandler(CmdDel)
      << CI{"DELEX", CO::JOURNALED | CO::FAST, -2, 1, 1, acl::kDel}.HFUNC(Delex)
      /* Redis compatibility:
       * We don't allow PING during loading since in Redis PING is used as
       * failure detection, and a loading server is considered to be
       * not available. */
      << CI{"PING", CO::FAST, -1, 0, 0, acl::kPing}.HFUNC(Ping)
      << CI{"ECHO", CO::LOADING | CO::FAST, 2, 0, 0, acl::kEcho}.HFUNC(Echo)
      << CI{"EXISTS", CO::READONLY | CO::FAST, -2, 1, -1, acl::kExists}.HFUNC(Exists)
      << CI{"TOUCH", CO::READONLY | CO::FAST, -2, 1, -1, acl::kTouch}.HFUNC(Exists)
      << CI{"EXPIRE", CO::JOURNALED | CO::FAST | CO::NO_AUTOJOURNAL, -3, 1, 1, acl::kExpire}.HFUNC(
             Expire)
      << CI{"EXPIREAT", CO::JOURNALED | CO::FAST | CO::NO_AUTOJOURNAL, -3, 1, 1, acl::kExpireAt}
             .HFUNC(ExpireAt)
      << CI{"PERSIST", CO::JOURNALED | CO::FAST, 2, 1, 1, acl::kPersist}.HFUNC(Persist)
      << CI{"KEYS", CO::READONLY, 2, 0, 0, acl::kKeys}.HFUNC(Keys)
      << CI{"PEXPIREAT", CO::JOURNALED | CO::FAST | CO::NO_AUTOJOURNAL, -3, 1, 1, acl::kPExpireAt}
             .HFUNC(PexpireAt)
      << CI{"PEXPIRE", CO::JOURNALED | CO::FAST | CO::NO_AUTOJOURNAL, -3, 1, 1, acl::kPExpire}
             .HFUNC(Pexpire)
      << CI{"FIELDEXPIRE", CO::JOURNALED | CO::FAST | CO::DENYOOM, -4, 1, 1, acl::kFieldExpire}
             .HFUNC(FieldExpire)
      << CI{"RENAME", CO::JOURNALED | CO::NO_AUTOJOURNAL, 3, 1, 2, acl::kRename}.HFUNC(Rename)
      << CI{"COPY", CO::JOURNALED | CO::NO_AUTOJOURNAL, -3, 1, 2, acl::kCopy}.HFUNC(Copy)
      << CI{"RENAMENX", CO::JOURNALED | CO::NO_AUTOJOURNAL, 3, 1, 2, acl::kRenamNX}.HFUNC(RenameNx)
      << CI{"SELECT", kSelectOpts, 2, 0, 0, acl::kSelect}.HFUNC(Select)
      << CI{"SCAN", CO::READONLY | CO::FAST | CO::LOADING, -2, 0, 0, acl::kScan}.HFUNC(Scan)
      << CI{"RM", CO::JOURNALED | CO::NO_AUTOJOURNAL, -2, 0, 0, acl::kRm}.HFUNC(Rm)
      << CI{"TTL", CO::READONLY | CO::FAST, 2, 1, 1, acl::kTTL}.HFUNC(Ttl)
      << CI{"PTTL", CO::READONLY | CO::FAST, 2, 1, 1, acl::kPTTL}.HFUNC(Pttl)
      << CI{"FIELDTTL", CO::READONLY | CO::FAST, 3, 1, 1, acl::kFieldTtl}.HFUNC(FieldTtl)
      << CI{"TIME", CO::LOADING | CO::FAST, 1, 0, 0, acl::kTime}.HFUNC(Time)
      << CI{"TYPE", CO::READONLY | CO::FAST | CO::LOADING, 2, 1, 1, acl::kType}.HFUNC(Type)
      << CI{"DUMP", CO::READONLY, 2, 1, 1, acl::kDump}.HFUNC(Dump)
      << CI{"UNLINK", CO::JOURNALED, -2, 1, -1, acl::kUnlink}.SetAsyncHandler(CmdDel)
      << CI{"STICK", CO::JOURNALED, -2, 1, -1, acl::kStick}.HFUNC(Stick)
      << CI{"SORT", CO::JOURNALED | CO::STORE_LAST_KEY, -2, 1, 1, acl::kSort}.HFUNC(Sort)
      << CI{"SORT_RO", CO::READONLY, -2, 1, 1, acl::kSortRO}.HFUNC(Sort_RO)
      << CI{"MOVE", CO::JOURNALED | CO::GLOBAL_TRANS | CO::NO_AUTOJOURNAL, 3, 1, 1, acl::kMove}
             .HFUNC(Move)
      << CI{"RESTORE", CO::JOURNALED, -4, 1, 1, acl::kRestore}.HFUNC(Restore)
      << CI{"RANDOMKEY", CO::READONLY, 1, 0, 0, 0}.HFUNC(RandomKey)
      << CI{"EXPIRETIME", CO::READONLY | CO::FAST, 2, 1, 1, acl::kExpireTime}.HFUNC(ExpireTime)
      << CI{"PEXPIRETIME", CO::READONLY | CO::FAST, 2, 1, 1, acl::kPExpireTime}.HFUNC(PExpireTime);
}

}  // namespace dfly


================================================
FILE: src/server/generic_family.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include "base/flags.h"
#include "facade/facade_types.h"
#include "server/tx_base.h"

ABSL_DECLARE_FLAG(uint32_t, dbnum);

namespace dfly {

using facade::CmdArgList;
using facade::OpResult;

class GenericFamily {
 public:
  static void Register(CommandRegistry* registry);

  // Accessed by Service::Exec and Service::Watch as an utility.
  static OpResult<uint32_t> OpExists(const OpArgs& op_args, const ShardArgs& keys);
  static OpResult<uint32_t> OpDel(const OpArgs& op_args, const ShardArgs& keys, bool async);

 private:
  static void Delex(CmdArgList args, CommandContext* cmd_cntx);
  static void Ping(CmdArgList args, CommandContext* cmd_cntx);
  static void Exists(CmdArgList args, CommandContext* cmd_cntx);
  static void Expire(CmdArgList args, CommandContext* cmd_cntx);
  static void ExpireAt(CmdArgList args, CommandContext* cmd_cntx);
  static void Persist(CmdArgList args, CommandContext* cmd_cntx);
  static void Keys(CmdArgList args, CommandContext* cmd_cntx);
  static void PexpireAt(CmdArgList args, CommandContext* cmd_cntx);
  static void Pexpire(CmdArgList args, CommandContext* cmd_cntx);
  static void Stick(CmdArgList args, CommandContext* cmd_cntx);
  static void Sort(CmdArgList args, CommandContext* cmd_cntx);
  static void Sort_RO(CmdArgList args, CommandContext* cmd_cntx);
  static void Move(CmdArgList args, CommandContext* cmd_cntx);

  static void Rename(CmdArgList args, CommandContext* cmd_cntx);
  static void RenameNx(CmdArgList args, CommandContext* cmd_cntx);
  static void Copy(CmdArgList args, CommandContext* cmd_cntx);
  static void ExpireTime(CmdArgList args, CommandContext* cmd_cntx);
  static void PExpireTime(CmdArgList args, CommandContext* cmd_cntx);
  static void Ttl(CmdArgList args, CommandContext* cmd_cntx);
  static void Pttl(CmdArgList args, CommandContext* cmd_cntx);

  static void Echo(CmdArgList args, CommandContext* cmd_cntx);
  static void Select(CmdArgList args, CommandContext* cmd_cntx);
  static void Scan(CmdArgList args, CommandContext* cmd_cntx);
  static void Rm(CmdArgList args, CommandContext* cmd_cntx);
  static void Time(CmdArgList args, CommandContext* cmd_cntx);
  static void Type(CmdArgList args, CommandContext* cmd_cntx);
  static void Dump(CmdArgList args, CommandContext* cmd_cntx);
  static void Restore(CmdArgList args, CommandContext* cmd_cntx);
  static void RandomKey(CmdArgList args, CommandContext* cmd_cntx);
  static void FieldTtl(CmdArgList args, CommandContext* cmd_cntx);
  static void FieldExpire(CmdArgList args, CommandContext* cmd_cntx);
};

}  // namespace dfly


================================================
FILE: src/server/generic_family_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/generic_family.h"

extern "C" {
#include "redis/rdb.h"
}

#include "base/gtest.h"
#include "base/logging.h"
#include "facade/facade_test.h"
#include "server/conn_context.h"
#include "server/engine_shard_set.h"
#include "server/test_utils.h"
#include "server/transaction.h"

using namespace testing;
using namespace std;
using namespace util;
using absl::StrCat;

namespace dfly {

class GenericFamilyTest : public BaseFamilyTest {};

TEST_F(GenericFamilyTest, Expire) {
  Run({"set", "key", "val"});

  // sideqik expiry limit
  auto resp = Run({"expire", "key", absl::StrCat(5 * 365 * 24 * 3600)});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"expire", "key", "1"});
  EXPECT_THAT(resp, IntArg(1));
  AdvanceTime(1000);
  resp = Run({"get", "key"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  Run({"set", "key", "val"});
  resp = Run({"pexpireat", "key", absl::StrCat(TEST_current_time_ms + 2000)});
  EXPECT_THAT(resp, IntArg(1));

  // override
  resp = Run({"pexpireat", "key", absl::StrCat(TEST_current_time_ms + 3000)});
  EXPECT_THAT(resp, IntArg(1));

  AdvanceTime(2999);
  resp = Run({"get", "key"});
  EXPECT_THAT(resp, "val");

  AdvanceTime(1);
  resp = Run({"get", "key"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  // pexpire test
  Run({"set", "key", "val"});
  resp = Run({"pexpire", "key", absl::StrCat(2000)});
  EXPECT_THAT(resp, IntArg(1));

  // expire time override
  resp = Run({"pexpire", "key", absl::StrCat(3000)});
  EXPECT_THAT(resp, IntArg(1));

  AdvanceTime(2999);
  resp = Run({"get", "key"});
  EXPECT_THAT(resp, "val");

  AdvanceTime(1);
  resp = Run({"get", "key"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));
}

TEST_F(GenericFamilyTest, ExpireOptions) {
  // NX and XX are mutually exclusive
  Run({"set", "key", "val"});
  auto resp = Run({"expire", "key", "3600", "NX", "XX"});
  ASSERT_THAT(resp, ErrArg("NX and XX options at the same time are not compatible"));

  // GT and LT are mutually exclusive
  resp = Run({"expire", "key", "3600", "GT", "LT"});
  ASSERT_THAT(resp, ErrArg("GT and LT options at the same time are not compatible"));

  // NX option should be added since there is no expiry
  resp = Run({"expire", "key", "3600", "NX"});
  EXPECT_THAT(resp, IntArg(1));
  resp = Run({"ttl", "key"});
  EXPECT_THAT(resp.GetInt(), 3600);

  // running again with NX option, should not change expiry
  resp = Run({"expire", "key", "42", "NX"});
  EXPECT_THAT(resp, IntArg(0));

  // given a key with no expiry
  Run({"set", "key2", "val"});
  resp = Run({"expire", "key2", "404", "XX"});
  // XX does not apply expiry since key has no existing expiry
  EXPECT_THAT(resp, IntArg(0));
  resp = Run({"ttl", "key2"});
  EXPECT_THAT(resp.GetInt(), -1);

  // GT does not apply since key has no "inf" expiry
  resp = Run({"expire", "key2", "404", "GT"});
  EXPECT_THAT(resp, IntArg(0));
  resp = Run({"ttl", "key2"});
  EXPECT_THAT(resp.GetInt(), -1);

  // LT applies
  resp = Run({"expire", "key2", "404", "LT"});
  EXPECT_THAT(resp, IntArg(1));
  resp = Run({"ttl", "key2"});
  EXPECT_THAT(resp.GetInt(), 404);

  Run({"persist", "key"});

  // set expiry to 101
  resp = Run({"expire", "key", "101"});
  EXPECT_THAT(resp, IntArg(1));

  // GT should not apply expiry since new is not greater than the current one
  resp = Run({"expire", "key", "100", "GT"});
  EXPECT_THAT(resp, IntArg(0));
  resp = Run({"ttl", "key"});
  EXPECT_THAT(resp.GetInt(), 101);

  // GT should apply expiry since new is greater than the current one
  resp = Run({"expire", "key", "102", "GT"});
  EXPECT_THAT(resp, IntArg(1));
  resp = Run({"ttl", "key"});
  EXPECT_THAT(resp.GetInt(), 102);

  // GT should not apply since expiry is smaller than current
  resp = Run({"expire", "key", "101", "GT"});
  EXPECT_THAT(resp, IntArg(0));
  resp = Run({"ttl", "key"});
  EXPECT_THAT(resp.GetInt(), 102);

  // LT should apply new expiry is smaller than current
  resp = Run({"expire", "key", "101", "LT"});
  EXPECT_THAT(resp, IntArg(1));
  resp = Run({"ttl", "key"});
  EXPECT_THAT(resp.GetInt(), 101);

  resp = Run({"expire", "key", "102", "LT"});
  EXPECT_THAT(resp, IntArg(0));
  resp = Run({"ttl", "key"});
  EXPECT_THAT(resp.GetInt(), 101);

  // NX with GT, first sets expiry, updates only to larger values
  Run({"persist", "key"});
  Run({"expire", "key", "5", "NX", "GT"});
  EXPECT_THAT(Run({"ttl", "key"}), IntArg(5));

  Run({"expire", "key", "3", "NX", "GT"});
  EXPECT_THAT(Run({"ttl", "key"}), IntArg(5));

  Run({"expire", "key", "7", "NX", "GT"});
  EXPECT_THAT(Run({"ttl", "key"}), IntArg(7));
}

TEST_F(GenericFamilyTest, ExpireAtOptions) {
  auto test_time_ms = TEST_current_time_ms;
  auto time_s = (test_time_ms + 500) / 1000;
  auto test_time_s = time_s;

  Run({"set", "key", "val"});
  // NX and XX are mutually exclusive
  auto resp = Run({"expireat", "key", "3600", "NX", "XX"});
  ASSERT_THAT(resp, ErrArg("NX and XX options at the same time are not compatible"));

  // GT and LT are mutually exclusive
  resp = Run({"expireat", "key", "3600", "GT", "LT"});
  ASSERT_THAT(resp, ErrArg("GT and LT options at the same time are not compatible"));

  // NX option should be added since there is no expiry
  test_time_s = time_s + 5;
  resp = Run({"expireat", "key", absl::StrCat(test_time_s), "NX"});
  EXPECT_THAT(resp, IntArg(1));
  EXPECT_EQ(test_time_s, CheckedInt({"EXPIRETIME", "key"}));

  // running again with NX option, should not change expiry
  test_time_s = time_s + 9;
  resp = Run({"expireat", "key", absl::StrCat(test_time_s), "NX"});
  EXPECT_THAT(resp, IntArg(0));

  // NX option with expired time is not accepted and so it doesn't delete the value
  resp = Run({"expireat", "key", absl::StrCat(TEST_current_time_ms / 1000 - 10), "NX"});
  EXPECT_THAT(resp, IntArg(0));
  EXPECT_THAT(Run({"exists", "key"}), IntArg(1));

  // given a key with no expiry
  Run({"set", "key2", "val"});
  test_time_s = time_s + 9;
  resp = Run({"expireat", "key2", absl::StrCat(test_time_s), "XX"});
  // XX does not apply expiry since key has no existing expiry
  EXPECT_THAT(resp, IntArg(0));
  resp = Run({"ttl", "key2"});
  EXPECT_THAT(resp.GetInt(), -1);

  // set expiry to 101
  test_time_s = time_s + 101;
  resp = Run({"expireat", "key", absl::StrCat(test_time_s)});
  EXPECT_THAT(resp, IntArg(1));

  // GT should not apply expiry since new is not greater than the current one
  auto less_test_time_s = time_s + 99;
  resp = Run({"expireat", "key", absl::StrCat(less_test_time_s), "GT"});
  EXPECT_THAT(resp, IntArg(0));
  EXPECT_EQ(test_time_s, CheckedInt({"EXPIRETIME", "key"}));

  // GT should apply expiry since new is greater than the current one
  test_time_s = time_s + 105;
  resp = Run({"expireat", "key", absl::StrCat(test_time_s), "GT"});
  EXPECT_THAT(resp, IntArg(1));
  EXPECT_EQ(test_time_s, CheckedInt({"EXPIRETIME", "key"}));

  // LT should apply new expiry is smaller than current
  test_time_s = time_s + 101;
  resp = Run({"expireat", "key", absl::StrCat(test_time_s), "LT"});
  EXPECT_THAT(resp, IntArg(1));
  EXPECT_EQ(test_time_s, CheckedInt({"EXPIRETIME", "key"}));

  // LT should not apply expiry since new is not lesser than the current one
  auto gt_test_time_s = time_s + 102;
  resp = Run({"expireat", "key", absl::StrCat(gt_test_time_s), "LT"});
  EXPECT_THAT(resp, IntArg(0));
  EXPECT_EQ(test_time_s, CheckedInt({"EXPIRETIME", "key"}));
}

TEST_F(GenericFamilyTest, PExpireOptions) {
  // NX and XX are mutually exclusive
  Run({"set", "key", "val"});
  auto resp = Run({"pexpire", "key", "3600", "NX", "XX"});
  ASSERT_THAT(resp, ErrArg("NX and XX options at the same time are not compatible"));

  // GT and LT are mutually exclusive
  resp = Run({"pexpire", "key", "3600", "GT", "LT"});
  ASSERT_THAT(resp, ErrArg("GT and LT options at the same time are not compatible"));

  // NX option should be added since there is no expiry
  resp = Run({"pexpire", "key", "3600000", "NX"});
  EXPECT_THAT(resp, IntArg(1));
  resp = Run({"pttl", "key"});
  EXPECT_THAT(resp.GetInt(), 3600000);

  // running again with NX option, should not change expiry
  resp = Run({"pexpire", "key", "42", "NX"});
  EXPECT_THAT(resp, IntArg(0));

  // given a key with no expiry
  Run({"set", "key2", "val"});
  resp = Run({"pexpire", "key2", "404", "XX"});
  // XX does not apply expiry since key has no existing expiry
  EXPECT_THAT(resp, IntArg(0));
  resp = Run({"pttl", "key2"});
  EXPECT_THAT(resp.GetInt(), -1);

  // set expiry to 101
  resp = Run({"pexpire", "key", "101000"});
  EXPECT_THAT(resp, IntArg(1));

  // GT should not apply expiry since new is not greater than the current one
  resp = Run({"pexpire", "key", "100000", "GT"});
  EXPECT_THAT(resp, IntArg(0));
  resp = Run({"pttl", "key"});
  EXPECT_THAT(resp.GetInt(), 101000);

  // GT should apply expiry since new is greater than the current one
  resp = Run({"pexpire", "key", "102000", "GT"});
  EXPECT_THAT(resp, IntArg(1));
  resp = Run({"pttl", "key"});
  EXPECT_THAT(resp.GetInt(), 102000);

  // GT should not apply since expiry is smaller than current
  resp = Run({"pexpire", "key", "101000", "GT"});
  EXPECT_THAT(resp, IntArg(0));
  resp = Run({"pttl", "key"});
  EXPECT_THAT(resp.GetInt(), 102000);

  // LT should apply new expiry is smaller than current
  resp = Run({"pexpire", "key", "101000", "LT"});
  EXPECT_THAT(resp, IntArg(1));
  resp = Run({"pttl", "key"});
  EXPECT_THAT(resp.GetInt(), 101000);

  // LT should not apply since expiry is greater than current
  resp = Run({"pexpire", "key", "102000", "LT"});
  EXPECT_THAT(resp, IntArg(0));
  resp = Run({"pttl", "key"});
  EXPECT_THAT(resp.GetInt(), 101000);
}

TEST_F(GenericFamilyTest, PExpireAtOptions) {
  auto test_time_ms = TEST_current_time_ms;
  Run({"set", "key", "val"});
  // NX and XX are mutually exclusive
  auto resp = Run({"pexpireat", "key", "3600", "NX", "XX"});
  ASSERT_THAT(resp, ErrArg("NX and XX options at the same time are not compatible"));

  // GT and LT are mutually exclusive
  resp = Run({"pexpireat", "key", "3600", "GT", "LT"});
  ASSERT_THAT(resp, ErrArg("GT and LT options at the same time are not compatible"));

  // NX option should be added since there is no expiry
  test_time_ms = TEST_current_time_ms + 3600;
  resp = Run({"pexpireat", "key", absl::StrCat(test_time_ms), "NX"});
  EXPECT_THAT(resp, IntArg(1));
  EXPECT_EQ(test_time_ms, CheckedInt({"PEXPIRETIME", "key"}));

  // running again with NX option, should not change expiry
  test_time_ms = TEST_current_time_ms + 42000;
  resp = Run({"pexpireat", "key", absl::StrCat(test_time_ms), "NX"});
  EXPECT_THAT(resp, IntArg(0));

  // given a key with no expiry
  Run({"set", "key2", "val"});
  test_time_ms = TEST_current_time_ms + 404;
  resp = Run({"pexpireat", "key2", absl::StrCat(test_time_ms), "XX"});
  // XX does not apply expiry since key has no existing expiry
  EXPECT_THAT(resp, IntArg(0));
  resp = Run({"ttl", "key2"});
  EXPECT_THAT(resp.GetInt(), -1);

  // set expiry to 101
  test_time_ms = TEST_current_time_ms + 101;
  resp = Run({"pexpireat", "key", absl::StrCat(test_time_ms)});
  EXPECT_THAT(resp, IntArg(1));

  // GT should not apply expiry since new is not greater than the current one
  auto less_test_time_ms = TEST_current_time_ms + 100;
  resp = Run({"pexpireat", "key", absl::StrCat(less_test_time_ms), "GT"});
  EXPECT_THAT(resp, IntArg(0));
  EXPECT_EQ(test_time_ms, CheckedInt({"PEXPIRETIME", "key"}));

  // GT should apply expiry since new is greater than the current one
  test_time_ms = TEST_current_time_ms + 105;
  resp = Run({"pexpireat", "key", absl::StrCat(test_time_ms), "GT"});
  EXPECT_THAT(resp, IntArg(1));
  EXPECT_EQ(test_time_ms, CheckedInt({"PEXPIRETIME", "key"}));

  // LT should apply new expiry is smaller than current
  test_time_ms = TEST_current_time_ms + 101;
  resp = Run({"pexpireat", "key", absl::StrCat(test_time_ms), "LT"});
  EXPECT_THAT(resp, IntArg(1));
  EXPECT_EQ(test_time_ms, CheckedInt({"PEXPIRETIME", "key"}));

  // LT should not apply expiry since new is not lesser than the current one
  auto gt_test_time_ms = TEST_current_time_ms + 102;
  resp = Run({"pexpireat", "key", absl::StrCat(gt_test_time_ms), "LT"});
  EXPECT_THAT(resp, IntArg(0));
  EXPECT_EQ(test_time_ms, CheckedInt({"PEXPIRETIME", "key"}));
}

TEST_F(GenericFamilyTest, Del) {
  for (size_t i = 0; i < 1000; ++i) {
    Run({"set", StrCat("foo", i), "1"});
    Run({"set", StrCat("bar", i), "1"});
  }

  ASSERT_EQ(2000, CheckedInt({"dbsize"}));

  auto exist_fb = pp_->at(0)->LaunchFiber([&] {
    for (size_t i = 0; i < 1000; ++i) {
      int64_t resp = CheckedInt({"exists", StrCat("foo", i), StrCat("bar", i)});
      ASSERT_TRUE(2 == resp || resp == 0) << resp << " " << i;
    }
  });

  auto del_fb = pp_->at(2)->LaunchFiber([&] {
    for (size_t i = 0; i < 1000; ++i) {
      auto resp = CheckedInt({"del", StrCat("foo", i), StrCat("bar", i)});
      ASSERT_EQ(2, resp);
    }
  });

  exist_fb.Join();
  del_fb.Join();

  Run({"setex", "k1", "10", "bar"});
  Run({"del", "k1"});
}

TEST_F(GenericFamilyTest, TTL) {
  EXPECT_EQ(-2, CheckedInt({"ttl", "foo"}));
  EXPECT_EQ(-2, CheckedInt({"pttl", "foo"}));
  Run({"set", "foo", "bar"});
  EXPECT_EQ(-1, CheckedInt({"ttl", "foo"}));
  EXPECT_EQ(-1, CheckedInt({"pttl", "foo"}));
}

TEST_F(GenericFamilyTest, Exists) {
  Run({"mset", "x", "0", "y", "1"});
  auto resp = Run({"exists", "x", "y", "x"});
  EXPECT_THAT(resp, IntArg(3));
}

TEST_F(GenericFamilyTest, Touch) {
  RespExpr resp;

  Run({"mset", "x", "0", "y", "1"});
  resp = Run({"touch", "x", "y", "x"});
  EXPECT_THAT(resp, IntArg(3));

  resp = Run({"touch", "z", "x", "w"});
  EXPECT_THAT(resp, IntArg(1));
}

TEST_F(GenericFamilyTest, Rename) {
  RespExpr resp;
  string b_val(32, 'b');
  string x_val(32, 'x');

  resp = Run({"mset", "x", x_val, "b", b_val});
  ASSERT_EQ(resp, "OK");
  ASSERT_EQ(2, last_cmd_dbg_info_.shards_count);

  resp = Run({"rename", "z", "b"});
  ASSERT_THAT(resp, ErrArg("no such key"));

  resp = Run({"rename", "x", "b"});
  ASSERT_EQ(resp, "OK");

  int64_t val = CheckedInt({"get", "x"});
  ASSERT_EQ(kint64min, val);  // does not exist

  ASSERT_EQ(x_val, Run({"get", "b"}));  // swapped.

  EXPECT_EQ(CheckedInt({"exists", "x", "b"}), 1);

  const char* keys[2] = {"b", "x"};
  auto ren_fb = pp_->at(0)->LaunchFiber([&] {
    for (size_t i = 0; i < 200; ++i) {
      int j = i % 2;
      auto resp = Run({"rename", keys[j], keys[1 - j]});
      ASSERT_EQ(resp, "OK");
    }
  });

  auto exist_fb = pp_->at(2)->LaunchFiber([&] {
    for (size_t i = 0; i < 300; ++i) {
      int64_t resp = CheckedInt({"exists", "x", "b"});
      ASSERT_EQ(1, resp);
    }
  });

  exist_fb.Join();
  ren_fb.Join();
}

TEST_F(GenericFamilyTest, RenameList) {
  for (string_view dest : {"b", "y", "z"}) {
    EXPECT_EQ(1, CheckedInt({"lpush", "x", "elem"}));
    Metrics metrics = GetMetrics();

    size_t list_usage = metrics.db_stats[0].memory_usage_by_type[OBJ_LIST];
    size_t string_usage = metrics.db_stats[0].memory_usage_by_type[OBJ_STRING];
    ASSERT_GT(list_usage, 0);
    ASSERT_EQ(string_usage, 0);

    auto resp = Run({"rename", "x", dest});
    ASSERT_EQ(resp, "OK");
    if (dest == "b") {
      ASSERT_EQ(2, last_cmd_dbg_info_.shards_count);
    } else {
      ASSERT_EQ(1, last_cmd_dbg_info_.shards_count);
    }

    metrics = GetMetrics();
    size_t list_usage_after = metrics.db_stats[0].memory_usage_by_type[OBJ_LIST];
    string_usage = metrics.db_stats[0].memory_usage_by_type[OBJ_STRING];
    ASSERT_EQ(list_usage_after, list_usage);
    ASSERT_EQ(string_usage, 0);

    EXPECT_EQ(0, CheckedInt({"del", "x"}));
    EXPECT_EQ(1, CheckedInt({"del", dest}));
  }
}

TEST_F(GenericFamilyTest, RenameBinary) {
  const char kKey1[] = "\x01\x02\x03\x04";
  const char kKey2[] = "\x05\x06\x07\x08";

  Run({"set", kKey1, "bar"});
  Run({"rename", kKey1, kKey2});
  EXPECT_THAT(Run({"get", kKey1}), ArgType(RespExpr::NIL));
  EXPECT_EQ(Run({"get", kKey2}), "bar");
}

TEST_F(GenericFamilyTest, RenameNx) {
  // Set two keys
  string b_val(32, 'b');
  string x_val(32, 'x');
  Run({"mset", "x", x_val, "b", b_val});

  ASSERT_THAT(Run({"renamenx", "z", "b"}), ErrArg("no such key"));
  ASSERT_THAT(Run({"renamenx", "x", "b"}), IntArg(0));  // b already exists
  ASSERT_THAT(Run({"renamenx", "x", "y"}), IntArg(1));
  ASSERT_EQ(Run({"get", "y"}), x_val);
  ASSERT_THAT(Run({"renamenx", "y", "y"}), IntArg(0));
}

TEST_F(GenericFamilyTest, RenameSameName) {
  const char kKey[] = "key";

  ASSERT_THAT(Run({"rename", kKey, kKey}), ErrArg("no such key"));

  ASSERT_EQ(Run({"set", kKey, "value"}), "OK");
  EXPECT_EQ(Run({"rename", kKey, kKey}), "OK");
}

TEST_F(GenericFamilyTest, RenameSameShard) {
  num_threads_ = 1;
  ResetService();

  ASSERT_EQ(Run({"set", "x", "value"}), "OK");
  ASSERT_EQ(Run({"set", "y", "value"}), "OK");
  EXPECT_EQ(Run({"rename", "x", "y"}), "OK");
}

TEST_F(GenericFamilyTest, Stick) {
  // check stick returns zero on non-existent keys
  ASSERT_THAT(Run({"stick", "a", "b"}), IntArg(0));

  for (auto key : {"a", "b", "c", "d"}) {
    Run({"set", key, "."});
  }

  // check stick is applied only once
  ASSERT_THAT(Run({"stick", "a", "b"}), IntArg(2));
  ASSERT_THAT(Run({"stick", "a", "b"}), IntArg(0));
  ASSERT_THAT(Run({"stick", "a", "c"}), IntArg(1));
  ASSERT_THAT(Run({"stick", "b", "d"}), IntArg(1));
  ASSERT_THAT(Run({"stick", "c", "d"}), IntArg(0));

  // check stickyness persists during writes
  Run({"set", "a", "new"});
  ASSERT_THAT(Run({"stick", "a"}), IntArg(0));
  Run({"append", "a", "-value"});
  ASSERT_THAT(Run({"stick", "a"}), IntArg(0));

  // check rename persists stickyness
  Run({"rename", "a", "k"});
  ASSERT_THAT(Run({"stick", "k"}), IntArg(0));

  // check rename persists stickyness on multiple shards
  Run({"del", "b"});
  string b_val(32, 'b');
  string x_val(32, 'x');
  Run({"mset", "b", b_val, "x", x_val});
  ASSERT_EQ(2, last_cmd_dbg_info_.shards_count);
  Run({"stick", "x"});
  Run({"rename", "x", "b"});
  ASSERT_THAT(Run({"stick", "b"}), IntArg(0));
}

TEST_F(GenericFamilyTest, Move) {
  // Check MOVE returns 0 on non-existent keys
  ASSERT_THAT(Run({"move", "a", "1"}), IntArg(0));

  // Check MOVE catches non-existent database indices
  ASSERT_THAT(Run({"move", "a", "-1"}), ArgType(RespExpr::ERROR));
  ASSERT_THAT(Run({"move", "a", "100500"}), ArgType(RespExpr::ERROR));

  // Check MOVE moves value & expiry & stickyness
  Run({"set", "a", "test"});
  Run({"expire", "a", "1000"});
  Run({"stick", "a"});
  ASSERT_THAT(Run({"move", "a", "1"}), IntArg(1));
  Run({"select", "1"});
  ASSERT_THAT(Run({"get", "a"}), "test");
  ASSERT_THAT(Run({"ttl", "a"}), testing::Not(IntArg(-1)));
  ASSERT_THAT(Run({"stick", "a"}), IntArg(0));

  // Check MOVE doesn't move if key exists
  Run({"select", "1"});
  Run({"set", "a", "test"});
  Run({"select", "0"});
  Run({"set", "a", "another test"});
  ASSERT_THAT(Run({"move", "a", "1"}), IntArg(0));  // exists from test case above
  Run({"select", "1"});
  ASSERT_THAT(Run({"get", "a"}), "test");

  // Check MOVE awakes blocking operations
  auto fb_blpop = pp_->at(0)->LaunchFiber(Launch::dispatch, [&] {
    Run({"select", "1"});
    auto resp = Run({"blpop", "l", "0"});
    ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
    EXPECT_THAT(resp.GetVec(), ElementsAre("l", "TestItem"));
  });

  WaitUntilLocked(1, "l");

  pp_->at(1)->Await([&] {
    Run({"select", "0"});
    Run({"lpush", "l", "TestItem"});
    Run({"move", "l", "1"});
  });

  fb_blpop.Join();
}

using testing::AnyOf;
using testing::Each;
using testing::StartsWith;

TEST_F(GenericFamilyTest, Scan) {
  for (unsigned i = 0; i < 10; ++i)
    Run({"set", absl::StrCat("key", i), "bar"});

  for (unsigned i = 0; i < 10; ++i)
    Run({"set", absl::StrCat("str", i), "bar"});

  for (unsigned i = 0; i < 10; ++i)
    Run({"sadd", absl::StrCat("set", i), "bar"});

  for (unsigned i = 0; i < 10; ++i)
    Run({"zadd", absl::StrCat("zset", i), "0", "bar"});

  auto resp = Run({"scan", "0", "count", "20", "type", "string"});
  EXPECT_THAT(resp, ArrLen(2));
  auto vec = StrArray(resp.GetVec()[1]);
  EXPECT_GT(vec.size(), 10);
  EXPECT_THAT(vec, Each(AnyOf(StartsWith("str"), StartsWith("key"))));

  resp = Run({"scan", "0", "count", "20", "match", "zset*"});
  vec = StrArray(resp.GetVec()[1]);
  EXPECT_EQ(10, vec.size());
  EXPECT_THAT(vec, Each(StartsWith("zset")));

  Run({"flushdb"});

  Run({"set", "", "foo"});
  Run({"set", "bar", "1"});
  resp = Run({"keys", "*"});
  EXPECT_THAT(resp, RespArray(ElementsAre("bar", "")));
  resp = Run({"keys", ""});
  EXPECT_EQ(resp, "");
}

TEST_F(GenericFamilyTest, ScanWithAttr) {
  Run({"set", "hello", "world"});
  Run({"set", "foo", "bar"});

  Run({"expire", "hello", "1000"});

  auto resp = Run({"scan", "0", "attr", "v"});
  auto vec = StrArray(resp.GetVec()[1]);
  ASSERT_EQ(1, vec.size());
  EXPECT_EQ(vec[0], "hello");

  resp = Run({"scan", "0", "attr", "p"});
  vec = StrArray(resp.GetVec()[1]);
  ASSERT_EQ(1, vec.size());
  EXPECT_EQ(vec[0], "foo");

  // before run get "foo", scan with a attr should return "hello", because set "hello" expire before
  resp = Run({"scan", "0", "attr", "a"});
  vec = StrArray(resp.GetVec()[1]);
  ASSERT_EQ(1, vec.size());
  EXPECT_EQ(vec[0], "hello");

  // before run get "foo", scan with a attr should return "foo"
  resp = Run({"scan", "0", "attr", "u"});
  vec = StrArray(resp.GetVec()[1]);
  ASSERT_EQ(1, vec.size());
  EXPECT_EQ(vec[0], "foo");

  ASSERT_THAT(Run({"get", "foo"}), "bar");

  // after run get "foo", scan with a attr should return "foo" and "hello"
  resp = Run({"scan", "0", "attr", "a"});
  vec = StrArray(resp.GetVec()[1]);
  ASSERT_EQ(2, vec.size());

  // after run get "foo", scan with a attr should return empty set
  resp = Run({"scan", "0", "attr", "u"});
  vec = StrArray(resp.GetVec()[1]);
  ASSERT_EQ(0, vec.size());
}

TEST_F(GenericFamilyTest, ScanMallocSize) {
  Run({"set", "k1", string(1000, 'a')});
  Run({"set", "k2", string(500, 'b')});
  Run({"set", "k3", string(15, 'c')});

  auto resp = Run({"scan", "0", "MINMSZ", "15"});
  EXPECT_THAT(resp.GetVec()[1], RespArray(UnorderedElementsAre("k1", "k2")));
  resp = Run({"scan", "0", "MINMSZ", "500"});
  EXPECT_THAT(resp.GetVec()[1], RespArray(UnorderedElementsAre("k1")));
}

TEST_F(GenericFamilyTest, Sort) {
  // Test list sort with params
  Run({"del", "list-1"});
  Run({"lpush", "list-1", "3.5", "1.2", "10.1", "2.20", "200"});
  // numeric
  ASSERT_THAT(Run({"sort", "list-1"}).GetVec(), ElementsAre("1.2", "2.20", "3.5", "10.1", "200"));
  // string
  ASSERT_THAT(Run({"sort", "list-1", "ALPHA"}).GetVec(),
              ElementsAre("1.2", "10.1", "2.20", "200", "3.5"));
  // desc numeric
  ASSERT_THAT(Run({"sort", "list-1", "DESC"}).GetVec(),
              ElementsAre("200", "10.1", "3.5", "2.20", "1.2"));
  // desc strig
  ASSERT_THAT(Run({"sort", "list-1", "DESC", "ALPHA"}).GetVec(),
              ElementsAre("3.5", "200", "2.20", "10.1", "1.2"));
  // limits
  ASSERT_THAT(Run({"sort", "list-1", "LIMIT", "0", "5"}).GetVec(),
              ElementsAre("1.2", "2.20", "3.5", "10.1", "200"));
  ASSERT_THAT(Run({"sort", "list-1", "LIMIT", "0", "10"}).GetVec(),
              ElementsAre("1.2", "2.20", "3.5", "10.1", "200"));
  ASSERT_THAT(Run({"sort", "list-1", "LIMIT", "2", "2"}).GetVec(), ElementsAre("3.5", "10.1"));
  ASSERT_THAT(Run({"sort", "list-1", "LIMIT", "1", "1"}), "2.20");
  ASSERT_THAT(Run({"sort", "list-1", "LIMIT", "4", "2"}), "200");
  ASSERT_THAT(Run({"sort", "list-1", "LIMIT", "5", "2"}), ArrLen(0));
  // limits desc
  ASSERT_THAT(Run({"sort", "list-1", "DESC", "LIMIT", "0", "5"}).GetVec(),
              ElementsAre("200", "10.1", "3.5", "2.20", "1.2"));
  ASSERT_THAT(Run({"sort", "list-1", "DESC", "LIMIT", "2", "2"}).GetVec(),
              ElementsAre("3.5", "2.20"));
  ASSERT_THAT(Run({"sort", "list-1", "DESC", "LIMIT", "1", "1"}), "10.1");
  ASSERT_THAT(Run({"sort", "list-1", "DESC", "LIMIT", "5", "2"}), ArrLen(0));

  // Test set sort
  Run({"del", "set-1"});
  Run({"sadd", "set-1", "5.3", "4.4", "60", "99.9", "100", "9"});
  ASSERT_THAT(Run({"sort", "set-1"}).GetVec(), ElementsAre("4.4", "5.3", "9", "60", "99.9", "100"));
  ASSERT_THAT(Run({"sort", "set-1", "ALPHA"}).GetVec(),
              ElementsAre("100", "4.4", "5.3", "60", "9", "99.9"));
  ASSERT_THAT(Run({"sort", "set-1", "DESC"}).GetVec(),
              ElementsAre("100", "99.9", "60", "9", "5.3", "4.4"));
  ASSERT_THAT(Run({"sort", "set-1", "DESC", "ALPHA"}).GetVec(),
              ElementsAre("99.9", "9", "60", "5.3", "4.4", "100"));

  // Test intset sort
  Run({"del", "intset-1"});
  Run({"sadd", "intset-1", "5", "4", "3", "2", "1"});
  ASSERT_THAT(Run({"sort", "intset-1"}).GetVec(), ElementsAre("1", "2", "3", "4", "5"));

  // Test sorted set sort
  Run({"del", "zset-1"});
  Run({"zadd", "zset-1", "0", "3.3", "0", "30.1", "0", "8.2"});
  ASSERT_THAT(Run({"sort", "zset-1"}).GetVec(), ElementsAre("3.3", "8.2", "30.1"));
  ASSERT_THAT(Run({"sort", "zset-1", "ALPHA"}).GetVec(), ElementsAre("3.3", "30.1", "8.2"));
  ASSERT_THAT(Run({"sort", "zset-1", "DESC"}).GetVec(), ElementsAre("30.1", "8.2", "3.3"));
  ASSERT_THAT(Run({"sort", "zset-1", "DESC", "ALPHA"}).GetVec(), ElementsAre("8.2", "30.1", "3.3"));

  // Test sort with non existent key
  Run({"del", "list-2"});
  ASSERT_THAT(Run({"sort", "list-2"}), ArrLen(0));

  // Test not convertible to double
  Run({"lpush", "list-2", "NOTADOUBLE"});
  ASSERT_THAT(Run({"sort", "list-2"}), ErrArg("One or more scores can't be converted into double"));

  Run({"set", "foo", "bar"});
  ASSERT_THAT(Run({"sort", "foo"}), ErrArg("WRONGTYPE "));

  Run({"rpush", "list-3", ""});
  ASSERT_THAT(Run({"sort", "list-3"}), "");

  Run({"rpush", "list-3", "2", "0", "", "-0.14", "0.12", "-0", "-123123", "7654"});
  ASSERT_THAT(Run({"sort", "list-3"}).GetVec(),
              ElementsAre("-123123", "-0.14", "", "", "-0", "0", "0.12", "2", "7654"));

  Run({"rpush", "NANvalue", "nan"});
  ASSERT_THAT(Run({"sort", "NANvalue"}),
              ErrArg("One or more scores can't be converted into double"));
}

TEST_F(GenericFamilyTest, SortBug3636) {
  Run({"RPUSH", "foo", "1.100000023841858", "1.100000023841858", "1.100000023841858", "-15710",
       "1.100000023841858", "1.100000023841858", "1.100000023841858", "-15710", "-15710",
       "1.100000023841858", "-15710", "-15710", "-15710", "-15710", "1.100000023841858", "-15710",
       "-15710"});
  auto resp = Run({"SORT", "foo", "desc", "alpha"});
  ASSERT_THAT(resp, ArrLen(17));
}

TEST_F(GenericFamilyTest, SortStore) {
  // Test list sort with params
  Run({"del", "list-1"});
  Run({"del", "list-2"});
  Run({"lpush", "list-1", "3.5", "1.2", "10.1", "2.20", "200"});
  // numeric
  auto resp = Run({"sort", "list-1", "store", "list-2"});
  EXPECT_EQ(5, resp.GetInt());
  ASSERT_THAT(Run({"lrange", "list-2", "0", "-1"}).GetVec(),
              ElementsAre("1.2", "2.20", "3.5", "10.1", "200"));

  // string
  resp = Run({"sort", "list-1", "ALPHA", "store", "list-2"});
  EXPECT_EQ(5, resp.GetInt());
  ASSERT_THAT(Run({"lrange", "list-2", "0", "-1"}).GetVec(),
              ElementsAre("1.2", "10.1", "2.20", "200", "3.5"));

  // desc numeric
  resp = Run({"sort", "list-1", "DESC", "store", "list-2"});
  EXPECT_EQ(5, resp.GetInt());
  ASSERT_THAT(Run({"lrange", "list-2", "0", "-1"}).GetVec(),
              ElementsAre("200", "10.1", "3.5", "2.20", "1.2"));

  // desc string
  resp = Run({"sort", "list-1", "ALPHA", "DESC", "store", "list-2"});
  EXPECT_EQ(5, resp.GetInt());
  ASSERT_THAT(Run({"lrange", "list-2", "0", "-1"}).GetVec(),
              ElementsAre("3.5", "200", "2.20", "10.1", "1.2"));

  // limits
  resp = Run({"sort", "list-1", "LIMIT", "0", "5", "store", "list-2"});
  EXPECT_EQ(5, resp.GetInt());
  ASSERT_THAT(Run({"lrange", "list-2", "0", "-1"}).GetVec(),
              ElementsAre("1.2", "2.20", "3.5", "10.1", "200"));
  resp = Run({"sort", "list-1", "LIMIT", "0", "10", "store", "list-2"});
  EXPECT_EQ(5, resp.GetInt());
  ASSERT_THAT(Run({"lrange", "list-2", "0", "-1"}).GetVec(),
              ElementsAre("1.2", "2.20", "3.5", "10.1", "200"));
  resp = Run({"sort", "list-1", "LIMIT", "2", "2", "store", "list-2"});
  EXPECT_EQ(2, resp.GetInt());
  ASSERT_THAT(Run({"lrange", "list-2", "0", "-1"}).GetVec(), ElementsAre("3.5", "10.1"));
  resp = Run({"sort", "list-1", "LIMIT", "1", "1", "store", "list-2"});
  EXPECT_EQ(1, resp.GetInt());
  ASSERT_THAT(Run({"lrange", "list-2", "0", "-1"}), "2.20");
  resp = Run({"sort", "list-1", "LIMIT", "4", "2", "store", "list-2"});
  EXPECT_EQ(1, resp.GetInt());
  ASSERT_THAT(Run({"lrange", "list-2", "0", "-1"}), "200");
  resp = Run({"sort", "list-1", "LIMIT", "5", "2", "store", "list-2"});
  EXPECT_EQ(0, resp.GetInt());
  ASSERT_THAT(Run({"lrange", "list-2", "0", "-1"}), ArrLen(0));

  // Test set sort
  Run({"del", "set-1"});
  Run({"del", "list-3"});
  Run({"sadd", "set-1", "5.3", "4.4", "60", "99.9", "100", "9"});
  resp = Run({"sort", "set-1", "store", "list-3"});
  EXPECT_EQ(6, resp.GetInt());
  ASSERT_THAT(Run({"lrange", "list-3", "0", "-1"}).GetVec(),
              ElementsAre("4.4", "5.3", "9", "60", "99.9", "100"));

  // Test sorted set sort
  Run({"del", "zset-1"});
  Run({"del", "list-4"});
  Run({"zadd", "zset-1", "0", "3.3", "0", "30.1", "0", "8.2"});
  resp = Run({"sort", "zset-1", "store", "list-4"});
  EXPECT_EQ(3, resp.GetInt());
  ASSERT_THAT(Run({"lrange", "list-4", "0", "-1"}).GetVec(), ElementsAre("3.3", "8.2", "30.1"));

  // Same key overwrite.
  Run({"del", "list-1"});
  Run({"del", "list-2"});
  Run({"lpush", "list-1", "3.5", "1.2", "10.1", "2.20", "200"});
  resp = Run({"sort", "list-1", "store", "list-1"});
  EXPECT_EQ(5, resp.GetInt());
  ASSERT_THAT(Run({"lrange", "list-1", "0", "-1"}).GetVec(),
              ElementsAre("1.2", "2.20", "3.5", "10.1", "200"));

  // Check that the keys should not expire after some time.
  Run({"del", "list-1"});
  Run({"del", "list-2"});
  Run({"lpush", "list-1", "3.5", "1.2", "10.1", "2.20", "200"});
  Run({"sort", "list-1", "store", "list-2"});
  AdvanceTime(5000);
  ASSERT_THAT(Run({"lrange", "list-2", "0", "-1"}).GetVec(),
              ElementsAre("1.2", "2.20", "3.5", "10.1", "200"));
}

TEST_F(GenericFamilyTest, SortStoreResetsExpiry) {
  // SORT set STORE dest, where dest has an expiry — dest expiry must be cleared.
  Run({"del", "src", "dest"});
  Run({"sadd", "src", "3", "1", "2"});
  Run({"sadd", "dest", "old"});
  Run({"expire", "dest", "100"});
  EXPECT_GT(Run({"ttl", "dest"}).GetInt(), 0);

  auto resp = Run({"sort", "src", "store", "dest"});
  EXPECT_EQ(3, resp.GetInt());
  // Destination must have no expiry after SORT STORE overwrites it.
  EXPECT_EQ(-1, Run({"ttl", "dest"}).GetInt());
  ASSERT_THAT(Run({"lrange", "dest", "0", "-1"}).GetVec(), ElementsAre("1", "2", "3"));

  // SORT src STORE src (same key), src has an expiry — must not crash and must clear expiry.
  Run({"del", "myset"});
  Run({"sadd", "myset", "c", "a", "b"});
  Run({"expire", "myset", "100"});
  EXPECT_GT(Run({"ttl", "myset"}).GetInt(), 0);

  resp = Run({"sort", "myset", "ALPHA", "store", "myset"});
  EXPECT_EQ(3, resp.GetInt());
  EXPECT_EQ(-1, Run({"ttl", "myset"}).GetInt());
  ASSERT_THAT(Run({"lrange", "myset", "0", "-1"}).GetVec(), ElementsAre("a", "b", "c"));
}

TEST_F(GenericFamilyTest, Sort_RO) {
  // Test list sort with params
  Run({"del", "list-1"});
  Run({"lpush", "list-1", "3.5", "1.2", "10.1", "2.20", "200"});
  // numeric
  ASSERT_THAT(Run({"sort_ro", "list-1"}).GetVec(),
              ElementsAre("1.2", "2.20", "3.5", "10.1", "200"));
  // string
  ASSERT_THAT(Run({"sort_ro", "list-1", "ALPHA"}).GetVec(),
              ElementsAre("1.2", "10.1", "2.20", "200", "3.5"));
  // desc numeric
  ASSERT_THAT(Run({"sort_ro", "list-1", "DESC"}).GetVec(),
              ElementsAre("200", "10.1", "3.5", "2.20", "1.2"));
  // desc strig
  ASSERT_THAT(Run({"sort_ro", "list-1", "DESC", "ALPHA"}).GetVec(),
              ElementsAre("3.5", "200", "2.20", "10.1", "1.2"));
  // limits
  ASSERT_THAT(Run({"sort_ro", "list-1", "LIMIT", "0", "5"}).GetVec(),
              ElementsAre("1.2", "2.20", "3.5", "10.1", "200"));
  ASSERT_THAT(Run({"sort_ro", "list-1", "LIMIT", "0", "10"}).GetVec(),
              ElementsAre("1.2", "2.20", "3.5", "10.1", "200"));
  ASSERT_THAT(Run({"sort_ro", "list-1", "LIMIT", "2", "2"}).GetVec(), ElementsAre("3.5", "10.1"));
  ASSERT_THAT(Run({"sort_ro", "list-1", "LIMIT", "1", "1"}), "2.20");
  ASSERT_THAT(Run({"sort_ro", "list-1", "LIMIT", "4", "2"}), "200");
  ASSERT_THAT(Run({"sort_ro", "list-1", "LIMIT", "5", "2"}), ArrLen(0));
  // limits desc
  ASSERT_THAT(Run({"sort_ro", "list-1", "DESC", "LIMIT", "0", "5"}).GetVec(),
              ElementsAre("200", "10.1", "3.5", "2.20", "1.2"));
  ASSERT_THAT(Run({"sort_ro", "list-1", "DESC", "LIMIT", "2", "2"}).GetVec(),
              ElementsAre("3.5", "2.20"));
  ASSERT_THAT(Run({"sort_ro", "list-1", "DESC", "LIMIT", "1", "1"}), "10.1");
  ASSERT_THAT(Run({"sort_ro", "list-1", "DESC", "LIMIT", "5", "2"}), ArrLen(0));

  // Test set sort
  Run({"del", "set-1"});
  Run({"sadd", "set-1", "5.3", "4.4", "60", "99.9", "100", "9"});
  ASSERT_THAT(Run({"sort_ro", "set-1"}).GetVec(),
              ElementsAre("4.4", "5.3", "9", "60", "99.9", "100"));
  ASSERT_THAT(Run({"sort_ro", "set-1", "ALPHA"}).GetVec(),
              ElementsAre("100", "4.4", "5.3", "60", "9", "99.9"));
  ASSERT_THAT(Run({"sort_ro", "set-1", "DESC"}).GetVec(),
              ElementsAre("100", "99.9", "60", "9", "5.3", "4.4"));
  ASSERT_THAT(Run({"sort_ro", "set-1", "DESC", "ALPHA"}).GetVec(),
              ElementsAre("99.9", "9", "60", "5.3", "4.4", "100"));

  // Test intset sort
  Run({"del", "intset-1"});
  Run({"sadd", "intset-1", "5", "4", "3", "2", "1"});
  ASSERT_THAT(Run({"sort_ro", "intset-1"}).GetVec(), ElementsAre("1", "2", "3", "4", "5"));

  // Test sorted set sort
  Run({"del", "zset-1"});
  Run({"zadd", "zset-1", "0", "3.3", "0", "30.1", "0", "8.2"});
  ASSERT_THAT(Run({"sort_ro", "zset-1"}).GetVec(), ElementsAre("3.3", "8.2", "30.1"));
  ASSERT_THAT(Run({"sort_ro", "zset-1", "ALPHA"}).GetVec(), ElementsAre("3.3", "30.1", "8.2"));
  ASSERT_THAT(Run({"sort_ro", "zset-1", "DESC"}).GetVec(), ElementsAre("30.1", "8.2", "3.3"));
  ASSERT_THAT(Run({"sort_ro", "zset-1", "DESC", "ALPHA"}).GetVec(),
              ElementsAre("8.2", "30.1", "3.3"));

  // Test sort with non existent key
  Run({"del", "list-2"});
  ASSERT_THAT(Run({"sort_ro", "list-2"}), ArrLen(0));

  // Test not convertible to double
  Run({"lpush", "list-2", "NOTADOUBLE"});
  ASSERT_THAT(Run({"sort_ro", "list-2"}),
              ErrArg("One or more scores can't be converted into double"));

  Run({"set", "foo", "bar"});
  ASSERT_THAT(Run({"sort_ro", "foo"}), ErrArg("WRONGTYPE "));

  Run({"rpush", "list-3", ""});
  ASSERT_THAT(Run({"sort_ro", "list-3"}), "");

  Run({"rpush", "list-3", "2", "0", "", "-0.14", "0.12", "-0", "-123123", "7654"});
  ASSERT_THAT(Run({"sort_ro", "list-3"}).GetVec(),
              ElementsAre("-123123", "-0.14", "", "", "-0", "0", "0.12", "2", "7654"));

  Run({"rpush", "NANvalue", "nan"});
  ASSERT_THAT(Run({"sort_ro", "NANvalue"}),
              ErrArg("One or more scores can't be converted into double"));

  // Test store option should not work
  ASSERT_THAT(Run({"sort_ro", "list-1", "store", "list-2"}), ErrArg("syntax error"));
}

TEST_F(GenericFamilyTest, SortROBug3636) {
  Run({"RPUSH", "foo", "1.100000023841858", "1.100000023841858", "1.100000023841858", "-15710",
       "1.100000023841858", "1.100000023841858", "1.100000023841858", "-15710", "-15710",
       "1.100000023841858", "-15710", "-15710", "-15710", "-15710", "1.100000023841858", "-15710",
       "-15710"});
  auto resp = Run({"SORT_RO", "foo", "desc", "alpha"});
  ASSERT_THAT(resp, ArrLen(17));
}

TEST_F(GenericFamilyTest, TimeNoKeys) {
  auto resp = Run({"time"});
  EXPECT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp.GetVec()[0], ArgType(RespExpr::INT64));
  EXPECT_THAT(resp.GetVec()[1], ArgType(RespExpr::INT64));

  // Check that time is the same inside a transaction.
  Run({"multi"});
  Run({"time"});
  usleep(2000);
  Run({"time"});
  resp = Run({"exec"});

  EXPECT_THAT(resp, RespArray(ElementsAre(RespArray(ElementsAre(Not(IntArg(0)), _)),
                                          RespArray(ElementsAre(Not(IntArg(0)), _)))));

  for (int i = 0; i < 2; ++i) {
    int64_t val0 = get<int64_t>(resp.GetVec()[0].GetVec()[i].u);
    int64_t val1 = get<int64_t>(resp.GetVec()[1].GetVec()[i].u);
    EXPECT_EQ(val0, val1);
  }
}

TEST_F(GenericFamilyTest, TimeWithKeys) {
  auto resp = Run({"time"});
  EXPECT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp.GetVec()[0], ArgType(RespExpr::INT64));
  EXPECT_THAT(resp.GetVec()[1], ArgType(RespExpr::INT64));

  // Check that time is the same inside a transaction.
  Run({"multi"});
  Run({"time"});
  usleep(2000);
  Run({"time"});
  Run({"get", "x"});
  resp = Run({"exec"});

  EXPECT_THAT(resp, RespArray(ElementsAre(RespArray(ElementsAre(Not(IntArg(0)), _)),
                                          RespArray(ElementsAre(Not(IntArg(0)), _)), _)));

  for (int i = 0; i < 2; ++i) {
    int64_t val0 = get<int64_t>(resp.GetVec()[0].GetVec()[i].u);
    int64_t val1 = get<int64_t>(resp.GetVec()[1].GetVec()[i].u);
    EXPECT_EQ(val0, val1);
  }
}

TEST_F(GenericFamilyTest, Persist) {
  auto resp = Run({"set", "mykey", "somevalue"});
  EXPECT_EQ(resp, "OK");
  // Key without expiration time - return 0
  EXPECT_EQ(0, CheckedInt({"persist", "mykey"}));
  EXPECT_EQ(-1, CheckedInt({"TTL", "mykey"}));
  // set expiration time and try again
  resp = Run({"EXPIRE", "mykey", "10"});
  EXPECT_EQ(10, CheckedInt({"TTL", "mykey"}));
  EXPECT_EQ(1, CheckedInt({"persist", "mykey"}));
  EXPECT_EQ(-1, CheckedInt({"TTL", "mykey"}));
  // persist on key that does not exist should also return 0
  EXPECT_EQ(0, CheckedInt({"persist", "keythatdoesnotexist"}));
}

TEST_F(GenericFamilyTest, Dump) {
  ASSERT_EQ(RDB_SER_VERSION, 9);
  uint8_t EXPECTED_STRING_DUMP[13] = {0x00, 0xc0, 0x13, 0x09, 0x00, 0x23, 0x13,
                                      0x6f, 0x4d, 0x68, 0xf6, 0x35, 0x6e};
  uint8_t EXPECTED_HASH_DUMP[] = {0x10, 0xc,  0xc,  0x0,  0x0, 0x0,  0x2,  0x0,
                                  0x13, 0x1,  0xc4, 0xd2, 0x2, 0xff, 0x9,  0x0,
                                  0x68, 0x4d, 0x73, 0xa4, 0xf, 0x23, 0x4f, 0xc7};

  uint8_t EXPECTED_LIST_DUMP[] = {0x12, 0x01, 0x02, '\t', '\t', 0x00, 0x00, 0x00,
                                  0x01, 0x00, 0x14, 0x01, 0xff, '\t', 0x00, 0xfb,
                                  0xbd, 0x36, 0xf8, 0xb4, 't',  '%',  ';'};

  // Check string dump
  auto resp = Run({"set", "z", "19"});
  EXPECT_EQ(resp, "OK");
  resp = Run({"dump", "z"});
  auto dump = resp.GetBuf();
  ASSERT_EQ(ToSV(dump), ToSV(EXPECTED_STRING_DUMP));

  // Check list dump
  EXPECT_EQ(1, CheckedInt({"rpush", "l", "20"}));
  resp = Run({"dump", "l"});
  dump = resp.GetBuf();
  ASSERT_EQ(ToSV(dump), ToSV(EXPECTED_LIST_DUMP)) << absl::CHexEscape(resp.GetString());

  // Check for hash dump
  EXPECT_EQ(1, CheckedInt({"hset", "z2", "19", "1234"}));
  resp = Run({"dump", "z2"});
  dump = resp.GetBuf();
  ASSERT_EQ(ToSV(dump), ToSV(EXPECTED_HASH_DUMP));

  // Check that when running with none existing key we're getting nil
  resp = Run({"dump", "foo"});
  EXPECT_EQ(resp.type, RespExpr::NIL);
}

TEST_F(GenericFamilyTest, Restore) {
  using std::chrono::duration_cast;
  using std::chrono::milliseconds;
  using std::chrono::seconds;
  using std::chrono::system_clock;

  // redis 6 with RDB_VERSION 9
  uint8_t STRING_DUMP_REDIS[] = {0x00, 0xc1, 0xd2, 0x04, 0x09, 0x00, 0xd0,
                                 0x75, 0x59, 0x6d, 0x10, 0x04, 0x3f, 0x5c};
  auto resp = Run({"set", "exiting-key", "1234"});
  EXPECT_EQ(resp, "OK");

  // try to restore into existing key - this should fail. We should get BUSYKEY error
  ASSERT_THAT(Run({"restore", "exiting-key", "0", ToSV(STRING_DUMP_REDIS)}),
              ErrArg("BUSYKEY Target key name already exists."));

  // Try restore while setting expiration into the past
  // note that value for expiration is just some valid unix time stamp from the pass
  resp = Run(
      {"restore", "exiting-key", "1665476212900", ToSV(STRING_DUMP_REDIS), "ABSTTL", "REPLACE"});
  ASSERT_EQ(resp, "OK");
  resp = Run({"get", "exiting-key"});
  EXPECT_EQ(resp.type, RespExpr::NIL);  // it was deleted as a result of restore action

  // Test for string that we can successfully load the dumped data and read it back
  resp = Run({"restore", "new-key", "0", ToSV(STRING_DUMP_REDIS)});
  EXPECT_EQ(resp, "OK");
  resp = Run({"get", "new-key"});
  EXPECT_EQ("1234", resp);
  resp = Run({"dump", "new-key"});
  auto dump = resp.GetBuf();
  ASSERT_EQ(ToSV(dump), ToSV(STRING_DUMP_REDIS));

  // test for list
  EXPECT_EQ(1, CheckedInt({"rpush", "orig-list", "20"}));
  resp = Run({"dump", "orig-list"});
  dump = resp.GetBuf();
  resp = Run({"restore", "new-list", "10", ToSV(dump)});
  EXPECT_EQ(resp, "OK");
  resp = Run({"lpop", "new-list"});
  EXPECT_EQ("20", resp);

  // run with hash type
  EXPECT_EQ(1, CheckedInt({"hset", "orig-hash", "123", "45678"}));
  resp = Run({"dump", "orig-hash"});
  dump = resp.GetBuf();
  resp = Run({"restore", "new-hash", "1", ToSV(dump)});
  EXPECT_EQ(resp, "OK");
  EXPECT_EQ(1, CheckedInt({"hexists", "new-hash", "123"}));

  // test with replace and no TTL
  resp = Run({"set", "string-key", "hello world"});
  EXPECT_EQ(resp, "OK");
  resp = Run({"dump", "string-key"});
  dump = resp.GetBuf();
  // this will change the value from "hello world" to "1234"
  resp = Run({"restore", "string-key", "7000", ToSV(STRING_DUMP_REDIS), "REPLACE"});
  resp = Run({"get", "string-key"});
  EXPECT_EQ("1234", resp);
  // check TTL validity
  EXPECT_EQ(CheckedInt({"pttl", "string-key"}), 7000);

  // Make check about ttl with abs time, restoring back to "hello world"
  resp = Run({"restore", "string-key", absl::StrCat(TEST_current_time_ms + 2000), ToSV(dump),
              "ABSTTL", "REPLACE"});
  resp = Run({"get", "string-key"});
  EXPECT_EQ("hello world", resp);
  EXPECT_EQ(CheckedInt({"pttl", "string-key"}), 2000);

  // Last but not least - just make sure that we are good without TTL as well
  resp = Run({"restore", "string-key", "0", ToSV(STRING_DUMP_REDIS), "REPLACE"});
  resp = Run({"get", "string-key"});
  EXPECT_EQ("1234", resp);
  EXPECT_EQ(CheckedInt({"ttl", "string-key"}), -1);

  // The following set was created in Redis 7 with rdb version 11 and it's listpack encoded.
  // We should be able to read it and convert it to our own format DenseSet or HT
  // sadd myset "acme"
  // dump myset
  uint8_t SET_LISTPACK_DUMP[] = {0x14, 0x0D, 0x0D, 0x00, 0x00, 0x00, 0x01, 0x00, 0x84,
                                 0x61, 0x63, 0x6D, 0x65, 0x05, 0xff, 0x0b, 0x00, 0xc1,
                                 0x37, 0x5c, 0xe5, 0xe2, 0xc0, 0xdd, 0x27};
  resp = Run({"restore", "listpack-set", "0", ToSV(SET_LISTPACK_DUMP)});
  resp = Run({"sismember", "listpack-set", "acme"});
  EXPECT_EQ(true, resp.GetInt().has_value());
  EXPECT_EQ(1, resp.GetInt());

  // The following zset was created in Redis 7 with rdb version 11 and it's listpack encoded.
  // zadd my-zset 1 "elon"
  // dump my-zset
  uint8_t ZSET_LISTPACK_DUMP[] = {0x11, 0x0f, 0x0f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x84,
                                  0x65, 0x6c, 0x6f, 0x6e, 0x05, 0x01, 0x01, 0xff, 0x0b,
                                  0x00, 0xc8, 0x01, 0x2c, 0xad, 0xd9, 0xa3, 0x99, 0x5e};

  resp = Run({"restore", "my-zset", "0", ToSV(ZSET_LISTPACK_DUMP)});
  EXPECT_EQ(resp.GetString(), "OK");
  resp = Run({"zrange", "my-zset", "0", "-1"});
  EXPECT_EQ("elon", resp.GetString());

  // corrupt the dump file but keep the crc correct.
  ZSET_LISTPACK_DUMP[0] = 0x12;
  uint8_t crc64[8] = {0x4e, 0xa3, 0x4c, 0x89, 0xc4, 0x8b, 0xd9, 0xe4};
  memcpy(ZSET_LISTPACK_DUMP + 19, crc64, 8);
  resp = Run({"restore", "invalid", "0", ToSV(ZSET_LISTPACK_DUMP)});
  EXPECT_THAT(resp, ErrArg("ERR Bad data format"));
}

TEST_F(GenericFamilyTest, Info) {
  InitWithDbFilename();  // Needed for `save`

  auto get_rdb_changes_since_last_save = [](const string& str) -> size_t {
    const string matcher = "rdb_changes_since_last_success_save:";
    const auto pos = str.find(matcher) + matcher.size();
    const auto sub = str.substr(pos, 1);
    return atoi(sub.c_str());
  };

  EXPECT_EQ(Run({"set", "k", "1"}), "OK");
  auto resp = Run({"info", "persistence"});
  EXPECT_EQ(1, get_rdb_changes_since_last_save(resp.GetString()));

  EXPECT_EQ(Run({"set", "k", "1"}), "OK");
  resp = Run({"info", "persistence"});
  EXPECT_EQ(2, get_rdb_changes_since_last_save(resp.GetString()));

  EXPECT_EQ(Run({"set", "k2", "2"}), "OK");
  resp = Run({"info", "persistence"});
  EXPECT_EQ(3, get_rdb_changes_since_last_save(resp.GetString()));

  EXPECT_EQ(Run({"save"}), "OK");
  resp = Run({"info", "persistence"});
  EXPECT_EQ(0, get_rdb_changes_since_last_save(resp.GetString()));

  EXPECT_EQ(Run({"set", "k2", "2"}), "OK");
  resp = Run({"info", "persistence"});
  EXPECT_EQ(1, get_rdb_changes_since_last_save(resp.GetString()));

  EXPECT_EQ(Run({"bgsave"}), "OK");
  bool cond = WaitUntilCondition(
      [&]() {
        resp = Run({"info", "persistence"});
        return get_rdb_changes_since_last_save(resp.GetString()) == 0;
      },
      500ms);
  EXPECT_TRUE(cond);

  EXPECT_EQ(Run({"set", "k3", "3"}), "OK");
  resp = Run({"info", "persistence"});
  EXPECT_EQ(1, get_rdb_changes_since_last_save(resp.GetString()));

  EXPECT_THAT(Run({"del", "k3"}), IntArg(1));
  resp = Run({"info", "persistence"});
  EXPECT_EQ(2, get_rdb_changes_since_last_save(resp.GetString()));
}

TEST_F(GenericFamilyTest, FieldTtl) {
  TEST_current_time_ms = kMemberExpiryBase * 1000;  // to reset to test time.
  EXPECT_THAT(Run({"saddex", "key", "1", "val1"}), IntArg(1));
  EXPECT_THAT(Run({"saddex", "key", "2", "val2"}), IntArg(1));
  EXPECT_THAT(Run({"sadd", "key", "val3"}), IntArg(1));

  EXPECT_EQ(-2, CheckedInt({"fieldttl", "nokey", "val1"}));  // key not found
  EXPECT_EQ(-3, CheckedInt({"fieldttl", "key", "bar"}));     // field not found
  EXPECT_EQ(1, CheckedInt({"fieldttl", "key", "val1"}));
  EXPECT_EQ(2, CheckedInt({"fieldttl", "key", "val2"}));
  EXPECT_EQ(-1, CheckedInt({"fieldttl", "key", "val3"}));

  AdvanceTime(1100);
  EXPECT_EQ(-3, CheckedInt({"fieldttl", "key", "val1"}));
  EXPECT_EQ(1, CheckedInt({"fieldttl", "key", "val2"}));

  Run({"set", "str", "val"});
  EXPECT_THAT(Run({"fieldttl", "str", "bar"}), ErrArg("wrong"));

  EXPECT_EQ(2, CheckedInt({"HSETEX", "k2", "1", "f1", "v1", "f2", "v2"}));
  EXPECT_EQ(1, CheckedInt({"HSET", "k2", "f3", "v3"}));

  EXPECT_EQ(1, CheckedInt({"fieldttl", "k2", "f1"}));
  EXPECT_EQ(-1, CheckedInt({"fieldttl", "k2", "f3"}));
  EXPECT_EQ(-3, CheckedInt({"fieldttl", "k2", "f4"}));
}

TEST_F(GenericFamilyTest, RandomKey) {
  auto resp = Run({"randomkey"});
  EXPECT_EQ(resp.type, RespExpr::NIL);

  resp = Run({"set", "k1", "1"});
  EXPECT_EQ(Run({"randomkey"}), "k1");
}

TEST_F(GenericFamilyTest, JsonType) {
  auto resp = Run({"json.set", "json", "$", R"({"example":"value"})"});
  EXPECT_EQ(resp, "OK");

  resp = Run({"type", "json"});
  EXPECT_EQ(resp, "ReJSON-RL") << "For the Redis GUI the register of the JSON type is important. "
                                  "See https://github.com/dragonflydb/dragonfly/issues/3386";

  // Test json type lowercase works for the SCAN commmand
  resp = Run({"scan", "0", "type", "rejson-rl"});
  EXPECT_THAT(resp, ArrLen(2));
  auto vec = StrArray(resp.GetVec()[1]);
  ASSERT_THAT(vec, ElementsAre("json"));
}

TEST_F(GenericFamilyTest, FieldExpireSet) {
  Run({"SADD", "key", "a", "b", "c"});
  AdvanceTime(2'000);
  EXPECT_THAT(Run({"FIELDEXPIRE", "key", "10", "a", "b", "c"}),
              RespArray(ElementsAre(IntArg(1), IntArg(1), IntArg(1))));
  EXPECT_EQ(10, CheckedInt({"fieldttl", "key", "a"}));
  AdvanceTime(10'000);
  EXPECT_THAT(Run({"SMEMBERS", "key"}), RespArray(ElementsAre()));
}

TEST_F(GenericFamilyTest, FieldExpireHset) {
  for (int i = 0; i < 3; ++i) {
    EXPECT_EQ(CheckedInt({"HSET", "key", absl::StrCat("k", i), "v"}), 1);
  }
  AdvanceTime(2'000);
  EXPECT_THAT(Run({"FIELDEXPIRE", "key", "10", "k0", "k1", "k2"}),
              RespArray(ElementsAre(IntArg(1), IntArg(1), IntArg(1))));
  EXPECT_EQ(10, CheckedInt({"fieldttl", "key", "k0"}));
  AdvanceTime(10'000);
  EXPECT_THAT(Run({"HGETALL", "key"}), RespArray(ElementsAre()));
}

TEST_F(GenericFamilyTest, FieldExpireNoSuchField) {
  EXPECT_EQ(CheckedInt({"SADD", "key", "a"}), 1);
  EXPECT_EQ(CheckedInt({"HSET", "key2", "k0", "v0"}), 1);
  EXPECT_THAT(Run({"FIELDEXPIRE", "key", "10", "a", "b"}),
              RespArray(ElementsAre(IntArg(1), IntArg(-2))));
  EXPECT_THAT(Run({"FIELDEXPIRE", "key2", "10", "k0", "b"}),
              RespArray(ElementsAre(IntArg(1), IntArg(-2))));
}

TEST_F(GenericFamilyTest, FieldExpireNoSuchKey) {
  EXPECT_THAT(Run({"FIELDEXPIRE", "key", "10", "a", "b"}),
              RespArray(ElementsAre(IntArg(-2), IntArg(-2))));
}

TEST_F(GenericFamilyTest, ExpireTime) {
  EXPECT_EQ(-2, CheckedInt({"EXPIRETIME", "foo"}));
  EXPECT_EQ(-2, CheckedInt({"PEXPIRETIME", "foo"}));
  Run({"set", "foo", "bar"});
  EXPECT_EQ(-1, CheckedInt({"EXPIRETIME", "foo"}));
  EXPECT_EQ(-1, CheckedInt({"PEXPIRETIME", "foo"}));

  // set expiry
  uint64_t expire_time_in_ms = TEST_current_time_ms + 5000;
  uint64_t expire_time_in_seconds = (expire_time_in_ms + 500) / 1000;
  Run({"pexpireat", "foo", absl::StrCat(expire_time_in_ms)});
  EXPECT_EQ(expire_time_in_seconds, CheckedInt({"EXPIRETIME", "foo"}));
  EXPECT_EQ(expire_time_in_ms, CheckedInt({"PEXPIRETIME", "foo"}));
}

TEST_F(GenericFamilyTest, RestoreOOM) {
  max_memory_limit = 20000000;
  Run({"set", "src", string(5000, 'x')});
  auto resp = Run({"dump", "src"});

  string dump = resp.GetString();

  // Let Dragonfly propagate max_memory_limit to shards. It does not have to be precise,
  // the loop should have enough time for the internal processes to progress.
  usleep(10000);
  unsigned i = 0;
  for (; i < 10000; ++i) {
    resp = Run({"restore", absl::StrCat("dst", i), "0", dump});
    if (resp != "OK")
      break;
  }
  ASSERT_LT(i, 10000);
  EXPECT_THAT(resp, ErrArg("Out of memory"));
}

TEST_F(GenericFamilyTest, Bug4466) {
  auto resp = Run({"SCAN", "9223372036854775808"});  // an invalid cursor should not crash us.
  EXPECT_THAT(resp, RespElementsAre("0", RespElementsAre()));
}

TEST_F(GenericFamilyTest, Unlink) {
  for (unsigned i = 0; i < 1000; ++i) {
    unsigned start = i * 10;
    vector<string> cmd = {"SADD", "s1"};
    for (unsigned j = 0; j < 10; ++j) {
      cmd.push_back(absl::StrCat("f", start + j));
    }
    auto resp = Run(absl::MakeSpan(cmd));
    ASSERT_THAT(resp, IntArg(10));
    cmd[1] = "s2";
    resp = Run(absl::MakeSpan(cmd));
    ASSERT_THAT(resp, IntArg(10));
  }
  auto resp = Run({"unlink", "s1", "s2"});
  EXPECT_THAT(resp, IntArg(2));
}

TEST_F(GenericFamilyTest, Copy) {
  RespExpr resp;
  string b_val(32, 'b');
  string x_val(32, 'x');

  resp = Run({"mset", "x", x_val, "b", b_val});
  ASSERT_EQ(resp, "OK");
  ASSERT_EQ(2, last_cmd_dbg_info_.shards_count);

  resp = Run({"COPY", "z", "b"});
  ASSERT_THAT(resp, IntArg(0));

  resp = Run({"COPY", "b", "c"});
  ASSERT_THAT(resp, IntArg(1));
  ASSERT_EQ(b_val, Run({"get", "c"}));

  resp = Run({"COPY", "x", "b", "REPLACE"});
  ASSERT_THAT(resp, IntArg(1));

  ASSERT_EQ(x_val, Run({"get", "x"}));
  ASSERT_EQ(x_val, Run({"get", "b"}));
  EXPECT_EQ(CheckedInt({"exists", "x", "b"}), 2);

  const char* keys[2] = {"b", "x"};
  auto ren_fb = pp_->at(0)->LaunchFiber([&] {
    for (size_t i = 0; i < 200; ++i) {
      int j = i % 2;
      auto resp = Run({"COPY", keys[j], keys[1 - j], "REPLACE"});
      ASSERT_THAT(resp, IntArg(1));
    }
  });

  auto exist_fb = pp_->at(2)->LaunchFiber([&] {
    for (size_t i = 0; i < 300; ++i) {
      int64_t resp = CheckedInt({"exists", "x", "b"});
      ASSERT_EQ(2, resp);
    }
  });

  exist_fb.Join();
  ren_fb.Join();
}

TEST_F(GenericFamilyTest, CopyNonString) {
  EXPECT_EQ(1, CheckedInt({"lpush", "x", "elem"}));
  auto resp = Run({"COPY", "x", "b"});
  ASSERT_THAT(resp, IntArg(1));
  ASSERT_EQ(2, last_cmd_dbg_info_.shards_count);

  EXPECT_EQ(1, CheckedInt({"del", "x"}));
  EXPECT_EQ(1, CheckedInt({"del", "b"}));
}

TEST_F(GenericFamilyTest, CopyBinary) {
  const char kKey1[] = "\x01\x02\x03\x04";
  const char kKey2[] = "\x05\x06\x07\x08";

  Run({"set", kKey1, "bar"});
  Run({"COPY", kKey1, kKey2});
  EXPECT_EQ(Run({"get", kKey1}), "bar");
  EXPECT_EQ(Run({"get", kKey2}), "bar");
}

TEST_F(GenericFamilyTest, CopyTTL) {
  Run({"setex", "k1", "10", "bar"});

  ASSERT_THAT(Run({"COPY", "k1", "k2"}), IntArg(1));
  EXPECT_THAT(Run({"ttl", "k2"}), 10);
}

TEST_F(GenericFamilyTest, CopySameName) {
  ASSERT_THAT(Run({"COPY", "k1", "k1"}), ErrArg("source and destination objects are the same"));

  ASSERT_EQ(Run({"set", "k1", "v"}), "OK");
  ASSERT_THAT(Run({"COPY", "k1", "k1"}), ErrArg("source and destination objects are the same"));
}

TEST_F(GenericFamilyTest, CopyToDB) {
  // we don't support DB arg for now
  ASSERT_THAT(Run({"COPY", "k1", "k1", "DB", "SOME_DB"}), ErrArg("syntax error"));
}

TEST_F(GenericFamilyTest, CopyKeyExists) {
  Run({"set", "source", "value1"});
  Run({"set", "destination", "value2"});

  ASSERT_THAT(Run({"COPY", "source", "destination"}), IntArg(0));

  EXPECT_EQ(Run({"get", "destination"}), "value2");
  EXPECT_EQ(Run({"get", "source"}), "value1");

  ASSERT_THAT(Run({"COPY", "source", "destination", "REPLACE"}), IntArg(1));
  EXPECT_EQ(Run({"get", "destination"}), "value1");
}

TEST_F(GenericFamilyTest, HashFieldExpiryDuringDeserialize) {
  Run({"HSETEX", "src", "1", "field1", "value1"});

  // Advance time past field TTL - now field is expired
  AdvanceTime(2000);

  Run({"RENAME", "src", "dst"});
}

TEST_F(GenericFamilyTest, SortNegativeLimit) {
  Run({"lpush", "list-neg", "1", "2", "3", "4", "5"});

  // Negative offset
  auto resp = Run({"sort", "list-neg", "LIMIT", "-1", "2"});
  ASSERT_THAT(resp, ErrArg("value is not an integer"));

  // Negative limit
  resp = Run({"sort", "list-neg", "LIMIT", "0", "-1"});
  ASSERT_THAT(resp, ErrArg("value is not an integer"));

  // Both negative
  resp = Run({"sort", "list-neg", "LIMIT", "-1", "-1"});
  ASSERT_THAT(resp, ErrArg("value is not an integer"));
}

TEST_F(GenericFamilyTest, SortBy) {
  Run({"del", "list-1"});
  Run({"lpush", "list-1", "1", "2", "3"});
  Run({"set", "w_1", "30"});
  Run({"set", "w_2", "20"});
  Run({"set", "w_3", "10"});

  // standard sort
  auto resp = Run({"sort", "list-1", "BY", "w_*"});
  ASSERT_THAT(resp, RespElementsAre("3", "2", "1"));

  // desc
  ASSERT_THAT(Run({"sort", "list-1", "BY", "w_*", "DESC"}), RespElementsAre("1", "2", "3"));

  // alpha
  Run({"set", "s_1", "c"});
  Run({"set", "s_2", "b"});
  Run({"set", "s_3", "a"});
  ASSERT_THAT(Run({"sort", "list-1", "BY", "s_*", "ALPHA"}), RespElementsAre("3", "2", "1"));

  // nosort, lpush reverses order, so 3, 2, 1 is insertion order (or close to it)
  ASSERT_THAT(Run({"sort", "list-1", "BY", "nosort"}), RespElementsAre("3", "2", "1"));

  // missing keys -> 0
  Run({"del", "w_1"});
  ASSERT_THAT(Run({"sort", "list-1", "BY", "w_*"}), RespElementsAre("1", "3", "2"));  // 0, 10, 20

  // BY pattern with LIMIT - test pagination works correctly
  Run({"set", "w_1", "30"});  // restore w_1
  // Sorted order: 3 (w_3=10), 2 (w_2=20), 1 (w_1=30). LIMIT 1 2 skips first, returns next 2
  ASSERT_THAT(Run({"sort", "list-1", "BY", "w_*", "LIMIT", "1", "2"}), RespElementsAre("2", "1"));
  // multiple asterisks should result in syntax error
  ASSERT_THAT(Run({"sort", "list-1", "BY", "w_*_*"}), ErrArg("syntax error"));
}

TEST_F(GenericFamilyTest, SortGet) {
  // Setup test data
  Run({"del", "mylist"});
  Run({"lpush", "mylist", "1", "2", "3"});
  Run({"set", "obj_1", "first"});
  Run({"set", "obj_2", "second"});
  Run({"set", "obj_3", "third"});
  Run({"set", "weight_1", "30"});
  Run({"set", "weight_2", "20"});
  Run({"set", "weight_3", "10"});

  // Test 1: Basic GET with single pattern (sorted numerically: 1,2,3)
  auto resp = Run({"sort", "mylist", "GET", "obj_*"});
  ASSERT_THAT(resp, RespElementsAre("first", "second", "third"));

  // Test 2: GET with special # pattern (returns element itself, sorted: 1,2,3)
  resp = Run({"sort", "mylist", "GET", "#"});
  ASSERT_THAT(resp, RespElementsAre("1", "2", "3"));

  // Test 3: Multiple GET patterns
  resp = Run({"sort", "mylist", "GET", "#", "GET", "obj_*"});
  ASSERT_THAT(resp, RespElementsAre("1", "first", "2", "second", "3", "third"));

  // Test 4: GET with BY pattern (sorted by weight: 3(10), 2(20), 1(30))
  resp = Run({"sort", "mylist", "BY", "weight_*", "GET", "obj_*"});
  ASSERT_THAT(resp, RespElementsAre("third", "second", "first"));

  // Test 5: Multiple GET patterns with BY
  resp = Run({"sort", "mylist", "BY", "weight_*", "GET", "#", "GET", "obj_*"});
  ASSERT_THAT(resp, RespElementsAre("3", "third", "2", "second", "1", "first"));

  // Test 6: GET with missing keys (should return empty strings, sorted: 1,2,3)
  Run({"del", "obj_2"});
  resp = Run({"sort", "mylist", "GET", "obj_*"});
  ASSERT_THAT(resp, RespElementsAre("first", "", "third"));

  // Restore obj_2 for further tests
  Run({"set", "obj_2", "second"});

  // Test 7: GET with DESC (sorted DESC: 3,2,1)
  resp = Run({"sort", "mylist", "DESC", "GET", "obj_*"});
  ASSERT_THAT(resp, RespElementsAre("third", "second", "first"));

  // Test 8: GET with ALPHA
  Run({"del", "strlist"});
  Run({"lpush", "strlist", "c", "b", "a"});
  Run({"set", "obj_a", "alpha"});
  Run({"set", "obj_b", "beta"});
  Run({"set", "obj_c", "gamma"});
  resp = Run({"sort", "strlist", "ALPHA", "GET", "obj_*"});
  ASSERT_THAT(resp, RespElementsAre("alpha", "beta", "gamma"));

  // Test 9: GET with LIMIT
  resp = Run({"sort", "mylist", "GET", "#", "GET", "obj_*", "LIMIT", "1", "2"});
  ASSERT_THAT(resp, RespElementsAre("2", "second", "3", "third"));

  // Test 10: GET with STORE
  resp = Run({"sort", "mylist", "GET", "#", "GET", "obj_*", "STORE", "result"});
  ASSERT_THAT(resp, IntArg(6));  // 3 elements * 2 GET patterns = 6 stored values
  resp = Run({"lrange", "result", "0", "-1"});
  ASSERT_THAT(resp, RespElementsAre("1", "first", "2", "second", "3", "third"));

  // Test 11: GET with BY nosort
  resp = Run({"sort", "mylist", "BY", "nosort", "GET", "obj_*"});
  ASSERT_THAT(resp, RespElementsAre("third", "second", "first"));  // insertion order

  // Test 12: GET pattern validation (multiple asterisks should error)
  ASSERT_THAT(Run({"sort", "mylist", "GET", "obj_*_*"}), ErrArg("syntax error"));

  // Test 13: GET with empty list
  Run({"del", "emptylist"});
  Run({"lpush", "emptylist", "placeholder"});
  Run({"lpop", "emptylist"});
  resp = Run({"sort", "emptylist", "GET", "obj_*"});
  ASSERT_THAT(resp, ArrLen(0));

  // Test 14: GET with literal pattern (no asterisk)
  Run({"set", "fixed_key", "fixed_value"});
  resp = Run({"sort", "mylist", "GET", "fixed_key"});
  ASSERT_THAT(resp, RespElementsAre("fixed_value", "fixed_value", "fixed_value"));

  // Test 15: SORT_RO with GET
  resp = Run({"sort_ro", "mylist", "GET", "#", "GET", "obj_*"});
  ASSERT_THAT(resp, RespElementsAre("1", "first", "2", "second", "3", "third"));
}

TEST_F(GenericFamilyTest, Delex) {
  // DELEX without condition behaves like DEL
  Run({"set", "key1", "value1"});
  EXPECT_EQ(1, CheckedInt({"delex", "key1"}));
  EXPECT_THAT(Run({"get", "key1"}), ArgType(RespExpr::NIL));

  // DELEX on non-existent key returns 0
  EXPECT_EQ(0, CheckedInt({"delex", "nonexistent"}));

  // DELEX IFEQ deletes when values match
  Run({"set", "key2", "value2"});
  EXPECT_EQ(1, CheckedInt({"delex", "key2", "IFEQ", "value2"}));
  EXPECT_THAT(Run({"get", "key2"}), ArgType(RespExpr::NIL));

  // DELEX IFEQ does not delete when values differ
  Run({"set", "key3", "value3"});
  EXPECT_EQ(0, CheckedInt({"delex", "key3", "IFEQ", "wrongvalue"}));
  EXPECT_EQ(Run({"get", "key3"}), "value3");

  // DELEX IFNE deletes when values differ
  Run({"set", "key4", "value4"});
  EXPECT_EQ(1, CheckedInt({"delex", "key4", "IFNE", "differentvalue"}));
  EXPECT_THAT(Run({"get", "key4"}), ArgType(RespExpr::NIL));

  // DELEX IFNE does not delete when values match
  Run({"set", "key5", "value5"});
  EXPECT_EQ(0, CheckedInt({"delex", "key5", "IFNE", "value5"}));
  EXPECT_EQ(Run({"get", "key5"}), "value5");

  // DELEX IFDEQ tests - get digest first and use it
  Run({"set", "key6", "value6"});
  auto digest = Run({"digest", "key6"});
  string_view digest_str = ToSV(digest.GetBuf());
  EXPECT_EQ(1, CheckedInt({"delex", "key6", "IFDEQ", string(digest_str)}));
  EXPECT_THAT(Run({"get", "key6"}), ArgType(RespExpr::NIL));

  // DELEX IFDEQ does not delete when digests differ
  Run({"set", "key7", "value7"});
  EXPECT_EQ(0, CheckedInt({"delex", "key7", "IFDEQ", "0000000000000000"}));
  EXPECT_EQ(Run({"get", "key7"}), "value7");

  // DELEX IFDNE deletes when digests differ
  Run({"set", "key8", "value8"});
  EXPECT_EQ(1, CheckedInt({"delex", "key8", "IFDNE", "0000000000000000"}));
  EXPECT_THAT(Run({"get", "key8"}), ArgType(RespExpr::NIL));

  // DELEX IFDNE does not delete when digests match
  Run({"set", "key9", "value9"});
  auto digest9 = Run({"digest", "key9"});
  string_view digest9_str = ToSV(digest9.GetBuf());
  EXPECT_EQ(0, CheckedInt({"delex", "key9", "IFDNE", string(digest9_str)}));
  EXPECT_EQ(Run({"get", "key9"}), "value9");

  Run({"lpush", "list1", "item"});
  EXPECT_THAT(Run({"delex", "list1", "IFEQ", "item"}), ErrArg("WRONGTYPE"));

  // DELEX with invalid option returns syntax error
  Run({"set", "key10", "value10"});
  EXPECT_THAT(Run({"delex", "key10", "INVALID", "value"}), ErrArg("Unknown subcommand"));

  // DELEX with too many arguments returns error
  EXPECT_THAT(Run({"delex", "key", "IFEQ", "val", "extra"}), ErrArg("wrong number of arguments"));

  EXPECT_THAT(Run({"delex", "key11", "randomarg"}), ErrArg("wrong number of arguments"));
  EXPECT_THAT(Run({"delex", "key12", "IFEQ"}), ErrArg("wrong number of arguments"));
  EXPECT_THAT(Run({"delex", "key13", "xyz"}), ErrArg("wrong number of arguments"));
}

TEST_F(GenericFamilyTest, Rm) {
  // Basic: RM 0 on empty db returns [0, 0]
  auto resp = Run({"rm", "0"});
  ASSERT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp.GetVec()[0], "0");
  EXPECT_THAT(resp.GetVec()[1], IntArg(0));

  // With MATCH arg — still parses OK
  resp = Run({"rm", "0", "match", "foo*"});
  ASSERT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp.GetVec()[1], IntArg(0));

  // With TYPE arg — still parses OK
  resp = Run({"rm", "0", "type", "string"});
  ASSERT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp.GetVec()[1], IntArg(0));

  // With COUNT arg — still parses OK
  resp = Run({"rm", "0", "match", "foo*", "count", "100"});
  ASSERT_THAT(resp, ArrLen(2));

  // Invalid cursor → error
  resp = Run({"rm", "notanumber"});
  EXPECT_THAT(resp, ErrArg("invalid cursor"));

  // Invalid options → syntax error
  resp = Run({"rm", "0", "badopt"});
  EXPECT_THAT(resp, ErrArg("syntax"));
}

TEST_F(GenericFamilyTest, RmDeletesMatchingKeys) {
  for (int i = 0; i < 10; ++i)
    Run({"set", absl::StrCat("foo", i), "val"});
  for (int i = 0; i < 5; ++i)
    Run({"set", absl::StrCat("bar", i), "val"});

  // Delete all foo* keys by iterating until cursor returns 0
  uint32_t total_deleted = 0;
  uint64_t cursor = 0;
  do {
    auto resp = Run({"rm", absl::StrCat(cursor), "match", "foo*", "count", "100"});
    ASSERT_THAT(resp, ArrLen(2));
    ASSERT_TRUE(absl::SimpleAtoi(resp.GetVec()[0].GetString(), &cursor));
    total_deleted += resp.GetVec()[1].GetInt().value();
  } while (cursor != 0);

  EXPECT_EQ(total_deleted, 10u);

  // foo* keys are gone, bar* keys remain
  EXPECT_EQ(Run({"exists", "foo0"}), 0);
  EXPECT_EQ(Run({"exists", "bar0"}), 1);
  EXPECT_EQ(Run({"dbsize"}), 5);
}

}  // namespace dfly


================================================
FILE: src/server/geo_family.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include <absl/strings/ascii.h>

extern "C" {
#include "redis/geo.h"
#include "redis/geohash.h"
#include "redis/geohash_helper.h"
#include "redis/redis_aux.h"
#include "redis/util.h"
#include "redis/zmalloc.h"
}

#include "base/logging.h"
#include "core/sorted_map.h"
#include "facade/cmd_arg_parser.h"
#include "facade/error.h"
#include "server/acl/acl_commands_def.h"
#include "server/command_families.h"
#include "server/command_registry.h"
#include "server/conn_context.h"
#include "server/engine_shard_set.h"
#include "server/error.h"
#include "server/family_utils.h"
#include "server/transaction.h"
#include "server/zset_family.h"

namespace dfly {

using namespace std;
using namespace facade;
using absl::SimpleAtoi;
namespace {

using CI = CommandId;

enum Errors {
  INVALID_LONG_LAT = CmdArgParser::ErrorType::CUSTOM_ERROR,
  INVALID_UNIT = INVALID_LONG_LAT + 1,
};

const char kNxXxErr[] = "XX and NX options at the same time are not compatible";
const char kFromMemberLonglatErr[] =
    "FROMMEMBER and FROMLONLAT options at the same time are not compatible";
const char kByRadiusBoxErr[] = "BYRADIUS and BYBOX options at the same time are not compatible";
const char kAscDescErr[] = "ASC and DESC options at the same time are not compatible";
const char kStoreTypeErr[] = "STORE and STOREDIST options at the same time are not compatible";
const char kStoreCompatRadErr[] =
    "STORE option in GEORADIUS is not compatible with WITHDIST, WITHHASH and WITHCOORDS options";
const char kStoreCompatByMemberErr[] =
    "STORE option in GEORADIUSBYMEMBER is not compatible with WITHDIST, WITHHASH and WITHCOORDS "
    "options";
const char kMemberNotFound[] = "could not decode requested zset member";
const char kInvalidUnit[] = "unsupported unit provided. please use M, KM, FT, MI";
const char kCountError[] = "ERR COUNT must be > 0";
constexpr string_view kGeoAlphabet = "0123456789bcdefghjkmnpqrstuvwxyz"sv;

enum class Type {
  FROMMEMBER,
  FROMLONLAT,
  BYRADIUS,
  BYBOX,
  ASC,
  DESC,
  COUNT,
  WITHCOORD,
  WITHDIST,
  WITHHASH,

  STORE,
  STOREDIST
};

using MScoreResponse = std::vector<std::optional<double>>;

using ScoredMember = std::pair<std::string, double>;
using ScoredArray = std::vector<ScoredMember>;
using ScoredMemberView = std::pair<double, std::string_view>;
using ScoredMemberSpan = absl::Span<const ScoredMemberView>;

struct GeoPoint {
  double longitude;
  double latitude;
  double dist;
  double score;
  std::string member;
  GeoPoint() : longitude(0.0), latitude(0.0), dist(0.0), score(0.0){};
  GeoPoint(double _longitude, double _latitude, double _dist, double _score,
           const std::string& _member)
      : longitude(_longitude), latitude(_latitude), dist(_dist), score(_score), member(_member){};
};
using GeoArray = std::vector<GeoPoint>;

enum class Sorting { kUnsorted, kAsc, kDesc, kError };
enum class GeoStoreType { kNoStore, kStoreHash, kStoreDist, kError };
struct GeoSearchOpts {
  double conversion = 0;
  uint64_t count = std::numeric_limits<uint64_t>::max();
  Sorting sorting = Sorting::kUnsorted;
  bool any = 0;
  bool withdist = 0;
  bool withcoord = 0;
  bool withhash = 0;
  GeoStoreType store = GeoStoreType::kNoStore;
  string_view store_key;

  bool HasWithStatement() const {
    return withdist || withcoord || withhash;
  }
};

bool ValidateLongLat(double longitude, double latitude) {
  return !(longitude < GEO_LONG_MIN || longitude > GEO_LONG_MAX || latitude < GEO_LAT_MIN ||
           latitude > GEO_LAT_MAX);
}

void ParseLongLat(CmdArgParser* parser, double lonlat[2]) {
  std::tie(lonlat[0], lonlat[1]) = parser->Next<double, double>();

  if (!ValidateLongLat(lonlat[0], lonlat[1])) {
    parser->Report(Errors::INVALID_LONG_LAT);
  }
}

bool ParseLongLat(string_view lon, string_view lat, std::pair<double, double>* res) {
  if (!ParseDouble(lon, &res->first))
    return false;

  if (!ParseDouble(lat, &res->second))
    return false;

  return ValidateLongLat(res->first, res->second);
}

bool ScoreToLongLat(const std::optional<double>& val, double* xy) {
  if (!val.has_value())
    return false;

  double score = *val;

  GeoHashBits hash = {.bits = (uint64_t)score, .step = GEO_STEP_MAX};

  return geohashDecodeToLongLatType(hash, xy) == 1;
}

bool ToAsciiGeoHash(const std::optional<double>& val, array<char, 12>* buf) {
  if (!val.has_value())
    return false;

  double score = *val;

  GeoHashBits hash = {.bits = (uint64_t)score, .step = GEO_STEP_MAX};

  double xy[2];
  if (!geohashDecodeToLongLatType(hash, xy)) {
    return false;
  }

  /* Re-encode */
  GeoHashRange r[2];
  r[0].min = -180;
  r[0].max = 180;
  r[1].min = -90;
  r[1].max = 90;

  geohashEncode(&r[0], &r[1], xy[0], xy[1], 26, &hash);

  for (int i = 0; i < 11; i++) {
    int idx;
    if (i == 10) {
      /* We have just 52 bits, but the API used to output
       * an 11 bytes geohash. For compatibility we assume
       * zero. */
      idx = 0;
    } else {
      idx = (hash.bits >> (52 - ((i + 1) * 5))) % kGeoAlphabet.size();
    }
    (*buf)[i] = kGeoAlphabet[idx];
  }
  (*buf)[11] = '\0';

  return true;
}

double ExtractUnit(CmdArgParser* parser) {
  auto unit = parser->TryMapNext("M", 1.0, "KM", 1000.0, "FT", 0.3048, "MI", 1609.34);
  if (!unit)
    parser->Report(Errors::INVALID_UNIT);
  return unit.value_or(-1);
}

double ExtractUnit(std::string_view arg) {
  const string unit = absl::AsciiStrToUpper(arg);
  if (unit == "M") {
    return 1;
  } else if (unit == "KM") {
    return 1000;
  } else if (unit == "FT") {
    return 0.3048;
  } else if (unit == "MI") {
    return 1609.34;
  } else {
    return -1;
  }
}

bool HandleGeoParserFinalize(const GeoShape& shape, CmdArgParser* parser,
                             CommandContext* cmd_cntx) {
  if (parser->Finalize()) {
    return false;
  }

  auto error = parser->TakeError();
  switch (error.type) {
    case Errors::INVALID_LONG_LAT: {
      string err =
          absl::StrCat("-ERR invalid longitude,latitude pair ", shape.xy[0], ",", shape.xy[1]);
      cmd_cntx->SendError(err, kSyntaxErrType);
      break;
    }
    case Errors::INVALID_UNIT:
      cmd_cntx->SendError("Unsupported unit provided. please use M, KM, FT, MI", kSyntaxErrType);
      break;
    default:
      cmd_cntx->SendError(error.MakeReply());
      break;
  }

  return true;
}

void CmdGeoAdd(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);

  ZSetFamily::ZParams zparams;
  size_t i = 1;
  for (; i < args.size(); ++i) {
    string cur_arg = absl::AsciiStrToUpper(ArgS(args, i));

    if (cur_arg == "XX") {
      zparams.flags |= ZADD_IN_XX;  // update only
    } else if (cur_arg == "NX") {
      zparams.flags |= ZADD_IN_NX;  // add new only.
    } else if (cur_arg == "CH") {
      zparams.ch = true;
    } else {
      break;
    }
  }

  auto* builder = cmd_cntx->rb();
  args.remove_prefix(i);
  if (args.empty() || args.size() % 3 != 0) {
    builder->SendError(kSyntaxErr);
    return;
  }

  if ((zparams.flags & ZADD_IN_NX) && (zparams.flags & ZADD_IN_XX)) {
    builder->SendError(kNxXxErr);
    return;
  }

  absl::InlinedVector<ScoredMemberView, 4> members;
  for (i = 0; i < args.size(); i += 3) {
    string_view longitude = ArgS(args, i);
    string_view latitude = ArgS(args, i + 1);
    string_view member = ArgS(args, i + 2);

    pair<double, double> longlat;

    if (!ParseLongLat(longitude, latitude, &longlat)) {
      string err = absl::StrCat("-ERR invalid longitude,latitude pair ", longitude, ",", latitude,
                                ",", member);

      return builder->SendError(err, kSyntaxErrType);
    }

    /* Turn the coordinates into the score of the element. */
    GeoHashBits hash;
    geohashEncodeWGS84(longlat.first, longlat.second, GEO_STEP_MAX, &hash);
    GeoHashFix52Bits bits = geohashAlign52Bits(hash);

    members.emplace_back(bits, member);
  }
  DCHECK(cmd_cntx->tx());

  absl::Span memb_sp{members.data(), members.size()};
  ZSetFamily::ZAddGeneric(key, zparams, memb_sp, cmd_cntx);
}

void CmdGeoHash(CmdArgList args, CommandContext* cmd_cntx) {
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  OpResult<MScoreResponse> result = ZSetFamily::ZGetMembers(args, cmd_cntx->tx(), rb);

  if (result.status() == OpStatus::WRONG_TYPE) {
    return rb->SendError(kWrongTypeErr);
  }

  RedisReplyBuilder::ArrayScope scope{rb, result->size()};
  array<char, 12> buf;
  for (const auto& p : result.value()) {
    if (ToAsciiGeoHash(p, &buf)) {
      rb->SendBulkString(string_view{buf.data(), buf.size() - 1});
    } else {
      rb->SendNull();
    }
  }
}

void CmdGeoPos(CmdArgList args, CommandContext* cmd_cntx) {
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  OpResult<MScoreResponse> result = ZSetFamily::ZGetMembers(args, cmd_cntx->tx(), rb);

  if (result.status() != OpStatus::OK) {
    return rb->SendError(result.status());
  }

  RedisReplyBuilder::ArrayScope scope{rb, result->size()};
  double xy[2];
  for (const auto& p : result.value()) {
    if (ScoreToLongLat(p, xy)) {
      rb->StartArray(2);
      rb->SendDouble(xy[0]);
      rb->SendDouble(xy[1]);
    } else {
      rb->SendNull();
    }
  }
}

void CmdGeoDist(CmdArgList args, CommandContext* cmd_cntx) {
  double distance_multiplier = 1;
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  if (args.size() == 4) {
    string_view unit = ArgS(args, 3);
    distance_multiplier = ExtractUnit(unit);
    args.remove_suffix(1);
    if (distance_multiplier < 0) {
      return rb->SendError(kInvalidUnit);
    }
  } else if (args.size() != 3) {
    return rb->SendError(kSyntaxErr);
  }

  OpResult<MScoreResponse> result = ZSetFamily::ZGetMembers(args, cmd_cntx->tx(), rb);

  if (result.status() != OpStatus::OK) {
    return rb->SendError(result.status());
  }

  const MScoreResponse& arr = result.value();

  if (arr.size() != 2) {
    return rb->SendError(kSyntaxErr);
  }

  double xyxy[4];  // 2 pairs of score holding 2 locations
  for (size_t i = 0; i < arr.size(); i++) {
    if (!ScoreToLongLat(arr[i], xyxy + (i * 2))) {
      return rb->SendNull();
    }
  }

  return rb->SendDouble(geohashGetDistance(xyxy[0], xyxy[1], xyxy[2], xyxy[3]) /
                        distance_multiplier);
}

namespace {
std::vector<ZSetFamily::ZRangeSpec> GetGeoRangeSpec(const GeoHashRadius& n) {
  array<GeoHashBits, 9> neighbors;
  unsigned int last_processed = 0;

  neighbors[0] = n.hash;
  neighbors[1] = n.neighbors.north;
  neighbors[2] = n.neighbors.south;
  neighbors[3] = n.neighbors.east;
  neighbors[4] = n.neighbors.west;
  neighbors[5] = n.neighbors.north_east;
  neighbors[6] = n.neighbors.north_west;
  neighbors[7] = n.neighbors.south_east;
  neighbors[8] = n.neighbors.south_west;

  // Get range_specs for neighbors (*and* our own hashbox)
  std::vector<ZSetFamily::ZRangeSpec> range_specs;
  for (unsigned int i = 0; i < neighbors.size(); i++) {
    if (HASHISZERO(neighbors[i])) {
      continue;
    }

    // When a huge Radius (in the 5000 km range or more) is used,
    // adjacent neighbors can be the same, leading to duplicated
    // elements. Skip every range which is the same as the one
    // processed previously.
    if (last_processed && neighbors[i].bits == neighbors[last_processed].bits &&
        neighbors[i].step == neighbors[last_processed].step) {
      continue;
    }

    GeoHashFix52Bits min, max;
    scoresOfGeoHashBox(neighbors[i], &min, &max);

    ZSetFamily::ScoreInterval si;
    si.first = ZSetFamily::Bound{static_cast<double>(min), false};
    si.second = ZSetFamily::Bound{static_cast<double>(max), true};

    ZSetFamily::RangeParams range_params;
    range_params.interval_type = ZSetFamily::RangeParams::IntervalType::SCORE;
    range_params.with_scores = true;
    range_specs.emplace_back(si, range_params);

    last_processed = i;
  }
  return range_specs;
}

void SortIfNeeded(GeoArray* ga, Sorting sorting, uint64_t count) {
  if (sorting == Sorting::kUnsorted) {
    if (count && ga->size() > count) {
      ga->resize(count);
    }
    return;
  }

  auto comparator = [&](const GeoPoint& a, const GeoPoint& b) {
    if (sorting == Sorting::kAsc) {
      return a.dist < b.dist;
    } else {
      DCHECK(sorting == Sorting::kDesc);
      return a.dist > b.dist;
    }
  };

  if (count > 0) {
    count = std::min(count, static_cast<uint64_t>(ga->size()));
    std::partial_sort(ga->begin(), ga->begin() + count, ga->end(), comparator);
    ga->resize(count);
  } else {
    std::sort(ga->begin(), ga->end(), comparator);
  }
}

void GeoSearchStoreGeneric(Transaction* tx, facade::SinkReplyBuilder* builder,
                           const GeoShape& shape_ref, string_view key, string_view member,
                           const GeoSearchOpts& geo_ops) {
  GeoShape* shape = &(const_cast<GeoShape&>(shape_ref));
  auto* rb = static_cast<RedisReplyBuilder*>(builder);

  ShardId from_shard = Shard(key, shard_set->size());

  if (!member.empty()) {
    // get shape.xy from member
    OpResult<double> member_score;
    auto cb = [&](Transaction* t, EngineShard* shard) {
      if (shard->shard_id() == from_shard) {
        member_score = ZSetFamily::OpScore(t->GetOpArgs(shard), key, member);
      }
      return OpStatus::OK;
    };
    tx->Execute(std::move(cb), false);
    auto member_sts = member_score.status();
    if (member_sts != OpStatus::OK) {
      tx->Conclude();
      switch (member_sts) {
        case OpStatus::WRONG_TYPE:
          return builder->SendError(kWrongTypeErr);
        case OpStatus::KEY_NOTFOUND:
          return rb->StartArray(0);
        case OpStatus::MEMBER_NOTFOUND:
          return builder->SendError(kMemberNotFound);
        default:
          return builder->SendError(member_sts);
      }
    }
    ScoreToLongLat(*member_score, shape->xy);
  } else {
    // verify key is valid
    OpResult<void> result;
    auto cb = [&](Transaction* t, EngineShard* shard) {
      if (shard->shard_id() == from_shard) {
        result = ZSetFamily::OpKeyExisted(t->GetOpArgs(shard), key);
      }
      return OpStatus::OK;
    };
    tx->Execute(std::move(cb), false);
    auto result_sts = result.status();
    if (result_sts != OpStatus::OK) {
      tx->Conclude();
      switch (result_sts) {
        case OpStatus::WRONG_TYPE:
          return builder->SendError(kWrongTypeErr);
        case OpStatus::KEY_NOTFOUND:
          return rb->StartArray(0);
        default:
          return builder->SendError(result_sts);
      }
    }
  }
  DCHECK(shape->xy[0] >= -180.0 && shape->xy[0] <= 180.0);
  DCHECK(shape->xy[1] >= -90.0 && shape->xy[1] <= 90.0);

  // query
  GeoHashRadius georadius = geohashCalculateAreasByShapeWGS84(shape);
  GeoArray ga;
  auto range_specs = GetGeoRangeSpec(georadius);
  // get all the matching members and add them to the potential result list
  vector<OpResult<vector<ScoredArray>>> result_arrays;
  auto cb = [&](Transaction* t, EngineShard* shard) {
    auto res_it = ZSetFamily::OpRanges(range_specs, t->GetOpArgs(shard), key);
    if (res_it) {
      result_arrays.emplace_back(res_it);
    }
    return OpStatus::OK;
  };

  tx->Execute(std::move(cb), geo_ops.store == GeoStoreType::kNoStore);

  // filter potential result list
  double xy[2];
  double distance;
  unsigned long limit = geo_ops.any ? geo_ops.count : 0;
  for (auto& result_array : result_arrays) {
    for (auto& arr : *result_array) {
      for (auto& p : arr) {
        if (geoWithinShape(shape, p.second, xy, &distance) == 0) {
          ga.emplace_back(xy[0], xy[1], distance, p.second, p.first);
          if (limit > 0 && ga.size() >= limit)
            break;
        }
      }
    }
  }

  // sort and trim by count
  SortIfNeeded(&ga, geo_ops.sorting, geo_ops.count);

  if (geo_ops.store == GeoStoreType::kNoStore) {
    // case 1: read mode
    // case 2: write mode, kNoStore
    // generate reply array withdist, withcoords, withhash
    int record_size = 1;
    if (geo_ops.withdist) {
      record_size++;
    }
    if (geo_ops.withhash) {
      record_size++;
    }
    if (geo_ops.withcoord) {
      record_size++;
    }

    RedisReplyBuilder::ArrayScope scope{rb, ga.size()};
    for (const auto& p : ga) {
      // [member, dist, x, y, hash]
      if (geo_ops.HasWithStatement()) {
        rb->StartArray(record_size);
      }
      rb->SendBulkString(p.member);
      if (geo_ops.withdist) {
        rb->SendDouble(p.dist / geo_ops.conversion);
      }
      if (geo_ops.withhash) {
        rb->SendDouble(p.score);
      }
      if (geo_ops.withcoord) {
        rb->StartArray(2);
        rb->SendDouble(p.longitude);
        rb->SendDouble(p.latitude);
      }
    }
  } else {
    // case 3: write mode, !kNoStore
    DCHECK(geo_ops.store == GeoStoreType::kStoreDist || geo_ops.store == GeoStoreType::kStoreHash);
    ShardId dest_shard = Shard(geo_ops.store_key, shard_set->size());
    DVLOG(1) << "store shard:" << dest_shard << ", key " << geo_ops.store_key;

    OpResult<ZSetFamily::AddResult> add_result;
    vector<ScoredMemberView> smvec;
    for (const auto& p : ga) {
      if (geo_ops.store == GeoStoreType::kStoreDist) {
        smvec.emplace_back(p.dist / geo_ops.conversion, p.member);
      } else {
        DCHECK(geo_ops.store == GeoStoreType::kStoreHash);
        smvec.emplace_back(p.score, p.member);
      }
    }

    auto store_cb = [&](Transaction* t, EngineShard* shard) {
      if (shard->shard_id() == dest_shard) {
        ZSetFamily::ZParams zparams;
        zparams.override = true;
        add_result = ZSetFamily::OpAdd(t->GetOpArgs(shard), zparams, geo_ops.store_key,
                                       ScoredMemberSpan{smvec})
                         .value();
      }
      return OpStatus::OK;
    };

    tx->Execute(std::move(store_cb), true);

    rb->SendLong(smvec.size());
  }
}

}  // namespace

void CmdGeoSearch(CmdArgList args, CommandContext* cmd_cntx) {
  GeoShape shape = {};
  GeoSearchOpts geo_ops;
  string_view member;

  // FROMMEMBER or FROMLONLAT is set
  int from_set = 0;
  // BYRADIUS or BYBOX is set
  int by_set = 0;
  auto* builder = cmd_cntx->rb();

  CmdArgParser parser(args);
  string_view key = parser.Next();

  while (parser.HasNext()) {
    auto type = parser.MapNext(
        "FROMMEMBER", Type::FROMMEMBER, "FROMLONLAT", Type::FROMLONLAT, "BYRADIUS", Type::BYRADIUS,
        "BYBOX", Type::BYBOX, "ASC", Type::ASC, "DESC", Type::DESC, "COUNT", Type::COUNT,
        "WITHCOORD", Type::WITHCOORD, "WITHDIST", Type::WITHDIST, "WITHHASH", Type::WITHHASH);

    switch (type) {
      case Type::FROMMEMBER:
        ++from_set;
        member = parser.Next();
        break;
      case Type::FROMLONLAT: {
        ++from_set;
        ParseLongLat(&parser, shape.xy);
        break;
      }
      case Type::BYRADIUS:
        ++by_set;
        shape.t.radius = parser.Next<double>();
        shape.conversion = ExtractUnit(&parser);
        geo_ops.conversion = shape.conversion;
        shape.type = CIRCULAR_TYPE;
        break;
      case Type::BYBOX: {
        ++by_set;
        std::tie(shape.t.r.width, shape.t.r.height) = parser.Next<double, double>();
        shape.conversion = ExtractUnit(&parser);
        geo_ops.conversion = shape.conversion;
        shape.type = RECTANGLE_TYPE;
        break;
      }
      case Type::ASC:
        geo_ops.sorting = geo_ops.sorting == Sorting::kUnsorted ? Sorting::kAsc : Sorting::kError;
        break;
      case Type::DESC:
        geo_ops.sorting = geo_ops.sorting == Sorting::kUnsorted ? Sorting::kDesc : Sorting::kError;
        break;
      case Type::COUNT:
        geo_ops.count = parser.Next<uint64_t>();
        geo_ops.any = parser.Check("ANY");
        break;
      case Type::WITHCOORD:
        geo_ops.withcoord = true;
        break;
      case Type::WITHDIST:
        geo_ops.withdist = true;
        break;
      case Type::WITHHASH:
        geo_ops.withhash = true;
        break;
      default:
        return builder->SendError(kSyntaxErr);
    }
  }

  if (HandleGeoParserFinalize(shape, &parser, cmd_cntx)) {
    return;
  }

  // check mandatory options
  if (from_set == 0 || by_set == 0) {
    return builder->SendError(kSyntaxErr);
  } else if (from_set > 1) {
    return builder->SendError(kFromMemberLonglatErr);
  } else if (by_set > 1) {
    return builder->SendError(kByRadiusBoxErr);
  } else if (geo_ops.sorting == Sorting::kError) {
    return builder->SendError(kAscDescErr);
  } else if (geo_ops.count == 0) {
    return builder->SendError(kCountError);
  }

  geo_ops.count = (geo_ops.count == UINT64_MAX) ? 0 : geo_ops.count;
  GeoSearchStoreGeneric(cmd_cntx->tx(), builder, shape, key, member, geo_ops);
}

void GeoRadiusByMemberGeneric(CmdArgList args, CommandContext* cmd_cntx, bool read_only) {
  GeoShape shape = {};
  GeoSearchOpts geo_ops;
  // parse arguments
  string_view key = ArgS(args, 0);
  // member to latlong, set shape.xy
  string_view member = ArgS(args, 1);

  auto* builder = cmd_cntx->rb();
  if (!ParseDouble(ArgS(args, 2), &shape.t.radius)) {
    return builder->SendError(kInvalidFloatErr);
  }
  string_view unit = ArgS(args, 3);
  shape.conversion = ExtractUnit(unit);
  geo_ops.conversion = shape.conversion;
  if (shape.conversion == -1) {
    return builder->SendError("unsupported unit provided. please use M, KM, FT, MI");
  }
  shape.type = CIRCULAR_TYPE;

  for (size_t i = 4; i < args.size(); ++i) {
    string cur_arg = absl::AsciiStrToUpper(ArgS(args, i));

    if (cur_arg == "ASC") {
      if (geo_ops.sorting != Sorting::kUnsorted) {
        return builder->SendError(kAscDescErr);
      }
      geo_ops.sorting = Sorting::kAsc;
    } else if (cur_arg == "DESC") {
      if (geo_ops.sorting != Sorting::kUnsorted) {
        return builder->SendError(kAscDescErr);
      }
      geo_ops.sorting = Sorting::kDesc;
    } else if (cur_arg == "COUNT") {
      if (i + 1 < args.size() && absl::SimpleAtoi(ArgS(args, i + 1), &geo_ops.count)) {
        i++;
        if (geo_ops.count == 0) {
          return builder->SendError(kCountError);
        }
      } else {
        return builder->SendError(kSyntaxErr);
      }
      if (i + 1 < args.size() && ArgS(args, i + 1) == "ANY") {
        geo_ops.any = true;
        i++;
      }
    } else if (cur_arg == "WITHCOORD") {
      geo_ops.withcoord = true;
    } else if (cur_arg == "WITHDIST") {
      geo_ops.withdist = true;
    } else if (cur_arg == "WITHHASH") {
      geo_ops.withhash = true;
    } else if (cur_arg == "STORE" && !read_only) {
      if (geo_ops.store != GeoStoreType::kNoStore) {
        return builder->SendError(kStoreTypeErr);
      }
      if (i + 1 < args.size()) {
        geo_ops.store_key = ArgS(args, i + 1);
        geo_ops.store = GeoStoreType::kStoreHash;
        i++;
      } else {
        return builder->SendError(kSyntaxErr);
      }
    } else if (cur_arg == "STOREDIST" && !read_only) {
      if (geo_ops.store != GeoStoreType::kNoStore) {
        return builder->SendError(kStoreTypeErr);
      }
      if (i + 1 < args.size()) {
        geo_ops.store_key = ArgS(args, i + 1);
        geo_ops.store = GeoStoreType::kStoreDist;
        i++;
      } else {
        return builder->SendError(kSyntaxErr);
      }
    } else {
      return builder->SendError(kSyntaxErr);
    }
  }

  if ((geo_ops.withcoord || geo_ops.withdist || geo_ops.withhash) &&
      geo_ops.store != GeoStoreType::kNoStore) {
    return builder->SendError(kStoreCompatByMemberErr);
  }

  geo_ops.count = (geo_ops.count == UINT64_MAX) ? 0 : geo_ops.count;
  GeoSearchStoreGeneric(cmd_cntx->tx(), builder, shape, key, member, geo_ops);
}

void GeoRadiusGeneric(CmdArgList args, CommandContext* cmd_cntx, bool read_only) {
  GeoShape shape = {};
  GeoSearchOpts geo_ops;

  auto* builder = cmd_cntx->rb();

  CmdArgParser parser(args);

  string_view key = parser.Next();
  ParseLongLat(&parser, shape.xy);
  shape.t.radius = parser.Next<double>();
  shape.conversion = ExtractUnit(&parser);
  geo_ops.conversion = shape.conversion;
  shape.type = CIRCULAR_TYPE;

  while (parser.HasNext()) {
    // try and parse for only RO options first
    auto type =
        parser.TryMapNext("ASC", Type::ASC, "DESC", Type::DESC, "COUNT", Type::COUNT, "WITHCOORD",
                          Type::WITHCOORD, "WITHDIST", Type::WITHDIST, "WITHHASH", Type::WITHHASH);
    // if writing variant and there there was a mapping failure test for write variant arguments
    if (!type && !read_only) {
      type = parser.MapNext("STORE", Type::STORE, "STOREDIST", Type::STOREDIST);
    }

    // could not map the argument to an argument for RO or write GEORADIUS
    if (!type) {
      return builder->SendError("syntax error", kSyntaxErrType);
    }

    switch (*type) {
      case Type::STORE:
        geo_ops.store_key = parser.Next();
        geo_ops.store = geo_ops.store == GeoStoreType::kNoStore ? GeoStoreType::kStoreHash
                                                                : GeoStoreType::kError;
        break;
      case Type::STOREDIST:
        geo_ops.store_key = parser.Next();
        geo_ops.store = geo_ops.store == GeoStoreType::kNoStore ? GeoStoreType::kStoreDist
                                                                : GeoStoreType::kError;
        break;
      case Type::ASC:
        geo_ops.sorting = geo_ops.sorting == Sorting::kUnsorted ? Sorting::kAsc : Sorting::kError;
        break;
      case Type::DESC:
        geo_ops.sorting = geo_ops.sorting == Sorting::kUnsorted ? Sorting::kDesc : Sorting::kError;
        break;
      case Type::COUNT:
        geo_ops.count = parser.Next<uint64_t>();
        geo_ops.any = parser.Check("ANY");
        break;
      case Type::WITHCOORD:
        geo_ops.withcoord = true;
        break;
      case Type::WITHDIST:
        geo_ops.withdist = true;
        break;
      case Type::WITHHASH:
        geo_ops.withhash = true;
        break;
      default:
        // If MapNext failed, it means an unknown option was provided or
        // an option requiring an argument was missing its argument.
        // The parser has already recorded the error.
        DCHECK(parser.HasError());
        break;
    }
  }

  if (HandleGeoParserFinalize(shape, &parser, cmd_cntx)) {
    return;
  }

  if (geo_ops.sorting == Sorting::kError) {
    return builder->SendError(kAscDescErr);
  } else if (geo_ops.store == GeoStoreType::kError) {
    return builder->SendError(kStoreTypeErr);
  } else if (geo_ops.count == 0) {
    return builder->SendError(kCountError);
  }

  if ((geo_ops.withcoord || geo_ops.withdist || geo_ops.withhash) &&
      geo_ops.store != GeoStoreType::kNoStore) {
    return builder->SendError(kStoreCompatRadErr);
  }

  geo_ops.count = (geo_ops.count == UINT64_MAX) ? 0 : geo_ops.count;
  GeoSearchStoreGeneric(cmd_cntx->tx(), builder, shape, key, "", geo_ops);
}

void CmdGeoRadiusByMember(CmdArgList args, CommandContext* cmd_cntx) {
  GeoRadiusByMemberGeneric(args, cmd_cntx, false);
}

void CmdGeoRadiusByMemberRO(CmdArgList args, CommandContext* cmd_cntx) {
  GeoRadiusByMemberGeneric(args, cmd_cntx, true);
}

void CmdGeoRadius(CmdArgList args, CommandContext* cmd_cntx) {
  GeoRadiusGeneric(args, cmd_cntx, false);
}

void CmdGeoRadiusRO(CmdArgList args, CommandContext* cmd_cntx) {
  GeoRadiusGeneric(args, cmd_cntx, true);
}

}  // namespace

#define HFUNC(x) SetHandler(&Cmd##x)

void RegisterGeoFamily(CommandRegistry* registry) {
  registry->StartFamily(acl::GEO);
  *registry << CI{"GEOADD", CO::JOURNALED | CO::DENYOOM, -5, 1, 1}.HFUNC(GeoAdd)
            << CI{"GEOHASH", CO::READONLY, -2, 1, 1}.HFUNC(GeoHash)
            << CI{"GEOPOS", CO::READONLY, -2, 1, 1}.HFUNC(GeoPos)
            << CI{"GEODIST", CO::READONLY, -4, 1, 1}.HFUNC(GeoDist)
            << CI{"GEOSEARCH", CO::READONLY, -7, 1, 1}.HFUNC(GeoSearch)
            << CI{"GEORADIUSBYMEMBER", CO::JOURNALED | CO::STORE_LAST_KEY, -5, 1, 1}.HFUNC(
                   GeoRadiusByMember)
            << CI{"GEORADIUSBYMEMBER_RO", CO::READONLY, -5, 1, 1}.HFUNC(GeoRadiusByMemberRO)
            << CI{"GEORADIUS", CO::JOURNALED | CO::STORE_LAST_KEY, -6, 1, 1}.HFUNC(GeoRadius)
            << CI{"GEORADIUS_RO", CO::READONLY, -6, 1, 1}.HFUNC(GeoRadiusRO);
}

}  // namespace dfly


================================================
FILE: src/server/geo_family_test.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#include "base/gtest.h"
#include "base/logging.h"
#include "facade/facade_test.h"
#include "server/test_utils.h"

using namespace testing;
using namespace std;
using namespace util;

namespace dfly {

class GeoFamilyTest : public BaseFamilyTest {
 protected:
};

TEST_F(GeoFamilyTest, GeoAdd) {
  EXPECT_EQ(2, CheckedInt({"geoadd", "Sicily", "13.361389", "38.115556", "Palermo", "15.087269",
                           "37.502669", "Catania"}));
  EXPECT_EQ(0, CheckedInt({"geoadd", "Sicily", "13.361389", "38.115556", "Palermo", "15.087269",
                           "37.502669", "Catania"}));
  auto resp = Run({"geohash", "Sicily", "Palermo", "Catania"});
  EXPECT_THAT(resp, RespArray(ElementsAre("sqc8b49rny0", "sqdtr74hyu0")));
}

TEST_F(GeoFamilyTest, GeoAddOptions) {
  EXPECT_EQ(2, CheckedInt({"geoadd", "Sicily", "13.361389", "38.115556", "Palermo", "15.087269",
                           "37.502669", "Catania"}));

  // add 1 + update 1 + XX
  EXPECT_EQ(0, CheckedInt({"geoadd", "Sicily", "XX", "15.361389", "38.115556", "Palermo",
                           "15.554167", "38.193611", "Messina"}));
  auto resp = Run({"geopos", "Sicily", "Palermo", "Messina"});
  EXPECT_THAT(
      resp, RespArray(ElementsAre(RespArray(ElementsAre("15.361389219760895", "38.1155563954963")),
                                  ArgType(RespExpr::NIL))));

  // add 1 + update 1 + NX
  EXPECT_EQ(1, CheckedInt({"geoadd", "Sicily", "NX", "18.361389", "38.115556", "Palermo", "15.2875",
                           "37.069167", "Syracuse"}));
  resp = Run({"geopos", "Sicily", "Palermo", "Syracuse"});
  EXPECT_THAT(resp, RespArray(ElementsAre(
                        RespArray(ElementsAre("15.361389219760895", "38.1155563954963")),
                        RespArray(ElementsAre("15.287499725818634", "37.06916773705567")))));

  // add 1 + update 1 CH
  EXPECT_EQ(2, CheckedInt({"geoadd", "Sicily", "CH", "18.361389", "38.115556", "Palermo",
                           "12.434167", "37.798056", "Marsala"}));
  resp = Run({"geopos", "Sicily", "Palermo", "Marsala"});
  EXPECT_THAT(resp, RespArray(ElementsAre(
                        RespArray(ElementsAre("18.361386358737946", "38.1155563954963")),
                        RespArray(ElementsAre("12.43416577577591", "37.7980572230775")))));

  // update 1 + CH + XX
  EXPECT_EQ(1, CheckedInt({"geoadd", "Sicily", "CH", "XX", "10.361389", "38.115556", "Palermo"}));
  resp = Run({"geopos", "Sicily", "Palermo"});
  EXPECT_THAT(resp, RespArray(ElementsAre(DoubleArg(10.361389), DoubleArg(38.115556))));

  // add 1 + CH + NX
  EXPECT_EQ(1, CheckedInt({"geoadd", "Sicily", "CH", "NX", "14.25", "37.066667", "Gela"}));
  resp = Run({"geopos", "Sicily", "Gela"});
  EXPECT_THAT(resp, RespArray(ElementsAre(DoubleArg(14.25), DoubleArg(37.066667))));

  // add 1 + XX + NX
  resp = Run({"geoadd", "Sicily", "XX", "NX", "14.75", "36.933333", "Ragusa"});
  EXPECT_THAT(resp, ErrArg("XX and NX options at the same time are not compatible"));

  // incorrect number of args
  resp = Run({"geoadd", "Sicily", "14.75", "36.933333", "Ragusa", "10.23"});
  EXPECT_THAT(resp, ErrArg("syntax error"));
}

TEST_F(GeoFamilyTest, GeoPos) {
  EXPECT_EQ(1, CheckedInt({"geoadd", "Sicily", "13.361389", "38.115556", "Palermo"}));
  auto resp = Run({"geopos", "Sicily", "Palermo", "NonExisting"});
  EXPECT_THAT(
      resp, RespArray(ElementsAre(RespArray(ElementsAre("13.361389338970184", "38.1155563954963")),
                                  ArgType(RespExpr::NIL))));
}

TEST_F(GeoFamilyTest, GeoPosWrongType) {
  Run({"set", "x", "value"});
  EXPECT_THAT(Run({"geopos", "x", "Sicily", "Palermo"}), ErrArg("WRONGTYPE"));
}

TEST_F(GeoFamilyTest, GeoDist) {
  EXPECT_EQ(2, CheckedInt({"geoadd", "Sicily", "13.361389", "38.115556", "Palermo", "15.087269",
                           "37.502669", "Catania"}));
  auto resp = Run({"geodist", "Sicily", "Palermo", "Catania"});
  // Haswell+ CPUs use FMA instructions, yielding higher precision that breaks exact string
  // matching. DoubleArg handles parsing safely and applies standard floating-point tolerance.
  EXPECT_THAT(resp, DoubleArg(166274.15156960033));

  resp = Run({"geodist", "Sicily", "Palermo", "Catania", "km"});
  EXPECT_THAT(resp, DoubleArg(166.27415156960032));

  resp = Run({"geodist", "Sicily", "Palermo", "Catania", "MI"});
  EXPECT_THAT(resp, DoubleArg(103.31822459492733));

  resp = Run({"geodist", "Sicily", "Palermo", "Catania", "FT"});
  EXPECT_THAT(resp, DoubleArg(545518.8699790037));

  resp = Run({"geodist", "Sicily", "Foo", "Bar"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));
}

TEST_F(GeoFamilyTest, GeoSearch) {
  EXPECT_EQ(10, CheckedInt({"geoadd",  "Europe",    "13.4050", "52.5200", "Berlin",   "3.7038",
                            "40.4168", "Madrid",    "9.1427",  "38.7369", "Lisbon",   "2.3522",
                            "48.8566", "Paris",     "16.3738", "48.2082", "Vienna",   "4.8952",
                            "52.3702", "Amsterdam", "10.7522", "59.9139", "Oslo",     "23.7275",
                            "37.9838", "Athens",    "19.0402", "47.4979", "Budapest", "6.2603",
                            "53.3498", "Dublin"}));

  auto resp = Run({"GEOSEARCH", "Europe", "FROMLONLAT", "13.4050", "52.5200", "BYRADIUS", "500",
                   "KM", "WITHCOORD", "WITHDIST", "WITHHASH"});
  EXPECT_THAT(
      resp,
      RespArray(ElementsAre(
          RespArray(ElementsAre("Berlin", DoubleArg(0.00017343178521311378), "3673983950397063",
                                RespArray(ElementsAre(DoubleArg(13.4050), DoubleArg(52.5200))))),
          RespArray(ElementsAre("Dublin", DoubleArg(487.5619030644293), "3678981558208417",
                                RespArray(ElementsAre(DoubleArg(6.2603), DoubleArg(53.3498))))))));

  resp = Run({"GEOSEARCH", "invalid_key", "FROMMEMBER", "Madrid", "BYRADIUS", "700", "KM",
              "WITHCOORD", "WITHDIST"});
  EXPECT_THAT(resp.GetVec().empty(), true);

  resp = Run({"GEOSEARCH", "Europe", "FROMMEMBER", "invalid_member", "BYRADIUS", "700", "KM",
              "WITHCOORD", "WITHDIST"});
  EXPECT_THAT(resp, ErrArg("could not decode requested zset member"));

  resp = Run({"GEOSEARCH", "America", "FROMLONLAT", "13.4050", "52.5200", "BYBOX", "1000", "1000",
              "KM", "WITHCOORD", "WITHDIST"});
  EXPECT_THAT(resp.GetVec().empty(), true);

  resp = Run({"GEOSEARCH", "Europe", "FROMLONLAT", "130.4050", "52.5200", "BYBOX", "10", "10", "KM",
              "WITHCOORD", "WITHDIST"});
  EXPECT_THAT(resp.GetVec().empty(), true);

  resp = Run({"GEOSEARCH", "Europe", "FROMLONLAT", "13.4050", "52.5200", "BYBOX", "1000", "1000",
              "KM", "WITHCOORD", "WITHDIST"});
  EXPECT_THAT(
      resp,
      RespArray(ElementsAre(
          RespArray(ElementsAre("Vienna", DoubleArg(523.6926930553866),
                                RespArray(ElementsAre(DoubleArg(16.3738), DoubleArg(48.2082))))),
          RespArray(ElementsAre("Berlin", DoubleArg(0.00017343178521311378),
                                RespArray(ElementsAre(DoubleArg(13.4050), DoubleArg(52.5200))))),
          RespArray(ElementsAre("Dublin", DoubleArg(487.5619030644293),
                                RespArray(ElementsAre(DoubleArg(6.2603), DoubleArg(53.3498))))))));

  resp = Run({"GEOSEARCH", "Europe", "FROMLONLAT", "13.4050", "52.5200", "BYRADIUS", "500", "KM",
              "COUNT", "3", "WITHCOORD", "WITHDIST"});
  EXPECT_THAT(
      resp,
      RespArray(ElementsAre(
          RespArray(ElementsAre("Berlin", DoubleArg(0.00017343178521311378),
                                RespArray(ElementsAre(DoubleArg(13.4050), DoubleArg(52.5200))))),
          RespArray(ElementsAre("Dublin", DoubleArg(487.5619030644293),
                                RespArray(ElementsAre(DoubleArg(6.2603), DoubleArg(53.3498))))))));

  resp = Run({"GEOSEARCH", "Europe", "FROMLONLAT", "13.4050", "52.5200", "BYRADIUS", "500", "KM",
              "DESC", "WITHCOORD", "WITHDIST"});
  EXPECT_THAT(
      resp,
      RespArray(ElementsAre(
          RespArray(ElementsAre("Dublin", DoubleArg(487.5619030644293),
                                RespArray(ElementsAre(DoubleArg(6.2603), DoubleArg(53.3498))))),
          RespArray(ElementsAre("Berlin", DoubleArg(0.00017343178521311378),
                                RespArray(ElementsAre(DoubleArg(13.4050), DoubleArg(52.5200))))))));

  resp = Run({"GEOSEARCH", "Europe", "FROMMEMBER", "Madrid", "BYRADIUS", "700", "KM", "WITHCOORD",
              "WITHDIST"});
  EXPECT_THAT(
      resp,
      RespArray(ElementsAre(
          // Use DoubleArg to tolerate floating-point precision differences on Haswell+ CPUs (e.g.,
          // 0 becoming 5.7e-15 due to FMA).
          RespArray(ElementsAre("Madrid", DoubleArg(0),
                                RespArray(ElementsAre(DoubleArg(3.7038), DoubleArg(40.4168))))),
          RespArray(ElementsAre("Lisbon", DoubleArg(502.20769462704106),
                                RespArray(ElementsAre(DoubleArg(9.1427), DoubleArg(38.7369))))))));

  resp = Run({"GEOSEARCH", "Europe", "FROMMEMBER", "Madrid", "BYRADIUS", "700", "KM"});
  EXPECT_THAT(resp, RespArray(ElementsAre("Madrid", "Lisbon")));
}

TEST_F(GeoFamilyTest, GeoRadiusByMember) {
  EXPECT_EQ(10, CheckedInt({"geoadd",  "Europe",    "13.4050", "52.5200", "Berlin",   "3.7038",
                            "40.4168", "Madrid",    "9.1427",  "38.7369", "Lisbon",   "2.3522",
                            "48.8566", "Paris",     "16.3738", "48.2082", "Vienna",   "4.8952",
                            "52.3702", "Amsterdam", "10.7522", "59.9139", "Oslo",     "23.7275",
                            "37.9838", "Athens",    "19.0402", "47.4979", "Budapest", "6.2603",
                            "53.3498", "Dublin"}));

  auto resp = Run({"GEORADIUSBYMEMBER", "invalid_key", "Madrid", "900", "KM"});
  EXPECT_THAT(resp.GetVec().empty(), true);

  resp = Run({"GEORADIUSBYMEMBER", "invalid_key", "Madrid", "900", "KM", "STORE", "store_key"});
  EXPECT_THAT(resp.GetVec().empty(), true);

  resp = Run({"GEORADIUSBYMEMBER", "Europe", "invalid_mem", "900", "KM", "STORE", "store_key"});
  EXPECT_THAT(resp, ErrArg("could not decode requested zset member"));

  resp = Run({"GEORADIUSBYMEMBER", "Europe", "Madrid", "700", "KM", "WITHCOORD", "WITHDIST"});
  EXPECT_THAT(
      resp,
      RespArray(ElementsAre(
          RespArray(ElementsAre("Madrid", DoubleArg(0),
                                RespArray(ElementsAre(DoubleArg(3.703801), DoubleArg(40.416799))))),
          RespArray(
              ElementsAre("Lisbon", DoubleArg(502.207695),
                          RespArray(ElementsAre(DoubleArg(9.142698), DoubleArg(38.736900))))))));

  EXPECT_EQ(
      2, CheckedInt({"GEORADIUSBYMEMBER", "Europe", "Madrid", "700", "KM", "STORE", "store_key"}));
  resp = Run({"ZRANGE", "store_key", "0", "-1"});
  EXPECT_THAT(resp, RespArray(ElementsAre("Madrid", "Lisbon")));
  resp = Run({"ZRANGE", "store_key", "0", "-1", "WITHSCORES"});
  EXPECT_THAT(resp,
              RespArray(ElementsAre("Madrid", "3471766229222696", "Lisbon", "3473121093062745")));

  EXPECT_EQ(2, CheckedInt({"GEORADIUSBYMEMBER", "Europe", "Madrid", "700", "KM", "STOREDIST",
                           "store_dist_key"}));
  resp = Run({"ZRANGE", "store_dist_key", "0", "-1", "WITHSCORES"});
  EXPECT_THAT(resp,
              RespArray(ElementsAre("Madrid", DoubleArg(0), "Lisbon", DoubleArg(502.207695))));

  resp = Run(
      {"GEORADIUSBYMEMBER", "Europe", "Madrid", "900", "KM", "STORE", "store_key", "WITHCOORD"});
  EXPECT_THAT(resp, ErrArg("ERR STORE option in GEORADIUSBYMEMBER is not compatible with WITHDIST, "
                           "WITHHASH and WITHCOORDS options"));

  // Do not remove this test case, it's not redundant.
  // It's different from the one above because the arguments have
  // different permutation which our code did not handle.
  auto err =
      "ERR STORE option in GEORADIUSBYMEMBER is not compatible with WITHDIST, WITHHASH and WITHCOORDS options"sv;
  resp = Run("GEORADIUSBYMEMBER Sicily Agrigento 100 km WITHHASH store tmp");
  EXPECT_THAT(resp, ErrArg(err));

  resp = Run("GEOADD t 13.361389 38.115556 a 13.3619 38.1159 b 13.3608 38.1152 c");
  resp = Run("GEOSEARCH t FROMLONLAT 13.361389 38.115556 BYRADIUS 1 KM COUNT 0");
  EXPECT_THAT(resp, ErrArg("ERR COUNT must be > 0"));
}

TEST_F(GeoFamilyTest, GeoRadiusByMemberRO) {
  EXPECT_EQ(10, CheckedInt({"geoadd",  "Europe",    "13.4050", "52.5200", "Berlin",   "3.7038",
                            "40.4168", "Madrid",    "9.1427",  "38.7369", "Lisbon",   "2.3522",
                            "48.8566", "Paris",     "16.3738", "48.2082", "Vienna",   "4.8952",
                            "52.3702", "Amsterdam", "10.7522", "59.9139", "Oslo",     "23.7275",
                            "37.9838", "Athens",    "19.0402", "47.4979", "Budapest", "6.2603",
                            "53.3498", "Dublin"}));

  auto resp =
      Run({"GEORADIUSBYMEMBER_RO", "Europe", "Madrid", "700", "KM", "WITHCOORD", "WITHDIST"});
  EXPECT_THAT(
      resp,
      RespArray(ElementsAre(
          RespArray(ElementsAre("Madrid", DoubleArg(0),
                                RespArray(ElementsAre(DoubleArg(3.703801), DoubleArg(40.416799))))),
          RespArray(
              ElementsAre("Lisbon", DoubleArg(502.207695),
                          RespArray(ElementsAre(DoubleArg(9.142698), DoubleArg(38.736900))))))));

  // GEORADIUSBYMEMBER_RO should not accept arguments for storing (writing data)
  resp =
      Run({"GEORADIUSBYMEMBER_RO", "Europe", "Madrid", "700", "KM", "STOREDIST", "store_dist_key"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  resp = Run({"GEORADIUSBYMEMBER_RO", "Europe", "Madrid", "700", "KM", "STORE", "store_key"});
  EXPECT_THAT(resp, ErrArg("syntax error"));
}

TEST_F(GeoFamilyTest, GeoRadius) {
  EXPECT_EQ(10, CheckedInt({"geoadd",  "Europe",    "13.4050", "52.5200", "Berlin",   "3.7038",
                            "40.4168", "Madrid",    "9.1427",  "38.7369", "Lisbon",   "2.3522",
                            "48.8566", "Paris",     "16.3738", "48.2082", "Vienna",   "4.8952",
                            "52.3702", "Amsterdam", "10.7522", "59.9139", "Oslo",     "23.7275",
                            "37.9838", "Athens",    "19.0402", "47.4979", "Budapest", "6.2603",
                            "53.3498", "Dublin"}));

  auto resp = Run({"GEORADIUS", "invalid_key", "16.3738", "48.2082", "900", "KM"});
  EXPECT_THAT(resp.GetVec().empty(), true);

  resp = Run({"GEORADIUS", "America", "13.4050", "52.5200", "500", "KM", "WITHCOORD", "WITHDIST"});
  EXPECT_THAT(resp.GetVec().empty(), true);

  resp = Run({"GEORADIUS", "Europe", "130.4050", "52.5200", "10", "KM", "WITHCOORD", "WITHDIST"});
  EXPECT_THAT(resp.GetVec().empty(), true);

  resp = Run({"GEORADIUS", "Europe", "13.4050", "52.5200", "500", "KM", "COUNT", "3", "WITHCOORD",
              "WITHDIST"});
  EXPECT_THAT(
      resp,
      RespArray(ElementsAre(
          RespArray(ElementsAre("Berlin", DoubleArg(0.00017343178521311378),
                                RespArray(ElementsAre(DoubleArg(13.4050), DoubleArg(52.5200))))),
          RespArray(ElementsAre("Dublin", DoubleArg(487.5619030644293),
                                RespArray(ElementsAre(DoubleArg(6.2603), DoubleArg(53.3498))))))));

  resp = Run(
      {"GEORADIUS", "Europe", "13.4050", "52.5200", "500", "KM", "DESC", "WITHCOORD", "WITHDIST"});
  EXPECT_THAT(
      resp,
      RespArray(ElementsAre(
          RespArray(ElementsAre("Dublin", DoubleArg(487.5619030644293),
                                RespArray(ElementsAre(DoubleArg(6.2603), DoubleArg(53.3498))))),
          RespArray(ElementsAre("Berlin", DoubleArg(0.00017343178521311378),
                                RespArray(ElementsAre(DoubleArg(13.4050), DoubleArg(52.5200))))))));

  EXPECT_EQ(2, CheckedInt({"GEORADIUS", "Europe", "3.7038", "40.4168", "700", "KM", "STORE",
                           "store_key"}));
  resp = Run({"ZRANGE", "store_key", "0", "-1"});

  EXPECT_THAT(resp, RespArray(ElementsAre("Madrid", "Lisbon")));
  resp = Run({"ZRANGE", "store_key", "0", "-1", "WITHSCORES"});
  EXPECT_THAT(resp,
              RespArray(ElementsAre("Madrid", "3471766229222696", "Lisbon", "3473121093062745")));

  EXPECT_EQ(2, CheckedInt({"GEORADIUS", "Europe", "3.7038", "40.4168", "700", "KM", "STOREDIST",
                           "store_dist_key"}));
  resp = Run({"ZRANGE", "store_dist_key", "0", "-1", "WITHSCORES"});
  EXPECT_THAT(resp,
              RespArray(ElementsAre("Madrid", DoubleArg(0), "Lisbon", DoubleArg(502.207694))));

  // Test with STORE and other options
  resp = Run({"GEORADIUS", "key:poq6moq\\r", "111.38360132204588", "-71.17374967857494",
              "69.77510489600115", "ft", "key", "WITHDIST", "COUNT", "key", "WITHCOORD", "count",
              "WITHHASH", "STORE"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  Run("GEOADD Sicily 13.361389 38.115556 Palermo 15.087269 37.502669 Catania");
  resp = Run("GEORADIUS SICILY 15 37 200 KM COUNT 0");
  EXPECT_THAT(resp, ErrArg("ERR COUNT must be > 0"));

  Run("GEOADD Sicily 13.583333 37.316667 Agrigento");
  resp = Run("GEORADIUSBYMEMBER Sicily Agrigento 100 km COUNT 0");
  EXPECT_THAT(resp, ErrArg("ERR COUNT must be > 0"));

  resp = Run("GEORADIUS Sicily 15 37 200 km COUNT 1");
  EXPECT_THAT(resp, "Agrigento");

  auto err =
      "ERR STORE option in GEORADIUS is not compatible with WITHDIST, WITHHASH and WITHCOORDS options"sv;
  resp = Run("GEORADIUS Sicily 15 37 200 km WITHDIST STORE result");
  EXPECT_THAT(resp, ErrArg(err));
}

TEST_F(GeoFamilyTest, GeoRadiusRO) {
  EXPECT_EQ(10, CheckedInt({"geoadd",  "Europe",    "13.4050", "52.5200", "Berlin",   "3.7038",
                            "40.4168", "Madrid",    "9.1427",  "38.7369", "Lisbon",   "2.3522",
                            "48.8566", "Paris",     "16.3738", "48.2082", "Vienna",   "4.8952",
                            "52.3702", "Amsterdam", "10.7522", "59.9139", "Oslo",     "23.7275",
                            "37.9838", "Athens",    "19.0402", "47.4979", "Budapest", "6.2603",
                            "53.3498", "Dublin"}));

  // GEORADIUS_RO should not accept arguments for storing (writing data)
  auto resp =
      Run({"GEORADIUS_RO", "Europe", "13.4050", "52.5200", "900", "KM", "STORE_DIST", "store_key"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  resp = Run({"GEORADIUS_RO", "Europe", "13.4050", "52.5200", "900", "KM", "STORE", "store_key"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  resp = Run({"GEORADIUS_RO", "Europe", "13.4050", "52.5200", "500", "KM", "COUNT", "3",
              "WITHCOORD", "WITHDIST"});
  EXPECT_THAT(
      resp,
      RespArray(ElementsAre(
          RespArray(ElementsAre("Berlin", DoubleArg(0.00017343178521311378),
                                RespArray(ElementsAre(DoubleArg(13.4050), DoubleArg(52.5200))))),
          RespArray(ElementsAre("Dublin", DoubleArg(487.5619030644293),
                                RespArray(ElementsAre(DoubleArg(6.2603), DoubleArg(53.3498))))))));
}

TEST_F(GeoFamilyTest, GeoRadiusByMemberUb) {
  Run({"GEOADD", "geo", "-118.2437", "34.0522", "972"});
  Run({"GEOADD", "geo", "-73.935242", "40.730610", "973"});
  Run({"GEOADD", "geo", "-122.4194", "37.7749", "971"});

  auto resp = Run({"GEORADIUSBYMEMBER", "geo", "971", "200", "mi", "WITHCOORD", "WITHDIST", "COUNT",
                   "40", "ASC"});
  // Use DoubleArg(0) to tolerate tiny floating-point residuals (e.g. 5e-15) on AVX/FMA builds.
  EXPECT_THAT(resp, RespArray(ElementsAre(
                        "971", DoubleArg(0),
                        RespArray(ElementsAre("-122.41940170526505", "37.77490001056578")))));
}

}  // namespace dfly


================================================
FILE: src/server/hll_family.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

extern "C" {
#include "redis/hyperloglog.h"
}

#include "base/logging.h"
#include "base/stl_util.h"
#include "facade/error.h"
#include "facade/reply_builder.h"
#include "server/acl/acl_commands_def.h"
#include "server/command_registry.h"
#include "server/conn_context.h"
#include "server/container_utils.h"
#include "server/db_slice.h"
#include "server/engine_shard_set.h"
#include "server/error.h"
#include "server/transaction.h"

namespace dfly {

using namespace std;
using namespace facade;

namespace {

template <typename T>
void HandleOpValueResult(const OpResult<T>& result, SinkReplyBuilder* builder) {
  static_assert(std::is_integral<T>::value,
                "we are only handling types that are integral types in the return types from "
                "here");
  if (result) {
    builder->SendLong(result.value());
  } else {
    switch (result.status()) {
      case OpStatus::WRONG_TYPE:
        builder->SendError(kWrongTypeErr);
        break;
      case OpStatus::OUT_OF_MEMORY:
        builder->SendError(kOutOfMemory);
        break;
      case OpStatus::INVALID_VALUE:
        builder->SendError(kInvalidHllError);
        break;
      case OpStatus::CORRUPTED_HLL:
        builder->SendError(facade::StatusToMsg(OpStatus::CORRUPTED_HLL));
        break;
      default:
        builder->SendLong(0);  // in case we don't have the value we should just send 0
        break;
    }
  }
}

HllBufferPtr StringToHllPtr(string_view hll) {
  return {.hll = (unsigned char*)hll.data(), .size = hll.size()};
}

bool ConvertToDenseIfNeeded(string* hll) {
  int hll_validity = isValidHLL(StringToHllPtr(*hll));
  if (hll_validity == HLL_VALID_SPARSE) {
    string new_hll;
    new_hll.resize(getDenseHllSize());
    int result = convertSparseToDenseHll(StringToHllPtr(*hll), StringToHllPtr(new_hll));
    if (result != 0) {
      // Conversion failed - HLL data is corrupted
      return false;
    }
    *hll = std::move(new_hll);
    return true;
  }
  return hll_validity == HLL_VALID_DENSE;
}

OpResult<int> AddToHll(const OpArgs& op_args, string_view key, CmdArgList values) {
  auto& db_slice = op_args.GetDbSlice();

  string hll;

  auto op_res = db_slice.AddOrFind(op_args.db_cntx, key, OBJ_STRING);
  RETURN_ON_BAD_STATUS(op_res);
  auto& res = *op_res;
  if (res.is_new) {
    hll.resize(getSparseHllInitSize());
    initSparseHll(StringToHllPtr(hll));
  } else {
    res.it->second.GetString(&hll);
  }
  if (isValidHLL(StringToHllPtr(hll)) == HLL_INVALID) {
    return OpStatus::INVALID_VALUE;
  }

  int updated = 0;
  bool is_sparse = isValidHLL(StringToHllPtr(hll)) == HLL_VALID_SPARSE;
  sds hll_sds;
  if (is_sparse) {
    hll_sds = sdsnewlen(hll.data(), hll.size());
  }

  for (const auto& value : values) {
    int added;
    if (is_sparse) {
      // Inserting to sparse hll might extend it.
      // We can't use std::string with sds
      // `promoted` will be assigned 1 if sparse hll was promoted to dense
      int promoted = 0;
      added = pfadd_sparse(&hll_sds, (unsigned char*)value.data(), value.size(), &promoted);
      if (promoted == 1) {
        is_sparse = false;
        hll = string{hll_sds, sdslen(hll_sds)};
        sdsfree(hll_sds);
        DCHECK_EQ(isValidHLL(StringToHllPtr(hll)), HLL_VALID_DENSE);
      }
    } else {
      added = pfadd_dense(StringToHllPtr(hll), (unsigned char*)value.data(), value.size());
    }
    if (added < 0) {
      return OpStatus::INVALID_VALUE;
    }
    updated += added;
  }

  if (is_sparse) {
    hll = string{hll_sds, sdslen(hll_sds)};
    sdsfree(hll_sds);
  }
  res.it->second.SetString(hll);
  return std::min(updated, 1);
}

void PFAdd(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  args.remove_prefix(1);

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return AddToHll(t->GetOpArgs(shard), key, args);
  };

  OpResult<int> res = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  HandleOpValueResult(res, cmd_cntx->rb());
}

OpResult<int64_t> CountHllsSingle(const OpArgs& op_args, string_view key) {
  auto& db_slice = op_args.GetDbSlice();

  auto it = db_slice.FindReadOnly(op_args.db_cntx, key, OBJ_STRING);
  if (it.ok()) {
    string hll;
    string_view hll_view = it.value()->second.GetSlice(&hll);

    switch (isValidHLL(StringToHllPtr(hll_view))) {
      case HLL_VALID_DENSE:
        break;
      case HLL_VALID_SPARSE:
        // Even in the case of a read - we still want to convert the hll to dense format, as it
        // could originate in Redis (like in replication or rdb load).
        hll = hll_view;
        if (!ConvertToDenseIfNeeded(&hll)) {
          return OpStatus::CORRUPTED_HLL;
        }
        hll_view = hll;
        break;
      case HLL_INVALID:
      default:
        return OpStatus::INVALID_VALUE;
    }

    return pfcountSingle(StringToHllPtr(hll_view));
  } else if (it.status() == OpStatus::WRONG_TYPE) {
    return it.status();
  } else {
    // Non existing keys count as 0.
    return 0;
  }
}

OpResult<vector<string>> ReadValues(const OpArgs& op_args, const ShardArgs& keys) {
  try {
    vector<string> values;
    for (string_view key : keys) {
      auto it = op_args.GetDbSlice().FindReadOnly(op_args.db_cntx, key, OBJ_STRING);
      if (it.ok()) {
        string hll;
        it.value()->second.GetString(&hll);
        if (!ConvertToDenseIfNeeded(&hll)) {
          return OpStatus::CORRUPTED_HLL;
        }
        values.push_back(std::move(hll));
      } else if (it.status() == OpStatus::WRONG_TYPE) {
        return OpStatus::WRONG_TYPE;
      }
    }
    return values;
  } catch (const std::bad_alloc&) {
    return OpStatus::OUT_OF_MEMORY;
  }
}

vector<HllBufferPtr> ConvertShardVector(const vector<vector<string>>& hlls) {
  vector<HllBufferPtr> ptrs;
  ptrs.reserve(hlls.size());
  for (auto& shard_hlls : hlls) {
    for (auto& hll : shard_hlls) {
      ptrs.push_back(StringToHllPtr(hll));
    }
  }
  return ptrs;
}

OpResult<int64_t> PFCountMulti(CmdArgList args, CommandContext* cmd_cntx) {
  vector<vector<string>> hlls;
  hlls.resize(shard_set->size());

  atomic<OpStatus> error_status{OpStatus::OK};
  auto cb = [&](Transaction* t, EngineShard* shard) {
    ShardId sid = shard->shard_id();
    ShardArgs shard_args = t->GetShardArgs(shard->shard_id());
    auto result = ReadValues(t->GetOpArgs(shard), shard_args);
    if (result.ok()) {
      hlls[sid] = std::move(result.value());
    } else {
      error_status.store(result.status(), memory_order_relaxed);
    }
    return OpStatus::OK;
  };

  OpStatus cb_status = cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));
  if (cb_status != OpStatus::OK) {
    return cb_status;
  }

  OpStatus stored_error = error_status.load(memory_order_relaxed);
  if (stored_error != OpStatus::OK) {
    return stored_error;
  }

  vector<HllBufferPtr> ptrs = ConvertShardVector(hlls);
  int64_t pf_count = pfcountMulti(ptrs.data(), ptrs.size());
  if (pf_count < 0) {
    return OpStatus::INVALID_VALUE;
  } else {
    return pf_count;
  }
}

void PFCount(CmdArgList args, CommandContext* cmd_cntx) {
  if (args.size() == 1) {
    string_view key = ArgS(args, 0);
    auto cb = [&](Transaction* t, EngineShard* shard) {
      return CountHllsSingle(t->GetOpArgs(shard), key);
    };

    OpResult<int64_t> res = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
    HandleOpValueResult(res, cmd_cntx->rb());
  } else {
    HandleOpValueResult(PFCountMulti(args, cmd_cntx), cmd_cntx->rb());
  }
}

OpResult<int> PFMergeInternal(CmdArgList args, Transaction* tx, SinkReplyBuilder* builder) {
  vector<vector<string>> hlls;
  hlls.resize(shard_set->size());

  atomic<OpStatus> error_status{OpStatus::OK};
  auto cb = [&](Transaction* t, EngineShard* shard) {
    ShardId sid = shard->shard_id();
    ShardArgs shard_args = t->GetShardArgs(shard->shard_id());
    auto result = ReadValues(t->GetOpArgs(shard), shard_args);
    if (result.ok()) {
      hlls[sid] = std::move(result.value());
    } else {
      error_status.store(result.status(), memory_order_relaxed);
    }
    return OpStatus::OK;
  };

  tx->Execute(std::move(cb), false);

  OpStatus stored_error = error_status.load(memory_order_relaxed);
  if (stored_error != OpStatus::OK) {
    tx->Conclude();
    return stored_error;
  }

  vector<HllBufferPtr> ptrs = ConvertShardVector(hlls);

  string hll;
  hll.resize(getDenseHllSize());
  createDenseHll(StringToHllPtr(hll));
  int result = pfmerge(ptrs.data(), ptrs.size(), StringToHllPtr(hll));

  auto set_cb = [&](Transaction* t, EngineShard* shard) {
    string_view key = ArgS(args, 0);
    const OpArgs& op_args = t->GetOpArgs(shard);
    auto& db_slice = op_args.GetDbSlice();
    auto op_res = db_slice.AddOrFind(t->GetDbContext(), key, OBJ_STRING);
    RETURN_ON_BAD_STATUS(op_res);
    auto& res = *op_res;
    res.it->second.SetString(hll);

    if (op_args.shard->journal()) {
      RecordJournal(op_args, "SET", ArgSlice{key, hll});
    }

    return OpStatus::OK;
  };
  tx->Execute(std::move(set_cb), true);

  return result;
}

void PFMerge(CmdArgList args, CommandContext* cmd_cntx) {
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  OpResult<int> result = PFMergeInternal(args, cmd_cntx->tx(), rb);
  if (result.ok()) {
    if (result.value() == 0) {
      rb->SendOk();
    } else {
      rb->SendError(kInvalidHllError);
    }
  } else {
    HandleOpValueResult(result, rb);
  }
}

}  // namespace

void RegisterHllFamily(CommandRegistry* registry) {
  using CI = CommandId;
  registry->StartFamily(acl::HYPERLOGLOG);
  *registry << CI{"PFADD", CO::FAST | CO::JOURNALED, -3, 1, 1}.SetHandler(PFAdd)
            << CI{"PFCOUNT", CO::READONLY, -2, 1, -1}.SetHandler(PFCount)
            << CI{"PFMERGE", CO::JOURNALED | CO::NO_AUTOJOURNAL, -2, 1, -1}.SetHandler(PFMerge);
}

}  // namespace dfly


================================================
FILE: src/server/hll_family_test.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#include "base/gtest.h"
#include "base/logging.h"
#include "facade/error.h"
#include "facade/facade_test.h"
#include "server/test_utils.h"

using namespace testing;
using namespace std;
using namespace util;
using namespace facade;

namespace dfly {

class HllFamilyTest : public BaseFamilyTest {
 protected:
  std::string GenerateUniqueValue(int index) {
    return "Value_{" + std::to_string(index) + "}";
  }
};

TEST_F(HllFamilyTest, Simple) {
  EXPECT_EQ(CheckedInt({"pfadd", "key", "1"}), 1);
  EXPECT_EQ(CheckedInt({"pfadd", "key", "1"}), 0);
  EXPECT_EQ(CheckedInt({"pfcount", "key"}), 1);
}

TEST_F(HllFamilyTest, Promote) {
  int unique_values = 20000;
  // Sparse hll is promoted to dense at the 1660th+- insertion
  // This value varies if any parameter in hyperloglog.c changes.
  int promote_i = 1660;
  // Keep consistent with hyperloglog.c
  int kHllSparseMaxBytes = 3000;
  int kHllDenseSize = 12304;
  for (int i = 0; i < unique_values; ++i) {
    std::string newkey = GenerateUniqueValue(i);
    Run({"pfadd", "key", newkey});
    if (i < promote_i) {
      EXPECT_LT(CheckedInt({"strlen", "key"}), kHllSparseMaxBytes + 1);
    } else {
      EXPECT_EQ(CheckedInt({"strlen", "key"}), kHllDenseSize);
    }
  }
  // HyperLogLog computations come with a
  // margin of error, with a standard error rate of 0.81%.
  // Set it to 5% so this test won't fail unless something went wrong badly.
  EXPECT_LT(std::abs(CheckedInt({"pfcount", "key"}) - unique_values * 1.0) / unique_values, 0.05);
}

TEST_F(HllFamilyTest, MultipleValues) {
  EXPECT_EQ(CheckedInt({"pfadd", "key", "1", "2", "3"}), 1);
  EXPECT_EQ(CheckedInt({"pfcount", "key"}), 3);
  EXPECT_EQ(CheckedInt({"pfadd", "key", "1", "2", "3"}), 0);
  EXPECT_EQ(CheckedInt({"pfcount", "key"}), 3);
  EXPECT_EQ(CheckedInt({"pfadd", "key", "1"}), 0);
  EXPECT_EQ(CheckedInt({"pfcount", "key"}), 3);
  EXPECT_EQ(CheckedInt({"pfadd", "key", "2"}), 0);
  EXPECT_EQ(CheckedInt({"pfcount", "key"}), 3);
  EXPECT_EQ(CheckedInt({"pfadd", "key", "3"}), 0);
  EXPECT_EQ(CheckedInt({"pfcount", "key"}), 3);
  EXPECT_EQ(CheckedInt({"pfadd", "key", "3", "4"}), 1);
  EXPECT_EQ(CheckedInt({"pfcount", "key"}), 4);
  EXPECT_EQ(CheckedInt({"pfadd", "key", "5"}), 1);
  EXPECT_EQ(CheckedInt({"pfcount", "key"}), 5);
  EXPECT_EQ(CheckedInt({"pfadd", "key", "1", "2", "3", "4", "5"}), 0);
  EXPECT_EQ(CheckedInt({"pfcount", "key"}), 5);
}

TEST_F(HllFamilyTest, MultipleValues_random) {
  int insertions = 20000;
  int unique_values = 0;
  std::random_device rd;
  std::mt19937 gen(rd());
  std::uniform_int_distribution<> dis(1, 20);
  // cumulated pfadd result
  for (int i = 0; i < insertions; ++i) {
    // Number of values to insert
    int num_values = dis(gen);
    unique_values += num_values;

    // Prepare the command
    std::vector<std::string> values;
    values.reserve(num_values + 2);
    values.push_back("pfadd");
    values.push_back("key");

    // Generate and add unique values to the command
    for (int j = 0; j < num_values; ++j) {
      values.push_back(GenerateUniqueValue(i * 20 + j));
    }

    std::vector<std::string_view> commandViews;
    for (const auto& val : values) {
      commandViews.push_back(val);
    }
    Run(commandViews);
  }
  // HyperLogLog computations come with a
  // margin of error, with a standard error rate of 0.81%.
  // Set it to 5% so this test won't fail unless something went wrong badly.
  EXPECT_LT(std::abs(CheckedInt({"pfcount", "key"}) - unique_values * 1.0) / unique_values, 0.05);
}

TEST_F(HllFamilyTest, AddInvalid) {
  EXPECT_EQ(Run({"set", "key", "..."}), "OK");
  EXPECT_THAT(Run({"pfadd", "key", "1"}), ErrArg(kInvalidHllError));
  EXPECT_THAT(Run({"pfcount", "key"}), ErrArg(kInvalidHllError));
}

TEST_F(HllFamilyTest, OtherType) {
  Run({"zadd", "key", "1", "a"});
  EXPECT_THAT(Run({"pfadd", "key", "1"}),
              ErrArg("Operation against a key holding the wrong kind of value"));
  EXPECT_THAT(Run({"pfcount", "key"}),
              ErrArg("Operation against a key holding the wrong kind of value"));
}

TEST_F(HllFamilyTest, CountEmpty) {
  EXPECT_EQ(CheckedInt({"pfcount", "nonexisting"}), 0);
}

TEST_F(HllFamilyTest, CountInvalid) {
  EXPECT_EQ(Run({"set", "key", "..."}), "OK");
  EXPECT_THAT(Run({"pfcount", "key"}), ErrArg(kInvalidHllError));
}

TEST_F(HllFamilyTest, CountMultiple) {
  EXPECT_EQ(CheckedInt({"pfadd", "key1", "1", "2", "3"}), 1);
  EXPECT_EQ(CheckedInt({"pfcount", "key1"}), 3);

  EXPECT_EQ(CheckedInt({"pfadd", "key2", "1", "2", "3"}), 1);
  EXPECT_EQ(CheckedInt({"pfcount", "key2"}), 3);

  EXPECT_EQ(CheckedInt({"pfadd", "key3", "2", "3"}), 1);
  EXPECT_EQ(CheckedInt({"pfcount", "key3"}), 2);

  EXPECT_EQ(CheckedInt({"pfadd", "key4", "4", "5"}), 1);
  EXPECT_EQ(CheckedInt({"pfcount", "key4"}), 2);

  EXPECT_EQ(CheckedInt({"pfcount", "key1", "key4"}), 5);

  EXPECT_EQ(CheckedInt({"pfcount", "non-existing-key1", "non-existing-key2"}), 0);

  EXPECT_EQ(CheckedInt({"pfcount", "key1", "non-existing-key"}), 3);

  EXPECT_EQ(CheckedInt({"pfcount", "key1", "key2"}), 3);
  EXPECT_EQ(CheckedInt({"pfcount", "key1", "key3"}), 3);
  EXPECT_EQ(CheckedInt({"pfcount", "key1", "key2", "key3"}), 3);
  EXPECT_EQ(CheckedInt({"pfcount", "key1", "key2", "key3", "key4"}), 5);
  EXPECT_EQ(CheckedInt({"pfcount", "key1", "key2", "key3", "key4", "non-existing"}), 5);
  EXPECT_EQ(CheckedInt({"pfcount", "key1", "key4"}), 5);
}

TEST_F(HllFamilyTest, CountMultipleWithWrongType) {
  EXPECT_EQ(Run({"set", "key1", "value1"}), "OK");
  EXPECT_EQ(CheckedInt({"pfadd", "key", "value"}), 1);
  EXPECT_EQ(CheckedInt({"pfadd", "list1 element1", "data"}), 1);

  EXPECT_THAT(Run({"pfcount", "key1", "key", "list1 element1"}),
              ErrArg("INVALIDOBJ Corrupted HLL object detected."));
}

TEST_F(HllFamilyTest, MergeToNew) {
  EXPECT_EQ(CheckedInt({"pfadd", "key1", "1", "2", "3"}), 1);
  EXPECT_EQ(CheckedInt({"pfadd", "key2", "4", "5"}), 1);
  EXPECT_EQ(Run({"pfmerge", "key3", "key1", "key2"}), "OK");
  EXPECT_EQ(CheckedInt({"pfcount", "key3"}), 5);
}

TEST_F(HllFamilyTest, MergeToExisting) {
  EXPECT_EQ(CheckedInt({"pfadd", "key1", "1", "2", "3"}), 1);
  EXPECT_EQ(CheckedInt({"pfadd", "key2", "4", "5"}), 1);
  EXPECT_EQ(Run({"pfmerge", "key3", "key2", "key1"}), "OK");
  EXPECT_EQ(CheckedInt({"pfcount", "key3"}), 5);
  EXPECT_EQ(Run({"pfmerge", "key3", "key3"}), "OK");
  EXPECT_EQ(CheckedInt({"pfcount", "key3"}), 5);
  EXPECT_EQ(Run({"pfmerge", "key3"}), "OK");
  EXPECT_EQ(CheckedInt({"pfcount", "key3"}), 5);
  EXPECT_EQ(CheckedInt({"pfadd", "key4", "4", "5", "6"}), 1);
  EXPECT_EQ(Run({"pfmerge", "key3", "key4"}), "OK");
  EXPECT_EQ(CheckedInt({"pfcount", "key3"}), 6);
}

TEST_F(HllFamilyTest, MergeNonExisting) {
  EXPECT_EQ(CheckedInt({"pfadd", "key1", "1", "2", "3"}), 1);
  EXPECT_EQ(Run({"pfmerge", "key3", "key1", "key2"}), "OK");
  EXPECT_EQ(CheckedInt({"pfcount", "key3"}), 3);
}

TEST_F(HllFamilyTest, MergeOverlapping) {
  EXPECT_EQ(CheckedInt({"pfadd", "key1", "1", "2", "3"}), 1);
  EXPECT_EQ(CheckedInt({"pfadd", "key2", "2", "3"}), 1);
  EXPECT_EQ(CheckedInt({"pfadd", "key3", "1", "3"}), 1);
  EXPECT_EQ(CheckedInt({"pfadd", "key4", "2", "3"}), 1);
  EXPECT_EQ(CheckedInt({"pfadd", "key5", "3"}), 1);
  EXPECT_EQ(Run({"pfmerge", "key6", "key1", "key2", "key3", "key4", "key5"}), "OK");
  EXPECT_EQ(CheckedInt({"pfcount", "key6"}), 3);
}

TEST_F(HllFamilyTest, MergeInvalid) {
  Run({"exists", "key1", "key4"});
  ASSERT_EQ(GetDebugInfo().shards_count, 2);  // ensure 2 shards

  EXPECT_EQ(CheckedInt({"pfadd", "key1", "1", "2", "3"}), 1);
  EXPECT_EQ(Run({"set", "key4", "..."}), "OK");
  EXPECT_THAT(Run({"pfmerge", "key1", "key4"}),
              ErrArg("INVALIDOBJ Corrupted HLL object detected."));
  EXPECT_EQ(CheckedInt({"pfcount", "key1"}), 3);
}

TEST_F(HllFamilyTest, MergeWithInvalidHllFormat) {
  EXPECT_EQ(CheckedInt({"pfadd", "complex@key \"weird!field\" \"value\\nwith\\tescape sequences\"",
                        "some_element"}),
            1);
  EXPECT_EQ(CheckedInt({"append", "complex@key \"weird!field\" \"value\\nwith\\tescape sequences\"",
                        "corrupt_data"}),
            33);
  EXPECT_EQ(CheckedInt({"pfadd", "\"key with \\\"quotes\\\"\" \"value with \\\\backslashes\\\\\"",
                        "element1"}),
            1);
  EXPECT_THAT(Run({"pfmerge", "result_key",
                   "complex@key \"weird!field\" \"value\\nwith\\tescape sequences\"",
                   "\"key with \\\"quotes\\\"\" \"value with \\\\backslashes\\\\\""}),
              ErrArg("INVALIDOBJ Corrupted HLL object detected."));
}

}  // namespace dfly


================================================
FILE: src/server/hset_family.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/hset_family.h"

#include <absl/strings/ascii.h>

extern "C" {
#include "redis/listpack.h"
#include "redis/redis_aux.h"
#include "redis/util.h"
#include "redis/zmalloc.h"
}

#include "base/logging.h"
#include "core/detail/listpack_wrap.h"
#include "core/overloaded.h"
#include "core/string_map.h"
#include "facade/cmd_arg_parser.h"
#include "server/acl/acl_commands_def.h"
#include "server/command_registry.h"
#include "server/conn_context.h"
#include "server/container_utils.h"
#include "server/db_slice.h"
#include "server/engine_shard_set.h"
#include "server/error.h"
#include "server/family_utils.h"
#include "server/tiered_storage.h"
#include "server/tiering/decoders.h"
#include "server/tiering/serialized_map.h"
#include "server/transaction.h"

using namespace std;

namespace dfly {

using namespace facade;
using absl::SimpleAtoi;

namespace {

using IncrByParam = std::variant<double, int64_t>;
using OptStr = std::optional<std::string>;
enum GetAllMode : uint8_t { FIELDS = 1, VALUES = 2 };

bool IsGoodForListpack(CmdArgList args, const uint8_t* lp) {
  size_t sum = 0;
  for (auto s : args) {
    if (s.size() > server.max_map_field_len)
      return false;
    sum += s.size();
  }

  return lpBytes(const_cast<uint8_t*>(lp)) + sum < server.max_listpack_map_bytes;
}

using container_utils::GetStringMap;

// Generic wrapper for multiple underlying map <string, string> types
// holding a variant of:
// 1. Listpack
// 2. StringMap
// 3. SerializedMap (tiered)
struct HMapWrap {
 private:
  template <typename F> decltype(auto) VisitRef(F f) const {  // Cast T* to T&
    return std::visit(Overloaded{[&f](auto* s) { return f(*s); }, f}, impl_);
  }

  template <typename F> decltype(auto) VisitMut(F& f) {
    auto serialized_bust = [&](tiering::SerializedMap* s) {
      ABSL_UNREACHABLE();                          // Serialized maps should never be mutable
      return f(static_cast<StringMap*>(nullptr));  // purely for same return type
    };
    return std::visit(Overloaded{f, serialized_bust}, impl_);
  }

 public:
  HMapWrap(const PrimeValue& pv, DbContext db_cntx) {
    DCHECK(!pv.IsExternal() || pv.IsCool());
    if (pv.Encoding() == kEncodingListPack)
      impl_ = detail::ListpackWrap{static_cast<uint8_t*>(pv.RObjPtr())};
    else
      impl_ = GetStringMap(pv, db_cntx);
  }

  explicit HMapWrap(tiering::SerializedMap* sm) : impl_{sm} {
  }

  size_t Length() const {
    Overloaded ov{
        [](StringMap* s) { return s->UpperBoundSize(); },
        [](const detail::ListpackWrap& lw) { return lw.size(); },
        [](tiering::SerializedMap* s) { return s->size(); },
    };
    return visit(ov, impl_);
  }

  auto Find(std::string_view key) const {
    using RT = optional<pair<string_view, string_view>>;
    return VisitRef([key](auto& h) -> RT {
      if (auto it = h.Find(key); it != h.end())
        return *it;
      return std::nullopt;
    });
  }

  auto Range() const {
    auto f = [](auto p) -> pair<string_view, string_view> { return p; };  // implicit conversion
    using IT = base::it::CompoundIterator<decltype(f), detail::ListpackWrap::Iterator,
                                          StringMap::iterator, tiering::SerializedMap::Iterator>;
    auto cb = [f](auto& h) -> std::pair<IT, IT> {
      return {{f, h.begin()}, {std::nullopt, h.end()}};
    };
    return base::it::Range(VisitRef(cb));
  }

  bool Erase(std::string_view key) {
    Overloaded ov{[key](StringMap* s) { return s->Erase(key); },
                  [key](detail::ListpackWrap& lw) { return lw.Delete(key); }};
    return VisitMut(ov);
  }

  void AddOrUpdate(std::string_view key, std::string_view value) {
    Overloaded ov{[&](StringMap* sm) { sm->AddOrUpdate(key, value, UINT32_MAX, true); },
                  [&](detail::ListpackWrap& lw) { lw.Insert(key, value, false); }};
    VisitMut(ov);
  }

  void Launder(PrimeValue& pv) {
    Overloaded ov{
        [](StringMap* s) {},
        [&](detail::ListpackWrap& lw) { pv.SetRObjPtr(lw.GetPointer()); },
    };
    VisitMut(ov);
  }

  template <typename T> optional<T> Get() const {
    if (holds_alternative<T>(impl_))
      return get<T>(impl_);
    return nullopt;
  }

 private:
  variant<StringMap*, tiering::SerializedMap*, detail::ListpackWrap> impl_;
};  // namespace dfly

// Delete if length is zero
void DeleteHw(HMapWrap& hw, const OpArgs& op_args, std::string_view key) {
  auto& db_slice = op_args.GetDbSlice();
  if (auto del_it = db_slice.FindMutable(op_args.db_cntx, key, OBJ_HASH); del_it) {
    del_it->post_updater.Run();
    db_slice.Del(op_args.db_cntx, del_it->it);
    if (op_args.shard->journal()) {
      RecordJournal(op_args, "DEL"sv, {key});
    }
  }
}

auto KeyAndArgs(Transaction* t, EngineShard* es) {
  return std::make_pair(t->GetShardArgs(es->shard_id()).Front(), t->GetOpArgs(es));
}

// A wrappable callback returns a OpResult<T> or the future version of it for tiered values.
// Because the top-level value needs to be an OpResult, the variant is wrapped as an OpResult again.
// However, we can take the "result" out of the bare value and keep it only on the top-level.
template <typename T> using CbVariant = std::variant<T, ::util::fb2::Future<OpResult<T>>>;

// Unwrap possibly future result to a regular one
template <typename T> OpResult<T> Unwrap(OpResult<CbVariant<T>> result) {
  if (!result.ok())
    return result.status();

  Overloaded ov{
      [](T res) -> OpResult<T> { return res; },
      [](util::fb2::Future<OpResult<T>> fut) -> OpResult<T> { return fut.Get(); },
  };
  return visit(ov, std::move(result).value());
}

// Execute callback on generic HMapWrap, possibly on offloaded value and waiting for result
template <typename F, typename T = typename std::invoke_result_t<F, HMapWrap>::Type>
OpResult<T> ExecuteRO(Transaction* tx, F&& f) {
  auto shard_cb = [f = std::forward<F>(f)](Transaction* t,
                                           EngineShard* es) -> OpResult<CbVariant<T>> {
    // Fetch value of hash type
    auto [key, op_args] = KeyAndArgs(t, es);
    auto it_res = op_args.GetDbSlice().FindReadOnly(op_args.db_cntx, key, OBJ_HASH);
    RETURN_ON_BAD_STATUS(it_res);
    auto& pv = (*it_res)->second;

    // Enqueue read for future values
    if (pv.IsExternal() && !pv.IsCool()) {
      using D = tiering::SerializedMapDecoder;
      util::fb2::Future<OpResult<T>> fut;
      auto read_cb = [fut, f = std::move(f)](io::Result<D*> res) mutable {
        HMapWrap hw{res.value()->Get()};
        fut.Resolve(f(hw));
      };

      es->tiered_storage()->Read(op_args.db_cntx.db_index, key, pv.GetExternalSlice(), D{},
                                 std::move(read_cb));
      return CbVariant<T>{std::move(fut)};
    }

    HMapWrap hw{pv, op_args.db_cntx};
    auto res = f(hw);

    if (hw.Length() == 0)  // Expirations might have emptied it
      DeleteHw(hw, op_args, key);

    // Move result into variant or keep error status
    RETURN_ON_BAD_STATUS(res);
    return CbVariant<T>{std::move(res).value()};
  };

  return Unwrap(tx->ScheduleSingleHopT(std::move(shard_cb)));
}

// Wrap write handler
template <typename F> auto WrapW(F&& f) {
  using RT = std::invoke_result_t<F, HMapWrap&>;
  return [f = std::forward<F>(f)](Transaction* t, EngineShard* es) -> RT {
    auto [key, op_args] = KeyAndArgs(t, es);

    auto it_res = op_args.GetDbSlice().FindMutable(op_args.db_cntx, key, OBJ_HASH);
    RETURN_ON_BAD_STATUS(it_res);
    auto& pv = it_res->it->second;

    // Remove document before modification
    op_args.shard->search_indices()->RemoveDoc(key, op_args.db_cntx, pv);

    HMapWrap hw{pv, op_args.db_cntx};
    auto res = f(hw);
    hw.Launder(pv);

    // Run post updater
    it_res->post_updater.Run();

    if (hw.Length() == 0)
      DeleteHw(hw, op_args, key);
    else
      op_args.shard->search_indices()->AddDoc(key, op_args.db_cntx, &pv);

    return res;
  };
}

size_t EstimateListpackMinBytes(CmdArgList members) {
  size_t bytes = 0;
  for (const auto& member : members) {
    bytes += (member.size() + 1);  // string + at least 1 byte for string header.
  }
  return bytes;
}

OpStatus IncrementValue(optional<string_view> prev_val, IncrByParam* param) {
  if (holds_alternative<double>(*param)) {
    double incr = get<double>(*param);
    double value = 0;

    if (prev_val) {
      if (!ParseDouble(*prev_val, &value)) {
        return OpStatus::INVALID_VALUE;
      }
    }
    value += incr;
    if (isnan(value) || isinf(value)) {
      return OpStatus::NAN_OR_INF_DURING_INCR;
    }

    param->emplace<double>(value);

    return OpStatus::OK;
  }

  // integer increment
  long long old_val = 0;
  if (prev_val) {
    if (!string2ll(prev_val->data(), prev_val->size(), &old_val)) {
      return OpStatus::INVALID_VALUE;
    }
  }

  int64_t incr = get<int64_t>(*param);
  if ((incr < 0 && old_val < 0 && incr < (LLONG_MIN - old_val)) ||
      (incr > 0 && old_val > 0 && incr > (LLONG_MAX - old_val))) {
    return OpStatus::OUT_OF_RANGE;
  }

  int64_t new_val = old_val + incr;
  param->emplace<int64_t>(new_val);

  return OpStatus::OK;
}

OpStatus OpIncrBy(const OpArgs& op_args, string_view key, string_view field, IncrByParam* param) {
  auto& db_slice = op_args.GetDbSlice();
  auto op_res = db_slice.AddOrFind(op_args.db_cntx, key, OBJ_HASH);
  RETURN_ON_BAD_STATUS(op_res);

  auto& add_res = *op_res;
  PrimeValue& pv = add_res.it->second;
  if (add_res.is_new) {
    pv.InitRobj(OBJ_HASH, kEncodingListPack, lpNew(0));
  } else {
    op_args.shard->search_indices()->RemoveDoc(key, op_args.db_cntx, add_res.it->second);

    if (pv.Encoding() == kEncodingListPack) {
      uint8_t* lp = (uint8_t*)pv.RObjPtr();
      size_t lpb = lpBytes(lp);

      if (lpb >= server.max_listpack_map_bytes) {
        StringMap* sm = HSetFamily::ConvertToStrMap(lp);
        pv.InitRobj(OBJ_HASH, kEncodingStrMap2, sm);
      }
    }
  }

  HMapWrap hw{pv, op_args.db_cntx};
  optional<string_view> res;
  if (!add_res.is_new) {
    if (auto it = hw.Find(field); it)
      res = it->second;
  }

  if (OpStatus status = IncrementValue(res, param); status != OpStatus::OK)
    return status;

  if (holds_alternative<double>(*param)) {
    double new_val = get<double>(*param);
    char buf[128];
    char* str = RedisReplyBuilder::FormatDouble(new_val, buf, sizeof(buf));
    hw.AddOrUpdate(field, str);
  } else {  // integer increment
    int64_t new_val = get<int64_t>(*param);
    absl::AlphaNum an(new_val);
    hw.AddOrUpdate(field, an.Piece());
  }

  hw.Launder(pv);
  op_args.shard->search_indices()->AddDoc(key, op_args.db_cntx, &pv);

  return OpStatus::OK;
}

OpResult<StringVec> OpScan(const HMapWrap& hw, uint64_t* cursor, const ScanOpts& scan_op) {
  /* We set the max number of iterations to ten times the specified
   * COUNT, so if the hash table is in a pathological state (very
   * sparsely populated) we avoid to block too much time at the cost
   * of returning no or very few elements. (taken from redis code at db.c line 904 */
  constexpr size_t INTERATION_FACTOR = 10;

  StringVec res;
  // If NOVALUES, we expect 1 element per match (key). Otherwise, 2 elements (key + value).
  uint32_t count = scan_op.limit * (scan_op.novalues ? 1 : 2);

  if (auto lw = hw.Get<detail::ListpackWrap>(); lw) {
    // TODO: Optimize unnecessary value reads from iterator
    for (const auto [key, value] : *lw) {
      if (scan_op.Matches(key)) {
        res.emplace_back(key);
        if (!scan_op.novalues) {
          res.emplace_back(value);
        }
      }
    }
    *cursor = 0;
  } else {
    StringMap* sm = *hw.Get<StringMap*>();

    long max_iterations = count * INTERATION_FACTOR;

    // note about this lambda - don't capture here! it should be convertible to C function!
    auto scanCb = [&](const void* obj) {
      sds val = (sds)obj;
      size_t len = sdslen(val);
      if (scan_op.Matches(string_view(val, len))) {
        res.emplace_back(val, len);
        if (!scan_op.novalues) {
          val = StringMap::GetValue(val);
          res.emplace_back(val, sdslen(val));
        }
      }
    };

    do {
      *cursor = sm->Scan(*cursor, scanCb);
    } while (*cursor && max_iterations-- && res.size() < count);
  }

  return res;
}

OpResult<vector<OptStr>> OpHMGet(const HMapWrap& hw, CmdArgList fields) {
  DCHECK(!fields.empty());

  std::vector<OptStr> result(fields.size());
  if (auto lw = hw.Get<detail::ListpackWrap>(); lw) {
    absl::flat_hash_map<string_view, absl::InlinedVector<size_t, 3>> reverse;
    reverse.reserve(fields.size() + 1);
    for (size_t i = 0; i < fields.size(); ++i) {
      reverse[ArgS(fields, i)].push_back(i);  // map fields to their index.
    }

    for (const auto [key, value] : *lw) {
      if (auto it = reverse.find(key); it != reverse.end()) {
        for (size_t index : it->second) {
          DCHECK_LT(index, result.size());
          result[index].emplace(value);
        }
      }
    }
  } else {
    StringMap* sm = *hw.Get<StringMap*>();
    for (size_t i = 0; i < fields.size(); ++i) {
      if (auto it = sm->Find(fields[i]); it != sm->end()) {
        result[i].emplace(it->second, sdslen(it->second));
      }
    }
  }

  return result;
}

struct OpSetParams {
  bool skip_if_exists = false;
  uint32_t ttl = UINT32_MAX;
  bool keepttl = false;
};

OpResult<uint32_t> OpSet(const OpArgs& op_args, string_view key, CmdArgList values,
                         const OpSetParams& op_sp = OpSetParams{}) {
  DCHECK(!values.empty() && 0 == values.size() % 2);
  VLOG(2) << "OpSet(" << key << ")";

  auto& db_slice = op_args.GetDbSlice();
  auto op_res = db_slice.AddOrFind(op_args.db_cntx, key, OBJ_HASH);
  RETURN_ON_BAD_STATUS(op_res);
  auto& add_res = *op_res;

  uint8_t* lp = nullptr;
  auto& it = add_res.it;
  PrimeValue& pv = it->second;

  if (add_res.is_new) {
    if (op_sp.ttl == UINT32_MAX) {
      lp = lpNew(0);
      pv.InitRobj(OBJ_HASH, kEncodingListPack, lp);
    } else {
      pv.InitRobj(OBJ_HASH, kEncodingStrMap2, CompactObj::AllocateMR<StringMap>());
    }
  } else {
    op_args.shard->search_indices()->RemoveDoc(key, op_args.db_cntx, it->second);
  }

  if (pv.Encoding() == kEncodingListPack) {
    lp = (uint8_t*)pv.RObjPtr();

    if (op_sp.ttl != UINT32_MAX || !IsGoodForListpack(values, lp)) {
      StringMap* sm = HSetFamily::ConvertToStrMap(lp);
      pv.InitRobj(OBJ_HASH, kEncodingStrMap2, sm);
      lp = nullptr;
    }
  }

  unsigned created = 0;

  if (lp) {
    size_t malloc_reserved = zmalloc_size(lp);
    size_t min_sz = EstimateListpackMinBytes(values);
    if (min_sz > malloc_reserved) {
      lp = (uint8_t*)zrealloc(lp, min_sz);
    }
    detail::ListpackWrap lw{lp};
    for (size_t i = 0; i < values.size(); i += 2) {
      created += lw.Insert(values[i], values[i + 1], op_sp.skip_if_exists);
    }
    pv.SetRObjPtr(lw.GetPointer());
  } else {
    DCHECK_EQ(kEncodingStrMap2, pv.Encoding());  // Dictionary
    StringMap* sm = GetStringMap(pv, op_args.db_cntx);
    sm->Reserve(values.size() / 2);
    bool added;

    for (size_t i = 0; i < values.size(); i += 2) {
      string_view field = values[i];
      string_view value = values[i + 1];
      if (op_sp.skip_if_exists)
        added = sm->AddOrSkip(field, value, op_sp.ttl);
      else
        added = sm->AddOrUpdate(field, value, op_sp.ttl, op_sp.keepttl);

      created += unsigned(added);
    }
  }

  op_args.shard->search_indices()->AddDoc(key, op_args.db_cntx, &pv);

  if (auto* ts = op_args.shard->tiered_storage(); ts) {
    StashPrimeValue(op_args.db_cntx.db_index, key, &pv, ts, nullptr);
  }

  return created;
}

void HGetGeneric(CmdArgList args, uint8_t getall_mask, CommandContext* cmd_cntx) {
  auto cb = [getall_mask](const HMapWrap& hw) -> OpResult<vector<string>> {
    vector<string> res;
    bool keyval = (getall_mask == (FIELDS | VALUES));
    res.reserve(hw.Length() * (keyval ? 2 : 1));

    for (const auto& [key, value] : hw.Range()) {
      if (getall_mask & FIELDS)
        res.emplace_back(key);
      if (getall_mask & VALUES)
        res.emplace_back(value);
    }

    return res;
  };

  OpResult<vector<string>> result = ExecuteRO(cmd_cntx->tx(), cb);
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  switch (result.status()) {
    case OpStatus::OK:
    case OpStatus::KEY_NOTFOUND: {
      bool is_map = (getall_mask == (VALUES | FIELDS));
      return rb->SendBulkStrArr(*result, is_map ? CollectionType::MAP : CollectionType::ARRAY);
    }
    default:
      return cmd_cntx->SendError(result.status());
  };
}

OpResult<vector<long>> OpHExpire(const OpArgs& op_args, string_view key, uint32_t ttl_sec,
                                 ExpireFlags flags, CmdArgList values) {
  auto& db_slice = op_args.GetDbSlice();
  auto op_res = db_slice.FindMutable(op_args.db_cntx, key, OBJ_HASH);
  RETURN_ON_BAD_STATUS(op_res);

  PrimeValue* pv = &((*op_res).it->second);
  auto res = HSetFamily::SetFieldsExpireTime(op_args, ttl_sec, flags, key, values, pv);

  // If it is a hash which became empty after expiring fields, we must delete the key safely.
  // We use DelMutable which consumes the iterator/updater to prevent the crash.
  if (pv->Encoding() == kEncodingStrMap2) {
    auto* sm = static_cast<StringMap*>(pv->RObjPtr());
    if (sm->UpperBoundSize() == 0) {
      db_slice.DelMutable(op_args.db_cntx, std::move(*op_res));
    }
  }

  return res;
}

// HSETEX key [NX] [KEEPTTL] tll_sec field value field value ...
void HSetEx(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};

  string_view key = parser.Next();
  OpSetParams op_sp;

  const auto option_already_set = [&cmd_cntx] {
    return cmd_cntx->SendError(WrongNumArgsError(cmd_cntx->cid()->name()), kSyntaxErrType);
  };

  while (true) {
    if (parser.Check("NX")) {
      if (op_sp.skip_if_exists) {
        return option_already_set();
      }
      op_sp.skip_if_exists = true;
    } else if (parser.Check("KEEPTTL")) {
      if (op_sp.keepttl) {
        return option_already_set();
      }
      op_sp.keepttl = true;
    } else {
      break;
    }
  }

  op_sp.ttl = parser.Next<uint32_t>();
  auto* rb = cmd_cntx->rb();
  if (parser.HasError()) {
    return cmd_cntx->SendError(parser.TakeError().MakeReply());
  }

  constexpr uint32_t kMaxTtl = (1UL << 26);
  if (op_sp.ttl == 0 || op_sp.ttl > kMaxTtl) {
    return cmd_cntx->SendError(kInvalidIntErr);
  }

  CmdArgList fields = parser.Tail();

  if (fields.size() % 2 != 0) {
    return cmd_cntx->SendError(facade::WrongNumArgsError(cmd_cntx->cid()->name()), kSyntaxErrType);
  }

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpSet(t->GetOpArgs(shard), key, fields, op_sp);
  };

  OpResult<uint32_t> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  if (result) {
    rb->SendLong(*result);
  } else {
    cmd_cntx->SendError(result.status());
  }
}

struct HSetReplies {
  void Send(OpResult<uint32_t> result) const {
    switch (result.status()) {
      case OpStatus::OK:
      case OpStatus::KEY_NOTFOUND:
        return cmd_cntx->SendLong(result.value_or(0));
      default:
        return cmd_cntx->SendError(result.status());
    };
  }

  CommandContext* cmd_cntx;
};

void CmdHDel(CmdArgList args, CommandContext* cmd_cntx) {
  auto cb = [&](HMapWrap& hw) -> OpResult<uint32_t> {
    unsigned deleted = 0;
    for (string_view s : args.subspan(1))
      deleted += hw.Erase(s);
    return deleted;
  };
  HSetReplies{cmd_cntx}.Send(cmd_cntx->tx()->ScheduleSingleHopT(WrapW(cb)));
}

void CmdHExpire(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  using MinMaxTtl = FInt<0, (1 << 26)>;
  auto [key, ttl_sec] = parser.Next<string_view, MinMaxTtl>();

  ExpireFlags flags = parser
                          .TryMapNext("NX", ExpireFlags::EXPIRE_NX, "XX", ExpireFlags::EXPIRE_XX,
                                      "GT", ExpireFlags::EXPIRE_GT, "LT", ExpireFlags::EXPIRE_LT)
                          .value_or(ExpireFlags::EXPIRE_ALWAYS);

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (parser.HasError()) {
    return cmd_cntx->SendError(parser.TakeError().MakeReply());
  }
  if (!parser.Check("FIELDS"sv)) {
    return cmd_cntx->SendError("Mandatory argument FIELDS is missing or not at the right position",
                               kSyntaxErrType);
  }

  uint32_t numFields = parser.Next<uint32_t>();

  CmdArgList fields = parser.Tail();
  if (fields.size() != numFields) {
    return rb->SendError("The `numfields` parameter must match the number of arguments",
                         kSyntaxErrType);
  }

  RETURN_ON_PARSE_ERROR(parser, cmd_cntx);

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpHExpire(t->GetOpArgs(shard), key, ttl_sec, flags, fields);
  };
  OpResult<vector<long>> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));

  switch (result.status()) {
    case OpStatus::OK:
      return rb->SendLongArr(absl::MakeConstSpan(result.value()));
    case OpStatus::KEY_NOTFOUND:
      return rb->SendLongArr(absl::MakeConstSpan(vector<long>(numFields, -2)));
    default:
      return cmd_cntx->SendError(result.status());
  };
}

OpResult<vector<long>> OpHTtl(Transaction* t, EngineShard* shard, string_view key,
                              CmdArgList fields) {
  auto& db_slice = t->GetDbSlice(shard->shard_id());
  const DbContext& db_cntx = t->GetDbContext();
  auto it_res = db_slice.FindReadOnly(db_cntx, key, OBJ_HASH);
  RETURN_ON_BAD_STATUS(it_res);

  const PrimeValue& pv = (*it_res)->second;
  vector<long> res;
  res.reserve(fields.size());

  for (auto field : fields) {
    int32_t exp_time = HSetFamily::FieldExpireTime(db_cntx, pv, field);
    if (exp_time <= 0) {
      // -3 from FieldExpireTime means field not found -> HTTL returns -2
      // -1 means no expiry -> stays -1
      res.push_back(exp_time == -3 ? -2 : exp_time);
    } else {
      res.push_back(int32_t(exp_time - MemberTimeSeconds(db_cntx.time_now_ms)));
    }
  }

  return res;
}

void CmdHTtl(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  string_view key = parser.Next();

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (parser.HasError()) {
    return cmd_cntx->SendError(parser.TakeError().MakeReply());
  }
  if (!parser.Check("FIELDS"sv)) {
    return cmd_cntx->SendError("Mandatory argument FIELDS is missing or not at the right position",
                               kSyntaxErrType);
  }

  uint32_t numFields = parser.Next<uint32_t>();

  CmdArgList fields = parser.Tail();
  if (fields.size() != numFields) {
    return rb->SendError("The `numfields` parameter must match the number of arguments",
                         kSyntaxErrType);
  }

  RETURN_ON_PARSE_ERROR(parser, cmd_cntx);

  auto cb = [&](Transaction* t, EngineShard* shard) { return OpHTtl(t, shard, key, fields); };
  OpResult<vector<long>> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));

  switch (result.status()) {
    case OpStatus::OK:
      return rb->SendLongArr(absl::MakeConstSpan(result.value()));
    case OpStatus::KEY_NOTFOUND:
      return rb->SendLongArr(absl::MakeConstSpan(vector<long>(numFields, -2)));
    default:
      return cmd_cntx->SendError(result.status());
  };
}

void CmdHGet(CmdArgList args, CommandContext* cmd_cntx) {
  auto cb = [field = args[1]](const HMapWrap& hw) -> OpResult<string> {
    if (auto it = hw.Find(field); it)
      return string{it->second};
    return OpStatus::KEY_NOTFOUND;
  };

  OpResult<string> result = ExecuteRO(cmd_cntx->tx(), cb);
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  switch (result.status()) {
    case OpStatus::OK:
      return rb->SendBulkString(*result);
    case OpStatus::KEY_NOTFOUND:
      return rb->SendNull();
    default:
      return cmd_cntx->SendError(result.status());
  };
}

void CmdHMGet(CmdArgList args, CommandContext* cmd_cntx) {
  auto fields = args.subspan(1);
  auto cb = [fields](const HMapWrap& hw) { return OpHMGet(hw, fields); };

  OpResult<vector<OptStr>> result = ExecuteRO(cmd_cntx->tx(), cb);
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  switch (result.status()) {
    case OpStatus::OK:
    case OpStatus::KEY_NOTFOUND: {
      RedisReplyBuilder::ArrayScope scope{rb, fields.size()};
      for (size_t i = 0; i < fields.size(); i++) {
        if (result.ok() && (*result)[i].has_value())
          rb->SendBulkString(*(*result)[i]);
        else
          rb->SendNull();
      }
    } break;
    default:
      cmd_cntx->SendError(result.status());
  };
}

void CmdHStrLen(CmdArgList args, CommandContext* cmd_cntx) {
  auto cb = [field = ArgS(args, 1)](const HMapWrap& hw) -> OpResult<uint32_t> {
    if (auto it = hw.Find(field); it)
      return it->second.length();
    return OpStatus::KEY_NOTFOUND;
  };
  HSetReplies{cmd_cntx}.Send(ExecuteRO(cmd_cntx->tx(), cb));
}

void CmdHLen(CmdArgList args, CommandContext* cmd_cntx) {
  auto cb = [](const HMapWrap& hw) -> OpResult<uint32_t> { return hw.Length(); };
  HSetReplies{cmd_cntx}.Send(ExecuteRO(cmd_cntx->tx(), cb));
}

void CmdHExists(CmdArgList args, CommandContext* cmd_cntx) {
  auto cb = [field = args[1]](const HMapWrap& hw) -> OpResult<uint32_t> {
    return hw.Find(field) ? 1 : 0;
  };
  HSetReplies{cmd_cntx}.Send(ExecuteRO(cmd_cntx->tx(), cb));
}

void CmdHIncrBy(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view field = ArgS(args, 1);
  string_view incrs = ArgS(args, 2);
  int64_t ival = 0;

  if (!absl::SimpleAtoi(incrs, &ival)) {
    return cmd_cntx->SendError(kInvalidIntErr);
  }

  IncrByParam param{ival};

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpIncrBy(t->GetOpArgs(shard), key, field, &param);
  };

  OpStatus status = cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));

  if (status == OpStatus::OK) {
    cmd_cntx->SendLong(get<int64_t>(param));
  } else {
    switch (status) {
      case OpStatus::INVALID_VALUE:
        cmd_cntx->SendError("hash value is not an integer");
        break;
      case OpStatus::OUT_OF_RANGE:
        cmd_cntx->SendError(kIncrOverflow);
        break;
      default:
        cmd_cntx->SendError(status);
        break;
    }
  }
}

void CmdHIncrByFloat(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view field = ArgS(args, 1);
  string_view incrs = ArgS(args, 2);
  double dval = 0;

  if (!absl::SimpleAtod(incrs, &dval)) {
    return cmd_cntx->SendError(kInvalidFloatErr);
  }

  if (isnan(dval) || isinf(dval)) {
    return cmd_cntx->SendError(kNanOrInfDuringIncr);
  }

  IncrByParam param{dval};

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpIncrBy(t->GetOpArgs(shard), key, field, &param);
  };

  OpStatus status = cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));

  if (status == OpStatus::OK) {
    auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
    rb->SendDouble(get<double>(param));
  } else {
    switch (status) {
      case OpStatus::INVALID_VALUE:
        cmd_cntx->SendError("hash value is not a float");
        break;
      default:
        cmd_cntx->SendError(status);
        break;
    }
  }
}

void CmdHKeys(CmdArgList args, CommandContext* cmd_cntx) {
  HGetGeneric(args, FIELDS, cmd_cntx);
}

void CmdHVals(CmdArgList args, CommandContext* cmd_cntx) {
  HGetGeneric(args, VALUES, cmd_cntx);
}

void CmdHGetAll(CmdArgList args, CommandContext* cmd_cntx) {
  HGetGeneric(args, GetAllMode::FIELDS | GetAllMode::VALUES, cmd_cntx);
}

void CmdHScan(CmdArgList args, CommandContext* cmd_cntx) {
  std::string_view token = ArgS(args, 1);
  uint64_t cursor = 0;
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (!absl::SimpleAtoi(token, &cursor)) {
    return rb->SendError("invalid cursor");
  }

  // HSCAN key cursor [MATCH pattern] [COUNT count] [NOVALUES]
  if (args.size() > 7) {
    DVLOG(1) << "got " << args.size() << " this is more than it should be";
    return rb->SendError(kSyntaxErr);
  }

  OpResult<ScanOpts> ops = ScanOpts::TryFrom(args.subspan(2), true);
  if (!ops) {
    DVLOG(1) << "HScan invalid args - return " << ops << " to the user";
    return cmd_cntx->SendError(ops.status());
  }

  const ScanOpts& scan_op = ops.value();
  auto cb = [&](const HMapWrap& hw) { return OpScan(hw, &cursor, scan_op); };

  OpResult<StringVec> result = ExecuteRO(cmd_cntx->tx(), cb);
  switch (result.status()) {
    case OpStatus::KEY_NOTFOUND:
      cursor = 0;
      [[fallthrough]];
    case OpStatus::OK: {
      RedisReplyBuilder::ArrayScope scope{rb, 2};
      rb->SendBulkString(absl::StrCat(cursor));
      rb->SendBulkStrArr(*result);
      break;
    }
    default:
      cmd_cntx->SendError(result.status());
  }
}

void CmdHSet(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);

  string_view cmd{cmd_cntx->cid()->name()};
  auto* rb = cmd_cntx->rb();
  if (args.size() % 2 != 1) {
    return rb->SendError(facade::WrongNumArgsError(cmd), kSyntaxErrType);
  }

  args.remove_prefix(1);
  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpSet(t->GetOpArgs(shard), key, args);
  };

  OpResult<uint32_t> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));

  if (result && cmd == "HSET") {
    rb->SendLong(*result);
  } else {
    cmd_cntx->SendError(result.status());
  }
}

void CmdHSetNx(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpSet(t->GetOpArgs(shard), key, args.subspan(1), OpSetParams{.skip_if_exists = true});
  };
  HSetReplies{cmd_cntx}.Send(cmd_cntx->tx()->ScheduleSingleHopT(cb));
}

void StrVecEmplaceBack(StringVec& str_vec, const listpackEntry& lp) {
  if (lp.sval) {
    str_vec.emplace_back(reinterpret_cast<char*>(lp.sval), lp.slen);
    return;
  }
  str_vec.emplace_back(absl::StrCat(lp.lval));
}

void CmdHRandField(CmdArgList args, CommandContext* cmd_cntx) {
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (args.size() > 3) {
    DVLOG(1) << "Wrong number of command arguments: " << args.size();
    return rb->SendError(kSyntaxErr);
  }

  string_view key = ArgS(args, 0);
  int32_t count;
  bool with_values = false;

  if ((args.size() > 1) && (!SimpleAtoi(ArgS(args, 1), &count))) {
    return rb->SendError("count value is not an integer", kSyntaxErrType);
  }

  if (args.size() == 3) {
    string arg = absl::AsciiStrToUpper(ArgS(args, 2));
    if (arg != "WITHVALUES")
      return rb->SendError(kSyntaxErr);
    else
      with_values = true;
  }

  auto cb = [&](Transaction* t, EngineShard* shard) -> OpResult<StringVec> {
    auto& db_slice = t->GetDbSlice(shard->shard_id());
    DbContext db_context = t->GetDbContext();
    auto it_res = db_slice.FindReadOnly(db_context, key, OBJ_HASH);

    if (!it_res)
      return it_res.status();

    const PrimeValue& pv = it_res.value()->second;
    StringVec str_vec;

    if (pv.Encoding() == kEncodingStrMap2) {
      StringMap* string_map = GetStringMap(pv, db_context);

      if (args.size() == 1) {
        auto opt_pair = string_map->RandomPair();
        if (opt_pair.has_value()) {
          auto [key, value] = *opt_pair;
          str_vec.emplace_back(key, sdslen(key));
        }
      } else {
        size_t actual_count =
            (count >= 0) ? std::min(size_t(count), string_map->UpperBoundSize()) : abs(count);
        std::vector<sds> keys, vals;
        if (count >= 0) {
          string_map->RandomPairsUnique(actual_count, keys, vals, with_values);
        } else {
          string_map->RandomPairs(actual_count, keys, vals, with_values);
        }
        for (size_t i = 0; i < actual_count; ++i) {
          str_vec.emplace_back(keys[i], sdslen(keys[i]));
          if (with_values) {
            str_vec.emplace_back(vals[i], sdslen(vals[i]));
          }
        }
      }

      if (string_map->Empty()) {  // Can happen if we use a TTL on hash members.
        auto res_it = db_slice.FindMutable(db_context, key, OBJ_HASH);
        if (res_it) {
          db_slice.DelMutable(db_context, std::move(*res_it));
        }
        return facade::OpStatus::KEY_NOTFOUND;
      }
    } else if (pv.Encoding() == kEncodingListPack) {
      uint8_t* lp = (uint8_t*)pv.RObjPtr();
      size_t lplen = lpLength(lp);
      CHECK(lplen > 0 && lplen % 2 == 0);
      size_t hlen = lplen / 2;
      if (args.size() == 1) {
        listpackEntry key;
        lpRandomPair(lp, hlen, &key, NULL);
        StrVecEmplaceBack(str_vec, key);
      } else {
        size_t actual_count = (count >= 0) ? std::min(size_t(count), hlen) : abs(count);
        std::unique_ptr<listpackEntry[]> keys = nullptr, vals = nullptr;
        keys = std::make_unique<listpackEntry[]>(actual_count);
        if (with_values)
          vals = std::make_unique<listpackEntry[]>(actual_count);

        // count has been specified.
        if (count >= 0)
          // always returns unique entries.
          lpRandomPairsUnique(lp, actual_count, keys.get(), vals.get());
        else
          // allows non-unique entries.
          lpRandomPairs(lp, actual_count, keys.get(), vals.get());

        for (size_t i = 0; i < actual_count; ++i) {
          StrVecEmplaceBack(str_vec, keys[i]);
          if (with_values) {
            StrVecEmplaceBack(str_vec, vals[i]);
          }
        }
      }
    } else {
      LOG(FATAL) << "Invalid encoding " << pv.Encoding();
    }
    return str_vec;
  };

  OpResult<StringVec> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  if (result) {
    if (result->size() == 1 && args.size() == 1)
      rb->SendBulkString(result->front());
    else if (with_values) {
      const auto result_size = result->size();
      DCHECK(result_size % 2 == 0)
          << "unexpected size of strings " << result_size << ", expected pairs";
      SinkReplyBuilder::ReplyScope scope{rb};
      const bool is_resp3 = rb->IsResp3();
      rb->StartArray(is_resp3 ? result_size / 2 : result_size);
      for (size_t i = 0; i < result_size; i += 2) {
        if (is_resp3)
          rb->StartArray(2);
        rb->SendBulkString((*result)[i]);
        rb->SendBulkString((*result)[i + 1]);
      }
    } else
      rb->SendBulkStrArr(*result, CollectionType::ARRAY);
  } else if (result.status() == OpStatus::KEY_NOTFOUND) {
    if (args.size() == 1)
      rb->SendNull();
    else
      rb->SendEmptyArray();
  } else {
    cmd_cntx->SendError(result.status());
  }
}

}  // namespace

using CI = CommandId;

#define HFUNC(x) SetHandler(&Cmd##x)

void HSetFamily::Register(CommandRegistry* registry) {
  registry->StartFamily(acl::HASH);
  *registry << CI{"HDEL", CO::FAST | CO::JOURNALED, -3, 1, 1}.HFUNC(HDel)
            << CI{"HLEN", CO::FAST | CO::READONLY, 2, 1, 1}.HFUNC(HLen)
            << CI{"HEXISTS", CO::FAST | CO::READONLY, 3, 1, 1}.HFUNC(HExists)
            << CI{"HGET", CO::FAST | CO::READONLY, 3, 1, 1}.HFUNC(HGet)
            << CI{"HGETALL", CO::FAST | CO::READONLY, 2, 1, 1}.HFUNC(HGetAll)
            << CI{"HMGET", CO::FAST | CO::READONLY, -3, 1, 1}.HFUNC(HMGet)
            << CI{"HMSET", CO::JOURNALED | CO::FAST | CO::DENYOOM, -4, 1, 1}.HFUNC(HSet)
            << CI{"HINCRBY", CO::JOURNALED | CO::DENYOOM | CO::FAST, 4, 1, 1}.HFUNC(HIncrBy)
            << CI{"HINCRBYFLOAT", CO::JOURNALED | CO::DENYOOM | CO::FAST, 4, 1, 1}.HFUNC(
                   HIncrByFloat)
            << CI{"HKEYS", CO::READONLY, 2, 1, 1}.HFUNC(HKeys)
            << CI{"HEXPIRE", CO::JOURNALED | CO::FAST | CO::DENYOOM, -5, 1, 1}.HFUNC(HExpire)
            << CI{"HTTL", CO::READONLY | CO::FAST, -4, 1, 1}.HFUNC(HTtl)
            << CI{"HRANDFIELD", CO::READONLY, -2, 1, 1}.HFUNC(HRandField)
            << CI{"HSCAN", CO::READONLY, -3, 1, 1}.HFUNC(HScan)
            << CI{"HSET", CO::JOURNALED | CO::FAST | CO::DENYOOM, -4, 1, 1}.HFUNC(HSet)
            << CI{"HSETEX", CO::JOURNALED | CO::FAST | CO::DENYOOM, -5, 1, 1}.SetHandler(HSetEx)
            << CI{"HSETNX", CO::JOURNALED | CO::DENYOOM | CO::FAST, 4, 1, 1}.HFUNC(HSetNx)
            << CI{"HSTRLEN", CO::READONLY | CO::FAST, 3, 1, 1}.HFUNC(HStrLen)
            << CI{"HVALS", CO::READONLY, 2, 1, 1}.HFUNC(HVals);
}

auto HSetFamily::LoadZiplistBlob(std::string_view blob, PrimeValue* pv) -> LoadBlobResult {
  unsigned char* lp = lpNew(blob.size());
  if (!ZiplistPairsConvertAndValidateIntegrity((const uint8_t*)blob.data(), blob.size(), &lp)) {
    LOG(ERROR) << "Hash ziplist integrity check failed.";
    zfree(lp);
    return LoadBlobResult::kCorrupted;
  }

  if (lpLength(lp) == 0) {
    lpFree(lp);
    return LoadBlobResult::kEmpty;
  }

  if (lpBytes(lp) > server.max_listpack_map_bytes) {
    StringMap* sm = ConvertToStrMap(lp);
    lpFree(lp);
    pv->InitRobj(OBJ_HASH, kEncodingStrMap2, sm);
  } else {
    lp = lpShrinkToFit(lp);
    pv->InitRobj(OBJ_HASH, kEncodingListPack, lp);
  }

  return LoadBlobResult::kSuccess;
}

auto HSetFamily::LoadListpackBlob(std::string_view blob, PrimeValue* pv) -> LoadBlobResult {
  if (!lpValidateIntegrity((uint8_t*)blob.data(), blob.size(), 0, nullptr, nullptr)) {
    LOG(ERROR) << "Hash listpack integrity check failed.";
    return LoadBlobResult::kCorrupted;
  }

  unsigned char* lp = lpNew(blob.size());
  std::memcpy(lp, blob.data(), blob.size());

  if (lpLength(lp) == 0) {
    lpFree(lp);
    return LoadBlobResult::kEmpty;
  }

  if (lpBytes(lp) > server.max_listpack_map_bytes) {
    StringMap* sm = ConvertToStrMap(lp);
    lpFree(lp);
    pv->InitRobj(OBJ_HASH, kEncodingStrMap2, sm);
  } else {
    lp = lpShrinkToFit(lp);
    pv->InitRobj(OBJ_HASH, kEncodingListPack, lp);
  }

  return LoadBlobResult::kSuccess;
}

StringMap* HSetFamily::ConvertToStrMap(uint8_t* lp) {
  StringMap* sm = CompactObj::AllocateMR<StringMap>();

  detail::ListpackWrap lw{lp};
  sm->Reserve(lw.size());
  for (const auto [key, value] : lw)
    LOG_IF(ERROR, !sm->AddOrUpdate(key, value)) << "Internal error: duplicate key " << key;
  return sm;
}

// returns -1 if no expiry is associated with the field, -3 if no field is found.
int32_t HSetFamily::FieldExpireTime(const DbContext& db_context, const PrimeValue& pv,
                                    std::string_view field) {
  DCHECK_EQ(OBJ_HASH, pv.ObjType());

  if (pv.Encoding() == kEncodingListPack) {
    detail::ListpackWrap lw{static_cast<uint8_t*>(pv.RObjPtr())};
    return lw.Find(field) == lw.end() ? -3 : -1;
  } else {
    StringMap* string_map = (StringMap*)pv.RObjPtr();
    string_map->set_time(MemberTimeSeconds(db_context.time_now_ms));
    auto it = string_map->Find(field);
    if (it == string_map->end())
      return -3;
    return it.HasExpiry() ? it.ExpiryTime() : -1;
  }
}

// returns vector of results for each field in values:
// -2 if the provided key does not exist.
// 0 if the specified NX | XX | GT | LT condition has not been met.
// 1 if the expiration time was set/updated.
// 2 when HEXPIRE/HPEXPIRE is called with 0 seconds and the field is deleted.
static std::vector<long> UpdateTTL(facade::CmdArgList values, uint32_t ttl_sec, ExpireFlags flags,
                                   StringMap* owner) {
  std::vector<long> res;
  res.reserve(values.size());

  for (size_t i = 0; i < values.size(); i++) {
    std::string_view field = facade::ToSV(values[i]);
    auto it = owner->Find(field);
    if (it != owner->end()) {
      switch (flags) {
        case ExpireFlags::EXPIRE_NX:
          if (it.HasExpiry()) {
            res.emplace_back(0);
            continue;
          }
          break;
        case ExpireFlags::EXPIRE_XX:
          if (!it.HasExpiry()) {
            res.emplace_back(0);
            continue;
          }
          break;
        case ExpireFlags::EXPIRE_GT:
          if (it.ExpiryTime() - owner->time_now() >= ttl_sec) {
            res.emplace_back(0);
            continue;
          }
          break;
        case ExpireFlags::EXPIRE_LT:
          if (it.ExpiryTime() - owner->time_now() <= ttl_sec) {
            res.emplace_back(0);
            continue;
          }
          break;
        case ExpireFlags::EXPIRE_ALWAYS:
          break;
      }
      if (ttl_sec == 0) {
        owner->Erase(field);
        res.emplace_back(2);
      } else {
        it.SetExpiryTime(ttl_sec);
        res.emplace_back(1);
      }
    } else {
      res.emplace_back(-2);
    }
  }

  return res;
}

vector<long> HSetFamily::SetFieldsExpireTime(const OpArgs& op_args, uint32_t ttl_sec,
                                             ExpireFlags flags, string_view key, CmdArgList values,
                                             PrimeValue* pv) {
  DCHECK_EQ(OBJ_HASH, pv->ObjType());
  op_args.shard->search_indices()->RemoveDoc(key, op_args.db_cntx, *pv);

  if (pv->Encoding() == kEncodingListPack) {
    // a valid result can never be a listpack, since it doesnt keep ttl
    uint8_t* lp = (uint8_t*)pv->RObjPtr();
    StringMap* sm = HSetFamily::ConvertToStrMap(lp);
    pv->InitRobj(OBJ_HASH, kEncodingStrMap2, sm);
  }

  // This needs to be explicitly fetched again since the pv might have changed.
  StringMap* sm = container_utils::GetStringMap(*pv, op_args.db_cntx);
  vector<long> res = UpdateTTL(values, ttl_sec, flags, sm);
  op_args.shard->search_indices()->AddDoc(key, op_args.db_cntx, pv);
  return res;
}

}  // namespace dfly


================================================
FILE: src/server/hset_family.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <optional>

#include "facade/op_status.h"
#include "server/common.h"
#include "server/table.h"
namespace dfly {

class StringMap;

using facade::OpResult;
using facade::OpStatus;

class HSetFamily {
 public:
  static void Register(CommandRegistry* registry);

  static LoadBlobResult LoadZiplistBlob(std::string_view blob, PrimeValue* pv);
  static LoadBlobResult LoadListpackBlob(std::string_view blob, PrimeValue* pv);

  // Does not free lp.
  static StringMap* ConvertToStrMap(uint8_t* lp);

  static int32_t FieldExpireTime(const DbContext& db_context, const PrimeValue& pv,
                                 std::string_view field);

  static std::vector<long> SetFieldsExpireTime(const OpArgs& op_args, uint32_t ttl_sec,
                                               ExpireFlags flags, std::string_view key,
                                               CmdArgList values, PrimeValue* pv);
};

}  // namespace dfly


================================================
FILE: src/server/hset_family_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/hset_family.h"

#include <absl/cleanup/cleanup.h>

#include <tuple>

extern "C" {
#include "redis/listpack.h"
#include "redis/sds.h"
}

#include "base/gtest.h"
#include "base/logging.h"
#include "core/detail/gen_utils.h"
#include "facade/facade_test.h"
#include "server/test_utils.h"

using namespace testing;
using namespace std;
using namespace util;
using namespace facade;

namespace dfly {

class HSetFamilyTest : public BaseFamilyTest {
 protected:
};

class HestFamilyTestProtocolVersioned : public HSetFamilyTest,
                                        public ::testing::WithParamInterface<string> {
 protected:
};

INSTANTIATE_TEST_SUITE_P(HestFamilyTestProtocolVersioned, HestFamilyTestProtocolVersioned,
                         ::testing::Values("2", "3"));

TEST_F(HSetFamilyTest, Basic) {
  auto resp = Run({"hset", "x", "a"});
  EXPECT_THAT(resp, ErrArg("wrong number"));

  EXPECT_THAT(Run({"HSET", "hs", "key1", "val1", "key2"}), ErrArg("wrong number"));

  EXPECT_EQ(1, CheckedInt({"hset", "x", "a", "b"}));
  EXPECT_EQ(1, CheckedInt({"hlen", "x"}));

  EXPECT_EQ(1, CheckedInt({"hexists", "x", "a"}));
  EXPECT_EQ(0, CheckedInt({"hexists", "x", "b"}));
  EXPECT_EQ(0, CheckedInt({"hexists", "y", "a"}));

  EXPECT_EQ(0, CheckedInt({"hset", "x", "a", "b"}));
  EXPECT_EQ(0, CheckedInt({"hset", "x", "a", "c"}));
  EXPECT_EQ(0, CheckedInt({"hset", "x", "a", ""}));

  EXPECT_EQ(2, CheckedInt({"hset", "y", "a", "c", "d", "e"}));
  EXPECT_EQ(2, CheckedInt({"hdel", "y", "a", "d"}));

  EXPECT_THAT(Run({"hdel", "nokey", "a"}), IntArg(0));
}

TEST_F(HSetFamilyTest, HSet) {
  // Simulate HSET on mirror map
  {
    absl::flat_hash_map<string, string> mirror;  // mirror

    // Generate HSET commands and check how many new entries were added
    absl::InsecureBitGen gen{};
    while (mirror.size() < 600) {
      vector<string> cmd = {"HSET", "hash"};
      size_t new_values = 0;
      for (int i = 0; i < 20; i++) {
        string key = GetRandomHex(gen, 3);
        string value = GetRandomHex(gen, 20, 10);
        new_values += mirror.contains(key) ? 0 : 1;
        mirror[key] = value;

        cmd.emplace_back(key);
        cmd.emplace_back(value);
      }

      EXPECT_THAT(Run(cmd), IntArg(new_values));
    }

    // Verify consistency
    EXPECT_THAT(Run({"HLEN", "hash"}), IntArg(mirror.size()));
    for (const auto& [key, value] : mirror)
      EXPECT_EQ(Run({"HGET", "hash", key}), mirror[key]);
  }

  // HSet with same key twice
  Run({"HSET", "hash", "key1", "value1", "key1", "value2"});
  EXPECT_EQ(Run({"HGET", "hash", "key1"}), "value2");

  // Wrong value cases
  EXPECT_THAT(Run({"HSET", "key"}), ErrArg("wrong number of arguments"));
  EXPECT_THAT(Run({"HSET", "key", "key"}), ErrArg("wrong number of arguments"));
  EXPECT_THAT(Run({"HSET", "key", "key", "value", "key2"}), ErrArg("wrong number of arguments"));
}

TEST_F(HSetFamilyTest, HSetNX) {
  // Should create new field
  EXPECT_THAT(Run({"HSETNX", "hash", "key1", "value1"}), IntArg(1));
  EXPECT_EQ(Run({"HGET", "hash", "key1"}), "value1");

  // Should not overwrite
  EXPECT_THAT(Run({"HSETNX", "hash", "key1", "value2"}), IntArg(0));
  EXPECT_EQ(Run({"HGET", "hash", "key1"}), "value1");

  // Wrong value cases
  EXPECT_THAT(Run({"HSETNX", "key"}), ErrArg("wrong number of arguments"));
  EXPECT_THAT(Run({"HSET", "key", "key"}), ErrArg("wrong number of arguments"));
}

// Listpack handles integers separately, so create a mix of different types
TEST_F(HSetFamilyTest, MixedTypes) {
  absl::flat_hash_set<string> str_keys, int_keys;
  for (int i = 0; i < 100; i++) {
    auto key1 = absl::StrCat("s", i);
    auto key2 = absl::StrCat("i", i);
    Run({"HSET", "hash", key1, "VALUE", key2, "123456"});
    str_keys.emplace(key1);
    int_keys.emplace(key2);
  }

  for (string_view key : str_keys)
    EXPECT_EQ(Run({{"HGET", "hash", key}}), "VALUE");

  for (string_view key : int_keys) {
    EXPECT_EQ(Run({{"HGET", "hash", key}}), "123456");
    EXPECT_EQ(CheckedInt({"hincrby", "hash", key, "1"}), 123456 + 1);
  }
}

TEST_P(HestFamilyTestProtocolVersioned, Get) {
  auto resp = Run({"hello", GetParam()});
  EXPECT_THAT(resp.GetVec()[6], "proto");
  EXPECT_THAT(resp.GetVec()[7], IntArg(atoi(GetParam().c_str())));

  resp = Run({"hset", "x", "a", "1", "b", "2", "c", "3"});
  EXPECT_THAT(resp, IntArg(3));

  resp = Run({"hmget", "unkwn", "a", "c"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), ElementsAre(ArgType(RespExpr::NIL), ArgType(RespExpr::NIL)));

  resp = Run({"hkeys", "x"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), UnorderedElementsAre("a", "b", "c"));

  resp = Run({"hvals", "x"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), UnorderedElementsAre("1", "2", "3"));

  resp = Run({"hmget", "x", "a", "c", "d"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), ElementsAre("1", "3", ArgType(RespExpr::NIL)));

  resp = Run({"hmget", "x", "a", "c", "d", "d", "c", "a"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(),
              ElementsAre("1", "3", ArgType(RespExpr::NIL), ArgType(RespExpr::NIL), "3", "1"));

  resp = Run({"hgetall", "x"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), ElementsAre("a", "1", "b", "2", "c", "3"));
}

TEST_F(HSetFamilyTest, HIncrBy) {
  int total = 10;
  // Check new field is created
  EXPECT_EQ(CheckedInt({"hincrby", "key", "field", "10"}), 10);
  EXPECT_EQ(Run({"hget", "key", "field"}), "10");
  // Simulate multiple additions
  for (int i = -100; i < 100; i += 7) {
    total += i;
    EXPECT_EQ(CheckedInt({"hincrby", "key", "field", to_string(i)}), total);
  }

  // Overflow
  Run({"hset", "key", "field2", to_string(numeric_limits<int64_t>::max() - 1)});
  EXPECT_THAT(Run({"hincrby", "key", "field2", "2"}), ErrArg("would overflow"));

  // Error case
  Run({"hset", "key", "a", " 1"});
  auto resp = Run({"hincrby", "key", "a", "10"});
  EXPECT_THAT(resp, ErrArg("hash value is not an integer"));
}

TEST_F(HSetFamilyTest, HIncrRespected) {
  Run({"hset", "key", "a", "1"});
  EXPECT_EQ(11, CheckedInt({"hincrby", "key", "a", "10"}));
  EXPECT_EQ(11, CheckedInt({"hget", "key", "a"}));
}

TEST_F(HSetFamilyTest, HIncrCmdsPreserveTtl) {
  Run({"hsetex", "key", "5", "a", "1"});
  EXPECT_EQ(5, CheckedInt({"fieldttl", "key", "a"}));
  EXPECT_EQ(2, CheckedInt({"hincrby", "key", "a", "1"}));
  EXPECT_EQ(5, CheckedInt({"fieldttl", "key", "a"}));

  // If the field has already expired by the time hincrby runs, the TTL is default
  AdvanceTime(5 * 1000);
  EXPECT_EQ(1, CheckedInt({"hincrby", "key", "a", "1"}));
  EXPECT_EQ(-1, CheckedInt({"fieldttl", "key", "a"}));

  Run({"hsetex", "key", "5", "fl", "1.1"});
  EXPECT_EQ(5, CheckedInt({"fieldttl", "key", "fl"}));
  EXPECT_EQ("2.2", Run({"hincrbyfloat", "key", "fl", "1.1"}));
}

TEST_F(HSetFamilyTest, HScan) {
  auto resp = Run("hscan non-existing-key 100 count 5");
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  ASSERT_THAT(resp.GetVec(), ElementsAre(ArgType(RespExpr::STRING), ArgType(RespExpr::ARRAY)));
  EXPECT_EQ(ToSV(resp.GetVec()[0].GetBuf()), "0");
  EXPECT_EQ(StrArray(resp.GetVec()[1]).size(), 0);

  for (int i = 0; i < 10; i++) {
    Run({"HSET", "myhash", absl::StrCat("Field-", i), absl::StrCat("Value-", i)});
  }

  // Note that even though this limit by 4, it would return more because
  // all fields are on listpack
  resp = Run({"hscan", "myhash", "0", "count", "4"});
  EXPECT_THAT(resp, ArrLen(2));
  auto vec = StrArray(resp.GetVec()[1]);
  EXPECT_EQ(vec.size(), 20);
  EXPECT_THAT(vec, Each(AnyOf(StartsWith("Field"), StartsWith("Value"))));

  // Now run with filter on the results - we are expecting to not getting
  // any result at this point
  resp = Run({"hscan", "myhash", "0", "match", "*x*"});  // nothing should match this
  EXPECT_THAT(resp, ArrLen(2));
  vec = StrArray(resp.GetVec()[1]);
  EXPECT_EQ(vec.size(), 0);

  // now we will do a positive match - anything that has 1 on it
  resp = Run({"hscan", "myhash", "0", "match", "*1*"});
  EXPECT_THAT(resp, ArrLen(2));
  vec = StrArray(resp.GetVec()[1]);
  EXPECT_EQ(vec.size(), 2);  // key/value = 2

  // Test with large hash to see that count limit the number of entries
  for (int i = 0; i < 200; i++) {
    Run({"HSET", "largehash", absl::StrCat("KeyNum-", i), absl::StrCat("KeyValue-", i)});
  }
  resp = Run({"hscan", "largehash", "0", "count", "20"});
  EXPECT_THAT(resp, ArrLen(2));
  vec = StrArray(resp.GetVec()[1]);

  // See https://redis.io/commands/scan/ --> "The COUNT option", for why this cannot be exact
  EXPECT_GE(vec.size(), 40);  // This should be larger than (20 * 2) and less than about 50
  EXPECT_LT(vec.size(), 60);

  // Test NOVALUES option on 'myhash' (which has 10 items)
  resp = Run({"hscan", "myhash", "0", "NOVALUES"});
  EXPECT_THAT(resp, ArrLen(2));
  vec = StrArray(resp.GetVec()[1]);
  EXPECT_EQ(vec.size(), 10);
  EXPECT_THAT(vec, Each(StartsWith("Field")));  // Should contain "Field-X", but never "Value-X"
}

// Verifies that the NOVALUES flag functions correctly when combined with other arguments
// like MATCH and COUNT, ensuring values are suppressed even during filtered or limited scans.
TEST_F(HSetFamilyTest, HScan_NoValuesCombinations) {
  Run({"HSET", "h_combos", "user:1", "v1", "user:2", "v2", "admin:1", "v3"});

  // case 1: MATCH + NOVALUES
  // We want only keys starting with "user*", and NO values.
  auto resp = Run({"HSCAN", "h_combos", "0", "MATCH", "user:*", "NOVALUES"});
  ASSERT_THAT(resp, ArrLen(2));
  auto vec = StrArray(resp.GetVec()[1]);

  // Should find: "user:1", "user:2" (2 items)
  // Should NOT find: "admin:1" (filtered out)
  // Should NOT find: "v1", "v2" (values suppressed)
  EXPECT_EQ(vec.size(), 2);
  EXPECT_THAT(vec, UnorderedElementsAre("user:1", "user:2"));

  // case 2: COUNT + NOVALUES
  // Populate a larger hash to force scanning behavior, verify no values and only key present
  for (int i = 0; i < 50; ++i) {
    Run({"HSET", "h_large", absl::StrCat("k", i), "v"});
  }
  resp = Run({"HSCAN", "h_large", "0", "COUNT", "10", "NOVALUES"});
  vec = StrArray(resp.GetVec()[1]);
  EXPECT_GT(vec.size(), 0);
  EXPECT_THAT(vec, Not(Contains("v")));
  EXPECT_THAT(vec, Each(StartsWith("k")));
}

TEST_F(HSetFamilyTest, HScanLpMatchBug) {
  Run({"HSET", "key", "1", "2"});
  auto resp = Run({"hscan", "key", "0", "match", "1"});
  EXPECT_THAT(resp, ArrLen(2));
}

TEST_F(HSetFamilyTest, HincrbyFloat) {
  Run({"hincrbyfloat", "k", "a", "1.5"});
  EXPECT_EQ(Run({"hget", "k", "a"}), "1.5");

  Run({"hincrbyfloat", "k", "a", "1.5"});
  EXPECT_EQ(Run({"hget", "k", "a"}), "3");

  for (size_t i = 0; i < 500; ++i) {
    Run({"hincrbyfloat", "k", absl::StrCat("v", i), "1.5"});
  }

  for (size_t i = 0; i < 500; ++i) {
    EXPECT_EQ(Run({"hget", "k", absl::StrCat("v", i)}), "1.5");
  }
}

TEST_F(HSetFamilyTest, HincrbyFloatCornerCases) {
  Run({"hset", "k", "mhv", "-1.8E+308", "phv", "1.8E+308", "nd", "-+-inf", "+inf", "+inf", "nan",
       "nan", "-inf", "-inf"});
  // we don't support long doubles, so in all next cases we should return errors
  EXPECT_THAT(Run({"hincrbyfloat", "k", "mhv", "-1"}), ErrArg("ERR hash value is not a float"));
  EXPECT_THAT(Run({"hincrbyfloat", "k", "phv", "1"}), ErrArg("ERR hash value is not a float"));
  EXPECT_THAT(Run({"hincrbyfloat", "k", "nd", "1"}), ErrArg("ERR hash value is not a float"));
  EXPECT_THAT(Run({"hincrbyfloat", "k", "+inf", "1"}),
              ErrArg("increment would produce NaN or Infinity"));
  EXPECT_THAT(Run({"hincrbyfloat", "k", "nan", "1"}), ErrArg("ERR hash value is not a float"));
  EXPECT_THAT(Run({"hincrbyfloat", "k", "-inf", "1"}),
              ErrArg("increment would produce NaN or Infinity"));
}

TEST_F(HSetFamilyTest, HRandFloat) {
  Run({"HSET", "k", "1", "2"});

  EXPECT_EQ(Run({"hrandfield", "k"}), "1");

  for (size_t i = 0; i < 500; ++i) {
    Run({"hincrbyfloat", "k", absl::StrCat("v", i), "1.1"});
  }

  Run({"hrandfield", "k"});
}

TEST_F(HSetFamilyTest, HRandField) {
  // exercise Redis' listpack encoding
  Run({"HSET", "k", "a", "0", "b", "1", "c", "2"});

  EXPECT_THAT(Run({"hrandfield", "k"}), AnyOf("a", "b", "c"));

  EXPECT_THAT(Run({"hrandfield", "k", "2"}).GetVec(), IsSubsetOf({"a", "b", "c"}));

  EXPECT_THAT(Run({"hrandfield", "k", "3"}).GetVec(), UnorderedElementsAre("a", "b", "c"));

  EXPECT_THAT(Run({"hrandfield", "k", "4"}).GetVec(), UnorderedElementsAre("a", "b", "c"));

  auto resp = Run({"hrandfield", "k", "4", "withvalues"});
  EXPECT_THAT(resp, ArrLen(6));
  auto vec = resp.GetVec();

  std::vector<RespExpr> k, v;
  for (unsigned int i = 0; i < vec.size(); ++i) {
    if (i % 2 == 1)
      v.push_back(vec[i]);
    else
      k.push_back(vec[i]);
  }

  EXPECT_THAT(v, UnorderedElementsAre("0", "1", "2"));
  EXPECT_THAT(k, UnorderedElementsAre("a", "b", "c"));

  resp = Run({"hrandfield", "k", "-4", "withvalues"});
  EXPECT_THAT(resp, ArrLen(8));
  vec = resp.GetVec();
  k.clear();
  v.clear();
  for (unsigned int i = 0; i < vec.size(); ++i) {
    if (i % 2 == 0) {
      if (vec[i] == "a")
        EXPECT_EQ(vec[i + 1], "0");
      else if (vec[i] == "b")
        EXPECT_EQ(vec[i + 1], "1");
      else if (vec[i] == "c")
        EXPECT_EQ(vec[i + 1], "2");
      else
        ADD_FAILURE();
    }
  }

  // exercise Dragonfly's string map encoding
  int num_entries = 500;
  for (int i = 0; i < num_entries; i++) {
    Run({"HSET", "largehash", std::to_string(i), std::to_string(i * 10)});
  }

  resp = Run({"hrandfield", "largehash"});
  EXPECT_LE(stoi(resp.GetString()), num_entries - 1);
  EXPECT_GE(stoi(resp.GetString()), 0);

  resp = Run({"hrandfield", "largehash", std::to_string(num_entries / 2)});
  vec = resp.GetVec();
  std::vector<std::string> string_vec;
  for (auto v : vec) {
    string_vec.push_back(v.GetString());
  }

  sort(string_vec.begin(), string_vec.end());
  auto it = std::unique(string_vec.begin(), string_vec.end());
  bool is_unique = (it == string_vec.end());
  EXPECT_TRUE(is_unique);

  for (const auto& str : string_vec) {
    EXPECT_LE(stoi(str), num_entries - 1);
    EXPECT_GE(stoi(str), 0);
  }

  resp = Run({"hrandfield", "largehash", std::to_string(num_entries * -1 - 1)});
  EXPECT_THAT(resp, ArrLen(num_entries + 1));
  vec = resp.GetVec();

  string_vec.clear();
  for (auto v : vec) {
    string_vec.push_back(v.GetString());
    int i = stoi(v.GetString());
    EXPECT_LE(i, num_entries - 1);
    EXPECT_GE(i, 0);
  }

  sort(string_vec.begin(), string_vec.end());
  it = std::unique(string_vec.begin(), string_vec.end());
  is_unique = (it == string_vec.end());
  EXPECT_FALSE(is_unique);

  resp = Run({"hrandfield", "largehash", std::to_string(num_entries * -1 - 1), "withvalues"});
  EXPECT_THAT(resp, ArrLen((num_entries + 1) * 2));
  vec = resp.GetVec();

  string_vec.clear();
  for (unsigned int i = 0; i < vec.size(); ++i) {
    if (i % 2 == 0) {
      int k = stoi(vec[i].GetString());
      EXPECT_LE(k, num_entries - 1);
      EXPECT_GE(k, 0);
      int v = stoi(vec[i + 1].GetString());
      EXPECT_EQ(v, k * 10);
      string_vec.push_back(vec[i].GetString());
    }
  }

  sort(string_vec.begin(), string_vec.end());
  it = std::unique(string_vec.begin(), string_vec.end());
  is_unique = (it == string_vec.end());
  EXPECT_FALSE(is_unique);
}

TEST_F(HSetFamilyTest, HSetEx) {
  TEST_current_time_ms = kMemberExpiryBase * 1000;  // to reset to test time.

  auto resp = Run({"HSETEX", "k", "1", "f", "v"});
  EXPECT_THAT(resp, IntArg(1));

  AdvanceTime(500);
  EXPECT_THAT(Run({"HGET", "k", "f"}), "v");

  AdvanceTime(500);
  EXPECT_THAT(Run({"HGET", "k", "f"}), ArgType(RespExpr::NIL));

  const std::string_view long_time = "100"sv;

  resp = Run({"HSETEX", "k", long_time, "field1", "value"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"HSETEX", "k", long_time, "field1", "new_value"});
  EXPECT_THAT(resp, IntArg(0));

  resp = Run({"HGET", "k", "field1"});
  EXPECT_THAT(resp, "new_value");  // HSETEX without NX option; value was replaced by new_value

  resp = Run({"HSETEX", "k", long_time, "field2", "value"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"HSETEX", "k", "NX", long_time, "field2", "new_value"});
  EXPECT_THAT(resp, IntArg(0));

  resp = Run({"HGET", "k", "field2"});
  EXPECT_THAT(resp, "value");  // HSETEX with NX option; value was NOT replaced by new_value

  const std::string_view short_time = "1"sv;

  resp = Run({"HSETEX", "k", long_time, "field3", "value"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"HSETEX", "k", short_time, "field3", "value"});
  EXPECT_THAT(resp, IntArg(0));

  AdvanceTime(1000);
  resp = Run({"HGET", "k", "field3"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));
  // HSETEX without NX option; old expiration time was replaced by a new one

  resp = Run({"HSETEX", "k", long_time, "field4", "value"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"HSETEX", "k", "NX", short_time, "field4", "value"});
  EXPECT_THAT(resp, IntArg(0));

  AdvanceTime(1100);
  resp = Run({"HGET", "k", "field4"});
  EXPECT_THAT(resp,
              "value");  // HSETEX with NX option; old expiration time was NOT replaced by a new one

  // KEEPTTL related asserts
  EXPECT_THAT(Run({"HSETEX", "k", long_time, "kttlfield", "value"}), IntArg(1));
  EXPECT_EQ(Run({"HGET", "k", "kttlfield"}), "value");
  EXPECT_EQ(CheckedInt({"FIELDTTL", "k", "kttlfield"}), 100);

  // KEEPTTL resets value of kttlfield, but preserves its TTL. afield is added with TTL=1
  EXPECT_THAT(Run({"HSETEX", "k", "KEEPTTL", "1", "kttlfield", "resetvalue", "afield", "aval"}),
              IntArg(1));
  EXPECT_EQ(CheckedInt({"FIELDTTL", "k", "kttlfield"}), 100);
  EXPECT_EQ(Run({"FIELDTTL", "k", "afield"}).GetInt(), 1);
  EXPECT_EQ(Run({"HGET", "k", "afield"}), "aval");
  // make afield expire
  AdvanceTime(1000);
  EXPECT_THAT(Run({"HGET", "k", "afield"}), ArgType(RespExpr::NIL));

  // kttlfield is still present although with updated value
  EXPECT_EQ(Run({"HGET", "k", "kttlfield"}), "resetvalue");
  EXPECT_EQ(Run({"FIELDTTL", "k", "kttlfield"}).GetInt(), 99);

  // If NX is supplied, with or without KEEPTTL neither expiry nor value is updated
  EXPECT_THAT(Run({"HSETEX", "k", "NX", "KEEPTTL", "1", "kttlfield", "value"}), IntArg(0));

  // No updates
  EXPECT_EQ(Run({"HGET", "k", "kttlfield"}), "resetvalue");
  EXPECT_EQ(Run({"FIELDTTL", "k", "kttlfield"}).GetInt(), 99);

  EXPECT_THAT(Run({"HSETEX", "k", "NX", "1", "kttlfield", "value"}), IntArg(0));
  // No updates
  EXPECT_EQ(Run({"HGET", "k", "kttlfield"}), "resetvalue");
  EXPECT_EQ(Run({"FIELDTTL", "k", "kttlfield"}).GetInt(), 99);

  // Invalid TTL handling
  EXPECT_THAT(Run({"HSETEX", "k", "NX", "zero", "kttlfield", "value"}),
              ErrArg("ERR value is not an integer or out of range"));

  // Exercise the code path where a field is added without TTL, but then we set a new expiration AND
  // provide KEEPTTL. Since there was no old expiry, the new TTL should be applied.
  EXPECT_EQ(Run({"HSET", "k", "nottl", "val"}), 1);
  EXPECT_EQ(Run({"HSETEX", "k", "KEEPTTL", long_time, "nottl", "newval"}), 0);
  EXPECT_EQ(Run({"FIELDTTL", "k", "nottl"}).GetInt(), 100);

  EXPECT_THAT(Run({"HSETEX", "k", "NX", "KEEPTTL", "NX", "1", "v", "v2"}),
              ErrArg("ERR wrong number of arguments for 'hsetex' command"));
  EXPECT_THAT(Run({"HSETEX", "k", "KEEPTTL", "KEEPTTL", "1", "v", "v2"}),
              ErrArg("ERR wrong number of arguments for 'hsetex' command"));
}

TEST_F(HSetFamilyTest, TriggerConvertToStrMap) {
  const int kElements = 200;
  // Enough for IsGoodForListpack to become false
  for (size_t i = 0; i < kElements; i++) {
    auto k = absl::StrCat(100500700u + i);
    Run({"HSET", "hk", k, "100500700"});
  }
  EXPECT_THAT(Run({"HLEN", "hk"}), IntArg(kElements));
}

TEST_F(HSetFamilyTest, Issue1140) {
  Run({"HSET", "CaseKey", "Foo", "Bar"});

  EXPECT_EQ("Bar", Run({"HGET", "CaseKey", "Foo"}));
}

TEST_F(HSetFamilyTest, Issue2102) {
  // Set key with element that will expire after 1s
  EXPECT_EQ(CheckedInt({"HSETEX", "key", "10", "k1", "v1"}), 1);
  AdvanceTime(10'000);
  EXPECT_THAT(Run({"HGETALL", "key"}), RespArray(ElementsAre()));
}

TEST_F(HSetFamilyTest, HExpire) {
  EXPECT_EQ(CheckedInt({"HSET", "key", "k0", "v0", "k1", "v1", "k2", "v2"}), 3);
  EXPECT_THAT(Run({"HEXPIRE", "key", "10", "FIELDS", "3", "k0", "k1", "k2"}),
              RespArray(ElementsAre(IntArg(1), IntArg(1), IntArg(1))));
  AdvanceTime(10'000);
  EXPECT_THAT(Run({"HGETALL", "key"}), RespArray(ElementsAre()));

  EXPECT_EQ(CheckedInt({"HSETEX", "key2", "60", "k0", "v0", "k1", "v2"}), 2);
  EXPECT_THAT(Run({"HEXPIRE", "key2", "10", "FIELDS", "2", "k0", "k1"}),
              RespArray(ElementsAre(IntArg(1), IntArg(1))));
  AdvanceTime(10'000);
  EXPECT_THAT(Run({"HGETALL", "key2"}), RespArray(ElementsAre()));

  EXPECT_EQ(CheckedInt({"HSET", "key3", "k0", "v0", "k1", "v1", "k2", "v2", "k3", "v3", "k4", "v4",
                        "k5", "v5"}),
            6);
  EXPECT_THAT(Run({"HEXPIRE", "key3", "10", "XX", "FIELDS", "1", "k0"}), IntArg(0));
  EXPECT_THAT(Run({"HEXPIRE", "key3", "10", "NX", "FIELDS", "1", "k0"}), IntArg(1));
  EXPECT_THAT(Run({"HEXPIRE", "key3", "10", "NX", "FIELDS", "1", "k0"}), IntArg(0));
  EXPECT_THAT(Run({"HEXPIRE", "key3", "10", "XX", "FIELDS", "1", "k0"}), IntArg(1));
  EXPECT_THAT(Run({"HEXPIRE", "key3", "10", "NX", "FIELDS", "3", "k1", "k2", "k3"}),
              RespArray(ElementsAre(IntArg(1), IntArg(1), IntArg(1))));
  EXPECT_THAT(Run({"HEXPIRE", "key3", "8", "GT", "FIELDS", "1", "k2"}), IntArg(0));
  EXPECT_THAT(Run({"HEXPIRE", "key3", "12", "GT", "FIELDS", "1", "k2"}), IntArg(1));
  EXPECT_THAT(Run({"HEXPIRE", "key3", "8", "LT", "FIELDS", "1", "k3"}), IntArg(1));
  EXPECT_THAT(Run({"HEXPIRE", "key3", "12", "LT", "FIELDS", "1", "k3"}), IntArg(0));
  EXPECT_THAT(Run({"HEXPIRE", "key3", "10", "GT", "FIELDS", "1", "k4"}), IntArg(0));
  EXPECT_THAT(Run({"HEXPIRE", "key3", "10", "LT", "FIELDS", "1", "k5"}), IntArg(1));
  AdvanceTime(8'000);
  EXPECT_THAT(
      Run({"HGETALL", "key3"}),
      RespArray(UnorderedElementsAre("k0", "v0", "k1", "v1", "k2", "v2", "k4", "v4", "k5", "v5")));
  AdvanceTime(2'000);
  EXPECT_THAT(Run({"HGETALL", "key3"}), RespArray(UnorderedElementsAre("k2", "v2", "k4", "v4")));
  AdvanceTime(2'000);
  EXPECT_THAT(Run({"HGETALL", "key3"}), RespArray(ElementsAre("k4", "v4")));

  EXPECT_THAT(Run({"HEXPIRE", "key3", "10", "FIELDS", "1", "k4"}), IntArg(1));
  EXPECT_THAT(Run({"HEXPIRE", "key3", "0", "XX", "FIELDS", "1", "k4"}), IntArg(2));
  EXPECT_THAT(Run({"HGETALL", "key3"}), RespArray(ElementsAre()));

  EXPECT_EQ(
      CheckedInt({"HSET", "key4", "k0", "v0", "k1", "v1", "k2", "v2", "k3", "v3", "k4", "v4"}), 5);
  EXPECT_THAT(Run({"HEXPIRE", "key4", "0", "NX", "FIELDS", "2", "k0", "k1"}),
              RespElementsAre(IntArg(2), IntArg(2)));
  EXPECT_THAT(Run({"HEXPIRE", "key4", "0", "LT", "FIELDS", "2", "k2", "k3"}),
              RespElementsAre(IntArg(2), IntArg(2)));

  EXPECT_THAT(Run({"HEXPIRE", "key4", "0", "XX", "FIELDS", "1", "k4"}), IntArg(0));
  EXPECT_THAT(Run({"HEXPIRE", "key4", "10", "NX", "FIELDS", "1", "k4"}), IntArg(1));
  EXPECT_THAT(Run({"HEXPIRE", "key4", "0", "NX", "FIELDS", "1", "k4"}), IntArg(0));
  EXPECT_THAT(Run({"HEXPIRE", "key4", "0", "GT", "FIELDS", "1", "k4"}), IntArg(0));
  EXPECT_THAT(Run({"HEXPIRE", "key4", "0", "FIELDS", "1", "k4"}), IntArg(2));
  EXPECT_THAT(Run({"HGETALL", "key4"}), RespArray(ElementsAre()));
}

TEST_F(HSetFamilyTest, HExpireNoExpireEarly) {
  EXPECT_EQ(CheckedInt({"HSET", "key", "k0", "v0", "k1", "v1"}), 2);
  EXPECT_THAT(Run({"HEXPIRE", "key", "10", "FIELDS", "2", "k0", "k1"}),
              RespArray(ElementsAre(IntArg(1), IntArg(1))));
  AdvanceTime(9'000);
  EXPECT_THAT(Run({"HGETALL", "key"}), RespArray(UnorderedElementsAre("k0", "v0", "k1", "v1")));
}

TEST_F(HSetFamilyTest, HExpireNoSuchField) {
  EXPECT_EQ(CheckedInt({"HSET", "key", "k0", "v0"}), 1);
  EXPECT_THAT(Run({"HEXPIRE", "key", "10", "FIELDS", "2", "k0", "k1"}),
              RespArray(ElementsAre(IntArg(1), IntArg(-2))));
}

TEST_F(HSetFamilyTest, HExpireNoSuchKey) {
  EXPECT_THAT(Run({"HEXPIRE", "key", "10", "FIELDS", "2", "k0", "k1"}),
              RespArray(ElementsAre(IntArg(-2), IntArg(-2))));
}

TEST_F(HSetFamilyTest, HExpireNoAddNew) {
  Run({"HEXPIRE", "key", "10", "FIELDS", "1", "k0"});
  EXPECT_THAT(Run({"HGETALL", "key"}), RespArray(ElementsAre()));
}

TEST_F(HSetFamilyTest, HExpireWithNullChar) {
  string val_with_null("test\0test", 9);
  Run({"HSET", "hash", "field", val_with_null});
  string expected_val("test\0test", 9);
  EXPECT_EQ(ToSV(Run({"HGET", "hash", "field"}).GetBuf()), expected_val);
  Run({"HEXPIRE", "hash", "15", "FIELDS", "1", "field"});
  EXPECT_EQ(ToSV(Run({"HGET", "hash", "field"}).GetBuf()), expected_val);
}

TEST_F(HSetFamilyTest, HTtl) {
  // Non-existent key returns -2 for all fields
  EXPECT_THAT(Run({"HTTL", "nokey", "FIELDS", "2", "f1", "f2"}),
              RespArray(ElementsAre(IntArg(-2), IntArg(-2))));

  // Fields without TTL return -1, non-existent fields return -2
  EXPECT_EQ(CheckedInt({"HSET", "key", "k0", "v0", "k1", "v1"}), 2);
  EXPECT_THAT(Run({"HTTL", "key", "FIELDS", "3", "k0", "k1", "nosuch"}),
              RespArray(ElementsAre(IntArg(-1), IntArg(-1), IntArg(-2))));

  // Set expiry and verify TTL
  EXPECT_THAT(Run({"HEXPIRE", "key", "10", "FIELDS", "1", "k0"}), IntArg(1));
  EXPECT_THAT(Run({"HTTL", "key", "FIELDS", "2", "k0", "k1"}),
              RespArray(ElementsAre(IntArg(10), IntArg(-1))));

  // Advance time and verify TTL decreases
  AdvanceTime(3000);
  EXPECT_THAT(Run({"HTTL", "key", "FIELDS", "1", "k0"}), IntArg(7));

  // Wrong type
  Run({"SET", "strkey", "val"});
  EXPECT_THAT(Run({"HTTL", "strkey", "FIELDS", "1", "f"}), ErrArg("WRONGTYPE"));

  // Syntax errors
  EXPECT_THAT(Run({"HTTL", "key", "1", "k0"}), ErrArg("Mandatory argument FIELDS"));
  EXPECT_THAT(Run({"HTTL", "key", "FIELDS", "2", "k0"}), ErrArg("numfields"));
}

TEST_F(HSetFamilyTest, RandomFieldAllExpired) {
  for (int i = 0; i < 10; ++i) {
    EXPECT_EQ(CheckedInt({"HSETEX", "key", "10", absl::StrCat("k", i), "v"}), 1);
  }
  AdvanceTime(10'000);
  EXPECT_THAT(Run({"HRANDFIELD", "key"}), ArgType(RespExpr::NIL));
}

TEST_F(HSetFamilyTest, RandomField1NotExpired) {
  for (int i = 0; i < 10; ++i) {
    EXPECT_EQ(CheckedInt({"HSETEX", "key", "10", absl::StrCat("k", i), "v"}), 1);
  }
  EXPECT_EQ(CheckedInt({"HSET", "key", "keep", "v"}), 1);

  AdvanceTime(10'000);
  EXPECT_THAT(Run({"HRANDFIELD", "key"}), "keep");
}

TEST_F(HSetFamilyTest, EmptyHashBug) {
  EXPECT_THAT(Run({"HSET", "foo", "a_field", "a_value"}), IntArg(1));
  EXPECT_THAT(Run({"HSETEX", "foo", "1", "b_field", "b_value"}), IntArg(1));
  EXPECT_THAT(Run({"HDEL", "foo", "a_field"}), IntArg(1));

  AdvanceTime(4000);

  EXPECT_THAT(Run({"HGETALL", "foo"}), RespArray(ElementsAre()));
  EXPECT_THAT(Run({"EXISTS", "foo"}), IntArg(0));
}

TEST_F(HSetFamilyTest, ScanAfterExpireSet) {
  EXPECT_THAT(Run({"HSET", "aset", "afield", "avalue"}), IntArg(1));
  EXPECT_THAT(Run({"HEXPIRE", "aset", "1", "FIELDS", "1", "afield"}), IntArg(1));

  const auto resp = Run({"HSCAN", "aset", "0", "count", "100"});
  EXPECT_THAT(resp, ArrLen(2));

  const auto vec = StrArray(resp.GetVec()[1]);
  EXPECT_EQ(vec.size(), 2);

  EXPECT_THAT(vec, Contains("afield").Times(1));
  EXPECT_THAT(vec, Contains("avalue").Times(1));
}

TEST_F(HSetFamilyTest, KeyRemovedWhenEmpty) {
  auto test_cmd = [&](const std::function<void()>& f, const std::string_view tag) {
    EXPECT_THAT(Run({"HSET", "a", "afield", "avalue"}), IntArg(1));
    EXPECT_THAT(Run({"HEXPIRE", "a", "1", "FIELDS", "1", "afield"}), IntArg(1));
    AdvanceTime(1000);

    EXPECT_THAT(Run({"EXISTS", "a"}), IntArg(1));
    f();
    EXPECT_THAT(Run({"EXISTS", "a"}), IntArg(0)) << "failed when testing " << tag;
  };

  test_cmd([&] { EXPECT_THAT(Run({"HGET", "a", "afield"}), ArgType(RespExpr::NIL)); }, "HGET");
  test_cmd([&] { EXPECT_THAT(Run({"HGETALL", "a"}), RespArray(ElementsAre())); }, "HGETALL");
  test_cmd([&] { EXPECT_THAT(Run({"HDEL", "a", "afield"}), IntArg(0)); }, "HDEL");
  test_cmd([&] { EXPECT_THAT(Run({"HSCAN", "a", "0"}).GetVec()[0], "0"); }, "HSCAN");
  test_cmd([&] { EXPECT_THAT(Run({"HMGET", "a", "afield"}), ArgType(RespExpr::NIL)); }, "HMGET");
  test_cmd([&] { EXPECT_THAT(Run({"HEXISTS", "a", "afield"}), IntArg(0)); }, "HEXISTS");
  test_cmd([&] { EXPECT_THAT(Run({"HSTRLEN", "a", "afield"}), IntArg(0)); }, "HSTRLEN");
}

TEST_F(HSetFamilyTest, HRandFieldRespFormat) {
  absl::flat_hash_map<std::string, std::string> expected{
      {"a", "1"},
      {"b", "2"},
      {"c", "3"},
  };
  Run({"HELLO", "3"});
  EXPECT_THAT(Run({"HSET", "key", "a", "1", "b", "2", "c", "3"}), IntArg(3));
  auto resp = Run({"HRANDFIELD", "key", "3", "WITHVALUES"});
  EXPECT_THAT(resp, ArrLen(3));
  for (const auto& v : resp.GetVec()) {
    EXPECT_THAT(v, ArrLen(2));
    const auto& kv = v.GetVec();
    EXPECT_THAT(kv[0], AnyOf("a", "b", "c"));
    EXPECT_THAT(kv[1], expected[kv[0].GetView()]);
  }

  Run({"HELLO", "2"});
  resp = Run({"HRANDFIELD", "key", "3", "WITHVALUES"});
  EXPECT_THAT(resp, ArrLen(6));
  const auto& vec = resp.GetVec();
  for (size_t i = 0; i < vec.size(); i += 2) {
    EXPECT_THAT(vec[i], AnyOf("a", "b", "c"));
    EXPECT_THAT(vec[i + 1], expected[vec[i].GetView()]);
  }
}

// Make sure no "Zombie Key": HEXPIRE with TTL 0 must delete the key
// if the hash becomes empty. If the key remains (zombie), saving the RDB or running
// commands like EXISTS against it may lead to crashes or other incorrect behavior.
TEST_F(HSetFamilyTest, HExpireZeroTTL_DeletesKey) {
  constexpr auto kRdbFile = "zombie_test.rdb";
  auto cleanup = absl::MakeCleanup([kRdbFile] { std::ignore = remove(kRdbFile); });
  Run({"HSET", "zombie", "f", "v"});
  auto resp = Run({"HEXPIRE", "zombie", "0", "FIELDS", "1", "f"});
  EXPECT_THAT(resp, IntArg(2));
  EXPECT_EQ(0, CheckedInt({"EXISTS", "zombie"}));
  EXPECT_EQ(Run({"SAVE", "RDB", kRdbFile}), "OK");
}

// HINCRBYFLOAT with NaN on a non-existing key must not create a zombie empty hash.
// Before the fix, the key was left in the DB with an empty listpack, causing HRANDFIELD
// to crash with CHECK(lplen > 0 && lplen % 2 == 0).
TEST_F(HSetFamilyTest, HIncrByFloatNaNDoesNotCreateKey) {
  EXPECT_THAT(Run({"HINCRBYFLOAT", "key", "field", "nan"}),
              ErrArg("increment would produce NaN or Infinity"));
  EXPECT_EQ(0, CheckedInt({"EXISTS", "key"}));
  EXPECT_THAT(Run({"HRANDFIELD", "key"}), ArgType(RespExpr::NIL));
}

}  // namespace dfly


================================================
FILE: src/server/http_api.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/http_api.h"

#include "base/logging.h"
#include "core/flatbuffers.h"
#include "facade/conn_context.h"
#include "facade/reply_capture.h"
#include "server/conn_context.h"
#include "server/main_service.h"
#include "util/http/http_common.h"

namespace dfly {
using namespace util;
using namespace std;
namespace h2 = boost::beast::http;
namespace payload = facade::payload;
namespace {

bool IsVectorOfStrings(flexbuffers::Reference req) {
  if (!req.IsVector()) {
    return false;
  }

  auto vec = req.AsVector();
  if (vec.size() == 0) {
    return false;
  }

  for (size_t i = 0; i < vec.size(); ++i) {
    if (!vec[i].IsString()) {
      return false;
    }
  }
  return true;
}

// Escape a string so that it is legal to print it in JSON text.
std::string JsonEscape(string_view input) {
  auto hex_digit = [](unsigned c) -> char {
    DCHECK_LT(c, 0xFu);
    return c < 10 ? c + '0' : c - 10 + 'a';
  };

  string out;
  out.reserve(input.size() + 2);
  out.push_back('\"');

  auto p = input.begin();
  auto e = input.end();

  while (p < e) {
    uint8_t c = *p;
    if (c == '\\' || c == '\"') {
      out.push_back('\\');
      out.push_back(*p++);
    } else if (c <= 0x1f) {
      switch (c) {
        case '\b':
          out.append("\\b");
          p++;
          break;
        case '\f':
          out.append("\\f");
          p++;
          break;
        case '\n':
          out.append("\\n");
          p++;
          break;
        case '\r':
          out.append("\\r");
          p++;
          break;
        case '\t':
          out.append("\\t");
          p++;
          break;
        default:
          // this condition captures non readable chars with value < 32,
          // so size = 1 byte (e.g control chars).
          out.append("\\u00");
          out.push_back(hex_digit((c & 0xf0) >> 4));
          out.push_back(hex_digit(c & 0xf));
          p++;
      }
    } else {
      out.push_back(*p++);
    }
  }

  out.push_back('\"');
  return out;
}

struct CaptureVisitor {
  CaptureVisitor() {
    str = R"({"result":)";
  }

  void operator()(monostate) {
  }

  void operator()(long v) {
    absl::StrAppend(&str, v);
  }

  void operator()(double v) {
    absl::StrAppend(&str, v);
  }

  void operator()(const payload::SimpleString& ss) {
    absl::StrAppend(&str, "\"", ss, "\"");
  }

  void operator()(const payload::BulkString& bs) {
    absl::StrAppend(&str, JsonEscape(bs));
  }

  void operator()(payload::Null) {
    absl::StrAppend(&str, "null");
  }

  void operator()(const payload::Error& err) {
    str = absl::StrCat(R"({"error": ")", err->first, "\"");
  }

  void operator()(facade::OpStatus status) {
    absl::StrAppend(&str, "\"", facade::StatusToMsg(status), "\"");
  }

  void operator()(unique_ptr<payload::CollectionPayload> cp) {
    if (!cp) {
      absl::StrAppend(&str, "null");
      return;
    }
    if (cp->len == 0 && cp->type == facade::CollectionType::ARRAY) {
      absl::StrAppend(&str, "[]");
      return;
    }
    absl::StrAppend(&str, "[");
    bool append_delimiter = false;
    for (auto& pl : cp->arr) {
      if (append_delimiter) {
        absl::StrAppend(&str, ",");
      }
      append_delimiter = true;
      visit(*this, std::move(pl));
    }
    absl::StrAppend(&str, "]");
  }
  string str;
};

}  // namespace

void HttpAPI(const http::QueryArgs& args, HttpRequest&& req, Service* service,
             HttpContext* http_cntx) {
  auto& body = req.body();

  flexbuffers::Builder fbb;
  flatbuffers::Parser parser;
  flexbuffers::Reference doc;
  bool success = parser.ParseFlexBuffer(body.c_str(), nullptr, &fbb);
  if (success) {
    fbb.Finish();
    doc = flexbuffers::GetRoot(fbb.GetBuffer());
    if (!IsVectorOfStrings(doc)) {
      success = false;
    }
  }

  // TODO: to add a content-type/json check.
  if (!success) {
    VLOG(1) << "Invalid body " << body;
    auto response = http::MakeStringResponse(h2::status::bad_request);
    http::SetMime(http::kTextMime, &response);
    response.body() = "Failed to parse json\r\n";
    http_cntx->Invoke(std::move(response));
    return;
  }

  flexbuffers::Vector vec = doc.AsVector();

  facade::ConnectionContext* context = (facade::ConnectionContext*)http_cntx->user_data();
  DCHECK(context);

  facade::CapturingReplyBuilder reply_builder;

  // TODO: to finish this.

  CommandContext cmd_cntx;

  cmd_cntx.Init(&reply_builder, context);
  for (size_t i = 0; i < vec.size(); ++i) {
    cmd_cntx.PushArg(vec[i].AsString().c_str());
  }
  service->DispatchCommand(facade::ParsedArgs{cmd_cntx}, &cmd_cntx,
                           facade::AsyncPreference::ONLY_SYNC);
  facade::CapturingReplyBuilder::Payload payload = reply_builder.Take();

  auto response = http::MakeStringResponse();
  http::SetMime(http::kJsonMime, &response);

  CaptureVisitor visitor;
  std::visit(visitor, std::move(payload));
  visitor.str.append("}\r\n");
  response.body() = visitor.str;
  http_cntx->Invoke(std::move(response));
}

}  // namespace dfly


================================================
FILE: src/server/http_api.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include "util/http/http_handler.h"

namespace dfly {
class Service;
using HttpRequest = util::HttpListenerBase::RequestType;

/**
 * @brief The main handler function for dispatching commands via HTTP.
 *
 * @param args - query arguments. currently not used.
 * @param req  - full http request including the body that should consist of a json array
 *               representing a Dragonfly command. aka `["set", "foo", "bar"]`
 * @param service - a pointer to dfly::Service* object.
 * @param http_cntxt - a pointer to the http context object which provide dragonfly context
 *                     information via user_data() and allows to reply with HTTP responses.
 */
void HttpAPI(const util::http::QueryArgs& args, HttpRequest&& req, Service* service,
             util::HttpContext* http_cntxt);

}  // namespace dfly


================================================
FILE: src/server/journal/CMakeLists.txt
================================================
SET(DF_JOURNAL_SRCS
    journal/cmd_serializer.cc journal/tx_executor.cc namespaces.cc
    journal/journal.cc journal/types.cc journal/journal_slice.cc
    journal/serializer.cc journal/executor.cc journal/streamer.cc
    PARENT_SCOPE)


================================================
FILE: src/server/journal/cmd_serializer.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/journal/cmd_serializer.h"

#include "server/container_utils.h"
#include "server/db_slice.h"
#include "server/engine_shard.h"
#include "server/journal/serializer.h"
#include "server/rdb_save.h"
#include "server/tiered_storage.h"

namespace dfly {

namespace {
using namespace std;

class CommandAggregator {
 public:
  using WriteCmdCallback = std::function<void(absl::Span<const string_view>)>;

  CommandAggregator(string_view key, WriteCmdCallback cb, size_t max_agg_bytes)
      : key_(key), cb_(std::move(cb)), max_aggragation_bytes_(max_agg_bytes) {
  }

  ~CommandAggregator() {
    CommitPending();
  }

  enum class CommitMode : uint8_t { kAuto, kNoCommit };

  // Returns whether CommitPending() was called
  bool AddArg(string arg, CommitMode commit_mode = CommitMode::kAuto) {
    agg_bytes_ += arg.size();
    members_.push_back(std::move(arg));

    if (commit_mode != CommitMode::kNoCommit && agg_bytes_ >= max_aggragation_bytes_) {
      CommitPending();
      return true;
    }

    return false;
  }

 private:
  void CommitPending() {
    if (members_.empty()) {
      return;
    }

    args_.clear();
    args_.reserve(members_.size() + 1);
    args_.push_back(key_);
    for (string_view member : members_) {
      args_.push_back(member);
    }
    cb_(args_);
    members_.clear();
  }

  string_view key_;
  WriteCmdCallback cb_;
  vector<string> members_;
  absl::InlinedVector<string_view, 5> args_;
  size_t agg_bytes_ = 0;
  size_t max_aggragation_bytes_;
};

}  // namespace

CmdSerializer::CmdSerializer(DbSlice* db_slice, FlushSerialized cb,
                             size_t max_serialization_buffer_size)
    : db_slice_(db_slice),
      cb_(std::move(cb)),
      max_serialization_buffer_size_(max_serialization_buffer_size) {
  serializer_ = std::make_unique<RdbSerializer>(GetDefaultCompressionMode());
}

size_t CmdSerializer::SerializeEntry(string_view key, const PrimeKey& pk, const PrimeValue& pv,
                                     uint64_t expire_ms) {
  // We send RESTORE commands objects we don't support breaking.
  bool use_restore_serialization = true;
  size_t commands = 1;
  switch (pv.ObjType()) {
    case OBJ_SET:
      commands = SerializeSet(key, pv);
      use_restore_serialization = false;
      break;
    case OBJ_ZSET:
      commands = SerializeZSet(key, pv);
      use_restore_serialization = false;
      break;
    case OBJ_HASH:
      commands = SerializeHash(key, pv);
      use_restore_serialization = false;
      break;
    case OBJ_LIST:
      commands = SerializeList(key, pv);
      use_restore_serialization = false;
      break;
    case OBJ_STRING:
      commands = SerializeString(key, pv, expire_ms);
      use_restore_serialization = false;
      // reset expire_ms to skip it in SerializeExpireIfNeeded
      expire_ms = 0;
      break;
    case OBJ_STREAM:
    case OBJ_JSON:
    case OBJ_SBF:
    default:
      // These types are unsupported wrt splitting huge values to multiple commands, so we send
      // them as a RESTORE command.
      break;
  }

  if (use_restore_serialization) {
    // RESTORE sets STICK and EXPIRE as part of the command.
    SerializeRestore(key, pk, pv, expire_ms);
  } else {
    SerializeStickIfNeeded(key, pk);
    SerializeExpireIfNeeded(key, expire_ms);
  }
  return commands;
}

size_t CmdSerializer::SerializeDelayedEntries(bool force,
                                              absl::flat_hash_set<std::string>* tiered_keys) {
  size_t serialized = 0;
  for (auto it = delayed_entries_.begin(); it != delayed_entries_.end();) {
    auto& entry = it->second;
    // Skip unresolved entries unless force is true
    if (!force && !entry->value.IsResolved()) {
      ++it;
      continue;
    }

    // If tiered_keys filter is provided, only serialize matching keys
    // Compare the string key from the map with the keys in tiered_keys set
    if (tiered_keys && !tiered_keys->contains(it->first)) {
      ++it;
      continue;
    }

    // Get the value from the future (blocks if not resolved and force=true)
    auto res = entry->value.Get();
    if (!res.has_value()) {
      LOG(ERROR) << "Failed to read delayed entry for key " << entry->key.ToString();
      it++;
      continue;
    }

    // Serialize the entry and remove it from delayed_entries_
    PrimeValue pv{*res};
    serialized += SerializeEntry(entry->key.ToString(), entry->key, pv, entry->expire);
    delayed_entries_.erase(it++);
  }
  return serialized;
}

void CmdSerializer::SerializeCommand(string_view cmd, absl::Span<const string_view> args) {
  journal::Entry entry(0,                     // txid
                       journal::Op::COMMAND,  // single command
                       0,                     // db index
                       0,                     // slot-id, but it is ignored at this level
                       journal::Entry::Payload(cmd, ArgSlice(args)));

  // Serialize into a string
  io::StringSink cmd_sink;
  JournalWriter writer{&cmd_sink};
  writer.Write(entry);

  cb_(std::move(cmd_sink).str());
}

void CmdSerializer::SerializeStickIfNeeded(string_view key, const PrimeKey& pk) {
  if (!pk.IsSticky()) {
    return;
  }

  SerializeCommand("STICK", {key});
}

void CmdSerializer::SerializeExpireIfNeeded(string_view key, uint64_t expire_ms) {
  if (expire_ms == 0) {
    return;
  }

  SerializeCommand("PEXPIREAT", {key, absl::StrCat(expire_ms)});
}

size_t CmdSerializer::SerializeSet(string_view key, const PrimeValue& pv) {
  CommandAggregator aggregator(
      key, [&](absl::Span<const string_view> args) { SerializeCommand("SADD", args); },
      max_serialization_buffer_size_);

  size_t commands = 0;
  container_utils::IterateSet(pv, [&](container_utils::ContainerEntry ce) {
    commands += aggregator.AddArg(ce.ToString());
    return true;
  });
  return commands;
}

size_t CmdSerializer::SerializeZSet(string_view key, const PrimeValue& pv) {
  CommandAggregator aggregator(
      key, [&](absl::Span<const string_view> args) { SerializeCommand("ZADD", args); },
      max_serialization_buffer_size_);

  size_t commands = 0;
  container_utils::IterateSortedSet(
      pv,
      [&](container_utils::ContainerEntry ce, double score) {
        aggregator.AddArg(absl::StrCat(score), CommandAggregator::CommitMode::kNoCommit);
        commands += aggregator.AddArg(ce.ToString());
        return true;
      },
      /*start=*/0, /*end=*/SIZE_MAX, /*reverse=*/false, /*use_score=*/true);
  return commands;
}

size_t CmdSerializer::SerializeHash(string_view key, const PrimeValue& pv) {
  CommandAggregator aggregator(
      key, [&](absl::Span<const string_view> args) { SerializeCommand("HSET", args); },
      max_serialization_buffer_size_);

  size_t commands = 0;
  container_utils::IterateMap(
      pv, [&](container_utils::ContainerEntry k, container_utils::ContainerEntry v) {
        aggregator.AddArg(k.ToString(), CommandAggregator::CommitMode::kNoCommit);
        commands += aggregator.AddArg(v.ToString());
        return true;
      });
  return commands;
}

size_t CmdSerializer::SerializeList(string_view key, const PrimeValue& pv) {
  CommandAggregator aggregator(
      key, [&](absl::Span<const string_view> args) { SerializeCommand("RPUSH", args); },
      max_serialization_buffer_size_);

  size_t commands = 0;
  container_utils::IterateList(pv, [&](container_utils::ContainerEntry ce) {
    commands += aggregator.AddArg(ce.ToString());
    return true;
  });
  return commands;
}

size_t CmdSerializer::SerializeString(string_view key, const PrimeValue& pv, uint64_t expire_ms) {
  string str;
  if (pv.IsExternal()) {
    if (pv.IsCool()) {
      pv.GetCool().record->value.GetString(&str);
    } else {
      SerializeExternal(key, pv, expire_ms);
      return 0;
    }
  } else {
    pv.GetString(&str);
  }

  if (expire_ms) {
    std::string expire_ms_str = to_string(expire_ms);
    std::string_view args[] = {key, string_view(str), "PXAT", string_view(expire_ms_str)};
    SerializeCommand("SET", args);
  } else {
    std::string_view args[] = {key, string_view(str)};
    SerializeCommand("SET", args);
  }

  return 1;
}

void CmdSerializer::SerializeRestore(string_view key, const PrimeKey& pk, const PrimeValue& pv,
                                     uint64_t expire_ms) {
  absl::InlinedVector<string_view, 5> args;
  args.push_back(key);

  string expire_str = absl::StrCat(expire_ms);
  args.push_back(expire_str);

  // TODO we already ignore CRC in the load rdb code during migration, we need to provide ignore_crc
  // = true when we are sure that all shards ignore crc during migration process
  std::string value_dump = RdbSerializerBase::DumpValue(serializer_.get(), pv, false);
  args.push_back(value_dump);

  args.push_back("ABSTTL");  // Means expire string is since epoch

  if (pk.IsSticky()) {
    args.push_back("STICK");
  }

  SerializeCommand("RESTORE", args);
}

void CmdSerializer::SerializeExternal(std::string_view key, const PrimeValue& pv,
                                      time_t expire_time) {
  // In cluster mode, db_id is always 0
  constexpr DbIndex kClusterDbId = 0;
  auto future = ReadTieredString(kClusterDbId, key, pv, EngineShard::tlocal()->tiered_storage());
  PrimeKey prime_key{key};
  uint32_t mc_flags = pv.HasFlag() ? db_slice_->GetMCFlag(kClusterDbId, prime_key) : 0;
  auto entry = std::make_unique<TieredDelayedEntry>(kClusterDbId, std::move(prime_key),
                                                    std::move(future), expire_time, mc_flags);
  delayed_entries_.emplace(key, std::move(entry));
}

}  // namespace dfly


================================================
FILE: src/server/journal/cmd_serializer.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/types/span.h>

#include <string>
#include <string_view>

#include "server/table.h"
#include "server/tiered_storage.h"
#include "server/tx_base.h"

namespace dfly {

class RdbSerializer;

// CmdSerializer serializes DB entries (key+value) into command(s) in RESP format string.
// Small entries are serialized as RESTORE commands, while bigger ones (see
// serialization_max_chunk_size) are split into multiple commands (like rpush, hset, etc).
// Expiration and stickiness are also serialized into commands.
class CmdSerializer {
 public:
  using FlushSerialized = std::function<void(std::string)>;

  explicit CmdSerializer(DbSlice* db_slice, FlushSerialized cb,
                         size_t max_serialization_buffer_size);

  // Returns how many commands we broke this entry into (like multiple HSETs etc)
  size_t SerializeEntry(std::string_view key, const PrimeKey& pk, const PrimeValue& pv,
                        uint64_t expire_ms);

  // Serialize delayed entries. If force is true, blocks until all are resolved.
  // If force is false, only serializes entries whose futures are already resolved.
  // If tiered_keys is provided, only serializes entries whose keys are in the set.
  size_t SerializeDelayedEntries(bool force, absl::flat_hash_set<std::string>* tiered_keys);

 private:
  void SerializeCommand(std::string_view cmd, absl::Span<const std::string_view> args);
  void SerializeStickIfNeeded(std::string_view key, const PrimeKey& pk);
  void SerializeExpireIfNeeded(std::string_view key, uint64_t expire_ms);

  size_t SerializeSet(std::string_view key, const PrimeValue& pv);
  size_t SerializeZSet(std::string_view key, const PrimeValue& pv);
  size_t SerializeHash(std::string_view key, const PrimeValue& pv);
  size_t SerializeList(std::string_view key, const PrimeValue& pv);
  size_t SerializeString(std::string_view key, const PrimeValue& pv, uint64_t expire_ms);
  void SerializeRestore(std::string_view key, const PrimeKey& pk, const PrimeValue& pv,
                        uint64_t expire_ms);
  void SerializeExternal(std::string_view key, const PrimeValue& pv, time_t expire_time);

  DbSlice* db_slice_;
  FlushSerialized cb_;
  size_t max_serialization_buffer_size_;
  std::unique_ptr<RdbSerializer> serializer_;
  absl::flat_hash_map<std::string, std::unique_ptr<TieredDelayedEntry>> delayed_entries_;
};

}  // namespace dfly


================================================
FILE: src/server/journal/executor.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/journal/executor.h"

#include <absl/strings/str_cat.h>
#include <absl/strings/str_split.h>

#include <algorithm>
#include <memory>

#include "base/logging.h"
#include "facade/reply_capture.h"
#include "facade/service_interface.h"
#include "server/main_service.h"
#include "server/namespaces.h"

using namespace std;

namespace dfly {

namespace {
// Build a CmdData from parts passed to absl::StrCat.
template <typename... Ts> void BuildFromParts(cmn::BackedArguments* dest, Ts... parts) {
  vector<string> raw_parts{absl::StrCat(std::forward<Ts>(parts))...};

  dest->Assign(raw_parts.begin(), raw_parts.end(), raw_parts.size());
}

}  // namespace

JournalExecutor::JournalExecutor(Service* service)
    : service_{service},
      reply_builder_{new facade::CapturingReplyBuilder{facade::ReplyMode::NONE}},
      conn_context_{nullptr, acl::UserCredentials{}} {
  conn_context_.is_replicating = true;
  conn_context_.journal_emulated = true;
  conn_context_.skip_acl_validation = true;
  conn_context_.ns = &namespaces->GetDefaultNamespace();
}

JournalExecutor::~JournalExecutor() {
}

facade::DispatchResult JournalExecutor::Execute(DbIndex dbid, journal::ParsedEntry::CmdData& cmd) {
  SelectDb(dbid);
  CommandContext cntx_cmd;
  cntx_cmd.Init(reply_builder_.get(), &conn_context_);

  // TODO: we should improve interfaces in callers (replica and rdb_load) so that we pass
  // CommandContext directly and avoid this swap.
  cntx_cmd.SwapArgs(cmd);
  return Execute(&cntx_cmd);
}

void JournalExecutor::FlushAll() {
  CommandContext cmd;
  cmd.Init(reply_builder_.get(), &conn_context_);
  BuildFromParts(&cmd, "FLUSHALL");
  std::ignore = Execute(&cmd);
}

void JournalExecutor::FlushSlots(const cluster::SlotRange& slot_range) {
  CommandContext cmd;
  cmd.Init(reply_builder_.get(), &conn_context_);
  BuildFromParts(&cmd, "DFLYCLUSTER", "FLUSHSLOTS", slot_range.start, slot_range.end);
  std::ignore = Execute(&cmd);
}

facade::DispatchResult JournalExecutor::Execute(CommandContext* cmd_cntx) {
  return service_->DispatchCommand(facade::ParsedArgs{*cmd_cntx}, cmd_cntx,
                                   facade::AsyncPreference::ONLY_SYNC);
}

void JournalExecutor::SelectDb(DbIndex dbid) {
  if (ensured_dbs_.size() <= dbid)
    ensured_dbs_.resize(dbid + 1);

  if (!ensured_dbs_[dbid]) {
    CommandContext cmd;

    cmd.Init(reply_builder_.get(), &conn_context_);
    BuildFromParts(&cmd, "SELECT", dbid);
    std::ignore = Execute(&cmd);
    ensured_dbs_[dbid] = true;

    // TODO: This is a temporary fix for #4146.
    // For some reason without this the replication breaks in regtests.
    auto cb = [](EngineShard* shard) { return OpStatus::OK; };
    shard_set->RunBriefInParallel(std::move(cb));
  } else {
    conn_context_.conn_state.db_index = dbid;
  }
}

}  // namespace dfly


================================================
FILE: src/server/journal/executor.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/types/span.h>

#include "facade/service_interface.h"
#include "server/cluster/cluster_defs.h"
#include "server/conn_context.h"
#include "server/journal/types.h"

namespace facade {
class CapturingReplyBuilder;
}  // namespace facade

namespace dfly {

class Service;

// JournalExecutor allows executing journal entries.
class JournalExecutor {
 public:
  explicit JournalExecutor(Service* service);
  ~JournalExecutor();

  JournalExecutor(JournalExecutor&&) = delete;

  // Returns the result of Service::DispatchCommand
  facade::DispatchResult Execute(DbIndex dbid, journal::ParsedEntry::CmdData& cmd);

  void FlushAll();  // Execute FLUSHALL.
  void FlushSlots(const cluster::SlotRange& slot_range);

  ConnectionContext* connection_context() {
    return &conn_context_;
  }

 private:
  facade::DispatchResult Execute(CommandContext* cmd_cntx);

  // Select database. Ensure it exists if accessed for first time.
  void SelectDb(DbIndex dbid);

  Service* service_;
  std::unique_ptr<facade::CapturingReplyBuilder> reply_builder_;
  ConnectionContext conn_context_;

  std::vector<bool> ensured_dbs_;
};

}  // namespace dfly


================================================
FILE: src/server/journal/journal.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/journal/journal.h"

#include "base/logging.h"
#include "server/engine_shard_set.h"
#include "server/journal/journal_slice.h"

namespace dfly {
namespace journal {

using namespace std;
using namespace util;

namespace {

// Active only in shard threads.
thread_local JournalSlice journal_slice;

}  // namespace

void StartInThread() {
  journal_slice.Init();

  EngineShard* shard = EngineShard::tlocal();
  shard->set_journal(true);
}

void StartInThreadAtLsn(LSN lsn) {
  StartInThread();
  journal_slice.ResetRingBuffer();
  journal_slice.SetStartingLSN(lsn);
}

error_code Close() {
  VLOG(1) << "Journal::Close";

  auto close_cb = [&](auto* shard) {
    journal_slice.ResetRingBuffer();
    shard->set_journal(false);
  };

  shard_set->RunBriefInParallel(close_cb);

  return {};
}

bool HasRegisteredCallbacks() {
  return journal_slice.HasRegisteredCallbacks();
}

bool IsLSNInBuffer(LSN lsn) {
  return journal_slice.IsLSNInBuffer(lsn);
}

std::string_view GetEntry(LSN lsn) {
  return journal_slice.GetEntry(lsn);
}

uint32_t RegisterConsumer(JournalConsumerInterface* consumer) {
  return journal_slice.RegisterOnChange(consumer);
}

void UnregisterConsumer(uint32_t id) {
  journal_slice.UnregisterOnChange(id);
}

LSN GetLsn() {
  return journal_slice.cur_lsn();
}

void RecordEntry(TxId txid, Op opcode, DbIndex dbid, std::optional<SlotId> slot,
                 Entry::Payload payload) {
  journal_slice.AddLogRecord(Entry{txid, opcode, dbid, slot, std::move(payload)});
}

void SetFlushMode(bool allow_flush) {
  journal_slice.SetFlushMode(allow_flush);
}

size_t LsnBufferSize() {
  return journal_slice.GetRingBufferSize();
}

size_t LsnBufferBytes() {
  return journal_slice.GetRingBufferBytes();
}

size_t thread_local DisableFlushGuard::counter_ = 0;

}  // namespace journal
}  // namespace dfly


================================================
FILE: src/server/journal/journal.h
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once
#include "server/journal/types.h"
#include "util/fibers/detail/fiber_interface.h"

namespace dfly {

namespace journal {

void StartInThread();

// Starts the journal at specified LSN
// Also drops the (resets) the partial sync buffers
void StartInThreadAtLsn(LSN lsn);

std::error_code Close();

//******* The following functions must be called in the context of the owning shard *********//

bool HasRegisteredCallbacks();

bool IsLSNInBuffer(LSN lsn);

std::string_view GetEntry(LSN lsn);

LSN GetLsn();
uint32_t RegisterConsumer(JournalConsumerInterface* consumer);
void UnregisterConsumer(uint32_t id);

void RecordEntry(TxId txid, Op opcode, DbIndex dbid, std::optional<SlotId> slot,
                 Entry::Payload payload);

size_t LsnBufferSize();
size_t LsnBufferBytes();

void SetFlushMode(bool allow_flush);

class DisableFlushGuard {
 public:
  explicit DisableFlushGuard(bool j) : journal_(j) {
    if (journal_ && counter_ == 0) {
      SetFlushMode(false);
    }
    util::fb2::detail::EnterFiberAtomicSection();
    ++counter_;
  }

  ~DisableFlushGuard() {
    util::fb2::detail::LeaveFiberAtomicSection();
    --counter_;
    if (journal_ && counter_ == 0) {
      SetFlushMode(true);  // Restore the state on destruction
    }
  }

  DisableFlushGuard(const DisableFlushGuard&) = delete;
  DisableFlushGuard& operator=(const DisableFlushGuard&) = delete;

 private:
  bool journal_;
  static size_t thread_local counter_;
};

}  // namespace journal
}  // namespace dfly


================================================
FILE: src/server/journal/journal_slice.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/journal/journal_slice.h"

#include <absl/container/inlined_vector.h>
#include <absl/flags/flag.h>
#include <absl/strings/escaping.h>
#include <absl/strings/str_cat.h>
#include <fcntl.h>

#include <filesystem>

#include "base/function2.hpp"
#include "base/logging.h"
#include "server/journal/serializer.h"
#include "util/fibers/fibers.h"

ABSL_FLAG(uint32_t, shard_repl_backlog_len, 8192,
          "The length of the circular replication log per shard");

namespace dfly {
namespace journal {
using namespace std;
using namespace util;

JournalSlice::JournalSlice() {
}

JournalSlice::~JournalSlice() {
}

void JournalSlice::Init() {
  // calling this function multiple times is allowed and it's a no-op.
  if (ring_buffer_.capacity() > 0)
    return;

  ring_buffer_.set_capacity(absl::GetFlag(FLAGS_shard_repl_backlog_len));
  ring_buffer_bytes_ = ring_buffer_.capacity() * sizeof(JournalItem);
}

bool JournalSlice::IsLSNInBuffer(LSN lsn) const {
  DCHECK(ring_buffer_.capacity() > 0);

  if (ring_buffer_.empty()) {
    return false;
  }

  if (ring_buffer_.size() == 1) {
    return ring_buffer_.front().lsn == lsn;
  }

  return ring_buffer_.front().lsn <= lsn && lsn <= ring_buffer_.back().lsn;
}

std::string_view JournalSlice::GetEntry(LSN lsn) const {
  DCHECK(ring_buffer_.capacity() > 0 && IsLSNInBuffer(lsn));

  auto start = ring_buffer_.front().lsn;
  DCHECK(ring_buffer_[lsn - start].lsn == lsn);
  return ring_buffer_[lsn - start].data;
}

void JournalSlice::SetFlushMode(bool allow_flush) {
  DCHECK(allow_flush != enable_journal_flush_);
  enable_journal_flush_ = allow_flush;
  if (allow_flush) {
    // This lock is never blocking because it contends with UnregisterOnChange, which is cpu only.
    // Hence this lock prevents the UnregisterOnChange to start running in the middle of
    // SetFlushMode.
    std::shared_lock lk(cb_mu_);
    for (auto k_v : journal_consumers_arr_) {
      k_v.second->ThrottleIfNeeded();
    }
  }
}

void JournalSlice::AddLogRecord(const Entry& entry) {
  DCHECK(ring_buffer_.capacity() > 0);

  JournalChangeItem item;

  {
    FiberAtomicGuard fg;
    item.journal_item.lsn = lsn_++;

    // only used by RestoreStreamer
    item.cmd = entry.payload.cmd;
    item.slot = entry.slot;

    io::StringSink sink;
    JournalWriter writer{&sink};
    writer.Write(entry);

    std::move(sink).str().swap(item.journal_item.data);

    if (item.journal_item.data.size() > 32) {
      // for non-SSO strings capacity should not be much higher than size.
      DCHECK_LE(item.journal_item.data.capacity(), item.journal_item.data.size() * 2);
    }
    VLOG(2) << "Writing item [" << item.journal_item.lsn << "]: " << entry.ToString();
  }

  CallOnChange(&item);
}

void JournalSlice::CallOnChange(JournalChangeItem* change_item) {
  // This lock is never blocking because it contends with UnregisterOnChange, which is cpu only.
  // Hence this lock prevents the UnregisterOnChange to start running in the middle of CallOnChange.
  // CallOnChange is atomic if JournalSlice::SetFlushMode(false) is called before.
  std::shared_lock lk(cb_mu_);
  for (auto k_v : journal_consumers_arr_) {
    k_v.second->ConsumeJournalChange(*change_item);
  }
  auto& item = change_item->journal_item;

  // We preserve order here. After ConsumeJournalChange there can reordering
  if (ring_buffer_.size() == ring_buffer_.capacity()) {
    const size_t bytes_removed = ring_buffer_.front().data.capacity();
    DCHECK_GE(ring_buffer_bytes_, bytes_removed);
    ring_buffer_bytes_ -= bytes_removed;
  }
  if (!ring_buffer_.empty()) {
    DCHECK(item.lsn == ring_buffer_.back().lsn + 1);
  }
  ring_buffer_.push_back(std::move(item));
  auto& data = ring_buffer_.back().data;

  // Small strings assignment keep the existing capacity intact due to SSO.
  // Shrink strings in this case to prevent excessive memory usage.
  if (data.size() < 32 && data.capacity() > 64) {
    data.shrink_to_fit();
  }
  ring_buffer_bytes_ += data.capacity();

  if (enable_journal_flush_) {
    for (auto k_v : journal_consumers_arr_) {
      k_v.second->ThrottleIfNeeded();
    }
  }
}

uint32_t JournalSlice::RegisterOnChange(JournalConsumerInterface* consumer) {
  // mutex lock isn't needed due to iterators are not invalidated
  uint32_t id = next_cb_id_++;
  journal_consumers_arr_.emplace_back(id, consumer);
  return id;
}

void JournalSlice::UnregisterOnChange(uint32_t id) {
  // we need to wait until callback is finished before remove it
  lock_guard lk(cb_mu_);
  auto it = find_if(journal_consumers_arr_.begin(), journal_consumers_arr_.end(),
                    [id](const auto& e) { return e.first == id; });
  CHECK(it != journal_consumers_arr_.end());
  journal_consumers_arr_.erase(it);
}

}  // namespace journal
}  // namespace dfly


================================================
FILE: src/server/journal/journal_slice.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <boost/circular_buffer.hpp>
#include <optional>
#include <shared_mutex>
#include <string_view>

#include "server/journal/types.h"
#include "util/fibers/synchronization.h"

namespace dfly {
namespace journal {

// Journal slice is present for both shards and io threads.
class JournalSlice {
 public:
  JournalSlice();
  ~JournalSlice();

  void Init();

  // This is always the LSN of the *next* journal entry.
  LSN cur_lsn() const {
    return lsn_;
  }

  std::error_code status() const {
    return status_ec_;
  }

  void AddLogRecord(const Entry& entry);

  // Register a callback that will be called every time a new entry is
  // added to the journal.
  // The callback receives the entry and a boolean that indicates whether
  // awaiting (to apply backpressure) is allowed.
  uint32_t RegisterOnChange(JournalConsumerInterface* consumer);
  void UnregisterOnChange(uint32_t);

  bool HasRegisteredCallbacks() const {
    return !journal_consumers_arr_.empty();
  }

  /// Returns whether the journal entry with this LSN is available
  /// from the buffer.
  bool IsLSNInBuffer(LSN lsn) const;
  std::string_view GetEntry(LSN lsn) const;
  // SetFlushMode with allow_flush=false is used to disable preemptions during
  // subsequent calls to AddLogRecord.
  // SetFlushMode with allow_flush=true flushes all log records aggregated
  // since the last call with allow_flush=false. This call may preempt.
  // The caller must ensure that no preemptions occur between the initial call
  // with allow_flush=false and the subsequent call with allow_flush=true.
  void SetFlushMode(bool allow_flush);

  size_t GetRingBufferSize() const {
    return ring_buffer_.size();
  }

  size_t GetRingBufferBytes() const {
    return ring_buffer_bytes_;
  }

  void ResetRingBuffer() {
    ring_buffer_.clear();
    ring_buffer_bytes_ = ring_buffer_.capacity() * sizeof(JournalItem);
  }

  void SetStartingLSN(LSN lsn) {
    lsn_ = lsn;
  }

 private:
  void CallOnChange(JournalChangeItem* item);
  boost::circular_buffer<JournalItem> ring_buffer_;

  mutable util::fb2::SharedMutex cb_mu_;  // to prevent removing callback during call
  std::list<std::pair<uint32_t, JournalConsumerInterface*>> journal_consumers_arr_;

  LSN lsn_ = 1;

  uint32_t next_cb_id_ = 1;
  std::error_code status_ec_;
  bool enable_journal_flush_ = true;

  size_t ring_buffer_bytes_ = 0;
};

}  // namespace journal
}  // namespace dfly


================================================
FILE: src/server/journal/journal_test.cc
================================================
#include <boost/circular_buffer.hpp>
#include <random>
#include <string>

#include "base/gtest.h"
#include "base/logging.h"
#include "core/detail/gen_utils.h"
#include "server/common.h"
#include "server/journal/pending_buf.h"
#include "server/journal/serializer.h"
#include "server/journal/types.h"
#include "server/serializer_commons.h"
#include "util/fibers/fibers.h"

using namespace testing;
using namespace std;
using namespace util;

namespace dfly {
namespace journal {
template <typename T> string ConCat(const T& list) {
  string res;
  for (auto arg : list) {
    res += string_view{arg.data(), arg.size()};
    res += ' ';
  }
  return res;
}

template <> string ConCat(const CmdArgList& list) {
  string res;
  for (auto arg : list) {
    res += facade::ToSV(arg);
    res += ' ';
  }
  return res;
}

struct EntryPayloadVisitor {
  void operator()(const Entry::Payload& p) {
    out->append(p.cmd).append(" ");
    *out += visit([](const auto& args) { return ConCat(args); }, p.args);
  }

  string* out;
};

// Extract payload from entry in string form.
std::string ExtractPayload(ParsedEntry& entry) {
  std::string out = ConCat(entry.cmd);

  if (!out.empty())
    out.pop_back();

  return out;
}

std::string ExtractPayload(Entry& entry) {
  std::string out;
  EntryPayloadVisitor visitor{&out};
  visitor(entry.payload);

  if (!out.empty())
    out.pop_back();

  return out;
}

// Mock non-owned types with underlying storage.
using StoredSlices = vector<vector<string_view>>;
using StoredLists = vector<pair<vector<string>, CmdArgVec>>;

template <typename... Ss> ArgSlice StoreSlice(StoredSlices* vec, Ss... strings) {
  vec->emplace_back(initializer_list<string_view>{strings...});
  return ArgSlice{vec->back().data(), vec->back().size()};
}

template <typename... Ss> CmdArgList StoreList(StoredLists* vec, Ss... strings) {
  vector<string> stored_strings{strings...};
  CmdArgVec out;
  for (auto& s : stored_strings) {
    out.emplace_back(s.data(), s.size());
  }

  vec->emplace_back(std::move(stored_strings), std::move(out));
  auto& arg_vec = vec->back().second;
  return CmdArgList{arg_vec.data(), arg_vec.size()};
}

// Test serializing and de-serializing entries.
TEST(Journal, WriteRead) {
  StoredSlices slices{};
  StoredLists lists{};

  auto slice = [v = &slices](auto... ss) { return StoreSlice(v, ss...); };
  auto list = [v = &lists](auto... ss) { return StoreList(v, ss...); };
  using Payload = Entry::Payload;

  std::vector<Entry> test_entries = {
      {0, Op::COMMAND, 0, nullopt, Payload("MSET", slice("A", "1", "B", "2"))},
      {0, Op::COMMAND, 0, nullopt, Payload("MSET", slice("C", "3"))},
      {1, Op::COMMAND, 0, nullopt, Payload("DEL", list("A", "B"))},
      {2, Op::COMMAND, 1, nullopt, Payload("LPUSH", list("l", "v1", "v2"))},
      {3, Op::COMMAND, 0, nullopt, Payload("MSET", slice("D", "4"))},
      {4, Op::COMMAND, 1, nullopt, Payload("DEL", list("l1"))},
      {5, Op::COMMAND, 2, nullopt, Payload("DEL", list("E", "2"))}};

  // Write all entries to a buffer.
  base::IoBuf buf;
  io::BufSink sink{&buf};

  JournalWriter writer{&sink};
  for (const auto& entry : test_entries) {
    writer.Write(entry);
  }

  // Read them back.
  io::BufSource source{&buf};
  JournalReader reader{&source, 0};

  ParsedEntry res;
  for (unsigned i = 0; i < test_entries.size(); i++) {
    auto& expected = test_entries[i];

    auto ec = reader.ReadEntry(&res);
    ASSERT_FALSE(ec);

    ASSERT_EQ(expected.opcode, res.opcode);
    ASSERT_EQ(expected.txid, res.txid);
    ASSERT_EQ(expected.dbid, res.dbid);
    ASSERT_EQ(ExtractPayload(expected), ExtractPayload(res));
  }
}

TEST(Journal, PendingBuf) {
  PendingBuf pbuf;

  ASSERT_TRUE(pbuf.Empty());
  ASSERT_EQ(pbuf.Size(), 0);

  pbuf.Push("one");
  pbuf.Push(" smallllllllllllllllllllllllllllllll");
  pbuf.Push(" test");

  ASSERT_FALSE(pbuf.Empty());
  ASSERT_EQ(pbuf.Size(), 44);

  {
    auto& sending_buf = pbuf.PrepareSendingBuf();
    ASSERT_EQ(sending_buf.buf.size(), 3);
    ASSERT_EQ(sending_buf.mem_size, 44);

    ASSERT_EQ(sending_buf.buf[0], "one");
    ASSERT_EQ(sending_buf.buf[1], " smallllllllllllllllllllllllllllllll");
    ASSERT_EQ(sending_buf.buf[2], " test");
  }

  const size_t string_num = PendingBuf::Buf::kMaxBufSize + 1000;
  std::vector<std::string> test_data;
  test_data.reserve(string_num);

  absl::InsecureBitGen gen;

  for (size_t i = 0; i < string_num; ++i) {
    auto str = GetRandomHex(gen, 10, 90);
    test_data.push_back(str);
    pbuf.Push(std::move(str));
  }

  const size_t test_data_size =
      std::accumulate(test_data.begin(), test_data.end(), 0,
                      [](size_t size, const auto& s) { return s.size() + size; });

  ASSERT_FALSE(pbuf.Empty());
  ASSERT_EQ(pbuf.Size(), 44 + test_data_size);

  pbuf.Pop();

  ASSERT_FALSE(pbuf.Empty());
  ASSERT_EQ(pbuf.Size(), test_data_size);

  {
    auto& sending_buf = pbuf.PrepareSendingBuf();

    const size_t send_buf_size =
        std::accumulate(test_data.begin(), test_data.begin() + PendingBuf::Buf::kMaxBufSize, 0,
                        [](size_t size, const auto& s) { return s.size() + size; });

    ASSERT_EQ(sending_buf.buf.size(), PendingBuf::Buf::kMaxBufSize);
    ASSERT_EQ(sending_buf.mem_size, send_buf_size);

    for (size_t i = 0; i < sending_buf.buf.size(); ++i) {
      ASSERT_EQ(sending_buf.buf[i], test_data[i]);
    }
  }

  pbuf.Pop();

  test_data.erase(test_data.begin(), test_data.begin() + PendingBuf::Buf::kMaxBufSize);

  const size_t last_buf_size =
      std::accumulate(test_data.begin(), test_data.end(), 0,
                      [](size_t size, const auto& s) { return s.size() + size; });

  ASSERT_FALSE(pbuf.Empty());
  ASSERT_EQ(pbuf.Size(), last_buf_size);

  {
    auto& sending_buf = pbuf.PrepareSendingBuf();

    ASSERT_EQ(sending_buf.buf.size(), 1000);
    ASSERT_EQ(sending_buf.mem_size, last_buf_size);

    for (size_t i = 0; i < sending_buf.buf.size(); ++i) {
      ASSERT_EQ(sending_buf.buf[i], test_data[i]);
    }
  }

  pbuf.Pop();

  ASSERT_TRUE(pbuf.Empty());
  ASSERT_EQ(pbuf.Size(), 0);
}

TEST(Journal, CircularMemory) {
  boost::circular_buffer<string> ring_buffer(1024);
  for (int i = 0; i < 2000; ++i) {
    ring_buffer.push_back(string(512, 'a'));
  }

  size_t cap = 0;
  for (size_t i = 0; i < ring_buffer.size(); ++i) {
    cap += ring_buffer[i].capacity();
  }
  LOG(INFO) << "Total capacity: " << cap;
  for (size_t i = 0; i < 2000; ++i) {
    ring_buffer.push_back(string(16, 'a'));
  }
  cap = 0;
  for (size_t i = 0; i < ring_buffer.size(); ++i) {
    cap += ring_buffer[i].capacity();
  }
  LOG(INFO) << "Total capacity after push: " << cap;

  string tmp(1 << 16, 'x');
  tmp = string(4, 'a');
  LOG(INFO) << "Tmp string capacity: " << tmp.capacity();
  tmp = string(32, 'a');
  LOG(INFO) << "Tmp string capacity: " << tmp.capacity();
}

}  // namespace journal
}  // namespace dfly


================================================
FILE: src/server/journal/pending_buf.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/inlined_vector.h>

#include <cassert>
#include <deque>
#include <numeric>

namespace dfly {

class PendingBuf {
 public:
  struct Buf {
    size_t mem_size = 0;
    absl::InlinedVector<std::string, 8> buf;

#ifdef UIO_MAXIOV
    static constexpr size_t kMaxBufSize = UIO_MAXIOV;
#else
    static constexpr size_t kMaxBufSize = 1024;
#endif
  };

  PendingBuf() : bufs_(1) {
  }

  bool Empty() const {
    return std::all_of(bufs_.begin(), bufs_.end(), [](const auto& b) { return b.buf.empty(); });
  }

  void Push(std::string str) {
    assert(!bufs_.empty());
    if (bufs_.back().buf.size() == Buf::kMaxBufSize) {
      bufs_.emplace_back();
    }
    auto& front_buf = bufs_.back();
    front_buf.mem_size += str.size();
    front_buf.buf.push_back(std::move(str));
  }

  // should be called to get the next buffer for sending
  const Buf& PrepareSendingBuf() {
    // Adding to the buffer ensures that future `Push()`es will not modify the in-flight buffer
    if (bufs_.size() == 1) {
      bufs_.emplace_back();
    }
    return bufs_.front();
  }

  size_t FrontBufSize() const {
    return bufs_.front().mem_size;
  }

  // should be called when the buf from PrepareSendingBuf() method was sent
  void Pop() {
    assert(bufs_.size() >= 2);
    bufs_.pop_front();
  }

  size_t Size() const {
    return std::accumulate(bufs_.begin(), bufs_.end(), 0,
                           [](size_t s, const auto& b) { return s + b.mem_size; });
  }

 private:
  std::deque<Buf> bufs_;
};

}  // namespace dfly


================================================
FILE: src/server/journal/serializer.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/journal/serializer.h"

#include <system_error>

#include "base/logging.h"
#include "glog/logging.h"
#include "io/io.h"
#include "io/io_buf.h"
#include "server/error.h"
#include "server/journal/types.h"
#include "server/main_service.h"
#include "server/serializer_commons.h"
#include "server/transaction.h"

using namespace std;

namespace dfly {

JournalWriter::JournalWriter(io::Sink* sink) : sink_{sink} {
}

void JournalWriter::Write(uint64_t v) {
  uint8_t buf[10];
  unsigned len = WritePackedUInt(v, buf);
  sink_->Write(io::Bytes{buf}.first(len));
}

void JournalWriter::Write(std::string_view sv) {
  Write(sv.size());
  if (!sv.empty())  // arguments can be empty strings
    sink_->Write(io::Buffer(sv));
}

void JournalWriter::Write(const journal::Entry::Payload& payload) {
  if (payload.cmd.empty())
    return;

  size_t num_elems = 0, size = 0;
  for (string_view str : base::it::Wrap(cmn::kToSV, payload.args)) {
    num_elems++;
    size += str.size();
  };

  Write(1 + num_elems);

  size_t cmd_size = payload.cmd.size() + size;
  Write(cmd_size);
  Write(payload.cmd);

  for (string_view str : base::it::Wrap(cmn::kToSV, payload.args))
    this->Write(str);
}

void JournalWriter::Write(const journal::Entry& entry) {
  // Check if entry has a new db index and we need to emit a SELECT entry.
  if (entry.opcode != journal::Op::SELECT && entry.opcode != journal::Op::LSN &&
      entry.opcode != journal::Op::PING && (!cur_dbid_ || entry.dbid != *cur_dbid_)) {
    Write(journal::Entry{journal::Op::SELECT, entry.dbid, entry.slot});
    cur_dbid_ = entry.dbid;
  }

  VLOG(1) << "Writing entry " << entry.ToString();

  Write(uint8_t(entry.opcode));

  switch (entry.opcode) {
    case journal::Op::SELECT:
      return Write(entry.dbid);
    case journal::Op::LSN:
      return Write(entry.lsn);
    case journal::Op::PING:
      return;
    case journal::Op::COMMAND:
      Write(entry.txid);
      Write(1u);  // deprecated field, kept for backward compatibility.
      Write(entry.payload);
      break;
    default:
      LOG(FATAL) << "Unknown journal opcode: " << static_cast<int>(entry.opcode);
      break;
  };
}

JournalReader::JournalReader(io::Source* source, DbIndex dbid)
    : source_{source}, buf_{4096}, dbid_{dbid} {
}

void JournalReader::SetSource(io::Source* source) {
  CHECK_EQ(buf_.InputLen(), 0ULL);
  source_ = source;
}

std::error_code JournalReader::EnsureRead(size_t num) {
  // Check if we already have enough.
  if (buf_.InputLen() >= num)
    return {};

  uint64_t remainder = num - buf_.InputLen();
  buf_.EnsureCapacity(remainder);

  // Try reading at least how much we need, but possibly more
  uint64_t read;
  SET_OR_RETURN(source_->ReadAtLeast(buf_.AppendBuffer(), remainder), read);

  // Happens on end of stream (for example, a too-small string buffer or a closed socket)
  if (read < remainder) {
    return make_error_code(errc::io_error);
  }

  buf_.CommitWrite(read);
  return {};
}

template <typename UT> io::Result<UT> JournalReader::ReadUInt() {
  // Determine type and number of following bytes.
  if (auto ec = EnsureRead(1); ec)
    return make_unexpected(ec);
  PackedUIntMeta meta{buf_.InputBuffer()[0]};
  buf_.ConsumeInput(1);

  if (auto ec = EnsureRead(meta.ByteSize()); ec)
    return make_unexpected(ec);

  // Read and check intenger.
  uint64_t res;
  SET_OR_UNEXPECT(ReadPackedUInt(meta, buf_.InputBuffer()), res);
  buf_.ConsumeInput(meta.ByteSize());

  if (res > std::numeric_limits<UT>::max())
    return make_unexpected(make_error_code(errc::result_out_of_range));
  return static_cast<UT>(res);
}

template io::Result<uint8_t> JournalReader::ReadUInt<uint8_t>();
template io::Result<uint16_t> JournalReader::ReadUInt<uint16_t>();
template io::Result<uint32_t> JournalReader::ReadUInt<uint32_t>();
template io::Result<uint64_t> JournalReader::ReadUInt<uint64_t>();

std::error_code JournalReader::ReadString(io::MutableBytes buffer) {
  size_t size = buffer.size();
  uint64_t available = std::min(size, buf_.InputLen());
  uint64_t remainder = 0;

  if (available < size) {
    remainder = size - available;
  }

  buf_.ReadAndConsume(available, buffer.data());

  // If remainder of string is bigger than threshold - read and populate directly
  // output buffer otherwise use intermediate io_buf.
  bool is_short_remainder = remainder < (buf_.Capacity() / 2);

  auto remainder_buf_pos = buffer.data() + available;

  if (remainder) {
    if (is_short_remainder) {
      if (auto ec = EnsureRead(remainder); ec)
        return ec;
      buf_.ReadAndConsume(remainder, remainder_buf_pos);
    } else {
      uint64_t read;
      SET_OR_RETURN(source_->Read({remainder_buf_pos, remainder}), read);
      if (read < remainder) {
        return make_error_code(errc::io_error);
      }
    }
  }

  return {};
}

std::error_code JournalReader::ReadCommand(journal::ParsedEntry::CmdData* data) {
  size_t num_strings = 0;
  SET_OR_RETURN(ReadUInt<uint64_t>(), num_strings);

  size_t cmd_size = 0;
  SET_OR_RETURN(ReadUInt<uint64_t>(), cmd_size);

  data->Reserve(num_strings, cmd_size + num_strings /* +\0 char*/);

  // Read all strings consecutively.
  for (size_t i = 0; i < num_strings; ++i) {
    size_t size = 0;
    SET_OR_RETURN(ReadUInt<uint64_t>(), size);
    if (size > cmd_size) {  // corrupted entry
      return make_error_code(errc::io_error);
    }
    data->PushArg(size);
    uint8_t* ptr = reinterpret_cast<uint8_t*>(data->data(i));
    if (auto ec = ReadString({ptr, size}); ec)
      return ec;

    ptr[size] = '\0';  // null terminate

    cmd_size -= size;
  }

  return {};
}

std::error_code JournalReader::ReadEntry(journal::ParsedEntry* dest) {
  uint8_t int_op;
  SET_OR_RETURN(ReadUInt<uint8_t>(), int_op);
  journal::Op opcode = static_cast<journal::Op>(int_op);

  if (opcode == journal::Op::SELECT) {
    SET_OR_RETURN(ReadUInt<uint16_t>(), dbid_);
    return ReadEntry(dest);
  }

  dest->dbid = dbid_;
  dest->opcode = opcode;
  dest->cmd.clear();
  if (opcode == journal::Op::PING) {
    return {};
  }

  if (opcode == journal::Op::LSN) {
    SET_OR_RETURN(ReadUInt<uint64_t>(), dest->lsn);
    return {};
  }

  SET_OR_RETURN(ReadUInt<uint64_t>(), dest->txid);
  [[maybe_unused]] uint32_t unused;

  SET_OR_RETURN(ReadUInt<uint32_t>(), unused);

  VLOG(1) << "Read entry " << dest->ToString();

  return ReadCommand(&dest->cmd);
}

}  // namespace dfly


================================================
FILE: src/server/journal/serializer.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <optional>
#include <string>

#include "io/io.h"
#include "io/io_buf.h"
#include "server/journal/types.h"

namespace dfly {

// JournalWriter serializes journal entries to a sink.
// It automatically keeps track of the current database index.
class JournalWriter {
 public:
  JournalWriter(io::Sink* sink);

  // Write single entry to sink.
  void Write(const journal::Entry& entry);
  void Write(uint64_t v);  // Write packed unsigned integer.

 private:
  void Write(std::string_view sv);  // Write string.
  void Write(const journal::Entry::Payload& payload);

 private:
  io::Sink* sink_;
  std::optional<DbIndex> cur_dbid_{};
};

// JournalReader allows deserializing journal entries from a source.
// Like the writer, it automatically keeps track of the database index.
struct JournalReader {
 public:
  // Initialize start database index.
  JournalReader(io::Source* source, DbIndex dbid);

  // Overwrite current source and ensure there is no leftover from previous.
  void SetSource(io::Source* source);

  // Try reading entry from source.
  std::error_code ReadEntry(journal::ParsedEntry* dest);

 private:
  // Read from source until buffer contains at least num bytes.
  std::error_code EnsureRead(size_t num);

  // Read unsigned integer in packed encoding.
  template <typename UT> io::Result<UT> ReadUInt();

  // Reads exactly buffer.size() bytes and copies them to buffer.
  std::error_code ReadString(io::MutableBytes buffer);

  // Read argument array into string buffer.
  std::error_code ReadCommand(journal::ParsedEntry::CmdData* entry);

 private:
  io::Source* source_;
  base::IoBuf buf_;
  DbIndex dbid_;
};

}  // namespace dfly


================================================
FILE: src/server/journal/streamer.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/journal/streamer.h"

#include <absl/functional/bind_front.h>
#include <sys/socket.h>

#include <chrono>

#ifdef __linux__
#include <netinet/tcp.h>
#endif

#include "base/flags.h"
#include "base/logging.h"
#include "server/db_slice.h"
#include "server/engine_shard.h"
#include "server/journal/cmd_serializer.h"
#include "server/journal/serializer.h"
#include "server/rdb_save.h"
#include "server/server_state.h"
#include "util/fibers/synchronization.h"

using namespace facade;

ABSL_FLAG(uint32_t, replication_timeout, 30000,
          "Time in milliseconds to wait for the replication writes being stuck.");

ABSL_FLAG(uint32_t, replication_stream_output_limit, 1_MB,
          "Time to wait for the replication output buffer go below the throttle limit");

ABSL_FLAG(uint32_t, migration_buckets_serialization_threshold, 10,
          "The Number of buckets to serialize on each iteration before yielding");
ABSL_FLAG(uint32_t, migration_buckets_sleep_usec, 500,
          "Sleep time in microseconds after each time we reach "
          "migration_buckets_serialization_threshold");

ABSL_FLAG(float, migration_buckets_cpu_budget, 0.2,
          "How much CPU budget to use for migration buckets serialization");

ABSL_FLAG(uint32_t, replication_dispatch_threshold, 1500,
          "Number of bytes to aggregate before replication");

namespace dfly {
using namespace util;
using namespace journal;
using namespace std;
namespace {

iovec IoVec(io::Bytes src) {
  return iovec{const_cast<uint8_t*>(src.data()), src.size()};
}

uint32_t replication_stream_output_limit_cached = 64_KB;
uint32_t migration_buckets_serialization_threshold_cached = 100;
uint32_t migration_buckets_sleep_usec_cached = 100;
uint32_t replication_dispatch_threshold = 1500;
uint32_t stalled_writer_base_period_ms = 10;

void LogTcpSocketDiagnostics(util::FiberSocketBase* dest) {
  if (!dest) {
    return;
  }

#ifdef __linux__
  // On Linux, we can get TCP diagnostics using getsockopt.
  int sockfd = dest->native_handle();
  if (sockfd < 0) {
    return;
  }

  struct tcp_info info;
  socklen_t info_len = sizeof(info);
  if (getsockopt(sockfd, IPPROTO_TCP, TCP_INFO, &info, &info_len) == 0) {
    LOG_EVERY_T(INFO, 1) << "TCP socket diagnostics - "
                         << "state: " << static_cast<int>(info.tcpi_state)
                         << ", ca_state: " << static_cast<int>(info.tcpi_ca_state)
                         << ", retransmits: " << static_cast<int>(info.tcpi_retransmits)
                         << ", probes: " << static_cast<int>(info.tcpi_probes)
                         << ", backoff: " << static_cast<int>(info.tcpi_backoff)
                         << ", options: " << static_cast<int>(info.tcpi_options)
                         << ", snd_wscale: " << static_cast<int>(info.tcpi_snd_wscale)
                         << ", rcv_wscale: " << static_cast<int>(info.tcpi_rcv_wscale)
                         << ", rto: " << info.tcpi_rto << ", ato: " << info.tcpi_ato
                         << ", snd_mss: " << info.tcpi_snd_mss << ", rcv_mss: " << info.tcpi_rcv_mss
                         << ", unacked: " << info.tcpi_unacked << ", sacked: " << info.tcpi_sacked
                         << ", lost: " << info.tcpi_lost << ", retrans: " << info.tcpi_retrans
                         << ", fackets: " << info.tcpi_fackets
                         << ", last_data_sent: " << info.tcpi_last_data_sent
                         << ", last_ack_sent: " << info.tcpi_last_ack_sent
                         << ", last_data_recv: " << info.tcpi_last_data_recv
                         << ", last_ack_recv: " << info.tcpi_last_ack_recv
                         << ", pmtu: " << info.tcpi_pmtu
                         << ", rcv_ssthresh: " << info.tcpi_rcv_ssthresh
                         << ", rtt: " << info.tcpi_rtt << ", rttvar: " << info.tcpi_rttvar
                         << ", snd_ssthresh: " << info.tcpi_snd_ssthresh
                         << ", snd_cwnd: " << info.tcpi_snd_cwnd << ", advmss: " << info.tcpi_advmss
                         << ", reordering: " << info.tcpi_reordering
                         << ", rcv_rtt: " << info.tcpi_rcv_rtt
                         << ", rcv_space: " << info.tcpi_rcv_space
                         << ", total_retrans: " << info.tcpi_total_retrans;
  } else {
    LOG_EVERY_T(INFO, 1) << "Failed to get TCP socket info: " << strerror(errno);
  }
#endif
}

}  // namespace

JournalStreamer::JournalStreamer(ExecutionState* cntx, JournalStreamer::Config config)
    : cntx_(cntx), config_(config) {
  // cache the flag to avoid accessing it later.
  replication_stream_output_limit_cached = absl::GetFlag(FLAGS_replication_stream_output_limit);
  migration_buckets_sleep_usec_cached = absl::GetFlag(FLAGS_migration_buckets_sleep_usec);
  replication_dispatch_threshold = absl::GetFlag(FLAGS_replication_dispatch_threshold);
  last_async_write_time_ = fb2::ProactorBase::GetMonotonicTimeNs() / 1000000;
}

JournalStreamer::~JournalStreamer() {
  if (!cntx_->IsError()) {
    DCHECK_EQ(in_flight_bytes_, 0u);
  }
  VLOG(1) << "~JournalStreamer";
}

void JournalStreamer::ConsumeJournalChange(const JournalChangeItem& item) {
  if (!ShouldWrite(item)) {
    return;
  }

  DCHECK_GT(item.journal_item.lsn, last_lsn_writen_);
  Write(item.journal_item.data);
  time_t now = time(nullptr);
  last_lsn_writen_ = item.journal_item.lsn;
  // TODO: to chain it to the previous Write call.
  if (config_.should_sent_lsn && now - last_lsn_time_ > 3) {
    last_lsn_time_ = now;
    io::StringSink sink;
    JournalWriter writer(&sink);
    writer.Write(Entry{journal::Op::LSN, last_lsn_writen_});
    Write(std::move(sink).str());
  }
}

void JournalStreamer::Start(util::FiberSocketBase* dest) {
  CHECK(dest_ == nullptr && dest != nullptr);
  dest_ = dest;
  // For partial sync we first catch up from journal replication buffer and only then register.
  if (config_.start_partial_sync_at == 0) {
    journal_cb_id_ = journal::RegisterConsumer(this);
  }
  StartStalledDataWriterFiber();
}

bool JournalStreamer::Cancel() {
  VLOG(1) << "JournalStreamer::Cancel " << cntx_->IsCancelled();
  waker_.notifyAll();
  bool res = false;
  if (journal_cb_id_) {
    auto cb_id = journal_cb_id_;
    journal_cb_id_ = 0;  // Reset to prevent double unregistration in another fiber
    journal::UnregisterConsumer(cb_id);
    res = true;
  }
  StopStalledDataWriterFiber();
  WaitForInflightToComplete(false);
  return res;
}

size_t JournalStreamer::UsedBytes() const {
  return pending_buf_.Size();
}

std::string JournalStreamer::FormatInternalState() const {
  return absl::StrCat(
      "pending_buf_size:", pending_buf_.Size(), " in_flight_bytes:", in_flight_bytes_,
      " total_sent:", total_sent_, " throttle_count:", throttle_count_,
      " total_throttle_wait_usec:", total_throttle_wait_usec_,
      " throttle_waiters:", throttle_waiters_, " last_async_write_time_ms:", last_async_write_time_,
      " last_lsn_time_s:", last_lsn_time_, " last_lsn_writen_:", last_lsn_writen_);
}

void JournalStreamer::Write(std::string str) {
  DCHECK(!str.empty());
  DVLOG(3) << "Writing " << str.size() << " bytes";

  pending_buf_.Push(std::move(str));
  AsyncWrite(false);
}

void JournalStreamer::StartStalledDataWriterFiber() {
  if (config_.init_from_stable_sync && !stalled_data_writer_.IsJoinable()) {
    auto pb = fb2::ProactorBase::me();
    std::chrono::milliseconds period_us(stalled_writer_base_period_ms);
    stalled_data_writer_ = MakeFiber([this, index = pb->GetPoolIndex(), period_us]() mutable {
      ThisFiber::SetName(absl::StrCat("fiber_periodic_journal_writer_", index));
      this->StalledDataWriterFiber(period_us, &stalled_data_writer_done_);
    });
  }
}

bool JournalStreamer::MaybePartialStreamLSNs() {
  // Same algorithm as SwitchIncrementalFb. The only difference is that we don't sent
  // the old LSN"s via a snapshot but rather as journal changes.
  if (config_.start_partial_sync_at > 0) {
    LSN lsn = config_.start_partial_sync_at;
    DCHECK_LE(lsn, journal::GetLsn()) << "The replica tried to sync from the future.";

    LOG(INFO) << "Starting partial sync from lsn: " << lsn;
    // The replica sends the LSN of the next entry is wants to receive.
    while (cntx_->IsRunning() && journal::IsLSNInBuffer(lsn)) {
      JournalChangeItem item;
      item.journal_item.data = journal::GetEntry(lsn);
      item.journal_item.lsn = lsn;
      ConsumeJournalChange(item);
      lsn++;
    }

    if (!cntx_->IsRunning()) {
      return false;
    }

    if (journal::GetLsn() != lsn) {
      // We stopped but we didn't manage to send the whole stream.
      cntx_->ReportError(
          std::make_error_code(errc::state_not_recoverable),
          absl::StrCat("Partial sync was unsuccessful because entry #", lsn,
                       " was dropped from the buffer. Current lsn=", journal::GetLsn()));
      return false;
    }

    // We are done, register back to the journal so we don't miss any changes
    journal_cb_id_ = journal::RegisterConsumer(this);

    LOG(INFO) << "Last LSN sent in partial sync was " << (lsn - 1);
    // flush pending
    if (pending_buf_.Size() != 0) {
      AsyncWrite(true);
    }
  }
  return true;
}

void JournalStreamer::StalledDataWriterFiber(std::chrono::milliseconds period_ms,
                                             util::fb2::Done* waiter) {
  if (!MaybePartialStreamLSNs()) {
    // Either context got cancelled, or partial sync failed because the lsn's stalled.
    return;
  }

  while (cntx_->IsRunning()) {
    if (waiter->WaitFor(period_ms)) {
      if (!cntx_->IsRunning()) {
        return;
      }
    }

    // We don't want to force async write to replicate if last data
    // was written recent. Data needs to be stalled for period_ms duration.
    if (!pending_buf_.Size() || in_flight_bytes_ > 0 ||
        ((last_async_write_time_ + period_ms.count()) >
         (fb2::ProactorBase::GetMonotonicTimeNs() / 1000000))) {
      continue;
    }

    AsyncWrite(true);
  }
}

void JournalStreamer::AsyncWrite(bool force_send) {
  // Stable sync or RestoreStreamer replication can't write data until
  // previous AsyncWriter finished.
  if (in_flight_bytes_ > 0) {
    return;
  }

  // Writing in stable sync and outside of fiber needs to check
  // threshold before writing data.
  if (config_.init_from_stable_sync && !force_send &&
      pending_buf_.FrontBufSize() < replication_dispatch_threshold) {
    return;
  }

  const auto& cur_buf = pending_buf_.PrepareSendingBuf();

  in_flight_bytes_ = cur_buf.mem_size;
  total_sent_ += in_flight_bytes_;
  last_async_write_time_ = fb2::ProactorBase::GetMonotonicTimeNs() / 1000000;

  const auto v_size = cur_buf.buf.size();
  absl::InlinedVector<iovec, 8> v(v_size);

  for (size_t i = 0; i < v_size; ++i) {
    const auto* uptr = reinterpret_cast<const uint8_t*>(cur_buf.buf[i].data());
    v[i] = IoVec(io::Bytes(uptr, cur_buf.buf[i].size()));
  }

  dest_->AsyncWrite(v.data(), v.size(),
                    [this, len = in_flight_bytes_](std::error_code ec) { OnCompletion(ec, len); });
}

void JournalStreamer::OnCompletion(std::error_code ec, size_t len) {
  DCHECK_EQ(in_flight_bytes_, len);

  DVLOG(3) << "Completing " << in_flight_bytes_;
  in_flight_bytes_ = 0;
  pending_buf_.Pop();
  if (cntx_->IsRunning()) {
    if (ec) {
      // Enhanced error logging with socket diagnostics for master disconnects
      LOG_EVERY_T(INFO, 1) << "JournalStreamer write error: " << ec.message()
                           << " (code: " << ec.value() << ", category: " << ec.category().name()
                           << ")";

      LogTcpSocketDiagnostics(dest_);

      cntx_->ReportError(ec);
    } else if (!pending_buf_.Empty()) {
      AsyncWrite(false);
    }
  }

  // notify ThrottleIfNeeded or WaitForInflightToComplete that waits
  // for all the completions to finish.
  // ThrottleIfNeeded can run from multiple fibers in the journal thread.
  // For example, from Heartbeat calling TriggerJournalWriteToSink to flush potential
  // expiration deletions and there are other cases as well.
  waker_.notifyAll();
}

void JournalStreamer::ThrottleIfNeeded() {
  if (!cntx_->IsRunning() || !IsStalled())
    return;

  ++throttle_count_;
  ++throttle_waiters_;

  const auto start = chrono::steady_clock::now();
  const auto next = start + chrono::milliseconds(absl::GetFlag(FLAGS_replication_timeout));
  auto log_start = start;
  size_t inflight_start = in_flight_bytes_;
  size_t sent_start = total_sent_;

  // Please note that ThrottleIfNeeded is unfair. Specifically with several producers pushing data
  // to this JournalStreamer, one of them may be stalled and the other will be able to
  // progress indefinitely. The stalled producer will be woken up only to verify again that the
  // other one succeeded to push data before it.
  // We currently do not solve this problem, but at least we will be more verbose about it.
  std::cv_status status = waker_.await_until(
      [&] {
        bool finished = !IsStalled() || !cntx_->IsRunning();
        if (finished)
          return finished;

        // Log every second that we are stalled and for how long.
        auto current = chrono::steady_clock::now();
        if (current - log_start > 1000ms) {
          log_start = current;
          LOG(WARNING) << "Waiting for "
                       << chrono::duration_cast<chrono::milliseconds>(current - start).count()
                       << "ms " << ThisFiber::GetName();
        }

        return false;
      },
      next);

  --throttle_waiters_;
  total_throttle_wait_usec_ +=
      chrono::duration_cast<chrono::microseconds>(chrono::steady_clock::now() - start).count();
  if (status == std::cv_status::timeout) {
    LOG(WARNING) << "Stream timed out, inflight bytes/sent start: " << inflight_start << "/"
                 << sent_start << ", end: " << in_flight_bytes_ << "/" << total_sent_;
    cntx_->ReportError("JournalStreamer write operation timeout");
  }
}

void JournalStreamer::WaitForInflightToComplete(bool with_timeout) {
  const auto start = chrono::steady_clock::now();
  const auto max_timeout = start + chrono::milliseconds(absl::GetFlag(FLAGS_replication_timeout));
  while (in_flight_bytes_) {
    auto next = chrono::steady_clock::now() + 1s;
    std::cv_status status =
        waker_.await_until([this] { return this->in_flight_bytes_ == 0; }, next);
    LOG_IF(WARNING, status == std::cv_status::timeout)
        << "Waiting for inflight bytes " << in_flight_bytes_;

    if (next >= max_timeout) {
      if (with_timeout) {
        cntx_->ReportError("JournalStreamer write operation timeout");
        break;
      } else {
        LOG(WARNING) << "WaitForInflightToComplete timed out with " << in_flight_bytes_
                     << " inflight bytes remaining";
      }
    }
  }
}

void JournalStreamer::StopStalledDataWriterFiber() {
  if (config_.init_from_stable_sync && stalled_data_writer_.IsJoinable()) {
    stalled_data_writer_done_.Notify();
    if (stalled_data_writer_.IsJoinable()) {
      stalled_data_writer_.Join();
    }
  }
}

bool JournalStreamer::IsStalled() const {
  return pending_buf_.Size() >= replication_stream_output_limit_cached;
}

RestoreStreamer::RestoreStreamer(DbSlice* slice, cluster::SlotSet slots, ExecutionState* cntx)
    : JournalStreamer(cntx, {}), db_slice_(slice), my_slots_(std::move(slots)) {
  DCHECK(slice != nullptr);
  migration_buckets_serialization_threshold_cached =
      absl::GetFlag(FLAGS_migration_buckets_serialization_threshold);
  db_array_ = slice->databases();  // Inc ref to make sure DB isn't deleted while we use it

  cmd_serializer_ = std::make_unique<CmdSerializer>(
      db_slice_,
      [&](std::string s) {
        Write(std::move(s));
        ThrottleIfNeeded();
      },
      ServerState::tlocal()->serialization_max_chunk_size);
}

void RestoreStreamer::Start(util::FiberSocketBase* dest) {
  if (!cntx_->IsRunning())
    return;

  VLOG(1) << "RestoreStreamer start";
  auto db_cb = absl::bind_front(&RestoreStreamer::OnDbChange, this);
  snapshot_version_ = db_slice_->RegisterOnChange(std::move(db_cb));

  JournalStreamer::Start(dest);
}

void RestoreStreamer::Run() {
  VLOG(1) << "RestoreStreamer run";

  PrimeTable::Cursor cursor;
  uint64_t last_yield = 0;

  // Explicitly copy table smart pointer to keep reference count up (flushall drops it)
  boost::intrusive_ptr<DbTable> table = db_array_.front();
  PrimeTable* pt = &table->prime;

  do {
    if (!cntx_->IsRunning())
      return;

    // If someone else throtles due to huge pending_buf_, give it priority.
    // Apparently, continue goes through the loop by checking the condition below, so we check
    // cursor here as well.
    // In addition if bucket writing was too intensive on CPU and we are overloaded.
    // Note that we account for CPU time from OnDbChange and here as well (inside WriteBucket).
    // But we only throttle here, so if we migrated lots of slots during mutations, we
    // won't progress here but if we have not, then this fiber will progress withing the
    // CPU budget we defined for it.
    bool should_stall =
        throttle_waiters_ > 0 ||
        (pending_buf_.Size() >= replication_stream_output_limit_cached / 3) ||
        cpu_aggregator_.IsOverloaded(absl::GetFlag(FLAGS_migration_buckets_cpu_budget));
    if (cursor && should_stall) {
      ThisFiber::SleepFor(300us);

      // We have a design bug in RealTimeAggregator that resets it measurements only when
      // the next sample is taken. So we add this sample to ensure cpu_aggregator_
      // refreshes its state.
      base::CpuTimeGuard guard(&cpu_aggregator_);
      stats_.iter_skips++;
      continue;
    }

    cursor = pt->TraverseBuckets(cursor, [&](PrimeTable::bucket_iterator it) {
      if (!cntx_->IsRunning())  // Could be cancelled any time as Traverse may preempt
        return;

      db_slice_->FlushChangeToEarlierCallbacks(0 /*db_id always 0 for cluster*/,
                                               DbSlice::Iterator::FromPrime(it), snapshot_version_);

      if (!cntx_->IsRunning())  // Could have been cancelled in above call too
        return;

      // Do not progress if we are stalled.
      ThrottleIfNeeded();

      std::lock_guard guard(big_value_mu_);

      {
        // Locking this never preempts. See snapshot.cc for why we need it.
        auto* blocking_counter = db_slice_->GetLatch();
        lock_guard blocking_counter_guard(*blocking_counter);

        stats_.buckets_loop += WriteBucket(it, false);
      }

      // We could have delayed entries that are watiting so we want to flush them
      cmd_serializer_->SerializeDelayedEntries(false, nullptr);
    });

    // TODO: FLAGS_migration_buckets_cpu_budget should eventually be a single configurable
    // setting that controls how agressive we are with migration pace.
    // Once we gain confidence with FLAGS_migration_buckets_cpu_budget we should retire
    // migration_buckets_serialization_threshold and migration_buckets_sleep_usec.
    if (++last_yield >= migration_buckets_serialization_threshold_cached) {
      ThisFiber::SleepFor(chrono::microseconds(migration_buckets_sleep_usec_cached));
      last_yield = 0;
    }
  } while (cursor);

  // Force serialize of all delayed entries.
  {
    std::lock_guard guard(big_value_mu_);
    cmd_serializer_->SerializeDelayedEntries(true, nullptr);
  }

  VLOG(1) << "RestoreStreamer finished loop of " << my_slots_.ToSlotRanges().ToString()
          << ", shard " << db_slice_->shard_id() << ". Buckets looped " << stats_.buckets_loop;
}

void RestoreStreamer::SendFinalize(long attempt) {
  VLOG(1) << "RestoreStreamer LSN of " << my_slots_.ToSlotRanges().ToString() << ", shard "
          << db_slice_->shard_id() << " attempt " << attempt << " with " << stats_.commands
          << " commands. Buckets looped " << stats_.buckets_loop << ", buckets on_db_update "
          << stats_.buckets_on_db_update << ", buckets skipped " << stats_.buckets_skipped
          << ", buckets written " << stats_.buckets_written << ". Keys skipped "
          << stats_.keys_skipped << ", keys written " << stats_.keys_written
          << " throttle count: " << throttle_count_
          << ", throttle on db update: " << stats_.throttle_on_db_update
          << ", throttle usec on db update: " << stats_.throttle_usec_on_db_update
          << ", iter_skips: " << stats_.iter_skips;

  // Drain all pending journal data before sending the finalize marker.
  // At this point client pause is active, so no new entries can arrive.
  WaitForInflightToComplete(true);

  journal::Entry entry(journal::Op::LSN, attempt);

  io::StringSink sink;
  JournalWriter writer{&sink};
  writer.Write(entry);
  Write(std::move(sink).str());

  // DFLYMIGRATE ACK command has a timeout so we want to send it only when LSN is ready to be sent
  ThrottleIfNeeded();
}

RestoreStreamer::~RestoreStreamer() {
}

bool RestoreStreamer::Cancel() {
  auto sver = snapshot_version_;
  snapshot_version_ = 0;  // to prevent double cancel in another fiber
  cntx_->Cancel();
  if (sver != 0) {
    db_slice_->UnregisterOnChange(sver);
  }
  bool res = JournalStreamer::Cancel();
  LOG_IF(WARNING, res != (sver != 0)) << "Journal and DBSlice unregister state mismatch in "
                                         "RestoreStreamer Cancel. DBSlice unregister state: "
                                      << (sver != 0) << ", Journal unregister state: " << res;
  return res && (sver != 0);
}

bool RestoreStreamer::ShouldWrite(const journal::JournalChangeItem& item) const {
  if (item.cmd == "FLUSHALL" || item.cmd == "FLUSHDB") {
    // On FLUSH* we restart the migration
    CHECK(dest_ != nullptr);
    cntx_->ReportError("FLUSH command during migration");
    std::ignore = dest_->Shutdown(SHUT_RDWR);
    return false;
  }

  if (!item.slot.has_value()) {
    return false;
  }

  return ShouldWrite(*item.slot);
}

bool RestoreStreamer::ShouldWrite(std::string_view key) const {
  return ShouldWrite(KeySlot(key));
}

bool RestoreStreamer::ShouldWrite(SlotId slot_id) const {
  return my_slots_.Contains(slot_id);
}

bool RestoreStreamer::WriteBucket(PrimeTable::bucket_iterator it, bool on_db_change_cb) {
  auto& shard_stats = EngineShard::tlocal()->stats();
  bool written = false;
  absl::flat_hash_set<string> tiered_keys;
  string key_buffer;  // we can reuse it

  // Only track tiered keys when needed and flush delayed entries
  // 1. When we have tiered storage
  // 2. We're called from a OnDbChange callback
  //
  // We need to track all keys in bucket with tiering. Even if they are not set as external. There
  // is situation when we request externalization of key and key is read - marking it as not
  // external but not yet flushed. When OnDbChange callback is called we need to flush it and than
  // write journal changes - so we cannot realy on IsExternal flag and need to track all keys.
  const bool track_tiered_keys =
      on_db_change_cb && EngineShard::tlocal()->tiered_storage() != nullptr;

  if (!it.is_done() && it.GetVersion() < snapshot_version_) {
    base::CpuTimeGuard guard(&cpu_aggregator_);
    stats_.buckets_written++;
    it.SetVersion(snapshot_version_);
    for (it.AdvanceIfNotOccupied(); !it.is_done(); ++it) {
      const auto& pv = it->second;
      string_view key = it->first.GetSlice(&key_buffer);
      if (ShouldWrite(key)) {
        ++stats_.keys_written;
        ++shard_stats.total_migrated_keys;
        uint64_t expire = it->first.GetExpireTime();
        // Track tiered keys that will need delayed entry flushing
        if (track_tiered_keys) {
          tiered_keys.emplace(key);
        }
        WriteEntry(key, it->first, pv, expire);
        written = true;
      } else {
        stats_.keys_skipped++;
      }
    }
  } else {
    // Bucket already serialized, but we may still need to track tiered keys
    // for force-flushing their delayed entries
    if (track_tiered_keys) {
      for (it.AdvanceIfNotOccupied(); !it.is_done(); ++it) {
        string_view key = it->first.GetSlice(&key_buffer);
        if (ShouldWrite(key)) {
          tiered_keys.emplace(key);
        }
      }
    }
    stats_.buckets_skipped++;
  }

  // Force serialized entries for keys that are tiered and were updated during migration.
  // Unfortunately we cannot be selective here and need to flush all delayed entreis that we
  // collected while traversing bucket.
  // TODO: change interface so we forcefully flush only single entry.
  if (tiered_keys.size()) {
    cmd_serializer_->SerializeDelayedEntries(true, &tiered_keys);
  }

  // we don't need throttle here, because we throttle after every entry written

  return written;
}

// Ordering invariant (PIT mode, slot migration):
//   Same as SliceSnapshot::OnDbChange — for any key K the baseline must be sent before any
//   journal entry that mutates K. RestoreStreamer always uses PIT mode (snapshot_version_ != 0)
//   and serializes-before-mutate via CVCUponInsert (inserts) or WriteBucket (updates).
//   big_value_mu_ prevents interleaving with the traversal fiber's WriteBucket.
void RestoreStreamer::OnDbChange(DbIndex db_index, const DbSlice::ChangeReq& req) {
  std::lock_guard guard(big_value_mu_);
  DCHECK_EQ(db_index, 0) << "Restore migration only allowed in cluster mode in db0";

  PrimeTable* table = db_slice_->GetTables(0).first;
  uint64_t throttle_start = throttle_count_;
  uint64_t throttle_usec_start = total_throttle_wait_usec_;
  if (const PrimeTable::bucket_iterator* bit = req.update()) {
    if (snapshot_version_ == 0) {
      // If snapshot_version_ is 0, it means that Cancel() was called and we shouldn't proceed.
      return;
    }
    stats_.buckets_on_db_update += WriteBucket(*bit, true);
  } else {
    string_view key = get<string_view>(req.change);
    table->CVCUponInsert(snapshot_version_, key, [&](PrimeTable::bucket_iterator it) {
      if (snapshot_version_ != 0) {  // we need this check because lambda can be called several
                                     // times and we can preempt in WriteBucket
        DCHECK_LT(it.GetVersion(), snapshot_version_);
        stats_.buckets_on_db_update += WriteBucket(it, true);
      }
    });
  }
  stats_.throttle_on_db_update += throttle_count_ - throttle_start;
  stats_.throttle_usec_on_db_update += total_throttle_wait_usec_ - throttle_usec_start;
}

void RestoreStreamer::WriteEntry(string_view key, const PrimeKey& pk, const PrimeValue& pv,
                                 uint64_t expire_ms) {
  stats_.commands += cmd_serializer_->SerializeEntry(key, pk, pv, expire_ms);
}

}  // namespace dfly


================================================
FILE: src/server/journal/streamer.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include "base/cycle_clock.h"
#include "server/cluster/slot_set.h"
#include "server/common_types.h"
#include "server/execution_state.h"
#include "server/journal/journal.h"
#include "server/journal/pending_buf.h"
#include "server/synchronization.h"
#include "util/fiber_socket_base.h"

namespace dfly {

// Buffered single-shard journal streamer that listens for journal changes with a
// journal listener and writes them to a destination sink in a separate fiber.
class JournalStreamer : public journal::JournalConsumerInterface {
 public:
  struct Config {
    bool should_sent_lsn = false;
    bool init_from_stable_sync = false;
    LSN start_partial_sync_at = 0;
  };

  JournalStreamer(ExecutionState* cntx, Config config);

  virtual ~JournalStreamer();

  // Self referential.
  JournalStreamer(const JournalStreamer& other) = delete;
  JournalStreamer(JournalStreamer&& other) = delete;

  // Register journal listener and start writer in fiber.
  virtual void Start(util::FiberSocketBase* dest);

  void ConsumeJournalChange(const journal::JournalChangeItem& item);

  // Must be called on context cancellation for unblocking
  // and manual cleanup. If it unregistered a listener, returns true.
  virtual bool Cancel();

  size_t UsedBytes() const;

  // For debugging purposes. Return string with formatted internal state.
  std::string FormatInternalState() const;

 protected:
  // TODO: we copy the string on each write because JournalItem may be passed to multiple
  // streamers so we can not move it. However, if we would either wrap JournalItem in shared_ptr
  // or wrap JournalItem::data in shared_ptr, we can avoid the cost of copying strings.
  // Also, for small strings it's more peformant to copy to the intermediate buffer than
  // to issue an io operation.
  void Write(std::string str);

  // Blocks the if the consumer if not keeping up.
  void ThrottleIfNeeded() final;

  virtual bool ShouldWrite(const journal::JournalChangeItem& item) const {
    return cntx_->IsRunning();
  }

  void WaitForInflightToComplete(bool with_timeout);

  size_t inflight_bytes() const {
    return in_flight_bytes_;
  }

  util::FiberSocketBase* dest_ = nullptr;
  ExecutionState* cntx_;
  uint64_t throttle_count_ = 0;
  uint64_t total_throttle_wait_usec_ = 0;
  uint32_t throttle_waiters_ = 0;

  PendingBuf pending_buf_;

 private:
  // Return true if all lsn's from config_.start_partial_sync_at were sent (or if started from 0).
  // Return false if not all lsn's were sent (stalled) in time. Cancels the context with error.
  bool MaybePartialStreamLSNs();

  void AsyncWrite(bool force_send);
  void OnCompletion(std::error_code ec, size_t len);

  bool IsStalled() const;

  util::fb2::Fiber stalled_data_writer_;
  util::fb2::Done stalled_data_writer_done_;
  void StartStalledDataWriterFiber();
  void StopStalledDataWriterFiber();
  void StalledDataWriterFiber(std::chrono::milliseconds period_ms, util::fb2::Done* waiter);

  const Config config_;
  // If we are replication in stable sync we can aggregate data before sending
  size_t in_flight_bytes_ = 0, total_sent_ = 0;
  // Last time that send data in milliseconds
  uint64_t last_async_write_time_ = 0;
  time_t last_lsn_time_ = 0;
  LSN last_lsn_writen_ = 0;
  util::fb2::EventCount waker_;
  uint32_t journal_cb_id_{0};
};

class CmdSerializer;

// Serializes existing DB as RESTORE commands, and sends updates as regular commands.
// Only handles relevant slots, while ignoring all others.
class RestoreStreamer : public JournalStreamer {
 public:
  RestoreStreamer(DbSlice* slice, cluster::SlotSet slots, ExecutionState* cntx);
  ~RestoreStreamer() override;

  void Start(util::FiberSocketBase* dest) override;

  void Run();

  // Cancel() must be called if Start() is called
  bool Cancel() override;

  void SendFinalize(long attempt);

 private:
  void OnDbChange(DbIndex db_index, const ChangeReq& req);
  bool ShouldWrite(const journal::JournalChangeItem& item) const override;
  bool ShouldWrite(std::string_view key) const;
  bool ShouldWrite(SlotId slot_id) const;

  // Returns true if any entry was actually written
  bool WriteBucket(PrimeTable::bucket_iterator it, bool on_db_change);

  void WriteEntry(std::string_view key, const PrimeKey& pk, const PrimeValue& pv,
                  uint64_t expire_ms);

  struct Stats {
    uint64_t buckets_skipped = 0;
    uint64_t buckets_written = 0;
    uint64_t buckets_loop = 0;
    uint64_t buckets_on_db_update = 0;
    uint64_t throttle_on_db_update = 0;
    uint64_t throttle_usec_on_db_update = 0;
    uint64_t keys_written = 0;
    uint64_t keys_skipped = 0;
    uint64_t commands = 0;
    uint64_t iter_skips = 0;
  };

  DbSlice* db_slice_;
  DbTableArray db_array_;
  uint64_t snapshot_version_ = 0;
  cluster::SlotSet my_slots_;

  std::unique_ptr<CmdSerializer> cmd_serializer_;

  ThreadLocalMutex big_value_mu_;
  Stats stats_;
  base::RealTimeAggregator cpu_aggregator_;
};

}  // namespace dfly


================================================
FILE: src/server/journal/tx_executor.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "tx_executor.h"

#include <absl/strings/match.h>

#include "base/logging.h"
#include "server/execution_state.h"
#include "server/journal/serializer.h"

using namespace std;
using namespace facade;

namespace dfly {

bool MultiShardExecution::InsertTxToSharedMap(TxId txid, uint32_t shard_cnt) {
  std::unique_lock lk(map_mu);
  auto [it, was_insert] = tx_sync_execution.emplace(txid, shard_cnt);
  lk.unlock();

  VLOG(2) << "txid: " << txid << " unique_shard_cnt_: " << shard_cnt
          << " was_insert: " << was_insert;
  it->second.block->Dec();

  return was_insert;
}

MultiShardExecution::TxExecutionSync& MultiShardExecution::Find(TxId txid) {
  std::lock_guard lk(map_mu);
  VLOG(2) << "Execute txid: " << txid;
  auto it = tx_sync_execution.find(txid);
  DCHECK(it != tx_sync_execution.end());
  return it->second;
}

void MultiShardExecution::Erase(TxId txid) {
  std::lock_guard lg{map_mu};
  tx_sync_execution.erase(txid);
}

void MultiShardExecution::CancelAllBlockingEntities() {
  lock_guard lk{map_mu};
  for (auto& tx_data : tx_sync_execution) {
    tx_data.second.barrier.Cancel();
    tx_data.second.block->Cancel();
  }
}

void TransactionData::AddEntry(journal::ParsedEntry&& entry) {
  opcode = entry.opcode;

  switch (entry.opcode) {
    case journal::Op::LSN:
      lsn = entry.lsn;
      return;
    case journal::Op::PING:
      return;
    case journal::Op::EXPIRED:
    case journal::Op::COMMAND:
      command = std::move(entry.cmd);
      dbid = entry.dbid;
      txid = entry.txid;
      return;
    default:
      DCHECK(false) << "Unsupported opcode";
  }
}

bool TransactionData::IsGlobalCmd() const {
  if (command.empty()) {
    return false;
  }

  string_view front = command.Front();

  if (absl::EqualsIgnoreCase(front, "FLUSHDB"sv) || absl::EqualsIgnoreCase(front, "FLUSHALL"sv))
    return true;

  if (command.size() > 1 && absl::EqualsIgnoreCase(front, "DFLYCLUSTER"sv) &&
      absl::EqualsIgnoreCase(command[1], "FLUSHSLOTS"sv)) {
    return true;
  }

  return false;
}

bool TransactionReader::NextTxData(JournalReader* reader, ExecutionState* cntx,
                                   TransactionData* dest) {
  if (!cntx->IsRunning()) {
    return false;
  }
  journal::ParsedEntry entry;
  if (auto ec = reader->ReadEntry(&entry); ec) {
    cntx->ReportError(ec);
    return false;
  }

  // When LSN opcode is sent master does not increase journal lsn.
  if (lsn_.has_value() && entry.opcode != journal::Op::LSN) {
    ++*lsn_;
    VLOG(2) << "read lsn: " << *lsn_;
  }

  dest->command.clear();
  dest->AddEntry(std::move(entry));

  if (lsn_.has_value() && dest->opcode == journal::Op::LSN) {
    DCHECK_NE(dest->lsn, 0u);
    LOG_IF_EVERY_N(WARNING, dest->lsn != *lsn_, 10000)
        << "master lsn:" << dest->lsn << " replica lsn" << *lsn_;
    DCHECK_EQ(dest->lsn, *lsn_);
  }
  return true;
}

}  // namespace dfly


================================================
FILE: src/server/journal/tx_executor.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once

#include <unordered_map>

#include "server/execution_state.h"
#include "server/journal/types.h"
#include "util/fibers/synchronization.h"

namespace dfly {

struct JournalReader;

// Coordinator for multi shard execution.
class MultiShardExecution {
 public:
  struct TxExecutionSync {
    util::fb2::Barrier barrier;
    std::atomic_uint32_t counter;
    util::fb2::BlockingCounter block;

    explicit TxExecutionSync(uint32_t counter)
        : barrier(counter), counter(counter), block(counter) {
    }
  };

  bool InsertTxToSharedMap(TxId txid, uint32_t shard_cnt);
  TxExecutionSync& Find(TxId txid);
  void Erase(TxId txid);
  void CancelAllBlockingEntities();

 private:
  util::fb2::Mutex map_mu;
  std::unordered_map<TxId, TxExecutionSync> tx_sync_execution;
};

// This class holds the commands of transaction in single shard.
// Once all commands were received, the transaction can be executed.
struct TransactionData {
  // Update the data from ParsedEntry
  void AddEntry(journal::ParsedEntry&& entry);

  bool IsGlobalCmd() const;

  TxId txid{0};
  DbIndex dbid{0};
  journal::ParsedEntry::CmdData command;

  journal::Op opcode;
  uint64_t lsn = 0;
};

// Utility for reading TransactionData from a journal reader.
// The journal stream can contain interleaved data for multiple multi transactions,
// expiries and out of order executed transactions that need to be grouped on the replica side.
struct TransactionReader {
  TransactionReader(std::optional<uint64_t> lsn = std::nullopt) : lsn_(lsn) {
  }

  bool NextTxData(JournalReader* reader, ExecutionState* cntx, TransactionData* dest);

 private:
  std::optional<uint64_t> lsn_ = 0;
};

}  // namespace dfly


================================================
FILE: src/server/journal/types.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/journal/types.h"

#include <absl/strings/str_join.h>

namespace dfly::journal {

using namespace std;

void AppendPrefix(string_view cmd, string* dest) {
  absl::StrAppend(dest, ", cmd='");
  absl::StrAppend(dest, cmd);
  absl::StrAppend(dest, "', args=[");
}

void AppendSuffix(string* dest) {
  if (dest->back() == ',')
    dest->pop_back();
  absl::StrAppend(dest, "]");
}

string Entry::ToString() const {
  string rv = absl::StrCat("{op=", opcode, ", dbid=", dbid);

  if (HasPayload()) {
    AppendPrefix(payload.cmd, &rv);
    for (string_view arg : base::it::Wrap(cmn::kToSV, payload.args))
      absl::StrAppend(&rv, "'", cmn::ToSV(arg), "',");
    AppendSuffix(&rv);
  } else {
    absl::StrAppend(&rv, ", empty");
  }

  rv += "}";
  return rv;
}

string ParsedEntry::ToString() const {
  string rv = absl::StrCat("{op=", opcode, ", dbid=", dbid, ", cmd='");
  for (string_view arg : cmd) {
    absl::StrAppend(&rv, arg, " ");
  }
  rv.pop_back();
  rv += "'}";
  return rv;
}

}  // namespace dfly::journal


================================================
FILE: src/server/journal/types.h
================================================
// Copyright 2022, Roman Gershman.  All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once

#include <optional>
#include <string>
#include <variant>

#include "common/backed_args.h"
#include "server/common_types.h"
#include "server/table.h"

namespace dfly {
namespace journal {

enum class Op : uint8_t { SELECT = 6, EXPIRED = 9 /* sunset*/, COMMAND = 10, PING = 13, LSN = 15 };

struct EntryBase {
  TxId txid;
  Op opcode;
  DbIndex dbid;
  std::optional<SlotId> slot;
  LSN lsn{0};
};

// This struct represents a single journal entry.
// Those are either control instructions or commands.
struct Entry : public EntryBase {
  // Payload represents a non-owning view into a command executed on the shard.
  struct Payload {
    std::string_view cmd;
    std::variant<ShardArgs,  // Shard parts.
                 ArgSlice>   // Parts of a full command.
        args;

    Payload() = default;

    Payload(std::string_view c, const ShardArgs& a) : cmd(c), args(a) {
    }
    Payload(std::string_view c, ArgSlice a) : cmd(c), args(a) {
    }
  };

  Entry(TxId txid, Op opcode, DbIndex dbid, std::optional<SlotId> slot_id, Payload pl)
      : EntryBase{txid, opcode, dbid, slot_id}, payload{std::move(pl)} {
  }

  Entry(journal::Op opcode, DbIndex dbid, std::optional<SlotId> slot_id)
      : EntryBase{0, opcode, dbid, slot_id, 0} {
  }

  Entry(journal::Op opcode, LSN lsn) : EntryBase{0, opcode, 0, std::nullopt, lsn} {
  }

  Entry(TxId txid, journal::Op opcode, DbIndex dbid, std::optional<SlotId> slot_id)
      : EntryBase{txid, opcode, dbid, slot_id, 0} {
  }

  bool HasPayload() const {
    return !payload.cmd.empty();
  }

  std::string ToString() const;

  Payload payload;
};

struct ParsedEntry : public EntryBase {
  using CmdData = cmn::BackedArguments;
  CmdData cmd;

  ParsedEntry(const ParsedEntry&) = delete;
  ParsedEntry() = default;

  std::string ToString() const;
};

struct JournalItem {
  LSN lsn;
  std::string data;
};

struct JournalChangeItem {
  JournalItem journal_item;

  std::string_view cmd;
  std::optional<SlotId> slot;
};

struct JournalConsumerInterface {
  virtual ~JournalConsumerInterface() = default;

  // Receives a journal change for serializing
  virtual void ConsumeJournalChange(const JournalChangeItem& item) = 0;
  // Waits for writing the serialized data
  virtual void ThrottleIfNeeded() = 0;
};

}  // namespace journal
}  // namespace dfly


================================================
FILE: src/server/json_family.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include <absl/strings/match.h>
#include <absl/strings/str_cat.h>
#include <absl/strings/str_join.h>
#include <absl/strings/str_split.h>

#include <type_traits>

#include "absl/cleanup/cleanup.h"
#include "base/flags.h"
#include "base/logging.h"
#include "core/flatbuffers.h"
#include "core/json/json_object.h"
#include "core/json/path.h"
#include "core/mi_memory_resource.h"
#include "facade/cmd_arg_parser.h"
#include "facade/op_status.h"
#include "facade/reply_builder.h"
#include "server/acl/acl_commands_def.h"
#include "server/command_families.h"
#include "server/command_registry.h"
#include "server/conn_context.h"
#include "server/db_slice.h"
#include "server/detail/wrapped_json_path.h"
#include "server/engine_shard_set.h"
#include "server/error.h"
#include "server/execution_state.h"
#include "server/journal/journal.h"
#include "server/search/doc_index.h"
#include "server/sharding.h"
#include "server/tiered_storage.h"
#include "server/transaction.h"

// clang-format off
#include <jsoncons_ext/jsonpatch/jsonpatch.hpp>
#include <jsoncons_ext/jsonpointer/jsonpointer.hpp>
#include <jsoncons_ext/mergepatch/mergepatch.hpp>
// clang-format on

ABSL_DECLARE_FLAG(bool, jsonpathv2);

namespace dfly {

using namespace std;
using namespace jsoncons;
using facade::CmdArgParser;
using facade::kSyntaxErrType;
using facade::RedisReplyBuilder;
using facade::SinkReplyBuilder;

using JsonExpression = jsonpath::jsonpath_expression<JsonType>;
using CI = CommandId;

namespace {

struct JsonAutoUpdaterOptions {
  bool disable_indexing = false;  // If true, the key will not be removed or added to the indexes
  bool update_on_delete = false;  // If true, SetJsonSize will be called on destruction
};

/* Helper class which must be initialized before any mutate operations on json.
  It will track the memory usage of the json object and update the size in the CompactObj.
  It also contains indexes updates, post update operations on the iterator. */
class JsonAutoUpdater {
 public:
  JsonAutoUpdater(const OpArgs& op_args, string_view key, DbSlice::ItAndUpdater it,
                  JsonAutoUpdaterOptions options = {})
      : op_args_(op_args), key_(key), it_(std::move(it)), options_(options) {
    if (!options_.disable_indexing) {
      op_args.shard->search_indices()->RemoveDoc(key, op_args.db_cntx, it.it->second);
    }

    /* We need to initialize start memory usage after RemoveDoc because internally RemoveDoc has
    static cache that can allocate/deallocate memory. Because of this, we will
    overestimate/underestimate memory usage for json object. */
    start_size_ = GetMemoryUsage();
  }

  JsonAutoUpdater(const JsonAutoUpdater&) = delete;
  JsonAutoUpdater& operator=(const JsonAutoUpdater&) = delete;

  JsonAutoUpdater(JsonAutoUpdater&&) = default;
  JsonAutoUpdater& operator=(JsonAutoUpdater&&) = delete;

  void SetJsonSize() {
    set_size_was_called_ = true;

    ShrinkJsonIfNeeded();

    const size_t current = GetMemoryUsage();
    int64_t diff = static_cast<int64_t>(current) - static_cast<int64_t>(start_size_);

    GetPrimeValue().SetJsonSize(diff);

    // Under any flow we must not end up with this special value.
    DCHECK(GetPrimeValue().MallocUsed() != 0);
  }

  void AddDocToIndexes() {
    op_args_.shard->search_indices()->AddDoc(key_, op_args_.db_cntx, &GetPrimeValue());
  }

  ~JsonAutoUpdater() {
    if (was_released_) {
      return;  // Skip all cleanup if iterator was released
    }

    if (options_.update_on_delete && !set_size_was_called_) {
      SetJsonSize();
    } else if (!set_size_was_called_) {
      LOG(WARNING) << "JsonAutoUpdater destructor called without SetJsonSize() being called. This "
                      "may lead to memory tracking issues.";
    }

    it_.post_updater.Run();

    /* We need to call AddDoc after SetJsonSize because internally AddDoc has static cache that can
    allocate/deallocate memory. Because of this, we will overestimate/underestimate memory usage for
    json object. */
    if (!options_.disable_indexing) {
      AddDocToIndexes();
    }
  }

  PrimeValue& GetPrimeValue() {
    return it_.it->second;
  }

  JsonType* GetJson() {
    return GetPrimeValue().GetJson();
  }

  const DbSlice::Iterator& GetIterator() const {
    return it_.it;
  }

  // Releases ownership of the iterator. After calling this, the destructor becomes a noop.
  // Used when we need to delete the entry manually (e.g., on error paths for newly created keys).
  DbSlice::ItAndUpdater Release() {
    was_released_ = true;
    return std::move(it_);
  }

 private:
  size_t GetMemoryUsage() const {
    return static_cast<MiMemoryResource*>(CompactObj::memory_resource())->used();
  }

  /* Shrinks the json object to fit its current size.
     Sometimes after mutating the json object, it may have more capacity than needed.
     This method will reduce the capacity to fit the current size. */
  void ShrinkJsonIfNeeded() {
    auto json = GetJson();
    if (json->size() * 2 < json->capacity()) {
      json->shrink_to_fit();
    }
  }

  const OpArgs& op_args_;
  string_view key_;
  DbSlice::ItAndUpdater it_;
  JsonAutoUpdaterOptions options_;

  // Used to track the memory usage of the json object
  size_t start_size_{0};
  bool set_size_was_called_{false};
  bool was_released_{false};
};

template <typename T> using ParseResult = io::Result<T, std::string>;

ParseResult<JsonExpression> ParseJsonPathAsExpression(std::string_view path) {
  std::error_code ec;
  JsonExpression res = MakeJsonPathExpr(path, ec);
  if (ec)
    return nonstd::make_unexpected(kSyntaxErr);
  return res;
}

ParseResult<WrappedJsonPath> ParseJsonPath(StringOrView path, JsonPathType path_type) {
  if (absl::GetFlag(FLAGS_jsonpathv2)) {
    auto path_result = json::ParsePath(path.view());
    if (!path_result) {
      VLOG(1) << "Invalid Json path: " << path << ' ' << path_result.error();
      return nonstd::make_unexpected(kSyntaxErr);
    }
    return WrappedJsonPath{std::move(path_result).value(), std::move(path), path_type};
  }

  auto expr_result = ParseJsonPathAsExpression(path.view());
  if (!expr_result) {
    VLOG(1) << "Invalid Json path: " << path << ' ' << expr_result.error();
    return nonstd::make_unexpected(kSyntaxErr);
  }
  return WrappedJsonPath{std::move(expr_result).value(), std::move(path), path_type};
}

ParseResult<WrappedJsonPath> ParseJsonPathV1(std::string_view path) {
  if (path.empty() || path == WrappedJsonPath::kV1PathRootElement) {
    return ParseJsonPath(StringOrView::FromView(WrappedJsonPath::kV2PathRootElement),
                         JsonPathType::kLegacy);
  }

  std::string v2_path = absl::StrCat(
      WrappedJsonPath::kV2PathRootElement, path.front() != '.' && path.front() != '[' ? "." : "",
      path);  // Convert to V2 path; TODO(path.front() != all kinds of symbols)
  return ParseJsonPath(StringOrView::FromString(std::move(v2_path)), JsonPathType::kLegacy);
}

ParseResult<WrappedJsonPath> ParseJsonPathV2(std::string_view path) {
  return ParseJsonPath(StringOrView::FromView(path), JsonPathType::kV2);
}

bool IsJsonPathV2(std::string_view path) {
  return !path.empty() && path.front() == '$';
}

ParseResult<WrappedJsonPath> ParseJsonPath(std::string_view path) {
  return IsJsonPathV2(path) ? ParseJsonPathV2(path) : ParseJsonPathV1(path);
}

namespace reply_generic {

template <typename I> void Send(I begin, I end, CommandContext* cmd_cntx);

inline RedisReplyBuilder* RB(CommandContext* cmd_cntx) {
  return static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
}

void Send(bool value, CommandContext* cmd_cntx) {
  RB(cmd_cntx)->SendBulkString(value ? "true"sv : "false"sv);
}

void Send(long value, CommandContext* cmd_cntx) {
  RB(cmd_cntx)->SendLong(value);
}

void Send(size_t value, CommandContext* cmd_cntx) {
  RB(cmd_cntx)->SendLong(value);
}

void Send(double value, CommandContext* cmd_cntx) {
  RB(cmd_cntx)->SendDouble(value);
}

void Send(const std::string& value, CommandContext* cmd_cntx) {
  RB(cmd_cntx)->SendBulkString(value);
}

void Send(const std::vector<std::string>& vec, CommandContext* cmd_cntx) {
  Send(vec.begin(), vec.end(), cmd_cntx);
}

template <typename Allocator>
void Send(const JsonWithAllocator<Allocator>& value, CommandContext* cmd_cntx) {
  auto* rb = RB(cmd_cntx);
  if (value.is_double()) {
    Send(value.as_double(), cmd_cntx);
  } else if (value.is_number()) {
    Send(value.template as_integer<long>(), cmd_cntx);
  } else if (value.is_bool()) {
    rb->SendSimpleString(value.as_bool() ? "true" : "false");
  } else if (value.is_null()) {
    rb->SendNull();
  } else if (value.is_string()) {
    rb->SendBulkString(value.as_string_view());
  } else if (value.is_object()) {
    rb->StartArray(value.size() + 1);
    rb->SendSimpleString("{");
    for (const auto& item : value.object_range()) {
      rb->StartArray(2);
      rb->SendBulkString(item.key());
      Send(item.value(), cmd_cntx);
    }
  } else if (value.is_array()) {
    if (rb->IsResp3()) {
      rb->StartArray(value.size());
      for (const auto& item : value.array_range()) {
        Send(item, cmd_cntx);
      }
    } else {
      rb->StartArray(value.size() + 1);
      rb->SendSimpleString("[");
      for (const auto& item : value.array_range()) {
        Send(item, cmd_cntx);
      }
    }
  }
}

template <typename T> void Send(const std::optional<T>& opt, CommandContext* cmd_cntx) {
  if (opt.has_value()) {
    Send(opt.value(), cmd_cntx);
  } else {
    RB(cmd_cntx)->SendNull();
  }
}

template <typename I> void Send(I begin, I end, CommandContext* cmd_cntx) {
  RedisReplyBuilder* rb = RB(cmd_cntx);
  RedisReplyBuilder::ReplyScope scope{rb};
  if (begin == end) {
    rb->SendEmptyArray();
  } else {
    if constexpr (is_same_v<decltype(*begin), const string>) {
      rb->SendBulkStrArr(cmn::OwnedArgSlice{begin, end});
    } else {
      rb->StartArray(end - begin);
      for (auto i = begin; i != end; ++i) {
        Send(*i, cmd_cntx);
      }
    }
  }
}

template <typename T> void Send(const JsonCallbackResult<T>& result, CommandContext* cmd_cntx) {
  RedisReplyBuilder* rb = RB(cmd_cntx);
  if (result.ShouldSendNil())
    return rb->SendNull();
  if (result.ShouldSendWrongType())
    return cmd_cntx->SendError(OpStatus::WRONG_JSON_TYPE);

  if (result.IsV1()) {
    /* The specified path was restricted (JSON legacy mode), then the result consists only of a
     * single value */
    if (rb->IsResp3()) {
      rb->StartArray(1);
    }
    Send(result.AsV1(), cmd_cntx);
  } else {
    /* The specified path was enhanced (starts with '$'), then the result is an array of multiple
     * values */
    const auto& arr = result.AsV2();
    if (rb->IsResp3()) {
      rb->StartArray(arr.size());
      for (const auto& item : arr) {
        // For JSON.TYPE (std::string), preserve nested array behavior for compatibility
        if constexpr (std::is_same_v<T, std::string>) {
          rb->StartArray(1);
        }
        Send(item, cmd_cntx);
      }
    } else {
      Send(arr.begin(), arr.end(), cmd_cntx);
    }
  }
}

template <typename T> void Send(const OpResult<T>& result, CommandContext* cmd_cntx) {
  if (result) {
    RedisReplyBuilder::ReplyScope scope{cmd_cntx->rb()};
    Send(result.value(), cmd_cntx);
  } else {
    cmd_cntx->SendError(result.status());
  }
}

void SendJsonString(const OpResult<string>& result, CommandContext* cmd_cntx) {
  if (result) {
    RedisReplyBuilder::ReplyScope scope{cmd_cntx->rb()};
    RedisReplyBuilder* rb = RB(cmd_cntx);
    const string& json_str = result.value();
    if (rb->IsResp3()) {
      if (const std::optional<TmpJson> parsed_json = JsonFromString(json_str)) {
        Send(parsed_json.value(), cmd_cntx);
        return;
      }
    }
    Send(json_str, cmd_cntx);
  } else {
    cmd_cntx->SendError(result.status());
  }
}

}  // namespace reply_generic

using OptSize = optional<size_t>;
using SavingOrder = CallbackResultOptions::SavingOrder;
using OnEmpty = CallbackResultOptions::OnEmpty;

struct JsonGetParams {
  std::optional<std::string> indent;
  std::optional<std::string> new_line;
  std::optional<std::string> space;
  bool no_escape = false;  // Flag for NOESCAPE option
  std::vector<std::pair<std::string_view, WrappedJsonPath>> paths;
};

std::optional<JsonGetParams> ParseJsonGetParams(CmdArgParser* parser, SinkReplyBuilder* builder) {
  JsonGetParams parsed_args;
  while (parser->HasNext()) {
    if (parser->Check("NOESCAPE")) {
      parsed_args.no_escape = true;
    } else if (parser->Check("SPACE")) {
      parsed_args.space = parser->Next();
    } else if (parser->Check("NEWLINE")) {
      parsed_args.new_line = parser->Next();
    } else if (parser->Check("INDENT")) {
      parsed_args.indent = parser->Next();
    } else {
      std::string_view path_str = parser->Next();

      auto json_path = ParseJsonPath(path_str);
      if (!json_path) {
        builder->SendError(json_path.error());
        return std::nullopt;
      }

      parsed_args.paths.emplace_back(path_str, std::move(json_path).value());
    }
  }
  return parsed_args;
}

// This method makes a comparison of json considering their types
// For example, 3 != 3.0 because json_type::int64_value != json_type::double_value
bool JsonAreEquals(const JsonType& lhs, const JsonType& rhs) {
  if (lhs.type() != rhs.type()) {
    return false;
  }
  switch (lhs.type()) {
    case json_type::array_value: {
      if (lhs.size() != rhs.size()) {
        return false;
      }

      auto rhs_array = rhs.array_range();
      for (auto l_it = lhs.array_range().begin(), r_it = rhs_array.begin(); r_it != rhs_array.end();
           ++r_it, ++l_it) {
        if (!JsonAreEquals(*l_it, *r_it)) {
          return false;
        }
      }
      return true;
    }

    case json_type::object_value: {
      if (lhs.size() != rhs.size()) {
        return false;
      }
      return std::all_of(
          lhs.object_range().begin(), lhs.object_range().end(), [&](const auto& l_it) {
            auto r_it = rhs.find(l_it.key());
            return r_it != rhs.object_range().end() && JsonAreEquals(l_it.value(), r_it->value());
          });
    }

    default:
      return lhs == rhs;
  }
}

/* Converts a JSONPath to a JSONPointer.
   E.g. $[a][b][0] -> /a/b/0.
   V1 JSONPath is not supported. */
std::optional<std::string> ConvertJsonPathToJsonPointer(string_view json_path) {
  auto parsed_path = json::ParsePath(json_path);

  if (!parsed_path) {
    VLOG(2) << "Error during conversion of JSONPath to JSONPointer: " << json_path
            << ". Invalid JSONPath.";
    return std::nullopt;
  }

  std::string pointer;
  const auto& path = parsed_path.value();
  for (const auto& node : path) {
    const auto& type = node.type();
    if (type == json::SegmentType::IDENTIFIER) {
      absl::StrAppend(&pointer, "/"sv, node.identifier());
    } else if (type == json::SegmentType::INDEX) {
      const auto& index = node.index();

      if (index.first != index.second) {
        VLOG(2) << "Error during conversion of JSONPath to JSONPointer: " << json_path
                << ". Index range is not supported.";
        return std::nullopt;
      }

      absl::StrAppend(&pointer, "/"sv, node.index().first);
    } else {
      VLOG(2) << "Error during conversion of JSONPath to JSONPointer: " << json_path
              << ". Unsupported segment type.";
      return std::nullopt;
    }
  }

  return pointer;
}

/* Use this method on the shard thread

   If you do memory tracking, make sure to initialize it before calling this method, and reset the
   result before invoking SetJsonSize. Note that even after calling std::move on an optional, it may
   still hold the JSON value, which can lead to incorrect memory tracking. */
std::optional<JsonType> ShardJsonFromString(std::string_view input) {
  return ParseJsonUsingShardHeap(input);
}

OpStatus SetFullJson(const OpArgs& op_args, string_view key, string_view json_str) {
  // We check the type of the object later, because we allow here OBJ_JSON and OBJ_STRING
  auto it_res = op_args.GetDbSlice().AddOrFind(op_args.db_cntx, key, std::nullopt);
  RETURN_ON_BAD_STATUS(it_res);

  auto type = it_res->it->second.ObjType();
  if (type == OBJ_JSON) {
    // If it json we need to remove the old json object from the indexes
    op_args.shard->search_indices()->RemoveDoc(key, op_args.db_cntx, it_res->it->second);
  } else if (type != OBJ_STRING) {
    // The object is not a JSON object and not a string, so we cannot set a full JSON value
    return OpStatus::WRONG_TYPE;
  }

  const bool is_new_key = it_res->is_new;

  // AddOrFind for Add case has type == OBJ_STRING.
  // We either added a new key (is_new_key is true) or found a pre-existing (string).
  // For both cases we must reset the object before we set up the JsonAutoUpdater.
  // *note* that ShardJsonFromString is called twice. This *parses and allocates* the
  // same JSON object twice and might impact performance of large json strings.
  if (type != OBJ_JSON) {
    if (!ShardJsonFromString(json_str)) {
      if (is_new_key) {
        // Delete the key if it was created during this operation to avoid
        // an orphan (leftover empty key).
        auto& db_slice = op_args.GetDbSlice();
        db_slice.DelMutable(op_args.db_cntx, std::move(*it_res));
      }
      VLOG(1) << "got invalid JSON string '" << json_str << "' cannot be saved";
      return OpStatus::INVALID_JSON;
    }
    it_res->it->second.Reset();
  }

  JsonAutoUpdater updater(op_args, key, *std::move(it_res),
                          {.disable_indexing = true, .update_on_delete = false});

  {
    std::optional<JsonType> parsed_json = ShardJsonFromString(json_str);
    if (!parsed_json) {
      VLOG(1) << "got invalid JSON string '" << json_str << "' cannot be saved";
      if (type == OBJ_JSON) {
        // We need to add the document to the indexes, because we removed it before
        op_args.shard->search_indices()->AddDoc(key, op_args.db_cntx, &updater.GetPrimeValue());
      }
      if (is_new_key) {
        auto& db_slice = op_args.GetDbSlice();
        db_slice.DelMutable(op_args.db_cntx, updater.Release());
      }
      return OpStatus::INVALID_JSON;
    }

    op_args.GetDbSlice().RemoveExpire(op_args.db_cntx.db_index, updater.GetIterator());

    if (JsonEnconding() == kEncodingJsonFlat) {
      flexbuffers::Builder fbb;
      json::FromJsonType(*parsed_json, &fbb);
      fbb.Finish();
      const auto& buf = fbb.GetBuffer();
      updater.GetPrimeValue().SetJson(buf.data(), buf.size());
    } else {
      updater.GetPrimeValue().SetJson(std::move(*parsed_json));
    }

    // We should reset parsed_json before setting the size of the json, because
    // std::optional still holds the value and it will be deallocated
  }
  updater.SetJsonSize();

  // We need to manually run add document here
  op_args.shard->search_indices()->AddDoc(key, op_args.db_cntx, &updater.GetPrimeValue());

  return OpStatus::OK;
}

/* Sets a partial JSON value at the specified path.
   True means that the value was set, false means that the value was not set. */
OpResult<bool> SetPartialJson(const OpArgs& op_args, string_view key,
                              const WrappedJsonPath& json_path, string_view json_str,
                              bool is_nx_condition, bool is_xx_condition) {
  auto it_res = op_args.GetDbSlice().FindMutable(op_args.db_cntx, key, OBJ_JSON);
  RETURN_ON_BAD_STATUS(it_res);

  JsonAutoUpdater updater(op_args, key, *std::move(it_res));

  /* This method would use copy for parsed_json and not move!
     The reason being, that we are applying this multiple times for each match we found.
     So for example if we have an array that this expression will match each entry in it then the
     assign here is called N times. */
  std::optional<JsonType> parsed_json = ShardJsonFromString(json_str);
  if (!parsed_json) {
    VLOG(1) << "got invalid JSON string '" << json_str << "' cannot be saved";
    return OpStatus::INVALID_JSON;
  }

  bool path_exists = false;
  bool value_was_set = false;

  // If the path exists, this callback will be called
  auto mutate_cb = [&](std::optional<std::string_view>, JsonType* val) -> MutateCallbackResult<> {
    path_exists = true;
    if (!is_nx_condition) {
      value_was_set = true;
      *val = JsonType(parsed_json.value(), StatelessAllocator<char>{});
    }
    return {};
  };

  auto mutate_res = json_path.ExecuteMutateCallback<Nothing>(
      updater.GetJson(), mutate_cb, CallbackResultOptions::DefaultMutateOptions());

  // Set a new value if the path doesn't exist and the xx condition is not set.
  if (mutate_res && !path_exists && !is_xx_condition) {
    auto pointer = ConvertJsonPathToJsonPointer(json_path.Path());
    if (!pointer) {
      return OpStatus::SYNTAX_ERR;
    }

    std::error_code ec;
    jsoncons::jsonpointer::add(*updater.GetJson(), pointer.value(), std::move(parsed_json).value(),
                               ec);
    if (ec) {
      VLOG(1) << "Failed to add a JSON value to the following path: " << json_str
              << " with the error: " << ec.message();
      return OpStatus::SYNTAX_ERR;
    }

    value_was_set = true;
  }

  if (value_was_set) {
    // We should do reset before setting the size of the json, because
    // std::optional still holds the value and it will be deallocated
    parsed_json.reset();
    updater.SetJsonSize();
  }

  return value_was_set;
}

size_t NormalizeNegativeIndex(int index, size_t size) {
  if (index >= 0) {
    return index;
  }

  if (static_cast<size_t>(-index) > size) {
    return 0;
  }
  return size + index;
}

auto GetJsonArrayIterator(JsonType* val, size_t index) {
  return std::next(val->array_range().begin(), static_cast<ptrdiff_t>(index));
}

auto GetJsonArrayIterator(const JsonType& val, size_t index) {
  return std::next(val.array_range().begin(), static_cast<ptrdiff_t>(index));
}

string JsonTypeToName(const JsonType& val) {
  using namespace std::string_literals;

  if (val.is_null()) {
    return "null"s;
  } else if (val.is_bool()) {
    return "boolean"s;
  } else if (val.is_string()) {
    return "string"s;
  } else if (val.is_int64() || val.is_uint64()) {
    return "integer"s;
  } else if (val.is_number()) {
    return "number"s;
  } else if (val.is_object()) {
    return "object"s;
  } else if (val.is_array()) {
    return "array"s;
  }

  return std::string{};
}

// Returns the index of the next right bracket
OptSize GetNextIndex(string_view str) {
  size_t current_idx = 0;
  while (current_idx + 1 < str.size()) {
    // ignore escaped character after the backslash (e.g. \').
    if (str[current_idx] == '\\') {
      current_idx += 2;
    } else if (str[current_idx] == '\'' && str[current_idx + 1] == ']') {
      return current_idx;
    } else {
      current_idx++;
    }
  }

  return nullopt;
}

// Encodes special characters when appending token to JSONPointer
struct JsonPointerFormatter {
  void operator()(std::string* out, string_view token) const {
    for (size_t i = 0; i < token.size(); i++) {
      char ch = token[i];
      if (ch == '~') {
        out->append("~0");
      } else if (ch == '/') {
        out->append("~1");
      } else if (ch == '\\') {
        // backslash for encoded another character should remove.
        if (i + 1 < token.size() && token[i + 1] == '\\') {
          out->append(1, '\\');
          i++;
        }
      } else {
        out->append(1, ch);
      }
    }
  }
};

// Returns the JsonPointer of a JsonPath
// e.g. $[a][b][0] -> /a/b/0
string ConvertToJsonPointer(string_view json_path) {
  if (json_path.empty() || json_path[0] != '$') {
    LOG(FATAL) << "Unexpected JSONPath syntax: " << json_path;
  }

  // remove prefix
  json_path.remove_prefix(1);

  // except the supplied string is compatible with JSONPath syntax.
  // Each item in the string is a left bracket followed by
  // numeric or '<key>' and then a right bracket.
  vector<string_view> parts;
  bool invalid_syntax = false;
  while (!json_path.empty()) {
    bool is_array = false;
    bool is_object = false;

    // check string size is sufficient enough for at least one item.
    if (2 >= json_path.size()) {
      invalid_syntax = true;
      break;
    }

    if (json_path[0] == '[') {
      if (json_path[1] == '\'') {
        is_object = true;
        json_path.remove_prefix(2);
      } else if (isdigit(json_path[1])) {
        is_array = true;
        json_path.remove_prefix(1);
      } else {
        invalid_syntax = true;
        break;
      }
    } else {
      invalid_syntax = true;
      break;
    }

    if (is_array) {
      size_t end_val_idx = json_path.find(']');
      if (end_val_idx == string::npos) {
        invalid_syntax = true;
        break;
      }

      parts.emplace_back(json_path.substr(0, end_val_idx));
      json_path.remove_prefix(end_val_idx + 1);
    } else if (is_object) {
      OptSize end_val_idx = GetNextIndex(json_path);
      if (!end_val_idx) {
        invalid_syntax = true;
        break;
      }

      parts.emplace_back(json_path.substr(0, *end_val_idx));
      json_path.remove_prefix(*end_val_idx + 2);
    } else {
      invalid_syntax = true;
      break;
    }
  }

  if (invalid_syntax) {
    LOG(FATAL) << "Unexpected JSONPath syntax: " << json_path;
  }

  string result{"/"};  // initialize with a leading slash
  result += absl::StrJoin(parts, "/", JsonPointerFormatter());
  return result;
}

size_t CountJsonFields(const JsonType& j) {
  size_t res = 0;
  json_type type = j.type();
  if (type == json_type::array_value) {
    res += j.size();
    for (const auto& item : j.array_range()) {
      if (item.type() == json_type::array_value || item.type() == json_type::object_value) {
        res += CountJsonFields(item);
      }
    }

  } else if (type == json_type::object_value) {
    res += j.size();
    for (const auto& item : j.object_range()) {
      if (item.value().type() == json_type::array_value ||
          item.value().type() == json_type::object_value) {
        res += CountJsonFields(item.value());
      }
    }

  } else {
    res += 1;
  }

  return res;
}

struct ReadOnlyOperationOptions {
  bool return_nil_if_key_not_found = false;
  CallbackResultOptions cb_result_options = CallbackResultOptions::DefaultReadOnlyOptions();
};

template <typename T>
OpResult<JsonCallbackResult<T>> JsonReadOnlyOperation(const OpArgs& op_args, std::string_view key,
                                                      const WrappedJsonPath& json_path,
                                                      JsonPathReadOnlyCallback<T> cb,
                                                      ReadOnlyOperationOptions options = {}) {
  auto it_res = op_args.GetDbSlice().FindReadOnly(op_args.db_cntx, key, OBJ_JSON);

  if (!it_res) {
    if (options.return_nil_if_key_not_found && it_res == OpStatus::KEY_NOTFOUND) {
      return JsonCallbackResult<T>{{CallbackResultOptions::OnEmpty::kSendNil,
                                    options.cb_result_options.saving_order,
                                    JsonPathType::kLegacy}};  // set legacy mode to return nil
    }
    return it_res.status();
  }

  JsonType* json_val = it_res.value()->second.GetJson();
  DCHECK(json_val) << "should have a valid JSON object for key " << key;

  return json_path.ExecuteReadOnlyCallback<T>(json_val, cb, options.cb_result_options);
}

template <typename T>
OpResult<JsonCallbackResult<optional<T>>> JsonMutateOperation(
    const OpArgs& op_args, std::string_view key, const WrappedJsonPath& json_path,
    JsonPathMutateCallback<T> cb,
    CallbackResultOptions cb_result_options = CallbackResultOptions::DefaultMutateOptions()) {
  auto it_res = op_args.GetDbSlice().FindMutable(op_args.db_cntx, key, OBJ_JSON);
  RETURN_ON_BAD_STATUS(it_res);

  JsonAutoUpdater updater(op_args, key, *std::move(it_res));

  auto mutate_res = json_path.ExecuteMutateCallback(updater.GetJson(), cb, cb_result_options);

  updater.SetJsonSize();

  return mutate_res;
}

bool LegacyModeIsEnabled(const std::vector<std::pair<std::string_view, WrappedJsonPath>>& paths) {
  return std::all_of(paths.begin(), paths.end(),
                     [](auto& parsed_path) { return parsed_path.second.IsLegacyModePath(); });
}

OpResult<std::string> OpJsonGet(const OpArgs& op_args, string_view key,
                                const JsonGetParams& params) {
  // We don't use OBJ_JSON here because we want to support both JSON and STRING types.
  // If the key is not OBJ_JSON and not OBJ_STRING, we return WRONG_TYPE.
  auto it = op_args.GetDbSlice().FindReadOnly(op_args.db_cntx, key).it;
  if (!IsValid(it))
    return OpStatus::KEY_NOTFOUND;

  const JsonType* json_ptr = nullptr;
  JsonType json;
  if (it->second.ObjType() == OBJ_JSON) {
    json_ptr = it->second.GetJson();
  } else if (it->second.ObjType() == OBJ_STRING) {
    string tmp;
    it->second.GetString(&tmp);
    auto parsed_json = ShardJsonFromString(tmp);
    if (!parsed_json) {
      return OpStatus::WRONG_TYPE;
    }
    json.swap(*parsed_json);
    json_ptr = &json;
  } else {
    return OpStatus::WRONG_TYPE;
  }

  const auto& paths = params.paths;
  const JsonType& json_entry = *json_ptr;

  if (paths.empty()) {
    // this implicitly means that we're using . which
    // means we just brings all values
    return json_entry.to_string();
  }

  json_options options;
  options.spaces_around_comma(spaces_option::no_spaces)
      .spaces_around_colon(spaces_option::no_spaces)
      .object_array_line_splits(line_split_kind::multi_line)
      .indent_size(0)
      .new_line_chars("");

  if (params.indent) {
    options.indent_size(1);
    options.indent_chars(params.indent.value());
  }

  if (params.new_line) {
    options.new_line_chars(params.new_line.value());
  }

  if (params.space) {
    options.after_key_chars(params.space.value());
  }

  auto cb = [](std::string_view, const JsonType& val) { return val; };

  const bool legacy_mode_is_enabled = LegacyModeIsEnabled(paths);
  CallbackResultOptions cb_options = CallbackResultOptions::DefaultReadOnlyOptions();
  cb_options.path_type = legacy_mode_is_enabled ? JsonPathType::kLegacy : JsonPathType::kV2;

  auto eval_wrapped = [&](const WrappedJsonPath& json_path) -> std::optional<JsonType> {
    auto eval_result = json_path.ExecuteReadOnlyCallback<JsonType>(&json_entry, cb, cb_options);

    DCHECK(legacy_mode_is_enabled == eval_result.IsV1());

    if (eval_result.IsV1()) {
      if (eval_result.Empty())
        return nullopt;
      return eval_result.AsV1();
    }

    return JsonType{eval_result.AsV2()};
  };

  JsonType out{
      jsoncons::json_object_arg};  // see https://github.com/danielaparker/jsoncons/issues/482
  if (paths.size() == 1) {
    auto eval_result = eval_wrapped(paths[0].second);
    if (!eval_result) {
      return OpStatus::INVALID_JSON_PATH;
    }
    out = std::move(eval_result).value();  // TODO(Print not existing path to the user)
  } else {
    for (const auto& [path_str, path] : paths) {
      auto eval_result = eval_wrapped(path);
      if (legacy_mode_is_enabled && !eval_result) {
        return OpStatus::INVALID_JSON_PATH;
      }
      out[path_str] = std::move(eval_result).value();  // TODO(Print not existing path to the user)
    }
  }

  jsoncons::json_printable jp(out, options, jsoncons::indenting::indent);
  std::stringstream ss;
  jp.dump(ss);
  return ss.str();
}

auto OpType(const OpArgs& op_args, string_view key, const WrappedJsonPath& json_path) {
  auto cb = [](const string_view&, const JsonType& val) -> std::string {
    return JsonTypeToName(val);
  };
  return JsonReadOnlyOperation<std::string>(op_args, key, json_path, std::move(cb), {true});
}

OpResult<JsonCallbackResult<OptSize>> OpStrLen(const OpArgs& op_args, string_view key,
                                               const WrappedJsonPath& json_path) {
  auto cb = [](const string_view&, const JsonType& val) -> OptSize {
    if (val.is_string()) {
      return val.as_string_view().size();
    } else {
      return nullopt;
    }
  };
  return JsonReadOnlyOperation<OptSize>(
      op_args, key, json_path, std::move(cb),
      {json_path.IsLegacyModePath(),
       CallbackResultOptions::DefaultReadOnlyOptions(SavingOrder::kSaveFirst)});
}

OpResult<JsonCallbackResult<OptSize>> OpObjLen(const OpArgs& op_args, string_view key,
                                               const WrappedJsonPath& json_path) {
  auto cb = [](const string_view&, const JsonType& val) -> optional<size_t> {
    if (val.is_object()) {
      return val.size();
    } else {
      return nullopt;
    }
  };
  return JsonReadOnlyOperation<OptSize>(
      op_args, key, json_path, std::move(cb),
      {json_path.IsLegacyModePath(),
       CallbackResultOptions::DefaultReadOnlyOptions(SavingOrder::kSaveFirst)});
}

OpResult<JsonCallbackResult<OptSize>> OpArrLen(const OpArgs& op_args, string_view key,
                                               const WrappedJsonPath& json_path) {
  auto cb = [](const string_view&, const JsonType& val) -> OptSize {
    if (val.is_array()) {
      return val.size();
    } else {
      return std::nullopt;
    }
  };
  return JsonReadOnlyOperation<OptSize>(
      op_args, key, json_path, std::move(cb),
      {true, CallbackResultOptions::DefaultReadOnlyOptions(SavingOrder::kSaveFirst)});
}

template <typename T>
auto OpToggle(const OpArgs& op_args, string_view key,
              const WrappedJsonPath& json_path) {  // TODO(change the output type for enhanced path)
  auto cb = [](std::optional<std::string_view>,
               JsonType* val) -> MutateCallbackResult<std::optional<T>> {
    if (val->is_bool()) {
      bool next_val = val->as_bool() ^ true;
      *val = next_val;
      return {false, next_val};
    }
    return {};
  };
  return JsonMutateOperation<std::optional<T>>(op_args, key, json_path, std::move(cb));
}

template <typename T>
auto ExecuteToggle(string_view key, const WrappedJsonPath& json_path, CommandContext* cmd_cntx) {
  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpToggle<T>(t->GetOpArgs(shard), key, json_path);
  };

  auto result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  reply_generic::Send(result, cmd_cntx);
}

enum ArithmeticOpType : uint8_t { OP_ADD, OP_MULTIPLY };

void BinOpApply(double num, bool num_is_double, ArithmeticOpType op, JsonType* val,
                bool* overflow) {
  double result = 0;
  switch (op) {
    case OP_ADD:
      result = val->as<double>() + num;
      break;
    case OP_MULTIPLY:
      result = val->as<double>() * num;
      break;
  }

  if (isinf(result)) {
    *overflow = true;
    return;
  }

  if (val->is_double() || num_is_double) {
    *val = result;
  } else {
    *val = static_cast<uint64_t>(result);
  }
  *overflow = false;
}

// Tmp solution with struct CallbackResult, because MutateCallbackResult<std::optional<JsonType>>
// does not compile
struct DoubleArithmeticCallbackResult {
  explicit DoubleArithmeticCallbackResult(bool legacy_mode_is_enabled_)
      : legacy_mode_is_enabled(legacy_mode_is_enabled_) {
    if (!legacy_mode_is_enabled) {
      json_value.emplace(jsoncons::json_array_arg);
    }
  }

  void AddValue(JsonType val) {
    if (legacy_mode_is_enabled) {
      json_value = std::move(val);
    } else {
      json_value->emplace_back(std::move(val));
    }
  }

  void AddEmptyValue() {
    if (!legacy_mode_is_enabled) {
      json_value->emplace_back(JsonType::null());
    }
  }

  std::optional<JsonType> json_value;
  bool legacy_mode_is_enabled;
};

OpResult<string> OpDoubleArithmetic(const OpArgs& op_args, string_view key,
                                    const WrappedJsonPath& json_path, string_view num,
                                    ArithmeticOpType op_type) {
  bool has_fractional_part = num.find('.') != string::npos;
  double double_value = 0;

  if (!ParseDouble(num, &double_value)) {
    VLOG(2) << "Failed to parse number as double: " << num;
    return OpStatus::WRONG_TYPE;
  }

  bool is_result_overflow = false;

  DoubleArithmeticCallbackResult result{json_path.IsLegacyModePath()};
  auto cb = [&](std::optional<std::string_view>, JsonType* val) -> MutateCallbackResult<> {
    if (val->is_number()) {
      bool res = false;
      BinOpApply(double_value, has_fractional_part, op_type, val, &res);
      if (res) {
        is_result_overflow = true;
      } else {
        result.AddValue(*val);
        return {};
      }
    }
    result.AddEmptyValue();
    return {};
  };

  auto res = JsonMutateOperation<Nothing>(op_args, key, json_path, std::move(cb));

  if (is_result_overflow)
    return OpStatus::INVALID_NUMERIC_RESULT;

  RETURN_ON_BAD_STATUS(res);

  if (!result.json_value) {
    return OpStatus::WRONG_JSON_TYPE;
  }
  return result.json_value->as_string();
}

// Deletes items specified by the expression/path.
OpResult<long> OpDel(const OpArgs& op_args, string_view key, string_view path,
                     const WrappedJsonPath& json_path) {
  if (json_path.RefersToRootElement()) {
    auto& db_slice = op_args.GetDbSlice();
    auto res_it = db_slice.FindMutable(op_args.db_cntx, key, OBJ_JSON);

    // For JSON.DEL, if key doesn't exist, return 0 instead of error
    if (res_it.status() == OpStatus::KEY_NOTFOUND) {
      return 0;
    }

    RETURN_ON_BAD_STATUS(res_it);

    if (IsValid(res_it->it)) {
      db_slice.DelMutable(op_args.db_cntx, std::move(*res_it));
      return 1;
    }
    return 0;
  }

  // FindMutable because we need to run the AutoUpdater at the end which will account
  // the deltas calculated from the MemoryTracker
  auto it_res = op_args.GetDbSlice().FindMutable(op_args.db_cntx, key, OBJ_JSON);
  if (!it_res) {
    return 0;
  }

  if (json_path.HoldsJsonPath()) {
    JsonAutoUpdater updater(op_args, key, *std::move(it_res),
                            {.disable_indexing = false, .update_on_delete = true});
    const json::Path& path = json_path.AsJsonPath();
    long deletions = json::DeletePath(path, updater.GetJson());
    return deletions;
  }

  // Allocates memory for the deletion_items.
  // So we need to initialize JsonAutoUpdater after this callback
  vector<string> deletion_items;
  auto cb = [&deletion_items](string_view path, const JsonType& val) -> Nothing {
    deletion_items.emplace_back(path);
    return {};
  };

  auto res = json_path.ExecuteReadOnlyCallback<Nothing>(
      it_res->it->second.GetJson(), cb, CallbackResultOptions::DefaultReadOnlyOptions());
  if (deletion_items.empty()) {
    return 0;
  }

  long total_deletions = 0;
  JsonType patch(jsoncons::json_array_arg, {});
  reverse(deletion_items.begin(), deletion_items.end());  // deletion should finish at root keys.
  for (const auto& item : deletion_items) {
    string pointer = ConvertToJsonPointer(item);
    total_deletions++;
    JsonType patch_item(jsoncons::json_object_arg, {{"op", "remove"}, {"path", pointer}});
    patch.emplace_back(patch_item);
  }

  JsonAutoUpdater updater(op_args, key, *std::move(it_res));

  std::error_code ec;
  jsoncons::jsonpatch::apply_patch(*updater.GetJson(), patch, ec);
  if (ec) {
    VLOG(1) << "Failed to apply patch on json with error: " << ec.message();
    return 0;
  }

  updater.SetJsonSize();

  return total_deletions;
}

// Returns a vector of string vectors,
// keys within the same object are stored in the same string vector.
auto OpObjKeys(const OpArgs& op_args, string_view key, const WrappedJsonPath& json_path) {
  auto cb = [](const string_view& path, const JsonType& val) {
    // Aligned with ElastiCache flavor.
    DVLOG(2) << "path: " << path << " val: " << val.to_string();

    StringVec vec;
    if (val.is_object()) {
      for (const auto& member : val.object_range()) {
        vec.emplace_back(member.key());
      }
    }
    return vec;
  };
  return JsonReadOnlyOperation<StringVec>(
      op_args, key, json_path, std::move(cb),
      {json_path.IsLegacyModePath(),
       CallbackResultOptions::DefaultReadOnlyOptions(SavingOrder::kSaveFirst)});
}

OpResult<JsonCallbackResult<OptSize>> OpStrAppend(const OpArgs& op_args, string_view key,
                                                  const WrappedJsonPath& path, string_view value) {
  auto cb = [&](optional<string_view>, JsonType* val) -> MutateCallbackResult<size_t> {
    if (!val->is_string())
      return {};

    string new_val = absl::StrCat(val->as_string_view(), value);
    size_t len = new_val.size();
    *val = std::move(new_val);
    return {false, len};  // do not delete, new value len
  };
  return JsonMutateOperation<size_t>(op_args, key, path, std::move(cb));
}

// Returns the numbers of values cleared.
// Clears containers(arrays or objects) and zeroing numbers.
OpResult<long> OpClear(const OpArgs& op_args, string_view key, const WrappedJsonPath& path) {
  long clear_items = 0;

  auto cb = [&clear_items](std::optional<std::string_view>,
                           JsonType* val) -> MutateCallbackResult<> {
    if (!(val->is_object() || val->is_array() || val->is_number())) {
      return {};
    }

    if (val->is_object()) {
      val->erase(val->object_range().begin(), val->object_range().end());
    } else if (val->is_array()) {
      val->erase(val->array_range().begin(), val->array_range().end());
    } else if (val->is_number()) {
      *val = 0;
    }

    clear_items += 1;
    return {};
  };

  auto res = JsonMutateOperation<Nothing>(op_args, key, path, std::move(cb));
  RETURN_ON_BAD_STATUS(res);
  return clear_items;
}

// Returns string vector that represents the pop out values.
auto OpArrPop(const OpArgs& op_args, string_view key, WrappedJsonPath& path, int index) {
  auto cb = [index](std::optional<std::string_view>,
                    JsonType* val) -> MutateCallbackResult<std::string> {
    if (!val->is_array() || val->empty()) {
      return {};
    }

    size_t array_size = val->size();
    size_t removal_index = std::min(NormalizeNegativeIndex(index, array_size), array_size - 1);

    auto it = GetJsonArrayIterator(val, removal_index);
    string str;
    error_code ec;
    it->dump(str, {}, ec);
    if (ec) {
      LOG(ERROR) << "Failed to dump JSON to string with the error: " << ec.message();
      return {};
    }

    val->erase(it);
    return {false, std::move(str)};
  };
  return JsonMutateOperation<std::string>(op_args, key, path, std::move(cb),
                                          CallbackResultOptions{OnEmpty::kSendNil});
}

// Returns numeric vector that represents the new length of the array at each path.
auto OpArrTrim(const OpArgs& op_args, string_view key, const WrappedJsonPath& path, int start_index,
               int stop_index) {
  auto cb = [&](optional<string_view>, JsonType* val) -> MutateCallbackResult<size_t> {
    if (!val->is_array()) {
      return {};
    }

    if (val->empty()) {
      return {false, 0};
    }

    size_t array_size = val->size();

    size_t trim_start_index = NormalizeNegativeIndex(start_index, array_size);
    size_t trim_end_index = NormalizeNegativeIndex(stop_index, array_size);

    if (trim_start_index >= array_size || trim_start_index > trim_end_index) {
      val->erase(val->array_range().begin(), val->array_range().end());
      return {false, 0};
    }

    trim_end_index = std::min(trim_end_index, array_size);

    auto trim_start_it = GetJsonArrayIterator(val, trim_start_index);
    auto trim_end_it = val->array_range().end();
    if (trim_end_index < val->size()) {
      trim_end_it = GetJsonArrayIterator(val, trim_end_index + 1);
    }

    *val = jsoncons::json_array<JsonType>(trim_start_it, trim_end_it);
    return {false, val->size()};
  };
  return JsonMutateOperation<size_t>(op_args, key, path, std::move(cb));
}

// Returns numeric vector that represents the new length of the array at each path.
OpResult<JsonCallbackResult<OptSize>> OpArrInsert(const OpArgs& op_args, string_view key,
                                                  const WrappedJsonPath& json_path, int index,
                                                  const vector<string_view>& new_values) {
  vector<JsonType> parsed_values;
  parsed_values.reserve(new_values.size());

  for (const auto& nv : new_values) {
    optional<JsonType> v = ShardJsonFromString(nv);
    if (!v) {
      return OpStatus::SYNTAX_ERR;
    }

    parsed_values.emplace_back(std::move(*v));
  }

  bool out_of_boundaries_encountered = false;

  // Insert user-supplied value into the supplied index that should be valid.
  // If at least one index isn't valid within an array in the json doc, the operation is discarded.
  // Negative indexes start from the end of the array.
  auto cb = [&](std::optional<std::string_view>, JsonType* val) -> MutateCallbackResult<size_t> {
    if (out_of_boundaries_encountered || !val->is_array()) {
      return {};
    }

    size_t array_size = val->size();
    size_t insert_before_index;

    if (index < 0) {
      if (static_cast<size_t>(-index) > array_size) {
        out_of_boundaries_encountered = true;
        return {};
      }
      insert_before_index = array_size + index;
    } else {
      if (static_cast<size_t>(index) > val->size()) {
        out_of_boundaries_encountered = true;
        return {};
      }
      insert_before_index = index;
    }

    auto it = GetJsonArrayIterator(val, insert_before_index);
    for (auto& new_val : parsed_values) {
      it = val->insert(it, new_val);
      it++;
    }
    return {false, val->size()};
  };

  auto res = JsonMutateOperation<size_t>(op_args, key, json_path, std::move(cb));
  if (out_of_boundaries_encountered) {
    return OpStatus::OUT_OF_RANGE;
  }
  return res;
}

OpResult<JsonCallbackResult<optional<optional<unsigned long>>>> OpArrAppend(
    const OpArgs& op_args, string_view key, const WrappedJsonPath& path,
    const vector<string_view>& append_values) {
  vector<JsonType> parsed_values;
  parsed_values.reserve(append_values.size());

  for (const auto& v : append_values) {
    optional<JsonType> parsed = ShardJsonFromString(v);
    if (!parsed) {
      return OpStatus::SYNTAX_ERR;
    }
    parsed_values.emplace_back(std::move(*parsed));
  }

  auto cb = [&](std::optional<std::string_view>,
                JsonType* val) -> MutateCallbackResult<std::optional<std::size_t>> {
    if (!val->is_array()) {
      return {};
    }
    for (auto& new_val : parsed_values) {
      val->emplace_back(new_val);
    }
    return {false, val->size()};
  };
  return JsonMutateOperation<std::optional<std::size_t>>(op_args, key, path, std::move(cb));
}

// Returns a numeric vector representing each JSON value first index of the JSON scalar.
// An index value of -1 represents unfound in the array.
// JSON scalar has types of string, boolean, null, and number.
OpResult<JsonCallbackResult<optional<long>>> OpArrIndex(const OpArgs& op_args, string_view key,
                                                        const WrappedJsonPath& json_path,
                                                        string_view search_val, int start_index,
                                                        int end_index) {
  const optional<JsonType> search_value_json = ShardJsonFromString(search_val);
  if (!search_value_json) {
    return OpStatus::SYNTAX_ERR;
  }

  auto cb = [&](const string_view&, const JsonType& val) -> std::optional<long> {
    if (!val.is_array()) {
      return std::nullopt;
    }

    if (val.empty()) {
      return -1;
    }

    size_t array_size = val.size();

    if (start_index < 0 && static_cast<size_t>(-start_index) > array_size) {
      return -1;
    }

    size_t pos_start_index = NormalizeNegativeIndex(start_index, array_size);
    size_t pos_end_index =
        end_index == 0 ? array_size : NormalizeNegativeIndex(end_index, array_size);

    if (pos_start_index >= array_size && pos_end_index < array_size) {
      return -1;
    }

    pos_start_index = std::min(pos_start_index, array_size - 1);
    pos_end_index = std::min(pos_end_index, array_size - 1);

    if (pos_start_index > pos_end_index) {
      return -1;
    }

    size_t pos = -1;
    auto it = GetJsonArrayIterator(val, pos_start_index);
    while (it != val.array_range().end()) {
      if (JsonAreEquals(search_value_json, *it)) {
        pos = pos_start_index;
        break;
      }

      if (pos_start_index == pos_end_index) {
        break;
      }

      ++it;
      pos_start_index++;
    }

    return pos;
  };

  return JsonReadOnlyOperation<std::optional<long>>(
      op_args, key, json_path, std::move(cb),
      {false, CallbackResultOptions{CallbackResultOptions::OnEmpty::kSendWrongType}});
}

// Returns string vector that represents the query result of each supplied key.
std::vector<std::optional<std::string>> OpJsonMGet(const WrappedJsonPath& json_path,
                                                   const Transaction* t, EngineShard* shard) {
  ShardArgs args = t->GetShardArgs(shard->shard_id());
  DCHECK(!args.Empty());
  std::vector<std::optional<std::string>> response(args.Size());

  auto& db_slice = t->GetDbSlice(shard->shard_id());
  unsigned index = 0;
  for (string_view key : args) {
    auto it_res = db_slice.FindReadOnly(t->GetDbContext(), key, OBJ_JSON);
    auto& dest = response[index++];
    if (!it_res.ok())
      continue;

    JsonType* json_val = it_res.value()->second.GetJson();
    DCHECK(json_val) << "should have a valid JSON object for key " << key;

    auto cb = [](std::string_view, const JsonType& val) { return val; };

    auto eval_wrapped = [&json_val,
                         &cb](const WrappedJsonPath& json_path) -> std::optional<JsonType> {
      auto eval_result = json_path.ExecuteReadOnlyCallback<JsonType>(
          json_val, std::move(cb), CallbackResultOptions::DefaultReadOnlyOptions());

      if (eval_result.IsV1()) {
        if (eval_result.Empty())
          return nullopt;
        return eval_result.AsV1();
      }

      return JsonType{eval_result.AsV2()};
    };

    auto eval_result = eval_wrapped(json_path);

    if (!eval_result) {
      continue;
    }

    std::string str;
    std::error_code ec;
    eval_result->dump(str, {}, ec);
    if (ec) {
      VLOG(1) << "Failed to dump JSON array to string with the error: " << ec.message();
    }

    dest = std::move(str);
  }

  return response;
}

// Returns numeric vector that represents the number of fields of JSON value at each path.
auto OpFields(const OpArgs& op_args, string_view key, const WrappedJsonPath& json_path) {
  auto cb = [](const string_view&, const JsonType& val) -> std::optional<std::size_t> {
    return CountJsonFields(val);
  };
  return JsonReadOnlyOperation<std::optional<std::size_t>>(op_args, key, json_path, std::move(cb));
}

// Returns numeric vector that represents the memory size in bytes of JSON value at each path.
auto OpMemory(const OpArgs& op_args, string_view key, const WrappedJsonPath& json_path) {
  auto cb = [](const string_view&, const JsonType& val) -> std::optional<std::size_t> {
    return ComputeMemorySize(val);
  };
  return JsonReadOnlyOperation<std::optional<std::size_t>>(
      op_args, key, json_path, std::move(cb),
      ReadOnlyOperationOptions{false, CallbackResultOptions::DefaultReadOnlyOptions()});
}

// Returns json vector that represents the result of the json query. A shard local
// heap allocated JSON cannot be copied and then destroyed on another shard because we use stateless
// allocators which forward all requests to thread local memory resource. This resource is
// initialized by the engine shard, and it is possible that the coordinator thread may not have this
// resource initialized. So the value is first copied to the std allocator-backed type TmpJson.
OpResult<JsonCallbackResult<TmpJson>> OpResp(const OpArgs& op_args, string_view key,
                                             const WrappedJsonPath& json_path) {
  auto cb = [](const string_view&, const JsonType& val) {
    string s;
    val.dump(s);
    return JsonFromString(s);
  };
  return JsonReadOnlyOperation<TmpJson>(op_args, key, json_path, std::move(cb));
}

// Returns boolean that represents the result of the operation.
OpResult<bool> OpSet(const OpArgs& op_args, string_view key, string_view path,
                     const WrappedJsonPath& json_path, std::string_view json_str,
                     bool is_nx_condition, bool is_xx_condition) {
  // The whole key should be replaced.
  // NOTE: unlike in Redis, we are overriding the value when the path is "$"
  // this is regardless of the current key type. In redis if the key exists
  // and its not JSON, it would return an error.
  if (json_path.RefersToRootElement()) {
    if (is_nx_condition || is_xx_condition) {
      auto it_res = op_args.GetDbSlice().FindReadOnly(op_args.db_cntx, key, OBJ_JSON);
      bool key_exists = (it_res.status() != OpStatus::KEY_NOTFOUND);
      if (is_nx_condition && key_exists) {
        return false;
      }

      if (is_xx_condition && !key_exists) {
        return false;
      }
    }

    OpStatus result = SetFullJson(op_args, key, json_str);
    if (result == OpStatus::OK) {
      return true;
    }
    return result;
  }

  return SetPartialJson(op_args, key, json_path, json_str, is_nx_condition, is_xx_condition);
}

OpResult<bool> OpSet(const OpArgs& op_args, string_view key, string_view path,
                     std::string_view json_str, bool is_nx_condition, bool is_xx_condition) {
  auto res_json_path = ParseJsonPath(path);
  if (!res_json_path) {
    return OpStatus::SYNTAX_ERR;  // TODO(Return initial error)
  }
  return OpSet(op_args, key, path, res_json_path.value(), json_str, is_nx_condition,
               is_xx_condition);
}

OpStatus OpMSet(const OpArgs& op_args, const ShardArgs& args) {
  DCHECK_EQ(args.Size() % 3, 0u);

  OpStatus result = OpStatus::OK;
  size_t stored = 0;
  for (auto it = args.begin(); it != args.end();) {
    string_view key = *(it++);
    string_view path = *(it++);
    string_view value = *(it++);
    if (auto res = OpSet(op_args, key, path, value, false, false); !res.ok()) {
      result = res.status();
      break;
    }

    stored++;
  }

  // Replicate custom journal, see OpMSet
  if (auto journal = op_args.shard->journal(); journal) {
    if (stored * 3 == args.Size()) {
      RecordJournal(op_args, "JSON.MSET", args, op_args.tx->GetUniqueShardCnt());
      DCHECK_EQ(result, OpStatus::OK);
      return result;
    }

    string_view cmd = stored == 0 ? "PING" : "JSON.MSET";
    vector<string_view> store_args(args.begin(), args.end());
    store_args.resize(stored * 3);
    RecordJournal(op_args, cmd, store_args, op_args.tx->GetUniqueShardCnt());
  }

  return result;
}

// Note that currently OpMerge works only with jsoncons and json::Path support has not been
// implemented yet.
OpStatus OpMerge(const OpArgs& op_args, string_view key, string_view path,
                 const WrappedJsonPath& json_path, std::string_view json_str) {
  auto it_res = op_args.GetDbSlice().FindMutable(op_args.db_cntx, key, OBJ_JSON);
  OpStatus res_status = it_res.status();

  if (res_status == OpStatus::OK) {
    JsonAutoUpdater updater(op_args, key, *std::move(it_res));

    std::optional<JsonType> parsed_json = ShardJsonFromString(json_str);
    if (!parsed_json) {
      VLOG(1) << "got invalid JSON string '" << json_str << "' cannot be saved";
      return OpStatus::INVALID_JSON;
    }

    auto cb = [&](std::optional<std::string_view> cur_path,
                  JsonType* val) -> MutateCallbackResult<> {
      string_view strpath = cur_path ? *cur_path : string_view{};
      DVLOG(2) << "Handling " << strpath << " " << val->to_string();

      // https://datatracker.ietf.org/doc/html/rfc7386#section-2
      try {
        mergepatch::apply_merge_patch(*val, *parsed_json);
      } catch (const std::exception& e) {
        LOG_EVERY_T(ERROR, 1) << "Exception in OpMerge: " << e.what() << " with obj: " << *val
                              << " and patch: " << *parsed_json << ", path: " << strpath;
      }

      return {};
    };

    auto opts = CallbackResultOptions::DefaultMutateOptions();
    auto res = json_path.ExecuteMutateCallback<Nothing>(updater.GetJson(), cb, opts);
    parsed_json.reset();
    updater.SetJsonSize();

    res_status = res.status();
  }

  if (res_status != OpStatus::KEY_NOTFOUND)
    return res_status;

  if (json_path.RefersToRootElement()) {
    return OpSet(op_args, key, path, json_path, json_str, false, false).status();
  }
  return OpStatus::SYNTAX_ERR;
}

void CmdSet(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  auto [key, path, json_str] = parser.Next<string_view, string_view, string_view>();
  auto* builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  WrappedJsonPath json_path = GET_OR_SEND_UNEXPECTED(ParseJsonPath(path));

  auto res = parser.TryMapNext("NX", 1, "XX", 2);
  bool is_xx_condition = (res == 2), is_nx_condition = (res == 1);

  if (parser.TakeError() || parser.HasNext())  // also clear the parser error dcheck
    return builder->SendError(kSyntaxErr);

  auto cb = [&, &key = key, &path = path, &json_str = json_str](Transaction* t,
                                                                EngineShard* shard) {
    return OpSet(t->GetOpArgs(shard), key, path, json_path, json_str, is_nx_condition,
                 is_xx_condition);
  };

  OpResult<bool> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));

  if (result) {
    if (*result) {
      builder->SendOk();
    } else {
      builder->SendNull();
    }
  } else {
    cmd_cntx->SendError(result.status());
  }
}

// JSON.MSET key path value [key path value ...]
void CmdMSet(CmdArgList args, CommandContext* cmd_cntx) {
  DCHECK_GE(args.size(), 3u);

  auto* builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (args.size() % 3 != 0) {
    return builder->SendError(facade::WrongNumArgsError("json.mset"));
  }

  AggregateStatus status;
  auto cb = [&status](Transaction* t, EngineShard* shard) {
    auto op_args = t->GetOpArgs(shard);
    ShardArgs args = t->GetShardArgs(shard->shard_id());
    if (auto result = OpMSet(op_args, args); result != OpStatus::OK)
      status = result;
    return OpStatus::OK;
  };

  cmd_cntx->tx()->ScheduleSingleHop(cb);

  if (*status != OpStatus::OK)
    return cmd_cntx->SendError(*status);
  builder->SendOk();
}

// JSON.MERGE key path value
// Based on https://datatracker.ietf.org/doc/html/rfc7386 spec
void CmdMerge(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  string_view key = parser.Next();
  string_view path = parser.Next();
  string_view value = parser.Next();

  auto* builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  WrappedJsonPath json_path = GET_OR_SEND_UNEXPECTED(ParseJsonPath(path));

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpMerge(t->GetOpArgs(shard), key, path, json_path, value);
  };

  OpStatus status = cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));
  if (status == OpStatus::OK)
    return builder->SendOk();
  cmd_cntx->SendError(status);
}

void CmdResp(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  string_view key = parser.Next();
  string_view path = parser.NextOrDefault();

  auto* builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  WrappedJsonPath json_path = GET_OR_SEND_UNEXPECTED(ParseJsonPath(path));

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpResp(t->GetOpArgs(shard), key, json_path);
  };

  auto result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  reply_generic::Send(result, cmd_cntx);
}

void CmdDebug(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  string_view command = parser.Next();

  auto* builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  if (absl::EqualsIgnoreCase(command, "help")) {
    builder->StartArray(3);
    builder->SendBulkString(
        "JSON.DEBUG MEMORY <key> [path] - report memory size (bytes) of the JSON element. "
        "Path defaults to root if not provided.");
    builder->SendBulkString(
        "JSON.DEBUG FIELDS <key> [path] - report number of fields in the JSON element. "
        "Path defaults to root if not provided.");
    builder->SendBulkString("JSON.DEBUG HELP - print help message.");
    return;
  }

  if (absl::EqualsIgnoreCase(command, "memory")) {
    // JSON.DEBUG MEMORY
    string_view key = parser.Next();
    string_view path = parser.NextOrDefault();

    WrappedJsonPath json_path = GET_OR_SEND_UNEXPECTED(ParseJsonPath(path));

    ShardId sid = Shard(key, shard_set->size());
    ConnectionContext* cntx = cmd_cntx->server_conn_cntx();
    auto cb = [&]() {
      EngineShard* shard = EngineShard::tlocal();
      DbContext db_cntx{cntx->ns, cntx->conn_state.db_index};
      OpArgs op_args{shard, nullptr, db_cntx};
      return OpMemory(op_args, key, json_path);
    };

    auto result = shard_set->Await(sid, std::move(cb));
    reply_generic::Send(result, cmd_cntx);
    return;
  }

  if (absl::EqualsIgnoreCase(command, "fields")) {
    // JSON.DEBUG FIELDS
    string_view key = parser.Next();
    string_view path = parser.NextOrDefault();

    WrappedJsonPath json_path = GET_OR_SEND_UNEXPECTED(ParseJsonPath(path));

    ShardId sid = Shard(key, shard_set->size());
    ConnectionContext* cntx = cmd_cntx->server_conn_cntx();
    auto cb = [&]() {
      EngineShard* shard = EngineShard::tlocal();
      DbContext db_cntx{cntx->ns, cntx->conn_state.db_index};
      OpArgs op_args{shard, nullptr, db_cntx};
      return OpFields(op_args, key, json_path);
    };

    auto result = shard_set->Await(sid, std::move(cb));
    reply_generic::Send(result, cmd_cntx);
    return;
  }

  builder->SendError(facade::UnknownSubCmd(command, "JSON.DEBUG"), facade::kSyntaxErrType);
}

void CmdMGet(CmdArgList args, CommandContext* cmd_cntx) {
  DCHECK_GE(args.size(), 1U);

  string_view path = ArgS(args, args.size() - 1);

  auto* builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  WrappedJsonPath json_path = GET_OR_SEND_UNEXPECTED(ParseJsonPath(path));

  unsigned shard_count = shard_set->size();
  std::vector<std::vector<std::optional<std::string>>> mget_resp(shard_count);

  auto cb = [&](Transaction* t, EngineShard* shard) {
    ShardId sid = shard->shard_id();
    mget_resp[sid] = OpJsonMGet(json_path, t, shard);
    return OpStatus::OK;
  };

  OpStatus result = cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));
  CHECK_EQ(OpStatus::OK, result);

  std::vector<std::optional<std::string>> results(args.size() - 1);
  for (ShardId sid = 0; sid < shard_count; ++sid) {
    if (!cmd_cntx->tx()->IsActive(sid))
      continue;

    std::vector<std::optional<std::string>>& res = mget_resp[sid];
    ShardArgs shard_args = cmd_cntx->tx()->GetShardArgs(sid);
    unsigned src_index = 0;
    for (auto it = shard_args.begin(); it != shard_args.end(); ++it, ++src_index) {
      if (!res[src_index])
        continue;

      uint32_t dst_indx = it.index();
      results[dst_indx] = std::move(res[src_index]);
    }
  }

  reply_generic::Send(results.begin(), results.end(), cmd_cntx);
}

void CmdArrIndex(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  string_view key = parser.Next();
  string_view path = parser.Next();

  auto* builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  WrappedJsonPath json_path = GET_OR_SEND_UNEXPECTED(ParseJsonPath(path));

  string_view search_value = parser.Next();

  int start_index = 0;
  if (parser.HasNext()) {
    if (!absl::SimpleAtoi(parser.Next(), &start_index)) {
      VLOG(1) << "Failed to convert the start index to numeric" << ArgS(args, 3);
      builder->SendError(kInvalidIntErr);
      return;
    }
  }

  int end_index = 0;
  if (parser.HasNext()) {
    if (!absl::SimpleAtoi(parser.Next(), &end_index)) {
      VLOG(1) << "Failed to convert the stop index to numeric" << ArgS(args, 4);
      builder->SendError(kInvalidIntErr);
      return;
    }
  }

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpArrIndex(t->GetOpArgs(shard), key, json_path, search_value, start_index, end_index);
  };

  auto result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  reply_generic::Send(result, cmd_cntx);
}

void CmdArrInsert(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view path = ArgS(args, 1);
  int index = -1;

  auto* builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (!absl::SimpleAtoi(ArgS(args, 2), &index)) {
    VLOG(1) << "Failed to convert the following value to numeric: " << ArgS(args, 2);
    builder->SendError(kInvalidIntErr);
    return;
  }

  WrappedJsonPath json_path = GET_OR_SEND_UNEXPECTED(ParseJsonPath(path));

  vector<string_view> new_values;
  for (size_t i = 3; i < args.size(); i++) {
    new_values.emplace_back(ArgS(args, i));
  }

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpArrInsert(t->GetOpArgs(shard), key, json_path, index, new_values);
  };

  auto result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  reply_generic::Send(result, cmd_cntx);
}

void CmdArrAppend(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view path = ArgS(args, 1);

  auto* builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  WrappedJsonPath json_path = GET_OR_SEND_UNEXPECTED(ParseJsonPath(path));

  vector<string_view> append_values;
  for (size_t i = 2; i < args.size(); ++i) {
    append_values.emplace_back(ArgS(args, i));
  }

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpArrAppend(t->GetOpArgs(shard), key, json_path, append_values);
  };

  auto result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  reply_generic::Send(result, cmd_cntx);
}

void CmdArrTrim(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view path = ArgS(args, 1);
  int start_index;
  int stop_index;

  auto* builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (!absl::SimpleAtoi(ArgS(args, 2), &start_index)) {
    VLOG(1) << "Failed to parse array start index";
    builder->SendError(kInvalidIntErr);
    return;
  }

  if (!absl::SimpleAtoi(ArgS(args, 3), &stop_index)) {
    VLOG(1) << "Failed to parse array stop index";
    builder->SendError(kInvalidIntErr);
    return;
  }

  WrappedJsonPath json_path = GET_OR_SEND_UNEXPECTED(ParseJsonPath(path));

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpArrTrim(t->GetOpArgs(shard), key, json_path, start_index, stop_index);
  };

  auto result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  reply_generic::Send(result, cmd_cntx);
}

void CmdArrPop(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  string_view key = parser.Next();
  string_view path = parser.NextOrDefault();
  int index = parser.NextOrDefault<int>(-1);

  auto* builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  RETURN_ON_PARSE_ERROR(parser, cmd_cntx);

  WrappedJsonPath json_path = GET_OR_SEND_UNEXPECTED(ParseJsonPath(path));

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpArrPop(t->GetOpArgs(shard), key, json_path, index);
  };

  auto result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  reply_generic::Send(result, cmd_cntx);
}

void CmdClear(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  string_view key = parser.Next();
  string_view path = parser.NextOrDefault();

  auto* builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  WrappedJsonPath json_path = GET_OR_SEND_UNEXPECTED(ParseJsonPath(path));

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpClear(t->GetOpArgs(shard), key, json_path);
  };

  OpResult<long> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  reply_generic::Send(result, cmd_cntx);
}

void CmdStrAppend(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view path = ArgS(args, 1);
  string_view value = ArgS(args, 2);

  auto* builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  WrappedJsonPath json_path = GET_OR_SEND_UNEXPECTED(ParseJsonPath(path));

  // We try parsing the value into json string object first.
  optional<TmpJson> parsed_json = JsonFromString(value);
  if (!parsed_json || !parsed_json->is_string()) {
    return builder->SendError("expected string value", kSyntaxErrType);
  };

  string_view json_string = parsed_json->as_string_view();
  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpStrAppend(t->GetOpArgs(shard), key, json_path, json_string);
  };

  auto result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  reply_generic::Send(result, cmd_cntx);
}

void CmdObjKeys(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  string_view key = parser.Next();
  string_view path = parser.NextOrDefault();

  auto* builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  WrappedJsonPath json_path = GET_OR_SEND_UNEXPECTED(ParseJsonPath(path));

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpObjKeys(t->GetOpArgs(shard), key, json_path);
  };

  auto result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  reply_generic::Send(result, cmd_cntx);
}

void CmdDel(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  string_view key = parser.Next();
  string_view path = parser.NextOrDefault();

  auto* builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  WrappedJsonPath json_path = GET_OR_SEND_UNEXPECTED(ParseJsonPath(path));

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpDel(t->GetOpArgs(shard), key, path, json_path);
  };

  OpResult<long> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  reply_generic::Send(result, cmd_cntx);
}

void CmdNumIncrBy(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view path = ArgS(args, 1);
  string_view num = ArgS(args, 2);

  auto* builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  WrappedJsonPath json_path = GET_OR_SEND_UNEXPECTED(ParseJsonPath(path));

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpDoubleArithmetic(t->GetOpArgs(shard), key, json_path, num, OP_ADD);
  };

  OpResult<string> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  reply_generic::SendJsonString(result, cmd_cntx);
}

void CmdNumMultBy(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view path = ArgS(args, 1);
  string_view num = ArgS(args, 2);

  auto* builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  WrappedJsonPath json_path = GET_OR_SEND_UNEXPECTED(ParseJsonPath(path));

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpDoubleArithmetic(t->GetOpArgs(shard), key, json_path, num, OP_MULTIPLY);
  };

  OpResult<string> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  reply_generic::SendJsonString(result, cmd_cntx);
}

void CmdToggle(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  string_view key = parser.Next();
  string_view path = parser.NextOrDefault();

  auto* builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  WrappedJsonPath json_path = GET_OR_SEND_UNEXPECTED(ParseJsonPath(path));

  if (json_path.IsLegacyModePath()) {
    ExecuteToggle<bool>(key, json_path, cmd_cntx);
  } else {
    ExecuteToggle<long>(key, json_path, cmd_cntx);
  }
}

void CmdType(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  string_view key = parser.Next();
  string_view path = parser.NextOrDefault();

  auto* builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  WrappedJsonPath json_path = GET_OR_SEND_UNEXPECTED(ParseJsonPath(path));

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpType(t->GetOpArgs(shard), key, json_path);
  };

  auto result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  reply_generic::Send(result, cmd_cntx);
}

void CmdArrLen(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  string_view key = parser.Next();
  string_view path = parser.NextOrDefault();

  auto* builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  WrappedJsonPath json_path = GET_OR_SEND_UNEXPECTED(ParseJsonPath(path));

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpArrLen(t->GetOpArgs(shard), key, json_path);
  };

  auto result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  reply_generic::Send(result, cmd_cntx);
}

void CmdObjLen(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  string_view key = parser.Next();
  string_view path = parser.NextOrDefault();

  auto* builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  WrappedJsonPath json_path = GET_OR_SEND_UNEXPECTED(ParseJsonPath(path));

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpObjLen(t->GetOpArgs(shard), key, json_path);
  };

  auto result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  reply_generic::Send(result, cmd_cntx);
}

void CmdStrLen(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  string_view key = parser.Next();
  string_view path = parser.NextOrDefault();

  auto* builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  WrappedJsonPath json_path = GET_OR_SEND_UNEXPECTED(ParseJsonPath(path));

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpStrLen(t->GetOpArgs(shard), key, json_path);
  };

  auto result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  reply_generic::Send(result, cmd_cntx);
}

void CmdGet(CmdArgList args, CommandContext* cmd_cntx) {
  DCHECK_GE(args.size(), 1U);

  facade::CmdArgParser parser{args};
  string_view key = parser.Next();
  auto* builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  auto params = ParseJsonGetParams(&parser, builder);
  if (!params) {
    return;  // ParseJsonGetParams should have already sent an error
  }

  RETURN_ON_PARSE_ERROR(parser, cmd_cntx);

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpJsonGet(t->GetOpArgs(shard), key, params.value());
  };

  OpResult<string> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  auto* rb = static_cast<RedisReplyBuilder*>(builder);

  if (result == OpStatus::KEY_NOTFOUND) {
    rb->SendNull();  // Match Redis
  } else {
    reply_generic::Send(result, cmd_cntx);
  }
}

}  // namespace

#define HFUNC(x) SetHandler(&Cmd##x)

// Redis modules do not have acl categories, therefore they can not be used by default.
// However, we do not implement those as modules and therefore we can define our own
// sensible defaults.
// For now I introduced only the JSON category which will be the default.
// TODO: Add sensible defaults/categories to json commands

void RegisterJsonFamily(CommandRegistry* registry) {
  constexpr size_t kMsetFlags = CO::JOURNALED | CO::DENYOOM | CO::FAST | CO::NO_AUTOJOURNAL;
  registry->StartFamily();
  *registry << CI{"JSON.GET", CO::READONLY | CO::FAST, -2, 1, 1, acl::JSON}.HFUNC(Get);
  *registry << CI{"JSON.MGET", CO::READONLY | CO::FAST, -3, 1, -2, acl::JSON}.HFUNC(MGet);
  *registry << CI{"JSON.TYPE", CO::READONLY | CO::FAST, -2, 1, 1, acl::JSON}.HFUNC(Type);
  *registry << CI{"JSON.STRLEN", CO::READONLY | CO::FAST, -2, 1, 1, acl::JSON}.HFUNC(StrLen);
  *registry << CI{"JSON.OBJLEN", CO::READONLY | CO::FAST, -2, 1, 1, acl::JSON}.HFUNC(ObjLen);
  *registry << CI{"JSON.ARRLEN", CO::READONLY | CO::FAST, -2, 1, 1, acl::JSON}.HFUNC(ArrLen);
  *registry << CI{"JSON.TOGGLE", CO::JOURNALED | CO::FAST, 3, 1, 1, acl::JSON}.HFUNC(Toggle);
  *registry << CI{"JSON.NUMINCRBY", CO::JOURNALED | CO::FAST, 4, 1, 1, acl::JSON}.HFUNC(NumIncrBy);
  *registry << CI{"JSON.NUMMULTBY", CO::JOURNALED | CO::FAST, 4, 1, 1, acl::JSON}.HFUNC(NumMultBy);
  *registry << CI{"JSON.DEL", CO::JOURNALED, -2, 1, 1, acl::JSON}.HFUNC(Del);
  *registry << CI{"JSON.FORGET", CO::JOURNALED, -2, 1, 1, acl::JSON}.HFUNC(
      Del);  // An alias of JSON.DEL.
  *registry << CI{"JSON.OBJKEYS", CO::READONLY | CO::FAST, -2, 1, 1, acl::JSON}.HFUNC(ObjKeys);
  *registry << CI{"JSON.STRAPPEND", CO::JOURNALED | CO::DENYOOM | CO::FAST, 4, 1, 1, acl::JSON}
                   .HFUNC(StrAppend);
  *registry << CI{"JSON.CLEAR", CO::JOURNALED | CO::FAST, -2, 1, 1, acl::JSON}.HFUNC(Clear);
  *registry << CI{"JSON.ARRPOP", CO::JOURNALED | CO::FAST, -2, 1, 1, acl::JSON}.HFUNC(ArrPop);
  *registry << CI{"JSON.ARRTRIM", CO::JOURNALED | CO::FAST, 5, 1, 1, acl::JSON}.HFUNC(ArrTrim);
  *registry << CI{"JSON.ARRINSERT", CO::JOURNALED | CO::DENYOOM | CO::FAST, -4, 1, 1, acl::JSON}
                   .HFUNC(ArrInsert);
  *registry << CI{"JSON.ARRAPPEND", CO::JOURNALED | CO::DENYOOM | CO::FAST, -4, 1, 1, acl::JSON}
                   .HFUNC(ArrAppend);
  *registry << CI{"JSON.ARRINDEX", CO::READONLY | CO::FAST, -4, 1, 1, acl::JSON}.HFUNC(ArrIndex);
  *registry
      << CI{"JSON.DEBUG", CO::READONLY | CO::FAST, -2, 0, 0, acl::JSON}.HFUNC(Debug)
      << CI{"JSON.RESP", CO::READONLY | CO::FAST, -2, 1, 1, acl::JSON}.HFUNC(Resp)
      << CI{"JSON.SET", CO::JOURNALED | CO::DENYOOM | CO::FAST, -4, 1, 1, acl::JSON}.HFUNC(Set)
      << CI{"JSON.MSET", kMsetFlags, -4, 1, -1, acl::JSON}.HFUNC(MSet)
      << CI{"JSON.MERGE", CO::JOURNALED | CO::DENYOOM | CO::FAST, 4, 1, 1, acl::JSON}.HFUNC(Merge);
}

}  // namespace dfly


================================================
FILE: src/server/json_family_memory_test.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "base/gtest.h"
#include "base/logging.h"
#include "facade/facade_test.h"
#include "server/test_utils.h"

using namespace testing;
using namespace std;
using namespace util;

ABSL_DECLARE_FLAG(bool, jsonpathv2);

namespace dfly {

class JsonFamilyMemoryTest : public BaseFamilyTest {
 public:
  static MiMemoryResource* GetMemoryResource() {
    thread_local mi_heap_t* heap = mi_heap_new();
    thread_local MiMemoryResource memory_resource{heap};
    return &memory_resource;
  }

 protected:
  void SetUp() override {
    BaseFamilyTest::SetUp();
    // Make the core running the thread use the same resource as the rest of the test. Although
    // BaseFamilyTest initializes the heap on shards serving transactions, the core running the test
    // needs this initialized explicitly.
    InitTLStatelessAllocMR(GetMemoryResource());
    detail::InternedString::ResetPool();
  }

  auto GetJsonMemoryUsageFromDb(std::string_view key) {
    return Run({"MEMORY", "USAGE", key, "WITHOUTKEY"});
  }
};

// Single-thread fixture so all keys land on the same shard and share the same
// thread-local InternedBlobPool. Required to reproduce interned string sharing bugs.
class JsonFamilyMemoryTestSingleThread : public JsonFamilyMemoryTest {
 public:
  JsonFamilyMemoryTestSingleThread() {
    num_threads_ = 1;
  }
};

size_t GetMemoryUsage() {
  return JsonFamilyMemoryTest::GetMemoryResource()->used();
}

size_t GetJsonMemoryUsageFromString(std::string_view json_str, bool include_root = true) {
  size_t start = GetMemoryUsage();
  auto json = ParseJsonUsingShardHeap(json_str);
  if (!json) {
    return 0;
  }

  // The same behaviour as in CompactObj
  void* ptr =
      JsonFamilyMemoryTest::GetMemoryResource()->allocate(sizeof(JsonType), alignof(JsonType));
  JsonType* json_on_heap = new (ptr) JsonType(std::move(json).value());
  DCHECK(json_on_heap);

  size_t result = GetMemoryUsage() - start;
  if (!include_root)
    result -= mi_usable_size(ptr);

  // Free the memory
  json_on_heap->~JsonType();
  JsonFamilyMemoryTest::GetMemoryResource()->deallocate(json_on_heap, sizeof(JsonType),
                                                        alignof(JsonType));
  return result;
}

TEST_F(JsonFamilyMemoryTest, SimpleSet) {
  std::string_view big_json = R"({"a":"some big string asdkasdkasdfkkasjdkfjka"})";
  size_t start_size = GetJsonMemoryUsageFromString(big_json);

  auto resp = Run({"JSON.SET", "j1", "$", big_json});
  EXPECT_EQ(resp, "OK");
  resp = GetJsonMemoryUsageFromDb("j1");
  EXPECT_THAT(resp, IntArg(start_size));

  std::string_view small_json = R"({"a":" "})";
  size_t next_size = GetJsonMemoryUsageFromString(small_json);

  resp = Run({"JSON.SET", "j1", "$", small_json});
  EXPECT_EQ(resp, "OK");
  resp = GetJsonMemoryUsageFromDb("j1");
  EXPECT_THAT(resp, IntArg(next_size));

  // Again set big json
  resp = Run({"JSON.SET", "j1", "$", big_json});
  EXPECT_EQ(resp, "OK");
  resp = GetJsonMemoryUsageFromDb("j1");
  EXPECT_THAT(resp, IntArg(start_size));
}

TEST_F(JsonFamilyMemoryTest, PartialSet) {
  std::string_view start_json = R"({"a":"some text", "b":" "})";
  size_t start_size = GetJsonMemoryUsageFromString(start_json);

  auto resp = Run({"JSON.SET", "j1", "$", start_json});
  EXPECT_EQ(resp, "OK");
  resp = GetJsonMemoryUsageFromDb("j1");
  EXPECT_THAT(resp, IntArg(start_size));

  std::string_view json_after_set = R"({"a":"some text", "b":"some another text"})";
  size_t size_after_set = GetJsonMemoryUsageFromString(json_after_set);

  resp = Run({"JSON.SET", "j1", "$.b", "\"some another text\""});
  EXPECT_EQ(resp, "OK");
  resp = GetJsonMemoryUsageFromDb("j1");
  EXPECT_THAT(resp, IntArg(size_after_set));

  // Again set start json
  resp = Run({"JSON.SET", "j1", "$.b", "\" \""});
  EXPECT_EQ(resp, "OK");
  resp = GetJsonMemoryUsageFromDb("j1");
  EXPECT_THAT(resp, IntArg(start_size));
}

/* Tests how works memory usage after deleting json object in jsoncons */
TEST_F(JsonFamilyMemoryTest, JsonConsDelTest) {
  std::string_view start_json = R"({"a":"some text", "b":" "})";

  size_t start = GetMemoryUsage();

  auto json = ParseJsonUsingShardHeap(start_json);
  void* ptr = GetMemoryResource()->allocate(sizeof(JsonType), alignof(JsonType));
  JsonType* json_on_heap = new (ptr) JsonType(std::move(json).value());

  size_t memory_usage_before_erase = GetMemoryUsage() - start;

  json_on_heap->erase("a");
  /* To deallocate memory we should use shrink_to_fit */
  json_on_heap->shrink_to_fit();

  size_t memory_usage_after_erase = GetMemoryUsage() - start;

  EXPECT_GT(memory_usage_before_erase, memory_usage_after_erase);
  // b is interned, parsing it again will just reuse the same object and not use extra memory. to
  // force a realistic comparison use a new character.
  EXPECT_EQ(memory_usage_after_erase, GetJsonMemoryUsageFromString(R"({"x":" "})"));
}

TEST_F(JsonFamilyMemoryTest, SimpleDel) {
  std::string_view start_json = R"({"a":"some text", "b":" "})";
  size_t start_size = GetJsonMemoryUsageFromString(start_json);

  auto resp = Run({"JSON.SET", "j1", "$", start_json});
  EXPECT_EQ(resp, "OK");
  resp = GetJsonMemoryUsageFromDb("j1");
  EXPECT_THAT(resp, IntArg(start_size));
  // Use non-interned key to get accurate usage
  std::string_view json_after_del = R"({"k":" "})";
  size_t size_after_del = GetJsonMemoryUsageFromString(json_after_del);

  // Test that raw memory usage is correct
  resp = Run({"JSON.SET", "j2", "$", json_after_del});
  EXPECT_EQ(resp, "OK");
  resp = GetJsonMemoryUsageFromDb("j2");
  EXPECT_THAT(resp, IntArg(size_after_del));

  // Test that after deletion memory usage is correct
  resp = Run({"JSON.DEL", "j1", "$.a"});
  EXPECT_THAT(resp, IntArg(1));
  resp = Run({"JSON.GET", "j1"});
  EXPECT_EQ(resp, R"({"b":" "})");
  resp = GetJsonMemoryUsageFromDb("j1");

  /* We still expect the initial size here, because after deletion we do not call shrink_to_fit on
     the JSON object. As a result, the memory will not be deallocated. Check
     JsonFamilyMemoryTest::JsonConsDelTest for example. */
  const size_t size_after_delete = [start_size] {
    const detail::InternedString dropped("a");
    return start_size - dropped.MemUsed();
  }();
  EXPECT_THAT(resp, IntArg(size_after_delete));

  // Again set start json
  resp = Run({"JSON.SET", "j1", "$.a", "\"some text\""});
  EXPECT_EQ(resp, "OK");
  resp = GetJsonMemoryUsageFromDb("j1");
  EXPECT_THAT(resp, IntArg(start_size));
}

TEST_F(JsonFamilyMemoryTest, JsonShrinking) {
  std::string_view start_json = R"({"a":"some text","b":"some another text","c":" "})";
  size_t start_size = GetJsonMemoryUsageFromString(start_json);

  auto resp = Run({"JSON.SET", "j1", "$", start_json});
  EXPECT_EQ(resp, "OK");
  resp = GetJsonMemoryUsageFromDb("j1");
  EXPECT_THAT(resp, IntArg(start_size));

  // Change key but keep length so that interned key "c" does not throw off calculation
  std::string_view json_after_del = R"({"z":" "})";
  size_t size_after_del = GetJsonMemoryUsageFromString(json_after_del);

  // Test that raw memory usage is correct
  resp = Run({"JSON.SET", "j2", "$", json_after_del});
  EXPECT_EQ(resp, "OK");
  resp = GetJsonMemoryUsageFromDb("j2");
  EXPECT_THAT(resp, IntArg(size_after_del));

  // Test that after deletion memory usage decreases
  resp = Run({"JSON.DEL", "j1", "$.a"});
  EXPECT_THAT(resp, IntArg(1));
  resp = Run({"JSON.DEL", "j1", "$.b"});
  EXPECT_THAT(resp, IntArg(1));
  resp = Run({"JSON.GET", "j1"});
  EXPECT_EQ(resp, R"({"c":" "})");
  resp = GetJsonMemoryUsageFromDb("j1");
  // Now we expect the size to be smaller, because shrink_to_fit was called
  EXPECT_THAT(resp, IntArg(size_after_del));

  // Again set start json
  resp = Run({"JSON.SET", "j1", "$.a", "\"some text\""});
  EXPECT_EQ(resp, "OK");
  resp = Run({"JSON.SET", "j1", "$.b", "\"some another text\""});
  EXPECT_EQ(resp, "OK");
  resp = Run({"JSON.GET", "j1"});
  EXPECT_EQ(resp, start_json);
  resp = GetJsonMemoryUsageFromDb("j1");

  // Jsoncons will allocate more memory for the new json that needed.
  // This is totally fine, because we will not call shrink_to_fit.
  // Different compilers may allocate different amounts, so check reasonable range
  auto final_size = get<int64_t>(resp.u);
  EXPECT_GT(final_size, start_size);      // Should be larger than initial
  EXPECT_LT(final_size, start_size * 2);  // But not unreasonably large
}

TEST_F(JsonFamilyMemoryTest, ShortKeyAccounting) {
  const std::string value(128, 'v');
  std::string json = "{";
  for (int i = 0; i < 512; ++i) {
    if (i)
      json += ",";
    json += absl::StrFormat(R"("k%d":"%s")", i, value);
  }
  json += "}";

  auto resp = Run({"JSON.SET", "j1", "$", json});
  EXPECT_EQ(resp, "OK");

  resp = Run({"JSON.DEBUG", "MEMORY", "j1"});

  const auto actual = get<int64_t>(resp.u);
  const auto expected = static_cast<int64_t>(GetJsonMemoryUsageFromString(json, false));

  EXPECT_LE(std::llabs(actual - expected), 64);
}

TEST_F(JsonFamilyMemoryTest, MergeMemoryTrackingCrash) {
  Run("JSON.SET key $ {\"x\":1}");

  auto resp = Run("JSON.MERGE key $ {\"y\":2}");
  ASSERT_THAT(resp, "OK");

  resp = Run("JSON.MERGE key $ null");
  ASSERT_THAT(resp, "OK");

  resp = Run("JSON.GET key");
  ASSERT_THAT(resp, "null");
}

TEST_F(JsonFamilyMemoryTestSingleThread, InternedStringSharedBlobAccounting) {
  // ref count for x = 1
  ASSERT_THAT(Run("json.set foo $ {\"x\":3}"), "OK");
  // ref count for x = 2
  ASSERT_THAT(Run("json.set bar $ {\"x\":5}"), "OK");
  Run("del foo");
  Run("json.merge bar $ null");
}

}  // namespace dfly


================================================
FILE: src/server/json_family_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#include <absl/flags/flag.h>
#include <absl/strings/str_replace.h>

#include "base/gtest.h"
#include "base/logging.h"
#include "facade/error.h"
#include "facade/facade_test.h"
#include "server/test_utils.h"

using namespace testing;
using namespace std;
using namespace util;

ABSL_DECLARE_FLAG(bool, jsonpathv2);

namespace dfly {

class JsonFamilyTest : public BaseFamilyTest {
 protected:
};

MATCHER_P(ElementsAreArraysMatcher, matchers, "") {
  const auto& vec = arg.GetVec();
  const size_t expected_size = std::tuple_size<decltype(matchers)>::value;

  if (vec.size() != expected_size) {
    *result_listener << "size mismatch: expected " << expected_size << " but got " << vec.size();
    return false;
  }

  bool result = true;
  size_t index = 0;

  auto check_matcher = [&](const auto& matcher) {
    if (!ExplainMatchResult(matcher, vec[index].GetVec(), result_listener)) {
      *result_listener << " at index " << index;
      result = false;
    }
    index++;
  };

  std::apply([&check_matcher](const auto&... matchers) { (check_matcher(matchers), ...); },
             matchers);

  return result;
}

template <typename... Matchers> auto ElementsAreArrays(Matchers&&... matchers) {
  return ElementsAreArraysMatcher(std::make_tuple(std::forward<Matchers>(matchers)...));
}

TEST_F(JsonFamilyTest, SetGetBasic) {
  string json = R"(
    {
       "store": {
        "book": [
         {
           "category": "Fantasy",
           "author": "J. K. Rowling",
           "title": "Harry Potter and the Philosopher's Stone",
           "isbn": 9780747532743,
           "price": 5.99
         }
       ]
      }
    }
)";

  string xml = R"(
    <?xml version="1.0" encoding="UTF-8" ?>
    <store>
      <book>
        <category>Fantasy</category>
        <author>J. K. Rowling</author>
        <title>Harry Potter and the Philosopher&#x27;s Stone</title>
        <isbn>9780747532743</isbn>
        <price>5.99</price>
      </book>
    </store>
)";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.GET", "json", "$..*"});
  ASSERT_THAT(resp, ArgType(RespExpr::STRING));

  resp = Run({"JSON.GET", "json", "$..book[0].price"});
  EXPECT_THAT(resp, ArgType(RespExpr::STRING));

  resp = Run({"JSON.GET", "json", "//*"});
  EXPECT_THAT(resp, ArgType(RespExpr::ERROR));

  resp = Run({"JSON.GET", "json", "//book[0]"});
  EXPECT_THAT(resp, ArgType(RespExpr::ERROR));

  resp = Run({"JSON.GET", "json", "store.book[0].category"});
  EXPECT_EQ(resp, "\"Fantasy\"");

  resp = Run({"JSON.GET", "json", ".store.book[0].category"});
  EXPECT_EQ(resp, "\"Fantasy\"");

  resp = Run({"SET", "xml", xml});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.GET", "xml", "$..*"});
  EXPECT_THAT(resp, ArgType(RespExpr::ERROR));
}

TEST_F(JsonFamilyTest, GetLegacy) {
  string json = R"({"name":"Leonard Cohen","lastSeen":1478476800,"loggedOut": true})";

  auto resp = Run({"JSON.SET", "json", "$", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.GET", "json"});  // V1 Response
  ASSERT_THAT(resp, "{\"lastSeen\":1478476800,\"loggedOut\":true,\"name\":\"Leonard Cohen\"}");

  resp = Run({"JSON.GET", "json", "."});  // V1 Response
  ASSERT_THAT(resp, "{\"lastSeen\":1478476800,\"loggedOut\":true,\"name\":\"Leonard Cohen\"}");

  resp = Run({"JSON.GET", "json", "$"});  // V2 Response
  ASSERT_THAT(resp, "[{\"lastSeen\":1478476800,\"loggedOut\":true,\"name\":\"Leonard Cohen\"}]");

  resp = Run({"JSON.GET", "json", ".name"});  // V1 Response
  ASSERT_THAT(resp, "\"Leonard Cohen\"");

  resp = Run({"JSON.GET", "json", "$.name"});  // V2 Response
  ASSERT_THAT(resp, "[\"Leonard Cohen\"]");

  resp = Run({"JSON.GET", "json", ".name", "$.lastSeen"});  // V2 Response
  ASSERT_THAT(resp, "{\"$.lastSeen\":[1478476800],\".name\":[\"Leonard Cohen\"]}");

  resp = Run({"JSON.GET", "json", ".name", ".lastSeen"});  // V1 Response
  ASSERT_THAT(resp, "{\".lastSeen\":1478476800,\".name\":\"Leonard Cohen\"}");

  resp = Run({"JSON.GET", "json", "$.name", "$.lastSeen"});  // V2 Response
  ASSERT_THAT(resp, "{\"$.lastSeen\":[1478476800],\"$.name\":[\"Leonard Cohen\"]}");

  json = R"(
    {"a":"first","b":{"field":"second"},"c":{"field":"third"}}
  )";

  resp = Run({"JSON.SET", "json", "$", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.GET", "json", "bar"});  // V1 Response
  ASSERT_THAT(resp, ErrArg("ERR invalid JSON path"));

  resp = Run({"JSON.GET", "json", ".", "bar"});  // V1 Response
  ASSERT_THAT(resp, ErrArg("ERR invalid JSON path"));

  resp = Run({"JSON.GET", "json", ".a", "bar", "foo", "third", "."});  // V1 Response
  ASSERT_THAT(resp, ErrArg("ERR invalid JSON path"));

  resp = Run({"JSON.GET", "json", "$.bar"});  // V2 Response
  ASSERT_THAT(resp, "[]");

  resp = Run({"JSON.GET", "json", "bar", "$.a"});  // V2 Response
  ASSERT_THAT(resp, R"({"$.a":["first"],"bar":[]})");

  resp = Run({"JSON.GET", "json", "$.bar"});  // V2 Response
  ASSERT_THAT(resp, "[]");
}

static const string PhonebookJson = R"(
    {
      "firstName":"John",
      "lastName":"Smith",
      "age":27,
      "weight":135.25,
      "isAlive":true,
      "address":{
          "street":"21 2nd Street",
          "city":"New York",
          "state":"NY",
          "zipcode":"10021-3100"
      },
      "phoneNumbers":[
          {
            "type":"home",
            "number":"212 555-1234"
          },
          {
            "type":"office",
            "number":"646 555-4567"
          }
      ],
      "children":[

      ],
      "spouse":null
    }
  )";

TEST_F(JsonFamilyTest, SetGetFromPhonebook) {
  auto resp = Run({"JSON.SET", "json", ".", PhonebookJson});
  ASSERT_THAT(resp, "OK");

  auto compact_json = jsoncons::json::parse(PhonebookJson).as_string();

  resp = Run({"JSON.GET", "json", "."});
  EXPECT_EQ(resp, compact_json);

  resp = Run({"JSON.GET", "json", "$"});
  EXPECT_EQ(resp, "[" + compact_json + "]");

  resp = Run({"JSON.GET", "json", "$.address.*"});
  EXPECT_EQ(resp, R"(["New York","NY","21 2nd Street","10021-3100"])");

  resp = Run({"JSON.GET", "json", "$.firstName", "$.age", "$.lastName"});
  EXPECT_EQ(resp, R"({"$.age":[27],"$.firstName":["John"],"$.lastName":["Smith"]})");

  resp = Run({"JSON.GET", "json", "$.spouse.*"});
  EXPECT_EQ(resp, "[]");

  resp = Run({"JSON.GET", "json", "$.children.*"});
  EXPECT_EQ(resp, "[]");

  resp = Run({"JSON.GET", "json", "$..phoneNumbers[1].*"});
  EXPECT_EQ(resp, R"(["646 555-4567","office"])");

  resp = Run({"JSON.GET", "json", "$.address.*", "INDENT", "indent", "NEWLINE", "newline"});
  EXPECT_EQ(
      resp,
      R"([newlineindent"New York",newlineindent"NY",newlineindent"21 2nd Street",newlineindent"10021-3100"newline])");

  resp = Run({"JSON.GET", "json", "$.address", "SPACE", "space"});
  EXPECT_EQ(
      resp,
      R"([{"city":space"New York","state":space"NY","street":space"21 2nd Street","zipcode":space"10021-3100"}])");

  resp = Run({"JSON.GET", "json", "$.firstName", "$.age", "$.lastName", "INDENT", "indent",
              "NEWLINE", "newline", "SPACE", "space"});
  EXPECT_EQ(
      resp,
      R"({newlineindent"$.age":space[newlineindentindent27newlineindent],newlineindent"$.firstName":space[newlineindentindent"John"newlineindent],newlineindent"$.lastName":space[newlineindentindent"Smith"newlineindent]newline})");

  resp =
      Run({"JSON.GET", "json", "$..phoneNumbers.*", "INDENT", "t", "NEWLINE", "s", "SPACE", "s"});
  EXPECT_EQ(
      resp,
      R"([st{stt"number":s"212 555-1234",stt"type":s"home"st},st{stt"number":s"646 555-4567",stt"type":s"office"st}s])");
}

TEST_F(JsonFamilyTest, GetBrackets) {
  string json = R"(
    {"a":"first", "b":{"a":"second"}}
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.GET", "json", "$[\"a\"]"});
  ASSERT_THAT(resp, "[\"first\"]");

  resp = Run({"JSON.GET", "json", "$..[\"a\"]"});
  ASSERT_THAT(resp, R"(["first","second"])");

  resp = Run({"JSON.GET", "json", "$.b[\"a\"]"});
  ASSERT_THAT(resp, "[\"second\"]");

  resp = Run({"JSON.GET", "json", "[\"a\"]"});
  ASSERT_THAT(resp, "\"first\"");

  resp = Run({"JSON.GET", "json", "..[\"a\"]"});
  ASSERT_THAT(resp, "\"second\"");

  json = R"(
    ["first", ["second"]]
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.GET", "json", "$[0]"});
  ASSERT_THAT(resp, "[\"first\"]");

  resp = Run({"JSON.GET", "json", "$..[0]"});
  ASSERT_THAT(resp, R"(["first","second"])");

  resp = Run({"JSON.GET", "json", "[0]"});
  ASSERT_THAT(resp, "\"first\"");

  resp = Run({"JSON.GET", "json", "..[0]"});
  ASSERT_THAT(resp, "\"second\"");

  resp = Run({"JSON.GET", "json", "$[\"first\"]"});
  ASSERT_THAT(resp, "[]");

  json = R"(
    {"a":{"b":{"c":"first"}}, "b":{"b":{"c":"second"}}, "c":{"b":{"c":"third"}}}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.GET", "json", R"($["a"]['b']["c"])"});
  ASSERT_THAT(resp, "[\"first\"]");

  resp = Run({"JSON.GET", "json", R"($["a"].b['c'])"});
  ASSERT_THAT(resp, "[\"first\"]");

  resp = Run({"JSON.GET", "json", R"($..['b']["c"])"});
  ASSERT_THAT(resp, R"(["first","second","third"])");

  resp = Run({"JSON.GET", "json", R"($.c['b']["c"])"});
  ASSERT_THAT(resp, "[\"third\"]");
}

TEST_F(JsonFamilyTest, GetWithNoEscape) {
  string json = R"({"key": "value with special characters: \n \t \" \""})";
  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  // Test without NOESCAPE option
  resp = Run({"JSON.GET", "json", "."});
  EXPECT_EQ(resp, "{\"key\":\"value with special characters: \\n \\t \\\" \\\"\"}");

  // Test with NOESCAPE option
  resp = Run({"JSON.GET", "json", ".", "NOESCAPE"});
  EXPECT_EQ(resp, "{\"key\":\"value with special characters: \\n \\t \\\" \\\"\"}");  // No changes
}

TEST_F(JsonFamilyTest, Type) {
  string json = R"(
    [1, 2.3, "foo", true, null, {}, []]
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.TYPE", "json", "$[*]"});
  ASSERT_THAT(resp, RespArray(ElementsAre("integer", "number", "string", "boolean", "null",
                                          "object", "array")));

  resp = Run({"JSON.TYPE", "json", "$[10]"});
  EXPECT_THAT(resp, ArrLen(0));

  resp = Run({"JSON.TYPE", "not_exist_key", "$[10]"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));
}

TEST_F(JsonFamilyTest, TypeLegacy) {
  string json = R"(
    {
      "firstName":"John",
      "lastName":"Smith",
      "age":27,
      "weight":135.25,
      "isAlive":true,
      "address":{"street":"21 2nd Street","city":"New York","state":"NY","zipcode":"10021-3100"},
      "phoneNumbers":[{"type":"home","number":"212 555-1234"},{"type":"office","number":"646 555-4567"}],
      "children":[],
      "spouse":null
    }
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.TYPE", "json"});
  EXPECT_EQ(resp, "object");

  resp = Run({"JSON.TYPE", "json", ".children"});
  EXPECT_EQ(resp, "array");

  resp = Run({"JSON.TYPE", "json", ".firstName"});
  EXPECT_EQ(resp, "string");

  resp = Run({"JSON.TYPE", "json", ".age"});
  EXPECT_EQ(resp, "integer");

  resp = Run({"JSON.TYPE", "json", ".weight"});
  EXPECT_EQ(resp, "number");

  resp = Run({"JSON.TYPE", "json", ".isAlive"});
  EXPECT_EQ(resp, "boolean");

  resp = Run({"JSON.TYPE", "json", ".spouse"});
  EXPECT_EQ(resp, "null");

  resp = Run({"JSON.TYPE", "not_exist_key", ".some_field"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));
}

TEST_F(JsonFamilyTest, StrLen) {
  string json = R"(
    {"a":{"a":"a"}, "b":{"a":"a", "b":1}, "c":{"a":"a", "b":"bb"}, "d":{"a":1, "b":"b", "c":3}}
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  /* Test simple response from only one value */

  resp = Run({"JSON.STRLEN", "json", "$.a.a"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"JSON.STRLEN", "json", "$.a"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  resp = Run({"JSON.STRLEN", "json", "$.a.*"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"JSON.STRLEN", "json", "$.c.b"});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"JSON.STRLEN", "non_existent_key", "$.c.b"});
  EXPECT_THAT(resp, ErrArg("no such key"));

  resp = Run({"JSON.STRLEN", "non_existent_key", "$"});
  EXPECT_THAT(resp, ErrArg("no such key"));

  /*
  Test response from several possible values
  In JSON V2, the response is an array of all possible values
  */

  resp = Run({"JSON.STRLEN", "json", "$.c.*"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(1), IntArg(2)));

  resp = Run({"JSON.STRLEN", "json", "$.d.*"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(),
              ElementsAre(ArgType(RespExpr::NIL), IntArg(1), ArgType(RespExpr::NIL)));
}

TEST_F(JsonFamilyTest, StrLenLegacy) {
  string json = R"(
    {"a":{"a":"a"}, "b":{"a":"a", "b":1}, "c":{"a":"a", "b":"bb"}, "d":{"a":1, "b":"b", "c":3}}
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  /* Test simple response from only one value */

  resp = Run({"JSON.STRLEN", "json"});
  EXPECT_THAT(resp, ErrArg("wrong JSON type of path value"));

  resp = Run({"JSON.STRLEN", "json", ".a.a"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"JSON.STRLEN", "json", ".a"});
  EXPECT_THAT(resp, ErrArg("wrong JSON type of path value"));

  resp = Run({"JSON.STRLEN", "json", ".a.*"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"JSON.STRLEN", "json", ".c.b"});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"JSON.STRLEN", "non_existent_key", ".c.b"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  /*
  Test response from several possible values
  In JSON legacy mode, the response contains only one value - the first string's length.
  */

  resp = Run({"JSON.STRLEN", "json", ".c.*"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"JSON.STRLEN", "json", ".d.*"});
  EXPECT_THAT(resp, IntArg(1));
}

TEST_F(JsonFamilyTest, ObjLen) {
  string json = R"(
    {"a":{}, "b":{"a":"a"}, "c":{"a":"a", "b":"bb"}, "d":{"a":1, "b":"b", "c":{"a":3,"b":4}}, "e":1}
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  /* Test simple response from only one value */

  resp = Run({"JSON.OBJLEN", "json", "$.a"});
  EXPECT_THAT(resp, IntArg(0));

  resp = Run({"JSON.OBJLEN", "json", "$.a.*"});
  EXPECT_THAT(resp.GetVec(), IsEmpty());

  resp = Run({"JSON.OBJLEN", "json", "$.b"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"JSON.OBJLEN", "json", "$.b.*"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  resp = Run({"JSON.OBJLEN", "json", "$.c"});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"JSON.OBJLEN", "json", "$.d"});
  EXPECT_THAT(resp, IntArg(3));

  resp = Run({"JSON.OBJLEN", "non_existent_key", "$.a"});
  EXPECT_THAT(resp, ErrArg("no such key"));

  /*
  Test response from several possible values
  In JSON V2, the response is an array of all possible values
  */

  resp = Run({"JSON.OBJLEN", "json", "$.c.*"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), ElementsAre(ArgType(RespExpr::NIL), ArgType(RespExpr::NIL)));

  resp = Run({"JSON.OBJLEN", "json", "$.d.*"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(),
              ElementsAre(ArgType(RespExpr::NIL), ArgType(RespExpr::NIL), IntArg(2)));

  resp = Run({"JSON.OBJLEN", "json", "$.*"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(),
              ElementsAre(IntArg(0), IntArg(1), IntArg(2), IntArg(3), ArgType(RespExpr::NIL)));
}

TEST_F(JsonFamilyTest, ObjLenLegacy) {
  string json = R"(
    {"a":{}, "b":{"a":"a"}, "c":{"a":"a", "b":"bb"}, "d":{"a":1, "b":"b", "c":{"a":3,"b":4}}, "e":1}
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  /* Test simple response from only one value */
  resp = Run({"JSON.STRLEN", "json"});
  EXPECT_THAT(resp, ErrArg("wrong JSON type of path value"));

  resp = Run({"JSON.OBJLEN", "json", ".a"});
  EXPECT_THAT(resp, IntArg(0));

  resp = Run({"JSON.OBJLEN", "json", ".a.*"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  resp = Run({"JSON.OBJLEN", "json", ".b"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"JSON.OBJLEN", "json", ".b.*"});
  EXPECT_THAT(resp, ErrArg("wrong JSON type of path value"));

  resp = Run({"JSON.OBJLEN", "json", ".c"});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"JSON.OBJLEN", "json", ".d"});
  EXPECT_THAT(resp, IntArg(3));

  resp = Run({"JSON.OBJLEN", "non_existent_key", ".a"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  resp = Run({"JSON.OBJLEN", "json", ".none"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  /*
  Test response from several possible values
  In JSON legacy mode, the response contains only one value - the first object's length.
  */

  resp = Run({"JSON.OBJLEN", "json", ".c.*"});
  EXPECT_THAT(resp, ErrArg("wrong JSON type of path value"));

  resp = Run({"JSON.OBJLEN", "json", ".d.*"});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"JSON.OBJLEN", "json", ".*"});
  EXPECT_THAT(resp, IntArg(0));
}

TEST_F(JsonFamilyTest, ArrLen) {
  string json = R"(
    [[], ["a"], ["a", "b"], ["a", "b", "c"]]
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRLEN", "json", "$[*]"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(0), IntArg(1), IntArg(2), IntArg(3)));

  json = R"(
    [[], "a", ["a", "b"], ["a", "b", "c"], 4]
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRLEN", "json", "$[*]"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(0), ArgType(RespExpr::NIL), IntArg(2), IntArg(3),
                                         ArgType(RespExpr::NIL)));

  resp = Run({"JSON.OBJLEN", "non_existent_key", "$[*]"});
  EXPECT_THAT(resp, ErrArg("no such key"));
}

TEST_F(JsonFamilyTest, ArrLenLegacy) {
  string json = R"(
    [[], ["a"], ["a", "b"], ["a", "b", "c"]]
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRLEN", "json"});
  EXPECT_THAT(resp, IntArg(4));

  resp = Run({"JSON.ARRLEN", "json", "[*]"});
  EXPECT_THAT(resp, IntArg(0));

  resp = Run({"JSON.ARRLEN", "json", "[3]"});
  EXPECT_THAT(resp, IntArg(3));

  json = R"(
    [[], "a", ["a", "b"], ["a", "b", "c"], 4]
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRLEN", "json", "[*]"});
  EXPECT_THAT(resp, IntArg(0));

  resp = Run({"JSON.ARRLEN", "json", "[1]"});
  EXPECT_THAT(resp, ErrArg("wrong JSON type of path value"));

  resp = Run({"JSON.ARRLEN", "json", "[2]"});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"JSON.OBJLEN", "non_existent_key", "[*]"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));
}

TEST_F(JsonFamilyTest, Toggle) {
  string json = R"(
    {"a":true, "b":false, "c":1, "d":null, "e":"foo", "f":[], "g":{}}
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.TOGGLE", "json", "$.*"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(),
              ElementsAre(IntArg(0), IntArg(1), ArgType(RespExpr::NIL), ArgType(RespExpr::NIL),
                          ArgType(RespExpr::NIL), ArgType(RespExpr::NIL), ArgType(RespExpr::NIL)));

  resp = Run({"JSON.GET", "json", "$.*"});
  EXPECT_EQ(resp, R"([false,true,1,null,"foo",[],{}])");

  resp = Run({"JSON.TOGGLE", "json", "$.*"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(),
              ElementsAre(IntArg(1), IntArg(0), ArgType(RespExpr::NIL), ArgType(RespExpr::NIL),
                          ArgType(RespExpr::NIL), ArgType(RespExpr::NIL), ArgType(RespExpr::NIL)));

  resp = Run({"JSON.GET", "json", "$.*"});
  EXPECT_EQ(resp, R"([true,false,1,null,"foo",[],{}])");
}

TEST_F(JsonFamilyTest, ToggleLegacy) {
  string json = R"(
    {"a":true, "b":false, "c":1, "d":null, "e":"foo", "f":[], "g":{}}
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.TOGGLE", "json"});
  EXPECT_THAT(resp, ErrArg("wrong number of arguments"));

  resp = Run({"JSON.TOGGLE", "json", ".*"});
  EXPECT_EQ(resp, "true");

  resp = Run({"JSON.TOGGLE", "json", ".*"});
  EXPECT_EQ(resp, "false");

  resp = Run({"JSON.GET", "json", "$.*"});
  EXPECT_EQ(R"([true,false,1,null,"foo",[],{}])", resp);

  resp = Run({"JSON.SET", "json", ".", "true"});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.TOGGLE", "json", "."});
  EXPECT_EQ(resp, "false");

  resp = Run({"JSON.TOGGLE", "json", "."});
  EXPECT_EQ(resp, "true");

  json = R"(
    {"isAvailable": false}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.TOGGLE", "json", ".isAvailable"});
  EXPECT_EQ(resp, "true");

  resp = Run({"JSON.TOGGLE", "json", ".isAvailable"});
  EXPECT_EQ(resp, "false");
}

TEST_F(JsonFamilyTest, NumIncrBy) {
  string json = R"(
    {"e":1.5,"a":1}
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.NUMINCRBY", "json", "$.a", "1.1"});
  EXPECT_EQ(resp, "[2.1]");

  resp = Run({"JSON.NUMINCRBY", "json", "$.e", "1"});
  EXPECT_EQ(resp, "[2.5]");

  resp = Run({"JSON.NUMINCRBY", "json", "$.e", "inf"});
  EXPECT_THAT(resp, ErrArg("ERR result is not a number"));

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.NUMINCRBY", "json", "$.e", "1.7e308"});
  EXPECT_EQ(resp, "[1.7e+308]");

  resp = Run({"JSON.NUMINCRBY", "json", "$.e", "1.7e308"});
  EXPECT_THAT(resp, ErrArg("ERR result is not a number"));

  resp = Run({"JSON.GET", "json", "$.*"});
  EXPECT_EQ(resp, R"([1,1.7e+308])");

  json = R"(
    {"a":[], "b":[1], "c":[1,2], "d":[1,2,3]}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.NUMINCRBY", "json", "$.d[*]", "10"});
  EXPECT_EQ(resp, "[11,12,13]");

  resp = Run({"JSON.GET", "json", "$.d[*]"});
  EXPECT_EQ(resp, "[11,12,13]");

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.NUMINCRBY", "json", "$.a[*]", "1"});
  EXPECT_EQ(resp, "[]");

  resp = Run({"JSON.NUMINCRBY", "json", "$.b[*]", "1"});
  EXPECT_EQ(resp, "[2]");

  resp = Run({"JSON.NUMINCRBY", "json", "$.c[*]", "1"});
  EXPECT_EQ(resp, "[2,3]");

  resp = Run({"JSON.NUMINCRBY", "json", "$.d[*]", "1"});
  EXPECT_EQ(resp, "[2,3,4]");

  resp = Run({"JSON.NUMINCRBY", "json", "$.d[2]", "1"});
  EXPECT_EQ(resp, "[5]");

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":[],"b":[2],"c":[2,3],"d":[2,3,5]})");

  json = R"(
    {"a":{}, "b":{"a":1}, "c":{"a":1, "b":2}, "d":{"a":1, "b":2, "c":3}}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.NUMINCRBY", "json", "$.a.*", "1"});
  EXPECT_EQ(resp, "[]");

  resp = Run({"JSON.NUMINCRBY", "json", "$.b.*", "1"});
  EXPECT_EQ(resp, "[2]");

  resp = Run({"JSON.NUMINCRBY", "json", "$.c.*", "1"});
  EXPECT_EQ(resp, "[2,3]");

  resp = Run({"JSON.NUMINCRBY", "json", "$.d.*", "1"});
  EXPECT_EQ(resp, "[2,3,4]");

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":{},"b":{"a":2},"c":{"a":2,"b":3},"d":{"a":2,"b":3,"c":4}})");

  json = R"(
    {"a":{"a":"a"}, "b":{"a":"a", "b":1}, "c":{"a":"a", "b":"b"}, "d":{"a":1, "b":"b", "c":3}}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.NUMINCRBY", "json", "$.a.*", "1"});
  EXPECT_EQ(resp, "[null]");

  resp = Run({"JSON.NUMINCRBY", "json", "$.b.*", "1"});
  EXPECT_EQ(resp, "[null,2]");

  resp = Run({"JSON.NUMINCRBY", "json", "$.c.*", "1"});
  EXPECT_EQ(resp, "[null,null]");

  resp = Run({"JSON.NUMINCRBY", "json", "$.d.*", "1"});
  EXPECT_EQ(resp, "[2,null,4]");

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(
      resp,
      R"({"a":{"a":"a"},"b":{"a":"a","b":2},"c":{"a":"a","b":"b"},"d":{"a":2,"b":"b","c":4}})");
}

TEST_F(JsonFamilyTest, NumIncrByLegacy) {
  string json = R"(
    {"e":1.5,"a":1}
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.NUMINCRBY", "json", ".a", "1.1"});
  EXPECT_EQ(resp, "2.1");

  resp = Run({"JSON.NUMINCRBY", "json", ".e", "1"});
  EXPECT_EQ(resp, "2.5");

  resp = Run({"JSON.NUMINCRBY", "json", ".e", "inf"});
  EXPECT_THAT(resp, ErrArg("ERR result is not a number"));

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.NUMINCRBY", "json", ".e", "1.7e308"});
  EXPECT_EQ(resp, "1.7e+308");

  resp = Run({"JSON.NUMINCRBY", "json", ".e", "1.7e308"});
  EXPECT_THAT(resp, ErrArg("ERR result is not a number"));

  resp = Run({"JSON.GET", "json", "$.*"});
  EXPECT_EQ(resp, R"([1,1.7e+308])");

  json = R"(
    {"a":[], "b":[1], "c":[1,2], "d":[1,2,3]}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.NUMINCRBY", "json", ".d[*]", "10"});
  EXPECT_EQ(resp, "13");

  resp = Run({"JSON.GET", "json", "$.d[*]"});
  EXPECT_EQ(resp, "[11,12,13]");

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.NUMINCRBY", "json", ".a[*]", "1"});
  EXPECT_THAT(resp, ErrArg("wrong JSON type of path value"));

  resp = Run({"JSON.NUMINCRBY", "json", ".b[*]", "1"});
  EXPECT_EQ(resp, "2");

  resp = Run({"JSON.NUMINCRBY", "json", ".c[*]", "1"});
  EXPECT_EQ(resp, "3");

  resp = Run({"JSON.NUMINCRBY", "json", ".d[*]", "1"});
  EXPECT_EQ(resp, "4");

  resp = Run({"JSON.NUMINCRBY", "json", ".d[2]", "1"});
  EXPECT_EQ(resp, "5");

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":[],"b":[2],"c":[2,3],"d":[2,3,5]})");

  json = R"(
    {"a":{}, "b":{"a":1}, "c":{"a":1, "b":2}, "d":{"a":1, "b":2, "c":3}}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.NUMINCRBY", "json", ".a.*", "1"});
  EXPECT_THAT(resp, ErrArg("wrong JSON type of path value"));

  resp = Run({"JSON.NUMINCRBY", "json", ".b.*", "1"});
  EXPECT_EQ(resp, "2");

  resp = Run({"JSON.NUMINCRBY", "json", ".c.*", "1"});
  EXPECT_EQ(resp, "3");

  resp = Run({"JSON.NUMINCRBY", "json", ".d.*", "1"});
  EXPECT_EQ(resp, "4");

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":{},"b":{"a":2},"c":{"a":2,"b":3},"d":{"a":2,"b":3,"c":4}})");

  json = R"(
    {"a":{"a":"a"}, "b":{"a":"a", "b":1}, "c":{"a":"a", "b":"b"}, "d":{"a":1, "b":"b", "c":3}}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.NUMINCRBY", "json", ".a.*", "1"});
  EXPECT_THAT(resp, ErrArg("wrong JSON type of path value"));

  resp = Run({"JSON.NUMINCRBY", "json", ".b.*", "1"});
  EXPECT_EQ(resp, "2");

  resp = Run({"JSON.NUMINCRBY", "json", ".c.*", "1"});
  EXPECT_THAT(resp, ErrArg("wrong JSON type of path value"));

  resp = Run({"JSON.NUMINCRBY", "json", ".d.*", "1"});
  EXPECT_EQ(resp, "4");

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(
      resp,
      R"({"a":{"a":"a"},"b":{"a":"a","b":2},"c":{"a":"a","b":"b"},"d":{"a":2,"b":"b","c":4}})");
}

TEST_F(JsonFamilyTest, NumMultBy) {
  string json = R"(
    {"a":[], "b":[1], "c":[1,2], "d":[1,2,3]}
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.NUMMULTBY", "json", "$.d[*]", "2"});
  EXPECT_EQ(resp, "[2,4,6]");

  resp = Run({"JSON.GET", "json", "$.d[*]"});
  EXPECT_EQ(resp, R"([2,4,6])");

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.NUMMULTBY", "json", "$.a[*]", "2"});
  EXPECT_EQ(resp, "[]");

  resp = Run({"JSON.NUMMULTBY", "json", "$.b[*]", "2"});
  EXPECT_EQ(resp, "[2]");

  resp = Run({"JSON.NUMMULTBY", "json", "$.c[*]", "2"});
  EXPECT_EQ(resp, "[2,4]");

  resp = Run({"JSON.NUMMULTBY", "json", "$.d[*]", "2"});
  EXPECT_EQ(resp, "[2,4,6]");

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":[],"b":[2],"c":[2,4],"d":[2,4,6]})");

  json = R"(
    {"a":{}, "b":{"a":1}, "c":{"a":1, "b":2}, "d":{"a":1, "b":2, "c":3}}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.NUMMULTBY", "json", "$.a.*", "2"});
  EXPECT_EQ(resp, "[]");

  resp = Run({"JSON.NUMMULTBY", "json", "$.b.*", "2"});
  EXPECT_EQ(resp, "[2]");

  resp = Run({"JSON.NUMMULTBY", "json", "$.c.*", "2"});
  EXPECT_EQ(resp, "[2,4]");

  resp = Run({"JSON.NUMMULTBY", "json", "$.d.*", "2"});
  EXPECT_EQ(resp, "[2,4,6]");

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":{},"b":{"a":2},"c":{"a":2,"b":4},"d":{"a":2,"b":4,"c":6}})");

  json = R"(
    {"a":{"a":"a"}, "b":{"a":"a", "b":1}, "c":{"a":"a", "b":"b"}, "d":{"a":1, "b":"b", "c":3}}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.NUMMULTBY", "json", "$.a.*", "2"});
  EXPECT_EQ(resp, "[null]");

  resp = Run({"JSON.NUMMULTBY", "json", "$.b.*", "2"});
  EXPECT_EQ(resp, "[null,2]");

  resp = Run({"JSON.NUMMULTBY", "json", "$.c.*", "2"});
  EXPECT_EQ(resp, "[null,null]");

  resp = Run({"JSON.NUMMULTBY", "json", "$.d.*", "2"});
  EXPECT_EQ(resp, "[2,null,6]");

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(
      resp,
      R"({"a":{"a":"a"},"b":{"a":"a","b":2},"c":{"a":"a","b":"b"},"d":{"a":2,"b":"b","c":6}})");
}

TEST_F(JsonFamilyTest, NumMultByLegacy) {
  string json = R"(
    {"a":[], "b":[1], "c":[1,2], "d":[1,2,3]}
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.NUMMULTBY", "json", ".d[*]", "2"});
  EXPECT_EQ(resp, "6");

  resp = Run({"JSON.GET", "json", "$.d[*]"});
  EXPECT_EQ(resp, R"([2,4,6])");

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.NUMMULTBY", "json", ".a[*]", "2"});
  EXPECT_THAT(resp, ErrArg("wrong JSON type of path value"));

  resp = Run({"JSON.NUMMULTBY", "json", ".b[*]", "2"});
  EXPECT_EQ(resp, "2");

  resp = Run({"JSON.NUMMULTBY", "json", ".c[*]", "2"});
  EXPECT_EQ(resp, "4");

  resp = Run({"JSON.NUMMULTBY", "json", ".d[*]", "2"});
  EXPECT_EQ(resp, "6");

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":[],"b":[2],"c":[2,4],"d":[2,4,6]})");

  json = R"(
    {"a":{}, "b":{"a":1}, "c":{"a":1, "b":2}, "d":{"a":1, "b":2, "c":3}}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.NUMMULTBY", "json", ".a.*", "2"});
  EXPECT_THAT(resp, ErrArg("wrong JSON type of path value"));

  resp = Run({"JSON.NUMMULTBY", "json", ".b.*", "2"});
  EXPECT_EQ(resp, "2");

  resp = Run({"JSON.NUMMULTBY", "json", ".c.*", "2"});
  EXPECT_EQ(resp, "4");

  resp = Run({"JSON.NUMMULTBY", "json", ".d.*", "2"});
  EXPECT_EQ(resp, "6");

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":{},"b":{"a":2},"c":{"a":2,"b":4},"d":{"a":2,"b":4,"c":6}})");

  json = R"(
    {"a":{"a":"a"}, "b":{"a":"a", "b":1}, "c":{"a":"a", "b":"b"}, "d":{"a":1, "b":"b", "c":3}}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.NUMMULTBY", "json", ".a.*", "2"});
  EXPECT_THAT(resp, ErrArg("wrong JSON type of path value"));

  resp = Run({"JSON.NUMMULTBY", "json", ".b.*", "2"});
  EXPECT_EQ(resp, "2");

  resp = Run({"JSON.NUMMULTBY", "json", ".c.*", "2"});
  EXPECT_THAT(resp, ErrArg("wrong JSON type of path value"));

  resp = Run({"JSON.NUMMULTBY", "json", ".d.*", "2"});
  EXPECT_EQ(resp, "6");

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(
      resp,
      R"({"a":{"a":"a"},"b":{"a":"a","b":2},"c":{"a":"a","b":"b"},"d":{"a":2,"b":"b","c":6}})");
}

TEST_F(JsonFamilyTest, NumericOperationsWithConversions) {
  auto resp = Run({"JSON.SET", "json", ".", R"({"a":2.0})"});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.NUMINCRBY", "json", "$.a", "1"});
  EXPECT_EQ(resp, "[3.0]");

  resp = Run({"JSON.NUMINCRBY", "json", "$.a", "1.0"});
  EXPECT_EQ(resp, "[4.0]");

  resp = Run({"JSON.NUMMULTBY", "json", "$.a", "2"});
  EXPECT_EQ(resp, "[8.0]");

  resp = Run({"JSON.NUMMULTBY", "json", "$.a", "2.0"});
  EXPECT_EQ(resp, "[16.0]");

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":16.0})");

  resp = Run({"JSON.SET", "json", ".", R"({"a":2})"});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.NUMINCRBY", "json", "$.a", "1"});
  EXPECT_EQ(resp, "[3]");

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":3})");  // Is still integer

  resp = Run({"JSON.NUMINCRBY", "json", "$.a", "1.0"});
  EXPECT_EQ(resp, "[4.0]");

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":4.0})");  // Is converted to double

  resp = Run({"JSON.SET", "json", ".", R"({"a":2})"});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.NUMMULTBY", "json", "$.a", "2"});
  EXPECT_EQ(resp, "[4]");

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":4})");  // Is still integer

  resp = Run({"JSON.NUMMULTBY", "json", "$.a", "2.0"});
  EXPECT_EQ(resp, "[8.0]");

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":8.0})");  // Is converted to double
}

TEST_F(JsonFamilyTest, NumericOperationsWithConversionsLegacy) {
  auto resp = Run({"JSON.SET", "json", ".", R"({"a":2.0})"});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.NUMINCRBY", "json", ".a", "1"});
  EXPECT_EQ(resp, "3.0");

  resp = Run({"JSON.NUMINCRBY", "json", ".a", "1.0"});
  EXPECT_EQ(resp, "4.0");

  resp = Run({"JSON.NUMMULTBY", "json", ".a", "2"});
  EXPECT_EQ(resp, "8.0");

  resp = Run({"JSON.NUMMULTBY", "json", ".a", "2.0"});
  EXPECT_EQ(resp, "16.0");

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":16.0})");

  resp = Run({"JSON.SET", "json", ".", R"({"a":2})"});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.NUMINCRBY", "json", ".a", "1"});
  EXPECT_EQ(resp, "3");

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":3})");  // Is still integer

  resp = Run({"JSON.NUMINCRBY", "json", ".a", "1.0"});
  EXPECT_EQ(resp, "4.0");

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":4.0})");  // Is converted to double

  resp = Run({"JSON.SET", "json", ".", R"({"a":2})"});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.NUMMULTBY", "json", ".a", "2"});
  EXPECT_EQ(resp, "4");

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":4})");  // Is still integer

  resp = Run({"JSON.NUMMULTBY", "json", ".a", "2.0"});
  EXPECT_EQ(resp, "8.0");

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":8.0})");  // Is converted to double
}

TEST_F(JsonFamilyTest, NumericOperationsResp2Resp3) {
  // Test RESP2 behavior
  Run({"HELLO", "2"});

  auto resp = Run({"JSON.SET", "a", "$", "1"});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.NUMINCRBY", "a", "$", "1"});
  EXPECT_EQ(resp, "[2]");  // Currently returns string "[2]"

  resp = Run({"JSON.TYPE", "a", "$"});
  EXPECT_EQ(resp, "integer");

  resp = Run({"JSON.TYPE", "a", "."});
  EXPECT_EQ(resp, "integer");

  resp = Run({"JSON.NUMMULTBY", "a", "$", "2"});
  EXPECT_EQ(resp, "[4]");  // Currently returns string "[4]"

  // Test RESP3 behavior
  Run({"HELLO", "3"});
  Run({"FLUSHALL"});

  resp = Run({"JSON.SET", "a", "$", "1"});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.NUMINCRBY", "a", "$", "1"});
  // In RESP3, this should return a proper array with integer: 1) (integer) 2
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"JSON.TYPE", "a", "$"});
  EXPECT_THAT(resp, RespArray(ElementsAre("integer")));

  resp = Run({"JSON.TYPE", "a", "."});
  EXPECT_EQ(resp, "integer");

  resp = Run({"JSON.NUMMULTBY", "a", "$", "2"});
  // In RESP3, this should return a proper array with integer: 1) (integer) 4
  EXPECT_THAT(resp, IntArg(4));
}

TEST_F(JsonFamilyTest, Del) {
  string json = R"(
    {"a":{}, "b":{"a":1}, "c":{"a":1, "b":2}, "d":{"a":1, "b":2, "c":3}, "e": [1,2,3,4,5]}}
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.DEL", "json", "$.d.*"});
  EXPECT_THAT(resp, IntArg(3));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":{},"b":{"a":1},"c":{"a":1,"b":2},"d":{},"e":[1,2,3,4,5]})");

  resp = Run({"JSON.DEL", "json", "$.e[*]"});
  EXPECT_THAT(resp, IntArg(5));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":{},"b":{"a":1},"c":{"a":1,"b":2},"d":{},"e":[]})");

  resp = Run({"JSON.DEL", "json", "$..*"});

  // TODO: legacy jsoncons implementation returns, 8 but in practive it should return 5.
  // redis-stack returns 5 as well.
  // Once we drop jsoncons path, we can enforce here equality.
  EXPECT_GE(resp.GetInt(), 5);

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({})");

  resp = Run({"JSON.DEL", "json"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"GET", "json"});  // This is legal since the key was removed
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  resp = Run({"JSON.GET", "json"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  json = R"(
    {"a":[{"b": [1,2,3]}], "b": [{"c": 2}], "c']":[1,2,3]}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.DEL", "json", "$.a[0].b[0]"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"GET", "json"});  // not a legal type
  EXPECT_THAT(resp, ErrArg("Operation against a key holding the wrong kind of value"));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":[{"b":[2,3]}],"b":[{"c":2}],"c']":[1,2,3]})");

  resp = Run({"JSON.DEL", "json", "$.b[0].c"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":[{"b":[2,3]}],"b":[{}],"c']":[1,2,3]})");

  resp = Run({"JSON.DEL", "json", "$.*"});
  EXPECT_THAT(resp, IntArg(3));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({})");

  resp = Run({"JSON.SET", "json", "$", R"({"a": 1})"});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.DEL", "json", "$"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"JSON.GET", "json"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  if (absl::GetFlag(FLAGS_jsonpathv2)) {
    // Test recursive delete with $..a path
    resp = Run({"JSON.SET", "doc2", "$",
                R"({"a": {"a": 2, "b": 3}, "b": ["a", "b"], "nested": {"b": [true, "a", "b"]}})"});
    ASSERT_THAT(resp, "OK");

    resp = Run({"JSON.GET", "doc2"});
    EXPECT_EQ(resp, R"({"a":{"a":2,"b":3},"b":["a","b"],"nested":{"b":[true,"a","b"]}})");

    // JSON.DEL with $..a should find and delete the key "a" at root level
    // but not string values "a" inside arrays
    resp = Run({"JSON.DEL", "doc2", "$..a"});
    EXPECT_THAT(resp, IntArg(1));

    resp = Run({"JSON.GET", "doc2"});
    EXPECT_EQ(resp, R"({"b":["a","b"],"nested":{"b":[true,"a","b"]}})");
  }
}

TEST_F(JsonFamilyTest, DelLegacy) {
  string json = R"(
    {"a":{}, "b":{"a":1}, "c":{"a":1, "b":2}, "d":{"a":1, "b":2, "c":3}, "e": [1,2,3,4,5]}}
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.DEL", "json", ".d.*"});
  EXPECT_THAT(resp, IntArg(3));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":{},"b":{"a":1},"c":{"a":1,"b":2},"d":{},"e":[1,2,3,4,5]})");

  resp = Run({"JSON.DEL", "json", ".e[*]"});
  EXPECT_THAT(resp, IntArg(5));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":{},"b":{"a":1},"c":{"a":1,"b":2},"d":{},"e":[]})");

  resp = Run({"JSON.DEL", "json", "..*"});
  EXPECT_GE(resp.GetInt(), 5);

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({})");

  resp = Run({"JSON.DEL", "json"});
  EXPECT_THAT(resp, IntArg(1));
  resp = Run({"GET", "json"});  // This is legal since the key was removed
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  resp = Run({"JSON.GET", "json"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  json = R"(
    {"a":[{"b": [1,2,3]}], "b": [{"c": 2}], "c']":[1,2,3]}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.DEL", "json", ".a[0].b[0]"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"GET", "json"});  // not a legal type
  EXPECT_THAT(resp, ErrArg("Operation against a key holding the wrong kind of value"));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":[{"b":[2,3]}],"b":[{"c":2}],"c']":[1,2,3]})");

  resp = Run({"JSON.DEL", "json", ".b[0].c"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":[{"b":[2,3]}],"b":[{}],"c']":[1,2,3]})");

  resp = Run({"JSON.DEL", "json", ".*"});
  EXPECT_THAT(resp, IntArg(3));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({})");

  resp = Run({"JSON.SET", "json", ".", R"({"a": 1})"});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.DEL", "json", "."});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"JSON.GET", "json"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.DEL", "json"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"JSON.GET", "json"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));
}

TEST_F(JsonFamilyTest, ObjKeys) {
  string json = R"(
    {"a":{}, "b":{"a":"a"}, "c":{"a":"a", "b":"bb"}, "d":{"a":1, "b":"b", "c":{"a":3,"b":4}}, "e":1}
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.OBJKEYS", "json", "$"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("a", "b", "c", "d", "e"));

  resp = Run({"JSON.OBJKEYS", "json", "$.a"});
  EXPECT_THAT(resp.GetVec(), IsEmpty());

  resp = Run({"JSON.OBJKEYS", "json", "$.b"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("a"));

  resp = Run({"JSON.OBJKEYS", "json", "$.*"});
  EXPECT_THAT(resp, ElementsAreArrays(IsEmpty(), ElementsAre("a"), ElementsAre("a", "b"),
                                      ElementsAre("a", "b", "c"), IsEmpty()));

  resp = Run({"JSON.OBJKEYS", "json", "$.notfound"});
  EXPECT_THAT(resp.GetVec(), IsEmpty());

  json = R"(
     {"a":[7], "inner": {"a": {"b": 2, "c": 1337}}}
   )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.OBJKEYS", "json", "$..a"});
  EXPECT_THAT(resp, ElementsAreArrays(IsEmpty(), ElementsAre("b", "c")));

  json = R"(
     {"a":{}, "b":{"c":{"d": {"e": 1337}}}}
   )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.OBJKEYS", "json", "$..*"});
  EXPECT_THAT(resp, ElementsAreArrays(IsEmpty(), ElementsAre("c"), ElementsAre("d"),
                                      ElementsAre("e"), IsEmpty()));
}

TEST_F(JsonFamilyTest, ObjKeysLegacy) {
  string json = R"(
    {"a":{}, "b":{"a":"a"}, "c":{"a":"a", "b":"bb"}, "d":{"a":1, "b":"b", "c":{"a":3,"b":4}}, "e":1}
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.OBJKEYS", "json"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("a", "b", "c", "d", "e"));

  resp = Run({"JSON.OBJKEYS", "json", "."});
  EXPECT_THAT(resp.GetVec(), ElementsAre("a", "b", "c", "d", "e"));

  resp = Run({"JSON.OBJKEYS", "json", ".a"});
  EXPECT_THAT(resp.GetVec(), IsEmpty());

  resp = Run({"JSON.OBJKEYS", "json", ".b"});
  EXPECT_THAT(resp, "a");

  resp = Run({"JSON.OBJKEYS", "json", ".*"});
  EXPECT_THAT(resp.GetVec(), IsEmpty());

  resp = Run({"JSON.OBJKEYS", "json", ".notfound"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  json = R"(
     {"a":[7], "inner": {"a": {"b": 2, "c": 1337}}}
   )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.OBJKEYS", "json", "..a"});
  EXPECT_THAT(resp.GetVec(), IsEmpty());

  json = R"(
     {"a":{}, "b":{"c":{"d": {"e": 1337}}}}
   )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.OBJKEYS", "json", "..*"});
  EXPECT_THAT(resp.GetVec(), IsEmpty());
}

TEST_F(JsonFamilyTest, StrAppend) {
  string json = R"(
    {"a":{"a":"a"}, "b":{"a":"a", "b":1}, "c":{"a":"a", "b":"bb"}, "d":{"a":1, "b":"b", "c":3}}
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  /* Test simple response from only one value */

  resp = Run({"JSON.STRAPPEND", "json", "$.a.a", "\"ab\""});
  EXPECT_THAT(resp, IntArg(3));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(
      resp,
      R"({"a":{"a":"aab"},"b":{"a":"a","b":1},"c":{"a":"a","b":"bb"},"d":{"a":1,"b":"b","c":3}})");

  const char kVal[] = "\"a\"";

  resp = Run({"JSON.STRAPPEND", "json", "$.a.*", kVal});
  EXPECT_THAT(resp, IntArg(4));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(
      resp,
      R"({"a":{"a":"aaba"},"b":{"a":"a","b":1},"c":{"a":"a","b":"bb"},"d":{"a":1,"b":"b","c":3}})");

  resp = Run({"JSON.STRAPPEND", "json", "$.c.b", kVal});
  EXPECT_THAT(resp, IntArg(3));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(
      resp,
      R"({"a":{"a":"aaba"},"b":{"a":"a","b":1},"c":{"a":"a","b":"bba"},"d":{"a":1,"b":"b","c":3}})");

  /*
  Test response from several possible values
  In JSON V2, the response is an array of all possible values
  */

  resp = Run({"JSON.STRAPPEND", "json", "$.b.*", kVal});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(2), ArgType(RespExpr::NIL)));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(
      resp,
      R"({"a":{"a":"aaba"},"b":{"a":"aa","b":1},"c":{"a":"a","b":"bba"},"d":{"a":1,"b":"b","c":3}})");

  resp = Run({"JSON.STRAPPEND", "json", "$.c.*", kVal});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(2), IntArg(4)));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(
      resp,
      R"({"a":{"a":"aaba"},"b":{"a":"aa","b":1},"c":{"a":"aa","b":"bbaa"},"d":{"a":1,"b":"b","c":3}})");

  resp = Run({"JSON.STRAPPEND", "json", "$.d.*", kVal});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(),
              ElementsAre(ArgType(RespExpr::NIL), IntArg(2), ArgType(RespExpr::NIL)));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(
      resp,
      R"({"a":{"a":"aaba"},"b":{"a":"aa","b":1},"c":{"a":"aa","b":"bbaa"},"d":{"a":1,"b":"ba","c":3}})");

  json = R"(
    {"a":{"a":"a", "b":"aa", "c":"aaa"}, "b":{"a":"aaa", "b":"aa", "c":"a"}}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  EXPECT_EQ(resp, "OK");

  resp = Run({"JSON.STRAPPEND", "json", "$.a.*", kVal});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(2), IntArg(3), IntArg(4)));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":{"a":"aa","b":"aaa","c":"aaaa"},"b":{"a":"aaa","b":"aa","c":"a"}})");

  resp = Run({"JSON.STRAPPEND", "json", "$.b.*", kVal});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(4), IntArg(3), IntArg(2)));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":{"a":"aa","b":"aaa","c":"aaaa"},"b":{"a":"aaaa","b":"aaa","c":"aa"}})");

  json = R"(
    {"a":{"a":"a", "b":"aa", "c":["aaaaa", "aaaaa"]}, "b":{"a":"aaa", "b":["aaaaa", "aaaaa"], "c":"a"}}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  EXPECT_EQ(resp, "OK");

  resp = Run({"JSON.STRAPPEND", "json", "$.a.*", kVal});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(2), IntArg(3), ArgType(RespExpr::NIL)));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(
      resp,
      R"({"a":{"a":"aa","b":"aaa","c":["aaaaa","aaaaa"]},"b":{"a":"aaa","b":["aaaaa","aaaaa"],"c":"a"}})");

  resp = Run({"JSON.STRAPPEND", "json", "$.b.*", kVal});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(4), ArgType(RespExpr::NIL), IntArg(2)));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(
      resp,
      R"({"a":{"a":"aa","b":"aaa","c":["aaaaa","aaaaa"]},"b":{"a":"aaaa","b":["aaaaa","aaaaa"],"c":"aa"}})");

  json = R"(
    {"a":{"a":"a", "b":"aa", "c":{"c": "aaaaa"}}, "b":{"a":"aaa", "b":{"b": "aaaaa"}, "c":"a"}}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  EXPECT_EQ(resp, "OK");

  resp = Run({"JSON.STRAPPEND", "json", "$.a.*", kVal});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(2), IntArg(3), ArgType(RespExpr::NIL)));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(
      resp,
      R"({"a":{"a":"aa","b":"aaa","c":{"c":"aaaaa"}},"b":{"a":"aaa","b":{"b":"aaaaa"},"c":"a"}})");

  resp = Run({"JSON.STRAPPEND", "json", "$.b.*", kVal});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(4), ArgType(RespExpr::NIL), IntArg(2)));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(
      resp,
      R"({"a":{"a":"aa","b":"aaa","c":{"c":"aaaaa"}},"b":{"a":"aaaa","b":{"b":"aaaaa"},"c":"aa"}})");

  json = R"(
    {"a":"foo", "inner": {"a": "bye"}, "inner1": {"a": 7}}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  EXPECT_EQ(resp, "OK");

  resp = Run({"JSON.STRAPPEND", "json", "$..a", "\"bar\""});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(6), IntArg(6), ArgType(RespExpr::NIL)));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":"foobar","inner":{"a":"byebar"},"inner1":{"a":7}})");
}

TEST_F(JsonFamilyTest, StrAppendLegacyMode) {
  string json = R"(
    {"a":{"a":"a"}, "b":{"a":"a", "b":1}, "c":{"a":"a", "b":"bb"}, "d":{"a":1, "b":"b", "c":3}}
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  /* Test simple response from only one value */

  resp = Run({"JSON.STRAPPEND", "json", ".a.a", "\"ab\""});
  ASSERT_THAT(resp, ArgType(RespExpr::INT64));
  EXPECT_THAT(resp, IntArg(3));

  resp = Run({"JSON.GET", "json"});
  EXPECT_THAT(
      resp,
      R"({"a":{"a":"aab"},"b":{"a":"a","b":1},"c":{"a":"a","b":"bb"},"d":{"a":1,"b":"b","c":3}})");

  const char kVal[] = "\"a\"";

  resp = Run({"JSON.STRAPPEND", "json", ".a.*", kVal});
  ASSERT_THAT(resp, ArgType(RespExpr::INT64));
  EXPECT_THAT(resp, IntArg(4));

  resp = Run({"JSON.GET", "json"});
  EXPECT_THAT(
      resp,
      R"({"a":{"a":"aaba"},"b":{"a":"a","b":1},"c":{"a":"a","b":"bb"},"d":{"a":1,"b":"b","c":3}})");

  resp = Run({"JSON.STRAPPEND", "json", ".c.b", kVal});
  ASSERT_THAT(resp, ArgType(RespExpr::INT64));
  EXPECT_THAT(resp, IntArg(3));

  resp = Run({"JSON.GET", "json"});
  EXPECT_THAT(
      resp,
      R"({"a":{"a":"aaba"},"b":{"a":"a","b":1},"c":{"a":"a","b":"bba"},"d":{"a":1,"b":"b","c":3}})");

  /*
  Test response from several possible values
  In JSON legacy mode, the response contains only one value - the new length of the last updated
  string.
  */

  resp = Run({"JSON.STRAPPEND", "json", ".b.*", kVal});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"JSON.GET", "json"});
  EXPECT_THAT(
      resp,
      R"({"a":{"a":"aaba"},"b":{"a":"aa","b":1},"c":{"a":"a","b":"bba"},"d":{"a":1,"b":"b","c":3}})");

  resp = Run({"JSON.STRAPPEND", "json", ".c.*", kVal});
  ASSERT_THAT(resp, ArgType(RespExpr::INT64));
  EXPECT_THAT(resp, IntArg(4));

  resp = Run({"JSON.GET", "json"});
  EXPECT_THAT(
      resp,
      R"({"a":{"a":"aaba"},"b":{"a":"aa","b":1},"c":{"a":"aa","b":"bbaa"},"d":{"a":1,"b":"b","c":3}})");

  resp = Run({"JSON.STRAPPEND", "json", ".d.*", kVal});
  ASSERT_THAT(resp, ArgType(RespExpr::INT64));
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"JSON.GET", "json"});
  EXPECT_THAT(
      resp,
      R"({"a":{"a":"aaba"},"b":{"a":"aa","b":1},"c":{"a":"aa","b":"bbaa"},"d":{"a":1,"b":"ba","c":3}})");

  json = R"(
    {"a":{"a":"a", "b":"aa", "c":"aaa"}, "b":{"a":"aaa", "b":"aa", "c":"a"}}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  EXPECT_EQ(resp, "OK");

  resp = Run({"JSON.STRAPPEND", "json", ".a.*", kVal});
  ASSERT_THAT(resp, ArgType(RespExpr::INT64));
  EXPECT_THAT(resp, IntArg(4));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":{"a":"aa","b":"aaa","c":"aaaa"},"b":{"a":"aaa","b":"aa","c":"a"}})");

  resp = Run({"JSON.STRAPPEND", "json", ".b.*", kVal});
  ASSERT_THAT(resp, ArgType(RespExpr::INT64));
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":{"a":"aa","b":"aaa","c":"aaaa"},"b":{"a":"aaaa","b":"aaa","c":"aa"}})");

  json = R"(
    {"a":{"a":"a", "b":"aa", "c":["aaaaa", "aaaaa"]}, "b":{"a":"aaa", "b":["aaaaa", "aaaaa"], "c":"a"}}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  EXPECT_EQ(resp, "OK");

  resp = Run({"JSON.STRAPPEND", "json", ".a.*", kVal});
  ASSERT_THAT(resp, ArgType(RespExpr::INT64));
  EXPECT_THAT(resp, IntArg(3));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(
      resp,
      R"({"a":{"a":"aa","b":"aaa","c":["aaaaa","aaaaa"]},"b":{"a":"aaa","b":["aaaaa","aaaaa"],"c":"a"}})");

  resp = Run({"JSON.STRAPPEND", "json", ".b.*", kVal});
  ASSERT_THAT(resp, ArgType(RespExpr::INT64));
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(
      resp,
      R"({"a":{"a":"aa","b":"aaa","c":["aaaaa","aaaaa"]},"b":{"a":"aaaa","b":["aaaaa","aaaaa"],"c":"aa"}})");

  json = R"(
    {"a":{"a":"a", "b":"aa", "c":{"c": "aaaaa"}}, "b":{"a":"aaa", "b":{"b": "aaaaa"}, "c":"a"}}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  EXPECT_EQ(resp, "OK");

  resp = Run({"JSON.STRAPPEND", "json", ".a.*", kVal});
  ASSERT_THAT(resp, ArgType(RespExpr::INT64));
  EXPECT_THAT(resp, IntArg(3));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(
      resp,
      R"({"a":{"a":"aa","b":"aaa","c":{"c":"aaaaa"}},"b":{"a":"aaa","b":{"b":"aaaaa"},"c":"a"}})");

  resp = Run({"JSON.STRAPPEND", "json", ".b.*", kVal});
  ASSERT_THAT(resp, ArgType(RespExpr::INT64));
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(
      resp,
      R"({"a":{"a":"aa","b":"aaa","c":{"c":"aaaaa"}},"b":{"a":"aaaa","b":{"b":"aaaaa"},"c":"aa"}})");

  json = R"(
    {"a":"foo", "inner": {"a": "bye"}, "inner1": {"a": 7}}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  EXPECT_EQ(resp, "OK");

  resp = Run({"JSON.STRAPPEND", "json", "..a", "\"bar\""});
  ASSERT_THAT(resp, ArgType(RespExpr::INT64));
  EXPECT_THAT(resp, IntArg(6));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":"foobar","inner":{"a":"byebar"},"inner1":{"a":7}})");
}

TEST_F(JsonFamilyTest, Clear) {
  string json = R"(
    [[], [0], [0,1], [0,1,2], 1, true, null, "d"]
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.CLEAR", "json", "$[*]"});
  EXPECT_THAT(resp, IntArg(5));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"([[],[],[],[],0,true,null,"d"])");

  resp = Run({"JSON.CLEAR", "json", "$"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"([])");

  json = R"(
    {"children": ["Yossi", "Rafi", "Benni", "Avraham", "Yehoshua", "Moshe"]}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.CLEAR", "json", "$.children"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"children":[]})");

  resp = Run({"JSON.CLEAR", "json", "$"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({})");
}

TEST_F(JsonFamilyTest, ClearLegacy) {
  string json = R"(
    [[], [0], [0,1], [0,1,2], 1, true, null, "d"]
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.CLEAR", "json", "[*]"});
  EXPECT_THAT(resp, IntArg(5));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"([[],[],[],[],0,true,null,"d"])");

  resp = Run({"JSON.CLEAR", "json", "."});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"([])");

  json = R"(
    {"children": ["Yossi", "Rafi", "Benni", "Avraham", "Yehoshua", "Moshe"]}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.CLEAR", "json", ".children"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"children":[]})");

  resp = Run({"JSON.CLEAR", "json", "."});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({})");

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.CLEAR", "json"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({})");
}

TEST_F(JsonFamilyTest, ArrPop) {
  string json = R"(
    [[6,1,6], [7,2,7], [8,3,8]]
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRPOP", "json", "$[*]", "-2"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  EXPECT_THAT(resp.GetVec(), ElementsAre("1", "2", "3"));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"([[6,6],[7,7],[8,8]])");

  json = R"(
    [[], ["a"], ["a", "b"]]
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRPOP", "json", "$[*]"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  EXPECT_THAT(resp.GetVec(), ElementsAre(ArgType(RespExpr::NIL), R"("a")", R"("b")"));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"([[],[],["a"]])");
}

TEST_F(JsonFamilyTest, ArrPopLegacy) {
  string json = R"(
    [[6,1,6], [7,2,7], [8,3,8]]
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRPOP", "json", "[*]", "-2"});
  EXPECT_EQ(resp, R"(3)");

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"([[6,6],[7,7],[8,8]])");

  json = R"(
    [[], ["a"], ["a", "b"]]
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRPOP", "json", "."});
  EXPECT_EQ(resp, R"(["a","b"])");

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"([[],["a"]])");

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRPOP", "json", ".", "0"});
  EXPECT_EQ(resp, "[]");

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"([["a"],["a","b"]])");

  resp = Run({"JSON.ARRPOP", "json"});
  EXPECT_EQ(resp, R"(["a","b"])");

  json = R"(
    {"a":"b"}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRPOP", "json", "."});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  resp = Run({"JSON.SET", "json", ".", "[]"});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRPOP", "json", "."});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));
}

TEST_F(JsonFamilyTest, ArrPopOutOfRange) {
  string json = R"(
    [0,1,2,3,4,5]
  )";

  auto resp = Run({"JSON.SET", "arr", "$", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRPOP", "arr", "$", "-55"});
  EXPECT_EQ(resp, "0");

  resp = Run({"JSON.SET", "arr", "$", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRPOP", "arr", "$", "55"});
  EXPECT_EQ(resp, "5");

  // Test legacy mode
  resp = Run({"JSON.SET", "arr", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRPOP", "arr", ".", "-55"});
  EXPECT_EQ(resp, "0");

  resp = Run({"JSON.SET", "arr", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRPOP", "arr", ".", "55"});
  EXPECT_EQ(resp, "5");
}

TEST_F(JsonFamilyTest, ArrTrim) {
  string json = R"(
    [[], ["a"], ["a", "b"], ["a", "b", "c"]]
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRTRIM", "json", "$[*]", "0", "1"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(0), IntArg(1), IntArg(2), IntArg(2)));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"([[],["a"],["a","b"],["a","b"]])");

  json = R"(
    {"a":[], "nested": {"a": [1,4]}}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRTRIM", "json", "$..a", "0", "1"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(0), IntArg(2)));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":[],"nested":{"a":[1,4]}})");

  json = R"(
    {"a":[1,2,3,2], "nested": {"a": false}}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRTRIM", "json", "$..a", "1", "2"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(2), ArgType(RespExpr::NIL)));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":[2,3],"nested":{"a":false}})");

  json = R"(
    [1,2,3,4,5,6,7]
  )";

  resp = Run({"JSON.SET", "json", "$", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRTRIM", "json", "$", "2", "3"});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"([3,4])");
}

TEST_F(JsonFamilyTest, ArrTrimLegacy) {
  string json = R"(
    [[], ["a"], ["a", "b"], ["a", "b", "c"]]
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRTRIM", "json", "[*]", "0", "1"});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"([[],["a"],["a","b"],["a","b"]])");

  json = R"(
    {"a":[], "nested": {"a": [1,4]}}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRTRIM", "json", "..a", "0", "1"});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":[],"nested":{"a":[1,4]}})");

  json = R"(
    {"a":[1,2,3,2], "nested": {"a": false}}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRTRIM", "json", "..a", "1", "2"});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":[2,3],"nested":{"a":false}})");

  json = R"(
    [1,2,3,4,5,6,7]
  )";

  resp = Run({"JSON.SET", "json", "$", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRTRIM", "json", ".", "2", "3"});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"([3,4])");

  json = R"(
    {"a":"b"}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRTRIM", "json", ".", "0", "0"});
  EXPECT_THAT(resp, ErrArg("wrong JSON type of path value"));
}

TEST_F(JsonFamilyTest, ArrTrimOutOfRange) {
  string arr = R"(
    [0,1,2,3,4]
  )";

  auto resp = Run({"JSON.SET", "arr", "$", arr});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRTRIM", "arr", "$", "-1", "3"});
  EXPECT_THAT(resp, IntArg(0));
  EXPECT_EQ(Run({"JSON.GET", "arr"}), "[]");

  resp = Run({"JSON.SET", "arr", "$", arr});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRTRIM", "arr", "$", "54", "55"});
  EXPECT_THAT(resp, IntArg(0));
  EXPECT_EQ(Run({"JSON.GET", "arr"}), "[]");

  resp = Run({"JSON.SET", "arr", "$", arr});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRTRIM", "arr", "$", "56", "55"});
  EXPECT_THAT(resp, IntArg(0));
  EXPECT_EQ(Run({"JSON.GET", "arr"}), "[]");

  resp = Run({"JSON.SET", "arr", "$", arr});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRTRIM", "arr", "$", "-55", "-55"});
  EXPECT_THAT(resp, IntArg(1));
  EXPECT_EQ(Run({"JSON.GET", "arr"}), "[0]");

  resp = Run({"JSON.SET", "arr", "$", arr});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRTRIM", "arr", "$", "-2", "-1"});
  EXPECT_THAT(resp, IntArg(2));
  EXPECT_EQ(Run({"JSON.GET", "arr"}), "[3,4]");

  resp = Run({"JSON.SET", "arr", "$", arr});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRTRIM", "arr", "$", "-1", "-2"});
  EXPECT_THAT(resp, IntArg(0));
  EXPECT_EQ(Run({"JSON.GET", "arr"}), "[]");
}

TEST_F(JsonFamilyTest, ArrInsert) {
  string json = R"(
    [[], ["a"], ["a", "b"]]
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRINSERT", "json", "$[*]", "0", R"("a")"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(1), IntArg(2), IntArg(3)));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"([["a"],["a","a"],["a","a","b"]])");

  resp = Run({"JSON.ARRINSERT", "json", "$[*]", "-1", R"("b")"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(2), IntArg(3), IntArg(4)));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"([["b","a"],["a","b","a"],["a","a","b","b"]])");

  resp = Run({"JSON.ARRINSERT", "json", "$[*]", "1", R"("c")"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(3), IntArg(4), IntArg(5)));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"([["b","c","a"],["a","c","b","a"],["a","c","a","b","b"]])");

  json = R"(
    {"a":{"b":"c"}, "b":[["a"], ["a", "b"]]}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRINSERT", "json", "$.a", "0", R"("c")"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));
}

TEST_F(JsonFamilyTest, ArrInsertLegacy) {
  string json = R"(
    [[], ["a"], ["a", "b"]]
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRINSERT", "json", "[*]", "0", R"("c")"});
  EXPECT_THAT(resp, IntArg(3));

  resp = Run({"JSON.ARRINSERT", "json", ".", "0", R"("c")"});
  EXPECT_THAT(resp, IntArg(4));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"(["c",["c"],["c","a"],["c","a","b"]])");

  json = R"(
    {"a":{"b":"c"}, "b":[["a"], ["a", "b"]]}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRINSERT", "json", ".a", "0", R"("c")"});
  EXPECT_THAT(resp, ErrArg("wrong JSON type of path value"));
}

TEST_F(JsonFamilyTest, ArrInsertOutOfRange) {
  string json = R"(
    [0,1,2,3,4,5]
  )";

  auto resp = Run({"JSON.SET", "arr", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRINSERT", "arr", "$", "-55", "6"});
  EXPECT_THAT(resp, ErrArg("index out of range"));

  resp = Run({"JSON.ARRINSERT", "arr", "$", "55", "6"});
  EXPECT_THAT(resp, ErrArg("index out of range"));

  resp = Run({"JSON.ARRINSERT", "arr", ".", "-55", "6"});  // Legacy mode
  EXPECT_THAT(resp, ErrArg("index out of range"));

  resp = Run({"JSON.ARRINSERT", "arr", ".", "55", "6"});  // Legacy mode
  EXPECT_THAT(resp, ErrArg("index out of range"));

  resp = Run({"JSON.SET", "arr", ".", "[]"});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRINSERT", "arr", "$", "-1", "2"});
  EXPECT_THAT(resp, ErrArg("index out of range"));

  resp = Run({"JSON.ARRINSERT", "arr", "$", "1", "2"});
  EXPECT_THAT(resp, ErrArg("index out of range"));

  resp = Run({"JSON.ARRINSERT", "arr", "$", "0", "2"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"JSON.GET", "arr"});
  EXPECT_EQ(resp, "[2]");
}

TEST_F(JsonFamilyTest, ArrAppend) {
  string json = R"(
    [[], ["a"], ["a", "b"]]
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRAPPEND", "json", "$[*]", R"("a")"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(1), IntArg(2), IntArg(3)));

  resp = Run({"JSON.ARRAPPEND", "json", "$[*]", R"("b")"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(2), IntArg(3), IntArg(4)));

  json = R"(
    {"a": [1], "nested": {"a": [1,2], "nested2": {"a": 42}}}
  )";
  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRAPPEND", "json", "$..a", "3"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(2), IntArg(3), ArgType(RespExpr::NIL)));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"a":[1,3],"nested":{"a":[1,2,3],"nested2":{"a":42}}})");
}

TEST_F(JsonFamilyTest, ArrAppendLegacy) {
  string json = R"(
    [[], ["a"], ["a", "b"]]
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRAPPEND", "json", "[-1]", R"("c")"});
  EXPECT_THAT(resp, IntArg(3));

  resp = Run({"JSON.ARRAPPEND", "json", ".*", R"("c")"});
  EXPECT_THAT(resp, IntArg(4));

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"([["c"],["a","c"],["a","b","c","c"]])");

  json = R"(
    {"a":"b"}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRAPPEND", "json", ".", R"("c")"});
  EXPECT_THAT(resp, ErrArg("wrong JSON type of path value"));
}

TEST_F(JsonFamilyTest, ArrIndex) {
  string json = R"(
    [[], ["a"], ["a", "b"], ["a", "b", "c"]]
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRINDEX", "json", "$[*]", R"("b")"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(-1), IntArg(-1), IntArg(1), IntArg(1)));

  json = R"(
    {"a":["a","b","c","d"], "nested": {"a": ["c","d"]}}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRINDEX", "json", "$..a", R"("b")"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(1), IntArg(-1)));

  json = R"(
    {"a":["a","b","c","d"], "nested": {"a": false}}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRINDEX", "json", "$..a", R"("b")"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(1), ArgType(RespExpr::NIL)));

  resp = Run(
      {"JSON.SET", "json", ".", R"({"key" : ["Alice", "Bob", "Carol", "David", "Eve", "Frank"]})"});
  ASSERT_EQ(resp, "OK");
  resp = Run({"JSON.ARRINDEX", "json", "$.key", R"("Bob")"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"JSON.ARRINDEX", "json", "$.key", R"("Bob")", "1", "2"});
  EXPECT_THAT(resp, IntArg(1));
}

TEST_F(JsonFamilyTest, ArrIndexLegacy) {
  string json = R"(
    {"children": ["John", "Jack", "Tom", "Bob", "Mike"]}
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRINDEX", "json", ".children", R"("Tom")"});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"JSON.ARRINDEX", "json", ".children", R"("DoesNotExist")"});
  EXPECT_THAT(resp, IntArg(-1));

  resp = Run({"JSON.ARRINDEX", "json", ".children.[0].notexist", "3"});
  EXPECT_THAT(resp.type, RespExpr::ERROR);

  json = R"(
    {"a":"b"}
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRINDEX", "json", ".", R"("Tom")"});
  EXPECT_THAT(resp, ErrArg("wrong JSON type of path value"));
}

TEST_F(JsonFamilyTest, ArrIndexWithNumericValues) {
  string json = R"(
    [2, 3.0, 3]
  )";

  auto resp = Run({"JSON.SET", "json", "$", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRINDEX", "json", "$", "3"});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"JSON.ARRINDEX", "json", "$", "3.0"});
  EXPECT_THAT(resp, IntArg(1));

  json = R"(
    [[1, 2, 3], [1.0, 2.0, 3.0], 2.0, [1,2,3]]
  )";

  resp = Run({"JSON.SET", "json", "$", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRINDEX", "json", "$", "[1,2,3]"});
  EXPECT_THAT(resp, IntArg(0));

  resp = Run({"JSON.ARRINDEX", "json", "$", "[1.0,2.0,3.0]"});
  EXPECT_THAT(resp, IntArg(1));

  json = R"(
    [{"a":2},{"a":2.0},2.0]
  )";

  resp = Run({"JSON.SET", "json", "$", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRINDEX", "json", "$", R"({"a":2})"});
  EXPECT_THAT(resp, IntArg(0));

  resp = Run({"JSON.ARRINDEX", "json", "$", R"({"a":2.0})"});
  EXPECT_THAT(resp, IntArg(1));

  json = R"(
    [{"arr":[1,2,3],"number":2},{"arr":[1.0,2.0,3.0],"number":2.0},2]
  )";

  resp = Run({"JSON.SET", "json", "$", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRINDEX", "json", "$", R"({"arr":[1,2,3],"number":2})"});
  EXPECT_THAT(resp, IntArg(0));

  resp = Run({"JSON.ARRINDEX", "json", "$", R"({"arr":[1.0,2.0,3.0],"number":2.0})"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"JSON.ARRINDEX", "json", "$", R"({"arr":[1,2,3],"number":2.0})"});
  EXPECT_THAT(resp, IntArg(-1));

  resp = Run({"JSON.ARRINDEX", "json", "$", R"({"arr":[1.0,2.0,3.0],"number":2})"});
  EXPECT_THAT(resp, IntArg(-1));
}

TEST_F(JsonFamilyTest, ArrIndexWithNumericValuesLegacy) {
  string json = R"(
    [2, 3.0, 3]
  )";

  auto resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRINDEX", "json", ".", "3"});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"JSON.ARRINDEX", "json", ".", "3.0"});
  EXPECT_THAT(resp, IntArg(1));

  json = R"(
    [{"arr":[1,2,3],"number":2},{"arr":[1.0,2.0,3.0],"number":2.0},2]
  )";

  resp = Run({"JSON.SET", "json", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRINDEX", "json", ".", R"({"arr":[1,2,3],"number":2})"});
  EXPECT_THAT(resp, IntArg(0));

  resp = Run({"JSON.ARRINDEX", "json", ".", R"({"arr":[1.0,2.0,3.0],"number":2.0})"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"JSON.ARRINDEX", "json", ".", R"({"arr":[1,2,3],"number":2.0})"});
  EXPECT_THAT(resp, IntArg(-1));

  resp = Run({"JSON.ARRINDEX", "json", ".", R"({"arr":[1.0,2.0,3.0],"number":2})"});
  EXPECT_THAT(resp, IntArg(-1));
}

TEST_F(JsonFamilyTest, ArrIndexOutOfRange) {
  auto resp = Run({"JSON.SET", "arr", ".", R"([1,1,1,1,1])"});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRINDEX", "arr", "$", "1", "-55", "-55"});
  EXPECT_THAT(resp, IntArg(-1));

  resp = Run({"JSON.ARRINDEX", "arr", "$", "1", "-55", "-56"});
  EXPECT_THAT(resp, IntArg(-1));

  resp = Run({"JSON.ARRINDEX", "arr", "$", "1", "-55", "-54"});
  EXPECT_THAT(resp, IntArg(-1));

  resp = Run({"JSON.ARRINDEX", "arr", "$", "1", "-2"});
  EXPECT_THAT(resp, IntArg(3));

  resp = Run({"JSON.ARRINDEX", "arr", "$", "1", "-2", "-1"});
  EXPECT_THAT(resp, IntArg(3));

  resp = Run({"JSON.ARRINDEX", "arr", "$", "1", "-2", "-3"});
  EXPECT_THAT(resp, IntArg(-1));

  resp = Run({"JSON.ARRINDEX", "arr", "$", "1", "55", "56"});
  EXPECT_THAT(resp, IntArg(4));

  resp = Run({"JSON.ARRINDEX", "arr", "$", "1", "55", "54"});
  EXPECT_THAT(resp, IntArg(4));

  resp = Run({"JSON.ARRINDEX", "arr", "$", "1", "5", "4"});
  EXPECT_THAT(resp, IntArg(-1));
}

TEST_F(JsonFamilyTest, MGet) {
  string json[] = {
      R"(
    {"address":{"street":"14 Imber Street","city":"Petah-Tikva","country":"Israel","zipcode":"49511"}}
  )",
      R"(
    {"address":{"street":"Oranienburger Str. 27","city":"Berlin","country":"Germany","zipcode":"10117"}}
  )",
      R"(
    {"a":1, "b": 2, "nested": {"a": 3}, "c": null}
  )",
      R"(
    {"a":4, "b": 5, "nested": {"a": 6}, "c": null}
  )"};

  auto resp = Run({"JSON.SET", "json1", ".", json[0]});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.SET", "json2", ".", json[1]});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.MGET", "json1", "??INNNNVALID??"});
  EXPECT_THAT(resp, ErrArg("ERR syntax error"));

  resp = Run({"JSON.MGET", "json1", "json2", "json3", "$.address.country"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  EXPECT_THAT(resp.GetVec(),
              ElementsAre(R"(["Israel"])", R"(["Germany"])", ArgType(RespExpr::NIL)));

  resp = Run({"JSON.SET", "json3", ".", json[2]});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.SET", "json4", ".", json[3]});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.MGET", "json3", "json4", "$..a"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  EXPECT_THAT(resp.GetVec(), ElementsAre(R"([1,3])", R"([4,6])"));
}

TEST_F(JsonFamilyTest, MGetLegacy) {
  string json[] = {
      R"(
    {"address":{"street":"14 Imber Street","city":"Petah-Tikva","country":"Israel","zipcode":"49511"}}
  )",
      R"(
    {"address":{"street":"Oranienburger Str. 27","city":"Berlin","country":"Germany","zipcode":"10117"}}
  )",
      R"(
    {"a":1, "b": 2, "nested": {"a": 3}, "c": null}
  )",
      R"(
    {"a":4, "b": 5, "nested": {"a": 6}, "c": null}
  )"};

  auto resp = Run({"JSON.SET", "json1", ".", json[0]});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.SET", "json2", ".", json[1]});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.MGET", "json1", "json2", "json3", ".address.country"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  EXPECT_THAT(resp.GetVec(), ElementsAre(R"("Israel")", R"("Germany")", ArgType(RespExpr::NIL)));

  resp = Run({"JSON.MGET", "json1", "json2", ".[0]"});
  if (auto jsonpathv2 = absl::GetFlag(FLAGS_jsonpathv2); jsonpathv2) {
    ASSERT_EQ(RespExpr::ARRAY, resp.type);
    EXPECT_THAT(resp.GetVec(), ElementsAre(ArgType(RespExpr::NIL), ArgType(RespExpr::NIL)));
  } else {
    EXPECT_THAT(resp, ErrArg("ERR syntax error"));
  }

  resp = Run({"JSON.SET", "json3", ".", json[2]});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.SET", "json4", ".", json[3]});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.MGET", "json3", "json4", "..a"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  EXPECT_THAT(resp.GetVec(), ElementsAre(R"(3)", R"(6)"));
}

TEST_F(JsonFamilyTest, DebugHelp) {
  auto resp = Run({"JSON.DEBUG", "HELP"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  EXPECT_EQ(resp.GetVec().size(), 3);

  EXPECT_THAT(resp.GetVec()[0].GetString(), HasSubstr("MEMORY"));
  EXPECT_THAT(resp.GetVec()[1].GetString(), HasSubstr("FIELDS"));
  EXPECT_THAT(resp.GetVec()[2].GetString(), HasSubstr("HELP"));
}

TEST_F(JsonFamilyTest, DebugFields) {
  string json = R"(
    [1, 2.3, "foo", true, null, {}, [], {"a":1, "b":2}, [1,2,3]]
  )";

  auto resp = Run({"JSON.SET", "json1", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.DEBUG", "fields", "json1", "$[*]"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(1), IntArg(1), IntArg(1), IntArg(1), IntArg(1),
                                         IntArg(0), IntArg(0), IntArg(2), IntArg(3)));

  resp = Run({"JSON.DEBUG", "fields", "json1", "$"});
  EXPECT_THAT(resp, IntArg(14));

  json = R"(
    [[1,2,3, [4,5,6,[6,7,8]]], {"a": {"b": {"c": 1337}}}]
  )";

  resp = Run({"JSON.SET", "json1", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.DEBUG", "fields", "json1", "$[*]"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(11), IntArg(3)));

  resp = Run({"JSON.DEBUG", "fields", "json1", "$"});
  EXPECT_THAT(resp, IntArg(16));

  json = R"({"a":1, "b":2, "c":{"k1":1,"k2":2}})";

  resp = Run({"JSON.SET", "obj_doc", "$", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.DEBUG", "FIELDS", "obj_doc", "$.a"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"JSON.DEBUG", "fields", "obj_doc", "$.a"});
  EXPECT_THAT(resp, IntArg(1));
}

TEST_F(JsonFamilyTest, DebugFieldsLegacy) {
  string json = R"(
    [1, 2.3, "foo", true, null, {}, [], {"a":1, "b":2}, [1,2,3]]
  )";

  auto resp = Run({"JSON.SET", "json1", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.DEBUG", "fields", "json1", "[*]"});
  EXPECT_THAT(resp, IntArg(3));

  resp = Run({"JSON.DEBUG", "fields", "json1", "."});
  EXPECT_THAT(resp, IntArg(14));

  resp = Run({"JSON.DEBUG", "fields", "json1"});
  EXPECT_THAT(resp, IntArg(14));

  json = R"(
    [[1,2,3, [4,5,6,[6,7,8]]], {"a": {"b": {"c": 1337}}}]
  )";

  resp = Run({"JSON.SET", "json1", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.DEBUG", "fields", "json1", "[*]"});
  EXPECT_THAT(resp, IntArg(3));

  resp = Run({"JSON.DEBUG", "fields", "json1", "."});
  EXPECT_THAT(resp, IntArg(16));

  json = R"({"a":1, "b":2, "c":{"k1":1,"k2":2}})";

  resp = Run({"JSON.SET", "obj_doc", ".", json});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.DEBUG", "FIELDS", "obj_doc", ".a"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"JSON.DEBUG", "fields", "obj_doc", ".a"});
  EXPECT_THAT(resp, IntArg(1));
}

TEST_F(JsonFamilyTest, DebugMemory) {
  auto resp = Run({"JSON.SET", "json1", "$",
                   R"([1, 2.3, "foo", true, null, {}, [], {"a":1, "b":2}, [1,2,3]])"});
  EXPECT_EQ(resp, "OK");

  resp = Run({"JSON.DEBUG", "memory", "json1", "$[*]"});
  EXPECT_EQ(resp.type, RespExpr::ARRAY);
  EXPECT_EQ(resp.GetVec().size(), 9);
  EXPECT_EQ(resp.GetVec()[0].GetInt(), 0);
  EXPECT_EQ(resp.GetVec()[1].GetInt(), 0);
  EXPECT_EQ(resp.GetVec()[2].GetInt(), 0);
  EXPECT_EQ(resp.GetVec()[3].GetInt(), 0);
  EXPECT_EQ(resp.GetVec()[4].GetInt(), 0);
  EXPECT_GE(resp.GetVec()[5].GetInt(), 0);
  EXPECT_GE(resp.GetVec()[6].GetInt(), 0);
  EXPECT_GT(resp.GetVec()[7].GetInt(), 0);
  EXPECT_GT(resp.GetVec()[8].GetInt(), 0);

  resp = Run({"JSON.DEBUG", "memory", "json1", "$"});
  EXPECT_GT(resp.GetInt(), 0);

  resp = Run({"JSON.SET", "bigstr", "$",
              R"({"text":"This is a longer string that should definitely exceed SSO buffer"})"});
  EXPECT_EQ(resp, "OK");
  resp = Run({"JSON.DEBUG", "memory", "bigstr", "$.text"});
  EXPECT_GT(resp.GetInt(), 0);

  resp = Run({"JSON.SET", "obj_doc", "$", R"({"num":42, "obj":{"k1":1,"k2":2}})"});
  EXPECT_EQ(resp, "OK");
  resp = Run({"JSON.DEBUG", "MEMORY", "obj_doc", "$.num"});
  EXPECT_EQ(resp.GetInt(), 0);
  resp = Run({"JSON.DEBUG", "memory", "obj_doc", "$.obj"});
  EXPECT_GT(resp.GetInt(), 0);
}

TEST_F(JsonFamilyTest, DebugMemoryLegacy) {
  auto resp = Run({"JSON.SET", "json1", "$",
                   R"([1, 2.3, "foo", true, null, {}, [], {"a":1, "b":2}, [1,2,3]])"});
  EXPECT_EQ(resp, "OK");

  resp = Run({"JSON.DEBUG", "memory", "json1", "."});
  EXPECT_EQ(resp.type, RespExpr::INT64);
  EXPECT_GT(resp.GetInt(), 0);

  resp = Run({"JSON.DEBUG", "memory", "json1"});
  EXPECT_EQ(resp.type, RespExpr::INT64);
  EXPECT_GT(resp.GetInt(), 0);

  resp = Run({"JSON.SET", "primitives", "$", R"({"num":42, "bool":true, "null":null})"});
  EXPECT_EQ(resp, "OK");
  resp = Run({"JSON.DEBUG", "memory", "primitives", ".num"});
  EXPECT_EQ(resp.GetInt(), 0);
  resp = Run({"JSON.DEBUG", "memory", "primitives", ".bool"});
  EXPECT_EQ(resp.GetInt(), 0);
  resp = Run({"JSON.DEBUG", "memory", "primitives", ".null"});
  EXPECT_EQ(resp.GetInt(), 0);

  resp = Run({"JSON.SET", "obj_doc", "$",
              R"({"longstring":"This is a very long string that definitely exceeds SSO buffer"})"});
  EXPECT_EQ(resp, "OK");
  resp = Run({"JSON.DEBUG", "MEMORY", "obj_doc", ".longstring"});
  EXPECT_GT(resp.GetInt(), 0);

  resp = Run({"JSON.SET", "arr", "$", R"([1,2,3,4,5,6,7,8,9,10])"});
  EXPECT_EQ(resp, "OK");
  resp = Run({"JSON.DEBUG", "memory", "arr", "."});
  EXPECT_GT(resp.GetInt(), 0);

  resp = Run({"JSON.SET", "obj", "$", R"({"a":1, "b":2, "c":3})"});
  EXPECT_EQ(resp, "OK");
  resp = Run({"JSON.DEBUG", "memory", "obj", "."});
  EXPECT_GT(resp.GetInt(), 0);
}

TEST_F(JsonFamilyTest, Resp) {
  auto resp = Run({"JSON.SET", "json", ".", PhonebookJson});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.RESP", "json", "$"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);

  resp = Run({"JSON.RESP", "json", "$.address.*"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  EXPECT_THAT(resp.GetVec(), ElementsAre("New York", "NY", "21 2nd Street", "10021-3100"));

  resp = Run({"JSON.RESP", "json", "$.isAlive"});
  EXPECT_THAT(resp, "true");

  resp = Run({"JSON.RESP", "json", "$.age"});
  EXPECT_THAT(resp, IntArg(27));

  resp = Run({"JSON.RESP", "json", "$.weight"});
  EXPECT_THAT(resp, "135.25");
}

TEST_F(JsonFamilyTest, RespLegacy) {
  auto resp = Run({"JSON.SET", "json", ".", PhonebookJson});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.RESP", "json"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);

  resp = Run({"JSON.RESP", "json", ".address.*"});
  EXPECT_THAT(resp, "10021-3100");

  resp = Run({"JSON.RESP", "json", ".isAlive"});
  EXPECT_THAT(resp, "true");

  resp = Run({"JSON.RESP", "json", ".age"});
  EXPECT_THAT(resp, IntArg(27));

  resp = Run({"JSON.RESP", "json", ".weight"});
  EXPECT_THAT(resp, "135.25");
}

TEST_F(JsonFamilyTest, Set) {
  string json = R"(
    {"a":{"a":1, "b":2, "c":3}}
  )";

  auto resp = Run({"JSON.SET", "json1", ".", json});
  EXPECT_THAT(resp, "OK");

  resp = Run({"JSON.SET", "json1", "$.a.*", "0"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"JSON.GET", "json1"});
  EXPECT_EQ(resp, R"({"a":{"a":0,"b":0,"c":0}})");

  json = R"(
    {"a": [1,2,3,4,5]}
  )";

  resp = Run({"JSON.SET", "json2", ".", json});
  EXPECT_THAT(resp, "OK");

  resp = Run({"JSON.SET", "json2", "$.a[*]", "0"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"JSON.GET", "json2"});
  EXPECT_EQ(resp, R"({"a":[0,0,0,0,0]})");

  json = R"(
    {"a": 2}
  )";

  resp = Run({"JSON.SET", "json3", "$", json});
  EXPECT_THAT(resp, "OK");

  resp = Run({"JSON.SET", "json3", "$.b", "8"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"JSON.SET", "json3", "$.c", "[1,2,3]"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"JSON.SET", "json3", "$.z", "3", "XX"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  resp = Run({"JSON.SET", "json3", "$.b", "4", "NX"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  resp = Run({"JSON.GET", "json3"});
  EXPECT_EQ(resp, R"({"a":2,"b":8,"c":[1,2,3]})");
}

TEST_F(JsonFamilyTest, SetLegacy) {
  string json = R"(
    {"a":{"a":1, "b":2, "c":3}}
  )";

  auto resp = Run({"JSON.SET", "json1", ".", json});
  EXPECT_THAT(resp, "OK");

  resp = Run({"JSON.SET", "json1", ".a.*", "0"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"JSON.GET", "json1"});
  EXPECT_EQ(resp, R"({"a":{"a":0,"b":0,"c":0}})");

  json = R"(
    {"a": [1,2,3,4,5]}
  )";

  resp = Run({"JSON.SET", "json2", ".", json});
  EXPECT_THAT(resp, "OK");

  resp = Run({"JSON.SET", "json2", ".a[*]", "0"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"JSON.GET", "json2"});
  EXPECT_EQ(resp, R"({"a":[0,0,0,0,0]})");

  json = R"(
    {"a": 2}
  )";

  resp = Run({"JSON.SET", "json3", ".", json});
  EXPECT_THAT(resp, "OK");

  resp = Run({"JSON.SET", "json3", ".b", "8"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"JSON.SET", "json3", ".c", "[1,2,3]"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"JSON.SET", "json3", ".z", "3", "XX"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  resp = Run({"JSON.SET", "json3", ".z", "3"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"JSON.SET", "json3", ".z", "4", "XX"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"JSON.SET", "json3", ".b", "4", "NX"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  resp = Run({"JSON.SET", "json3", ".b", "5"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"JSON.SET", "json3", ".", "[]", "NX"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  resp = Run({"JSON.GET", "json3"});
  EXPECT_EQ(resp, R"({"a":2,"b":5,"c":[1,2,3],"z":4})");

  json = R"(
    {"foo": "bar"}
  )";

  resp = Run({"JSON.SET", "json4", ".", json});
  EXPECT_THAT(resp, "OK");

  resp = Run({"JSON.SET", "json4", "foo", "\"baz\"", "XX"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"JSON.SET", "json4", "foo2", "\"qaz\"", "NX"});
  EXPECT_THAT(resp, "OK");
}

TEST_F(JsonFamilyTest, MSet) {
  string json1 = R"({"a":{"a":1,"b":2,"c":3}})";
  string json2 = R"({"a":{"a":4,"b":5,"c":6}})";

  auto resp = Run({"JSON.MSET", "j1", "$"});
  EXPECT_THAT(resp, ErrArg("wrong number"));
  resp = Run({"JSON.MSET", "j1", "$", json1, "j3", "$"});
  EXPECT_THAT(resp, ErrArg("wrong number"));

  resp = Run({"JSON.MSET", "j1", "$", json1, "j2", "$", json2, "j3", "$", json1, "j4", "$", json2});
  EXPECT_EQ(resp, "OK");

  resp = Run({"JSON.MGET", "j1", "j2", "j3", "j4", "$"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("[" + json1 + "]", "[" + json2 + "]", "[" + json1 + "]",
                                         "[" + json2 + "]"));
}

TEST_F(JsonFamilyTest, MSetLegacy) {
  string json1 = R"({"a":{"a":1,"b":2,"c":3}})";
  string json2 = R"({"a":{"a":4,"b":5,"c":6}})";

  auto resp = Run({"JSON.MSET", "j1", "."});
  EXPECT_THAT(resp, ErrArg("wrong number"));
  resp = Run({"JSON.MSET", "j1", ".", json1, "j3", "."});
  EXPECT_THAT(resp, ErrArg("wrong number"));

  resp = Run({"JSON.MSET", "j1", ".", json1, "j2", ".", json2, "j3", ".", json1, "j4", ".", json2});
  EXPECT_EQ(resp, "OK");

  resp = Run({"JSON.MGET", "j1", "j2", "j3", "j4", "$"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("[" + json1 + "]", "[" + json2 + "]", "[" + json1 + "]",
                                         "[" + json2 + "]"));
}

TEST_F(JsonFamilyTest, Merge) {
  string json = R"(
  { "a": "b",
    "c": {
      "d": "e",
      "f": "g"
    }
  }
  )";

  auto resp = Run({"JSON.SET", "j1", "$", json});
  EXPECT_EQ(resp, "OK");

  string patch = R"(
    {
      "a":"z",
      "c": {
      "f": null
      }
    }
  )";

  resp = Run({"JSON.MERGE", "new", "$", patch});
  EXPECT_EQ(resp, "OK");

  resp = Run({"JSON.GET", "new"});
  EXPECT_EQ(resp, R"({"a":"z","c":{"f":null}})");

  resp = Run({"JSON.MERGE", "j1", "$", patch});
  EXPECT_EQ(resp, "OK");
  resp = Run({"JSON.GET", "j1"});
  EXPECT_EQ(resp, R"({"a":"z","c":{"d":"e"}})");

  resp = Run({"JSON.SET", "foo", "$", R"("{"f1":1, "common":2}")"});
  EXPECT_EQ(resp, "OK");
  resp = Run({"JSON.MERGE", "foo", "$", R"({"f2":2, "common":4})"});
  EXPECT_EQ(resp, "OK");
  resp = Run({"JSON.GET", "foo"});
  EXPECT_EQ(resp, R"({"common":4,"f2":2})");

  json = R"({
  "ans": {
    "x": {
      "y" : {
        "doubled": false,
        "answers": [
          "foo",
          "bar"
        ]
      }
    }
  }
  })";
  resp = Run({"JSON.SET", "j2", "$", json});
  ASSERT_EQ(resp, "OK");

  patch = R"(
    {"z": {
      "doubled": false,
      "answers": ["xxx",  "yyy"]
     },
     "y": { "doubled": true}
     })";

  resp = Run({"JSON.MERGE", "j2", "$.ans.x", patch});

  EXPECT_EQ(resp, "OK");
  resp = Run({"JSON.GET", "j2"});
  EXPECT_EQ(resp, R"({"ans":{"x":{"y":{"answers":["foo","bar"],"doubled":true},)"
                  R"("z":{"answers":["xxx","yyy"],"doubled":false}}}})");

  // Test not existing entry
  resp = Run({"JSON.MERGE", "j3", "$", patch});
  EXPECT_EQ(resp, "OK");
  resp = Run({"JSON.GET", "j3"});
  EXPECT_EQ(resp, R"({"y":{"doubled":true},"z":{"answers":["xxx","yyy"],"doubled":false}})");
}

TEST_F(JsonFamilyTest, MergeLegacy) {
  string json = R"(
  { "a": "b",
    "c": {
      "d": "e",
      "f": "g"
    }
  }
  )";

  auto resp = Run({"JSON.SET", "j1", "$", json});
  EXPECT_EQ(resp, "OK");

  string patch = R"(
    {
      "a":"z",
      "c": {
      "f": null
      }
    }
  )";

  resp = Run({"JSON.MERGE", "new", ".", patch});
  EXPECT_EQ(resp, "OK");

  resp = Run({"JSON.GET", "new"});
  EXPECT_EQ(resp, R"({"a":"z","c":{"f":null}})");

  resp = Run({"JSON.MERGE", "j1", ".", patch});
  EXPECT_EQ(resp, "OK");
  resp = Run({"JSON.GET", "j1"});
  EXPECT_EQ(resp, R"({"a":"z","c":{"d":"e"}})");

  resp = Run({"JSON.SET", "foo", "$", R"("{"f1":1, "common":2}")"});
  EXPECT_EQ(resp, "OK");
  resp = Run({"JSON.MERGE", "foo", ".", R"({"f2":2, "common":4})"});
  EXPECT_EQ(resp, "OK");
  resp = Run({"JSON.GET", "foo"});
  EXPECT_EQ(resp, R"({"common":4,"f2":2})");

  json = R"({
  "ans": {
    "x": {
      "y" : {
        "doubled": false,
        "answers": [
          "foo",
          "bar"
        ]
      }
    }
  }
  })";
  resp = Run({"JSON.SET", "j2", "$", json});
  ASSERT_EQ(resp, "OK");

  patch = R"(
    {"z": {
      "doubled": false,
      "answers": ["xxx",  "yyy"]
     },
     "y": { "doubled": true}
     })";

  resp = Run({"JSON.MERGE", "j2", ".ans.x", patch});

  EXPECT_EQ(resp, "OK");
  resp = Run({"JSON.GET", "j2"});
  EXPECT_EQ(resp, R"({"ans":{"x":{"y":{"answers":["foo","bar"],"doubled":true},)"
                  R"("z":{"answers":["xxx","yyy"],"doubled":false}}}})");

  // Test not existing entry
  resp = Run({"JSON.MERGE", "j3", ".", patch});
  EXPECT_EQ(resp, "OK");
  resp = Run({"JSON.GET", "j3"});
  EXPECT_EQ(resp, R"({"y":{"doubled":true},"z":{"answers":["xxx","yyy"],"doubled":false}})");
}

TEST_F(JsonFamilyTest, GetString) {
  string json = R"(
  { "a": "b",
    "c": {
      "d": "e",
      "f": "g"
    }
  }
  )";

  auto resp = Run({"SET", "json", json});
  EXPECT_THAT(resp, "OK");
  resp = Run({"JSON.GET", "json", "$.c"});
  EXPECT_EQ(resp, R"([{"d":"e","f":"g"}])");
  Run({"SET", "not_json", "not_json"});
  resp = Run({"JSON.GET", "not_json", "$.c"});
  EXPECT_THAT(resp, ErrArg("WRONGTYPE"));
}

TEST_F(JsonFamilyTest, MaxNestingJsonDepth) {
  auto generate_nested_json = [](int depth) -> std::string {
    std::string json = "{";
    for (int i = 0; i < depth - 1; ++i) {
      json += R"("key": {)";
    }
    json += R"("key": "value")";  // Innermost value
    for (int i = 0; i < depth - 1; ++i) {
      json += "}";
    }
    json += "}";
    return json;
  };

  // Generate JSON with maximum allowed depth (256)
  /* std::string valid_json = generate_nested_json(255);

  // Test with valid JSON at depth 256
  auto resp = Run({"JSON.SET", "valid_json",  ".", valid_json});
  EXPECT_THAT(resp, "OK"); */

  // Generate JSON exceeding maximum depth (257)
  std::string invalid_json = generate_nested_json(257);

  // Test with invalid JSON at depth 257
  auto resp = Run({"JSON.SET", "invalid_json", ".", invalid_json});
  EXPECT_THAT(resp, ErrArg("failed to parse JSON"));
}

TEST_F(JsonFamilyTest, SetNestedFields) {
  auto resp = Run({"JSON.SET", "json", "$", "{}"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"JSON.SET", "json", "$['field1']", "1"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"field1":1})");

  resp = Run({"JSON.SET", "json", "$['-field2']", "2"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"JSON.GET", "json"});
  EXPECT_EQ(resp, R"({"-field2":2,"field1":1})");
}

TEST_F(JsonFamilyTest, ArrPopWithFormatParameter) {
  auto resp = Run({"JSON.ARRPOP", "test_resp3", "FORMAT", "EXPAND", "$.a"});
  ASSERT_THAT(resp, ErrArg("value is not an integer or out of range"));
}

TEST_F(JsonFamilyTest, DepthLimitExceeded) {
  string deep_json =
      R"({"jdiqr":{"nro":{"uzuf":{"bq":{"yc":{"zodmw":{"zbbq":{"sf":{"oule":{"j":{"mjsss":{"tap":{"bh":{"f":{"zlwgu":{"s":{"kt":{"fnmo":{"hub":{"xj":{"jo":{"ofara":{"kx":{"uw":{"z":{"mwvk":{"jo":{"qqz":{"b":{"tbp":{"esx":{"g":{"p":{"tpzk":{"i":{"azq":{"ttcd":{"wl":{"zo":{"l":{"nsq":{"tulso":{"uk":{"imfzw":{"vlub":{"k":{"ypml":{"voack":{"sosd":{"f":{"x":{"usv":{"hnw":{"ax":{"e":{"ozi":{"doi":{"k":{"bz":{"vxhp":{"e":{"vnpv":{"rhs":{"j":{"esp":{"f":{"ykyvy":{"xvmhg":{"eks":{"oijy":{"sjk":{"a":{"sejgy":{"msd":{"acyo":{"yxss":{"slbf":{"ssuns":{"c":{"kv":{"i":{"y":{"ubqz":{"uam":{"igaq":{"jl":{"vy":{"zlu":{"gscx":{"mb":{"idca":{"k":{"twx":{"ngjs":{"k":{"xcx":{"sxc":{"ye":{"fty":{"pho":{"lrn":{"wmv":{"h":{"sfuk":{"ilwzy":{"nlofv":{"mpcms":{"bg":{"jykgm":{"x":{"nbe":{"ixbyh":{"tmus":{"nqulr":{"cqxdw":{"wwpi":{"kj":{"udb":{"oct":{"tqkv":{"r":{"zev":{"rsu":{"gs":{"pyzm":{"au":{"__leaf":42}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}})";

  auto resp = Run({"JSON.SET", "test", "$", deep_json});
  ASSERT_THAT(resp, ErrArg("ERR failed to parse JSON"));
}

TEST_F(JsonFamilyTest, JsonCommandsWorkingWithOtherTypesBug) {
  std::string_view wrong_type_err{kWrongTypeErr};
  wrong_type_err.remove_prefix(1);  // Remove the leading - character

  auto resp = Run({"HSET", "k1", "field", "value"});
  EXPECT_THAT(resp, IntArg(1));

  // First bug: JSON.SET should return an error
  resp = Run({"JSON.SET", "k1", "$", R"({"a":"b"})"});
  ASSERT_THAT(resp, ErrArg(wrong_type_err));

  // Second bug: JSON.DEL should not delete the hash
  resp = Run({"HSET", "k2", "field", "value"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"JSON.DEL", "k2"});
  ASSERT_THAT(resp, ErrArg(wrong_type_err));

  resp = Run({"HGET", "k2", "field"});
  EXPECT_THAT(resp, "value");
}

TEST_F(JsonFamilyTest, ResetStringKeyWithSetGet) {
  auto resp = Run({"JSON.SET", "key", "$", R"({"a":"b"})"});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.GET", "key"});
  EXPECT_THAT(resp, R"({"a":"b"})");

  // Resetting the key with a string value
  resp = Run({"SET", "key", R"({"a":"b"})"});
  ASSERT_THAT(resp, "OK");

  resp = Run({"GET", "key"});
  EXPECT_THAT(resp, R"({"a":"b"})");

  // JSON.GET should still work after resetting the key with a string value
  resp = Run({"JSON.GET", "key"});
  EXPECT_THAT(resp, R"({"a":"b"})");

  // Resetting the key again with JSON.SET
  // This should not cause any issues
  resp = Run({"JSON.SET", "key", "$", R"({"a":"b"})"});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.GET", "key"});
  EXPECT_THAT(resp, R"({"a":"b"})");
}

TEST_F(JsonFamilyTest, DelNonExistingKey) {
  auto resp = Run({"EXISTS", "nonexisting_key"});
  EXPECT_THAT(resp, IntArg(0));

  resp = Run({"JSON.DEL", "nonexisting_key", "."});
  EXPECT_THAT(resp, IntArg(0));

  resp = Run({"JSON.DEL", "nonexisting_key", "$"});
  EXPECT_THAT(resp, IntArg(0));

  resp = Run({"JSON.DEL", "nonexisting_key"});
  EXPECT_THAT(resp, IntArg(0));
}

TEST_F(JsonFamilyTest, JsonKeysWithDots) {
  auto resp = Run(
      {"JSON.SET", "OFFERS:DBX-AGG1611-IGN", "$",
       R"({"Gallery": {"Images": {"bdz1xjm.jpeg": "some_value", "bdz1xjm": "another_value"}}})"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"JSON.GET", "OFFERS:DBX-AGG1611-IGN", "$['Gallery']['Images']['bdz1xjm']"});
  EXPECT_THAT(resp, "[\"another_value\"]");

  resp = Run({"JSON.GET", "OFFERS:DBX-AGG1611-IGN", "$['Gallery']['Images']['bdz1xjm.jpeg']"});
  EXPECT_THAT(resp, "[\"some_value\"]");
}

TEST_F(JsonFamilyTest, JsonSetDeleteExpiryOfExistingKey) {
  auto resp = Run("SET key foo EX 1000");
  ASSERT_THAT(resp, "OK");
  resp = Run("JSON.SET key $ {}");
  ASSERT_THAT(resp, "OK");
  resp = Run("TTL key");
  ASSERT_THAT(resp, IntArg(-1));
  resp = Run("EXPIRE key 100");
  ASSERT_THAT(resp, IntArg(1));
  resp = Run("TTL key");
  EXPECT_THAT(resp.GetInt(), 100);
}

TEST_F(JsonFamilyTest, JsonIntPathTest) {
  auto resp = Run(
      R"(JSON.SET test:images $ {"images":[{"id":1,"sizes":{"1":"small.jpg","10":"medium.jpg","14":"large.jpg","8":"thumb.jpg"}}]})");
  ASSERT_THAT(resp, "OK");
  resp = Run(R"(JSON.GET test:images $.images[0].sizes.10)");
  EXPECT_THAT(resp, "[\"medium.jpg\"]");
  resp = Run(R"(JSON.GET test:images $.images[0].sizes["10"])");
  EXPECT_THAT(resp, "[\"medium.jpg\"]");
  resp = Run(R"(JSON.GET test:images $.images[0].sizes['10'])");
  EXPECT_THAT(resp, "[\"medium.jpg\"]");
  resp = Run(R"(JSON.GET test:images $.images[0]["sizes"]["10"])");
  EXPECT_THAT(resp, "[\"medium.jpg\"]");
  resp = Run(R"(JSON.GET test:images $.images[0].sizes.8)");
  EXPECT_THAT(resp, "[\"thumb.jpg\"]");
  resp = Run(R"(JSON.GET test:images $.images[0].sizes.14)");
  EXPECT_THAT(resp, "[\"large.jpg\"]");
  resp = Run(R"(JSON.GET test:images $.images[0].sizes["8"])");
  EXPECT_THAT(resp, "[\"thumb.jpg\"]");
  resp = Run(R"(JSON.GET test:images $.images[0].sizes["14"])");
  EXPECT_THAT(resp, "[\"large.jpg\"]");
}

TEST_F(JsonFamilyTest, ARRLEN_RESP3NestedArrayBug) {
  Run({"HELLO", "3"});

  string json = R"({"a":[1], "b":{"a":[1,2,3]}, "c":{"x":"not_a"}})";
  auto resp = Run({"JSON.SET", "doc", ".", json});
  ASSERT_THAT(resp, "OK");

  // In RESP3 mode, this should return [1, 3] (direct integers)
  // NOT [[1], [3]] (integers wrapped in arrays)
  resp = Run({"JSON.ARRLEN", "doc", "$..a"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_EQ(resp.GetVec().size(), 2);

  // The bug: each element is wrapped in array when it shouldn't be
  // Check that elements are NOT arrays themselves
  EXPECT_THAT(resp.GetVec()[0], Not(ArgType(RespExpr::ARRAY)));  // Should be integer, not array
  EXPECT_THAT(resp.GetVec()[1], Not(ArgType(RespExpr::ARRAY)));  // Should be integer, not array

  // Verify the actual values
  EXPECT_THAT(resp.GetVec()[0], IntArg(1));
  EXPECT_THAT(resp.GetVec()[1], IntArg(3));
}

TEST_F(JsonFamilyTest, ARRAPPEND_RESP3NestedArrayBug) {
  Run({"HELLO", "3"});

  auto resp = Run({"JSON.SET", "doc", ".", R"({"a":[1], "b":{"a":[1,2,3]}})"});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRAPPEND", "doc", "$..a", "2"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  ASSERT_EQ(resp.GetVec().size(), 2);
  EXPECT_THAT(resp.GetVec()[0], Not(ArgType(RespExpr::ARRAY)));
  EXPECT_THAT(resp.GetVec()[1], Not(ArgType(RespExpr::ARRAY)));
  EXPECT_THAT(resp.GetVec()[0], IntArg(2));
  EXPECT_THAT(resp.GetVec()[1], IntArg(4));
}

TEST_F(JsonFamilyTest, ARRINDEX_RESP3NestedArrayBug) {
  Run({"HELLO", "3"});

  auto resp = Run({"JSON.SET", "doc", ".", R"({"a":["x","y"], "b":{"a":["y","z"]}})"});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRINDEX", "doc", "$..a", R"("y")"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  ASSERT_EQ(resp.GetVec().size(), 2);
  EXPECT_THAT(resp.GetVec()[0], Not(ArgType(RespExpr::ARRAY)));
  EXPECT_THAT(resp.GetVec()[1], Not(ArgType(RespExpr::ARRAY)));
  EXPECT_THAT(resp.GetVec()[0], IntArg(1));
  EXPECT_THAT(resp.GetVec()[1], IntArg(0));
}

TEST_F(JsonFamilyTest, ARRPOP_RESP3NestedArrayBug) {
  Run({"HELLO", "3"});

  auto resp = Run({"JSON.SET", "doc", ".", R"({"a":[7], "b":{"a":[8]}})"});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRPOP", "doc", "$..a"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  ASSERT_EQ(resp.GetVec().size(), 2);
  EXPECT_THAT(resp.GetVec()[0], Not(ArgType(RespExpr::ARRAY)));
  EXPECT_THAT(resp.GetVec()[1], Not(ArgType(RespExpr::ARRAY)));
}

TEST_F(JsonFamilyTest, ARRTRIM_RESP3NestedArrayBug) {
  Run({"HELLO", "3"});

  auto resp = Run({"JSON.SET", "doc", ".", R"({"a":[1,2], "b":{"a":[3,4,5]}})"});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.ARRTRIM", "doc", "$..a", "0", "0"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  ASSERT_EQ(resp.GetVec().size(), 2);
  EXPECT_THAT(resp.GetVec()[0], Not(ArgType(RespExpr::ARRAY)));
  EXPECT_THAT(resp.GetVec()[1], Not(ArgType(RespExpr::ARRAY)));
  EXPECT_THAT(resp.GetVec()[0], IntArg(1));
  EXPECT_THAT(resp.GetVec()[1], IntArg(1));
}

TEST_F(JsonFamilyTest, STRLEN_RESP3NestedArrayBug) {
  Run({"HELLO", "3"});

  auto resp = Run({"JSON.SET", "doc", ".", R"({"s":"hi", "b":{"s":"abc"}})"});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.STRLEN", "doc", "$..s"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  ASSERT_EQ(resp.GetVec().size(), 2);
  EXPECT_THAT(resp.GetVec()[0], Not(ArgType(RespExpr::ARRAY)));
  EXPECT_THAT(resp.GetVec()[1], Not(ArgType(RespExpr::ARRAY)));
  EXPECT_THAT(resp.GetVec()[0], IntArg(2));
  EXPECT_THAT(resp.GetVec()[1], IntArg(3));
}

TEST_F(JsonFamilyTest, OBJLEN_RESP3NestedArrayBug) {
  Run({"HELLO", "3"});

  auto resp = Run({"JSON.SET", "doc", ".", R"({"o":{"k":1}, "b":{"o":{"k":1,"m":2}}})"});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.OBJLEN", "doc", "$..o"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  ASSERT_EQ(resp.GetVec().size(), 2);
  EXPECT_THAT(resp.GetVec()[0], Not(ArgType(RespExpr::ARRAY)));
  EXPECT_THAT(resp.GetVec()[1], Not(ArgType(RespExpr::ARRAY)));
  EXPECT_THAT(resp.GetVec()[0], IntArg(1));
  EXPECT_THAT(resp.GetVec()[1], IntArg(2));
}

TEST_F(JsonFamilyTest, OBJKEYS_RESP3NestedArrayBug) {
  Run({"HELLO", "3"});

  auto resp = Run({"JSON.SET", "doc", ".", R"({"o":{"k":1}, "b":{"o":{"k":1,"m":2}}})"});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.OBJKEYS", "doc", "$..o"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  ASSERT_EQ(resp.GetVec().size(), 2);
  // Each element should be array of keys, not array wrapped again
  auto& el0 = resp.GetVec()[0];
  auto& el1 = resp.GetVec()[1];
  ASSERT_THAT(el0, ArgType(RespExpr::ARRAY));
  ASSERT_THAT(el1, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(el0.GetVec(), ElementsAre("k"));
  // Order of keys in objects is not guaranteed, so check size only for the second
  EXPECT_EQ(el1.GetVec().size(), 2);
}

TEST_F(JsonFamilyTest, STRAPPEND_RESP3NestedArrayBug) {
  Run({"HELLO", "3"});

  auto resp = Run({"JSON.SET", "doc", ".", R"({"s":"a", "b":{"s":"zz"}})"});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.STRAPPEND", "doc", "$..s", R"("b")"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  ASSERT_EQ(resp.GetVec().size(), 2);
  EXPECT_THAT(resp.GetVec()[0], Not(ArgType(RespExpr::ARRAY)));
  EXPECT_THAT(resp.GetVec()[1], Not(ArgType(RespExpr::ARRAY)));
  EXPECT_THAT(resp.GetVec()[0], IntArg(2));
  EXPECT_THAT(resp.GetVec()[1], IntArg(3));
}

TEST_F(JsonFamilyTest, TOGGLE_RESP3NestedArrayBug) {
  Run({"HELLO", "3"});

  auto resp = Run({"JSON.SET", "doc", ".", R"({"b":true, "x":{"b":false}})"});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.TOGGLE", "doc", "$..b"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  ASSERT_EQ(resp.GetVec().size(), 2);
  EXPECT_THAT(resp.GetVec()[0], Not(ArgType(RespExpr::ARRAY)));
  EXPECT_THAT(resp.GetVec()[1], Not(ArgType(RespExpr::ARRAY)));
  EXPECT_THAT(resp.GetVec()[0], IntArg(0));
  EXPECT_THAT(resp.GetVec()[1], IntArg(1));
}

TEST_F(JsonFamilyTest, SetOverLargeStringKey) {
  // Create a key with a large string value (must be heap-allocated, >16 bytes).
  string large_value(16000, 'x');
  Run({"SET", "key", large_value});

  // Overwrite the string key with a small JSON using root path.
  // Without the fix, freeing the old string inside SetJson caused a negative
  // memory diff in JsonAutoUpdater::SetJsonSize while bytes_used was 0.
  auto resp = Run({"JSON.SET", "key", "$", "1"});
  ASSERT_THAT(resp, "OK");

  resp = Run({"JSON.GET", "key"});
  EXPECT_EQ(resp, "1");
}

TEST_F(JsonFamilyTest, SetFullJsonInvalidOnNewKey) {
  // Try to set invalid JSON on a non-existent key
  auto resp = Run("JSON.SET newkey $ {invalid}");
  EXPECT_THAT(resp, ErrArg("failed to parse JSON"));

  // Verify the key was NOT created (proper cleanup)
  resp = Run("EXISTS newkey");
  EXPECT_THAT(resp, IntArg(0));
}

}  // namespace dfly


================================================
FILE: src/server/list_family.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
extern "C" {
#include "redis/sds.h"
}

#include <absl/functional/overload.h>
#include <absl/strings/numbers.h>

#include "base/flags.h"
#include "base/logging.h"
#include "core/detail/listpack.h"
#include "core/qlist.h"
#include "facade/cmd_arg_parser.h"
#include "server/acl/acl_commands_def.h"
#include "server/blocking_controller.h"
#include "server/cluster/cluster_defs.h"
#include "server/command_families.h"
#include "server/command_registry.h"
#include "server/conn_context.h"
#include "server/container_utils.h"
#include "server/db_slice.h"
#include "server/engine_shard_set.h"
#include "server/error.h"
#include "server/family_utils.h"
#include "server/namespaces.h"
#include "server/transaction.h"

/**
 * The number of entries allowed per internal list node can be specified
 * as a fixed maximum size or a maximum number of elements.
 * For a fixed maximum size, use -5 through -1, meaning:
 * -5: max size: 64 Kb  <-- not recommended for normal workloads
 * -4: max size: 32 Kb  <-- not recommended
 * -3: max size: 16 Kb  <-- probably not recommended
 * -2: max size: 8 Kb   <-- good
 * -1: max size: 4 Kb   <-- good
 * Positive numbers mean store up to _exactly_ that number of elements
 * per list node.
 * The highest performing option is usually -2 (8 Kb size) or -1 (4 Kb size),
 * but if your use case is unique, adjust the settings as necessary.
 *
 */
ABSL_FLAG(int32_t, list_max_listpack_size, -2, "Maximum listpack size, default is 8kb");

/**
 * Lists may also be compressed.
 * Compress depth is the number of quicklist listpack nodes from *each* side of
 * the list to *exclude* from compression.  The head and tail of the list
 * are always uncompressed for fast push/pop operations.  Settings are:
 * 0: disable all list compression
 * 1: depth 1 means "don't start compressing until after 1 node into the list,
 *    going from either the head or tail"
 *    So: [head]->node->node->...->node->[tail]
 *    [head], [tail] will always be uncompressed; inner nodes will compress.
 * 2: [head]->[next]->node->node->...->node->[prev]->[tail]
 *    2 here means: don't compress head or head->next or tail->prev or tail,
 *    but compress all nodes between them.
 * 3: [head]->[next]->[next]->node->node->...->node->[prev]->[prev]->[tail]
 * etc.
 *
 */

ABSL_FLAG(int32_t, list_compress_depth, 0, "Compress depth of the list. Default is no compression");
ABSL_FLAG(unsigned, list_tiering_threshold, 0,
          "Tiering threshold for lists. Default - no tiering.");

namespace dfly {

using namespace std;

using namespace facade;
using absl::GetFlag;
using absl::Overload;
using time_point = Transaction::time_point;

namespace {

class ListWrapper {
  using LP = detail::ListPack;

  std::variant<QList*, LP> impl_;

  template <typename F> decltype(auto) VisitRef(F f) const {  // Cast T* to T&
    return std::visit(Overload{[&f](auto* s) { return f(*s); }, f}, impl_);
  }

  template <typename F> decltype(auto) VisitMut(F f) {  // Cast T* to T&
    return std::visit(Overload{[&f](auto* s) { return f(*s); }, f}, impl_);
  }

  static QList* PromoteToQLIfNeeded(LP lp, size_t additional_size) {
    size_t sz = lp.BytesSize();
    if (ShouldStoreAsListPack(sz + additional_size)) {
      return nullptr;
    }
    QList* ql = CompactObj::AllocateMR<QList>(GetFlag(FLAGS_list_max_listpack_size),
                                              GetFlag(FLAGS_list_compress_depth));
    if (GetFlag(FLAGS_list_tiering_threshold) > 0) {
      ql->SetTieringParams(
          QList::TieringParams{.node_depth_threshold = GetFlag(FLAGS_list_tiering_threshold)});
    }
    if (lp.Size() > 0) {
      ql->AppendListpack(lp.GetPointer());
    }
    return ql;
  }

  void PushInternal(string_view value, QList::Where where, QList& ql) {
    ql.Push(value, where);
  }

  void PushInternal(string_view value, QList::Where where, LP& lp) {
    if (QList* ql = PromoteToQLIfNeeded(lp, value.size()); ql) {
      if (lp.Size() == 0) {  // otherwise we already appended it in PromoteToQLIfNeeded.
        lpFree(lp.GetPointer());
      }
      ql->Push(value, where);
      impl_ = ql;
    } else {
      lp.Push(value, where);
    }
  }

  bool InsertInternal(string_view pivot, string_view elem, QList::InsertOpt insert_opt, QList& ql) {
    return ql.Insert(pivot, elem, insert_opt);
  }

  bool InsertInternal(string_view pivot, string_view elem, QList::InsertOpt insert_opt, LP& lp) {
    uint8_t* p = lp.Find(pivot);
    if (!p)
      return false;

    if (QList* ql = PromoteToQLIfNeeded(lp, elem.size()); ql) {
      DCHECK_GT(ql->Size(), 0u);  // otherwise we would not Find the pivot.
      impl_ = ql;
      return ql->Insert(pivot, elem, insert_opt);
    }

    lp.Insert(p, elem, insert_opt);
    return true;
  }

  bool ReplaceInternal(long index, string_view elem, QList& ql) {
    return ql.Replace(index, elem);
  }

  bool ReplaceInternal(long index, string_view elem, LP& lp) {
    uint8_t* p = lp.Seek(index);
    if (!p)
      return false;

    if (QList* ql = PromoteToQLIfNeeded(lp, elem.size()); ql) {
      DCHECK_GT(ql->Size(), 0u);  // otherwise we would not seek
      impl_ = ql;
      return ql->Replace(index, elem);
    }
    lp.Replace(p, elem);
    return true;
  }

 public:
  template <typename T> explicit ListWrapper(T t) : impl_(std::forward<T>(t)) {
  }

  size_t Size() const {
    return VisitRef([](auto& list) { return list.Size(); });
  }

  string Pop(QList::Where where) {
    return VisitMut([where](auto& list) { return list.Pop(where); });
  }

  void Push(string_view value, QList::Where where) {
    VisitMut([&](auto& list) { PushInternal(value, where, list); });
  }

  string First(QList::Where where) const {
    return visit(Overload{[&](QList* ql) {
                            auto it = ql->GetIterator(where);
                            CHECK(it.Valid());
                            return it.Get().to_string();
                          },
                          [&](const LP& lp) { return lp.First(where); }},
                 impl_);
  }

  std::optional<string> At(long index) const {
    return visit(Overload{[&](QList* ql) -> optional<string> {
                            auto it = ql->GetIterator(index);
                            if (!it.Valid())
                              return nullopt;
                            return it.Get().to_string();
                          },
                          [&](const LP& lp) { return lp.At(index); }},
                 impl_);
  }

  vector<uint32_t> Pos(string_view element, uint32_t rank, uint32_t count, uint32_t max_len,
                       QList::Where where) const;

  bool Insert(string_view pivot, string_view elem, QList::InsertOpt insert_opt) {
    return VisitMut([&](auto& list) { return InsertInternal(pivot, elem, insert_opt, list); });
  }

  unsigned Remove(string_view elem, unsigned count, QList::Where where);

  bool Replace(long index, string_view elem) {
    return VisitMut([&](auto& list) { return ReplaceInternal(index, elem, list); });
  }

  void Erase(long start, long count) {
    VisitMut([&](auto& list) { list.Erase(start, count); });
  }

  void Launder(PrimeValue* pv) {
    if (auto* lp = std::get_if<LP>(&impl_)) {
      pv->SetRObjPtr(lp->GetPointer());
    } else if (pv->Encoding() != kEncodingQL2) {
      // We promoted to QList but the PrimeValue is not updated.
      pv->SetRObjPtr(nullptr);
      auto* ql = std::get<QList*>(impl_);
      pv->InitRobj(OBJ_LIST, kEncodingQL2, ql);
    }
  }
};

vector<uint32_t> ListWrapper::Pos(string_view element, uint32_t rank, uint32_t count,
                                  uint32_t max_len, QList::Where where) const {
  DCHECK_GT(rank, 0u);

  if (auto* lp = std::get_if<LP>(&impl_)) {
    return lp->Pos(element, rank, count, max_len, where);
  }

  vector<uint32_t> matches;

  auto* ql = std::get<QList*>(impl_);
  auto it = ql->GetIterator(where);
  if (!it.Valid())
    return matches;

  unsigned index = 0;
  while (max_len == 0 || index < max_len) {
    if (it.Get() == element) {
      if (rank == 1) {
        auto k = (where == QList::HEAD) ? index : ql->Size() - index - 1;
        matches.push_back(k);
        if (count && matches.size() >= count)
          break;
      } else {
        rank--;
      }
    }
    index++;
    if (!it.Next())
      break;
  }
  return matches;
}

unsigned ListWrapper::Remove(string_view elem, unsigned count, QList::Where where) {
  // try parsing the element into an integer.
  int64_t ival;
  int is_int = lpStringToInt64(elem.data(), elem.size(), &ival);
  CollectionEntry collection_elem(elem.data(), elem.size());
  if (is_int) {
    collection_elem = CollectionEntry{ival};
  }

  if (auto* lp = std::get_if<LP>(&impl_)) {
    return lp->Remove(collection_elem, count, where);
  }

  auto* ql = std::get<QList*>(impl_);
  auto it = ql->GetIterator(where);
  auto is_match = [&](const QList::Entry& entry) {
    return is_int ? entry.is_int() && entry.ival() == ival : entry == elem;
  };

  unsigned removed = 0;
  while (it.Valid()) {
    QList::Entry entry = it.Get();
    if (is_match(entry)) {
      it = ql->Erase(it);
      removed++;
      if (count && removed == count)
        break;
    } else {
      it.Next();
    }
  }
  return removed;
}

ListWrapper GetLW(const PrimeValue& mv) {
  if (mv.Encoding() == kEncodingQL2) {
    return ListWrapper{static_cast<QList*>(mv.RObjPtr())};
  }
  return ListWrapper{detail::ListPack(static_cast<uint8_t*>(mv.RObjPtr()))};
}

enum class ListDir : uint8_t { LEFT, RIGHT };

QList::Where ToWhere(ListDir dir) {
  return dir == ListDir::LEFT ? QList::HEAD : QList::TAIL;
}

ListDir ParseDir(facade::CmdArgParser* parser) {
  return parser->MapNext("LEFT", ListDir::LEFT, "RIGHT", ListDir::RIGHT);
}

class BPopPusher {
 public:
  BPopPusher(string_view pop_key, string_view push_key, ListDir popdir, ListDir pushdir);

  // Returns WRONG_TYPE, OK.
  // If OK is returned then use result() to fetch the value.
  OpResult<string> Run(unsigned limit_ms, Transaction* tx, ConnectionContext* cntx);

 private:
  OpResult<string> RunSingle(time_point tp, Transaction* tx, ConnectionContext* cntx);
  OpResult<string> RunPair(time_point tp, Transaction* tx, ConnectionContext* cntx);

  string_view pop_key_, push_key_;
  ListDir popdir_, pushdir_;
};

// Called as a callback from BPopGeneric after we've determined which key to pop.
std::string OpBPop(Transaction* t, EngineShard* shard, std::string_view key, ListDir dir) {
  DVLOG(2) << "popping from " << key << " " << t->DebugId();

  auto& db_slice = t->GetDbSlice(shard->shard_id());
  auto it_res = db_slice.FindMutable(t->GetDbContext(), key, OBJ_LIST);

  CHECK(it_res) << t->DebugId() << " " << key;  // must exist and must be ok.

  auto it = it_res->it;
  std::string value;
  size_t len;

  ListWrapper lw = GetLW(it->second);
  QList::Where where = ToWhere(dir);
  value = lw.Pop(where);
  lw.Launder(&it->second);
  len = lw.Size();

  it_res->post_updater.Run();

  OpArgs op_args = t->GetOpArgs(shard);
  if (len == 0) {
    DVLOG(1) << "deleting key " << key << " " << t->DebugId();
    op_args.GetDbSlice().Del(op_args.db_cntx, it);
  }

  if (op_args.shard->journal()) {
    string command = dir == ListDir::LEFT ? "LPOP" : "RPOP";
    RecordJournal(op_args, command, ArgSlice{key}, 1);
  }

  return value;
}

ListWrapper CreateOrGet(const OpArgs& op_args, string_view key, bool create, PrimeValue* pv) {
  if (create) {
    auto blocking_controller = op_args.db_cntx.ns->GetBlockingController(op_args.shard->shard_id());
    if (blocking_controller) {
      blocking_controller->Awaken(op_args.db_cntx.db_index, key);
    }

    uint8_t* lp = lpNew(0);
    pv->InitRobj(OBJ_LIST, kEncodingListPack, lp);
    return ListWrapper{detail::ListPack(lp)};
  }

  return GetLW(*pv);
}

OpResult<string> OpMoveSingleShard(const OpArgs& op_args, string_view src, string_view dest,
                                   ListDir src_dir, ListDir dest_dir) {
  auto& db_slice = op_args.GetDbSlice();
  auto src_res = db_slice.FindMutable(op_args.db_cntx, src, OBJ_LIST);
  if (!src_res)
    return src_res.status();

  auto src_it = src_res->it;
  string val;
  ListWrapper srcql_v2 = GetLW(src_it->second);
  size_t prev_len = srcql_v2.Size();

  if (src == dest) {  // simple case.
    val = srcql_v2.Pop(ToWhere(src_dir));
    srcql_v2.Push(val, ToWhere(dest_dir));
    srcql_v2.Launder(&src_it->second);
    return val;
  }

  src_res->post_updater.Run();

  auto op_res = db_slice.AddOrFind(op_args.db_cntx, dest, OBJ_LIST);
  RETURN_ON_BAD_STATUS(op_res);
  auto& dest_res = *op_res;

  // Insertion of dest could invalidate src_it. Find it again.
  src_res = db_slice.FindMutable(op_args.db_cntx, src, OBJ_LIST);
  src_it = src_res->it;

  ListWrapper dest_lw = CreateOrGet(op_args, dest, dest_res.is_new, &dest_res.it->second);

  val = srcql_v2.Pop(ToWhere(src_dir));
  srcql_v2.Launder(&src_it->second);

  dest_lw.Push(val, ToWhere(dest_dir));
  dest_lw.Launder(&dest_res.it->second);

  src_res->post_updater.Run();
  dest_res.post_updater.Run();

  if (prev_len == 1) {
    db_slice.Del(op_args.db_cntx, src_it);
  }

  return val;
}

// Read-only peek operation that determines whether the list exists and optionally
// returns the first from left/right value without popping it from the list.
OpResult<string> Peek(const OpArgs& op_args, string_view key, ListDir dir, bool fetch) {
  auto it_res = op_args.GetDbSlice().FindReadOnly(op_args.db_cntx, key, OBJ_LIST);
  if (!it_res) {
    return it_res.status();
  }

  if (!fetch)
    return OpStatus::OK;

  const PrimeValue& pv = it_res.value()->second;
  DCHECK_GT(pv.Size(), 0u);  // should be not-empty.

  ListWrapper lw = GetLW(pv);
  return lw.First(ToWhere(dir));
}

OpResult<uint32_t> OpPush(const OpArgs& op_args, std::string_view key, ListDir dir,
                          bool skip_notexist, const facade::ArgRange& vals, bool journal_rewrite) {
  DbSlice::ItAndUpdater res;

  if (skip_notexist) {
    auto tmp_res = op_args.GetDbSlice().FindMutable(op_args.db_cntx, key, OBJ_LIST);
    if (tmp_res == OpStatus::KEY_NOTFOUND)
      return 0;  // Redis returns 0 for nonexisting keys for the *PUSHX actions.
    RETURN_ON_BAD_STATUS(tmp_res);
    res = std::move(*tmp_res);
  } else {
    auto op_res = op_args.GetDbSlice().AddOrFind(op_args.db_cntx, key, OBJ_LIST);
    RETURN_ON_BAD_STATUS(op_res);
    res = std::move(*op_res);
  }

  size_t len = 0;
  DVLOG(1) << "OpPush " << key << " new_key " << res.is_new;
  ListWrapper lw = CreateOrGet(op_args, key, res.is_new, &res.it->second);

  QList::Where where = ToWhere(dir);
  for (string_view v : vals) {
    lw.Push(v, where);
  }
  lw.Launder(&res.it->second);
  len = lw.Size();

  if (journal_rewrite && op_args.shard->journal()) {
    string command = dir == ListDir::LEFT ? "LPUSH" : "RPUSH";
    vector<string_view> mapped(vals.Size() + 1);
    mapped[0] = key;
    std::copy(vals.begin(), vals.end(), mapped.begin() + 1);
    RecordJournal(op_args, command, mapped, 2);
  }

  return len;
}

OpResult<StringVec> OpPop(const OpArgs& op_args, string_view key, ListDir dir, uint32_t count,
                          bool return_results, bool journal_rewrite) {
  auto& db_slice = op_args.GetDbSlice();
  auto it_res = db_slice.FindMutable(op_args.db_cntx, key, OBJ_LIST);
  if (!it_res)
    return it_res.status();

  if (count == 0)
    return StringVec{};

  auto it = it_res->it;
  size_t prev_len = 0;
  StringVec res;

  ListWrapper lw = GetLW(it->second);
  prev_len = lw.Size();

  if (prev_len < count) {
    count = prev_len;
  }

  if (return_results) {
    res.reserve(count);
  }

  QList::Where where = ToWhere(dir);
  for (unsigned i = 0; i < count; ++i) {
    string val = lw.Pop(where);
    if (return_results) {
      res.push_back(std::move(val));
    }
  }
  lw.Launder(&it->second);

  it_res->post_updater.Run();

  if (count == prev_len) {
    db_slice.Del(op_args.db_cntx, it);
  }

  if (op_args.shard->journal() && journal_rewrite) {
    string command = dir == ListDir::LEFT ? "LPOP" : "RPOP";
    RecordJournal(op_args, command, ArgSlice{key}, 2);
  }
  return res;
}

OpResult<string> MoveTwoShards(Transaction* trans, string_view src, string_view dest,
                               ListDir src_dir, ListDir dest_dir, bool conclude_on_error) {
  DCHECK_EQ(2u, trans->GetUniqueShardCnt());

  OpResult<string> find_res[2];
  OpResult<string> result;

  // Transaction is comprised of 2 hops:
  // 1 - check for entries existence, their types and if possible -
  //     read the value we may move from the source list.
  // 2.  If everything is ok, pop from source and push the peeked value into
  //     the destination.
  //
  auto cb = [&](Transaction* t, EngineShard* shard) {
    auto args = t->GetShardArgs(shard->shard_id());
    DCHECK_EQ(1u, args.Size());
    bool is_dest = args.Front() == dest;
    find_res[is_dest] = Peek(t->GetOpArgs(shard), args.Front(), src_dir, !is_dest);
    return OpStatus::OK;
  };

  trans->Execute(std::move(cb), false);

  if (!find_res[0] || find_res[1].status() == OpStatus::WRONG_TYPE) {
    result = find_res[0] ? find_res[1] : find_res[0];
    if (conclude_on_error)
      trans->Conclude();
  } else {
    // Everything is ok, lets proceed with the mutations.
    auto cb = [&](Transaction* t, EngineShard* shard) {
      auto args = t->GetShardArgs(shard->shard_id());
      auto key = args.Front();
      bool is_dest = (key == dest);
      OpArgs op_args = t->GetOpArgs(shard);

      if (is_dest) {
        string_view val{find_res[0].value()};
        DVLOG(1) << "Pushing value: " << val << " to list: " << dest;

        OpPush(op_args, key, dest_dir, false, ArgSlice{val}, true);

        // blocking_controller does not have to be set with non-blocking transactions.
        auto blocking_controller = t->GetNamespace().GetBlockingController(shard->shard_id());
        if (blocking_controller) {
          IndexSlice slice(0, 1);
          ShardArgs sa{absl::MakeSpan(&src, 1), absl::MakeSpan(&slice, 1)};

          // hack, again. since we hacked which queue we are waiting on (see RunPair)
          // we must clean-up src key here manually. See RunPair why we do this.
          // in short- we suspended on "src" on both shards.
          blocking_controller->RemovedWatched(sa, t);
        }
      } else {
        DVLOG(1) << "Popping value from list: " << key;
        OpPop(op_args, key, src_dir, 1, false, true);
      }

      return OpStatus::OK;
    };
    trans->Execute(std::move(cb), true);
    result = std::move(find_res[0].value());
  }

  return result;
}

OpResult<uint32_t> OpLen(const OpArgs& op_args, std::string_view key) {
  auto res = op_args.GetDbSlice().FindReadOnly(op_args.db_cntx, key, OBJ_LIST);
  if (!res)
    return res.status();

  ListWrapper lw = GetLW(res.value()->second);
  return lw.Size();
}

OpResult<string> OpIndex(const OpArgs& op_args, std::string_view key, long index) {
  auto res = op_args.GetDbSlice().FindReadOnly(op_args.db_cntx, key, OBJ_LIST);
  if (!res)
    return res.status();

  ListWrapper lw = GetLW(res.value()->second);
  optional elem = lw.At(index);
  if (!elem)
    return OpStatus::KEY_NOTFOUND;
  return std::move(*elem);
}

OpResult<vector<uint32_t>> OpPos(const OpArgs& op_args, string_view key, string_view element,
                                 int rank, uint32_t count, uint32_t max_len) {
  DCHECK(key.data() && element.data());
  DCHECK_NE(rank, 0);

  auto it_res = op_args.GetDbSlice().FindReadOnly(op_args.db_cntx, key, OBJ_LIST);
  if (!it_res.ok())
    return it_res.status();

  const PrimeValue& pv = (*it_res)->second;
  ListWrapper lw = GetLW(pv);

  QList::Where where = QList::HEAD;
  if (rank < 0) {
    rank = -rank;
    where = QList::TAIL;
  }

  return lw.Pos(element, rank, count, max_len, where);
}

OpResult<int> OpInsert(const OpArgs& op_args, string_view key, string_view pivot, string_view elem,
                       QList::InsertOpt insert_opt) {
  DCHECK(key.data() && pivot.data() && elem.data());

  auto& db_slice = op_args.GetDbSlice();
  auto it_res = db_slice.FindMutable(op_args.db_cntx, key, OBJ_LIST);
  if (!it_res)
    return it_res.status();

  ListWrapper lw = GetLW(it_res->it->second);

  int res = -1;

  if (lw.Insert(pivot, elem, insert_opt)) {
    lw.Launder(&it_res->it->second);
    res = int(lw.Size());
  }

  return res;
}

OpResult<uint32_t> OpRem(const OpArgs& op_args, string_view key, string_view elem, long count) {
  auto& db_slice = op_args.GetDbSlice();
  auto it_res = db_slice.FindMutable(op_args.db_cntx, key, OBJ_LIST);
  if (!it_res)
    return it_res.status();

  ListWrapper lw = GetLW(it_res->it->second);

  QList::Where where = QList::HEAD;
  if (count < 0) {
    count = -count;
    where = QList::TAIL;
  }

  unsigned removed = lw.Remove(elem, count, where);
  size_t len = lw.Size();
  lw.Launder(&it_res->it->second);
  it_res->post_updater.Run();

  if (len == 0) {
    db_slice.Del(op_args.db_cntx, it_res->it);
  }

  return removed;
}

OpStatus OpSet(const OpArgs& op_args, string_view key, string_view elem, long index) {
  auto& db_slice = op_args.GetDbSlice();
  auto it_res = db_slice.FindMutable(op_args.db_cntx, key, OBJ_LIST);
  if (!it_res)
    return it_res.status();

  ListWrapper lw = GetLW(it_res->it->second);
  OpStatus status = OpStatus::OUT_OF_RANGE;
  if (lw.Replace(index, elem)) {
    lw.Launder(&it_res->it->second);
    status = OpStatus::OK;
  }
  return status;
}

OpStatus OpTrim(const OpArgs& op_args, string_view key, long start, long end) {
  auto& db_slice = op_args.GetDbSlice();
  auto it_res = db_slice.FindMutable(op_args.db_cntx, key, OBJ_LIST);
  if (!it_res)
    return it_res.status();

  auto it = it_res->it;

  long llen = long(it->second.Size());

  /* convert negative indexes */
  if (start < 0)
    start = llen + start;
  if (end < 0)
    end = llen + end;
  if (start < 0)
    start = 0;

  long ltrim, rtrim;

  /* Invariant: start >= 0, so this test will be true when end < 0.
   * The range is empty when start > end or start >= length. */
  if (start > end || start >= llen) {
    /* Out of range start or start > end result in empty list */
    ltrim = llen;
    rtrim = 0;
  } else {
    if (end >= llen)
      end = llen - 1;
    ltrim = start;
    rtrim = llen - end - 1;
  }

  ListWrapper lw = GetLW(it->second);
  lw.Erase(0, ltrim);
  lw.Erase(-rtrim, rtrim);
  lw.Launder(&it->second);

  it_res->post_updater.Run();

  if (it->second.Size() == 0) {
    db_slice.Del(op_args.db_cntx, it);
  }
  return OpStatus::OK;
}

OpResult<StringVec> OpRange(const OpArgs& op_args, std::string_view key, long start, long end) {
  auto res = op_args.GetDbSlice().FindReadOnly(op_args.db_cntx, key, OBJ_LIST);
  if (!res)
    return res.status();

  const PrimeValue& pv = (*res)->second;
  long llen = pv.Size();

  /* convert negative indexes */
  if (start < 0)
    start = llen + start;
  if (end < 0)
    end = llen + end;
  if (start < 0)
    start = 0;

  /* Invariant: start >= 0, so this test will be true when end < 0.
   * The range is empty when start > end or start >= length. */
  if (start > end || start >= llen) {
    /* Out of range start or start > end result in empty list */
    return StringVec{};
  }

  StringVec str_vec;
  container_utils::IterateList(
      pv,
      [&str_vec](container_utils::ContainerEntry ce) {
        str_vec.emplace_back(ce.ToString());
        return true;
      },
      start, end);
  return str_vec;
}

void MoveGeneric(string_view src, string_view dest, ListDir src_dir, ListDir dest_dir,
                 Transaction* tx, SinkReplyBuilder* builder) {
  OpResult<string> result;

  if (tx->GetUniqueShardCnt() == 1) {
    auto cb = [&](Transaction* t, EngineShard* shard) {
      OpArgs op_args = t->GetOpArgs(shard);
      auto op_res = OpMoveSingleShard(op_args, src, dest, src_dir, dest_dir);
      if (op_res) {
        if (op_args.shard->journal()) {
          std::string_view cmd = src_dir == ListDir::LEFT ? "LPOP" : "RPOP";
          RecordJournal(op_args, cmd, ArgSlice{src}, 1);
          cmd = dest_dir == ListDir::LEFT ? "LPUSH" : "RPUSH";
          RecordJournal(op_args, cmd, ArgSlice{dest, op_res.value()}, 1);
        }
      }
      return op_res;
    };
    result = tx->ScheduleSingleHopT(std::move(cb));
  } else {
    result = MoveTwoShards(tx, src, dest, src_dir, dest_dir, true);
  }

  auto* rb = static_cast<RedisReplyBuilder*>(builder);
  if (result) {
    return rb->SendBulkString(*result);
  }

  switch (result.status()) {
    case OpStatus::KEY_NOTFOUND:
      rb->SendNull();
      break;

    default:
      builder->SendError(result.status());
      break;
  }
}

void RPopLPush(CmdArgList args, CommandContext* cmd_cntx) {
  string_view src = ArgS(args, 0);
  string_view dest = ArgS(args, 1);

  MoveGeneric(src, dest, ListDir::RIGHT, ListDir::LEFT, cmd_cntx->tx(), cmd_cntx->rb());
}

void BRPopLPush(CmdArgList args, CommandContext* cmd_cntx) {
  facade::CmdArgParser parser{args};
  auto [src, dest] = parser.Next<string_view, string_view>();
  float timeout = parser.Next<float>();
  auto* builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (auto err = parser.TakeError(); err)
    return cmd_cntx->SendError(err.MakeReply());

  if (timeout < 0)
    return cmd_cntx->SendError("timeout is negative");

  BPopPusher bpop_pusher(src, dest, ListDir::RIGHT, ListDir::LEFT);
  OpResult<string> op_res =
      bpop_pusher.Run(unsigned(timeout * 1000), cmd_cntx->tx(), cmd_cntx->server_conn_cntx());

  if (op_res) {
    return builder->SendBulkString(*op_res);
  }

  switch (op_res.status()) {
    case OpStatus::CANCELLED:
    case OpStatus::TIMED_OUT:
      return builder->SendNull();
      break;

    default:
      return builder->SendError(op_res.status());
      break;
  }
}

void BLMove(CmdArgList args, CommandContext* cmd_cntx) {
  facade::CmdArgParser parser{args};
  auto [src, dest] = parser.Next<string_view, string_view>();
  ListDir src_dir = ParseDir(&parser);
  ListDir dest_dir = ParseDir(&parser);
  float timeout = parser.Next<float>();
  auto* builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (auto err = parser.TakeError(); err)
    return cmd_cntx->SendError(err.MakeReply());

  if (timeout < 0)
    return cmd_cntx->SendError("timeout is negative");

  BPopPusher bpop_pusher(src, dest, src_dir, dest_dir);
  OpResult<string> op_res =
      bpop_pusher.Run(unsigned(timeout * 1000), cmd_cntx->tx(), cmd_cntx->server_conn_cntx());

  if (op_res) {
    return builder->SendBulkString(*op_res);
  }

  switch (op_res.status()) {
    case OpStatus::CANCELLED:
    case OpStatus::TIMED_OUT:
      return builder->SendNull();
      break;

    default:
      return builder->SendError(op_res.status());
      break;
  }
}

BPopPusher::BPopPusher(string_view pop_key, string_view push_key, ListDir popdir, ListDir pushdir)
    : pop_key_(pop_key), push_key_(push_key), popdir_(popdir), pushdir_(pushdir) {
}

OpResult<string> BPopPusher::Run(unsigned limit_ms, Transaction* tx, ConnectionContext* cntx) {
  time_point tp =
      limit_ms ? chrono::steady_clock::now() + chrono::milliseconds(limit_ms) : time_point::max();

  if (tx->GetUniqueShardCnt() == 1) {
    return RunSingle(tp, tx, cntx);
  }

  return RunPair(tp, tx, cntx);
}

OpResult<string> BPopPusher::RunSingle(time_point tp, Transaction* tx, ConnectionContext* cntx) {
  OpResult<string> op_res;
  bool is_multi = tx->IsMulti();
  auto cb_move = [&](Transaction* t, EngineShard* shard) {
    OpArgs op_args = t->GetOpArgs(shard);
    op_res = OpMoveSingleShard(op_args, pop_key_, push_key_, popdir_, pushdir_);
    if (op_res) {
      if (op_args.shard->journal()) {
        std::string_view cmd = popdir_ == ListDir::LEFT ? "LPOP" : "RPOP";
        RecordJournal(op_args, cmd, ArgSlice{pop_key_}, 1);
        cmd = pushdir_ == ListDir::LEFT ? "LPUSH" : "RPUSH";
        RecordJournal(op_args, cmd, ArgSlice{push_key_, op_res.value()}, 1);
      }
    }
    return OpStatus::OK;
  };
  tx->Execute(cb_move, false);

  if (is_multi || op_res.status() != OpStatus::KEY_NOTFOUND) {
    if (op_res.status() == OpStatus::KEY_NOTFOUND) {
      op_res = OpStatus::TIMED_OUT;
    }
    tx->Conclude();
    return op_res;
  }

  const auto key_checker = [](EngineShard* owner, const DbContext& context, Transaction*,
                              std::string_view key) -> bool {
    return context.GetDbSlice(owner->shard_id()).FindReadOnly(context, key, OBJ_LIST).ok();
  };

  // Block
  auto status = tx->WaitOnWatch(tp, pop_key_, key_checker, &(cntx->blocked), &(cntx->paused));
  if (status != OpStatus::OK)
    return status;

  tx->Execute(cb_move, true);
  return op_res;
}

OpResult<string> BPopPusher::RunPair(time_point tp, Transaction* tx, ConnectionContext* cntx) {
  bool is_multi = tx->IsMulti();
  OpResult<string> op_res = MoveTwoShards(tx, pop_key_, push_key_, popdir_, pushdir_, false);

  if (is_multi || op_res.status() != OpStatus::KEY_NOTFOUND) {
    if (op_res.status() == OpStatus::KEY_NOTFOUND) {
      op_res = OpStatus::TIMED_OUT;
    }
    tx->Conclude();
    return op_res;
  }

  const auto key_checker = [](EngineShard* owner, const DbContext& context, Transaction*,
                              std::string_view key) -> bool {
    return context.GetDbSlice(owner->shard_id()).FindReadOnly(context, key, OBJ_LIST).ok();
  };

  // a hack: we watch in both shards for pop_key but only in the source shard it's relevant.
  // Therefore we follow the regular flow of watching the key but for the destination shard it
  // will never be triggerred.
  // This allows us to run Transaction::Execute on watched transactions in both shards.
  if (auto status = tx->WaitOnWatch(tp, pop_key_, key_checker, &cntx->blocked, &cntx->paused);
      status != OpStatus::OK)
    return status;

  return MoveTwoShards(tx, pop_key_, push_key_, popdir_, pushdir_, true);
}

void PushGeneric(ListDir dir, bool skip_notexists, CmdArgList args, CommandContext* cmd_cntx) {
  std::string_view key = ArgS(args, 0);

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpPush(t->GetOpArgs(shard), key, dir, skip_notexists, args.subspan(1), false);
  };

  OpResult<uint32_t> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  if (result) {
    return cmd_cntx->SendLong(result.value());
  }

  return cmd_cntx->SendError(result.status());
}

void PopGeneric(ListDir dir, CmdArgList args, CommandContext* cmd_cntx) {
  facade::CmdArgParser parser{args};
  string_view key = parser.Next();

  uint32_t count = 1;
  bool return_arr = false;
  if (parser.HasNext()) {
    count = parser.Next<uint32_t>();
    return_arr = true;
  }

  RETURN_ON_PARSE_ERROR(parser, cmd_cntx);

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpPop(t->GetOpArgs(shard), key, dir, count, true, false);
  };

  OpResult<StringVec> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  switch (result.status()) {
    case OpStatus::KEY_NOTFOUND:
      return rb->SendNull();
    case OpStatus::WRONG_TYPE:
      return cmd_cntx->SendError(kWrongTypeErr);
    default:;
  }

  if (return_arr) {
    rb->SendBulkStrArr(*result);
  } else {
    DCHECK_EQ(1u, result->size());
    rb->SendBulkString(result->front());
  }
}

void BPopGeneric(ListDir dir, CmdArgList args, CommandContext* cmd_cntx) {
  DCHECK_GE(args.size(), 2u);

  float timeout;
  auto timeout_str = ArgS(args, args.size() - 1);
  if (!absl::SimpleAtof(timeout_str, &timeout)) {
    return cmd_cntx->SendError("timeout is not a float or out of range");
  }
  if (timeout < 0) {
    return cmd_cntx->SendError("timeout is negative");
  }
  VLOG(1) << "BPop timeout(" << timeout << ")";

  std::string popped_value;
  auto cb = [dir, &popped_value](Transaction* t, EngineShard* shard, std::string_view key) {
    popped_value = OpBPop(t, shard, key, dir);
  };

  auto* cntx = cmd_cntx->server_conn_cntx();
  Transaction* tx = cmd_cntx->tx();
  OpResult<string> popped_key = container_utils::RunCbOnFirstNonEmptyBlocking(
      tx, OBJ_LIST, std::move(cb), unsigned(timeout * 1000), &cntx->blocked, &cntx->paused);

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (popped_key) {
    DVLOG(1) << "BPop " << tx->DebugId() << " popped from key " << popped_key;  // key.
    std::string_view str_arr[2] = {*popped_key, popped_value};
    return rb->SendBulkStrArr(str_arr);
  }

  DVLOG(1) << "result for " << tx->DebugId() << " is " << popped_key.status();

  switch (popped_key.status()) {
    case OpStatus::WRONG_TYPE:
      return cmd_cntx->SendError(kWrongTypeErr);
    case OpStatus::CANCELLED:
    case OpStatus::TIMED_OUT:
      return rb->SendNullArray();
    case OpStatus::KEY_MOVED: {
      auto error = cluster::SlotOwnershipError(*tx->GetUniqueSlotId());
      CHECK(!error.status.has_value() || error.status.value() != facade::OpStatus::OK);
      return cmd_cntx->SendError(error);
    }
    default:
      LOG(ERROR) << "Unexpected error " << popped_key.status();
  }
  return rb->SendNullArray();
}

// Returns the first non-empty key found in the shard arguments along with its type validity.
// Returns a pair of (key, is_valid_type) where is_valid_type is true if the key exists
// and has the correct type (LIST). If a wrong type is found, returns that key with false.
// Returns nullopt if no suitable key is found.
optional<pair<string_view, bool>> GetFirstNonEmptyKeyFound(EngineShard* shard, Transaction* t) {
  ShardArgs keys = t->GetShardArgs(shard->shard_id());
  DCHECK(!keys.Empty());

  auto& db_slice = t->GetDbSlice(shard->shard_id());
  optional<pair<string_view, bool>> result;

  for (string_view key : keys) {
    auto res = db_slice.FindReadOnly(t->GetDbContext(), key, OBJ_LIST);
    if (res) {
      result = {key, true};
      break;
    }

    // If the key is not found, check if it's a wrong type error
    if (res.status() == OpStatus::WRONG_TYPE) {
      result = {key, false};
      break;
    }
  }

  return result;
}

void CmdLMPop(CmdArgList args, CommandContext* cmd_cntx) {
  auto* response_builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  CmdArgParser parser{args};
  parser.Skip(parser.Next<size_t>());  // skip numkeys and keys

  ListDir dir = parser.MapNext("LEFT", ListDir::LEFT, "RIGHT", ListDir::RIGHT);
  size_t pop_count = 1;
  if (parser.Check("COUNT"))
    pop_count = parser.Next<size_t>();

  if (!parser.Finalize())
    return cmd_cntx->SendError(parser.TakeError().MakeReply());

  // Create a vector to store first found key for each shard
  vector<optional<pair<string_view, bool>>> found_keys_per_shard(shard_set->size());

  auto cb = [&](Transaction* t, EngineShard* shard) {
    // Each shard writes results to its own space
    found_keys_per_shard[shard->shard_id()] = GetFirstNonEmptyKeyFound(shard, t);
    return OpStatus::OK;
  };

  cmd_cntx->tx()->Execute(std::move(cb), false /* followed by another hop */);

  // Find the first existing key from command arguments
  optional<string_view> key_to_pop;
  bool found_wrong_type = false;
  size_t min_index = numeric_limits<size_t>::max();

  // Iterate over each shard to find the key with the smallest index
  for (ShardId sid = 0; sid < found_keys_per_shard.size(); ++sid) {
    if (!found_keys_per_shard[sid])
      continue;

    const auto& [found_key, is_valid_type] = *found_keys_per_shard[sid];
    ShardArgs shard_args = cmd_cntx->tx()->GetShardArgs(sid);

    for (auto it = shard_args.begin(); it != shard_args.end(); ++it) {
      if (found_key == *it && it.index() < min_index) {
        min_index = it.index();
        key_to_pop = found_key;
        found_wrong_type = !is_valid_type;
        break;
      }
    }
  }

  // Handle errors and empty cases first
  if (!key_to_pop || found_wrong_type) {
    cmd_cntx->tx()->Conclude();
    if (found_wrong_type) {
      response_builder->SendError(kWrongTypeErr);
    } else {
      response_builder->SendNull();
    }
    return;
  }

  // Pop values from the found key
  optional<ShardId> key_shard = Shard(*key_to_pop, shard_set->size());
  OpResult<StringVec> result;

  auto cb_pop = [dir, pop_count, key_shard, &result, key = *key_to_pop](Transaction* t,
                                                                        EngineShard* shard) {
    if (*key_shard == shard->shard_id()) {
      result = OpPop(t->GetOpArgs(shard), key, dir, pop_count, true, true);
    }
    return OpStatus::OK;
  };

  cmd_cntx->tx()->Execute(std::move(cb_pop), true);

  if (result) {
    response_builder->StartArray(2);
    response_builder->SendBulkString(*key_to_pop);
    response_builder->SendBulkStrArr(*result);
  } else {
    response_builder->SendNull();
  }
}

void CmdBLMPop(CmdArgList args, CommandContext* cmd_cntx) {
  auto* response_builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  CmdArgParser parser{args};
  float timeout = parser.Next<float>();
  if (auto err = parser.TakeError(); err)
    return cmd_cntx->SendError(err.MakeReply());

  if (timeout < 0)
    return cmd_cntx->SendError("timeout is negative");

  parser.Skip(parser.Next<size_t>());  // Skip numkeys and keys
  ListDir dir = parser.MapNext("LEFT", ListDir::LEFT, "RIGHT", ListDir::RIGHT);

  size_t pop_count = 1;
  if (parser.Check("COUNT"))
    pop_count = parser.Next<size_t>();

  if (!parser.Finalize())
    return cmd_cntx->SendError(parser.TakeError().MakeReply());

  OpResult<StringVec> result;
  auto cb = [&](Transaction* t, EngineShard* shard, string_view key) {
    result = OpPop(t->GetOpArgs(shard), key, dir, pop_count, true, true);
    return result.status();
  };

  ConnectionContext* conn_cntx = cmd_cntx->server_conn_cntx();
  OpResult<string> popped_key = container_utils::RunCbOnFirstNonEmptyBlocking(
      cmd_cntx->tx(), OBJ_LIST, std::move(cb), unsigned(timeout * 1000), &conn_cntx->blocked,
      &conn_cntx->paused);

  if (popped_key.ok()) {
    response_builder->StartArray(2);
    response_builder->SendBulkString(*popped_key);
    response_builder->SendBulkStrArr(*result);
  } else {
    response_builder->SendNull();
  }
}

void CmdLPush(CmdArgList args, CommandContext* cmd_cntx) {
  return PushGeneric(ListDir::LEFT, false, args, cmd_cntx);
}

void CmdLPushX(CmdArgList args, CommandContext* cmd_cntx) {
  return PushGeneric(ListDir::LEFT, true, args, cmd_cntx);
}

void CmdLPop(CmdArgList args, CommandContext* cmd_cntx) {
  return PopGeneric(ListDir::LEFT, args, cmd_cntx);
}

void CmdRPush(CmdArgList args, CommandContext* cmd_cntx) {
  return PushGeneric(ListDir::RIGHT, false, args, cmd_cntx);
}

void CmdRPushX(CmdArgList args, CommandContext* cmd_cntx) {
  return PushGeneric(ListDir::RIGHT, true, args, cmd_cntx);
}

void CmdRPop(CmdArgList args, CommandContext* cmd_cntx) {
  return PopGeneric(ListDir::RIGHT, args, cmd_cntx);
}

void CmdLLen(CmdArgList args, CommandContext* cmd_cntx) {
  auto key = ArgS(args, 0);
  auto cb = [&](Transaction* t, EngineShard* shard) { return OpLen(t->GetOpArgs(shard), key); };
  OpResult<uint32_t> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  if (result) {
    cmd_cntx->SendLong(result.value());
  } else if (result.status() == OpStatus::KEY_NOTFOUND) {
    cmd_cntx->SendLong(0);
  } else {
    cmd_cntx->SendError(result.status());
  }
}

void CmdLPos(CmdArgList args, CommandContext* cmd_cntx) {
  facade::CmdArgParser parser{args};
  auto [key, elem] = parser.Next<string_view, string_view>();

  int rank = 1;
  uint32_t count = 1;
  uint32_t max_len = 0;
  bool skip_count = true;

  while (parser.HasNext()) {
    if (parser.Check("RANK")) {
      rank = parser.Next<int>();
      continue;
    }

    if (parser.Check("COUNT")) {
      count = parser.Next<uint32_t>();
      skip_count = false;
      continue;
    }

    if (parser.Check("MAXLEN")) {
      max_len = parser.Next<uint32_t>();
      continue;
    }

    parser.Skip(1);
  }

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (rank == 0)
    return rb->SendError(kInvalidIntErr);

  RETURN_ON_PARSE_ERROR(parser, cmd_cntx);

  auto cb = [&, &key = key, &elem = elem](Transaction* t, EngineShard* shard) {
    return OpPos(t->GetOpArgs(shard), key, elem, rank, count, max_len);
  };

  Transaction* trans = cmd_cntx->tx();
  auto result = trans->ScheduleSingleHopT(std::move(cb));

  if (result.status() == OpStatus::WRONG_TYPE) {
    return rb->SendError(result.status());
  } else if (result.status() == OpStatus::INVALID_VALUE) {
    return rb->SendError(result.status());
  }

  if (skip_count) {
    if (result->empty()) {
      rb->SendNull();
    } else {
      rb->SendLong((*result)[0]);
    }
  } else {
    rb->SendLongArr(absl::MakeConstSpan(result.value()));
  }
}

void CmdLIndex(CmdArgList args, CommandContext* cmd_cntx) {
  std::string_view key = ArgS(args, 0);
  std::string_view index_str = ArgS(args, 1);
  int32_t index;
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  if (!absl::SimpleAtoi(index_str, &index)) {
    rb->SendError(kInvalidIntErr);
    return;
  }

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpIndex(t->GetOpArgs(shard), key, index);
  };

  OpResult<string> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  if (result) {
    rb->SendBulkString(result.value());
  } else if (result.status() == OpStatus::WRONG_TYPE) {
    rb->SendError(result.status());
  } else {
    rb->SendNull();
  }
}

/* LINSERT <key> (BEFORE|AFTER) <pivot> <element> */
void CmdLInsert(CmdArgList args, CommandContext* cmd_cntx) {
  facade::CmdArgParser parser{args};
  string_view key = parser.Next();
  QList::InsertOpt ins_opt = parser.MapNext("AFTER", QList::AFTER, "BEFORE", QList::BEFORE);
  auto [pivot, elem] = parser.Next<string_view, string_view>();
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  RETURN_ON_PARSE_ERROR(parser, cmd_cntx);

  DCHECK(pivot.data() && elem.data());

  auto cb = [&, &pivot = pivot, &elem = elem](Transaction* t, EngineShard* shard) {
    return OpInsert(t->GetOpArgs(shard), key, pivot, elem, ins_opt);
  };

  OpResult<int> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  if (result || result == OpStatus::KEY_NOTFOUND) {
    return rb->SendLong(result.value_or(0));
  }

  rb->SendError(result.status());
}

void CmdLTrim(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view s_str = ArgS(args, 1);
  string_view e_str = ArgS(args, 2);
  int32_t start, end;

  if (!absl::SimpleAtoi(s_str, &start) || !absl::SimpleAtoi(e_str, &end)) {
    cmd_cntx->SendError(kInvalidIntErr);
    return;
  }

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpTrim(t->GetOpArgs(shard), key, start, end);
  };
  OpStatus st = cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));
  if (st == OpStatus::KEY_NOTFOUND)
    st = OpStatus::OK;
  cmd_cntx->SendError(st);
}

void CmdLRange(CmdArgList args, CommandContext* cmd_cntx) {
  std::string_view key = ArgS(args, 0);
  std::string_view s_str = ArgS(args, 1);
  std::string_view e_str = ArgS(args, 2);
  int32_t start, end;

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (!absl::SimpleAtoi(s_str, &start) || !absl::SimpleAtoi(e_str, &end)) {
    rb->SendError(kInvalidIntErr);
    return;
  }

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpRange(t->GetOpArgs(shard), key, start, end);
  };

  auto res = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  if (!res && res.status() != OpStatus::KEY_NOTFOUND) {
    return rb->SendError(res.status());
  }

  rb->SendBulkStrArr(*res);
}

// lrem key 5 foo, will remove foo elements from the list if exists at most 5 times.
void CmdLRem(CmdArgList args, CommandContext* cmd_cntx) {
  std::string_view key = ArgS(args, 0);
  std::string_view index_str = ArgS(args, 1);
  std::string_view elem = ArgS(args, 2);
  int32_t count;

  if (!absl::SimpleAtoi(index_str, &count)) {
    cmd_cntx->SendError(kInvalidIntErr);
    return;
  }

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpRem(t->GetOpArgs(shard), key, elem, count);
  };
  OpResult<uint32_t> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  if (result || result == OpStatus::KEY_NOTFOUND) {
    return cmd_cntx->SendLong(result.value_or(0));
  }
  cmd_cntx->SendError(result.status());
}

void CmdLSet(CmdArgList args, CommandContext* cmd_cntx) {
  std::string_view key = ArgS(args, 0);
  std::string_view index_str = ArgS(args, 1);
  std::string_view elem = ArgS(args, 2);
  int32_t count;

  if (!absl::SimpleAtoi(index_str, &count)) {
    cmd_cntx->SendError(kInvalidIntErr);
    return;
  }

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpSet(t->GetOpArgs(shard), key, elem, count);
  };
  OpResult<void> result = cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));
  if (result) {
    cmd_cntx->rb()->SendOk();
  } else {
    cmd_cntx->SendError(result.status());
  }
}

void CmdBLPop(CmdArgList args, CommandContext* cmd_cntx) {
  BPopGeneric(ListDir::LEFT, args, cmd_cntx);
}

void CmdBRPop(CmdArgList args, CommandContext* cmd_cntx) {
  BPopGeneric(ListDir::RIGHT, args, cmd_cntx);
}

void CmdLMove(CmdArgList args, CommandContext* cmd_cntx) {
  facade::CmdArgParser parser{args};
  auto [src, dest] = parser.Next<string_view, string_view>();
  ListDir src_dir = ParseDir(&parser);
  ListDir dest_dir = ParseDir(&parser);

  if (auto err = parser.TakeError(); err)
    return cmd_cntx->SendError(err.MakeReply());

  MoveGeneric(src, dest, src_dir, dest_dir, cmd_cntx->tx(), cmd_cntx->rb());
}

}  // namespace

using CI = CommandId;

#define HFUNC(x) SetHandler(&Cmd##x)

void RegisterListFamily(CommandRegistry* registry) {
  registry->StartFamily(acl::LIST);
  *registry
      << CI{"LPUSH", CO::JOURNALED | CO::FAST | CO::DENYOOM, -3, 1, 1}.HFUNC(LPush)
      << CI{"LPUSHX", CO::JOURNALED | CO::FAST | CO::DENYOOM, -3, 1, 1}.HFUNC(LPushX)
      << CI{"LPOP", CO::JOURNALED | CO::FAST, -2, 1, 1}.HFUNC(LPop)
      << CI{"LMPOP", CO::JOURNALED | CO::VARIADIC_KEYS | CO::NO_AUTOJOURNAL, -4, 2, 2}.HFUNC(LMPop)
      << CI{"BLMPOP", CO::JOURNALED | CO::BLOCKING | CO::VARIADIC_KEYS | CO::NO_AUTOJOURNAL, -5, 3,
            3}
             .HFUNC(BLMPop)
      << CI{"RPUSH", CO::JOURNALED | CO::FAST | CO::DENYOOM, -3, 1, 1}.HFUNC(RPush)
      << CI{"RPUSHX", CO::JOURNALED | CO::FAST | CO::DENYOOM, -3, 1, 1}.HFUNC(RPushX)
      << CI{"RPOP", CO::JOURNALED | CO::FAST, -2, 1, 1}.HFUNC(RPop)
      << CI{"RPOPLPUSH", CO::JOURNALED | CO::NO_AUTOJOURNAL, 3, 1, 2}.SetHandler(RPopLPush)
      << CI{"BRPOPLPUSH", CO::JOURNALED | CO::NOSCRIPT | CO::BLOCKING | CO::NO_AUTOJOURNAL, 4, 1, 2}
             .SetHandler(BRPopLPush)
      << CI{"BLPOP", CO::JOURNALED | CO::NOSCRIPT | CO::BLOCKING | CO::NO_AUTOJOURNAL, -3, 1, -2}
             .HFUNC(BLPop)
      << CI{"BRPOP", CO::JOURNALED | CO::NOSCRIPT | CO::BLOCKING | CO::NO_AUTOJOURNAL, -3, 1, -2}
             .HFUNC(BRPop)
      << CI{"LLEN", CO::READONLY | CO::FAST, 2, 1, 1}.HFUNC(LLen)
      << CI{"LPOS", CO::READONLY, -3, 1, 1}.HFUNC(LPos)
      << CI{"LINDEX", CO::READONLY, 3, 1, 1}.HFUNC(LIndex)
      << CI{"LINSERT", CO::JOURNALED | CO::DENYOOM, 5, 1, 1}.HFUNC(LInsert)
      << CI{"LRANGE", CO::READONLY, 4, 1, 1}.HFUNC(LRange)
      << CI{"LSET", CO::JOURNALED | CO::DENYOOM, 4, 1, 1}.HFUNC(LSet)
      << CI{"LTRIM", CO::JOURNALED, 4, 1, 1}.HFUNC(LTrim)
      << CI{"LREM", CO::JOURNALED, 4, 1, 1}.HFUNC(LRem)
      << CI{"LMOVE", CO::JOURNALED | CO::NO_AUTOJOURNAL, 5, 1, 2}.HFUNC(LMove)
      << CI{"BLMOVE", CO::JOURNALED | CO::NO_AUTOJOURNAL | CO::BLOCKING, 6, 1, 2}.SetHandler(
             BLMove);
}

}  // namespace dfly


================================================
FILE: src/server/list_family_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include <absl/strings/match.h>

#include <random>

#include "base/gtest.h"
#include "base/logging.h"
#include "facade/facade_test.h"
#include "server/blocking_controller.h"
#include "server/conn_context.h"
#include "server/engine_shard_set.h"
#include "server/test_utils.h"
#include "server/transaction.h"
#include "util/fibers/fibers.h"

using namespace testing;
using namespace std;
using namespace util;
using absl::StrCat;

namespace dfly {

class ListFamilyTest : public BaseFamilyTest {
 protected:
  ListFamilyTest() {
    num_threads_ = 4;
  }

  static unsigned NumWatched() {
    atomic_uint32_t sum{0};

    auto ns = &namespaces->GetDefaultNamespace();
    shard_set->RunBriefInParallel([&](EngineShard* es) {
      auto* bc = ns->GetBlockingController(es->shard_id());
      if (bc)
        sum.fetch_add(bc->NumWatched(0), memory_order_relaxed);
    });

    return sum.load();
  }

  static bool HasAwakened() {
    atomic_uint32_t sum{0};
    auto ns = &namespaces->GetDefaultNamespace();
    shard_set->RunBriefInParallel([&](EngineShard* es) {
      auto* bc = ns->GetBlockingController(es->shard_id());
      if (bc)
        sum.fetch_add(bc->HasAwakedTransaction(), memory_order_relaxed);
    });

    return sum.load() > 0;
  }
};

const char kKey1[] = "x";
const char kKey2[] = "b";
const char kKey3[] = "c";

TEST_F(ListFamilyTest, Basic) {
  auto resp = Run({"lpush", kKey1, "1"});
  EXPECT_THAT(resp, IntArg(1));
  resp = Run({"lpush", kKey2, "2"});
  ASSERT_THAT(resp, IntArg(1));
  resp = Run({"llen", kKey1});
  ASSERT_THAT(resp, IntArg(1));
}

TEST_F(ListFamilyTest, Expire) {
  auto resp = Run({"lpush", kKey1, "1"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"expire", kKey1, "1"});
  EXPECT_THAT(resp, IntArg(1));

  AdvanceTime(1000);

  resp = Run({"lpush", kKey1, "1"});
  EXPECT_THAT(resp, IntArg(1));
}

TEST_F(ListFamilyTest, BLMPopNonblocking) {
  auto resp = Run({"lpush", kKey1, "1", "2", "3", "4"});
  EXPECT_THAT(resp, IntArg(4));

  resp = Run({"blmpop", "0.01", "2", kKey2, kKey1, "LEFT"});
  EXPECT_THAT(resp, RespElementsAre(kKey1, RespElementsAre("4")));

  resp = Run({"blmpop", "0.01", "2", kKey2, kKey1, "RIGHT", "COUNT", "2"});
  EXPECT_THAT(resp, RespElementsAre(kKey1, RespElementsAre("1", "2")));

  // If the count exceeds the size of the key's values (but the key is non-empty) then return all of
  // the key's values
  resp = Run({"blmpop", "0.01", "1", kKey1, "RIGHT", "COUNT", "10"});
  EXPECT_THAT(resp, RespElementsAre(kKey1, RespElementsAre("3")));
}

TEST_F(ListFamilyTest, BLMPopInvalidSyntax) {
  // Not enough arguments
  auto resp = Run({"blmpop", "0.1", "1", kKey1});
  EXPECT_THAT(resp, ErrArg("wrong number of arguments"));

  // Timeout is not a float
  resp = Run({"blmpop", "foo", "1", kKey1, "LEFT", "COUNT", "1"});
  EXPECT_THAT(resp, ErrArg("value is not a valid float"));

  // Negative timeout
  resp = Run({"blmpop", "-0.01", "1", kKey1, "LEFT", "COUNT", "1"});
  EXPECT_THAT(resp, ErrArg("timeout is negative"));

  // Zero keys
  resp = Run({"blmpop", "0.01", "0", "LEFT", "COUNT", "1"});
  EXPECT_THAT(resp, ErrArg("at least 1 input key is needed"));

  // Number of keys is not uint
  resp = Run({"blmpop", "0.01", "aa", kKey1, "LEFT"});
  EXPECT_THAT(resp, ErrArg("value is not an integer or out of range"));

  // Missing LEFT/RIGHT
  resp = Run({"blmpop", "0.01", "1", kKey1, "COUNT", "1"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  // Wrong number of keys
  resp = Run({"blmpop", "0.01", "1", kKey1, kKey2, "LEFT"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  // COUNT without number
  resp = Run({"blmpop", "0.01", "1", kKey1, "LEFT", "COUNT"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  // COUNT is not uint
  resp = Run({"blmpop", "0.01", "1", kKey1, "LEFT", "COUNT", "boo"});
  EXPECT_THAT(resp, ErrArg("value is not an integer or out of range"));

  // Too many arguments
  resp = Run({"blmpop", "0.01", "1", "c", "LEFT", "COUNT", "2", "foo"});
  EXPECT_THAT(resp, ErrArg("syntax error"));
}

TEST_F(ListFamilyTest, BLMPopBlocking) {
  // attempting to pop from empty key results in blocking and returns
  // null if no values are pushed to the key.
  RespExpr resp;
  auto fb0 = pp_->at(0)->LaunchFiber(Launch::dispatch, [&] {
    resp = Run({"blmpop", "0.1", "1", kKey1, "LEFT"});
  });
  ThisFiber::SleepFor(1ms);
  ASSERT_TRUE(IsLocked(0, kKey1));

  fb0.Join();
  ASSERT_FALSE(IsLocked(0, kKey1));
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  // BLMPOP should not block if there is a non-empty key available
  resp = Run({"lpush", kKey1, "0"});
  EXPECT_THAT(resp, IntArg(1));

  auto fb1 = pp_->at(1)->LaunchFiber(Launch::dispatch, [&] {
    resp = Run({"blmpop", "0.1", "1", kKey1, "LEFT"});
  });
  ThisFiber::SleepFor(1ms);
  // shouldn't need to lock the key just pop immediately
  ASSERT_FALSE(IsLocked(0, kKey1));
  fb1.Join();

  // should block until a key is available and then immediately unblock
  auto fb2 = pp_->at(2)->LaunchFiber(Launch::dispatch, [&] {
    resp = Run({"blmpop", "0.1", "1", kKey1, "LEFT"});
  });

  // key should be locked while waiting
  WaitUntilLocked(0, kKey1);
  ASSERT_TRUE(IsLocked(0, kKey1));

  auto push_resp = Run({"lpush", kKey1, "1"});
  EXPECT_THAT(push_resp, IntArg(1));

  // key should be unlocked after being inserted to
  fb2.Join();
  ASSERT_FALSE(IsLocked(0, kKey1));
  EXPECT_THAT(resp, RespElementsAre(kKey1, RespElementsAre("1")));
}

TEST_F(ListFamilyTest, BLPopUnblocking) {
  auto resp = Run({"lpush", kKey1, "1"});
  EXPECT_THAT(resp, IntArg(1));
  resp = Run({"lpush", kKey2, "2"});
  ASSERT_THAT(resp, IntArg(1));

  resp = Run({"blpop", kKey1, kKey2});  // missing "0" delimiter.
  ASSERT_THAT(resp, ErrArg("timeout is not a float"));

  resp = Run({"blpop", kKey1, kKey2, "0"});
  ASSERT_EQ(2, GetDebugInfo().shards_count);
  ASSERT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp.GetVec(), ElementsAre(kKey1, "1"));

  resp = Run({"blpop", kKey1, kKey2, "0"});
  ASSERT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp.GetVec(), ElementsAre(kKey2, "2"));

  resp = Run({"set", "z", "1"});
  ASSERT_EQ(resp, "OK");

  resp = Run({"blpop", "z", "0"});
  ASSERT_THAT(resp, ErrArg("WRONGTYPE "));

  ASSERT_FALSE(IsLocked(0, "x"));
  ASSERT_FALSE(IsLocked(0, "y"));
  ASSERT_FALSE(IsLocked(0, "z"));
}

TEST_F(ListFamilyTest, BLPopBlocking) {
  RespExpr resp0, resp1;

  // Run the fiber at creation.
  auto fb0 = pp_->at(0)->LaunchFiber(Launch::dispatch, [&] {
    resp0 = Run({"blpop", "x", "0"});
    LOG(INFO) << "pop0";
  });

  ThisFiber::SleepFor(50us);
  auto fb1 = pp_->at(1)->LaunchFiber([&] {
    resp1 = Run({"blpop", "x", "0"});
    LOG(INFO) << "pop1";
  });
  ThisFiber::SleepFor(30us);

  RespExpr resp = pp_->at(1)->Await([&] { return Run("B1", {"lpush", "x", "2", "1"}); });
  ASSERT_THAT(resp, IntArg(2));

  fb0.Join();
  fb1.Join();

  // fb0 should start first and be the first transaction blocked. Therefore, it should pop '1'.
  // sometimes order is switched, need to think how to fix it.
  int64_t epoch0 = GetDebugInfo("IO0").clock;
  int64_t epoch1 = GetDebugInfo("IO1").clock;
  ASSERT_LT(epoch0, epoch1);
  ASSERT_THAT(resp0, ArrLen(2));
  EXPECT_THAT(resp0.GetVec(), ElementsAre("x", "1"));
  ASSERT_FALSE(IsLocked(0, "x"));
  ASSERT_EQ(0, NumWatched());
}

TEST_F(ListFamilyTest, BLPopMultiple) {
  RespExpr resp0, resp1;

  resp0 = Run({"blpop", kKey1, kKey2, "0.01"});  // timeout
  EXPECT_THAT(resp0, ArgType(RespExpr::NIL_ARRAY));
  ASSERT_EQ(2, GetDebugInfo().shards_count);

  ASSERT_FALSE(IsLocked(0, kKey1));
  ASSERT_FALSE(IsLocked(0, kKey2));

  auto fb1 = pp_->at(0)->LaunchFiber(Launch::dispatch, [&] {
    resp0 = Run({"blpop", kKey1, kKey2, "0"});
  });

  pp_->at(1)->Await([&] { Run({"lpush", kKey1, "1", "2", "3"}); });
  fb1.Join();

  ASSERT_THAT(resp0, ArrLen(2));
  EXPECT_THAT(resp0.GetVec(), ElementsAre(kKey1, "3"));
  ASSERT_FALSE(IsLocked(0, kKey1));
  ASSERT_FALSE(IsLocked(0, kKey2));
  ASSERT_EQ(0, NumWatched());
}

TEST_F(ListFamilyTest, BLPopTimeout) {
  RespExpr resp = Run({"blpop", kKey1, kKey2, kKey3, "0.01"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL_ARRAY));
  EXPECT_EQ(3, GetDebugInfo().shards_count);
  ASSERT_FALSE(IsLocked(0, kKey1));

  // Under Multi
  resp = Run({"multi"});
  ASSERT_EQ(resp, "OK");

  Run({"blpop", kKey1, "0"});
  resp = Run({"exec"});

  EXPECT_THAT(resp, ArgType(RespExpr::NIL_ARRAY));
  ASSERT_FALSE(IsLocked(0, kKey1));
  ASSERT_EQ(0, NumWatched());
}

TEST_F(ListFamilyTest, BLPopTimeout2) {
  Run({"BLPOP", "blist1", "blist2", "0.1"});

  Run({"RPUSH", "blist2", "d"});
  Run({"RPUSH", "blist2", "hello"});

  auto resp = Run({"BLPOP", "blist1", "blist2", "1"});
  ASSERT_THAT(resp, ArrLen(2));
  ASSERT_THAT(resp.GetVec(), ElementsAre("blist2", "d"));

  Run({"RPUSH", "blist1", "a"});
  Run({"DEL", "blist2"});
  Run({"RPUSH", "blist2", "d"});
  Run({"BLPOP", "blist1", "blist2", "1"});
  ASSERT_EQ(0, NumWatched());
}

TEST_F(ListFamilyTest, BLPopMultiPush) {
  Run({"exists", kKey1, kKey2, kKey3});
  ASSERT_EQ(3, GetDebugInfo().shards_count);
  RespExpr blpop_resp;
  auto pop_fb = pp_->at(0)->LaunchFiber(Launch::dispatch, [&] {
    blpop_resp = Run({"blpop", kKey1, kKey2, kKey3, "0"});
  });

  WaitUntilLocked(0, kKey1);

  auto p1_fb = pp_->at(1)->LaunchFiber([&] {
    for (unsigned i = 0; i < 100; ++i) {
      // a filler command to create scheduling queue.
      Run({"exists", kKey1, kKey2, kKey3});
    }
  });

  auto p2_fb = pp_->at(2)->LaunchFiber([&] {
    Run({"multi"});
    Run({"lpush", kKey3, "C"});
    Run({"exists", kKey2});
    Run({"lpush", kKey2, "B"});
    Run({"exists", kKey1});
    Run({"lpush", kKey1, "A"});
    Run({"exists", kKey1, kKey2, kKey3});
    auto resp = Run({"exec"});
    ASSERT_THAT(resp, ArrLen(6));
  });

  p1_fb.Join();
  p2_fb.Join();

  pop_fb.Join();

  // We can't determine what key was popped, so only check result presence.
  // It might not be first kKey3 "C" because of squashing and re-ordering.
  ASSERT_THAT(blpop_resp, ArrLen(2));
  ASSERT_THAT(Run({"exists", kKey1, kKey2, kKey3}), IntArg(2));
  ASSERT_EQ(0, NumWatched());
}

TEST_F(ListFamilyTest, WrongTypeDoesNotWake) {
  RespExpr blpop_resp;

  auto pop_fb = pp_->at(0)->LaunchFiber(Launch::dispatch, [&] {
    blpop_resp = Run({"blpop", kKey1, "0"});
  });

  WaitUntilLocked(0, kKey1);

  auto p1_fb = pp_->at(1)->LaunchFiber([&] {
    Run({"multi"});
    Run({"lpush", kKey1, "A"});
    Run({"set", kKey1, "foo"});

    auto resp = Run({"exec"});
    EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(1), "OK"));

    Run({"del", kKey1});
    Run({"lpush", kKey1, "B"});
  });

  p1_fb.Join();
  pop_fb.Join();
  ASSERT_THAT(blpop_resp, ArrLen(2));
  EXPECT_THAT(blpop_resp.GetVec(), ElementsAre(kKey1, "B"));
}

TEST_F(ListFamilyTest, BPopSameKeyTwice) {
  RespExpr blpop_resp;

  auto pop_fb = pp_->at(0)->LaunchFiber(Launch::dispatch, [&] {
    blpop_resp = Run({"blpop", kKey1, kKey2, kKey2, kKey1, "0"});
    EXPECT_EQ(0, NumWatched());
  });

  WaitUntilLocked(0, kKey1);

  pp_->at(1)->Await([&] { EXPECT_EQ(1, CheckedInt({"lpush", kKey1, "bar"})); });
  pop_fb.Join();

  ASSERT_THAT(blpop_resp, ArrLen(2));
  EXPECT_THAT(blpop_resp.GetVec(), ElementsAre(kKey1, "bar"));

  pop_fb = pp_->at(0)->LaunchFiber(Launch::dispatch, [&] {
    blpop_resp = Run({"blpop", kKey1, kKey2, kKey2, kKey1, "0"});
  });

  WaitUntilLocked(0, kKey1);

  pp_->at(1)->Await([&] { EXPECT_EQ(1, CheckedInt({"lpush", kKey2, "bar"})); });
  pop_fb.Join();

  ASSERT_THAT(blpop_resp, ArrLen(2));
  EXPECT_THAT(blpop_resp.GetVec(), ElementsAre(kKey2, "bar"));
}

TEST_F(ListFamilyTest, BPopTwoKeysSameShard) {
  Run({"exists", "x", "y"});
  ASSERT_EQ(1, GetDebugInfo().shards_count);
  RespExpr blpop_resp;

  auto pop_fb = pp_->at(0)->LaunchFiber(Launch::dispatch, [&] {
    blpop_resp = Run({"blpop", "x", "y", "0"});
    EXPECT_FALSE(IsLocked(0, "y"));
    ASSERT_EQ(0, NumWatched());
  });

  WaitUntilLocked(0, "x");

  pp_->at(1)->Await([&] { EXPECT_EQ(1, CheckedInt({"lpush", "x", "bar"})); });
  pop_fb.Join();

  ASSERT_THAT(blpop_resp, ArrLen(2));
  EXPECT_THAT(blpop_resp.GetVec(), ElementsAre("x", "bar"));
}

TEST_F(ListFamilyTest, BPopRename) {
  RespExpr blpop_resp;

  Run({"exists", kKey1, kKey2});
  ASSERT_EQ(2, GetDebugInfo().shards_count);

  auto pop_fb = pp_->at(0)->LaunchFiber(Launch::dispatch, [&] {
    blpop_resp = Run({"blpop", kKey1, "0"});
  });

  WaitUntilLocked(0, kKey1);

  pp_->at(1)->Await([&] {
    EXPECT_EQ(1, CheckedInt({"lpush", "a", "bar"}));
    Run({"rename", "a", kKey1});
  });
  pop_fb.Join();

  ASSERT_THAT(blpop_resp, ArrLen(2));
  EXPECT_THAT(blpop_resp.GetVec(), ElementsAre(kKey1, "bar"));
}

TEST_F(ListFamilyTest, BPopFlush) {
  RespExpr blpop_resp;
  auto pop_fb = pp_->at(0)->LaunchFiber(Launch::dispatch, [&] {
    blpop_resp = Run({"blpop", kKey1, "0"});
  });

  WaitUntilLocked(0, kKey1);

  pp_->at(1)->Await([&] {
    Run({"flushdb"});
    EXPECT_EQ(1, CheckedInt({"lpush", kKey1, "bar"}));
  });
  pop_fb.Join();
}

TEST_F(ListFamilyTest, LRem) {
  auto resp = Run({"rpush", kKey1, "a", "b", "a", "c"});
  ASSERT_THAT(resp, IntArg(4));
  resp = Run({"lrem", kKey1, "2", "a"});
  ASSERT_THAT(resp, IntArg(2));

  resp = Run({"lrange", kKey1, "0", "1"});
  ASSERT_THAT(resp, ArrLen(2));
  ASSERT_THAT(resp.GetVec(), ElementsAre("b", "c"));

  Run({"set", "foo", "bar"});
  ASSERT_THAT(Run({"lrem", "foo", "0", "elem"}), ErrArg("WRONGTYPE"));
  ASSERT_THAT(Run({"lrem", "nexists", "0", "elem"}), IntArg(0));

  // Triggers QUICKLIST_NODE_CONTAINER_PLAIN coverage
  string val(10000, 'a');
  Run({"rpush", kKey2, val, "12345678"});

  ASSERT_THAT(Run({"lrem", kKey2, "1", "12345678"}), IntArg(1));
  ASSERT_THAT(Run({"lrem", kKey2, "1", val}), IntArg(1));

  ASSERT_THAT(Run({"lpush", kKey3, "bar", "bar", "foo"}), IntArg(3));
  ASSERT_THAT(Run({"lrem", kKey3, "-2", "bar"}), IntArg(2));
  resp = Run({"lrange", kKey3, "0", "-1"});
  ASSERT_EQ(resp, "foo");
}

TEST_F(ListFamilyTest, DumpRestorePlain) {
  const string kValue(10'000, '#');
  EXPECT_EQ(CheckedInt({"LPUSH", kKey1, kValue}), 1);
  auto buffer = Run({"DUMP", kKey1}).GetBuf();
  EXPECT_EQ(Run({"RESTORE", kKey2, "0", ToSV(buffer)}), "OK");
  EXPECT_EQ(CheckedInt({"LLEN", kKey2}), 1);
  EXPECT_EQ(Run({"LRANGE", kKey2, "0", "1"}), kValue);
}

TEST_F(ListFamilyTest, LTrim) {
  Run({"rpush", kKey1, "a", "b", "c", "d"});
  ASSERT_EQ(Run({"ltrim", kKey1, "-2", "-1"}), "OK");
  auto resp = Run({"lrange", kKey1, "0", "1"});
  ASSERT_THAT(resp, ArrLen(2));
  ASSERT_THAT(resp.GetVec(), ElementsAre("c", "d"));
  ASSERT_EQ(Run({"ltrim", kKey1, "0", "0"}), "OK");
  ASSERT_EQ(Run({"lrange", kKey1, "0", "1"}), "c");
  Run({"set", "foo", "bar"});
  ASSERT_THAT(Run({"ltrim", "foo", "0", "1"}), ErrArg("WRONGTYPE"));
  ASSERT_EQ(Run({"ltrim", "nexists", "0", "1"}), "OK");
}

TEST_F(ListFamilyTest, LRange) {
  auto resp = Run({"lrange", kKey1, "0", "5"});
  ASSERT_THAT(resp, ArrLen(0));
  Run({"rpush", kKey1, "0", "1", "2"});
  resp = Run({"lrange", kKey1, "-2", "-1"});

  ASSERT_THAT(resp, ArrLen(2));
  ASSERT_THAT(resp.GetVec(), ElementsAre("1", "2"));
}

TEST_F(ListFamilyTest, Lset) {
  Run({"rpush", kKey1, "0", "1", "2"});
  ASSERT_EQ(Run({"lset", kKey1, "0", "bar"}), "OK");
  ASSERT_EQ(Run({"lpop", kKey1}), "bar");
  ASSERT_EQ(Run({"lset", kKey1, "-1", "foo"}), "OK");
  ASSERT_EQ(Run({"rpop", kKey1}), "foo");
  Run({"rpush", kKey2, "a"});
  ASSERT_THAT(Run({"lset", kKey2, "1", "foo"}), ErrArg("index out of range"));
}

TEST_F(ListFamilyTest, LPop) {
  Run({"rpush", "foo", "bar"});
  auto resp = Run({"lpop", "foo", "0"});
  EXPECT_THAT(resp, RespArray(ElementsAre()));
  resp = Run({"lpop", "bar", "0"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));
}

TEST_F(ListFamilyTest, LPos) {
  auto resp = Run({"rpush", kKey1, "1", "a", "b", "1", "1", "a", "1"});
  ASSERT_THAT(resp, IntArg(7));

  ASSERT_THAT(Run({"lpos", kKey1, "1"}), IntArg(0));

  ASSERT_THAT(Run({"lpos", kKey1, "f"}), ArgType(RespExpr::NIL));
  ASSERT_THAT(Run({"lpos", kKey1, "1", "COUNT", "-1"}), ArgType(RespExpr::ERROR));
  ASSERT_THAT(Run({"lpos", kKey1, "1", "MAXLEN", "-1"}), ArgType(RespExpr::ERROR));
  ASSERT_THAT(Run({"lpos", kKey1, "1", "RANK", "0"}), ArgType(RespExpr::ERROR));

  resp = Run({"lpos", kKey1, "a", "RANK", "-1", "COUNT", "2"});
  ASSERT_THAT(resp.GetVec(), ElementsAre(IntArg(5), IntArg(1)));

  resp = Run({"lpos", kKey1, "1", "COUNT", "0"});
  ASSERT_THAT(resp.GetVec(), ElementsAre(IntArg(0), IntArg(3), IntArg(4), IntArg(6)));

  resp = Run({"lpos", kKey1, "1", "COUNT", "0", "MAXLEN", "5"});
  ASSERT_THAT(resp.GetVec(), ElementsAre(IntArg(0), IntArg(3), IntArg(4)));
}

TEST_F(ListFamilyTest, RPopLPush) {
  // src and dest are diffrent keys
  auto resp = Run({"rpush", kKey1, "1", "a", "b", "1", "2", "3", "4"});
  ASSERT_THAT(resp, IntArg(7));

  resp = Run({"rpoplpush", kKey1, kKey2});
  ASSERT_THAT(resp, "4");

  resp = Run({"rpoplpush", kKey1, kKey2});
  ASSERT_THAT(resp, "3");

  resp = Run({"rpoplpush", kKey1, kKey2});
  ASSERT_THAT(resp, "2");

  resp = Run({"rpoplpush", kKey1, kKey2});
  ASSERT_THAT(resp, "1");

  resp = Run({"lrange", kKey1, "0", "-1"});
  ASSERT_THAT(resp, ArrLen(3));
  ASSERT_THAT(resp.GetVec(), ElementsAre("1", "a", "b"));

  resp = Run({"lrange", kKey2, "0", "-1"});
  ASSERT_THAT(resp, ArrLen(4));
  ASSERT_THAT(resp.GetVec(), ElementsAre("1", "2", "3", "4"));

  resp = Run({"rpoplpush", kKey1, kKey2});
  ASSERT_THAT(resp, "b");

  resp = Run({"rpoplpush", kKey1, kKey2});
  ASSERT_THAT(resp, "a");

  resp = Run({"rpoplpush", kKey1, kKey2});
  ASSERT_THAT(resp, "1");

  ASSERT_THAT(Run({"lrange", kKey1, "0", "-1"}), ArrLen(0));
  EXPECT_THAT(Run({"exists", kKey1}), IntArg(0));
  ASSERT_THAT(Run({"rpoplpush", kKey1, kKey2}), ArgType(RespExpr::NIL));

  resp = Run({"lrange", kKey2, "0", "-1"});
  ASSERT_THAT(resp, ArrLen(7));
  ASSERT_THAT(resp.GetVec(), ElementsAre("1", "a", "b", "1", "2", "3", "4"));

  // src and dest are the same key
  resp = Run({"rpush", kKey1, "1", "a", "b", "1", "2", "3", "4"});
  ASSERT_THAT(resp, IntArg(7));

  resp = Run({"rpoplpush", kKey1, kKey1});
  ASSERT_THAT(resp, "4");

  resp = Run({"rpoplpush", kKey1, kKey1});
  ASSERT_THAT(resp, "3");

  resp = Run({"rpoplpush", kKey1, kKey1});
  ASSERT_THAT(resp, "2");

  resp = Run({"rpoplpush", kKey1, kKey1});
  ASSERT_THAT(resp, "1");

  resp = Run({"lrange", kKey1, "0", "-1"});
  ASSERT_THAT(resp, ArrLen(7));
  ASSERT_THAT(resp.GetVec(), ElementsAre("1", "2", "3", "4", "1", "a", "b"));

  resp = Run({"rpoplpush", kKey1, kKey1});
  ASSERT_THAT(resp, "b");

  resp = Run({"rpoplpush", kKey1, kKey1});
  ASSERT_THAT(resp, "a");

  resp = Run({"rpoplpush", kKey1, kKey1});
  ASSERT_THAT(resp, "1");

  resp = Run({"lrange", kKey1, "0", "-1"});
  ASSERT_THAT(resp, ArrLen(7));
  ASSERT_THAT(resp.GetVec(), ElementsAre("1", "a", "b", "1", "2", "3", "4"));
}

TEST_F(ListFamilyTest, LMove) {
  // src and dest are different keys
  auto resp = Run({"rpush", kKey1, "1", "2", "3", "4", "5"});
  ASSERT_THAT(resp, IntArg(5));

  resp = Run({"lmove", kKey1, kKey2, "LEFT", "RIGHT"});
  ASSERT_THAT(resp, "1");
  ASSERT_THAT(Run({"llen", kKey1}), IntArg(4));

  resp = Run({"lmove", kKey1, kKey2, "LEFT", "LEFT"});
  ASSERT_THAT(resp, "2");

  resp = Run({"lrange", kKey2, "0", "-1"});
  ASSERT_THAT(resp, ArrLen(2));
  ASSERT_THAT(resp.GetVec(), ElementsAre("2", "1"));

  resp = Run({"lmove", kKey1, kKey2, "RIGHT", "LEFT"});
  ASSERT_THAT(resp, "5");

  resp = Run({"lrange", kKey2, "0", "-1"});
  ASSERT_THAT(resp, ArrLen(3));
  ASSERT_THAT(resp.GetVec(), ElementsAre("5", "2", "1"));

  resp = Run({"lmove", kKey1, kKey2, "RIGHT", "RIGHT"});
  ASSERT_THAT(resp, "4");

  resp = Run({"lrange", kKey1, "0", "-1"});
  ASSERT_EQ(resp, "3");

  resp = Run({"lrange", kKey2, "0", "-1"});
  ASSERT_THAT(resp, ArrLen(4));
  ASSERT_THAT(resp.GetVec(), ElementsAre("5", "2", "1", "4"));

  resp = Run({"lmove", kKey1, kKey2, "RIGHT", "RIGHT"});
  ASSERT_THAT(resp, "3");

  ASSERT_THAT(Run({"lrange", kKey1, "0", "-1"}), ArrLen(0));
  EXPECT_THAT(Run({"exists", kKey1}), IntArg(0));
  ASSERT_THAT(Run({"lmove", kKey1, kKey2, "LEFT", "RIGHT"}), ArgType(RespExpr::NIL));
  ASSERT_THAT(Run({"lmove", kKey1, kKey2, "RIGHT", "RIGHT"}), ArgType(RespExpr::NIL));

  resp = Run({"lrange", kKey2, "0", "-1"});
  ASSERT_THAT(resp, ArrLen(5));
  ASSERT_THAT(resp.GetVec(), ElementsAre("5", "2", "1", "4", "3"));

  // src and dest are the same key
  resp = Run({"rpush", kKey1, "1", "2", "3", "4", "5"});
  ASSERT_THAT(resp, IntArg(5));

  resp = Run({"lmove", kKey1, kKey1, "LEFT", "RIGHT"});
  ASSERT_THAT(resp, "1");

  resp = Run({"lmove", kKey1, kKey1, "LEFT", "LEFT"});
  ASSERT_THAT(resp, "2");

  resp = Run({"lmove", kKey1, kKey1, "RIGHT", "LEFT"});
  ASSERT_THAT(resp, "1");

  resp = Run({"lmove", kKey1, kKey1, "RIGHT", "RIGHT"});
  ASSERT_THAT(resp, "5");

  resp = Run({"lmove", kKey1, kKey1, "LEFT", "RIGHT"});
  ASSERT_THAT(resp, "1");

  resp = Run({"lrange", kKey1, "0", "-1"});
  ASSERT_THAT(resp, ArrLen(5));
  ASSERT_THAT(resp.GetVec(), ElementsAre("2", "3", "4", "5", "1"));

  resp = Run({"lmove", kKey1, kKey1, "LEFT", "RIGHT"});
  ASSERT_THAT(resp, "2");

  resp = Run({"lmove", kKey1, kKey1, "LEFT", "RIGHT"});
  ASSERT_THAT(resp, "3");

  resp = Run({"lmove", kKey1, kKey1, "RIGHT", "RIGHT"});
  ASSERT_THAT(resp, "3");

  resp = Run({"lmove", kKey1, kKey1, "LEFT", "RIGHT"});
  ASSERT_THAT(resp, "4");

  resp = Run({"lrange", kKey1, "0", "-1"});
  ASSERT_THAT(resp, ArrLen(5));
  ASSERT_THAT(resp.GetVec(), ElementsAre("5", "1", "2", "3", "4"));

  ASSERT_THAT(Run({"lmove", kKey1, kKey1, "LEFT", "R"}), ArgType(RespExpr::ERROR));
}

TEST_F(ListFamilyTest, TwoQueueBug451) {
  // The bug was that if 2 push operations where queued together in the tx queue,
  // and the first awoke pending blpop, then the PollExecution function would continue with the
  // second push before switching to blpop, which contradicts the spec.
  std::atomic_bool running{true};
  std::atomic_int it_cnt{0};

  auto pop_fiber = [&]() {
    auto id = "t-" + std::to_string(it_cnt.fetch_add(1));
    while (running.load()) {
      Run(id, {"blpop", "a", "0.1"});
    }
  };

  auto push_fiber = [&]() {
    auto id = "t-" + std::to_string(it_cnt.fetch_add(1));
    for (int i = 0; i < 300; i++) {
      Run(id, {"rpush", "a", "DATA"});
    }
    ThisFiber::SleepFor(50ms);
    running = false;
  };

  vector<Fiber> fbs;

  // more likely to reproduce the bug if we start pop_fiber first.
  for (int i = 0; i < 2; i++) {
    fbs.push_back(pp_->at(i)->LaunchFiber(pop_fiber));
  }

  for (int i = 0; i < 2; i++) {
    fbs.push_back(pp_->at(i)->LaunchFiber(push_fiber));
  }

  for (auto& f : fbs)
    f.Join();
  ASSERT_EQ(0, NumWatched());
}

TEST_F(ListFamilyTest, BRPopLPushSingleShard) {
  EXPECT_THAT(Run({"brpoplpush", "x", "y", "0.05"}), ArgType(RespExpr::NIL));
  ASSERT_EQ(0, NumWatched());

  EXPECT_THAT(Run({"lpush", "x", "val1"}), IntArg(1));
  EXPECT_EQ(Run({"brpoplpush", "x", "y", "0.01"}), "val1");
  ASSERT_EQ(1, GetDebugInfo().shards_count);

  EXPECT_THAT(Run({
                  "exists",
                  "x",
              }),
              IntArg(0));
  Run({"set", "x", "str"});
  EXPECT_THAT(Run({"brpoplpush", "y", "x", "0.01"}), ErrArg("wrong kind of value"));

  Run({"del", "x", "y"});
  Run({"multi"});
  Run({"brpoplpush", "y", "x", "0"});
  RespExpr resp = Run({"exec"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));
  ASSERT_FALSE(IsLocked(0, "x"));
  ASSERT_FALSE(IsLocked(0, "y"));
  ASSERT_EQ(0, NumWatched());
}

TEST_F(ListFamilyTest, BRPopLPushSingleShardBug2857) {
  Run({"lpush", "src", "val1"});
  RespExpr resp;
  auto blpop = [&]() { resp = Run("id", {"blpop", "dest", "4"}); };
  auto f = pp_->at(1)->LaunchFiber(Launch::dispatch, blpop);
  EXPECT_THAT(Run({"brpoplpush", "src", "dest", "1"}), "val1");
  f.Join();
  EXPECT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), ElementsAre("dest", "val1"));

  // Timeout
  f = pp_->at(1)->LaunchFiber(Launch::dispatch, blpop);
  EXPECT_THAT(Run({"brpoplpush", "src", "dest", "1"}), ArgType(RespExpr::NIL));
  f.Join();
  EXPECT_THAT(resp, ArgType(RespExpr::NIL_ARRAY));
}

TEST_F(ListFamilyTest, BRPopLPushSingleShardBug4569) {
  RespExpr resp;
  auto fb0 = pp_->at(1)->LaunchFiber(Launch::dispatch, [&] { resp = Run({"brpop", "x", "0"}); });
  WaitUntilLocked(0, "x");

  ASSERT_TRUE(IsLocked(0, "x"));
  Run({"lpush", "y", "val"});
  Run({"rpoplpush", "y", "x"});
  ASSERT_EQ(1, GetDebugInfo().shards_count);
  fb0.Join();
  EXPECT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), ElementsAre("x", "val"));
  ASSERT_EQ(0, NumWatched());
  ASSERT_FALSE(IsLocked(0, "x"));
}

TEST_F(ListFamilyTest, BRPopLPushSingleShardBlocking) {
  RespExpr resp;

  // Run the fiber at creation.
  auto fb0 = pp_->at(0)->LaunchFiber(Launch::dispatch, [&] {
    resp = Run({"brpoplpush", "x", "y", "0"});
  });
  ThisFiber::SleepFor(30us);
  pp_->at(1)->Await([&] { Run("B1", {"lpush", "y", "2"}); });

  pp_->at(1)->Await([&] { Run("B1", {"lpush", "x", "1"}); });
  fb0.Join();
  ASSERT_EQ(resp, "1");
  ASSERT_FALSE(IsLocked(0, "x"));
  ASSERT_FALSE(IsLocked(0, "y"));
  ASSERT_EQ(0, NumWatched());
}

TEST_F(ListFamilyTest, BRPopContended) {
  RespExpr resp;
  atomic_bool done{false};
  constexpr auto kNumFibers = 4;

  // Run the fiber at creation.
  Fiber fb[kNumFibers];
  for (int i = 0; i < kNumFibers; i++) {
    fb[i] = pp_->at(1)->LaunchFiber(Launch::dispatch, [&] {
      string id = StrCat("id", i);
      while (!done) {
        Run(id, {"brpop", "k0", "k1", "k2", "k3", "k4", "0.1"});
      };
    });
  }

  for (int i = 0; i < 500; i++) {
    string key = absl::StrCat("k", i % 3);
    Run({"lpush", key, "foo"});
  }

  done = true;
  for (int i = 0; i < kNumFibers; i++) {
    fb[i].Join();
  }
  ASSERT_EQ(0, NumWatched());
  ASSERT_FALSE(HasAwakened());
}

TEST_F(ListFamilyTest, BRPopLPushTwoShards) {
  RespExpr resp;
  EXPECT_THAT(Run({"brpoplpush", "x", "z", "0.05"}), ArgType(RespExpr::NIL));

  ASSERT_EQ(0, NumWatched());

  Run({"lpush", "x", "val"});
  EXPECT_EQ(Run({"brpoplpush", "x", "z", "0"}), "val");
  resp = Run({"lrange", "z", "0", "-1"});
  ASSERT_EQ(resp, "val");
  Run({"del", "z"});
  ASSERT_EQ(0, NumWatched());

  // Run the fiber at creation.
  auto fb0 = pp_->at(0)->LaunchFiber(Launch::dispatch, [&] {
    resp = Run({"brpoplpush", "x", "z", "0"});
  });

  ThisFiber::SleepFor(30us);
  RespExpr resp_push = pp_->at(1)->Await([&] { return Run("B1", {"lpush", "z", "val2"}); });
  ASSERT_THAT(resp_push, IntArg(1));

  resp_push = pp_->at(1)->Await([&] { return Run("B1", {"lpush", "x", "val1"}); });
  ASSERT_THAT(resp_push, IntArg(1));
  fb0.Join();

  // Result of brpoplpush above.
  ASSERT_EQ(resp, "val1");

  resp = Run({"lrange", "z", "0", "-1"});
  ASSERT_THAT(resp, ArrLen(2));
  ASSERT_THAT(resp.GetVec(), ElementsAre("val1", "val2"));
  ASSERT_FALSE(IsLocked(0, "x"));
  ASSERT_FALSE(IsLocked(0, "z"));
  ASSERT_EQ(0, NumWatched());
  ASSERT_FALSE(HasAwakened());

  // TODO: there is a bug here.
  // we do not wake the dest shard, when source is awaked which prevents
  // the atomicity and causes the first bug as well.
}

TEST_F(ListFamilyTest, BLMove) {
  EXPECT_THAT(Run({"blmove", "x", "y", "right", "right", "0.05"}), ArgType(RespExpr::NIL));
  ASSERT_EQ(0, NumWatched());

  EXPECT_THAT(Run({"lpush", "x", "val1"}), IntArg(1));
  EXPECT_THAT(Run({"lpush", "y", "val2"}), IntArg(1));

  EXPECT_EQ(Run({"blmove", "x", "y", "right", "left", "0.01"}), "val1");
  auto resp = Run({"lrange", "y", "0", "-1"});
  ASSERT_THAT(resp, ArrLen(2));
  ASSERT_THAT(resp.GetVec(), ElementsAre("val1", "val2"));
}

// Wake two BLMOVEs on the same shard simultaneously
TEST_F(ListFamilyTest, BLMoveSimultaneously) {
  EXPECT_EQ(Shard("src1", shard_set->size()),
            Shard("src10", shard_set->size()));  // wake on same shard
  EXPECT_NE(Shard("dest110", shard_set->size()),
            Shard("src1", shard_set->size()));  // Trigger MoveTwoShards

  auto f1 = pp_->at(1)->LaunchFiber([this]() {
    Run("c1", {"blmove", "src1", "dest110", "LEFT", "RIGHT", "0"});
  });
  auto f2 = pp_->at(1)->LaunchFiber([this]() {
    Run("c2", {"blmove", "src10", "dest110", "LEFT", "RIGHT", "0"});
  });

  ThisFiber::SleepFor(5ms);
  Run({"multi"});
  Run({"rpush", "src1", "v1"});
  Run({"rpush", "src10", "v2"});
  Run({"exec"});

  f1.Join();
  f2.Join();

  auto res = Run({"lrange", "dest110", "0", "-1"});
  EXPECT_THAT(res.GetVec(), UnorderedElementsAre("v1", "v2"));
}

// Move key five times in rings 0 -> 1 -> 2 ... -> 0
TEST_F(ListFamilyTest, BLMoveRings) {
  vector<fb2::Fiber> fibers;
  for (int j = 0; j < 5; j++) {
    for (int i = 0; i < 10; i++) {
      fibers.emplace_back(pp_->at(i % pp_->size())->LaunchFiber([i, j, this]() {
        auto key1 = to_string(i);
        auto key2 = to_string((i + 1) % 10);
        Run(key1 + to_string(j), {"blmove", key1, key2, "LEFT", "RIGHT", "0"});
      }));
    }
  }

  ThisFiber::SleepFor(5ms);

  Run({"lpush", "0", "v1"});
  for (auto& fiber : fibers)
    fiber.Join();

  for (int i = 1; i < 10; i++)
    EXPECT_THAT(Run({"llen", to_string(i)}), IntArg(0));
  EXPECT_EQ(Run({"lrange", "0", "0", "-1"}), "v1");
}

// Move in waves where each wave layer has a fixed set of "vertices" through which all values travel
TEST_F(ListFamilyTest, BLMoveWaves) {
  static constexpr int kFlow = 64;
  vector<int> wave_sizes = {1 /* 0:0 */, kFlow, kFlow / 2, kFlow / 4, kFlow / 8, kFlow / 3,
                            kFlow / 5,   1,     kFlow / 6, kFlow,     kFlow / 4, 1};

  vector<fb2::Fiber> fibers;
  for (size_t i = 1; i < wave_sizes.size(); i++) {
    for (size_t j = 0; j < kFlow; j++) {
      fibers.emplace_back(pp_->at(i % 3)->LaunchFiber([i, j, wave_sizes, this]() {
        auto src = to_string(i - 1) + ":" + to_string(j / (kFlow / wave_sizes[i - 1]));
        auto dest = to_string(i) + ":" + to_string(j / (kFlow / wave_sizes[i]));
        Run("c" + to_string(i * kFlow + j), {"blmove", src, dest, "LEFT", "RIGHT", "0"});
      }));
    }
  }

  vector<string> values(kFlow);
  for (size_t i = 0; i < kFlow; i++)
    values[i] = "v" + to_string(i);

  Run({"multi"});
  for (size_t i = 0; i < kFlow; i++)
    Run({"lpush", "0:0", values[i]});
  Run({"exec"});

  for (auto& fiber : fibers)
    fiber.Join();

  auto res = Run({"lrange", to_string(wave_sizes.size() - 1) + ":0", "0", "-1"});
  EXPECT_THAT(res.GetVec(), UnorderedElementsAreArray(values));
}

// Move value back and forth between two lists, verfiy that atomic lookup of states catches it only
// in one of two possible states
TEST_F(ListFamilyTest, BLMovePendulum) {
  GTEST_SKIP() << "Blocking commands don't respect transactional ordering after waking up";
  // Suppose BLMOVE A -> B is running, then MULTI LLEN A LLEN B EXEC will
  // 1. Run on shard B because it doesn't have "blocking" keys freely, so LLEN B = 0
  // 2. Will run on shard A after BLMOVE A removed itself from the "awakened" set, so LLEN A = 0
  // => we observe a theoretically impossible state and the execution order is not linearizable

  vector<fb2::Fiber> fibers;

  atomic_bool stopped = false;
  auto swing = [this, &stopped](int i, string src, string dest) {
    while (!stopped.load(std::memory_order_relaxed))
      Run(src + dest + to_string(i), {"blmove", src, dest, "LEFT", "RIGHT", "0"});
  };

  for (int i = 0; i < 3; i++)
    fibers.emplace_back(pp_->at(i % pp_->size())->LaunchFiber([=]() { swing(i, "A", "B"); }));

  for (int i = 0; i < 3; i++)
    fibers.emplace_back(pp_->at(i % pp_->size())->LaunchFiber([=]() { swing(i, "B", "A"); }));

  Run({"lpush", "A", "v"});
  ThisFiber::SleepFor(1ms);

  for (int i = 0; i < 100; i++) {
    Run({"multi"});
    Run({"llen", "A"});
    Run({"llen", "B"});
    auto res = Run({"EXEC"});
    int i1 = *res.GetVec()[0].GetInt();
    int i2 = *res.GetVec()[1].GetInt();
    ASSERT_EQ(i1 + i2, 1);
  }

  stopped = true;
  Run({"lpush", "A", "stop"});
  Run({"lpush", "B", "stop"});
  for (auto& fiber : fibers)
    fiber.Join();

  int i1 = *Run({"llen", "A"}).GetInt();
  int i2 = *Run({"llen", "B"}).GetInt();
  ASSERT_EQ(i1 + i2, 3);  // v, stop, stop
}

TEST_F(ListFamilyTest, LPushX) {
  // No push for 'lpushx' on nonexisting key.
  EXPECT_THAT(Run({"lpushx", kKey1, "val1"}), IntArg(0));
  EXPECT_THAT(Run({"llen", kKey1}), IntArg(0));

  EXPECT_THAT(Run({"lpush", kKey1, "val1"}), IntArg(1));
  EXPECT_THAT(Run({"lrange", kKey1, "0", "-1"}), "val1");

  EXPECT_THAT(Run({"lpushx", kKey1, "val2"}), IntArg(2));
  EXPECT_THAT(Run({"lrange", kKey1, "0", "-1"}).GetVec(), ElementsAre("val2", "val1"));
}

TEST_F(ListFamilyTest, RPushX) {
  // No push for 'rpushx' on nonexisting key.
  EXPECT_THAT(Run({"rpushx", kKey1, "val1"}), IntArg(0));
  EXPECT_THAT(Run({"llen", kKey1}), IntArg(0));

  EXPECT_THAT(Run({"rpush", kKey1, "val1"}), IntArg(1));
  EXPECT_THAT(Run({"lrange", kKey1, "0", "-1"}), "val1");

  EXPECT_THAT(Run({"rpushx", kKey1, "val2"}), IntArg(2));
  EXPECT_THAT(Run({"lrange", kKey1, "0", "-1"}).GetVec(), ElementsAre("val1", "val2"));
}

TEST_F(ListFamilyTest, LInsert) {
  // List not found.
  EXPECT_THAT(Run({"linsert", "notfound", "before", "foo", "bar"}), IntArg(0));

  // Key is not a list.
  Run({"set", "notalist", "x"});
  EXPECT_THAT(Run({"linsert", "notalist", "before", "foo", "bar"}),
              ErrArg("Operation against a key holding the wrong kind of value"));

  // Insert before.
  Run({"rpush", "mylist", "foo"});
  EXPECT_THAT(Run({"linsert", "mylist", "before", "foo", "bar"}), IntArg(2));
  auto resp = Run({"lrange", "mylist", "0", "1"});
  ASSERT_THAT(resp, ArrLen(2));
  ASSERT_THAT(resp.GetVec(), ElementsAre("bar", "foo"));

  // Insert after.
  EXPECT_THAT(Run({"linsert", "mylist", "after", "foo", "car"}), IntArg(3));
  resp = Run({"lrange", "mylist", "0", "2"});
  ASSERT_THAT(resp, ArrLen(3));
  ASSERT_THAT(resp.GetVec(), ElementsAre("bar", "foo", "car"));

  // Insert before, pivot not found.
  EXPECT_THAT(Run({"linsert", "mylist", "before", "notfound", "x"}), IntArg(-1));

  // Insert after, pivot not found.
  EXPECT_THAT(Run({"linsert", "mylist", "after", "notfound", "x"}), IntArg(-1));

  // insert empty
  Run({"rpush", "k", "a"});
  Run({"linsert", "k", "before", "a", ""});
  resp = Run({"lpop", "k"});
  EXPECT_EQ(resp, "");
  resp = Run({"linsert", "k", "before", "", ""});
  EXPECT_THAT(resp, IntArg(-1));
}

TEST_F(ListFamilyTest, BLPopUnwakesInScript) {
  const string_view SCRIPT = R"(
    for i = 1, 1000 do
      redis.call('MGET', 'a', 'b', 'c', 'd')
      redis.call('LPUSH', 'l', tostring(i))
    end
  )";

  // Start blpop with without timeout
  auto f1 = pp_->at(1)->LaunchFiber(Launch::dispatch, [&]() {
    auto resp = Run("blpop", {"BLPOP", "l", "0"});
    // blpop should only be awakened after the script has completed, so the
    // last element added in the script should be returned.
    EXPECT_THAT(resp, ArgType(RespExpr::ARRAY));
    EXPECT_THAT(resp.GetVec(), ElementsAre("l", "1000"));
  });

  // Start long running script that intends to wake up blpop
  auto f2 = pp_->at(2)->LaunchFiber([&] {
    Run("script", {"EVAL", SCRIPT, "5", "a", "b", "c", "d", "l"});
  });

  // Run blpop that times out
  auto resp = Run({"blpop", "g", "0.01"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL_ARRAY));

  f1.Join();
  f2.Join();
}

TEST_F(ListFamilyTest, OtherMultiWakesBLpop) {
  const string_view SCRIPT = R"(
    redis.call('LPUSH', 'l', 'bad')
    for i = 1, 1000 do
      redis.call('MGET', 'a', 'b', 'c', 'd')
    end
    redis.call('LPUSH', 'l', 'good')
  )";

  const string_view SCRIPT_SHORT = R"(
    redis.call('GET', KEYS[1])
  )";

  // Start BLPOP with infinite timeout
  auto f1 = pp_->at(1)->LaunchFiber(Launch::dispatch, [&] {
    auto resp = Run("blpop", {"BLPOP", "l", "0"});
    // blpop should only be awakened after the script has completed, so the
    // last element added in the script should be returned.
    EXPECT_THAT(resp, ArgType(RespExpr::ARRAY));
    EXPECT_THAT(resp.GetVec(), ElementsAre("l", "good"));
  });

  // Start long running script that accesses the list, but should wake up blpop only after it
  // finished
  auto f2 = pp_->at(2)->LaunchFiber(Launch::dispatch, [&] {
    Run("script", {"EVAL", SCRIPT, "5", "a", "b", "c", "d", "l"});
  });

  // Run quick multi transaction that concludes after one hop
  Run({"EVAL", SCRIPT_SHORT, "1", "y"});

  f1.Join();
  f2.Join();
}

TEST_F(ListFamilyTest, ContendExpire) {
  vector<fb2::Fiber> blpop_fibers;
  for (unsigned i = 0; i < num_threads_; ++i) {
    for (unsigned j = 0; j < 30; ++j) {
      blpop_fibers.emplace_back(pp_->at(i)->LaunchFiber(Launch::post, [&, i, j] {
        string keys[2] = {"key0", "key1"};
        thread_local unsigned cur = 0;
        for (unsigned n = 0; n < 30; n++) {
          string k = keys[cur];
          cur ^= 1;
          Run(StrCat("push", i, "_", j), {"lpush", k, "foo"});
          Run(StrCat("blpop", i, "_", j), {"blpop", keys[cur], "a", "0.001"});
        }
      }));
    }
  }

  for (auto& f : blpop_fibers) {
    f.Join();
  }
}

TEST_F(ListFamilyTest, LMPopInvalidSyntax) {
  // Not enough arguments
  auto resp = Run({"lmpop", "1", "a"});
  EXPECT_THAT(resp, ErrArg("wrong number of arguments"));

  // Zero keys
  resp = Run({"lmpop", "0", "LEFT", "COUNT", "1"});
  EXPECT_THAT(resp, ErrArg("at least 1 input key is needed"));

  // Number of keys is not uint
  resp = Run({"lmpop", "aa", "a", "LEFT"});
  EXPECT_THAT(resp, ErrArg("value is not an integer or out of range"));

  // Missing LEFT/RIGHT
  resp = Run({"lmpop", "1", "a", "COUNT", "1"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  // Wrong number of keys
  resp = Run({"lmpop", "1", "a", "b", "LEFT"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  // COUNT without number
  resp = Run({"lmpop", "1", "a", "LEFT", "COUNT"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  // COUNT is not uint
  resp = Run({"lmpop", "1", "a", "LEFT", "COUNT", "boo"});
  EXPECT_THAT(resp, ErrArg("value is not an integer or out of range"));

  // Too many arguments
  resp = Run({"lmpop", "1", "c", "LEFT", "COUNT", "2", "foo"});
  EXPECT_THAT(resp, ErrArg("syntax error"));
}

TEST_F(ListFamilyTest, LMPop) {
  // All lists are empty
  auto resp = Run({"lmpop", "1", "e", "LEFT"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  // LEFT operation
  resp = Run({"lpush", "a", "a1", "a2"});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"lmpop", "1", "a", "LEFT"});
  EXPECT_THAT(resp, RespArray(ElementsAre("a", RespArray(ElementsAre("a2")))));

  // RIGHT operation
  resp = Run({"lpush", "b", "b1", "b2"});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"lmpop", "1", "b", "RIGHT"});
  EXPECT_THAT(resp, RespArray(ElementsAre("b", RespArray(ElementsAre("b1")))));

  // COUNT > 1
  resp = Run({"lpush", "c", "c1", "c2"});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"lmpop", "1", "c", "RIGHT", "COUNT", "2"});
  EXPECT_THAT(resp, RespArray(ElementsAre("c", RespArray(ElementsAre("c1", "c2")))));

  resp = Run({"llen", "c"});
  EXPECT_THAT(resp, IntArg(0));

  // COUNT > number of elements in list
  resp = Run({"lpush", "d", "d1", "d2"});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"lmpop", "1", "d", "RIGHT", "COUNT", "3"});
  EXPECT_THAT(resp, RespArray(ElementsAre("d", RespArray(ElementsAre("d1", "d2")))));

  resp = Run({"llen", "d"});
  EXPECT_THAT(resp, IntArg(0));

  // First non-empty list is not the first list
  resp = Run({"lpush", "x", "x1"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"lpush", "y", "y1"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"lmpop", "3", "empty", "x", "y", "RIGHT"});
  EXPECT_THAT(resp, RespArray(ElementsAre("x", RespArray(ElementsAre("x1")))));

  resp = Run({"llen", "x"});
  EXPECT_THAT(resp, IntArg(0));
}

TEST_F(ListFamilyTest, LMPopMultipleElements) {
  // Test removing multiple elements from left end
  Run({"rpush", "list1", "a", "b", "c", "d", "e"});
  auto resp = Run({"lmpop", "1", "list1", "LEFT", "COUNT", "3"});
  EXPECT_THAT(resp, RespArray(ElementsAre("list1", RespArray(ElementsAre("a", "b", "c")))));

  resp = Run({"lrange", "list1", "0", "-1"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("d", "e"));

  // Test removing multiple elements from right end
  Run({"rpush", "list2", "v", "w", "x", "y", "z"});
  resp = Run({"lmpop", "1", "list2", "RIGHT", "COUNT", "2"});
  EXPECT_THAT(resp, RespArray(ElementsAre("list2", RespArray(ElementsAre("z", "y")))));

  resp = Run({"lrange", "list2", "0", "-1"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("v", "w", "x"));
}

TEST_F(ListFamilyTest, LMPopMultipleLists) {
  // Test finding first non-empty list
  Run({"rpush", "list1", "a", "b"});
  Run({"rpush", "list2", "c", "d"});
  Run({"rpush", "list3", "e", "f"});

  // Pop from first non-empty list
  auto resp = Run({"lmpop", "3", "list1", "list2", "list3", "LEFT"});
  EXPECT_THAT(resp, RespArray(ElementsAre("list1", RespArray(ElementsAre("a")))));

  // Pop from second list after first becomes empty
  Run({"lmpop", "1", "list1", "LEFT"});  // Empty list1
  resp = Run({"lmpop", "3", "list1", "list2", "list3", "RIGHT", "COUNT", "2"});
  EXPECT_THAT(resp, RespArray(ElementsAre("list2", RespArray(ElementsAre("d", "c")))));

  // Verify third list remains untouched
  resp = Run({"lrange", "list3", "0", "-1"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("e", "f"));
}

TEST_F(ListFamilyTest, LMPopEdgeCases) {
  // Test with empty list
  Run({"rpush", "empty_list", "a"});
  Run({"lpop", "empty_list"});
  auto resp = Run({"lmpop", "1", "empty_list", "LEFT"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  // Test with non-existent list
  resp = Run({"lmpop", "1", "nonexistent", "LEFT"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  // Test with wrong type key
  Run({"set", "string_key", "value"});
  resp = Run({"lmpop", "1", "string_key", "LEFT"});
  EXPECT_THAT(resp, ErrArg("WRONGTYPE Operation against a key holding the wrong kind of value"));

  // Test without COUNT parameter - should return 1 element by default
  Run({"rpush", "list", "a", "b"});
  resp = Run({"lmpop", "1", "list", "LEFT"});
  EXPECT_THAT(resp,
              RespArray(ElementsAre(
                  "list", RespArray(ElementsAre("a")))));  // Should return 1 element by default

  // Test with COUNT = 0 - should return error
  resp = Run({"lmpop", "1", "list", "LEFT", "COUNT", "0"});
  EXPECT_THAT(resp, RespArray(ElementsAre("list", RespArray(ElementsAre()))));

  // Test with negative COUNT - should return error
  resp = Run({"lmpop", "1", "list", "LEFT", "COUNT", "-1"});
  EXPECT_THAT(resp, ErrArg("value is not an integer or out of range"));
}

TEST_F(ListFamilyTest, LMPopDocExample) {
  // Try to pop from non-existing lists
  auto resp = Run({"LMPOP", "2", "non1", "non2", "LEFT", "COUNT", "10"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  // Create first list and test basic pop
  resp = Run({"LPUSH", "mylist", "one", "two", "three", "four", "five"});
  EXPECT_THAT(resp, IntArg(5));

  resp = Run({"LMPOP", "1", "mylist", "LEFT"});
  EXPECT_THAT(resp, RespArray(ElementsAre("mylist", RespArray(ElementsAre("five")))));

  resp = Run({"LRANGE", "mylist", "0", "-1"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("four", "three", "two", "one"));

  // Test RIGHT pop with COUNT
  resp = Run({"LMPOP", "1", "mylist", "RIGHT", "COUNT", "10"});
  EXPECT_THAT(resp, RespArray(ElementsAre("mylist",
                                          RespArray(ElementsAre("one", "two", "three", "four")))));

  // Create two lists and test multi-key pop
  resp = Run({"LPUSH", "mylist", "one", "two", "three", "four", "five"});
  EXPECT_THAT(resp, IntArg(5));

  resp = Run({"LPUSH", "mylist2", "a", "b", "c", "d", "e"});
  EXPECT_THAT(resp, IntArg(5));

  resp = Run({"LMPOP", "2", "mylist", "mylist2", "RIGHT", "COUNT", "3"});
  EXPECT_THAT(resp,
              RespArray(ElementsAre("mylist", RespArray(ElementsAre("one", "two", "three")))));

  resp = Run({"LRANGE", "mylist", "0", "-1"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("five", "four"));

  resp = Run({"LMPOP", "2", "mylist", "mylist2", "RIGHT", "COUNT", "5"});
  EXPECT_THAT(resp, RespArray(ElementsAre("mylist", RespArray(ElementsAre("four", "five")))));

  resp = Run({"LMPOP", "2", "mylist", "mylist2", "RIGHT", "COUNT", "10"});
  EXPECT_THAT(resp,
              RespArray(ElementsAre("mylist2", RespArray(ElementsAre("a", "b", "c", "d", "e")))));

  // Verify both lists are now empty
  resp = Run({"EXISTS", "mylist", "mylist2"});
  EXPECT_THAT(resp, IntArg(0));
}

TEST_F(ListFamilyTest, LMPopWrongType) {
  // Setup: create a list and a hash
  Run({"lpush", "l1", "e1"});
  Run({"hset", "foo", "k1", "v1"});

  // Test: first key is wrong type
  auto resp = Run({"lmpop", "2", "foo", "l1", "left"});
  EXPECT_THAT(resp, ErrArg("WRONGTYPE Operation against a key holding the wrong kind of value"));

  // Test: second key is wrong type but first doesn't exist
  resp = Run({"lmpop", "2", "nonexistent", "foo", "left"});
  EXPECT_THAT(resp, ErrArg("WRONGTYPE Operation against a key holding the wrong kind of value"));

  // Test: second key is wrong type but first is a valid list
  resp = Run({"lmpop", "2", "l1", "foo", "left"});
  EXPECT_THAT(resp, RespArray(ElementsAre("l1", RespArray(ElementsAre("e1")))));
}

// Blocking command wakeup is complicated by running multi transaction at the same time
TEST_F(ListFamilyTest, AwakeMulti) {
  auto f1 = pp_->at(1)->LaunchFiber(Launch::dispatch, [&] {
    for (unsigned i = 0; i < 100; ++i) {
      Run("CONSUMER", {"blmove", "src", "dest", "LEFT", "LEFT", "0"});
    };
  });
  auto f2 = pp_->at(1)->LaunchFiber([&] {
    for (unsigned i = 0; i < 100; ++i) {
      Run("PROD", {"lpush", "src", "a"});
      ThisFiber::SleepFor(50us);
    };
  });

  auto f3 = pp_->at(2)->LaunchFiber([&] {
    for (unsigned i = 0; i < 100; ++i) {
      Run({"multi"});
      for (unsigned j = 0; j < 8; ++j) {
        Run({"get", StrCat("key", j)});
      };
      Run({"exec"});
    };
  });

  f1.Join();
  f2.Join();
  f3.Join();
}

TEST_F(ListFamilyTest, PressureBLMove) {
#ifndef NDEBUG
  GTEST_SKIP() << "Requires release build to reproduce";
#endif

  auto consumer = [this](string_view id, string_view src, string_view dest) {
    for (unsigned i = 0; i < 1000; ++i) {
      Run(id, {"blmove", src, dest, "LEFT", "LEFT", "0"});
    };
  };
  auto producer = [this](string_view id, size_t delay, string_view src) {
    for (unsigned i = 0; i < 1000; ++i) {
      Run(id, {"lpush", src, "a"});
      ThisFiber::SleepFor(1us * delay);
    }
  };

  for (size_t delay : {1, 2, 5}) {
    LOG(INFO) << "Running with delay: " << delay;
    auto f1 = pp_->at(1)->LaunchFiber([=] { consumer("c1", "src", "dest"); });
    auto f2 = pp_->at(1)->LaunchFiber([=] { producer("p1", delay, "src"); });

    f1.Join();
    f2.Join();
  }
}

TEST_F(ListFamilyTest, AwakeDb1) {
  const char* kDbId = "1";

  auto f1 = pp_->at(1)->LaunchFiber(Launch::dispatch, [&] {
    Run("C", {"SELECT", kDbId});
    Run("C", {"brpoplpush", "x", "y", "0"});
    ASSERT_EQ(GetDebugInfo("C").shards_count, 1);
  });
  Run({"SELECT", kDbId});
  Run({"EVAL", "redis.call('LPUSH', KEYS[1], 'val'); return 1;", "1", "x"});
  f1.Join();
}

#pragma GCC diagnostic pop
}  // namespace dfly


================================================
FILE: src/server/main_service.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/main_service.h"

#include "absl/strings/str_split.h"
#include "facade/resp_expr.h"
#include "util/fibers/detail/fiber_interface.h"
#include "util/fibers/proactor_base.h"
#include "util/fibers/synchronization.h"

#ifdef __FreeBSD__
#include <pthread_np.h>
#elif defined(__linux__)
#include "util/fibers/uring_proactor.h"
#endif

extern "C" {
#include "redis/redis_aux.h"
}

#include <absl/cleanup/cleanup.h>
#include <absl/functional/bind_front.h>
#include <absl/strings/ascii.h>
#include <absl/strings/match.h>
#include <absl/strings/str_format.h>
#include <xxhash.h>

#include <csignal>
#include <filesystem>

#include "base/cycle_clock.h"
#include "base/flag_utils.h"
#include "base/flags.h"
#include "base/logging.h"
#include "core/search/vector_utils.h"
#include "facade/dragonfly_connection.h"
#include "facade/dragonfly_listener.h"
#include "facade/error.h"
#include "facade/reply_builder.h"
#include "facade/reply_capture.h"
#include "server/acl/acl_commands_def.h"
#include "server/acl/acl_family.h"
#include "server/acl/user_registry.h"
#include "server/acl/validator.h"
#include "server/channel_store.h"
#include "server/cluster/cluster_family.h"
#include "server/command_families.h"
#include "server/dflycmd.h"
#include "server/error.h"
#include "server/generic_family.h"
#include "server/hset_family.h"
#include "server/http_api.h"
#include "server/multi_command_squasher.h"
#include "server/namespaces.h"
#include "server/script_mgr.h"
#include "server/search/search_family.h"
#include "server/server_state.h"
#include "server/set_family.h"
#include "server/sharding.h"
#include "server/stream_family.h"
#include "server/tiered_storage.h"
#include "server/transaction.h"
#include "server/version.h"
#include "server/zset_family.h"
#include "strings/human_readable.h"
#include "util/html/sorted_table.h"
#include "util/varz.h"

using namespace std;
using facade::ErrorReply;

ABSL_FLAG(int32_t, port, 6379,
          "Redis port. 0 disables the port, -1 will bind on a random available port.");

ABSL_FLAG(uint16_t, announce_port, 0,
          "Port that Dragonfly announces to cluster clients and replication master");

ABSL_FLAG(uint32_t, memcached_port, 0, "Memcached port");

ABSL_FLAG(uint32_t, num_shards, 0, "Number of database shards, 0 - to choose automatically");

ABSL_FLAG(bool, multi_exec_squash, true,
          "Whether multi exec will squash single shard commands to optimize performance");

ABSL_FLAG(bool, lua_resp2_legacy_float, false,
          "Return rounded down integers instead of floats for lua scripts with RESP2");
ABSL_FLAG(uint32_t, multi_eval_squash_buffer, 4096, "Max buffer for squashed commands per script");

ABSL_DECLARE_FLAG(bool, primary_port_http_enabled);
ABSL_FLAG(bool, admin_nopass, false,
          "If set, would enable open admin access to console on the assigned port, without "
          "authorization needed.");

ABSL_FLAG(bool, expose_http_api, false,
          "If set, will expose a POST /api handler for sending redis commands as json array.");

ABSL_FLAG(strings::MemoryBytesFlag, maxmemory, strings::MemoryBytesFlag{},
          "Limit on maximum-memory that is used by the database, until data starts to be evicted "
          "(according to eviction policy). With tiering, this value defines only the size in RAM, "
          "and not the whole dataset (RAM + SSD). "
          "Must be *at least* 256MiB per proactor thread. "
          "Can be any human‑readable bytes values (supports K/M/G/T/P/E with optional B, "
          "case‑insensitive, both 'GiB' & 'GB' possible). Examples: 300000000, 512MB, 2G, 1.25GiB. "
          "0 - value will be automatically defined based on the env (ex: machine's capacity). "
          "default: 0");

ABSL_RETIRED_FLAG(
    double, oom_deny_ratio, 1.1,
    "commands with flag denyoom will return OOM when the ratio between maxmemory and used "
    "memory is above this value");

ABSL_FLAG(uint32_t, shard_thread_busy_polling_usec, 0,
          "If non-zero, overrides the busy polling parameter for shard threads.");

ABSL_FLAG(string, huffman_table, "",
          "a comma separated map: domain1:code1,domain2:code2,... where "
          "domain can currently be only KEYS or STRINGS, code is a base64-encoded huffman table"
          " exported via "
          "DEBUG COMPRESSION EXPORT. if the flag is empty no huffman compression is applied.");

ABSL_FLAG(bool, jsonpathv2, true,
          "If true uses Dragonfly jsonpath implementation, "
          "otherwise uses legacy jsoncons implementation.");

ABSL_FLAG(uint32_t, scheduler_background_budget, 50'000, "Background fiber budget in nanoseconds");
ABSL_FLAG(uint32_t, scheduler_background_sleep_prob, 50,
          "Sleep probability of background fibers on reaching budget");
ABSL_FLAG(uint32_t, scheduler_background_warrant, 5,
          "Percentage of guaranteed cpu time for background fibers");

ABSL_FLAG(uint32_t, squash_stats_latency_lower_limit, 0,
          "If set, will not track latency stats below this threshold (usec). ");

namespace {

struct ShutdownWatchdog {
  util::fb2::Fiber watchdog_fb;
  util::fb2::Done watchdog_done;
  util::ProactorPool& pool;

  explicit ShutdownWatchdog(util::ProactorPool& pp);
  void Disarm();
};

ShutdownWatchdog::ShutdownWatchdog(util::ProactorPool& pp) : pool{pp} {
  watchdog_fb = pool.GetNextProactor()->LaunchFiber("shutdown_watchdog", [&] {
    if (!watchdog_done.WaitFor(20s)) {
      LOG(ERROR) << "Deadlock detected during shutdown";
      absl::SetFlag(&FLAGS_alsologtostderr, true);
      util::fb2::Mutex m;
      pool.AwaitFiberOnAll([&m](unsigned index, auto*) {
        util::ThisFiber::SetName(absl::StrFormat("print_stack_fib_%u", index));
        std::unique_lock lk(m);
        LOG(ERROR) << "Proactor " << index << ":\n";
        util::fb2::detail::FiberInterface::PrintAllFiberStackTraces();
      });
    }
  });
}

void ShutdownWatchdog::Disarm() {
  watchdog_done.Notify();
  watchdog_fb.JoinIfNeeded();
}

std::optional<ShutdownWatchdog> shutdown_watchdog = std::nullopt;

}  // namespace

namespace dfly {

#if defined(__linux__)
#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 30
#include <sys/syscall.h>
#define gettid() syscall(SYS_gettid)
#endif

#elif defined(__FreeBSD__)

#define gettid() pthread_getthreadid_np()

#elif defined(__APPLE__)

inline unsigned gettid() {
  uint64_t tid;
  pthread_threadid_np(NULL, &tid);
  return tid;
}

#endif

using namespace util;
using absl::GetFlag;
using absl::StrCat;
using base::VarzValue;
using ::boost::intrusive_ptr;
using namespace facade;
namespace h2 = boost::beast::http;

namespace {

std::optional<VarzFunction> engine_varz;

constexpr size_t kMaxThreadSize = 1024;

// Unwatch all keys for a connection and unregister from DbSlices.
// Used by UNWATCH, DICARD and EXEC.
void UnwatchAllKeys(Namespace* ns, ConnectionState::ExecInfo* exec_info) {
  if (!exec_info->watched_keys.empty()) {
    auto cb = [&](EngineShard* shard) {
      ns->GetDbSlice(shard->shard_id())
          .UnregisterConnectionWatches(exec_info->watched_keys, &exec_info->watched_dirty);
    };
    shard_set->RunBriefInParallel(std::move(cb));
  }
  exec_info->ClearWatched();
}

void MultiCleanup(ConnectionContext* cntx) {
  auto& exec_info = cntx->conn_state.exec_info;
  if (auto* borrowed = exec_info.preborrowed_interpreter; borrowed) {
    ServerState::tlocal()->ReturnInterpreter(borrowed);
    exec_info.preborrowed_interpreter = nullptr;
  }
  UnwatchAllKeys(cntx->ns, &exec_info);
  exec_info.Clear();
}

void DeactivateMonitoring(ConnectionContext* server_ctx) {
  if (server_ctx->monitor) {
    // remove monitor on this connection
    server_ctx->ChangeMonitor(false /*start*/);
  }
}

// The format of the message that are sending is
// +"time of day" [db-number <lua|unix:path|connection info] "command" "arg1" .. "argM"
std::string CreateMonitorTimestamp() {
  timeval tv;

  gettimeofday(&tv, nullptr);
  return absl::StrCat(tv.tv_sec, ".", tv.tv_usec, absl::kZeroPad6);
}

auto CmdEntryToMonitorFormat(std::string_view str) -> std::string {
  // This code is based on Redis impl for it at sdscatrepr@sds.c
  std::string result = absl::StrCat("\"");

  for (auto c : str) {
    switch (c) {
      case '\\':
        absl::StrAppend(&result, "\\\\");
        break;
      case '"':
        absl::StrAppend(&result, "\\\"");
        break;
      case '\n':
        absl::StrAppend(&result, "\\n");
        break;
      case '\r':
        absl::StrAppend(&result, "\\r");
        break;
      case '\t':
        absl::StrAppend(&result, "\\t");
        break;
      case '\a':
        absl::StrAppend(&result, "\\a");
        break;
      case '\b':
        absl::StrAppend(&result, "\\b");
        break;
      default:
        if (isprint(c)) {
          result += c;
        } else {
          absl::StrAppendFormat(&result, "\\x%02x", c);
        }
        break;
    }
  }
  absl::StrAppend(&result, "\"");
  return result;
}

std::string MakeMonitorMessage(const ConnectionContext* cntx, const CommandId* cid,
                               CmdArgList tail_args) {
  std::string message = absl::StrCat(CreateMonitorTimestamp(), " [", cntx->conn_state.db_index);

  string endpoint;
  if (cntx->conn_state.script_info) {
    endpoint = "lua";
  } else if (const auto* conn = cntx->conn(); conn != nullptr) {
    endpoint = conn->RemoteEndpointStr();
  } else {
    endpoint = "REPLICATION:0";
  }
  absl::StrAppend(&message, " ", endpoint, "] ");

  absl::StrAppend(&message, "\"", cid->name(), "\"");

  if (cid->name() == "AUTH")
    return message;

  for (auto arg : tail_args)
    absl::StrAppend(&message, " ", CmdEntryToMonitorFormat(facade::ToSV(arg)));

  return message;
}

void DispatchMonitor(ConnectionContext* cntx, const CommandId* cid, CmdArgList tail_args) {
  auto cb = [msg = MakeMonitorMessage(cntx, cid, tail_args)](unsigned idx, util::ProactorBase*) {
    const auto& monitors = ServerState::tlocal()->Monitors().monitors();
    if (monitors.empty())
      return;

    VLOG(2) << "Sending command '" << msg << "' from " << ProactorBase::me()->GetPoolIndex()
            << " to " << monitors.size() << " monitors";
    for (auto monitor_conn : monitors)
      monitor_conn->SendMonitorMessageAsync(msg);
  };
  shard_set->pool()->DispatchBrief(std::move(cb));
}

class InterpreterReplier : public RedisReplyBuilder {
 public:
  explicit InterpreterReplier(ObjectExplorer* explr) : RedisReplyBuilder(nullptr), explr_(explr) {
  }

  void SendError(std::string_view str, std::string_view type) final;

  void SendBulkString(std::string_view str) final;
  void SendSimpleString(std::string_view str) final;

  void SendNullArray() final;
  void SendNull() final;
  void SendLong(long val) final;
  void SendDouble(double val) final;

  void StartCollection(unsigned len, CollectionType type) final;

 private:
  void PostItem();

  ObjectExplorer* explr_;
  vector<pair<unsigned, unsigned>> array_len_;
  unsigned num_elems_ = 0;
};

// Serialized result of script invocation to Redis protocol
class EvalSerializer : public ObjectExplorer {
 public:
  explicit EvalSerializer(RedisReplyBuilder* rb, bool float_as_int)
      : rb_(rb), float_as_int_(float_as_int) {
  }

  void OnBool(bool b) final {
    if (b) {
      rb_->SendLong(1);
    } else {
      rb_->SendNull();
    }
  }

  void OnString(string_view str) final {
    rb_->SendBulkString(str);
  }

  void OnDouble(double d) final {
    if (float_as_int_ || GetFlag(FLAGS_lua_resp2_legacy_float)) {
      const long val = d >= 0 ? static_cast<long>(floor(d)) : static_cast<long>(ceil(d));
      rb_->SendLong(val);
    } else {
      rb_->SendDouble(d);
    }
  }

  void OnInt(int64_t val) final {
    rb_->SendLong(val);
  }

  void OnArrayStart(unsigned len) final {
    rb_->StartArray(len);
  }

  void OnArrayEnd() final {
  }

  void OnMapStart(unsigned len) final {
    rb_->StartCollection(len, CollectionType::MAP);
  }

  void OnMapEnd() final {
  }

  void OnNil() final {
    rb_->SendNull();
  }

  void OnStatus(string_view str) {
    rb_->SendSimpleString(str);
  }

  void OnError(string_view str) {
    if (!str.empty() && str.front() != '-') {
      rb_->SendError(absl::StrCat("-", str));
    } else {
      rb_->SendError(str);
    }
  }

 private:
  RedisReplyBuilder* rb_;
  bool float_as_int_;
};

void InterpreterReplier::PostItem() {
  if (array_len_.empty()) {
    DCHECK_EQ(0u, num_elems_);
    ++num_elems_;
  } else {
    ++num_elems_;

    while (num_elems_ == array_len_.back().second) {
      num_elems_ = array_len_.back().first;
      explr_->OnArrayEnd();

      array_len_.pop_back();
      if (array_len_.empty())
        break;
    }
  }
}

void InterpreterReplier::SendError(string_view str, std::string_view type) {
  DCHECK(array_len_.empty());
  DVLOG(1) << "Lua/df_call error " << str;
  if (!str.empty() && str.front() != '-') {
    explr_->OnError(absl::StrCat("-ERR ", str));
  } else {
    explr_->OnError(str);
  }
}

void InterpreterReplier::SendSimpleString(string_view str) {
  if (array_len_.empty())
    explr_->OnStatus(str);
  else
    explr_->OnString(str);
  PostItem();
}

void InterpreterReplier::SendNullArray() {
  SendSimpleStrArr(ArgSlice{});
  PostItem();
}

void InterpreterReplier::SendNull() {
  explr_->OnNil();
  PostItem();
}

void InterpreterReplier::SendLong(long val) {
  explr_->OnInt(val);
  PostItem();
}

void InterpreterReplier::SendDouble(double val) {
  explr_->OnDouble(val);
  PostItem();
}

void InterpreterReplier::SendBulkString(string_view str) {
  explr_->OnString(str);
  PostItem();
}

void InterpreterReplier::StartCollection(unsigned len, CollectionType type) {
  if (type == CollectionType::MAP)
    len *= 2;
  explr_->OnArrayStart(len);

  if (len == 0) {
    explr_->OnArrayEnd();
    PostItem();
  } else {
    array_len_.emplace_back(num_elems_ + 1, len);
    num_elems_ = 0;
  }
}

bool IsSHA(string_view str) {
  return std::all_of(str.begin(), str.end(),
                     [](unsigned char c) { return absl::ascii_isxdigit(c); });
}

optional<ErrorReply> EvalValidator(CmdArgList args) {
  string_view num_keys_str = ArgS(args, 1);
  int32_t num_keys;

  if (!absl::SimpleAtoi(num_keys_str, &num_keys) || num_keys < 0)
    return ErrorReply{facade::kInvalidIntErr};

  if (unsigned(num_keys) > args.size() - 2)
    return ErrorReply{"Number of keys can't be greater than number of args", kSyntaxErrType};

  return nullopt;
}

enum class ExecScriptUse : uint8_t {
  NONE = 0,
  SCRIPT_LOAD = 1,
  SCRIPT_RUN = 2,
};

ExecScriptUse DetermineScriptPresense(const std::vector<StoredCmd>& body) {
  bool script_load = false;
  for (const auto& scmd : body) {
    if (scmd.Cid()->MultiControlKind() == CO::MultiControlKind::EVAL) {
      return ExecScriptUse::SCRIPT_RUN;
    }

    if ((scmd.Cid()->name() == "SCRIPT") && (absl::AsciiStrToUpper(scmd.FirstArg()) == "LOAD")) {
      script_load = true;
    }
  }

  if (script_load)
    return ExecScriptUse::SCRIPT_LOAD;

  return ExecScriptUse::NONE;
}

// Returns the multi mode for that transaction. Returns NOT_DETERMINED if no scheduling
// is required.
Transaction::MultiMode DeduceExecMode(ExecScriptUse state,
                                      const ConnectionState::ExecInfo& exec_info,
                                      const ScriptMgr& script_mgr) {
  // Check if script most LIKELY has global eval transactions
  bool contains_global = false;
  bool contains_admin_cmd = false;
  Transaction::MultiMode multi_mode = Transaction::LOCK_AHEAD;

  if (state == ExecScriptUse::SCRIPT_RUN) {
    contains_global = script_mgr.AreGlobalByDefault();
  }

  bool transactional = contains_global;
  if (!transactional) {
    for (const auto& scmd : exec_info.body) {
      // We can only tell if eval is transactional based on they keycount
      if (absl::StartsWith(scmd.Cid()->name(), "EVAL")) {
        CmdArgVec arg_vec{};
        auto args = scmd.Slice(&arg_vec);
        auto keys = DetermineKeys(scmd.Cid(), args);
        transactional |= (keys && keys.value().NumArgs() > 0);
      } else {
        transactional |= scmd.Cid()->IsTransactional();
      }
      contains_global |= scmd.Cid()->opt_mask() & CO::GLOBAL_TRANS;
      contains_admin_cmd |= scmd.Cid()->opt_mask() & CO::ADMIN;

      // We can't run no-key-transactional commands in lock-ahead mode currently,
      // because it means we have to schedule on all shards
      if (scmd.Cid()->opt_mask() & CO::NO_KEY_TRANSACTIONAL)
        contains_global = true;

      if (contains_global)
        break;
    }
  }

  // multi/exec contains commands like ping that do not affect db state.
  if (!transactional && exec_info.watched_keys.empty())
    return Transaction::NOT_DETERMINED;

  if (contains_admin_cmd) {
    multi_mode = Transaction::NON_ATOMIC;
  }
  // Atomic modes fall back to GLOBAL if they contain global commands.
  else if (contains_global && multi_mode == Transaction::LOCK_AHEAD) {
    multi_mode = Transaction::GLOBAL;
  }

  return multi_mode;
}

string CreateExecDescriptor(const std::vector<StoredCmd>& stored_cmds, unsigned num_uniq_shards) {
  string result;
  size_t max_len = std::min<size_t>(20u, stored_cmds.size());
  absl::StrAppend(&result, "EXEC/", num_uniq_shards, "/", max_len);

  return result;
}

string ConnectionLogContext(const facade::Connection* conn) {
  if (conn == nullptr) {
    return "(null-conn)";
  }
  return absl::StrCat("(", conn->RemoteEndpointStr(), ")");
}

string FailedCommandToString(std::string_view command, facade::CmdArgList args,
                             std::string_view reason) {
  string result;
  absl::StrAppend(&result, " ", command);

  if (command != "AUTH" && command != "ACL SETUSER") {
    for (auto arg : args) {
      absl::StrAppend(&result, " ", absl::CHexEscape(arg));
    }
  }

  absl::StrAppend(&result, " failed with reason: ", reason);

  return result;
}

thread_local uint32_t squash_stats_latency_lower_limit_cached;

void UpdateFromFlagsOnThread() {
  if (uint32_t poll = GetFlag(FLAGS_shard_thread_busy_polling_usec);
      poll > 0 && EngineShard::tlocal())
    ProactorBase::me()->SetBusyPollUsec(poll);
  squash_stats_latency_lower_limit_cached = GetFlag(FLAGS_squash_stats_latency_lower_limit);
}

std::vector<std::string> GetMutableFlagNames() {
  return base::GetFlagNames(FLAGS_shard_thread_busy_polling_usec,
                            FLAGS_squash_stats_latency_lower_limit);
}

void UpdateSchedulerFlagsOnThread() {
  using fb2::detail::Scheduler;
  auto* sched = util::fb2::detail::FiberScheduler();
  sched->UpdateConfig(&Scheduler::Config::budget_background_fib,
                      GetFlag(FLAGS_scheduler_background_budget));
  sched->UpdateConfig(&Scheduler::Config::background_sleep_prob,
                      GetFlag(FLAGS_scheduler_background_sleep_prob));
  sched->UpdateConfig(&Scheduler::Config::background_warrant_pct,
                      GetFlag(FLAGS_scheduler_background_warrant));
}

void SetHuffmanTable(const std::string& huffman_table) {
  if (huffman_table.empty())
    return;
  vector<string_view> parts = absl::StrSplit(huffman_table, ',');
  for (const auto& part : parts) {
    vector<string_view> kv = absl::StrSplit(part, ':');
    if (kv.size() != 2 || kv[0].empty() || kv[1].empty()) {
      LOG(ERROR) << "Invalid huffman table entry" << part;
      continue;
    }
    string domain_str = absl::AsciiStrToUpper(kv[0]);
    CompactObj::HuffmanDomain domain;

    if (domain_str == "KEYS") {
      domain = CompactObj::HUFF_KEYS;
    } else if (domain_str == "STRINGS") {
      domain = CompactObj::HUFF_STRING_VALUES;
    } else {
      LOG(ERROR) << "Unknown huffman domain: " << kv[0];
      continue;
    }

    string unescaped;
    if (!absl::Base64Unescape(kv[1], &unescaped)) {
      LOG(ERROR) << "Failed to decode base64 huffman table for domain " << kv[0] << " with value "
                 << kv[1];
      continue;
    }

    atomic_bool success = true;
    shard_set->RunBriefInParallel([&](auto* shard) {
      if (!CompactObj::InitHuffmanThreadLocal(domain, unescaped)) {
        success = false;
      }
    });
    LOG_IF(ERROR, !success) << "Failed to set huffman table for domain " << kv[0] << " with value "
                            << kv[1];
  }
}

string_view CommandOptName(CO::CommandOpt opt, bool enabled) {
  using namespace CO;
  if (!enabled) {
    if (opt == FAST)
      return "SLOW";
    return "";
  }

  switch (opt) {
    case JOURNALED:
      return "write";
    case READONLY:
      return "readonly";
    case DENYOOM:
      return "denyoom";
    case FAST:
      return "fast";
    case LOADING:
      return "loading";
    case DANGEROUS:
      return "dangerous";
    case ADMIN:
      return "admin";
    case NOSCRIPT:
      return "noscript";
    case BLOCKING:
      return "blocking";
    case HIDDEN:
    case GLOBAL_TRANS:
    case STORE_LAST_KEY:
    case VARIADIC_KEYS:
    case NO_AUTOJOURNAL:
    case NO_KEY_TRANSACTIONAL:
    case NO_KEY_TX_SPAN_ALL:
    case IDEMPOTENT:
      return "";
  }
  return "";
}

OpResult<void> OpTrackKeys(const OpArgs slice_args, const facade::Connection::WeakRef& conn_ref,
                           const ShardArgs& args) {
  if (conn_ref.IsExpired()) {
    DVLOG(2) << "Connection expired, exiting TrackKey function.";
    return OpStatus::OK;
  }

  DVLOG(2) << "Start tracking keys for client ID: " << conn_ref.GetClientId();

  // TODO: There is a bug here that we track all arguments instead of tracking only keys.
  auto& db_slice = slice_args.GetDbSlice();
  for (auto key : args)
    db_slice.TrackKey(conn_ref, key);

  return OpStatus::OK;
}

void TrackIfNeeded(CommandContext* cmd_cntx) {
  auto* cntx = cmd_cntx->server_conn_cntx();
  auto& info = cntx->conn_state.tracking_info_;

  if (!info.IsTrackingOn()) {
    return;
  }

  if (auto* tx = cmd_cntx->tx(); tx) {
    // Reset it, because in multi/exec the transaction pointer is the same and
    // we will end up triggerring the callback on the following commands. To avoid this
    // we reset it.
    tx->SetTrackingCallback({});
    if (cmd_cntx->cid()->IsReadOnly() && info.ShouldTrackKeys()) {
      auto conn = cntx->conn()->Borrow();
      tx->SetTrackingCallback([conn](Transaction* trans) {
        auto* shard = EngineShard::tlocal();
        OpTrackKeys(trans->GetOpArgs(shard), conn, trans->GetShardArgs(shard->shard_id()));
      });
    }
  }
}

// Check CLIENT PAUSE state and block if needed
void CheckPauseState(facade::Connection* conn, ConnectionContext* dfly_cntx, const CommandId* cid) {
  auto& etl = *ServerState::tlocal();
  if (etl.IsPaused() && !conn->IsPrivileged()) {
    bool is_write = cid->IsJournaled();
    is_write |= cid->name() == "PUBLISH" || cid->name() == "EVAL" || cid->name() == "EVALSHA";
    is_write |= cid->name() == "EXEC" && dfly_cntx->conn_state.exec_info.is_write;

    dfly_cntx->paused = true;
    etl.AwaitPauseState(is_write);
    dfly_cntx->paused = false;
  }
}

// Prepare transaction for DispatchCommand.
//
// Return value:
//   first  - newly created top-level transaction (or nullptr if none).
//   second - result: overall status of preparation.
pair<intrusive_ptr<Transaction>, OpStatus> PrepareTransaction(const CommandId* cid,
                                                              ArgSlice tail_args,
                                                              CommandContext* cmd_ctx) {
  auto* dfly_cntx = cmd_ctx->server_conn_cntx();
  bool init = false;
  intrusive_ptr<Transaction> res;
  if (dfly_cntx->transaction) {  // Existing transaction context (e.g., MULTI/EXEC or script)
    DCHECK(dfly_cntx->transaction->IsMulti());  // dispatching in multi
    if (cid->IsTransactional()) {
      dfly_cntx->transaction->MultiSwitchCmd(cid);
      init = true;
    }
  } else {
    if (cid->IsTransactional()) {
      res.reset(new Transaction{cid});
      init = !res->IsMulti();  // Multi command initialize themselves based on their mode
    }
    dfly_cntx->transaction = res.get();
  }

  cmd_ctx->SetupTx(cid, dfly_cntx->transaction);

  if (init) {
    DCHECK(cmd_ctx->tx());
    if (auto st =
            cmd_ctx->tx()->InitByArgs(dfly_cntx->ns, dfly_cntx->conn_state.db_index, tail_args);
        st != OpStatus::OK) {
      if (res) {
        dfly_cntx->transaction = nullptr;
      }
      return {nullptr, st};
    }

    if (res)  // new transaction
      dfly_cntx->last_command_debug.shards_count = cmd_ctx->tx()->GetUniqueShardCnt();
  }

  return {std::move(res), OpStatus::OK};
}

void StoreInMultiBlock(ConnectionContext* dfly_cntx, const CommandId* cid, ArgSlice tail_args) {
  // TODO: protect against aggregating huge transactions.
  auto& exec_info = dfly_cntx->conn_state.exec_info;
  const size_t old_size = exec_info.GetStoredCmdBytes();
  exec_info.AddStoredCmd(cid, tail_args);  // Deep copy of args.
  ServerState::tlocal()->stats.stored_cmd_bytes += exec_info.GetStoredCmdBytes() - old_size;
}

bool ShouldLogError(const CommandId& cid, string_view reason, CmdArgList tail_args) {
  if (absl::StartsWith(reason, "-BUSYGROUP"))
    return false;

  if (cid.name() != "CLIENT")
    return true;
  return tail_args.empty() || !absl::EqualsIgnoreCase(tail_args.front(), "maint_notifications");
}

}  // namespace

Service::Service(ProactorPool* pp)
    : pp_(*pp),
      acl_family_(&user_registry_, pp),
      server_family_(this),
      cluster_family_(&server_family_) {
  CHECK(pp);
  CHECK(shard_set == NULL);

#ifdef PRINT_STACKTRACES_ON_SIGNAL
  LOG(INFO) << "PRINT STACKTRACES REGISTERED";
  ProactorBase::RegisterSignal({SIGUSR1}, pp_.GetNextProactor(), [this](int signal) {
    LOG(INFO) << "Received " << strsignal(signal);
    base::SetVLogLevel("uring_proactor", 2);

    util::fb2::Mutex m;
    pp_.AwaitFiberOnAll([&m](unsigned index, util::ProactorBase* base) {
      util::fb2::LockGuard lk(m);
      util::fb2::detail::FiberInterface::PrintAllFiberStackTraces();
    });
  });
#endif

  CHECK(shard_set == nullptr);
  shard_set = new EngineShardSet(pp);

  // We support less than 1024 threads and we support less than 1024 shards.
  // For example, Scan uses 10 bits in cursor to encode shard id it currently traverses.
  CHECK_LT(pp->size(), kMaxThreadSize);
  RegisterCommands();

  exec_cid_ = FindCmd("EXEC");

  engine_varz.emplace("engine", [this] { return GetVarzStats(); });
}

Service::~Service() {
#ifdef PRINT_STACKTRACES_ON_SIGNAL
  ProactorBase::ClearSignal({SIGUSR1}, true);
#endif

  delete shard_set;
  shard_set = nullptr;
}

void RegisterMutableFlags(ConfigRegistry* reg, absl::Span<const std::string> names,
                          std::function<void()> f) {
  auto cb = [f](auto&&) {
    shard_set->pool()->AwaitBrief([f](unsigned tid, auto*) { f(); });
    return true;
  };
  for (std::string_view name : names)
    reg->RegisterMutable(name, cb);
}

void Service::Init(util::AcceptServer* acceptor, std::vector<facade::Listener*> listeners) {
  InitRedisTables();
  facade::Connection::Init(pp_.size());

#if defined(WITH_SEARCH)
  // Initialize SimSIMD runtime if needed (explicit, avoids implicit static initializers)
  dfly::search::InitSimSIMD();
#endif

  config_registry.RegisterMutable("dbfilename");
  config_registry.Register("dbnum");  // equivalent to databases in redis.
  config_registry.Register("dir");
  config_registry.RegisterMutable("enable_heartbeat_eviction");
  config_registry.RegisterMutable("enable_heartbeat_rss_eviction");
  config_registry.RegisterMutable("masterauth");
  config_registry.RegisterMutable("masteruser");
  config_registry.RegisterMutable("max_eviction_per_heartbeat");
  config_registry.RegisterMutable("max_segment_to_consider");
  config_registry.RegisterMutable("pipeline_squash");
  config_registry.RegisterMutable("lua_mem_gc_threshold");
  config_registry.RegisterMutable("background_debug_jobs");

  // Register ServerState flags
  RegisterMutableFlags(&config_registry, ServerState::GetMutableFlagNames(),
                       []() { ServerState::tlocal()->UpdateFromFlags(); });
  // Register Connection flags
  RegisterMutableFlags(&config_registry, facade::Connection::GetMutableFlagNames(),
                       []() { facade::Connection::UpdateFromFlags(); });
  // Register tiered storage flags
  RegisterMutableFlags(&config_registry, TieredStorage::GetMutableFlagNames(), []() {
    if (auto* es = EngineShard::tlocal(); es && es->tiered_storage()) {
      es->tiered_storage()->UpdateFromFlags();
    }
  });
  // Register main service flags
  RegisterMutableFlags(&config_registry, GetMutableFlagNames(),
                       []() { UpdateFromFlagsOnThread(); });
  // Register squsher flags
  RegisterMutableFlags(&config_registry, MultiCommandSquasher::GetMutableFlagNames(),
                       []() { MultiCommandSquasher::UpdateFromFlags(); });

  // Register scheduler flags
  RegisterMutableFlags(
      &config_registry,
      base::GetFlagNames(FLAGS_scheduler_background_budget, FLAGS_scheduler_background_sleep_prob,
                         FLAGS_scheduler_background_warrant),
      []() { UpdateSchedulerFlagsOnThread(); });

  config_registry.RegisterSetter<strings::MemoryBytesFlag>(
      "maxmemory", [](const strings::MemoryBytesFlag& flag) {
        // TODO: reduce code reliance on constant direct access of max_memory_limit
        max_memory_limit.store(flag.value, memory_order_relaxed);
      });

  config_registry.RegisterMutable("replica_partial_sync");
  config_registry.RegisterMutable("background_snapshotting");
  config_registry.RegisterMutable("replication_timeout");
  config_registry.RegisterMutable("migration_finalization_timeout_ms");
  config_registry.RegisterMutable("slot_migration_throttle_us");
  config_registry.RegisterMutable("table_growth_margin");
  config_registry.RegisterMutable("tcp_keepalive");
  config_registry.RegisterMutable("timeout");
  config_registry.RegisterMutable("send_timeout");
  config_registry.RegisterMutable("managed_service_info");
#ifdef WITH_SEARCH
  config_registry.RegisterMutable("MAXSEARCHRESULTS");
  config_registry.RegisterMutable("search_query_string_bytes");
#endif

  config_registry.RegisterMutable(
      "notify_keyspace_events", [pool = &pp_](const absl::CommandLineFlag& flag) {
        auto res = flag.TryGet<std::string>();
        if (!res.has_value() || (!res->empty() && !absl::EqualsIgnoreCase(*res, "EX"))) {
          return false;
        }

        pool->AwaitBrief([&res](unsigned, auto*) {
          auto* shard = EngineShard::tlocal();
          if (shard) {
            auto shard_id = shard->shard_id();
            auto& db_slice = namespaces->GetDefaultNamespace().GetDbSlice(shard_id);
            db_slice.SetNotifyKeyspaceEvents(*res);
          }
        });

        return true;
      });

  config_registry.RegisterMutable("aclfile");
  config_registry.RegisterSetter<uint32_t>("acllog_max_len", [](uint32_t val) {
    shard_set->pool()->AwaitFiberOnAll(
        [val](auto index, auto* context) { ServerState::tlocal()->acl_log.SetTotalEntries(val); });
  });

  uint32_t shard_num = GetFlag(FLAGS_num_shards);
  if (shard_num == 0 || shard_num > pp_.size()) {
    LOG_IF(WARNING, shard_num > pp_.size())
        << "Requested num_shards (" << shard_num << ") is bigger than thread count (" << pp_.size()
        << "), using num_shards=" << pp_.size();
    shard_num = pp_.size();
  }

  // We assume that listeners.front() is the main_listener
  // see dfly_main RunEngine. In unit tests, listeners are empty.
  facade::Listener* main_listener = listeners.empty() ? nullptr : listeners.front();

  ChannelStore* cs = new ChannelStore{};
  // Must initialize before the shard_set because EngineShard::Init references ServerState.
  pp_.AwaitBrief([&](uint32_t index, ProactorBase* pb) {
    tl_facade_stats = new FacadeStats;
    ServerState::Init(index, shard_num, main_listener, &user_registry_);
    ServerState::tlocal()->UpdateChannelStore(cs);
  });

  const auto tcp_disabled = GetFlag(FLAGS_port) == 0u;
  // We assume that listeners.front() is the main_listener
  // see dfly_main RunEngine
  if (!tcp_disabled && main_listener) {
    acl_family_.Init(main_listener, &user_registry_);
  }

  // Initialize shard_set with a callback running once in a while in the shard threads.
  shard_set->Init(shard_num, [this] {
    server_family_.GetDflyCmd()->BreakStalledFlowsInShard();
    server_family_.UpdateMemoryGlobalStats();
  });
  // InitThreadLocals might block
  pp_.AwaitFiberOnAll(
      [&](uint32_t index, ProactorBase* pb) { sharding::InitThreadLocals(shard_set->size()); });
  Transaction::Init(shard_num);

  shard_set->pool()->AwaitBrief([](unsigned, auto*) {
    facade::Connection::UpdateFromFlags();
    UpdateFromFlagsOnThread();
    UpdateSchedulerFlagsOnThread();
  });
  SetHuffmanTable(GetFlag(FLAGS_huffman_table));

  // Requires that shard_set will be initialized before because server_family_.Init might
  // load the snapshot.
  server_family_.Init(acceptor, std::move(listeners));
}

void Service::Shutdown() {
  VLOG(1) << "Service::Shutdown";

  // We mark that we are shutting down. After this incoming requests will be
  // rejected.
  mu_.lock();
  global_state_ = GlobalState::SHUTTING_DOWN;
  mu_.unlock();

  pp_.AwaitFiberOnAll([](ProactorBase* pb) {
    ServerState::tlocal()->EnterLameDuck();
    facade::Connection::ShutdownThreadLocal();
  });

  config_registry.Reset();

  // to shutdown all the runtime components that depend on EngineShard
  cluster_family_.Shutdown();
  server_family_.Shutdown();

  shutdown_watchdog.emplace(pp_);

  engine_varz.reset();

  ChannelStore::Destroy();

  shard_set->PreShutdown();
  shard_set->Shutdown();

  Transaction::Shutdown();

  pp_.AwaitFiberOnAll([](ProactorBase* pb) {
#if defined(DFLY_USE_SSL)
    // Explicitly release OpenSSL thread-local state here.
    // This prevents a potential crash during thread exit where the allocator (e.g. mimalloc)
    // might tear down the thread's heap before OpenSSL tries to free its internal state.
    OPENSSL_thread_stop();
#endif
    ServerState::tlocal()->Destroy();
  });

  // wait for all the pending callbacks to stop.
  ThisFiber::SleepFor(10ms);
  facade::Connection::Shutdown();

  shutdown_watchdog->Disarm();
}

OpResult<KeyIndex> Service::FindKeys(const CommandId* cid, CmdArgList args) {
  // Sharded pub-sub acts as if it's sharded by its channel name (just for checks)
  if (cid->PubSubKind() == CO::PubSubKind::SHARDED) {
    // SPUBLISH has only one key, the rest is data
    if (cid->name() == registry_.RenamedOrOriginal("SPUBLISH"))
      return KeyIndex(0, 1);
    return {KeyIndex(0, args.size())};  // sub/unsub list of channels
  }

  return DetermineKeys(cid, args);
}

optional<ErrorReply> Service::CheckKeysOwnership(const CommandId& cid, CmdArgList args,
                                                 const ConnectionContext& dfly_cntx) {
  if (dfly_cntx.is_replicating) {
    // Always allow commands on the replication port, as it might be for future-owned keys.
    return nullopt;
  }

  if (cid.first_key_pos() == 0 && cid.PubSubKind() != CO::PubSubKind::SHARDED) {
    return nullopt;  // No key command.
  }

  OpResult<KeyIndex> key_index_res = FindKeys(&cid, args);

  if (!key_index_res) {
    return ErrorReply{key_index_res.status()};
  }

  const auto& key_index = *key_index_res;

  UniqueSlotChecker slot_checker;
  for (string_view key : key_index.Range(args)) {
    slot_checker.Add(key);
  }

  if (slot_checker.IsCrossSlot()) {
    return ErrorReply{kCrossSlotError};
  }

  optional<SlotId> keys_slot = slot_checker.GetUniqueSlotId();

  if (keys_slot.has_value()) {
    if (auto error = cluster::SlotOwnershipError(*keys_slot);
        !error.status.has_value() || error.status.value() != facade::OpStatus::OK) {
      return ErrorReply{std::move(error)};
    }
  }

  return nullopt;
}

// TODO(kostas) refactor. Almost 1-1 with CheckKeyOwnership() above.
std::optional<facade::ErrorReply> Service::TakenOverSlotError(const CommandId& cid, CmdArgList args,
                                                              const ConnectionContext& dfly_cntx) {
  if (cid.first_key_pos() == 0 && cid.PubSubKind() != CO::PubSubKind::SHARDED) {
    return nullopt;  // No key command.
  }

  OpResult<KeyIndex> key_index_res = FindKeys(&cid, args);

  if (!key_index_res) {
    return ErrorReply{key_index_res.status()};
  }

  const auto& key_index = *key_index_res;

  UniqueSlotChecker slot_checker;
  for (string_view key : key_index.Range(args)) {
    slot_checker.Add(key);
  }

  if (slot_checker.IsCrossSlot()) {
    return ErrorReply{kCrossSlotError};
  }

  optional<SlotId> keys_slot = slot_checker.GetUniqueSlotId();
  if (!keys_slot.has_value()) {
    return nullopt;
  }

  if (auto error = cluster::SlotOwnershipError(*keys_slot);
      !error.status.has_value() || error.status.value() != facade::OpStatus::OK) {
    return ErrorReply{std::move(error)};
  }
  const auto cluster_config = cluster::ClusterConfig::Current();
  if (!cluster_config)
    return facade::ErrorReply{facade::kClusterNotConfigured};

  // Moved regardless, we have been taken over
  cluster::ClusterNodeInfo redirect = cluster_config->GetMasterNodeForSlot(*keys_slot);
  return facade::ErrorReply{
      absl::StrCat("-MOVED ", *keys_slot, " ", redirect.ip, ":", redirect.port), "MOVED"};
}

// Return OK if all keys are allowed to be accessed: either declared in EVAL or
// transaction is running in global or non-atomic mode.
optional<ErrorReply> CheckKeysDeclared(const ConnectionState::ScriptInfo& eval_info,
                                       const CommandId* cid, CmdArgList args,
                                       Transaction::MultiMode multi_mode) {
  // We either scheduled on all shards or re-schedule for each operation,
  // so we are not restricted to any keys.
  if (multi_mode == Transaction::GLOBAL || multi_mode == Transaction::NON_ATOMIC)
    return nullopt;

  OpResult<KeyIndex> key_index_res = DetermineKeys(cid, args);
  if (!key_index_res)
    return ErrorReply{key_index_res.status()};

  // TODO: Switch to transaction internal locked keys once single hop multi transactions are merged
  // const auto& locked_keys = trans->GetMultiKeys();
  const auto& locked_tags = eval_info.lock_tags;
  for (string_view key : key_index_res->Range(args)) {
    if (!locked_tags.contains(LockTag{key})) {
      return ErrorReply(absl::StrCat(kUndeclaredKeyErr, ", key: ", key));
    }
  }

  return nullopt;
}

static optional<ErrorReply> VerifyConnectionAclStatus(const CommandId* cid,
                                                      const ConnectionContext* cntx,
                                                      string_view error_msg, ArgSlice tail_args) {
  if (!acl::IsUserAllowedToInvokeCommand(*cntx, *cid, tail_args)) {
    return ErrorReply(absl::StrCat("-NOPERM ", cntx->authed_username, " ", error_msg));
  }
  return nullopt;
}

bool ShouldDenyOnOOM(const CommandContext& cmd_cntx) {
  DCHECK_NE(cmd_cntx.start_time_ns, 0u);
  ServerState& etl = *ServerState::tlocal();
  if ((cmd_cntx.cid()->opt_mask() & CO::DENYOOM) && etl.is_master) {
    auto memory_stats = etl.GetMemoryUsage(cmd_cntx.start_time_ns);

    size_t limit = max_memory_limit.load(memory_order_relaxed);
    if (memory_stats.used_mem > limit ||
        (etl.rss_oom_deny_ratio > 0 && memory_stats.rss_mem > (limit * etl.rss_oom_deny_ratio))) {
      DLOG(WARNING) << "Out of memory, used " << memory_stats.used_mem << " ,rss "
                    << memory_stats.rss_mem << " ,limit " << limit;
      etl.stats.oom_error_cmd_cnt++;
      return true;
    }
  }
  return false;
}

std::optional<ErrorReply> Service::VerifyCommandState(const CommandId& cid, CmdArgList tail_args,
                                                      const ConnectionContext& dfly_cntx) {
  ServerState& etl = *ServerState::tlocal();

  // If there is no connection owner, it means the command it being called
  // from another command or used internally, therefore is always permitted.
  if (dfly_cntx.conn() != nullptr && !dfly_cntx.conn()->IsPrivileged() && cid.IsRestricted()) {
    VLOG(1) << "Non-admin attempt to execute " << cid.name() << " " << tail_args << " "
            << ConnectionLogContext(dfly_cntx.conn());
    return ErrorReply{"Cannot execute restricted command (admin only)", kRestrictDenied};
  }

  if (auto err = cid.Validate(tail_args); err)
    return err;

  // Check if the command is allowed to execute under this global state
  bool allowed_by_state = true;
  const GlobalState gstate = etl.gstate();
  switch (gstate) {
    case GlobalState::LOADING:
      allowed_by_state = dfly_cntx.journal_emulated || (cid.opt_mask() & CO::LOADING);
      break;
    case GlobalState::SHUTTING_DOWN:
      allowed_by_state = false;
      break;
    case GlobalState::TAKEN_OVER:
      // Only PING, admin commands, and all commands via admin connections are allowed
      // we prohibit even read commands, because read commands running in pipeline can take a while
      // to send all data to a client which leads to fail in takeover
      allowed_by_state =
          dfly_cntx.conn()->IsPrivileged() || (cid.opt_mask() & CO::ADMIN) || cid.name() == "PING";
      break;
    default:
      break;
  }

  if (!allowed_by_state) {
    VLOG(1) << "Command " << cid.name() << " not executed because global state is " << gstate;

    if (gstate == GlobalState::LOADING) {
      return ErrorReply(kLoadingErr);
    }

    if (gstate == GlobalState::TAKEN_OVER) {
      if (IsClusterEnabled()) {
        if (auto err = TakenOverSlotError(cid, tail_args, dfly_cntx); err) {
          return err;
        }
      }
      return ErrorReply(kLoadingErr);
    }

    return ErrorReply{StrCat("Can not execute during ", GlobalStateName(gstate))};
  }

  string_view cmd_name{cid.name()};

  if (dfly_cntx.req_auth && !dfly_cntx.authenticated) {
    if (cmd_name != "AUTH" && cmd_name != "QUIT" && cmd_name != "HELLO") {
      return ErrorReply{"-NOAUTH Authentication required.", facade::kNoAuthErrType};
    }
  }

  // only reset and quit are allow if this connection is used for monitoring
  if (dfly_cntx.monitor && (cmd_name != "RESET" && cmd_name != "QUIT"))
    return ErrorReply{"Replica can't interact with the keyspace"};

  bool is_write_cmd = cid.IsJournaled();
  bool is_trans_cmd = cid.MultiControlKind() == CO::MultiControlKind::EXEC;
  bool under_script = dfly_cntx.conn_state.script_info != nullptr;
  bool multi_active = dfly_cntx.conn_state.exec_info.IsCollecting() && !is_trans_cmd;

  if (!etl.is_master && is_write_cmd && !dfly_cntx.is_replicating)
    return ErrorReply{"-READONLY You can't write against a read only replica."};

  if (multi_active) {
    if (cmd_name == "WATCH" || cmd_name == "FLUSHALL" || cmd_name == "FLUSHDB" ||
        absl::EndsWith(cmd_name, "SUBSCRIBE"))
      return ErrorReply{absl::StrCat("'", cmd_name, "' not allowed inside a transaction")};
  }

  if (IsClusterEnabled()) {
    if (auto err = CheckKeysOwnership(cid, tail_args, dfly_cntx); err)
      return err;
  }

  if (under_script && (cid.opt_mask() & CO::NOSCRIPT))
    return ErrorReply{"This Redis command is not allowed from script"};

  if (under_script) {
    auto* tx = dfly_cntx.transaction;
    DCHECK(tx);
    // The following commands access shards arbitrarily without having keys, so they can only be run
    // non atomically or globally.
    Transaction::MultiMode mode = tx->GetMultiMode();
    bool shard_access = (cid.opt_mask()) & (CO::GLOBAL_TRANS | CO::NO_KEY_TRANSACTIONAL);
    if (shard_access && (mode != Transaction::GLOBAL && mode != Transaction::NON_ATOMIC))
      return ErrorReply("This Redis command is not allowed from script");

    if (cid.IsTransactional()) {
      auto err = CheckKeysDeclared(*dfly_cntx.conn_state.script_info, &cid, tail_args, mode);

      if (err.has_value()) {
        VLOG(1) << "CheckKeysDeclared failed with error " << err->ToSv() << " for command "
                << cid.name();
        return err;
      }
    }

    if (dfly_cntx.conn_state.script_info->read_only && is_write_cmd) {
      return ErrorReply{"Write commands are not allowed from read-only scripts"};
    }
  }

  return VerifyConnectionAclStatus(&cid, &dfly_cntx, "has no ACL permissions", tail_args);
}

DispatchResult Service::DispatchCommand(facade::ParsedArgs args, facade::ParsedCommand* parsed_cmd,
                                        facade::AsyncPreference async_pref) {
  DCHECK(!args.empty());
  DCHECK_NE(0u, shard_set->size()) << "Init was not called";

  // We must resolve the command ID (cid) before the guard block.
  // The following switch statement relies on the command's metadata
  // (e.g., SupportsAsync()) to evaluate execution preferences,
  // making this lookup a hard dependency for the logic below.
  string cmd = absl::AsciiStrToUpper(args.Front());
  const auto [cid, args_no_cmd] = registry_.FindExtended(cmd, args.Tail());
  if (cid == nullptr) {
    if (async_pref != AsyncPreference::ONLY_SYNC) {
      parsed_cmd->SetDeferredReply();
    }
    parsed_cmd->SendError(ReportUnknownCmd(cmd));
    return DispatchResult::ERROR;
  }

  // Determine if command should run async
  switch (async_pref) {
    case AsyncPreference::ONLY_SYNC:
      break;
    case AsyncPreference::ONLY_ASYNC:
      if (!cid->SupportsAsync())
        return DispatchResult::WOULD_BLOCK;
      [[fallthrough]];
    case AsyncPreference::PREFER_ASYNC:
      if (cid->SupportsAsync())
        parsed_cmd->SetDeferredReply();
      break;
  };

  CommandContext* cmd_cntx = static_cast<CommandContext*>(parsed_cmd);
  ConnectionContext* dfly_cntx = cmd_cntx->server_conn_cntx();

  if (dfly_cntx->async_dispatch && cid->IsBlocking()) {
    ++ServerState::tlocal()->stats.blocking_commands_in_pipelines;
    cmd_cntx->conn()->FlushReplies();
  }

  ArgSlice tail_args;
  if (cmd_cntx->IsDeferredReply()) {
    args_no_cmd.ToVec(&cmd_cntx->arg_slice_backing);  // Ensure lifetime
    tail_args = cmd_cntx->arg_slice_backing;
  } else {
    tail_args = args_no_cmd.ToSlice(&cmd_cntx->arg_slice_backing);
  }

  // Block on CLIENT PAUSE if needed
  if (auto* conn = cmd_cntx->conn(); conn /* replica context doesn't have an owner */) {
    if (VLOG_IS_ON(2)) {
      bool under_script = bool(dfly_cntx->conn_state.script_info);
      LOG(INFO) << "Got (" << conn->GetClientId() << "): " << (under_script ? "LUA " : "")
                << cid->name() << " " << tail_args << " in dbid=" << dfly_cntx->conn_state.db_index;
    }

    // Check pause state only if it is a top level transaction.
    if (dfly_cntx->transaction == nullptr)
      CheckPauseState(conn, dfly_cntx, cid);
  }

  // Verify command state
  if (auto err = VerifyCommandState(*cid, tail_args, *dfly_cntx); err) {
    LOG_IF(WARNING, dfly_cntx->replica_conn || !dfly_cntx->conn() /* no owner in replica context */)
        << "VerifyCommandState error: " << err->ToSv();
    if (auto& exec_info = dfly_cntx->conn_state.exec_info; exec_info.IsCollecting())
      exec_info.state = ConnectionState::ExecInfo::EXEC_ERROR;

    // We need to skip this because ACK's should not be replied to
    // Bonus points because this allows to continue replication with ACL users who got
    // their access revoked and reinstated

    if (cid->name() == "REPLCONF") {
      DCHECK_GE(args_no_cmd.size(), 1u);
      // We should not reply to REPLCONF ACKS.
      if (absl::EqualsIgnoreCase(args_no_cmd.Front(), "ACK")) {
        server_family_.GetDflyCmd()->OnClose(
            dfly_cntx->conn_state.replication_info.repl_session_id);
        return DispatchResult::ERROR;
      }
    }
    DCHECK(!err->status);
    cmd_cntx->SendError(*err);
    return DispatchResult::ERROR;
  }

  VLOG_IF(1, cid->opt_mask() & CO::CommandOpt::DANGEROUS)
      << "Executing dangerous command " << cid->name() << " "
      << ConnectionLogContext(dfly_cntx->conn());

  // If inside MULTI block, store command
  bool is_trans_cmd = cid->MultiControlKind() == CO::MultiControlKind::EXEC;
  if (dfly_cntx->conn_state.exec_info.IsCollecting() && !is_trans_cmd) {
    StoreInMultiBlock(dfly_cntx, cid, tail_args);
    cmd_cntx->SendSimpleString("QUEUED");
    return DispatchResult::OK;
  }

  auto [dispatched_tx, status] = PrepareTransaction(cid, tail_args, cmd_cntx);
  if (status != OpStatus::OK) {
    DCHECK(!dispatched_tx);
    cmd_cntx->SendError(StatusToMsg(status));
    return DispatchResult::ERROR;
  }

  DispatchResult res = InvokeCmd(tail_args, cmd_cntx);
  if (dispatched_tx) {
    DCHECK(dfly_cntx->transaction == dispatched_tx.get());
    dfly_cntx->transaction = nullptr;
  }

  if ((res != DispatchResult::OK) && (res != DispatchResult::OOM)) {
    cmd_cntx->SendError("Internal Error");
    dfly_cntx->conn()->MarkForClose();
  }

  return res;
}

class ReplyGuard {
 public:
  explicit ReplyGuard(const CommandContext& cmd_cntx) {
    const bool is_script = bool(cmd_cntx.server_conn_cntx()->conn_state.script_info);
    cid_name_ = cmd_cntx.cid()->name();
    const bool is_one_of = (cid_name_ == "REPLCONF" || cid_name_ == "DFLY");
    bool is_mcache = cmd_cntx.mc_command() != nullptr;
    const bool is_no_reply_memcache =
        (is_mcache && cmd_cntx.mc_command()->cmd_flags.no_reply) || cid_name_ == "QUIT";
    const bool should_dcheck = !is_one_of && !is_script && !is_no_reply_memcache;
    if (should_dcheck) {
      cmd_cntx_ = &cmd_cntx;
      replies_recorded_ = cmd_cntx.rb()->RepliesRecorded();
    }
  }

  ~ReplyGuard() {
    if (cmd_cntx_ && !cmd_cntx_->IsDeferredReply()) {
      auto* rb = cmd_cntx_->rb();
      DCHECK_GT(rb->RepliesRecorded(), replies_recorded_) << cid_name_ << " " << typeid(*rb).name();
    }
  }

 private:
  const CommandContext* cmd_cntx_ = nullptr;
  size_t replies_recorded_ = 0;
  std::string_view cid_name_;
};

DispatchResult Service::InvokeCmd(CmdArgList tail_args, CommandContext* cmd_cntx) {
  auto* cid = cmd_cntx->cid();
  DCHECK(cid);
  DCHECK(!cid->Validate(tail_args));

  cmd_cntx->start_time_ns = absl::GetCurrentTimeNanos();

  ConnectionContext* cntx = cmd_cntx->server_conn_cntx();
  auto* builder = cmd_cntx->rb();
  DCHECK(builder);
  DCHECK(cntx);

  if (ShouldDenyOnOOM(*cmd_cntx)) {
    cmd_cntx->SendError(ErrorReply{OpStatus::OUT_OF_MEMORY});
    return DispatchResult::OOM;
  }

  bool has_monitors = !ServerState::tlocal()->Monitors().Empty();
  if (cid->CanBeMonitored() && has_monitors) {
    DispatchMonitor(cntx, cid, tail_args);
  }

  ServerState::tlocal()->RecordCmd(cntx->has_main_or_memcache_listener);
  TrackIfNeeded(cmd_cntx);
  auto* tx = cmd_cntx->tx();

  // For EVAL[] and EXEC/DISCARD, clean up state.
  // We don't do it directly in commands to allow some introspection after execution (slowlog).
  absl::Cleanup mck_cleanup = [cntx, cid, mck = cid->MultiControlKind()]() {
    if (mck && *mck == CO::MultiControlKind::EXEC && cid->name() != "MULTI")
      MultiCleanup(cntx);
    else if (mck && *mck == CO::MultiControlKind::EVAL)
      cntx->conn_state.script_info.reset();
  };

#ifndef NDEBUG
  // Verifies that we reply to the client when needed.
  ReplyGuard reply_guard(*cmd_cntx);
#endif
  builder->ConsumeLastError();  // throw away last error
  try {
    cid->Invoke(tail_args, cmd_cntx);
  } catch (std::exception& e) {
    LOG(ERROR) << "Internal error, system probably unstable " << e.what();
    return DispatchResult::ERROR;
  }

  DispatchResult res = DispatchResult::OK;
  if (std::string reason = builder->ConsumeLastError(); !reason.empty()) {
    // Set flag if OOM reported
    if (reason == kOutOfMemory) {
      res = DispatchResult::OOM;
    }
    VLOG(2) << FailedCommandToString(cid->name(), tail_args, reason);
    if (ShouldLogError(*cid, reason, tail_args)) {
      LOG_EVERY_T(WARNING, 1) << FailedCommandToString(cid->name(), tail_args, reason);
    }
  }

  if (cntx->conn_state.tracking_info_.IsTrackingOn()) {
    if ((!tx && cid->name() != "MULTI") || (tx && !tx->IsMulti())) {
      // Each time we execute a command we need to increase the sequence number in
      // order to properly track clients when OPTIN is used.
      // We don't do this for `multi/exec` because it would break the
      // semantics, i.e, CACHING should stick for all commands following
      // the CLIENT CACHING ON within a multi/exec block
      cntx->conn_state.tracking_info_.IncrementSequenceNumber();
    }
  }

  cmd_cntx->RecordLatency(tail_args);

  if (tx && !cntx->conn_state.exec_info.IsRunning() && cntx->conn_state.script_info == nullptr) {
    cntx->last_command_debug.clock = tx->txid();
  }

  return res;
}

DispatchManyResult Service::DispatchManyCommands(std::function<facade::ParsedArgs()> arg_gen,
                                                 unsigned count, SinkReplyBuilder* builder,
                                                 facade::ConnectionContext* cntx) {
  ConnectionContext* dfly_cntx = static_cast<ConnectionContext*>(cntx);
  DCHECK(!dfly_cntx->conn_state.exec_info.IsRunning());
  DCHECK_EQ(builder->GetProtocol(), Protocol::REDIS);
  DCHECK_GT(count, 1u);

  auto* ss = dfly::ServerState::tlocal();
  // Don't even start when paused. We can only continue if DispatchTracker is aware of us running.
  if (ss->IsPaused())
    return {.processed = 0, .account_in_stats = false};

  vector<StoredCmd> stored_cmds;
  intrusive_ptr<Transaction> dist_trans;
  uint32_t dispatched = 0;
  MultiCommandSquasher::Stats stats;

  uint64_t start_cycles = base::CycleClock::Now();
  CommandContext dummy_cmd_cntx;
  dummy_cmd_cntx.Init(builder, dfly_cntx);

  auto perform_squash = [&] {
    if (stored_cmds.empty())
      return;

    if (!dist_trans) {
      dist_trans.reset(new Transaction{exec_cid_});
      dist_trans->StartMultiNonAtomic();
    } else {
      // Reset to original command id as it's changed during squashing
      dist_trans->MultiSwitchCmd(exec_cid_);
    }

    dfly_cntx->transaction = dist_trans.get();
    MultiCommandSquasher::Opts opts;
    opts.verify_commands = true;
    opts.max_squash_size = ss->max_squash_cmd_num;

    stats += MultiCommandSquasher::Execute(absl::MakeSpan(stored_cmds),
                                           static_cast<RedisReplyBuilder*>(builder), dfly_cntx,
                                           this, opts);
    dfly_cntx->transaction = nullptr;

    dispatched += stored_cmds.size();
    stored_cmds.clear();
  };

  for (unsigned i = 0; i < count; i++) {
    ParsedArgs args = arg_gen();
    string cmd = absl::AsciiStrToUpper(args.Front());
    const auto [cid, tail_args] = registry_.FindExtended(cmd, args.Tail());

    // MULTI...EXEC commands need to be collected into a single context, so squashing is not
    // possible
    const bool is_multi = dfly_cntx->conn_state.exec_info.IsCollecting() ||
                          (cid != nullptr && cid->MultiControlKind() == CO::MultiControlKind::EXEC);

    // Generally, executing any multi-transactions (including eval) is not possible because they
    // might request a stricter multi mode than non-atomic which is used for squashing.
    // TODO: By allowing promoting non-atomic multit transactions to lock-ahead for specific command
    // invocations, we can potentially execute multiple eval in parallel, which is very powerful
    // paired with shardlocal eval
    const bool is_eval = cid != nullptr && cid->MultiControlKind() == CO::MultiControlKind::EVAL;
    const bool is_blocking = cid != nullptr && cid->IsBlocking();

    if (!is_multi && !is_eval && !is_blocking && cid != nullptr) {
      stored_cmds.reserve(count);
      stored_cmds.emplace_back(cid, tail_args);  // Shallow copy
      continue;
    }

    // Squash accumulated commands
    perform_squash();

    // Stop accumulating when a pause is requested, fall back to regular dispatch
    if (ss->IsPaused())
      break;

    // Dispatch non squashed command only after all squshed commands were executed and replied
    DispatchCommand(args, &dummy_cmd_cntx, AsyncPreference::ONLY_SYNC);
    dispatched++;
  }

  perform_squash();

  if (dist_trans)
    dist_trans->UnlockMulti();

  uint64_t total_usec = base::CycleClock::ToUsec(base::CycleClock::Now() - start_cycles);
  bool account_in_stats = total_usec > squash_stats_latency_lower_limit_cached;
  if (account_in_stats) {
    auto* ss = ServerState::tlocal();
    ss->stats.multi_squash_exec_hop_usec += stats.hop_usec;
    ss->stats.multi_squash_exec_reply_usec += stats.reply_usec;
    ss->stats.multi_squash_hops += stats.hops;
    ss->stats.squashed_commands += stats.squashed_commands;
  } else {
    ss->stats.squash_stats_ignored++;
  }
  return {.processed = dispatched, .account_in_stats = account_in_stats};
}

DispatchResult Service::DispatchMC(facade::ParsedCommand* parsed_cmd,
                                   facade::AsyncPreference apref) {
  CommandContext* cmd_ctx = static_cast<CommandContext*>(parsed_cmd);
  const auto& cmd = *parsed_cmd->mc_command();

  auto* cntx = cmd_ctx->server_conn_cntx();
  DCHECK(cntx->transaction == nullptr);

  string_view cmd_name, cmd_opt;
  char buffer[absl::numbers_internal::kFastToBufferSize];

  switch (cmd.type) {
    case MemcacheParser::REPLACE:
      cmd_name = "SET";
      cmd_opt = "XX";
      break;
    case MemcacheParser::SET:
      cmd_name = "SET";
      break;
    case MemcacheParser::ADD:
      cmd_name = "SET";
      cmd_opt = "NX";
      break;
    case MemcacheParser::DELETE:
      cmd_name = "DEL";
      break;
    case MemcacheParser::INCR:
      cmd_name = "INCRBY";
      absl::numbers_internal::FastIntToBuffer(cmd.delta, buffer);
      cmd_opt = buffer;
      break;
    case MemcacheParser::DECR:
      cmd_name = "DECRBY";
      absl::numbers_internal::FastIntToBuffer(cmd.delta, buffer);
      cmd_opt = buffer;
      break;
    case MemcacheParser::APPEND:
      cmd_name = "APPEND";
      break;
    case MemcacheParser::PREPEND:
      cmd_name = "PREPEND";
      break;
    case MemcacheParser::GAT:
    case MemcacheParser::GATS:
      cmd_name = "GAT";
      break;
    case MemcacheParser::GET:
    case MemcacheParser::GETS:
      cmd_name = "MGET";
      break;
    case MemcacheParser::FLUSHALL:
      cmd_name = "FLUSHDB";
      break;
    case MemcacheParser::QUIT:
      cmd_name = "QUIT";
      break;
    case MemcacheParser::STATS:
      if (apref == AsyncPreference::ONLY_ASYNC)
        return DispatchResult::WOULD_BLOCK;
      server_family_.StatsMC(cmd.key(), cmd_ctx);
      return DispatchResult::OK;
    case MemcacheParser::VERSION:
      if (apref == AsyncPreference::ONLY_ASYNC)
        return DispatchResult::WOULD_BLOCK;
      cmd_ctx->SendSimpleString("VERSION 1.6.0 DF");
      return DispatchResult::OK;
    default:
      if (apref != AsyncPreference::ONLY_SYNC) {
        parsed_cmd->SetDeferredReply();
      }
      cmd_ctx->SendSimpleString("CLIENT_ERROR bad command line format");
      return DispatchResult::ERROR;
  }

  absl::InlinedVector<string_view, 8> args = {cmd_name};

  bool is_store = MemcacheParser::IsStoreCmd(cmd.type);
  bool is_read = !is_store && cmd.type < MemcacheParser::QUIT;
  if (!is_read) {
    if (!cmd.backed_args->empty())
      args.emplace_back(cmd.key());

    if (is_store)
      args.emplace_back(cmd.value());
    if (!cmd_opt.empty())
      args.emplace_back(cmd_opt);

    if (cmd.expire_ts && cmd_name == "SET") {
      args.emplace_back("EXAT");
      absl::numbers_internal::FastIntToBuffer(cmd.expire_ts, buffer);
      args.emplace_back(buffer);
    }
  } else {  // is_read
    args.insert(args.end(), cmd.backed_args->begin(), cmd.backed_args->end());
  }

  return DispatchCommand(ParsedArgs{args}, parsed_cmd, apref);
}

ErrorReply Service::ReportUnknownCmd(string_view cmd_name) {
  constexpr uint8_t kMaxUknownCommands = 64;
  constexpr uint8_t kMaxUknownCommandLength = 20;

  lock_guard lk(mu_);
  if (unknown_cmds_.size() <= kMaxUknownCommands && cmd_name.size() <= kMaxUknownCommandLength)
    unknown_cmds_[cmd_name]++;

  return ErrorReply{StrCat("unknown command `", cmd_name, "`"), "unknown_cmd"};
}

bool RequirePrivilegedAuth() {
  return !GetFlag(FLAGS_admin_nopass);
}

facade::ConnectionContext* Service::CreateContext(facade::Connection* owner) {
  auto cred = user_registry_.GetCredentials("default");
  ConnectionContext* res = new ConnectionContext{owner, std::move(cred)};
  res->ns = &namespaces->GetOrInsert("");

  if (owner->socket()->IsUDS()) {
    res->req_auth = false;
    res->skip_acl_validation = true;
  } else if (owner->IsPrivileged() && RequirePrivilegedAuth()) {
    res->req_auth = !GetPassword().empty();
  } else if (!owner->IsPrivileged()) {
    // Memcached protocol doesn't support authentication, so we don't require it
    if (owner->GetProtocol() == Protocol::MEMCACHE) {
      res->req_auth = false;
      res->authenticated = true;  // Automatically authenticated for Memcached protocol
    } else {
      res->req_auth = !user_registry_.AuthUser("default", "");
    }
  }

  // a bit of a hack. I set up breaker callback here for the owner.
  // Should work though it's confusing to have it here.
  owner->RegisterBreakHook([res](uint32_t) {
    if (res->transaction)
      res->transaction->CancelBlocking(nullptr);
  });

  return res;
}

facade::ParsedCommand* Service::AllocateParsedCommand() {
  return new CommandContext{};
}

const CommandId* Service::FindCmd(std::string_view cmd) const {
  return registry_.Find(registry_.RenamedOrOriginal(cmd));
}

bool Service::IsLocked(Namespace* ns, DbIndex db_index, std::string_view key) const {
  ShardId sid = Shard(key, shard_count());
  bool is_open = pp_.at(sid)->AwaitBrief([db_index, key, ns, sid] {
    return ns->GetDbSlice(sid).CheckLock(IntentLock::EXCLUSIVE, db_index, key);
  });
  return !is_open;
}

bool Service::IsShardSetLocked() const {
  std::atomic_uint res{0};

  shard_set->RunBriefInParallel([&](EngineShard* shard) {
    bool unlocked = shard->shard_lock()->Check(IntentLock::SHARED);
    res.fetch_add(!unlocked, memory_order_relaxed);
  });

  return res.load() != 0;
}

absl::flat_hash_map<std::string, unsigned> Service::UknownCmdMap() const {
  lock_guard lk(mu_);
  return unknown_cmds_;
}

void Service::Quit(CmdArgList args, CommandContext* cmd_cntx) {
  if (cmd_cntx->rb()->GetProtocol() == Protocol::REDIS)
    cmd_cntx->rb()->SendOk();

  auto* cntx = cmd_cntx->server_conn_cntx();
  DeactivateMonitoring(cntx);
  cmd_cntx->conn()->MarkForClose();
}

void Service::Multi(CmdArgList args, CommandContext* cmd_cntx) {
  auto& conn_state = cmd_cntx->server_conn_cntx()->conn_state;
  if (conn_state.exec_info.IsCollecting()) {
    return cmd_cntx->SendError("MULTI calls can not be nested");
  }
  conn_state.exec_info.state = ConnectionState::ExecInfo::EXEC_COLLECT;
  // TODO: to protect against huge exec transactions.
  return cmd_cntx->rb()->SendOk();
}

void Service::Watch(CmdArgList args, CommandContext* cmd_cntx) {
  auto* cntx = cmd_cntx->server_conn_cntx();
  auto& exec_info = cntx->conn_state.exec_info;

  // Skip if EXEC will already fail due previous WATCH.
  if (exec_info.watched_dirty.load(memory_order_relaxed)) {
    return cmd_cntx->rb()->SendOk();
  }

  atomic_uint32_t keys_existed = 0;
  auto cb = [&](Transaction* t, EngineShard* shard) {
    ShardId shard_id = shard->shard_id();
    ShardArgs largs = t->GetShardArgs(shard_id);
    for (auto k : largs) {
      t->GetDbSlice(shard_id).RegisterWatchedKey(cntx->db_index(), k, &exec_info.watched_dirty);
    }

    auto res = GenericFamily::OpExists(t->GetOpArgs(shard), largs);
    keys_existed.fetch_add(res.value_or(0), memory_order_relaxed);
    return OpStatus::OK;
  };
  cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));

  // Duplicate keys are stored to keep correct count.
  exec_info.watched_existed += keys_existed.load(memory_order_relaxed);
  for (string_view key : args) {
    exec_info.watched_keys.emplace_back(cntx->db_index(), key);
  }

  return cmd_cntx->rb()->SendOk();
}

void Service::Unwatch(CmdArgList args, CommandContext* cmd_cntx) {
  auto* cntx = cmd_cntx->server_conn_cntx();
  UnwatchAllKeys(cntx->ns, &cntx->conn_state.exec_info);
  return cmd_cntx->rb()->SendOk();
}

optional<CapturingReplyBuilder::Payload> Service::FlushEvalAsyncCmds(ConnectionContext* cntx,
                                                                     bool force) {
  auto& info = cntx->conn_state.script_info;
  auto* tx = cntx->transaction;
  size_t used_mem = info->async_cmds_heap_mem + info->async_cmds.size() * sizeof(StoredCmd);

  if ((info->async_cmds.empty() || !force) && used_mem < info->async_cmds_heap_limit)
    return nullopt;

  ++ServerState::tlocal()->stats.eval_squashed_flushes;

  auto* eval_cid = registry_.Find("EVAL");
  DCHECK(eval_cid);
  tx->MultiSwitchCmd(eval_cid);

  CapturingReplyBuilder crb{ReplyMode::ONLY_ERR};
  MultiCommandSquasher::Opts opts;
  opts.verify_commands = true;
  opts.error_abort = true;
  opts.max_squash_size = ServerState::tlocal()->max_squash_cmd_num;
  MultiCommandSquasher::Execute(absl::MakeSpan(info->async_cmds), &crb, cntx, this, opts);

  info->async_cmds_heap_mem = 0;
  info->async_cmds.clear();

  auto reply = crb.Take();
  return CapturingReplyBuilder::TryExtractError(reply) ? make_optional(std::move(reply)) : nullopt;
}

void Service::CallFromScript(Interpreter::CallArgs& ca, CommandContext* cmd_cntx) {
  auto* tx = cmd_cntx->tx();
  DCHECK(tx);
  auto* cntx = cmd_cntx->server_conn_cntx();
  auto& info = cntx->conn_state.script_info;
  info->stats.num_commands++;

  InterpreterReplier replier(ca.translator);
  optional<ErrorReply> findcmd_err;
  if (ca.async) {
    string cmd = absl::AsciiStrToUpper(ca.args[0]);

    // Full command verification happens during squashed execution
    if (auto* cid = registry_.Find(cmd); cid != nullptr) {
      auto reply_mode = ca.error_abort ? ReplyMode::ONLY_ERR : ReplyMode::NONE;
      info->async_cmds.emplace_back(cid, ca.args.subspan(1), reply_mode);
      info->async_cmds_heap_mem += info->async_cmds.back().UsedMemory();
    } else if (ca.error_abort) {  // If we don't abort on errors, we can ignore it completely
      findcmd_err = ReportUnknownCmd(ca.args[0]);
    }
  }

  if (auto err = FlushEvalAsyncCmds(cntx, !ca.async || findcmd_err.has_value()); err) {
    CapturingReplyBuilder::Apply(std::move(*err), &replier);  // forward error to lua
    *ca.requested_abort = true;
    return;
  }

  if (findcmd_err.has_value()) {
    auto* prev = cmd_cntx->SwapReplier(&replier);
    cmd_cntx->SendError(*findcmd_err);
    *ca.requested_abort |= ca.error_abort;
    cmd_cntx->SwapReplier(prev);
  }

  if (ca.async)
    return;

  auto* prev = cmd_cntx->SwapReplier(&replier);
  DispatchCommand(ParsedArgs{ca.args}, cmd_cntx, AsyncPreference::ONLY_SYNC);
  cmd_cntx->SwapReplier(prev);
}

void Service::Eval(CmdArgList args, CommandContext* cmd_cntx, bool read_only) {
  string_view body = ArgS(args, 0);

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (body.empty()) {
    return rb->SendNull();
  }

  auto* cntx = cmd_cntx->server_conn_cntx();
  BorrowedInterpreter interpreter{cmd_cntx->tx(), &cntx->conn_state};
  auto res = server_family_.script_mgr()->Insert(body, interpreter);
  if (!res)
    return cmd_cntx->SendError(res.error().Format(), facade::kScriptErrType);

  string sha{std::move(res.value())};

  CallSHA(args, sha, interpreter, read_only, cmd_cntx);
}

void Service::EvalRo(CmdArgList args, CommandContext* cmd_cntx) {
  Eval(args, cmd_cntx, true);
}

void Service::EvalSha(CmdArgList args, CommandContext* cmd_cntx, bool read_only) {
  string sha = absl::AsciiStrToLower(ArgS(args, 0));
  auto* cntx = cmd_cntx->server_conn_cntx();
  BorrowedInterpreter interpreter{cmd_cntx->tx(), &cntx->conn_state};
  CallSHA(args, sha, interpreter, read_only, cmd_cntx);
}

void Service::EvalShaRo(CmdArgList args, CommandContext* cmd_cntx) {
  EvalSha(args, cmd_cntx, true);
}

void Service::CallSHA(CmdArgList args, string_view sha, Interpreter* interpreter, bool read_only,
                      CommandContext* cmd_cntx) {
  uint32_t num_keys;
  CHECK(absl::SimpleAtoi(ArgS(args, 1), &num_keys));  // we already validated this

  EvalArgs ev_args;
  ev_args.sha = sha;
  ev_args.keys = args.subspan(2, num_keys);
  ev_args.args = args.subspan(2 + num_keys);

  uint64_t start = absl::GetCurrentTimeNanos();
  EvalInternal(args, ev_args, interpreter, read_only, cmd_cntx);

  uint64_t end = absl::GetCurrentTimeNanos();
  ServerState::tlocal()->RecordCallLatency(sha, (end - start) / 1000);
}

void LoadScript(string_view sha, ScriptMgr* script_mgr, Interpreter* interpreter) {
  if (interpreter->Exists(sha))
    return;

  auto script_data = script_mgr->Find(sha);
  if (!script_data) {
    LOG(DFATAL) << "Script " << sha << " not found in script mgr";
    return;
  }

  string err;
  Interpreter::AddResult add_res = interpreter->AddFunction(sha, script_data->body, &err);
  if (add_res != Interpreter::ADD_OK) {
    LOG(DFATAL) << "Error adding " << sha << " to database, err " << err;
  }
}

// Determine multi mode based on script params.
Transaction::MultiMode DetermineMultiMode(ScriptMgr::ScriptParams params) {
  if (params.atomic && params.undeclared_keys)
    return Transaction::GLOBAL;
  else if (params.atomic)
    return Transaction::LOCK_AHEAD;
  else
    return Transaction::NON_ATOMIC;
}

// Starts multi transaction. Returns true if transaction was scheduled.
// Skips scheduling if multi mode requires declaring keys, but no keys were declared.
bool StartMulti(ConnectionContext* cntx, Transaction::MultiMode tx_mode, CmdArgList keys) {
  Transaction* tx = cntx->transaction;
  DCHECK(tx);
  Namespace* ns = cntx->ns;
  const DbIndex dbid = cntx->db_index();

  switch (tx_mode) {
    case Transaction::GLOBAL:
      tx->StartMultiGlobal(ns, dbid);
      return true;
    case Transaction::LOCK_AHEAD:
      if (keys.empty())
        return false;
      tx->StartMultiLockedAhead(ns, dbid, keys);
      return true;
    case Transaction::NON_ATOMIC:
      tx->StartMultiNonAtomic();
      return true;
    default:
      LOG(FATAL) << "Invalid mode";
  };

  return false;
}

// `multi_mode` is the deduced multi mode that is not yet set on the transaction
static bool CanRunSingleShardMulti(bool one_shard, Transaction::MultiMode multi_mode,
                                   const Transaction& tx) {
  if (tx.GetMultiMode() != Transaction::NOT_DETERMINED) {
    // We may be running EVAL under MULTI. Currently RunSingleShardMulti() will attempt to lock
    // keys, in which case will be already locked by MULTI. We could optimize this path as well
    // though.
    return false;
  }

  // If we have only a single shard, we can run a global command without hops
  if (shard_set->size() == 1 && multi_mode == Transaction::GLOBAL)
    return true;

  return one_shard && multi_mode == Transaction::LOCK_AHEAD;
}

void Service::EvalInternal(CmdArgList args, const EvalArgs& eval_args, Interpreter* interpreter,
                           bool read_only, CommandContext* cmd_cntx) {
  const static size_t kShaSize = 40;
  static_assert(sizeof(ConnectionState::ScriptInfo::Stats::sha) == kShaSize);

  // Sanitizing the input to avoid code injection.
  if (eval_args.sha.size() != kShaSize || !IsSHA(eval_args.sha)) {
    return cmd_cntx->SendError(facade::kScriptNotFound);
  }

  auto* ss = ServerState::tlocal();
  auto params = ss->GetScriptParams(eval_args.sha);
  if (!params) {
    return cmd_cntx->SendError(facade::kScriptNotFound);
  }

  LoadScript(eval_args.sha, server_family_.script_mgr(), interpreter);

  string error;
  auto* conn_cntx = cmd_cntx->server_conn_cntx();
  DCHECK(!conn_cntx->conn_state.script_info);  // we should not call eval from the script.

  // TODO: to determine whether the script is RO by scanning all "redis.p?call" calls
  // and checking whether all invocations consist of RO commands.
  // we can do it once during script insertion into script mgr.
  auto& sinfo = conn_cntx->conn_state.script_info;
  sinfo = make_unique<ConnectionState::ScriptInfo>();
  sinfo->lock_tags.reserve(eval_args.keys.size());
  sinfo->read_only = read_only;
  memcpy(sinfo->stats.sha, eval_args.sha.data(), eval_args.sha.size());

  optional<ShardId> sid{nullopt};
  UniqueSlotChecker slot_checker;
  for (size_t i = 0; i < eval_args.keys.size(); ++i) {
    string_view key = ArgS(eval_args.keys, i);
    slot_checker.Add(key);
    sinfo->lock_tags.insert(LockTag(key));

    ShardId cur_sid = Shard(key, shard_count());
    if (i == 0) {
      sid = cur_sid;
    }
    if (sid.has_value() && *sid != cur_sid) {
      sid = nullopt;
    }
  }

  sinfo->async_cmds_heap_limit = GetFlag(FLAGS_multi_eval_squash_buffer);
  Transaction* tx = cmd_cntx->tx();
  CHECK(tx != nullptr);

  Interpreter::RunResult result;
  Transaction::MultiMode script_mode = DetermineMultiMode(*params);

  interpreter->SetGlobalArray("KEYS", eval_args.keys);
  interpreter->SetGlobalArray("ARGV", eval_args.args);

  // Reset cid to EVAL[] as the context is reused during command dispatch
  absl::Cleanup clean = [interpreter, cmd_cntx, cid = cmd_cntx->cid()]() {
    interpreter->ResetStack();
    cmd_cntx->SetupTx(cid, cmd_cntx->tx());
  };

  if (CanRunSingleShardMulti(sid.has_value(), script_mode, *tx)) {
    sinfo->stats.tx_shards = 1;
    // It might be that there are no declared keys, but there is only a single shard
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
    DCHECK(sid.has_value() || shard_set->size() == 1);
    ShardId real_sid = sid.value_or(ShardId(0));
#pragma GCC diagnostic pop

    // If script runs on a single shard, we run it remotely to save hops.
    interpreter->SetRedisFunc([cmd_cntx, this](Interpreter::CallArgs args) {
      // Disable squashing, as we're using the squashing mechanism to run remotely.
      args.async = false;
      CallFromScript(args, cmd_cntx);
    });

    ++ss->stats.eval_shardlocal_coordination_cnt;
    tx->PrepareSingleSquash(conn_cntx->ns, real_sid, conn_cntx->db_index(), eval_args.keys,
                            script_mode);

    tx->ScheduleSingleHop([&](Transaction*, EngineShard*) {
      boost::intrusive_ptr<Transaction> stub_tx =
          new Transaction{tx, real_sid, slot_checker.GetUniqueSlotId()};
      conn_cntx->transaction = stub_tx.get();

      result = interpreter->RunFunction(eval_args.sha, &error);

      conn_cntx->transaction = tx;
      return OpStatus::OK;
    });

    // Migration only makes sense if there are distinct shards
    if (sid.has_value() && *sid != ss->thread_index()) {
      VLOG(2) << "Migrating connection " << conn_cntx->conn() << " from "
              << ProactorBase::me()->GetPoolIndex() << " to " << real_sid;
      conn_cntx->conn()->RequestAsyncMigration(shard_set->pool()->at(real_sid), false);
    }
  } else {
    Transaction::MultiMode tx_mode = tx->GetMultiMode();
    bool scheduled = false;

    // Check if eval is already part of a running multi transaction
    if (tx_mode != Transaction::NOT_DETERMINED) {
      if (tx_mode > script_mode) {
        string err = StrCat(
            "Multi mode conflict when running eval in multi transaction. Multi mode is: ", tx_mode,
            " eval mode is: ", script_mode);
        return cmd_cntx->SendError(err);
      }
    } else {
      scheduled = StartMulti(conn_cntx, script_mode, eval_args.keys);
      sinfo->stats.tx_shards = tx->GetUniqueShardCnt();
    }

    ++ss->stats.eval_io_coordination_cnt;
    interpreter->SetRedisFunc(
        [cmd_cntx, this](Interpreter::CallArgs args) { CallFromScript(args, cmd_cntx); });

    result = interpreter->RunFunction(eval_args.sha, &error);

    if (auto err = FlushEvalAsyncCmds(conn_cntx, true); err) {
      auto err_ref = CapturingReplyBuilder::TryExtractError(*err);
      result = Interpreter::RUN_ERR;
      error = absl::StrCat(err_ref->first);
    }

    // Conclude the transaction.
    if (scheduled)
      tx->UnlockMulti();
  }

  sinfo->stats.tx_mode = script_mode;

  if (result == Interpreter::RUN_ERR) {
    string resp = StrCat("Error running script (call to ", eval_args.sha, "): ", error);
    server_family_.script_mgr()->OnScriptError(eval_args.sha, error);
    return cmd_cntx->SendError(resp, facade::kScriptErrType);
  }

  CHECK(result == Interpreter::RUN_OK);

  // TODO(vlad): Investigate if using ReplyScope here is possible with a different serialization
  // strategy due to currently SerializeResult destructuring a value while serializing
  auto* builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  SinkReplyBuilder::ReplyAggregator agg(builder);
  EvalSerializer ser{builder, params->float_as_int};
  if (!interpreter->IsResultSafe()) {
    builder->SendError("reached lua stack limit");
  } else {
    interpreter->SerializeResult(&ser);
  }
}

void Service::Discard(CmdArgList args, CommandContext* cmd_cntx) {
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  auto* cntx = cmd_cntx->server_conn_cntx();
  if (!cntx->conn_state.exec_info.IsCollecting()) {
    return rb->SendError("DISCARD without MULTI");
  }

  rb->SendOk();
}

// Return true if non of the connections watched keys expired.
bool CheckWatchedKeyExpiry(ConnectionContext* cntx, const CommandId* exists_cid,
                           const CommandId* exec_cid) {
  auto& exec_info = cntx->conn_state.exec_info;
  auto* tx = cntx->transaction;

  CmdArgVec str_list(exec_info.watched_keys.size());
  for (size_t i = 0; i < str_list.size(); i++) {
    auto& [db, s] = exec_info.watched_keys[i];
    str_list[i] = MutableSlice{s.data(), s.size()};
  }

  atomic_uint32_t watch_exist_count{0};
  auto cb = [&watch_exist_count](Transaction* t, EngineShard* shard) {
    ShardArgs args = t->GetShardArgs(shard->shard_id());
    auto res = GenericFamily::OpExists(t->GetOpArgs(shard), args);
    watch_exist_count.fetch_add(res.value_or(0), memory_order_relaxed);

    return OpStatus::OK;
  };

  tx->MultiSwitchCmd(exists_cid);
  tx->InitByArgs(cntx->ns, cntx->conn_state.db_index, CmdArgList{str_list});
  OpStatus status = tx->ScheduleSingleHop(std::move(cb));
  CHECK_EQ(OpStatus::OK, status);

  // Reset cid to EXEC as it was before
  tx->MultiSwitchCmd(exec_cid);

  // The comparison can still be true even if a key expired due to another one being created.
  // So we have to check the watched_dirty flag, which is set if a key expired.
  return watch_exist_count.load() == exec_info.watched_existed &&
         !exec_info.watched_dirty.load(memory_order_relaxed);
}

// Check if exec_info watches keys on dbs other than db_indx.
bool IsWatchingOtherDbs(DbIndex db_indx, const ConnectionState::ExecInfo& exec_info) {
  return std::any_of(exec_info.watched_keys.begin(), exec_info.watched_keys.end(),
                     [db_indx](const auto& pair) { return pair.first != db_indx; });
}

template <typename F> void IterateAllKeys(const ConnectionState::ExecInfo* exec_info, F&& f) {
  for (auto& [dbid, key] : exec_info->watched_keys)
    f(MutableSlice{key.data(), key.size()});

  CmdArgVec arg_vec{};

  for (const auto& scmd : exec_info->body) {
    if (!scmd.Cid()->IsTransactional())
      continue;

    auto args = scmd.Slice(&arg_vec);
    auto key_res = DetermineKeys(scmd.Cid(), args);
    if (!key_res.ok())
      continue;

    for (unsigned i : key_res->Range())
      f(arg_vec[i]);
  }
}

CmdArgVec CollectAllKeys(ConnectionState::ExecInfo* exec_info) {
  CmdArgVec out;
  out.reserve(exec_info->watched_keys.size() + exec_info->body.size());

  IterateAllKeys(exec_info, [&out](MutableSlice key) { out.push_back(key); });

  return out;
}

void Service::Exec(CmdArgList args, CommandContext* cmd_cntx) {
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  auto* cntx = cmd_cntx->server_conn_cntx();
  auto& exec_info = cntx->conn_state.exec_info;

  if (exec_info.state == ConnectionState::ExecInfo::EXEC_ERROR) {
    return rb->SendError("-EXECABORT Transaction discarded because of previous errors");
  }

  // Check basic invariants
  if (!exec_info.IsCollecting()) {
    return rb->SendError("EXEC without MULTI");
  }

  if (IsWatchingOtherDbs(cntx->db_index(), exec_info)) {
    return rb->SendError("Dragonfly does not allow WATCH and EXEC on different databases");
  }

  if (exec_info.watched_dirty.load(memory_order_relaxed)) {
    return rb->SendNull();
  }

  auto keys = CollectAllKeys(&exec_info);
  if (IsClusterEnabled()) {
    UniqueSlotChecker slot_checker;
    for (const auto& s : keys) {
      slot_checker.Add(s);
    }

    if (slot_checker.IsCrossSlot()) {
      return rb->SendError(kCrossSlotError);
    }
  }

  // The transaction can contain script load script execution, determine their presence ahead to
  // customize logic below.
  ExecScriptUse state = DetermineScriptPresense(exec_info.body);

  // We borrow a single interpreter for all the EVALs/Script load inside. Returned by MultiCleanup
  if (state != ExecScriptUse::NONE) {
    exec_info.preborrowed_interpreter =
        BorrowedInterpreter(cmd_cntx->tx(), &cntx->conn_state).Release();
  }

  // Determine according multi mode, not only only flag, but based on presence of global commands
  // and scripts
  Transaction::MultiMode multi_mode = DeduceExecMode(state, exec_info, *script_mgr());

  bool scheduled = false;
  if (multi_mode != Transaction::NOT_DETERMINED) {
    scheduled = StartMulti(cntx, multi_mode, keys);
  }

  // EXEC should not run if any of the watched keys expired.
  if (!exec_info.watched_keys.empty() &&
      !CheckWatchedKeyExpiry(cntx, registry_.Find("EXISTS"), exec_cid_)) {
    cmd_cntx->tx()->UnlockMulti();
    return rb->SendNull();
  }

  exec_info.state = ConnectionState::ExecInfo::EXEC_RUNNING;

  VLOG(2) << "StartExec " << exec_info.body.size();

  // Make sure we flush whatever responses we aggregated in the reply builder.
  SinkReplyBuilder::ReplyAggregator agg(rb);
  rb->StartArray(exec_info.body.size());

  if (!exec_info.body.empty()) {
    string descr = CreateExecDescriptor(exec_info.body, cmd_cntx->tx()->GetUniqueShardCnt());
    ServerState::tlocal()->exec_freq_count[descr]++;

    if (GetFlag(FLAGS_multi_exec_squash) && state != ExecScriptUse::SCRIPT_RUN &&
        !cntx->conn_state.tracking_info_.IsTrackingOn()) {
      MultiCommandSquasher::Opts opts;
      opts.max_squash_size = ServerState::tlocal()->max_squash_cmd_num;
      MultiCommandSquasher::Execute(absl::MakeSpan(exec_info.body), rb, cntx, this, opts);
    } else {
      CmdArgVec arg_vec;
      DCHECK_EQ(cmd_cntx->cid(), exec_cid_);

      for (const auto& scmd : exec_info.body) {
        CmdArgList args = scmd.Slice(&arg_vec);

        if (scmd.Cid()->IsTransactional()) {
          cmd_cntx->tx()->MultiSwitchCmd(scmd.Cid());
          OpStatus st = cmd_cntx->tx()->InitByArgs(cntx->ns, cntx->conn_state.db_index, args);
          if (st != OpStatus::OK) {
            cmd_cntx->SendError(st);
            break;
          }
        }

        // TODO: we will have to create a CommandContext per command if we want to support async
        // execution inside exec.
        cmd_cntx->UpdateCid(scmd.Cid());
        auto invoke_res = InvokeCmd(args, cmd_cntx);
        if ((invoke_res != DispatchResult::OK) ||
            rb->GetError())  // checks for i/o error, not logical error.
          break;
      }
      cmd_cntx->UpdateCid(exec_cid_);
    }
  }

  if (scheduled) {
    VLOG(2) << "Exec unlocking " << exec_info.body.size() << " commands";
    cmd_cntx->tx()->UnlockMulti();
  }

  // Dispatch at the end manually to have (MULTI, cmds..., EXEC) order
  if (!ServerState::tlocal()->Monitors().Empty()) {
    LOG_IF(DFATAL, exec_cid_->opt_mask() & CO::ADMIN) << "EXEC should be non admin command";
    DispatchMonitor(cntx, exec_cid_, args);
  }

  VLOG(2) << "Exec completed";
}

void Service::Publish(CmdArgList args, CommandContext* cmd_cntx) {
  bool sharded = cmd_cntx->cid()->PubSubKind() == CO::PubSubKind::SHARDED;
  if (!sharded && IsClusterEnabled())
    return cmd_cntx->SendError("PUBLISH is not supported in cluster mode yet");

  string_view channel = ArgS(args, 0);
  string_view messages[] = {ArgS(args, 1)};

  auto* cs = ServerState::tlocal()->channel_store();
  cmd_cntx->SendLong(cs->SendMessages(channel, messages, sharded));
}

void Service::Subscribe(CmdArgList args, CommandContext* cmd_cntx) {
  bool sharded = cmd_cntx->cid()->PubSubKind() == CO::PubSubKind::SHARDED;
  if (!sharded && IsClusterEnabled())
    return cmd_cntx->SendError("SUBSCRIBE is not supported in cluster mode yet");

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  auto* conn_cntx = cmd_cntx->server_conn_cntx();
  conn_cntx->ChangeSubscription(true /*add*/, true /* reply*/, sharded, args, rb);
}

void Service::Unsubscribe(CmdArgList args, CommandContext* cmd_cntx) {
  bool sharded = cmd_cntx->cid()->PubSubKind() == CO::PubSubKind::SHARDED;
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  auto* conn_cntx = cmd_cntx->server_conn_cntx();
  if (!sharded && IsClusterEnabled())
    return rb->SendError("UNSUBSCRIBE is not supported in cluster mode yet");

  if (args.size() == 0) {
    conn_cntx->UnsubscribeAll(true, rb);
  } else {
    conn_cntx->ChangeSubscription(false, true, sharded, args, rb);
  }
}

void Service::PSubscribe(CmdArgList args, CommandContext* cmd_cntx) {
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  if (IsClusterEnabled()) {
    return rb->SendError("PSUBSCRIBE is not supported in cluster mode yet");
  }
  cmd_cntx->server_conn_cntx()->ChangePSubscription(true, true, args, rb);
}

void Service::PUnsubscribe(CmdArgList args, CommandContext* cmd_cntx) {
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (IsClusterEnabled()) {
    return rb->SendError("PUNSUBSCRIBE is not supported in cluster mode yet");
  }
  auto* conn_cntx = cmd_cntx->server_conn_cntx();
  if (args.size() == 0) {
    conn_cntx->PUnsubscribeAll(true, rb);
  } else {
    conn_cntx->ChangePSubscription(false, true, args, rb);
  }
}

// Not a real implementation. Serves as a decorator to accept some function commands
// for testing.
void Service::Function(CmdArgList args, CommandContext* cmd_cntx) {
  string sub_cmd = absl::AsciiStrToUpper(ArgS(args, 0));

  if (sub_cmd == "FLUSH") {
    return cmd_cntx->rb()->SendOk();
  }

  string err = UnknownSubCmd(sub_cmd, "FUNCTION");
  return cmd_cntx->SendError(err, kSyntaxErrType);
}

void Service::PubsubChannels(string_view pattern, SinkReplyBuilder* builder) {
  auto* rb = static_cast<RedisReplyBuilder*>(builder);
  rb->SendBulkStrArr(ServerState::tlocal()->channel_store()->ListChannels(pattern));
}

void Service::PubsubPatterns(SinkReplyBuilder* builder) {
  size_t pattern_count = ServerState::tlocal()->channel_store()->PatternCount();
  builder->SendLong(pattern_count);
}

void Service::PubsubNumSub(CmdArgList args, SinkReplyBuilder* builder) {
  auto* rb = static_cast<RedisReplyBuilder*>(builder);
  rb->StartArray(args.size() * 2);
  for (string_view channel : args) {
    rb->SendBulkString(channel);
    rb->SendLong(ServerState::tlocal()->channel_store()->FetchSubscribers(channel).size());
  }
}

void Service::Monitor(CmdArgList args, CommandContext* cmd_cntx) {
  VLOG(1) << "starting monitor on this connection: "
          << cmd_cntx->server_conn_cntx()->conn()->GetClientId();
  // we are registering the current connection for all threads so they will be aware of
  // this connection, to send to it any command
  cmd_cntx->rb()->SendOk();
  cmd_cntx->server_conn_cntx()->ChangeMonitor(true /* start */);
}

void Service::Pubsub(CmdArgList args, CommandContext* cmd_cntx) {
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  if (args.size() < 1) {
    rb->SendError(WrongNumArgsError(cmd_cntx->cid()->name()));
    return;
  }

  string subcmd = absl::AsciiStrToUpper(ArgS(args, 0));

  if (subcmd == "HELP") {
    string_view help_arr[] = {
        "PUBSUB <subcommand> [<arg> [value] [opt] ...]. Subcommands are:",
        "CHANNELS [<pattern>]",
        "\tReturn the currently active channels matching a <pattern> (default: '*').",
        "NUMPAT",
        "\tReturn number of subscriptions to patterns.",
        "NUMSUB [<channel> <channel...>]",
        "\tReturns the number of subscribers for the specified channels, excluding",
        "\tpattern subscriptions.",
        "SHARDCHANNELS [pattern]",
        "\tReturns a list of active shard channels, optionally matching the specified pattern ",
        "(default: '*').",
        "SHARDNUMSUB [<channel> <channel...>]",
        "\tReturns the number of subscribers for the specified shard channels, excluding",
        "\tpattern subscriptions.",
        "HELP",
        "\tPrints this help."};

    rb->SendSimpleStrArr(help_arr);
    return;
  }

  // Don't allow SHARD subcommands in non cluster mode
  if (!IsClusterEnabledOrEmulated() && ((subcmd == "SHARDCHANNELS") || (subcmd == "SHARDNUMSUB"))) {
    auto err = absl::StrCat("PUBSUB ", subcmd, " is not supported in non cluster mode");
    return rb->SendError(err);
  }

  if (subcmd == "CHANNELS" || subcmd == "SHARDCHANNELS") {
    string_view pattern;
    if (args.size() > 1) {
      pattern = ArgS(args, 1);
    }
    PubsubChannels(pattern, rb);
  } else if (subcmd == "NUMPAT") {
    PubsubPatterns(rb);
  } else if (subcmd == "NUMSUB" || subcmd == "SHARDNUMSUB") {
    args.remove_prefix(1);
    PubsubNumSub(args, rb);
  } else {
    rb->SendError(UnknownSubCmd(subcmd, "PUBSUB"));
  }
}

void Service::Command(CmdArgList args, CommandContext* cmd_cntx) {
  unsigned cmd_cnt = 0;
  registry_.Traverse([&](string_view name, const CommandId& cd) {
    if ((cd.opt_mask() & CO::HIDDEN) == 0) {
      ++cmd_cnt;
    }
  });

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  auto serialize_command = [rb, this](string_view name, const CommandId& cid) {
    rb->StartArray(7);
    rb->SendSimpleString(cid.name());
    rb->SendLong(cid.arity());

    vector<string> opts;
    for (uint32_t i = 0; i < 32; i++) {
      unsigned obit = (1u << i);
      if (auto name = CommandOptName(CO::CommandOpt{obit}, cid.opt_mask() & obit); !name.empty())
        opts.emplace_back(name);
    }
    rb->SendSimpleStrArr(opts);

    rb->SendLong(cid.first_key_pos());
    rb->SendLong(cid.last_key_pos());
    rb->SendLong(cid.interleaved_step() ? cid.interleaved_step() : 1);

    {
      const auto& table = acl_family_.GetRevTable();
      vector<string> cats;
      for (uint32_t i = 0; i < 32; i++) {
        if (cid.acl_categories() & (1 << i)) {
          cats.emplace_back("@" + table[i]);
        }
      }
      rb->SendSimpleStrArr(cats);
    }
  };

  // If no arguments are specified, reply with all commands
  if (args.empty()) {
    rb->StartArray(cmd_cnt);
    registry_.Traverse([&](string_view name, const CommandId& cid) {
      if (cid.opt_mask() & CO::HIDDEN)
        return;
      serialize_command(name, cid);
    });
    return;
  }

  string subcmd = absl::AsciiStrToUpper(ArgS(args, 0));

  // COUNT
  if (subcmd == "COUNT") {
    return rb->SendLong(cmd_cnt);
  }

  bool sufficient_args = (args.size() == 2);

  // INFO [cmd]
  if (subcmd == "INFO" && sufficient_args) {
    string cmd = absl::AsciiStrToUpper(ArgS(args, 1));

    if (const auto* cid = registry_.Find(cmd); cid) {
      rb->StartArray(1);
      serialize_command(cmd, *cid);
    } else {
      rb->SendNull();
    }

    return;
  }

  sufficient_args = (args.size() == 1);
  if (subcmd == "DOCS" && sufficient_args) {
    // Returning an error here forces the interactive CLI client to fall back to static hints and
    // tab completion
    return rb->SendError("COMMAND DOCS Not Implemented");
  }

  if (subcmd == "HELP" && sufficient_args) {
    // Return help information for supported COMMAND subcommands
    constexpr string_view help[] = {
        "(no subcommand)",
        "    Return details about all commands.",
        "INFO command-name",
        "    Return details about specified command.",
        "COUNT",
        "    Return the total number of commands in this server.",
    };
    return rb->SendSimpleStrArr(help);
  }

  return rb->SendError(kSyntaxErr, kSyntaxErrType);
}

VarzValue::Map Service::GetVarzStats() {
  VarzValue::Map res;

  Metrics m = server_family_.GetMetrics(&namespaces->GetDefaultNamespace());
  DbStats db_stats;
  for (const auto& s : m.db_stats) {
    db_stats += s;
  }

  res.emplace_back("keys", VarzValue::FromInt(db_stats.key_count));
  res.emplace_back("obj_mem_usage", VarzValue::FromInt(db_stats.obj_memory_usage));
  double load = double(db_stats.key_count) / (1 + db_stats.prime_capacity);
  res.emplace_back("table_load_factor", VarzValue::FromDouble(load));

  return res;
}

GlobalState Service::SwitchState(GlobalState from, GlobalState to) {
  util::fb2::LockGuard lk(mu_);
  GlobalState prev = global_state_;
  if (global_state_ != from) {
    return prev;
  }

  VLOG(1) << "Switching state from " << from << " to " << to;
  global_state_ = to;

  pp_.Await([&](ProactorBase*) {
    ServerState::tlocal()->set_gstate(to);
    auto* es = EngineShard::tlocal();
    if (es && to == GlobalState::ACTIVE) {
      DbSlice& db = namespaces->GetDefaultNamespace().GetDbSlice(es->shard_id());
      DCHECK(db.IsLoadRefCountZero());
    }
  });
  return prev;
}

bool Service::RequestLoadingState() {
  GlobalState prev = SwitchState(GlobalState::ACTIVE, GlobalState::LOADING);
  if (prev == GlobalState::ACTIVE || prev == GlobalState::LOADING) {
    util::fb2::LockGuard lk(mu_);
    loading_state_counter_++;
    return true;
  }
  return false;
}

void Service::RemoveLoadingState() {
  bool switch_state = false;
  {
    util::fb2::LockGuard lk(mu_);
    CHECK_GT(loading_state_counter_, 0u);
    --loading_state_counter_;
    switch_state = loading_state_counter_ == 0;
  }
  if (switch_state) {
    SwitchState(GlobalState::LOADING, GlobalState::ACTIVE);
  }
}

bool Service::IsLoadingExclusively() {
  util::fb2::LockGuard lk(mu_);
  return global_state_ == GlobalState::LOADING && loading_state_counter_ == 0;
}

void Service::ConfigureHttpHandlers(util::HttpListenerBase* base, bool is_privileged) {
  // We skip authentication on privileged listener if the flag admin_nopass is set
  // We also skip authentication if requirepass is empty
  const bool should_skip_auth =
      (is_privileged && !RequirePrivilegedAuth()) || GetPassword().empty();
  if (!should_skip_auth) {
    base->SetAuthFunctor([pass = GetPassword()](std::string_view path, std::string_view username,
                                                std::string_view password) {
      if (path == "/metrics")
        return true;
      const bool pass_verified = pass.empty() ? true : password == pass;
      return username == "default" && pass_verified;
    });
  }
  server_family_.ConfigureMetrics(base);

  if (GetFlag(FLAGS_expose_http_api)) {
    base->RegisterCb("/api",
                     [this](const http::QueryArgs& args, HttpRequest&& req, HttpContext* send) {
                       HttpAPI(args, std::move(req), this, send);
                     });
  }
}

void Service::OnConnectionClose(facade::ConnectionContext* cntx) {
  ConnectionContext* server_cntx = static_cast<ConnectionContext*>(cntx);
  ConnectionState& conn_state = server_cntx->conn_state;
  VLOG_IF(1, conn_state.replication_info.repl_session_id)
      << "OnConnectionClose: " << server_cntx->conn()->GetName()
      << ", repl_session_id: " << conn_state.replication_info.repl_session_id;

  if (conn_state.subscribe_info) {  // Clean-ups related to PUBSUB
    if (!conn_state.subscribe_info->channels.empty()) {
      server_cntx->UnsubscribeAll(false, nullptr);
    }

    if (conn_state.subscribe_info) {
      DCHECK(!conn_state.subscribe_info->patterns.empty());
      server_cntx->PUnsubscribeAll(false, nullptr);
    }

    DCHECK(!conn_state.subscribe_info);
  }

  UnwatchAllKeys(server_cntx->ns, &conn_state.exec_info);

  DeactivateMonitoring(server_cntx);

  server_family_.OnClose(server_cntx);

  conn_state.tracking_info_.SetClientTracking(false);
}

void Service::RegisterTieringFlags() {
#ifdef WITH_TIERING
  // TODO(vlad): Introduce templatable flag cache
  auto update_tiered_storage = [](auto) {
    shard_set->pool()->AwaitBrief([](unsigned, auto*) {
      if (auto* es = EngineShard::tlocal(); es && es->tiered_storage()) {
        es->tiered_storage()->UpdateFromFlags();
      }
    });
  };
  config_registry.RegisterSetter<bool>("tiered_experimental_cooling", update_tiered_storage);
  config_registry.RegisterSetter<unsigned>("tiered_storage_write_depth", update_tiered_storage);
  config_registry.RegisterSetter<float>("tiered_offload_threshold", update_tiered_storage);
  config_registry.RegisterSetter<float>("tiered_upload_threshold", update_tiered_storage);
#endif
}

Service::ContextInfo Service::GetContextInfo(facade::ConnectionContext* cntx) const {
  ConnectionContext* server_cntx = static_cast<ConnectionContext*>(cntx);
  return {.db_index = server_cntx->db_index(),
          .async_dispatch = server_cntx->async_dispatch,
          .conn_closing = server_cntx->conn_closing,
          .subscribers = bool(server_cntx->conn_state.subscribe_info),
          .blocked = server_cntx->blocked};
}

#define HFUNC(x) SetHandler(&Service::x)
#define MFUNC(x) \
  SetHandler([this](CmdArgList sp, CommandContext* cntx) { this->x(std::move(sp), cntx); })

namespace acl {
constexpr uint32_t kQuit = FAST | CONNECTION;
constexpr uint32_t kMulti = FAST | TRANSACTION;
constexpr uint32_t kWatch = FAST | TRANSACTION;
constexpr uint32_t kUnwatch = FAST | TRANSACTION;
constexpr uint32_t kDiscard = FAST | TRANSACTION;
constexpr uint32_t kEval = SLOW | SCRIPTING;
constexpr uint32_t kEvalRo = SLOW | SCRIPTING;
constexpr uint32_t kEvalSha = SLOW | SCRIPTING;
constexpr uint32_t kEvalShaRo = SLOW | SCRIPTING;
constexpr uint32_t kExec = SLOW | TRANSACTION;
constexpr uint32_t kPublish = PUBSUB | FAST;
constexpr uint32_t kSubscribe = PUBSUB | SLOW;
constexpr uint32_t kUnsubscribe = PUBSUB | SLOW;
constexpr uint32_t kPSubscribe = PUBSUB | SLOW;
constexpr uint32_t kPUnsubsribe = PUBSUB | SLOW;
constexpr uint32_t kFunction = SLOW;
constexpr uint32_t kMonitor = ADMIN | SLOW | DANGEROUS;
constexpr uint32_t kPubSub = SLOW;
constexpr uint32_t kCommand = SLOW | CONNECTION;
}  // namespace acl

void Service::Register(CommandRegistry* registry) {
  using CI = CommandId;
  registry->StartFamily();
  *registry
      << CI{"QUIT", CO::FAST, 1, 0, 0, acl::kQuit}.HFUNC(Quit)
      << CI{"MULTI", CO::NOSCRIPT | CO::FAST | CO::LOADING, 1, 0, 0, acl::kMulti}.HFUNC(Multi)
      << CI{"WATCH", CO::LOADING, -2, 1, -1, acl::kWatch}.HFUNC(Watch)
      << CI{"UNWATCH", CO::LOADING, 1, 0, 0, acl::kUnwatch}.HFUNC(Unwatch)
      << CI{"DISCARD", CO::NOSCRIPT | CO::FAST | CO::LOADING, 1, 0, 0, acl::kDiscard}.MFUNC(Discard)
      << CI{"EVAL", CO::NOSCRIPT | CO::VARIADIC_KEYS, -3, 3, 3, acl::kEval}
             .MFUNC(Eval)
             .SetValidator(&EvalValidator)
      << CI{"EVAL_RO", CO::NOSCRIPT | CO::READONLY | CO::VARIADIC_KEYS, -3, 3, 3, acl::kEvalRo}
             .MFUNC(EvalRo)
             .SetValidator(&EvalValidator)
      << CI{"EVALSHA", CO::NOSCRIPT | CO::VARIADIC_KEYS, -3, 3, 3, acl::kEvalSha}
             .MFUNC(EvalSha)
             .SetValidator(&EvalValidator)
      << CI{"EVALSHA_RO",   CO::NOSCRIPT | CO::READONLY | CO::VARIADIC_KEYS, -3, 3, 3,
            acl::kEvalShaRo}
             .MFUNC(EvalShaRo)
             .SetValidator(&EvalValidator)
      << CI{"EXEC", CO::LOADING | CO::NOSCRIPT, 1, 0, 0, acl::kExec}.MFUNC(Exec)
      << CI{"PUBLISH", CO::LOADING | CO::FAST, 3, 0, 0, acl::kPublish}.MFUNC(Publish)
      << CI{"SPUBLISH", CO::LOADING | CO::FAST, 3, 0, 0, acl::kPublish}.MFUNC(Publish)
      << CI{"SUBSCRIBE", CO::NOSCRIPT | CO::LOADING, -2, 0, 0, acl::kSubscribe}.MFUNC(Subscribe)
      << CI{"SSUBSCRIBE", CO::NOSCRIPT | CO::LOADING, -2, 0, 0, acl::kSubscribe}.MFUNC(Subscribe)
      << CI{"UNSUBSCRIBE", CO::NOSCRIPT | CO::LOADING, -1, 0, 0, acl::kUnsubscribe}.MFUNC(
             Unsubscribe)
      << CI{"SUNSUBSCRIBE", CO::NOSCRIPT | CO::LOADING, -1, 0, 0, acl::kUnsubscribe}.MFUNC(
             Unsubscribe)
      << CI{"PSUBSCRIBE", CO::NOSCRIPT | CO::LOADING, -2, 0, 0, acl::kPSubscribe}.MFUNC(PSubscribe)
      << CI{"PUNSUBSCRIBE", CO::NOSCRIPT | CO::LOADING, -1, 0, 0, acl::kPUnsubsribe}.MFUNC(
             PUnsubscribe)
      << CI{"FUNCTION", CO::NOSCRIPT, 2, 0, 0, acl::kFunction}.MFUNC(Function)
      << CI{"MONITOR", CO::ADMIN, 1, 0, 0, acl::kMonitor}.MFUNC(Monitor)
      << CI{"PUBSUB", CO::LOADING | CO::FAST, -1, 0, 0, acl::kPubSub}.MFUNC(Pubsub)
      << CI{"COMMAND", CO::LOADING | CO::NOSCRIPT, -1, 0, 0, acl::kCommand}.MFUNC(Command);
}

void Service::RegisterCommands() {
  Register(&registry_);
  server_family_.Register(&registry_);
  GenericFamily::Register(&registry_);
  RegisterListFamily(&registry_);
  RegisterStringFamily(&registry_);

#ifdef WITH_COLLECTION_CMDS
  SetFamily::Register(&registry_);
  HSetFamily::Register(&registry_);
  ZSetFamily::Register(&registry_);
  StreamFamily::Register(&registry_);
#endif

#ifdef WITH_EXTENSION_CMDS
  RegisterGeoFamily(&registry_);
  RegisterBitopsFamily(&registry_);
  RegisterHllFamily(&registry_);
  RegisterBloomFamily(&registry_);
  RegisterCmsFamily(&registry_);
  RegisterJsonFamily(&registry_);
#endif

#ifdef WITH_SEARCH
  SearchFamily::Register(&registry_);
#endif

  cluster_family_.Register(&registry_);

  // AclFamily should always be registered last
  // If we add a new familly, register that first above and *not* below
  acl_family_.Register(&registry_);

  // Only after all the commands are registered
  registry_.Init(pp_.size());

  using CI = CommandId;
  if (VLOG_IS_ON(2)) {
    LOG(INFO) << "Multi-key commands are: ";
    registry_.Traverse([](std::string_view key, const CI& cid) {
      if (cid.is_multi_key()) {
        string key_len;
        if (cid.last_key_pos() < 0)
          key_len = "unlimited";
        else
          key_len = StrCat(cid.last_key_pos() - cid.first_key_pos() + 1);
        LOG(INFO) << "    " << key << ": with " << key_len << " keys";
      }
    });

    LOG(INFO) << "Non-transactional commands are: ";
    registry_.Traverse([](std::string_view name, const CI& cid) {
      if (cid.IsTransactional()) {
        LOG(INFO) << "    " << name;
      }
    });
  }
}

const acl::AclFamily* Service::TestInit() {
  acl_family_.Init(nullptr, &user_registry_);
  return &acl_family_;
}

}  // namespace dfly


================================================
FILE: src/server/main_service.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include "base/varz_value.h"
#include "core/interpreter.h"
#include "facade/service_interface.h"
#include "server/acl/acl_commands_def.h"
#include "server/acl/acl_family.h"
#include "server/acl/user_registry.h"
#include "server/cluster/cluster_family.h"
#include "server/command_registry.h"
#include "server/config_registry.h"
#include "server/engine_shard_set.h"
#include "server/server_family.h"

namespace util {
class AcceptServer;
}  // namespace util

namespace dfly {

using facade::MemcacheParser;

class Service : public facade::ServiceInterface {
 public:
  explicit Service(util::ProactorPool* pp);
  ~Service();

  void Init(util::AcceptServer* acceptor, std::vector<facade::Listener*> listeners);

  void Shutdown();

  // Prepare command execution, verify and execute, reply to context
  facade::DispatchResult DispatchCommand(facade::ParsedArgs args, facade::ParsedCommand* parsed_cmd,
                                         facade::AsyncPreference apref) final;

  // Execute multiple consecutive commands, possibly in parallel by squashing
  facade::DispatchManyResult DispatchManyCommands(std::function<facade::ParsedArgs()> arg_gen,
                                                  unsigned count, facade::SinkReplyBuilder* builder,
                                                  facade::ConnectionContext* cntx) final;

  // Check OOM and invoke command with args
  facade::DispatchResult InvokeCmd(CmdArgList tail_args, CommandContext* cmd_cntx);

  // Verify command prepares execution in correct state.
  // It's usually called before command execution. Only for multi/exec transactions it's checked
  // when the command is queued for execution, not before the execution itself.
  std::optional<facade::ErrorReply> VerifyCommandState(const CommandId& cid, ArgSlice tail_args,
                                                       const ConnectionContext& cntx);

  facade::DispatchResult DispatchMC(facade::ParsedCommand* parsed_cmd,
                                    facade::AsyncPreference apref) final;

  facade::ConnectionContext* CreateContext(facade::Connection* owner) final;
  facade::ParsedCommand* AllocateParsedCommand() final;

  const CommandId* FindCmd(std::string_view) const;

  CommandRegistry* mutable_registry() {
    return &registry_;
  }

  facade::ErrorReply ReportUnknownCmd(std::string_view cmd_name) ABSL_LOCKS_EXCLUDED(mu_);

  // Attempts to switch global state from 'from' to 'to'.
  // Returns the PREVIOUS global state (before the switch attempt).
  // If from equals the previous state then the switch is performed and 'from' is returned.
  // Otherwise, does not switch and returns the current (unchanged) state.
  // Upon switch, updates cached global state in threadlocal ServerState struct.
  GlobalState SwitchState(GlobalState from, GlobalState to) ABSL_LOCKS_EXCLUDED(mu_);

  bool RequestLoadingState() ABSL_LOCKS_EXCLUDED(mu_);
  void RemoveLoadingState() ABSL_LOCKS_EXCLUDED(mu_);

  // Return true if state is LOADING and loading_state_counter_ == 0, that is,
  // if no multiple operations require LOADING_STATE at the same time.
  bool IsLoadingExclusively() ABSL_LOCKS_EXCLUDED(mu_);

  void ConfigureHttpHandlers(util::HttpListenerBase* base, bool is_privileged) final;
  void OnConnectionClose(facade::ConnectionContext* cntx) final;

  Service::ContextInfo GetContextInfo(facade::ConnectionContext* cntx) const final;

  uint32_t shard_count() const {
    return shard_set->size();
  }

  // Used by tests.
  bool IsLocked(Namespace* ns, DbIndex db_index, std::string_view key) const;
  bool IsShardSetLocked() const;

  util::ProactorPool& proactor_pool() {
    return pp_;
  }

  absl::flat_hash_map<std::string, unsigned> UknownCmdMap() const;

  ScriptMgr* script_mgr() {
    return server_family_.script_mgr();
  }

  const ScriptMgr* script_mgr() const {
    return server_family_.script_mgr();
  }

  ServerFamily& server_family() {
    return server_family_;
  }

  cluster::ClusterFamily& cluster_family() {
    return cluster_family_;
  }

  // Utility function used in unit tests
  // Do not use in production, only meant to be used by unit tests
  const acl::AclFamily* TestInit();

 private:
  using SinkReplyBuilder = facade::SinkReplyBuilder;

  static void Quit(CmdArgList args, CommandContext* cmd_cntx);
  static void Multi(CmdArgList args, CommandContext* cmd_cntx);

  static void Watch(CmdArgList args, CommandContext* cmd_cntx);
  static void Unwatch(CmdArgList args, CommandContext* cmd_cntx);

  void Discard(CmdArgList args, CommandContext* cmd_cntx);
  void Eval(CmdArgList args, CommandContext* cmd_cntx, bool read_only = false);
  void EvalRo(CmdArgList args, CommandContext* cmd_cntx);
  void EvalSha(CmdArgList args, CommandContext* cmd_cntx, bool read_only = false);
  void EvalShaRo(CmdArgList args, CommandContext* cmd_cntx);
  void Exec(CmdArgList args, CommandContext* cmd_cntx);
  void Publish(CmdArgList args, CommandContext* cmd_cntx);
  void Subscribe(CmdArgList args, CommandContext* cmd_cntx);
  void Unsubscribe(CmdArgList args, CommandContext* cmd_cntx);
  void PSubscribe(CmdArgList args, CommandContext* cmd_cntx);
  void PUnsubscribe(CmdArgList args, CommandContext* cmd_cntx);
  void Function(CmdArgList args, CommandContext* cmd_cntx);
  void Monitor(CmdArgList args, CommandContext* cmd_cntx);
  void Pubsub(CmdArgList args, CommandContext* cmd_cntx);
  void Command(CmdArgList args, CommandContext* cmd_cntx);

  void PubsubChannels(std::string_view pattern, SinkReplyBuilder* builder);
  void PubsubPatterns(SinkReplyBuilder* builder);
  void PubsubNumSub(CmdArgList channels, SinkReplyBuilder* builder);

  struct EvalArgs {
    std::string_view sha;  // only one of them is defined.
    CmdArgList keys, args;
  };

  // Return error if not all keys are owned by the server when running in cluster mode
  std::optional<facade::ErrorReply> CheckKeysOwnership(const CommandId& cid, CmdArgList args,
                                                       const ConnectionContext& dfly_cntx);

  // Return moved error if we *own* the slot. This function is used from flows that assume our
  // state is TAKEN_OVER which happens after a replica takeover.
  std::optional<facade::ErrorReply> TakenOverSlotError(const CommandId& cid, CmdArgList args,
                                                       const ConnectionContext& dfly_cntx);

  void EvalInternal(CmdArgList args, const EvalArgs& eval_args, Interpreter* interpreter,
                    bool read_only, CommandContext* cmd_cntx);
  void CallSHA(CmdArgList args, std::string_view sha, Interpreter* interpreter, bool read_only,
               CommandContext* cmd_cntx);

  // Return optional payload - first received error that occured when executing commands.
  std::optional<facade::payload::Payload> FlushEvalAsyncCmds(ConnectionContext* cntx,
                                                             bool force = false);

  void CallFromScript(Interpreter::CallArgs& args, CommandContext* cmd_cntx);

  OpResult<KeyIndex> FindKeys(const CommandId* cid, CmdArgList args);

  void RegisterCommands();
  void Register(CommandRegistry* registry);
  // Helper for registering tiering flags
  void RegisterTieringFlags();

  base::VarzValue::Map GetVarzStats();

  util::ProactorPool& pp_;

  acl::UserRegistry user_registry_;
  acl::AclFamily acl_family_;
  ServerFamily server_family_;
  cluster::ClusterFamily cluster_family_;
  CommandRegistry registry_;
  absl::flat_hash_map<std::string, unsigned> unknown_cmds_;

  const CommandId* exec_cid_;  // command id of EXEC command for pipeline squashing

  mutable util::fb2::Mutex mu_;
  GlobalState global_state_ ABSL_GUARDED_BY(mu_) = GlobalState::ACTIVE;
  uint32_t loading_state_counter_ ABSL_GUARDED_BY(mu_) = 0;
};

}  // namespace dfly


================================================
FILE: src/server/memory_cmd.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/memory_cmd.h"

#include <absl/strings/ascii.h>
#include <absl/strings/str_cat.h>

#ifdef __linux__
#include <malloc.h>
#endif

#include <mimalloc.h>

#include "base/flags.h"
#include "core/allocation_tracker.h"
#include "facade/cmd_arg_parser.h"
#include "facade/dragonfly_connection.h"
#include "facade/dragonfly_listener.h"
#include "facade/error.h"
#include "facade/reply_builder.h"
#include "io/io_buf.h"
#include "server/engine_shard_set.h"
#include "server/namespaces.h"
#include "server/server_family.h"
#include "server/server_state.h"

using namespace std;
using namespace facade;

ABSL_DECLARE_FLAG(float, mem_defrag_page_utilization_threshold);

namespace dfly {

namespace {

void MiStatsCallback(const char* msg, void* arg) {
  string* str = (string*)arg;
  absl::StrAppend(str, msg);
}

// blocksize, reserved, committed, used.
using BlockKey = std::tuple<size_t, size_t, size_t, size_t>;
using BlockMap = absl::flat_hash_map<BlockKey, uint64_t>;

bool MiArenaVisit(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size,
                  void* arg) {
  BlockMap* bmap = (BlockMap*)arg;
  BlockKey bkey{block_size, area->reserved, area->committed, area->used * block_size};
  (*bmap)[bkey]++;

  return true;
}

struct BlockSummary {
  size_t reserved = 0;
  size_t committed = 0;
  size_t used = 0;
};

using BlockSummaryMap = absl::flat_hash_map<size_t, BlockSummary>;

bool MiArenaVisitSummary(const mi_heap_t*, const mi_heap_area_t* area, void*, size_t block_size,
                         void* arg) {
  BlockSummaryMap* bsm = static_cast<BlockSummaryMap*>(arg);
  BlockSummary& block_stats = (*bsm)[block_size];
  block_stats.committed += area->committed;
  block_stats.reserved += area->reserved;
  block_stats.used += area->used * block_size;
  return true;
}

BlockSummaryMap CollectSummary(bool backing) {
  BlockSummaryMap summary;
  const mi_heap_t* data_heap = backing ? mi_heap_get_backing() : ServerState::tlocal()->data_heap();
  mi_heap_visit_blocks(data_heap, false, MiArenaVisitSummary, &summary);
  return summary;
}

vector<BlockSummaryMap> CollectSummaries(bool backing) {
  std::vector<BlockSummaryMap> summaries(shard_set->size());
  shard_set->RunBriefInParallel([&summaries, backing](EngineShard* shard) {
    summaries[shard->shard_id()] = CollectSummary(backing);
  });
  return summaries;
}

void FormatSummary(std::string* str, const BlockSummaryMap& summary) {
  absl::StrAppend(str, absl::StrFormat("%10s %10s %10s %10s %10s %8s\n", "BlockSize", "Reserved",
                                       "Committed", "Used", "Wasted", "Waste%"));
  std::vector<std::pair<size_t, BlockSummary>> entries{summary.begin(), summary.end()};
  std::ranges::sort(entries, {}, [](const auto& entry) {
    const BlockSummary& stats = entry.second;
    return stats.committed > stats.used ? stats.committed - stats.used : 0;
  });

  size_t total_reserved = 0;
  size_t total_committed = 0;
  size_t total_used = 0;

  for (const auto& [size, block_summary] : entries) {
    const size_t wasted = block_summary.committed > block_summary.used
                              ? block_summary.committed - block_summary.used
                              : 0;
    const double waste_pct = 100.0 * wasted / std::max<size_t>(1UL, block_summary.committed);
    absl::StrAppend(str, absl::StrFormat("%10zu %10zu %10zu %10zu %10zu %8.2f%%\n", size,
                                         block_summary.reserved, block_summary.committed,
                                         block_summary.used, wasted, waste_pct));
    total_reserved += block_summary.reserved;
    total_committed += block_summary.committed;
    total_used += block_summary.used;
  }

  const size_t wasted = total_committed > total_used ? total_committed - total_used : 0;
  absl::StrAppend(str, absl::StrFormat("%10s %10zu %10zu %10zu %10zu %8.2f%%\n", "Total:",
                                       total_reserved, total_committed, total_used, wasted,
                                       100.0 * wasted / std::max<size_t>(1UL, total_committed)));
}

string FormatSummaries(const vector<BlockSummaryMap>& summaries) {
  string str;
  BlockSummaryMap machine_wide;
  for (size_t i = 0; i < summaries.size(); ++i) {
    absl::StrAppend(&str, "\nArena statistics for thread ", i, ":\n");
    FormatSummary(&str, summaries[i]);
    for (const auto& [size, block_summary] : summaries[i]) {
      BlockSummary& machine_block = machine_wide[size];
      machine_block.reserved += block_summary.reserved;
      machine_block.committed += block_summary.committed;
      machine_block.used += block_summary.used;
    }
  }

  absl::StrAppend(&str, "\nArena statistics for machine:\n");
  FormatSummary(&str, machine_wide);

  return str;
}

std::string MallocStatsCb(bool backing, unsigned tid) {
  string str;

  uint64_t start = absl::GetCurrentTimeNanos();

  absl::StrAppend(&str, "\nArena statistics from thread:", tid, "\n");

  mi_heap_t* data_heap = backing ? mi_heap_get_backing() : ServerState::tlocal()->data_heap();

  BlockMap block_map;

  mi_heap_visit_blocks(data_heap, false /* visit all blocks*/, MiArenaVisit, &block_map);
  uint64_t reserved = 0, committed = 0, used = 0;
  absl::StrAppend(&str, "Count BlockSize Reserved Committed Used\n");
  for (const auto& k_v : block_map) {
    uint64_t count = k_v.second;
    absl::StrAppend(&str, count, " ", get<0>(k_v.first), " ", get<1>(k_v.first), " ",
                    get<2>(k_v.first), " ", get<3>(k_v.first), "\n");
    reserved += count * get<1>(k_v.first);
    committed += count * get<2>(k_v.first);
    used += count * get<3>(k_v.first);
  }

  absl::StrAppend(
      &str, "total reserved: ", reserved, ", committed: ", committed, ", used: ", used,
      " fragmentation waste: ",
      100.0 * (committed > used ? committed - used : 0) / std::max<size_t>(1UL, committed), "%\n");
  const uint64_t delta = (absl::GetCurrentTimeNanos() - start) / 1000;
  absl::StrAppend(&str, "--- End mimalloc statistics, took ", delta, "us ---\n");

  return str;
}

size_t MemoryUsage(PrimeIterator it, bool account_key_memory_usage) {
  size_t key_size = account_key_memory_usage ? it->first.MallocUsed() : 0;
  return key_size + it->second.MallocUsed(true);
}

}  // namespace

MemoryCmd::MemoryCmd(ServerFamily* owner, CommandContext* cmd_cntx)
    : cmd_cntx_(cmd_cntx), owner_(owner) {
}

void MemoryCmd::Run(CmdArgList args) {
  CmdArgParser parser(args);

  if (parser.Check("HELP")) {
    string_view help_arr[] = {
        "MEMORY <subcommand> [<arg> ...]. Subcommands are:",
        "STATS",
        "    Shows breakdown of memory.",
        "MALLOC-STATS",
        "    Show global malloc stats as provided by allocator libraries",
        "ARENA [SUMMARY] [BACKING] [thread-id]",
        "    Show mimalloc arena stats for a heap residing in specified thread-id. 0 by default.",
        "    If SUMMARY is specified, show stats summarized by block size",
        "        per thread summary, followed by machine wide summary",
        "        thread-id is ignored for summary output.",
        "    If BACKING is specified, show stats for the backing heap.",
        "ARENA SHOW",
        "    Prints the arena summary report for the entire process.",
        "    Requires MIMALLOC_VERBOSE=1 environment to be set. The output goes to stdout",
        "USAGE <key> [WITHOUTKEY]",
        "    Show memory usage of a key.",
        "    If WITHOUTKEY is specified, the key itself is not accounted.",
        "DECOMMIT",
        "    Force decommit the memory freed by the server back to OS.",
        "TRACK",
        "    Allow tracking of memory allocation via `new` and `delete` based on input criteria.",
        "    USE WITH CAUTIOUS! This command is designed for Dragonfly developers.",
        "    ADD <lower-bound> <upper-bound> <sample-odds>",
        "        Sets up tracking memory allocations in the (inclusive) range [lower, upper]",
        "        sample-odds indicates how many of the allocations will be logged, there 0 means "
        "none, 1 means all, and everything in between is linear",
        "        There could be at most 4 tracking placed in parallel",
        "    REMOVE <lower-bound> <upper-bound>",
        "        Removes all memory tracking added which match bounds",
        "        Could remove 0, 1 or more",
        "    CLEAR",
        "        Removes all memory tracking",
        "    GET",
        "        Returns an array with all active tracking",
        "    ADDRESS <address>",
        "        Returns whether <address> is known to be allocated internally by any of the "
        "backing heaps",
        "DEFRAGMENT [threshold]",
        "    Tries to free memory by moving allocations around from sparsely used memory pages.",
        "    If a threshold is supplied, it is used to determine if data will be moved from the "
        "page.",
        "    Pages used less than the threshold percentage (default 0.8) are targeted for moving "
        "out data.",
    };
    auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx_->rb());
    return rb->SendSimpleStrArr(help_arr);
  };

  if (parser.Check("STATS")) {
    return Stats();
  }

  if (parser.Check("USAGE")) {
    if (!parser.HasNext()) {
      return cmd_cntx_->SendError(kSyntaxErr);
    }
    string_view key = parser.Next();
    bool account_key_memory_usage = !parser.Check("WITHOUTKEY");
    return Usage(key, account_key_memory_usage);
  }

  if (parser.Check("DECOMMIT")) {
    shard_set->pool()->AwaitBrief(
        [](unsigned, auto* pb) { ServerState::tlocal()->DecommitMemory(ServerState::kAllMemory); });
    return cmd_cntx_->rb()->SendSimpleString("OK");
  }

  if (parser.Check("MALLOC-STATS")) {
    return MallocStats();
  }

  if (parser.Check("ARENA")) {
    return ArenaStats(args);
  }

  if (parser.Check("TRACK")) {
    args.remove_prefix(1);
    return Track(args);
  }

  if (parser.Check("DEFRAGMENT")) {
    static const float default_threshold =
        absl::GetFlag(FLAGS_mem_defrag_page_utilization_threshold);
    const float threshold = parser.NextOrDefault(default_threshold);

    std::vector<CollectedPageStats> results(shard_set->size());
    shard_set->pool()->AwaitFiberOnAll([threshold, &results](util::ProactorBase*) {
      if (auto* shard = EngineShard::tlocal(); shard) {
        PageUsage page_usage{CollectPageStats::YES, threshold,
                             CycleQuota{CycleQuota::kDefaultDefragQuota}};
        if (auto shard_res = shard->DoDefrag(&page_usage); shard_res.has_value()) {
          results[shard->shard_id()] = std::move(shard_res.value());
        }
      }
    });

    const CollectedPageStats merged = CollectedPageStats::Merge(std::move(results), threshold);
    auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx_->rb());
    return rb->SendVerbatimString(merged.ToString());
  }

  string err = UnknownSubCmd(parser.Next(), "MEMORY");
  return cmd_cntx_->SendError(err, kSyntaxErrType);
}

namespace {

struct ConnectionMemoryUsage {
  size_t connection_size = 0;
  size_t replication_connection_count = 0;
  size_t replication_connection_size = 0;
};

ConnectionMemoryUsage GetConnectionMemoryUsage(ServerFamily* server) {
  vector<ConnectionMemoryUsage> mems(shard_set->pool()->size());

  for (auto* listener : server->GetListeners()) {
    listener->TraverseConnections([&](unsigned thread_index, util::Connection* conn) {
      if (conn == nullptr) {
        return;
      }

      auto* dfly_conn = static_cast<facade::Connection*>(conn);
      auto* cntx = static_cast<ConnectionContext*>(dfly_conn->cntx());

      size_t usage = dfly_conn->GetMemoryUsage();
      if (cntx == nullptr || cntx->master_repl_flow == nullptr) {
        mems[thread_index].connection_size += usage;
      } else {
        mems[thread_index].replication_connection_count++;
        mems[thread_index].replication_connection_size += usage;
      }
    });
  }

  ConnectionMemoryUsage mem;
  for (const auto& m : mems) {
    mem.connection_size += m.connection_size;
    mem.replication_connection_count += m.replication_connection_count;
    mem.replication_connection_size += m.replication_connection_size;
  }
  return mem;
}

}  // namespace

void MemoryCmd::Stats() {
  vector<pair<string, size_t>> stats;
  stats.reserve(25);
  ConnectionMemoryUsage connection_memory = GetConnectionMemoryUsage(owner_);

  // Connection stats, excluding replication connections
  stats.push_back({"connections.direct_bytes", connection_memory.connection_size});

  // Replication connection stats
  stats.push_back(
      {"replication.connections_count", connection_memory.replication_connection_count});
  stats.push_back({"replication.direct_bytes", connection_memory.replication_connection_size});

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx_->rb());
  rb->StartCollection(stats.size(), CollectionType::MAP);
  for (const auto& [k, v] : stats) {
    rb->SendBulkString(k);
    rb->SendLong(v);
  }
}

void MemoryCmd::MallocStats() {
  string report;

#if __GLIBC__  // MUSL/alpine do not have mallinfo routines.
#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 33)
  struct mallinfo2 malloc_info = mallinfo2();
#else
  struct mallinfo malloc_info = mallinfo();  // buggy because 32-bit stats may overflow.
#endif

  absl::StrAppend(&report, "___ Begin malloc stats ___\n");
  absl::StrAppend(&report, "arena: ", malloc_info.arena, ", ordblks: ", malloc_info.ordblks,
                  ", smblks: ", malloc_info.smblks, "\n");
  absl::StrAppend(&report, "hblks: ", malloc_info.hblks, ", hblkhd: ", malloc_info.hblkhd,
                  ", usmblks: ", malloc_info.usmblks, "\n");
  absl::StrAppend(&report, "fsmblks: ", malloc_info.fsmblks, ", uordblks: ", malloc_info.uordblks,
                  ", fordblks: ", malloc_info.fordblks, ", keepcost: ", malloc_info.keepcost, "\n");
  absl::StrAppend(&report, "___ End malloc stats ___\n\n");
#endif

  absl::StrAppend(&report, "___ Begin mimalloc stats ___\n");
  mi_stats_print_out(MiStatsCallback, &report);
  absl::StrAppend(&report, "___ End mimalloc stats ___\n\n");

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx_->rb());
  return rb->SendVerbatimString(report);
}

void MemoryCmd::ArenaStats(CmdArgList args) {
  uint32_t tid = 0;
  bool backing = false;
  bool show_arenas = false;
  bool summarize = false;

  if (args.size() >= 2) {
    string sub_cmd = absl::AsciiStrToUpper(ArgS(args, 1));

    if (sub_cmd == "SHOW") {
      if (args.size() != 2)
        return cmd_cntx_->SendError(kSyntaxErr, kSyntaxErrType);
      show_arenas = true;
    } else {
      unsigned tid_indx = 1;

      if (sub_cmd == "SUMMARY") {
        ++tid_indx;
        summarize = true;

        if (args.size() > tid_indx) {
          sub_cmd = absl::AsciiStrToUpper(ArgS(args, tid_indx));
        }
      }

      if (sub_cmd == "BACKING") {
        ++tid_indx;
        backing = true;
      }

      if (summarize && args.size() > tid_indx) {
        return cmd_cntx_->SendError(kSyntaxErr, kSyntaxErrType);
      }

      if (args.size() > tid_indx && !absl::SimpleAtoi(ArgS(args, tid_indx), &tid)) {
        return cmd_cntx_->SendError(kInvalidIntErr);
      }
    }
  }

  if (show_arenas) {
    mi_debug_show_arenas();
    return cmd_cntx_->rb()->SendOk();
  }

  if (summarize) {
    const uint64_t start = absl::GetCurrentTimeNanos();
    const auto summaries = CollectSummaries(backing);
    string report = FormatSummaries(summaries);
    const uint64_t delta = (absl::GetCurrentTimeNanos() - start) / 1000;
    absl::StrAppend(&report, "\n--- End mimalloc statistics, took ", delta, "us ---\n");
    auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx_->rb());
    return rb->SendVerbatimString(report);
  }

  if (backing && tid >= shard_set->pool()->size()) {
    return cmd_cntx_->SendError(
        absl::StrCat("Thread id must be less than ", shard_set->pool()->size()));
  }

  if (!backing && tid >= shard_set->size()) {
    return cmd_cntx_->SendError(absl::StrCat("Thread id must be less than ", shard_set->size()));
  }

  const string mi_malloc_info =
      shard_set->pool()->at(tid)->AwaitBrief([=] { return MallocStatsCb(backing, tid); });

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx_->rb());
  return rb->SendVerbatimString(mi_malloc_info);
}

void MemoryCmd::Usage(std::string_view key, bool account_key_memory_usage) {
  ShardId sid = Shard(key, shard_set->size());
  ssize_t memory_usage = shard_set->pool()->at(sid)->AwaitBrief(
      [key, account_key_memory_usage, this, sid]() -> ssize_t {
        auto& db_slice = cmd_cntx_->server_conn_cntx()->ns->GetDbSlice(sid);
        auto [pt, exp_t] = db_slice.GetTables(cmd_cntx_->server_conn_cntx()->db_index());
        PrimeIterator it = pt->Find(key);
        if (IsValid(it)) {
          return MemoryUsage(it, account_key_memory_usage);
        } else {
          return -1;
        }
      });

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx_->rb());
  if (memory_usage < 0)
    return rb->SendNull();
  rb->SendLong(memory_usage);
}

void MemoryCmd::Track(CmdArgList args) {
#ifndef DFLY_ENABLE_MEMORY_TRACKING
  return cmd_cntx_->SendError("MEMORY TRACK must be enabled at build time.");
#endif

  CmdArgParser parser(args);

  if (parser.Check("ADD")) {
    AllocationTracker::TrackingInfo tracking_info;
    std::tie(tracking_info.lower_bound, tracking_info.upper_bound, tracking_info.sample_odds) =
        parser.Next<size_t, size_t, double>();
    if (parser.HasError()) {
      return cmd_cntx_->SendError(parser.TakeError().MakeReply());
    }

    atomic_bool error{false};
    shard_set->pool()->AwaitBrief([&](unsigned index, auto*) {
      if (!AllocationTracker::Get().Add(tracking_info)) {
        error.store(true);
      }
    });

    if (error.load()) {
      return cmd_cntx_->SendError("Unable to add tracker");
    } else {
      return cmd_cntx_->rb()->SendOk();
    }
  }

  if (parser.Check("REMOVE")) {
    auto [lower_bound, upper_bound] = parser.Next<size_t, size_t>();
    if (parser.HasError()) {
      return cmd_cntx_->SendError(parser.TakeError().MakeReply());
    }

    atomic_bool error{false};
    shard_set->pool()->AwaitBrief([&, lo = lower_bound, hi = upper_bound](unsigned index, auto*) {
      if (!AllocationTracker::Get().Remove(lo, hi)) {
        error.store(true);
      }
    });

    if (error.load()) {
      return cmd_cntx_->SendError("Unable to remove tracker");
    } else {
      return cmd_cntx_->rb()->SendOk();
    }
  }

  if (parser.Check("CLEAR")) {
    shard_set->pool()->AwaitBrief([&](unsigned index, auto*) { AllocationTracker::Get().Clear(); });
    return cmd_cntx_->rb()->SendOk();
  }

  if (parser.Check("GET")) {
    auto ranges = AllocationTracker::Get().GetRanges();
    auto* rb = static_cast<facade::RedisReplyBuilder*>(cmd_cntx_->rb());
    rb->StartArray(ranges.size());
    for (const auto& range : ranges) {
      rb->SendSimpleString(
          absl::StrCat(range.lower_bound, ",", range.upper_bound, ",", range.sample_odds));
    }
    return;
  }

  if (parser.Check("ADDRESS")) {
    string_view ptr_str = parser.Next();
    if (parser.HasError()) {
      return cmd_cntx_->SendError(parser.TakeError().MakeReply());
    }

    size_t ptr = 0;
    if (!absl::SimpleHexAtoi(ptr_str, &ptr)) {
      return cmd_cntx_->SendError("Address must be hex number");
    }

    atomic_bool found{false};
    shard_set->pool()->AwaitBrief([&](unsigned index, auto*) {
      if (mi_heap_check_owned(mi_heap_get_backing(), (void*)ptr)) {
        found.store(true);
      }
    });

    return cmd_cntx_->rb()->SendSimpleString(found.load() ? "FOUND" : "NOT-FOUND");
  }

  return cmd_cntx_->SendError(kSyntaxErrType);
}

}  // namespace dfly


================================================
FILE: src/server/memory_cmd.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include "server/conn_context.h"

namespace dfly {

class ServerFamily;

class MemoryCmd {
 public:
  MemoryCmd(ServerFamily* owner, CommandContext* cmd_cntx);

  void Run(CmdArgList args);

 private:
  void Stats();
  void MallocStats();
  void ArenaStats(CmdArgList args);
  void Usage(std::string_view key, bool account_key_memory_usage);
  void Track(CmdArgList args);

  CommandContext* cmd_cntx_;
  ServerFamily* owner_;
};

}  // namespace dfly


================================================
FILE: src/server/multi_command_squasher.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/multi_command_squasher.h"

#include <absl/container/inlined_vector.h>

#include "base/cycle_clock.h"
#include "base/flag_utils.h"
#include "base/flags.h"
#include "base/logging.h"
#include "core/overloaded.h"
#include "facade/dragonfly_connection.h"
#include "server/command_registry.h"
#include "server/conn_context.h"
#include "server/engine_shard_set.h"
#include "server/transaction.h"
#include "server/tx_base.h"

ABSL_FLAG(uint32_t, max_busy_squash_usec, 1000,
          "Maximum time in microseconds to execute squashed commands before yielding.");

ABSL_FLAG(uint32_t, log_squash_info_threshold_usec, 1 << 31,
          "Threshold in microseconds above which to log squashing timings.");

namespace dfly {

using namespace std;
using namespace facade;
using namespace util;
using base::CycleClock;

namespace {

thread_local uint64_t max_busy_squash_cycles_cached = 1ULL << 32;
thread_local uint32_t log_squash_threshold_cached = 1ULL << 31;

size_t Size(const CapturingReplyBuilder::Payload& payload) {
  size_t payload_size = sizeof(CapturingReplyBuilder::Payload);
  return payload_size +
         visit(Overloaded{[](const payload::SimpleString& data) { return data.size(); },
                          [](const payload::BulkString& data) { return data.size(); },
                          [](const payload::Error& data) {
                            return data->first.size() + data->second.size();
                          },
                          [](const unique_ptr<payload::CollectionPayload>& data) {
                            if (!data || (data->len == 0 && data->type == CollectionType::ARRAY)) {
                              return 0ul;
                            }
                            size_t res = 0;
                            for (const auto& pl : data->arr) {
                              res += Size(pl);
                            }
                            return res;
                          },
                          // Other payload types are small
                          [](const auto&) { return 0ul; }},
               payload);
}

}  // namespace

MultiCommandSquasher::Stats& MultiCommandSquasher::Stats::operator+=(const Stats& o) {
  squashed_commands += o.squashed_commands;
  hop_usec += o.hop_usec;
  reply_usec += o.reply_usec;
  hops += o.hops;
  yields += o.yields;

  return *this;
}

MultiCommandSquasher::MultiCommandSquasher(absl::Span<StoredCmd> cmds, ConnectionContext* cntx,
                                           Service* service, const Opts& opts)
    : cmds_{cmds}, cntx_{cntx}, service_{service}, base_cid_{nullptr}, opts_{opts} {
  auto mode = cntx->transaction->GetMultiMode();
  base_cid_ = cntx->transaction->GetCId();
  atomic_ = mode != Transaction::NON_ATOMIC;
}

MultiCommandSquasher::ShardExecInfo& MultiCommandSquasher::PrepareShardInfo(ShardId sid) {
  if (sharded_.empty()) {
    sharded_.resize(shard_set->size());
    for (size_t i = 0; i < sharded_.size(); i++) {
      sharded_[i].reply_size_total_ptr = &tl_facade_stats->reply_stats.squashing_current_reply_size;
    }
  }

  auto& sinfo = sharded_[sid];
  if (!sinfo.local_tx) {
    if (IsAtomic()) {
      sinfo.local_tx = new Transaction{cntx_->transaction, sid, nullopt};
    } else {
      // Non-atomic squashing does not use the transactional framework for fan out, so local
      // transactions have to be fully standalone, check locks and release them immediately.
      sinfo.local_tx = new Transaction{base_cid_};
      sinfo.local_tx->StartMultiNonAtomic();
    }
    num_shards_++;
  }

  return sinfo;
}

MultiCommandSquasher::SquashResult MultiCommandSquasher::TrySquash(const StoredCmd* cmd) {
  DCHECK(cmd->Cid());

  const CommandId& cid = *cmd->Cid();
  if (!cid.IsTransactional() || (cid.opt_mask() & CO::BLOCKING) ||
      (cid.opt_mask() & CO::GLOBAL_TRANS))
    return SquashResult::NOT_SQUASHED;

  if (cid.name() == "CLIENT" || cntx_->conn_state.tracking_info_.IsTrackingOn()) {
    return SquashResult::NOT_SQUASHED;
  }

  auto args = cmd->Slice(&tmp_keylist_);
  if (args.empty())
    return SquashResult::NOT_SQUASHED;

  // Instead of returning an error, we treat command as non-squashable, allowing the
  // standalone execution path to handle it.
  // Validate returns an optional ErrorReply
  if (cid.Validate(args).has_value())
    return SquashResult::NOT_SQUASHED;

  auto keys = DetermineKeys(&cid, args);
  if (!keys.ok() || keys->NumArgs() == 0)
    return SquashResult::NOT_SQUASHED;

  // Check if all command keys belong to one shard
  ShardId last_sid = kInvalidSid;

  for (string_view key : keys->Range(args)) {
    ShardId sid = Shard(key, shard_set->size());
    if (last_sid == kInvalidSid || last_sid == sid)
      last_sid = sid;
    else
      return SquashResult::NOT_SQUASHED;  // at least two shards
  }

  auto& sinfo = PrepareShardInfo(last_sid);

  sinfo.dispatched.push_back({.cmd = cmd, .reply = {}});
  order_.push_back(last_sid);

  bool need_flush = sinfo.dispatched.size() >= opts_.max_squash_size;
  return need_flush ? SquashResult::SQUASHED_FULL : SquashResult::SQUASHED;
}

bool MultiCommandSquasher::ExecuteStandalone(RedisReplyBuilder* rb, const StoredCmd* cmd) {
  DCHECK(order_.empty());  // check no squashed chain is interrupted

  auto args = cmd->Slice(&tmp_keylist_);

  if (opts_.verify_commands) {
    if (auto err = service_->VerifyCommandState(*cmd->Cid(), args, *cntx_); err) {
      rb->SendError(std::move(*err));
      return !opts_.error_abort;
    }
  }

  auto* tx = cntx_->transaction;
  if (cmd->Cid()->IsTransactional()) {
    tx->MultiSwitchCmd(cmd->Cid());
    auto status = tx->InitByArgs(cntx_->ns, cntx_->conn_state.db_index, args);
    if (status != OpStatus::OK) {
      rb->SendError(status);
      return !opts_.error_abort;
    }
  }
  CommandContext cmd_cntx{rb, cntx_};
  cmd_cntx.SetupTx(cmd->Cid(), tx);
  service_->InvokeCmd(args, &cmd_cntx);
  return true;
}

OpStatus MultiCommandSquasher::SquashedHopCb(EngineShard* es, RespVersion resp_v) {
  auto& sinfo = sharded_[es->shard_id()];
  DCHECK(!sinfo.dispatched.empty());

  auto* local_tx = sinfo.local_tx.get();
  CapturingReplyBuilder crb(ReplyMode::FULL, resp_v);
  CmdArgVec arg_vec;
  CommandContext cmd_cntx{&crb, cntx_};
  cmd_cntx.SetupTx(nullptr, local_tx);

  auto move_reply = [&sinfo](CapturingReplyBuilder::Payload&& src,
                             CapturingReplyBuilder::Payload* dst) {
    *dst = std::move(src);
    size_t sz = Size(*dst);
    sinfo.reply_size_delta += sz;
    sinfo.reply_size_total_ptr->fetch_add(sz, std::memory_order_relaxed);
  };

  for (auto& dispatched : sinfo.dispatched) {
    auto args = dispatched.cmd->Slice(&arg_vec);
    if (opts_.verify_commands) {
      // The shared context is used for state verification, the local one is only for replies
      if (auto err = service_->VerifyCommandState(*dispatched.cmd->Cid(), args, *cntx_); err) {
        crb.SendError(std::move(*err));
        move_reply(crb.Take(), &dispatched.reply);
        continue;
      }
    }

    crb.SetReplyMode(dispatched.cmd->ReplyMode());

    local_tx->MultiSwitchCmd(dispatched.cmd->Cid());
    auto status = local_tx->InitByArgs(cntx_->ns, cntx_->conn_state.db_index, args);
    if (status != OpStatus::OK) {
      crb.SendError(status);
    } else {
      cmd_cntx.UpdateCid(dispatched.cmd->Cid());
      service_->InvokeCmd(args, &cmd_cntx);
    }
    move_reply(crb.Take(), &dispatched.reply);
  }

  return OpStatus::OK;
}

bool MultiCommandSquasher::ExecuteSquashed(facade::RedisReplyBuilder* rb) {
  DCHECK(!cntx_->conn_state.exec_info.IsCollecting());

  if (order_.empty())
    return true;

  unsigned num_shards = 0;
  for (auto& sd : sharded_) {
    if (!sd.dispatched.empty())
      ++num_shards;
  }

  Transaction* tx = cntx_->transaction;
  ServerState::tlocal()->stats.squash_width_freq_arr[num_shards - 1]++;
  uint64_t start = CycleClock::Now();
  atomic_uint64_t max_sched_cycles{0}, max_exec_cycles{0};
  base::SpinLock lock;
  uint64_t fiber_running_cycles{0}, proactor_running_cycles{0};
  uint32_t max_sched_thread_id{0}, max_sched_seq_num{0};

  // Atomic transactions (that have all keys locked) perform hops and run squashed commands via
  // stubs, non-atomic ones just run the commands in parallel.
  if (IsAtomic()) {
    auto cb = [this](ShardId sid) { return !sharded_[sid].dispatched.empty(); };
    tx->PrepareSquashedMultiHop(base_cid_, cb);
    tx->ScheduleSingleHop(
        [this, rb](auto* tx, auto* es) { return SquashedHopCb(es, rb->GetRespVersion()); });
  } else {
    fb2::BlockingCounter bc(num_shards);
    DVLOG(1) << "Squashing " << num_shards << " " << tx->DebugId();

    // Saves work in case logging is disable (i.e. log_squash_threshold_cached is high).
    const uint64_t min_threshold_cycles = CycleClock::FromUsec(log_squash_threshold_cached / 5);
    auto cb = [&, bc, rb]() mutable {
      uint64_t sched_time = CycleClock::Now() - start;

      // Update max_sched_cycles in lock-free fashion, to avoid contention
      uint64_t current = max_sched_cycles.load(memory_order_relaxed);
      while (sched_time > min_threshold_cycles && sched_time > current) {
        if (max_sched_cycles.compare_exchange_weak(current, sched_time, memory_order_relaxed,
                                                   memory_order_relaxed)) {
          lock_guard<base::SpinLock> g(lock);

          // If it is still the longest scheduling time
          if (max_sched_cycles.load(memory_order_relaxed) == sched_time) {
            // Store the stats from the callback with longest scheduling time.
            fiber_running_cycles = ThisFiber::GetRunningTimeCycles();
            proactor_running_cycles = ProactorBase::me()->GetCurrentBusyCycles();
            max_sched_thread_id = ProactorBase::me()->GetPoolIndex();
            max_sched_seq_num = fb2::GetFiberRunSeq();
          }
          break;
        }
        // current is updated to the current value of max_sched_cycles, so the loop will retry
        // with the new value if sched_time is still greater than it.
      }

      if (ThisFiber::GetRunningTimeCycles() > max_busy_squash_cycles_cached) {
        ThisFiber::Yield();
        stats_.yields++;
      }
      this->SquashedHopCb(EngineShard::tlocal(), rb->GetRespVersion());
      uint64_t exec_time = CycleClock::Now() - start;
      current = max_exec_cycles.load(memory_order_relaxed);
      while (exec_time > current) {
        if (max_exec_cycles.compare_exchange_weak(current, exec_time, memory_order_relaxed,
                                                  memory_order_relaxed))
          break;
      }

      bc->Dec();  // Release barrier: Must be the last one in the callback.
    };
    for (unsigned i = 0; i < sharded_.size(); ++i) {
      if (!sharded_[i].dispatched.empty())
        shard_set->AddL2(i, cb);
    }
    bc->Wait();
  }

  uint64_t after_hop = CycleClock::Now();
  bool aborted = false;

  size_t total_reply_size = 0;
  for (auto& sinfo : sharded_) {
    total_reply_size += sinfo.reply_size_delta;
  }

  for (auto idx : order_) {
    auto& sinfo = sharded_[idx];
    DCHECK_LT(sinfo.reply_id, sinfo.dispatched.size());

    auto& reply = sinfo.dispatched[sinfo.reply_id++].reply;
    aborted |= opts_.error_abort && CapturingReplyBuilder::TryExtractError(reply);

    CapturingReplyBuilder::Apply(std::move(reply), rb);
    if (aborted)
      break;
  }

  uint64_t after_reply = CycleClock::Now();
  uint64_t total_usec = CycleClock::ToUsec(after_reply - start);
  stats_.hop_usec += total_usec;
  stats_.reply_usec += CycleClock::ToUsec(after_reply - after_hop);
  stats_.hops++;
  stats_.squashed_commands += order_.size();

  if (total_usec > log_squash_threshold_cached) {
    uint64_t max_sched_usec = CycleClock::ToUsec(max_sched_cycles.load());
    uint64_t fiber_running_usec = CycleClock::ToUsec(fiber_running_cycles);
    uint64_t proactor_running_usec = CycleClock::ToUsec(proactor_running_cycles);
    uint64_t max_exec_usec = CycleClock::ToUsec(max_exec_cycles.load());

    LOG_EVERY_T(INFO, 0.1)
        << "Squashed " << order_.size() << " commands. "
        << "Total/Fanout/MaxSchedTime/ThreadCbTime/ThreadId/FiberCbTime/FiberSeq/"
        << "MaxExecTime: " << total_usec << "/" << num_shards_ << "/" << max_sched_usec << "/"
        << proactor_running_usec << "/" << max_sched_thread_id << "/" << fiber_running_usec << "/"
        << "/" << max_sched_seq_num << "/" << max_exec_usec << "\ncoordinator thread running time: "
        << CycleClock::ToUsec(ProactorBase::me()->GetCurrentBusyCycles());
  }

  tl_facade_stats->reply_stats.squashing_current_reply_size.fetch_sub(total_reply_size,
                                                                      std::memory_order_release);
  for (auto& sinfo : sharded_) {
    sinfo.dispatched.clear();
    sinfo.reply_id = 0;
  }

  order_.clear();
  return !aborted;
}

void MultiCommandSquasher::Run(RedisReplyBuilder* rb) {
  DVLOG(1) << "Trying to squash " << cmds_.size() << " commands for transaction "
           << cntx_->transaction->DebugId();

  for (auto& cmd : cmds_) {
    auto res = TrySquash(&cmd);

    if (res == SquashResult::NOT_SQUASHED || res == SquashResult::SQUASHED_FULL) {
      if (!ExecuteSquashed(rb))
        break;

      // if the last command was not added - we squash it separately.
      if (res == SquashResult::NOT_SQUASHED) {
        if (!ExecuteStandalone(rb, &cmd))
          break;
      }
    }
  }

  ExecuteSquashed(rb);  // Flush leftover

  // Set last txid.
  cntx_->last_command_debug.clock = cntx_->transaction->txid();

  // UnlockMulti is a no-op for non-atomic multi transactions,
  // still called for correctness and future changes
  if (!IsAtomic()) {
    for (auto& sd : sharded_) {
      if (sd.local_tx)
        sd.local_tx->UnlockMulti();
    }
  }

  VLOG(1) << "Handled " << cmds_.size() << " commands, max fanout: " << num_shards_
          << ", atomic: " << atomic_;
}

bool MultiCommandSquasher::IsAtomic() const {
  return atomic_;
}

void MultiCommandSquasher::UpdateFromFlags() {
  max_busy_squash_cycles_cached = CycleClock::FromUsec(absl::GetFlag(FLAGS_max_busy_squash_usec));
  log_squash_threshold_cached = absl::GetFlag(FLAGS_log_squash_info_threshold_usec);
}

vector<string> MultiCommandSquasher::GetMutableFlagNames() {
  return base::GetFlagNames(FLAGS_max_busy_squash_usec, FLAGS_log_squash_info_threshold_usec);
}

}  // namespace dfly


================================================
FILE: src/server/multi_command_squasher.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include "facade/reply_capture.h"
#include "server/conn_context.h"
#include "server/main_service.h"

namespace dfly {

// MultiCommandSquasher allows executing a series of commands under a multi transaction
// and squashing multiple consecutive single-shard commands into one hop whenever it's possible,
// thus parallelizing command execution and greatly decreasing the dispatch overhead for them.
//
// Single shard commands are executed in small batches over multiple shards.
// For atomic multi transactions (global & locking ahead), the batch is executed with a regular hop
// of the multi transaction. Each shard contains a "stub" transaction to mimic the regular
// transactional api for commands. Non atomic multi transactions use regular shard_set dispatches
// instead of hops for executing batches. This allows avoiding locking many keys at once. Each shard
// contains a non-atomic multi transaction to execute squashed commands.
class MultiCommandSquasher {
 public:
  struct Opts {
    bool verify_commands = false;   // Whether commands need to be verified before execution
    bool error_abort = false;       // Abort upon receiving error
    unsigned max_squash_size = 32;  // How many commands to squash at once
  };

  struct Stats {
    uint32_t squashed_commands = 0;  // Total number of squashed commands
    uint32_t hop_usec = 0;           // Total time spent in hops (microseconds)
    uint32_t reply_usec = 0;         // Total time spent in replies (microseconds)
    uint32_t hops = 0;               // Total number of hops executed
    uint32_t yields = 0;
    Stats& operator+=(const Stats& o);
  };

  // Returns number of processed commands.
  static Stats Execute(absl::Span<StoredCmd> cmds, facade::RedisReplyBuilder* rb,
                       ConnectionContext* cntx, Service* service, const Opts& opts) {
    MultiCommandSquasher sq{cmds, cntx, service, opts};
    sq.Run(rb);
    return sq.stats_;
  }

  static void UpdateFromFlags();
  static std::vector<std::string> GetMutableFlagNames();

 private:
  // Per-shard execution info.
  struct ShardExecInfo {
    ShardExecInfo() : local_tx{nullptr} {
    }

    struct Command {
      const StoredCmd* cmd;
      facade::CapturingReplyBuilder::Payload reply;
    };
    std::vector<Command> dispatched;  // Dispatched commands
    unsigned reply_id = 0;

    std::atomic<size_t>* reply_size_total_ptr;   // Total size of replies on the IO thread
    size_t reply_size_delta = 0;                 // Size of replies for this shard
    boost::intrusive_ptr<Transaction> local_tx;  // stub-mode tx for use inside shard
  };

  enum class SquashResult : uint8_t { SQUASHED, SQUASHED_FULL, NOT_SQUASHED };

  MultiCommandSquasher(absl::Span<StoredCmd> cmds, ConnectionContext* cntx, Service* Service,
                       const Opts& opts);

  // Lazy initialize shard info.
  ShardExecInfo& PrepareShardInfo(ShardId sid);

  // Retrun squash flags
  SquashResult TrySquash(const StoredCmd* cmd);

  // Execute separate non-squashed cmd. Return false if aborting on error.
  bool ExecuteStandalone(facade::RedisReplyBuilder* rb, const StoredCmd* cmd);

  // Callback that runs on shards during squashed hop.
  facade::OpStatus SquashedHopCb(EngineShard* es, facade::RespVersion resp_v);

  // Execute all currently squashed commands. Return false if aborting on error.
  bool ExecuteSquashed(facade::RedisReplyBuilder* rb);

  void Run(facade::RedisReplyBuilder* rb);

  bool IsAtomic() const;

  absl::Span<StoredCmd> cmds_;  // Input range of stored commands
  ConnectionContext* cntx_;     // Underlying context
  Service* service_;

  bool atomic_;                // Whether working in any of the atomic modes
  const CommandId* base_cid_;  // underlying cid (exec or eval) for executing batch hops

  Opts opts_;

  std::vector<ShardExecInfo> sharded_;
  std::vector<ShardId> order_;  // reply order for squashed cmds

  size_t num_shards_ = 0;

  std::vector<MutableSlice> tmp_keylist_;
  Stats stats_;
};

}  // namespace dfly


================================================
FILE: src/server/multi_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include <absl/flags/reflection.h>
#include <absl/strings/str_cat.h>
#include <absl/strings/str_replace.h>
#include <gmock/gmock.h>

#include "base/flags.h"
#include "base/gtest.h"
#include "base/logging.h"
#include "core/interpreter.h"
#include "facade/facade_test.h"
#include "server/conn_context.h"
#include "server/main_service.h"
#include "server/test_utils.h"
#include "server/transaction.h"

ABSL_DECLARE_FLAG(uint32_t, num_shards);
ABSL_DECLARE_FLAG(bool, multi_exec_squash);
ABSL_DECLARE_FLAG(bool, lua_auto_async);
ABSL_DECLARE_FLAG(bool, lua_allow_undeclared_auto_correct);
ABSL_DECLARE_FLAG(std::string, default_lua_flags);
ABSL_DECLARE_FLAG(std::vector<std::string>, lua_force_atomicity_shas);

namespace dfly {

using namespace std;
using namespace util;
using absl::StrCat;
using ::io::Result;
using testing::_;
using testing::ElementsAre;
using testing::HasSubstr;

namespace {

constexpr unsigned kPoolThreadCount = 4;

const char kKey1[] = "x";
const char kKey2[] = "b";
const char kKey3[] = "c";
const char kKey4[] = "y";

const char kKeySid0[] = "x";
const char kKeySid1[] = "c";
const char kKeySid2[] = "b";

}  // namespace

// This test is responsible for server and main service
// (connection, transaction etc) families.
class MultiTest : public BaseFamilyTest {
 protected:
  MultiTest() : BaseFamilyTest() {
    num_threads_ = kPoolThreadCount;
  }
};

class SingleShardMultiTest : public BaseFamilyTest {
 protected:
  SingleShardMultiTest() : BaseFamilyTest() {
    num_threads_ = 5;
    absl::SetFlag(&FLAGS_num_shards, 1);
  }

  absl::FlagSaver saver_;
};

struct MultiTxTest : public MultiTest {};

// Check constants are valid.
TEST_F(MultiTest, VerifyConstants) {
  Run({"mget", kKeySid0, kKeySid1, kKeySid2});
  ASSERT_EQ(3, GetDebugInfo().shards_count);
}

TEST_F(MultiTest, MultiAndFlush) {
  RespExpr resp = Run({"multi"});
  ASSERT_EQ(resp, "OK");

  resp = Run({"get", kKey1});
  ASSERT_EQ(resp, "QUEUED");

  EXPECT_THAT(Run({"FLUSHALL"}), ErrArg("not allowed inside a transaction"));
}

TEST_F(MultiTest, MultiWithError) {
  EXPECT_THAT(Run({"exec"}), ErrArg("EXEC without MULTI"));
  EXPECT_THAT(Run({"multi"}), "OK");
  EXPECT_THAT(Run({"set", "x", "y"}), "QUEUED");
  EXPECT_THAT(Run({"set", "x"}), ErrArg("wrong number of arguments for 'set' command"));
  EXPECT_THAT(Run({"exec"}), ErrArg("EXECABORT Transaction discarded because of previous errors"));

  EXPECT_THAT(Run({"multi"}), "OK");
  EXPECT_THAT(Run({"set", "z", "y"}), "QUEUED");
  EXPECT_THAT(Run({"exec"}), "OK");

  EXPECT_THAT(Run({"get", "x"}), ArgType(RespExpr::NIL));
  EXPECT_THAT(Run({"get", "z"}), "y");
}

TEST_F(MultiTest, Multi) {
  RespExpr resp = Run({"multi"});
  ASSERT_EQ(resp, "OK");

  resp = Run({"get", kKey1});
  ASSERT_EQ(resp, "QUEUED");

  resp = Run({"get", kKey4});
  ASSERT_EQ(resp, "QUEUED");

  resp = Run({"exec"});
  ASSERT_THAT(resp, ArrLen(2));
  ASSERT_THAT(resp.GetVec(), ElementsAre(ArgType(RespExpr::NIL), ArgType(RespExpr::NIL)));

  atomic_bool tx_empty = true;
  shard_set->RunBriefInParallel([&](EngineShard* shard) {
    if (!shard->txq()->Empty())
      tx_empty.store(false);
  });
  EXPECT_TRUE(tx_empty);

  resp = Run({"get", kKey4});
  ASSERT_THAT(resp, ArgType(RespExpr::NIL));

  ASSERT_FALSE(IsLocked(0, kKey1));
  ASSERT_FALSE(IsLocked(0, kKey4));
  ASSERT_FALSE(service_->IsShardSetLocked());
}

TEST_F(MultiTxTest, MultiUnlock) {
  auto* exec_cid = service_->FindCmd("EXEC");
  boost::intrusive_ptr<Transaction> tx(new Transaction{exec_cid});

  auto* ns = &namespaces->GetDefaultNamespace();
  string_view keys[4] = {kKey1, kKey2, kKey3, kKey4};

  pp_->at(0)->Await([&] { tx->StartMultiLockedAhead(ns, 0, keys); });

  for (auto key : keys)
    EXPECT_TRUE(IsLocked(0, key));

  pp_->at(0)->Await([&] { tx->UnlockMulti(true); });

  for (auto key : keys)
    EXPECT_FALSE(IsLocked(0, key));
}

TEST_F(MultiTest, MultiGlobalCommands) {
  ASSERT_THAT(Run({"set", "key", "val"}), "OK");

  ASSERT_THAT(Run({"multi"}), "OK");
  ASSERT_THAT(Run({"move", "key", "2"}), "QUEUED");
  ASSERT_THAT(Run({"save"}), "QUEUED");

  RespExpr resp = Run({"exec"});
  ASSERT_THAT(resp, ArrLen(2));

  ASSERT_THAT(Run({"get", "key"}), ArgType(RespExpr::NIL));

  ASSERT_THAT(Run({"select", "2"}), "OK");
  ASSERT_THAT(Run({"get", "key"}), "val");

  ASSERT_FALSE(IsLocked(0, "key"));
  ASSERT_FALSE(IsLocked(2, "key"));
}

TEST_F(MultiTest, HitMissStats) {
  RespExpr resp = Run({"set", "Key1", "VAL"});
  ASSERT_EQ(resp, "OK");

  resp = Run({"get", "Key1"});
  ASSERT_EQ(resp, "VAL");

  resp = Run({"get", "Key2"});
  ASSERT_THAT(resp, ArgType(RespExpr::NIL));

  auto metrics = GetMetrics();
  EXPECT_THAT(metrics.events.hits, 1);
  EXPECT_THAT(metrics.events.misses, 1);
}

TEST_F(MultiTest, PerDbHitMissStats) {
  Run({"SELECT", "0"});
  ASSERT_EQ(Run({"SET", "key1", "val1"}), "OK");
  ASSERT_EQ(Run({"GET", "key1"}), "val1");
  ASSERT_THAT(Run({"GET", "nonexistent1"}), ArgType(RespExpr::NIL));

  Run({"SELECT", "1"});
  ASSERT_EQ(Run({"SET", "key2", "val2"}), "OK");
  ASSERT_EQ(Run({"GET", "key2"}), "val2");
  ASSERT_THAT(Run({"GET", "nonexistent2"}), ArgType(RespExpr::NIL));

  auto metrics = GetMetrics();

  EXPECT_GE(metrics.db_stats.size(), 2u);
  EXPECT_EQ(metrics.db_stats[0].events.hits, 1u);
  EXPECT_EQ(metrics.db_stats[0].events.misses, 1u);
  EXPECT_EQ(metrics.db_stats[1].events.hits, 1u);
  EXPECT_EQ(metrics.db_stats[1].events.misses, 1u);

  EXPECT_EQ(metrics.events.hits, 2u);
  EXPECT_EQ(metrics.events.misses, 2u);
}

TEST_F(MultiTest, PerDbHitMissStatsReset) {
  Run({"SELECT", "0"});
  Run({"SET", "key1", "val1"});
  Run({"GET", "key1"});
  Run({"GET", "key2"});

  auto before = GetMetrics();
  ASSERT_GT(before.db_stats[0].events.hits, 0u);
  ASSERT_GT(before.db_stats[0].events.misses, 0u);

  EXPECT_EQ("OK", Run({"CONFIG", "RESETSTAT"}));

  auto after = GetMetrics();
  EXPECT_EQ(after.db_stats[0].events.hits, 0u);
  EXPECT_EQ(after.db_stats[0].events.misses, 0u);
}

TEST_F(MultiTest, PerDbHitMissInfoOutput) {
  Run({"SELECT", "0"});
  Run({"SET", "testkey", "testval"});
  Run({"GET", "testkey"});
  Run({"GET", "missing"});

  auto info_resp = Run({"INFO", "keyspace"});
  ASSERT_TRUE(info_resp.type == RespExpr::STRING);
  string info_str = info_resp.GetString();
  EXPECT_THAT(info_str, HasSubstr("hits=1"));
  EXPECT_THAT(info_str, HasSubstr("misses=1"));
  EXPECT_THAT(info_str, HasSubstr("hit_ratio=50.00"));
}

TEST_F(MultiTest, MultiEmpty) {
  RespExpr resp = Run({"multi"});
  ASSERT_EQ(resp, "OK");
  resp = Run({"exec"});
  EXPECT_THAT(resp, ArrLen(0));
  EXPECT_FALSE(service_->IsShardSetLocked());

  Run({"multi"});
  ASSERT_EQ(Run({"ping", "foo"}), "QUEUED");
  resp = Run({"exec"});
  EXPECT_EQ(resp, "foo");

  Run({"multi"});
  Run({"set", "a", ""});
  resp = Run({"exec"});
  EXPECT_EQ(resp, "OK");

  resp = Run({"get", "a"});
  EXPECT_EQ(resp, "");
}

TEST_F(MultiTest, MultiSeq) {
  RespExpr resp = Run({"multi"});
  ASSERT_EQ(resp, "OK");

  resp = Run({"set", kKey1, absl::StrCat(1)});
  ASSERT_EQ(resp, "QUEUED");
  resp = Run({"get", kKey1});
  ASSERT_EQ(resp, "QUEUED");
  resp = Run({"mget", kKey1, kKey4});
  ASSERT_EQ(resp, "QUEUED");
  resp = Run({"exec"});

  ASSERT_FALSE(IsLocked(0, kKey1));
  ASSERT_FALSE(IsLocked(0, kKey4));
  ASSERT_FALSE(service_->IsShardSetLocked());

  ASSERT_THAT(resp, ArrLen(3));
  const auto& arr = resp.GetVec();
  EXPECT_THAT(arr, ElementsAre("OK", "1", ArrLen(2)));

  ASSERT_THAT(arr[2].GetVec(), ElementsAre("1", ArgType(RespExpr::NIL)));
}

TEST_F(MultiTest, MultiConsistent) {
  Run({"mset", kKey1, "base", kKey4, "base"});

  auto mset_fb = pp_->at(0)->LaunchFiber([&] {
    for (size_t i = 1; i < 10; ++i) {
      string base = StrCat(i * 900);
      RespExpr resp = Run({"mset", kKey1, base, kKey4, base});
      ASSERT_EQ(resp, "OK");
    }
  });

  auto fb = pp_->at(1)->LaunchFiber([&] {
    RespExpr resp = Run({"multi"});
    ASSERT_EQ(resp, "OK");
    ThisFiber::SleepFor(1ms);

    resp = Run({"get", kKey1});
    ASSERT_EQ(resp, "QUEUED");

    resp = Run({"get", kKey4});
    ASSERT_EQ(resp, "QUEUED");

    resp = Run({"mget", kKey4, kKey1});
    ASSERT_EQ(resp, "QUEUED");

    resp = Run({"exec"});
    ASSERT_THAT(resp, ArrLen(3));
    const RespVec& resp_arr = resp.GetVec();
    ASSERT_THAT(resp_arr, ElementsAre(ArgType(RespExpr::STRING), ArgType(RespExpr::STRING),
                                      ArgType(RespExpr::ARRAY)));
    ASSERT_EQ(resp_arr[0].GetBuf(), resp_arr[1].GetBuf());
    const RespVec& sub_arr = resp_arr[2].GetVec();
    EXPECT_THAT(sub_arr, ElementsAre(ArgType(RespExpr::STRING), ArgType(RespExpr::STRING)));
    EXPECT_EQ(sub_arr[0].GetBuf(), sub_arr[1].GetBuf());
    EXPECT_EQ(sub_arr[0].GetBuf(), resp_arr[0].GetBuf());
  });

  mset_fb.Join();
  fb.Join();

  ASSERT_FALSE(IsLocked(0, kKey1));
  ASSERT_FALSE(IsLocked(0, kKey4));
  ASSERT_FALSE(service_->IsShardSetLocked());
}

TEST_F(MultiTest, MultiConsistent2) {
  const int kKeyCount = 50;
  const int kRuns = 50;
  const int kJobs = 20;

  vector<string> all_keys(kKeyCount);
  for (size_t i = 0; i < kKeyCount; i++)
    all_keys[i] = absl::StrCat("key", i);

  auto cb = [&](string id) {
    for (size_t r = 0; r < kRuns; r++) {
      size_t num_keys = (rand() % 5) + 1;
      set<string_view> keys;
      for (size_t i = 0; i < num_keys; i++)
        keys.insert(all_keys[rand() % kKeyCount]);

      Run(id, {"MULTI"});
      for (auto key : keys)
        Run(id, {"INCR", key});
      for (auto key : keys)
        Run(id, {"DECR", key});
      auto resp = Run(id, {"EXEC"});

      ASSERT_EQ(resp.GetVec().size(), keys.size() * 2);
      for (size_t i = 0; i < keys.size(); i++) {
        EXPECT_EQ(resp.GetVec()[i].GetInt(), optional<int64_t>(1));
        EXPECT_EQ(resp.GetVec()[i + keys.size()].GetInt(), optional<int64_t>(0));
      }
    }
  };

  vector<Fiber> fbs(kJobs);
  for (size_t i = 0; i < kJobs; i++) {
    fbs[i] = pp_->at(i % pp_->size())->LaunchFiber([i, cb]() { cb(absl::StrCat("worker", i)); });
  }

  for (auto& fb : fbs)
    fb.Join();
}

TEST_F(MultiTest, MultiConsistent3) {
  GTEST_SKIP() << "Known consistency bug";

  absl::SetFlag(&FLAGS_multi_exec_squash, false);
  vector<Fiber> fbs;

  auto run_multi = [this](string_view client) {
    Run(client, {"multi"});
    Run(client, {"incr", kKeySid0});
    Run(client, {"incr", kKeySid1});
    Run(client, {"incr", kKeySid2});
    Run(client, {"exec"});
  };

  auto run_mget = [this](string_view client) {
    auto resp = Run(client, {"mget", kKeySid0, kKeySid1, kKeySid2});
    const auto& elems = resp.GetVec();
    EXPECT_EQ(elems[0].GetString(), elems[1].GetString());
    EXPECT_EQ(elems[1].GetString(), elems[2].GetString());
  };

  for (size_t i = 0; i < 10; i++) {
    auto fb = pp_->at(i % pp_->size())->LaunchFiber([i, run_mget, run_multi] {
      auto client = absl::StrCat("c", i);
      for (size_t j = 0; j < 1000; j++) {
        if (j % 2)
          run_mget(client);
        else
          run_multi(client);
        size_t sleep = 30 + j / 10 + 5 * i;
        ThisFiber::SleepFor(chrono::microseconds(sleep));
      }
    });
    fbs.emplace_back(std::move(fb));
  }

  for (auto& fb : fbs)
    fb.JoinIfNeeded();

  auto metrics = GetMetrics();
  EXPECT_GT(metrics.shard_stats.tx_optimistic_total, 100);
}

TEST_F(MultiTest, MultiRename) {
  RespExpr resp = Run({"mget", kKey1, kKey4});
  ASSERT_EQ(1, GetDebugInfo().shards_count);

  resp = Run({"multi"});
  ASSERT_EQ(resp, "OK");
  Run({"set", kKey1, "1"});

  resp = Run({"rename", kKey1, kKey4});
  ASSERT_EQ(resp, "QUEUED");
  resp = Run({"exec"});

  ASSERT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp.GetVec(), ElementsAre("OK", "OK"));

  // Now rename with keys spawning multiple shards.
  Run({"mget", kKey4, kKey2});
  ASSERT_EQ(2, GetDebugInfo().shards_count);

  Run({"multi"});
  resp = Run({"rename", kKey4, kKey2});
  ASSERT_EQ(resp, "QUEUED");
  resp = Run({"exec"});
  EXPECT_EQ(resp, "OK");

  EXPECT_FALSE(IsLocked(0, kKey1));
  EXPECT_FALSE(IsLocked(0, kKey2));
  EXPECT_FALSE(IsLocked(0, kKey4));
  EXPECT_FALSE(service_->IsShardSetLocked());
}

// Run multi without transactional commands
TEST_F(MultiTest, MultiWithoutTx) {
  Run({"multi"});
  Run({"ping"});
  auto resp = Run({"exec"});
  EXPECT_EQ(resp, "PONG");

  // EVAL without keys and default script flags should be non-transactional
  Run({"multi"});
  Run({"eval", "return 'OK1'", "0"});
  Run({"ping"});
  Run({"eval", "return 'OK2'", "0", "not-a-key"});
  Run({"ping"});
  Run({"eval", "return 'OK3'", "0", "not-a-key", "as-well"});
  Run({"ping"});
  resp = Run({"exec"});
  EXPECT_EQ(resp.GetVec()[2], "OK2");
  EXPECT_EQ(resp.GetVec()[4], "OK3");
}

TEST_F(MultiTest, MultiCommandsWithBonusKeys) {
  absl::FlagSaver fs;
  absl::SetFlag(&FLAGS_multi_exec_squash, true);

  EXPECT_EQ(Shard("za", shard_set->size()), Shard("zb", shard_set->size()));
  EXPECT_EQ(Shard("zb", shard_set->size()), Shard("ze", shard_set->size()));

  // Check bonus keys are correctly processed with squashing
  Run({"multi"});
  Run({"zadd", "za", "1", "a", "2", "b"});
  Run({"zadd", "zb", "2", "b", "3", "c"});
  Run({"zinterstore", "ze", "2", "za", "zb"});
  auto resp = Run({"exec"});
  EXPECT_THAT(resp.GetVec()[2], IntArg(1));
  EXPECT_THAT(Run({"zcard", "ze"}), IntArg(1));

  // Check squashing correctly pre-validates commands
  Run({"multi"});
  Run({"zinterstore", "ze", "2", "za", "zb", "z one extra"});
  resp = Run({"exec"});
  EXPECT_THAT(resp, ErrArg("syntax error"));
}

TEST_F(MultiTest, MultiHop) {
  Run({"set", kKey1, "1"});

  auto p1_fb = pp_->at(1)->LaunchFiber([&] {
    for (int i = 0; i < 100; ++i) {
      auto resp = Run({"rename", kKey1, kKey2});
      ASSERT_EQ(resp, "OK");
      EXPECT_EQ(2, GetDebugInfo("IO1").shards_count);

      resp = Run({"rename", kKey2, kKey1});
      ASSERT_EQ(resp, "OK");
    }
  });

  // mset should be executed either as ooo or via tx-queue because previous transactions
  // have been unblocked and executed as well. In other words, this mset should never block
  // on serializability constraints.
  auto p2_fb = pp_->at(2)->LaunchFiber([&] {
    for (int i = 0; i < 100; ++i) {
      Run({"mset", kKey3, "1", kKey4, "2"});
    }
  });

  p1_fb.Join();
  p2_fb.Join();
}

TEST_F(MultiTest, FlushDb) {
  Run({"mset", kKey1, "1", kKey4, "2"});
  auto resp = Run({"flushdb"});
  ASSERT_EQ(resp, "OK");

  auto fb0 = pp_->at(0)->LaunchFiber([&] {
    for (unsigned i = 0; i < 100; ++i) {
      Run({"flushdb"});
    }
  });

  pp_->at(1)->Await([&] {
    for (unsigned i = 0; i < 100; ++i) {
      Run({"mset", kKey1, "1", kKey4, "2"});
      int64_t ival = CheckedInt({"exists", kKey1, kKey4});
      ASSERT_TRUE(ival == 0 || ival == 2) << i << " " << ival;
    }
  });

  fb0.Join();

  ASSERT_FALSE(IsLocked(0, kKey1));
  ASSERT_FALSE(IsLocked(0, kKey4));
  ASSERT_FALSE(service_->IsShardSetLocked());
}

// Triggers a false possitive and therefore we turn it off
// There seem not to be a good solution to handle these false positives
// since sanitizers work well with u_context which is *very* slow
TEST_F(MultiTest, Eval) {
  if (auto config = absl::GetFlag(FLAGS_default_lua_flags); config != "") {
    GTEST_SKIP() << "Skipped Eval test because default_lua_flags is set";
    return;
  }
  absl::FlagSaver saver;
  absl::SetFlag(&FLAGS_lua_allow_undeclared_auto_correct, true);

  RespExpr resp;

  resp = Run({"incrby", "foo", "42"});
  EXPECT_THAT(resp, IntArg(42));

  // first time running the script will return error and will change the script flag to allow
  // undeclared
  resp = Run({"eval", "return redis.call('get', 'foo')", "0"});
  EXPECT_THAT(resp, ErrArg("undeclared"));

  // running the same script the second time will succeed
  resp = Run({"eval", "return redis.call('get', 'foo')", "0"});
  EXPECT_THAT(resp, "42");

  Run({"script", "flush"});  // Reset global flag due to lua_allow_undeclared_auto_correct effect

  resp = Run({"eval", "return redis.call('get', 'foo')", "1", "bar"});
  EXPECT_THAT(resp, ErrArg("undeclared"));
  ASSERT_FALSE(IsLocked(0, "foo"));

  Run({"script", "flush"});  // Reset global flag from autocorrect

  resp = Run({"eval", "return redis.call('get', 'foo')", "1", "foo"});
  EXPECT_THAT(resp, "42");
  ASSERT_FALSE(IsLocked(0, "foo"));

  resp = Run({"eval", "return redis.call('get', KEYS[1])", "1", "foo"});
  EXPECT_THAT(resp, "42");
  ASSERT_FALSE(IsLocked(0, "foo"));
  ASSERT_FALSE(service_->IsShardSetLocked());

  resp = Run({"eval", "return 77", "2", "foo", "zoo"});
  EXPECT_THAT(resp, IntArg(77));

  // a,b important here to spawn multiple shards.
  resp = Run({"eval", "return redis.call('exists', KEYS[2])", "2", "a", "b"});
  // EXPECT_EQ(2, GetDebugInfo().shards_count);
  EXPECT_THAT(resp, IntArg(0));

  resp = Run({"eval", "return redis.call('hmset', KEYS[1], 'f1', '2222')", "1", "hmap"});
  EXPECT_EQ(resp, "OK");

  resp = Run({"hvals", "hmap"});
  EXPECT_EQ(resp, "2222");

  Run({"sadd", "s1", "a", "b"});
  Run({"sadd", "s2", "a", "c"});
  resp = Run({"eval", "return redis.call('SUNION', KEYS[1], KEYS[2])", "2", "s1", "s2"});
  ASSERT_THAT(resp, ArrLen(3));
  const auto& arr = resp.GetVec();
  EXPECT_THAT(arr, ElementsAre("a", "b", "c"));

  Run({"zadd", "z1", "123", "a", "12345678912345", "b", "12.5", "c"});
  const char* kGetScore = "return redis.call('ZSCORE', KEYS[1], ARGV[1]) .. '-works'";

  resp = Run({"eval", kGetScore, "1", "z1", "a"});
  EXPECT_EQ(resp, "123-works");
  resp = Run({"eval", kGetScore, "1", "z1", "b"});
  EXPECT_EQ(resp, "12345678912345-works");
  resp = Run({"eval", kGetScore, "1", "z1", "c"});
  EXPECT_EQ(resp, "12.5-works");

  // Multiple calls in a Lua script
  EXPECT_EQ(Run({"eval",
                 R"(redis.call('set', 'foo', '42')
                    return redis.call('get', 'foo'))",
                 "1", "foo"}),
            "42");

  auto condition = [&]() { return IsLocked(0, "foo"); };
  auto fb = ExpectConditionWithSuspension(condition);
  EXPECT_EQ(Run({"eval",
                 R"(redis.call('set', 'foo', '42')
                    return redis.call('get', 'foo'))",
                 "1", "foo"}),
            "42");
  fb.Join();

  // Call multi-shard command scan from single shard mode
  resp = Run({"eval", "return redis.call('scan', '0'); ", "1", "key"});
  EXPECT_EQ(resp.GetVec()[0], "0");
  EXPECT_EQ(resp.GetVec()[1].type, RespExpr::Type::ARRAY);
}

TEST_F(MultiTest, Watch) {
  auto kExecFail = ArgType(RespExpr::NIL);
  auto kExecSuccess = ArgType(RespExpr::ARRAY);

  // Check watch doesn't run in multi.
  Run({"multi"});
  ASSERT_THAT(Run({"watch", "a"}), ErrArg("not allowed inside a transaction"));
  Run({"discard"});

  // Check watch on existing key.
  Run({"set", "a", "1"});
  EXPECT_EQ(Run({"watch", "a"}), "OK");
  Run({"set", "a", "2"});
  Run({"multi"});
  ASSERT_THAT(Run({"exec"}), kExecFail);

  // Check watch with nonempty exec body
  EXPECT_EQ(Run({"watch", "a"}), "OK");
  Run({"multi"});
  Run({"get", "a"});
  Run({"get", "b"});
  Run({"get", "c"});
  ASSERT_THAT(Run({"exec"}), kExecSuccess);

  // Check watch data cleared after EXEC.
  Run({"set", "a", "1"});
  Run({"multi"});
  ASSERT_THAT(Run({"exec"}), kExecSuccess);

  // Check watch on non-existent key.
  Run({"del", "b"});
  EXPECT_EQ(Run({"watch", "b"}), "OK");  // didn't exist yet
  Run({"set", "b", "1"});
  Run({"multi"});
  ASSERT_THAT(Run({"exec"}), kExecFail);

  // Check EXEC doesn't miss watched key expiration.
  Run({"watch", "a"});
  Run({"expire", "a", "1"});
  AdvanceTime(1000);
  Run({"multi"});
  Run({"get", "a"});
  ASSERT_THAT(Run({"exec"}), kExecFail);

  // Check unwatch.
  Run({"watch", "a"});
  Run({"unwatch"});
  Run({"set", "a", "3"});
  Run({"multi"});
  ASSERT_THAT(Run({"exec"}), kExecSuccess);

  // Check double expire
  Run({"watch", "a", "b"});
  Run({"set", "a", "2"});
  Run({"set", "b", "2"});
  Run({"multi"});
  ASSERT_THAT(Run({"exec"}), kExecFail);

  // Check EXPIRE + new key.
  Run({"set", "a", "1"});
  Run({"del", "c"});
  Run({"watch", "c"});  // didn't exist yet
  Run({"watch", "a"});
  Run({"set", "c", "1"});
  Run({"expire", "a", "1"});  // a existed

  AdvanceTime(1000);

  Run({"multi"});
  ASSERT_THAT(Run({"exec"}), kExecFail);

  // Check FLUSHDB touches watched keys
  Run({"select", "1"});
  Run({"set", "a", "1"});
  Run({"watch", "a"});
  Run({"flushdb"});
  Run({"multi"});
  ASSERT_THAT(Run({"exec"}), kExecFail);

  // Check multi db watches are not supported.
  Run({"select", "1"});
  Run({"set", "a", "1"});
  Run({"watch", "a"});
  Run({"select", "0"});
  Run({"multi"});
  ASSERT_THAT(Run({"exec"}), ArgType(RespExpr::ERROR));

  // Check watch keys are isolated between databases.
  Run({"set", "a", "1"});
  Run({"watch", "a"});
  Run({"select", "1"});
  Run({"set", "a", "2"});  // changing a on db 1
  Run({"select", "0"});
  Run({"multi"});
  ASSERT_THAT(Run({"exec"}), kExecSuccess);
}

TEST_F(MultiTest, MultiOOO) {
  GTEST_SKIP() << "Command squashing breaks stats";

  auto fb0 = pp_->at(0)->LaunchFiber([&] {
    for (unsigned i = 0; i < 100; i++) {
      Run({"multi"});
      Run({"rpush", "a", "bar"});
      Run({"exec"});
    }
  });

  pp_->at(1)->Await([&] {
    for (unsigned i = 0; i < 100; ++i) {
      Run({"multi"});
      Run({"rpush", "b", "bar"});
      Run({"exec"});
    }
  });

  fb0.Join();
  auto metrics = GetMetrics();

  // OOO works in LOCK_AHEAD mode.
  EXPECT_EQ(200, metrics.shard_stats.tx_ooo_total);
}

// Lua scripts lock their keys ahead and thus can run out of order.
TEST_F(MultiTest, EvalOOO) {
  if (auto config = absl::GetFlag(FLAGS_default_lua_flags); config != "") {
    GTEST_SKIP() << "Skipped EvalOOO test because default_lua_flags is set";
    return;
  }

  // Assign to prevent asyc optimization.
  const char* kScript = "local r = redis.call('MGET', unpack(KEYS)); return 'OK'";

  // Check single call.
  {
    auto resp = Run({"eval", kScript, "3", kKey1, kKey2, kKey3});
    ASSERT_EQ(resp, "OK");
  }

  const int kTimes = 10;
  // Check scripts running on different shards don't block each other.
  {
    auto run = [this, kScript](auto key) {
      for (int i = 0; i < kTimes; i++)
        Run({"eval", kScript, "1", key});
    };

    auto f1 = pp_->at(0)->LaunchFiber([&]() { run(kKeySid0); });
    auto f2 = pp_->at(1)->LaunchFiber([&]() { run(kKeySid1); });

    f1.Join();
    f2.Join();
  }

  auto metrics = GetMetrics();
  auto sum = metrics.coordinator_stats.eval_io_coordination_cnt +
             metrics.coordinator_stats.eval_shardlocal_coordination_cnt;
  EXPECT_EQ(1 + 2 * kTimes, sum);
}

// Run MULTI/EXEC commands in parallel, where each command is:
//        MULTI - SET k1 v - SET k2 v - SET k3 v - EXEC
// but the order of the commands inside appears in any permutation.
TEST_F(MultiTest, MultiContendedPermutatedKeys) {
  constexpr int kRounds = 5;

  auto run = [this](vector<string> keys, bool reversed) {
    int i = 0;
    do {
      Run({"multi"});
      auto apply = [this](auto key) { Run({"set", key, "v"}); };

      if (reversed)
        for_each(keys.rbegin(), keys.rend(), apply);
      else
        for_each(keys.begin(), keys.end(), apply);

      Run({"exec"});
    } while (next_permutation(keys.begin(), keys.end()) || i++ < kRounds);
  };

  vector<string> keys = {kKeySid0, kKeySid1, kKey3};

  auto f1 = pp_->at(1)->LaunchFiber([run, keys]() { run(keys, false); });
  auto f2 = pp_->at(2)->LaunchFiber([run, keys]() { run(keys, true); });

  f1.Join();
  f2.Join();
}

TEST_F(MultiTest, MultiCauseUnblocking) {
  const int kRounds = 10;
  vector<string> keys = {kKeySid0, kKeySid1, kKeySid2};

  auto push = [this, keys]() mutable {
    int i = 0;
    do {
      Run({"multi"});
      for (auto k : keys)
        Run({"lpush", k, "v"});
      Run({"exec"});
    } while (next_permutation(keys.begin(), keys.end()) || i++ < kRounds);
  };

  auto pop = [this, keys]() mutable {
    int i = 0;
    do {
      for (int j = keys.size() - 1; j >= 0; j--)
        ASSERT_THAT(Run({"blpop", keys[j], "0"}), ArrLen(2));
    } while (next_permutation(keys.begin(), keys.end()) || i++ < kRounds);
  };

  auto f1 = pp_->at(1)->LaunchFiber([push]() mutable { push(); });
  auto f2 = pp_->at(2)->LaunchFiber([pop]() mutable { pop(); });

  f1.Join();
  f2.Join();
}

TEST_F(MultiTest, ExecGlobalFallback) {
  Run({"multi"});
  Run({"set", "a", "1"});  // won't run ooo, because it became part of global
  Run({"move", "a", "1"});
  Run({"exec"});
  EXPECT_EQ(1, GetMetrics().coordinator_stats.tx_global_cnt);
}

TEST_F(MultiTest, ScriptFlagsCommand) {
  if (auto flags = absl::GetFlag(FLAGS_default_lua_flags); flags != "") {
    GTEST_SKIP() << "Skipped ScriptFlagsCommand test because default_lua_flags is set";
    return;
  }

  const char* kUndeclared1 = "return redis.call('GET', 'random-key-1');";
  const char* kUndeclared2 = "return redis.call('GET', 'random-key-2');";

  Run({"set", "random-key-1", "works"});
  Run({"set", "random-key-2", "works"});

  // Check SCRIPT FLAGS is applied correctly to loaded scripts.
  {
    auto sha_resp = Run({"script", "load", kUndeclared1});
    auto sha = facade::ToSV(sha_resp.GetBuf());

    EXPECT_THAT(Run({"evalsha", sha, "0"}), ErrArg("undeclared"));

    EXPECT_EQ(Run({"script", "flags", sha, "allow-undeclared-keys"}), "OK");

    EXPECT_THAT(Run({"evalsha", sha, "0"}), "works");
  }

  // Check SCRIPT FLAGS can be applied by sha before loading.
  {
    char sha_buf[41];
    Interpreter::FuncSha1(kUndeclared2, sha_buf);
    string_view sha{sha_buf, 40};

    EXPECT_THAT(Run({"script", "flags", sha, "allow-undeclared-keys"}), "OK");

    EXPECT_THAT(Run({"eval", kUndeclared2, "0"}), "works");
  }
}

TEST_F(MultiTest, ScriptFlagsInvalidSha) {
  EXPECT_THAT(Run({"script", "flags", "short", "allow-undeclared-keys"}), ErrArg(""));
}

TEST_F(MultiTest, ScriptFlagsEmbedded) {
  const char* s1 = R"(
  --!df flags=allow-undeclared-keys
  return redis.call('GET', 'random-key');
)";

  // Check eval finds script flags.
  Run({"set", "random-key", "works"});
  EXPECT_EQ(Run({"eval", s1, "0"}), "works");

  const char* s2 = R"(
  --!df flags=this-is-an-error
  redis.call('SET', 'random-key', 'failed')
  )";

  EXPECT_THAT(Run({"eval", s2, "0"}), ErrArg("Invalid flag: this-is-an-error"));
}

TEST_F(MultiTest, UndeclaredKeyFlag) {
  absl::FlagSaver fs;  // lua_undeclared_keys_shas changed via CONFIG cmd below

  const char* script = "return redis.call('GET', 'random-key');";
  Run({"set", "random-key", "works"});

  // Get SHA for script in a persistent way
  string sha = Run({"script", "load", script}).GetString();

  // Make sure we can't run the script before setting the flag
  EXPECT_THAT(Run({"evalsha", sha, "0"}), ErrArg("undeclared"));
  EXPECT_THAT(Run({"eval", script, "0"}), ErrArg("undeclared"));

  // Clear all Lua scripts so we can configure the cache
  EXPECT_THAT(Run({"script", "flush"}), "OK");
  EXPECT_THAT(Run({"script", "exists", sha}), IntArg(0));

  EXPECT_THAT(
      Run({"config", "set", "lua_undeclared_keys_shas", absl::StrCat(sha, ",NON-EXISTING-HASH")}),
      "OK");

  // Check eval finds script flags.
  EXPECT_EQ(Run({"eval", script, "0"}), "works");
  EXPECT_EQ(Run({"evalsha", sha, "0"}), "works");
}

TEST_F(MultiTest, LegacyFloatFlag) {
  const char* script_with_flag = R"(
  --!df flags=legacy-float
  return 42.9
)";
  EXPECT_THAT(Run({"eval", script_with_flag, "0"}), IntArg(42));

  const char* script_negative = R"(
  --!df flags=legacy-float
  return -3.8
)";
  EXPECT_THAT(Run({"eval", script_negative, "0"}), IntArg(-3));

  EXPECT_THAT(Run({"eval", "return 42.9", "0"}), DoubleArg(42.9));

  const char* script = "return 42.9";
  char sha_buf[41];
  Interpreter::FuncSha1(script, sha_buf);
  string_view sha{sha_buf, 40};

  EXPECT_EQ(Run({"script", "flags", string(sha), "legacy-float"}), "OK");

  EXPECT_THAT(Run({"eval", script, "0"}), IntArg(42));
}

TEST_F(MultiTest, LegacyFloatShaFlag) {
  absl::FlagSaver fs;

  const char* script = "return 42.9";
  string sha = Run({"script", "load", script}).GetString();

  EXPECT_THAT(Run({"evalsha", sha, "0"}), DoubleArg(42.9));

  Run({"script", "flush"});
  EXPECT_THAT(Run({"config", "set", "lua_float_as_int_shas", sha}), "OK");

  EXPECT_THAT(Run({"eval", script, "0"}), IntArg(42));
}

TEST_F(MultiTest, CjsonDecodeIntegerBehavior) {
  // cjson.decode always returns integers for whole numbers (Redis/Lua 5.1 compatible)
  const char* script_cjson = R"(
    local obj = cjson.decode('{"value": 42}')
    return tostring(obj.value)
  )";
  EXPECT_EQ(Run({"eval", script_cjson, "0"}), "42");

  // Floats with fractional parts remain as floats
  const char* script_cjson_float = R"(
    local obj = cjson.decode('{"value": 42.5}')
    return tostring(obj.value)
  )";
  EXPECT_EQ(Run({"eval", script_cjson_float, "0"}), "42.5");
}

TEST_F(MultiTest, ScriptBadCommand) {
  const char* s1 = "redis.call('FLUSHALL')";
  const char* s2 = "redis.call('FLUSHALL'); redis.set(KEYS[1], ARGS[1]);";
  const char* s3 = "redis.acall('FLUSHALL'); redis.set(KEYS[1], ARGS[1]);";
  const char* s4 = R"(
    --!df flags=disable-atomicity
    redis.call('FLUSHALL');
    return "OK";
  )";

  auto resp = Run({"eval", s1, "0"});  // tx won't be scheduled at all
  EXPECT_THAT(resp, ErrArg("This Redis command is not allowed from script"));

  resp = Run({"eval", s2, "1", "works", "false"});  // will be scheduled as lock ahead
  EXPECT_THAT(resp, ErrArg("This Redis command is not allowed from script"));

  resp = Run({"eval", s3, "1", "works", "false"});  // also async call will happen
  EXPECT_THAT(resp, ErrArg("This Redis command is not allowed from script"));

  resp = Run({"eval", s4, "0"});
  EXPECT_EQ(resp, "OK");
}

TEST_F(MultiTest, MultiSquash) {
  string_view script = R"(
redis.call('APPEND', KEYS[1], ARGV[1]);
redis.call('GET', KEYS[1]);
redis.call('APPEND', KEYS[1], ARGV[2])
return 'OK';
)";

  auto resp = Run({"EVAL", script, "1", "A", "works", "reliably"});
  EXPECT_EQ(resp, "OK");

  resp = Run({"EVAL", script, "1", "A", "once", "again"});
  EXPECT_EQ(resp, "OK");

  auto metrics = GetMetrics();
  EXPECT_EQ(metrics.coordinator_stats.eval_shardlocal_coordination_cnt, 2u);
  // EXPECT_EQ(metrics.shard_stats.tx_ooo_total, 2u);

  auto a_expect = absl::StrCat("works", "reliably", "once", "again");
  EXPECT_EQ(Run({"GET", "A"}), a_expect);
}

// Check that single shard script running with allow-undeclared-keys (i.e. global)
// running on a single shard setup can be squashed with "shardlocal" execution
TEST_F(SingleShardMultiTest, MultiSquashGlobalSingleShard) {
  string_view script = R"(
--!df flags=allow-undeclared-keys
redis.call('SET', 'first', 'works');
redis.call('SET', 'second', 'too');
redis.call('SET', 'third', 'as well');
return 'OK';
)";

  auto resp = Run({"EVAL", script, "0"});
  EXPECT_EQ(resp, "OK");

  // Check call was shardlocal and out of order
  auto metrics = GetMetrics();
  EXPECT_EQ(metrics.coordinator_stats.eval_shardlocal_coordination_cnt, 1u);

  EXPECT_EQ(Run({"GET", "first"}), "works");
  EXPECT_EQ(Run({"GET", "second"}), "too");
  EXPECT_EQ(Run({"GET", "third"}), "as well");
}

TEST_F(MultiTest, MultiEvalModeConflict) {
  const char* s1 = R"(
  --!df flags=allow-undeclared-keys
  return redis.call('GET', 'random-key');
)";

  EXPECT_EQ(Run({"multi"}), "OK");
  // Check eval finds script flags.
  EXPECT_EQ(Run({"set", "random-key", "works"}), "QUEUED");
  EXPECT_EQ(Run({"eval", s1, "0"}), "QUEUED");
  EXPECT_THAT(Run({"exec"}),
              RespArray(ElementsAre(
                  "OK", ErrArg("Multi mode conflict when running eval in multi transaction"))));
}

// Run multi-exec transactions that move values from a source list
// to destination list through two contended channels.
TEST_F(MultiTest, ContendedList) {
  constexpr int listSize = 50;
  constexpr int stepSize = 5;

  auto run = [this](string_view src, string_view dest) {
    for (int i = 0; i < listSize / stepSize; i++) {
      Run({"multi"});
      Run({"sort", src});
      for (int j = 0; j < stepSize; j++)
        Run({"lmove", src, j % 2 ? "chan-1" : "chan-2", "RIGHT", "RIGHT"});
      for (int j = 0; j < stepSize; j++)
        Run({"lmove", j % 2 ? "chan-1" : "chan-2", dest, "LEFT", "RIGHT"});
      Run({"exec"});
    }
  };

  for (int i = 0; i < listSize; i++) {
    Run({"lpush", "l1", "a"});
    Run({"lpush", "l2", "b"});
  }

  auto f1 = pp_->at(1)->LaunchFiber([run]() mutable { run("l1", "l1-out"); });
  auto f2 = pp_->at(2)->LaunchFiber([run]() mutable { run("l2", "l2-out"); });

  f1.Join();
  f2.Join();

  for (int i = 0; i < listSize; i++) {
    EXPECT_EQ(Run({"lpop", "l1-out"}), "a");
    EXPECT_EQ(Run({"lpop", "l2-out"}), "b");
  }

  EXPECT_THAT(Run({"llen", "chan-1"}), IntArg(0));
  EXPECT_THAT(Run({"llen", "chan-2"}), IntArg(0));
}

// Test that squashing makes single-key ops atomic withing a non-atomic tx
// because it runs them within one hop.
TEST_F(MultiTest, TestSquashing) {
  absl::FlagSaver fs;
  absl::SetFlag(&FLAGS_multi_exec_squash, true);

  const char* keys[] = {kKeySid0, kKeySid1, kKeySid2};

  atomic_bool done{false};
  auto f1 = pp_->at(1)->LaunchFiber([this, keys, &done]() {
    while (!done.load()) {
      for (auto key : keys)
        ASSERT_THAT(Run({"llen", key}), IntArg(0));
    }
  });

  for (unsigned times = 0; times < 10; times++) {
    Run({"multi"});
    for (auto key : keys)
      Run({"lpush", key, "works"});
    for (auto key : keys)
      Run({"lpop", key});
    Run({"exec"});
  }

  done.store(true);
  f1.Join();

  // Test some more unusual commands
  Run({"multi"});
  Run({"mget", "x1", "x2", "x3"});
  Run({"mget", "x4"});
  Run({"mget", "x5", "x6", "x7", "x8"});
  Run({"ft.search", "i1", "*"});
  Run({"exec"});
}

TEST_F(MultiTest, MultiLeavesTxQueue) {
  // Tests the scenario, where the OOO multi-tx is scheduled into tx queue and there is another
  // tx (mget) after it that runs and tests for atomicity.
  absl::FlagSaver fs;
  absl::SetFlag(&FLAGS_multi_exec_squash, false);

  for (unsigned i = 0; i < 20; ++i) {
    string key = StrCat("x", i);
    LOG(INFO) << key << ": shard " << Shard(key, shard_set->size());
  }

  Run({"mget", "x5", "x8", "x9", "x13", "x16", "x17"});
  ASSERT_EQ(1, GetDebugInfo().shards_count);

  auto fb1 = pp_->at(1)->LaunchFiber(Launch::post, [&] {
    // Runs multi on shard0 1000 times.
    for (unsigned j = 0; j < 1000; ++j) {
      Run({"multi"});
      Run({"incrby", "x13", "1"});
      Run({"incrby", "x16", "1"});
      Run({"incrby", "x17", "1"});
      Run({"exec"});
    }
  });

  auto fb2 = pp_->at(2)->LaunchFiber(Launch::dispatch, [&] {
    // Runs multi on shard0 1000 times.
    for (unsigned j = 0; j < 1000; ++j) {
      Run({"multi"});
      Run({"incrby", "x5", "1"});
      Run({"incrby", "x8", "1"});
      Run({"incrby", "x9", "1"});
      Run({"exec"});
    }
  });

  auto check_triple = [](const RespExpr::Vec& arr, unsigned start) {
    if (arr[start].type != arr[start + 1].type || arr[start + 1].type != arr[start + 2].type) {
      return false;
    }

    if (arr[start].type == RespExpr::STRING) {
      string s0 = arr[start].GetString();
      string s1 = arr[start + 1].GetString();
      string s2 = arr[start + 2].GetString();
      if (s0 != s1 || s1 != s2) {
        return false;
      }
    }
    return true;
  };

  bool success = pp_->at(0)->Await([&]() -> bool {
    for (unsigned j = 0; j < 1000; ++j) {
      auto resp = Run({"mget", "x5", "x8", "x9", "x13", "x16", "x17"});
      const RespExpr::Vec& arr = resp.GetVec();
      CHECK_EQ(6u, arr.size());

      if (!check_triple(arr, 0)) {
        LOG(ERROR) << "inconsistent " << arr[0] << " " << arr[1] << " " << arr[2];
        return false;
      }
      if (!check_triple(arr, 3)) {
        LOG(ERROR) << "inconsistent " << arr[3] << " " << arr[4] << " " << arr[5];
        return false;
      }
    }
    return true;
  });

  fb1.Join();
  fb2.Join();
  ASSERT_TRUE(success);
}

TEST_F(MultiTest, TestLockedKeys) {
  auto condition = [&]() { return IsLocked(0, "key1") && IsLocked(0, "key2"); };
  auto fb = ExpectConditionWithSuspension(condition);

  EXPECT_EQ(Run({"multi"}), "OK");
  EXPECT_EQ(Run({"set", "key1", "val1"}), "QUEUED");
  EXPECT_EQ(Run({"set", "key2", "val2"}), "QUEUED");
  EXPECT_EQ(Run({"mset", "key1", "val3", "key1", "val4"}), "QUEUED");
  EXPECT_THAT(Run({"exec"}), RespArray(ElementsAre("OK", "OK", "OK")));
  fb.Join();
  EXPECT_FALSE(IsLocked(0, "key1"));
  EXPECT_FALSE(IsLocked(0, "key2"));
}

TEST_F(MultiTest, EvalExpiration) {
  // Make sure expiration is correctly set even from Lua scripts
  if (auto config = absl::GetFlag(FLAGS_default_lua_flags); config != "") {
    GTEST_SKIP() << "Skipped Eval test because default_lua_flags is set";
    return;
  }

  Run({"eval", "redis.call('set', 'x', 0, 'ex', 5, 'nx')", "1", "x"});
  EXPECT_LE(CheckedInt({"pttl", "x"}), 5000);
}

TEST_F(MultiTest, MemoryInScript) {
  EXPECT_EQ(Run({"set", "x", "y"}), "OK");

  auto resp = Run({"eval", "return redis.call('MEMORY', 'USAGE', KEYS[1])", "1", "x"});
  EXPECT_THAT(resp, IntArg(0));
}

TEST_F(MultiTest, NoKeyTransactional) {
  Run({"multi"});
  Run({"ft._list"});
  Run({"exec"});
}

TEST_F(MultiTest, NoKeyTransactionalMany) {
  vector<vector<string>> cmds;
  cmds.push_back({"rename", "x", "z"});
  cmds.push_back({"ft._list"});
  RunMany(cmds);
}

class MultiEvalTest : public BaseFamilyTest {
 protected:
  MultiEvalTest() : BaseFamilyTest() {
    num_threads_ = kPoolThreadCount;
    absl::SetFlag(&FLAGS_default_lua_flags, "allow-undeclared-keys");
  }

  absl::FlagSaver fs_;
};

TEST_F(MultiEvalTest, MultiAllEval) {
  RespExpr brpop_resp;

  // Run the fiber at creation.
  auto fb0 = pp_->at(1)->LaunchFiber(Launch::dispatch, [&] {
    brpop_resp = Run({"brpop", "x", "1"});
  });
  Run({"multi"});
  Run({"eval", "return redis.call('lpush', 'x', 'y')", "0"});
  Run({"eval", "return redis.call('lpop', 'x')", "0"});
  RespExpr exec_resp = Run({"exec"});
  fb0.Join();

  EXPECT_THAT(exec_resp.GetVec(), ElementsAre(IntArg(1), "y"));

  EXPECT_THAT(brpop_resp, ArgType(RespExpr::NIL_ARRAY));
}

TEST_F(MultiEvalTest, MultiSomeEval) {
  RespExpr brpop_resp;

  // Run the fiber at creation.
  auto fb0 = pp_->at(1)->LaunchFiber(Launch::dispatch, [&] {
    brpop_resp = Run({"brpop", "x", "1"});
  });
  Run({"multi"});
  Run({"eval", "return redis.call('lpush', 'x', 'y')", "0"});
  Run({"lpop", "x"});
  RespExpr exec_resp = Run({"exec"});
  fb0.Join();

  EXPECT_THAT(exec_resp.GetVec(), ElementsAre(IntArg(1), "y"));

  EXPECT_THAT(brpop_resp, ArgType(RespExpr::NIL_ARRAY));
}

TEST_F(MultiEvalTest, ScriptSquashingUknownCmd) {
  absl::FlagSaver fs;
  absl::SetFlag(&FLAGS_lua_auto_async, true);

  // The script below contains two commands for which execution can't even be prepared
  // (FIRST/SECOND WRONG). The first is issued with pcall, so its error should be completely
  // ignored, the second one should cause an abort and no further commands should be executed
  string_view s = R"(
    redis.pcall('INCR', 'A')
    redis.pcall('FIRST WRONG')
    redis.pcall('INCR', 'A')
    redis.call('SECOND WRONG')
    redis.pcall('INCR', 'A')
  )";

  EXPECT_THAT(Run({"EVAL", s, "1", "A"}), ErrArg("unknown command `SECOND WRONG`"));
  EXPECT_EQ(Run({"get", "A"}), "2");
}

TEST_F(MultiEvalTest, MultiAndEval) {
  // We had a bug in borrowing interpreters which caused a crash in this scenario
  Run({"multi"});
  Run({"eval", "return redis.call('set', 'x', 'y1')", "1", "x"});
  Run({"exec"});

  Run({"eval", "return redis.call('set', 'x', 'y1')", "1", "x"});

  Run({"multi"});
  Run({"eval", "return 'OK';", "0"});
  auto resp = Run({"exec"});
  EXPECT_EQ(resp, "OK");

  // We had a bug running script load inside multi
  Run({"multi"});
  Run({"script", "load", "return '5'"});
  Run({"exec"});

  Run({"multi"});
  Run({"script", "load", "return '5'"});
  Run({"get", "x"});
  Run({"exec"});

  Run({"multi"});
  Run({"script", "load", "return '5'"});
  Run({"mset", "x1", "y1", "x2", "y2"});
  Run({"exec"});

  Run({"multi"});
  Run({"script", "load", "return '5'"});
  Run({"eval", "return redis.call('set', 'x', 'y')", "1", "x"});
  Run({"get", "x"});
  Run({"exec"});

  Run({"get", "x"});
}

TEST_F(MultiTest, MultiTypes) {
  // we had a bug with namespaces for type command in multi/exec
  EXPECT_THAT(Run({"multi"}), "OK");
  EXPECT_THAT(Run({"type", "sdfx3"}), "QUEUED");
  EXPECT_THAT(Run({"type", "asdasd2"}), "QUEUED");
  EXPECT_THAT(Run({"type", "wer124"}), "QUEUED");
  EXPECT_THAT(Run({"type", "asafdasd"}), "QUEUED");
  EXPECT_THAT(Run({"type", "dsfgser"}), "QUEUED");
  EXPECT_THAT(Run({"type", "erg2"}), "QUEUED");
  EXPECT_THAT(Run({"exec"}),
              RespArray(ElementsAre("none", "none", "none", "none", "none", "none")));
}

TEST_F(MultiTest, EvalRo) {
  RespExpr resp;

  resp = Run({"set", "foo", "bar"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"eval_ro", "return redis.call('get', KEYS[1])", "1", "foo"});
  EXPECT_THAT(resp, "bar");

  resp = Run({"eval_ro", "return redis.call('set', KEYS[1], 'car')", "1", "foo"});
  EXPECT_THAT(resp, ErrArg("Write commands are not allowed from read-only scripts"));
}

TEST_F(MultiTest, EvalShaRo) {
  RespExpr resp;

  const char* read_script = "return redis.call('get', KEYS[1]);";
  const char* write_script = "return redis.call('set', KEYS[1], 'car');";

  auto sha_resp = Run({"script", "load", read_script});
  auto read_sha = facade::ToSV(sha_resp.GetBuf());
  sha_resp = Run({"script", "load", write_script});
  auto write_sha = facade::ToSV(sha_resp.GetBuf());

  resp = Run({"set", "foo", "bar"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"evalsha_ro", read_sha, "1", "foo"});
  EXPECT_THAT(resp, "bar");

  resp = Run({"evalsha_ro", write_sha, "1", "foo"});
  EXPECT_THAT(resp, ErrArg("Write commands are not allowed from read-only scripts"));
}

TEST_F(MultiTest, EvalSelect) {
  string_view script = R"(--!df flags=X
redis.call('SET', 'A', ARGV[1])
redis.call('SELECT', '1')
redis.call('SET', 'A', ARGV[2])
return 'OK';
)";
  auto script_global = absl::StrReplaceAll(script, {{"X", "allow-undeclared-keys"}});
  auto resp = Run({"EVAL", script_global, "0", "G1", "G2"});
  EXPECT_EQ(resp, "OK");

  Run({"SELECT", "0"});
  EXPECT_EQ(Run({"GET", "A"}), "G1");
  Run({"SELECT", "1"});
  EXPECT_EQ(Run({"GET", "A"}), "G2");
  Run({"SELECT", "0"});

  auto script_nonatomic = absl::StrReplaceAll(script, {{"X", "disable-atomicity"}});
  resp = Run({"EVAL", script_nonatomic, "0", "G3", "G4"});
  EXPECT_EQ(resp, "OK");

  Run({"SELECT", "0"});
  EXPECT_EQ(Run({"GET", "A"}), "G3");
  Run({"SELECT", "1"});
  EXPECT_EQ(Run({"GET", "A"}), "G4");
  Run({"SELECT", "0"});

  // Don't allow in regular transactions
  string_view script_fail = R"(
redis.call('SET', KEYS[1], ARGV[1])
redis.call('SELECT', '1')
redis.call('SET', KEYS[1], ARGV[1])
)";
  resp = Run({"EVAL", script_fail, "1", "A", "wont-work"});
  EXPECT_THAT(resp, ErrArg("SELECT is not allowed in regular"));
}

TEST_F(MultiTest, StoredCmdBytesMetric) {
  ASSERT_EQ(GetMetrics().coordinator_stats.stored_cmd_bytes, 0);

  RespExpr resp = Run({"multi"});
  ASSERT_EQ(resp, "OK");

  for (auto i = 0; i < 100; ++i) {
    ASSERT_EQ(Run({"get", kKey1}), "QUEUED");
  }

  ASSERT_GT(GetMetrics().coordinator_stats.stored_cmd_bytes, 0);

  resp = Run({"exec"});
  ASSERT_THAT(resp, ArrLen(100));
  ASSERT_THAT(resp.GetVec(), Contains(ArgType(RespExpr::NIL)).Times(100));
  ASSERT_EQ(GetMetrics().coordinator_stats.stored_cmd_bytes, 0);
}

// Verify that lazy expiration works inside EVAL running in global mode.
// Previously, the shard_lock()->Check(EXCLUSIVE) guard in ExpireIfNeeded
// prevented lazy expiry while a global transaction held the shard lock,
// causing expired keys to be returned as if they were still alive.
TEST_F(MultiTest, EvalGlobalLazyExpire) {
  // Set key with TTL, advance time past expiry, then read via global EVAL.
  // The global shard lock blocks heartbeat during EVAL, so active expiry
  // cannot delete the key — only lazy expiry inside GET can.
  Run({"set", "key", "val", "px", "10"});
  AdvanceTime(100);

  constexpr char kScript[] = R"(
--!df flags=allow-undeclared-keys
return redis.call('GET', KEYS[1])
)";

  auto resp = Run({"eval", kScript, "1", "key"});
  ASSERT_THAT(resp, ArgType(RespExpr::NIL));
}

}  // namespace dfly


================================================
FILE: src/server/namespaces.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/namespaces.h"

#include "base/flags.h"
#include "base/logging.h"
#include "server/blocking_controller.h"
#include "server/db_slice.h"
#include "server/engine_shard_set.h"

ABSL_DECLARE_FLAG(bool, cache_mode);

namespace dfly {

using namespace std;

Namespace::Namespace() {
  shard_db_slices_.resize(shard_set->size());
  shard_blocking_controller_.resize(shard_set->size());
  shard_set->RunBriefInParallel([&](EngineShard* es) {
    CHECK(es != nullptr);
    ShardId sid = es->shard_id();
    shard_db_slices_[sid] = make_unique<DbSlice>(sid, absl::GetFlag(FLAGS_cache_mode), es);
    shard_db_slices_[sid]->UpdateExpireBase(absl::GetCurrentTimeNanos() / 1000000, 0);
  });
}

DbSlice& Namespace::GetCurrentDbSlice() {
  EngineShard* es = EngineShard::tlocal();
  CHECK(es != nullptr);
  return GetDbSlice(es->shard_id());
}

DbSlice& Namespace::GetDbSlice(ShardId sid) {
  CHECK_LT(sid, shard_db_slices_.size());
  return *shard_db_slices_[sid];
}

BlockingController* Namespace::GetOrAddBlockingController(EngineShard* shard) {
  if (!shard_blocking_controller_[shard->shard_id()]) {
    shard_blocking_controller_[shard->shard_id()] = make_unique<BlockingController>(shard, this);
  }

  return shard_blocking_controller_[shard->shard_id()].get();
}

BlockingController* Namespace::GetBlockingController(ShardId sid) {
  return shard_blocking_controller_[sid].get();
}

Namespaces::Namespaces() {
  default_namespace_ = &GetOrInsert("");
}

Namespaces::~Namespaces() {
  Clear();
}

void Namespaces::Clear() {
  util::fb2::LockGuard guard(mu_);

  default_namespace_ = nullptr;

  if (namespaces_.empty()) {
    return;
  }

  shard_set->RunBriefInParallel([&](EngineShard* es) {
    CHECK(es != nullptr);
    for (auto& ns : ABSL_TS_UNCHECKED_READ(namespaces_)) {
      ns.second.shard_db_slices_[es->shard_id()].reset();
    }
  });

  namespaces_.clear();
}

Namespace& Namespaces::GetDefaultNamespace() const {
  CHECK(default_namespace_ != nullptr);
  return *default_namespace_;
}

Namespace& Namespaces::GetOrInsert(std::string_view ns) {
  {
    // Try to look up under a shared lock
    dfly::SharedLock guard(mu_);
    auto it = namespaces_.find(ns);
    if (it != namespaces_.end()) {
      return it->second;
    }
  }

  {
    // Key was not found, so we create create it under unique lock
    util::fb2::LockGuard guard(mu_);
    return namespaces_[ns];
  }
}

}  // namespace dfly


================================================
FILE: src/server/namespaces.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/node_hash_map.h>

#include <memory>
#include <string>
#include <vector>

#include "server/common_types.h"
#include "util/fibers/synchronization.h"

namespace dfly {

class BlockingController;
class DbSlice;
class EngineShard;

// A Namespace is a way to separate and isolate different databases in a single instance.
// It can be used to allow multiple tenants to use the same server without hacks of using a common
// prefix, or SELECT-ing a different database.
// Each Namespace contains per-shard DbSlice, as well as a BlockingController.
class Namespace {
 public:
  Namespace();

  DbSlice& GetCurrentDbSlice();

  DbSlice& GetDbSlice(ShardId sid);
  BlockingController* GetOrAddBlockingController(EngineShard* shard);
  BlockingController* GetBlockingController(ShardId sid);

 private:
  std::vector<std::unique_ptr<DbSlice>> shard_db_slices_;
  std::vector<std::unique_ptr<BlockingController>> shard_blocking_controller_;

  friend class Namespaces;
};

// Namespaces is a registry and container for Namespace instances.
// Each Namespace has a unique string name, which identifies it in the store.
// Any attempt to access a non-existing Namespace will first create it, add it to the internal map
// and will then return it.
// It is currently impossible to remove a Namespace after it has been created.
// The default Namespace can be accessed via either GetDefaultNamespace() (which guarantees not to
// yield), or via the GetOrInsert() with an empty string.
// The initialization order of this class with the engine shards is slightly subtle, as they have
// mutual dependencies.
class Namespaces {
 public:
  Namespaces();
  ~Namespaces();

  void Clear() ABSL_LOCKS_EXCLUDED(mu_);  // Thread unsafe, use in tear-down or tests

  Namespace& GetDefaultNamespace() const;  // No locks
  Namespace& GetOrInsert(std::string_view ns) ABSL_LOCKS_EXCLUDED(mu_);

 private:
  util::fb2::SharedMutex mu_{};
  absl::node_hash_map<std::string, Namespace> namespaces_ ABSL_GUARDED_BY(mu_);
  Namespace* default_namespace_ = nullptr;
};

}  // namespace dfly


================================================
FILE: src/server/protocol_client.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#include "server/protocol_client.h"

#include "facade/tls_helpers.h"

extern "C" {
#include "redis/rdb.h"
}

#include <absl/cleanup/cleanup.h>
#include <absl/flags/flag.h>
#include <absl/functional/bind_front.h>
#include <absl/strings/escaping.h>
#include <absl/strings/str_cat.h>
#include <absl/strings/strip.h>

#include <boost/asio/ip/tcp.hpp>
#include <string>

#include "base/logging.h"
#include "facade/dragonfly_connection.h"
#include "facade/redis_parser.h"
#include "facade/reply_builder.h"
#include "facade/socket_utils.h"
#include "server/error.h"
#include "server/journal/executor.h"
#include "server/journal/serializer.h"
#include "server/main_service.h"
#include "server/rdb_load.h"
#include "strings/human_readable.h"
#include "util/fibers/dns_resolve.h"

#ifdef DFLY_USE_SSL
#include "util/tls/tls_socket.h"
#endif

ABSL_FLAG(std::string, masteruser, "", "username for authentication with master");
ABSL_FLAG(std::string, masterauth, "", "password for authentication with master");
ABSL_FLAG(bool, tls_replication, false, "Enable TLS on replication");

ABSL_DECLARE_FLAG(std::string, tls_cert_file);
ABSL_DECLARE_FLAG(std::string, tls_key_file);
ABSL_DECLARE_FLAG(std::string, tls_ca_cert_file);
ABSL_DECLARE_FLAG(std::string, tls_ca_cert_dir);

namespace dfly {

using namespace std;
using namespace util;
using namespace boost::asio;
using namespace facade;
using absl::GetFlag;
using absl::StrCat;

error_code ProtocolClient::Recv(FiberSocketBase* input, base::IoBuf* dest) {
  auto buf = dest->AppendBuffer();
  io::Result<size_t> exp_size = input->Recv(buf);
  if (!exp_size) {
    LOG(WARNING) << "Socket error " << exp_size.error();
    return exp_size.error();
  }

  if (*exp_size == 0) {
    VLOG(1) << "Connection closed by peer";
    return make_error_code(errc::connection_aborted);
  }

  TouchIoTime();

  dest->CommitWrite(*exp_size);
  return error_code{};
}

std::string ProtocolClient::ServerContext::Description() const {
  return absl::StrCat(host, ":", port);
}

void ValidateClientTlsFlags() {
  if (!GetFlag(FLAGS_tls_replication)) {
    return;
  }

  bool has_auth = false;

  if (!GetFlag(FLAGS_tls_key_file).empty()) {
    if (GetFlag(FLAGS_tls_cert_file).empty()) {
      LOG(ERROR) << "tls_cert_file flag should be set";
      exit(1);
    }
    has_auth = true;
  }

  if (!GetFlag(FLAGS_masterauth).empty())
    has_auth = true;

  if (!has_auth) {
    LOG(ERROR) << "No authentication method configured!";
    exit(1);
  }
}

#ifdef DFLY_USE_SSL
void ProtocolClient::MaybeInitSslCtx() {
  if (GetFlag(FLAGS_tls_replication)) {
    ssl_ctx_ = CreateSslCntx(facade::TlsContextRole::CLIENT);
  }
}
#endif

ProtocolClient::ProtocolClient(string host, uint16_t port) {
  server_context_.host = std::move(host);
  server_context_.port = port;
#ifdef DFLY_USE_SSL
  MaybeInitSslCtx();
#endif
}
ProtocolClient::ProtocolClient(ServerContext context) : server_context_(std::move(context)) {
#ifdef DFLY_USE_SSL
  MaybeInitSslCtx();
#endif
}

ProtocolClient::~ProtocolClient() {
  exec_st_.JoinErrorHandler();

#ifdef DFLY_USE_SSL
  if (ssl_ctx_) {
    SSL_CTX_free(ssl_ctx_);
  }
#endif
}

error_code ProtocolClient::ResolveHostDns() {
  char ip_addr[INET6_ADDRSTRLEN];

  // IPv6 address can be enclosed in square brackets.
  // https://www.rfc-editor.org/rfc/rfc2732#section-2
  // We need to remove the brackets before resolving the DNS.
  // Enclosed IPv6 addresses can't be resolved by the DNS resolver.
  std::string host = server_context_.host;
  if (!host.empty() && host.front() == '[' && host.back() == ']') {
    host = host.substr(1, host.size() - 2);
  }

  auto ec = util::fb2::DnsResolve(host, 0, ip_addr, ProactorBase::me());
  if (ec) {
    LOG(ERROR) << "Dns error " << ec << ", host: " << server_context_.host;
    return make_error_code(errc::host_unreachable);
  }

  LOG_IF(INFO, std::string(ip_addr) != server_context_.host)
      << "Resolved endpoint " << server_context_.Description() << " to " << ip_addr << ":"
      << server_context_.port;
  server_context_.endpoint = {ip::make_address(ip_addr), server_context_.port};

  return error_code{};
}

error_code ProtocolClient::ConnectAndAuth(std::chrono::milliseconds connect_timeout_ms,
                                          ExecutionState* cntx) {
  ProactorBase* mythread = ProactorBase::me();
  CHECK(mythread);
  {
    unique_lock lk(sock_mu_);
    // The context closes sock_. So if the context error handler has already
    // run we must not create a new socket. sock_mu_ syncs between the two
    // functions.
    if (cntx->IsRunning()) {
      if (sock_) {
        LOG_IF(WARNING, sock_->Close()) << "Error closing socket";
        sock_.reset(nullptr);
      }

      if (ssl_ctx_) {
#ifdef DFLY_USE_SSL
        auto tls_sock = std::make_unique<tls::TlsSocket>(mythread->CreateSocket());
        tls_sock->InitSSL(ssl_ctx_);
        sock_ = std::move(tls_sock);
#endif
      } else {
        sock_.reset(mythread->CreateSocket());
      }
    } else {
      return cntx->GetError();
    }
  }

  // We set this timeout because this call blocks other REPLICAOF commands. We don't need it for the
  // rest of the sync.
  {
    uint32_t timeout = sock_->timeout();
    sock_->set_timeout(connect_timeout_ms.count());
    RETURN_ON_ERR(sock_->Connect(server_context_.endpoint));
    sock_->set_timeout(timeout);
  }

  // For idle connections we enable TCP keepalive to prevent disconnects.
  int yes = 1;
  if (setsockopt(sock_->native_handle(), SOL_SOCKET, SO_KEEPALIVE, &yes, sizeof(yes)) == 0) {
    int intv = 300;
#ifdef __APPLE__
    setsockopt(sock_->native_handle(), IPPROTO_TCP, TCP_KEEPALIVE, &intv, sizeof(intv));
#else
    setsockopt(sock_->native_handle(), IPPROTO_TCP, TCP_KEEPIDLE, &intv, sizeof(intv));
#endif

    intv /= 3;
    setsockopt(sock_->native_handle(), IPPROTO_TCP, TCP_KEEPINTVL, &intv, sizeof(intv));

    intv = 3;
    setsockopt(sock_->native_handle(), IPPROTO_TCP, TCP_KEEPCNT, &intv, sizeof(intv));
  }

  // CHECK_EQ(0, setsockopt(sock_->native_handle(), IPPROTO_TCP, TCP_NODELAY, &yes, sizeof(yes)));

  auto masterauth = GetFlag(FLAGS_masterauth);
  auto masteruser = GetFlag(FLAGS_masteruser);
  ResetParser(RedisParser::Mode::CLIENT);
  if (!masterauth.empty()) {
    auto cmd = masteruser.empty() ? StrCat("AUTH ", masterauth)
                                  : StrCat("AUTH ", masteruser, " ", masterauth);
    RETURN_ON_ERR(SendCommandAndReadResponse(cmd));
    last_cmd_ = "AUTH";  // Make sure the password is not printed to logs
    PC_RETURN_ON_BAD_RESPONSE(CheckRespIsSimpleReply("OK"));
  }
  return error_code{};
}

void ProtocolClient::ShutdownSocketImpl(bool should_close) {
  unique_lock lk(sock_mu_);
  if (sock_) {
    sock_->proactor()->Await([this, should_close] {
      if (sock_->IsOpen()) {
        auto ec = sock_->Shutdown(SHUT_RDWR);
        LOG_IF(ERROR, ec) << "Could not shutdown socket " << ec;
      }
      if (should_close) {
        auto ec = sock_->Close();  // Quietly close.
        LOG_IF(WARNING, ec) << "Error closing socket " << ec << "/" << ec.message();
      }
    });
  }
}

void ProtocolClient::CloseSocket() {
  return ShutdownSocketImpl(true);
}

void ProtocolClient::ShutdownSocket() {
  return ShutdownSocketImpl(false);
}

void ProtocolClient::DefaultErrorHandler(const GenericError& err) {
  LOG(WARNING) << "Socket error: " << err.Format() << " in " << server_context_.Description()
               << ", socket info: " << GetSocketInfo(sock_ ? sock_->native_handle() : -1);
  ShutdownSocket();
}

io::Result<ProtocolClient::ReadRespRes> ProtocolClient::ReadRespReply(base::IoBuf* buffer,
                                                                      bool copy_msg) {
  DCHECK(parser_);

  error_code ec;
  if (!buffer) {
    buffer = &resp_buf_;
    buffer->Clear();
  }
  last_resp_ = "";

  uint32_t processed_bytes = 0;

  RedisParser::Result result = RedisParser::OK;
  while (!ec) {
    uint32_t consumed;
    if (buffer->InputLen() == 0 || result == RedisParser::INPUT_PENDING) {
      DCHECK_GT(buffer->AppendLen(), 0u);

      ec = Recv(sock_.get(), buffer);
      if (ec) {
        return nonstd::make_unexpected(ec);
      }
    }

    result = parser_->Parse(buffer->InputBuffer(), &consumed, &resp_args_);
    processed_bytes += consumed;
    if (copy_msg)
      last_resp_ +=
          std::string_view(reinterpret_cast<char*>(buffer->InputBuffer().data()), consumed);

    if (result == RedisParser::OK) {
      return ReadRespRes{processed_bytes, consumed};  // success path
    }

    buffer->ConsumeInput(consumed);

    if (result != RedisParser::INPUT_PENDING) {
      LOG(ERROR) << "Invalid parser status " << result << " for response " << last_resp_;
      return nonstd::make_unexpected(std::make_error_code(std::errc::bad_message));
    }

    // We need to read more data. Check that we have enough space.
    if (buffer->AppendLen() < 64u) {
      buffer->EnsureCapacity(buffer->Capacity() * 2);
    }
  }

  return nonstd::make_unexpected(ec);
}

io::Result<ProtocolClient::ReadRespRes> ProtocolClient::ReadRespReply(uint32_t timeout) {
  auto prev_timeout = sock_->timeout();
  sock_->set_timeout(timeout);
  auto res = ReadRespReply();
  sock_->set_timeout(prev_timeout);
  return res;
}

io::Result<dfly::RESPObj> ProtocolClient::TakeRespReply(uint32_t timeout, base::IoBuf* buffer,
                                                        bool copy_msg) {
  auto prev_timeout = sock_->timeout();
  sock_->set_timeout(timeout);
  absl::Cleanup on_exit([this, prev_timeout]() { sock_->set_timeout(prev_timeout); });

  error_code ec;
  if (!buffer) {
    buffer = &resp_buf_;
  }

  last_resp_ = "";

  uint32_t processed_bytes = 0;
  std::optional<dfly::RESPObj> resp;

  do {
    resp = resp_parser_.Feed(nullptr, 0);  // check if previous data produced a reply
    if (resp && !resp->Empty()) {
      VLOG(2) << "return reply from previous data read";
      return std::move(resp).value();  // success path
    }
    if (buffer->InputLen() == 0) {
      DCHECK_GT(buffer->AppendLen(), 0u);
      ec = Recv(sock_.get(), buffer);
      if (ec) {
        VLOG(2) << "error socket reading reply: " << ec;
        return nonstd::make_unexpected(ec);
      }
    }

    auto input_buf = buffer->InputBuffer();
    resp = resp_parser_.Feed(reinterpret_cast<char*>(input_buf.data()), input_buf.size());
    processed_bytes += input_buf.size();
    if (copy_msg)
      last_resp_ +=
          std::string_view(reinterpret_cast<char*>(buffer->InputBuffer().data()), input_buf.size());

    buffer->ConsumeInput(input_buf.size());
    if (resp && !resp->Empty()) {
      VLOG(2) << "successfully parsed readed reply";
      return std::move(resp).value();  // success path
    }

    // We need to read more data. Check that we have enough space.
    if (buffer->AppendLen() < 64u) {
      buffer->EnsureCapacity(buffer->Capacity() * 2);
    }
  } while (resp);

  VLOG(2) << "protocol issue";
  return nonstd::make_unexpected(std::make_error_code(std::errc::bad_message));
}

error_code ProtocolClient::ReadLine(base::IoBuf* io_buf, string_view* line) {
  size_t eol_pos;
  std::string_view input_str = ToSV(io_buf->InputBuffer());

  // consume whitespace.
  while (true) {
    auto it = find_if_not(input_str.begin(), input_str.end(), absl::ascii_isspace);
    size_t ws_len = it - input_str.begin();
    io_buf->ConsumeInput(ws_len);
    input_str = ToSV(io_buf->InputBuffer());
    if (!input_str.empty())
      break;

    RETURN_ON_ERR(Recv(sock_.get(), io_buf));
    input_str = ToSV(io_buf->InputBuffer());
  };

  // find eol.
  while (true) {
    eol_pos = input_str.find('\n');

    if (eol_pos != std::string_view::npos) {
      DCHECK_GT(eol_pos, 0u);  // can not be 0 because then would be consumed as a whitespace.
      if (input_str[eol_pos - 1] != '\r') {
        break;
      }
      *line = input_str.substr(0, eol_pos - 1);
      return error_code{};
    }

    RETURN_ON_ERR(Recv(sock_.get(), io_buf));
    input_str = ToSV(io_buf->InputBuffer());
  }

  LOG(ERROR) << "Bad replication header: " << input_str;
  return std::make_error_code(std::errc::illegal_byte_sequence);
}

bool ProtocolClient::CheckRespIsSimpleReply(string_view reply) const {
  return resp_args_.size() == 1 && resp_args_.front().type == RespExpr::STRING &&
         ToSV(resp_args_.front().GetBuf()) == reply;
}

bool ProtocolClient::CheckRespSimpleError(string_view error) const {
  return resp_args_.size() == 1 && resp_args_.front().type == RespExpr::ERROR &&
         ToSV(resp_args_.front().GetBuf()) == error;
}

bool ProtocolClient::CheckRespFirstTypes(initializer_list<RespExpr::Type> types) const {
  unsigned i = 0;
  for (RespExpr::Type type : types) {
    if (i >= resp_args_.size() || resp_args_[i].type != type)
      return false;
    ++i;
  }
  return true;
}

error_code ProtocolClient::SendCommand(string_view command) {
  string formatted_command = RedisReplyBuilderBase::SerializeCommand(command);
  DCHECK(sock_->proactor() == ProactorBase::me());
  auto ec = sock_->Write(io::Buffer(formatted_command));
  if (!ec)
    TouchIoTime();
  return ec;
}

error_code ProtocolClient::SendCommandAndReadResponse(string_view command) {
  last_cmd_ = command;
  if (auto ec = SendCommand(command); ec)
    return ec;
  auto response_res = ReadRespReply();
  return response_res.has_value() ? error_code{} : response_res.error();
}

void ProtocolClient::ResetParser(RedisParser::Mode mode) {
  // We accept any length for the parser because it has been approved by the master.
  parser_.reset(new RedisParser(mode));
}

uint64_t ProtocolClient::LastIoTime() const {
  return last_io_time_.load(std::memory_order_relaxed);
}

void ProtocolClient::TouchIoTime() {
  last_io_time_.store(Proactor()->GetMonotonicTimeNs(), std::memory_order_relaxed);
}

}  // namespace dfly


================================================
FILE: src/server/protocol_client.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once

#include <absl/strings/escaping.h>

#include <queue>
#include <variant>

#include "facade/facade_types.h"
#include "facade/redis_parser.h"
#include "facade/resp_parser.h"
#include "io/io_buf.h"
#include "server/execution_state.h"
#include "server/version.h"
#include "util/fiber_socket_base.h"

#ifdef DFLY_USE_SSL
#include <openssl/ssl.h>
#endif

namespace dfly {

class Service;
class ConnectionContext;
class JournalExecutor;
struct JournalReader;

void ValidateClientTlsFlags();

// A helper class for implementing a Redis client that talks to a redis server.
// This class should be inherited from.
class ProtocolClient {
 public:
#ifdef DFLY_USE_SSL
  using SSL_CTX = struct ssl_ctx_st;
#endif

  ProtocolClient(std::string master_host, uint16_t port);
  virtual ~ProtocolClient();

  // First Shutdown() the socket and immediately Close() it.
  // Any attempt for IO in the socket after Close() will crash with CHECK fail.
  void CloseSocket();

  // Shutdown the underline socket but do not Close() it. By decoupling this, api
  // callers can shutdown the socket, wait for the relevant flows to gracefully exit
  // (by observing during an IO operation that the socket was shut down) and then finally
  // Close() the socket.
  void ShutdownSocket();

  uint64_t LastIoTime() const;
  void TouchIoTime();

  const std::string& GetHost() const {
    return server().host;
  };

  uint16_t GetPort() const {
    return server().port;
  };

 protected:
  struct ServerContext {
    std::string host;
    uint16_t port;
    boost::asio::ip::tcp::endpoint endpoint;

    std::string Description() const;
  };

  // Constructing using a fully initialized ServerContext allows to skip
  // the DNS resolution step.
  explicit ProtocolClient(ServerContext context);

  std::error_code ResolveHostDns();
  // Connect to master and authenticate if needed.
  std::error_code ConnectAndAuth(std::chrono::milliseconds connect_timeout_ms,
                                 ExecutionState* cntx);

  void DefaultErrorHandler(const GenericError& err);

  struct ReadRespRes {
    uint32_t total_read;
    uint32_t left_in_buffer;
  };

  // This function uses parser_ and cmd_args_ in order to consume a single response
  // from the sock_. The output will reside in resp_args_.
  // For error reporting purposes, the parsed command would be in last_resp_ if copy_msg is true.
  // If io_buf is not given, a internal temporary buffer will be used.
  // It is the responsibility of the caller to call buffer->ConsumeInput(rv.left_in_buffer) when it
  // is done with the result of the call; Calling ConsumeInput may invalidate the data in the result
  // if the buffer relocates.
  // TODO these functions contains bugs related to partial reads and parser state management.
  io::Result<ReadRespRes> ReadRespReply(base::IoBuf* buffer = nullptr, bool copy_msg = true);
  io::Result<ReadRespRes> ReadRespReply(uint32_t timeout);

  io::Result<facade::RESPObj> TakeRespReply(uint32_t timeout, base::IoBuf* buffer = nullptr,
                                            bool copy_msg = true);

  std::error_code ReadLine(base::IoBuf* io_buf, std::string_view* line);

  // Check if reps_args contains a simple reply.
  bool CheckRespIsSimpleReply(std::string_view reply) const;

  // Check if resp_args contains a simple error
  bool CheckRespSimpleError(std::string_view error) const;

  // Check resp_args contains the following types at front.
  bool CheckRespFirstTypes(std::initializer_list<facade::RespExpr::Type> types) const;

  // Send command, update last_io_time, return error.
  std::error_code SendCommand(std::string_view command);
  // Send command, read response into resp_args_.
  std::error_code SendCommandAndReadResponse(std::string_view command);

  const ServerContext& server() const {
    return server_context_;
  }

  void ResetParser(facade::RedisParser::Mode mode);

  // TODO can return invalid results if response answer was bigger than provided buffer into
  // ReadRespReply
  auto& LastResponseArgs() {
    return resp_args_;
  }

  auto* Proactor() const {
    return sock_->proactor();
  }

  util::FiberSocketBase* Sock() const {
    return sock_.get();
  }

 private:
  std::error_code Recv(util::FiberSocketBase* input, base::IoBuf* dest);

  void ShutdownSocketImpl(bool should_close);

  ServerContext server_context_;

  std::unique_ptr<facade::RedisParser> parser_;
  facade::RespVec resp_args_;
  base::IoBuf resp_buf_;

  facade::RESPParser resp_parser_;

  std::unique_ptr<util::FiberSocketBase> sock_;
  util::fb2::Mutex sock_mu_;

 protected:
  ExecutionState exec_st_;  // context for tasks in replica.

  std::string last_cmd_;
  std::string last_resp_;

  std::atomic<uint64_t> last_io_time_ = 0;  // in ns, monotonic clock.

#ifdef DFLY_USE_SSL

  void MaybeInitSslCtx();

  SSL_CTX* ssl_ctx_{nullptr};
#else
  void* ssl_ctx_{nullptr};
#endif
};

}  // namespace dfly

/**
 * A convenience macro to use with ProtocolClient instances for protocol input validation.
 */
#define PC_RETURN_ON_BAD_RESPONSE_T(T, x)                                                      \
  do {                                                                                         \
    if (!(x)) {                                                                                \
      LOG(ERROR) << "Bad response to \"" << last_cmd_ << "\": \"" << absl::CEscape(last_resp_) \
                 << "\"";                                                                      \
      return (T)(std::make_error_code(errc::bad_message));                                     \
    }                                                                                          \
  } while (false)

#define PC_RETURN_ON_BAD_RESPONSE(x) PC_RETURN_ON_BAD_RESPONSE_T(std::error_code, x)


================================================
FILE: src/server/rdb_extensions.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

extern "C" {
#include "redis/rdb.h"
}

//  Custom types: Range 30-35 is used by DF RDB types.
constexpr uint8_t RDB_TYPE_JSON = 30;
constexpr uint8_t RDB_TYPE_HASH_WITH_EXPIRY = 31;
constexpr uint8_t RDB_TYPE_SET_WITH_EXPIRY = 32;
constexpr uint8_t RDB_TYPE_SBF = 33;
constexpr uint8_t RDB_TYPE_SBF2 = 34;
constexpr uint8_t RDB_TYPE_CMS = 35;

constexpr bool rdbIsObjectTypeDF(uint8_t type) {
  return __rdbIsObjectType(type) || (type == RDB_TYPE_JSON) ||
         (type == RDB_TYPE_HASH_WITH_EXPIRY) || (type == RDB_TYPE_SET_WITH_EXPIRY) ||
         (type == RDB_TYPE_SBF) || (type == RDB_TYPE_SBF2) || (type == RDB_TYPE_CMS);
}

//  Opcodes: Range 200-240 is used by DF extensions.

// This opcode is sent by the master Dragonfly instance to a replica
// to notify that it finished streaming static data and is ready
// to switch to the stable state replication phase.
constexpr uint8_t RDB_OPCODE_FULLSYNC_END = 200;

constexpr uint8_t RDB_OPCODE_COMPRESSED_ZSTD_BLOB_START = 201;
constexpr uint8_t RDB_OPCODE_COMPRESSED_LZ4_BLOB_START = 202;
constexpr uint8_t RDB_OPCODE_COMPRESSED_BLOB_END = 203;

constexpr uint8_t RDB_OPCODE_JOURNAL_BLOB = 210;

// A full sync will continue to send information in journal blobs until the replica
// sends a `DFLY STARTSTABLE` to the master.
// We use this opcode to synchronize the journal offsets at the end of the full sync,
// so it is always sent at the end of the RDB stream.
constexpr uint8_t RDB_OPCODE_JOURNAL_OFFSET = 211;

constexpr uint8_t RDB_OPCODE_DF_MASK = 220; /* Mask for key properties */

// RDB_OPCODE_DF_MASK define 4byte field with next flags
constexpr uint32_t DF_MASK_FLAG_STICKY = (1 << 0);
constexpr uint32_t DF_MASK_FLAG_MC_FLAGS = (1 << 1);

// Opcode to store HNSW vector index node data for global indices
// Format: [index_name, elements_number, internal_id, global_id, level, zero_level_links_num,
// zero_level_links,
//          higher_level_links_num (only if level > 0), higher_level_links (only if level > 0)]
constexpr uint8_t RDB_OPCODE_VECTOR_INDEX = 222;

// Opcode to store ShardDocIndex key-to-DocId mapping for search indices
// Format: [shard_id, index_name, mapping_count, then for each mapping: key_string, doc_id]
constexpr uint8_t RDB_OPCODE_SHARD_DOC_INDEX = 223;


================================================
FILE: src/server/rdb_load.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/rdb_load.h"

#include "absl/strings/escaping.h"
#include "server/search/global_hnsw_index.h"
#include "server/tiered_storage.h"

extern "C" {
#include "redis/intset.h"
#include "redis/listpack.h"
#include "redis/lzfP.h" /* LZF compression library */
#include "redis/stream.h"
#include "redis/util.h"
#include "redis/ziplist.h"
#include "redis/zmalloc.h"
}
#include <absl/cleanup/cleanup.h>
#include <absl/strings/match.h>
#include <absl/strings/str_cat.h>
#include <absl/strings/str_split.h>

#include <cstring>

#include "base/endian.h"
#include "base/flags.h"
#include "base/logging.h"
#include "core/bloom.h"
#include "core/cms.h"
#include "core/detail/listpack_wrap.h"
#include "core/json/json_object.h"
#include "core/qlist.h"
#include "core/sorted_map.h"
#include "core/string_map.h"
#include "core/string_set.h"
#include "server/cluster/cluster_config.h"
#include "server/engine_shard_set.h"
#include "server/error.h"
#include "server/family_utils.h"
#include "server/hset_family.h"
#include "server/journal/executor.h"
#include "server/journal/serializer.h"
#include "server/main_service.h"
#include "server/namespaces.h"
#include "server/rdb_extensions.h"
#include "server/script_mgr.h"
#include "server/search/doc_index.h"
#include "server/search/global_hnsw_index.h"
#include "server/serializer_commons.h"
#include "server/server_state.h"
#include "server/set_family.h"
#include "server/stream_family.h"
#include "server/transaction.h"
#include "server/zset_family.h"
#include "strings/human_readable.h"

ABSL_DECLARE_FLAG(int32_t, list_max_listpack_size);
ABSL_DECLARE_FLAG(int32_t, list_compress_depth);
ABSL_DECLARE_FLAG(uint32_t, dbnum);
ABSL_FLAG(bool, deserialize_hnsw_index, false, "Deserialize HNSW vector index graph structure");
ABSL_FLAG(bool, rdb_load_dry_run, false, "Dry run RDB load without applying changes");
ABSL_FLAG(bool, rdb_ignore_expiry, false, "Ignore Key Expiry when loding from RDB snapshot");

namespace dfly {

using namespace std;
using base::IoBuf;
using nonstd::make_unexpected;
using namespace util;
using absl::GetFlag;
using rdb::errc;
using namespace tiering::literals;

namespace {

int64_t LpGetIntegerIfValid(unsigned char* ele, int* valid) {
  int64_t v = 0;
  *valid = lpGetInteger(ele, &v);
  return v;
}

// Returns 1 if the stream listpack entries structure is valid, 0 otherwise.
int StreamValidateListpackIntegrity(unsigned char* lp, size_t size) {
  int valid_record;
  unsigned char *p, *next;

  if (!lpValidateIntegrity(lp, size, 0, NULL, NULL))
    return 0;

  next = p = lpValidateFirst(lp);
  if (!lpValidateNext(lp, &next, size))
    return 0;
  if (!p)
    return 0;

  LpGetIntegerIfValid(p, &valid_record);
  if (!valid_record)
    return 0;
  p = next;
  if (!lpValidateNext(lp, &next, size))
    return 0;

  LpGetIntegerIfValid(p, &valid_record);
  if (!valid_record)
    return 0;
  p = next;
  if (!lpValidateNext(lp, &next, size))
    return 0;

  LpGetIntegerIfValid(p, &valid_record);
  if (!valid_record)
    return 0;
  p = next;
  if (!lpValidateNext(lp, &next, size))
    return 0;
  return 1;
}

// Maximum length of each LoadTrace segment.
//
// Note kMaxBlobLen must be a multiple of 6 to avoid truncating elements
// containing 2 or 3 items.
constexpr size_t kMaxBlobLen = 4092;

inline auto Unexpected(errc ev) {
  return make_unexpected(RdbError(ev));
}

const error_code kOk;

/* callback for ziplistValidateIntegrity.
 * The ziplist element pointed by 'p' will be converted and stored into listpack. */
int ziplistEntryConvertAndValidate(unsigned char* p, unsigned int head_count, void* userdata) {
  unsigned char* str;
  unsigned int slen;
  long long vll;
  unsigned char** lp = (unsigned char**)userdata;

  if (!ziplistGet(p, &str, &slen, &vll))
    return 0;

  if (str)
    *lp = lpAppend(*lp, (unsigned char*)str, slen);
  else
    *lp = lpAppendInteger(*lp, vll);

  return 1;
}
string ModuleTypeName(uint64_t module_id) {
  static const char ModuleNameSet[] =
      "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
      "abcdefghijklmnopqrstuvwxyz"
      "0123456789-_";

  char name[10];

  name[9] = '\0';
  char* p = name + 8;
  module_id >>= 10;
  for (int j = 0; j < 9; j++) {
    *p-- = ModuleNameSet[module_id & 63];
    module_id >>= 6;
  }

  return string{name};
}

bool RdbTypeAllowedEmpty(int type) {
  return type == RDB_TYPE_STRING || type == RDB_TYPE_JSON || type == RDB_TYPE_SBF ||
         type == RDB_TYPE_STREAM_LISTPACKS || type == RDB_TYPE_SET_WITH_EXPIRY ||
         type == RDB_TYPE_HASH_WITH_EXPIRY || type == RDB_TYPE_SBF2 || type == RDB_TYPE_CMS;
}

DbSlice& GetCurrentDbSlice() {
  return namespaces->GetDefaultNamespace().GetCurrentDbSlice();
}

}  // namespace

class RdbLoaderBase::OpaqueObjLoader {
 public:
  OpaqueObjLoader(int rdb_type, PrimeValue* pv, LoadConfig config)
      : rdb_type_(rdb_type), pv_(pv), config_(config) {
  }

  void operator()(long long val) {
    pv_->SetInt(val);
  }

  void operator()(const base::PODArray<char>& str);
  void operator()(const LzfString& lzfstr);
  void operator()(const unique_ptr<LoadTrace>& ptr);
  void operator()(const RdbSBF& src);
  void operator()(const RdbCMS& src);

  std::error_code ec() const {
    return ec_;
  }

 private:
  using ScratchBuf = base::PODArray<char>;

  void CreateSet(const LoadTrace* ltrace);
  void CreateHMap(const LoadTrace* ltrace);
  void CreateList(const LoadTrace* ltrace);
  void CreateZSet(const LoadTrace* ltrace);
  void CreateStream(const LoadTrace* ltrace);

  void HandleBlob(string_view blob);

  string_view ToSV(const RdbVariant& obj, ScratchBuf* buf);

  // Returns whether pv_ has the given object type and encoding. If not ec_
  // is set to the error.
  bool EnsureObjEncoding(CompactObjType type, unsigned encoding);

  template <typename F> static void Iterate(const LoadTrace& ltrace, F&& f) {
    for (const auto& blob : ltrace.arr) {
      if (!f(blob)) {
        return;
      }
    }
  }

  std::error_code ec_;
  int rdb_type_;
  ScratchBuf buf1_, buf2_, buf3_;
  PrimeValue* pv_;
  LoadConfig config_;
};

RdbLoaderBase::RdbLoaderBase() : origin_mem_buf_{16_KB} {
  mem_buf_ = &origin_mem_buf_;
}

RdbLoaderBase::~RdbLoaderBase() {
}

void RdbLoaderBase::OpaqueObjLoader::operator()(const base::PODArray<char>& str) {
  string_view sv(str.data(), str.size());
  HandleBlob(sv);
}

void RdbLoaderBase::OpaqueObjLoader::operator()(const LzfString& lzfstr) {
  string tmp(lzfstr.uncompressed_len, '\0');
  if (lzf_decompress(lzfstr.compressed_blob.data(), lzfstr.compressed_blob.size(), tmp.data(),
                     tmp.size()) == 0) {
    LOG(ERROR) << "Invalid LZF compressed string";
    ec_ = RdbError(errc::rdb_file_corrupted);
    return;
  }
  HandleBlob(tmp);
}

void RdbLoaderBase::OpaqueObjLoader::operator()(const unique_ptr<LoadTrace>& ptr) {
  switch (rdb_type_) {
    case RDB_TYPE_SET:
    case RDB_TYPE_SET_WITH_EXPIRY:
      CreateSet(ptr.get());
      break;
    case RDB_TYPE_HASH:
    case RDB_TYPE_HASH_WITH_EXPIRY:
      CreateHMap(ptr.get());
      break;
    case RDB_TYPE_LIST_QUICKLIST:
    case RDB_TYPE_LIST_QUICKLIST_2:
      CreateList(ptr.get());
      break;
    case RDB_TYPE_ZSET:
    case RDB_TYPE_ZSET_2:
      CreateZSet(ptr.get());
      break;
    case RDB_TYPE_STREAM_LISTPACKS:
    case RDB_TYPE_STREAM_LISTPACKS_2:
    case RDB_TYPE_STREAM_LISTPACKS_3:
      CreateStream(ptr.get());
      break;
    default:
      LOG(FATAL) << "Unsupported rdb type " << rdb_type_;
  }
}

void RdbLoaderBase::OpaqueObjLoader::operator()(const RdbSBF& src) {
  SBF* sbf =
      CompactObj::AllocateMR<SBF>(src.grow_factor, src.fp_prob, src.max_capacity, src.prev_size,
                                  src.current_size, CompactObj::memory_resource());
  for (unsigned i = 0; i < src.filters.size(); ++i) {
    sbf->AddFilter(src.filters[i].blob, src.filters[i].hash_cnt);
  }
  pv_->SetSBF(sbf);
}

void RdbLoaderBase::OpaqueObjLoader::operator()(const RdbCMS& src) {
  CMS* cms = CompactObj::AllocateMR<CMS>(src.width, src.depth, CompactObj::memory_resource());
  DCHECK_EQ(src.counters.size(), cms->NumCounters());
  cms->Load(src.total_incr_count, src.counters.data());
  pv_->SetCMS(cms);
}

void RdbLoaderBase::OpaqueObjLoader::CreateSet(const LoadTrace* ltrace) {
  size_t len = ltrace->arr.size();

  bool is_intset = true;
  if (!config_.chunked && rdb_type_ == RDB_TYPE_SET &&
      ltrace->arr.size() <= SetFamily::MaxIntsetEntries()) {
    Iterate(*ltrace, [&](const LoadBlob& blob) {
      if (!holds_alternative<long long>(blob.rdb_var)) {
        is_intset = false;
        return false;
      }
      return true;
    });
  } else {
    /* Use a regular set when there are too many entries, or when the
     * set is being chunked. */
    is_intset = false;
  }

  sds sdsele = nullptr;
  void* inner_obj = nullptr;

  auto cleanup = absl::MakeCleanup([&] {
    if (sdsele)
      sdsfree(sdsele);
    if (inner_obj) {
      if (is_intset) {
        zfree(inner_obj);
      } else {
        CompactObj::DeleteMR<StringSet>(inner_obj);
      }
    }
  });

  if (is_intset) {
    inner_obj = intsetNew();

    long long llval;
    Iterate(*ltrace, [&](const LoadBlob& blob) {
      llval = get<long long>(blob.rdb_var);
      uint8_t success;
      inner_obj = intsetAdd((intset*)inner_obj, llval, &success);
      if (!success) {
        LOG(ERROR) << "Duplicate set members detected";
        ec_ = RdbError(errc::duplicate_key);
        return false;
      }
      return true;
    });
  } else {
    StringSet* set;
    if (config_.append) {
      // Note we always use StringSet when the object is being chunked.
      if (!EnsureObjEncoding(OBJ_SET, kEncodingStrMap2)) {
        return;
      }
      set = static_cast<StringSet*>(pv_->RObjPtr());
    } else {
      set = CompactObj::AllocateMR<StringSet>();
      set->set_time(MemberTimeSeconds(GetCurrentTimeMs()));
      inner_obj = set;

      // Expand the set up front to avoid rehashing.
      set->Reserve((config_.reserve > len) ? config_.reserve : len);
    }

    size_t increment = 1;
    if (rdb_type_ == RDB_TYPE_SET_WITH_EXPIRY) {
      increment = 2;
    }

    bool values_expired = false;

    for (size_t i = 0; i < ltrace->arr.size(); i += increment) {
      string_view element = ToSV(ltrace->arr[i].rdb_var, &buf1_);

      uint32_t ttl_sec = UINT32_MAX;
      if (increment == 2) {
        int64_t ttl_time = -1;
        string_view ttl_str = ToSV(ltrace->arr[i + 1].rdb_var, &buf2_);
        if (!absl::SimpleAtoi(ttl_str, &ttl_time)) {
          LOG(ERROR) << "Can't parse set TTL " << ttl_str;
          ec_ = RdbError(errc::rdb_file_corrupted);
          return;
        }

        if (ttl_time != -1) {
          if (ttl_time <= set->time_now()) {
            values_expired = true;
            continue;
          }

          ttl_sec = ttl_time - set->time_now();
        }
      }
      if (!set->Add(element, ttl_sec)) {
        LOG(ERROR) << "Duplicate set members detected " << absl::CHexEscape(element) << " with TTL "
                   << ttl_sec << " " << rdb_type_ << " " << set->ExpirationUsed() << " "
                   << config_.append;
        ec_ = RdbError(errc::duplicate_key);
        return;
      }
    }
    if (set->Empty() && values_expired) {
      ec_ = RdbError(errc::value_expired);
    }
  }

  if (ec_)
    return;

  if (!config_.append) {
    pv_->InitRobj(OBJ_SET, is_intset ? kEncodingIntSet : kEncodingStrMap2, inner_obj);
  }
  std::move(cleanup).Cancel();
}

void RdbLoaderBase::OpaqueObjLoader::CreateHMap(const LoadTrace* ltrace) {
  size_t increment = 2;
  if (rdb_type_ == RDB_TYPE_HASH_WITH_EXPIRY)
    increment = 3;

  size_t len = ltrace->arr.size() / increment;

  /* Too many entries? Use a hash table right from the start. */
  bool keep_lp = !config_.chunked && (len <= 64) && (rdb_type_ != RDB_TYPE_HASH_WITH_EXPIRY);

  size_t lp_size = 0;
  if (keep_lp) {
    Iterate(*ltrace, [&](const LoadBlob& blob) {
      size_t str_len = StrLen(blob.rdb_var);
      lp_size += str_len;

      if (str_len > server.max_map_field_len) {
        keep_lp = false;
        return false;
      }
      return true;
    });
  }

  if (keep_lp) {
    uint8_t* lp = lpNew(lp_size);

    CHECK(ltrace->arr.size() % 2 == 0);
    for (size_t i = 0; i < ltrace->arr.size(); i += 2) {
      /* Add pair to listpack */
      string_view sv = ToSV(ltrace->arr[i].rdb_var, &buf1_);
      lp = lpAppend(lp, reinterpret_cast<const uint8_t*>(sv.data()), sv.size());

      sv = ToSV(ltrace->arr[i + 1].rdb_var, &buf1_);
      lp = lpAppend(lp, reinterpret_cast<const uint8_t*>(sv.data()), sv.size());
    }

    if (ec_) {
      lpFree(lp);
      return;
    }

    lp = lpShrinkToFit(lp);
    pv_->InitRobj(OBJ_HASH, kEncodingListPack, lp);
  } else {
    StringMap* string_map;
    if (config_.append) {
      // Note we always use StringMap when the object is being streamed.
      if (!EnsureObjEncoding(OBJ_HASH, kEncodingStrMap2)) {
        return;
      }

      string_map = static_cast<StringMap*>(pv_->RObjPtr());
    } else {
      string_map = CompactObj::AllocateMR<StringMap>();
      string_map->set_time(MemberTimeSeconds(GetCurrentTimeMs()));

      // Expand the map up front to avoid rehashing.
      string_map->Reserve((config_.reserve > len) ? config_.reserve : len);
    }

    auto cleanup = absl::MakeCleanup([&] {
      if (!config_.append) {
        CompactObj::DeleteMR<StringMap>(string_map);
      }
    });
    bool values_expired = false;
    for (size_t i = 0; i < ltrace->arr.size(); i += increment) {
      string_view key = ToSV(ltrace->arr[i].rdb_var, &buf1_);
      string_view val = ToSV(ltrace->arr[i + 1].rdb_var, &buf2_);

      if (ec_)
        return;

      uint32_t ttl_sec = UINT32_MAX;
      if (increment == 3) {
        int64_t ttl_time = -1;
        string_view ttl_str = ToSV(ltrace->arr[i + 2].rdb_var, &buf3_);
        if (!absl::SimpleAtoi(ttl_str, &ttl_time)) {
          LOG(ERROR) << "Can't parse hashmap TTL for " << key << ", ttl='" << ttl_str
                     << "', val=" << val;
          ec_ = RdbError(errc::rdb_file_corrupted);
          return;
        }

        if (ttl_time != -1) {
          if (ttl_time <= string_map->time_now()) {
            values_expired = true;
            continue;
          }

          ttl_sec = ttl_time - string_map->time_now();
        }
      }

      if (!string_map->AddOrSkip(key, val, ttl_sec)) {
        LOG(ERROR) << "Duplicate hash fields detected for field " << key;
        ec_ = RdbError(errc::rdb_file_corrupted);
        return;
      }
    }
    if (string_map->Empty() && values_expired) {
      ec_ = RdbError(errc::value_expired);
      return;
    }
    if (!config_.append) {
      pv_->InitRobj(OBJ_HASH, kEncodingStrMap2, string_map);
    }
    std::move(cleanup).Cancel();
  }
}

void RdbLoaderBase::OpaqueObjLoader::CreateList(const LoadTrace* ltrace) {
  QList* qlv2 = nullptr;
  if (config_.append) {
    if (pv_->ObjType() != OBJ_LIST) {
      ec_ = RdbError(errc::invalid_rdb_type);
      return;
    }
    DCHECK_EQ(pv_->Encoding(), kEncodingQL2);
    qlv2 = static_cast<QList*>(pv_->RObjPtr());
  } else {
    qlv2 = CompactObj::AllocateMR<QList>(GetFlag(FLAGS_list_max_listpack_size),
                                         GetFlag(FLAGS_list_compress_depth));
  }

  auto cleanup = absl::Cleanup([&] {
    if (!config_.append) {
      CompactObj::DeleteMR<QList>(qlv2);
    }
  });

  Iterate(*ltrace, [&](const LoadBlob& blob) {
    unsigned container = blob.encoding;
    string_view sv = ToSV(blob.rdb_var, &buf1_);

    if (ec_)
      return false;

    uint8_t* lp = nullptr;
    if (container == QUICKLIST_NODE_CONTAINER_PLAIN) {
      lp = (uint8_t*)zmalloc(sv.size());
      ::memcpy(lp, (uint8_t*)sv.data(), sv.size());
      qlv2->AppendPlain(lp, sv.size());

      return true;
    }

    if (rdb_type_ == RDB_TYPE_LIST_QUICKLIST_2) {
      uint8_t* src = (uint8_t*)sv.data();
      if (!lpValidateIntegrity(src, sv.size(), 0, nullptr, nullptr)) {
        LOG(ERROR) << "Listpack integrity check failed.";
        ec_ = RdbError(errc::rdb_file_corrupted);
        return false;
      }

      if (lpLength(src) == 0) {
        return true;
      }

      lp = (uint8_t*)zmalloc(sv.size());
      ::memcpy(lp, src, sv.size());
    } else {
      lp = lpNew(sv.size());
      if (!ziplistValidateIntegrity((uint8_t*)sv.data(), sv.size(), 1,
                                    ziplistEntryConvertAndValidate, &lp)) {
        LOG(ERROR) << "Ziplist integrity check failed: " << sv.size();
        zfree(lp);
        ec_ = RdbError(errc::rdb_file_corrupted);
        return false;
      }

      /* Silently skip empty ziplists, if we'll end up with empty quicklist we'll fail later. */
      if (lpLength(lp) == 0) {
        zfree(lp);
        return true;
      }

      lp = lpShrinkToFit(lp);
    }

    qlv2->AppendListpack(lp);
    return true;
  });

  if (ec_)
    return;
  if (qlv2 && qlv2->Size() == 0) {
    ec_ = RdbError(errc::empty_key);
    return;
  }

  std::move(cleanup).Cancel();

  if (!config_.append) {
    // Try to convert to listpack if it's a single-node quicklist
    if (uint8_t* lp = qlv2->TryExtractListpack()) {
      CompactObj::DeleteMR<QList>(qlv2);
      pv_->InitRobj(OBJ_LIST, kEncodingListPack, lp);
    } else {
      pv_->InitRobj(OBJ_LIST, kEncodingQL2, qlv2);
    }
  }
}

void RdbLoaderBase::OpaqueObjLoader::CreateZSet(const LoadTrace* ltrace) {
  size_t zsetlen = ltrace->arr.size();

  unsigned encoding = OBJ_ENCODING_SKIPLIST;
  detail::SortedMap* zs;
  if (config_.append) {
    // Note we always use SortedMap when the object is being chunked.
    if (!EnsureObjEncoding(OBJ_ZSET, OBJ_ENCODING_SKIPLIST)) {
      return;
    }

    zs = static_cast<detail::SortedMap*>(pv_->RObjPtr());
  } else {
    zs = CompactObj::AllocateMR<detail::SortedMap>();

    size_t reserve = (config_.reserve > zsetlen) ? config_.reserve : zsetlen;
    if (reserve > 2 && !zs->Reserve(reserve)) {
      LOG(ERROR) << "OOM in dictTryExpand " << zsetlen;
      ec_ = RdbError(errc::out_of_memory);
      return;
    }
  }

  auto cleanup = absl::MakeCleanup([&] {
    if (!config_.append) {
      CompactObj::DeleteMR<detail::SortedMap>(zs);
    }
  });

  size_t maxelelen = 0, totelelen = 0;

  Iterate(*ltrace, [&](const LoadBlob& blob) {
    string_view sv = ToSV(blob.rdb_var, &buf1_);

    double score = blob.score;

    /* Don't care about integer-encoded strings. */
    if (sv.size() > maxelelen)
      maxelelen = sv.size();
    totelelen += sv.size();

    if (!zs->InsertNew(score, sv)) {
      LOG(ERROR) << "Duplicate zset fields detected";
      ec_ = RdbError(errc::rdb_file_corrupted);
      return false;
    }

    return true;
  });

  if (ec_)
    return;

  void* inner = zs;
  if (!config_.chunked && zs->Size() <= ZSET_MAX_LISTPACK_ENTRIES &&
      maxelelen <= ZSET_MAX_LISTPACK_VALUE && lpSafeToAdd(NULL, totelelen)) {
    encoding = OBJ_ENCODING_LISTPACK;
    inner = zs->ToListPack();
    CompactObj::DeleteMR<detail::SortedMap>(zs);
  }

  std::move(cleanup).Cancel();

  if (!config_.append) {
    pv_->InitRobj(OBJ_ZSET, encoding, inner);
  }
}

void RdbLoaderBase::OpaqueObjLoader::CreateStream(const LoadTrace* ltrace) {
  stream* s;
  StreamMemTracker mem_tracker;
  if (config_.append) {
    if (!EnsureObjEncoding(OBJ_STREAM, OBJ_ENCODING_STREAM)) {
      return;
    }

    s = static_cast<stream*>(pv_->RObjPtr());
  } else {
    s = streamNew();
  }

  auto cleanup = absl::Cleanup([&] {
    if (!config_.append) {
      freeStream(s);
    }
  });

  for (size_t i = 0; i < ltrace->arr.size(); i += 2) {
    string_view nodekey = ToSV(ltrace->arr[i].rdb_var, &buf1_);
    string_view data = ToSV(ltrace->arr[i + 1].rdb_var, &buf2_);

    uint8_t* lp = (uint8_t*)data.data();

    if (!StreamValidateListpackIntegrity(lp, data.size())) {
      LOG(ERROR) << "Stream listpack integrity check failed.";
      ec_ = RdbError(errc::rdb_file_corrupted);
      return;
    }
    CHECK(lpFirst(lp) != NULL);
    uint8_t* copy_lp = (uint8_t*)zmalloc(data.size());
    ::memcpy(copy_lp, lp, data.size());
    /* Insert the key in the radix tree. */
    int retval =
        raxTryInsert(s->rax, (unsigned char*)nodekey.data(), nodekey.size(), copy_lp, NULL);
    if (!retval) {
      zfree(copy_lp);
      LOG(ERROR) << "Listpack re-added with existing key";
      ec_ = RdbError(errc::rdb_file_corrupted);
      return;
    }
  }

  // We only load the stream metadata and consumer groups (stream_trace) on
  // the final read (when reading the stream in increments). Therefore if
  // stream_trace is null add the partial stream, then stream_trace will be
  // loaded later.
  if (!ltrace->stream_trace) {
    if (!config_.append) {
      pv_->InitRobj(OBJ_STREAM, OBJ_ENCODING_STREAM, s);
    }
    std::move(cleanup).Cancel();
    return;
  }

  s->length = ltrace->stream_trace->stream_len;
  CopyStreamId(ltrace->stream_trace->last_id, &s->last_id);
  CopyStreamId(ltrace->stream_trace->first_id, &s->first_id);
  CopyStreamId(ltrace->stream_trace->max_deleted_entry_id, &s->max_deleted_entry_id);
  s->entries_added = ltrace->stream_trace->entries_added;

  if (rdb_type_ == RDB_TYPE_STREAM_LISTPACKS) {
    /* Since the rax is already loaded, we can find the first entry's
     * ID. */
    streamGetEdgeID(s, 1, 1, &s->first_id);
  }

  for (const auto& cg : ltrace->stream_trace->cgroup) {
    streamCG* cgroup = nullptr;
    {
      string_view cgname = ToSV(cg.name, &buf1_);
      streamID cg_id;
      cg_id.ms = cg.ms;
      cg_id.seq = cg.seq;

      uint64_t entries_read = cg.entries_read;
      if (rdb_type_ == RDB_TYPE_STREAM_LISTPACKS) {
        entries_read = streamEstimateDistanceFromFirstEverEntry(s, &cg_id);
      }

      cgroup = streamCreateCG(s, cgname.data(), cgname.size(), &cg_id, entries_read);
      if (cgroup == NULL) {
        LOG(ERROR) << "Duplicated consumer group name " << cgname;
        ec_ = RdbError(errc::duplicate_key);
        return;
      }
    }
    for (const auto& pel : cg.pel_arr) {
      streamNACK* nack = reinterpret_cast<streamNACK*>(zmalloc(sizeof(*nack)));
      nack->delivery_time = pel.delivery_time;
      nack->delivery_count = pel.delivery_count;
      nack->consumer = nullptr;

      if (!raxTryInsert(cgroup->pel, const_cast<uint8_t*>(pel.rawid.data()), pel.rawid.size(), nack,
                        NULL)) {
        LOG(ERROR) << "Duplicated global PEL entry loading stream consumer group";
        ec_ = RdbError(errc::duplicate_key);
        streamFreeNACK(nack);
        return;
      }
    }

    for (const auto& cons : cg.cons_arr) {
      streamConsumer* consumer = StreamCreateConsumer(
          cgroup, ToSV(cons.name, &buf1_), cons.seen_time, SCC_NO_NOTIFY | SCC_NO_DIRTIFY);
      if (!consumer) {
        LOG(ERROR) << "Duplicate stream consumer detected.";
        ec_ = RdbError(errc::duplicate_key);
        return;
      }

      consumer->active_time = cons.active_time;
      /* Create the PEL (pending entries list) about entries owned by this specific
       * consumer. */
      for (const auto& rawid : cons.nack_arr) {
        uint8_t* ptr = const_cast<uint8_t*>(rawid.data());
        streamNACK* nack = nullptr;
        int fres = raxFind(cgroup->pel, ptr, rawid.size(), (void**)&nack);
        if (fres == 0) {
          LOG(ERROR) << "Consumer entry not found in group global PEL";
          ec_ = RdbError(errc::rdb_file_corrupted);
          return;
        }

        /* Set the NACK consumer, that was left to NULL when
         * loading the global PEL. Then set the same shared
         * NACK structure also in the consumer-specific PEL. */
        nack->consumer = consumer;
        if (!raxTryInsert(consumer->pel, ptr, rawid.size(), nack, NULL)) {
          LOG(ERROR) << "Duplicated consumer PEL entry loading a stream consumer group";
          streamFreeNACK(nack);
          ec_ = RdbError(errc::duplicate_key);
          return;
        }
      }
    }
  }

  std::move(cleanup).Cancel();
  if (!config_.append) {
    pv_->InitRobj(OBJ_STREAM, OBJ_ENCODING_STREAM, s);
  }
  mem_tracker.UpdateStreamSize(*pv_);
}

void RdbLoaderBase::OpaqueObjLoader::HandleBlob(string_view blob) {
  auto handle_load_result = [&](LoadBlobResult load_result) {
    switch (load_result) {
      case LoadBlobResult::kCorrupted:
        LOG(ERROR) << "Corrupted blob detected with size " << blob.size() << " for rdb type "
                   << rdb_type_;
        ec_ = RdbError(errc::rdb_file_corrupted);
        break;
      case LoadBlobResult::kOutOfMemory:
        LOG(ERROR) << "OOM in LoadBlob " << blob.size();
        ec_ = RdbError(errc::out_of_memory);
        break;
      case LoadBlobResult::kEmpty:
        ec_ = RdbError(errc::empty_key);
        break;
      default:
        break;
    }
  };

  if (rdb_type_ == RDB_TYPE_STRING) {
    if (config_.append) {
      pv_->AppendString(blob);
    } else if (config_.reserve) {
      pv_->ReserveString(config_.reserve);
      pv_->AppendString(blob);
    } else {
      pv_->SetString(blob);
    }
    return;
  }

  if (rdb_type_ == RDB_TYPE_SET_INTSET || rdb_type_ == RDB_TYPE_SET_LISTPACK) {
    LoadBlobResult load_result = rdb_type_ == RDB_TYPE_SET_INTSET
                                     ? SetFamily::LoadIntSetBlob(blob, pv_)
                                     : SetFamily::LoadLPSetBlob(blob, pv_);
    handle_load_result(load_result);
    return;
  }

  if (rdb_type_ == RDB_TYPE_HASH_ZIPLIST || rdb_type_ == RDB_TYPE_HASH_LISTPACK) {
    LoadBlobResult load_result = rdb_type_ == RDB_TYPE_HASH_ZIPLIST
                                     ? HSetFamily::LoadZiplistBlob(blob, pv_)
                                     : HSetFamily::LoadListpackBlob(blob, pv_);
    handle_load_result(load_result);
    return;
  }

  if (rdb_type_ == RDB_TYPE_ZSET_ZIPLIST || rdb_type_ == RDB_TYPE_ZSET_LISTPACK) {
    LoadBlobResult load_result = rdb_type_ == RDB_TYPE_ZSET_ZIPLIST
                                     ? ZSetFamily::LoadZiplistBlob(blob, pv_)
                                     : ZSetFamily::LoadListpackBlob(blob, pv_);
    handle_load_result(load_result);
    return;
  } else if (rdb_type_ == RDB_TYPE_JSON) {
    size_t start_size = static_cast<MiMemoryResource*>(CompactObj::memory_resource())->used();
    {
      if (auto json = ParseJsonUsingShardHeap(blob)) {
        pv_->SetJson(std::move(*json));
      } else {
        LOG(INFO) << "Invalid JSON string during rdb load of JSON object: " << blob;
        ec_ = RdbError(errc::bad_json_string);
        return;
      }
    }
    size_t end_size = static_cast<MiMemoryResource*>(CompactObj::memory_resource())->used();
    DCHECK(end_size > start_size);
    pv_->SetJsonSize(end_size - start_size);
  } else {
    LOG(FATAL) << "Unsupported rdb type " << rdb_type_;
  }
}

string_view RdbLoaderBase::OpaqueObjLoader::ToSV(const RdbVariant& obj, ScratchBuf* buf) {
  if (holds_alternative<long long>(obj)) {
    buf->resize(absl::numbers_internal::kFastToBufferSize);
    auto val = get<long long>(obj);
    char* next = absl::numbers_internal::FastIntToBuffer(val, buf->data());
    return string_view{buf->data(), size_t(next - buf->data())};
  }

  const base::PODArray<char>* ch_arr = get_if<base::PODArray<char>>(&obj);
  if (ch_arr) {
    // pass non-null pointer to avoid UB with lp API.
    return ch_arr->empty() ? ""sv : string_view{ch_arr->data(), ch_arr->size()};
  }

  const LzfString* lzf = get_if<LzfString>(&obj);
  if (lzf) {
    buf->resize(lzf->uncompressed_len);
    if (lzf_decompress(lzf->compressed_blob.data(), lzf->compressed_blob.size(), buf->data(),
                       lzf->uncompressed_len) == 0) {
      LOG(ERROR) << "Invalid LZF compressed string";
      ec_ = RdbError(errc::rdb_file_corrupted);
      return {buf->data(), 0};  // important to return non-null pointer to avoid UB with lp API.
    }
    return {buf->data(), buf->size()};
  }

  LOG(FATAL) << "Unexpected variant";
  return {};
}

bool RdbLoaderBase::OpaqueObjLoader::EnsureObjEncoding(CompactObjType type, unsigned encoding) {
  if (pv_->ObjType() != type) {
    LOG(DFATAL) << "Invalid RDB type " << pv_->ObjType() << "; expected " << type;
    ec_ = RdbError(errc::invalid_rdb_type);
    return false;
  }
  if (pv_->Encoding() != encoding) {
    LOG(DFATAL) << "Invalid encoding " << pv_->Encoding() << "; expected " << encoding;
    ec_ = RdbError(errc::invalid_encoding);
    return false;
  }

  return true;
}

std::error_code RdbLoaderBase::FetchBuf(size_t size, void* dest) {
  if (size == 0)
    return kOk;

  uint8_t* next = (uint8_t*)dest;
  size_t bytes_read;

  size_t to_copy = std::min(mem_buf_->InputLen(), size);
  DVLOG(3) << "Copying " << to_copy << " bytes";

  ::memcpy(next, mem_buf_->InputBuffer().data(), to_copy);
  mem_buf_->ConsumeInput(to_copy);
  size -= to_copy;
  if (size == 0)
    return kOk;

  next += to_copy;

  if (size + bytes_read_ > source_limit_) {
    LOG(ERROR) << "Out of bound read " << size + bytes_read_ << " vs " << source_limit_;

    return RdbError(errc::rdb_file_corrupted);
  }

  if (size > 512) {  // Worth reading directly into next.
    io::MutableBytes mb{next, size};

    SET_OR_RETURN(src_->Read(mb), bytes_read);
    if (bytes_read < size)
      return RdbError(errc::rdb_file_corrupted);

    bytes_read_ += bytes_read;
    DCHECK_LE(bytes_read_, source_limit_);

    return kOk;
  }

  io::MutableBytes mb = mem_buf_->AppendBuffer();

  // Must be because mem_buf_ is be empty.
  DCHECK_GT(mb.size(), size);

  if (bytes_read_ + mb.size() > source_limit_) {
    mb = mb.subspan(0, source_limit_ - bytes_read_);
  }

  SET_OR_RETURN(src_->ReadAtLeast(mb, size), bytes_read);

  if (bytes_read < size)
    return RdbError(errc::rdb_file_corrupted);
  bytes_read_ += bytes_read;

  DCHECK_LE(bytes_read_, source_limit_);

  mem_buf_->CommitWrite(bytes_read);
  ::memcpy(next, mem_buf_->InputBuffer().data(), size);
  mem_buf_->ConsumeInput(size);

  return kOk;
}

size_t RdbLoaderBase::StrLen(const RdbVariant& tset) {
  const base::PODArray<char>* arr = get_if<base::PODArray<char>>(&tset);
  if (arr)
    return arr->size();

  if (holds_alternative<long long>(tset)) {
    auto val = get<long long>(tset);
    char buf[32];
    char* next = absl::numbers_internal::FastIntToBuffer(val, buf);
    return (next - buf);
  }

  const LzfString* lzf = get_if<LzfString>(&tset);
  if (lzf)
    return lzf->uncompressed_len;

  LOG(DFATAL) << "should not reach";
  return 0;
}

auto RdbLoaderBase::FetchGenericString() -> io::Result<string> {
  bool isencoded;
  size_t len;

  SET_OR_UNEXPECT(LoadLen(&isencoded), len);

  if (isencoded) {
    switch (len) {
      case RDB_ENC_INT8:
      case RDB_ENC_INT16:
      case RDB_ENC_INT32:
        return FetchIntegerObject(len);
      case RDB_ENC_LZF:
        return FetchLzfStringObject();
      default:
        LOG(ERROR) << "Unknown RDB string encoding len " << len;
        return Unexpected(errc::rdb_file_corrupted);
    }
  }

  string res;

  if (len > 0) {
    res.resize(len);
    error_code ec = FetchBuf(len, res.data());
    if (ec) {
      return make_unexpected(ec);
    }
  }

  return res;
}

auto RdbLoaderBase::FetchLzfStringObject() -> io::Result<string> {
  bool zerocopy_decompress = true;

  const uint8_t* cbuf = NULL;
  uint64_t clen, len;

  SET_OR_UNEXPECT(LoadLen(NULL), clen);
  SET_OR_UNEXPECT(LoadLen(NULL), len);

  // TODO serialization and deserialization for data > 512 MB should be done via chunks
  if (len <= clen || clen == 0) {
    LOG(ERROR) << "Bad compressed string";
    return Unexpected(rdb::rdb_file_corrupted);
  }

  if (mem_buf_->InputLen() >= clen) {
    cbuf = mem_buf_->InputBuffer().data();
  } else {
    compr_buf_.resize(clen);
    zerocopy_decompress = false;

    /* Load the compressed representation and uncompress it to target. */
    error_code ec = FetchBuf(clen, compr_buf_.data());
    if (ec) {
      return make_unexpected(ec);
    }
    cbuf = compr_buf_.data();
  }

  string res(len, 0);

  if (lzf_decompress(cbuf, clen, res.data(), len) == 0) {
    LOG(ERROR) << "Invalid LZF compressed string";
    return Unexpected(errc::rdb_file_corrupted);
  }

  // FetchBuf consumes the input but if we have not went through that path
  // we need to consume now.
  if (zerocopy_decompress)
    mem_buf_->ConsumeInput(clen);

  return res;
}

auto RdbLoaderBase::FetchIntegerObject(int enctype) -> io::Result<string> {
  io::Result<long long> val = ReadIntObj(enctype);

  if (!val.has_value()) {
    return val.get_unexpected();
  }

  char buf[32];
  absl::numbers_internal::FastIntToBuffer(*val, buf);

  return string(buf);
}

io::Result<double> RdbLoaderBase::FetchBinaryDouble() {
  union {
    uint64_t val;
    double d;
  } u;

  static_assert(sizeof(u) == sizeof(uint64_t));
  auto ec = EnsureRead(8);
  if (ec)
    return make_unexpected(ec);

  uint8_t buf[8];
  mem_buf_->ReadAndConsume(8, buf);
  u.val = base::LE::LoadT<uint64_t>(buf);
  return u.d;
}

io::Result<double> RdbLoaderBase::FetchDouble() {
  uint8_t len;

  SET_OR_UNEXPECT(FetchInt<uint8_t>(), len);
  constexpr double kInf = std::numeric_limits<double>::infinity();
  switch (len) {
    case 255:
      return -kInf;
    case 254:
      return kInf;
    case 253:
      return std::numeric_limits<double>::quiet_NaN();
    default:;
  }
  char buf[256];
  error_code ec = FetchBuf(len, buf);
  if (ec)
    return make_unexpected(ec);
  buf[len] = '\0';
  double val;
  if (sscanf(buf, "%lg", &val) != 1)
    return Unexpected(errc::rdb_file_corrupted);
  return val;
}

auto RdbLoaderBase::ReadKey() -> io::Result<string> {
  return FetchGenericString();
}

error_code RdbLoaderBase::ReadObj(int rdbtype, OpaqueObj* dest) {
  io::Result<OpaqueObj> iores;

  switch (rdbtype) {
    case RDB_TYPE_SET:
    case RDB_TYPE_SET_WITH_EXPIRY:
      iores = ReadSet(rdbtype);
      break;
    case RDB_TYPE_SET_INTSET:
      iores = ReadIntSet();
      break;
    case RDB_TYPE_HASH_ZIPLIST:
    case RDB_TYPE_HASH_LISTPACK:
    case RDB_TYPE_ZSET_LISTPACK:
    case RDB_TYPE_ZSET_ZIPLIST:
    case RDB_TYPE_STRING:
    case RDB_TYPE_JSON:
      iores = ReadGeneric(rdbtype);
      break;
    case RDB_TYPE_HASH:
    case RDB_TYPE_HASH_WITH_EXPIRY:
      iores = ReadHMap(rdbtype);
      break;
    case RDB_TYPE_ZSET:
    case RDB_TYPE_ZSET_2:
      iores = ReadZSet(rdbtype);
      break;
    case RDB_TYPE_LIST_QUICKLIST:
    case RDB_TYPE_LIST_QUICKLIST_2:
      iores = ReadListQuicklist(rdbtype);
      break;
    case RDB_TYPE_STREAM_LISTPACKS:
    case RDB_TYPE_STREAM_LISTPACKS_2:
    case RDB_TYPE_STREAM_LISTPACKS_3:
      iores = ReadStreams(rdbtype);
      break;
    case RDB_TYPE_SET_LISTPACK:
      // We need to deal with protocol versions 9 and older because in these
      // RDB_TYPE_JSON == 20. On newer versions > 9 we bumped up RDB_TYPE_JSON to 30
      // because it overlapped with the new type RDB_TYPE_SET_LISTPACK
      if (rdb_version_ < 10) {
        // consider it RDB_TYPE_JSON_OLD (20)
        iores = ReadGeneric(RDB_TYPE_JSON);
      } else {
        iores = ReadGeneric(rdbtype);
      }
      break;
    case RDB_TYPE_MODULE_2:
      iores = ReadRedisJson();
      break;
    case RDB_TYPE_SBF:
      iores = ReadSBF();
      break;
    case RDB_TYPE_SBF2:
      iores = ReadSBF2();
      break;
    case RDB_TYPE_CMS:
      iores = ReadCMS();
      break;
    default:
      LOG(ERROR) << "Unsupported rdb type " << rdbtype;

      return RdbError(errc::invalid_encoding);
  }

  if (!iores)
    return iores.error();
  *dest = std::move(*iores);
  return error_code{};
}

static const size_t kMaxStringSize = 200_KB;

error_code RdbLoaderBase::ReadStringObj(RdbVariant* dest, bool big_string_split) {
  bool isencoded = false;
  size_t len;
  SET_OR_RETURN(LoadLen(&isencoded), len);

  if (isencoded) {
    switch (len) {
      case RDB_ENC_INT8:
      case RDB_ENC_INT16:
      case RDB_ENC_INT32: {
        io::Result<long long> io_int = ReadIntObj(len);
        if (!io_int)
          return io_int.error();
        dest->emplace<long long>(*io_int);
        return error_code{};
      }
      case RDB_ENC_LZF: {
        io::Result<LzfString> lzf = ReadLzf();
        if (!lzf)
          return lzf.error();

        dest->emplace<LzfString>(std::move(lzf.value()));
        return error_code{};
      }
      default:
        LOG(ERROR) << "Unknown RDB string encoding " << len;
        return RdbError(errc::rdb_file_corrupted);
    }
  }

  if (big_string_split && len > kMaxStringSize) {
    pending_read_.remaining = len - kMaxStringSize;
    pending_read_.reserve = len;
    len = kMaxStringSize;
  }

  auto& blob = dest->emplace<base::PODArray<char>>();
  blob.resize(len);
  return FetchBuf(len, blob.data());
}

error_code RdbLoaderBase::ReadRemainingString(RdbVariant* dest) {
  size_t read_len = std::min(pending_read_.remaining, kMaxStringSize);
  pending_read_.remaining = pending_read_.remaining - read_len;

  auto& blob = dest->emplace<base::PODArray<char>>();
  blob.resize(read_len);
  return FetchBuf(read_len, blob.data());
}

io::Result<long long> RdbLoaderBase::ReadIntObj(int enctype) {
  long long val;

  if (enctype == RDB_ENC_INT8) {
    SET_OR_UNEXPECT(FetchInt<int8_t>(), val);
  } else if (enctype == RDB_ENC_INT16) {
    SET_OR_UNEXPECT(FetchInt<int16_t>(), val);
  } else if (enctype == RDB_ENC_INT32) {
    SET_OR_UNEXPECT(FetchInt<int32_t>(), val);
  } else {
    return Unexpected(errc::invalid_encoding);
  }
  return val;
}

auto RdbLoaderBase::ReadLzf() -> io::Result<LzfString> {
  uint64_t clen;
  LzfString res;

  SET_OR_UNEXPECT(LoadLen(NULL), clen);
  SET_OR_UNEXPECT(LoadLen(NULL), res.uncompressed_len);

  if (res.uncompressed_len > 1ULL << 29) {
    LOG(ERROR) << "Uncompressed length is too big " << res.uncompressed_len;
    return Unexpected(errc::rdb_file_corrupted);
  }

  res.compressed_blob.resize(clen);
  /* Load the compressed representation and uncompress it to target. */
  error_code ec = FetchBuf(clen, res.compressed_blob.data());
  if (ec) {
    return make_unexpected(ec);
  }

  return res;
}

auto RdbLoaderBase::ReadSet(int rdbtype) -> io::Result<OpaqueObj> {
  size_t len;
  if (pending_read_.remaining > 0) {
    len = pending_read_.remaining;
  } else {
    SET_OR_UNEXPECT(LoadLen(NULL), len);
    if (rdbtype == RDB_TYPE_SET_WITH_EXPIRY) {
      len *= 2;
    }
    pending_read_.reserve = len;
  }

  // Limit each read to kMaxBlobLen elements.
  unique_ptr<LoadTrace> load_trace(new LoadTrace);
  size_t n = std::min(len, kMaxBlobLen);
  load_trace->arr.resize(n);
  for (size_t i = 0; i < n; i++) {
    error_code ec = ReadStringObj(&load_trace->arr[i].rdb_var);
    if (ec) {
      return make_unexpected(ec);
    }
  }

  // If there are still unread elements, cache the number of remaining
  // elements, or clear if the full object has been read.
  if (len > n) {
    pending_read_.remaining = len - n;
  } else if (pending_read_.remaining > 0) {
    pending_read_.remaining = 0;
  }

  return OpaqueObj{std::move(load_trace), rdbtype};
}

auto RdbLoaderBase::ReadIntSet() -> io::Result<OpaqueObj> {
  RdbVariant obj;
  error_code ec = ReadStringObj(&obj);
  if (ec) {
    return make_unexpected(ec);
  }

  const LzfString* lzf = get_if<LzfString>(&obj);
  const base::PODArray<char>* arr = get_if<base::PODArray<char>>(&obj);

  if (lzf) {
    if (lzf->uncompressed_len == 0 || lzf->compressed_blob.empty())
      return Unexpected(errc::rdb_file_corrupted);
  } else if (arr) {
    if (arr->empty())
      return Unexpected(errc::rdb_file_corrupted);
  } else {
    return Unexpected(errc::rdb_file_corrupted);
  }

  return OpaqueObj{std::move(obj), RDB_TYPE_SET_INTSET};
}

auto RdbLoaderBase::ReadGeneric(int rdbtype) -> io::Result<OpaqueObj> {
  bool is_string_type = RDB_TYPE_STRING == rdbtype;
  RdbVariant str_obj;
  error_code ec;
  if (pending_read_.remaining) {
    ec = ReadRemainingString(&str_obj);
  } else {
    ec = ReadStringObj(&str_obj, is_string_type);
  }
  if (ec)
    return make_unexpected(ec);

  if (!is_string_type && StrLen(str_obj) == 0) {
    return Unexpected(errc::rdb_file_corrupted);
  }

  return OpaqueObj{std::move(str_obj), rdbtype};
}

auto RdbLoaderBase::ReadHMap(int rdbtype) -> io::Result<OpaqueObj> {
  size_t len;
  if (pending_read_.remaining > 0) {
    len = pending_read_.remaining;
  } else {
    SET_OR_UNEXPECT(LoadLen(NULL), len);

    if (rdbtype == RDB_TYPE_HASH) {
      len *= 2;
    } else {
      DCHECK_EQ(rdbtype, RDB_TYPE_HASH_WITH_EXPIRY);
      len *= 3;
    }

    pending_read_.reserve = len;
  }

  // Limit each read to kMaxBlobLen elements.
  unique_ptr<LoadTrace> load_trace(new LoadTrace);
  size_t n = std::min<size_t>(len, kMaxBlobLen);
  load_trace->arr.resize(n);
  for (size_t i = 0; i < n; ++i) {
    error_code ec = ReadStringObj(&load_trace->arr[i].rdb_var);
    if (ec)
      return make_unexpected(ec);
  }

  // If there are still unread elements, cache the number of remaining
  // elements, or clear if the full object has been read.
  if (len > n) {
    pending_read_.remaining = len - n;
  } else if (pending_read_.remaining > 0) {
    pending_read_.remaining = 0;
  }

  return OpaqueObj{std::move(load_trace), rdbtype};
}

auto RdbLoaderBase::ReadZSet(int rdbtype) -> io::Result<OpaqueObj> {
  uint64_t zsetlen;
  if (pending_read_.remaining > 0) {
    zsetlen = pending_read_.remaining;
  } else {
    SET_OR_UNEXPECT(LoadLen(nullptr), zsetlen);
    pending_read_.reserve = zsetlen;
  }

  if (zsetlen == 0)
    return Unexpected(errc::empty_key);

  double score;

  // Limit each read to kMaxBlobLen elements.
  unique_ptr<LoadTrace> load_trace(new LoadTrace);
  size_t n = std::min<size_t>(zsetlen, kMaxBlobLen);
  load_trace->arr.resize(n);
  for (size_t i = 0; i < n; ++i) {
    error_code ec = ReadStringObj(&load_trace->arr[i].rdb_var);
    if (ec)
      return make_unexpected(ec);
    if (rdbtype == RDB_TYPE_ZSET_2) {
      SET_OR_UNEXPECT(FetchBinaryDouble(), score);
    } else {
      SET_OR_UNEXPECT(FetchDouble(), score);
    }
    if (isnan(score)) {
      LOG(ERROR) << "Zset with NAN score detected";
      return Unexpected(errc::rdb_file_corrupted);
    }
    load_trace->arr[i].score = score;
  }

  // If there are still unread elements, cache the number of remaining
  // elements, or clear if the full object has been read.
  if (zsetlen > n) {
    pending_read_.remaining = zsetlen - n;
  } else if (pending_read_.remaining > 0) {
    pending_read_.remaining = 0;
  }

  return OpaqueObj{std::move(load_trace), rdbtype};
}

auto RdbLoaderBase::ReadListQuicklist(int rdbtype) -> io::Result<OpaqueObj> {
  size_t len;
  if (pending_read_.remaining > 0) {
    len = pending_read_.remaining;
  } else {
    SET_OR_UNEXPECT(LoadLen(NULL), len);
    pending_read_.reserve = len;
  }

  if (len == 0)
    return Unexpected(errc::empty_key);

  unique_ptr<LoadTrace> load_trace(new LoadTrace);
  // Lists pack multiple entries into each list node (8Kb by default),
  // therefore using a smaller segment length than kMaxBlobLen.
  size_t n = std::min<size_t>(len, 512);
  load_trace->arr.resize(n);
  for (size_t i = 0; i < n; ++i) {
    uint64_t container = QUICKLIST_NODE_CONTAINER_PACKED;
    if (rdbtype == RDB_TYPE_LIST_QUICKLIST_2) {
      SET_OR_UNEXPECT(LoadLen(nullptr), container);

      if (container != QUICKLIST_NODE_CONTAINER_PACKED &&
          container != QUICKLIST_NODE_CONTAINER_PLAIN) {
        LOG(ERROR) << "Quicklist integrity check failed.";
        return Unexpected(errc::rdb_file_corrupted);
      }
    }

    RdbVariant var;
    error_code ec = ReadStringObj(&var);
    if (ec)
      return make_unexpected(ec);

    if (StrLen(var) == 0) {
      return Unexpected(errc::rdb_file_corrupted);
    }
    load_trace->arr[i].rdb_var = std::move(var);
    load_trace->arr[i].encoding = container;
  }

  // If there are still unread elements, cache the number of remaining
  // elements, or clear if the full object has been read.
  if (len > n) {
    pending_read_.remaining = len - n;
  } else if (pending_read_.remaining > 0) {
    pending_read_.remaining = 0;
  }

  return OpaqueObj{std::move(load_trace), rdbtype};
}

auto RdbLoaderBase::ReadStreams(int rdbtype) -> io::Result<OpaqueObj> {
  size_t listpacks;
  if (pending_read_.remaining > 0) {
    listpacks = pending_read_.remaining;
  } else {
    SET_OR_UNEXPECT(LoadLen(NULL), listpacks);
  }

  unique_ptr<LoadTrace> load_trace(new LoadTrace);
  // Streams pack multiple entries into each stream node (4Kb or 100
  // entries), therefore using a smaller segment length than kMaxBlobLen.
  size_t n = std::min<size_t>(listpacks, 512);
  load_trace->arr.resize(n * 2);

  error_code ec;
  for (size_t i = 0; i < n; ++i) {
    /* Get the master ID, the one we'll use as key of the radix tree
     * node: the entries inside the listpack itself are delta-encoded
     * relatively to this ID. */
    RdbVariant stream_id, blob;
    ec = ReadStringObj(&stream_id);
    if (ec)
      return make_unexpected(ec);
    if (StrLen(stream_id) != sizeof(streamID)) {
      LOG(ERROR) << "Stream node key entry is not the size of a stream ID";

      return Unexpected(errc::rdb_file_corrupted);
    }

    ec = ReadStringObj(&blob);
    if (ec)
      return make_unexpected(ec);
    if (StrLen(blob) == 0) {
      LOG(ERROR) << "Stream listpacks loading failed";
      return Unexpected(errc::rdb_file_corrupted);
    }

    load_trace->arr[2 * i].rdb_var = std::move(stream_id);
    load_trace->arr[2 * i + 1].rdb_var = std::move(blob);
  }

  // If there are still unread elements, cache the number of remaining
  // elements, or clear if the full object has been read.
  //
  // We only load the stream metadata and consumer groups in the final read,
  // so if there are still unread elements return the partial stream.
  if (listpacks > n) {
    pending_read_.remaining = listpacks - n;
    return OpaqueObj{std::move(load_trace), rdbtype};
  }

  pending_read_.remaining = 0;

  // Load stream metadata.
  load_trace->stream_trace.reset(new StreamTrace);

  /* Load total number of items inside the stream. */
  SET_OR_UNEXPECT(LoadLen(nullptr), load_trace->stream_trace->stream_len);

  /* Load the last entry ID. */
  SET_OR_UNEXPECT(LoadLen(nullptr), load_trace->stream_trace->last_id.ms);
  SET_OR_UNEXPECT(LoadLen(nullptr), load_trace->stream_trace->last_id.seq);

  if (rdbtype >= RDB_TYPE_STREAM_LISTPACKS_2) {
    /* Load the first entry ID. */
    SET_OR_UNEXPECT(LoadLen(nullptr), load_trace->stream_trace->first_id.ms);
    SET_OR_UNEXPECT(LoadLen(nullptr), load_trace->stream_trace->first_id.seq);

    /* Load the maximal deleted entry ID. */
    SET_OR_UNEXPECT(LoadLen(nullptr), load_trace->stream_trace->max_deleted_entry_id.ms);
    SET_OR_UNEXPECT(LoadLen(nullptr), load_trace->stream_trace->max_deleted_entry_id.seq);

    /* Load the offset. */
    SET_OR_UNEXPECT(LoadLen(nullptr), load_trace->stream_trace->entries_added);
  } else {
    /* During migration the offset can be initialized to the stream's
     * length. At this point, we also don't care about tombstones
     * because CG offsets will be later initialized as well. */
    load_trace->stream_trace->entries_added = load_trace->stream_trace->stream_len;
  }

  /* Consumer groups loading */
  uint64_t cgroups_count;
  SET_OR_UNEXPECT(LoadLen(nullptr), cgroups_count);
  load_trace->stream_trace->cgroup.resize(cgroups_count);

  for (size_t i = 0; i < cgroups_count; ++i) {
    auto& cgroup = load_trace->stream_trace->cgroup[i];
    /* Get the consumer group name and ID. We can then create the
     * consumer group ASAP and populate its structure as
     * we read more data. */

    // sds cgname;
    RdbVariant cgname;
    ec = ReadStringObj(&cgname);
    if (ec)
      return make_unexpected(ec);
    cgroup.name = std::move(cgname);

    SET_OR_UNEXPECT(LoadLen(nullptr), cgroup.ms);
    SET_OR_UNEXPECT(LoadLen(nullptr), cgroup.seq);

    cgroup.entries_read = 0;
    if (rdbtype >= RDB_TYPE_STREAM_LISTPACKS_2) {
      SET_OR_UNEXPECT(LoadLen(nullptr), cgroup.entries_read);
    }

    /* Load the global PEL for this consumer group, however we'll
     * not yet populate the NACK structures with the message
     * owner, since consumers for this group and their messages will
     * be read as a next step. So for now leave them not resolved
     * and later populate it. */
    uint64_t pel_size;
    SET_OR_UNEXPECT(LoadLen(nullptr), pel_size);

    cgroup.pel_arr.resize(pel_size);

    for (size_t j = 0; j < pel_size; ++j) {
      auto& pel = cgroup.pel_arr[j];
      error_code ec = FetchBuf(pel.rawid.size(), pel.rawid.data());
      if (ec) {
        LOG(ERROR) << "Stream PEL ID loading failed.";
        return make_unexpected(ec);
      }

      SET_OR_UNEXPECT(FetchInt<int64_t>(), pel.delivery_time);
      SET_OR_UNEXPECT(LoadLen(nullptr), pel.delivery_count);
    }

    /* Now that we loaded our global PEL, we need to load the
     * consumers and their local PELs. */
    uint64_t consumers_num;
    SET_OR_UNEXPECT(LoadLen(nullptr), consumers_num);
    cgroup.cons_arr.resize(consumers_num);

    for (size_t j = 0; j < consumers_num; ++j) {
      auto& consumer = cgroup.cons_arr[j];
      ec = ReadStringObj(&consumer.name);
      if (ec)
        return make_unexpected(ec);

      SET_OR_UNEXPECT(FetchInt<int64_t>(), consumer.seen_time);

      if (rdbtype >= RDB_TYPE_STREAM_LISTPACKS_3) {
        SET_OR_UNEXPECT(FetchInt<int64_t>(), consumer.active_time);
      } else {
        /* That's the best estimate we got */
        consumer.active_time = consumer.seen_time;
      }

      /* Load the PEL about entries owned by this specific
       * consumer. */
      SET_OR_UNEXPECT(LoadLen(nullptr), pel_size);
      consumer.nack_arr.resize(pel_size);
      for (size_t k = 0; k < pel_size; ++k) {
        auto& nack = consumer.nack_arr[k];
        // unsigned char rawid[sizeof(streamID)];
        error_code ec = FetchBuf(nack.size(), nack.data());
        if (ec) {
          LOG(ERROR) << "Stream PEL ID loading failed.";
          return make_unexpected(ec);
        }
        /*streamNACK* nack = (streamNACK*)raxFind(cgroup->pel, rawid, sizeof(rawid));
        if (nack == raxNotFound) {
          LOG(ERROR) << "Consumer entry not found in group global PEL";
          return Unexpected(errc::rdb_file_corrupted);
        }*/

        /* Set the NACK consumer, that was left to NULL when
         * loading the global PEL. Then set the same shared
         * NACK structure also in the consumer-specific PEL. */
        /*
        nack->consumer = consumer;
        if (!raxTryInsert(consumer->pel, rawid, sizeof(rawid), nack, NULL)) {
          LOG(ERROR) << "Duplicated consumer PEL entry loading a stream consumer group";
          streamFreeNACK(nack);
          return Unexpected(errc::duplicate_key);
        }*/
      }
    }  // while (consumers_num)
  }    // while (cgroup_num)

  return OpaqueObj{std::move(load_trace), RDB_TYPE_STREAM_LISTPACKS};
}

auto RdbLoaderBase::ReadRedisJson() -> io::Result<OpaqueObj> {
  auto json_magic_number = LoadLen(nullptr);
  if (!json_magic_number) {
    return Unexpected(errc::rdb_file_corrupted);
  }

  constexpr string_view kJsonModule = "ReJSON-RL"sv;
  string module_name = ModuleTypeName(*json_magic_number);
  if (module_name != kJsonModule) {
    LOG(ERROR) << "Unsupported module: " << module_name;
    return Unexpected(errc::unsupported_operation);
  }

  int encver = *json_magic_number & 1023;
  if (encver != 3) {
    LOG(ERROR) << "Unsupported ReJSON version: " << encver;
    return Unexpected(errc::unsupported_operation);
  }

  auto opcode = FetchInt<uint8_t>();
  if (!opcode || *opcode != RDB_MODULE_OPCODE_STRING) {
    return Unexpected(errc::rdb_file_corrupted);
  }

  RdbVariant dest;
  error_code ec = ReadStringObj(&dest);
  if (ec) {
    return make_unexpected(ec);
  }

  opcode = FetchInt<uint8_t>();
  if (!opcode || *opcode != RDB_MODULE_OPCODE_EOF) {
    return Unexpected(errc::rdb_file_corrupted);
  }

  return OpaqueObj{std::move(dest), RDB_TYPE_JSON};
}

auto RdbLoaderBase::ReadSBFImpl(bool chunking) -> io::Result<OpaqueObj> {
  RdbSBF res;
  uint64_t options;
  SET_OR_UNEXPECT(LoadLen(nullptr), options);
  if (options != 0)
    return Unexpected(errc::rdb_file_corrupted);
  SET_OR_UNEXPECT(FetchBinaryDouble(), res.grow_factor);
  SET_OR_UNEXPECT(FetchBinaryDouble(), res.fp_prob);
  if (res.fp_prob <= 0 || res.fp_prob > 0.5) {
    return Unexpected(errc::rdb_file_corrupted);
  }
  SET_OR_UNEXPECT(LoadLen(nullptr), res.prev_size);
  SET_OR_UNEXPECT(LoadLen(nullptr), res.current_size);
  SET_OR_UNEXPECT(LoadLen(nullptr), res.max_capacity);

  unsigned num_filters = 0;
  SET_OR_UNEXPECT(LoadLen(nullptr), num_filters);
  auto is_power2 = [](size_t n) { return (n & (n - 1)) == 0; };

  for (unsigned i = 0; i < num_filters; ++i) {
    unsigned hash_cnt;
    string filter_data;
    SET_OR_UNEXPECT(LoadLen(nullptr), hash_cnt);

    if (chunking) {
      size_t total_size = 0;
      SET_OR_UNEXPECT(LoadLen(nullptr), total_size);
      if (total_size == 0) {
        return Unexpected(errc::rdb_file_corrupted);
      }

      filter_data.resize(total_size);
      size_t offset = 0;
      while (offset < total_size) {
        size_t chunk_size = 0;
        SET_OR_UNEXPECT(LoadLen(nullptr), chunk_size);
        if (chunk_size == 0 || chunk_size > total_size - offset) {
          return Unexpected(errc::rdb_file_corrupted);
        }
        error_code ec = FetchBuf(chunk_size, filter_data.data() + offset);
        if (ec) {
          return make_unexpected(ec);
        }

        offset += chunk_size;
      }
    } else {
      SET_OR_UNEXPECT(FetchGenericString(), filter_data);
    }

    size_t bit_len = filter_data.size() * 8;
    if (!is_power2(bit_len)) {  // must be power of two
      return Unexpected(errc::rdb_file_corrupted);
    }
    res.filters.emplace_back(hash_cnt, std::move(filter_data));
  }
  return OpaqueObj{std::move(res), RDB_TYPE_SBF};
}

auto RdbLoaderBase::ReadSBF() -> io::Result<OpaqueObj> {
  return ReadSBFImpl(false);
}

auto RdbLoaderBase::ReadSBF2() -> io::Result<OpaqueObj> {
  return ReadSBFImpl(true);
}

io::Result<RdbLoaderBase::OpaqueObj> RdbLoaderBase::ReadCMS() {
  RdbCMS res;

  SET_OR_UNEXPECT(LoadLen(nullptr), res.width);
  SET_OR_UNEXPECT(LoadLen(nullptr), res.depth);
  SET_OR_UNEXPECT(LoadLen(nullptr), res.total_incr_count);

  const size_t num_counters = res.width * res.depth;
  res.counters.resize(num_counters);
  for (size_t i = 0; i < num_counters; ++i) {
    uint64_t raw;
    auto ec = FetchBuf(sizeof(raw), &raw);
    if (ec)
      return make_unexpected(ec);
    res.counters[i] = static_cast<int64_t>(base::LE::LoadT<uint64_t>(&raw));
  }

  return OpaqueObj{std::move(res), RDB_TYPE_CMS};
}

template <typename T> io::Result<T> RdbLoaderBase::FetchInt() {
  auto ec = EnsureRead(sizeof(T));
  if (ec)
    return make_unexpected(ec);

  char buf[16];
  mem_buf_->ReadAndConsume(sizeof(T), buf);

  return base::LE::LoadT<std::make_unsigned_t<T>>(buf);
}

io::Result<uint8_t> RdbLoaderBase::FetchType() {
  return FetchInt<uint8_t>();
}

// -------------- RdbLoader   ----------------------------

struct RdbLoader::ObjSettings {
  long long now;           // current epoch time in ms.
  int64_t expiretime = 0;  // expire epoch time in ms
  uint32_t mc_flags = 0;

  bool has_expired = false;

  bool is_sticky = false;
  bool has_mc_flags = false;

  void Reset() {
    mc_flags = expiretime = 0;
    has_expired = false;
    is_sticky = false;
    has_mc_flags = false;
  }

  void SetExpire(int64_t val) {
    expiretime = val;
    has_expired = (val <= now);
  }

  void SetMCFlags(uint32_t flags) {
    has_mc_flags = true;
    mc_flags = flags;
  }

  ObjSettings() = default;
};

RdbLoader::RdbLoader(Service* service, RdbLoadContext* load_context, std::string snapshot_id)
    : service_{service},
      load_context_(load_context),
      snapshot_id_(std::move(snapshot_id)),
      rdb_ignore_expiry_{GetFlag(FLAGS_rdb_ignore_expiry)},
      deserialize_hnsw_index_{GetFlag(FLAGS_deserialize_hnsw_index)},
      script_mgr_{service == nullptr ? nullptr : service->script_mgr()},
      shard_buf_{shard_set->size()} {
}

RdbLoader::~RdbLoader() {
  while (true) {
    Item* item = item_queue_.Pop();
    if (item == nullptr)
      break;
    delete item;
  }

  // Decommit local memory.
  // We create an RdbLoader for each thread, so each one will Decommit for itself after
  // full sync ends (since we explicitly reset the RdbLoader).
  auto* tlocal = ServerState::tlocal();
  tlocal->DecommitMemory(ServerState::kAllMemory);
}

error_code RdbLoader::Load(io::Source* src) {
  CHECK(!src_ && src);

  is_tiered_enabled_ =
      shard_set->Await(0, [] { return EngineShard::tlocal()->tiered_storage() != nullptr; });

  absl::Time start = absl::Now();
  src_ = src;

  IoBuf::Bytes bytes = mem_buf_->AppendBuffer();
  io::Result<size_t> read_sz = src_->ReadAtLeast(bytes, 9);
  if (!read_sz)
    return read_sz.error();

  bytes_read_ = *read_sz;
  if (bytes_read_ < 9) {
    return RdbError(errc::wrong_signature);
  }

  mem_buf_->CommitWrite(bytes_read_);

  {
    auto cb = mem_buf_->InputBuffer();

    if (memcmp(cb.data(), "REDIS", 5) != 0) {
      VLOG(1) << "Bad header: " << absl::CHexEscape(facade::ToSV(cb));
      return RdbError(errc::wrong_signature);
    }

    char buf[64] = {0};
    ::memcpy(buf, cb.data() + 5, 4);

    rdb_version_ = atoi(buf);
    if (rdb_version_ < 5 || rdb_version_ > RDB_VERSION) {  // We accept starting from 5.
      LOG(ERROR) << "RDB Version " << rdb_version_ << " is not supported";
      return RdbError(errc::bad_version);
    }

    mem_buf_->ConsumeInput(9);
  }

  int type;

  /* Key-specific attributes, set by opcodes before the key type. */
  ObjSettings settings;
  settings.now = GetCurrentTimeMs();
  size_t keys_loaded = 0;

  auto cleanup = absl::Cleanup([&] { FinishLoad(start, &keys_loaded); });

  // Increment local one if it exists
  if (EngineShard* es = EngineShard::tlocal(); es) {
    GetCurrentDbSlice().IncrLoadInProgress();
  }

  while (!stop_early_.load(memory_order_relaxed)) {
    if (pause_) {
      ThisFiber::SleepFor(100ms);
      continue;
    }

    /* Read type. */
    SET_OR_RETURN(FetchType(), type);

    DVLOG(3) << "Opcode type: " << type;

    /* Handle special types. */
    if (type == RDB_OPCODE_EXPIRETIME) {
      LOG(ERROR) << "opcode RDB_OPCODE_EXPIRETIME not supported";

      return RdbError(errc::invalid_encoding);
    }

    if (type == RDB_OPCODE_EXPIRETIME_MS) {
      int64_t val;
      /* EXPIRETIME_MS: milliseconds precision expire times introduced
       * with RDB v3. Like EXPIRETIME but no with more precision. */
      SET_OR_RETURN(FetchInt<int64_t>(), val);
      if (!rdb_ignore_expiry_) {
        settings.SetExpire(val);
      }
      continue; /* Read next opcode. */
    }

    if (type == RDB_OPCODE_DF_MASK) {
      uint32_t mask;
      SET_OR_RETURN(FetchInt<uint32_t>(), mask);
      settings.is_sticky = mask & DF_MASK_FLAG_STICKY;
      settings.has_mc_flags = mask & DF_MASK_FLAG_MC_FLAGS;
      if (settings.has_mc_flags) {
        SET_OR_RETURN(FetchInt<uint32_t>(), settings.mc_flags);
      }
      continue; /* Read next opcode. */
    }

    if (type == RDB_OPCODE_FREQ) {
      /* FREQ: LFU frequency. */
      FetchInt<uint8_t>();  // IGNORE
      continue;             /* Read next opcode. */
    }

    if (type == RDB_OPCODE_IDLE) {
      /* IDLE: LRU idle time. */
      uint64_t idle;
      SET_OR_RETURN(LoadLen(nullptr), idle);  // ignore
      (void)idle;
      continue; /* Read next opcode. */
    }

    if (type == RDB_OPCODE_EOF) {
      /* EOF: End of file, exit the main loop. */
      break;
    }

    if (type == RDB_OPCODE_FULLSYNC_END) {
      VLOG(1) << "Read RDB_OPCODE_FULLSYNC_END";
      RETURN_ON_ERR(EnsureRead(8));
      mem_buf_->ConsumeInput(8);  // ignore 8 bytes

      if (full_sync_cut_cb) {
        FlushAllShards();  // Flush as the handler awakes post load handlers
        full_sync_cut_cb();
      }
      continue;
    }

    if (type == RDB_OPCODE_JOURNAL_OFFSET) {
      VLOG(1) << "Read RDB_OPCODE_JOURNAL_OFFSET";
      uint64_t journal_offset;
      SET_OR_RETURN(FetchInt<uint64_t>(), journal_offset);
      VLOG(1) << "Got offset " << journal_offset;
      journal_offset_ = journal_offset;
      continue;
    }

    if (type == RDB_OPCODE_SELECTDB) {
      unsigned dbid = 0;

      /* SELECTDB: Select the specified database. */
      SET_OR_RETURN(LoadLen(nullptr), dbid);

      if (dbid > GetFlag(FLAGS_dbnum)) {
        LOG(WARNING) << "database id " << dbid << " exceeds dbnum limit. Try increasing the flag.";

        return RdbError(errc::bad_db_index);
      }

      DVLOG(2) << "Select DB: " << dbid;
      for (unsigned i = 0; i < shard_set->size(); ++i) {
        // we should flush pending items before switching dbid.
        FlushShardAsync(i);

        // Active database if not existed before.
        shard_set->Add(i, [dbid] { GetCurrentDbSlice().ActivateDb(dbid); });
      }

      cur_db_index_ = dbid;
      if (EngineShard::tlocal()) {  // because we sometimes create entries inline.
        GetCurrentDbSlice().ActivateDb(dbid);
      }
      continue; /* Read next opcode. */
    }

    if (type == RDB_OPCODE_RESIZEDB) {
      /* RESIZEDB: Hint about the size of the keys in the currently
       * selected data base, in order to avoid useless rehashing. */
      uint64_t db_size, expires_size;
      SET_OR_RETURN(LoadLen(nullptr), db_size);
      SET_OR_RETURN(LoadLen(nullptr), expires_size);

      VLOG(1) << "RESIZEDB: db_size=" << db_size << ", expires_size=" << expires_size;

      // We do not use this information because it is not possible to easily preallocate
      // dash tables based on this information. Moreover, number of shards can change
      // between the original shard set and the loading server.
      continue; /* Read next opcode. */
    }

    if (type == RDB_OPCODE_AUX) {
      RETURN_ON_ERR(HandleAux());
      continue; /* Read type again. */
    }

    if (type == RDB_OPCODE_MODULE_AUX) {
      uint64_t module_id;
      SET_OR_RETURN(LoadLen(nullptr), module_id);
      string module_name = ModuleTypeName(module_id);

      LOG(WARNING) << "WARNING: Skipping data for module " << module_name;
      RETURN_ON_ERR(SkipModuleData());
      continue;
    }

    if (type == RDB_OPCODE_COMPRESSED_ZSTD_BLOB_START ||
        type == RDB_OPCODE_COMPRESSED_LZ4_BLOB_START) {
      RETURN_ON_ERR(HandleCompressedBlob(type));
      continue;
    }

    if (type == RDB_OPCODE_COMPRESSED_BLOB_END) {
      RETURN_ON_ERR(HandleCompressedBlobFinish());
      continue;
    }

    if (type == RDB_OPCODE_JOURNAL_BLOB) {
      FlushAllShards();  // Always flush before applying incremental on top
      RETURN_ON_ERR(HandleJournalBlob(service_));
      continue;
    }

    if (type == RDB_OPCODE_SLOT_INFO) {
      [[maybe_unused]] uint64_t slot_id;
      SET_OR_RETURN(LoadLen(nullptr), slot_id);
      [[maybe_unused]] uint64_t slot_size;
      SET_OR_RETURN(LoadLen(nullptr), slot_size);
      [[maybe_unused]] uint64_t expires_slot_size;
      SET_OR_RETURN(LoadLen(nullptr), expires_slot_size);
      continue;
    }

    if (type == RDB_OPCODE_VECTOR_INDEX) {
      // HNSW vector index graph data.
      // Binary format: [index_key, elements_number,
      //   then for each node (little-endian):
      //     internal_id (4 bytes), global_id (8 bytes), level (4 bytes),
      //     for each level (0 to level): links_num (4 bytes) + links (4 bytes each)]
      string index_key;
      SET_OR_RETURN(FetchGenericString(), index_key);

      uint64_t elements_number;
      SET_OR_RETURN(LoadLen(nullptr), elements_number);

      if (!deserialize_hnsw_index_) {
        RETURN_ON_ERR(SkipVectorIndex(index_key, elements_number));
      } else {
        DCHECK_GT(shard_count_, 0u);
        // Parse "index_name:field_name" from the composite key.
        size_t colon_pos = index_key.rfind(':');
        string_view index_name{index_key.data(),
                               colon_pos != string::npos ? colon_pos : index_key.size()};
        string_view field_name = colon_pos != string::npos
                                     ? string_view{index_key.data() + colon_pos + 1}
                                     : string_view{};

        if (shard_count_ == shard_set->size()) {
          // Same shard count: restore directly.
          RETURN_ON_ERR(RestoreVectorIndex(index_key, index_name, field_name, elements_number));
        } else {
          // Different shard count: load nodes and defer restoration.
          // Global_ids will be remapped in PerformPostLoad after all key mappings are collected.
          PendingHnswNodes pending{std::string(index_name), std::string(field_name), {}};
          RETURN_ON_ERR(LoadVectorIndexNodes(elements_number, &pending.nodes));
          LOG(INFO) << "Deferred HNSW index restore for " << index_key << " with "
                    << pending.nodes.size() << " nodes (shard count mismatch: " << shard_count_
                    << " vs " << shard_set->size() << ")";
          load_context_->AddPendingHnswNodes(std::move(pending));
        }
      }
      continue;
    }

    if (type == RDB_OPCODE_SHARD_DOC_INDEX) {
      // Load ShardDocIndex key-to-DocId mapping
      // Format: [shard_id, index_name, mapping_count, then for each mapping: key_string, doc_id]
      PendingIndexMapping pim;
      uint32_t shard_id;
      SET_OR_RETURN(LoadLen(nullptr), shard_id);

      SET_OR_RETURN(FetchGenericString(), pim.index_name);

      uint64_t mapping_count;
      SET_OR_RETURN(LoadLen(nullptr), mapping_count);
      pim.mappings.reserve(mapping_count);

      for (uint64_t i = 0; i < mapping_count; ++i) {
        string key;
        SET_OR_RETURN(FetchGenericString(), key);
        uint64_t doc_id;
        SET_OR_RETURN(LoadLen(nullptr), doc_id);
        pim.mappings.emplace_back(std::move(key), static_cast<search::DocId>(doc_id));
      }

      if (!deserialize_hnsw_index_) {
        continue;
      }
      DCHECK_GT(shard_count_, 0u);

      VLOG(2) << "Loaded index mapping for shard " << shard_id << " with " << mapping_count
              << " entries";

      // Always store mappings. When shard counts differ, PerformPostLoad will redistribute
      // keys to replica shards and remap global_ids accordingly.
      load_context_->AddPendingIndexMapping(shard_id, std::move(pim));
      continue;
    }

    if (!rdbIsObjectTypeDF(type)) {
      LOG(ERROR) << "Unrecognized rdb object type: " << type;
      LOG(ERROR) << "Last iteration: ";
      LOG(ERROR) << "key loaded: " << absl::CHexEscape(last_key_loaded_);
      LOG(ERROR) << "pending_read_.remaining: " << pending_read_.remaining
                 << "\npending_read_.reserve: " << pending_read_.reserve;
      // In case we encounter an error, it might worth peeking the InputBuffer()
      return RdbError(errc::invalid_rdb_type);
    }

    ++keys_loaded;
    RETURN_ON_ERR(LoadKeyValPair(type, &settings));
    settings.Reset();
  }  // main load loop

  DVLOG(1) << "RdbLoad loop finished";

  if (stop_early_) {
    return *ec_;
  }

  /* Verify the checksum if RDB version is >= 5 */
  RETURN_ON_ERR(VerifyChecksum());

  return kOk;
}

void RdbLoader::FinishLoad(absl::Time start_time, size_t* keys_loaded) {
  BlockingCounter bc(shard_set->size());
  for (unsigned i = 0; i < shard_set->size(); ++i) {
    // Flush the remaining items.
    FlushShardAsync(i);

    // Send sentinel callbacks to ensure that all previous messages have been processed.
    shard_set->Add(i, [bc]() mutable { bc->Dec(); });
  }
  bc->Wait();  // wait for sentinels to report.
  // Decrement local one if it exists
  if (EngineShard* es = EngineShard::tlocal(); es) {
    GetCurrentDbSlice().DecrLoadInProgress();
  }

  now_chunked_.clear();

  absl::Duration dur = absl::Now() - start_time;
  load_time_ = double(absl::ToInt64Milliseconds(dur)) / 1000;
  keys_loaded_ = *keys_loaded;
}

std::error_code RdbLoaderBase::EnsureRead(size_t min_sz) {
  // In the flow of reading compressed data, we store the uncompressed data to in uncompressed
  // buffer. When parsing entries we call ensure read with 9 bytes to read the length of
  // key/value. If the key/value is very small (less than 9 bytes) the remainded data in
  // uncompressed buffer might contain less than 9 bytes. We need to make sure that we dont read
  // from sink to the uncompressed buffer and therefor in this flow we return here.
  if (mem_buf_ != &origin_mem_buf_)
    return std::error_code{};
  if (mem_buf_->InputLen() >= min_sz)
    return std::error_code{};
  return EnsureReadInternal(min_sz);
}

error_code RdbLoaderBase::EnsureReadInternal(size_t min_to_read) {
  // We need to include what we already read inside Input buffer. Otherwise we might expect to read
  // more than the minimum
  const size_t min_sz = min_to_read - mem_buf_->InputLen();

  auto out_buf = mem_buf_->AppendBuffer();
  CHECK_GT(out_buf.size(), min_sz);

  // If limit was applied we do not want to read more than needed
  // important when reading from sockets.
  if (bytes_read_ + out_buf.size() > source_limit_) {
    out_buf = out_buf.subspan(0, source_limit_ - bytes_read_);
  }

  io::Result<size_t> res = src_->ReadAtLeast(out_buf, min_sz);
  if (!res) {
    VLOG(1) << "Error reading from source: " << res.error() << " " << min_sz << " bytes";
    return res.error();
  }
  if (*res < min_sz)
    return RdbError(errc::rdb_file_corrupted);
  DVLOG(2) << "EnsureRead " << *res << " bytes";
  bytes_read_ += *res;

  DCHECK_LE(bytes_read_, source_limit_);
  mem_buf_->CommitWrite(*res);

  return kOk;
}

io::Result<uint64_t> RdbLoaderBase::LoadLen(bool* is_encoded) {
  if (is_encoded)
    *is_encoded = false;

  // Every RDB file with rdbver >= 5 has 8-bytes checksum at the end,
  // so we can ensure we have 9 bytes to read up until that point.
  error_code ec = EnsureRead(9);
  if (ec)
    return make_unexpected(ec);

  // Read integer meta info.
  auto bytes = mem_buf_->InputBuffer();
  PackedUIntMeta meta{bytes[0]};
  bytes.remove_prefix(1);

  // Read integer.
  uint64_t res;
  SET_OR_UNEXPECT(ReadPackedUInt(meta, bytes), res);

  if (meta.Type() == RDB_ENCVAL && is_encoded)
    *is_encoded = true;

  mem_buf_->ConsumeInput(1 + meta.ByteSize());

  return res;
}

error_code RdbLoaderBase::AllocateDecompressOnce(int op_type) {
  if (decompress_impl_) {
    return {};
  }

  if (op_type == RDB_OPCODE_COMPRESSED_ZSTD_BLOB_START) {
    decompress_impl_ = detail::DecompressImpl::CreateZstd();
  } else if (op_type == RDB_OPCODE_COMPRESSED_LZ4_BLOB_START) {
    decompress_impl_ = detail::DecompressImpl::CreateLZ4();
  } else {
    return RdbError(errc::unsupported_operation);
  }
  return {};
}

error_code RdbLoaderBase::SkipModuleData() {
  uint64_t opcode;
  SET_OR_RETURN(LoadLen(nullptr), opcode);  // ignore field 'when_opcode'
  if (opcode != RDB_MODULE_OPCODE_UINT)
    return RdbError(errc::rdb_file_corrupted);
  SET_OR_RETURN(LoadLen(nullptr), opcode);  // ignore field 'when'

  while (true) {
    SET_OR_RETURN(LoadLen(nullptr), opcode);

    switch (opcode) {
      case RDB_MODULE_OPCODE_EOF:
        return kOk;  // Module data end

      case RDB_MODULE_OPCODE_SINT:
      case RDB_MODULE_OPCODE_UINT: {
        [[maybe_unused]] uint64_t _;
        SET_OR_RETURN(LoadLen(nullptr), _);
        break;
      }

      case RDB_MODULE_OPCODE_STRING: {
        RdbVariant dest;
        error_code ec = ReadStringObj(&dest);
        if (ec) {
          return ec;
        }
        break;
      }

      case RDB_MODULE_OPCODE_DOUBLE: {
        [[maybe_unused]] double _;
        SET_OR_RETURN(FetchBinaryDouble(), _);
        break;
      }

      default:
        // TODO: handle RDB_MODULE_OPCODE_FLOAT
        LOG(ERROR) << "Unsupported module section: " << opcode;
        return RdbError(errc::rdb_file_corrupted);
    }
  }
}

error_code RdbLoaderBase::HandleCompressedBlob(int op_type) {
  DVLOG(2) << "HandleCompressedBlob: " << op_type;
  RETURN_ON_ERR(AllocateDecompressOnce(op_type));

  // Fetch uncompress blob
  string res;
  SET_OR_RETURN(FetchGenericString(), res);

  // Decompress blob and switch membuf pointer
  // Last type in the compressed blob is RDB_OPCODE_COMPRESSED_BLOB_END
  // in which we will switch back to the origin membuf (HandleCompressedBlobFinish)
  SET_OR_RETURN(decompress_impl_->Decompress(res), mem_buf_);

  return kOk;
}

error_code RdbLoaderBase::HandleCompressedBlobFinish() {
  DVLOG(2) << "HandleCompressedBlobFinish";

  CHECK_NE(&origin_mem_buf_, mem_buf_);
  CHECK_EQ(mem_buf_->InputLen(), size_t(0));
  mem_buf_ = &origin_mem_buf_;
  return kOk;
}

error_code RdbLoaderBase::HandleJournalBlob(Service* service) {
  // Read the number of entries in the journal blob.
  size_t num_entries;
  bool _encoded;
  SET_OR_RETURN(LoadLen(&_encoded), num_entries);

  // Read the journal blob.
  string journal_blob;
  SET_OR_RETURN(FetchGenericString(), journal_blob);

  io::BytesSource bs{io::Buffer(journal_blob)};
  journal_reader_.SetSource(&bs);

  // Parse and exectue in loop.
  size_t done = 0;
  JournalExecutor ex{service};
  while (done < num_entries) {
    journal::ParsedEntry entry;
    auto ec = journal_reader_.ReadEntry(&entry);
    if (ec)
      return ec;

    done++;

    if (entry.cmd.empty()) {
      if (entry.opcode == journal::Op::PING) {
        continue;
      }
      return RdbError(errc::rdb_file_corrupted);
    }

    if (absl::EqualsIgnoreCase(entry.cmd[0], "FLUSHALL") ||
        absl::EqualsIgnoreCase(entry.cmd[0], "FLUSHDB")) {
      // Applying a flush* operation in the middle of a load can cause out-of-sync deletions of
      // data that should not be deleted, see https://github.com/dragonflydb/dragonfly/issues/1231
      // By returning an error we are effectively restarting the replication.
      return RdbError(errc::unsupported_operation);
    }

    DVLOG(2) << "Executing item: " << entry.ToString();
    ex.Execute(entry.dbid, entry.cmd);
  }

  return std::error_code{};
}

error_code RdbLoader::HandleAux() {
  /* AUX: generic string-string fields. Use to add state to RDB
   * which is backward compatible. Implementations of RDB loading
   * are required to skip AUX fields they don't understand.
   *
   * An AUX field is composed of two strings: key and value. */
  string auxkey, auxval;

  SET_OR_RETURN(FetchGenericString(), auxkey);
  SET_OR_RETURN(FetchGenericString(), auxval);

  if (!auxkey.empty() && auxkey[0] == '%') {
    /* All the fields with a name staring with '%' are considered
     * information fields and are logged at startup with a log
     * level of NOTICE. */
    LOG(INFO) << "RDB '" << auxkey << "': " << auxval;
  } else if (auxkey == "snapshot-id") {
    if (snapshot_id_.empty()) {
      snapshot_id_ = auxval;
    } else if (snapshot_id_ != auxval) {
      return RdbError(errc::incorrect_snapshot_id);
    }
  } else if (auxkey == "repl-stream-db") {
    // TODO
  } else if (auxkey == "repl-id") {
    // TODO
  } else if (auxkey == "repl-offset") {
    // TODO
  } else if (auxkey == "lua") {
    LoadScriptFromAux(std::move(auxval));
  } else if (auxkey == "redis-ver") {
    VLOG(1) << "Loading RDB produced by Redis version " << auxval;
  } else if (auxkey == "df-ver") {
    VLOG(1) << "Loading RDB produced by Dragonfly version " << auxval;
  } else if (auxkey == "ctime") {
    int64_t ctime;
    if (absl::SimpleAtoi(auxval, &ctime)) {
      time_t age = time(NULL) - ctime;
      if (age < 0)
        age = 0;
      VLOG(1) << "RDB age " << strings::HumanReadableElapsedTime(age);
    }
  } else if (auxkey == "used-mem") {
    int64_t usedmem;
    if (absl::SimpleAtoi(auxval, &usedmem)) {
      VLOG(1) << "RDB memory usage when created " << strings::HumanReadableNumBytes(usedmem);
      // We allow 5% tolerance for snapshot used memory
      if (usedmem > (max_memory_limit * 1.05)) {
        if (IsClusterEnabled()) {
          LOG(INFO) << "Allowing to load a snapshot of size " << usedmem
                    << ", despite memory limit of " << max_memory_limit << " due to cluster mode";
        } else if (is_tiered_enabled_) {
          LOG(INFO) << "Allowing to load a snapshot of size " << usedmem
                    << ", despite memory limit of " << max_memory_limit << " due to tiered storage";
        } else {
          LOG(WARNING) << "Could not load snapshot - its used memory is " << usedmem
                       << " but the limit is " << max_memory_limit;
          return RdbError(errc::out_of_memory);
        }
      }
    }
  } else if (auxkey == "aof-preamble") {
    long long haspreamble;
    if (absl::SimpleAtoi(auxval, &haspreamble) && haspreamble) {
      VLOG(1) << "RDB has an AOF tail";
    }
  } else if (auxkey == "redis-bits") {
    /* Just ignored. */
  } else if (auxkey == "search-index") {
    LoadSearchIndexDefFromAux(std::move(auxval));
  } else if (auxkey == "hnsw-index-metadata") {
    LoadHnswIndexMetadataFromAux(std::move(auxval));
  } else if (auxkey == "search-synonyms") {
    LoadSearchSynonymsFromAux(std::move(auxval));
  } else if (auxkey == "shard-count") {
    uint32_t shard_count;
    if (absl::SimpleAtoi(auxval, &shard_count)) {
      shard_count_ = shard_count;
      load_context_->SetMasterShardCount(shard_count);
    }
  } else if (auxkey == "shard-id") {
    uint32_t shard_id;
    if (absl::SimpleAtoi(auxval, &shard_id)) {
      shard_id_ = shard_id;
    }
  } else if (auxkey == "table-mem") {
    size_t mem;
    if (absl::SimpleAtoi(auxval, &mem)) {
      table_used_memory_ = mem;
    }
  } else {
    /* We ignore fields we don't understand, as by AUX field
     * contract. */
    LOG(WARNING) << "Unrecognized RDB AUX field: '" << auxkey << "'";
  }

  return kOk;
}

error_code RdbLoader::VerifyChecksum() {
  uint64_t expected;

  SET_OR_RETURN(FetchInt<uint64_t>(), expected);

  io::Bytes cur_buf = mem_buf_->InputBuffer();

  VLOG(1) << "VerifyChecksum: input buffer len " << cur_buf.size() << ", expected " << expected;

  return kOk;
}

void RdbLoader::FlushShardAsync(ShardId sid) {
  auto& out_buf = shard_buf_[sid];
  if (out_buf.empty())
    return;

  auto cb = [indx = this->cur_db_index_, this, ib = std::move(out_buf)] {
    auto& db_slice = GetCurrentDbSlice();

    // Before we start loading, increment LoadInProgress.
    // This is required because FlushShardAsync dispatches to multiple shards, and those shards
    // might have not yet have their state (load in progress) incremented.
    db_slice.IncrLoadInProgress();
    this->LoadItemsBuffer(indx, ib);
    db_slice.DecrLoadInProgress();
  };

  bool preempted = shard_set->Add(sid, std::move(cb));
  VLOG_IF(2, preempted) << "FlushShardAsync was throttled";
}

void RdbLoader::FlushAllShards() {
  for (ShardId i = 0; i < shard_set->size(); i++)
    FlushShardAsync(i);
}

std::error_code RdbLoaderBase::FromOpaque(const OpaqueObj& opaque, LoadConfig config,
                                          PrimeValue* pv) {
  OpaqueObjLoader visitor(opaque.rdb_type, pv, config);
  std::visit(visitor, opaque.obj);

  return visitor.ec();
}

void RdbLoaderBase::CopyStreamId(const StreamID& src, struct streamID* dest) {
  dest->ms = src.ms;
  dest->seq = src.seq;
}

void RdbLoader::CreateObjectOnShard(const DbContext& db_cntx, const Item* item, DbSlice* db_slice) {
  PrimeValue pv;
  PrimeValue* pv_ptr = &pv;
  DbIndex db_ind = db_cntx.db_index;

  auto error_msg = [](const auto* item, auto db_ind) {
    return absl::StrCat("Found empty key: ", item->key, " in DB ", db_ind, " rdb_type ",
                        item->val.rdb_type);
  };

  LoadConfig config_copy = item->load_config;
  if (item->load_config.chunked && item->load_config.append) {
    std::unique_lock lk{now_chunked_mu_};
    if (auto it = now_chunked_.find(item->key); it != now_chunked_.end()) {
      pv_ptr = it->second.get();
    } else {
      // Sets and hashes are deleted when all their entries are expired.
      // If it's the case, set reset append flag and start from scratch.
      bool key_is_not_expired = item->expire_ms == 0 || db_cntx.time_now_ms < item->expire_ms;
      bool is_set_expiry_type = item->val.rdb_type == RDB_TYPE_HASH_WITH_EXPIRY ||
                                item->val.rdb_type == RDB_TYPE_SET_WITH_EXPIRY;
      if (!is_set_expiry_type && key_is_not_expired) {
        LOG(ERROR) << "Count not to find append key '" << item->key << "' in DB " << db_ind;
        return;
      }
      config_copy.append = false;
    }
  }

  if (auto ec = FromOpaque(item->val, config_copy, pv_ptr); ec) {
    if (ec.value() == errc::value_expired) {
      // hmap and sset values can expire and we ok with it,
      // so we don't set ec_ in this case
      return;
    }
    ec_ = ec;
    if (ec.value() == errc::empty_key) {
      auto error = error_msg(item, db_ind);
      if (RdbTypeAllowedEmpty(item->val.rdb_type)) {
        LOG(WARNING) << error;
      } else {
        LOG(ERROR) << error;
      }
      return;
    }
    LOG(ERROR) << "Could not load value for key '" << absl::CHexEscape(item->key) << "' in DB "
               << db_ind << " " << item->load_config.chunked << " " << item->load_config.append
               << " " << item->val.rdb_type;
    stop_early_ = true;
    return;
  }

  if (item->load_config.chunked) {
    std::unique_lock lk{now_chunked_mu_};
    if (!now_chunked_.contains(item->key))
      now_chunked_.emplace(item->key, make_unique<PrimeValue>(std::move(pv)));

    if (!item->load_config.finalize)
      return;

    pv = std::move(*now_chunked_.extract(item->key).mapped());
  }

  // We need this extra check because we don't return empty_key
  if (!pv.TagAllowsEmptyValue() && pv.Size() == 0) {
    LOG(WARNING) << error_msg(item, db_ind);
    return;
  }

  if (item->expire_ms > 0 && db_cntx.time_now_ms >= item->expire_ms) {
    VLOG(2) << "Expire key on load: " << item->key;
    return;
  }

  auto op_res = db_slice->AddOrUpdate(db_cntx, item->key, std::move(pv), item->expire_ms);
  if (!op_res) {
    LOG(ERROR) << "OOM failed to add key '" << item->key << "' in DB " << db_ind;
    ec_ = RdbError(errc::out_of_memory);
    stop_early_ = true;
    return;
  }

  DbSlice::ItAndUpdater& updater = *op_res;
  updater.it->first.SetSticky(item->is_sticky);
  if (item->has_mc_flags) {
    updater.it->second.SetFlag(true);
    db_slice->SetMCFlag(db_cntx.db_index, updater.it->first, item->mc_flags);
  }

  if (!override_existing_keys_ && !updater.is_new) {
    LOG(WARNING) << "RDB has duplicated key '" << item->key << "' in DB " << db_ind << " of type "
                 << updater.it->second.ObjType();
  }

  if (auto* ts = db_slice->shard_owner()->tiered_storage(); ts) {
    // Finalize the AutoUpdater before stashing. The stash callback may complete
    // (e.g. during the SleepFor yield below) and transform the PrimeValue to external,
    // changing MallocUsed(). If the AutoUpdater ran after that, it would compute a
    // bogus negative memory delta and crash in AccountObjectMemory.
    auto it = updater.it;
    updater.post_updater.Run();
    StashPrimeValue(db_cntx.db_index, item->key, &it->second, ts, nullptr);

    // Block, if tiered storage is active, but can't keep up
    while (db_slice->shard_owner()->ShouldThrottleForTiering())
      ThisFiber::SleepFor(100us);
  }
}

void RdbLoader::LoadItemsBuffer(DbIndex db_ind, const ItemsBuf& ib) {
  EngineShard* es = EngineShard::tlocal();
  DbContext db_cntx{&namespaces->GetDefaultNamespace(), db_ind, GetCurrentTimeMs()};
  DbSlice& db_slice = db_cntx.GetDbSlice(es->shard_id());

  DCHECK(!db_slice.IsCacheMode());

  for (const auto* item : ib) {
    CreateObjectOnShard(db_cntx, item, &db_slice);
    if (stop_early_) {
      return;
    }
  }

  for (auto* item : ib) {
    item_queue_.Push(item);
  }
}

// Loads the next key/val pair.
//
// Huge objects may be loaded in parts, where only a subset of elements are
// loaded at a time. This reduces the memory required to load huge objects and
// prevents LoadItemsBuffer blocking.
error_code RdbLoader::LoadKeyValPair(int type, ObjSettings* settings) {
  std::string key;
  int64_t start = absl::GetCurrentTimeNanos();

  SET_OR_RETURN(ReadKey(), key);
  last_key_loaded_ = key;

  bool dry_run = absl::GetFlag(FLAGS_rdb_load_dry_run);
  bool streamed = false;
  do {
    // If there is a cached Item in the free pool, take it, otherwise allocate
    // a new Item (LoadItemsBuffer returns free items).
    Item* item = item_queue_.Pop();
    if (item == nullptr) {
      item = new Item;
    }
    // Delete the item if we fail to load the key/val pair.
    auto cleanup = absl::Cleanup([item] { delete item; });

    item->load_config.append = pending_read_.remaining > 0;

    error_code ec = ReadObj(type, &item->val);
    if (ec) {
      VLOG(2) << "ReadObj error " << ec << " for key " << key;
      return ec;
    }

    // If the key can be discarded, we must still continue to read the
    // object from the RDB so we can read the next key.
    if (ShouldDiscardKey(key, *settings)) {
      pending_read_.reserve = 0;
      continue;
    }

    if (dry_run)
      continue;

    item->load_config.finalize = pending_read_.remaining == 0;
    if (!item->load_config.finalize) {
      item->key = key;
      streamed = true;
    } else {
      // Avoid copying the key if this is the last read of the object.
      item->key = std::move(key);
    }

    item->load_config.chunked = streamed;
    item->load_config.reserve = pending_read_.reserve;
    // Clear 'reserve' as we must only set when the object is first
    // initialized.
    pending_read_.reserve = 0;

    item->is_sticky = settings->is_sticky;
    item->has_mc_flags = settings->has_mc_flags;
    item->mc_flags = settings->mc_flags;
    item->expire_ms = settings->expiretime;

    std::move(cleanup).Cancel();
    ShardId sid = Shard(item->key, shard_set->size());
    EngineShard* es = EngineShard::tlocal();

    if (es && es->shard_id() == sid) {
      DbContext db_cntx{&namespaces->GetDefaultNamespace(), cur_db_index_, GetCurrentTimeMs()};
      CreateObjectOnShard(db_cntx, item, &db_cntx.GetDbSlice(sid));
      item_queue_.Push(item);
    } else {
      auto& out_buf = shard_buf_[sid];

      out_buf.emplace_back(item);

      constexpr size_t kBufSize = 64;
      if (out_buf.size() >= kBufSize) {
        // Despite being async, this function can block if the shard queue is full.
        FlushShardAsync(sid);
      }
    }
  } while (pending_read_.remaining > 0 && !stop_early_.load(memory_order_relaxed));

  int delta_ms = (absl::GetCurrentTimeNanos() - start) / 1000'000;
  LOG_IF(INFO, delta_ms > 1000) << "Took " << delta_ms << " ms to load rdb_type " << type;

  return kOk;
}

bool RdbLoader::ShouldDiscardKey(std::string_view key, const ObjSettings& settings) const {
  if (!load_unowned_slots_ && IsClusterEnabled()) {
    const auto cluster_config = cluster::ClusterConfig::Current();
    if (cluster_config && !cluster_config->IsMySlot(key)) {
      return true;
    }
  }

  /* Check if the key already expired. This function is used when loading
   * an RDB file from disk, either at startup, or when an RDB was
   * received from the master. In the latter case, the master is
   * responsible for key expiry. If we would expire keys here, the
   * snapshot taken by the master may not be reflected on the slave.
   * Similarly if the RDB is the preamble of an AOF file, we want to
   * load all the keys as they are, since the log of operations later
   * assume to work in an exact keyspace state. */
  if (ServerState::tlocal()->is_master && (settings.has_expired)) {
    VLOG(3) << "Expire key on read: " << key;
    return true;
  }

  return false;
}

void RdbLoader::LoadScriptFromAux(string&& body) {
  ServerState* ss = ServerState::tlocal();
  auto interpreter = ss->BorrowInterpreter();
  absl::Cleanup clean = [ss, interpreter] { ss->ReturnInterpreter(interpreter); };

  if (script_mgr_) {
    auto res = script_mgr_->Insert(body, interpreter);
    if (!res)
      LOG(ERROR) << "Error compiling script";
  }
}

void RdbLoader::LoadSearchIndexDefFromAux(string&& def) {
  LoadSearchCommandFromAux(service_, std::move(def), "FT.CREATE", "index definition", true);
}

void RdbLoader::LoadHnswIndexMetadataFromAux(string&& def) {
  try {
    auto json_opt = JsonFromString(def);
    if (!json_opt) {
      LOG(ERROR) << "Invalid HNSW index metadata JSON: " << def;
      return;
    }
    const auto& json = *json_opt;

    PendingHnswMetadata phm;
    phm.index_name = json["index_name"].as<string>();
    phm.field_name = json["field_name"].as<string>();
    phm.metadata.max_elements = json["max_elements"].as<size_t>();
    phm.metadata.cur_element_count = json["cur_element_count"].as<size_t>();
    phm.metadata.maxlevel = json["maxlevel"].as<int>();
    phm.metadata.enterpoint_node = json["enterpoint_node"].as<size_t>();

    LOG(INFO) << "Loaded HNSW metadata for index=" << phm.index_name << " field=" << phm.field_name
              << " elements=" << phm.metadata.cur_element_count;

    load_context_->AddPendingHnswMetadata(std::move(phm));
  } catch (const std::exception& e) {
    LOG(ERROR) << "Failed to parse HNSW index metadata JSON: " << e.what() << " def: " << def;
  }
}

error_code RdbLoader::LoadVectorIndexNodes(uint64_t elements_number,
                                           std::vector<search::HnswNodeData>* nodes) {
  nodes->reserve(elements_number);
  for (uint64_t elem = 0; elem < elements_number; ++elem) {
    search::HnswNodeData node;
    SET_OR_RETURN(FetchInt<uint32_t>(), node.internal_id);
    SET_OR_RETURN(FetchInt<uint64_t>(), node.global_id);
    uint32_t raw_level;
    SET_OR_RETURN(FetchInt<uint32_t>(), raw_level);
    node.level = static_cast<int>(raw_level);

    node.levels_links.resize(node.level + 1);
    for (int lvl = 0; lvl <= node.level; ++lvl) {
      uint32_t links_num;
      SET_OR_RETURN(FetchInt<uint32_t>(), links_num);
      node.levels_links[lvl].resize(links_num);
      for (uint32_t i = 0; i < links_num; ++i) {
        SET_OR_RETURN(FetchInt<uint32_t>(), node.levels_links[lvl][i]);
      }
    }
    nodes->push_back(std::move(node));
  }
  return {};
}

error_code RdbLoader::RestoreVectorIndex(string_view index_key, string_view index_name,
                                         string_view field_name, uint64_t elements_number) {
#ifdef WITH_SEARCH
  // Look up the HNSW index in the global registry. It should exist from FT.CREATE in aux.
  auto hnsw_index = GlobalHnswIndexRegistry::Instance().Get(index_name, field_name);
  if (!hnsw_index) {
    LOG(ERROR) << "HNSW index not found for restoration: " << index_key;
    return SkipVectorIndex(index_key, elements_number);
  }

  std::vector<search::HnswNodeData> nodes;
  RETURN_ON_ERR(LoadVectorIndexNodes(elements_number, &nodes));

  if (!nodes.empty()) {
    auto metadata = load_context_->FindHnswMetadata(index_name, field_name);
    DCHECK(metadata) << "HNSW metadata missing for " << index_key;

    hnsw_index->RestoreFromNodes(nodes, *metadata);
    LOG(INFO) << "Restored HNSW index " << index_key << " with " << nodes.size() << " nodes";
  }
  return {};
#else
  return SkipVectorIndex(index_key, elements_number);
#endif
}

error_code RdbLoader::SkipVectorIndex(string_view index_key, uint64_t elements_number) {
  for (uint64_t elem = 0; elem < elements_number; ++elem) {
    SET_OR_RETURN(FetchInt<uint32_t>(), std::ignore);  // internal_id
    SET_OR_RETURN(FetchInt<uint64_t>(), std::ignore);  // global_id
    uint32_t raw_level;
    SET_OR_RETURN(FetchInt<uint32_t>(), raw_level);
    int level = static_cast<int>(raw_level);

    for (int lvl = 0; lvl <= level; ++lvl) {
      uint32_t links_num;
      SET_OR_RETURN(FetchInt<uint32_t>(), links_num);
      for (uint32_t i = 0; i < links_num; ++i) {
        SET_OR_RETURN(FetchInt<uint32_t>(), std::ignore);
      }
    }
  }

  if (elements_number > 0) {
    LOG(INFO) << "Skipping HNSW vector index restore: " << index_key
              << " elements_number=" << elements_number << " shard_count_=" << shard_count_
              << " current_shards=" << shard_set->size() << ". Index will be rebuilt from data.";
  }
  return {};
}

void RdbLoader::LoadSearchSynonymsFromAux(string&& def) {
  load_context_->AddPendingSynonymCommand(std::move(def));
}

}  // namespace dfly


================================================
FILE: src/server/rdb_load.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once

#include <system_error>

extern "C" {
#include "redis/rdb.h"
}

#include <absl/container/flat_hash_map.h>
#include <absl/container/flat_hash_set.h>

#include "base/mpsc_intrusive_queue.h"
#include "base/pod_array.h"
#include "core/search/base.h"
#include "core/search/hnsw_index.h"
#include "io/io.h"
#include "io/io_buf.h"
#include "server/detail/decompress.h"
#include "server/execution_state.h"
#include "server/journal/serializer.h"
#include "server/rdb_load_context.h"

struct streamID;

namespace dfly {

class EngineShardSet;
class ScriptMgr;
class CompactObj;
class Service;

using RdbVersion = std::uint16_t;

class RdbLoaderBase {
 protected:
  RdbLoaderBase();
  ~RdbLoaderBase();

  struct LoadTrace;
  using MutableBytes = ::io::MutableBytes;

  struct LzfString {
    base::PODArray<uint8_t> compressed_blob;
    uint64_t uncompressed_len;
  };

  struct RdbSBF {
    double grow_factor, fp_prob;
    size_t prev_size, current_size;
    size_t max_capacity;

    struct Filter {
      unsigned hash_cnt;
      std::string blob;
      Filter(unsigned h, std::string b) : hash_cnt(h), blob(std::move(b)) {
      }
    };
    std::vector<Filter> filters;
  };

  struct RdbCMS {
    uint32_t width, depth;
    int64_t total_incr_count;
    std::vector<int64_t> counters;
  };

  using RdbVariant = std::variant<long long, base::PODArray<char>, LzfString,
                                  std::unique_ptr<LoadTrace>, RdbSBF, RdbCMS>;

  struct OpaqueObj {
    RdbVariant obj;
    int rdb_type{0};
  };

  struct LoadBlob {
    RdbVariant rdb_var;
    union {
      unsigned encoding;
      double score;
    };
  };

  struct StreamPelTrace {
    std::array<uint8_t, 16> rawid;
    int64_t delivery_time;
    uint64_t delivery_count;
  };

  struct StreamConsumerTrace {
    RdbVariant name;
    int64_t seen_time;
    int64_t active_time;
    std::vector<std::array<uint8_t, 16>> nack_arr;
  };

  struct StreamID {
    uint64_t ms = 0;
    uint64_t seq = 0;
  };

  struct StreamCGTrace {
    RdbVariant name;
    uint64_t ms;
    uint64_t seq;
    uint64_t entries_read;
    std::vector<StreamPelTrace> pel_arr;
    std::vector<StreamConsumerTrace> cons_arr;
  };

  struct StreamTrace {
    size_t lp_len;
    size_t stream_len;
    StreamID last_id;
    StreamID first_id;             /* The first non-tombstone entry, zero if empty. */
    StreamID max_deleted_entry_id; /* The maximal ID that was deleted. */
    uint64_t entries_added = 0;    /* All time count of elements added. */
    std::vector<StreamCGTrace> cgroup;
  };

  struct LoadTrace {
    std::vector<LoadBlob> arr;
    std::unique_ptr<StreamTrace> stream_trace;
  };

  // Contains the state of a pending partial read.
  //
  // This us used to load huge objects in parts (only loading a subset of
  // elements at a time) (see LoadKeyValPair).
  struct PendingRead {
    // Number of elements in the object to reserve.
    //
    // Used to reserve the elements in a huge object up front, then append
    // in next loads.
    size_t reserve = 0;

    // Number of elements remaining in the object.
    size_t remaining = 0;
  };

  struct LoadConfig {
    bool chunked = false;   // Big value streamed incrementally
    size_t reserve = 0;     // Number of elements to reserve to optimize big value load
    bool append = false;    // Append chunk to existing object
    bool finalize = false;  // Last portion of chunked stream, finalize object
  };

  class OpaqueObjLoader;

  io::Result<uint8_t> FetchType();

  template <typename T> io::Result<T> FetchInt();

  static std::error_code FromOpaque(const OpaqueObj& opaque, LoadConfig config, PrimeValue* pv);

  io::Result<uint64_t> LoadLen(bool* is_encoded);
  std::error_code FetchBuf(size_t size, void* dest);

  io::Result<std::string> FetchGenericString();
  io::Result<std::string> FetchLzfStringObject();
  io::Result<std::string> FetchIntegerObject(int enctype);

  io::Result<double> FetchBinaryDouble();
  io::Result<double> FetchDouble();

  ::io::Result<std::string> ReadKey();

  std::error_code ReadObj(int rdbtype, OpaqueObj* dest);
  std::error_code ReadStringObj(RdbVariant* rdb_variant, bool big_string_split = false);
  std::error_code ReadRemainingString(RdbVariant* dest);
  ::io::Result<long long> ReadIntObj(int encoding);
  ::io::Result<LzfString> ReadLzf();

  ::io::Result<OpaqueObj> ReadSet(int rdbtype);
  ::io::Result<OpaqueObj> ReadIntSet();
  ::io::Result<OpaqueObj> ReadGeneric(int rdbtype);
  ::io::Result<OpaqueObj> ReadHMap(int rdbtype);
  ::io::Result<OpaqueObj> ReadZSet(int rdbtype);
  ::io::Result<OpaqueObj> ReadListQuicklist(int rdbtype);
  ::io::Result<OpaqueObj> ReadStreams(int rdbtype);
  ::io::Result<OpaqueObj> ReadRedisJson();
  ::io::Result<OpaqueObj> ReadSBFImpl(bool chunking);
  ::io::Result<OpaqueObj> ReadSBF();
  ::io::Result<OpaqueObj> ReadSBF2();
  ::io::Result<OpaqueObj> ReadCMS();

  std::error_code SkipModuleData();
  std::error_code HandleCompressedBlob(int op_type);
  std::error_code HandleCompressedBlobFinish();
  std::error_code AllocateDecompressOnce(int op_type);

  std::error_code HandleJournalBlob(Service* service);

  static size_t StrLen(const RdbVariant& tset);

  std::error_code EnsureRead(size_t min_sz);

  std::error_code EnsureReadInternal(size_t min_to_read);

  static void CopyStreamId(const StreamID& src, struct streamID* dest);

  base::IoBuf* mem_buf_ = nullptr;
  base::IoBuf origin_mem_buf_;
  ::io::Source* src_ = nullptr;

  size_t bytes_read_ = 0;
  size_t source_limit_ = SIZE_MAX;
  base::PODArray<uint8_t> compr_buf_;
  std::unique_ptr<detail::DecompressImpl> decompress_impl_;
  JournalReader journal_reader_{nullptr, 0};
  std::optional<uint64_t> journal_offset_ = std::nullopt;
  RdbVersion rdb_version_ = RDB_VERSION;
  PendingRead pending_read_;
};

class RdbLoader : protected RdbLoaderBase {
 public:
  // load_context is shared across all RdbLoader instances in a load session.
  explicit RdbLoader(Service* service, RdbLoadContext* load_context, std::string snapshot_id = {});

  ~RdbLoader();

  void SetOverrideExistingKeys(bool override) {
    override_existing_keys_ = override;
  }

  void SetLoadUnownedSlots(bool load_unowned) {
    load_unowned_slots_ = load_unowned;
  }

  // Sets shard count of the snapshot being loaded.
  // Does not necessarily match the shard count of the current instance.
  void SetShardCount(uint32_t shard_cnt) {
    shard_count_ = shard_cnt;
  }

  std::error_code Load(::io::Source* src);

  void set_source_limit(size_t n) {
    source_limit_ = n;
  }

  ::io::Bytes Leftover() const {
    return mem_buf_->InputBuffer();
  }

  size_t bytes_read() const {
    return bytes_read_;
  }

  size_t keys_loaded() const {
    return keys_loaded_;
  }

  // returns time in seconds.
  double load_time() const {
    return load_time_;
  }

  void stop() {
    stop_early_.store(true);
  }

  void Pause(bool pause) {
    pause_ = pause;
  }

  const std::string& GetSnapshotId() const {
    return snapshot_id_;
  }

  // Return the offset that was received with a RDB_OPCODE_JOURNAL_OFFSET command,
  // or 0 if no offset was received.
  std::optional<uint64_t> journal_offset() const {
    return journal_offset_;
  }

  // Set callback for receiving RDB_OPCODE_FULLSYNC_END.
  // This opcode is used by a master instance to notify it finished streaming static data
  // and is ready to switch to stable state sync.
  void SetFullSyncCutCb(std::function<void()> cb) {
    full_sync_cut_cb = std::move(cb);
  }

  uint32_t shard_id() const {
    return shard_id_;
  }

  uint32_t shard_count() const {
    return shard_count_;
  }

 private:
  struct Item {
    std::string key;
    OpaqueObj val;
    uint64_t expire_ms;
    std::atomic<Item*> next;
    bool is_sticky = false;
    bool has_mc_flags = false;
    uint32_t mc_flags = 0;

    LoadConfig load_config;

    friend void MPSC_intrusive_store_next(Item* dest, Item* nxt) {
      dest->next.store(nxt, std::memory_order_release);
    }

    friend Item* MPSC_intrusive_load_next(const Item& src) {
      return src.next.load(std::memory_order_acquire);
    }
  };

  using ItemsBuf = std::vector<Item*>;

  struct ObjSettings;

  std::error_code LoadKeyValPair(int type, ObjSettings* settings);
  // Returns whether to discard the read key pair.
  bool ShouldDiscardKey(std::string_view key, const ObjSettings& settings) const;

  std::error_code HandleAux();

  std::error_code VerifyChecksum();

  void FinishLoad(absl::Time start_time, size_t* keys_loaded);

  void FlushShardAsync(ShardId sid);
  void FlushAllShards();

  void LoadItemsBuffer(DbIndex db_ind, const ItemsBuf& ib);

  void CreateObjectOnShard(const DbContext& db_cntx, const Item* item, DbSlice* db_slice);

  void LoadScriptFromAux(std::string&& value);

  // Load index definition from RESP string describing it in FT.CREATE format,
  // issues an FT.CREATE call, but does not start indexing
  void LoadSearchIndexDefFromAux(std::string&& value);

  // Load HNSW index metadata from JSON, sets metadata on the GlobalHnswIndexRegistry
  void LoadHnswIndexMetadataFromAux(std::string&& value);

  // Load synonyms from RESP string and issue FT.SYNUPDATE call
  void LoadSearchSynonymsFromAux(std::string&& value);

  // Restore HNSW vector index graph from serialized node data.
  std::error_code RestoreVectorIndex(std::string_view index_key, std::string_view index_name,
                                     std::string_view field_name, uint64_t elements_number);

  // Load HNSW vector index nodes into a vector for deferred restoration.
  std::error_code LoadVectorIndexNodes(uint64_t elements_number,
                                       std::vector<search::HnswNodeData>* nodes);

  // Skip over serialized HNSW vector index node data without restoring.
  std::error_code SkipVectorIndex(std::string_view index_key, uint64_t elements_number);

  Service* service_;
  RdbLoadContext* load_context_;

  std::string snapshot_id_;
  bool override_existing_keys_ = false;
  bool load_unowned_slots_ = false;
  bool rdb_ignore_expiry_;
  const bool deserialize_hnsw_index_;
  uint32_t shard_id_ = UINT32_MAX;
  uint32_t shard_count_ = 0;
  size_t table_used_memory_ = 0;
  ScriptMgr* script_mgr_;
  std::vector<ItemsBuf> shard_buf_;

  size_t keys_loaded_ = 0;
  double load_time_ = 0;

  DbIndex cur_db_index_ = 0;
  bool pause_ = false;
  bool is_tiered_enabled_ = false;
  AggregateError ec_;

  // We use atomics here because shard threads can notify RdbLoader fiber from another thread
  // that it should stop early.
  std::atomic_bool stop_early_{false};

  // Callback when receiving RDB_OPCODE_FULLSYNC_END
  std::function<void()> full_sync_cut_cb;

  // A free pool of allocated unused items.
  base::MPSCIntrusiveQueue<Item> item_queue_;

  // Map of currently chunked big values
  std::unordered_map<std::string, std::unique_ptr<PrimeValue>> now_chunked_;
  base::SpinLock now_chunked_mu_;  // guards now_chunked_

  std::string last_key_loaded_;
};

}  // namespace dfly


================================================
FILE: src/server/rdb_load_context.cc
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/rdb_load_context.h"

#include <absl/container/flat_hash_set.h>
#include <absl/strings/match.h>

#include <algorithm>
#include <limits>

#include "base/logging.h"
#include "facade/redis_parser.h"
#include "facade/reply_capture.h"
#include "server/conn_context.h"
#include "server/engine_shard_set.h"
#include "server/main_service.h"
#include "server/namespaces.h"
#include "server/search/doc_index.h"
#include "server/search/global_hnsw_index.h"
#include "server/sharding.h"

namespace dfly {

namespace {

constexpr search::GlobalDocId kInvalidRemapGid = std::numeric_limits<search::GlobalDocId>::max();

// index_name -> master_shard_id -> new_global_ids indexed by old doc_id
using HnswRemapTable =
    absl::flat_hash_map<std::string,
                        absl::flat_hash_map<uint32_t, std::vector<search::GlobalDocId>>>;

// vector indexed by shard_id; per-shard map from index_name to keys in doc_id order
using PerShardMappings = std::vector<absl::flat_hash_map<std::string, std::vector<std::string>>>;

// Assigns new global_ids to each (key, old_doc_id) pair, distributing keys to their target
// shards. Returns a table mapping old (index, master_shard, old_doc_id) -> new_global_id.
HnswRemapTable BuildRemapTable(
    const absl::flat_hash_map<uint32_t, std::vector<PendingIndexMapping>>& index_mappings,
    ShardId new_shard_count) {
  HnswRemapTable remap_table;
#ifdef WITH_SEARCH
  absl::flat_hash_map<std::string, absl::flat_hash_map<uint32_t, search::DocId>> doc_id_counters;

  for (const auto& [master_shard_id, pim_vec] : index_mappings) {
    for (const auto& pim : pim_vec) {
      auto& vec = remap_table[pim.index_name][master_shard_id];
      auto& counters = doc_id_counters[pim.index_name];

      // Pre-allocate to max old_doc_id in one shot, avoiding O(N²) repeated resizes when
      // doc_ids arrive in increasing order.
      search::DocId max_id = 0;
      for (const auto& [key, old_doc_id] : pim.mappings) {
        max_id = std::max(max_id, old_doc_id);
      }
      vec.assign(max_id + 1, kInvalidRemapGid);

      for (const auto& [key, old_doc_id] : pim.mappings) {
        ShardId new_shard_id = Shard(key, new_shard_count);
        // Counter starts at 0 for each (index, shard) — equivalent to DocKeyIndex::Add() on a
        // fresh index (free_ids_ empty → id = last_id_++). DocKeyIndex::Restore() is later called
        // with these exact keys in doc_id order, so the key_index stays consistent with the
        // global_ids stored in the remapped HNSW graph.
        search::DocId new_doc_id = counters[new_shard_id]++;
        vec[old_doc_id] = search::CreateGlobalDocId(new_shard_id, new_doc_id);
      }
    }
  }
#endif
  return remap_table;
}

// Remaps global_ids in deferred HNSW nodes and restores the graphs.
// Returns the set of index names that failed restoration (to be excluded from key mappings).
absl::flat_hash_set<std::string> RemapAndRestoreHnswGraphs(
    std::vector<PendingHnswNodes>& pending_nodes,
    const std::vector<PendingHnswMetadata>& hnsw_metadata, const HnswRemapTable& remap_table) {
  absl::flat_hash_set<std::string> failed_indices;
#ifdef WITH_SEARCH
  for (auto& pn : pending_nodes) {
    auto remap_it = remap_table.find(pn.index_name);

    auto hnsw_index = GlobalHnswIndexRegistry::Instance().Get(pn.index_name, pn.field_name);
    if (!hnsw_index) {
      LOG(ERROR) << "HNSW index not found for deferred restoration: " << pn.index_name << ":"
                 << pn.field_name << ". Will rebuild from scratch.";
      failed_indices.insert(pn.index_name);
      continue;
    }

    if (remap_it == remap_table.end()) {
      LOG(WARNING) << "No remap table for index " << pn.index_name << ":" << pn.field_name
                   << " (no key mappings). Will rebuild from scratch.";
      failed_indices.insert(pn.index_name);
      continue;
    }

    size_t remapped = 0;
    for (auto& node : pn.nodes) {
      auto [shard_id, doc_id] = search::DecomposeGlobalDocId(node.global_id);
      auto shard_it = remap_it->second.find(shard_id);
      if (shard_it != remap_it->second.end() && doc_id < shard_it->second.size()) {
        search::GlobalDocId new_gid = shard_it->second[doc_id];
        if (new_gid != kInvalidRemapGid) {
          node.global_id = new_gid;
          ++remapped;
        }
      }
    }

    if (remapped != pn.nodes.size()) {
      LOG(WARNING) << "Incomplete remap for HNSW index " << pn.index_name << ":" << pn.field_name
                   << " (" << remapped << "/" << pn.nodes.size()
                   << " nodes). Will rebuild from scratch.";
      failed_indices.insert(pn.index_name);
      continue;
    }

    const PendingHnswMetadata* phm_ptr = nullptr;
    for (const auto& phm : hnsw_metadata) {
      if (phm.index_name == pn.index_name && phm.field_name == pn.field_name) {
        phm_ptr = &phm;
        break;
      }
    }
    DCHECK(phm_ptr) << "HNSW metadata missing for " << pn.index_name << ":" << pn.field_name;

    hnsw_index->RestoreFromNodes(pn.nodes, phm_ptr->metadata);
    LOG(INFO) << "Restored HNSW index " << pn.index_name << ":" << pn.field_name << " with "
              << pn.nodes.size() << " nodes (" << remapped << " global_ids remapped)";
  }
#endif
  return failed_indices;
}

// Uses the remap table to distribute keys to their target shards.
// Each shard's entry maps index_name -> keys in new doc_id order (vector index = doc_id),
// matching the order assigned by BuildRemapTable (same iteration over index_mappings).
PerShardMappings PreDistributeKeyMappings(
    const absl::flat_hash_map<uint32_t, std::vector<PendingIndexMapping>>& index_mappings,
    const HnswRemapTable& remap_table, ShardId new_shard_count) {
  PerShardMappings per_shard(new_shard_count);

  for (const auto& [master_shard_id, pim_vec] : index_mappings) {
    for (const auto& pim : pim_vec) {
      auto idx_it = remap_table.find(pim.index_name);
      if (idx_it == remap_table.end())
        continue;
      auto shard_it = idx_it->second.find(master_shard_id);
      if (shard_it == idx_it->second.end())
        continue;
      const auto& remap_vec = shard_it->second;

      for (const auto& [key, old_doc_id] : pim.mappings) {
        if (old_doc_id >= remap_vec.size())
          continue;
        search::GlobalDocId new_gid = remap_vec[old_doc_id];
        if (new_gid == kInvalidRemapGid)
          continue;
        ShardId new_shard_id = search::DecomposeGlobalDocId(new_gid).first;
        per_shard[new_shard_id][pim.index_name].push_back(key);
      }
    }
  }

  return per_shard;
}

}  // namespace

void LoadSearchCommandFromAux(Service* service, std::string&& def, std::string_view command_name,
                              std::string_view error_context, bool add_NX) {
  facade::CapturingReplyBuilder crb;

  ConnectionContext cntx{nullptr, acl::UserCredentials{}};
  cntx.is_replicating = true;
  cntx.journal_emulated = true;
  cntx.skip_acl_validation = true;
  cntx.ns = &namespaces->GetDefaultNamespace();

  uint32_t consumed = 0;
  facade::RespVec resp_vec;
  facade::RedisParser parser;

  // Prepend a whitespace so names starting with ':' are treated as names, not RESP tokens.
  def.insert(def.begin(), ' ');

  // Add resp terminator
  constexpr std::string_view kRespTerminator = "\r\n";
  def += kRespTerminator;

  std::string_view printable_def{def.data(), def.size() - kRespTerminator.size()};

  io::MutableBytes buffer{reinterpret_cast<uint8_t*>(def.data()), def.size()};
  auto res = parser.Parse(buffer, &consumed, &resp_vec);

  if (res != facade::RedisParser::Result::OK) {
    LOG(ERROR) << "Bad " << error_context << ": " << printable_def;
    return;
  }

  // Temporary migration fix for backwards compatibility with old snapshots where TAG fields were
  // serialized as "TAG SORTABLE SEPARATOR x" but parser expects "TAG SEPARATOR x SORTABLE".
  // Reorder arguments if needed.
  // TODO: Remove this workaround after Apr 2026.
  for (size_t i = 0; i + 2 < resp_vec.size(); ++i) {
    std::string_view cur = resp_vec[i].GetView();
    std::string_view next = resp_vec[i + 1].GetView();
    if (absl::EqualsIgnoreCase(cur, "SORTABLE") && absl::EqualsIgnoreCase(next, "SEPARATOR")) {
      // SORTABLE SEPARATOR x -> SEPARATOR x SORTABLE
      std::swap(resp_vec[i], resp_vec[i + 1]);      // SEPARATOR SORTABLE x
      std::swap(resp_vec[i + 1], resp_vec[i + 2]);  // SEPARATOR x SORTABLE
    }
  }

  // Prepend command name (FT.CREATE or FT.SYNUPDATE)
  CommandContext cntx_cmd;
  cntx_cmd.Init(&crb, &cntx);

  cntx_cmd.PushArg(command_name);
  cntx_cmd.PushArg(resp_vec[0].GetView());  // index name
  if (add_NX) {
    cntx_cmd.PushArg("NX");
  }
  for (unsigned i = 1; i < resp_vec.size(); i++) {
    cntx_cmd.PushArg(resp_vec[i].GetView());
  }
  service->DispatchCommand(facade::ParsedArgs{cntx_cmd}, &cntx_cmd,
                           facade::AsyncPreference::ONLY_SYNC);

  auto response = crb.Take();
  if (auto err = facade::CapturingReplyBuilder::TryExtractError(response); err) {
    LOG(ERROR) << "Bad " << error_context << ": " << def << " " << err->first;
  }
}

void RdbLoadContext::AddPendingSynonymCommand(std::string cmd) {
  util::fb2::LockGuard<util::fb2::Mutex> lk(mu_);
  pending_synonym_cmds_.push_back(std::move(cmd));
}

void RdbLoadContext::AddPendingIndexMapping(uint32_t shard_id, PendingIndexMapping mapping) {
  util::fb2::LockGuard<util::fb2::Mutex> lk(mu_);
  pending_index_mappings_[shard_id].emplace_back(std::move(mapping));
}

void RdbLoadContext::AddPendingHnswMetadata(PendingHnswMetadata metadata) {
  util::fb2::LockGuard<util::fb2::Mutex> lk(mu_);
  pending_hnsw_metadata_.emplace_back(std::move(metadata));
}

void RdbLoadContext::AddPendingHnswNodes(PendingHnswNodes nodes) {
  util::fb2::LockGuard<util::fb2::Mutex> lk(mu_);
  pending_hnsw_nodes_.emplace_back(std::move(nodes));
}

void RdbLoadContext::SetMasterShardCount(uint32_t count) {
  master_shard_count_ = count;
}

std::optional<search::HnswIndexMetadata> RdbLoadContext::FindHnswMetadata(
    std::string_view index_name, std::string_view field_name) const {
  util::fb2::LockGuard<util::fb2::Mutex> lk(mu_);
  for (const auto& phm : pending_hnsw_metadata_) {
    if (phm.index_name == index_name && phm.field_name == field_name) {
      return phm.metadata;
    }
  }
  return std::nullopt;
}

std::vector<std::string> RdbLoadContext::TakePendingSynonymCommands() {
  util::fb2::LockGuard<util::fb2::Mutex> lk(mu_);
  std::vector<std::string> result;
  result.swap(pending_synonym_cmds_);
  return result;
}

absl::flat_hash_map<uint32_t, std::vector<PendingIndexMapping>>
RdbLoadContext::TakePendingIndexMappings() {
  util::fb2::LockGuard<util::fb2::Mutex> lk(mu_);
  decltype(pending_index_mappings_) result;
  std::swap(result, pending_index_mappings_);
  return result;
}

std::vector<PendingHnswNodes> RdbLoadContext::TakePendingHnswNodes() {
  util::fb2::LockGuard<util::fb2::Mutex> lk(mu_);
  return std::move(pending_hnsw_nodes_);
}

RdbLoadContext::PerShardMappings RdbLoadContext::RemapHnswForDifferentShardCount(
    const absl::flat_hash_map<uint32_t, std::vector<PendingIndexMapping>>& index_mappings,
    std::vector<PendingHnswNodes>& pending_nodes,
    const std::vector<PendingHnswMetadata>& hnsw_metadata) {
  const ShardId new_shard_count = shard_set->size();

  // Build remap table: index_name -> master_shard_id -> new_global_ids indexed by old doc_id.
  // Freed when this function returns.
  HnswRemapTable remap_table = BuildRemapTable(index_mappings, new_shard_count);

  // Remap global_ids, restore HNSW graphs; failed indices are excluded from key mappings.
  auto failed = RemapAndRestoreHnswGraphs(pending_nodes, hnsw_metadata, remap_table);
  for (const auto& name : failed) {
    remap_table.erase(name);
  }

  // Pre-distribute key mappings per target shard; keys in doc_id order (index = doc_id).
  return PreDistributeKeyMappings(index_mappings, remap_table, new_shard_count);
}

void RdbLoadContext::PerformPostLoad(Service* service, bool is_error) {
#ifdef WITH_SEARCH
  const CommandId* cmd = service->FindCmd("FT.CREATE");
  if (cmd == nullptr)  // In case search module is disabled
    return;

  std::vector<std::string> synonym_cmds = TakePendingSynonymCommands();
  auto index_mappings = TakePendingIndexMappings();
  auto pending_nodes = TakePendingHnswNodes();

  // Extract remaining shared state under lock. After this, no member access is needed.
  std::vector<PendingHnswMetadata> hnsw_metadata;
  {
    util::fb2::LockGuard<util::fb2::Mutex> lk(mu_);
    hnsw_metadata.swap(pending_hnsw_metadata_);
  }
  uint32_t master_shards = master_shard_count_;

  bool has_hnsw_restore = !hnsw_metadata.empty();

  if (is_error)
    return;

  // When shard counts differ, remap HNSW global_ids and redistribute key mappings on-the-fly.
  bool shard_count_differs = master_shards != 0 && master_shards != shard_set->size();

  if (shard_count_differs && !index_mappings.empty()) {
    // Remaps HNSW global_ids, restores HNSW graphs, and pre-distributes key mappings by target
    // shard. The internal remap table is local to the function and freed when it returns.
    auto per_shard_mappings =
        RemapHnswForDifferentShardCount(index_mappings, pending_nodes, hnsw_metadata);

    // Each shard reads only its own pre-built slice — no per-shard filtering of all N keys.
    shard_set->AwaitRunningOnShardQueue([&per_shard_mappings](EngineShard* es) {
      for (const auto& [name, keys] : per_shard_mappings[es->shard_id()]) {
        if (auto* index = es->search_indices()->GetIndex(name); index) {
          index->RestoreKeyIndex(keys);
          VLOG(1) << "Restored " << keys.size() << " key mappings for index " << name
                  << " on shard " << es->shard_id();
        }
      }
    });
  } else {
    if (shard_count_differs && !pending_nodes.empty()) {
      LOG(WARNING) << "Have " << pending_nodes.size()
                   << " deferred HNSW node sets but no key mappings for remapping. "
                      "Affected indices will be rebuilt from scratch.";
    }

    if (!index_mappings.empty()) {
      shard_set->AwaitRunningOnShardQueue([&index_mappings](EngineShard* es) {
        auto it = index_mappings.find(es->shard_id());
        if (it == index_mappings.end())
          return;
        for (const auto& pim : it->second) {
          if (auto* index = es->search_indices()->GetIndex(pim.index_name); index) {
            index->RestoreKeyIndex(pim.mappings);
            VLOG(1) << "Restored " << pim.mappings.size() << " key mappings for index "
                    << pim.index_name << " on shard " << es->shard_id();
          }
        }
      });
    }
  }
  // RestoreKeyIndex (above) and RebuildAllIndices (below) run in separate sequential
  // AwaitRunningOnShardQueue calls, so there is no parallel index build that could interfere
  // with the doc_ids assigned during key mapping restoration.
  shard_set->AwaitRunningOnShardQueue([has_hnsw_restore](EngineShard* es) {
    OpArgs op_args{es, nullptr,
                   DbContext{&namespaces->GetDefaultNamespace(), 0, GetCurrentTimeMs()}};
    es->search_indices()->RebuildAllIndices(op_args, has_hnsw_restore);
  });

  // Now execute all pending synonym commands after indices are rebuilt
  for (auto& syn_cmd : synonym_cmds) {
    LoadSearchCommandFromAux(service, std::move(syn_cmd), "FT.SYNUPDATE", "synonym definition");
  }

  // Wait until index building ends
  shard_set->RunBlockingInParallel(
      [](EngineShard* es) { es->search_indices()->BlockUntilConstructionEnd(); });
#endif
}

}  // namespace dfly


================================================
FILE: src/server/rdb_load_context.h
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once

#include <absl/container/flat_hash_map.h>

#include <optional>
#include <string>
#include <vector>

#include "core/search/base.h"
#include "core/search/hnsw_index.h"
#include "util/fibers/synchronization.h"

namespace dfly {

class Service;

// Dispatches a search command (FT.CREATE / FT.SYNUPDATE) from a serialized AUX string.
void LoadSearchCommandFromAux(Service* service, std::string&& def, std::string_view command_name,
                              std::string_view error_context, bool add_NX = false);

// Pending index key-to-DocId mappings to apply after indices are created.
struct PendingIndexMapping {
  std::string index_name;
  std::vector<std::pair<std::string, search::DocId>> mappings;
};

// HNSW metadata loaded from "hnsw-index-metadata" AUX fields.
struct PendingHnswMetadata {
  std::string index_name;
  std::string field_name;
  search::HnswIndexMetadata metadata;
};

// Deferred HNSW graph nodes for restoration when shard counts differ.
struct PendingHnswNodes {
  std::string index_name;
  std::string field_name;
  std::vector<search::HnswNodeData> nodes;
};

// Shared context for collecting search-related state across multiple RdbLoader instances
// during a single load session. Consumed by PerformPostLoad after all loaders finish.
//
// Thread-safe: all mutating methods lock internally.
class RdbLoadContext {
 public:
  RdbLoadContext() = default;

  RdbLoadContext(const RdbLoadContext&) = delete;
  RdbLoadContext& operator=(const RdbLoadContext&) = delete;

  void AddPendingSynonymCommand(std::string cmd);
  void AddPendingIndexMapping(uint32_t shard_id, PendingIndexMapping mapping);
  void AddPendingHnswMetadata(PendingHnswMetadata metadata);
  void AddPendingHnswNodes(PendingHnswNodes nodes);
  void SetMasterShardCount(uint32_t count);

  std::optional<search::HnswIndexMetadata> FindHnswMetadata(std::string_view index_name,
                                                            std::string_view field_name) const;

  // Performs post load procedures while still remaining in global LOADING state.
  // Called once immediately after loading the snapshot / full sync succeeded from the coordinator.
  void PerformPostLoad(Service* service, bool is_error = false);

 private:
  std::vector<std::string> TakePendingSynonymCommands();
  absl::flat_hash_map<uint32_t, std::vector<PendingIndexMapping>> TakePendingIndexMappings();
  std::vector<PendingHnswNodes> TakePendingHnswNodes();

  // Pre-distributed key mappings indexed by target shard_id.
  // Per-shard: index_name -> keys in doc_id order (vector index = doc_id).
  using PerShardMappings = std::vector<absl::flat_hash_map<std::string, std::vector<std::string>>>;

  // Remaps HNSW node global_ids, restores HNSW graphs, and pre-distributes key mappings by
  // target shard. The internal remap table is local and freed when this function returns.
  // Failed indices are excluded from the returned mappings so they fall back to a full rebuild.
  PerShardMappings RemapHnswForDifferentShardCount(
      const absl::flat_hash_map<uint32_t, std::vector<PendingIndexMapping>>& index_mappings,
      std::vector<PendingHnswNodes>& pending_nodes,
      const std::vector<PendingHnswMetadata>& hnsw_metadata);

  mutable util::fb2::Mutex mu_;
  std::vector<std::string> pending_synonym_cmds_ ABSL_GUARDED_BY(mu_);
  absl::flat_hash_map<uint32_t, std::vector<PendingIndexMapping>> pending_index_mappings_
      ABSL_GUARDED_BY(mu_);
  std::vector<PendingHnswMetadata> pending_hnsw_metadata_ ABSL_GUARDED_BY(mu_);
  std::vector<PendingHnswNodes> pending_hnsw_nodes_ ABSL_GUARDED_BY(mu_);
  uint32_t master_shard_count_ = 0;  // Set identically by all loaders from AUX field.
};

}  // namespace dfly


================================================
FILE: src/server/rdb_save.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/rdb_save.h"

#include <absl/cleanup/cleanup.h>
#include <absl/strings/str_cat.h>
#include <absl/strings/str_format.h>

#include <queue>

extern "C" {
#include "redis/crc64.h"
#include "redis/intset.h"
#include "redis/listpack.h"
#include "redis/rdb.h"
#include "redis/stream.h"
#include "redis/util.h"
#include "redis/zmalloc.h"
}

#include "base/flags.h"
#include "base/logging.h"
#include "core/bloom.h"
#include "core/cms.h"
#include "core/json/json_object.h"
#include "core/qlist.h"
#include "core/search/hnsw_index.h"
#include "core/size_tracking_channel.h"
#include "core/sorted_map.h"
#include "core/string_map.h"
#include "core/string_set.h"
#include "server/engine_shard_set.h"
#include "server/error.h"
#include "server/main_service.h"
#include "server/namespaces.h"
#include "server/rdb_extensions.h"
#include "server/search/doc_index.h"
#include "server/search/global_hnsw_index.h"
#include "server/serializer_commons.h"
#include "server/snapshot.h"
#include "server/tiered_storage.h"
#include "server/tiering/common.h"
#include "util/fibers/simple_channel.h"

ABSL_FLAG(dfly::CompressionMode, compression_mode, dfly::CompressionMode::MULTI_ENTRY_LZ4,
          "set 0 for no compression,"
          "set 1 for single entry lzf compression,"
          "set 2 for multi entry zstd compression on df snapshot and single entry on rdb snapshot,"
          "set 3 for multi entry lz4 compression on df snapshot and single entry on rdb snapshot");

// Flip this value to 'true' in March 2026.
ABSL_FLAG(bool, rdb_sbf_chunked, false, "Enable new save format for saving SBFs in chunks.");

namespace dfly {

using namespace std;
using base::IoBuf;
using io::Bytes;

using namespace tiering::literals;

namespace {

/* Encodes the "value" argument as integer when it fits in the supported ranges
 * for encoded types. If the function successfully encodes the integer, the
 * representation is stored in the buffer pointer to by "enc" and the string
 * length is returned. Otherwise 0 is returned. */
unsigned EncodeInteger(long long value, uint8_t* enc) {
  if (value >= -(1 << 7) && value <= (1 << 7) - 1) {
    enc[0] = (RDB_ENCVAL << 6) | RDB_ENC_INT8;
    enc[1] = value & 0xFF;
    return 2;
  }

  if (value >= -(1 << 15) && value <= (1 << 15) - 1) {
    enc[0] = (RDB_ENCVAL << 6) | RDB_ENC_INT16;
    enc[1] = value & 0xFF;
    enc[2] = (value >> 8) & 0xFF;
    return 3;
  }

  constexpr long long k31 = (1LL << 31);
  if (value >= -k31 && value <= k31 - 1) {
    enc[0] = (RDB_ENCVAL << 6) | RDB_ENC_INT32;
    enc[1] = value & 0xFF;
    enc[2] = (value >> 8) & 0xFF;
    enc[3] = (value >> 16) & 0xFF;
    enc[4] = (value >> 24) & 0xFF;
    return 5;
  }

  return 0;
}

/* String objects in the form "2391" "-100" without any space and with a
 * range of values that can fit in an 8, 16 or 32 bit signed value can be
 * encoded as integers to save space */
unsigned TryIntegerEncoding(string_view input, uint8_t* dest) {
  long long value;

  /* Check if it's possible to encode this value as a number */
  if (!absl::SimpleAtoi(input, &value))
    return 0;
  absl::AlphaNum alpha(value);

  /* If the number converted back into a string is not identical
   * then it's not possible to encode the string as integer */
  if (alpha.size() != input.size() || alpha.Piece() != input)
    return 0;

  return EncodeInteger(value, dest);
}

constexpr size_t kBufLen = 64_KB;
constexpr size_t kAmask = 4_KB - 1;
constexpr uint32_t kChannelLen = 2;

}  // namespace

bool AbslParseFlag(std::string_view in, dfly::CompressionMode* flag, std::string* err) {
  if (in == "0" || in == "NONE") {
    *flag = dfly::CompressionMode::NONE;
    return true;
  }
  if (in == "1" || in == "SINGLE_ENTRY") {
    *flag = dfly::CompressionMode::SINGLE_ENTRY;
    return true;
  }
  if (in == "2" || in == "MULTI_ENTRY_ZSTD") {
    *flag = dfly::CompressionMode::MULTI_ENTRY_ZSTD;
    return true;
  }
  if (in == "3" || in == "MULTI_ENTRY_LZ4") {
    *flag = dfly::CompressionMode::MULTI_ENTRY_LZ4;
    return true;
  }

  *err = absl::StrCat("Unknown value ", in, " for compression_mode flag");
  return false;
}

std::string AbslUnparseFlag(dfly::CompressionMode flag) {
  switch (flag) {
    case dfly::CompressionMode::NONE:
      return "NONE";
    case dfly::CompressionMode::SINGLE_ENTRY:
      return "SINGLE_ENTRY";
    case dfly::CompressionMode::MULTI_ENTRY_ZSTD:
      return "MULTI_ENTRY_ZSTD";
    case dfly::CompressionMode::MULTI_ENTRY_LZ4:
      return "MULTI_ENTRY_LZ4";
  }
  DCHECK(false) << "Unknown compression_mode flag value " << int(flag);
  return "NONE";
}

dfly::CompressionMode GetDefaultCompressionMode() {
  return absl::GetFlag(FLAGS_compression_mode);
}

uint8_t RdbObjectType(const CompactObj& pv) {
  unsigned type = pv.ObjType();
  unsigned compact_enc = pv.Encoding();
  switch (type) {
    case OBJ_STRING:
      return RDB_TYPE_STRING;
    case OBJ_LIST:
      return RDB_TYPE_LIST_QUICKLIST_2;
      break;
    case OBJ_SET:
      if (compact_enc == kEncodingIntSet)
        return RDB_TYPE_SET_INTSET;
      else if (compact_enc == kEncodingStrMap2) {
        if (((StringSet*)pv.RObjPtr())->ExpirationUsed())
          return RDB_TYPE_SET_WITH_EXPIRY;
        else
          return RDB_TYPE_SET;
      }
      break;
    case OBJ_ZSET:
      if (compact_enc == OBJ_ENCODING_LISTPACK)
        return RDB_TYPE_ZSET_LISTPACK;
      else if (compact_enc == OBJ_ENCODING_SKIPLIST)
        return RDB_TYPE_ZSET_2;
      break;
    case OBJ_HASH:
      if (compact_enc == kEncodingListPack)
        return RDB_TYPE_HASH_LISTPACK;
      else if (compact_enc == kEncodingStrMap2) {
        if (((StringMap*)pv.RObjPtr())->ExpirationUsed())
          return RDB_TYPE_HASH_WITH_EXPIRY;  // Incompatible with Redis
        else
          return RDB_TYPE_HASH;
      }
      break;
    case OBJ_STREAM:
      return RDB_TYPE_STREAM_LISTPACKS_3;
    case OBJ_MODULE:
      return RDB_TYPE_MODULE_2;
    case OBJ_JSON:
      return RDB_TYPE_JSON;
    case OBJ_SBF:
      return absl::GetFlag(FLAGS_rdb_sbf_chunked) ? RDB_TYPE_SBF2 : RDB_TYPE_SBF;
    case OBJ_CMS:
      return RDB_TYPE_CMS;
  }
  LOG(FATAL) << "Unknown encoding " << compact_enc << " for type " << type;
  return 0; /* avoid warning */
}

RdbSerializerBase::RdbSerializerBase(CompressionMode compression_mode)
    : compression_mode_(compression_mode), mem_buf_{4_KB}, tmp_buf_(nullptr) {
}

RdbSerializer::RdbSerializer(CompressionMode compression_mode, ConsumeFun consume_fun,
                             size_t flush_threshold)
    : RdbSerializerBase(compression_mode),
      consume_fun_(std::move(consume_fun)),
      flush_threshold_(flush_threshold) {
}

RdbSerializer::~RdbSerializer() {
  VLOG(2) << "compression mode: " << uint32_t(compression_mode_);
  if (compression_stats_) {
    VLOG(2) << "compression not effective: " << compression_stats_->compression_no_effective;
    VLOG(2) << "string compression skipped: " << compression_stats_->size_skip_count;
    VLOG(2) << "compression failed: " << compression_stats_->compression_failed;
    VLOG(2) << "compressed blobs:" << compression_stats_->compressed_blobs;
  }
}

std::error_code RdbSerializer::SaveValue(const PrimeValue& pv) {
  std::error_code ec;
  if (pv.ObjType() == OBJ_STRING) {
    auto opt_int = pv.TryGetInt();
    if (opt_int) {
      ec = SaveLongLongAsString(*opt_int);
    } else {
      if (pv.IsExternal()) {
        if (pv.IsCool()) {
          return SaveValue(pv.GetCool().record->value);
        }
        LOG(FATAL) << "External string not supported yet";
      } else {
        ec = SaveString(pv.GetSlice(&tmp_str_));
      }
    }
  } else {
    ec = SaveObject(pv);
  }
  return ec;
}

error_code RdbSerializer::SelectDb(uint32_t dbid) {
  if (dbid == last_entry_db_index_) {
    return error_code{};
  }
  last_entry_db_index_ = dbid;
  uint8_t buf[16];
  buf[0] = RDB_OPCODE_SELECTDB;
  unsigned enclen = WritePackedUInt(dbid, io::MutableBytes{buf}.subspan(1));
  return WriteRaw(Bytes{buf, enclen + 1});
}

// Called by snapshot
io::Result<uint8_t> RdbSerializer::SaveEntry(const PrimeKey& pk, const PrimeValue& pv,
                                             uint64_t expire_ms, uint32_t mc_flags, DbIndex dbid) {
  if (!pv.TagAllowsEmptyValue() && pv.Size() == 0) {
    string_view key = pk.GetSlice(&tmp_str_);
    LOG(DFATAL) << "SaveEntry skipped empty PrimeValue with key: " << key << " with tag "
                << static_cast<int>(pv.Tag());
    return 0;
  }

  DVLOG(3) << "Selecting " << dbid << " previous: " << last_entry_db_index_;
  auto ec = SelectDb(dbid);
  if (ec) {
    return make_unexpected(ec);
  }

  /* Save the expire time */
  if (expire_ms > 0) {
    uint8_t buf[16] = {RDB_OPCODE_EXPIRETIME_MS};
    absl::little_endian::Store64(buf + 1, expire_ms);
    if (auto ec = WriteRaw(Bytes{buf, 9}); ec)
      return make_unexpected(ec);
  }

  /* Save the key poperties */
  uint32_t df_mask_flags = pk.IsSticky() ? DF_MASK_FLAG_STICKY : 0;
  df_mask_flags |= pv.HasFlag() ? DF_MASK_FLAG_MC_FLAGS : 0;
  if (df_mask_flags != 0) {
    uint8_t buf[9] = {RDB_OPCODE_DF_MASK};
    absl::little_endian::Store32(buf + 1, df_mask_flags);
    size_t buf_size = 5;
    if (df_mask_flags & DF_MASK_FLAG_MC_FLAGS) {
      absl::little_endian::Store32(buf + buf_size, mc_flags);
      buf_size += 4;
    }
    if (auto ec = WriteRaw(Bytes{buf, buf_size}); ec)
      return make_unexpected(ec);
  }

  uint8_t rdb_type = RdbObjectType(pv);

  string_view key = pk.GetSlice(&tmp_str_);
  DVLOG(3) << ((void*)this) << ": Saving key/val start " << key << " in dbid=" << dbid;

  if (auto ec = WriteOpcode(rdb_type); ec)
    return make_unexpected(ec);

  if (auto ec = SaveString(key); ec)
    return make_unexpected(ec);

  if (auto ec = SaveValue(pv); ec) {
    LOG(ERROR) << "Problems saving value for key " << key << " in dbid=" << dbid;
    return make_unexpected(ec);
  }

  // We flush here because if the next element in the bucket we are serializing is a container,
  // it will first serialize the first entry and then flush the internal buffer, even if
  // crossed the limit.
  PushToConsumerIfNeeded(FlushState::kFlushEndEntry);
  return rdb_type;
}

error_code RdbSerializer::SaveObject(const PrimeValue& pv) {
  unsigned obj_type = pv.ObjType();
  CHECK_NE(obj_type, OBJ_STRING);

  if (obj_type == OBJ_LIST) {
    return SaveListObject(pv);
  }

  if (obj_type == OBJ_SET) {
    return SaveSetObject(pv);
  }

  if (obj_type == OBJ_HASH) {
    return SaveHSetObject(pv);
  }

  if (obj_type == OBJ_ZSET) {
    return SaveZSetObject(pv);
  }

  if (obj_type == OBJ_STREAM) {
    return SaveStreamObject(pv);
  }

  if (obj_type == OBJ_JSON) {
    return SaveJsonObject(pv);
  }

  if (obj_type == OBJ_SBF) {
    return SaveSBFObject(pv);
  }

  if (obj_type == OBJ_CMS) {
    return SaveCMSObject(pv);
  }

  LOG(ERROR) << "Not implemented " << obj_type;
  return make_error_code(errc::function_not_supported);
}

error_code RdbSerializer::SaveListObject(const PrimeValue& pv) {
  /* Save a list value */
  if (pv.Encoding() == kEncodingListPack) {
    uint8_t* lp = (uint8_t*)pv.RObjPtr();
    size_t len = 1;  // 1 node
    RETURN_ON_ERR(SaveLen(len));

    // Node 1
    RETURN_ON_ERR(SaveLen(QUICKLIST_NODE_CONTAINER_PACKED));
    size_t lp_bytes = lpBytes(lp);
    RETURN_ON_ERR(SaveString(lp, lp_bytes));

    PushToConsumerIfNeeded(FlushState::kFlushEndEntry);
    return error_code{};
  }

  DCHECK_EQ(pv.Encoding(), kEncodingQL2);
  QList* ql = reinterpret_cast<QList*>(pv.RObjPtr());
  const QList::Node* node = ql->Head();
  size_t len = ql->node_count();

  RETURN_ON_ERR(SaveLen(len));

  while (node) {
    DVLOG(3) << "QL node (encoding/container/sz): " << node->encoding << "/" << node->container
             << "/" << node->sz;

    // Use listpack encoding
    RETURN_ON_ERR(SaveLen(node->container));
    if (node->IsCompressed()) {
      void* data;
      size_t compress_len = node->GetLZF(&data);
      // TODO: LZ4 compression mode is not enabled for list objects yet.
      // If it will be enabled in the future, we need to adjust here accordingly.
      RETURN_ON_ERR(SaveLzfBlob(Bytes{reinterpret_cast<uint8_t*>(data), compress_len}, node->sz));
    } else {
      RETURN_ON_ERR(SaveString(node->entry, node->sz));
      FlushState flush_state = FlushState::kFlushMidEntry;
      if (node->next == nullptr)
        flush_state = FlushState::kFlushEndEntry;
      PushToConsumerIfNeeded(flush_state);
    }
    node = node->next;
  }
  return error_code{};
}

error_code RdbSerializer::SaveSetObject(const PrimeValue& obj) {
  if (obj.Encoding() == kEncodingStrMap2) {
    StringSet* set = (StringSet*)obj.RObjPtr();

    // We don't expire any data during serialization
    set->set_time(0);

    // due to we avoid expiring we can use UpperBoundSize() instead of SlowSize()
    RETURN_ON_ERR(SaveLen(set->UpperBoundSize()));
    for (auto it = set->begin(); it != set->end();) {
      RETURN_ON_ERR(SaveString(string_view{*it, sdslen(*it)}));
      if (set->ExpirationUsed()) {
        int64_t expiry = -1;
        if (it.HasExpiry())
          expiry = it.ExpiryTime();
        RETURN_ON_ERR(SaveLongLongAsString(expiry));
      }
      ++it;
      FlushState flush_state = FlushState::kFlushMidEntry;
      if (it == set->end())
        flush_state = FlushState::kFlushEndEntry;
      PushToConsumerIfNeeded(flush_state);
    }
    set->set_time(MemberTimeSeconds(GetCurrentTimeMs()));
  } else {
    CHECK_EQ(obj.Encoding(), kEncodingIntSet);
    intset* is = (intset*)obj.RObjPtr();
    size_t len = intsetBlobLen(is);

    RETURN_ON_ERR(SaveString(string_view{(char*)is, len}));
  }

  return error_code{};
}

error_code RdbSerializer::SaveHSetObject(const PrimeValue& pv) {
  DCHECK_EQ(OBJ_HASH, pv.ObjType());

  if (pv.Encoding() == kEncodingStrMap2) {
    StringMap* string_map = (StringMap*)pv.RObjPtr();

    // We don't expire any data during serialization
    string_map->set_time(0);

    // due to we avoid expiring we can use UpperBoundSize() instead of SlowSize()
    RETURN_ON_ERR(SaveLen(string_map->UpperBoundSize()));

    for (auto it = string_map->begin(); it != string_map->end();) {
      const auto& [k, v] = *it;
      RETURN_ON_ERR(SaveString(string_view{k, sdslen(k)}));
      RETURN_ON_ERR(SaveString(string_view{v, sdslen(v)}));
      if (string_map->ExpirationUsed()) {
        int64_t expiry = -1;
        if (it.HasExpiry())
          expiry = it.ExpiryTime();
        RETURN_ON_ERR(SaveLongLongAsString(expiry));
      }
      ++it;
      FlushState flush_state = FlushState::kFlushMidEntry;
      if (it == string_map->end())
        flush_state = FlushState::kFlushEndEntry;
      PushToConsumerIfNeeded(flush_state);
    }

    string_map->set_time(MemberTimeSeconds(GetCurrentTimeMs()));
  } else {
    CHECK_EQ(kEncodingListPack, pv.Encoding());

    uint8_t* lp = (uint8_t*)pv.RObjPtr();
    size_t lp_bytes = lpBytes(lp);
    RETURN_ON_ERR(SaveString((uint8_t*)lp, lp_bytes));
  }

  return error_code{};
}

error_code RdbSerializer::SaveZSetObject(const PrimeValue& pv) {
  DCHECK_EQ(OBJ_ZSET, pv.ObjType());
  if (pv.Encoding() == OBJ_ENCODING_SKIPLIST) {
    auto* zs = static_cast<detail::SortedMap*>(pv.RObjPtr());

    RETURN_ON_ERR(SaveLen(zs->Size()));
    std::error_code ec;

    const size_t total = zs->Size();
    size_t count = 0;

    // Iterate over the sorted map and save the key and score.
    // The order is important (from smallest to biggest) - so that the loader
    // will load the entries faster.
    zs->Iterate(0, total, false, [&](sds ele, double score) mutable {
      ec = SaveString(string_view{ele, sdslen(ele)});
      if (ec)
        return false;
      ec = SaveBinaryDouble(score);
      if (ec)
        return false;
      ++count;
      FlushState flush_state = FlushState::kFlushMidEntry;
      if (count == total)
        flush_state = FlushState::kFlushEndEntry;

      PushToConsumerIfNeeded(flush_state);
      return true;
    });
  } else {
    CHECK_EQ(pv.Encoding(), unsigned(OBJ_ENCODING_LISTPACK));
    uint8_t* lp = (uint8_t*)pv.RObjPtr();
    size_t lp_bytes = lpBytes(lp);

    RETURN_ON_ERR(SaveString((uint8_t*)lp, lp_bytes));
  }

  return error_code{};
}

error_code RdbSerializer::SaveStreamObject(const PrimeValue& pv) {
  /* Store how many listpacks we have inside the radix tree. */
  stream* s = (stream*)pv.RObjPtr();
  const size_t rax_size = raxSize(s->rax);

  RETURN_ON_ERR(SaveLen(rax_size));

  /* Serialize all the listpacks inside the radix tree as they are,
   * when loading back, we'll use the first entry of each listpack
   * to insert it back into the radix tree. */
  raxIterator ri;
  raxStart(&ri, s->rax);
  raxSeek(&ri, "^", NULL, 0);

  auto stop_listpacks_rax = absl::MakeCleanup([&] { raxStop(&ri); });

  for (size_t i = 0; raxNext(&ri); i++) {
    uint8_t* lp = (uint8_t*)ri.data;
    size_t lp_bytes = lpBytes(lp);

    RETURN_ON_ERR(SaveString((uint8_t*)ri.key, ri.key_len));
    RETURN_ON_ERR(SaveString(lp, lp_bytes));

    PushToConsumerIfNeeded(FlushState::kFlushMidEntry);
  }

  std::move(stop_listpacks_rax).Invoke();

  /* Save the number of elements inside the stream. We cannot obtain
   * this easily later, since our macro nodes should be checked for
   * number of items: not a great CPU / space tradeoff. */

  RETURN_ON_ERR(SaveLen(s->length));

  /* Save the last entry ID. */
  RETURN_ON_ERR(SaveLen(s->last_id.ms));
  RETURN_ON_ERR(SaveLen(s->last_id.seq));

  uint8_t rdb_type = RdbObjectType(pv);

  // 'first_id', 'max_deleted_entry_id' and 'entries_added' are added
  // in RDB_TYPE_STREAM_LISTPACKS_2
  if (rdb_type >= RDB_TYPE_STREAM_LISTPACKS_2) {
    /* Save the first entry ID. */
    RETURN_ON_ERR(SaveLen(s->first_id.ms));
    RETURN_ON_ERR(SaveLen(s->first_id.seq));

    /* Save the maximal tombstone ID. */
    RETURN_ON_ERR(SaveLen(s->max_deleted_entry_id.ms));
    RETURN_ON_ERR(SaveLen(s->max_deleted_entry_id.seq));

    /* Save the offset. */
    RETURN_ON_ERR(SaveLen(s->entries_added));
  }
  /* The consumer groups and their clients are part of the stream
   * type, so serialize every consumer group. */

  /* Save the number of groups. */
  size_t num_cgroups = s->cgroups ? raxSize(s->cgroups) : 0;
  RETURN_ON_ERR(SaveLen(num_cgroups));

  if (num_cgroups) {
    /* Serialize each consumer group. */
    raxStart(&ri, s->cgroups);
    raxSeek(&ri, "^", NULL, 0);

    auto stop_cgroups_rax = absl::MakeCleanup([&] { raxStop(&ri); });

    while (raxNext(&ri)) {
      streamCG* cg = (streamCG*)ri.data;

      /* Save the group name. */
      RETURN_ON_ERR(SaveString((uint8_t*)ri.key, ri.key_len));

      /* Last ID. */
      RETURN_ON_ERR(SaveLen(cg->last_id.ms));

      RETURN_ON_ERR(SaveLen(cg->last_id.seq));

      if (rdb_type >= RDB_TYPE_STREAM_LISTPACKS_2) {
        /* Save the group's logical reads counter. */
        RETURN_ON_ERR(SaveLen(cg->entries_read));
      }

      /* Save the global PEL. */
      RETURN_ON_ERR(SaveStreamPEL(cg->pel, true));

      /* Save the consumers of this group. */
      RETURN_ON_ERR(SaveStreamConsumers(rdb_type >= RDB_TYPE_STREAM_LISTPACKS_3, cg));
    }
  }

  PushToConsumerIfNeeded(FlushState::kFlushEndEntry);

  return error_code{};
}

error_code RdbSerializer::SaveJsonObject(const PrimeValue& pv) {
  auto json_string = pv.GetJson()->to_string();
  return SaveString(json_string);
}

std::error_code RdbSerializer::SaveSBFObject(const PrimeValue& pv) {
  SBF* sbf = pv.GetSBF();

  // options to allow format mutations in the future.
  RETURN_ON_ERR(SaveLen(0));  // options - reserved
  RETURN_ON_ERR(SaveBinaryDouble(sbf->grow_factor()));
  RETURN_ON_ERR(SaveBinaryDouble(sbf->fp_probability()));
  RETURN_ON_ERR(SaveLen(sbf->prev_size()));
  RETURN_ON_ERR(SaveLen(sbf->current_size()));
  RETURN_ON_ERR(SaveLen(sbf->max_capacity()));
  RETURN_ON_ERR(SaveLen(sbf->num_filters()));

  for (unsigned i = 0; i < sbf->num_filters(); ++i) {
    RETURN_ON_ERR(SaveLen(sbf->hashfunc_cnt(i)));

    string_view blob = sbf->data(i);
    if (absl::GetFlag(FLAGS_rdb_sbf_chunked)) {
      RETURN_ON_ERR(SaveLen(blob.size()));

      for (size_t offset = 0; offset < blob.size(); offset += kFilterChunkSize) {
        size_t chunk_len = std::min(kFilterChunkSize, blob.size() - offset);
        RETURN_ON_ERR(SaveString(blob.substr(offset, chunk_len)));
      }
    } else {
      RETURN_ON_ERR(SaveString(blob));
    }

    FlushState flush_state = FlushState::kFlushMidEntry;
    if ((i + 1) == sbf->num_filters())
      flush_state = FlushState::kFlushEndEntry;
    PushToConsumerIfNeeded(flush_state);
  }

  return {};
}

std::error_code RdbSerializer::SaveCMSObject(const PrimeValue& pv) {
  CMS* cms = pv.GetCMS();

  RETURN_ON_ERR(SaveLen(cms->width()));
  RETURN_ON_ERR(SaveLen(cms->depth()));
  RETURN_ON_ERR(SaveLen(cms->total_count()));

  size_t num_counters = cms->NumCounters();
  const int64_t* data = cms->Data();

  // Serialize counters as little-endian 64-bit values
  std::vector<uint64_t> buf(num_counters);
  for (size_t i = 0; i < num_counters; ++i) {
    absl::little_endian::Store64(&buf[i], static_cast<uint64_t>(data[i]));
  }
  RETURN_ON_ERR(
      WriteRaw(Bytes{reinterpret_cast<const uint8_t*>(buf.data()), buf.size() * sizeof(uint64_t)}));

  return {};
}

/* Save a long long value as either an encoded string or a string. */
error_code RdbSerializer::SaveLongLongAsString(int64_t value) {
  uint8_t buf[32];
  unsigned enclen = EncodeInteger(value, buf);
  if (enclen > 0) {
    return WriteRaw(Bytes{buf, enclen});
  }

  /* Encode as string */
  enclen = ll2string((char*)buf, 32, value);
  DCHECK_LT(enclen, 32u);

  RETURN_ON_ERR(SaveLen(enclen));
  return WriteRaw(Bytes{buf, enclen});
}

/* Saves a double for RDB 8 or greater, where IE754 binary64 format is assumed.
 * We just make sure the integer is always stored in little endian, otherwise
 * the value is copied verbatim from memory to disk.
 *
 * Return -1 on error, the size of the serialized value on success. */
error_code RdbSerializer::SaveBinaryDouble(double val) {
  static_assert(sizeof(val) == 8);
  const uint64_t* src = reinterpret_cast<const uint64_t*>(&val);
  uint8_t buf[8];
  absl::little_endian::Store64(buf, *src);

  return WriteRaw(Bytes{buf, sizeof(buf)});
}

error_code RdbSerializer::SaveStreamPEL(rax* pel, bool nacks) {
  /* Number of entries in the PEL. */

  RETURN_ON_ERR(SaveLen(raxSize(pel)));

  /* Save each entry. */
  raxIterator ri;
  raxStart(&ri, pel);
  raxSeek(&ri, "^", NULL, 0);
  auto cleanup = absl::MakeCleanup([&] { raxStop(&ri); });

  while (raxNext(&ri)) {
    /* We store IDs in raw form as 128 big big endian numbers, like
     * they are inside the radix tree key. */
    RETURN_ON_ERR(WriteRaw(Bytes{ri.key, sizeof(streamID)}));

    if (nacks) {
      streamNACK* nack = (streamNACK*)ri.data;
      uint8_t buf[8];
      absl::little_endian::Store64(buf, nack->delivery_time);
      RETURN_ON_ERR(WriteRaw(buf));
      RETURN_ON_ERR(SaveLen(nack->delivery_count));

      /* We don't save the consumer name: we'll save the pending IDs
       * for each consumer in the consumer PEL, and resolve the consumer
       * at loading time. */
    }
  }

  return error_code{};
}

error_code RdbSerializer::SaveStreamConsumers(bool save_active, streamCG* cg) {
  /* Number of consumers in this consumer group. */

  RETURN_ON_ERR(SaveLen(raxSize(cg->consumers)));

  /* Save each consumer. */
  raxIterator ri;
  raxStart(&ri, cg->consumers);
  raxSeek(&ri, "^", NULL, 0);
  auto cleanup = absl::MakeCleanup([&] { raxStop(&ri); });
  uint8_t buf[8];

  while (raxNext(&ri)) {
    streamConsumer* consumer = (streamConsumer*)ri.data;

    /* Consumer name. */
    RETURN_ON_ERR(SaveString(ri.key, ri.key_len));

    /* seen time. */
    absl::little_endian::Store64(buf, consumer->seen_time);
    RETURN_ON_ERR(WriteRaw(buf));

    if (save_active) {
      /* Active time. */
      absl::little_endian::Store64(buf, consumer->active_time);
      RETURN_ON_ERR(WriteRaw(buf));
    }
    /* Consumer PEL, without the ACKs (see last parameter of the function
     * passed with value of 0), at loading time we'll lookup the ID
     * in the consumer group global PEL and will put a reference in the
     * consumer local PEL. */

    RETURN_ON_ERR(SaveStreamPEL(consumer->pel, false));
  }

  return error_code{};
}

error_code RdbSerializer::SendEofAndChecksum() {
  VLOG(2) << "SendEof";
  /* EOF opcode */
  RETURN_ON_ERR(WriteOpcode(RDB_OPCODE_EOF));

  /* CRC64 checksum. It will be zero if checksum computation is disabled, the
   * loading code skips the check in this case. */
  uint8_t buf[8];
  uint64_t chksum = 0;

  absl::little_endian::Store64(buf, chksum);
  return WriteRaw(buf);
}

error_code RdbSerializer::SendJournalOffset(uint64_t journal_offset) {
  VLOG(2) << "SendJournalOffset";
  RETURN_ON_ERR(WriteOpcode(RDB_OPCODE_JOURNAL_OFFSET));
  uint8_t buf[sizeof(uint64_t)];
  absl::little_endian::Store64(buf, journal_offset);
  return WriteRaw(buf);
}

error_code RdbSerializer::SaveHNSWEntry(const search::HnswNodeData& node,
                                        absl::Span<uint8_t> tmp_buf) {
  // Binary format using little-endian encoding for efficiency:
  // - internal_id: 4 bytes (uint32_t)
  // - global_id: 8 bytes (uint64_t)
  // - level: 4 bytes (int)
  // - for each level (0 to level): links_num (4 bytes) + links (4 bytes each)

  size_t total_size = node.TotalSize();
  DCHECK_LE(total_size, tmp_buf.size());
  uint8_t* ptr = tmp_buf.data();

  absl::little_endian::Store32(ptr, static_cast<uint32_t>(node.internal_id));
  ptr += 4;
  absl::little_endian::Store64(ptr, node.global_id);
  ptr += 8;
  absl::little_endian::Store32(ptr, static_cast<uint32_t>(node.level));
  ptr += 4;

  for (const auto& level_links : node.levels_links) {
    absl::little_endian::Store32(ptr, static_cast<uint32_t>(level_links.size()));
    ptr += 4;
    for (uint32_t link : level_links) {
      absl::little_endian::Store32(ptr, link);
      ptr += 4;
    }
  }

  return WriteRaw(Bytes{tmp_buf.data(), total_size});
}

error_code RdbSerializerBase::SendFullSyncCut() {
  VLOG(1) << "SendFullSyncCut";
  RETURN_ON_ERR(WriteOpcode(RDB_OPCODE_FULLSYNC_END));

  // RDB_OPCODE_FULLSYNC_END followed by 8 bytes of 0.
  // The reason for this is that some opcodes require to have at least 8 bytes of data
  // in the read buffer when consuming the rdb data, and since RDB_OPCODE_FULLSYNC_END is one of
  // the last opcodes sent to replica, we respect this requirement by sending a blob of 8 bytes.
  uint8_t buf[8] = {0};
  return WriteRaw(buf);
}

std::error_code RdbSerializerBase::WriteOpcode(uint8_t opcode) {
  return WriteRaw(::io::Bytes{&opcode, 1});
}

size_t RdbSerializerBase::GetBufferCapacity() const {
  return mem_buf_.Capacity();
}

size_t RdbSerializerBase::GetTempBufferSize() const {
  return tmp_buf_.size();
}

error_code RdbSerializerBase::WriteRaw(const io::Bytes& buf) {
  mem_buf_.Reserve(mem_buf_.InputLen() + buf.size());
  IoBuf::Bytes dest = mem_buf_.AppendBuffer();
  memcpy(dest.data(), buf.data(), buf.size());
  mem_buf_.CommitWrite(buf.size());
  return error_code{};
}

string RdbSerializerBase::Flush(RdbSerializerBase::FlushState flush_state) {
  auto bytes = PrepareFlush(flush_state);
  if (bytes.empty())
    return {};

  if (bytes.size() > serialization_peak_bytes_) {
    serialization_peak_bytes_ = bytes.size();
  }

  DVLOG(2) << "FlushToSink " << bytes.size() << " bytes";

  string result(io::View(bytes));

  mem_buf_.ConsumeInput(bytes.size());

  return result;
}

string RdbSerializer::Flush(FlushState flush_state) {
  string res = RdbSerializerBase::Flush(flush_state);

  // After every flush we should write the DB index again because the blobs in the channel are
  // interleaved and multiple savers can correspond to a single writer (in case of single file rdb
  // snapshot)
  last_entry_db_index_ = kInvalidDbId;

  return res;
}

namespace {
using VersionBuffer = std::array<char, sizeof(uint16_t)>;
using CrcBuffer = std::array<char, sizeof(uint64_t)>;

VersionBuffer MakeRdbVersion() {
  VersionBuffer buf;
  buf[0] = RDB_SER_VERSION & 0xff;
  buf[1] = (RDB_SER_VERSION >> 8) & 0xff;
  return buf;
}

CrcBuffer MakeCheckSum(std::string_view dump_res, bool ignore_crc) {
  uint64_t chksum =
      ignore_crc ? 0 : crc64(0, reinterpret_cast<const uint8_t*>(dump_res.data()), dump_res.size());
  CrcBuffer buf;
  absl::little_endian::Store64(buf.data(), chksum);
  return buf;
}

void AppendFooter(bool ignore_crc, string* dest) {
  auto to_bytes = [dest](const auto& buf) { dest->append(buf.data(), buf.size()); };

  /* Write the footer, this is how it looks like:
   * ----------------+---------------------+---------------+
   * ... RDB payload | 2 bytes RDB version | 8 bytes CRC64 |
   * ----------------+---------------------+---------------+
   * RDB version and CRC are both in little endian.
   */
  const auto ver = MakeRdbVersion();
  to_bytes(ver);
  const auto crc = MakeCheckSum(*dest, ignore_crc);
  to_bytes(crc);
}
}  // namespace

string RdbSerializerBase::DumpValue(RdbSerializer* serializer, const PrimeValue& obj,
                                    bool ignore_crc) {
  CompressionMode serializer_used_compression_mode = serializer->compression_mode_;
  if (serializer_used_compression_mode != CompressionMode::NONE) {
    serializer->SetCompressionMode(CompressionMode::SINGLE_ENTRY);
  }

  // According to Redis code we need to
  // 1. Save the value itself - without the key
  // 2. Save footer: this include the RDB version and the CRC value for the message
  auto type = RdbObjectType(obj);
  DVLOG(2) << "We are going to dump object type: " << int(type);

  std::error_code ec = serializer->WriteOpcode(type);
  CHECK(!ec);
  ec = serializer->SaveValue(obj);
  CHECK(!ec);  // make sure that fully was successful
  string res = serializer->Flush(RdbSerializerBase::FlushState::kFlushMidEntry);
  CHECK(!res.empty());             // make sure that fully was successful
  AppendFooter(ignore_crc, &res);  // version and crc
  CHECK_GT(res.size(), 10u);

  serializer->SetCompressionMode(serializer_used_compression_mode);
  return res;
}

string RdbSerializerBase::DumpValue(const PrimeValue& obj, bool ignore_crc) {
  RdbSerializer serializer(GetDefaultCompressionMode());
  return DumpValue(&serializer, obj, ignore_crc);
}

size_t RdbSerializerBase::SerializedLen() const {
  return mem_buf_.InputLen();
}

io::Bytes RdbSerializerBase::PrepareFlush(RdbSerializerBase::FlushState flush_state) {
  size_t sz = mem_buf_.InputLen();
  if (sz == 0)
    return {};

  bool is_last_chunk = flush_state == FlushState::kFlushEndEntry;
  VLOG(2) << "PrepareFlush:" << is_last_chunk << " " << number_of_chunks_;
  if (is_last_chunk && number_of_chunks_ == 0) {
    if (compression_mode_ == CompressionMode::MULTI_ENTRY_ZSTD ||
        compression_mode_ == CompressionMode::MULTI_ENTRY_LZ4) {
      CompressBlob();
    }
  }

  number_of_chunks_ = is_last_chunk ? 0 : (number_of_chunks_ + 1);

  return mem_buf_.InputBuffer();
}

error_code RdbSerializerBase::WriteJournalEntry(std::string_view serialized_entry) {
  VLOG(2) << "WriteJournalEntry";
  RETURN_ON_ERR(WriteOpcode(RDB_OPCODE_JOURNAL_BLOB));
  RETURN_ON_ERR(SaveLen(1));
  RETURN_ON_ERR(SaveString(serialized_entry));
  return error_code{};
}

error_code RdbSerializerBase::SaveString(string_view val) {
  /* Try integer encoding */
  if (val.size() <= 11) {
    uint8_t buf[16];

    unsigned enclen = TryIntegerEncoding(val, buf);
    if (enclen > 0) {
      return WriteRaw(Bytes{buf, unsigned(enclen)});
    }
  }

  /* Try LZF compression - under 20 bytes it's unable to compress even
   * aaaaaaaaaaaaaaaaaa so skip it */
  size_t len = val.size();
  if ((compression_mode_ == CompressionMode::SINGLE_ENTRY) && (len > 20)) {
    size_t comprlen, outlen = len;
    tmp_buf_.resize(outlen + 1);

    // Due to stack constraints im fibers we can not allow large arrays on stack.
    // Therefore I am lazily allocating it on heap. It's not fixed in quicklist.
    if (!lzf_) {
      lzf_.reset(new LZF_HSLOT[1 << HLOG]);
    }

    /* We require at least 8 bytes compression for this to be worth it */
    comprlen = lzf_compress(val.data(), len, tmp_buf_.data(), outlen, lzf_.get());
    if (comprlen > 0 && comprlen < len - 8 && comprlen < size_t(len * 0.85)) {
      return SaveLzfBlob(Bytes{tmp_buf_.data(), comprlen}, len);
    }
  }

  /* Store verbatim */
  RETURN_ON_ERR(SaveLen(len));
  if (len > 0) {
    Bytes b{reinterpret_cast<const uint8_t*>(val.data()), val.size()};
    RETURN_ON_ERR(WriteRaw(b));
  }
  return error_code{};
}

error_code RdbSerializerBase::SaveLen(size_t len) {
  uint8_t buf[16];
  unsigned enclen = WritePackedUInt(len, buf);
  return WriteRaw(Bytes{buf, enclen});
}

error_code RdbSerializerBase::SaveLzfBlob(const io::Bytes& src, size_t uncompressed_len) {
  /* Data compressed! Let's save it on disk */
  uint8_t opcode = (RDB_ENCVAL << 6) | RDB_ENC_LZF;
  RETURN_ON_ERR(WriteOpcode(opcode));
  RETURN_ON_ERR(SaveLen(src.size()));
  RETURN_ON_ERR(SaveLen(uncompressed_len));
  RETURN_ON_ERR(WriteRaw(src));

  return error_code{};
}

AlignedBuffer::AlignedBuffer(size_t cap, ::io::Sink* upstream)
    : capacity_(cap), upstream_(upstream) {
  aligned_buf_ = (char*)mi_malloc_aligned(kBufLen, 4_KB);
}

AlignedBuffer::~AlignedBuffer() {
  mi_free(aligned_buf_);
}

io::Result<size_t> AlignedBuffer::WriteSome(const iovec* v, uint32_t len) {
  size_t total_len = 0;
  uint32_t vindx = 0;

  for (; vindx < len; ++vindx) {
    auto item = v[vindx];
    total_len += item.iov_len;

    while (buf_offs_ + item.iov_len > capacity_) {
      size_t to_write = capacity_ - buf_offs_;
      memcpy(aligned_buf_ + buf_offs_, item.iov_base, to_write);
      iovec ivec{.iov_base = aligned_buf_, .iov_len = capacity_};
      error_code ec = upstream_->Write(&ivec, 1);
      if (ec)
        return nonstd::make_unexpected(ec);

      item.iov_len -= to_write;
      item.iov_base = reinterpret_cast<char*>(item.iov_base) + to_write;
      buf_offs_ = 0;
    }

    DCHECK_GT(item.iov_len, 0u);
    memcpy(aligned_buf_ + buf_offs_, item.iov_base, item.iov_len);
    buf_offs_ += item.iov_len;
  }

  return total_len;
}

// Note that it may write more than AlignedBuffer has at this point since it rounds up the length
// to the nearest page boundary.
error_code AlignedBuffer::Flush() {
  size_t len = (buf_offs_ + kAmask) & (~kAmask);
  if (len == 0)
    return error_code{};

  iovec ivec{.iov_base = aligned_buf_, .iov_len = len};
  buf_offs_ = 0;

  return upstream_->Write(&ivec, 1);
}

// Ensures SliceSnapshot is destroyed on its owning shard thread.
struct OwnerThreadDeleter {
  ShardId owner_sid;

  OwnerThreadDeleter() : owner_sid(0) {
  }

  explicit OwnerThreadDeleter(ShardId sid) : owner_sid(sid) {
  }

  static OwnerThreadDeleter FromShard(EngineShard* shard) {
    return OwnerThreadDeleter(shard->shard_id());
  }

  void operator()(SliceSnapshot* ptr) const {
    if (!ptr)
      return;

    if (EngineShard::tlocal() && EngineShard::tlocal()->shard_id() == owner_sid) {
      delete ptr;
      return;
    }

    shard_set->Await(owner_sid, [ptr] { delete ptr; });
  }
};

using SnapshotPtr = std::unique_ptr<SliceSnapshot, OwnerThreadDeleter>;

class RdbSaver::Impl final : public SliceSnapshot::SnapshotDataConsumerInterface {
 private:
  void CleanShardSnapshots();
  SnapshotPtr CreateSliceSnapshot(EngineShard* shard, DbSlice* db_slice, ExecutionState* cntx);

 public:
  // We pass K=sz to say how many producers are pushing data in order to maintain
  // correct closing semantics - channel is closing when K producers marked it as closed.
  Impl(bool align_writes, unsigned producers_len, CompressionMode compression_mode,
       SaveMode save_mode, io::Sink* sink, DflyVersion replica_dfly_version);

  ~Impl();

  void StartSnapshotting(bool stream_journal, ExecutionState* cntx, EngineShard* shard);

  void StopSnapshotting(EngineShard* shard);
  void WaitForSnapshottingFinish(EngineShard* shard);

  // Pushes snapshot data. Called from SliceSnapshot
  void ConsumeData(std::string data, ExecutionState* cntx) override;
  // Finalizes the snapshot writing. Called from SliceSnapshot
  void Finalize() override;

  // used only for legacy rdb save flows.
  error_code ConsumeChannel(const ExecutionState* cll);

  void FillFreqMap(RdbTypeFreqMap* dest) const;

  error_code SaveAuxFieldStrStr(string_view key, string_view val);

  void CancelInShard(EngineShard* shard);

  size_t GetTotalBuffersSize() const;

  RdbSaver::SnapshotStats GetCurrentSnapshotProgress() const;

  error_code FlushSerializer();

  error_code FlushSink() {
    return aligned_buf_ ? aligned_buf_->Flush() : error_code{};
  }

  size_t Size() const {
    return shard_snapshots_.size();
  }

  RdbSerializer* serializer() {
    return &meta_serializer_;
  }

  int64_t last_write_ts() const {
    return last_write_time_ns_;
  }

 private:
  error_code WriteRecord(io::Bytes src);

  SnapshotPtr& GetSnapshot(EngineShard* shard);

  io::Sink* sink_;
  int64_t last_write_time_ns_ = -1;  // last write call.
  vector<SnapshotPtr> shard_snapshots_;

  // used for serializing non-body components in the calling fiber.
  RdbSerializer meta_serializer_;
  using RecordChannel = SizeTrackingChannel<string, base::mpmc_bounded_queue<string>>;
  std::optional<RecordChannel> channel_;
  std::optional<AlignedBuffer> aligned_buf_;

  // Single entry compression is compatible with redis rdb snapshot
  // Multi entry compression is available only on df snapshot, this will
  // make snapshot size smaller and opreation faster.
  CompressionMode compression_mode_;
  SaveMode save_mode_;
  DflyVersion replica_dfly_version_ = DflyVersion::CURRENT_VER;
};

// We pass K=sz to say how many producers are pushing data in order to maintain
// correct closing semantics - channel is closing when K producers marked it as closed.
RdbSaver::Impl::Impl(bool align_writes, unsigned producers_len, CompressionMode compression_mode,
                     SaveMode sm, io::Sink* sink, DflyVersion replica_dfly_version)
    : sink_(sink),
      shard_snapshots_(producers_len),
      meta_serializer_(CompressionMode::NONE),  // Note: I think there is not need for compression
                                                // at all in meta serializer
      compression_mode_(compression_mode) {
  if (align_writes) {
    aligned_buf_.emplace(kBufLen, sink);
    sink_ = &aligned_buf_.value();
  }
  if (sm == SaveMode::RDB) {
    channel_.emplace(kChannelLen, producers_len);
  }
  save_mode_ = sm;
  replica_dfly_version_ = replica_dfly_version;
}

void RdbSaver::Impl::CleanShardSnapshots() {
  // Deleter dispatches destruction to the owning shard thread when needed
  shard_snapshots_.clear();
}

RdbSaver::Impl::~Impl() {
  CleanShardSnapshots();
}

error_code RdbSaver::Impl::SaveAuxFieldStrStr(string_view key, string_view val) {
  auto& ser = meta_serializer_;
  RETURN_ON_ERR(ser.WriteOpcode(RDB_OPCODE_AUX));
  RETURN_ON_ERR(ser.SaveString(key));
  RETURN_ON_ERR(ser.SaveString(val));

  return error_code{};
}

error_code RdbSaver::Impl::ConsumeChannel(const ExecutionState* es) {
  error_code io_error;
  string record;

  auto& stats = ServerState::tlocal()->stats;
  DCHECK(channel_.has_value());
  // we can not exit on io-error since we spawn fibers that push data.
  // TODO: we may signal them to stop processing and exit asap in case of the error.
  while (channel_->Pop(record)) {
    if (io_error || (!es->IsRunning()))
      continue;

    do {
      if (!es->IsRunning())
        continue;

      auto start = absl::GetCurrentTimeNanos();
      io_error = WriteRecord(io::Buffer(record));
      if (io_error) {
        break;  // from the inner TryPop loop.
      }

      auto delta_usec = (absl::GetCurrentTimeNanos() - start) / 1'000;
      stats.rdb_save_usec += delta_usec;
      stats.rdb_save_count++;
    } while ((channel_->TryPop(record)));
  }  // while (channel_.Pop())

  for (auto& ptr : shard_snapshots_) {
    ptr->WaitSnapshotting();
  }
  VLOG(1) << "ConsumeChannel finished " << io_error;

  DCHECK(!channel_->TryPop(record));

  return io_error;
}

error_code RdbSaver::Impl::WriteRecord(io::Bytes src) {
  // For huge values, we break them up into chunks of upto several MBs to send in a single call,
  // so we could be more responsive.
  error_code ec;
  size_t start_size = src.size();
  last_write_time_ns_ = absl::GetCurrentTimeNanos();
  do {
    io::Bytes part = src.subspan(0, 8_MB);
    src.remove_prefix(part.size());

    ec = sink_->Write(part);

    int64_t now = absl::GetCurrentTimeNanos();
    unsigned delta_ms = (now - last_write_time_ns_) / 1000'000;
    last_write_time_ns_ = now;

    // Log extreme timings into the log for visibility.
    LOG_IF(INFO, delta_ms > 1000) << "Channel write took " << delta_ms << " ms while writing "
                                  << part.size() << "/" << start_size;
    if (ec) {
      LOG(INFO) << "Error writing to rdb sink " << ec.message();
      break;
    }
  } while (!src.empty());
  last_write_time_ns_ = -1;
  return ec;
}

void RdbSaver::Impl::StartSnapshotting(bool stream_journal, ExecutionState* cntx,
                                       EngineShard* shard) {
  auto& s = GetSnapshot(shard);
  auto& db_slice = namespaces->GetDefaultNamespace().GetDbSlice(shard->shard_id());

  s = CreateSliceSnapshot(shard, &db_slice, cntx);

  const auto allow_flush = (save_mode_ != SaveMode::RDB) ? SliceSnapshot::SnapshotFlush::kAllow
                                                         : SliceSnapshot::SnapshotFlush::kDisallow;

  s->Start(stream_journal, allow_flush);
}

SnapshotPtr RdbSaver::Impl::CreateSliceSnapshot(EngineShard* shard, DbSlice* db_slice,
                                                ExecutionState* cntx) {
  return SnapshotPtr(
      new SliceSnapshot(compression_mode_, db_slice, this, cntx, replica_dfly_version_),
      OwnerThreadDeleter::FromShard(shard));
}

// called on save flow
void RdbSaver::Impl::WaitForSnapshottingFinish(EngineShard* shard) {
  auto& snapshot = GetSnapshot(shard);
  CHECK(snapshot);
  snapshot->WaitSnapshotting();
}

void RdbSaver::Impl::ConsumeData(std::string data, ExecutionState* cntx) {
  if (!cntx->IsRunning()) {
    return;
  }
  if (channel_) {  // Rdb write to channel
    channel_->Push(std::move(data));
  } else {  // Write directly to socket
    auto ec = WriteRecord(io::Buffer(data));
    if (ec) {
      cntx->ReportError(ec);
    }
  }
}

void RdbSaver::Impl::Finalize() {
  if (channel_) {
    channel_->StartClosing();
  }
}

// called from replication flow
void RdbSaver::Impl::StopSnapshotting(EngineShard* shard) {
  auto& snapshot = GetSnapshot(shard);
  CHECK(snapshot);
  snapshot->FinalizeJournalStream(false);
}

void RdbSaver::Impl::CancelInShard(EngineShard* shard) {
  auto& snapshot = GetSnapshot(shard);
  if (snapshot) {  // Cancel can be called before snapshotting started.
    snapshot->FinalizeJournalStream(true);
  }
}

// This function is called from connection thread when info command is invoked.
// All accessed variableds must be thread safe, as they are fetched not from the rdb saver thread.
size_t RdbSaver::Impl::GetTotalBuffersSize() const {
  std::atomic<size_t> channel_bytes{0};
  std::atomic<size_t> serializer_bytes{0};

  auto cb = [this, &channel_bytes, &serializer_bytes](ShardId sid) {
    auto& snapshot = shard_snapshots_[sid];
    // before create a snapshot we save header so shard_snapshots_ are vector of nullptr until we
    // start snapshots saving
    if (!snapshot)
      return;
    if (channel_.has_value())
      channel_bytes.fetch_add(channel_->GetSize(), memory_order_relaxed);
    serializer_bytes.store(snapshot->GetBufferCapacity() + snapshot->GetTempBuffersSize(),
                           memory_order_relaxed);
  };

  if (shard_snapshots_.size() == 1) {
    cb(0);
  } else {
    shard_set->RunBriefInParallel([&](EngineShard* es) { cb(es->shard_id()); });
  }

  VLOG(2) << "channel_bytes:" << channel_bytes.load(memory_order_relaxed)
          << " serializer_bytes: " << serializer_bytes.load(memory_order_relaxed);
  return channel_bytes.load(memory_order_relaxed) + serializer_bytes.load(memory_order_relaxed);
}

RdbSaver::SnapshotStats RdbSaver::Impl::GetCurrentSnapshotProgress() const {
  std::vector<RdbSaver::SnapshotStats> results(shard_snapshots_.size());

  auto cb = [this, &results](ShardId sid) {
    auto& snapshot = shard_snapshots_[sid];
    // before create a snapshot we save header so shard_snapshots_ are vector of nullptr until we
    // start snapshots saving
    if (!snapshot)
      return;
    results[sid] = snapshot->GetCurrentSnapshotProgress();
  };

  if (shard_snapshots_.size() == 1) {
    cb(0);
    return results[0];
  }

  shard_set->RunBriefInParallel([&](EngineShard* es) { cb(es->shard_id()); });
  RdbSaver::SnapshotStats init{0, 0};
  return std::accumulate(
      results.begin(), results.end(), init, [](auto init, auto pr) -> RdbSaver::SnapshotStats {
        return {init.current_keys + pr.current_keys, init.total_keys + pr.total_keys};
      });
}

error_code RdbSaver::Impl::FlushSerializer() {
  last_write_time_ns_ = absl::GetCurrentTimeNanos();
  string blob = serializer()->Flush(RdbSerializerBase::FlushState::kFlushMidEntry);
  error_code ec;
  if (!blob.empty()) {
    ec = sink_->Write(io::Buffer(blob));
  }
  last_write_time_ns_ = -1;
  return ec;
}

namespace {

// Collect search index definitions and optionally HNSW metadata.
// search_indices always gets simple "index_name cmd" restore commands.
// For summary shards, hnsw_index_metadata gets JSON with HNSW graph metadata,
// and search_synonyms gets synonym group restore commands.
void CollectSearchIndices([[maybe_unused]] const EngineShard& shard,
                          [[maybe_unused]] StringVec* search_indices,
                          [[maybe_unused]] StringVec* search_synonyms,
                          [[maybe_unused]] StringVec* hnsw_index_metadata,
                          [[maybe_unused]] bool is_summary) {
#ifdef WITH_SEARCH
  auto* indices = shard.search_indices();
  for (const auto& index_name : indices->GetIndexNames()) {
    auto* index = indices->GetIndex(index_name);
    auto index_info = index->GetInfo();

    // Always store the simple restore command format
    std::string restore_cmd = absl::StrCat(index_name, " ", index_info.BuildRestoreCommand());
    search_indices->emplace_back(std::move(restore_cmd));

    if (!is_summary)
      continue;

    // Collect HNSW metadata for vector field (first one found), for now we don't support multiple
    // vector fields per index serialization
    for (const auto& [fident, finfo] : index_info.base_index.schema.fields) {
      if (finfo.type == search::SchemaField::VECTOR &&
          !(finfo.flags & search::SchemaField::NOINDEX)) {
        if (auto hnsw_index = GlobalHnswIndexRegistry::Instance().Get(index_name, finfo.short_name);
            hnsw_index) {
          auto meta = hnsw_index->GetMetadata();
          TmpJson meta_json;
          meta_json["index_name"] = index_name;
          meta_json["field_name"] = finfo.short_name;
          meta_json["max_elements"] = meta.max_elements;
          meta_json["cur_element_count"] = meta.cur_element_count;
          meta_json["maxlevel"] = meta.maxlevel;
          meta_json["enterpoint_node"] = meta.enterpoint_node;
          hnsw_index_metadata->emplace_back(meta_json.to_string());
          break;
        }
      }
    }

    // Save synonym groups
    const auto& synonym_groups = index->GetSynonyms().GetGroups();
    for (const auto& [group_id, terms] : synonym_groups) {
      if (!terms.empty()) {
        std::string syn_cmd =
            absl::StrCat(index_name, " ", group_id, " ", absl::StrJoin(terms, " "));
        search_synonyms->emplace_back(std::move(syn_cmd));
      }
    }
  }
#endif
}

}  // namespace

RdbSaver::GlobalData RdbSaver::GetGlobalData(const Service* service, bool is_summary) {
  StringVec script_bodies, search_indices, search_synonyms, hnsw_index_metadata;
  size_t table_mem_result = 0;

  if (!is_summary) {
    shard_set->RunBriefInParallel([&](EngineShard* shard) {
      if (shard->shard_id() == 0)
        CollectSearchIndices(*shard, &search_indices, &search_synonyms, &hnsw_index_metadata,
                             is_summary);
    });
    return RdbSaver::GlobalData{std::move(script_bodies), std::move(search_indices),
                                std::move(search_synonyms), std::move(hnsw_index_metadata),
                                table_mem_result};
  }
  {
    // For summary file: collect all global data
    auto scripts = service->script_mgr()->GetAll();
    script_bodies.reserve(scripts.size());
    for (auto& [sha, data] : scripts)
      script_bodies.push_back(std::move(data.body));
  }

  atomic<size_t> table_mem{0};
  shard_set->RunBriefInParallel([&](EngineShard* shard) {
    if (shard->shard_id() == 0)
      CollectSearchIndices(*shard, &search_indices, &search_synonyms, &hnsw_index_metadata,
                           is_summary);

    auto& db_slice = namespaces->GetDefaultNamespace().GetDbSlice(shard->shard_id());
    size_t shard_table_mem = 0;
    for (size_t db_id = 0; db_id < db_slice.db_array_size(); ++db_id) {
      auto* db_table = db_slice.GetDBTable(db_id);
      if (db_table) {
        shard_table_mem += db_table->table_memory();
      }
    }
    table_mem.fetch_add(shard_table_mem, memory_order_relaxed);
  });

  return RdbSaver::GlobalData{std::move(script_bodies), std::move(search_indices),
                              std::move(search_synonyms), std::move(hnsw_index_metadata),
                              table_mem.load(memory_order_relaxed)};
}

void RdbSaver::Impl::FillFreqMap(RdbTypeFreqMap* dest) const {
  for (auto& ptr : shard_snapshots_) {
    const RdbTypeFreqMap& src_map = ptr->freq_map();
    for (const auto& k_v : src_map)
      (*dest)[k_v.first] += k_v.second;
  }
}

SnapshotPtr& RdbSaver::Impl::GetSnapshot(EngineShard* shard) {
  // For single shard configuration, we maintain only one snapshot,
  // so we do not have to map it via shard_id.
  unsigned sid = shard_snapshots_.size() == 1 ? 0 : shard->shard_id();
  CHECK(sid < shard_snapshots_.size());
  return shard_snapshots_[sid];
}

RdbSaver::RdbSaver(::io::Sink* sink, SaveMode save_mode, bool align_writes, std::string snapshot_id,
                   DflyVersion replica_dfly_version)
    : replica_dfly_version_(replica_dfly_version), snapshot_id_(std::move(snapshot_id)) {
  CHECK_NOTNULL(sink);
  CompressionMode compression_mode = GetDefaultCompressionMode();
  int producer_count = 0;
  switch (save_mode) {
    case SaveMode::SUMMARY:
      producer_count = 0;
      if (compression_mode >= CompressionMode::SINGLE_ENTRY) {
        compression_mode_ = CompressionMode::SINGLE_ENTRY;
      } else {
        compression_mode_ = CompressionMode::NONE;
      }
      break;
    case SaveMode::SINGLE_SHARD:
    case SaveMode::SINGLE_SHARD_WITH_SUMMARY:
      producer_count = 1;
      compression_mode_ = compression_mode;
      break;
    case SaveMode::RDB:
      producer_count = shard_set->size();
      if (compression_mode >= CompressionMode::SINGLE_ENTRY) {
        compression_mode_ = CompressionMode::SINGLE_ENTRY;
      } else {
        compression_mode_ = CompressionMode::NONE;
      }
      break;
  }
  VLOG(1) << "Rdb save using compression mode:" << uint32_t(compression_mode_);
  impl_.reset(new Impl(align_writes, producer_count, compression_mode_, save_mode, sink,
                       replica_dfly_version_));
  save_mode_ = save_mode;
}

RdbSaver::~RdbSaver() {
  // Decommit local memory.
  // We create an RdbSaver for each thread, so each one will Decommit for itself.
  auto* tlocal = ServerState::tlocal();
  tlocal->DecommitMemory(ServerState::kAllMemory);
}

void RdbSaver::StartSnapshotInShard(bool stream_journal, ExecutionState* cntx, EngineShard* shard) {
  impl_->StartSnapshotting(stream_journal, cntx, shard);
}

error_code RdbSaver::WaitSnapshotInShard(EngineShard* shard) {
  impl_->WaitForSnapshottingFinish(shard);
  return SaveEpilog();
}

error_code RdbSaver::StopFullSyncInShard(EngineShard* shard) {
  impl_->StopSnapshotting(shard);
  return SaveEpilog();
}

error_code RdbSaver::SaveHeader(const GlobalData& glob_state) {
  char magic[16];
  // We should use RDB_VERSION here from rdb.h when we ditch redis 6 support
  // For now we serialize to an older version.
  size_t sz = absl::SNPrintF(magic, sizeof(magic), "REDIS%04d", RDB_SER_VERSION);
  CHECK_EQ(9u, sz);

  RETURN_ON_ERR(impl_->serializer()->WriteRaw(Bytes{reinterpret_cast<uint8_t*>(magic), sz}));
  RETURN_ON_ERR(SaveAux(glob_state));  // Should be first after magic
  RETURN_ON_ERR(impl_->FlushSerializer());
  return error_code{};
}

error_code RdbSaver::SaveBody(const ExecutionState& cntx) {
  RETURN_ON_ERR(impl_->FlushSerializer());

  if (save_mode_ == SaveMode::RDB) {
    VLOG(1) << "SaveBody , snapshots count: " << impl_->Size();
    error_code io_error = impl_->ConsumeChannel(&cntx);
    if (io_error) {
      return io_error;
    }
    if (cntx.GetError()) {
      return cntx.GetError();
    }
  } else {
    DCHECK(save_mode_ == SaveMode::SUMMARY);
  }

  return SaveEpilog();
}

void RdbSaver::FillFreqMap(RdbTypeFreqMap* freq_map) {
  freq_map->clear();
  impl_->FillFreqMap(freq_map);
}

error_code RdbSaver::SaveAux(const GlobalData& glob_state) {
  // Should be first
  if (!snapshot_id_.empty()) {
    RETURN_ON_ERR(impl_->SaveAuxFieldStrStr("snapshot-id", snapshot_id_));
  }

  /* Add a few fields about the state when the RDB was created. */
  RETURN_ON_ERR(impl_->SaveAuxFieldStrStr("redis-ver", REDIS_VERSION));
  RETURN_ON_ERR(impl_->SaveAuxFieldStrStr("df-ver", GetVersion()));
  RETURN_ON_ERR(SaveAuxFieldStrInt("redis-bits", 64));

  RETURN_ON_ERR(SaveAuxFieldStrInt("ctime", time(NULL)));
  auto used_mem = used_mem_current.load(memory_order_relaxed);
  VLOG(1) << "Used memory during save: " << used_mem;
  RETURN_ON_ERR(SaveAuxFieldStrInt("used-mem", used_mem));
  RETURN_ON_ERR(SaveAuxFieldStrInt("aof-preamble", 0));

  // Save lua scripts only in rdb or summary file
  DCHECK(save_mode_ != SaveMode::SINGLE_SHARD || glob_state.lua_scripts.empty());
  for (const string& s : glob_state.lua_scripts)
    RETURN_ON_ERR(impl_->SaveAuxFieldStrStr("lua", s));

  if (save_mode_ == SaveMode::RDB) {
    if (!glob_state.search_indices.empty())
      LOG(WARNING) << "Dragonfly search index data is incompatible with the RDB format";
  } else {
    // Search index definitions - for non-summary shards only sent to replicas >= VER6,
    // since older replicas only expect search-index from the summary shard.
    bool send_search_index =
        (save_mode_ != SaveMode::SINGLE_SHARD) || (replica_dfly_version_ >= DflyVersion::VER6);
    if (send_search_index) {
      for (const string& s : glob_state.search_indices)
        RETURN_ON_ERR(impl_->SaveAuxFieldStrStr("search-index", s));
    }

    // HNSW index metadata (JSON, summary only) - only for replicas >= VER6
    if (replica_dfly_version_ >= DflyVersion::VER6) {
      for (const string& s : glob_state.hnsw_index_metadata)
        RETURN_ON_ERR(impl_->SaveAuxFieldStrStr("hnsw-index-metadata", s));
    }

    // Save synonyms only in summary file
    DCHECK(save_mode_ != SaveMode::SINGLE_SHARD || glob_state.search_synonyms.empty());
    for (const string& s : glob_state.search_synonyms)
      RETURN_ON_ERR(impl_->SaveAuxFieldStrStr("search-synonyms", s));

    if (save_mode_ == SaveMode::SINGLE_SHARD_WITH_SUMMARY || save_mode_ == SaveMode::SUMMARY) {
      // We save the shard id in the summary file, so that we can restore it later.
      RETURN_ON_ERR(SaveAuxFieldStrInt("shard-count", shard_set->size()));
      RETURN_ON_ERR(SaveAuxFieldStrInt("table-mem", glob_state.table_used_memory));
    }
    if (EngineShard* shard = EngineShard::tlocal(); shard) {
      RETURN_ON_ERR(SaveAuxFieldStrInt("shard-id", shard->shard_id()));
    }
  }

  // TODO: "repl-stream-db", "repl-id", "repl-offset"
  return error_code{};
}

error_code RdbSaver::SaveEpilog() {
  RETURN_ON_ERR(impl_->serializer()->SendEofAndChecksum());

  RETURN_ON_ERR(impl_->FlushSerializer());

  return impl_->FlushSink();
}

error_code RdbSaver::SaveAuxFieldStrInt(string_view key, int64_t val) {
  char buf[LONG_STR_SIZE];
  int vlen = ll2string(buf, sizeof(buf), val);
  return impl_->SaveAuxFieldStrStr(key, string_view(buf, vlen));
}

void RdbSaver::CancelInShard(EngineShard* shard) {
  impl_->CancelInShard(shard);
}

size_t RdbSaver::GetTotalBuffersSize() const {
  return impl_->GetTotalBuffersSize();
}

RdbSaver::SnapshotStats RdbSaver::GetCurrentSnapshotProgress() const {
  return impl_->GetCurrentSnapshotProgress();
}

int64_t RdbSaver::GetLastWriteTime() const {
  return impl_->last_write_ts();
}

void RdbSerializerBase::AllocateCompressorOnce() {
  if (compressor_impl_) {
    return;
  }
  if (compression_mode_ == CompressionMode::MULTI_ENTRY_ZSTD) {
    compressor_impl_ = detail::CompressorImpl::CreateZstd();
  } else if (compression_mode_ == CompressionMode::MULTI_ENTRY_LZ4) {
    compressor_impl_ = detail::CompressorImpl::CreateLZ4();
  } else {
    LOG(FATAL) << "Invalid compression mode " << unsigned(compression_mode_);
  }
}

void RdbSerializerBase::CompressBlob() {
  if (!compression_stats_) {
    compression_stats_.emplace(CompressionStats{});
  }
  Bytes blob_to_compress = mem_buf_.InputBuffer();
  VLOG(2) << "CompressBlob size " << blob_to_compress.size();
  size_t blob_size = blob_to_compress.size();

  if (blob_size < kMinStrSizeToCompress || blob_size > kMaxStrSizeToCompress) {
    ++compression_stats_->size_skip_count;
    return;
  }

  AllocateCompressorOnce();

  // Compress the data. We copy compressed data once into the internal buffer of compressor_impl_
  // and then we copy it again into the mem_buf_.
  //
  // TODO: it is possible to avoid double copying here by changing the compressor interface,
  // so that the compressor will accept the output buffer and return the final size. This requires
  // exposing the additional compress bound interface as well.
  io::Result<io::Bytes> res = compressor_impl_->Compress(blob_to_compress);
  if (!res) {
    ++compression_stats_->compression_failed;
    return;
  }

  Bytes compressed_blob = *res;
  if (compressed_blob.length() > blob_size * kMinCompressionReductionPrecentage) {
    ++compression_stats_->compression_no_effective;
    return;
  }

  // Clear membuf and write the compressed blob to it
  mem_buf_.ConsumeInput(blob_size);
  mem_buf_.Reserve(compressed_blob.length() + 1 + 9);  // reserve space for blob + opcode + len

  // First write opcode for compressed string
  auto dest = mem_buf_.AppendBuffer();
  uint8_t opcode = compression_mode_ == CompressionMode::MULTI_ENTRY_ZSTD
                       ? RDB_OPCODE_COMPRESSED_ZSTD_BLOB_START
                       : RDB_OPCODE_COMPRESSED_LZ4_BLOB_START;
  dest[0] = opcode;
  mem_buf_.CommitWrite(1);

  // Write encoded compressed blob len
  dest = mem_buf_.AppendBuffer();
  unsigned enclen = WritePackedUInt(compressed_blob.length(), dest);
  mem_buf_.CommitWrite(enclen);

  // Write compressed blob
  dest = mem_buf_.AppendBuffer();
  memcpy(dest.data(), compressed_blob.data(), compressed_blob.length());
  mem_buf_.CommitWrite(compressed_blob.length());
  ++compression_stats_->compressed_blobs;
  auto& stats = ServerState::tlocal()->stats;
  ++stats.compressed_blobs;
}

size_t RdbSerializer::GetTempBufferSize() const {
  return RdbSerializerBase::GetTempBufferSize() + tmp_str_.size();
}

void RdbSerializer::PushToConsumerIfNeeded(RdbSerializerBase::FlushState flush_state) {
  if (consume_fun_ && SerializedLen() > flush_threshold_) {
    string blob = Flush(flush_state);
    DCHECK(!blob.empty());  // SerializedLen() > 0.
    consume_fun_(std::move(blob));
  }
}

}  // namespace dfly


================================================
FILE: src/server/rdb_save.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once

#include <absl/container/flat_hash_map.h>
#include <absl/types/span.h>

extern "C" {
#include "redis/lzfP.h"
}

#include <optional>

#include "base/pod_array.h"
#include "io/io.h"
#include "io/io_buf.h"
#include "server/detail/compressor.h"
#include "server/execution_state.h"
#include "server/journal/serializer.h"
#include "server/journal/types.h"
#include "server/table.h"
#include "server/version.h"

typedef struct rax rax;
typedef struct streamCG streamCG;

namespace dfly::search {
struct HnswNodeData;
}  // namespace dfly::search

namespace dfly {

// keys are RDB_TYPE_xxx constants.
using RdbTypeFreqMap = absl::flat_hash_map<unsigned, size_t>;

uint8_t RdbObjectType(const CompactObj& pv);

class EngineShard;
class Service;

class AlignedBuffer : public ::io::Sink {
 public:
  using io::Sink::Write;

  AlignedBuffer(size_t cap, ::io::Sink* upstream);
  ~AlignedBuffer();

  std::error_code Write(std::string_view buf) {
    return Write(io::Buffer(buf));
  }

  io::Result<size_t> WriteSome(const iovec* v, uint32_t len) final;

  std::error_code Flush();

  ::io::Sink* upstream() {
    return upstream_;
  }

 private:
  size_t capacity_;
  ::io::Sink* upstream_;
  char* aligned_buf_ = nullptr;

  off_t buf_offs_ = 0;
};

// SaveMode for snapshot. Used by RdbSaver to adjust internals.
enum class SaveMode {
  SUMMARY,                    // Save only header values (summary.dfs). Expected to read no shards.
  SINGLE_SHARD,               // Save single shard values (XXXX.dfs). Expected to read one shard.
  SINGLE_SHARD_WITH_SUMMARY,  // Save single shard value with the global summary. Used in the
                              // replication's fully sync stage.
  RDB,                        // Save .rdb file. Expected to read all shards.
};

enum class CompressionMode : uint8_t { NONE, SINGLE_ENTRY, MULTI_ENTRY_ZSTD, MULTI_ENTRY_LZ4 };

CompressionMode GetDefaultCompressionMode();

using StringVec = std::vector<std::string>;

class RdbSaver {
 public:
  // Global data which doesn't belong to shards and is serialized in header
  struct GlobalData {
    const StringVec lua_scripts;          // bodies of lua scripts
    const StringVec search_indices;       // ft.create commands to re-create search indices
    const StringVec search_synonyms;      // ft.synupdate commands to restore synonyms
    const StringVec hnsw_index_metadata;  // HNSW metadata JSON (summary only)
    size_t table_used_memory = 0;         // total memory used by all tables in all shards
  };

  // single_shard - true means that we run RdbSaver on a single shard and we do not use
  // to snapshot all the datastore shards.
  // single_shard - false, means we capture all the data using a single RdbSaver instance
  // (corresponds to legacy, redis compatible mode)
  // if align_writes is true - writes data in aligned chunks of 4KB to fit direct I/O requirements.
  // snapshot_id - allows to identify that group of files belongs to the same snapshot
  // replica_dfly_version - upper bound for conditional serialization of new features.
  explicit RdbSaver(::io::Sink* sink, SaveMode save_mode, bool align_writes,
                    std::string snapshot_id, DflyVersion replica_dfly_version);

  ~RdbSaver();

  // Initiates the serialization in the shard's thread.
  // cll allows breaking in the middle.
  void StartSnapshotInShard(bool stream_journal, ExecutionState* cntx, EngineShard* shard);

  // Stops full-sync serialization for replication in the shard's thread.
  std::error_code StopFullSyncInShard(EngineShard* shard);

  // Wait for snapshotting finish in shard thread. Called from save flows in shard thread.
  std::error_code WaitSnapshotInShard(EngineShard* shard);

  // Stores auxiliary (meta) values and header_info
  std::error_code SaveHeader(const GlobalData& header_info);

  // Writes the RDB file into sink. Waits for the serialization to finish.
  // Called only for save rdb flow and save df on summary file.
  std::error_code SaveBody(const ExecutionState& cntx);

  // Fills freq_map with the histogram of rdb types.
  void FillFreqMap(RdbTypeFreqMap* freq_map);

  void CancelInShard(EngineShard* shard);

  SaveMode Mode() const {
    return save_mode_;
  }

  // Get total size of all rdb serializer buffers and items currently placed in channel
  size_t GetTotalBuffersSize() const;

  struct SnapshotStats {
    size_t current_keys = 0;
    size_t total_keys = 0;
    size_t big_value_preemptions = 0;
  };

  SnapshotStats GetCurrentSnapshotProgress() const;

  // Fetch global data to be serialized in snapshot.
  // is_summary: true for summary file (full data with JSON search indices),
  //             false for per-shard files (only simple search index restore commands)
  static GlobalData GetGlobalData(const Service* service, bool is_summary);

  // Returns time in nanos of start of the last pending write interaction.
  // Returns -1 if no write operations are currently pending.
  int64_t GetLastWriteTime() const;

 private:
  class Impl;

  std::error_code SaveEpilog();

  std::error_code SaveAux(const GlobalData&);
  std::error_code SaveAuxFieldStrInt(std::string_view key, int64_t val);

  std::unique_ptr<Impl> impl_;
  SaveMode save_mode_;
  CompressionMode compression_mode_;
  DflyVersion replica_dfly_version_ = DflyVersion::CURRENT_VER;
  std::string snapshot_id_;
};

class RdbSerializer;
class RdbSerializerBase {
 public:
  enum class FlushState : uint8_t { kFlushMidEntry, kFlushEndEntry };

  explicit RdbSerializerBase(CompressionMode compression_mode);
  virtual ~RdbSerializerBase() = default;

  // Dumps `obj` in DUMP command format into `out`. Uses default compression mode.
  static std::string DumpValue(const PrimeValue& obj, bool ignore_crc = false);
  static std::string DumpValue(RdbSerializer* serializer, const PrimeValue& obj,
                               bool ignore_crc = false);

  // Internal buffer size. Might shrink after flush due to compression.
  size_t SerializedLen() const;

  // Flush internal buffer and return serialized blob.
  virtual std::string Flush(FlushState flush_state);

  size_t GetBufferCapacity() const;
  virtual size_t GetTempBufferSize() const;

  std::error_code WriteRaw(const ::io::Bytes& buf);

  // Write journal entry as an embedded journal blob.
  std::error_code WriteJournalEntry(std::string_view entry);

  // Send FULL_SYNC_CUT opcode to notify that all static data was sent.
  std::error_code SendFullSyncCut();

  std::error_code WriteOpcode(uint8_t opcode);

  std::error_code SaveLen(size_t len);
  std::error_code SaveString(std::string_view val);
  std::error_code SaveString(const uint8_t* buf, size_t len) {
    return SaveString(io::View(io::Bytes{buf, len}));
  }

  uint64_t GetSerializationPeakBytes() const {
    return serialization_peak_bytes_;
  }

  void SetCompressionMode(CompressionMode mode) {
    compression_mode_ = mode;
  }

 protected:
  // Prepare internal buffer for flush. Compress it.
  io::Bytes PrepareFlush(FlushState flush_state);

  // If membuf data is compressable use compression impl to compress the data and write it to membuf
  void CompressBlob();
  void AllocateCompressorOnce();

  std::error_code SaveLzfBlob(const ::io::Bytes& src, size_t uncompressed_len);

  CompressionMode compression_mode_;
  io::IoBuf mem_buf_;
  std::unique_ptr<detail::CompressorImpl> compressor_impl_;

  static constexpr size_t kFilterChunkSize = 1ULL << 26;
  static constexpr size_t kMinStrSizeToCompress = 256;
  static constexpr size_t kMaxStrSizeToCompress = 1 * 1024 * 1024;
  static constexpr double kMinCompressionReductionPrecentage = 0.95;
  struct CompressionStats {
    uint32_t compression_no_effective = 0;
    uint32_t size_skip_count = 0;
    uint32_t compression_failed = 0;
    uint32_t compressed_blobs = 0;
  };
  std::optional<CompressionStats> compression_stats_;
  base::PODArray<uint8_t> tmp_buf_;
  std::unique_ptr<LZF_HSLOT[]> lzf_;
  size_t number_of_chunks_ = 0;

  uint64_t serialization_peak_bytes_ = 0;
};

class RdbSerializer : public RdbSerializerBase {
 public:
  // ConsumeFun is called when internal buffer exceeds flush_threshold.
  // The callback receives the extracted data.
  using ConsumeFun = std::function<void(std::string)>;

  explicit RdbSerializer(CompressionMode compression_mode, ConsumeFun consume_fun = {},
                         size_t flush_threshold = 0);

  ~RdbSerializer();

  std::string Flush(FlushState flush_state) override;
  std::error_code SelectDb(uint32_t dbid);

  // Must be called in the thread to which `it` belongs.
  // Returns the serialized rdb_type or the error.
  // expire_ms = 0 means no expiry.
  // This function might preempt if flush_fun_ is used.
  io::Result<uint8_t> SaveEntry(const PrimeKey& pk, const PrimeValue& pv, uint64_t expire_ms,
                                uint32_t mc_flags, DbIndex dbid);

  // This would work for either string or an object.
  // The arg pv is taken from it->second if accessing
  // this by finding the key. This function is used
  // for the dump command - thus it is public function.
  // This function might preempt if flush_fun_ is used.
  std::error_code SaveValue(const PrimeValue& pv);

  std::error_code SendJournalOffset(uint64_t journal_offset);

  // Save HNSW index entry using provided tmp_buf for serialization to avoid repeated allocations.
  std::error_code SaveHNSWEntry(const search::HnswNodeData& node, absl::Span<uint8_t> tmp_buf);

  size_t GetTempBufferSize() const override;
  std::error_code SendEofAndChecksum();

 private:
  // Might preempt if flush_fun_ is used
  std::error_code SaveObject(const PrimeValue& pv);
  std::error_code SaveListObject(const PrimeValue& pv);
  std::error_code SaveSetObject(const PrimeValue& pv);
  std::error_code SaveHSetObject(const PrimeValue& pv);
  std::error_code SaveZSetObject(const PrimeValue& pv);
  std::error_code SaveStreamObject(const PrimeValue& obj);
  std::error_code SaveJsonObject(const PrimeValue& pv);
  std::error_code SaveSBFObject(const PrimeValue& pv);
  std::error_code SaveCMSObject(const PrimeValue& pv);

  std::error_code SaveLongLongAsString(int64_t value);
  std::error_code SaveBinaryDouble(double val);
  std::error_code SaveStreamPEL(rax* pel, bool nacks);
  std::error_code SaveStreamConsumers(bool save_active, streamCG* cg);

  // Might preempt
  void PushToConsumerIfNeeded(FlushState flush_state);

  std::string tmp_str_;
  DbIndex last_entry_db_index_ = kInvalidDbId;
  ConsumeFun consume_fun_;
  size_t flush_threshold_ = 0;
};

}  // namespace dfly


================================================
FILE: src/server/rdb_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#include <gmock/gmock.h>

extern "C" {
#include "redis/crc64.h"
#include "redis/listpack.h"
#include "redis/redis_aux.h"
#include "redis/zmalloc.h"
}

#include <absl/flags/reflection.h>
#include <mimalloc.h>

#include "base/flags.h"
#include "base/gtest.h"
#include "base/logging.h"
#include "facade/facade_test.h"  // needed to find operator== for RespExpr.
#include "io/file.h"
#include "server/engine_shard_set.h"
#include "server/rdb_load.h"
#include "server/rdb_save.h"
#include "server/test_utils.h"

using namespace testing;
using namespace std;
using namespace util;
using namespace facade;
using absl::SetFlag;
using absl::StrCat;

ABSL_DECLARE_FLAG(int32, list_compress_depth);
ABSL_DECLARE_FLAG(int32, list_max_listpack_size);
ABSL_DECLARE_FLAG(dfly::CompressionMode, compression_mode);
ABSL_DECLARE_FLAG(bool, rdb_ignore_expiry);
ABSL_DECLARE_FLAG(uint32_t, num_shards);
ABSL_DECLARE_FLAG(bool, rdb_sbf_chunked);
ABSL_DECLARE_FLAG(bool, serialize_hnsw_index);
ABSL_DECLARE_FLAG(bool, deserialize_hnsw_index);

namespace dfly {

static const auto kMatchNil = ArgType(RespExpr::NIL);

class RdbTest : public BaseFamilyTest {
 protected:
  void SetUp();

  io::FileSource GetSource(string name);

  std::error_code LoadRdb(const string& filename) {
    return pp_->at(0)->Await([&] {
      io::FileSource fs = GetSource(filename);

      RdbLoadContext load_context;
      RdbLoader loader(service_.get(), &load_context);
      return loader.Load(&fs);
    });
  }
};

void RdbTest::SetUp() {
  // Setting max_memory_limit must be before calling  InitWithDbFilename
  max_memory_limit = 40000000;
  absl::SetFlag(&FLAGS_serialize_hnsw_index, true);
  absl::SetFlag(&FLAGS_deserialize_hnsw_index, true);
  InitWithDbFilename();
  CHECK_EQ(zmalloc_used_memory_tl, 0);
}

inline const uint8_t* to_byte(const void* s) {
  return reinterpret_cast<const uint8_t*>(s);
}

io::FileSource RdbTest::GetSource(string name) {
  string rdb_file = base::ProgramRunfile("testdata/" + name);
  auto open_res = io::OpenRead(rdb_file, io::ReadonlyFile::Options{});
  CHECK(open_res) << rdb_file;

  return io::FileSource(*open_res);
}

static string FloatToBytes(float f) {
  return string(reinterpret_cast<const char*>(&f), sizeof(float));
}

TEST_F(RdbTest, SnapshotIdTest) {
  absl::SetFlag(&FLAGS_num_shards, num_threads_);
  ResetService();

  EXPECT_EQ(Run({"mset", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"}), "OK");

  Run({"save", "df", "test_dump"});

  absl::SetFlag(&FLAGS_num_shards, num_threads_ - 1);
  ResetService();

  EXPECT_EQ(Run({"mset", "test1", "val1", "test2", "val2"}), "OK");

  Run({"save", "df", "test_dump"});

  ResetService();

  EXPECT_EQ(Run({"dfly", "load", "test_dump-summary.dfs"}), "OK");

  auto resp = Run({"keys", "*"});
  EXPECT_THAT(resp.GetVec(), UnorderedElementsAre("test1", "test2"));
}

TEST_F(RdbTest, Crc) {
  std::string_view s{"TEST"};

  uint64_t c = crc64(0, to_byte(s.data()), s.size());
  ASSERT_NE(c, 0);

  uint64_t c2 = crc64(c, to_byte(s.data()), s.size());
  EXPECT_NE(c, c2);

  uint64_t c3 = crc64(c, to_byte(&c), sizeof(c));
  EXPECT_EQ(c3, 0);

  s = "COOLTEST";
  c = crc64(0, to_byte(s.data()), 8);
  c2 = crc64(0, to_byte(s.data()), 4);
  c3 = crc64(c2, to_byte(s.data() + 4), 4);
  EXPECT_EQ(c, c3);

  c2 = crc64(0, to_byte(s.data() + 4), 4);
  c3 = crc64(c2, to_byte(s.data()), 4);
  EXPECT_NE(c, c3);
}

TEST_F(RdbTest, LoadEmpty) {
  auto ec = LoadRdb("empty.rdb");
  ASSERT_FALSE(ec) << ec;
}

TEST_F(RdbTest, LoadSmall6) {
  // The rdb file contians keys that already expired, we want to continue loading them in this test.
  absl::FlagSaver fs;
  SetTestFlag("rdb_ignore_expiry", "true");

  auto ec = LoadRdb("redis6_small.rdb");

  ASSERT_FALSE(ec) << ec.message();

  auto resp = Run({"scan", "0"});

  ASSERT_THAT(resp, ArrLen(2));
  EXPECT_THAT(StrArray(resp.GetVec()[1]),
              UnorderedElementsAre("list1", "hset_zl", "list2", "zset_sl", "intset", "set1",
                                   "zset_zl", "hset_ht", "intkey", "strkey"));
  EXPECT_THAT(Run({"get", "intkey"}), "1234567");
  EXPECT_THAT(Run({"get", "strkey"}), "abcdefghjjjjjjjjjj");

  resp = Run({"smembers", "intset"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(),
              UnorderedElementsAre("111", "222", "1234", "3333", "4444", "67899", "76554"));

  // TODO: when we implement PEXPIRETIME we will be able to do it directly.
  int ttl = CheckedInt({"ttl", "set1"});    // should expire at 1747008000.
  EXPECT_GT(ttl + time(NULL), 1747007000);  // left 1000 seconds margin in case the clock is off.

  Run({"select", "1"});
  ASSERT_EQ(10, CheckedInt({"dbsize"}));
  ASSERT_EQ(128, CheckedInt({"strlen", "longggggggggggggggkeyyyyyyyyyyyyy:9"}));
  resp = Run({"script", "exists", "4ca238f611c9d0ae4e9a75a5dbac22aedc379801",
              "282297a0228f48cd3fc6a55de6316f31422f5d17"});
  ASSERT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp.GetVec(), ElementsAre(IntArg(1), IntArg(1)));
}

TEST_F(RdbTest, Stream) {
  auto ec = LoadRdb("redis6_stream.rdb");

  ASSERT_FALSE(ec) << ec.message();

  auto resp = Run({"type", "key:10"});
  EXPECT_EQ(resp, "stream");

  resp = Run({"xinfo", "groups", "key:0"});
  EXPECT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp.GetVec()[0],
              RespElementsAre("name", "g1", "consumers", 0, "pending", 0, "last-delivered-id",
                              "1655444851524-3", "entries-read", 128, "lag", 0));
  EXPECT_THAT(resp.GetVec()[1],
              RespElementsAre("name", "g2", "consumers", 1, "pending", 0, "last-delivered-id",
                              "1655444851523-1", "entries-read", kMatchNil, "lag", kMatchNil));

  resp = Run({"xinfo", "groups", "key:1"});  // test dereferences array of size 1
  EXPECT_THAT(resp, RespElementsAre("name", "g2", "consumers", IntArg(0), "pending", IntArg(0),
                                    "last-delivered-id", "1655444851523-1", "entries-read",
                                    kMatchNil, "lag", kMatchNil));

  resp = Run({"xinfo", "groups", "key:2"});
  EXPECT_THAT(resp, ArrLen(0));

  Run({"save"});
}

TEST_F(RdbTest, ComressionModeSaveDragonflyAndReload) {
  Run({"debug", "populate", "50000"});
  ASSERT_EQ(50000, CheckedInt({"dbsize"}));
  // Check keys inserted are lower than 50,000.
  auto resp = Run({"keys", "key:[5-9][0-9][0-9][0-9][0-9]*"});
  EXPECT_EQ(resp.GetVec().size(), 0);

  for (auto mode : {CompressionMode::NONE, CompressionMode::SINGLE_ENTRY,
                    CompressionMode::MULTI_ENTRY_ZSTD, CompressionMode::MULTI_ENTRY_LZ4}) {
    SetFlag(&FLAGS_compression_mode, mode);
    RespExpr resp = Run({"save", "df"});
    ASSERT_EQ(resp, "OK");

    if (mode == CompressionMode::MULTI_ENTRY_ZSTD || mode == CompressionMode::MULTI_ENTRY_LZ4) {
      EXPECT_GE(GetMetrics().coordinator_stats.compressed_blobs, 1);
    }

    auto save_info = service_->server_family().GetLastSaveInfo();
    resp = Run({"dfly", "load", save_info.file_name});
    ASSERT_EQ(resp, "OK");
    ASSERT_EQ(50000, CheckedInt({"dbsize"}));
  }
}

TEST_F(RdbTest, RdbLoaderOnReadCompressedDataShouldNotEnterEnsureReadFlow) {
  SetFlag(&FLAGS_compression_mode, CompressionMode::MULTI_ENTRY_ZSTD);
  for (int i = 0; i < 1000; ++i) {
    Run({"set", StrCat(i), "1"});
  }
  RespExpr resp = Run({"save", "df"});
  ASSERT_EQ(resp, "OK");

  auto save_info = service_->server_family().GetLastSaveInfo();
  resp = Run({"dfly", "load", save_info.file_name});
  ASSERT_EQ(resp, "OK");
}

TEST_F(RdbTest, SaveLoadSticky) {
  Run({"set", "a", "1"});
  Run({"set", "b", "2"});
  Run({"set", "c", "3"});
  Run({"stick", "a", "b"});
  RespExpr resp = Run({"save", "df"});
  ASSERT_EQ(resp, "OK");

  resp = Run({"debug", "reload"});
  ASSERT_EQ(resp, "OK");
  EXPECT_THAT(Run({"get", "a"}), "1");
  EXPECT_THAT(Run({"get", "b"}), "2");
  EXPECT_THAT(Run({"get", "c"}), "3");
  EXPECT_THAT(Run({"stick", "a", "b"}), IntArg(0));
  EXPECT_THAT(Run({"stick", "c"}), IntArg(1));
}

TEST_F(RdbTest, ReloadSetSmallStringBug) {
  auto str = absl::StrCat(std::string(32, 'X'));
  Run({"set", "small_key", str});
  auto resp = Run({"debug", "reload"});
  ASSERT_EQ(resp, "OK");
}

TEST_F(RdbTest, Reload) {
  absl::FlagSaver fs;

  SetFlag(&FLAGS_list_compress_depth, 1);
  SetFlag(&FLAGS_list_max_listpack_size, 1);  // limit listpack to a single element.

  Run({"set", "string_key", "val"});
  Run({"set", "large_key", string(511, 'L')});
  Run({"set", "huge_key", string((1 << 17) - 10, 'H')});

  Run({"sadd", "set_key1", "val1", "val2"});
  Run({"sadd", "intset_key", "1", "2", "3"});
  Run({"hset", "small_hset", "field1", "val1", "field2", "val2"});
  Run({"hset", "large_hset", "field1", string(510, 'V'), string(120, 'F'), "val2"});

  Run({"rpush", "list_key1", "val", "val2"});
  Run({"rpush", "list_key2", "head", string(511, 'a'), string(500, 'b'), "tail"});

  Run({"zadd", "zs1", "1.1", "a", "-1.1", "b"});
  Run({"zadd", "zs2", "1.1", string(510, 'a'), "-1.1", string(502, 'b')});

  Run({"hset", "large_keyname", string(240, 'X'), "-5"});
  Run({"hset", "large_keyname", string(240, 'Y'), "-500"});
  Run({"hset", "large_keyname", string(240, 'Z'), "-50000"});

  auto resp = Run({"debug", "reload"});
  ASSERT_EQ(resp, "OK");

  EXPECT_EQ(2, CheckedInt({"scard", "set_key1"}));
  EXPECT_EQ(3, CheckedInt({"scard", "intset_key"}));
  EXPECT_EQ(2, CheckedInt({"hlen", "small_hset"}));
  EXPECT_EQ(2, CheckedInt({"hlen", "large_hset"}));
  EXPECT_EQ(4, CheckedInt({"LLEN", "list_key2"}));
  EXPECT_EQ(2, CheckedInt({"ZCARD", "zs1"}));
  EXPECT_EQ(2, CheckedInt({"ZCARD", "zs2"}));

  EXPECT_EQ(-5, CheckedInt({"hget", "large_keyname", string(240, 'X')}));
  EXPECT_EQ(-500, CheckedInt({"hget", "large_keyname", string(240, 'Y')}));
  EXPECT_EQ(-50000, CheckedInt({"hget", "large_keyname", string(240, 'Z')}));
}

TEST_F(RdbTest, ReloadTtl) {
  Run({"set", "key", "val"});
  Run({"expire", "key", "1000"});
  Run({"debug", "reload"});
  EXPECT_LT(990, CheckedInt({"ttl", "key"}));
}

TEST_F(RdbTest, ReloadExpired) {
  Run({"set", "key", "val"});
  Run({"expire", "key", "2"});
  RespExpr resp = Run({"save", "df"});
  ASSERT_EQ(resp, "OK");
  auto save_info = service_->server_family().GetLastSaveInfo();
  AdvanceTime(2000);
  resp = Run({"dfly", "load", save_info.file_name});
  ASSERT_EQ(resp, "OK");
  resp = Run({"get", "key"});
  ASSERT_THAT(resp, ArgType(RespExpr::NIL));
}

TEST_F(RdbTest, HashmapExpiry) {
  // Add non-expiring elements
  Run({"hset", "key", "key1", "val1", "key2", "val2"});
  Run({"debug", "reload"});
  EXPECT_THAT(Run({"hgetall", "key"}),
              RespArray(UnorderedElementsAre("key1", "val1", "key2", "val2")));

  // Add expiring elements
  Run({"hsetex", "key", "5", "key3", "val3", "key4", "val4"});
  Run({"debug", "reload"});  // Reload before expiration
  EXPECT_THAT(Run({"hgetall", "key"}),
              RespArray(UnorderedElementsAre("key1", "val1", "key2", "val2", "key3", "val3", "key4",
                                             "val4")));
  AdvanceTime(10'000);
  EXPECT_THAT(Run({"hgetall", "key"}),
              RespArray(UnorderedElementsAre("key1", "val1", "key2", "val2")));

  Run({"hsetex", "key", "5", "key5", "val5", "key6", "val6"});
  EXPECT_THAT(Run({"hgetall", "key"}),
              RespArray(UnorderedElementsAre("key1", "val1", "key2", "val2", "key5", "val5", "key6",
                                             "val6")));
  AdvanceTime(10'000);
  Run({"debug", "reload"});  // Reload after expiration
  EXPECT_THAT(Run({"hgetall", "key"}),
              RespArray(UnorderedElementsAre("key1", "val1", "key2", "val2")));
}

TEST_F(RdbTest, SaveLoadExpiredValuesHmap) {
  // Add expiring elements
  Run({"hsetex", "hkey", "1", "key3", "val3", "key4", "val4"});

  RespExpr resp = Run({"TYPE", "hkey"});
  ASSERT_EQ(resp, "hash");

  AdvanceTime(10'000);
  resp = Run({"save", "RDB"});
  ASSERT_EQ(resp, "OK");

  resp = Run({"TYPE", "hkey"});
  ASSERT_EQ(resp, "hash");

  Run({"debug", "reload"});

  resp = Run({"TYPE", "hkey"});
  ASSERT_EQ(resp, "none");
}

TEST_F(RdbTest, SaveLoadExpiredValuesHugeHmap) {
  constexpr auto keys_num = 10000;
  for (int i = 0; i < keys_num; ++i) {
    Run({"hsetex", "hkey", "1", absl::StrCat("key", i), "val"});
  }

  ASSERT_EQ(keys_num, CheckedInt({"hlen", "hkey"}));

  AdvanceTime(10'000);

  Run({"debug", "reload"});

  ASSERT_EQ(Run({"TYPE", "hkey"}), "none");

  // with one value that isn't expired
  for (int i = 0; i < keys_num; ++i) {
    Run({"hsetex", "hkey", "1", absl::StrCat("key", i), "val"});
  }

  Run({"hset", "hkey", base::RandStr(20), "val"});

  ASSERT_EQ(keys_num + 1, CheckedInt({"hlen", "hkey"}));

  AdvanceTime(10'000);

  Run({"debug", "reload"});

  ASSERT_EQ(1, CheckedInt({"hlen", "hkey"}));
}

TEST_F(RdbTest, SaveLoadExpiredValuesSSet) {
  // Add expiring elements
  Run({"saddex", "skey", "1", "key3", "key4"});

  RespExpr resp = Run({"TYPE", "skey"});
  ASSERT_EQ(resp, "set");

  AdvanceTime(10'000);
  resp = Run({"save", "RDB"});
  ASSERT_EQ(resp, "OK");

  resp = Run({"TYPE", "skey"});
  ASSERT_EQ(resp, "set");

  Run({"debug", "reload"});

  resp = Run({"TYPE", "skey"});
  ASSERT_EQ(resp, "none");
}

TEST_F(RdbTest, SaveLoadExpiredValuesHugeSet) {
  constexpr auto keys_num = 10000;
  for (int i = 0; i < keys_num; ++i) {
    Run({"saddex", "skey", "1", absl::StrCat("key", i)});
  }

  ASSERT_EQ(keys_num, CheckedInt({"scard", "skey"}));

  AdvanceTime(10'000);

  Run({"debug", "reload"});

  ASSERT_EQ(Run({"TYPE", "skey"}), "none");

  // with one value that isn't expired
  for (int i = 0; i < keys_num; ++i) {
    Run({"saddex", "skey", "1", absl::StrCat("key", i)});
  }
  Run({"sadd", "skey", base::RandStr(20)});

  ASSERT_EQ(keys_num + 1, CheckedInt({"scard", "skey"}));

  AdvanceTime(10'000);

  Run({"debug", "reload"});

  ASSERT_EQ(1, CheckedInt({"scard", "skey"}));
}

TEST_F(RdbTest, SetExpiry) {
  // Add non-expiring elements
  Run({"sadd", "key", "key1", "key2"});
  Run({"debug", "reload"});
  EXPECT_THAT(Run({"smembers", "key"}), RespArray(UnorderedElementsAre("key1", "key2")));

  // Add expiring elements
  Run({"saddex", "key", "5", "key3", "key4"});
  Run({"debug", "reload"});  // Reload before expiration
  EXPECT_THAT(Run({"smembers", "key"}),
              RespArray(UnorderedElementsAre("key1", "key2", "key3", "key4")));
  AdvanceTime(10'000);
  EXPECT_THAT(Run({"smembers", "key"}), RespArray(UnorderedElementsAre("key1", "key2")));

  Run({"saddex", "key", "5", "key5", "key6"});
  EXPECT_THAT(Run({"smembers", "key"}),
              RespArray(UnorderedElementsAre("key1", "key2", "key5", "key6")));
  AdvanceTime(10'000);
  Run({"debug", "reload"});  // Reload after expiration
  EXPECT_THAT(Run({"smembers", "key"}), RespArray(UnorderedElementsAre("key1", "key2")));
}

// Tests that integer elements in sets with expiry are not corrupted during RDB load.
// This test covers the bug where ToSV() internal buffer was being reused,
// causing string corruption when loading integer elements.
TEST_F(RdbTest, SetExpiryInteger) {
  // Add integer elements with expiry - integers trigger ToSV() buffer reuse
  Run({"saddex", "s1", "10", "1", "2", "3", "12345", "67890"});

  // Verify elements are added correctly
  EXPECT_EQ(5, CheckedInt({"scard", "s1"}));
  EXPECT_THAT(Run({"smembers", "s1"}),
              RespArray(UnorderedElementsAre("1", "2", "3", "12345", "67890")));

  // Reload from RDB - this would trigger the corruption bug
  Run({"debug", "reload"});

  // Verify integers were loaded correctly without corruption
  EXPECT_EQ(5, CheckedInt({"scard", "s1"}));
  EXPECT_THAT(Run({"smembers", "s1"}),
              RespArray(UnorderedElementsAre("1", "2", "3", "12345", "67890")));

  // Verify all elements are actually in the set (no duplicates from corruption)
  EXPECT_THAT(Run({"sismember", "s1", "1"}), IntArg(1));
  EXPECT_THAT(Run({"sismember", "s1", "2"}), IntArg(1));
  EXPECT_THAT(Run({"sismember", "s1", "3"}), IntArg(1));
  EXPECT_THAT(Run({"sismember", "s1", "12345"}), IntArg(1));
  EXPECT_THAT(Run({"sismember", "s1", "67890"}), IntArg(1));
}

TEST_F(RdbTest, SaveFlush) {
  Run({"debug", "populate", "500000"});

  auto save_fb = pp_->at(1)->LaunchFiber([&] {
    RespExpr resp = Run({"save"});
    ASSERT_EQ(resp, "OK");
  });

  do {
    usleep(10);
  } while (!service_->server_family().TEST_IsSaving());

  Run({"flushdb"});
  save_fb.Join();
  auto save_info = service_->server_family().GetLastSaveInfo();
  ASSERT_EQ(1, save_info.freq_map.size());
  auto& k_v = save_info.freq_map.front();
  EXPECT_EQ("string", k_v.first);
  EXPECT_EQ(500000, k_v.second);
}

TEST_F(RdbTest, SaveManyDbs) {
  Run({"debug", "populate", "50000"});
  pp_->at(1)->Await([&] {
    Run({"select", "1"});
    Run({"debug", "populate", "10000"});
  });

  auto metrics = GetMetrics();
  ASSERT_EQ(2, metrics.db_stats.size());
  EXPECT_EQ(50000, metrics.db_stats[0].key_count);
  EXPECT_EQ(10000, metrics.db_stats[1].key_count);

  auto save_fb = pp_->at(0)->LaunchFiber([&] {
    RespExpr resp = Run({"save"});
    ASSERT_EQ(resp, "OK");
  });

  do {
    usleep(10);
  } while (!service_->server_family().TEST_IsSaving());

  pp_->at(1)->Await([&] {
    Run({"select", "1"});
    for (unsigned i = 0; i < 1000; ++i) {
      Run({"set", StrCat("abc", i), "bar"});
    }
  });

  save_fb.Join();

  auto save_info = service_->server_family().GetLastSaveInfo();
  ASSERT_EQ(1, save_info.freq_map.size());
  auto& k_v = save_info.freq_map.front();

  EXPECT_EQ("string", k_v.first);
  EXPECT_EQ(60000, k_v.second);
  auto resp = Run({"debug", "reload", "NOSAVE"});
  EXPECT_EQ(resp, "OK");

  metrics = GetMetrics();
  ASSERT_EQ(2, metrics.db_stats.size());
  EXPECT_EQ(50000, metrics.db_stats[0].key_count);
  EXPECT_EQ(10000, metrics.db_stats[1].key_count);
  if (metrics.db_stats[1].key_count != 10000) {
    Run({"select", "1"});
    resp = Run({"scan", "0", "match", "ab*"});
    StringVec vec = StrArray(resp.GetVec()[1]);
    for (const auto& s : vec) {
      LOG(ERROR) << "Bad key: " << s;
    }
  }
}

TEST_F(RdbTest, HMapBugs) {
  // Force kEncodingStrMap2 encoding.
  server.max_map_field_len = 0;
  Run({"hset", "hmap1", "key1", "val", "key2", "val2"});
  Run({"hset", "hmap2", "key1", string(690557, 'a')});

  server.max_map_field_len = 32;
  Run({"debug", "reload"});
  EXPECT_EQ(2, CheckedInt({"hlen", "hmap1"}));
}

TEST_F(RdbTest, Issue1305) {
  /***************
   * The code below crashes because of the weird listpack API that assumes that lpInsert
   * pointers are null then it should do deletion :(. See lpInsert comments for more info.

     uint8_t* lp = lpNew(128);
     lpAppend(lp, NULL, 0);
     lpFree(lp);

  */

  // Force kEncodingStrMap2 encoding.
  server.max_map_field_len = 0;
  Run({"hset", "hmap", "key1", "val", "key2", ""});

  server.max_map_field_len = 32;
  Run({"debug", "reload"});
  EXPECT_EQ(2, CheckedInt({"hlen", "hmap"}));
}

TEST_F(RdbTest, JsonTest) {
  string_view data[] = {
      R"({"a":1})"sv,                          //
      R"([1,2,3,4,5,6])"sv,                    //
      R"({"a":1.0,"b":[1,2],"c":"value"})"sv,  //
      R"({"a":{"a":{"a":{"a":1}}}})"sv         //
  };

  for (auto test : data) {
    Run({"json.set", "doc", "$", test});
    auto dump = Run({"dump", "doc"});
    Run({"del", "doc"});
    Run({"restore", "doc", "0", facade::ToSV(dump.GetBuf())});
    auto res = Run({"json.get", "doc"});
    ASSERT_EQ(res, test);
  }
}

// hll.rdb has 2 keys: "key-dense" and "key-sparse", both are HLL with a single added value "1".
class HllRdbTest : public RdbTest, public testing::WithParamInterface<string> {};

TEST_P(HllRdbTest, Hll) {
  LOG(INFO) << " max memory: " << max_memory_limit
            << " used_mem_current: " << used_mem_current.load();
  auto ec = LoadRdb("hll.rdb");

  ASSERT_FALSE(ec) << ec.message();

  EXPECT_EQ(CheckedInt({"pfcount", GetParam()}), 1);

  EXPECT_EQ(CheckedInt({"pfcount", GetParam(), "non-existing"}), 1);

  EXPECT_EQ(CheckedInt({"pfadd", "key2", "2"}), 1);
  EXPECT_EQ(CheckedInt({"pfcount", GetParam(), "key2"}), 2);

  EXPECT_EQ(CheckedInt({"pfadd", GetParam(), "2"}), 1);
  EXPECT_EQ(CheckedInt({"pfcount", GetParam()}), 2);

  EXPECT_EQ(Run({"pfmerge", "key3", GetParam(), "key2"}), "OK");
  EXPECT_EQ(CheckedInt({"pfcount", "key3"}), 2);
}

INSTANTIATE_TEST_SUITE_P(HllRdbTest, HllRdbTest, Values("key-sparse", "key-dense"));

TEST_F(RdbTest, LoadSmall7) {
  // Contains 3 keys
  // 1. A list called my-list encoded as RDB_TYPE_LIST_QUICKLIST_2
  // 2. A hashtable called my-hset encoded as RDB_TYPE_HASH_LISTPACK
  // 3. A set called my-set encoded as RDB_TYPE_SET_LISTPACK
  // 4. A zset called my-zset encoded as RDB_TYPE_ZSET_LISTPACK
  auto ec = LoadRdb("redis7_small.rdb");

  ASSERT_FALSE(ec) << ec.message();

  auto resp = Run({"scan", "0"});

  ASSERT_THAT(resp, ArrLen(2));

  EXPECT_THAT(StrArray(resp.GetVec()[1]),
              UnorderedElementsAre("my-set", "my-hset", "my-list", "zset"));

  resp = Run({"smembers", "my-set"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), UnorderedElementsAre("redis", "acme"));

  resp = Run({"hgetall", "my-hset"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), UnorderedElementsAre("acme", "44", "field", "22"));

  resp = Run({"lrange", "my-list", "0", "-1"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), UnorderedElementsAre("list1", "list2"));

  resp = Run({"zrange", "zset", "0", "-1"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), ElementsAre("einstein", "schrodinger"));
}

TEST_F(RdbTest, RedisJson) {
  // RDB file generated via:
  // ./redis-server --save "" --appendonly no --loadmodule ../lib/rejson.so
  // and then:
  // JSON.SET json-str $ '"hello"'
  // JSON.SET json-arr $ "[1, true, \"hello\", 3.14]"
  // JSON.SET json-obj $
  // '{"company":"DragonflyDB","product":"Dragonfly","website":"https://dragondlydb.io","years-active":[2021,2022,2023,2024,"and
  // more!"]}'
  auto ec = LoadRdb("redis_json.rdb");

  ASSERT_FALSE(ec) << ec.message();

  EXPECT_EQ(Run({"JSON.GET", "json-str"}), "\"hello\"");
  EXPECT_EQ(Run({"JSON.GET", "json-arr"}), "[1,true,\"hello\",3.14]");
  EXPECT_EQ(Run({"JSON.GET", "json-obj"}),
            "{\"company\":\"DragonflyDB\",\"product\":\"Dragonfly\",\"website\":\"https://"
            "dragondlydb.io\",\"years-active\":[2021,2022,2023,2024,\"and more!\"]}");
}

TEST_F(RdbTest, SBF) {
  EXPECT_THAT(Run({"BF.ADD", "k", "1"}), IntArg(1));
  Run({"debug", "reload"});
  EXPECT_EQ(Run({"type", "k"}), "MBbloom--");
  EXPECT_THAT(Run({"BF.EXISTS", "k", "1"}), IntArg(1));
}

TEST_F(RdbTest, SBFLargeFilterChunking) {
  absl::SetFlag(&FLAGS_rdb_sbf_chunked, true);
  max_memory_limit = 200000000;

  // Using this set of parameters for the BF.RESERVE command resulted in a
  // filter size large enough to require chunking (> 64 MB).
  const double error_rate = 0.001;
  const size_t capacity = 50'000'000;
  const size_t num_items = 100;

  size_t collisions = 0;

  Run({"BF.RESERVE", "large_key", std::to_string(error_rate), std::to_string(capacity)});
  for (size_t i = 0; i < num_items; i++) {
    auto res = Run({"BF.ADD", "large_key", absl::StrCat("item", i)});
    if (*res.GetInt() == 0)
      collisions++;
  }
  EXPECT_LT(static_cast<double>(collisions) / num_items, error_rate);

  Run({"debug", "reload"});
  EXPECT_EQ(Run({"type", "large_key"}), "MBbloom--");

  for (size_t i = 0; i < num_items; i++) {
    EXPECT_THAT(Run({"BF.EXISTS", "large_key", absl::StrCat("item", i)}), IntArg(1));
  }
}

TEST_F(RdbTest, RestoreSearchIndexNameStartingWithColon) {
  // Create an index with a name that starts with ':' and add a sample document
  EXPECT_EQ(Run({"FT.CREATE", ":Order:index", "ON", "HASH", "PREFIX", "1", ":Order:", "SCHEMA",
                 "customer_name", "AS", "customer_name", "TEXT", "status", "AS", "status", "TAG"}),
            "OK");

  EXPECT_THAT(Run({"HSET", ":Order:1", "customer_name", "John", "status", "new"}), IntArg(2));

  // Save and reload to ensure the index definition is persisted and restored
  EXPECT_EQ(Run({"save", "df"}), "OK");
  EXPECT_EQ(Run({"debug", "reload"}), "OK");

  // Verify a basic search works on the restored index
  auto search = Run({"FT.SEARCH", ":Order:index", "John"});
  ASSERT_THAT(search, ArgType(RespExpr::ARRAY));
  const auto& v = search.GetVec();
  ASSERT_FALSE(v.empty());
  EXPECT_THAT(v.front(), IntArg(1));
}

// Parametrized test for RestoreVectorSearchIndexHnsw with varying document counts
class HnswRestoreTest : public RdbTest, public testing::WithParamInterface<int> {};

TEST_P(HnswRestoreTest, RestoreVectorSearchIndexHnsw) {
  int num_docs = GetParam();

  EXPECT_EQ(
      Run({"FT.CREATE", "only_vec_idx", "ON", "HASH", "PREFIX", "1", "doc:", "SCHEMA", "embedding",
           "VECTOR", "HNSW", "6", "TYPE", "FLOAT32", "DIM", "2", "DISTANCE_METRIC", "L2"}),
      "OK");

  EXPECT_EQ(Run({"FT.CREATE", "vec_idx", "ON",   "HASH",      "PREFIX",          "1",    "doc:",
                 "SCHEMA",    "name",    "TEXT", "embedding", "VECTOR",          "HNSW", "6",
                 "TYPE",      "FLOAT32", "DIM",  "2",         "DISTANCE_METRIC", "L2"}),
            "OK");

  // Insert documents with incrementing vectors
  for (int i = 1; i <= num_docs; ++i) {
    float x = static_cast<float>(i * 2 - 1);
    float y = static_cast<float>(i * 2);
    Run({"HSET", StrCat("doc:", i), "name", StrCat("doc", i), "embedding",
         StrCat(FloatToBytes(x), FloatToBytes(y))});
  }

  LOG(INFO) << "Created " << num_docs << " documents with vector embeddings";

  EXPECT_EQ(Run({"save", "df"}), "OK");
  auto save_info = service_->server_family().GetLastSaveInfo();

  // Reload from the saved file - this should restore the HNSW index, not rebuild it
  // Look for "Restored HNSW index" in logs to verify restoration vs rebuild
  LOG(INFO) << "Reloading from " << save_info.file_name << " - expecting HNSW index restoration";
  EXPECT_EQ(Run({"dfly", "load", save_info.file_name}), "OK");

  // Wait for async index building to complete on both indices
  auto is_indexing_done = [this](string_view idx_name) {
    auto resp = Run({"FT.INFO", idx_name});
    auto arr = resp.GetVec();
    auto it = std::find_if(arr.begin(), arr.end(), [](const auto& e) { return e == "indexing"; });
    return it != arr.end() && (++it)->GetInt() == 0;
  };

  ASSERT_TRUE(WaitUntilCondition([&] { return is_indexing_done("vec_idx"); },
                                 std::chrono::milliseconds(10000)));
  ASSERT_TRUE(WaitUntilCondition([&] { return is_indexing_done("only_vec_idx"); },
                                 std::chrono::milliseconds(10000)));

  // Verify text search still works on the restored index
  auto search = Run({"FT.SEARCH", "vec_idx", "doc1"});
  ASSERT_THAT(search, ArgType(RespExpr::ARRAY));
  const auto& v = search.GetVec();
  ASSERT_FALSE(v.empty());
  EXPECT_THAT(v.front(), IntArg(1));

  // Verify KNN vector search works on the restored index
  // Query vector close to (1.0, 2.0) should find doc:1 as nearest
  string query_vec = StrCat(FloatToBytes(1.1f), FloatToBytes(2.1f));
  auto knn_search = Run({"FT.SEARCH", "vec_idx", "*=>[KNN 2 @embedding $vec]", "PARAMS", "2", "vec",
                         query_vec, "RETURN", "1", "name"});
  ASSERT_THAT(knn_search, ArgType(RespExpr::ARRAY));
  EXPECT_GE(knn_search.GetVec().front().GetInt(), 1);

  // The same check for another index with only vector field
  knn_search = Run({"FT.SEARCH", "only_vec_idx", "*=>[KNN 2 @embedding $vec]", "PARAMS", "2", "vec",
                    query_vec, "RETURN", "1", "name"});
  ASSERT_THAT(knn_search, ArgType(RespExpr::ARRAY));
  EXPECT_GE(knn_search.GetVec().front().GetInt(), 1);

  // Verify total document count matches
  EXPECT_EQ(CheckedInt({"dbsize"}), num_docs);

  LOG(INFO) << "Successfully verified HNSW index restoration with " << num_docs << " documents";
}

INSTANTIATE_TEST_SUITE_P(HnswRestoreTest, HnswRestoreTest, Values(5, 50, 500, 1000),
                         [](const testing::TestParamInfo<int>& info) {
                           return StrCat("Docs", info.param);
                         });

TEST_F(RdbTest, DflyLoadAppend) {
  // Create an RDB with (k1,1) value in it saved as `filename`
  EXPECT_EQ(Run({"set", "k1", "1"}), "OK");
  EXPECT_EQ(Run({"save", "df"}), "OK");
  string filename = service_->server_family().GetLastSaveInfo().file_name;

  // Without APPEND option - db should be flushed
  EXPECT_EQ(Run({"set", "k1", "TO-BE-FLUSHED"}), "OK");
  EXPECT_EQ(Run({"set", "k2", "TO-BE-FLUSHED"}), "OK");
  EXPECT_EQ(Run({"dfly", "load", filename}), "OK");
  EXPECT_THAT(Run({"dbsize"}), IntArg(1));
  EXPECT_EQ(Run({"get", "k1"}), "1");

  // With APPEND option - db shouldn't be flushed, but k1 should be overridden
  EXPECT_EQ(Run({"set", "k1", "TO-BE-OVERRIDDEN"}), "OK");
  EXPECT_EQ(Run({"set", "k2", "2"}), "OK");
  EXPECT_EQ(Run({"dfly", "load", filename, "append"}), "OK");
  EXPECT_THAT(Run({"dbsize"}), IntArg(2));
  EXPECT_EQ(Run({"get", "k1"}), "1");
  EXPECT_EQ(Run({"get", "k2"}), "2");
}

// Tests loading a huge set, where the set is loaded in multiple partial reads.
TEST_F(RdbTest, LoadHugeSet) {
  // Add 2 sets with 100k elements each (note must have more than kMaxBlobLen
  // elements to test partial reads).
  Run({"debug", "populate", "2", "test", "100", "rand", "type", "set", "elements", "100000"});
  ASSERT_EQ(100000, CheckedInt({"scard", "test:0"}));
  ASSERT_EQ(100000, CheckedInt({"scard", "test:1"}));

  RespExpr resp = Run({"save", "df"});
  ASSERT_EQ(resp, "OK");

  auto save_info = service_->server_family().GetLastSaveInfo();
  resp = Run({"dfly", "load", save_info.file_name});
  ASSERT_EQ(resp, "OK");

  ASSERT_EQ(100000, CheckedInt({"scard", "test:0"}));
  ASSERT_EQ(100000, CheckedInt({"scard", "test:1"}));
  auto metrics = GetMetrics();
  EXPECT_GT(metrics.db_stats[0].obj_memory_usage, 24'000'000u);
}

// Tests loading a huge hmap, where the map is loaded in multiple partial
// reads.
TEST_F(RdbTest, LoadHugeHMap) {
  // Add 2 sets with 100k elements each (note must have more than kMaxBlobLen
  // elements to test partial reads).
  Run({"debug", "populate", "2", "test", "100", "rand", "type", "hash", "elements", "100000"});
  ASSERT_EQ(100000, CheckedInt({"hlen", "test:0"}));
  ASSERT_EQ(100000, CheckedInt({"hlen", "test:1"}));

  RespExpr resp = Run({"save", "df"});
  ASSERT_EQ(resp, "OK");

  auto save_info = service_->server_family().GetLastSaveInfo();
  resp = Run({"dfly", "load", save_info.file_name});
  ASSERT_EQ(resp, "OK");

  ASSERT_EQ(100000, CheckedInt({"hlen", "test:0"}));
  ASSERT_EQ(100000, CheckedInt({"hlen", "test:1"}));
  auto metrics = GetMetrics();
  EXPECT_GT(metrics.db_stats[0].obj_memory_usage, 29'000'000u);
}

// Tests loading a huge zset, where the zset is loaded in multiple partial
// reads.
TEST_F(RdbTest, LoadHugeZSet) {
  // Add 2 sets with 100k elements each (note must have more than kMaxBlobLen
  // elements to test partial reads).
  Run({"debug", "populate", "2", "test", "100", "rand", "type", "zset", "elements", "100000"});
  ASSERT_EQ(100000, CheckedInt({"zcard", "test:0"}));
  ASSERT_EQ(100000, CheckedInt({"zcard", "test:1"}));

  RespExpr resp = Run({"save", "df"});
  ASSERT_EQ(resp, "OK");

  auto save_info = service_->server_family().GetLastSaveInfo();
  resp = Run({"dfly", "load", save_info.file_name});
  ASSERT_EQ(resp, "OK");

  ASSERT_EQ(100000, CheckedInt({"zcard", "test:0"}));
  ASSERT_EQ(100000, CheckedInt({"zcard", "test:1"}));
  auto metrics = GetMetrics();
  EXPECT_GT(metrics.db_stats[0].obj_memory_usage, 26'000'000u);
}

// Tests loading a huge list, where the list is loaded in multiple partial
// reads.
TEST_F(RdbTest, LoadHugeList) {
  // Add 2 lists with 100k elements each (note must have more than 512*8Kb
  // elements to test partial reads).
  Run({"debug", "populate", "2", "test", "100", "rand", "type", "list", "elements", "100000"});
  ASSERT_EQ(100000, CheckedInt({"llen", "test:0"}));
  ASSERT_EQ(100000, CheckedInt({"llen", "test:1"}));

  RespExpr resp = Run({"save", "df"});
  ASSERT_EQ(resp, "OK");

  auto save_info = service_->server_family().GetLastSaveInfo();
  resp = Run({"dfly", "load", save_info.file_name});
  ASSERT_EQ(resp, "OK");

  ASSERT_EQ(100000, CheckedInt({"llen", "test:0"}));
  ASSERT_EQ(100000, CheckedInt({"llen", "test:1"}));
  auto metrics = GetMetrics();
  EXPECT_GT(metrics.db_stats[0].obj_memory_usage, 20'000'000u);
}

// Tests loading a huge stream, where the stream is loaded in multiple partial
// reads.
TEST_F(RdbTest, LoadHugeStream) {
  TEST_current_time_ms = 1000;

  // Add a huge stream (test:0) with 2000 entries, and 4 1k elements per entry
  // (note must be more than 512*4kb elements to test partial reads).
  // We add 2000 entries to the stream to ensure that the stream, because populate stream
  // adds only a single entry at a time, with multiple elements in it.

  Run({"debug", "populate", "1", "test", "2000", "rand", "type", "stream", "elements", "8000"});

  ASSERT_EQ(2000, CheckedInt({"xlen", "test:0"}));
  Run({"XGROUP", "CREATE", "test:0", "grp1", "0"});
  Run({"XGROUP", "CREATE", "test:0", "grp2", "0"});
  Run({"XREADGROUP", "GROUP", "grp1", "Alice", "COUNT", "1", "STREAMS", "test:0", ">"});
  Run({"XREADGROUP", "GROUP", "grp2", "Alice", "COUNT", "1", "STREAMS", "test:0", ">"});

  auto resp = Run({"xinfo", "stream", "test:0"});

  EXPECT_THAT(
      resp, RespElementsAre("length", 2000, "radix-tree-keys", 2000, "radix-tree-nodes", 2010,
                            "last-generated-id", "1000-1999", "max-deleted-entry-id", "0-0",
                            "entries-added", 2000, "recorded-first-entry-id", "1000-0", "groups", 2,
                            "first-entry", ArrLen(2), "last-entry", ArrLen(2)));

  resp = Run({"save", "df"});
  ASSERT_EQ(resp, "OK");

  auto save_info = service_->server_family().GetLastSaveInfo();
  resp = Run({"dfly", "load", save_info.file_name});
  ASSERT_EQ(resp, "OK");

  ASSERT_EQ(2000, CheckedInt({"xlen", "test:0"}));
  resp = Run({"xinfo", "stream", "test:0"});
  EXPECT_THAT(
      resp, RespElementsAre("length", 2000, "radix-tree-keys", 2000, "radix-tree-nodes", 2010,
                            "last-generated-id", "1000-1999", "max-deleted-entry-id", "0-0",
                            "entries-added", 2000, "recorded-first-entry-id", "1000-0", "groups", 2,
                            "first-entry", ArrLen(2), "last-entry", ArrLen(2)));
  resp = Run({"xinfo", "groups", "test:0"});
  EXPECT_THAT(resp, RespElementsAre(RespElementsAre("name", "grp1", "consumers", 1, "pending", 1,
                                                    "last-delivered-id", "1000-0", "entries-read",
                                                    1, "lag", 1999),
                                    _));
}

TEST_F(RdbTest, LoadStream2) {
  auto ec = LoadRdb("RDB_TYPE_STREAM_LISTPACKS_2.rdb");
  ASSERT_FALSE(ec) << ec.message();
  auto res = Run({"XINFO", "STREAM", "mystream"});
  ASSERT_THAT(res.GetVec(),
              ElementsAre("length", 2, "radix-tree-keys", 1, "radix-tree-nodes", 2,
                          "last-generated-id", "1732613360686-0", "max-deleted-entry-id", "0-0",
                          "entries-added", 2, "recorded-first-entry-id", "1732613352350-0",
                          "groups", 1, "first-entry", RespElementsAre("1732613352350-0", _),
                          "last-entry", RespElementsAre("1732613360686-0", _)));
}

TEST_F(RdbTest, LoadStream3) {
  auto ec = LoadRdb("RDB_TYPE_STREAM_LISTPACKS_3.rdb");
  ASSERT_FALSE(ec) << ec.message();
  auto res = Run({"XINFO", "STREAM", "mystream"});
  ASSERT_THAT(
      res.GetVec(),
      ElementsAre("length", 2, "radix-tree-keys", 1, "radix-tree-nodes", 2, "last-generated-id",
                  "1732614679549-0", "max-deleted-entry-id", "0-0", "entries-added", 2,
                  "recorded-first-entry-id", "1732614676541-0", "groups", 1, "first-entry",
                  ArgType(RespExpr::ARRAY), "last-entry", ArgType(RespExpr::ARRAY)));
}

TEST_F(RdbTest, SnapshotTooBig) {
  // Run({"debug", "populate", "10000", "foo", "1000"});
  //  usleep(5000);  // let the stats to sync
  max_memory_limit = 100000;
  used_mem_current = 1000000;
  auto resp = Run({"debug", "reload"});
  ASSERT_THAT(resp, ErrArg("Out of memory"));
}

TEST_F(RdbTest, HugeKeyIssue4497) {
  SetTestFlag("cache_mode", "true");
  ResetService();

  EXPECT_EQ(Run({"flushall"}), "OK");
  EXPECT_EQ(Run({"debug", "populate", "1", "k", "1000", "rand", "type", "set", "elements", "5000"}),
            "OK");
  EXPECT_EQ(Run({"save", "rdb", "hugekey.rdb"}), "OK");
  EXPECT_EQ(Run({"dfly", "load", "hugekey.rdb"}), "OK");
  EXPECT_EQ(Run({"flushall"}), "OK");
}

TEST_F(RdbTest, HugeKeyIssue4554) {
  SetTestFlag("cache_mode", "true");
  // We need to stress one flow/shard such that the others finish early. Lock on hashtags allows
  // that.
  SetTestFlag("lock_on_hashtags", "true");
  ResetService();

  EXPECT_EQ(
      Run({"debug", "populate", "20", "{tmp}", "20", "rand", "type", "set", "elements", "10000"}),
      "OK");
  EXPECT_EQ(Run({"save", "df", "hugekey"}), "OK");
  EXPECT_EQ(Run({"dfly", "load", "hugekey-summary.dfs"}), "OK");
  EXPECT_EQ(Run({"flushall"}), "OK");
}

// ignore_expiry.rdb contains 2 keys which are expired keys
// this test case verifies wheather rdb_ignore_expiry flag is working as expected.
TEST_F(RdbTest, RDBIgnoreExpiryFlag) {
  absl::FlagSaver fs;

  SetTestFlag("rdb_ignore_expiry", "true");
  auto ec = LoadRdb("ignore_expiry.rdb");

  ASSERT_FALSE(ec) << ec.message();

  auto resp = Run({"scan", "0"});

  ASSERT_THAT(resp, ArrLen(2));

  EXPECT_THAT(StrArray(resp.GetVec()[1]), UnorderedElementsAre("test", "test2"));

  EXPECT_THAT(Run({"get", "test"}), "expkey");
  EXPECT_THAT(Run({"get", "test2"}), "expkey");

  int ttl = CheckedInt({"ttl", "test"});  // should ignore expiry for key
  EXPECT_EQ(ttl, -1);

  int ttl2 = CheckedInt({"ttl", "test2"});  // should ignore expiry for key
  EXPECT_EQ(ttl2, -1);
}

TEST_F(RdbTest, CmsSerialization) {
  Run("cms.initbydim cms 1000 5");
  Run("cms.incrby cms foo 5 bar 3 baz 9");

  auto resp = Run("cms.query cms foo bar baz");
  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(5), IntArg(3), IntArg(9))));

  Run("save df cms");
  Run("flushall");
  EXPECT_EQ(Run("dfly load cms-summary.dfs"), "OK");

  resp = Run("cms.query cms foo bar baz");
  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(5), IntArg(3), IntArg(9))));
}

}  // namespace dfly


================================================
FILE: src/server/replica.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#include "server/replica.h"

#include <chrono>

#include "absl/strings/match.h"
#include "facade/service_interface.h"
#include "server/engine_shard.h"

extern "C" {
#include "redis/rdb.h"
}

#include <absl/cleanup/cleanup.h>
#include <absl/flags/flag.h>
#include <absl/functional/bind_front.h>
#include <absl/strings/escaping.h>
#include <absl/strings/str_cat.h>
#include <absl/strings/strip.h>

#include <boost/asio/ip/tcp.hpp>
#include <memory>
#include <utility>

#include "base/logging.h"
#include "facade/redis_parser.h"
#include "facade/reply_capture.h"
#include "facade/socket_utils.h"
#include "server/error.h"
#include "server/journal/executor.h"
#include "server/journal/journal.h"
#include "server/journal/serializer.h"
#include "server/main_service.h"
#include "server/namespaces.h"
#include "server/rdb_load.h"
#include "strings/human_readable.h"

#define LOG_REPL_ERROR(msg)                                         \
  do {                                                              \
    if (state_mask_ & R_ENABLED) {                                  \
      if ((state_mask_ & R_SYNCING) || (state_mask_ & R_SYNC_OK)) { \
        LOG(WARNING) << msg;                                        \
      } else {                                                      \
        LOG(ERROR) << msg;                                          \
      }                                                             \
    } else {                                                        \
      VLOG(1) << msg;                                               \
    }                                                               \
  } while (0)

ABSL_FLAG(int, replication_acks_interval, 1000, "Interval between acks in milliseconds.");
ABSL_FLAG(int, master_connect_timeout_ms, 20000,
          "Timeout for establishing connection to a replication master");
ABSL_FLAG(int, master_reconnect_timeout_ms, 1000,
          "Timeout for re-establishing connection to a replication master");
ABSL_FLAG(bool, replica_partial_sync, true,
          "Use partial sync to reconnect when a replica connection is interrupted.");
ABSL_FLAG(bool, break_replication_on_master_restart, false,
          "When in replica mode, and master restarts, break replication from master to avoid "
          "flushing the replica's data.");
ABSL_FLAG(std::string, replica_announce_ip, "",
          "IP address that Dragonfly announces to replication master");
ABSL_DECLARE_FLAG(int32_t, port);
ABSL_DECLARE_FLAG(uint16_t, announce_port);
ABSL_FLAG(
    int, replica_priority, 100,
    "Published by info command for sentinel to pick replica based on score during a failover");
ABSL_FLAG(bool, experimental_replicaof_v2, true,
          "Use ReplicaOfV2 algorithm for initiating replication");

namespace dfly {

using namespace std;
using namespace util;
using namespace facade;
using absl::StrCat;

namespace {

constexpr unsigned kRdbEofMarkSize = 40;

// Distribute flow indices over all available threads (shard_set pool size).
vector<vector<unsigned>> Partition(unsigned num_flows) {
  vector<vector<unsigned>> partition(shard_set->pool()->size());
  for (unsigned i = 0; i < num_flows; ++i) {
    partition[i % partition.size()].push_back(i);
  }
  return partition;
}

}  // namespace

Replica::Replica(string host, uint16_t port, Service* se, std::string_view id,
                 std::optional<cluster::SlotRange> slot_range)
    : ProtocolClient(std::move(host), port), service_(*se), id_{id}, slot_range_(slot_range) {
  proactor_ = ProactorBase::me();
}

Replica::~Replica() {
  sync_fb_.JoinIfNeeded();
  acks_fb_.JoinIfNeeded();
}

static const char kConnErr[] = "could not connect to master: ";

GenericError Replica::Start() {
  VLOG(1) << "Starting replication " << this;
  ProactorBase* mythread = ProactorBase::me();
  CHECK(mythread);
  DCHECK(proactor_ == mythread);

  auto check_connection_error = [this](error_code ec, const char* msg) -> GenericError {
    if (!exec_st_.IsRunning()) {
      CloseSocket();
      return {"replication cancelled"};
    }
    if (ec) {
      CloseSocket();
      exec_st_.ReportCancelError();
      return {absl::StrCat(msg, ec.message())};
    }
    return ec;
  };

  // 0. Set basic error handler that is reponsible for cleaning up on errors.
  // Can return an error only if replication was cancelled immediately.
  auto err = exec_st_.SwitchErrorHandler([this](const auto& ge) { this->DefaultErrorHandler(ge); });
  RETURN_ON_GENERIC_ERR(check_connection_error(err, "replication cancelled"));

  // 1. Resolve dns.
  VLOG(1) << "Resolving master DNS";
  error_code ec = ResolveHostDns();
  RETURN_ON_GENERIC_ERR(check_connection_error(ec, "could not resolve master dns"));

  // 2. Connect socket.
  VLOG(1) << "Connecting to master";
  ec = ConnectAndAuth(absl::GetFlag(FLAGS_master_connect_timeout_ms) * 1ms, &exec_st_);
  RETURN_ON_GENERIC_ERR(check_connection_error(ec, kConnErr));

  // 3. Greet.
  VLOG(1) << "Greeting";
  state_mask_ = R_ENABLED | R_TCP_CONNECTED;
  ec = Greet();
  RETURN_ON_ERR(check_connection_error(ec, "could not greet master "));

  return {};
}

void Replica::StartMainReplicationFiber(std::optional<LastMasterSyncData> last_master_sync_data) {
  sync_fb_ = fb2::Fiber("main_replication", &Replica::MainReplicationFb, this,
                        std::move(last_master_sync_data));
}

void Replica::EnableReplication() {
  VLOG(1) << "Enabling replication";

  state_mask_ = R_ENABLED;                                           // set replica state to enabled
  sync_fb_ = MakeFiber(&Replica::MainReplicationFb, this, nullopt);  // call replication fiber
}

std::optional<Replica::LastMasterSyncData> Replica::Stop() {
  VLOG(1) << "Stopping replication " << this;
  // Stops the loop in MainReplicationFb.

  proactor_->Await([this] {
    state_mask_ = 0;               // Specifically ~R_ENABLED.
    exec_st_.ReportCancelError();  // Context is fully resposible for cleanup.
  });

  // Make sure the replica fully stopped and did all cleanup,
  // so we can freely release resources (connections).
  sync_fb_.JoinIfNeeded();
  CloseSocket();
  DVLOG(1) << "MainReplicationFb stopped " << this;
  acks_fb_.JoinIfNeeded();
  for (auto& flow : shard_flows_) {
    flow.reset();
  }

  if (last_journal_LSNs_.has_value()) {
    return LastMasterSyncData{master_context_.master_repl_id, last_journal_LSNs_.value()};
  }
  return nullopt;
}

void Replica::Pause(bool pause) {
  VLOG(1) << "Pausing replication";
  Proactor()->Await([&] {
    is_paused_ = pause;
    if (shard_flows_.empty())
      return;

    auto cb = [&](unsigned index, auto*) {
      for (auto id : thread_flow_map_[index]) {
        shard_flows_[id]->Pause(pause);
      }
    };
    shard_set->pool()->AwaitBrief(cb);
  });
}

std::error_code Replica::TakeOver(unsigned timeout_sec, bool save_flag) {
  VLOG(1) << "Taking over " << timeout_sec << " seconds, save_flag=" << save_flag;

  std::error_code ec;
  auto takeOverCmd = absl::StrCat("TAKEOVER ", timeout_sec, (save_flag ? " SAVE" : ""));
  Proactor()->Await([this, &ec, cmd = std::move(takeOverCmd), timeout_sec] {
    // Set socket timeout to prevent hanging on unresponsive master
    // Add buffer time for master processing (timeout + 10 seconds)
    auto prev_timeout = Sock()->timeout();
    Sock()->set_timeout((timeout_sec + 10) * 1000);  // milliseconds

    ec = SendNextPhaseRequest(cmd);

    Sock()->set_timeout(prev_timeout);
  });

  // If we successfully taken over, return and let server_family stop the replication.
  return ec;
}

void Replica::MainReplicationFb(std::optional<LastMasterSyncData> last_master_sync_data) {
  VLOG(1) << "Main replication fiber started " << this;
  // Switch shard states to replication.
  SetShardStates(true);

  error_code ec;
  while (state_mask_ & R_ENABLED) {
    // Discard all previous errors and set default error handler.
    exec_st_.Reset([this](const GenericError& ge) { this->DefaultErrorHandler(ge); });
    // 1. Connect socket.
    if ((state_mask_ & R_TCP_CONNECTED) == 0) {
      ThisFiber::SleepFor(500ms);
      if (is_paused_)
        continue;

      ec = ResolveHostDns();
      if (ec) {
        LOG(ERROR) << "Error resolving dns to " << server().host << " (phase: " << GetCurrentPhase()
                   << "): " << ec;
        continue;
      }

      // Give a lower timeout for connect, because we're
      reconnect_count_++;
      ec = ConnectAndAuth(absl::GetFlag(FLAGS_master_reconnect_timeout_ms) * 1ms, &exec_st_);
      if (ec) {
        LOG(WARNING) << "Error connecting to " << server().Description()
                     << " (phase: " << GetCurrentPhase() << "): " << ec
                     << ", reason: " << ec.message();
        continue;
      }
      VLOG(1) << "Replica socket connected";
      state_mask_ |= R_TCP_CONNECTED;
      continue;
    }

    DCHECK(Proactor() == proactor_);

    // 2. Greet.
    if ((state_mask_ & R_GREETED) == 0) {
      ec = Greet();
      if (ec) {
        LOG(WARNING) << "Error greeting " << server().Description()
                     << " (phase: " << GetCurrentPhase() << "): " << ec << " " << ec.message()
                     << ", socket state: " + GetSocketInfo(Sock()->native_handle());
        state_mask_ &= R_ENABLED;
        continue;
      }
      state_mask_ |= R_GREETED;
      continue;
    }

    // 3. Initiate full sync
    if ((state_mask_ & R_SYNC_OK) == 0) {
      if (HasDflyMaster()) {
        ec = InitiateDflySync(std::exchange(last_master_sync_data, nullopt));
      } else
        ec = InitiatePSync();

      if (ec) {
        LOG(WARNING) << "Error syncing with " << server().Description()
                     << " (phase: " << GetCurrentPhase() << "): " << ec << " " << ec.message()
                     << ", socket state: " + GetSocketInfo(Sock()->native_handle());
        state_mask_ &= R_ENABLED;  // reset all flags besides R_ENABLED
        continue;
      }
      state_mask_ |= R_SYNC_OK;
      continue;
    }

    // 4. Start stable state sync.
    DCHECK(state_mask_ & R_SYNC_OK);

    if (HasDflyMaster())
      ec = ConsumeDflyStream();
    else
      ec = ConsumeRedisStream();

    state_mask_ &= R_ENABLED;
    if (state_mask_ & R_ENABLED) {  // replication was not stopped.
      LOG(WARNING) << "Error stable sync with " << server().Description()
                   << " (phase: " << GetCurrentPhase() << "): " << ec << " " << ec.message()
                   << ", socket state: " + GetSocketInfo(Sock()->native_handle());
    }
  }

  // Wait for unblocking cleanup to finish.
  exec_st_.JoinErrorHandler();

  // Revert shard states to normal state.
  SetShardStates(false);

  VLOG(1) << "Main replication fiber finished";
}

error_code Replica::Greet() {
  ResetParser(RedisParser::Mode::CLIENT);
  VLOG(1) << "greeting message handling";
  // Corresponds to server.repl_state == REPL_STATE_CONNECTING state in redis
  RETURN_ON_ERR(SendCommandAndReadResponse("PING"));  // optional.
  PC_RETURN_ON_BAD_RESPONSE(CheckRespIsSimpleReply("PONG"));

  // Corresponds to server.repl_state == REPL_STATE_SEND_HANDSHAKE condition in replication.c
  uint16_t port = absl::GetFlag(FLAGS_announce_port);
  if (port == 0) {
    port = static_cast<uint16_t>(absl::GetFlag(FLAGS_port));
  }
  RETURN_ON_ERR(SendCommandAndReadResponse(StrCat("REPLCONF listening-port ", port)));
  PC_RETURN_ON_BAD_RESPONSE(CheckRespIsSimpleReply("OK"));

  auto announce_ip = absl::GetFlag(FLAGS_replica_announce_ip);
  if (!announce_ip.empty()) {
    RETURN_ON_ERR(SendCommandAndReadResponse(StrCat("REPLCONF ip-address ", announce_ip)));
    LOG_IF(WARNING, !CheckRespIsSimpleReply("OK"))
        << "Master did not OK announced IP address, perhaps it is using an old version";
  }

  // Corresponds to server.repl_state == REPL_STATE_SEND_CAPA
  RETURN_ON_ERR(SendCommandAndReadResponse("REPLCONF capa eof capa psync2"));
  PC_RETURN_ON_BAD_RESPONSE(CheckRespIsSimpleReply("OK"));

  // Announce that we are the dragonfly client.
  // Note that we currently do not support dragonfly->redis replication.
  RETURN_ON_ERR(SendCommandAndReadResponse("REPLCONF capa dragonfly"));
  PC_RETURN_ON_BAD_RESPONSE(CheckRespFirstTypes({RespExpr::STRING}));

  if (LastResponseArgs().size() == 1) {  // Redis
    PC_RETURN_ON_BAD_RESPONSE(CheckRespIsSimpleReply("OK"));
  } else if (LastResponseArgs().size() >= 3) {  // it's dragonfly master.
    PC_RETURN_ON_BAD_RESPONSE(!HandleCapaDflyResp());
    if (auto ec = ConfigureDflyMaster(); ec)
      return ec;
  } else {
    PC_RETURN_ON_BAD_RESPONSE(false);
  }

  state_mask_ |= R_GREETED;
  return error_code{};
}

std::error_code Replica::HandleCapaDflyResp() {
  // Response is: <master_repl_id, syncid, num_shards [, version]>
  if (!CheckRespFirstTypes({RespExpr::STRING, RespExpr::STRING, RespExpr::INT64}) ||
      LastResponseArgs()[0].GetBuf().size() != CONFIG_RUN_ID_SIZE)
    return make_error_code(errc::bad_message);

  int64 param_num_flows = get<int64_t>(LastResponseArgs()[2].u);
  if (param_num_flows <= 0 || param_num_flows > 1024) {
    // sanity check, we support upto 1024 shards.
    // It's not that we can not support more but it's probably highly unlikely that someone
    // will run dragonfly with more than 1024 cores.
    LOG(ERROR) << "Invalid flow count " << param_num_flows;
    return make_error_code(errc::bad_message);
  }

  DCHECK(proactor_ == Proactor());

  // If we're syncing a different replication ID, drop the saved LSNs.
  string_view master_repl_id = ToSV(LastResponseArgs()[0].GetBuf());

  // If we tried to replicate from ourself return an error
  if (master_repl_id == id_) {
    LOG(WARNING) << "Can't connect to myself";
    return make_error_code(errc::connection_aborted);
  }

  if (master_context_.master_repl_id != master_repl_id) {
    if (absl::GetFlag(FLAGS_break_replication_on_master_restart) &&
        !master_context_.master_repl_id.empty()) {
      LOG(ERROR) << "Encountered different master repl id (" << master_repl_id << " vs "
                 << master_context_.master_repl_id << ")";
      state_mask_ = 0;
      return make_error_code(errc::connection_aborted);
    }
    last_journal_LSNs_.reset();
  }
  master_context_.master_repl_id = master_repl_id;
  master_context_.dfly_session_id = ToSV(LastResponseArgs()[1].GetBuf());
  master_context_.num_flows = param_num_flows;

  if (LastResponseArgs().size() >= 4) {
    PC_RETURN_ON_BAD_RESPONSE(LastResponseArgs()[3].type == RespExpr::INT64);
    master_context_.version = DflyVersion(get<int64_t>(LastResponseArgs()[3].u));
  }
  VLOG(1) << "Master id: " << master_context_.master_repl_id
          << ", sync id: " << master_context_.dfly_session_id
          << ", num journals: " << param_num_flows
          << ", version: " << unsigned(master_context_.version);

  return error_code{};
}

std::error_code Replica::ConfigureDflyMaster() {
  // We need to send this because we may require to use this for cluster commands.
  // this reason to send this here is that in other context we can get an error reply
  // since we are budy with the replication
  RETURN_ON_ERR(SendCommandAndReadResponse(StrCat("REPLCONF CLIENT-ID ", id_)));
  if (!CheckRespIsSimpleReply("OK")) {
    LOG(WARNING) << "Bad REPLCONF CLIENT-ID response";
  }

  RETURN_ON_ERR(
      SendCommandAndReadResponse(StrCat("REPLCONF CLIENT-VERSION ", DflyVersion::CURRENT_VER)));
  PC_RETURN_ON_BAD_RESPONSE(CheckRespIsSimpleReply("OK"));

  return error_code{};
}

error_code Replica::InitiatePSync() {
  base::IoBuf io_buf{128};

  // Corresponds to server.repl_state == REPL_STATE_SEND_PSYNC
  string id("?");  // corresponds to null master id and null offset
  int64_t offs = -1;
  if (!master_context_.master_repl_id.empty()) {  // in case we synced before
    id = master_context_.master_repl_id;          // provide the replication offset and master id
    // TBD: for incremental sync send repl_offs_, not supported yet.
    // offs = repl_offs_;
  }

  RETURN_ON_ERR(SendCommand(StrCat("PSYNC ", id, " ", offs)));

  // Master may delay sync response with "repl_diskless_sync_delay"
  PSyncResponse repl_header;

  RETURN_ON_ERR(ParseReplicationHeader(&io_buf, &repl_header));

  string* token = absl::get_if<string>(&repl_header.fullsync);
  size_t snapshot_size = SIZE_MAX;
  if (!token) {
    snapshot_size = absl::get<size_t>(repl_header.fullsync);
  }
  TouchIoTime();

  // we get token for diskless redis replication. For disk based replication
  // we get the snapshot size.
  if (snapshot_size || token != nullptr) {
    LOG(INFO) << "Starting full sync with Redis master";

    state_mask_ |= R_SYNCING;

    io::PrefixSource ps{io_buf.InputBuffer(), Sock()};

    // Set LOADING state.
    if (!service_.RequestLoadingState()) {
      return exec_st_.ReportError(std::make_error_code(errc::state_not_recoverable),
                                  "Failed to enter LOADING state");
    }

    absl::Cleanup cleanup = [this]() { service_.RemoveLoadingState(); };

    if (slot_range_.has_value()) {
      JournalExecutor{&service_}.FlushSlots(slot_range_.value());
    } else {
      JournalExecutor{&service_}.FlushAll();
    }

    RdbLoadContext load_context;
    RdbLoader loader(NULL, &load_context);
    loader.SetLoadUnownedSlots(true);
    loader.set_source_limit(snapshot_size);
    // TODO: to allow registering callbacks within loader to send '\n' pings back to master.
    // Also to allow updating last_io_time_.
    error_code ec = loader.Load(&ps);
    RETURN_ON_ERR(ec);
    VLOG(1) << "full sync completed";

    if (token) {
      uint8_t buf[kRdbEofMarkSize];
      io::PrefixSource chained(loader.Leftover(), &ps);
      VLOG(1) << "Before reading from chained stream";
      io::Result<size_t> eof_res = chained.Read(io::MutableBytes{buf});
      CHECK(eof_res && *eof_res == kRdbEofMarkSize);

      VLOG(1) << "Comparing token " << ToSV(buf);

      // TODO: handle gracefully...
      CHECK_EQ(0, memcmp(token->data(), buf, kRdbEofMarkSize));
      CHECK(chained.UnusedPrefix().empty());
    } else {
      CHECK_EQ(0u, loader.Leftover().size());
      CHECK_EQ(snapshot_size, loader.bytes_read());
    }

    CHECK(ps.UnusedPrefix().empty());
    io_buf.ConsumeInput(io_buf.InputLen());
    TouchIoTime();
  } else {
    LOG(INFO) << "Re-established sync with Redis master with ID=" << id;
  }

  state_mask_ &= ~R_SYNCING;
  state_mask_ |= R_SYNC_OK;

  // There is a data race condition in Redis-master code, where "ACK 0" handler may be
  // triggered before Redis is ready to transition to the streaming state and it silenty ignores
  // "ACK 0". We reduce the chance it happens with this delay.
  ThisFiber::SleepFor(50ms);

  return error_code{};
}

// Initialize and start sub-replica for each flow.
error_code Replica::InitiateDflySync(std::optional<LastMasterSyncData> last_master_sync_data) {
  auto start_time = absl::Now();

  // Initialize MultiShardExecution.
  multi_shard_exe_.reset(new MultiShardExecution());

  auto load_context = std::make_shared<RdbLoadContext>();

  // Initialize shard flows.
  shard_flows_.resize(master_context_.num_flows);
  DCHECK(!shard_flows_.empty());
  for (unsigned i = 0; i < shard_flows_.size(); ++i) {
    // Transfer LSN state for partial sync
    uint64_t partial_sync_lsn = 0;
    if (shard_flows_[i]) {
      partial_sync_lsn = shard_flows_[i]->JournalExecutedCount();
    }
    shard_flows_[i].reset(new DflyShardReplica(server(), master_context_, i, &service_,
                                               multi_shard_exe_, load_context.get()));
    if (partial_sync_lsn > 0) {
      shard_flows_[i]->SetRecordsExecuted(partial_sync_lsn);
    }
  }
  thread_flow_map_ = Partition(shard_flows_.size());

  // Blocked on until all flows got full sync cut.
  BlockingCounter sync_block{unsigned(shard_flows_.size())};

  // Switch to new error handler that closes flow sockets.
  auto err_handler = [this, sync_block](const auto& ge) mutable {
    // Unblock this function.
    sync_block->Cancel();

    // Make sure the flows are not in a state transition
    lock_guard lk{flows_op_mu_};

    // Unblock all sockets.
    DefaultErrorHandler(ge);
    for (auto& flow : shard_flows_)
      flow->Cancel();
  };

  RETURN_ON_ERR(exec_st_.SwitchErrorHandler(std::move(err_handler)));

  // Start full sync flows.
  state_mask_ |= R_SYNCING;

  std::string_view sync_type;
  absl::Cleanup cleanup = [this, &sync_type]() {
    // We do the following operations regardless of outcome.
    JoinDflyFlows();
    if (sync_type == "full") {
      service_.RemoveLoadingState();
    }
    state_mask_ &= ~R_SYNCING;
    last_journal_LSNs_.reset();
  };

  {
    unsigned num_df_flows = shard_flows_.size();
    if (last_master_sync_data && num_df_flows != last_master_sync_data->last_journal_LSNs.size()) {
      LOG(WARNING) << "last master has different flow size: "
                   << last_master_sync_data->last_journal_LSNs.size()
                   << " than current: " << num_df_flows;
      last_master_sync_data = std::nullopt;
    }

    // Going out of the way to avoid using std::vector<bool>...
    auto is_full_sync = std::make_unique<bool[]>(num_df_flows);
    // The elements of this bool array are not always initialized but we call std::accumulate below
    // unconditionally. For some cases this will accumulate whatever junk that uninitialized memory
    // cell contain. Do not remove the memset below.
    std::memset(is_full_sync.get(), 0, num_df_flows);
    DCHECK(!last_journal_LSNs_ || last_journal_LSNs_->size() == num_df_flows);
    auto shard_cb = [&](unsigned index, auto*) {
      for (auto id : thread_flow_map_[index]) {
        auto ec = shard_flows_[id]->StartSyncFlow(sync_block, &exec_st_,
                                                  last_journal_LSNs_.has_value()
                                                      ? std::optional((*last_journal_LSNs_)[id])
                                                      : std::nullopt,
                                                  last_master_sync_data);
        if (ec.has_value())
          is_full_sync[id] = ec.value();
        else
          exec_st_.ReportError(ec.error());
      }
    };

    if (last_journal_LSNs_) {
      ++psync_attempts_;
    }

    // Lock to prevent the error handler from running instantly
    // while the flows are in a mixed state.
    lock_guard lk{flows_op_mu_};

    shard_set->pool()->AwaitFiberOnAll(std::move(shard_cb));
    if (last_journal_LSNs_) {
      ++psync_attempts_;
    }

    last_journal_LSNs_.reset();
    size_t num_full_flows =
        std::accumulate(is_full_sync.get(), is_full_sync.get() + num_df_flows, 0);

    if (num_full_flows == num_df_flows) {
      // Make sure we're in LOADING state.
      if (!service_.RequestLoadingState()) {
        return exec_st_.ReportError(std::make_error_code(errc::state_not_recoverable),
                                    "Failed to enter LOADING state");
      }
      sync_type = "full";

      DVLOG(1) << "Calling Flush on all slots " << this;

      passed_full_sync_ = false;
      if (slot_range_.has_value()) {
        JournalExecutor{&service_}.FlushSlots(slot_range_.value());
      } else {
        JournalExecutor{&service_}.FlushAll();
      }
      DVLOG(1) << "Flush on all slots ended " << this;
    } else if (num_full_flows == 0) {
      sync_type = "partial";
    } else {
      exec_st_.ReportError(std::make_error_code(errc::state_not_recoverable),
                           "Won't do a partial sync: some flows must fully resync");
    }
  }

  RETURN_ON_ERR(exec_st_.GetError());

  LOG(INFO) << "Started " << sync_type << " sync with " << server().Description();

  // We skip full sync if we can do partial
  if (sync_type != "partial") {
    // Send DFLY SYNC.
    if (auto ec = SendNextPhaseRequest("SYNC"); ec) {
      return exec_st_.ReportError(ec);
    }

    // Wait for all flows to receive full sync cut.
    // In case of an error, this is unblocked by the error handler.
    VLOG(1) << "Waiting for all full sync cut confirmations";
    sync_block->Wait();

    // Check if we woke up due to cancellation.
    if (!exec_st_.IsRunning()) {
      load_context->PerformPostLoad(&service_, true);
      return exec_st_.GetError();
    }

    load_context->PerformPostLoad(&service_);
  }

  passed_full_sync_ = true;

  // Send DFLY STARTSTABLE.
  if (auto ec = SendNextPhaseRequest("STARTSTABLE"); ec) {
    return exec_st_.ReportError(ec);
  }

  if (sync_type == "partial") {
    ++psync_successes_;
  }

  // Joining flows and resetting state is done by cleanup.
  double seconds = double(absl::ToInt64Milliseconds(absl::Now() - start_time)) / 1000;
  LOG(INFO) << sync_type << " sync finished in " << strings::HumanReadableElapsedTime(seconds);

  return exec_st_.GetError();
}

error_code Replica::ConsumeRedisStream() {
  base::IoBuf io_buf(16_KB);
  ConnectionContext conn_context{nullptr, {}};
  conn_context.is_replicating = true;
  conn_context.journal_emulated = true;
  conn_context.skip_acl_validation = true;
  conn_context.ns = &namespaces->GetDefaultNamespace();

  // we never reply back on the commands.
  facade::CapturingReplyBuilder null_builder{facade::ReplyMode::NONE};
  ResetParser(RedisParser::Mode::SERVER);

  // Master waits for this command in order to start sending replication stream.
  RETURN_ON_ERR(SendCommand("REPLCONF ACK 0"));

  VLOG(1) << "Before reading repl-log";

  // Redis sends either pings every "repl_ping_slave_period" time inside replicationCron().
  // or, alternatively, write commands stream coming from propagate() function.
  // Replica connection must send "REPLCONF ACK xxx" in order to make sure that master replication
  // buffer gets disposed of already processed commands, this is done in a separate fiber.
  error_code ec;
  LOG(INFO) << "Transitioned into stable sync";

  // Set new error handler.
  auto err_handler = [this](const auto& ge) {
    // Trigger ack-fiber
    replica_waker_.notifyAll();
    DefaultErrorHandler(ge);
  };
  RETURN_ON_ERR(exec_st_.SwitchErrorHandler(std::move(err_handler)));

  acks_fb_ = fb2::Fiber("redis_acks", &Replica::RedisStreamAcksFb, this);

  CommandContext cmnd_ctx;
  cmnd_ctx.Init(&null_builder, &conn_context);
  while (true) {
    // Yield if the fiber has been running for long.
    if (base::CycleClock::ToUsec(ThisFiber::GetRunningTimeCycles()) > 1000) {  // 1ms
      ThisFiber::Yield();
    }

    // If the acks-fb or something else triggered a shutdown, then do not attempt to read from the
    // stream.
    if (!exec_st_.IsRunning()) {
      DCHECK(exec_st_.IsError());
      LOG_REPL_ERROR("Stopping stream consumer in phase "
                     << GetCurrentPhase()
                     << " because of external error: " << exec_st_.GetError().Format());
      acks_fb_.JoinIfNeeded();
      return exec_st_.GetError();
    }

    auto response = ReadRespReply(&io_buf, /*copy_msg=*/false);
    if (!response.has_value()) {
      LOG_REPL_ERROR("Error in Redis Stream at phase "
                     << GetCurrentPhase() << " with " << server().Description()
                     << ", error: " << response.error()
                     << ", socket state: " + GetSocketInfo(Sock()->native_handle()));
      exec_st_.ReportError(response.error());
      acks_fb_.JoinIfNeeded();
      return response.error();
    }

    const auto& last_args = LastResponseArgs();
    if (!last_args.empty()) {
      string cmd = absl::CHexEscape(last_args[0].GetView());

      // Valkey and Redis may send MULTI and EXEC as part of their replication commands.
      // Dragonfly disallows some commands, such as SELECT, inside of MULTI/EXEC, so here we simply
      // ignore MULTI/EXEC and execute their inner commands individually.
      if (!absl::EqualsIgnoreCase(cmd, "MULTI") && !absl::EqualsIgnoreCase(cmd, "EXEC")) {
        VLOG(2) << "Got command " << cmd << "\n consumed: " << response->total_read;

        if (LastResponseArgs()[0].GetBuf()[0] == '\r') {
          for (const auto& arg : LastResponseArgs()) {
            LOG(INFO) << absl::CHexEscape(ToSV(arg.GetBuf()));
          }
        }

        FillBackedArgs(last_args, &cmnd_ctx);
        service_.DispatchCommand(facade::ParsedArgs{cmnd_ctx}, &cmnd_ctx,
                                 facade::AsyncPreference::ONLY_SYNC);
      }
    }

    io_buf.ConsumeInput(response->left_in_buffer);
    repl_offs_ += response->total_read;
    replica_waker_.notify();  // Notify to trigger ACKs.
  }
}

error_code Replica::ConsumeDflyStream() {
  // Set new error handler that closes flow sockets.
  auto err_handler = [this](const auto& ge) {
    // Make sure the flows are not in a state transition
    lock_guard lk{flows_op_mu_};

    LOG_REPL_ERROR("Replication error in phase "
                   << GetCurrentPhase() << " with " << server().Description() << ", error: "
                   << ge.Format() << ", socket state: " + GetSocketInfo(Sock()->native_handle()));

    DefaultErrorHandler(ge);
    for (auto& flow : shard_flows_) {
      flow->Cancel();
    }
    multi_shard_exe_->CancelAllBlockingEntities();
  };
  RETURN_ON_ERR(exec_st_.SwitchErrorHandler(std::move(err_handler)));

  LOG(INFO) << "Transitioned into stable sync";
  // Transition flows into stable sync.
  {
    auto shard_cb = [&](unsigned index, auto*) {
      const auto& local_ids = thread_flow_map_[index];

      for (unsigned id : local_ids) {
        auto ec = shard_flows_[id]->StartStableSyncFlow(&exec_st_);
        if (ec)
          exec_st_.ReportError(ec);
      }
    };

    // Lock to prevent error handler from running on mixed state.
    lock_guard lk{flows_op_mu_};
    shard_set->pool()->AwaitFiberOnAll(std::move(shard_cb));
  }

  JoinDflyFlows();

  last_journal_LSNs_.emplace();
  for (auto& flow : shard_flows_) {
    last_journal_LSNs_->push_back(flow->JournalExecutedCount());
  }

  LOG(INFO) << "Exit stable sync";
  // The only option to unblock is to cancel the context.
  CHECK(exec_st_.GetError());

  return exec_st_.GetError();
}

void Replica::JoinDflyFlows() {
  for (auto& flow : shard_flows_) {
    flow->JoinFlow();
  }
}

void Replica::SetShardStates(bool replica) {
  shard_set->RunBriefInParallel([replica](EngineShard* shard) { shard->SetReplica(replica); });
}

error_code Replica::SendNextPhaseRequest(string_view kind) {
  // Ask master to start sending replication stream
  string request = StrCat("DFLY ", kind, " ", master_context_.dfly_session_id);

  VLOG(1) << "Sending: " << request;
  RETURN_ON_ERR(SendCommandAndReadResponse(request));

  PC_RETURN_ON_BAD_RESPONSE(CheckRespIsSimpleReply("OK"));

  return std::error_code{};
}

io::Result<bool> DflyShardReplica::StartSyncFlow(
    BlockingCounter sb, ExecutionState* cntx, std::optional<LSN> lsn,
    std::optional<Replica::LastMasterSyncData> last_master_data) {
  using nonstd::make_unexpected;
  DCHECK(!master_context_.master_repl_id.empty() && !master_context_.dfly_session_id.empty());
  proactor_index_ = ProactorBase::me()->GetPoolIndex();

  RETURN_ON_ERR_T(make_unexpected,
                  ConnectAndAuth(absl::GetFlag(FLAGS_master_connect_timeout_ms) * 1ms, &exec_st_));

  VLOG(1) << "Sending on flow " << master_context_.master_repl_id << " "
          << master_context_.dfly_session_id << " " << flow_id_ << " lsn: " << lsn.value_or(-1);

  // DFLY FLOW <master_id> <session_id> <flow_id> [lsn] [last_master_id lsn-vec]
  std::string cmd = StrCat("DFLY FLOW ", master_context_.master_repl_id, " ",
                           master_context_.dfly_session_id, " ", flow_id_);
  // Try to negotiate a partial sync if possible.
  if (lsn.has_value() && master_context_.version > DflyVersion::VER1 &&
      absl::GetFlag(FLAGS_replica_partial_sync)) {
    absl::StrAppend(&cmd, " ", *lsn);
  }
  if (last_master_data && master_context_.version >= DflyVersion::VER5 &&
      absl::GetFlag(FLAGS_replica_partial_sync)) {
    string lsn_str = absl::StrJoin(last_master_data.value().last_journal_LSNs, "-");
    absl::StrAppend(&cmd, " ", last_master_data.value().id, " ", lsn_str);
    VLOG(1) << "Sending last master sync flow " << last_master_data.value().id << " " << lsn_str;
  }

  ResetParser(RedisParser::Mode::CLIENT);
  leftover_buf_.emplace(128);
  RETURN_ON_ERR_T(make_unexpected, SendCommand(cmd));
  auto read_resp = ReadRespReply(&*leftover_buf_);
  if (!read_resp.has_value()) {
    return make_unexpected(read_resp.error());
  }

  PC_RETURN_ON_BAD_RESPONSE_T(make_unexpected,
                              CheckRespFirstTypes({RespExpr::STRING, RespExpr::STRING}));

  string_view flow_directive = ToSV(LastResponseArgs()[0].GetBuf());

  string eof_token;
  PC_RETURN_ON_BAD_RESPONSE_T(make_unexpected,
                              flow_directive == "FULL" || flow_directive == "PARTIAL");
  bool is_full_sync = flow_directive == "FULL";

  eof_token = ToSV(LastResponseArgs()[1].GetBuf());

  leftover_buf_->ConsumeInput(read_resp->left_in_buffer);

  // Skip full sync if we are doing partial. Clean up will take care mixed state, e.g,
  // some flows receive partial while others receive full.
  if (is_full_sync) {
    // We can not discard io_buf because it may contain data
    // besides the response we parsed. Therefore we pass it further to ReplicateDFFb.
    sync_fb_ = fb2::Fiber("shard_full_sync", &DflyShardReplica::FullSyncDflyFb, this,
                          std::move(eof_token), sb, cntx);
  } else if (last_master_data) {
    // Only needed when we are rotating masters.
    SetRecordsExecuted(last_master_data->last_journal_LSNs[flow_id_]);
  }

  return is_full_sync;
}

error_code DflyShardReplica::StartStableSyncFlow(ExecutionState* cntx) {
  DCHECK(!master_context_.master_repl_id.empty() && !master_context_.dfly_session_id.empty());
  ProactorBase* mythread = ProactorBase::me();
  CHECK(mythread);

  if (!Sock()->IsOpen()) {
    return std::make_error_code(errc::io_error);
  }
  rdb_loader_.reset();  // we do not need it anymore.
  sync_fb_ =
      fb2::Fiber("shard_stable_sync_read", &DflyShardReplica::StableSyncDflyReadFb, this, cntx);

  return std::error_code{};
}

void DflyShardReplica::FullSyncDflyFb(std::string eof_token, BlockingCounter bc,
                                      ExecutionState* cntx) {
  DCHECK(leftover_buf_);
  io::PrefixSource ps{leftover_buf_->InputBuffer(), Sock()};

  rdb_loader_->SetFullSyncCutCb([bc, ran = false]() mutable {
    if (!ran) {
      bc->Dec();
      ran = true;
    }
  });

  // In the no point-in-time replication flow, it's possible to serialize a journal change
  // before serializing the bucket that the key was updated in on the master side. As a result,
  // when loading the serialized bucket data on the replica, it may overwrite the earlier entry
  // added by the journal change. This is an expected and valid scenario, so to avoid unnecessary
  // warnings, we enable SetOverrideExistingKeys(true).
  rdb_loader_->SetOverrideExistingKeys(true);

  // Load incoming rdb stream.
  if (std::error_code ec = rdb_loader_->Load(&ps); ec) {
    cntx->ReportError(ec, "Error loading rdb format");
    return;
  }

  // Try finding eof token.
  io::PrefixSource chained_tail{rdb_loader_->Leftover(), &ps};
  if (!eof_token.empty()) {
    unique_ptr<uint8_t[]> buf{new uint8_t[eof_token.size()]};

    io::Result<size_t> res =
        chained_tail.ReadAtLeast(io::MutableBytes{buf.get(), eof_token.size()}, eof_token.size());

    if (!res || *res != eof_token.size()) {
      cntx->ReportError(std::make_error_code(errc::protocol_error),
                        "Error finding eof token in stream");
      return;
    }
  }

  // Keep loader leftover.
  io::Bytes unused = chained_tail.UnusedPrefix();
  if (!unused.empty()) {
    leftover_buf_.emplace(unused.size());
    leftover_buf_->WriteAndCommit(unused.data(), unused.size());
  } else {
    leftover_buf_.reset();
  }

  if (auto jo = rdb_loader_->journal_offset(); jo.has_value()) {
    this->journal_rec_executed_.store(*jo);
  } else {
    cntx->ReportError(std::make_error_code(errc::protocol_error),
                      "Error finding journal offset in stream");
  }
  VLOG(1) << "FullSyncDflyFb finished after reading " << rdb_loader_->bytes_read() << " bytes";
}

void DflyShardReplica::StableSyncDflyReadFb(ExecutionState* cntx) {
  DCHECK_EQ(proactor_index_, ProactorBase::me()->GetPoolIndex());

  // Check leftover from full sync.
  io::Bytes prefix{};
  if (leftover_buf_ && leftover_buf_->InputLen() > 0) {
    prefix = leftover_buf_->InputBuffer();
  }

  io::PrefixSource ps{prefix, Sock()};

  JournalReader reader{&ps, 0};
  DCHECK_GE(journal_rec_executed_, 1u);
  TransactionReader tx_reader{journal_rec_executed_.load(std::memory_order_relaxed) - 1};

  acks_fb_ = fb2::Fiber("shard_acks", &DflyShardReplica::StableSyncDflyAcksFb, this, cntx);
  TransactionData tx_data;
  while (tx_reader.NextTxData(&reader, cntx, &tx_data)) {
    DVLOG(3) << "Lsn: " << tx_data.lsn;

    last_io_time_ = Proactor()->GetMonotonicTimeNs();
    if (tx_data.opcode == journal::Op::LSN) {
      //  Do nothing
    } else if (tx_data.opcode == journal::Op::PING) {
      force_ping_ = true;
      journal_rec_executed_.fetch_add(1, std::memory_order_relaxed);
      if (EngineShard::tlocal() && EngineShard::tlocal()->journal()) {
        // We must register this entry to the journal to allow partial sync
        // if journal is active.
        journal::RecordEntry(0, journal::Op::PING, 0, nullopt, {});
      }
    } else {
      const bool is_successful = ExecuteTx(std::move(tx_data), cntx);
      if (is_successful) {
        // We only increment upon successful execution of the transaction.
        // The reason for this is that during partial sync we sent this
        // number as the lsn number to resume from. However, if for example
        // we increment this when a command fails (because the context
        // got cancelled, e.g, replication connection broke), we will get
        // inconsistent data because the replica will resume from the next
        // lsn of the master and this lsn entry will be lost.
        journal_rec_executed_.fetch_add(1, std::memory_order_relaxed);
      } else {
        // We only report DFATAL:
        // 1. Context is running
        // 2. We are ACTIVE global state
        if (cntx->IsRunning() && ((*ServerState::tlocal()).gstate() == GlobalState::ACTIVE)) {
          LOG(DFATAL) << "ExecuteTx() on replica should be successful.";
        }
      }
    }

    shard_replica_waker_.notifyAll();
  }
}

void Replica::RedisStreamAcksFb() {
  constexpr size_t kAckRecordMaxInterval = 1024;
  std::chrono::duration ack_time_max_interval =
      1ms * absl::GetFlag(FLAGS_replication_acks_interval);
  std::string ack_cmd;
  auto next_ack_tp = std::chrono::steady_clock::now();

  while (exec_st_.IsRunning()) {
    VLOG(2) << "Sending an ACK with offset=" << repl_offs_;
    ack_cmd = absl::StrCat("REPLCONF ACK ", repl_offs_);
    next_ack_tp = std::chrono::steady_clock::now() + ack_time_max_interval;
    if (auto ec = SendCommand(ack_cmd); ec) {
      exec_st_.ReportError(ec);
      break;
    }
    ack_offs_ = repl_offs_;

    replica_waker_.await_until(
        [&]() { return repl_offs_ > ack_offs_ + kAckRecordMaxInterval || (!exec_st_.IsRunning()); },
        next_ack_tp);
  }
}

void DflyShardReplica::StableSyncDflyAcksFb(ExecutionState* cntx) {
  DCHECK_EQ(proactor_index_, ProactorBase::me()->GetPoolIndex());

  constexpr size_t kAckRecordMaxInterval = 1024;
  std::chrono::duration ack_time_max_interval =
      1ms * absl::GetFlag(FLAGS_replication_acks_interval);
  std::string ack_cmd;
  auto next_ack_tp = std::chrono::steady_clock::now();

  uint64_t current_offset;
  while (cntx->IsRunning()) {
    // Handle ACKs with the master. PING opcodes from the master mean we should immediately
    // answer.
    current_offset = journal_rec_executed_.load(std::memory_order_relaxed);
    VLOG(1) << "Sending an ACK with offset=" << current_offset << " forced=" << force_ping_;
    ack_cmd = absl::StrCat("REPLCONF ACK ", current_offset);
    force_ping_ = false;
    next_ack_tp = std::chrono::steady_clock::now() + ack_time_max_interval;
    if (auto ec = SendCommand(ack_cmd); ec) {
      cntx->ReportError(ec);
      break;
    }
    ack_offs_ = current_offset;

    shard_replica_waker_.await_until(
        [&]() {
          return journal_rec_executed_.load(std::memory_order_relaxed) >
                     ack_offs_ + kAckRecordMaxInterval ||
                 force_ping_ || (!cntx->IsRunning());
        },
        next_ack_tp);
  }
}

DflyShardReplica::DflyShardReplica(ServerContext server_context, MasterContext master_context,
                                   uint32_t flow_id, Service* service,
                                   std::shared_ptr<MultiShardExecution> multi_shard_exe,
                                   RdbLoadContext* load_context)
    : ProtocolClient(server_context),
      service_(*service),
      master_context_(master_context),
      multi_shard_exe_(multi_shard_exe),
      flow_id_(flow_id) {
  executor_ = std::make_unique<JournalExecutor>(service);
  rdb_loader_ = std::make_unique<RdbLoader>(&service_, load_context);
  rdb_loader_->SetLoadUnownedSlots(true);
  rdb_loader_->SetShardCount(master_context.num_flows);
}

DflyShardReplica::~DflyShardReplica() {
  CloseSocket();
  JoinFlow();
}

bool DflyShardReplica::ExecuteTx(TransactionData&& tx_data, ExecutionState* cntx) {
  if (!cntx->IsRunning()) {
    return false;
  }

  if (!tx_data.IsGlobalCmd()) {
    VLOG(3) << "Execute cmd without sync between shards. txid: " << tx_data.txid;
    return executor_->Execute(tx_data.dbid, tx_data.command) == facade::DispatchResult::OK;
  }

  bool inserted_by_me =
      multi_shard_exe_->InsertTxToSharedMap(tx_data.txid, master_context_.num_flows);

  auto& multi_shard_data = multi_shard_exe_->Find(tx_data.txid);

  VLOG(2) << "Execute txid: " << tx_data.txid << " waiting for data in all shards";
  // Wait until shards flows got transaction data and inserted to map.
  // This step enforces that replica will execute multi shard commands that finished on master
  // and replica recieved all the commands from all shards.
  multi_shard_data.block->Wait();
  // Check if we woke up due to cancellation.
  if (!exec_st_.IsRunning())
    return false;
  VLOG(2) << "Execute txid: " << tx_data.txid << " block wait finished";

  VLOG(2) << "Execute txid: " << tx_data.txid << " global command execution";
  // Wait until all shards flows get to execution step of this transaction.
  multi_shard_data.barrier.Wait();
  // Check if we woke up due to cancellation.
  if (!exec_st_.IsRunning())
    return false;
  // Global command will be executed only from one flow fiber. This ensure corectness of data in
  // replica.
  bool execution_res = true;
  if (inserted_by_me) {
    execution_res = executor_->Execute(tx_data.dbid, tx_data.command) == facade::DispatchResult::OK;
  }
  // Wait until exection is done, to make sure we done execute next commands while the global is
  // executed.
  multi_shard_data.barrier.Wait();
  // Check if we woke up due to cancellation.
  if (!exec_st_.IsRunning())
    return false;

  // Erase from map can be done only after all flow fibers executed the transaction commands.
  // The last fiber which will decrease the counter to 0 will be the one to erase the data from
  // map
  auto val = multi_shard_data.counter.fetch_sub(1, std::memory_order_relaxed);
  VLOG(2) << "txid: " << tx_data.txid << " counter: " << val;
  if (val == 1) {
    multi_shard_exe_->Erase(tx_data.txid);
  }
  return execution_res;
}

error_code Replica::ParseReplicationHeader(base::IoBuf* io_buf, PSyncResponse* dest) {
  std::string_view str;

  RETURN_ON_ERR(ReadLine(io_buf, &str));

  DCHECK(!str.empty());

  std::string_view header;
  bool valid = false;

  auto bad_header = [str]() {
    LOG(ERROR) << "Bad replication header: " << str;
    return std::make_error_code(std::errc::illegal_byte_sequence);
  };

  // non-empty lines
  if (str[0] != '+') {
    return bad_header();
  }

  header = str.substr(1);
  VLOG(1) << "header: " << header;
  if (absl::ConsumePrefix(&header, "FULLRESYNC ")) {
    // +FULLRESYNC db7bd45bf68ae9b1acac33acb 123\r\n
    //             master_id  repl_offset
    size_t pos = header.find(' ');
    if (pos != std::string_view::npos) {
      if (absl::SimpleAtoi(header.substr(pos + 1), &repl_offs_)) {
        master_context_.master_repl_id = string(header.substr(0, pos));
        valid = true;
        VLOG(1) << "master repl_id " << master_context_.master_repl_id << " / " << repl_offs_;
      }
    }

    if (!valid)
      return bad_header();

    io_buf->ConsumeInput(str.size() + 2);
    RETURN_ON_ERR(ReadLine(io_buf, &str));  // Read the next line parsed below.

    // Readline checks for non ws character first before searching for eol
    // so str must be non empty.
    DCHECK(!str.empty());

    if (str[0] != '$') {
      return bad_header();
    }

    std::string_view token = str.substr(1);
    VLOG(1) << "token: " << token;
    if (absl::ConsumePrefix(&token, "EOF:")) {
      CHECK_EQ(kRdbEofMarkSize, token.size()) << token;
      dest->fullsync.emplace<string>(token);
      VLOG(1) << "Token: " << token;
    } else {
      size_t rdb_size = 0;
      if (!absl::SimpleAtoi(token, &rdb_size))
        return std::make_error_code(std::errc::illegal_byte_sequence);

      VLOG(1) << "rdb size " << rdb_size;
      dest->fullsync.emplace<size_t>(rdb_size);
    }
    io_buf->ConsumeInput(str.size() + 2);
  } else if (absl::ConsumePrefix(&header, "CONTINUE")) {
    // we send psync2 so we should get master replid.
    // That could change due to redis failovers.
    // TODO: part sync
    dest->fullsync.emplace<size_t>(0);
    LOG(ERROR) << "Partial replication not supported yet";
    return std::make_error_code(std::errc::not_supported);
  } else {
    LOG(ERROR) << "Unknown replication header";
    return bad_header();
  }

  return error_code{};
}

auto Replica::GetSummary() const -> Summary {
  auto f = [this]() {
    auto last_io_time = LastIoTime();

    for (const auto& flow : shard_flows_) {
      last_io_time = std::max(last_io_time, flow->LastIoTime());
    }

    Summary res;
    res.host = server().host;
    res.port = server().port;
    res.master_link_established = (state_mask_ & R_TCP_CONNECTED);
    res.full_sync_in_progress = (state_mask_ & R_SYNCING);
    res.full_sync_done = (state_mask_ & R_SYNC_OK);

    uint64_t current_time = ProactorBase::GetMonotonicTimeNs();
    // last_io_time is derived above by reading last_io_time_ from all the flows,
    // by accessing them from a foreign thread, see the loop above. As a result some
    // threads may have last_io_time_ bigger than our current time, so we fix it here.
    if (last_io_time > current_time) {
      res.master_last_io_sec = 0;
    } else {
      res.master_last_io_sec = (current_time - last_io_time) / 1000000000UL;
    }

    res.master_id = master_context_.master_repl_id;
    res.reconnect_count = reconnect_count_;
    res.repl_offset_sum = 0;
    for (uint64_t offs : GetReplicaOffset()) {
      res.repl_offset_sum += offs;
    }
    res.psync_successes = psync_successes_;
    res.psync_attempts = psync_attempts_;
    res.passed_full_sync = passed_full_sync_;
    return res;
  };

  return proactor_->AwaitBrief(f);
}

std::vector<uint64_t> Replica::GetReplicaOffset() const {
  std::vector<uint64_t> flow_rec_count;
  flow_rec_count.resize(shard_flows_.size());
  for (const auto& flow : shard_flows_) {
    uint32_t flow_id = flow->FlowId();
    uint64_t rec_count = flow->JournalExecutedCount();
    DCHECK_LT(flow_id, shard_flows_.size());
    flow_rec_count[flow_id] = rec_count;
  }
  return flow_rec_count;
}

std::string Replica::GetSyncId() const {
  return master_context_.dfly_session_id;
}

std::string Replica::GetCurrentPhase() const {
  if (!(state_mask_ & R_ENABLED))
    return "DISABLED";
  if (!(state_mask_ & R_TCP_CONNECTED))
    return "TCP_CONNECTING";
  if (!(state_mask_ & R_GREETED))
    return "GREETING";
  if (!(state_mask_ & R_SYNC_OK))
    return "INITIAL_SYNC";
  if (state_mask_ & R_SYNCING)
    return "FULL_SYNC_IN_PROGRESS";

  return "STABLE_SYNC";
}

std::vector<unsigned> Replica::GetFlowMapAtIndex(size_t index) const {
  // Not all proactors have flows
  if (index >= thread_flow_map_.size()) {
    return {};
  }
  return thread_flow_map_[index];
}

size_t Replica::GetRecCountExecutedPerShard(const std::vector<unsigned>& indexes) const {
  size_t total_shard_lsn = 0;
  for (auto index : indexes) {
    total_shard_lsn += shard_flows_[index]->JournalExecutedCount();
  }
  // Journal always starts at pos 1
  return std::max<size_t>(1UL, total_shard_lsn);
}

uint32_t DflyShardReplica::FlowId() const {
  return flow_id_;
}

void DflyShardReplica::Pause(bool pause) {
  if (rdb_loader_) {
    rdb_loader_->Pause(pause);
  }
}

void DflyShardReplica::JoinFlow() {
  sync_fb_.JoinIfNeeded();
  acks_fb_.JoinIfNeeded();
}

void DflyShardReplica::Cancel() {
  if (rdb_loader_)
    rdb_loader_->stop();
  ShutdownSocket();
  shard_replica_waker_.notifyAll();
}

}  // namespace dfly


================================================
FILE: src/server/replica.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once

#include <absl/container/inlined_vector.h>

#include <atomic>
#include <boost/fiber/barrier.hpp>
#include <queue>
#include <variant>

#include "facade/facade_types.h"
#include "facade/redis_parser.h"
#include "io/io_buf.h"
#include "server/cluster/cluster_defs.h"
#include "server/execution_state.h"
#include "server/journal/tx_executor.h"
#include "server/journal/types.h"
#include "server/protocol_client.h"
#include "server/replica_types.h"
#include "server/version.h"
#include "util/fiber_socket_base.h"

namespace dfly {

class Service;
class ConnectionContext;
class JournalExecutor;
struct JournalReader;
class DflyShardReplica;

// The attributes of the master we are connecting to.
struct MasterContext {
  std::string master_repl_id;
  std::string dfly_session_id;  // Sync session id for dfly sync.
  unsigned num_flows = 0;
  DflyVersion version = DflyVersion::VER1;
};

// This class manages replication from both Dragonfly and Redis masters.
class Replica : ProtocolClient {
 private:
  // The flow is : R_ENABLED -> R_TCP_CONNECTED -> (R_SYNCING) -> R_SYNC_OK.
  // SYNCING means that the initial ack succeeded. It may be optional if we can still load from
  // the journal offset.
  enum State : unsigned {
    R_ENABLED = 1,  // Replication mode is enabled. Serves for signaling shutdown.
    R_TCP_CONNECTED = 2,
    R_GREETED = 4,     // Initial handshake with the master is done.
    R_SYNCING = 8,     // In process of full sync with the master.
    R_SYNC_OK = 0x10,  // Signals successful ending of full-sync state, exclusive with R_SYNCING.
  };

 public:
  Replica(std::string master_host, uint16_t port, Service* se, std::string_view id,
          std::optional<cluster::SlotRange> slot_range);
  ~Replica();

  // Spawns a fiber that runs until link with master is broken or the replication is stopped.
  // Returns true if initial link with master has been established or
  // false if it has failed.
  GenericError Start();
  using LastMasterSyncData = dfly::LastMasterSyncData;
  void StartMainReplicationFiber(std::optional<LastMasterSyncData> data);

  // Sets the server state to have replication enabled.
  // It is like Start(), but does not attempt to establish
  // a connection right-away, but instead lets MainReplicationFb do the work.
  void EnableReplication();

  std::optional<LastMasterSyncData> Stop();  // thread-safe

  void Pause(bool pause);

  std::error_code TakeOver(unsigned timeout, bool save_flag);

  bool IsContextCancelled() const {
    return !exec_st_.IsRunning();
  }

 private: /* Main standalone mode functions */
  // Coordinate state transitions. Spawned by start.
  void MainReplicationFb(std::optional<LastMasterSyncData> data);

  std::error_code Greet();  // Send PING and REPLCONF.

  std::error_code HandleCapaDflyResp();
  std::error_code ConfigureDflyMaster();

  std::error_code InitiatePSync();                                           // Redis full sync.
  std::error_code InitiateDflySync(std::optional<LastMasterSyncData> data);  // Dragonfly full sync.

  std::error_code ConsumeRedisStream();  // Redis stable state.
  std::error_code ConsumeDflyStream();   // Dragonfly stable state.

  void RedisStreamAcksFb();

  // Joins all the flows when doing sharded replication. This is called in two
  // places: Once at the end of full sync to join the full sync fibers, and twice
  // if a stable sync is interrupted to join the cancelled stable sync fibers.
  void JoinDflyFlows();
  void SetShardStates(bool replica);  // Call SetReplica(replica) on all shards.

  // Send DFLY ${kind} to the master instance.
  std::error_code SendNextPhaseRequest(std::string_view kind);

 private: /* Utility */
  struct PSyncResponse {
    // string - end of sync token (diskless)
    // size_t - size of the full sync blob (disk-based).
    // if fullsync is 0, it means that master can continue with partial replication.
    std::variant<std::string, size_t> fullsync;
  };

  std::error_code ParseReplicationHeader(base::IoBuf* io_buf, PSyncResponse* dest);

 public: /* Utility */
  using Summary = ReplicaSummary;

  Summary GetSummary() const;  // thread-safe, blocks fiber, makes a hop.

  bool HasDflyMaster() const {
    return !master_context_.dfly_session_id.empty();
  }

  std::vector<uint64_t> GetReplicaOffset() const;
  std::string GetSyncId() const;

  // Get the current replication phase based on state_mask_
  std::string GetCurrentPhase() const;

  // Used *only* in TakeOver flow and replicaof no one. There is small data race if
  // thread_flow_map_ gets written by the MainReplicationFiber thread but
  // the chances for that are extremely rare.
  std::vector<unsigned> GetFlowMapAtIndex(size_t index) const;

  size_t GetRecCountExecutedPerShard(const std::vector<unsigned>& indexes) const;

 private:
  util::fb2::ProactorBase* proactor_ = nullptr;
  Service& service_;
  MasterContext master_context_;

  // In redis replication mode.
  util::fb2::Fiber sync_fb_;
  util::fb2::Fiber acks_fb_;
  util::fb2::EventCount replica_waker_;

  std::vector<std::unique_ptr<DflyShardReplica>> shard_flows_;
  std::vector<std::vector<unsigned>> thread_flow_map_;  // a map from proactor id to flow list.

  // A vector of the last executer LSNs when a replication is interrupted.
  // Allows partial sync on reconnects.
  std::optional<std::vector<LSN>> last_journal_LSNs_;
  std::shared_ptr<MultiShardExecution> multi_shard_exe_;

  // Guard operations where flows might be in a mixed state (transition/setup)
  util::fb2::Mutex flows_op_mu_;

  // repl_offs - till what offset we've already read from the master.
  // ack_offs_ last acknowledged offset.
  size_t repl_offs_ = 0, ack_offs_ = 0;
  unsigned state_mask_ = 0;  // see State enum above.

  // When replica starts full sync it is set to false and true when it completes the full sync.
  // Disconnects do not reset this, so this variable is still true if the master
  // is not connected and the state_mask_ is cleared.
  // Furthermore, on reconnects that enter full sync
  // again this variable is set to false until full sync completes.
  // Therefore, we have a consistent view of the replica:
  // 1. True. Replica passed full sync even if master disconnects. In fact, once a
  // node reached stable, the deltas from journal are the only missing items.
  // 2. False. Replica has not passed full sync or a disconnect started full sync again.
  bool passed_full_sync_ = false;

  bool is_paused_ = false;
  std::string id_;

  std::optional<cluster::SlotRange> slot_range_;

  uint32_t reconnect_count_ = 0;
  size_t psync_attempts_ = 0;
  size_t psync_successes_ = 0;
};

class RdbLoader;
// This class implements a single shard replication flow from a Dragonfly master instance.
// Multiple DflyShardReplica objects are managed by a Replica object.
class DflyShardReplica : public ProtocolClient {
 public:
  DflyShardReplica(ServerContext server_context, MasterContext master_context, uint32_t flow_id,
                   Service* service, std::shared_ptr<MultiShardExecution> multi_shard_exe,
                   class RdbLoadContext* load_context);
  ~DflyShardReplica();

  void Cancel();
  void JoinFlow();

  // Start replica initialized as dfly flow.
  // Sets is_full_sync when successful.
  io::Result<bool> StartSyncFlow(util::fb2::BlockingCounter block, ExecutionState* cntx,
                                 std::optional<LSN>,
                                 std::optional<Replica::LastMasterSyncData> data);

  // Transition into stable state mode as dfly flow.
  std::error_code StartStableSyncFlow(ExecutionState* cntx);

  // Single flow full sync fiber spawned by StartFullSyncFlow.
  void FullSyncDflyFb(std::string eof_token, util::fb2::BlockingCounter block,
                      ExecutionState* cntx);

  // Single flow stable state sync fiber spawned by StartStableSyncFlow.
  void StableSyncDflyReadFb(ExecutionState* cntx);

  void StableSyncDflyAcksFb(ExecutionState* cntx);

  // Return true if the transaction executed successfully. On error,
  // or on context cancellation return false.
  bool ExecuteTx(TransactionData&& tx_data, ExecutionState* cntx);

  uint32_t FlowId() const;

  uint64_t JournalExecutedCount() const {
    return journal_rec_executed_.load(std::memory_order_relaxed);
  }

  uint64_t SetRecordsExecuted(uint64_t value) {
    return journal_rec_executed_ = value;
  }

  // Can be called from any thread.
  void Pause(bool pause);

 private:
  Service& service_;
  MasterContext master_context_;

  std::optional<base::IoBuf> leftover_buf_;

  util::fb2::EventCount shard_replica_waker_;  // waker for trans_data_queue_

  std::unique_ptr<JournalExecutor> executor_;
  std::unique_ptr<RdbLoader> rdb_loader_;

  // The master instance has a LSN for each journal record. This counts
  // the number of journal records executed in this flow plus the initial
  // journal offset that we received in the transition from full sync
  // to stable sync.
  // Note: This is not 1-to-1 the LSN in the master, because this counts
  // **executed** records, which might be received interleaved when commands
  // run out-of-order on the master instance.
  // Atomic, because JournalExecutedCount() can be called from any thread.
  std::atomic_uint64_t journal_rec_executed_ = 1;

  util::fb2::Fiber sync_fb_, acks_fb_;
  size_t ack_offs_ = 0;
  int proactor_index_ = -1;
  bool force_ping_ = false;

  std::shared_ptr<MultiShardExecution> multi_shard_exe_;
  uint32_t flow_id_ = UINT32_MAX;  // Flow id if replica acts as a dfly flow.
};

}  // namespace dfly


================================================
FILE: src/server/replica_types.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <string>
#include <vector>

#include "server/common_types.h"

namespace dfly {

struct ReplicaSummary {
  std::string host;
  uint16_t port;
  bool master_link_established;
  bool full_sync_in_progress;
  bool full_sync_done;
  time_t master_last_io_sec;  // monotonic clock.
  std::string master_id;
  uint32_t reconnect_count;

  // sum of the offsets on all the flows.
  uint64_t repl_offset_sum;
  size_t psync_attempts;
  size_t psync_successes;
  // We can't rely on full_sync_done or full_sync_in_progress because
  // on disconnects the replica state mask is cleared. We use this variable
  // to track if the replica reached full sync. When master disconnects,
  // we use this variable to print the journal offsets in info command even
  // when the link is down. It's reset whenever a full sync is initiated again.
  bool passed_full_sync;
};

struct LastMasterSyncData {
  std::string id;
  std::vector<LSN> last_journal_LSNs;  // lsn for each master shard.
};

}  // namespace dfly


================================================
FILE: src/server/script_mgr.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/script_mgr.h"

#include <absl/cleanup/cleanup.h>
#include <absl/strings/ascii.h>
#include <absl/strings/match.h>
#include <absl/strings/numbers.h>
#include <absl/strings/str_cat.h>
#include <absl/strings/str_split.h>

#include <regex>
#include <string>

#include "base/flags.h"
#include "base/logging.h"
#include "core/interpreter.h"
#include "facade/error.h"
#include "facade/reply_builder.h"
#include "server/conn_context.h"
#include "server/engine_shard_set.h"
#include "server/server_state.h"
#include "server/transaction.h"

ABSL_FLAG(std::string, default_lua_flags, "",
          "Configure default flags for running Lua scripts: \n - Use 'allow-undeclared-keys' to "
          "allow accessing undeclared keys, \n - Use 'disable-atomicity' to allow "
          "running scripts non-atomically, \n - Use 'legacy-float' to return floats as integers.\n"
          "Specify multiple values separated by space, for example 'allow-undeclared-keys "
          "disable-atomicity' runs scripts non-atomically and allows accessing undeclared keys");

ABSL_FLAG(
    bool, lua_auto_async, false,
    "If enabled, call/pcall with discarded values are automatically replaced with acall/apcall.");

ABSL_FLAG(bool, lua_allow_undeclared_auto_correct, false,
          "If enabled, when a script that is not allowed to run with undeclared keys is trying to "
          "access undeclared keys, automaticaly set the script flag to be able to run with "
          "undeclared key.");

ABSL_FLAG(
    std::vector<std::string>, lua_undeclared_keys_shas, {},
    "Comma-separated list of Lua script SHAs which are allowed to access undeclared keys. SHAs are "
    "only looked at when loading the script, and new values do not affect already-loaded script.");

ABSL_FLAG(std::vector<std::string>, lua_float_as_int_shas, {},
          "Comma-separated list of Lua script SHAs which should return floats as integers. "
          "SHAs are only looked at when loading the script.");

namespace dfly {
using namespace std;
using namespace facade;
using namespace util;

ScriptMgr::ScriptMgr() {
  // Build default script flags
  string flags = absl::GetFlag(FLAGS_default_lua_flags);

  static_assert(ScriptParams{}.atomic && !ScriptParams{}.undeclared_keys &&
                !ScriptParams{}.float_as_int);

  auto err = ScriptParams::ApplyFlags(flags, &default_params_);
  CHECK(!err) << err.Format();
}

ScriptMgr::ScriptKey::ScriptKey(string_view sha) : array{} {
  DCHECK_EQ(sha.size(), size());
  memcpy(data(), sha.data(), size());
}

void ScriptMgr::Run(CmdArgList args, Transaction* tx, SinkReplyBuilder* builder,
                    ConnectionContext* cntx) {
  string subcmd = absl::AsciiStrToUpper(ArgS(args, 0));

  if (subcmd == "HELP") {
    string_view kHelp[] = {
        "SCRIPT <subcommand> [<arg> [value] [opt] ...]",
        "Subcommands are:",
        "EXISTS <sha1> [<sha1> ...]",
        "   Return information about the existence of the scripts in the script cache.",
        "FLUSH",
        "   Flush the Lua scripts cache. Very dangerous on replicas.",
        "LOAD <script>",
        "   Load a script into the scripts cache without executing it.",
        "FLAGS <sha> [flags ...]",
        "   Set specific flags for script. Can be called before the sript is loaded.",
        "   The following flags are possible: ",
        "      - Use 'allow-undeclared-keys' to allow accessing undeclared keys",
        "      - Use 'disable-atomicity' to allow running scripts non-atomically",
        "      - Use 'legacy-float' to return floats as integers",
        "LIST",
        "   Lists loaded scripts.",
        "LATENCY",
        "   Prints latency histograms in usec for every called function.",
        "GC",
        "   Invokes garbage collection on all unused interpreter instances.",
        "HELP",
        "   Prints this help."};
    auto rb = static_cast<RedisReplyBuilder*>(builder);
    return rb->SendSimpleStrArr(kHelp);
  }

  if (subcmd == "EXISTS" && args.size() > 1)
    return ExistsCmd(args, tx, builder);

  if (subcmd == "FLUSH")
    return FlushCmd(args, tx, builder);

  if (subcmd == "LIST")
    return ListCmd(tx, builder);

  if (subcmd == "LATENCY")
    return LatencyCmd(tx, builder);

  if (subcmd == "LOAD" && args.size() == 2)
    return LoadCmd(args, tx, builder, cntx);

  if (subcmd == "FLAGS" && args.size() > 2)
    return ConfigCmd(args, tx, builder);

  if (subcmd == "GC")
    return GCCmd(tx, builder);

  string err = absl::StrCat("Unknown subcommand or wrong number of arguments for '", subcmd,
                            "'. Try SCRIPT HELP.");
  builder->SendError(err, kSyntaxErrType);
}

void ScriptMgr::ExistsCmd(CmdArgList args, Transaction* tx, SinkReplyBuilder* builder) const {
  vector<uint8_t> res(args.size() - 1, 0);
  for (size_t i = 1; i < args.size(); ++i) {
    if (string_view sha = ArgS(args, i); Find(sha)) {
      res[i - 1] = 1;
    }
  }

  auto rb = static_cast<RedisReplyBuilder*>(builder);
  rb->StartArray(res.size());
  for (uint8_t v : res) {
    rb->SendLong(v);
  }
}

void ScriptMgr::FlushCmd(CmdArgList args, Transaction* tx, SinkReplyBuilder* builder) {
  FlushAllScript();

  return builder->SendOk();
}

void ScriptMgr::LoadCmd(CmdArgList args, Transaction* tx, SinkReplyBuilder* builder,
                        ConnectionContext* cntx) {
  string_view body = ArgS(args, 1);
  auto rb = static_cast<RedisReplyBuilder*>(builder);
  if (body.empty()) {
    char sha[41];
    Interpreter::FuncSha1(body, sha);
    return rb->SendBulkString(sha);
  }

  BorrowedInterpreter interpreter{tx, &cntx->conn_state};

  auto res = Insert(body, interpreter);
  if (!res)
    return builder->SendError(res.error().Format());

  // Schedule empty callback inorder to journal command via transaction framework.
  tx->ScheduleSingleHop([](auto* t, auto* shard) { return OpStatus::OK; });

  return rb->SendBulkString(res.value());
}

void ScriptMgr::ConfigCmd(CmdArgList args, Transaction* tx, SinkReplyBuilder* builder) {
  string_view sha = ArgS(args, 1);
  if (sha.size() != ScriptKey{}.size()) {
    return builder->SendError(kSyntaxErr);
  }

  lock_guard lk{mu_};
  ScriptKey key{sha};
  auto& data = db_[key];

  for (auto flag : args.subspan(2)) {
    if (auto err = ScriptParams::ApplyFlags(facade::ToSV(flag), &data); err)
      return builder->SendError("Invalid config format: " + err.Format());
  }

  UpdateScriptCaches(key, data);

  // Schedule empty callback inorder to journal command via transaction framework.
  tx->ScheduleSingleHop([](auto* t, auto* shard) { return OpStatus::OK; });

  return builder->SendOk();
}

void ScriptMgr::ListCmd(Transaction* tx, SinkReplyBuilder* builder) const {
  vector<pair<string, ScriptData>> scripts = GetAll();
  auto rb = static_cast<RedisReplyBuilder*>(builder);
  rb->StartArray(scripts.size());
  for (const auto& [sha, data] : scripts) {
    rb->StartArray(2);
    rb->SendBulkString(sha);
    rb->SendBulkString(data.body);
  }
}

void ScriptMgr::LatencyCmd(Transaction* tx, SinkReplyBuilder* builder) const {
  absl::flat_hash_map<std::string, base::Histogram> result;
  fb2::Mutex mu;

  shard_set->pool()->AwaitFiberOnAll([&](auto* pb) {
    auto* ss = ServerState::tlocal();
    mu.lock();
    for (const auto& k_v : ss->call_latency_histos()) {
      result[k_v.first].Merge(k_v.second);
    }
    mu.unlock();
  });

  auto rb = static_cast<RedisReplyBuilder*>(builder);
  rb->StartArray(result.size());
  for (const auto& k_v : result) {
    rb->StartArray(2);
    rb->SendBulkString(k_v.first);
    rb->SendVerbatimString(k_v.second.ToString());
  }
}

void ScriptMgr::GCCmd(Transaction* tx, SinkReplyBuilder* builder) const {
  auto cb = [](Interpreter* ir) {
    ir->RunGC();
    ThisFiber::Yield();
  };
  shard_set->pool()->AwaitFiberOnAll(
      [cb](auto* pb) { ServerState::tlocal()->AlterInterpreters(cb); });
  return builder->SendOk();
}

// Check if script starts with lua flags instructions (--df flags=...).
io::Result<optional<ScriptMgr::ScriptParams>, GenericError> DeduceParams(string_view body) {
  static const regex kRegex{R"(^\s*?--!df flags=([^\s\n\r]*)[\s\n\r])"};
  cmatch matches;

  if (!regex_search(body.data(), matches, kRegex))
    return nullopt;

  ScriptMgr::ScriptParams params;
  if (auto err = ScriptMgr::ScriptParams::ApplyFlags(matches.str(1), &params); err)
    return nonstd::make_unexpected(err);

  return params;
}

unique_ptr<char[]> CharBufFromSV(string_view sv) {
  auto ptr = make_unique<char[]>(sv.size() + 1);
  memcpy(ptr.get(), sv.data(), sv.size());
  ptr[sv.size()] = '\0';
  return ptr;
}

nonstd::expected<string, GenericError> ScriptMgr::Insert(string_view body,
                                                         Interpreter* interpreter) {
  char sha_buf[64];
  Interpreter::FuncSha1(body, sha_buf);
  string_view sha{sha_buf, std::strlen(sha_buf)};

  if (interpreter->Exists(sha)) {
    return string{sha};
  }

  auto params_opt = DeduceParams(body);
  if (!params_opt)
    return params_opt.get_unexpected();
  auto params = params_opt->value_or(default_params_);

  if (!params.atomic) {
    // override atomicity for a specific buggy script.
    constexpr string_view sha_4522 =
        "f8133be7f04abd9dfefa83c3b29a9d837cfbda86"sv;  // Sidekiq, see #4522
    if (sha == sha_4522) {
      params.atomic = true;
    }
  }

  const char* kUndeclaredShas[] = {
      "351130589c64523cb98978dc32c64173a31244f3",  // Sidekiq, see #2442
      "6ae15ef4678593dc61f991c9953722d67d822776",  // Sidekiq, see #2442
      "34b1048274c8e50a0cc587a3ed9c383a82bb78c5",  // Sidekiq
      "b725ca33e5b36f318ab1150b8ac955a3d997c872",  // Sentry, see #5495
      "8c4dafdf9b6b7bcf511a0d1ec0518bed9260e16d",  // django-cacheops see #6119
      "3fc258d735c924d5652fceb90b41bea1f1f29e4b",  // django-cacheops see #6119
      "43d401bd2bd0ad864c3ca221512cda1b6215ec23",  // django-cacheops see #272
      // Cm_Cache_Backend_Redis (Magento) - until
      // https://github.com/colinmollenhour/Cm_Cache_Backend_Redis/pull/186 is merged
      "1617c9fb2bda7d790bb1aaa320c1099d81825e64",  // Cm_Cache_Backend_Redis LUA_SAVE
      "39383dcf36d2e71364a666b2a806bc8219cd332d",  // Cm_Cache_Backend_Redis LUA_CLEAN
      "6990147f5d1999b936dac3b6f7e5d2071908bcf3",  // Cm_Cache_Backend_Redis LUA_GC
  };

  if (find(begin(kUndeclaredShas), end(kUndeclaredShas), sha) != end(kUndeclaredShas)) {
    params.undeclared_keys = true;
  } else {
    auto undeclared_shas = absl::GetFlag(FLAGS_lua_undeclared_keys_shas);
    if (find(undeclared_shas.begin(), undeclared_shas.end(), sha) != undeclared_shas.end()) {
      params.undeclared_keys = true;
    }
  }

  auto float_as_int_shas = absl::GetFlag(FLAGS_lua_float_as_int_shas);
  if (find(float_as_int_shas.begin(), float_as_int_shas.end(), sha) != float_as_int_shas.end()) {
    params.float_as_int = true;
  }

  // If the script is atomic, check for possible squashing optimizations.
  // For non atomic modes, squashing increases the time locks are held, which
  // can decrease throughput with frequently accessed keys.
  optional<string> async_body;
  if (params.atomic && absl::GetFlag(FLAGS_lua_auto_async)) {
    if (async_body = Interpreter::DetectPossibleAsyncCalls(body); async_body)
      body = *async_body;
  }

  string result;
  Interpreter::AddResult add_result = interpreter->AddFunction(sha, body, &result);
  if (add_result == Interpreter::COMPILE_ERR)
    return nonstd::make_unexpected(GenericError{std::move(result)});

  lock_guard lk{mu_};
  auto [it, _] = db_.emplace(sha, InternalScriptData{params, nullptr});

  if (!it->second.body) {
    it->second.body = CharBufFromSV(body);
  }

  UpdateScriptCaches(sha, it->second);

  return string{sha};
}

optional<ScriptMgr::ScriptData> ScriptMgr::Find(std::string_view sha) const {
  if (sha.size() != ScriptKey{}.size())
    return std::nullopt;

  lock_guard lk{mu_};
  if (auto it = db_.find(sha); it != db_.end() && it->second.body)
    return ScriptData{it->second, it->second.body.get()};

  return std::nullopt;
}

void ScriptMgr::OnScriptError(std::string_view sha, std::string_view error) {
  ++tl_facade_stats->reply_stats.script_error_count;

  // Log script errors at most 5 times a second.
  LOG_EVERY_T(WARNING, 0.2) << "Error running script (call to " << sha << "): " << error;

  // If script has undeclared_keys and was not flaged to run in this mode we will change the
  // script flag - this will make script next run to not fail but run as global.
  if (absl::GetFlag(FLAGS_lua_allow_undeclared_auto_correct)) {
    size_t pos = error.rfind(kUndeclaredKeyErr);
    lock_guard lk{mu_};
    auto it = db_.find(sha);
    if (it == db_.end()) {
      return;
    }

    if (pos != string::npos) {
      it->second.undeclared_keys = true;
      LOG(WARNING) << "Setting undeclared_keys flag for script with sha : (" << sha << ")";
      UpdateScriptCaches(sha, it->second);
    }
  }
}

void ScriptMgr::FlushAllScript() {
  lock_guard lk{mu_};
  db_.clear();

  shard_set->pool()->AwaitFiberOnAll([](auto* pb) {
    ServerState* ss = ServerState::tlocal();
    ss->FlushScriptCache();
  });
}

vector<pair<string, ScriptMgr::ScriptData>> ScriptMgr::GetAll() const {
  vector<pair<string, ScriptData>> res;

  lock_guard lk{mu_};
  res.reserve(db_.size());
  for (const auto& [sha, data] : db_) {
    string body = data.body ? string{data.body.get()} : string{};
    res.emplace_back(string{sha.data(), sha.size()}, ScriptData{data, std::move(body)});
  }

  return res;
}

void ScriptMgr::UpdateScriptCaches(ScriptKey sha, ScriptParams params) const {
  shard_set->pool()->AwaitBrief([&sha, &params](auto index, auto* pb) {
    ServerState::tlocal()->SetScriptParams(sha, params);
  });
}

bool ScriptMgr::AreGlobalByDefault() const {
  return default_params_.undeclared_keys && default_params_.atomic;
}

GenericError ScriptMgr::ScriptParams::ApplyFlags(string_view config, ScriptParams* params) {
  auto parts = absl::StrSplit(config, absl::ByAnyChar(",; "), absl::SkipEmpty());
  for (auto flag : parts) {
    if (flag == "disable-atomicity") {
      params->atomic = false;
      continue;
    }

    if (flag == "allow-undeclared-keys") {
      params->undeclared_keys = true;
      continue;
    }

    if (flag == "legacy-float") {
      params->float_as_int = true;
      continue;
    }

    if (flag == "no-writes") {  // Used by Redis.
      // TODO: lock read-only.
      continue;
    }

    return GenericError{"Invalid flag: "s + string{flag}};
  }

  return {};
}

}  // namespace dfly


================================================
FILE: src/server/script_mgr.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/flat_hash_map.h>

#include <array>
#include <nonstd/expected.hpp>
#include <optional>

#include "server/common_types.h"
#include "server/execution_state.h"

namespace facade {
class SinkReplyBuilder;
}  // namespace facade

namespace dfly {

using facade::CmdArgList;

class EngineShardSet;
class Interpreter;

// This class has a state through the lifetime of a server because it manipulates scripts
class ScriptMgr {
 public:
  struct ScriptParams {
    bool atomic = true;            // Whether script must run atomically.
    bool undeclared_keys = false;  // Whether script accesses undeclared keys.
    bool float_as_int = false;     // Whether to return floats as integers.

    // Return GenericError if some flag was invalid.
    // Valid flags are:
    // - allow-undeclared-keys -> undeclared_keys=true
    // - disable-atomicity     -> atomic=false
    // - legacy-float          -> float_as_int=true
    static GenericError ApplyFlags(std::string_view flags, ScriptParams* params);
  };

  struct ScriptData : public ScriptParams {
    std::string body;  // script source code present in lua interpreter
  };

  struct ScriptKey : public std::array<char, 40> {
    ScriptKey() = default;
    ScriptKey(std::string_view sha);
  };

 public:
  using SinkReplyBuilder = facade::SinkReplyBuilder;

  ScriptMgr();

  void Run(CmdArgList args, Transaction* tx, SinkReplyBuilder* builder, ConnectionContext* cntx);

  // Insert script and return sha. Get possible error from compilation or parsing script flags.
  nonstd::expected<std::string, GenericError> Insert(std::string_view body,
                                                     Interpreter* interpreter);

  // Get script body by sha, returns nullptr if not found.
  std::optional<ScriptData> Find(std::string_view sha) const;

  // Returns a list of all scripts in the database with their sha and body.
  std::vector<std::pair<std::string, ScriptData>> GetAll() const;

  void FlushAllScript();

  // Returns if scripts run as global transactions by default
  bool AreGlobalByDefault() const;

  void OnScriptError(std::string_view sha, std::string_view error);

 private:
  void ExistsCmd(CmdArgList args, Transaction* tx, SinkReplyBuilder* builder) const;
  void FlushCmd(CmdArgList args, Transaction* tx, SinkReplyBuilder* builder);
  void LoadCmd(CmdArgList args, Transaction* tx, SinkReplyBuilder* builder,
               ConnectionContext* cntx);
  void ConfigCmd(CmdArgList args, Transaction* tx, SinkReplyBuilder* builder);
  void ListCmd(Transaction* tx, SinkReplyBuilder* builder) const;
  void LatencyCmd(Transaction* tx, SinkReplyBuilder* builder) const;
  void GCCmd(Transaction* tx, SinkReplyBuilder* builder) const;

  void UpdateScriptCaches(ScriptKey sha, ScriptParams params) const;

 private:
  struct InternalScriptData : public ScriptParams {
    std::unique_ptr<char[]> body{};
    std::unique_ptr<char[]> orig_body{};
  };

  ScriptParams default_params_;

  absl::flat_hash_map<ScriptKey, InternalScriptData> db_;
  mutable util::fb2::Mutex mu_;
};

}  // namespace dfly


================================================
FILE: src/server/search/CMakeLists.txt
================================================
if (NOT WITH_SEARCH)
  SET(DF_SEARCH_SRCS search/doc_index_fallback.cc PARENT_SCOPE)
else()
  SET(DF_SEARCH_SRCS
    search/aggregator.cc
    search/doc_accessors.cc
    search/doc_index.cc
    search/search_family.cc
    search/index_join.cc
    search/global_hnsw_index.cc
    search/index_builder.cc
    PARENT_SCOPE)
endif()


================================================
FILE: src/server/search/aggregator.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/search/aggregator.h"

#include "base/logging.h"
#include "server/search/doc_index.h"

namespace dfly::aggregate {

namespace {

using ValuesList = absl::FixedArray<Value>;

ValuesList ExtractFieldsValues(const DocValues& dv, absl::Span<const std::string> fields) {
  ValuesList out(fields.size());
  for (size_t i = 0; i < fields.size(); i++) {
    auto it = dv.find(fields[i]);
    out[i] = (it != dv.end()) ? it->second : Value{};
  }
  return out;
}

DocValues PackFields(ValuesList values, absl::Span<const std::string> fields) {
  DCHECK_EQ(values.size(), fields.size());
  DocValues out;
  for (size_t i = 0; i < fields.size(); i++)
    out[fields[i]] = std::move(values[i]);
  return out;
}

const Value kEmptyValue = Value{};

}  // namespace

void Aggregator::DoGroup(absl::Span<const std::string> fields, absl::Span<const Reducer> reducers) {
  // Separate items into groups
  absl::flat_hash_map<ValuesList, std::vector<DocValues>> groups;
  for (auto& value : result.values) {
    groups[ExtractFieldsValues(value, fields)].push_back(std::move(value));
  }

  // Restore DocValues and apply reducers
  auto& values = result.values;
  values.clear();
  values.reserve(groups.size());
  while (!groups.empty()) {
    auto node = groups.extract(groups.begin());
    DocValues doc = PackFields(std::move(node.key()), fields);
    for (auto& reducer : reducers) {
      doc[reducer.result_field] = reducer.func({reducer.source_field, node.mapped()});
    }
    values.push_back(std::move(doc));
  }

  auto& fields_to_print = result.fields_to_print;
  fields_to_print.clear();
  fields_to_print.reserve(fields.size() + reducers.size());

  for (auto& field : fields) {
    fields_to_print.insert(field);
  }
  for (auto& reducer : reducers) {
    fields_to_print.insert(reducer.result_field);
  }
}

void Aggregator::DoSort(const SortParams& sort_params) {
  /*
    Comparator for sorting DocValues by fields.
    If some of the fields is not present in the DocValues, comparator returns:
    1. l_it == l.end() && r_it != r.end()
      asc -> false
      desc -> false
    2. l_it != l.end() && r_it == r.end()
      asc -> true
      desc -> true
    3. l_it == l.end() && r_it == r.end()
      asc -> false
      desc -> false
  */
  auto comparator = [&](const DocValues& l, const DocValues& r) {
    for (const auto& [field, order] : sort_params.fields) {
      auto l_it = l.find(field);
      auto r_it = r.find(field);

      // If some of the values is not present
      if (l_it == l.end() || r_it == r.end()) {
        if (l_it == l.end() && r_it == r.end()) {
          continue;
        }
        return l_it != l.end();
      }

      const auto& lv = l_it->second;
      const auto& rv = r_it->second;
      if (lv == rv) {
        continue;
      }
      return order == SortOrder::ASC ? lv < rv : lv > rv;
    }
    return false;
  };

  auto& values = result.values;
  if (sort_params.SortAll()) {
    std::sort(values.begin(), values.end(), comparator);
  } else {
    DCHECK_GE(sort_params.max, 0);
    const size_t limit = std::min(values.size(), size_t(sort_params.max));
    std::partial_sort(values.begin(), values.begin() + limit, values.end(), comparator);
    values.resize(limit);
  }

  for (auto& field : sort_params.fields) {
    result.fields_to_print.insert(field.first);
  }
}

void Aggregator::DoLimit(size_t offset, size_t num) {
  auto& values = result.values;
  values.erase(values.begin(), values.begin() + std::min(offset, values.size()));
  values.resize(std::min(num, values.size()));
}

const Value& ValueIterator::operator*() const {
  auto it = values_.front().find(field_);
  return it == values_.front().end() ? kEmptyValue : it->second;
}

ValueIterator& ValueIterator::operator++() {
  values_.remove_prefix(1);
  return *this;
}

Reducer::Func FindReducerFunc(ReducerFunc name) {
  const static auto kCountReducer = [](ValueIterator it) -> double {
    return std::distance(it, it.end());
  };

  const static auto kSumReducer = [](ValueIterator it) -> double {
    double sum = 0;
    for (; it != it.end(); ++it)
      sum += std::holds_alternative<double>(*it) ? std::get<double>(*it) : 0.0;
    return sum;
  };

  switch (name) {
    case ReducerFunc::COUNT:
      return [](ValueIterator it) -> Value { return kCountReducer(it); };
    case ReducerFunc::COUNT_DISTINCT:
      return [](ValueIterator it) -> Value {
        return double(std::unordered_set<Value>(it, it.end()).size());
      };
    case ReducerFunc::SUM:
      return [](ValueIterator it) -> Value { return kSumReducer(it); };
    case ReducerFunc::AVG:
      return [](ValueIterator it) -> Value { return kSumReducer(it) / kCountReducer(it); };
    case ReducerFunc::MAX:
      return [](ValueIterator it) -> Value { return *std::max_element(it, it.end()); };
    case ReducerFunc::MIN:
      return [](ValueIterator it) -> Value { return *std::min_element(it, it.end()); };
  }

  return nullptr;
}

AggregationStep MakeGroupStep(std::vector<std::string> fields, std::vector<Reducer> reducers) {
  return [fields = std::move(fields), reducers = std::move(reducers)](Aggregator* aggregator) {
    aggregator->DoGroup(fields, reducers);
  };
}

AggregationStep MakeSortStep(SortParams sort_params) {
  return [params = std::move(sort_params)](Aggregator* aggregator) { aggregator->DoSort(params); };
}

AggregationStep MakeLimitStep(size_t offset, size_t num) {
  return [=](Aggregator* aggregator) { aggregator->DoLimit(offset, num); };
}

AggregationResult Process(std::vector<DocValues> values,
                          absl::Span<const std::string_view> fields_to_print,
                          absl::Span<const AggregationStep> steps) {
  Aggregator aggregator{std::move(values), {fields_to_print.begin(), fields_to_print.end()}};
  for (auto& step : steps) {
    step(&aggregator);
  }
  return aggregator.result;
}

}  // namespace dfly::aggregate


================================================
FILE: src/server/search/aggregator.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/flat_hash_map.h>
#include <absl/container/flat_hash_set.h>
#include <absl/types/span.h>

#include <string>
#include <variant>

#include "core/search/base.h"
#include "facade/reply_builder.h"
#include "io/io.h"

namespace dfly {
enum class SortOrder;
}

namespace dfly::aggregate {

struct Reducer;

using Value = ::dfly::search::SortableValue;

// DocValues sent through the pipeline
// TODO: Replace DocValues with compact linear search map instead of hash map
using DocValues = absl::flat_hash_map<std::string, Value>;

struct AggregationResult {
  // Values to be passed to the next step
  std::vector<DocValues> values;

  // Fields from values to be printed
  absl::flat_hash_set<std::string_view> fields_to_print;
};

struct SortParams {
  constexpr static int64_t kSortAll = -1;

  bool SortAll() const {
    return max == kSortAll;
  }

  /* Fields to sort by. If multiple fields are provided, sorting works hierarchically:
     - First, the i-th field is compared.
     - If the i-th field values are equal, the (i + 1)-th field is compared, and so on. */
  absl::InlinedVector<std::pair<std::string, SortOrder>, 2> fields;
  /* Max number of elements to include in the sorted result.
     If set, only the first [max] elements are fully sorted using partial_sort. */
  int64_t max = kSortAll;
};

struct Aggregator {
  void DoGroup(absl::Span<const std::string> fields, absl::Span<const Reducer> reducers);
  void DoSort(const SortParams& sort_params);
  void DoLimit(size_t offset, size_t num);

  AggregationResult result;
};

using AggregationStep = std::function<void(Aggregator*)>;  // Group, Sort, etc.

// Iterator over Span<DocValues> that yields doc[field] or monostate if not present.
// Extra clumsy for STL compatibility!
struct ValueIterator {
  using iterator_category = std::forward_iterator_tag;
  using difference_type = std::ptrdiff_t;
  using value_type = const Value;
  using pointer = const Value*;
  using reference = const Value&;

  ValueIterator(std::string_view field, absl::Span<const DocValues> values)
      : field_{field}, values_{values} {
  }

  const Value& operator*() const;

  ValueIterator& operator++();

  bool operator==(const ValueIterator& other) const {
    return values_.size() == other.values_.size();
  }

  bool operator!=(const ValueIterator& other) const {
    return !operator==(other);
  }

  static ValueIterator end() {
    return ValueIterator{};
  }

 private:
  ValueIterator() = default;

  std::string_view field_;
  absl::Span<const DocValues> values_;
};

struct Reducer {
  using Func = Value (*)(ValueIterator);
  std::string source_field, result_field;
  Func func;
};

enum class ReducerFunc { COUNT, COUNT_DISTINCT, SUM, AVG, MAX, MIN };

// Find reducer function by uppercase name (COUNT, MAX, etc...), empty functor if not found
Reducer::Func FindReducerFunc(ReducerFunc name);

// Make `GROUPBY [fields...]`  with REDUCE step
AggregationStep MakeGroupStep(std::vector<std::string> fields, std::vector<Reducer> reducers);

// Make `SORTBY field [DESC]` step
AggregationStep MakeSortStep(SortParams sort_params);

// Make `LIMIT offset num` step
AggregationStep MakeLimitStep(size_t offset, size_t num);

// Process values with given steps
AggregationResult Process(std::vector<DocValues> values,
                          absl::Span<const std::string_view> fields_to_print,
                          absl::Span<const AggregationStep> steps);

}  // namespace dfly::aggregate


================================================
FILE: src/server/search/aggregator_test.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/search/aggregator.h"

#include "base/gtest.h"
#include "server/search/doc_index.h"

namespace dfly::aggregate {

using namespace std::string_literals;

using StepsList = std::vector<AggregationStep>;

TEST(AggregatorTest, Sort) {
  std::vector<DocValues> values = {
      DocValues{{"a", 1.0}},
      DocValues{{"a", 0.5}},
      DocValues{{"a", 1.5}},
  };

  SortParams params;
  params.fields.emplace_back("a", SortOrder::ASC);
  StepsList steps = {MakeSortStep(std::move(params))};

  auto result = Process(values, {"a"}, steps);

  EXPECT_EQ(result.values[0]["a"], Value(0.5));
  EXPECT_EQ(result.values[1]["a"], Value(1.0));
  EXPECT_EQ(result.values[2]["a"], Value(1.5));
}

TEST(AggregatorTest, Limit) {
  std::vector<DocValues> values = {
      DocValues{{"i", 1.0}},
      DocValues{{"i", 2.0}},
      DocValues{{"i", 3.0}},
      DocValues{{"i", 4.0}},
  };

  StepsList steps = {MakeLimitStep(1, 2)};

  auto result = Process(values, {"i"}, steps);

  EXPECT_EQ(result.values.size(), 2);
  EXPECT_EQ(result.values[0]["i"], Value(2.0));
  EXPECT_EQ(result.values[1]["i"], Value(3.0));
}

TEST(AggregatorTest, SimpleGroup) {
  std::vector<DocValues> values = {
      DocValues{{"i", 1.0}, {"tag", "odd"}},
      DocValues{{"i", 2.0}, {"tag", "even"}},
      DocValues{{"i", 3.0}, {"tag", "odd"}},
      DocValues{{"i", 4.0}, {"tag", "even"}},
  };

  std::vector<std::string> fields = {"tag"};
  StepsList steps = {MakeGroupStep(std::move(fields), {})};

  auto result = Process(values, {"i", "tag"}, steps);
  EXPECT_EQ(result.values.size(), 2);

  EXPECT_EQ(result.values[0].size(), 1);
  std::set<Value> groups{result.values[0]["tag"], result.values[1]["tag"]};
  std::set<Value> expected{"even", "odd"};
  EXPECT_EQ(groups, expected);
}

TEST(AggregatorTest, GroupWithReduce) {
  std::vector<DocValues> values;
  // range from 0 to 9 inclusive
  for (size_t i = 0; i < 10; i++) {
    values.push_back(DocValues{
        {"i", double(i)},
        {"half-i", double(i / 4)},
        {"tag", i % 2 == 0 ? "even" : "odd"},
    });
  }

  std::vector<std::string> fields = {"tag"};
  std::vector<Reducer> reducers = {
      Reducer{"", "count", FindReducerFunc(ReducerFunc::COUNT)},
      Reducer{"i", "sum-i", FindReducerFunc(ReducerFunc::SUM)},
      Reducer{"half-i", "distinct-hi", FindReducerFunc(ReducerFunc::COUNT_DISTINCT)},
      Reducer{"null-field", "distinct-null", FindReducerFunc(ReducerFunc::COUNT_DISTINCT)}};

  StepsList steps = {MakeGroupStep(std::move(fields), std::move(reducers))};

  auto result = Process(values, {"i", "half-i", "tag"}, steps);
  EXPECT_EQ(result.values.size(), 2);

  // Reorder even first
  if (result.values[0].at("tag") == Value("odd"))
    std::swap(result.values[0], result.values[1]);

  // Even
  EXPECT_EQ(result.values[0].at("count"), Value{(double)5});
  EXPECT_EQ(result.values[0].at("sum-i"), Value{(double)2 + 4 + 6 + 8});
  EXPECT_EQ(result.values[0].at("distinct-hi"), Value{(double)3});
  EXPECT_EQ(result.values[0].at("distinct-null"), Value{(double)1});

  // Odd
  EXPECT_EQ(result.values[1].at("count"), Value{(double)5});
  EXPECT_EQ(result.values[1].at("sum-i"), Value{(double)1 + 3 + 5 + 7 + 9});
  EXPECT_EQ(result.values[1].at("distinct-hi"), Value{(double)3});
  EXPECT_EQ(result.values[1].at("distinct-null"), Value{(double)1});
}

}  // namespace dfly::aggregate


================================================
FILE: src/server/search/doc_accessors.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

// GCC yields a spurious warning about uninitialized data in DocumentAccessor::StringList.

#ifndef __clang__
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#endif

#include "server/search/doc_accessors.h"

#include <absl/functional/any_invocable.h>
#include <absl/strings/str_cat.h>
#include <absl/strings/str_join.h>

#include "base/flags.h"
#include "core/detail/listpack_wrap.h"
#include "core/json/path.h"
#include "core/overloaded.h"
#include "core/search/search.h"
#include "core/search/vector_utils.h"
#include "core/string_map.h"
#include "server/container_utils.h"

extern "C" {
#include "redis/listpack.h"
};

ABSL_DECLARE_FLAG(bool, jsonpathv2);

namespace dfly {

using namespace std;

namespace {

string_view SdsToSafeSv(sds str) {
  return str != nullptr ? string_view{str, sdslen(str)} : ""sv;
}

using FieldValue = std::optional<search::SortableValue>;

FieldValue ToSortableValue(search::SchemaField::FieldType type, string_view value) {
  if (value.empty()) {
    return std::nullopt;
  }

  if (type == search::SchemaField::NUMERIC) {
    auto value_as_double = search::ParseNumericField(value);
    if (!value_as_double) {  // temporary convert to double
      return std::nullopt;
    }
    return value_as_double.value();
  }
  if (type == search::SchemaField::VECTOR) {
    auto opt_vector = search::BytesToFtVectorSafe(value);
    if (!opt_vector) {
      return std::nullopt;
    }
    auto& [ptr, size] = opt_vector.value();
    return absl::StrCat("[", absl::StrJoin(absl::Span<const float>{ptr.get(), size}, ","), "]");
  }
  return string{value};
}

FieldValue ExtractSortableValue(const search::Schema& schema, string_view key, string_view value) {
  auto it = schema.fields.find(key);
  if (it == schema.fields.end())
    return ToSortableValue(search::SchemaField::TEXT, value);
  return ToSortableValue(it->second.type, value);
}

FieldValue ExtractSortableValueFromJson(const search::Schema& schema, string_view key,
                                        const JsonType& json) {
  if (json.is_null()) {
    return std::monostate{};
  }
  auto json_as_string = json.as_string();
  return ExtractSortableValue(schema, key, json_as_string);
}

/* Returns true if json elements were successfully processed. */
bool ProcessJsonElements(const std::vector<JsonType>& json_elements,
                         absl::FunctionRef<bool(const JsonType&)> cb) {
  auto process = [&cb](const auto& json_range) -> bool {
    for (const auto& json : json_range) {
      if (!json.is_null() && !cb(json)) {
        return false;
      }
    }
    return true;
  };

  if (!json_elements[0].is_array()) {
    return process(json_elements);
  }
  return json_elements.size() == 1 && process(json_elements[0].array_range());
}

}  // namespace

SearchDocData BaseAccessor::Serialize(const search::Schema& schema,
                                      absl::Span<const FieldReference> fields) const {
  SearchDocData out{};
  for (const auto& field : fields) {
    string_view fident = field.Identifier(schema, false);
    auto field_value =
        ExtractSortableValue(schema, fident, absl::StrJoin(GetStrings(fident).value(), ","));
    if (field_value) {
      out[field.OutputName()] = std::move(field_value).value();
    }
  }
  return out;
}

std::optional<BaseAccessor::VectorInfo> BaseAccessor::GetVector(std::string_view active_field,
                                                                size_t dim) const {
  auto strings_list = GetStrings(active_field);
  if (strings_list) {
    if (!strings_list->empty()) {
      auto value = strings_list->front();
      if ((value.size() % sizeof(float)) || (value.size() / sizeof(float) != dim)) {
        return std::nullopt;
      }
      return value.data();
    } else {
      return nullptr;
    }
  }
  return std::nullopt;
}

std::optional<BaseAccessor::NumsList> BaseAccessor::GetNumbers(
    std::string_view active_field) const {
  auto strings_list = GetStrings(active_field);
  if (!strings_list) {
    return std::nullopt;
  }

  NumsList nums_list;
  nums_list.reserve(strings_list->size());
  for (auto str : strings_list.value()) {
    auto num = search::ParseNumericField(str);
    if (!num) {
      return std::nullopt;
    }
    nums_list.push_back(num.value());
  }
  return nums_list;
}

std::optional<BaseAccessor::StringList> BaseAccessor::GetTags(std::string_view active_field) const {
  return GetStrings(active_field);
}

std::optional<BaseAccessor::StringList> ListPackAccessor::GetStrings(
    string_view active_field) const {
  auto it = lw_.Find(active_field);
  return it != lw_.end() ? StringList{(*it).second} : StringList{};
}

SearchDocData ListPackAccessor::Serialize(const search::Schema& schema) const {
  SearchDocData out{};
  for (const auto [key, value] : lw_) {
    if (auto field_value = ExtractSortableValue(schema, key, value); field_value) {
      out[key] = std::move(field_value).value();
    }
  }
  return out;
}

std::optional<BaseAccessor::StringList> StringMapAccessor::GetStrings(
    string_view active_field) const {
  auto it = hset_->Find(active_field);
  return it != hset_->end() ? StringList{SdsToSafeSv(it->second)} : StringList{};
}

SearchDocData StringMapAccessor::Serialize(const search::Schema& schema) const {
  SearchDocData out{};
  for (const auto& [kptr, vptr] : *hset_) {
    auto field_value = ExtractSortableValue(schema, SdsToSafeSv(kptr), SdsToSafeSv(vptr));
    if (field_value) {
      out[SdsToSafeSv(kptr)] = std::move(field_value).value();
    }
  }
  return out;
}

struct JsonAccessor::JsonPathContainer {
  vector<JsonType> Evaluate(const JsonType& json) const {
    vector<JsonType> res;

    visit(Overloaded{[&](const json::Path& path) {
                       json::EvaluatePath(path, json,
                                          [&](auto, const JsonType& v) { res.push_back(v); });
                     },
                     [&](const jsoncons::jsonpath::jsonpath_expression<JsonType>& path) {
                       auto json_arr = path.evaluate(json);
                       for (const auto& v : json_arr.array_range())
                         res.push_back(v);
                     }},
          val);

    return res;
  }

  variant<json::Path, jsoncons::jsonpath::jsonpath_expression<JsonType>> val;
};

std::optional<BaseAccessor::StringList> JsonAccessor::GetStrings(std::string_view field) const {
  return GetStrings(field, false);
}

std::optional<BaseAccessor::StringList> JsonAccessor::GetTags(std::string_view active_field) const {
  return GetStrings(active_field, true);
}

std::optional<BaseAccessor::StringList> JsonAccessor::GetStrings(std::string_view field,
                                                                 bool accept_boolean_values) const {
  auto* path = GetPath(field);
  if (!path)
    return search::EmptyAccessResult<StringList>();

  auto path_res = path->Evaluate(json_);
  if (path_res.empty())
    return search::EmptyAccessResult<StringList>();

  auto is_convertible_to_string = [](bool accept_boolean_values) -> bool (*)(const JsonType& json) {
    if (accept_boolean_values) {
      return [](const JsonType& json) -> bool { return json.is_string() || json.is_bool(); };
    } else {
      return [](const JsonType& json) -> bool { return json.is_string(); };
    }
  }(accept_boolean_values);

  if (path_res.size() == 1 && !path_res[0].is_array()) {
    if (path_res[0].is_null())
      return StringList{};
    if (!is_convertible_to_string(path_res[0]))
      return std::nullopt;

    buf_ = path_res[0].as_string();
    return StringList{buf_};
  }

  buf_.clear();

  // First, grow buffer and compute string sizes
  vector<size_t> sizes;
  sizes.reserve(path_res.size());

  // Returns true if json element is convertiable to string
  auto add_json_element_to_buf = [&](const JsonType& json) -> bool {
    if (!is_convertible_to_string(json))
      return false;

    size_t start = buf_.size();
    buf_ += json.as_string();
    sizes.push_back(buf_.size() - start);
    return true;
  };

  if (!ProcessJsonElements(path_res, std::move(add_json_element_to_buf))) {
    return std::nullopt;
  }

  // Reposition start pointers to the most recent allocation of buf
  StringList out(sizes.size());

  size_t start = 0;
  for (size_t i = 0; i < out.size(); i++) {
    out[i] = string_view{buf_}.substr(start, sizes[i]);
    start += sizes[i];
  }

  return out;
}

std::optional<BaseAccessor::VectorInfo> JsonAccessor::GetVector(string_view active_field,
                                                                size_t dim) const {
  auto* path = GetPath(active_field);
  if (!path)
    return VectorInfo{};

  auto res = path->Evaluate(json_);
  if (res.empty() || res[0].is_null())
    return VectorInfo{};

  if (!res[0].is_array())
    return std::nullopt;

  size_t size = res[0].size();

  if (size != dim)
    return std::nullopt;

  auto ptr = make_unique<float[]>(size);

  size_t i = 0;
  for (const auto& v : res[0].array_range()) {
    if (!v.is_number()) {
      return std::nullopt;
    }
    ptr[i++] = v.as<float>();
  }

  return search::OwnedFtVector{std::move(ptr), size};
}

std::optional<BaseAccessor::NumsList> JsonAccessor::GetNumbers(string_view active_field) const {
  auto* path = GetPath(active_field);
  if (!path)
    return search::EmptyAccessResult<NumsList>();

  auto path_res = path->Evaluate(json_);
  if (path_res.empty())
    return search::EmptyAccessResult<NumsList>();

  NumsList nums_list;
  nums_list.reserve(path_res.size());

  // Returns true if json element is convertiable to number
  auto add_json_element = [&](const JsonType& json) -> bool {
    if (!json.is_number())
      return false;
    nums_list.push_back(json.as<double>());
    return true;
  };

  if (!ProcessJsonElements(path_res, std::move(add_json_element))) {
    return std::nullopt;
  }
  return nums_list;
}

JsonAccessor::JsonPathContainer* JsonAccessor::GetPath(std::string_view field) const {
  if (auto it = path_cache_.find(field); it != path_cache_.end()) {
    return it->second.get();
  }

  string ec_msg;
  unique_ptr<JsonPathContainer> ptr;
  if (absl::GetFlag(FLAGS_jsonpathv2)) {
    auto path_expr = json::ParsePath(field);
    if (path_expr) {
      ptr.reset(new JsonPathContainer{std::move(path_expr.value())});
    } else {
      ec_msg = path_expr.error();
    }
  } else {
    error_code ec;
    auto path_expr = MakeJsonPathExpr(field, ec);
    if (ec) {
      ec_msg = ec.message();
    } else {
      ptr.reset(new JsonPathContainer{std::move(path_expr)});
    }
  }

  if (!ptr) {
    // This can occur for fields that are not actual JSON paths but are computed aliases
    // (e.g., 'vector_distance' from a KNN search clause in FT.SEARCH RETURN).
    // Such fields are valid for return but won't be found as paths in the document.
    VLOG(1) << "Invalid Json path: " << field << ' ' << ec_msg;
    return nullptr;
  }

  JsonPathContainer* path = ptr.get();
  path_cache_[field] = std::move(ptr);
  return path;
}

SearchDocData JsonAccessor::Serialize(const search::Schema& schema,
                                      absl::Span<const FieldReference> fields) const {
  SearchDocData out{};
  for (const auto& field : fields) {
    string_view ident = field.Identifier(schema, true);
    if (auto* path = GetPath(ident); path) {
      if (auto res = path->Evaluate(json_); !res.empty()) {
        auto field_value = ExtractSortableValueFromJson(schema, ident, res[0]);
        if (field_value) {
          out[field.OutputName()] = std::move(field_value).value();
        }
      }
    }
  }
  return out;
}

SearchDocData JsonAccessor::Serialize(const search::Schema& schema) const {
  return {{"$", json_.to_string()}};
}

void JsonAccessor::RemoveFieldFromCache(string_view field) {
  path_cache_.erase(field);
}

thread_local absl::flat_hash_map<std::string, std::unique_ptr<JsonAccessor::JsonPathContainer>>
    JsonAccessor::path_cache_;

unique_ptr<BaseAccessor> GetAccessor(const DbContext& db_cntx, const PrimeValue& pv) {
  DCHECK(pv.ObjType() == OBJ_HASH || pv.ObjType() == OBJ_JSON);

  if (pv.ObjType() == OBJ_JSON) {
    DCHECK(pv.GetJson());
    return make_unique<JsonAccessor>(pv.GetJson());
  }

  if (pv.Encoding() == kEncodingListPack) {
    auto ptr = reinterpret_cast<uint8_t*>(pv.RObjPtr());
    return make_unique<ListPackAccessor>(ptr);
  } else {
    auto* sm = container_utils::GetStringMap(pv, db_cntx);
    return make_unique<StringMapAccessor>(sm);
  }
}

}  // namespace dfly


================================================
FILE: src/server/search/doc_accessors.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/flat_hash_map.h>
#include <absl/types/span.h>

#include <string>

#include "core/detail/listpack_wrap.h"
#include "core/json/json_object.h"
#include "core/search/search.h"
#include "core/search/vector_utils.h"
#include "server/search/doc_index.h"
#include "server/table.h"

namespace dfly {

class StringMap;

// Document accessors allow different types (json/hset) to be hidden
// behind a document interface for quering fields and serializing.
// Field string_view's are only valid until the next is requested.
struct BaseAccessor : public search::DocumentAccessor {
  // Serialize all fields
  virtual SearchDocData Serialize(const search::Schema& schema) const = 0;

  // Serialize selected fields
  virtual SearchDocData Serialize(const search::Schema& schema,
                                  absl::Span<const FieldReference> fields) const;

  // Default implementation uses GetStrings
  virtual std::optional<VectorInfo> GetVector(std::string_view active_field,
                                              size_t dim) const override;
  virtual std::optional<NumsList> GetNumbers(std::string_view active_field) const override;
  virtual std::optional<StringList> GetTags(std::string_view active_field) const override;
};

// Accessor for hashes stored with listpack
struct ListPackAccessor : public BaseAccessor {
  explicit ListPackAccessor(uint8_t* ptr /* listpack ptr */) : lw_{ptr} {
  }

  std::optional<StringList> GetStrings(std::string_view field) const override;
  SearchDocData Serialize(const search::Schema& schema) const override;

 private:
  detail::ListpackWrap lw_;
};

// Accessor for hashes stored with StringMap
struct StringMapAccessor : public BaseAccessor {
  explicit StringMapAccessor(StringMap* hset) : hset_{hset} {
  }

  std::optional<StringList> GetStrings(std::string_view field) const override;
  SearchDocData Serialize(const search::Schema& schema) const override;

 private:
  StringMap* hset_;
};

// Accessor for json values
struct JsonAccessor : public BaseAccessor {
  struct JsonPathContainer;  // contains jsoncons::jsonpath::jsonpath_expression

  explicit JsonAccessor(const JsonType* json) : json_{*json} {
  }

  std::optional<StringList> GetStrings(std::string_view field) const override;
  std::optional<VectorInfo> GetVector(std::string_view field, size_t dim) const override;
  std::optional<NumsList> GetNumbers(std::string_view active_field) const override;
  std::optional<StringList> GetTags(std::string_view active_field) const override;

  // The JsonAccessor works with structured types and not plain strings, so an overload is needed
  SearchDocData Serialize(const search::Schema& schema,
                          absl::Span<const FieldReference> fields) const override;
  SearchDocData Serialize(const search::Schema& schema) const override;

  static void RemoveFieldFromCache(std::string_view field);

 private:
  /* If accept_boolean_values is true, then json boolean values are converted to strings */
  std::optional<StringList> GetStrings(std::string_view field, bool accept_boolean_values) const;

  /// Parses `field` into a JSON path. Caches the results internally.
  JsonPathContainer* GetPath(std::string_view field) const;

  const JsonType& json_;
  mutable std::string buf_;

  // Contains built json paths to avoid parsing them repeatedly
  static thread_local absl::flat_hash_map<std::string, std::unique_ptr<JsonPathContainer>>
      path_cache_;
};

// Get accessor for value
std::unique_ptr<BaseAccessor> GetAccessor(const DbContext& db_cntx, const PrimeValue& pv);

}  // namespace dfly


================================================
FILE: src/server/search/doc_index.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/search/doc_index.h"

#include <absl/strings/str_join.h>

#include <memory>
#include <queue>
#include <ranges>

#include "absl/strings/str_cat.h"
#include "base/logging.h"
#include "core/overloaded.h"
#include "core/search/indices.h"
#include "core/search/stateless_allocator.h"
#include "server/db_slice.h"
#include "server/engine_shard_set.h"
#include "server/family_utils.h"
#include "server/search/doc_accessors.h"
#include "server/search/global_hnsw_index.h"
#include "server/search/index_builder.h"
#include "server/server_state.h"
#include "util/fibers/fibers.h"

namespace dfly {

using namespace std;
using facade::ErrorReply;
using nonstd::make_unexpected;

namespace {

template <typename F>
void TraverseAllMatching(const DocIndex& index, const OpArgs& op_args, F&& f) {
  auto& db_slice = op_args.GetDbSlice();
  DCHECK(db_slice.IsDbValid(op_args.db_cntx.db_index));
  auto [prime_table, _] = db_slice.GetTables(op_args.db_cntx.db_index);

  string scratch;
  auto cb = [&](PrimeTable::iterator it) {
    PrimeValue& pv = it->second;
    string_view key = it->first.GetSlice(&scratch);

    if (!index.Matches(key, pv.ObjType()))
      return;

    f(key, op_args.db_cntx, pv);
  };

  PrimeTable::Cursor cursor;
  do {
    cursor = prime_table->Traverse(cursor, cb);
    // Yield if the fiber has been running for long.
    if (base::CycleClock::ToUsec(util::ThisFiber::GetRunningTimeCycles()) > 500) {  // 500us
      util::ThisFiber::Yield();
    }
  } while (cursor);
}

bool IsSortableField(std::string_view field_identifier, const search::Schema& schema) {
  auto it = schema.fields.find(field_identifier);
  return it != schema.fields.end() && (it->second.flags & search::SchemaField::SORTABLE);
}

using SortIndiciesFieldsList =
    std::vector<std::pair<string_view /*identifier*/, string_view /*alias*/>>;

std::pair<std::vector<FieldReference>, SortIndiciesFieldsList> PreprocessAggregateFields(
    const search::Schema& schema, const AggregateParams& params,
    const std::optional<std::vector<FieldReference>>& load_fields) {
  absl::flat_hash_map<std::string_view, FieldReference> fields_by_identifier;
  absl::flat_hash_map<std::string_view, std::string_view> sort_indicies_aliases;
  fields_by_identifier.reserve(schema.field_names.size());
  sort_indicies_aliases.reserve(schema.field_names.size());

  for (const auto& [fname, fident] : schema.field_names) {
    if (!IsSortableField(fident, schema)) {
      fields_by_identifier.emplace(fident, FieldReference{fident, fname});
    } else {
      sort_indicies_aliases[fident] = fname;
    }
  }

  for (const auto& field : load_fields.value_or(vector<FieldReference>{})) {
    string_view fident = field.Identifier(schema, false);
    if (!IsSortableField(fident, schema)) {
      fields_by_identifier.insert_or_assign(fident, field);
    } else {
      sort_indicies_aliases[fident] = field.OutputName();
    }
  }

  vector<FieldReference> fields;
  fields.reserve(fields_by_identifier.size());
  for (auto& [_, field] : fields_by_identifier) {
    fields.emplace_back(field);
  }

  return {std::move(fields), {sort_indicies_aliases.begin(), sort_indicies_aliases.end()}};
}

/* Separate fields into basic and sortable. The second vector contains flags indicating
   whether the field at the same index in the first vector is sortable or not. */
std::pair<std::vector<FieldReference>, std::vector<bool>> GetBasicFields(
    absl::Span<const std::string_view> fields, const search::Schema& schema) {
  const size_t fields_count = fields.size();
  std::vector<bool> is_sortable_field(fields_count);
  std::vector<FieldReference> basic_fields;
  basic_fields.reserve(fields_count);
  for (size_t i = 0; i < fields_count; ++i) {
    bool is_sortable = IsSortableField(fields[i], schema);
    is_sortable_field[i] = is_sortable;
    if (!is_sortable) {
      basic_fields.emplace_back(fields[i]);
    }
  }
  return {std::move(basic_fields), std::move(is_sortable_field)};
}

auto GetIndexedHnswFields(const search::Schema& schema) {
  return schema.fields |
         std::views::filter([](const auto& item) { return item.second.IsIndexableHnswField(); });
}
}  // namespace

bool FieldReference::IsJsonPath(std::string_view name) {
  if (name.size() < 2) {
    return false;
  }
  return name.front() == '$' && (name[1] == '.' || name[1] == '[');
}

bool SearchParams::ShouldReturnField(std::string_view alias) const {
  auto cb = [alias](const auto& entry) { return entry.OutputName() == alias; };
  return !return_fields || any_of(return_fields->begin(), return_fields->end(), cb);
}

string_view SearchFieldTypeToString(search::SchemaField::FieldType type) {
  switch (type) {
    case search::SchemaField::TAG:
      return "TAG";
    case search::SchemaField::TEXT:
      return "TEXT";
    case search::SchemaField::NUMERIC:
      return "NUMERIC";
    case search::SchemaField::VECTOR:
      return "VECTOR";
    case search::SchemaField::GEO:
      return "GEO";
  }
  ABSL_UNREACHABLE();
  return "";
}

string DocIndexInfo::BuildRestoreCommand() const {
  std::string out;

  // ON HASH/JSON
  absl::StrAppend(&out, "ON", " ", base_index.type == DocIndex::HASH ? "HASH" : "JSON");

  // optional PREFIX count *prefix1* *prefix2* ...
  if (!base_index.prefixes.empty()) {
    absl::StrAppend(&out, " PREFIX", " ", base_index.prefixes.size());
    for (const auto& prefix : base_index.prefixes) {
      absl::StrAppend(&out, " ", prefix);
    }
  }

  // STOPWORDS
  absl::StrAppend(&out, " STOPWORDS ", base_index.options.stopwords.size());
  for (const auto& sw : base_index.options.stopwords)
    absl::StrAppend(&out, " ", sw);

  absl::StrAppend(&out, " SCHEMA");
  for (const auto& [fident, finfo] : base_index.schema.fields) {
    // Store field name, alias and type
    absl::StrAppend(&out, " ", fident, " AS ", finfo.short_name, " ",
                    SearchFieldTypeToString(finfo.type));

    // Store specific params
    Overloaded info{
        [](monostate) {},
        [out = &out](const search::SchemaField::VectorParams& params) {
          auto sim = params.sim == search::VectorSimilarity::L2   ? "L2"
                     : params.sim == search::VectorSimilarity::IP ? "IP"
                                                                  : "COSINE";
          absl::StrAppend(out, " ", params.use_hnsw ? "HNSW" : "FLAT", " 6 ", "DIM ", params.dim,
                          " DISTANCE_METRIC ", sim, " INITIAL_CAP ", params.capacity);
        },
        [out = &out](const search::SchemaField::TagParams& params) {
          absl::StrAppend(out, " ", "SEPARATOR", " ", string{params.separator});
          if (params.case_sensitive)
            absl::StrAppend(out, " ", "CASESENSITIVE");
        },
        [out = &out](const search::SchemaField::TextParams& params) {
          if (params.with_suffixtrie)
            absl::StrAppend(out, " ", "WITH_SUFFIXTRIE");
        },
        [out = &out](const search::SchemaField::NumericParams& params) {
          absl::StrAppend(out, " ", "BLOCKSIZE", " ", std::to_string(params.block_size));
        }};
    visit(info, finfo.special_params);

    // Store shared field flags
    if (finfo.flags & search::SchemaField::SORTABLE)
      absl::StrAppend(&out, " SORTABLE");

    if (finfo.flags & search::SchemaField::NOINDEX)
      absl::StrAppend(&out, " NOINDEX");
  }

  return out;
}

ShardDocIndex::DocId ShardDocIndex::DocKeyIndex::Add(string_view key) {
  DCHECK_EQ(ids_.count(key), 0u);

  DocId id;
  if (!free_ids_.empty()) {
    id = free_ids_.back();
    free_ids_.pop_back();
    keys_[id] = key;
  } else {
    id = last_id_++;
    DCHECK_EQ(keys_.size(), id);
    keys_.emplace_back(key);
  }

  ids_[key] = id;
  return id;
}

ShardDocIndex::DocId ShardDocIndex::DocKeyIndex::AddNew(string_view key) {
  DCHECK_EQ(ids_.count(key), 0u);

  DocId id = last_id_++;
  if (id < keys_.size()) {
    keys_[id] = key;
  } else {
    DCHECK_EQ(keys_.size(), id);
    keys_.emplace_back(key);
  }

  ids_[key] = id;
  return id;
}
std::optional<ShardDocIndex::DocId> ShardDocIndex::DocKeyIndex::Find(string_view key) const {
  auto it = ids_.find(key);
  return it != ids_.end() ? std::make_optional(it->second) : std::nullopt;
}

void ShardDocIndex::DocKeyIndex::Remove(DocId id) {
  ids_.extract(keys_[id]);
  keys_[id] = "";
  free_ids_.push_back(id);
}

string_view ShardDocIndex::DocKeyIndex::Get(DocId id) const {
  DCHECK_LT(id, keys_.size());
  // Check that this id was not removed
  DCHECK(id < last_id_ && std::find(free_ids_.begin(), free_ids_.end(), id) == free_ids_.end());

  return keys_[id];
}

bool ShardDocIndex::DocKeyIndex::IsValid(DocId id) const {
  if (id >= last_id_ || id >= keys_.size())
    return false;
  // Check if the key at this slot is still tracked in the reverse map with the same id.
  // This correctly handles empty keys: freed slots have their key extracted from ids_,
  // while valid empty-key docs still have ids_[""] == id.
  auto it = ids_.find(keys_[id]);
  return it != ids_.end() && it->second == id;
}

size_t ShardDocIndex::DocKeyIndex::Size() const {
  return ids_.size();
}

std::vector<std::pair<std::string, search::DocId>> ShardDocIndex::DocKeyIndex::Serialize() const {
  std::vector<std::pair<std::string, search::DocId>> result;
  result.reserve(ids_.size());
  for (search::DocId id = 0; id < keys_.size(); ++id) {
    if (!keys_[id].empty()) {
      result.emplace_back(keys_[id], id);
    }
  }
  return result;
}

void ShardDocIndex::DocKeyIndex::Restore(
    const std::vector<std::pair<std::string, search::DocId>>& mappings) {
  DCHECK(ids_.empty()) << "Restore should only be called on an empty DocKeyIndex";
  // Find max doc_id to size the keys_ vector appropriately
  DocId max_id = 0;
  for (const auto& [key, doc_id] : mappings) {
    max_id = std::max(max_id, doc_id);
  }

  // Resize keys_ to accommodate all doc_ids
  keys_.resize(max_id + 1);
  last_id_ = max_id + 1;

  // Restore the mappings
  for (const auto& [key, doc_id] : mappings) {
    keys_[doc_id] = key;
    ids_[key] = doc_id;
  }

  // Build free_ids_ list for any gaps in the id sequence
  for (DocId id = 0; id <= max_id; ++id) {
    if (keys_[id].empty()) {
      free_ids_.push_back(id);
    }
  }
}

void ShardDocIndex::DocKeyIndex::Restore(const std::vector<std::string>& keys) {
  DCHECK(ids_.empty()) << "Restore should only be called on an empty DocKeyIndex";
  keys_.resize(keys.size());
  for (DocId id = 0; id < static_cast<DocId>(keys.size()); ++id) {
    keys_[id] = keys[id];
    ids_[keys[id]] = id;
  }
  last_id_ = static_cast<DocId>(keys.size());
}

uint8_t DocIndex::GetObjCode() const {
  return type == JSON ? OBJ_JSON : OBJ_HASH;
}

bool DocIndex::Matches(string_view key, unsigned obj_code) const {
  if (obj_code != GetObjCode())
    return false;

  // Empty prefixes means match all keys
  if (prefixes.empty())
    return true;

  for (const auto& prefix : prefixes) {
    if (key.rfind(prefix, 0) == 0)
      return true;
  }
  return false;
}

ShardDocIndex::ShardDocIndex(shared_ptr<const DocIndex> index)
    : base_{std::move(index)}, key_index_{} {
}

ShardDocIndex::~ShardDocIndex() {
  CancelBuilder();
}

void ShardDocIndex::Rebuild(const OpArgs& op_args, PMR_NS::memory_resource* mr, bool is_restored) {
  CancelBuilder();

  // When restoring, preserve key_index_ populated by RestoreKeyIndex() so that DocIds
  // match the GlobalDocIds stored in the serialized HNSW graph. CursorLoop will use
  // the existing DocIds to add documents to the regular indices.
  if (!is_restored) {
    key_index_ = DocKeyIndex{};
    // Full rebuild handles all documents — discard any buffered state from LOADING.
    is_restoring_vectors_ = false;
    pending_vector_updates_.clear();
  } else {
    // Restored path: VectorLoop will call RestoreGlobalVectorIndices which drains
    // the buffers. Until then, buffer any journal-driven mutations.
    is_restoring_vectors_ = true;
  }

  indices_.emplace(base_->schema, base_->options, mr, &synonyms_);

  // Create builder and start indexing
  builder_ = std::make_unique<search::IndexBuilder>(this);
  builder_->Start(op_args, is_restored, [this] {
    VLOG(1) << "Indexed " << key_index_.Size()
            << " docs on prefixes: " << absl::StrJoin(base_->prefixes, ", ");
    builder_.reset();
  });
}

void ShardDocIndex::CancelBuilder() {
  if (builder_) {
    builder_->Cancel();
    builder_.reset();
  }
}

void ShardDocIndex::RebuildForGroup(const OpArgs& op_args, const std::string_view& group_id,
                                    const std::vector<std::string_view>& terms) {
  if (!indices_)
    return;

  absl::flat_hash_set<DocId> docs_to_rebuild;
  std::vector<search::TextIndex*> text_indices = indices_->GetAllTextIndices();

  // Find all documents containing any term from the synonyms group
  for (auto* text_index : text_indices) {
    for (const auto& term : terms) {
      if (const auto* container = text_index->Matching(term)) {
        for (DocId doc_id : *container) {
          docs_to_rebuild.insert(doc_id);
        }
      }
    }
  }

  auto& db_slice = op_args.GetDbSlice();
  DCHECK(db_slice.IsDbValid(op_args.db_cntx.db_index));

  auto update_indices = [&](bool remove) {
    for (DocId doc_id : docs_to_rebuild) {
      std::string_view key = key_index_.Get(doc_id);
      auto it = db_slice.FindReadOnly(op_args.db_cntx, key, base_->GetObjCode());

      if (!it || !IsValid(*it)) {
        continue;
      }

      auto accessor = GetAccessor(op_args.db_cntx, (*it)->second);
      if (remove) {
        indices_->Remove(doc_id, *accessor);
      } else {
        // Add in this case always succeeds, because we are adding the same document again
        [[maybe_unused]] bool res = indices_->Add(doc_id, *accessor);
        DCHECK(res);
      }
    }
  };

  update_indices(true);
  synonyms_.UpdateGroup(group_id, terms);
  update_indices(false);
}

std::optional<ShardDocIndex::DocId> ShardDocIndex::GetDocId(std::string_view key,
                                                            const DbContext& db_cntx) {
  if (!indices_)
    return std::nullopt;

  // Only handle documents from database 0
  if (db_cntx.db_index != 0)
    return std::nullopt;

  return key_index_.Find(key);
}

std::optional<ShardDocIndex::DocId> ShardDocIndex::AddDoc(string_view key, const DbContext& db_cntx,
                                                          const PrimeValue& pv) {
  if (!indices_)
    return std::nullopt;

  // Only index documents from database 0
  if (db_cntx.db_index != 0)
    return std::nullopt;

  // Don't add document again if it exists. TODO: Try add?
  if (key_index_.Find(key))
    return std::nullopt;

  auto accessor = GetAccessor(db_cntx, pv);
  DocId id = key_index_.Add(key);
  if (!indices_->Add(id, *accessor)) {
    key_index_.Remove(id);
    return std::nullopt;
  }

  return id;
}

void ShardDocIndex::RemoveDoc(DocId id, const DbContext& db_cntx, const PrimeValue& pv) {
  auto accessor = GetAccessor(db_cntx, pv);
  key_index_.Remove(id);
  indices_->Remove(id, *accessor);
}

void ShardDocIndex::AddDocToGlobalVectorIndex(ShardDocIndex::DocId doc_id, const DbContext& db_cntx,
                                              PrimeValue* pv) {
  if (is_restoring_vectors_) {
    // Buffer the key — will be re-applied after RestoreGlobalVectorIndices completes.
    std::string_view key = key_index_.Get(doc_id);
    pending_vector_updates_.emplace(key);
    return;
  }

  auto accessor = GetAccessor(db_cntx, *pv);
  GlobalDocId global_id = search::CreateGlobalDocId(EngineShard::tlocal()->shard_id(), doc_id);

  for (const auto& [field_ident, field_info] : GetIndexedHnswFields(base_->schema)) {
    if (auto index = GlobalHnswIndexRegistry::Instance().Get(base_->name, field_info.short_name);
        index) {
      bool added = index->Add(global_id, *accessor, field_ident);
      if (added && !index->IsVectorCopied()) {
        pv->SetOmitDefrag(true);
      }
    }
  }
}

void ShardDocIndex::RemoveDocFromGlobalVectorIndex(ShardDocIndex::DocId doc_id,
                                                   const DbContext& db_cntx, const PrimeValue& pv) {
  if (is_restoring_vectors_) {
    // Buffer the key — will be re-applied after RestoreGlobalVectorIndices completes.
    std::string_view key = key_index_.Get(doc_id);
    pending_vector_updates_.emplace(key);
    return;
  }

  auto accessor = GetAccessor(db_cntx, pv);
  GlobalDocId global_id = search::CreateGlobalDocId(EngineShard::tlocal()->shard_id(), doc_id);

  for (const auto& [field_ident, field_info] : GetIndexedHnswFields(base_->schema)) {
    if (auto index = GlobalHnswIndexRegistry::Instance().Get(base_->name, field_info.short_name);
        index) {
      index->Remove(global_id, *accessor, field_ident);
    }
  }
}

void ShardDocIndex::RemoveFromAllHnswIndices(search::DocId doc_id) {
  GlobalDocId global_id = search::CreateGlobalDocId(EngineShard::tlocal()->shard_id(), doc_id);
  for (const auto& [field_ident, field_info] : GetIndexedHnswFields(base_->schema)) {
    if (auto index = GlobalHnswIndexRegistry::Instance().Get(base_->name, field_info.short_name);
        index) {
      index->Remove(global_id);
    }
  }
}

void ShardDocIndex::RestoreGlobalVectorIndices(std::string_view index_name, const OpArgs& op_args) {
  // Don't run loop if no vector fields are present
  if (std::ranges::empty(GetIndexedHnswFields(base_->schema)))
    return;

  LOG(INFO) << "Restoring vector index '" << index_name << "' from serialized graph on shard "
            << EngineShard::tlocal()->shard_id();

  auto& db_slice = op_args.GetDbSlice();
  DCHECK(db_slice.IsDbValid(op_args.db_cntx.db_index));

  size_t processed = 0;
  size_t successful_updates = 0;
  size_t failed_updates = 0;
  size_t missing_documents = 0;

  // Collect missing document IDs to remove after the loop (can't modify key_index_ during
  // iteration over the snapshot). Store the key too so we can re-validate: concurrent fibers
  // may free and reuse the DocId during Yield(), making the original local_id stale.
  struct MissingDoc {
    std::string key;
    DocId local_id;
    GlobalDocId global_id;
  };
  std::vector<MissingDoc> missing_doc_ids;

  // Snapshot the map: Yield() inside the loop lets other fibers run (e.g. FullSyncDflyFb
  // finishing its RDB load), which may mutate key_index_ via doc_del_cb_ and invalidate
  // flat_hash_map iterators.
  auto doc_keys_snapshot = key_index_.GetDocKeysMap();

  for (const auto& [key, local_id] : doc_keys_snapshot) {
    auto it = db_slice.FindMutable(op_args.db_cntx, key, base_->GetObjCode());
    if (!it || !IsValid(it->it)) {
      ++missing_documents;
      GlobalDocId global_id =
          search::CreateGlobalDocId(EngineShard::tlocal()->shard_id(), local_id);
      missing_doc_ids.push_back({std::string(key), local_id, global_id});
      continue;
    }

    PrimeValue& pv = it->it->second;
    auto doc = GetAccessor(op_args.db_cntx, pv);
    GlobalDocId global_id = search::CreateGlobalDocId(EngineShard::tlocal()->shard_id(), local_id);

    for (const auto& [field_ident, field_info] : GetIndexedHnswFields(base_->schema)) {
      if (auto index = GlobalHnswIndexRegistry::Instance().Get(index_name, field_info.short_name);
          index) {
        bool success = index->UpdateVectorData(global_id, *doc, field_ident);
        if (success) {
          ++successful_updates;
          if (!index->IsVectorCopied()) {
            pv.SetOmitDefrag(true);
          }
        } else {
          // Node not in restored HNSW graph (new doc added during full sync via journal
          // events before index was created). Fall back to Add.
          bool added = index->Add(global_id, *doc, field_ident);
          if (added) {
            ++successful_updates;
            if (!index->IsVectorCopied()) {
              pv.SetOmitDefrag(true);
            }
          } else {
            ++failed_updates;
          }
        }
      }
    }

    // Yield periodically to avoid blocking the fiber
    if (++processed % 1000 == 0) {
      util::ThisFiber::Yield();
    }
  }

  // Remove HNSW nodes for documents that no longer exist in DB (deleted before or during
  // restoration). Without this, stale nodes remain in the graph with no vector data, causing
  // inconsistent KNN search results compared to the master.
  // Re-validate each entry: concurrent fibers may have freed and reused the DocId.
  for (const auto& [key, local_id, global_id] : missing_doc_ids) {
    for (const auto& [field_ident, field_info] : GetIndexedHnswFields(base_->schema)) {
      if (auto index = GlobalHnswIndexRegistry::Instance().Get(index_name, field_info.short_name);
          index) {
        index->Remove(global_id);
      }
    }
    // Only remove from key_index_ if the mapping still matches the snapshot.
    if (key_index_.Find(key) == local_id) {
      key_index_.Remove(local_id);
    }
  }

  // Log summary of vector restoration
  size_t total_docs = doc_keys_snapshot.size();
  if (failed_updates > 0 || missing_documents > 0) {
    LOG(WARNING) << "Restored vectors for index " << index_name << ": " << successful_updates
                 << " successful, " << failed_updates << " failed (missing vector field), "
                 << missing_documents << " missing documents out of " << total_docs << " total";
  } else {
    VLOG(1) << "Restored vectors for index " << index_name << ": " << successful_updates << "/"
            << total_docs << " documents";
  }

  // Drain pending vector updates that arrived via journal during the LOADING window.
  // Clear the flag BEFORE draining so that AddDoc/AddDocToGlobalVectorIndex work normally.
  is_restoring_vectors_ = false;

  if (!pending_vector_updates_.empty()) {
    LOG(INFO) << "Draining " << pending_vector_updates_.size()
              << " pending vector updates for index '" << index_name << "' on shard "
              << EngineShard::tlocal()->shard_id();

    for (const auto& key : pending_vector_updates_) {
      auto local_id = key_index_.Find(key);
      auto it = db_slice.FindMutable(op_args.db_cntx, key, base_->GetObjCode());

      if (it && IsValid(it->it)) {
        // Key exists in DB — ensure it's properly indexed with current data.
        PrimeValue& pv = it->it->second;

        if (local_id) {
          // Already in key_index_ (from snapshot). Remove old HNSW node and re-add
          // with current vector data to match master state.
          RemoveFromAllHnswIndices(*local_id);
          AddDocToGlobalVectorIndex(*local_id, op_args.db_cntx, &pv);
        } else {
          // New document not in key_index_ (added during full sync).
          auto doc_id = AddDoc(key, op_args.db_cntx, pv);
          if (doc_id) {
            AddDocToGlobalVectorIndex(*doc_id, op_args.db_cntx, &pv);
          }
        }
      } else if (local_id) {
        // Key absent from DB — remove stale HNSW node and key_index_ entry.
        RemoveFromAllHnswIndices(*local_id);
        key_index_.Remove(*local_id);
      }
    }
    pending_vector_updates_.clear();
  }
}

ShardDocIndex::SerializedEntryWithKey ShardDocIndex::SerializeDocWithKey(
    search::DocId id, const OpArgs& op_args, const search::Schema& schema,
    const std::optional<std::vector<FieldReference>>& return_fields) {
  auto entry = LoadEntry(id, op_args);
  if (entry) {
    if (return_fields) {
      return std::optional<std::pair<std::string_view, SearchDocData>>{
          std::make_pair(entry->first, entry->second->Serialize(schema, *return_fields))};
    } else {
      return std::optional<std::pair<std::string_view, SearchDocData>>{
          std::make_pair(entry->first, entry->second->Serialize(schema))};
    }
  }
  return std::nullopt;
}

bool ShardDocIndex::Matches(string_view key, unsigned obj_code) const {
  return base_->Matches(key, obj_code);
}

optional<ShardDocIndex::LoadedEntry> ShardDocIndex::LoadEntry(DocId id,
                                                              const OpArgs& op_args) const {
  auto& db_slice = op_args.GetDbSlice();
  string_view key = key_index_.Get(id);
  auto it = db_slice.FindReadOnly(op_args.db_cntx, key, base_->GetObjCode());
  if (!it || !IsValid(*it))
    return std::nullopt;

  return {{key, GetAccessor(op_args.db_cntx, (*it)->second)}};
}

vector<search::SortableValue> ShardDocIndex::KeepTopKSorted(vector<DocId>* ids, size_t limit,
                                                            const SearchParams::SortOption& sort,
                                                            const OpArgs& op_args) const {
  DCHECK_GT(limit, 0u) << "Limit=0 still has O(ids->size()) complexity";

  auto comp = [order = sort.order](const auto& lhs, const auto& rhs) {
    return order == SortOrder::ASC ? lhs < rhs : lhs > rhs;
  };
  // Priority queue keeps top-k values in reverse order (to compare against top - worst value)
  using QPair = std::pair<search::SortableValue, DocId>;
  std::priority_queue<QPair, std::vector<QPair>, decltype(comp)> q(comp);

  // Iterate over all documents, extract sortable field and update the queue
  for (DocId id : *ids) {
    auto entry = LoadEntry(id, op_args);
    if (!entry)
      continue;

    auto result = entry->second->Serialize(base_->schema, {sort.field});
    if (result.empty())
      continue;

    // Check if the extracted value is better than the worst (q.top())
    if (q.size() < limit || comp(result.begin()->second, q.top().first)) {
      if (q.size() >= limit)
        q.pop();
      q.emplace(std::move(result.begin()->second), id);
    }
  }

  // Reorder ids and collect scores
  vector<search::SortableValue> out(q.size());
  for (int i = 0; !q.empty(); i++) {
    auto [v, id] = q.top();
    (*ids)[i] = id;
    out[i] = std::move(v);
    q.pop();
  }
  return out;
}

SearchResult ShardDocIndex::Search(const OpArgs& op_args, const SearchParams& params,
                                   search::SearchAlgorithm* search_algo,
                                   bool is_knn_prefilter) const {
  size_t limit = params.limit_offset + params.limit_total;

  // If we don't sort the documents, we don't need to copy more ids than are requested
  // Also for HNSW KNN search we don't cut results at the search stage.
  bool can_cut = !params.sort_option && !search_algo->GetKnnScoreSortOption() && !is_knn_prefilter;
  size_t id_cutoff_limit = can_cut ? limit : numeric_limits<size_t>::max();

  auto result = search_algo->Search(&*indices_, id_cutoff_limit);
  if (!result.error.empty())
    return {facade::ErrorReply(std::move(result.error))};

  if (limit == 0)
    return {result.total, {}, std::move(result.profile)};

  // Tune sort for KNN: Skip if it's on the knn field, otherwise extend the limit if needed
  bool skip_sort = false;
  if (auto ko = search_algo->GetKnnScoreSortOption(); ko) {
    skip_sort = !params.sort_option || params.sort_option->IsSame(*ko);
    if (!skip_sort)
      limit = max(limit, ko->limit);
  }

  // We don't apply limit if this is prefilter HNSW KNN search
  if (is_knn_prefilter) {
    limit = std::numeric_limits<size_t>::max();
  }

  auto return_fields = params.return_fields.value_or(vector<FieldReference>{});

  // Apply SORTBY
  // TODO(vlad): Write profiling up to here
  vector<search::SortableValue> sort_scores;
  if (params.sort_option && !skip_sort) {
    const auto& so = *params.sort_option;
    auto fident = so.field.Identifier(base_->schema, false);
    if (IsSortableField(fident, base_->schema)) {
      auto* idx = indices_->GetSortIndex(fident);
      sort_scores = idx->Sort(&result.ids, limit, so.order == SortOrder::DESC);
    } else {
      sort_scores = KeepTopKSorted(&result.ids, limit, so, op_args);
      // KeepTopKSorted only fills the first sort_scores.size() entries of result.ids;
      // trim the rest to avoid out-of-bounds access on sort_scores in the loop below.
      if (!sort_scores.empty())
        result.ids.resize(sort_scores.size());
      if (params.ShouldReturnAllFields())
        return_fields.push_back(so.field);
    }

    // If we sorted with knn_scores present, rearrange them
    if (!sort_scores.empty() && !result.knn_scores.empty()) {
      unordered_map<DocId, size_t> score_lookup(result.knn_scores.begin(), result.knn_scores.end());
      for (size_t i = 0; i < min(limit, result.ids.size()); i++)
        result.knn_scores[i] = {result.ids[i], score_lookup[result.ids[i]]};
    }
  }

  // Cut off unnecessary items
  result.ids.resize(min(result.ids.size(), limit));

  // Serialize documents
  vector<SerializedSearchDoc> out;
  out.reserve(min(limit, result.ids.size()));

  size_t expired_count = 0;
  for (size_t i = 0; i < result.ids.size(); i++) {
    float knn_score = result.knn_scores.empty() ? 0 : result.knn_scores[i].second;
    auto sort_score = sort_scores.empty() ? std::monostate{} : std::move(sort_scores[i]);

    // Don't load entry if we need only its key. Ignore expiration.
    if (params.IdsOnly()) {
      string_view key = key_index_.Get(result.ids[i]);
      out.push_back({result.ids[i], string{key}, {}, knn_score, sort_score});
      continue;
    }

    auto entry = LoadEntry(result.ids[i], op_args);
    if (!entry) {
      expired_count++;
      continue;
    }

    auto& [key, accessor] = *entry;

    // Load all specified fields from document
    SearchDocData fields{};
    if (params.ShouldReturnAllFields())
      fields = accessor->Serialize(base_->schema);

    auto more_fields = accessor->Serialize(base_->schema, return_fields);
    fields.insert(make_move_iterator(more_fields.begin()), make_move_iterator(more_fields.end()));
    out.push_back({result.ids[i], string{key}, std::move(fields), knn_score, sort_score});
  }

  return {result.total - expired_count, std::move(out), std::move(result.profile)};
}

vector<SearchDocData> ShardDocIndex::SearchForAggregator(
    const OpArgs& op_args, const AggregateParams& params,
    search::SearchAlgorithm* search_algo) const {
  auto search_results = search_algo->Search(&*indices_);

  if (!search_results.error.empty())
    return {};

  auto [fields_to_load, sort_indicies] =
      PreprocessAggregateFields(base_->schema, params, params.load_fields);

  vector<absl::flat_hash_map<string, search::SortableValue>> out;
  for (DocId doc : search_results.ids) {
    auto entry = LoadEntry(doc, op_args);
    if (!entry)
      continue;
    auto& [_, accessor] = *entry;

    SearchDocData extracted_sort_indicies;
    extracted_sort_indicies.reserve(sort_indicies.size());
    for (const auto& [fident, fname] : sort_indicies) {
      extracted_sort_indicies[fname] = indices_->GetSortIndexValue(doc, fident);
    }

    SearchDocData loaded = accessor->Serialize(base_->schema, fields_to_load);

    out.emplace_back(make_move_iterator(extracted_sort_indicies.begin()),
                     make_move_iterator(extracted_sort_indicies.end()));
    out.back().insert(make_move_iterator(loaded.begin()), make_move_iterator(loaded.end()));
  }

  return out;
}

join::Vector<join::OwnedEntry> ShardDocIndex::PreagregateDataForJoin(
    const OpArgs& op_args, absl::Span<const std::string_view> join_fields,
    search::SearchAlgorithm* search_algo) const {
  auto search_results = search_algo->Search(&*indices_);

  const size_t fields_count = join_fields.size();
  const auto [basic_fields, is_sortable_field] = GetBasicFields(join_fields, base_->schema);

  join::Vector<join::OwnedEntry> result;
  result.reserve(search_results.ids.size());

  const ShardId shard_id = op_args.shard->shard_id();
  for (DocId doc : search_results.ids) {
    auto entry = LoadEntry(doc, op_args);
    if (!entry)
      continue;

    auto& [key, accessor] = *entry;

    SearchDocData loaded_basic_fields = accessor->Serialize(base_->schema, basic_fields);

    bool insert_key = true;
    join::Vector<join::OwnedJoinableValue> join_fields_values(fields_count);
    for (size_t i = 0; i < fields_count; ++i) {
      search::SortableValue value;
      if (is_sortable_field[i]) {
        value = indices_->GetSortIndexValue(doc, join_fields[i]);
      } else {
        value = loaded_basic_fields[join_fields[i]];
      }

      auto copy = [&](auto&& v) {
        using T = std::decay_t<decltype(v)>;
        if constexpr (!std::is_same_v<T, std::monostate>) {
          join_fields_values[i] = v;
        } else {
          // If the value is nil, we skip this key
          insert_key = false;
        }
      };

      std::visit(std::move(copy), value);
    }

    if (insert_key) {
      result.emplace_back(std::piecewise_construct, std::forward_as_tuple(shard_id, doc),
                          std::forward_as_tuple(std::make_move_iterator(join_fields_values.begin()),
                                                std::make_move_iterator(join_fields_values.end())));
    }
  }

  return result;
}

ShardDocIndex::FieldsValuesPerDocId ShardDocIndex::LoadKeysData(
    const OpArgs& op_args, const absl::flat_hash_set<search::DocId>& doc_ids,
    absl::Span<const std::string_view> fields_to_load) const {
  const size_t fields_count = fields_to_load.size();
  const auto [basic_fields, is_sortable_field] = GetBasicFields(fields_to_load, base_->schema);

  FieldsValuesPerDocId result;
  result.reserve(doc_ids.size());

  for (DocId doc : doc_ids) {
    auto entry = LoadEntry(doc, op_args);
    if (!entry)
      continue;

    auto& [key, accessor] = *entry;

    SearchDocData loaded_basic_fields = accessor->Serialize(base_->schema, basic_fields);

    FieldsValues fields_values(fields_count);
    for (size_t i = 0; i < fields_count; ++i) {
      if (is_sortable_field[i]) {
        fields_values[i] = indices_->GetSortIndexValue(doc, fields_to_load[i]);
      } else {
        fields_values[i] = loaded_basic_fields[fields_to_load[i]];
      }
    }

    result.emplace(std::piecewise_construct, std::forward_as_tuple(doc),
                   std::forward_as_tuple(std::make_move_iterator(fields_values.begin()),
                                         std::make_move_iterator(fields_values.end())));
  }

  return result;
}

DocIndexInfo ShardDocIndex::GetInfo() const {
  return {.base_index = *base_,
          .num_docs = key_index_.Size(),
          .indexing = bool(builder_),
          .percent_indexed = bool(builder_) ? 0.5f : 1.0f,  // no estimation for now
          .hnsw_metadata = nullopt};
}

io::Result<StringVec, ErrorReply> ShardDocIndex::GetTagVals(string_view field) const {
  search::BaseIndex* base_index = indices_->GetIndex(field);
  if (base_index == nullptr) {
    return make_unexpected(ErrorReply{"-No such field"});
  }

  search::TagIndex* tag_index = dynamic_cast<search::TagIndex*>(base_index);
  if (tag_index == nullptr) {
    return make_unexpected(ErrorReply{"-Not a tag field"});
  }

  return tag_index->GetTerms();
}

ShardDocIndices::ShardDocIndices() : local_mr_{ServerState::tlocal()->data_heap()} {
  InitTLSearchMR(&local_mr_);
}

ShardDocIndex* ShardDocIndices::GetIndex(string_view name) {
  auto it = indices_.find(name);
  return it != indices_.end() ? it->second.get() : nullptr;
}

void ShardDocIndices::InitIndex(const OpArgs& op_args, std::string_view name,
                                shared_ptr<const DocIndex> index_ptr) {
  auto shard_index = make_unique<ShardDocIndex>(std::move(index_ptr));
  auto [it, _] = indices_.emplace(name, std::move(shard_index));

  // Don't build while loading, shutting down, etc.
  // After loading, indices are rebuilt separately
  if (ServerState::tlocal()->gstate() == GlobalState::ACTIVE)
    it->second->Rebuild(op_args, &local_mr_);

  op_args.GetDbSlice().SetDocDeletionCallback(
      [this](string_view key, const DbContext& cntx, const PrimeValue& pv) {
        RemoveDoc(key, cntx, pv);
      });
}

unique_ptr<ShardDocIndex> ShardDocIndices::DropIndex(string_view name) {
  auto it = indices_.find(name);
  if (it == indices_.end())
    return nullptr;

  DropIndexCache(*it->second);
  auto index = std::move(it->second);
  indices_.erase(it);

  return index;
}

void ShardDocIndices::DropAllIndices() {
  for (const auto& [_, idx] : indices_)
    DropIndexCache(*idx);
  indices_.clear();
  GlobalHnswIndexRegistry::Instance().Reset();
}

void ShardDocIndices::DropIndexCache(const dfly::ShardDocIndex& shard_doc_index) {
  auto info = shard_doc_index.GetInfo();
  for (const auto& [fident, field] : info.base_index.schema.fields)
    JsonAccessor::RemoveFieldFromCache(fident);
}

void ShardDocIndices::RebuildAllIndices(const OpArgs& op_args, bool is_restored) {
  for (auto& [index_name, ptr] : indices_) {
    // Only use the restore path for indices that have populated key mappings.
    // When shard counts differ, PerformPostLoad remaps the mappings; if remapping fails,
    // the mappings are removed so the index falls back to full rebuild here.
    bool index_restored = is_restored && ptr->key_index_.Size() > 0;
    ptr->Rebuild(op_args, &local_mr_, index_restored);
  }
}

void ShardDocIndices::BlockUntilConstructionEnd() {
  bool indexing = false;
  do {
    indexing = false;
    for (const auto& [_, ptr] : indices_)
      indexing |= ptr->GetInfo().indexing;

    if (indexing)
      util::ThisFiber::SleepFor(5ms);
  } while (indexing);
}

vector<string> ShardDocIndices::GetIndexNames() const {
  vector<string> names{};
  names.reserve(indices_.size());
  for (const auto& [name, ptr] : indices_)
    names.push_back(name);
  return names;
}

void ShardDocIndices::AddDoc(string_view key, const DbContext& db_cntx, PrimeValue* pv) {
  DCHECK(IsIndexedKeyType(*pv));
  for (auto& [index_name, index] : indices_) {
    if (index->Matches(key, pv->ObjType())) {
      std::optional<search::DocId> doc_id = index->AddDoc(key, db_cntx, *pv);
      if (doc_id) {
        index->AddDocToGlobalVectorIndex(*doc_id, db_cntx, pv);
      }
    }
  }
}

void ShardDocIndices::RemoveDoc(string_view key, const DbContext& db_cntx, const PrimeValue& pv) {
  DCHECK(IsIndexedKeyType(pv));
  for (auto& [index_name, index] : indices_) {
    if (index->Matches(key, pv.ObjType())) {
      std::optional<search::DocId> doc_id = index->GetDocId(key, db_cntx);
      if (doc_id) {
        index->RemoveDocFromGlobalVectorIndex(*doc_id, db_cntx, pv);
        index->RemoveDoc(*doc_id, db_cntx, pv);
      }
    }
  }
}

size_t ShardDocIndices::GetUsedMemory() const {
  return local_mr_.used();
}

SearchStats ShardDocIndices::GetStats() const {
  size_t total_entries = 0;
  for (const auto& [_, index] : indices_)
    total_entries += index->GetInfo().num_docs;

  return {GetUsedMemory(), indices_.size(), total_entries};
}

search::DefragmentResult ShardDocIndices::Defragment(PageUsage* page_usage) {
  // In case of resumed defragmentation, iteration order may change in case there were insertions
  // after the last defragment operation completed, so there is no guarantee that an entry will only
  // be defragmented once per cycle. This will only happen in case of a new index being added
  // though, so it is an acceptable anomaly.
  search::DefragmentMap dm{indices_, &next_defrag_index_};
  return dm.Defragment(page_usage);
}

}  // namespace dfly


================================================
FILE: src/server/search/doc_index.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/flat_hash_map.h>
#include <absl/container/flat_hash_set.h>

#include <memory>
#include <optional>
#include <string>
#include <utility>
#include <vector>

#include "base/pmr/memory_resource.h"
#include "core/mi_memory_resource.h"
#include "core/search/base.h"
#include "core/search/hnsw_index.h"
#include "core/search/search.h"
#include "core/search/synonyms.h"
#include "server/search/aggregator.h"
#include "server/search/index_join.h"
#include "server/stats.h"
#include "server/table.h"

namespace dfly {

using StringVec = std::vector<std::string>;

namespace search {
struct IndexBuilder;
}  // namespace search

struct BaseAccessor;

using SearchDocData = absl::flat_hash_map<std::string /*field*/, search::SortableValue /*value*/>;
using Synonyms = search::Synonyms;

std::string_view SearchFieldTypeToString(search::SchemaField::FieldType);

struct SerializedSearchDoc {
  search::DocId id;
  std::string key;
  SearchDocData values;
  float knn_score;
  search::SortableValue sort_score;
};

struct SearchResult {
  SearchResult() = default;

  SearchResult(size_t total_hits, std::vector<SerializedSearchDoc> docs,
               std::optional<search::AlgorithmProfile> profile)
      : total_hits{total_hits}, docs{std::move(docs)}, profile{std::move(profile)} {
  }

  SearchResult(facade::ErrorReply error) : error{std::move(error)} {
  }

  size_t total_hits;
  std::vector<SerializedSearchDoc> docs;
  std::optional<search::AlgorithmProfile> profile;

  std::optional<facade::ErrorReply> error;
};

// Field reference with optional alias as parsed from RETURN [field AS alias], LOAD, etc...
struct FieldReference {
  explicit FieldReference(std::string_view name, std::string_view alias = "")
      : name_{name}, alias_{alias} {
  }

  std::string_view Identifier(const search::Schema& schema, bool is_json) const {
    return (is_json && IsJsonPath(name_)) ? name_ : schema.LookupAlias(name_);
  }

  std::string_view Name() const {
    return name_;
  }

  std::string_view OutputName() const {
    return alias_.empty() ? name_ : alias_;
  }

 private:
  static bool IsJsonPath(std::string_view name);

  std::string_view name_, alias_;
};

enum class SortOrder { ASC, DESC };

struct SearchParams {
  struct SortOption {
    FieldReference field;
    SortOrder order = SortOrder::ASC;

    bool IsSame(const search::KnnScoreSortOption& knn_sort) const {
      return knn_sort.score_field_alias == field.OutputName();
    }
  };

  // Parameters for "LIMIT offset total": select total amount documents with a specific offset from
  // the whole result set
  size_t limit_offset = 0;
  size_t limit_total = 10;

  bool with_sortkeys = false;

  /*
  1. If not set -> return all fields
  2. If set but empty -> no fields should be returned
  3. If set and not empty -> return only these fields
  */
  std::optional<std::vector<FieldReference>> return_fields;

  /*
    Fields that should be also loaded from the document.

    Only one of load_fields and return_fields should be set.
  */
  std::optional<std::vector<FieldReference>> load_fields;

  std::optional<SortOption> sort_option;

  search::OptionalFilters optional_filters;

  search::QueryParams query_params;

  bool ShouldReturnAllFields() const {
    return !return_fields.has_value();
  }

  bool IdsOnly() const {
    return return_fields && return_fields->empty();
  }

  bool ShouldReturnField(std::string_view alias) const;
};

struct AggregateParams {
  struct JoinParams {
    // Fist field is the index name, second is the field name.
    using Field = std::pair<std::string, std::string>;

    struct Condition {
      Condition(std::string_view field_, std::string_view foreign_index_,
                std::string_view foreign_field_)
          : field{field_}, foreign_field{Field{foreign_index_, foreign_field_}} {
      }

      std::string field;
      Field foreign_field;
    };

    std::string index;
    std::string index_alias;
    std::vector<Condition> conditions;
    std::string query = "*";
  };

  /* Can have 2 scenarios:
      1. No joins - then this is ignored
      2. Has joins and SORTBY ... LIMIT option - then this is used to sort/limit right after join
      3. Has joins and LIMIT option - then this is used to limit right after join.
     Next aggregation steps after first LIMIT or first SORTBY will be applied on the final result,
     after loading the data for all joined documents. */
  struct JoinAggregateParams {
    static constexpr size_t kDefaultLimit = std::numeric_limits<size_t>::max();

    bool HasLimit() const {
      return limit_total != kDefaultLimit;
    }

    bool HasValue() const {
      return HasLimit() || sort.has_value();
    }

    size_t limit_offset = 0;
    size_t limit_total = kDefaultLimit;
    std::optional<aggregate::SortParams> sort;
  };

  std::string_view index, query;
  search::QueryParams params;

  std::vector<JoinParams> joins;
  JoinAggregateParams join_agg_params;

  std::optional<std::vector<FieldReference>> load_fields;
  std::vector<aggregate::AggregationStep> steps;
};

// Stores basic info about a document index.
struct DocIndex {
  enum DataType : uint8_t { HASH, JSON };

  // Get numeric OBJ_ code
  uint8_t GetObjCode() const;

  // Return true if the following document (key, obj_code) is tracked by this index.
  bool Matches(std::string_view key, unsigned obj_code) const;

  std::string name;
  search::Schema schema;
  search::IndicesOptions options;
  std::vector<std::string> prefixes;
  DataType type{HASH};
};

struct DocIndexInfo {
  DocIndex base_index;
  size_t num_docs = 0;

  bool indexing = false;
  float percent_indexed = 1;

  // HNSW metadata for vector index (if present)
  // TODO: move to schema
  std::optional<search::HnswIndexMetadata> hnsw_metadata = std::nullopt;

  // Build original ft.create command that can be used to re-create this index
  std::string BuildRestoreCommand() const;
};

class ShardDocIndices;

// Stores internal search indices for documents of a document index on a specific shard.
class ShardDocIndex {
  friend class ShardDocIndices;
  friend struct search::IndexBuilder;

  using DocId = search::DocId;
  using GlobalDocId = search::GlobalDocId;

  // Used in FieldsValuesPerDocId to store values for each field per document
  using FieldsValues = absl::InlinedVector<search::SortableValue, 4>;

  // DocKeyIndex manages mapping document keys to ids and vice versa through a simple interface.
  struct DocKeyIndex {
    DocId Add(std::string_view key);

    // Like Add but always allocates a fresh DocId, never reusing free_ids_.
    // Used during restored CursorLoop to avoid colliding with HNSW node ids.
    DocId AddNew(std::string_view key);

    void Remove(DocId id);

    std::string_view Get(DocId id) const;
    bool IsValid(DocId id) const;
    std::optional<DocId> Find(std::string_view key) const;
    size_t Size() const;

    // Get const reference to the internal ids map
    const absl::flat_hash_map<std::string, DocId>& GetDocKeysMap() const {
      return ids_;
    }

    // Serialization: returns pairs of (key, doc_id) for all active mappings
    std::vector<std::pair<std::string, DocId>> Serialize() const;

    // Restore key-to-docId mappings from serialized data (RDB load)
    void Restore(const std::vector<std::pair<std::string, search::DocId>>& mappings);

    // Restore from remapped keys in doc_id order (vector index = doc_id).
    void Restore(const std::vector<std::string>& keys);

   private:
    absl::flat_hash_map<std::string, DocId> ids_;
    std::vector<std::string> keys_;
    std::vector<DocId> free_ids_;
    DocId last_id_ = 0;
  };

 public:
  // Index must be rebuilt at least once after intialization
  explicit ShardDocIndex(std::shared_ptr<const DocIndex> index);

  // Possibly blocking to stop indexing job
  ~ShardDocIndex();

  // Perform search on all indexed documents and return results.
  SearchResult Search(const OpArgs& op_args, const SearchParams& params,
                      search::SearchAlgorithm* search_algo, bool is_knn_prefilter) const;

  // Perform search and load requested values - note params might be interpreted differently.
  std::vector<SearchDocData> SearchForAggregator(const OpArgs& op_args,
                                                 const AggregateParams& params,
                                                 search::SearchAlgorithm* search_algo) const;

  // Methods needed for join operation
  join::Vector<join::OwnedEntry> PreagregateDataForJoin(
      const OpArgs& op_args, absl::Span<const std::string_view> join_fields,
      search::SearchAlgorithm* search_algo) const;

  using FieldsValuesPerDocId = absl::flat_hash_map<DocId, FieldsValues>;
  FieldsValuesPerDocId LoadKeysData(const OpArgs& op_args,
                                    const absl::flat_hash_set<search::DocId>& doc_ids,
                                    absl::Span<const std::string_view> fields_to_load) const;

  // Return whether base index matches
  bool Matches(std::string_view key, unsigned obj_code) const;

  std::optional<ShardDocIndex::DocId> GetDocId(std::string_view key, const DbContext& db_cntx);

  std::optional<ShardDocIndex::DocId> AddDoc(std::string_view key, const DbContext& db_cntx,
                                             const PrimeValue& pv);

  void RemoveDoc(DocId id, const DbContext& db_cntx, const PrimeValue& pv);

  DocIndexInfo GetInfo() const;

  io::Result<StringVec, facade::ErrorReply> GetTagVals(std::string_view field) const;

  // Get synonym manager for this shard
  const Synonyms& GetSynonyms() const {
    return synonyms_;
  }

  Synonyms& GetSynonyms() {
    return synonyms_;
  }

  // Rebuild indices only for documents containing terms from the updated synonym group
  void RebuildForGroup(const OpArgs& op_args, const std::string_view& group_id,
                       const std::vector<std::string_view>& terms);

  // Public access to key index for direct operations (e.g., when dropping index with DD)
  // TODO: replace with keys() view
  const DocKeyIndex& key_index() const {
    return key_index_;
  }

  void AddDocToGlobalVectorIndex(ShardDocIndex::DocId doc_id, const DbContext& db_cntx,
                                 PrimeValue* pv);
  void RemoveDocFromGlobalVectorIndex(ShardDocIndex::DocId doc_id, const DbContext& db_cntx,
                                      const PrimeValue& pv);

  // Rebuild global vector indices from restored key index, updating vector data
  // for nodes whose graph structure was already restored from RDB.
  void RestoreGlobalVectorIndices(std::string_view index_name, const OpArgs& op_args);

  // Serialize doc and return with key name
  using SerializedEntryWithKey = std::optional<std::pair<std::string_view, SearchDocData>>;
  SerializedEntryWithKey SerializeDocWithKey(
      search::DocId id, const OpArgs& op_args, const search::Schema& schema,
      const std::optional<std::vector<FieldReference>>& return_fields);

  search::DefragmentResult Defragment(PageUsage* page_usage) {
    if (indices_) {
      return indices_->Defragment(page_usage);
    }
    return search::DefragmentResult{false, 0};
  }

  std::vector<std::pair<std::string, DocId>> SerializeKeyIndex() const {
    return key_index_.Serialize();
  }

  // Restore key-to-docId mappings from serialized data (RDB load)
  void RestoreKeyIndex(const std::vector<std::pair<std::string, search::DocId>>& mappings) {
    key_index_.Restore(mappings);
  }

  // Restore from remapped keys in doc_id order (vector index = doc_id).
  void RestoreKeyIndex(const std::vector<std::string>& keys) {
    key_index_.Restore(keys);
  }

 private:
  // Clears internal data. Traverses all matching documents and assigns ids.
  void Rebuild(const OpArgs& op_args, PMR_NS::memory_resource* mr, bool is_restored = false);

  // Cancel builder if in progress
  void CancelBuilder();

  using LoadedEntry = std::pair<std::string_view, std::unique_ptr<BaseAccessor>>;
  std::optional<LoadedEntry> LoadEntry(search::DocId id, const OpArgs& op_args) const;

  // Behaviour identical to SortIndex::Sort for non-sortable fields that need to be fetched first
  std::vector<search::SortableValue> KeepTopKSorted(std::vector<DocId>* ids, size_t limit,
                                                    const SearchParams::SortOption& sort,
                                                    const OpArgs& op_args) const;

  // Remove a DocId from all HNSW indices for this index.
  void RemoveFromAllHnswIndices(search::DocId doc_id);

 private:
  std::shared_ptr<const DocIndex> base_;
  std::optional<search::FieldIndices> indices_;
  DocKeyIndex key_index_;
  Synonyms synonyms_;

  std::unique_ptr<search::IndexBuilder> builder_;

  // Buffered state for journal events arriving while HNSW vector indices
  // are being restored from serialized graph data (is_restoring_vectors_ == true).
  // Drained by RestoreGlobalVectorIndices after the graph is fully restored.
  absl::flat_hash_set<std::string> pending_vector_updates_;
  bool is_restoring_vectors_ = false;
};

// Stores shard doc indices by name on a specific shard.
class ShardDocIndices {
 public:
  ShardDocIndices();

  // Get sharded document index by its name or nullptr if not found
  ShardDocIndex* GetIndex(std::string_view name);

  // Init index: create shard local state for given index with given name.
  // Build if instance is in active state.
  void InitIndex(const OpArgs& op_args, std::string_view name,
                 std::shared_ptr<const DocIndex> index);

  // Drop index, return the dropped index if it existed or nullptr otherwise
  std::unique_ptr<ShardDocIndex> DropIndex(std::string_view name);

  // Drop all indices
  void DropAllIndices();

  // Rebuild all indices
  void RebuildAllIndices(const OpArgs& op_args, bool is_restored);

  // Block until construction of all indices finishes
  void BlockUntilConstructionEnd();

  std::vector<std::string> GetIndexNames() const;

  /* Use AddDoc and RemoveDoc only if pv object type is json or hset */
  void AddDoc(std::string_view key, const DbContext& db_cnt, PrimeValue* pv);
  void RemoveDoc(std::string_view key, const DbContext& db_cnt, const PrimeValue& pv);

  size_t GetUsedMemory() const;
  SearchStats GetStats() const;  // combines stats for all indices

  search::DefragmentResult Defragment(PageUsage* page_usage);

 private:
  // Clean caches that might have data from this index
  void DropIndexCache(const dfly::ShardDocIndex& shard_doc_index);

 private:
  MiMemoryResource local_mr_;
  absl::flat_hash_map<std::string, std::unique_ptr<ShardDocIndex>> indices_;

  std::string next_defrag_index_;
};

}  // namespace dfly


================================================
FILE: src/server/search/doc_index_fallback.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#ifndef WITH_SEARCH
#include "core/page_usage/page_usage_stats.h"
#include "core/search/base.h"
#include "server/search/doc_index.h"
#include "server/search/index_builder.h"

namespace dfly {

using namespace std;

ShardDocIndices::ShardDocIndices() : local_mr_(nullptr) {
}

void ShardDocIndices::AddDoc(std::string_view key, const DbContext& db_cnt, PrimeValue* pv) {
}
void ShardDocIndices::RemoveDoc(std::string_view key, const DbContext& db_cnt,
                                const PrimeValue& pv) {
}

void ShardDocIndices::DropAllIndices() {
}
void ShardDocIndices::RebuildAllIndices(const OpArgs& op_args, bool is_restored) {
}
void ShardDocIndices::BlockUntilConstructionEnd() {
}

size_t ShardDocIndices::GetUsedMemory() const {
  return 0;
}
SearchStats ShardDocIndices::GetStats() const {
  return {};
}

search::DefragmentResult ShardDocIndices::Defragment(PageUsage*) {
  return search::DefragmentResult{};
}

ShardDocIndex::~ShardDocIndex() {
}

}  // namespace dfly
#endif


================================================
FILE: src/server/search/global_hnsw_index.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/search/global_hnsw_index.h"

#include <absl/strings/str_cat.h>

#include "base/logging.h"
#include "core/search/ast_expr.h"
#include "core/search/base.h"
#include "core/search/index_result.h"
#include "core/search/indices.h"
#include "core/search/vector_utils.h"
#include "server/engine_shard.h"
#include "server/engine_shard_set.h"
#include "server/search/doc_accessors.h"
#include "server/search/doc_index.h"
#include "server/transaction.h"
#include "server/tx_base.h"

namespace dfly {

// Global index registry implementation

GlobalHnswIndexRegistry& GlobalHnswIndexRegistry::Instance() {
  static GlobalHnswIndexRegistry instance;
  return instance;
}

bool GlobalHnswIndexRegistry::Create(std::string_view index_name, std::string_view field_name,
                                     const search::SchemaField::VectorParams& params,
                                     DocIndex::DataType data_type) {
  std::string key = MakeKey(index_name, field_name);

  std::unique_lock<std::shared_mutex> lock(registry_mutex_);

  auto it = indices_.find(key);

  if (it != indices_.end())
    return false;

  // We make a copy of vector data when:
  // 1. Data type is JSON. This is because JSON object is not represented as contiguous memory.
  // 2. Data type is HASH and vector data memory size is smaller than threshold for listpack
  // encoding.
  //    We use pesimistic approach for decision and expect that ONLY VECTOR data field is used.
  //    When HSET object is created function `IsGoodForListpack` decides if object should be encoded
  //    as listpack or StringMap. Problem with listpack encoding is that vector memory, if
  //    referenced, can have wrong alignment for vector distance operations.
  const bool copy_vector =
      (data_type == DocIndex::JSON) || (params.dim * 4 < server.max_listpack_map_bytes);

  indices_[key] = std::make_shared<search::HnswVectorIndex>(params, copy_vector);

  return true;
}

bool GlobalHnswIndexRegistry::Remove(std::string_view index_name, std::string_view field_name) {
  std::string key = MakeKey(index_name, field_name);
  std::unique_lock<std::shared_mutex> lock(registry_mutex_);
  return bool(indices_.erase(key));
}

std::shared_ptr<search::HnswVectorIndex> GlobalHnswIndexRegistry::Get(
    std::string_view index_name, std::string_view field_name) const {
  std::string key = MakeKey(index_name, field_name);
  std::shared_lock<std::shared_mutex> lock(registry_mutex_);
  auto it = indices_.find(key);
  return it != indices_.end() ? it->second : nullptr;
}

bool GlobalHnswIndexRegistry::Exist(std::string_view index_name,
                                    std::string_view field_name) const {
  std::string key = MakeKey(index_name, field_name);
  std::shared_lock<std::shared_mutex> lock(registry_mutex_);
  return indices_.find(key) != indices_.end();
}

void GlobalHnswIndexRegistry::Reset() {
  std::unique_lock<std::shared_mutex> lock(registry_mutex_);
  indices_.clear();
}

absl::flat_hash_set<std::string> GlobalHnswIndexRegistry::GetIndexNames() const {
  std::shared_lock<std::shared_mutex> lock(registry_mutex_);
  absl::flat_hash_set<std::string> index_names;
  for (const auto& [key, _] : indices_) {
    // Keys are in format "index_name:field_name", extract index_name.
    // Use rfind because index names may legally contain ':' (e.g. ":Order:index").
    size_t pos = key.rfind(':');
    if (pos != std::string::npos) {
      index_names.insert(key.substr(0, pos));
    }
  }
  return index_names;
}

std::string GlobalHnswIndexRegistry::MakeKey(std::string_view index_name,
                                             std::string_view field_name) const {
  return absl::StrCat(index_name, ":", field_name);
}

}  // namespace dfly


================================================
FILE: src/server/search/global_hnsw_index.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/flat_hash_map.h>
#include <absl/container/flat_hash_set.h>

#include <memory>
#include <shared_mutex>
#include <string>
#include <string_view>
#include <vector>

#include "core/search/base.h"
#include "core/search/hnsw_index.h"
#include "core/search/search.h"
#include "server/search/doc_index.h"

namespace dfly {
class GlobalHnswIndexRegistry {
 public:
  static GlobalHnswIndexRegistry& Instance();

  bool Create(std::string_view index_name, std::string_view field_name,
              const search::SchemaField::VectorParams& params, DocIndex::DataType data_type);

  bool Remove(std::string_view index_name, std::string_view field_name);

  std::shared_ptr<search::HnswVectorIndex> Get(std::string_view index_name,
                                               std::string_view field_name) const;

  bool Exist(std::string_view index_name, std::string_view field_name) const;

  absl::flat_hash_map<std::string, std::shared_ptr<search::HnswVectorIndex>> GetAll() const {
    std::shared_lock<std::shared_mutex> lock(registry_mutex_);
    return indices_;
  }

  // Returns unique index names from all registered HNSW indices
  absl::flat_hash_set<std::string> GetIndexNames() const;

  void Reset();

 private:
  GlobalHnswIndexRegistry() = default;
  std::string MakeKey(std::string_view index_name, std::string_view field_name) const;

  mutable std::shared_mutex registry_mutex_;
  absl::flat_hash_map<std::string, std::shared_ptr<search::HnswVectorIndex>> indices_;
};

}  // namespace dfly


================================================
FILE: src/server/search/index_builder.cc
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/search/index_builder.h"

#include <ranges>

#include "server/db_slice.h"
#include "server/engine_shard_set.h"
#include "server/search/doc_accessors.h"
#include "server/search/global_hnsw_index.h"

namespace dfly::search {

void IndexBuilder::Start(const OpArgs& op_args, bool is_restored,
                         std::function<void()> on_complete) {
  using namespace util::fb2;
  auto table = op_args.GetDbSlice().CopyDBTablePtr(op_args.db_cntx.db_index);
  DCHECK(table.get());

  is_restored_ = is_restored;

  auto cb = [this, table, db_cntx = op_args.db_cntx, on_complete = std::move(on_complete)] {
    CursorLoop(table.get(), db_cntx);
    VectorLoop(table.get(), db_cntx);

    // TODO: make it step by step + wire cancellation inside
    if (state_.IsRunning())
      index_->indices_->FinalizeInitialization();

    // Finish by clearing the fiber reference and calling on_complete as its last action
    {
      util::FiberAtomicGuard guard{};  // preserve cancellation
      fiber_.Detach();                 // builder is now safely deleteable
      if (!state_.IsCancelled())
        on_complete();
    }
  };

  fiber_ = Fiber{std::move(cb)};
}

void IndexBuilder::Cancel() {
  state_.Cancel();
  util::fb2::Fiber{std::move(fiber_)}.JoinIfNeeded();  // steal and wait for finish
}

util::fb2::Fiber IndexBuilder::Worker() {
  return std::move(fiber_);
}

void IndexBuilder::CursorLoop(dfly::DbTable* table, DbContext db_cntx) {
  auto cb = [this, db_cntx, scratch = std::string{}](PrimeTable::iterator it) mutable {
    PrimeValue& pv = it->second;
    std::string_view key = it->first.GetSlice(&scratch);

    if (!index_->Matches(key, pv.ObjType()))
      return;

    // TODO: make it a parameter of SharDocIndex::AddDoc()
    if (is_restored_) {
      // Use existing DocIds from the restored key_index_ to keep them aligned with
      // GlobalDocIds stored in the serialized HNSW graph. Only add to regular indices
      // (text/tag/numeric); vector indices are handled separately by VectorLoop.
      if (auto doc_id = index_->key_index().Find(key); doc_id) {
        auto accessor = GetAccessor(db_cntx, pv);
        if (!index_->indices_->Add(*doc_id, *accessor)) {
          LOG(WARNING) << "Failed to restore index entry for key: " << key
                       << ", removing from key index";
          index_->key_index_.Remove(*doc_id);
        }
      } else {
        // New document not in the restored key_index_ (added by journal events during
        // full sync before the index was created). Use AddNew to allocate a fresh DocId
        // that won't collide with serialized HNSW node ids from freed slots.
        auto accessor = GetAccessor(db_cntx, pv);
        DocId id = index_->key_index_.AddNew(key);
        if (!index_->indices_->Add(id, *accessor)) {
          index_->key_index_.Remove(id);
        }
      }
    } else {
      index_->AddDoc(key, db_cntx, pv);
    }
  };

  PrimeTable::Cursor cursor;
  do {
    cursor = table->prime.Traverse(cursor, cb);
    if (base::CycleClock::ToUsec(util::ThisFiber::GetRunningTimeCycles()) > 500)
      util::ThisFiber::Yield();
  } while (cursor && state_.IsRunning());
}

void IndexBuilder::VectorLoop(dfly::DbTable* table, DbContext db_cntx) {
  bool any_vector = std::ranges::any_of(index_->base_->schema.fields, [](const auto& item) {
    return item.second.IsIndexableHnswField();
  });
  if (!any_vector || !state_.IsRunning())
    return;

  // If any HNSW index was restored from RDB, use UpdateVectorData instead of Add.
  if (is_restored_) {
    // TODO: Add support for concurrent modifications
    OpArgs op_args{EngineShard::tlocal(), nullptr, db_cntx};
    index_->RestoreGlobalVectorIndices(index_->base_->name, op_args);
    return;
  }

  // Non-restored path: rebuilding HNSW from scratch. Clear the restoring flag and discard
  // any pending updates — the full table traversal below will pick up all current documents.
  index_->is_restoring_vectors_ = false;
  index_->pending_vector_updates_.clear();

  auto cb = [this, db_cntx, scratch = std::string{}](PrimeTable::iterator it) mutable {
    PrimeValue& pv = it->second;
    std::string_view key = it->first.GetSlice(&scratch);

    if (auto local_id = index_->key_index().Find(key); local_id)
      index_->AddDocToGlobalVectorIndex(*local_id, db_cntx, &pv);
  };

  // Because order of acquiring mutexes for global vector indices is not determined, we must run
  // all accesses on a single thread through the shard queue to have a single linear order
  // TODO: this prevents asynchronous indexing for vector fields
  auto shard_cb = [&] {
    PrimeTable::Cursor cursor;
    do {
      cursor = table->prime.Traverse(cursor, cb);
      if (base::CycleClock::ToUsec(util::ThisFiber::GetRunningTimeCycles()) > 500)
        util::ThisFiber::Yield();
    } while (cursor && state_.IsRunning());
  };
  shard_set->Await(EngineShard::tlocal()->shard_id(), std::move(shard_cb));
}

}  // namespace dfly::search


================================================
FILE: src/server/search/index_builder.h
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <functional>

#include "server/execution_state.h"
#include "server/tx_base.h"

namespace dfly {
struct DbTable;
class ShardDocIndex;
}  // namespace dfly

namespace dfly::search {

// Asynchronous index builder
struct IndexBuilder {
  explicit IndexBuilder(ShardDocIndex* index) : index_{index} {
  }

  // Start building and call `on_complete` on finish from worker fiber.
  // If `is_restored` is true, VectorLoop will use UpdateVectorData instead of Add
  // for HNSW indices (restored from RDB). This flag is passed from PerformPostLoad.
  void Start(const OpArgs& op_args, bool is_restored, std::function<void()> on_complete);

  // Cancel building and wait for worker to finish. Safe to delete after
  // TODO: Maybe implement nonblocking version?
  void Cancel();

  // Get fiber reference. Temporary to polyfill sync construction places
  util::fb2::Fiber Worker();

 private:
  // Loop with cursor over table and add entries to regular index
  void CursorLoop(DbTable* table, DbContext db_cntx);

  // Loop with cursor over table and add entries to global HNSW vector indices
  void VectorLoop(DbTable* table, DbContext db_cntx);

  dfly::ExecutionState state_;
  ShardDocIndex* index_;
  bool is_restored_ = false;
  util::fb2::Fiber fiber_;
};

}  // namespace dfly::search


================================================
FILE: src/server/search/index_join.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/search/index_join.h"

namespace dfly::join {

namespace {
// Joins joined_entries with new index entries using join_expressions.
// It uses hash joining algorithm to find matching entries.
std::vector<KeyIndexes> JoinWithNewIndex(
    EntriesPerIndex indexes_entries, absl::Span<const KeyIndexes> joined_entries,
    size_t new_index,  // represented as index in indexes_entries
    absl::Span<const JoinExpression> join_expressions) {
  /* We fill join_map with values sets from joined entries.
     In join_map we store {set of field values} to indexes in joined_entries that match this set of
     field values. So, then we can go over new_index entries and match their values with
     joined_entries using this.
     TODO: use hash map for the smallest set (new_index or joined_entries) */
  using ValuesSet = Vector<JoinableValue>;
  using JoinEntriesIndexes = absl::InlinedVector<size_t, 1>;
  absl::flat_hash_map<ValuesSet, JoinEntriesIndexes> join_map;
  join_map.reserve(joined_entries.size());

  // Now we need to initialize join_map with values of joined entries.
  for (size_t i = 0; i < joined_entries.size(); ++i) {
    const auto& joined_entry_keys = joined_entries[i];

    ValuesSet values_set;
    values_set.reserve(join_expressions.size());

    // Go over all join expressions and get field values using foreign index and field.
    for (const auto& join_expression : join_expressions) {
      size_t index = join_expression.foreign_index;
      size_t field_index = join_expression.foreign_field;

      // Now we need to get value of this field from joined key in this index
      DCHECK_LT(index, joined_entry_keys.size()) << "Join order broken, index out of range";
      KeyIndex key_index = joined_entry_keys[index];
      const JoinableValue& field_value = indexes_entries[index][key_index].second[field_index];

      // Add value to the set
      values_set.push_back(field_value);
    }

    // That means that this set of values corresponds to joined entry i
    join_map[values_set].push_back(i);
  }

  std::vector<KeyIndexes> result;
  result.reserve(join_map.size());

  // Now we store all possible sets of values in joined_entries that match this set.
  // We can iterate over new index and find entries with the same set of values.
  const auto& new_index_entries = indexes_entries[new_index];
  for (size_t i = 0; i < new_index_entries.size(); ++i) {
    const auto& index_entries = new_index_entries[i].second;

    ValuesSet values_set;
    values_set.reserve(join_expressions.size());
    // Go over all join expressions and get field values for this entry
    for (const auto& join_expression : join_expressions) {
      const JoinableValue& field_value = index_entries[join_expression.field];
      values_set.push_back(field_value);
    }

    // Now we need to find this set in the join_map
    auto it = join_map.find(values_set);
    if (it == join_map.end()) {
      continue;
    }

    // This entry in new index matches some joined entries,
    // we need to go over all entries with the same set of values
    // and add them to the result
    for (size_t joined_entry_index : it->second) {
      result.push_back(joined_entries[joined_entry_index]);
      // Add new index entry to the joined entry
      result.back().push_back(i);
    }
  }

  return result;
}

}  // anonymous namespace

Vector<Vector<Key>> JoinAllIndexes(
    EntriesPerIndex indexes_entries, IndexesJoinExpressions joins,
    absl::FunctionRef<void(std::vector<KeyIndexes>*)> aggregate_after_join) {
  if (indexes_entries.empty()) {
    return {};
  }

  // Will used to initialize joined entries
  const auto& first_index_entries = indexes_entries[0];

  /* Store current result of joins
     Each entry is vector of indexes, that referce to one key in the index
     For example, {1, 0, 4} means that key with index 1 in the first index,
     key with index 0 in the second index and key with index 4 in the third index were joined to
     single entry. */
  std::vector<KeyIndexes> joined_entries(first_index_entries.size(), KeyIndexes(1));

  // At the first step all keys from the first index are joined
  for (size_t i = 0; i < first_index_entries.size(); ++i) {
    joined_entries[i][0] = i;
  }

  DCHECK(joins[0].empty()) << "Base index must be first and have no joins";

  /* Now we need to iterate over all indexes and the joins
     Using joins for the new index, we will find matching entries in the current result
     (joined_entries) with the entries in the new index. */
  for (size_t i = 1; i < indexes_entries.size(); ++i) {
    joined_entries = JoinWithNewIndex(indexes_entries, joined_entries, i, joins[i]);
  }

  // Apply aggregation after join if needed
  // It can change size of joined_entries
  aggregate_after_join(&joined_entries);

  const size_t result_size = joined_entries.size();
  const size_t indexes_count = indexes_entries.size();
  // Now we have joined entries, we need to build JoinResult
  Vector<Vector<Key>> result(result_size, Vector<Key>(indexes_count));

  for (size_t i = 0; i < result_size; ++i) {
    auto& result_entry = result[i];

    for (size_t index = 0; index < indexes_count; ++index) {
      // Index of joined key in the current index
      KeyIndex key_index = joined_entries[i][index];
      // Find key by the key_index
      const auto& key = indexes_entries[index][key_index].first;

      // Add key to the result
      // That means that this key from this index was joined
      result_entry[index] = key;
    }
  }

  return result;
}

Vector<Vector<Key>> JoinAllIndexes(EntriesPerIndex indexes_entries, IndexesJoinExpressions joins) {
  return JoinAllIndexes(indexes_entries, joins, [](std::vector<KeyIndexes>*) {});
}

}  // namespace dfly::join


================================================
FILE: src/server/search/index_join.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <vector>

#include "base/logging.h"
#include "core/linear_search_map.h"
#include "core/search/base.h"
#include "server/common_types.h"

namespace dfly::join {

template <typename T> using Vector = absl::InlinedVector<T, 4>;

/* Represents field value.
   Same as search::SortableValue, but do not have monostate and stores string_view instead of
   std::string. */
using JoinableValue = std::variant<double, std::string_view>;

/* Each index has its own set of fields used for joins.
   Additionally, each index contains multiple keys/documents it has indexed, and each document
   includes several fields.

   For example:
    JOIN index2 ON index2.field1 = other_index.field2 AND index2.field3 = other_index.field4

    So, index2 uses field1 and field3 for joins. It also indexed docs key1, key2, key3:
    EntriesPerIndex will store something like:
                            [{"key1", {"field1" : value, "field3" : value}},
                             {"key2", {"field1" : value, "field3" : value}},
                             {"key3", {"field1" : value, "field3" : value}}].
    But to make join algorithm more efficient, we store it as raw vectors,
    instead of field_name as string, we use indexes;
    instead of key names we use shard id and doc id.
*/
using Key = std::pair<ShardId, search::DocId>;
using Entry = std::pair<Key, Vector<JoinableValue> /*fields values of this key*/>;
using EntriesPerIndex = absl::Span<const Vector<Entry> /*one index can store several keys*/>;

// TODO: comments
using OwnedJoinableValue = std::variant<double, std::string>;
using OwnedEntry = std::pair<Key, Vector<OwnedJoinableValue>>;

// Stores data for single join expression,
// e.g. index1.field1 = index2.field2:
// field - "field1", foreign_index - "index2", foreign_field - "field2"
struct JoinExpression {
  size_t field;          // field is represented as index in the Entry.second array
  size_t foreign_index;  // foreign_index is represented as index in the EntriesPerIndex array
  size_t foreign_field;  // foreign_field is too represented as index in the Entry.second array
};

using JoinExpressionsVec = Vector<JoinExpression>;

/* Each index can have several join expressions, e.g.:
   JOIN index1 ON index1.field1 = other_index.field2 AND index1.field3 = other_index.field4
   will result in:
   {"index1", {{"field1", "other_index", "field2"}, {"field3", "other_index", "field4"}}} */
using IndexesJoinExpressions = absl::Span<const JoinExpressionsVec>;

using KeyIndex = size_t;
using KeyIndexes = Vector<KeyIndex>;

/* Joins all indexes in indexes_map using join_expressions.
   Join algorithm is used is hash join. */
Vector<Vector<Key>> JoinAllIndexes(
    EntriesPerIndex indexes_entries, IndexesJoinExpressions joins,
    absl::FunctionRef<void(std::vector<KeyIndexes>*)> aggregate_after_join);

Vector<Vector<Key>> JoinAllIndexes(EntriesPerIndex indexes_entries, IndexesJoinExpressions joins);

}  // namespace dfly::join


================================================
FILE: src/server/search/index_join_test.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/search/index_join.h"

#include <absl/container/flat_hash_set.h>
#include <gmock/gmock.h>
#include <gtest/gtest.h>

#include <utility>

#include "base/gtest.h"
#include "base/logging.h"

namespace dfly {

using namespace join;

class IndexJoinTest : public testing::Test {
 protected:
};

struct TestIndexData {
  struct FieldData {
    std::string_view name;
    JoinableValue value;
  };

  struct KeyData {
    std::string_view key;
    std::vector<FieldData> fields;
  };

  std::string_view index_name;
  std::vector<KeyData> entries;
};

struct TestJoinExpression {
  std::string_view field;
  std::string_view foreign_index;
  std::string_view foreign_field;
};

struct PreprocessedIndexData {
  Vector<Vector<Entry>> entries;
  std::unordered_map<std::string_view, size_t> index_name_to_index;
  std::unordered_map<std::string_view, Key> key_name_to_key;
  std::vector<std::unordered_map<std::string_view, size_t>> field_names_to_index;
};

MATCHER_P(IsJoinResultMatcher, expected, "") {
  std::vector<testing::Matcher<std::vector<join::Key>>> matchers;
  for (const auto& entry : expected) {
    std::vector<join::Key> keys;
    for (auto field : entry) {
      keys.push_back(field);
    }
    matchers.push_back(testing::ElementsAreArray(keys));
  }

  std::vector<std::vector<join::Key>> result;
  for (size_t index = 0; index < arg.size(); ++index) {
    std::vector<join::Key> entry;
    for (const auto& key : arg[index]) {
      entry.push_back(key);
    }
    result.push_back(std::move(entry));
  }
  return testing::ExplainMatchResult(testing::UnorderedElementsAreArray(matchers), result,
                                     result_listener);
}

template <typename... Args>
auto IsJoinResult(const PreprocessedIndexData& data,
                  std::vector<std::vector<std::string_view>> joined_data) {
  std::vector<std::vector<join::Key>> joined_keys(joined_data.size());
  for (size_t i = 0; i < joined_data.size(); ++i) {
    for (const auto& entry : joined_data[i]) {
      auto it = data.key_name_to_key.find(entry);
      DCHECK(it != data.key_name_to_key.end()) << "Key not found in index data: " << entry;
      joined_keys[i].push_back(it->second);
    }
  }

  return IsJoinResultMatcher(std::move(joined_keys));
}

PreprocessedIndexData PreprocessIndexesData(std::vector<TestIndexData> indexes_data) {
  PreprocessedIndexData data;

  auto contains = [](const auto& set, const auto& key) { return set.find(key) != set.end(); };

  search::DocId doc_id = 0;
  for (size_t index = 0; index < indexes_data.size(); index++) {
    const auto& [index_name, index_data] = indexes_data[index];
    DCHECK(!contains(data.index_name_to_index, index_name))
        << "Duplicate index name: " << index_name;
    data.index_name_to_index[index_name] = index;

    data.field_names_to_index.emplace_back();
    auto& field_names_map = data.field_names_to_index.back();

    if (!index_data.empty()) {
      for (size_t i = 0; i < index_data[0].fields.size(); ++i) {
        const auto& field = index_data[0].fields[i];
        DCHECK(!contains(field_names_map, field.name))
            << "Duplicate field name in index: " << field.name;
        field_names_map[field.name] = i;
      }
    }

    Vector<Entry> index_entries;
    index_entries.reserve(index_data.size());

    for (size_t i = 0; i < index_data.size(); ++i) {
      const auto& [key, fields] = index_data[i];
      DCHECK(!contains(data.key_name_to_key, key)) << "Duplicate key name in index: " << key;

      Key key_for_join = {0 /*in tests we are using 0 for ShardId*/, doc_id++};
      data.key_name_to_key[key] = key_for_join;

      Entry entry = {key_for_join, Vector<JoinableValue>(field_names_map.size())};
      std::set<std::string_view> fields_set;
      for (const auto& [field_name, field_value] : fields) {
        DCHECK(contains(field_names_map, field_name));
        DCHECK(!contains(fields_set, field_name)) << "Duplicate field name in key: " << field_name;

        entry.second[field_names_map[field_name]] = field_value;
        fields_set.insert(field_name);
      }

      DCHECK_EQ(fields_set.size(), field_names_map.size())
          << "Not all fields are set for key: " << key;

      index_entries.emplace_back(std::move(entry));
    }

    data.entries.emplace_back(std::move(index_entries));
  }

  return data;
}

join::Vector<JoinExpressionsVec> BuildJoinExpressions(
    const PreprocessedIndexData& index_data,
    std::initializer_list<std::pair<std::string_view, std::initializer_list<TestJoinExpression>>>
        data) {
  join::Vector<JoinExpressionsVec> join_expressions(1);

  auto contains = [](const auto& set, const auto& key) { return set.find(key) != set.end(); };

  std::set<std::string_view> index_names_set;
  for (const auto& [index_name, expressions] : data) {
    DCHECK(contains(index_data.index_name_to_index, index_name))
        << "Index not found in join expressions: " << index_name;
    DCHECK(!contains(index_names_set, index_name))
        << "Duplicate index name in join expressions: " << index_name;

    index_names_set.insert(index_name);
    size_t current_index = index_data.index_name_to_index.at(index_name);

    JoinExpressionsVec exprs;
    for (const auto& expr : expressions) {
      DCHECK(contains(index_data.field_names_to_index[current_index], expr.field))
          << "Field not found in index: " << expr.field;
      size_t field_index = index_data.field_names_to_index[current_index].at(expr.field);

      DCHECK(contains(index_data.index_name_to_index, expr.foreign_index))
          << "Foreign index not found in join expressions: " << expr.foreign_index;
      size_t foreign_index = index_data.index_name_to_index.at(expr.foreign_index);

      DCHECK(contains(index_data.field_names_to_index[foreign_index], expr.foreign_field))
          << "Foreign field not found in foreign index: " << expr.foreign_field;
      size_t foreign_field_index =
          index_data.field_names_to_index[foreign_index].at(expr.foreign_field);

      exprs.emplace_back(JoinExpression{field_index, foreign_index, foreign_field_index});
    }

    join_expressions.emplace_back(std::move(exprs));
  }

  return join_expressions;
}

TEST_F(IndexJoinTest, SimpleJoin) {
  auto data = PreprocessIndexesData({{"index1",
                                      {{"key1", {{"field1", 1.0}, {"field2", "value1"}}},
                                       {"key2", {{"field1", 2.0}, {"field2", "value2"}}}}},
                                     {"index2",
                                      {{"key3", {{"field3", 1.0}, {"field4", "value3"}}},
                                       {"key4", {{"field3", 2.0}, {"field4", "value4"}}}}}});

  auto joins = BuildJoinExpressions(data, {{"index2", {{"field3", "index1", "field1"}}}});

  auto result = JoinAllIndexes(data.entries, joins);
  EXPECT_THAT(result, IsJoinResult(data, {{"key1", "key3"}, {"key2", "key4"}}));
}

TEST_F(IndexJoinTest, MultipleJoins) {
  auto data = PreprocessIndexesData({{"index1",
                                      {{"key1", {{"field1", 1.0}, {"field2", "value1"}}},
                                       {"key2", {{"field1", 2.0}, {"field2", "value2"}}}}},
                                     {"index2",
                                      {{"key3", {{"field3", 1.0}, {"field4", "value3"}}},
                                       {"key4", {{"field3", 2.0}, {"field4", "value4"}}}}},
                                     {"index3",
                                      {{"key5", {{"field5", 1.0}, {"field6", "value5"}}},
                                       {"key6", {{"field5", 2.0}, {"field6", "value6"}}}}}});

  auto joins = BuildJoinExpressions(data, {{"index2", {{"field3", "index1", "field1"}}},
                                           {"index3", {{"field5", "index2", "field3"}}}});

  auto result = JoinAllIndexes(data.entries, joins);
  EXPECT_THAT(result, IsJoinResult(data, {{"key1", "key3", "key5"}, {"key2", "key4", "key6"}}));
}

TEST_F(IndexJoinTest, NoMatches) {
  // Different values
  auto data = PreprocessIndexesData({{"index1",
                                      {{"key1", {{"field1", 1.0}, {"field2", "value1"}}},
                                       {"key2", {{"field1", 2.0}, {"field2", "value2"}}}}},
                                     {"index2",
                                      {{"key3", {{"field3", 3.0}, {"field4", "value3"}}},
                                       {"key4", {{"field3", 4.0}, {"field4", "value4"}}}}}});

  auto joins = BuildJoinExpressions(data, {{"index2", {{"field3", "index1", "field1"}}}});

  auto result = JoinAllIndexes(data.entries, joins);
  EXPECT_TRUE(result.empty());

  // Different types
  auto data2 = PreprocessIndexesData({{"index1",
                                       {{"key1", {{"field1", 1.0}, {"field2", "value1"}}},
                                        {"key2", {{"field1", 2.0}, {"field2", "value2"}}}}},
                                      {"index2",
                                       {{"key3", {{"field3", "value3"}, {"field4", "value4"}}},
                                        {"key4", {{"field3", "value5"}, {"field4", "value6"}}}}}});

  auto joins2 = BuildJoinExpressions(data2, {{"index2", {{"field3", "index1", "field1"}}}});

  result = JoinAllIndexes(data2.entries, joins2);
  EXPECT_TRUE(result.empty());
}

TEST_F(IndexJoinTest, JoinWithMultipleFields) {
  auto data = PreprocessIndexesData({{"index1",
                                      {{"key1", {{"field1", 1.0}, {"field2", "value1"}}},
                                       {"key2", {{"field1", 2.0}, {"field2", "value2"}}}}},
                                     {"index2",
                                      {{"key3", {{"field3", 1.0}, {"field4", "value1"}}},
                                       {"key4", {{"field3", 2.0}, {"field4", "value2"}}}}},
                                     {"index3",
                                      {{"key5", {{"field5", 1.0}, {"field6", "value1"}}},
                                       {"key6", {{"field5", 2.0}, {"field6", "value2"}}}}}});

  auto joins = BuildJoinExpressions(
      data, {{"index2", {{"field3", "index1", "field1"}, {"field4", "index1", "field2"}}},
             {"index3", {{"field5", "index2", "field3"}, {"field6", "index2", "field4"}}}});

  auto result = JoinAllIndexes(data.entries, joins);
  EXPECT_THAT(result, IsJoinResult(data, {{"key1", "key3", "key5"}, {"key2", "key4", "key6"}}));
}

TEST_F(IndexJoinTest, JoinWithSeveralCopiesOfSameKey) {
  auto data = PreprocessIndexesData({{"index1",
                                      {{"key1", {{"field1", 1.0}, {"field2", "value1"}}},
                                       {"key2", {{"field1", 2.0}, {"field2", "value2"}}},
                                       {"key3", {{"field1", 1.0}, {"field2", "value1"}}},
                                       {"key4", {{"field1", 2.0}, {"field2", "value2"}}}}},
                                     {"index2",
                                      {{"key5", {{"field3", 1.0}, {"field4", "value1"}}},
                                       {"key6", {{"field3", 2.0}, {"field4", "value2"}}}}},
                                     {"index3",
                                      {{"key7", {{"field5", 1.0}, {"field6", "value1"}}},
                                       {"key8", {{"field5", 2.0}, {"field6", "value2"}}},
                                       {"key9", {{"field5", 1.0}, {"field6", "value1"}}},
                                       {"key10", {{"field5", 2.0}, {"field6", "value2"}}},
                                       {"key11", {{"field5", 11.0}, {"field6", "value2"}}}}}});

  auto joins = BuildJoinExpressions(
      data, {{"index2", {{"field3", "index1", "field1"}, {"field4", "index1", "field2"}}},
             {"index3", {{"field5", "index2", "field3"}, {"field6", "index2", "field4"}}}});

  auto result = JoinAllIndexes(data.entries, joins);
  EXPECT_THAT(result, IsJoinResult(data, {{"key1", "key5", "key7"},
                                          {"key2", "key6", "key8"},
                                          {"key3", "key5", "key7"},
                                          {"key4", "key6", "key8"},
                                          {"key1", "key5", "key9"},
                                          {"key2", "key6", "key10"},
                                          {"key3", "key5", "key9"},
                                          {"key4", "key6", "key10"}}));
}

}  // namespace dfly


================================================
FILE: src/server/search/search_family.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/search/search_family.h"

#include <absl/container/flat_hash_map.h>
#include <absl/flags/flag.h>
#include <absl/strings/match.h>
#include <absl/strings/str_format.h>
#include <absl/strings/str_join.h>
#include <absl/strings/str_split.h>

#include <atomic>
#include <variant>
#include <vector>

#include "base/logging.h"
#include "core/search/indices.h"
#include "core/search/query_driver.h"
#include "core/search/search.h"
#include "core/search/vector_utils.h"
#include "facade/cmd_arg_parser.h"
#include "facade/error.h"
#include "facade/reply_builder.h"
#include "server/acl/acl_commands_def.h"
#include "server/cluster/cluster_config.h"
#include "server/cluster/coordinator.h"
#include "server/command_registry.h"
#include "server/config_registry.h"
#include "server/conn_context.h"
#include "server/container_utils.h"
#include "server/db_slice.h"
#include "server/engine_shard_set.h"
#include "server/namespaces.h"
#include "server/search/aggregator.h"
#include "server/search/doc_index.h"
#include "server/search/global_hnsw_index.h"
#include "server/transaction.h"
#include "src/core/overloaded.h"

ABSL_FLAG(bool, search_reject_legacy_field, true, "FT.AGGREGATE: Reject legacy field names.");
ABSL_FLAG(bool, cluster_search, false,
          "Enable search commands for cross-shard search. turned off by default for safety.");

ABSL_FLAG(size_t, MAXSEARCHRESULTS, 1000000, "Maximum number of results from ft.search command");

ABSL_FLAG(size_t, search_query_string_bytes, 10240,
          "Maximum number of bytes in search query string");

ABSL_FLAG(size_t, subset_knn_search_threshold, 8192,
          "If prefilter results are below this threshold, we will do exact subset search "
          "instead of HNSW graph search");

namespace dfly {

using namespace std;
using namespace facade;

namespace {
// we use it to find which flags are belong to search
const std::string kCurrentFile = std::filesystem::path(__FILE__).filename().string();

using nonstd::make_unexpected;

template <typename T> using ParseResult = io::Result<T, ErrorReply>;

nonstd::unexpected_type<ErrorReply> CreateSyntaxError(std::string message) {
  return make_unexpected(ErrorReply{std::move(message), kSyntaxErrType});
}

nonstd::unexpected_type<ErrorReply> CreateSyntaxError(std::string_view message) {
  return make_unexpected(ErrorReply{message, kSyntaxErrType});
}

string IndexNotFoundMsg(string_view index_name) {
  return absl::StrCat("Index with name '", index_name, "' not found");
}

// Send error from parser or result
// Returns false if no errors occured
template <typename T>
bool SendErrorIfOccurred(const ParseResult<T>& result, CmdArgParser* parser,
                         CommandContext* cmd_cntx) {
  if (auto err = parser->TakeError(); err || !result) {
    cmd_cntx->SendError(!result ? result.error() : err.MakeReply());
    return true;
  }

  return false;
}

bool IsValidJsonPath(string_view path) {
  error_code ec;
  MakeJsonPathExpr<TmpJson>(path, ec);
  return !ec;
}

search::SchemaField::VectorParams ParseVectorParams(CmdArgParser* parser) {
  search::SchemaField::VectorParams params{};

  params.use_hnsw = parser->MapNext("HNSW", true, "FLAT", false);
  const size_t num_args = parser->Next<size_t>();

  for (size_t i = 0; i * 2 < num_args; i++) {
    if (parser->Check("DIM", &params.dim)) {
    } else if (parser->Check("DISTANCE_METRIC")) {
      params.sim =
          parser->MapNext("L2", search::VectorSimilarity::L2, "IP", search::VectorSimilarity::IP,
                          "COSINE", search::VectorSimilarity::COSINE);
    } else if (parser->Check("INITIAL_CAP", &params.capacity)) {
    } else if (parser->Check("M", &params.hnsw_m)) {
    } else if (parser->Check("EF_CONSTRUCTION", &params.hnsw_ef_construction)) {
    } else if (parser->Check("EF_RUNTIME")) {
      parser->Next<size_t>();
      LOG(WARNING) << "EF_RUNTIME not supported";
    } else if (parser->Check("EPSILON")) {
      parser->Next<double>();
      LOG(WARNING) << "EPSILON not supported";
    } else {
      parser->Skip(2);
    }
  }

  return params;
}

ParseResult<search::SchemaField::TagParams> ParseTagParams(CmdArgParser* parser) {
  search::SchemaField::TagParams params{};
  while (parser->HasNext()) {
    if (parser->Check("SEPARATOR")) {
      std::string_view separator = parser->NextOrDefault();

      if (separator.size() != 1) {
        return CreateSyntaxError(
            absl::StrCat("Tag separator must be a single character. Got `"sv, separator, "`"sv));
      }

      params.separator = separator.front();
      continue;
    }

    if (parser->Check("CASESENSITIVE")) {
      params.case_sensitive = true;
      continue;
    }

    if (parser->Check("WITHSUFFIXTRIE")) {
      params.with_suffixtrie = true;
      continue;
    }

    break;
  }
  return params;
}

ParseResult<search::SchemaField::TextParams> ParseTextParams(CmdArgParser* parser) {
  search::SchemaField::TextParams params{};
  params.with_suffixtrie = parser->Check("WITHSUFFIXTRIE");
  return params;
}

search::SchemaField::NumericParams ParseNumericParams(CmdArgParser* parser) {
  search::SchemaField::NumericParams params{};
  if (parser->Check("BLOCKSIZE")) {
    params.block_size = parser->Next<size_t>();
  }
  return params;
}

// breaks on ParamsVariant initialization
#ifndef __clang__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#endif

using ParsedSchemaField =
    ParseResult<std::pair<search::SchemaField::FieldType, search::SchemaField::ParamsVariant>>;

// Tag fields include: [separator char] [casesensitive]
ParsedSchemaField ParseTag(CmdArgParser* parser) {
  auto tag_params = ParseTagParams(parser);
  if (!tag_params) {
    return make_unexpected(tag_params.error());
  }
  return std::make_pair(search::SchemaField::TAG, std::move(tag_params).value());
}

ParsedSchemaField ParseText(CmdArgParser* parser) {
  auto text_params = ParseTextParams(parser);
  if (!text_params)
    return make_unexpected(text_params.error());
  return std::make_pair(search::SchemaField::TEXT, std::move(text_params).value());
}

ParsedSchemaField ParseNumeric(CmdArgParser* parser) {
  return std::make_pair(search::SchemaField::NUMERIC, ParseNumericParams(parser));
}

// Vector fields include: {algorithm} num_args args...
ParsedSchemaField ParseVector(CmdArgParser* parser) {
  auto vector_params = ParseVectorParams(parser);

  if (parser->HasError()) {
    auto err = parser->TakeError();
    VLOG(1) << "Could not parse vector param " << err.index;
    return CreateSyntaxError("Parse error of vector parameters"sv);
  }

  if (vector_params.dim == 0) {
    return CreateSyntaxError("Knn vector dimension cannot be zero"sv);
  }
  return std::make_pair(search::SchemaField::VECTOR, vector_params);
}

ParsedSchemaField ParseGeo(CmdArgParser* parser) {
  return std::make_pair(search::SchemaField::GEO, std::monostate{});
}

// ON HASH | JSON
ParseResult<bool> ParseOnOption(CmdArgParser* parser, DocIndex* index) {
  index->type = parser->MapNext("HASH"sv, DocIndex::HASH, "JSON"sv, DocIndex::JSON);
  return true;
}

// PREFIX count prefix [prefix ...]
ParseResult<bool> ParsePrefix(CmdArgParser* parser, DocIndex* index) {
  size_t count = parser->Next<size_t>();
  index->prefixes.reserve(count);
  for (size_t i = 0; i < count; i++) {
    index->prefixes.push_back(parser->Next<std::string>());
  }
  return true;
}

// STOPWORDS count [words...]
ParseResult<bool> ParseStopwords(CmdArgParser* parser, DocIndex* index) {
  index->options.stopwords.clear();
  for (size_t num = parser->Next<size_t>(); num > 0; num--) {
    index->options.stopwords.emplace(parser->Next());
  }
  return true;
}

constexpr std::array<const std::string_view, 4> kIgnoredOptions = {
    "UNF"sv, "NOSTEM"sv, "INDEXMISSING"sv, "INDEXEMPTY"sv};
constexpr std::array<const std::string_view, 3> kIgnoredOptionsWithArg = {"WEIGHT"sv, "PHONETIC"sv};

// SCHEMA field [AS alias] type [flags...]
ParseResult<bool> ParseSchema(CmdArgParser* parser, DocIndex* index) {
  auto& schema = index->schema;

  if (!parser->HasNext()) {
    return CreateSyntaxError("Fields arguments are missing"sv);
  }

  while (parser->HasNext()) {
    string_view field = parser->Next();
    string_view field_alias = field;

    // Verify json path is correct
    if (index->type == DocIndex::JSON && !IsValidJsonPath(field)) {
      return CreateSyntaxError(absl::StrCat("Bad json path: "sv, field));
    }

    // AS [alias]
    parser->Check("AS", &field_alias);

    if (schema.field_names.contains(field_alias)) {
      return CreateSyntaxError(absl::StrCat("Duplicate field in schema - "sv, field_alias));
    }

    // Determine type
    using search::SchemaField;
    auto params_parser =
        parser->TryMapNext("TAG"sv, &ParseTag, "TEXT"sv, &ParseText, "NUMERIC"sv, &ParseNumeric,
                           "VECTOR"sv, &ParseVector, "GEO", &ParseGeo);
    if (!params_parser) {
      return CreateSyntaxError(
          absl::StrCat("Field type "sv, parser->Next(), " is not supported"sv));
    }

    auto parsed_params = params_parser.value()(parser);
    if (!parsed_params) {
      return make_unexpected(parsed_params.error());
    }

    auto [field_type, params] = std::move(parsed_params).value();

    // Flags: check for SORTABLE and NOINDEX
    uint8_t flags = 0;
    while (parser->HasNext()) {
      auto flag = parser->TryMapNext("NOINDEX", search::SchemaField::NOINDEX, "SORTABLE",
                                     search::SchemaField::SORTABLE);
      if (!flag) {
        std::string_view option = parser->Peek();
        if (std::find(kIgnoredOptions.begin(), kIgnoredOptions.end(), option) !=
            kIgnoredOptions.end()) {
          LOG_IF(WARNING, option != "INDEXMISSING"sv && option != "INDEXEMPTY"sv)
              << "Ignoring unsupported field option in FT.CREATE: " << option;
          // Ignore these options
          parser->Skip(1);
          continue;
        }
        if (std::find(kIgnoredOptionsWithArg.begin(), kIgnoredOptionsWithArg.end(), option) !=
            kIgnoredOptionsWithArg.end()) {
          LOG(WARNING) << "Ignoring unsupported field option in FT.CREATE: " << option;
          // Ignore these options with argument
          parser->Skip(2);
          continue;
        }
        break;
      }

      flags |= *flag;
    }

    schema.fields[field] = {field_type, flags, string{field_alias}, params};
    schema.field_names[field_alias] = field;
  }

  return false;
}

#ifndef __clang__
#pragma GCC diagnostic pop
#endif

ParseResult<DocIndex> CreateDocIndex(std::string_view name, CmdArgParser* parser) {
  DocIndex index{};
  index.name = name;

  while (parser->HasNext()) {
    auto option_parser =
        parser->TryMapNext("ON"sv, &ParseOnOption, "PREFIX"sv, &ParsePrefix, "STOPWORDS"sv,
                           &ParseStopwords, "SCHEMA"sv, &ParseSchema);

    if (!option_parser) {
      // Unsupported parameters are ignored for now
      parser->Skip(1);
      continue;
    }

    auto parse_result = option_parser.value()(parser, &index);
    if (!parse_result) {
      return make_unexpected(parse_result.error());
    }
    if (!parse_result.value()) {
      break;
    }
  }

  return index;
}

std::string_view ParseField(CmdArgParser* parser) {
  std::string_view field = parser->Next();
  if (absl::StartsWith(field, "@"sv)) {
    field.remove_prefix(1);  // remove leading @ if exists
  }
  return field;
}

std::optional<std::string_view> ParseFieldWithAtSign(CmdArgParser* parser) {
  std::string_view field = parser->Next();
  if (absl::StartsWith(field, "@"sv)) {
    field.remove_prefix(1);  // remove leading @
  } else {
    if (absl::GetFlag(FLAGS_search_reject_legacy_field)) {
      return std::nullopt;
    }
  }
  return field;
}

void ParseNumericFilter(CmdArgParser* parser, SearchParams* params) {
  auto field = ParseField(parser);
  size_t lo = parser->Next<size_t>();
  size_t hi = parser->Next<size_t>();
  if (auto it = params->optional_filters.find(field); it != params->optional_filters.end()) {
    search::OptionalNumericFilter* numeric_filter =
        dynamic_cast<search::OptionalNumericFilter*>(it->second.get());
    numeric_filter->AddRange(lo, hi);
  } else {
    params->optional_filters.emplace(field,
                                     std::make_unique<search::OptionalNumericFilter>(lo, hi));
  }
}

std::vector<FieldReference> ParseLoadOrReturnFields(CmdArgParser* parser, bool is_load) {
  // TODO: Change to num_strings. In Redis strings number is expected. For example: LOAD 3 $.a AS a
  std::vector<FieldReference> fields;
  size_t num_fields = parser->Next<size_t>();

  while (parser->HasNext() && num_fields--) {
    string_view field = is_load ? ParseField(parser) : parser->Next();
    string_view alias;
    parser->Check("AS", &alias);
    fields.emplace_back(field, alias);
  }
  return fields;
}

search::QueryParams ParseQueryParams(CmdArgParser* parser) {
  search::QueryParams params;
  size_t num_args = parser->Next<size_t>();
  while (parser->HasNext() && params.Size() * 2 < num_args) {
    auto [k, v] = parser->Next<string_view, string_view>();
    params[k] = v;
  }
  return params;
}

ParseResult<SearchParams> ParseSearchParams(CmdArgParser* parser) {
  SearchParams params;

  const size_t max_results = absl::GetFlag(FLAGS_MAXSEARCHRESULTS);

  while (parser->HasNext()) {
    // [LIMIT offset total]
    if (parser->Check("LIMIT")) {
      params.limit_offset = parser->Next<size_t>();
      params.limit_total = parser->Next<size_t>();
      if (params.limit_total > max_results) {
        return CreateSyntaxError(absl::StrFormat("LIMIT exceeds maximum of %d", max_results));
      }
    } else if (parser->Check("LOAD")) {
      if (params.return_fields) {
        return CreateSyntaxError("LOAD cannot be applied after RETURN"sv);
      }

      params.load_fields = ParseLoadOrReturnFields(parser, true);
    } else if (parser->Check("RETURN")) {
      if (params.load_fields) {
        return CreateSyntaxError("RETURN cannot be applied after LOAD"sv);
      }
      if (!params.return_fields)  // after NOCONTENT it's silently ignored
        params.return_fields = ParseLoadOrReturnFields(parser, false);
    } else if (parser->Check("NOCONTENT")) {  // NOCONTENT
      params.return_fields.emplace();
    } else if (parser->Check("PARAMS")) {  // [PARAMS num(ignored) name(ignored) knn_vector]
      params.query_params = ParseQueryParams(parser);
    } else if (parser->Check("SORTBY")) {
      FieldReference field{ParseField(parser)};
      params.sort_option =
          SearchParams::SortOption{field, parser->Check("DESC") ? SortOrder::DESC : SortOrder::ASC};
    } else if (parser->Check("FILTER")) {
      ParseNumericFilter(parser, &params);
    } else if (parser->Check("WITHSORTKEYS")) {
      params.with_sortkeys = true;
    } else {
      // Unsupported parameters are ignored for now
      parser->Skip(1);
    }
  }

  params.limit_total = std::min(params.limit_total, max_results);

  return params;
}

ParseResult<aggregate::SortParams> ParseAggregatorSortParams(CmdArgParser* parser) {
  size_t strings_num = parser->Next<size_t>();

  aggregate::SortParams sort_params;
  sort_params.fields.reserve(strings_num / 2);

  while (parser->HasNext() && strings_num > 0) {
    std::string_view potential_field =
        parser->Peek();  // Peek to get the field name for potential error message
    std::optional<std::string_view> parsed_field = ParseFieldWithAtSign(parser);
    if (!parsed_field) {
      return CreateSyntaxError(
          absl::StrCat("SORTBY field name '", potential_field, "' must start with '@'"));
    }
    strings_num--;

    SortOrder sord_order = SortOrder::ASC;
    if (strings_num > 0) {
      auto order = parser->TryMapNext("ASC", SortOrder::ASC, "DESC", SortOrder::DESC);
      if (order) {
        sord_order = order.value();
        strings_num--;
      }
    }

    sort_params.fields.emplace_back(*parsed_field, sord_order);
  }

  if (strings_num) {
    return CreateSyntaxError("bad arguments for SORTBY: specified invalid number of strings"sv);
  }

  if (parser->Check("MAX")) {
    sort_params.max = parser->Next<size_t>();
  }

  return sort_params;
}

std::pair<std::string_view, std::string_view> Split(std::string_view s, char delim) {
  return absl::StrSplit(s, absl::MaxSplits(absl::ByChar(delim), 1));
}

// Example: LOAD_FROM index AS alias num_conditions condition [condition ...] [QUERY query]
// condition is in the form index.field=foreign_index.field or foreign_index.field=index.field
ParseResult<AggregateParams::JoinParams> ParseAggregatorJoinParams(
    CmdArgParser* parser, absl::flat_hash_set<std::string>* known_indexes) {
  AggregateParams::JoinParams join_params;
  join_params.index = parser->Next<std::string>();
  if (parser->Check("AS")) {
    join_params.index_alias = parser->Next<std::string>();
  } else {
    join_params.index_alias = join_params.index;
  }

  if (known_indexes->contains(join_params.index_alias)) {
    return CreateSyntaxError(
        absl::StrCat("Duplicate index alias in LOAD_FROM: '", join_params.index_alias, "'"));
  }

  // Validate index name
  known_indexes->insert(join_params.index_alias);

  size_t num_fields = parser->Next<size_t>();
  join_params.conditions.reserve(num_fields);
  // Conditions are in the form index.field=foreign_index.field or foreign_index.field=index.field
  while (parser->HasNext() && num_fields > 0) {
    auto [left, right] = Split(parser->Next(), '=');
    auto [l_index, l_field] = Split(left, '.');
    auto [r_index, r_field] = Split(right, '.');

    if (right.empty() || l_field.empty() || r_field.empty()) {
      return CreateSyntaxError(
          "bad arguments for LOAD_FROM: expected 'index.field=foreign_index.field'"sv);
    }

    if (!known_indexes->contains(l_index) || !known_indexes->contains(r_index)) {
      return CreateSyntaxError(absl::StrCat("bad arguments for LOAD_FROM: unknown index '",
                                            known_indexes->contains(l_index) ? r_index : l_index,
                                            "'"));
    }

    if (l_index == join_params.index_alias) {
      join_params.conditions.emplace_back(l_field, r_index, r_field);
    } else if (r_index == join_params.index_alias) {
      join_params.conditions.emplace_back(r_field, l_index, l_field);
    } else {
      return CreateSyntaxError(absl::StrCat(
          "bad arguments for LOAD_FROM: one of the field must be from the current index '",
          join_params.index_alias, "'. Got '", left, "' and '", right, "'"));
    }

    num_fields--;
  }

  parser->Check("QUERY", &join_params.query);

  return join_params;
}

ParseResult<AggregateParams> ParseAggregatorParams(CmdArgParser* parser) {
  AggregateParams params;
  tie(params.index, params.query) = parser->Next<string_view, string_view>();

  // Parse LOAD count field [field ...]
  // LOAD options are at the beginning of the query, so we need to parse them first
  while (parser->HasNext() && parser->Check("LOAD")) {
    auto fields = ParseLoadOrReturnFields(parser, true);
    if (!params.load_fields.has_value())
      params.load_fields = std::move(fields);
    else
      params.load_fields->insert(params.load_fields->end(), make_move_iterator(fields.begin()),
                                 make_move_iterator(fields.end()));
  }

  // Used for join params
  absl::flat_hash_set<std::string> current_known_indexes;
  current_known_indexes.insert(std::string{params.index});
  while (parser->HasNext() && parser->Check("LOAD_FROM")) {
    auto join_params = ParseAggregatorJoinParams(parser, &current_known_indexes);
    if (!join_params) {
      return make_unexpected(join_params.error());
    }
    params.joins.emplace_back(std::move(join_params).value());
  }
  const bool joining_enabled = !params.joins.empty();

  while (parser->HasNext()) {
    // GROUPBY nargs property [property ...]
    if (parser->Check("GROUPBY")) {
      size_t num_fields = parser->Next<size_t>();

      std::vector<std::string> fields;
      fields.reserve(num_fields);
      while (parser->HasNext() && num_fields > 0) {
        auto parsed_field = ParseFieldWithAtSign(parser);
        if (!parsed_field) {
          return CreateSyntaxError("bad arguments: Field name should start with '@'"sv);
        }

        fields.emplace_back(*parsed_field);
        num_fields--;
      }

      vector<aggregate::Reducer> reducers;
      while (parser->Check("REDUCE")) {
        using RF = aggregate::ReducerFunc;
        auto func_name =
            parser->TryMapNext("COUNT", RF::COUNT, "COUNT_DISTINCT", RF::COUNT_DISTINCT, "SUM",
                               RF::SUM, "AVG", RF::AVG, "MAX", RF::MAX, "MIN", RF::MIN);

        if (!func_name) {
          return CreateSyntaxError(absl::StrCat("reducer function ", parser->Next(), " not found"));
        }

        auto func = aggregate::FindReducerFunc(*func_name);
        auto nargs = parser->Next<size_t>();

        string source_field;
        if (nargs > 0) {
          source_field = ParseField(parser);
        }

        parser->ExpectTag("AS");
        string result_field = parser->Next<string>();

        reducers.push_back(
            aggregate::Reducer{std::move(source_field), std::move(result_field), func});
      }

      params.steps.push_back(aggregate::MakeGroupStep(std::move(fields), std::move(reducers)));
      continue;
    }

    // SORTBY nargs
    if (parser->Check("SORTBY")) {
      auto sort_params = ParseAggregatorSortParams(parser);
      if (!sort_params) {
        return make_unexpected(sort_params.error());  // Propagate the specific error
      }

      if (!joining_enabled || params.join_agg_params.HasValue()) {
        params.steps.push_back(aggregate::MakeSortStep(std::move(sort_params).value()));
      } else {
        params.join_agg_params.sort = std::move(sort_params).value();
      }
      continue;
    }

    // LIMIT
    if (parser->Check("LIMIT")) {
      auto [offset, num] = parser->Next<size_t, size_t>();
      if (!joining_enabled || params.join_agg_params.HasLimit()) {
        params.steps.push_back(aggregate::MakeLimitStep(offset, num));
      } else {
        params.join_agg_params.limit_offset = offset;
        params.join_agg_params.limit_total = num;
      }
      continue;
    }

    // PARAMS
    if (parser->Check("PARAMS")) {
      params.params = ParseQueryParams(parser);
      continue;
    }

    if (parser->Check("LOAD")) {
      return CreateSyntaxError("LOAD cannot be applied after projectors or reducers"sv);
    }

    if (parser->Check("LOAD_FROM")) {
      return CreateSyntaxError("LOAD_FROM cannot be applied after projectors or reducers"sv);
    }

    return CreateSyntaxError(absl::StrCat("Unknown clause: ", parser->Peek()));
  }

  return params;
}

// Data that we need at the first step of join
struct PreprocessedJoinData {
  struct SortParam {
    size_t index;
    size_t field_index;
    SortOrder order;
  };

  explicit PreprocessedJoinData(size_t n)
      : indexes(n), needed_fields(n), joins_per_index(n), fields_to_load_per_index(n) {
  }

  // Index names
  join::Vector<std::string_view> indexes;
  // Maps index alias to its index in the indexes vector
  absl::flat_hash_map<std::string_view, size_t> alias_to_index;

  // For each index we store the fields that are needed for the join
  join::Vector<join::Vector<std::string_view>> needed_fields;
  // For each index we store the join expressions that are used to join this index
  join::Vector<join::JoinExpressionsVec> joins_per_index;
  // For each index we store the fields that should be loaded from the document after the join
  join::Vector<join::Vector<std::string_view>> fields_to_load_per_index;
  // Maps field names to the shard_id and their index in the needed_fields vector
  join::Vector<SortParam> sort_params;
};

io::Result<PreprocessedJoinData, ErrorReply> PreprocessDataForJoin(std::string_view index,
                                                                   const AggregateParams& params) {
  DCHECK(!params.joins.empty());

  const size_t n = params.joins.size();
  PreprocessedJoinData result(n + 1);

  // Collect aliases and initialize result.indexes
  result.alias_to_index.reserve(n);
  result.alias_to_index[index] = 0;
  result.indexes[0] = index;
  for (size_t i = 0; i < n; ++i) {
    result.alias_to_index[params.joins[i].index_alias] = i + 1;
    result.indexes[i + 1] = params.joins[i].index;
  }

  // Collect needed fields for joins for each index
  // needed_fields[i] contains fields needed for index i
  // for each field name we store its index
  // Also collect joins for each index
  std::vector<absl::flat_hash_map<std::string_view, size_t>> needed_fields(n + 1);

  auto insert = [&](std::string_view field, auto* map) -> size_t {
    auto it = map->find(field);
    if (it == map->end()) {
      const size_t field_index = map->size();
      map->emplace(field, field_index);
      return field_index;
    }
    return it->second;
  };

  for (size_t i = 0; i < n; ++i) {
    const auto& join = params.joins[i];
    for (const auto& condition : join.conditions) {
      size_t field_index = insert(condition.field, &needed_fields[i + 1]);

      DCHECK(result.alias_to_index.contains(condition.foreign_field.first))
          << "Unknown foreign index alias: " << condition.foreign_field.first;
      size_t foreign_index = result.alias_to_index[condition.foreign_field.first];
      DCHECK_LE(foreign_index, i) << "Foreign index alias out of range: "
                                  << condition.foreign_field.first;

      size_t foreign_field_index =
          insert(condition.foreign_field.second, &needed_fields[foreign_index]);

      // Update joins for this index
      result.joins_per_index[i + 1].emplace_back(
          join::JoinExpression{field_index, foreign_index, foreign_field_index});
    }
  }

  // Collect fields needed for sorting
  // Max option will be temprorary ignored
  if (params.join_agg_params.sort) {
    for (const auto& sort_field : params.join_agg_params.sort.value().fields) {
      auto [index_alias, field_name] = Split(sort_field.first, '.');

      auto it = result.alias_to_index.find(index_alias);
      if (it == result.alias_to_index.end()) {
        return CreateSyntaxError(absl::StrCat("Unknown index alias '", index_alias,
                                              "' in the SORTBY option. Field: '", field_name, "'"));
      }

      size_t index = it->second;
      size_t field_index = insert(field_name, &needed_fields[index]);
      result.sort_params.push_back(
          PreprocessedJoinData::SortParam{index, field_index, sort_field.second});
    }
  }

  // Map them to the result.needed_fields
  for (size_t i = 0; i <= n; ++i) {
    auto& from = needed_fields[i];
    auto& to = result.needed_fields[i];

    to.resize(from.size());
    for (const auto& [field_name, field_index] : from) {
      to[field_index] = field_name;
    }
  }

  // Initialize fields_to_load_per_index
  for (const auto& field : params.load_fields.value_or(std::vector<FieldReference>{})) {
    auto [index_alias, field_name] = Split(field.Name(), '.');

    auto it = result.alias_to_index.find(index_alias);
    if (it == result.alias_to_index.end()) {
      return CreateSyntaxError(absl::StrCat("Unknown index alias '", index_alias,
                                            "' in the LOAD option. Field: '", field_name, "'"));
    }

    result.fields_to_load_per_index[it->second].emplace_back(field_name);
  }

  return result;
}

// Merge preaggregated results from all shards for each index
join::Vector<join::Vector<join::Entry>> MergePreaggregatedShardJoinData(
    absl::Span<const std::vector<join::Vector<join::OwnedEntry>>> preaggregated_shard_data) {
  if (preaggregated_shard_data.empty()) {
    return {};
  }

  // indexes_entries[i] contains the preaggregated data for index i
  const size_t indexes_count = preaggregated_shard_data[0].size();
  join::Vector<join::Vector<join::Entry>> indexes_entries(indexes_count);
  for (size_t i = 0; i < indexes_count; ++i) {
    auto& entries = indexes_entries[i];

    size_t num_docs = 0;
    for (size_t j = 0; j < shard_set->size(); ++j) {
      num_docs += preaggregated_shard_data[j][i].size();
    }

    entries.reserve(num_docs);
    for (size_t j = 0; j < shard_set->size(); ++j) {
      for (const auto& entry : preaggregated_shard_data[j][i]) {
        join::Vector<join::JoinableValue> field_values;
        field_values.reserve(entry.second.size());

        auto insert_copy = [&field_values](const auto& field_value) {
          field_values.emplace_back(field_value);
        };

        for (const auto& field_value : entry.second) {
          std::visit(insert_copy, field_value);
        }

        entries.emplace_back(entry.first, std::move(field_values));
      }
    }
  }

  return indexes_entries;
}

join::Vector<join::Vector<join::Key>> DoJoin(
    absl::Span<const std::vector<join::Vector<join::OwnedEntry>>> preaggregated_shard_data,
    const AggregateParams& params, const PreprocessedJoinData& join_data) {
  using join::KeyIndexes;

  auto indexes_entries = MergePreaggregatedShardJoinData(preaggregated_shard_data);

  auto sort_and_limit = [&](std::vector<KeyIndexes>* joined_entries) {
    const size_t offset = params.join_agg_params.limit_offset;
    const size_t total = params.join_agg_params.limit_total;
    if (offset >= joined_entries->size()) {
      joined_entries->clear();
      return;
    }

    const auto& sort_params = join_data.sort_params;
    auto comparator = [&](const KeyIndexes& l, const KeyIndexes& r) {
      for (const auto& sort_param : sort_params) {
        size_t index = sort_param.index;
        const join::JoinableValue& l_value =
            indexes_entries[index][l[index]].second[sort_param.field_index];
        const join::JoinableValue& r_value =
            indexes_entries[index][r[index]].second[sort_param.field_index];

        if (l_value == r_value) {
          continue;
        }
        return sort_param.order == SortOrder::ASC ? l_value < r_value : l_value > r_value;
      }
      return false;
    };

    size_t limit = offset + total;
    if (!sort_params.empty()) {
      if (limit >= joined_entries->size()) {
        std::sort(joined_entries->begin(), joined_entries->end(), std::move(comparator));
      } else {
        std::partial_sort(joined_entries->begin(), joined_entries->begin() + limit,
                          joined_entries->end(), std::move(comparator));
        joined_entries->resize(limit);
      }
    }

    size_t new_limit = std::min(limit, joined_entries->size());
    if (offset) {
      for (size_t i = offset; i < new_limit; ++i) {
        auto& dest = (*joined_entries)[i - offset];
        auto& src = (*joined_entries)[i];
        DCHECK(dest.size() == src.size());
        dest = std::move(src);
      }
    }

    size_t new_size = std::min(total, joined_entries->size() - offset);
    joined_entries->resize(new_size);
  };

  return join::JoinAllIndexes(indexes_entries, join_data.joins_per_index, sort_and_limit);
}

std::vector<aggregate::DocValues> MergeJoinedKeysWithData(
    const AggregateParams& agg_params, const PreprocessedJoinData& join_data,
    absl::Span<const join::Vector<join::Key>> joined_entries,
    absl::Span<const std::vector<ShardDocIndex::FieldsValuesPerDocId>> shard_keys_data) {
  std::vector<aggregate::DocValues> merged_data;
  merged_data.reserve(joined_entries.size());

  const size_t indexes_count = join_data.indexes.size();
  const auto& fields_per_index = join_data.fields_to_load_per_index;

  for (const auto& entry : joined_entries) {
    aggregate::DocValues doc_values;

    // First reserve space for the total number of fields
    size_t docs_count = 0;
    for (size_t i = 0; i < indexes_count; ++i) {
      docs_count += fields_per_index[i].size();
    }
    doc_values.reserve(docs_count);

    for (size_t i = 0; i < indexes_count; ++i) {
      std::string_view index_alias =
          (i == 0) ? agg_params.index : agg_params.joins[i - 1].index_alias;

      const auto [shard_id, doc_id] = entry[i];
      const auto& field_values_per_doc_id = shard_keys_data[shard_id][i];

      auto it = field_values_per_doc_id.find(doc_id);
      if (it == field_values_per_doc_id.end()) {
        /* This doc id was joined but not found on the second step. This can happen due to
         * expiration for example. For now, just skip it */
        continue;
      }

      const auto& field_values = it->second;

      for (size_t j = 0; j < fields_per_index[i].size(); ++j) {
        std::string_view field_alias = fields_per_index[i][j];  // tmp alias is identifier
        doc_values.emplace(absl::StrCat(index_alias, "."sv, field_alias), field_values[j]);
      }
    }

    merged_data.push_back(std::move(doc_values));
  }
  return merged_data;
}

auto SortableValueSender(RedisReplyBuilder* rb) {
  return Overloaded{
      [rb](monostate) { rb->SendNull(); },
      [rb](double d) { rb->SendDouble(d); },
      [rb](const string& s) { rb->SendBulkString(s); },
  };
}

void SendSerializedDoc(const SerializedSearchDoc& doc, SinkReplyBuilder* builder) {
  auto* rb = static_cast<RedisReplyBuilder*>(builder);
  auto sortable_value_sender = SortableValueSender(rb);

  rb->StartCollection(doc.values.size(), CollectionType::MAP);
  for (const auto& [k, v] : doc.values) {
    rb->SendBulkString(k);
    visit(sortable_value_sender, v);
  }
}

template <typename T>
void PartialSort(absl::Span<SerializedSearchDoc*> docs, size_t limit, SortOrder order,
                 T SerializedSearchDoc::*field) {
  auto cb = [order, field](SerializedSearchDoc* l, SerializedSearchDoc* r) {
    return order == SortOrder::ASC ? l->*field < r->*field : r->*field < l->*field;
  };
  partial_sort(docs.begin(), docs.begin() + min(limit, docs.size()), docs.end(), cb);
}

void SearchReply(const SearchParams& params,
                 std::optional<search::KnnScoreSortOption> knn_sort_option,
                 absl::Span<SearchResult> results, SinkReplyBuilder* builder, bool is_css) {
  size_t total_hits = 0;
  absl::InlinedVector<SerializedSearchDoc*, 5> docs;
  docs.reserve(results.size());
  for (auto& shard_results : results) {
    total_hits += shard_results.total_hits;
    for (auto& doc : shard_results.docs) {
      docs.push_back(&doc);
    }
  }

  // Reorder and cut KNN results before applying SORT and LIMIT
  optional<string> knn_score_ret_field;
  bool ignore_sort = false;
  if (knn_sort_option) {
    total_hits = min(total_hits, knn_sort_option->limit);
    PartialSort(absl::MakeSpan(docs), total_hits, SortOrder::ASC, &SerializedSearchDoc::knn_score);
    docs.resize(min(docs.size(), knn_sort_option->limit));

    ignore_sort = !params.sort_option || params.sort_option->IsSame(*knn_sort_option);
    if (params.ShouldReturnField(knn_sort_option->score_field_alias))
      knn_score_ret_field = knn_sort_option->score_field_alias;
  }

  // Apply LIMIT
  size_t offset = 0;
  size_t limit = 0;
  if (is_css) {
    limit = std::min(docs.size(), params.limit_total + params.limit_offset);
  } else {
    offset = std::min(params.limit_offset, docs.size());
    limit = std::min(docs.size() - offset, params.limit_total);
  }
  const size_t end = limit + offset;

  // Apply SORTBY if its different from the KNN sort
  if (params.sort_option && !ignore_sort)
    PartialSort(absl::MakeSpan(docs), end, params.sort_option->order,
                &SerializedSearchDoc::sort_score);

  const bool reply_with_ids_only = params.IdsOnly();
  auto* rb = static_cast<RedisReplyBuilder*>(builder);
  const size_t items_per_field = (reply_with_ids_only ? 1 : 2) + params.with_sortkeys;
  RedisReplyBuilder::ArrayScope scope{rb, limit * items_per_field + 1};

  Overloaded sortable_value_sender{
      [rb](monostate) { rb->SendNull(); },
      [rb](double d) { rb->SendBulkString(absl::StrCat("#", d)); },
      [rb](const string& s) { rb->SendBulkString("$" + s); },
  };

  rb->SendLong(total_hits);
  for (size_t i = offset; i < end; i++) {
    rb->SendBulkString(docs[i]->key);
    if (params.with_sortkeys) {
      visit(sortable_value_sender, docs[i]->sort_score);
    }

    if (!reply_with_ids_only) {
      if (knn_score_ret_field)
        docs[i]->values[*knn_score_ret_field] = docs[i]->knn_score;

      SendSerializedDoc(*docs[i], builder);
    }
  }
}

// Warms up the query parser to avoid first-call slowness
void WarmupQueryParser() {
  static std::once_flag warmed_up;
  std::call_once(warmed_up, []() {
    search::QueryParams params;
    search::QueryDriver driver{};
    driver.SetParams(&params);
    driver.SetInput(std::string{""});
    (void)search::Parser (&driver)();
  });
}

vector<SearchResult> SearchGlobalHnswIndex(
    const search::AstKnnNode* knn, const shared_ptr<search::HnswVectorIndex>& index,
    const std::string_view index_name,
    const std::optional<search::KnnScoreSortOption>& knn_score_option,
    const std::vector<SearchResult>& sharded_prefilter_docs, const SearchParams& params,
    const CommandContext& cmd_cntx) {
  std::vector<SearchResult> results(1);

  std::optional<std::vector<search::GlobalDocId>> prefilter_global_docs_ids = std::nullopt;

  // Quick lookup to match global id to serialized doc
  std::map<search::GlobalDocId, const SerializedSearchDoc*> prefilter_docs_lookup;

  const bool has_prefilter_docs = knn->HasPreFilter();
  const ShardId shard_size = sharded_prefilter_docs.size();

  // We have pre filter docs so all documents should already be fetched
  if (has_prefilter_docs) {
    std::vector<search::GlobalDocId> global_doc_ids;
    for (size_t shard_id = 0; shard_id < shard_size; shard_id++) {
      for (auto& doc : sharded_prefilter_docs[shard_id].docs) {
        auto global_doc_id = search::CreateGlobalDocId(shard_id, doc.id);
        global_doc_ids.emplace_back(global_doc_id);
        prefilter_docs_lookup[global_doc_id] = &doc;
      }
    }
    prefilter_global_docs_ids = std::move(global_doc_ids);
  }

  // Search HNSW index
  std::vector<std::pair<float, search::GlobalDocId>> knn_results;

  if (prefilter_global_docs_ids) {
    VLOG(1) << "Searching HNSW index with prefilter size: " << prefilter_global_docs_ids->size();
    if (prefilter_global_docs_ids->size() < absl::GetFlag(FLAGS_subset_knn_search_threshold)) {
      knn_results = index->SubsetKnn(knn->vec.first.get(), knn->limit, *prefilter_global_docs_ids);
    } else {
      knn_results =
          index->Knn(knn->vec.first.get(), knn->limit, knn->ef_runtime, *prefilter_global_docs_ids);
    }
  } else {
    knn_results = index->Knn(knn->vec.first.get(), knn->limit, knn->ef_runtime);
  }

  std::vector<SerializedSearchDoc> knn_search_serialized_docs;
  knn_search_serialized_docs.reserve(knn_results.size());

  // Serialized docs for each shard
  std::vector<std::vector<SerializedSearchDoc>> shard_docs(shard_size);

  for (const auto& [score, global_doc_id] : knn_results) {
    if (has_prefilter_docs) {
      knn_search_serialized_docs.emplace_back(*prefilter_docs_lookup[global_doc_id]);
      knn_search_serialized_docs.back().knn_score = score;
    } else {
      // Create SerializedSearchDoc and fill only knn information
      auto [shard_id, local_doc_id] = search::DecomposeGlobalDocId(global_doc_id);
      SerializedSearchDoc doc;
      doc.id = local_doc_id;
      doc.knn_score = score;
      shard_docs[shard_id].emplace_back(doc);
    }
  }

  // If we have prefilter docs we don't need to fetch docs so can return early
  if (has_prefilter_docs) {
    results[0].total_hits = knn_search_serialized_docs.size();
    results[0].docs = std::move(knn_search_serialized_docs);
    return results;
  }

  // Do we need to set sort score
  bool set_sort_score = params.sort_option && !params.sort_option->IsSame(*knn_score_option);

  // Do we need to remove sort field from response
  bool remove_sort_field = false;

  std::optional<std::vector<FieldReference>> return_fields = params.return_fields;

  // If we don't return all fields
  if (return_fields) {
    // We have sort_option and it's different than knn score
    if (set_sort_score) {
      bool found_sort_return_field = false;
      for (const auto& return_field : *return_fields) {
        if (params.sort_option->field.Name() == return_field.Name()) {
          found_sort_return_field = true;
          break;
        }
      }
      // Sort return field is not found so we need to add it for request and
      // remove this field in response
      if (!found_sort_return_field) {
        (*return_fields).push_back(params.sort_option->field);
        remove_sort_field = true;
      }
    }
  }

  // Indicator if we serialized document on shard
  std::vector<std::vector<bool>> shard_docs_serialized_indicator(shard_size);

  // Fetch all docs from shards
  cmd_cntx.tx()->ScheduleSingleHop([&](Transaction* t, EngineShard* es) {
    auto* index = es->search_indices()->GetIndex(index_name);

    // No index found or no docs on this shard
    if (!index || shard_docs[es->shard_id()].empty()) {
      return OpStatus::OK;
    }

    const auto& schema = index->GetInfo().base_index.schema;

    // Resize shard with default `true` value
    shard_docs_serialized_indicator[es->shard_id()].resize(shard_docs[es->shard_id()].size(), true);

    for (size_t i = 0; i < shard_docs[es->shard_id()].size(); i++) {
      auto& shard_doc = shard_docs[es->shard_id()][i];
      if (auto doc =
              index->SerializeDocWithKey(shard_doc.id, t->GetOpArgs(es), schema, return_fields);
          doc) {
        auto& [key, fields] = *doc;

        // Handle sort_score and remove field if we don't need it
        search::SortableValue sort_score = std::monostate{};
        if (set_sort_score) {
          sort_score = fields[params.sort_option->field.Name()];
          if (remove_sort_field) {
            fields.erase(params.sort_option->field.Name());
          }
        }
        shard_doc.key = std::string{key};
        shard_doc.values = std::move(fields);
        shard_doc.sort_score = sort_score;
      } else {
        // If we couldn't serialize requested doc
        shard_docs_serialized_indicator[es->shard_id()][i] = false;
      }
    }
    return OpStatus::OK;
  });

  // Transform shard results back to
  size_t shard_id = 0;
  std::for_each(shard_docs.begin(), shard_docs.end(),
                [&](const std::vector<SerializedSearchDoc>& shard) {
                  for (size_t doc_index = 0; doc_index < shard.size(); ++doc_index) {
                    // Check if we serialized doc
                    if (shard_docs_serialized_indicator[shard_id][doc_index]) {
                      knn_search_serialized_docs.push_back(shard[doc_index]);
                    }
                  }
                  shard_id++;
                });

  results[0].total_hits = knn_search_serialized_docs.size();
  results[0].docs = std::move(knn_search_serialized_docs);

  return results;
}

// Search HNSW index for all documents within the given radius.
// Similar to SearchGlobalHnswIndex but uses RangeQuery instead of Knn.
vector<SearchResult> SearchGlobalHnswIndexRange(
    const search::AstVectorRangeNode* range, const shared_ptr<search::HnswVectorIndex>& index,
    string_view index_name, const std::optional<search::KnnScoreSortOption>& knn_score_option,
    const SearchParams& params, const CommandContext& cmd_cntx) {
  std::vector<SearchResult> results(1);
  const ShardId shard_size = shard_set->size();

  auto range_results = index->RangeQuery(range->vec.first.get(), static_cast<float>(range->radius));

  std::vector<std::vector<SerializedSearchDoc>> shard_docs(shard_size);
  for (const auto& [score, global_doc_id] : range_results) {
    auto [shard_id, local_doc_id] = search::DecomposeGlobalDocId(global_doc_id);
    SerializedSearchDoc doc;
    doc.id = local_doc_id;
    doc.knn_score = score;
    shard_docs[shard_id].emplace_back(doc);
  }

  bool set_sort_score =
      params.sort_option && (!knn_score_option || !params.sort_option->IsSame(*knn_score_option));
  bool remove_sort_field = false;
  std::optional<std::vector<FieldReference>> return_fields = params.return_fields;

  if (set_sort_score && return_fields) {
    bool found_sort_field = false;
    for (const auto& rf : *return_fields) {
      if (rf.Name() == params.sort_option->field.Name()) {
        found_sort_field = true;
        break;
      }
    }
    if (!found_sort_field) {
      return_fields->push_back(params.sort_option->field);
      remove_sort_field = true;
    }
  }

  cmd_cntx.tx()->ScheduleSingleHop([&](Transaction* t, EngineShard* es) {
    auto* idx = es->search_indices()->GetIndex(index_name);
    if (!idx || shard_docs[es->shard_id()].empty())
      return OpStatus::OK;
    const auto& schema = idx->GetInfo().base_index.schema;
    for (auto& shard_doc : shard_docs[es->shard_id()]) {
      if (auto doc =
              idx->SerializeDocWithKey(shard_doc.id, t->GetOpArgs(es), schema, return_fields);
          doc) {
        auto& [key, fields] = *doc;
        search::SortableValue sort_score = std::monostate{};
        if (set_sort_score) {
          sort_score = fields[params.sort_option->field.Name()];
          if (remove_sort_field)
            fields.erase(params.sort_option->field.Name());
        }
        shard_doc.key = std::string{key};
        shard_doc.values = std::move(fields);
        shard_doc.sort_score = sort_score;
      }
    }
    return OpStatus::OK;
  });

  std::vector<SerializedSearchDoc> serialized_docs;
  serialized_docs.reserve(range_results.size());
  for (const auto& shard : shard_docs) {
    for (const auto& doc : shard) {
      if (!doc.key.empty())
        serialized_docs.push_back(doc);
    }
  }

  results[0].total_hits = serialized_docs.size();
  results[0].docs = std::move(serialized_docs);
  return results;
}

// Try creating global hnsw indices for given fields and return true on success
bool CreateHnswIndices(std::string_view idx_name, const DocIndex& index) {
  std::vector<std::string> created_vector_indices;
  for (const auto& [field_ident, field_info] : index.schema.fields) {
    if (!field_info.IsIndexableHnswField())
      continue;

    const auto& vparams = std::get<search::SchemaField::VectorParams>(field_info.special_params);

    bool success = GlobalHnswIndexRegistry::Instance().Create(idx_name, field_info.short_name,
                                                              vparams, index.type);
    if (!success) {
      // Clean created indices
      for (const auto& cfname : created_vector_indices)
        GlobalHnswIndexRegistry::Instance().Remove(idx_name, cfname);
      return false;
    }

    created_vector_indices.emplace_back(field_info.short_name);
  }
  return true;
}

}  // namespace

void CmdFtCreate(CmdArgList args, CommandContext* cmd_cntx) {
  WarmupQueryParser();

  auto* builder = cmd_cntx->rb();
  if (cmd_cntx->server_conn_cntx()->conn_state.db_index != 0) {
    return builder->SendError("Cannot create index on db != 0"sv);
  }

  CmdArgParser parser{args};
  string_view idx_name = parser.Next();

  // Parse optional NX (Only create if not exists) parameter for internal usage
  bool is_NX = parser.Check("NX");

  bool is_cross_shard = parser.Check("CSS");

  auto parsed_index = CreateDocIndex(idx_name, &parser);
  if (SendErrorIfOccurred(parsed_index, &parser, cmd_cntx)) {
    return;
  }

  // Check if index already exists
  atomic_uint exists_cnt = 0;
  cmd_cntx->tx()->Execute(
      [idx_name, &exists_cnt](auto* tx, auto* es) {
        if (es->search_indices()->GetIndex(idx_name) != nullptr)
          exists_cnt.fetch_add(1, std::memory_order_relaxed);
        return OpStatus::OK;
      },
      false);

  DCHECK(exists_cnt == 0u || exists_cnt == shard_set->size());

  if (exists_cnt.load(memory_order_relaxed) > 0) {
    cmd_cntx->tx()->Conclude();
    return is_NX ? builder->SendOk() : builder->SendError("Index already exists");
  }

  if (absl::GetFlag(FLAGS_cluster_search) && !is_cross_shard && IsClusterEnabled()) {
    std::string args_str = absl::StrJoin(args.subspan(1), " ");
    std::string cmd = absl::StrCat("FT.CREATE ", idx_name, " CSS ", args_str);

    // TODO add processing of the reply to make sure index was created successfully on all shards,
    // and prevent simultaneous creation of the same index.
    auto req_future = cluster::Coordinator::Current().DispatchAll(cmd, [](const RESPObj&) {});
    // TODO add error handling
    CHECK(!req_future.Get());
  }

  if (!CreateHnswIndices(idx_name, *parsed_index)) {
    cmd_cntx->tx()->Conclude();
    return builder->SendError("Index already exists");
  }

  auto idx_ptr = make_shared<DocIndex>(std::move(parsed_index).value());
  cmd_cntx->tx()->Execute(
      [idx_name, idx_ptr](auto* tx, auto* es) {
        es->search_indices()->InitIndex(tx->GetOpArgs(es), idx_name, idx_ptr);
        return OpStatus::OK;
      },
      true);

  builder->SendOk();
}

void CmdFtAlter(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  string_view idx_name = parser.Next();
  parser.ExpectTag("SCHEMA");
  parser.ExpectTag("ADD");
  auto* builder = cmd_cntx->rb();
  RETURN_ON_PARSE_ERROR(parser, cmd_cntx);

  // First, extract existing index info
  shared_ptr<DocIndex> index_info;
  auto idx_cb = [idx_name, &index_info](auto* tx, EngineShard* es) {
    if (es->shard_id() > 0)  // all shards have the same data, fetch from first
      return OpStatus::OK;

    if (auto* idx = es->search_indices()->GetIndex(idx_name); idx != nullptr)
      index_info = make_shared<DocIndex>(idx->GetInfo().base_index);
    return OpStatus::OK;
  };
  cmd_cntx->tx()->Execute(idx_cb, false);

  if (!index_info) {
    cmd_cntx->tx()->Conclude();
    return cmd_cntx->SendError("Index not found");
  }

  // Parse additional schema
  DocIndex new_index{};
  new_index.type = index_info->type;
  auto parse_result = ParseSchema(&parser, &new_index);
  if (SendErrorIfOccurred(parse_result, &parser, cmd_cntx)) {
    cmd_cntx->tx()->Conclude();
    return;
  }

  auto& new_fields = new_index.schema;

  // For logging we copy the whole schema
  // TODO: Use a more efficient way for logging
  LOG(INFO) << "Adding "
            << DocIndexInfo{.base_index = new_index, .hnsw_metadata = {}}.BuildRestoreCommand();

  // Merge schemas
  search::Schema& schema = index_info->schema;
  schema.fields.insert(new_fields.fields.begin(), new_fields.fields.end());
  schema.field_names.insert(new_fields.field_names.begin(), new_fields.field_names.end());

  // Rebuild index
  // TODO: Introduce partial rebuild
  auto upd_cb = [idx_name, index_info](Transaction* tx, EngineShard* es) {
    (void)es->search_indices()->DropIndex(idx_name);
    es->search_indices()->InitIndex(tx->GetOpArgs(es), idx_name, index_info);
    return OpStatus::OK;
  };
  cmd_cntx->tx()->Execute(upd_cb, true);

  builder->SendOk();
}

void CmdFtDropIndex(CmdArgList args, CommandContext* cmd_cntx) {
  string_view idx_name = ArgS(args, 0);

  // Parse optional DD (Delete Documents) parameter
  bool delete_docs = args.size() > 1 && absl::EqualsIgnoreCase(args[1], "DD");

  shared_ptr<DocIndex> index_info;
  atomic_uint num_deleted{0};

  auto cb = [&](Transaction* t, EngineShard* es) {
    // Get index info from first shard for global cleanup
    if (es->shard_id() == 0) {
      if (auto* idx = es->search_indices()->GetIndex(idx_name); idx != nullptr) {
        index_info = make_shared<DocIndex>(idx->GetInfo().base_index);
      }
    }
    // Drop the index and get its pointer
    auto index = es->search_indices()->DropIndex(idx_name);
    if (!index)
      return OpStatus::OK;

    num_deleted.fetch_add(1);

    // If DD is set, delete all documents that were in the index
    if (delete_docs) {
      // Get const reference to document keys map (index will be destroyed after this scope)
      const auto& doc_keys = index->key_index().GetDocKeysMap();

      auto op_args = t->GetOpArgs(es);
      auto& db_slice = op_args.GetDbSlice();

      for (const auto& [key, doc_id] : doc_keys) {
        auto it = db_slice.FindMutable(op_args.db_cntx, key).it;
        if (IsValid(it)) {
          db_slice.Del(op_args.db_cntx, it);
        }
      }
    }

    return OpStatus::OK;
  };

  cmd_cntx->tx()->Execute(cb, true);

  if (index_info) {
    for (const auto& [field_ident, field_info] : index_info->schema.fields) {
      if (field_info.type == search::SchemaField::VECTOR &&
          !(field_info.flags & search::SchemaField::NOINDEX)) {
        if (GlobalHnswIndexRegistry::Instance().Remove(idx_name, field_info.short_name)) {
          num_deleted.fetch_add(1);
        }
      }
    }
  }

  if (num_deleted == 0u)
    return cmd_cntx->SendError(IndexNotFoundMsg(idx_name));
  return cmd_cntx->rb()->SendOk();
}

void CmdFtInfo(CmdArgList args, CommandContext* cmd_cntx) {
  string_view idx_name = ArgS(args, 0);

  vector<DocIndexInfo> infos(shard_set->size());

  cmd_cntx->tx()->ScheduleSingleHop([&](Transaction* t, EngineShard* es) {
    auto* index = es->search_indices()->GetIndex(idx_name);
    if (index != nullptr)
      infos[es->shard_id()] = index->GetInfo();
    return OpStatus::OK;
  });

  // Count how many shards didn't find the index by checking empty entries.
  size_t num_notfound = std::count_if(infos.begin(), infos.end(), [](const DocIndexInfo& info) {
    return info.base_index.schema.fields.empty();
  });

  DCHECK(num_notfound == 0u || num_notfound == shard_set->size());
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  if (num_notfound > 0u)
    return rb->SendError(IndexNotFoundMsg(idx_name));

  DCHECK(infos.front().base_index.schema.fields.size() ==
         infos.back().base_index.schema.fields.size());

  bool indexing = false;
  float percent_indexed = 1.0;
  size_t total_num_docs = 0;
  for (const auto& info : infos) {
    total_num_docs += info.num_docs;
    indexing |= info.indexing;
    percent_indexed = std::min(percent_indexed, info.percent_indexed);
  }

  const auto& info = infos.front();
  const auto& schema = info.base_index.schema;

  rb->StartCollection(7, CollectionType::MAP);

  rb->SendSimpleString("index_name");
  rb->SendSimpleString(idx_name);

  rb->SendSimpleString("index_definition");
  {
    rb->StartCollection(3, CollectionType::MAP);
    rb->SendSimpleString("key_type");
    rb->SendSimpleString(info.base_index.type == DocIndex::JSON ? "JSON" : "HASH");
    rb->SendSimpleString("prefixes");
    rb->StartArray(info.base_index.prefixes.size());
    for (const auto& prefix : info.base_index.prefixes) {
      rb->SendBulkString(prefix);
    }
    rb->SendSimpleString("default_score");
    rb->SendLong(1);
  }

  rb->SendSimpleString("index_options");
  rb->SendEmptyArray();

  rb->SendSimpleString("attributes");
  rb->StartArray(schema.fields.size());
  for (const auto& [field_ident, field_info] : schema.fields) {
    vector<string> info;

    string_view base[] = {"identifier"sv, string_view{field_ident},
                          "attribute"sv,  field_info.short_name,
                          "type"sv,       SearchFieldTypeToString(field_info.type)};
    info.insert(info.end(), base, base + ABSL_ARRAYSIZE(base));

    if (field_info.flags & search::SchemaField::NOINDEX)
      info.emplace_back("NOINDEX"sv);

    if (field_info.flags & search::SchemaField::SORTABLE)
      info.emplace_back("SORTABLE"sv);

    if (field_info.type == search::SchemaField::NUMERIC) {
      auto& numeric_params =
          std::get<search::SchemaField::NumericParams>(field_info.special_params);
      info.emplace_back("blocksize"sv);
      info.emplace_back(std::to_string(numeric_params.block_size));
    }

    rb->SendSimpleStrArr(info);
  }

  rb->SendSimpleString("num_docs");
  rb->SendLong(total_num_docs);

  rb->SendSimpleString("indexing");
  rb->SendLong(indexing ? 1 : 0);

  rb->SendSimpleString("percent_indexed");
  rb->SendDouble(percent_indexed);
}

void CmdFtList(CmdArgList args, CommandContext* cmd_cntx) {
  atomic_int first{0};
  vector<string> names;

  cmd_cntx->tx()->ScheduleSingleHop([&](Transaction* t, EngineShard* es) {
    // Using `first` to assign `names` only once without a race
    if (first.fetch_add(1) == 0)
      names = es->search_indices()->GetIndexNames();
    return OpStatus::OK;
  });
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  rb->SendBulkStrArr(names);
}

static vector<SearchResult> FtSearchCSS(std::string_view idx, std::string_view query,
                                        std::string_view args_str, const SearchParams& params) {
  vector<SearchResult> results;
  const bool sorted = params.sort_option.has_value();
  const std::string_view with_sortkeys = sorted && !params.with_sortkeys ? " WITHSORTKEYS"sv : ""sv;
  std::string cmd = absl::StrCat("FT.SEARCH ", idx, " ", query, " CSS ", args_str, with_sortkeys);

  util::fb2::Mutex mu_;
  auto req_future = cluster::Coordinator::Current().DispatchAll(cmd, [&](const RESPObj& resp_obj) {
    RESPIterator it{resp_obj};
    const auto size = it.Next<uint64_t>();

    std::lock_guard lock{mu_};
    auto& res = results.emplace_back();
    results.back().total_hits = size;

    while (it.HasNext()) {
      auto& search_doc = res.docs.emplace_back();
      search_doc.key = it.Next<std::string>();
      if (sorted) {
        auto sort_score = it.Next<std::string_view>();
        if (sort_score.empty() || (sort_score[0] != '#' && sort_score[0] != '$')) {
          it.SetError();
          break;
        }
        if (sort_score[0] == '#') {  // It's a double
          double sort_res = 0;
          if (ParseDouble(sort_score.substr(1), &sort_res)) {
            search_doc.sort_score = sort_res;
          } else {
            it.SetError();
            break;
          }
        } else {  // It's a string
          search_doc.sort_score = std::string(sort_score.substr(1));
        }
      }

      for (auto arr_fields = it.Next<RESPIterator>(); arr_fields.HasNext();) {
        auto [key, value] = arr_fields.Next<std::string, std::string>();
        search_doc.values.emplace(std::move(key), std::move(value));
      }
    }
    if (it.HasError()) {
      LOG(ERROR) << "FT.SEARCH CSS reply parsing error: " << resp_obj;
    }
  });
  // TODO add error handling
  CHECK(!req_future.Get());
  return results;
}

void CmdFtSearch(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  string_view index_name = parser.Next();
  string_view query_str = parser.Next();

  bool is_cross_shard = parser.Check("CSS");

  auto* builder = cmd_cntx->rb();
  auto params = ParseSearchParams(&parser);
  if (SendErrorIfOccurred(params, &parser, cmd_cntx))
    return;

  // Check query string length limit
  size_t max_query_bytes = absl::GetFlag(FLAGS_search_query_string_bytes);
  if (query_str.size() > max_query_bytes) {
    return builder->SendError(
        absl::StrCat("Query string is too long, max length is ", max_query_bytes, " bytes"));
  }

  vector<SearchResult> css_docs;
  if (absl::GetFlag(FLAGS_cluster_search) && !is_cross_shard && IsClusterEnabled()) {
    std::string args_str = absl::StrJoin(args.subspan(2), " ");

    css_docs = FtSearchCSS(index_name, query_str, args_str, *params);
  }

  search::SearchAlgorithm search_algo;
  if (!search_algo.Init(query_str, &params->query_params, &params->optional_filters))
    return builder->SendError("Query syntax error");

  std::unique_ptr<search::AstNode> knn_node;
  search::AstKnnNode* knn = nullptr;

  if (search_algo.IsKnnQuery()) {
    // Check if it is HNSW node
    if (GlobalHnswIndexRegistry::Instance().Exist(index_name, search_algo.GetKnnNode()->field)) {
      knn_node = search_algo.PopKnnNode();
      knn = std::get_if<search::AstKnnNode>(knn_node.get());
    }
  }

  // Check for HNSW vector range query (mutually exclusive with KNN)
  const search::AstVectorRangeNode* hnsw_range = nullptr;
  if (!knn) {
    if (auto* vr = search_algo.GetVectorRangeNode(); vr != nullptr) {
      if (GlobalHnswIndexRegistry::Instance().Exist(index_name, vr->field))
        hnsw_range = vr;
    }
  }

  // Because our coordinator thread may not have a shard, we can't check ahead if the index exists.
  atomic<bool> index_not_found{false};
  vector<SearchResult> docs(shard_set->size());

  const bool knn_has_prefilter = knn && knn->HasPreFilter();
  bool empty_prefilter_result = true;

  // If the query does not contain knn component, or it is a hybrid query.
  // HNSW vector range has no prefilter, so skip per-shard search entirely.
  if ((!knn || knn_has_prefilter) && !hnsw_range) {
    cmd_cntx->tx()->ScheduleSingleHop([&](Transaction* t, EngineShard* es) {
      if (auto* index = es->search_indices()->GetIndex(index_name); index)
        docs[es->shard_id()] =
            index->Search(t->GetOpArgs(es), *params, &search_algo, knn_has_prefilter);
      else
        index_not_found.store(true, memory_order_relaxed);
      return OpStatus::OK;
    });

    if (index_not_found.load(memory_order_relaxed))
      return cmd_cntx->SendError(string{index_name} + ": no such index");

    for (const auto& res : docs) {
      empty_prefilter_result &= res.docs.empty();
      if (res.error)
        return cmd_cntx->SendError(*res.error);
    }
  }

  if (knn_node && (!knn_has_prefilter || !empty_prefilter_result)) {
    auto hnsw_index = GlobalHnswIndexRegistry::Instance().Get(index_name, knn->field);
    if (!hnsw_index) {
      return builder->SendError(string{index_name} + ": no such global hnsw index");
    }
    docs = SearchGlobalHnswIndex(knn, hnsw_index, index_name, search_algo.GetKnnScoreSortOption(),
                                 docs, *params, *cmd_cntx);
  }

  auto knn_sort_option = search_algo.GetKnnScoreSortOption();

  if (hnsw_range) {
    auto hnsw_index = GlobalHnswIndexRegistry::Instance().Get(index_name, hnsw_range->field);
    if (!hnsw_index) {
      return builder->SendError(string{index_name} + ": no such global hnsw index");
    }
    if (hnsw_range->vec.second == 0) {
      return builder->SendError("Parse error of vector parameters");
    }
    if (hnsw_range->radius < 0 || std::isnan(hnsw_range->radius)) {
      return builder->SendError(
          absl::StrCat("VECTOR_RANGE radius must be non-negative, got: ", hnsw_range->radius));
    }
    if (hnsw_index->GetDim() != hnsw_range->vec.second) {
      return builder->SendError(
          absl::StrCat("Wrong vector index dimensions, got: ", hnsw_range->vec.second,
                       ", expected: ", hnsw_index->GetDim()));
    }
    if (!hnsw_range->score_alias.empty())
      knn_sort_option =
          search::KnnScoreSortOption{hnsw_range->score_alias, std::numeric_limits<size_t>::max()};
    docs = SearchGlobalHnswIndexRange(hnsw_range, hnsw_index, index_name, knn_sort_option, *params,
                                      *cmd_cntx);
  }

  // TODO add merging of CSS results with local results (SORT, LIMIT, etc)
  docs.insert(docs.end(), std::make_move_iterator(css_docs.begin()),
              std::make_move_iterator(css_docs.end()));

  SearchReply(*params, knn_sort_option, absl::MakeSpan(docs), builder, is_cross_shard);
}

void CmdFtProfile(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};

  string_view index_name = parser.Next();
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  if (!parser.Check("SEARCH") && !parser.Check("AGGREGATE")) {
    return rb->SendError("no `SEARCH` or `AGGREGATE` provided");
  }

  parser.Check("LIMITED");  // TODO: Implement limited profiling
  parser.ExpectTag("QUERY");

  string_view query_str = parser.Next();

  auto params = ParseSearchParams(&parser);
  if (SendErrorIfOccurred(params, &parser, cmd_cntx))
    return;

  search::SearchAlgorithm search_algo;
  if (!search_algo.Init(query_str, &params->query_params))
    return cmd_cntx->SendError("query syntax error");

  search_algo.EnableProfiling();

  absl::Time start = absl::Now();
  const size_t shards_count = shard_set->size();

  // Because our coordinator thread may not have a shard, we can't check ahead if the index exists.
  std::atomic<bool> index_not_found{false};
  std::vector<SearchResult> search_results(shards_count);
  std::vector<absl::Duration> profile_results(shards_count);

  cmd_cntx->tx()->ScheduleSingleHop([&](Transaction* t, EngineShard* es) {
    auto* index = es->search_indices()->GetIndex(index_name);
    if (!index) {
      index_not_found.store(true, memory_order_relaxed);
      return OpStatus::OK;
    }

    const ShardId shard_id = es->shard_id();

    auto shard_start = absl::Now();
    search_results[shard_id] = index->Search(t->GetOpArgs(es), *params, &search_algo, false);
    profile_results[shard_id] = {absl::Now() - shard_start};

    return OpStatus::OK;
  });

  if (index_not_found.load())
    return rb->SendError(std::string{index_name} + ": no such index");

  auto took = absl::Now() - start;

  bool result_is_empty = false;
  size_t total_docs = 0;
  size_t total_serialized = 0;
  for (const auto& result : search_results) {
    if (!result.error) {
      total_docs += result.total_hits;
      total_serialized += result.docs.size();
    } else {
      result_is_empty = true;
    }
  }

  // First element -> Result of the search command
  // Second element -> Profile information
  rb->StartArray(2);

  // Result of the search command
  if (!result_is_empty) {
    SearchReply(*params, search_algo.GetKnnScoreSortOption(), absl::MakeSpan(search_results), rb,
                false);
  } else {
    rb->StartArray(1);
    rb->SendLong(0);
  }

  // Profile information
  rb->StartArray(shards_count + 1);

  // General stats
  rb->StartCollection(3, CollectionType::MAP);
  rb->SendBulkString("took");
  rb->SendLong(absl::ToInt64Microseconds(took));
  rb->SendBulkString("hits");
  rb->SendLong(static_cast<long>(total_docs));
  rb->SendBulkString("serialized");
  rb->SendLong(static_cast<long>(total_serialized));

  // Per-shard stats
  for (size_t shard_id = 0; shard_id < shards_count; shard_id++) {
    rb->StartCollection(2, CollectionType::MAP);
    rb->SendBulkString("took");
    rb->SendLong(absl::ToInt64Microseconds(profile_results[shard_id]));
    rb->SendBulkString("tree");

    const auto& search_result = search_results[shard_id];
    if (search_result.error || !search_result.profile || search_result.profile->events.empty()) {
      rb->SendEmptyArray();
      continue;
    }

    const auto& events = search_result.profile->events;
    for (size_t i = 0; i < events.size(); i++) {
      const auto& event = events[i];

      size_t children = 0;
      size_t children_micros = 0;
      for (size_t j = i + 1; j < events.size(); j++) {
        if (events[j].depth == event.depth)
          break;
        if (events[j].depth == event.depth + 1) {
          children++;
          children_micros += events[j].micros;
        }
      }

      rb->StartCollection(4 + (children > 0), CollectionType::MAP);
      rb->SendSimpleString("total_time");
      rb->SendLong(event.micros);
      rb->SendSimpleString("operation");
      rb->SendSimpleString(event.descr);
      rb->SendSimpleString("self_time");
      rb->SendLong(event.micros - children_micros);
      rb->SendSimpleString("procecssed");
      rb->SendLong(event.num_processed);

      if (children > 0) {
        rb->SendSimpleString("children");
        rb->StartArray(children);
      }
    }
  }
}

void CmdFtTagVals(CmdArgList args, CommandContext* cmd_cntx) {
  string_view index_name = ArgS(args, 0);
  string_view field_name = ArgS(args, 1);
  VLOG(1) << "FtTagVals: " << index_name << " " << field_name;

  vector<io::Result<StringVec, ErrorReply>> shard_results(shard_set->size(), StringVec{});

  cmd_cntx->tx()->ScheduleSingleHop([&](Transaction* t, EngineShard* es) {
    if (auto* index = es->search_indices()->GetIndex(index_name); index)
      shard_results[es->shard_id()] = index->GetTagVals(field_name);
    else
      shard_results[es->shard_id()] =
          nonstd::make_unexpected(ErrorReply(IndexNotFoundMsg(index_name)));

    return OpStatus::OK;
  });

  absl::flat_hash_set<string> result_set;
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  // Check first if either shard had errors. Also merge the results into a single set.
  for (auto& res : shard_results) {
    if (res) {
      result_set.insert(make_move_iterator(res->begin()), make_move_iterator(res->end()));
    } else {
      res.error().kind = facade::kSearchErrType;
      return cmd_cntx->SendError(res.error());
    }
  }

  shard_results.clear();
  vector<string> vec(result_set.begin(), result_set.end());

  rb->SendBulkStrArr(vec, CollectionType::SET);
}

void CmdFtAggregate(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  auto* builder = cmd_cntx->rb();

  const auto params = ParseAggregatorParams(&parser);
  if (SendErrorIfOccurred(params, &parser, cmd_cntx))
    return;

  // Check query string length limit
  size_t max_query_bytes = absl::GetFlag(FLAGS_search_query_string_bytes);
  if (params->query.size() > max_query_bytes) {
    return builder->SendError(
        absl::StrCat("Query string is too long, max length is ", max_query_bytes, " bytes"));
  }

  std::vector<aggregate::DocValues> values;

  if (params->joins.empty()) {
    search::SearchAlgorithm search_algo;
    if (!search_algo.Init(params->query, &params->params))
      return builder->SendError("Query syntax error");

    using ResultContainer = decltype(declval<ShardDocIndex>().SearchForAggregator(
        declval<OpArgs>(), params.value(), &search_algo));

    vector<ResultContainer> query_results(shard_set->size());

    cmd_cntx->tx()->ScheduleSingleHop([&](Transaction* t, EngineShard* es) {
      if (auto* index = es->search_indices()->GetIndex(params->index); index) {
        query_results[es->shard_id()] =
            index->SearchForAggregator(t->GetOpArgs(es), params.value(), &search_algo);
      }
      return OpStatus::OK;
    });

    // ResultContainer is absl::flat_hash_map<std::string, search::SortableValue>
    // DocValues is absl::flat_hash_map<std::string_view, SortableValue>
    // Keys of values should point to the keys of the query_results
    size_t total_values = 0;
    for (const auto& sub_results : query_results) {
      total_values += sub_results.size();
    }

    values.reserve(total_values);
    for (auto& sub_results : query_results) {
      for (auto& docs : sub_results) {
        aggregate::DocValues doc_value;
        for (auto& doc : docs) {
          doc_value[doc.first] = std::move(doc.second);
        }
        values.emplace_back(std::move(doc_value));
      }
    }
  } else {
    const size_t indexes_count = params->joins.size() + 1;

    std::vector<search::SearchAlgorithm> search_algos(indexes_count);
    if (!search_algos[0].Init(params->query, &params->params)) {
      return builder->SendError("Query syntax error");
    }

    for (size_t i = 0; i < params->joins.size(); ++i) {
      // Check join query string length limit
      if (params->joins[i].query.size() > max_query_bytes) {
        return cmd_cntx->SendError(absl::StrCat("Join query string is too long, max length is ",
                                                max_query_bytes, " bytes"));
      }

      search::QueryParams empty_params;
      if (!search_algos[i + 1].Init(params->joins[i].query, &empty_params)) {
        return cmd_cntx->SendError("Query syntax error in JOIN");
      }
    }

    auto data_for_join = PreprocessDataForJoin(params->index, *params);
    if (!data_for_join) {
      return cmd_cntx->SendError(data_for_join.error());
    }

    // preaggregated_shard_data is preaggregation results per index per shard
    // preaggregated_shard_data[shard_id][i] is the results of index i on shard shard_id
    using JoinDataVector = join::Vector<join::OwnedEntry>;
    std::vector<std::vector<JoinDataVector>> preaggregated_shard_data(
        shard_set->size(), std::vector<JoinDataVector>(indexes_count));
    cmd_cntx->tx()->Execute(
        [&](Transaction* t, EngineShard* es) {
          auto& shard_data = preaggregated_shard_data[es->shard_id()];
          for (size_t i = 0; i < indexes_count; ++i) {
            if (auto* index = es->search_indices()->GetIndex(data_for_join->indexes[i]); index) {
              shard_data[i] = index->PreagregateDataForJoin(
                  t->GetOpArgs(es), data_for_join->needed_fields[i], &search_algos[i]);
            }
          }
          return OpStatus::OK;
        },
        false);

    // Do join
    auto joined_entries = DoJoin(preaggregated_shard_data, *params, *data_for_join);

    // Collect doc_ids per index that were joined
    // Each shard stores set of doc_ids per each index that was joined
    using DocIdsSet = absl::flat_hash_set<search::DocId>;
    std::vector<std::vector<DocIdsSet>> doc_ids_per_shard(shard_set->size(),
                                                          std::vector<DocIdsSet>(indexes_count));
    for (const auto& entry : joined_entries) {
      for (size_t index = 0; index < indexes_count; index++) {
        const auto [shard_id, doc_id] = entry[index];
        doc_ids_per_shard[shard_id][index].insert(doc_id);
      }
    }

    // Load fields for keys that were joined
    std::vector<std::vector<ShardDocIndex::FieldsValuesPerDocId>> shard_keys_data_per_index(
        shard_set->size(), std::vector<ShardDocIndex::FieldsValuesPerDocId>(indexes_count));
    cmd_cntx->tx()->Execute(
        [&](Transaction* t, EngineShard* es) {
          const ShardId shard_id = es->shard_id();
          auto& shard_keys_data = shard_keys_data_per_index[shard_id];
          const auto& doc_ids_per_index = doc_ids_per_shard[shard_id];

          for (size_t i = 0; i < indexes_count; ++i) {
            if (auto* index = es->search_indices()->GetIndex(data_for_join->indexes[i]); index) {
              shard_keys_data[i] = index->LoadKeysData(t->GetOpArgs(es), doc_ids_per_index[i],
                                                       data_for_join->fields_to_load_per_index[i]);
            }
          }
          return OpStatus::OK;
        },
        true);

    // Now we have sets of keys that were joined and keys data.
    // We need to build DocValues for each joined set.
    values =
        MergeJoinedKeysWithData(*params, *data_for_join, joined_entries, shard_keys_data_per_index);
  }

  std::vector<std::string_view> load_fields;
  if (params->load_fields) {
    load_fields.reserve(params->load_fields->size());
    for (const auto& field : params->load_fields.value()) {
      load_fields.push_back(field.OutputName());
    }
  }

  auto agg_results = aggregate::Process(std::move(values), load_fields, params->steps);

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  auto sortable_value_sender = SortableValueSender(rb);

  const size_t result_size = agg_results.values.size();
  RedisReplyBuilder::ArrayScope scope{rb, result_size + 1};
  rb->SendLong(result_size);

  for (const auto& value : agg_results.values) {
    size_t fields_count = 0;
    for (const auto& field : agg_results.fields_to_print) {
      if (value.find(field) != value.end()) {
        fields_count++;
      }
    }

    rb->StartArray(fields_count * 2);
    for (const auto& field : agg_results.fields_to_print) {
      auto it = value.find(field);
      if (it != value.end()) {
        rb->SendBulkString(field);
        std::visit(sortable_value_sender, it->second);
      }
    }
  }
}

void CmdFtSynDump(CmdArgList args, CommandContext* cmd_cntx) {
  string_view index_name = ArgS(args, 0);
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  atomic_bool index_not_found{true};
  // Store per-shard synonym data
  vector<absl::flat_hash_map<std::string, absl::flat_hash_set<std::string>>> shard_term_groups(
      shard_set->size());

  // Collect synonym data from all shards
  cmd_cntx->tx()->Execute(
      [&](Transaction* t, EngineShard* es) {
        auto* index = es->search_indices()->GetIndex(index_name);
        if (!index)
          return OpStatus::OK;

        index_not_found.store(false, std::memory_order_relaxed);

        // Get synonym data from current shard
        const auto& groups = index->GetSynonyms().GetGroups();

        // Build term -> group_ids mapping for this shard
        auto& term_groups = shard_term_groups[es->shard_id()];
        for (const auto& [group_id, group] : groups) {
          for (const auto& term : group) {
            term_groups[term].insert(group_id);
          }
        }

        return OpStatus::OK;
      },
      true);

  if (index_not_found.load(std::memory_order_relaxed))
    return rb->SendError("Unknown index name");

  // Merge data from all shards into a single map
  absl::flat_hash_map<std::string, absl::flat_hash_set<std::string>> merged_term_groups;
  for (auto& shard_groups : shard_term_groups) {
    for (auto& [term, group_ids] : shard_groups) {
      auto& merged_ids = merged_term_groups[term];
      merged_ids.merge(group_ids);
    }
  }

  // Format response according to Redis protocol:
  // Array of term + array of group ids pairs
  rb->StartArray(merged_term_groups.size() * 2);
  for (const auto& [term, group_ids] : merged_term_groups) {
    rb->SendBulkString(term);
    rb->StartArray(group_ids.size());

    // Sort group_ids before sending
    std::vector<std::string> sorted_ids(group_ids.begin(), group_ids.end());
    std::sort(sorted_ids.begin(), sorted_ids.end());

    for (const auto& id : sorted_ids) {
      rb->SendBulkString(id);
    }
  }
}

void FtConfigHelp(CmdArgParser* parser, CommandContext* cmd_cntx) {
  string_view param = parser->Next();

  vector<string> names = config_registry.List(param);
  vector<absl::CommandLineFlag*> res;

  for (const auto& name : names) {
    auto* flag = config_registry.GetFlag(name);
    DCHECK(flag);
    if (flag && flag->Filename().find(kCurrentFile) != std::string::npos) {
      res.push_back(flag);
    }
  }

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  rb->StartArray(res.size());
  for (const auto& flag : res) {
    rb->StartArray(5);
    rb->SendBulkString(flag->Name());
    rb->SendBulkString("Description"sv);
    rb->SendBulkString(flag->Help());
    rb->SendBulkString("Value"sv);
    rb->SendBulkString(flag->CurrentValue());
  }
}

void FtConfigGet(CmdArgParser* parser, CommandContext* cmd_cntx) {
  string_view param = parser->Next();
  vector<string> names = config_registry.List(param);

  vector<string> res;

  for (const auto& name : names) {
    auto* flag = config_registry.GetFlag(name);
    DCHECK(flag);
    if (flag && flag->Filename().find(kCurrentFile) != std::string::npos) {
      // Convert internal name (search_query_string_bytes) back to user-facing format
      // (search.query-string-bytes)
      string display_name = DenormalizeConfigName(name);
      res.push_back(display_name);
      res.push_back(flag->CurrentValue());
    }
  }
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  return rb->SendBulkStrArr(res, CollectionType::MAP);
}

void FtConfigSet(CmdArgParser* parser, CommandContext* cmd_cntx) {
  auto [param, value] = parser->Next<string_view, string_view>();

  if (!parser->Finalize()) {
    cmd_cntx->SendError(parser->TakeError().MakeReply());
    return;
  }

  vector<string> names = config_registry.List(param);
  if (names.size() != 1 ||
      config_registry.GetFlag(names[0])->Filename().find(kCurrentFile) == std::string::npos) {
    return cmd_cntx->SendError("Invalid option name");
  }

  ConfigRegistry::SetResult result = config_registry.Set(param, value);

  const char kErrPrefix[] = "FT.CONFIG SET failed (possibly related to argument '";
  switch (result) {
    case ConfigRegistry::SetResult::OK:
      return cmd_cntx->SendOk();
    case ConfigRegistry::SetResult::UNKNOWN:
      return cmd_cntx->SendError(
          absl::StrCat("Unknown option or number of arguments for CONFIG SET - '", param, "'"),
          kConfigErrType);

    case ConfigRegistry::SetResult::READONLY:
      return cmd_cntx->SendError(absl::StrCat(kErrPrefix, param, "') - can't set immutable config"),
                                 kConfigErrType);

    case ConfigRegistry::SetResult::INVALID:
      return cmd_cntx->SendError(absl::StrCat(kErrPrefix, param, "') - argument can not be set"),
                                 kConfigErrType);
  }
  ABSL_UNREACHABLE();
}

void CmdFtConfig(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  auto func = parser.MapNext("GET", &FtConfigGet, "SET", &FtConfigSet, "HELP", &FtConfigHelp);

  if (auto err = parser.TakeError(); err) {
    cmd_cntx->SendError("Unknown subcommand");
    return;
  }
  func(&parser, cmd_cntx);
}

void CmdFtSynUpdate(CmdArgList args, CommandContext* cmd_cntx) {
  facade::CmdArgParser parser{args};
  auto [index_name, group_id] = parser.Next<string_view, string>();

  // Redis ignores this parameter. Checked on redis_version:6.2.13
  [[maybe_unused]] bool skip_initial_scan = parser.Check("SKIPINITIALSCAN");

  // Collect terms
  std::vector<std::string_view> terms;
  while (parser.HasNext()) {
    terms.emplace_back(parser.Next());
  }

  if (terms.empty()) {
    return cmd_cntx->SendError("No terms specified");
  }

  if (!parser.Finalize()) {
    return cmd_cntx->SendError(parser.TakeError().MakeReply());
  }

  std::atomic_bool index_not_found{true};

  // Update synonym groups in all shards
  cmd_cntx->tx()->Execute(
      [&](Transaction* t, EngineShard* es) {
        auto* index = es->search_indices()->GetIndex(index_name);
        if (!index)
          return OpStatus::OK;

        index_not_found.store(false, std::memory_order_relaxed);

        // Rebuild indices only for documents containing terms from the updated group
        index->RebuildForGroup(
            OpArgs{es, nullptr,
                   DbContext{&namespaces->GetDefaultNamespace(), 0, GetCurrentTimeMs()}},
            group_id, terms);

        return OpStatus::OK;
      },
      true);

  if (index_not_found.load(std::memory_order_relaxed))
    return cmd_cntx->SendError(string{index_name} + ": no such index");

  cmd_cntx->rb()->SendOk();
}

void CmdFtDebug(CmdArgList args, CommandContext* cmd_cntx) {
  // FT._DEBUG command stub for test compatibility
  // This command is used by integration tests to control internal behavior
  CmdArgParser parser{args};
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  if (args.empty() || parser.Check("HELP")) {
    rb->SendSimpleString("FT._DEBUG - Debug command stub (not fully implemented)");
    return;
  }

  // Handle CONTROLLED_VARIABLE subcommand used by tests
  if (parser.Check("CONTROLLED_VARIABLE")) {
    if (parser.Check("SET")) {
      // Consume variable name and value - these are required by the command
      parser.Next();  // variable name
      parser.Next();  // variable value

      RETURN_ON_PARSE_ERROR(parser, cmd_cntx);

      // Just acknowledge the command
      rb->SendOk();
      return;
    }
  }

  // For any other subcommand, just return OK
  rb->SendOk();
}

#define HFUNC(x) SetHandler(&Cmd##x)

// Redis search is a module. Therefore we introduce dragonfly extension search
// to set as the default for the search family of commands. More sensible defaults,
// should also be considered in the future

void SearchFamily::Register(CommandRegistry* registry) {
  using CI = CommandId;

  // Disable journaling, because no-key-transactional enables it by default
  const uint32_t kReadOnlyMask =
      CO::NO_KEY_TRANSACTIONAL | CO::NO_KEY_TX_SPAN_ALL | CO::NO_AUTOJOURNAL | CO::IDEMPOTENT;

  registry->StartFamily();
  *registry
      << CI{"FT.CREATE", CO::JOURNALED | CO::GLOBAL_TRANS, -2, 0, 0, acl::FT_SEARCH}.HFUNC(FtCreate)
      << CI{"FT.ALTER", CO::JOURNALED | CO::GLOBAL_TRANS, -3, 0, 0, acl::FT_SEARCH}.HFUNC(FtAlter)
      << CI{"FT.DROPINDEX", CO::JOURNALED | CO::GLOBAL_TRANS, -2, 0, 0, acl::FT_SEARCH}.HFUNC(
             FtDropIndex)
      << CI{"FT.INFO", CO::NO_KEY_TRANSACTIONAL | CO::NO_KEY_TX_SPAN_ALL | CO::NO_AUTOJOURNAL,
            -2,        0,
            0,         acl::FT_SEARCH}
             .HFUNC(FtInfo)
      << CI{"FT.CONFIG", CO::ADMIN | CO::LOADING | CO::DANGEROUS, -3, 0, 0, acl::FT_SEARCH}.HFUNC(
             FtConfig)
      // Underscore same as in RediSearch because it's "temporary" (long time already)
      << CI{"FT._LIST", kReadOnlyMask, 1, 0, 0, acl::FT_SEARCH}.HFUNC(FtList)
      << CI{"FT.SEARCH", kReadOnlyMask, -3, 0, 0, acl::FT_SEARCH}.HFUNC(FtSearch)
      << CI{"FT.AGGREGATE", kReadOnlyMask, -3, 0, 0, acl::FT_SEARCH}.HFUNC(FtAggregate)
      << CI{"FT.PROFILE", kReadOnlyMask, -4, 0, 0, acl::FT_SEARCH}.HFUNC(FtProfile)
      << CI{"FT.TAGVALS", kReadOnlyMask, 3, 0, 0, acl::FT_SEARCH}.HFUNC(FtTagVals)
      << CI{"FT.SYNDUMP", kReadOnlyMask, 2, 0, 0, acl::FT_SEARCH}.HFUNC(FtSynDump)
      << CI{"FT.SYNUPDATE", CO::JOURNALED | CO::GLOBAL_TRANS, -4, 0, 0, acl::FT_SEARCH}.HFUNC(
             FtSynUpdate)
      << CI{"FT._DEBUG", kReadOnlyMask, -1, 0, 0, acl::FT_SEARCH}.HFUNC(FtDebug);
}

void SearchFamily::Shutdown() {
  shard_set->RunBlockingInParallel([](EngineShard* es) { es->search_indices()->DropAllIndices(); });
}

}  // namespace dfly


================================================
FILE: src/server/search/search_family.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

namespace dfly {
class CommandRegistry;

class SearchFamily {
 public:
  static void Register(CommandRegistry* registry);
  static void Shutdown();
};

}  // namespace dfly


================================================
FILE: src/server/search/search_family_test.cc
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/search/search_family.h"

#include <absl/flags/flag.h>
#include <absl/strings/str_format.h>

#include <algorithm>
#include <string_view>

#include "base/gtest.h"
#include "base/logging.h"
#include "core/detail/gen_utils.h"
#include "facade/error.h"
#include "facade/facade_test.h"
#include "facade/resp_parser.h"
#include "server/test_utils.h"

using namespace testing;
using namespace std;
using namespace util;
using namespace facade;

ABSL_DECLARE_FLAG(bool, search_reject_legacy_field);
ABSL_DECLARE_FLAG(size_t, search_query_string_bytes);

namespace {

// Verify and extract score field from vector search result
auto vector_score = [](std::string_view score_name, const RespExpr::Vec& score_field) -> float {
  EXPECT_THAT(score_field.size(), 2);
  EXPECT_THAT(score_field[0].GetString(), score_name);
  float score;
  bool success = absl::SimpleAtof(score_field[1].GetView(), &score);
  EXPECT_TRUE(success);
  return score;
};

// Helper to convert float array to binary format
auto Vec3ToBytes = [](float x, float y, float z) -> string {
  string result;
  result.append(reinterpret_cast<const char*>(&x), sizeof(float));
  result.append(reinterpret_cast<const char*>(&y), sizeof(float));
  result.append(reinterpret_cast<const char*>(&z), sizeof(float));
  return result;
};

}  // namespace

namespace dfly {

class SearchFamilyTest : public BaseFamilyTest {
 protected:
};

const auto kNoResults = IntArg(0);  // tests auto destruct single element arrays

/* Asserts that response is array of two arrays. Used to test FT.PROFILE response */
::testing::AssertionResult AssertArrayOfTwoArrays(const RespExpr& resp) {
  if (resp.GetVec().size() != 2) {
    return ::testing::AssertionFailure()
           << "Expected response array length to be 2, but was " << resp.GetVec().size();
  }

  const auto& vec = resp.GetVec();
  if (vec[0].type != RespExpr::ARRAY) {
    return ::testing::AssertionFailure()
           << "Expected resp[0] to be an array, but was " << vec[0].type;
  }
  if (vec[1].type != RespExpr::ARRAY) {
    return ::testing::AssertionFailure()
           << "Expected resp[1] to be an array, but was " << vec[1].type;
  }
  return ::testing::AssertionSuccess();
}

#define ASSERT_ARRAY_OF_TWO_ARRAYS(resp) ASSERT_PRED1(AssertArrayOfTwoArrays, resp)

MATCHER_P2(DocIds, total, arg_ids, "") {
  if (arg_ids.empty()) {
    if (auto res = arg.GetInt(); !res || *res != 0) {
      *result_listener << "Expected single zero";
      return false;
    }
    return true;
  }

  if (arg.type != RespExpr::ARRAY) {
    *result_listener << "Wrong response type: " << int(arg.type);
    return false;
  }

  auto results = arg.GetVec();
  if (results.size() != arg_ids.size() * 2 + 1) {
    *result_listener << "Wrong resp vec size: " << results.size();
    return false;
  }

  if (auto num_results = results[0].GetInt(); !num_results || size_t(*num_results) != total) {
    *result_listener << "Bad total count in reply: " << num_results.value_or(-1);
    return false;
  }

  vector<string> received_ids;
  for (size_t i = 1; i < results.size(); i += 2)
    received_ids.push_back(results[i].GetString());

  vector<string> expected_ids = arg_ids;
  sort(received_ids.begin(), received_ids.end());
  sort(expected_ids.begin(), expected_ids.end());

  return expected_ids == received_ids;
}

template <typename... Args> auto AreDocIds(Args... args) {
  return DocIds(sizeof...(args), vector<string>{args...});
}

template <typename... Args> auto IsArray(Args... args) {
  return RespArray(ElementsAre(std::forward<Args>(args)...));
}

template <typename... Args> auto IsUnordArray(Args... args) {
  return RespArray(UnorderedElementsAre(std::forward<Args>(args)...));
}
template <typename Expected, size_t... Is>
void BuildKvMatchers(std::vector<Matcher<std::pair<std::string, RespExpr>>>& kv_matchers,
                     const Expected& expected, std::index_sequence<Is...>) {
  (kv_matchers.emplace_back(Pair(std::get<Is * 2>(expected), std::get<Is * 2 + 1>(expected))), ...);
}

MATCHER_P(IsMapMatcher, expected, "") {
  if (arg.type != RespExpr::ARRAY) {
    *result_listener << "Wrong response type: " << arg.type;
    return false;
  }

  constexpr size_t expected_size = std::tuple_size<decltype(expected)>::value;
  constexpr size_t exprected_pairs_number = expected_size / 2;

  auto result = arg.GetVec();
  if (result.size() != expected_size) {
    *result_listener << "Wrong resp array size: " << result.size();
    return false;
  }

  std::vector<std::pair<std::string, RespExpr>> received_pairs;
  for (size_t i = 0; i < result.size(); i += 2) {
    received_pairs.emplace_back(result[i].GetString(), result[i + 1]);
  }

  std::vector<Matcher<std::pair<std::string, RespExpr>>> kv_matchers;
  BuildKvMatchers(kv_matchers, expected, std::make_index_sequence<exprected_pairs_number>{});

  return ExplainMatchResult(UnorderedElementsAreArray(kv_matchers), received_pairs,
                            result_listener);
}

template <typename... Args> auto IsMap(Args... args) {
  return IsMapMatcher(std::make_tuple(args...));
}

MATCHER_P(IsMapWithSizeMatcher, expected, "") {
  if (arg.type != RespExpr::ARRAY) {
    *result_listener << "Wrong response type: " << arg.type;
    return false;
  }
  constexpr size_t expected_size = std::tuple_size<decltype(expected)>::value;
  constexpr size_t exprected_pairs_number = expected_size / 2;

  auto result = arg.GetVec();
  if (result.size() != expected_size + 1 || result.size() % 2 != 1) {
    *result_listener << "Wrong resp array size: " << result.size();
    return false;
  }

  if (result[0].GetInt() != exprected_pairs_number) {
    *result_listener << "Wrong pairs count: " << result[0].GetInt().value_or(-1);
    return false;
  }

  std::vector<std::pair<std::string, RespExpr>> received_pairs;
  for (size_t i = 1; i < result.size(); i += 2) {
    received_pairs.emplace_back(result[i].GetString(), result[i + 1]);
  }

  std::vector<Matcher<std::pair<std::string, RespExpr>>> kv_matchers;
  BuildKvMatchers(kv_matchers, expected, std::make_index_sequence<exprected_pairs_number>{});

  return ExplainMatchResult(UnorderedElementsAreArray(kv_matchers), received_pairs,
                            result_listener);
}

template <typename... Args> auto IsMapWithSize(Args... args) {
  return IsMapWithSizeMatcher(std::make_tuple(args...));
}

MATCHER_P(IsUnordArrayWithSizeMatcher, expected, "") {
  if (arg.type != RespExpr::ARRAY) {
    *result_listener << "Wrong response type: " << arg.type;
    return false;
  }

  auto result = arg.GetVec();
  size_t expected_size = std::tuple_size<decltype(expected)>::value;
  if (result.size() != expected_size + 1) {
    *result_listener << "Wrong resp array size: " << result.size();
    return false;
  }

  if (result[0].GetInt() != expected_size) {
    *result_listener << "Wrong elements count: " << result[0].GetInt().value_or(-1);
    return false;
  }

  std::vector<RespExpr> received_elements(result.begin() + 1, result.end());

  // Create a vector of matchers from the tuple
  std::vector<Matcher<RespExpr>> matchers;
  std::apply([&matchers](auto&&... args) { ((matchers.push_back(args)), ...); }, expected);

  return ExplainMatchResult(UnorderedElementsAreArray(matchers), received_elements,
                            result_listener);
}

template <typename... Matchers> auto IsUnordArrayWithSize(Matchers... matchers) {
  return IsUnordArrayWithSizeMatcher(std::make_tuple(matchers...));
}

TEST_F(SearchFamilyTest, CreateDropListIndex) {
  EXPECT_EQ(Run({"ft.create", "idx-1", "ON", "HASH", "PREFIX", "1", "prefix-1"}), "OK");
  EXPECT_EQ(Run({"ft.create", "idx-2", "ON", "JSON", "PREFIX", "1", "prefix-2"}), "OK");
  EXPECT_EQ(Run({"ft.create", "idx-3", "ON", "JSON", "PREFIX", "1", "prefix-3"}), "OK");

  EXPECT_THAT(Run({"ft._list"}).GetVec(), testing::UnorderedElementsAre("idx-1", "idx-2", "idx-3"));

  EXPECT_EQ(Run({"ft.dropindex", "idx-2"}), "OK");
  EXPECT_THAT(Run({"ft._list"}).GetVec(), testing::UnorderedElementsAre("idx-1", "idx-3"));

  EXPECT_THAT(Run({"ft.create", "idx-1"}), ErrArg("Index already exists"));

  EXPECT_THAT(Run({"ft.dropindex", "idx-100"}), ErrArg("Index with name 'idx-100' not found"));

  EXPECT_EQ(Run({"ft.dropindex", "idx-1"}), "OK");
  EXPECT_EQ(Run({"ft._list"}), "idx-3");
}

TEST_F(SearchFamilyTest, CreateDropDifferentDatabases) {
  // Create index on db 0
  auto resp =
      Run({"ft.create", "idx-1", "ON", "HASH", "PREFIX", "1", "doc-", "SCHEMA", "name", "TEXT"});
  EXPECT_EQ(resp, "OK");

  // Add some data on database 0 (only db 0 is indexed)
  Run({"hset", "doc-0", "name", "Name of 0"});

  // Verify search works on db 0
  resp = Run({"ft.search", "idx-1", "*"});
  EXPECT_THAT(resp, IsMapWithSize("doc-0", IsMap("name", "Name of 0")));

  EXPECT_EQ(Run({"select", "1"}), "OK");  // change database

  // Creating an index on non zero database must fail
  resp = Run({"ft.create", "idx-2", "ON", "JSON", "PREFIX", "1", "prefix-2"});
  EXPECT_THAT(resp, ErrArg("ERR Cannot create index on db != 0"));

  // Search from db 1 should return 0 results (only db 0 is indexed)
  resp = Run({"ft.search", "idx-1", "*"});
  EXPECT_THAT(resp, IntArg(0));

  // ft.dropindex must work from another database
  EXPECT_EQ(Run({"ft.dropindex", "idx-1"}), "OK");
  EXPECT_THAT(Run({"ft.info", "idx-1"}), ErrArg("Index with name 'idx-1' not found"));
}

TEST_F(SearchFamilyTest, AlterIndex) {
  Run({"hset", "d:1", "color", "blue", "cost", "150"});
  Run({"hset", "d:2", "color", "green", "cost", "200"});

  Run({"ft.create", "idx-1", "ON", "HASH"});

  EXPECT_EQ(Run({"ft.alter", "idx-1", "schema", "add", "color", "tag"}), "OK");
  EXPECT_THAT(Run({"ft.search", "idx-1", "@color:{blue}"}), AreDocIds("d:1"));
  EXPECT_THAT(Run({"ft.search", "idx-1", "@color:{green}"}), AreDocIds("d:2"));

  EXPECT_EQ(Run({"ft.alter", "idx-1", "schema", "add", "cost", "numeric"}), "OK");
  EXPECT_THAT(Run({"ft.search", "idx-1", "@cost:[0 100]"}), kNoResults);
  EXPECT_THAT(Run({"ft.search", "idx-1", "@cost:[100 300]"}), AreDocIds("d:1", "d:2"));

  EXPECT_THAT(Run({"ft.alter", "idx-2", "schema", "add", "price", "numeric"}),
              ErrArg("Index not found"));
}

TEST_F(SearchFamilyTest, SuffixPrefixSearch) {
  Run({"ft.create", "idx", "SCHEMA", "name", "TEXT"});
  Run({"hset", "d:1", "name", "apple"});
  Run({"hset", "d:2", "name", "carrot"});

  EXPECT_THAT(Run({"FT.SEARCH", "idx", "app*"}), AreDocIds("d:1"));
  EXPECT_THAT(Run({"FT.SEARCH", "idx", "@name:app*"}), AreDocIds("d:1"));
  EXPECT_THAT(Run({"FT.SEARCH", "idx", "*le"}), AreDocIds("d:1"));
  EXPECT_THAT(Run({"FT.SEARCH", "idx", "@name:*le"}), AreDocIds("d:1"));
  EXPECT_THAT(Run({"FT.SEARCH", "idx", "*pl*"}), AreDocIds("d:1"));
  EXPECT_THAT(Run({"FT.SEARCH", "idx", "@name:*pl*"}), AreDocIds("d:1"));
}

TEST_F(SearchFamilyTest, InfoIndex) {
  EXPECT_EQ(
      Run({"ft.create", "idx-1", "ON", "HASH", "PREFIX", "1", "doc-", "SCHEMA", "name", "TEXT"}),
      "OK");

  for (size_t i = 0; i < 15; i++) {
    Run({"hset", absl::StrCat("doc-", i), "name", absl::StrCat("Name of", i)});
  }

  auto info = Run({"ft.info", "idx-1"});

  auto descriptor_matcher =
      IsArray("key_type", "HASH", "prefixes", IsArray("doc-"), "default_score", 1);
  auto schema_matcher = IsArray(IsArray("identifier", "name", "attribute", "name", "type", "TEXT"));

  EXPECT_THAT(info, IsArray(_, _, _, descriptor_matcher, "index_options", RespArray(IsEmpty()),
                            "attributes", schema_matcher, "num_docs", IntArg(15), "indexing",
                            IntArg(0), "percent_indexed", "1"));
}

TEST_F(SearchFamilyTest, Stats) {
  EXPECT_EQ(
      Run({"ft.create", "idx-1", "ON", "HASH", "PREFIX", "1", "doc1-", "SCHEMA", "name", "TEXT"}),
      "OK");

  EXPECT_EQ(
      Run({"ft.create", "idx-2", "ON", "HASH", "PREFIX", "1", "doc2-", "SCHEMA", "name", "TEXT"}),
      "OK");

  for (size_t i = 0; i < 50; i++) {
    Run({"hset", absl::StrCat("doc1-", i), "name", absl::StrCat("Name of", i)});
    Run({"hset", absl::StrCat("doc2-", i), "name", absl::StrCat("Name of", i)});
  }

  auto metrics = GetMetrics();
  EXPECT_EQ(metrics.search_stats.num_indices, 2);
  EXPECT_EQ(metrics.search_stats.num_entries, 50 * 2);

  size_t expected_usage = 2 * (50 + 3 /* number of distinct words*/) * (24 + 48 /* kv size */) +
                          50 * 2 * 1 /* posting list entries */;
  EXPECT_GE(metrics.search_stats.used_memory, expected_usage);
  EXPECT_LE(metrics.search_stats.used_memory, 3 * expected_usage);
}

// Test how asynchronous indexing indexes documents and reports its progress
TEST_F(SearchFamilyTest, Indexing) {
  // Create documents
#ifdef NDEBUG
  constexpr size_t kNumDocs = 10'000;
#else
  constexpr size_t kNumDocs = 1'000;
#endif

  for (size_t i = 0; i < kNumDocs; i++) {
    Run({"hset", absl::StrCat("doc-", i), "t", absl::StrCat("some long text at ", i), "v1",
         absl::StrCat(i / 10), "v2", absl::StrCat(i / 1000)});
  }

  string_view create_cmd[] = {"ft.create", "i1", "schema", "v1", "numeric", "t", "text"};

  // Drop immediately to check cancel
  {
    Run(create_cmd);
    for (size_t i = 0; i < 3; i++)
      ThisFiber::Yield();
    Run({"ft.dropindex", "i1"});
  }

  // Update with ft.alter to check restart
  {
    Run(create_cmd);
    for (size_t i = 0; i < 5; i++)
      ThisFiber::Yield();
    Run({"ft.alter", "i1", "schema", "add", "v2", "numeric"});
  }

  // loop and wait for index construction
  absl::Time deadline = absl::Now() + absl::Seconds(10);
  size_t iterations = 0;
  bool seen_full = false;
  while (true) {
    auto resp = Run({"ft.info", "i1"});
    auto arr = resp.GetVec();

    auto find_field = [&arr](string_view field) {
      return ++std::find_if(arr.begin(), arr.end(), [field](const auto& i) { return i == field; });
    };

    auto num_docs = find_field("num_docs");
    auto indexing = find_field("indexing");
    auto percent_indexed = find_field("percent_indexed");

    if (indexing->GetInt() == 0) {
      EXPECT_THAT(*num_docs, IntArg(kNumDocs));
      EXPECT_EQ(*percent_indexed, "1");
      break;
    }

    // Check basic invariants
    EXPECT_FALSE(seen_full);
    seen_full |= num_docs->GetInt() == kNumDocs;
    EXPECT_THAT(*indexing, IntArg(1));
    EXPECT_NE(*percent_indexed, "1");  // change once we have estimations

    // Check search doesn't return any errors
    resp = Run({"ft.search", "i1", "@v1:[10 20]"});
    EXPECT_THAT(resp, Not(ErrArg("")));

    iterations++;
    ASSERT_LE(absl::Now(), deadline);
  }

  EXPECT_GT(iterations, 0u);  // ensure we observed indexing-in-progress state at least once

  auto resp = Run({"ft.search", "i1", "@v1:[10 20]", "LIMIT", "0", "0"});
  EXPECT_THAT(resp, IntArg(110));

  // check added with alter field v2 is fully indexed
  resp = Run({"ft.search", "i1", "@v2:[0 10000]", "LIMIT", "0", "0"});
  EXPECT_THAT(resp, IntArg(kNumDocs));
}

TEST_F(SearchFamilyTest, Simple) {
  Run({"hset", "d:1", "foo", "baz", "k", "v"});
  Run({"hset", "d:2", "foo", "bar", "k", "v"});
  Run({"hset", "d:3", "foo", "bad", "k", "v"});

  EXPECT_EQ(Run({"ft.create", "i1", "PREFIX", "1", "d:", "SCHEMA", "foo", "TEXT", "k", "TEXT"}),
            "OK");

  EXPECT_THAT(Run({"ft.search", "i1", "@foo:bar"}), AreDocIds("d:2"));
  EXPECT_THAT(Run({"ft.search", "i1", "@foo:bar | @foo:baz"}), AreDocIds("d:1", "d:2"));
  EXPECT_THAT(Run({"ft.search", "i1", "@foo:(bar|baz|bad)"}), AreDocIds("d:1", "d:2", "d:3"));

  EXPECT_THAT(Run({"ft.search", "i1", "@foo:none"}), kNoResults);

  EXPECT_THAT(Run({"ft.search", "iNone", "@foo:bar"}), ErrArg("iNone: no such index"));
  EXPECT_THAT(Run({"ft.search", "i1", "@@NOTAQUERY@@"}), ErrArg("Query syntax error"));

  // w: prefix is not part of index
  Run({"hset", "w:2", "foo", "this", "k", "v"});
  EXPECT_THAT(Run({"ft.search", "i1", "@foo:this"}), kNoResults);
}

TEST_F(SearchFamilyTest, Errors) {
  Run({"ft.create", "i1", "PREFIX", "1", "d:", "SCHEMA", "foo", "TAG", "bar", "TEXT"});

  // Wrong field
  EXPECT_THAT(Run({"ft.search", "i1", "@whoami:lol"}), ErrArg("Invalid field: whoami"));

  // Wrong field type
  EXPECT_THAT(Run({"ft.search", "i1", "@foo:lol"}), ErrArg("Wrong access type for field: foo"));

  // ft.create index on json schema $.sometag AS sometag TAG SEPARATOR
  EXPECT_THAT(Run({"ft.create", "i2", "ON", "JSON", "SCHEMA", "$.sometag", "AS", "sometag", "TAG",
                   "SEPARATOR"}),
              ErrArg("Tag separator must be a single character. Got ``"));
}

TEST_F(SearchFamilyTest, NoPrefix) {
  Run({"hset", "d:1", "a", "one", "k", "v"});
  Run({"hset", "d:2", "a", "two", "k", "v"});
  Run({"hset", "d:3", "a", "three", "k", "v"});

  EXPECT_EQ(Run({"ft.create", "i1", "schema", "a", "text", "k", "text"}), "OK");

  EXPECT_THAT(Run({"ft.search", "i1", "one | three"}), AreDocIds("d:1", "d:3"));
}

TEST_F(SearchFamilyTest, Json) {
  Run({"json.set", "k1", ".", R"({"a": "small test", "b": "some details"})"});
  Run({"json.set", "k2", ".", R"({"a": "another test", "b": "more details"})"});
  Run({"json.set", "k3", ".", R"({"a": "last test", "b": "secret details"})"});

  EXPECT_EQ(Run({"ft.create", "i1", "on", "json", "schema", "$.a", "as", "a", "text", "$.b", "as",
                 "b", "text"}),
            "OK");

  EXPECT_THAT(Run({"ft.search", "i1", "some|more"}), AreDocIds("k1", "k2"));
  EXPECT_THAT(Run({"ft.search", "i1", "some|more|secret"}), AreDocIds("k1", "k2", "k3"));

  EXPECT_THAT(Run({"ft.search", "i1", "@a:last @b:details"}), AreDocIds("k3"));
  EXPECT_THAT(Run({"ft.search", "i1", "@a:(another|small)"}), AreDocIds("k1", "k2"));
  EXPECT_THAT(Run({"ft.search", "i1", "@a:(another|small|secret)"}), AreDocIds("k1", "k2"));

  EXPECT_THAT(Run({"ft.search", "i1", "none"}), kNoResults);
  EXPECT_THAT(Run({"ft.search", "i1", "@a:small @b:secret"}), kNoResults);
}

TEST_F(SearchFamilyTest, JsonAttributesPaths) {
  Run({"json.set", "k1", ".", R"(   {"nested": {"value": "no"}} )"});
  Run({"json.set", "k2", ".", R"(   {"nested": {"value": "yes"}} )"});
  Run({"json.set", "k3", ".", R"(   {"nested": {"value": "maybe"}} )"});

  EXPECT_EQ(
      Run({"ft.create", "i1", "on", "json", "schema", "$.nested.value", "as", "value", "text"}),
      "OK");

  EXPECT_THAT(Run({"ft.search", "i1", "yes"}), AreDocIds("k2"));
}

TEST_F(SearchFamilyTest, JsonIdentifierWithBrackets) {
  Run({"json.set", "k1", ".", R"({"name":"London","population":8.8,"continent":"Europe"})"});
  Run({"json.set", "k2", ".", R"({"name":"Athens","population":3.1,"continent":"Europe"})"});
  Run({"json.set", "k3", ".", R"({"name":"Tel-Aviv","population":1.3,"continent":"Asia"})"});
  Run({"json.set", "k4", ".", R"({"name":"Hyderabad","population":9.8,"continent":"Asia"})"});

  EXPECT_EQ(Run({"ft.create", "i1", "on", "json", "schema", "$[\"name\"]", "as", "name", "tag",
                 "$[\"population\"]", "as", "population", "numeric", "sortable", "$[\"continent\"]",
                 "as", "continent", "tag"}),
            "OK");

  EXPECT_THAT(Run({"ft.search", "i1", "(@continent:{Europe})"}), AreDocIds("k1", "k2"));
}

TEST_F(SearchFamilyTest, JsonArrayValues) {
  string_view D1 = R"(
{
  "name": "Alex",
  "plays" : [
    {"game": "Pacman", "score": 10},
    {"game": "Tetris", "score": 15}
  ],
  "areas": ["EU-west", "EU-central"]
}
)";
  string_view D2 = R"(
{
  "name": "Bob",
  "plays" : [
    {"game": "Pacman", "score": 15},
    {"game": "Mario", "score": 7}
  ],
  "areas": ["US-central"]
}
)";
  string_view D3 = R"(
{
  "name": "Caren",
  "plays" : [
    {"game": "Mario", "score": 9},
    {"game": "Doom", "score": 20}
  ],
  "areas": ["EU-central", "EU-east"]
}
)";

  Run({"json.set", "k1", ".", D1});
  Run({"json.set", "k2", ".", D2});
  Run({"json.set", "k3", ".", D3});

  Run({"ft.create", "i1",
       "on",        "json",
       "schema",    "$.name",
       "as",        "name",
       "text",      "$.plays[*].game",
       "as",        "games",
       "tag",       "$.plays[*].score",
       "as",        "scores",
       "numeric",   "$.areas[*]",
       "as",        "areas",
       "tag"});

  EXPECT_THAT(Run({"ft.search", "i1", "*"}), AreDocIds("k1", "k2", "k3"));

  // Find players by games
  EXPECT_THAT(Run({"ft.search", "i1", "@games:{Tetris | Mario | Doom}"}),
              AreDocIds("k1", "k2", "k3"));
  EXPECT_THAT(Run({"ft.search", "i1", "@games:{Pacman}"}), AreDocIds("k1", "k2"));
  EXPECT_THAT(Run({"ft.search", "i1", "@games:{Mario}"}), AreDocIds("k2", "k3"));

  // Find players by scores
  EXPECT_THAT(Run({"ft.search", "i1", "@scores:[15 15]"}), AreDocIds("k1", "k2"));
  EXPECT_THAT(Run({"ft.search", "i1", "@scores:[0 (10]"}), AreDocIds("k2", "k3"));
  EXPECT_THAT(Run({"ft.search", "i1", "@scores:[(15 20]"}), AreDocIds("k3"));

  // Find platers by areas
  EXPECT_THAT(Run({"ft.search", "i1", "@areas:{'EU-central'}"}), AreDocIds("k1", "k3"));
  EXPECT_THAT(Run({"ft.search", "i1", "@areas:{'US-central'}"}), AreDocIds("k2"));

  // Test complicated RETURN expression
  auto res = Run(
      {"ft.search", "i1", "@name:bob", "return", "1", "max($.plays[*].score)", "as", "max-score"});
  EXPECT_THAT(res, IsMapWithSize("k2", IsMap("max-score", "15")));

  // Test invalid json path expression omits that field
  res = Run({"ft.search", "i1", "@name:alex", "return", "1", "::??INVALID??::", "as", "retval"});
  EXPECT_THAT(res, IsMapWithSize("k1", IsMap()));
}

TEST_F(SearchFamilyTest, Tags) {
  Run({"hset", "d:1", "color", "red, green"});
  Run({"hset", "d:2", "color", "green, blue"});
  Run({"hset", "d:3", "color", "blue, red"});
  Run({"hset", "d:4", "color", "red"});
  Run({"hset", "d:5", "color", "green"});
  Run({"hset", "d:6", "color", "blue"});

  EXPECT_EQ(Run({"ft.create", "i1", "on", "hash", "schema", "color", "tag", "dummy", "numeric"}),
            "OK");
  EXPECT_THAT(Run({"ft.tagvals", "i2", "color"}), ErrArg("Index with name 'i2' not found"));
  EXPECT_THAT(Run({"ft.tagvals", "i1", "foo"}), ErrArg("No such field"));
  EXPECT_THAT(Run({"ft.tagvals", "i1", "dummy"}), ErrArg("Not a tag field"));
  auto resp = Run({"ft.tagvals", "i1", "color"});
  ASSERT_THAT(resp, IsUnordArray("red", "blue", "green"));

  // Tags don't participate in full text search
  EXPECT_THAT(Run({"ft.search", "i1", "red"}), kNoResults);

  EXPECT_THAT(Run({"ft.search", "i1", "@color:{ red }"}), AreDocIds("d:1", "d:3", "d:4"));
  EXPECT_THAT(Run({"ft.search", "i1", "@color:{green}"}), AreDocIds("d:1", "d:2", "d:5"));
  EXPECT_THAT(Run({"ft.search", "i1", "@color:{blue}"}), AreDocIds("d:2", "d:3", "d:6"));

  EXPECT_THAT(Run({"ft.search", "i1", "@color:{red | green}"}),
              AreDocIds("d:1", "d:2", "d:3", "d:4", "d:5"));
  EXPECT_THAT(Run({"ft.search", "i1", "@color:{blue | green}"}),
              AreDocIds("d:1", "d:2", "d:3", "d:5", "d:6"));

  EXPECT_EQ(Run({"ft.create", "i2", "on", "hash", "schema", "c1", "as", "c2", "tag"}), "OK");

  // TODO: there is a discrepancy here between redis stack and Dragonfly,
  // we accept the original field when it has alias, while redis stack does not.
  //
  // EXPECT_THAT(Run({"ft.tagvals", "i2", "c1"}), ErrArg("No such field"));
  EXPECT_THAT(Run({"ft.tagvals", "i2", "c2"}), ArrLen(0));
}

TEST_F(SearchFamilyTest, TagOptions) {
  Run({"hset", "d:1", "color", "    red/   green // bLUe   "});
  Run({"hset", "d:2", "color", "blue   /// GReeN   "});
  Run({"hset", "d:3", "color", "grEEn // yellow   //"});
  Run({"hset", "d:4", "color", "  /blue/green/  "});

  EXPECT_EQ(Run({"ft.create", "i1", "on", "hash", "schema", "color", "tag", "casesensitive",
                 "separator", "/"}),
            "OK");

  EXPECT_THAT(Run({"ft.search", "i1", "@color:{green}"}), AreDocIds("d:1", "d:4"));
  EXPECT_THAT(Run({"ft.search", "i1", "@color:{GReeN}"}), AreDocIds("d:2"));
  EXPECT_THAT(Run({"ft.search", "i1", "@color:{blue}"}), AreDocIds("d:2", "d:4"));
}

TEST_F(SearchFamilyTest, SymbolsInTag) {
  Run({"FT.CREATE", "demo_idx", "ON", "HASH", "PREFIX", "1", "doc:", "SCHEMA", "tags", "TAG"});
  Run({"HSET", "doc:1", "name", "First Item", "tags", "@first"});
  Run({"HSET", "doc:2", "name", "Second Item", "tags", "?second"});
  Run({"HSET", "doc:3", "name", "Third Item", "tags", ":third"});
  Run({"HSET", "doc:4", "name", "Fourth Item", "tags", "\"fourth"});
  EXPECT_THAT(Run({"FT.SEARCH", "demo_idx", R"(@tags:{\?second})"}), AreDocIds("doc:2"));
  EXPECT_THAT(Run({"FT.SEARCH", "demo_idx", R"(@tags:{\@first})"}), AreDocIds("doc:1"));
  EXPECT_THAT(Run({"FT.SEARCH", "demo_idx", R"(@tags:{\:third})"}), AreDocIds("doc:3"));
  EXPECT_THAT(Run({"FT.SEARCH", "demo_idx", R"(@tags:{\"fourth})"}), AreDocIds("doc:4"));
}

TEST_F(SearchFamilyTest, TagNumbers) {
  Run({"hset", "d:1", "number", "1"});
  Run({"hset", "d:2", "number", "2"});
  Run({"hset", "d:3", "number", "3"});

  EXPECT_EQ(Run({"ft.create", "i1", "on", "hash", "schema", "number", "tag"}), "OK");

  EXPECT_THAT(Run({"ft.search", "i1", "@number:{1}"}), AreDocIds("d:1"));
  EXPECT_THAT(Run({"ft.search", "i1", "@number:{1|2}"}), AreDocIds("d:1", "d:2"));
  EXPECT_THAT(Run({"ft.search", "i1", "@number:{1|2|3}"}), AreDocIds("d:1", "d:2", "d:3"));

  EXPECT_THAT(Run({"ft.search", "i1", "@number:{1.0|2|3.0}"}), AreDocIds("d:2"));
  EXPECT_THAT(Run({"ft.search", "i1", "@number:{1|2|3.0}"}), AreDocIds("d:1", "d:2"));
  EXPECT_THAT(Run({"ft.search", "i1", "@number:{1|hello|2}"}), AreDocIds("d:1", "d:2"));
}

TEST_F(SearchFamilyTest, TagEscapeCharacters) {
  EXPECT_EQ(Run({"ft.create", "item_idx", "ON", "JSON", "PREFIX", "1", "p", "SCHEMA", "$.name",
                 "AS", "name", "TAG"}),
            "OK");
  EXPECT_EQ(Run({"json.set", "p:1", "$", "{\"name\":\"escape-error\"}"}), "OK");

  auto resp = Run({"ft.search", "item_idx", "@name:{escape\\-err*}"});
  EXPECT_THAT(resp, AreDocIds("p:1"));
}

TEST_F(SearchFamilyTest, Numbers) {
  EXPECT_EQ(Run({"ft.create", "i1", "schema", "i", "numeric", "j", "numeric"}), "OK");

  for (unsigned i = 0; i <= 10; i++) {
    for (unsigned j = 0; j <= 10; j++) {
      auto key = absl::StrCat("i", i, "j", j);
      Run({"hset", key, "i", absl::StrCat(i), "j", absl::StrCat(j)});
    }
  }

  // Test simple ranges:
  EXPECT_THAT(Run({"ft.search", "i1", "@i:[5 5] @j:[5 5]"}), AreDocIds("i5j5"));

  EXPECT_THAT(Run({"ft.search", "i1", "@i:[0 1] @j:[9 10]"}),
              AreDocIds("i0j9", "i0j10", "i1j9", "i1j10"));

  EXPECT_THAT(Run({"ft.search", "i1", "@i:[7 8] @j:[2 3]"}),
              AreDocIds("i7j2", "i7j3", "i8j2", "i8j3"));

  // Test union of ranges:
  EXPECT_THAT(Run({"ft.search", "i1", "(@i:[1 2] | @i:[6 6]) @j:[7 7]"}),
              AreDocIds("i1j7", "i2j7", "i6j7"));

  EXPECT_THAT(Run({"ft.search", "i1", "(@i:[1 5] | @i:[1 3] | @i:[3 5]) @j:[7 7]"}),
              AreDocIds("i1j7", "i2j7", "i3j7", "i4j7", "i5j7"));

  // Test intersection of ranges:
  EXPECT_THAT(Run({"ft.search", "i1", "(@i:[9 9]) (@j:[5 7] @j:[6 8])"}),
              AreDocIds("i9j6", "i9j7"));

  EXPECT_THAT(Run({"ft.search", "i1", "@i:[9 9] (@j:[4 6] @j:[1 5] @j:[5 10])"}),
              AreDocIds("i9j5"));

  EXPECT_THAT(Run({"ft.search", "i1", "@i:[9 9] (@j:[4 6] @j:[1 5] @j:[5 10])"}),
              AreDocIds("i9j5"));

  // Test negation of ranges:
  EXPECT_THAT(Run({"ft.search", "i1", "@i:[9 9] -@j:[1 10]"}), AreDocIds("i9j0"));
  EXPECT_THAT(Run({"ft.search", "i1", "-@i:[0 9] -@j:[1 10]"}), AreDocIds("i10j0"));

  // Test empty range
  EXPECT_THAT(Run({"ft.search", "i1", "@i:[9 1]"}), AreDocIds());
  EXPECT_THAT(Run({"ft.search", "i1", "@j:[5 0]"}), AreDocIds());
  EXPECT_THAT(Run({"ft.search", "i1", "@i:[7 1] @j:[6 2]"}), AreDocIds());
}

TEST_F(SearchFamilyTest, TestLimit) {
  Run({"ft.create", "i1", "SCHEMA", "match", "text"});

  for (unsigned i = 0; i < 20; i++)
    Run({"hset", to_string(i), "match", "all"});

  // Default limit is 10
  auto resp = Run({"ft.search", "i1", "all"});
  EXPECT_THAT(resp, ArrLen(10 * 2 + 1));

  resp = Run({"ft.search", "i1", "all", "limit", "0", "0"});
  EXPECT_THAT(resp, IntArg(20));

  resp = Run({"ft.search", "i1", "all", "limit", "0", "5"});
  EXPECT_THAT(resp, ArrLen(5 * 2 + 1));

  resp = Run({"ft.search", "i1", "all", "limit", "17", "5"});
  EXPECT_THAT(resp, ArrLen(3 * 2 + 1));
}

string_view FloatSV(const float* f) {
  return {reinterpret_cast<const char*>(f), sizeof(float)};
}

auto MatchEntry = [](string key, auto... fields) { return IsMapWithSize(key, IsMap(fields...)); };

TEST_F(SearchFamilyTest, ReturnOption) {
  for (unsigned i = 0; i < 20; i++) {
    const float score = i;
    Run({"hset", "k"s + to_string(i), "longA", to_string(i), "longB", to_string(i + 1), "longC",
         to_string(i + 2), "secret", to_string(i + 3), "vector", FloatSV(&score)});
  }

  Run({"ft.create", "i1",     "SCHEMA", "longA",   "AS",    "justA", "TEXT",
       "longB",     "AS",     "justB",  "NUMERIC", "longC", "AS",    "justC",
       "NUMERIC",   "vector", "VECTOR", "FLAT",    "2",     "DIM",   "1"});

  // Check all fields are returned
  auto resp = Run({"ft.search", "i1", "@justA:0"});
  EXPECT_THAT(resp, MatchEntry("k0", "longA", "0", "longB", "1", "longC", "2", "secret", "3",
                               "vector", "[0]"));

  // Check no fields are returned
  resp = Run({"ft.search", "i1", "@justA:0", "return", "0"});
  EXPECT_THAT(resp, IsArray(IntArg(1), "k0"));

  resp = Run({"ft.search", "i1", "@justA:0", "nocontent"});
  EXPECT_THAT(resp, IsArray(IntArg(1), "k0"));

  // Check only one field is returned (and with original identifier)
  resp = Run({"ft.search", "i1", "@justA:0", "return", "1", "longA"});
  EXPECT_THAT(resp, MatchEntry("k0", "longA", "0"));

  // Check only one field is returned with right alias
  resp = Run({"ft.search", "i1", "@justA:0", "return", "1", "longB", "as", "madeupname"});
  EXPECT_THAT(resp, MatchEntry("k0", "madeupname", "1"));

  // Check two fields
  resp = Run({"ft.search", "i1", "@justA:0", "return", "2", "longB", "as", "madeupname", "longC"});
  EXPECT_THAT(resp, MatchEntry("k0", "madeupname", "1", "longC", "2"));

  // Check non-existing field
  resp = Run({"ft.search", "i1", "@justA:0", "return", "1", "nothere"});
  EXPECT_THAT(resp, MatchEntry("k0"));

  // Checl implcit __vector_score is provided
  float score = 20;
  resp = Run({"ft.search", "i1", "@justA:0 => [KNN 20 @vector $vector]", "SORTBY", "__vector_score",
              "DESC", "RETURN", "1", "longA", "PARAMS", "2", "vector", FloatSV(&score)});
  EXPECT_THAT(resp, MatchEntry("k0", "longA", "0"));

  // Check sort doesn't shadow knn return alias
  score = 20;
  resp = Run({"ft.search", "i1", "@justA:0 => [KNN 20 @vector $vector AS vec_return]", "SORTBY",
              "vec_return", "DESC", "RETURN", "1", "vec_return", "PARAMS", "2", "vector",
              FloatSV(&score)});
  EXPECT_THAT(resp, MatchEntry("k0", "vec_return", "20"));
}

TEST_F(SearchFamilyTest, ReturnOptionJson) {
  const string_view j =
      R"({"actions":["fly","sleep"],"name":"dragon","not_indexed":true,"size":3})";
  Run({"json.set", "k1", ".", j});
  Run({"ft.create", "i1", "on", "json", "schema", "$.name", "as", "name", "text", "$.actions[0]",
       "as", "primary_action", "tag", "$.size", "as", "size", "numeric"});

  // Return whole document as a single field by default
  EXPECT_THAT(Run({"ft.search", "i1", "*"}), MatchEntry("k1", "$", j));

  // RETURN 0
  EXPECT_THAT(Run({"ft.search", "i1", "*", "return", "0"}), IsArray(IntArg(1), "k1"));

  // RETURN by full path
  EXPECT_THAT(Run({"ft.search", "i1", "*", "return", "1", "$.name"}),
              MatchEntry("k1", "$.name", "dragon"));
  EXPECT_THAT(Run({"ft.search", "i1", "*", "return", "1", "$.actions"}),
              MatchEntry("k1", "$.actions", "[\"fly\",\"sleep\"]"));

  // RETURN by full path with alias
  EXPECT_THAT(Run({"ft.search", "i1", "*", "return", "1", "$.name", "as", "n"}),
              MatchEntry("k1", "n", "dragon"));

  // RETURN by schema alias
  EXPECT_THAT(Run({"ft.search", "i1", "*", "return", "1", "name"}),
              MatchEntry("k1", "name", "dragon"));
  EXPECT_THAT(Run({"ft.search", "i1", "*", "return", "1", "primary_action"}),
              MatchEntry("k1", "primary_action", "fly"));

  // RETURN by schema alias with new alias
  EXPECT_THAT(Run({"ft.search", "i1", "*", "return", "1", "name", "as", "n"}),
              MatchEntry("k1", "n", "dragon"));
  EXPECT_THAT(Run({"ft.search", "i1", "*", "return", "1", "primary_action", "as", "pa"}),
              MatchEntry("k1", "pa", "fly"));

  // Whole document with SORTBY includes sortable field as return field
  EXPECT_THAT(Run({"ft.search", "i1", "*", "sortby", "size"}),
              MatchEntry("k1", "$", j, "size", "3"));

  // RETURN with SORTBY doesn't include sortable field
  EXPECT_THAT(Run({"ft.search", "i1", "*", "sortby", "size", "return", "1", "name"}),
              MatchEntry("k1", "name", "dragon"));
}

TEST_F(SearchFamilyTest, TestStopWords) {
  Run({"ft.create", "i1", "STOPWORDS", "3", "red", "green", "blue", "SCHEMA", "title", "TEXT"});

  Run({"hset", "d:1", "title", "ReD? parrot flies away"});
  Run({"hset", "d:2", "title", "GrEEn crocodile eats you"});
  Run({"hset", "d:3", "title", "BLUe. Whale surfes the sea"});

  EXPECT_THAT(Run({"ft.search", "i1", "red"}), kNoResults);
  EXPECT_THAT(Run({"ft.search", "i1", "green"}), kNoResults);
  EXPECT_THAT(Run({"ft.search", "i1", "blue"}), kNoResults);

  EXPECT_THAT(Run({"ft.search", "i1", "parrot"}), AreDocIds("d:1"));
  EXPECT_THAT(Run({"ft.search", "i1", "crocodile"}), AreDocIds("d:2"));
  EXPECT_THAT(Run({"ft.search", "i1", "whale"}), AreDocIds("d:3"));
}

TEST_F(SearchFamilyTest, SimpleUpdates) {
  EXPECT_EQ(Run({"ft.create", "i1", "schema", "title", "text", "visits", "numeric"}), "OK");

  Run({"hset", "d:1", "title", "Dragonfly article", "visits", "100"});
  Run({"hset", "d:2", "title", "Butterfly observations", "visits", "50"});
  Run({"hset", "d:3", "title", "Bumblebee studies", "visits", "30"});

  // Check values above were added to the index
  EXPECT_THAT(Run({"ft.search", "i1", "article | observations | studies"}),
              AreDocIds("d:1", "d:2", "d:3"));

  // Update title - text value
  {
    Run({"hset", "d:2", "title", "Butterfly studies"});
    EXPECT_THAT(Run({"ft.search", "i1", "observations"}), kNoResults);
    EXPECT_THAT(Run({"ft.search", "i1", "studies"}), AreDocIds("d:2", "d:3"));

    Run({"hset", "d:1", "title", "Upcoming Dragonfly presentation"});
    EXPECT_THAT(Run({"ft.search", "i1", "article"}), kNoResults);
    EXPECT_THAT(Run({"ft.search", "i1", "upcoming presentation"}), AreDocIds("d:1"));

    Run({"hset", "d:3", "title", "Secret bumblebee research"});
    EXPECT_THAT(Run({"ft.search", "i1", "studies"}), AreDocIds("d:2"));
    EXPECT_THAT(Run({"ft.search", "i1", "secret research"}), AreDocIds("d:3"));
  }

  // Update visits - numeric value
  {
    EXPECT_THAT(Run({"ft.search", "i1", "@visits:[50 1000]"}), AreDocIds("d:1", "d:2"));

    Run({"hset", "d:3", "visits", "75"});
    EXPECT_THAT(Run({"ft.search", "i1", "@visits:[0 49]"}), kNoResults);
    EXPECT_THAT(Run({"ft.search", "i1", "@visits:[50 1000]"}), AreDocIds("d:1", "d:2", "d:3"));

    Run({"hset", "d:1", "visits", "125"});
    Run({"hset", "d:2", "visits", "150"});
    EXPECT_THAT(Run({"ft.search", "i1", "@visits:[100 1000]"}), AreDocIds("d:1", "d:2"));

    Run({"hset", "d:3", "visits", "175"});
    EXPECT_THAT(Run({"ft.search", "i1", "@visits:[0 100]"}), kNoResults);
    EXPECT_THAT(Run({"ft.search", "i1", "@visits:[150 1000]"}), AreDocIds("d:2", "d:3"));
  }

  // Delete documents
  {
    Run({"del", "d:2", "d:3"});
    EXPECT_THAT(Run({"ft.search", "i1", "dragonfly"}), AreDocIds("d:1"));
    EXPECT_THAT(Run({"ft.search", "i1", "butterfly | bumblebee"}), kNoResults);
  }
}

TEST_F(SearchFamilyTest, Unicode) {
  EXPECT_EQ(Run({"ft.create", "i1", "schema", "title", "text", "visits", "numeric"}), "OK");

  // Explicitly using screaming uppercase to check utf-8 to lowercase functionality
  Run({"hset", "d:1", "title", "Веселая СТРЕКОЗА Иван", "visits", "400"});
  Run({"hset", "d:2", "title", "Die fröhliche Libelle Günther", "visits", "300"});
  Run({"hset", "d:3", "title", "השפירית המהירה יעקב", "visits", "200"});
  Run({"hset", "d:4", "title", "πανίσχυρη ΛΙΒΕΛΛΟΎΛΗ Δίας", "visits", "100"});

  // Check we find our dragonfly in all languages
  EXPECT_THAT(Run({"ft.search", "i1", "стРекоЗа|liBellE|השפירית|λΙβελλοΎλη"}),
              AreDocIds("d:1", "d:2", "d:3", "d:4"));

  // Check the result is valid
  auto resp = Run({"ft.search", "i1", "λιβελλούλη"});
  EXPECT_THAT(resp,
              IsMapWithSize("d:4", IsMap("visits", "100", "title", "πανίσχυρη ΛΙΒΕΛΛΟΎΛΗ Δίας")));

  // Repeat with tags
  Run({"ft.create", "i2", "schema", "color", "tag", "separator", "/"});

  Run({"hset", "d:5", "color", "зеЛеный/żółtY"});
  Run({"hset", "d:6", "color", "κόκκινος/Білий"});

  auto tagvals = Run({"ft.tagvals", "i2", "color"});
  EXPECT_THAT(tagvals.GetVec(), UnorderedElementsAre("зеленый", "żółty", "κόκκινος", "білий"));
  EXPECT_THAT(Run({"ft.search", "i2", "@color:{зеленый|білий}"}), AreDocIds("d:5", "d:6"));
}

TEST_F(SearchFamilyTest, UnicodeWords) {
  EXPECT_EQ(Run({"ft.create", "i1", "schema", "title", "text"}), "OK");

  Run({"hset", "d:1", "title",
       "WORD!!! Одно слово? Zwei Wörter. Comma before ,sentence, "
       "Τρεις λέξεις: χελώνα-σκύλου-γάτας. !זה עובד",
       "visits", "400"});

  // Make sure it includes ALL those words
  EXPECT_THAT(Run({"ft.search", "i1", "word слово wörter sentence λέξεις γάτας עובד"}),
              AreDocIds("d:1"));
}

TEST_F(SearchFamilyTest, PrefixSuffixInfixTrie) {
  Run({"ft.create", "i1", "schema", "title", "text", "withsuffixtrie"});

  Run({"hset", "d:1", "title", "CaspIAn SeA"});
  Run({"hset", "d:2", "title", "GreAt LakEs"});
  Run({"hset", "d:3", "title", "Lake VictorIA"});
  Run({"hset", "d:4", "title", "LaKE Como"});

  EXPECT_THAT(Run({"ft.search", "i1", "*ea*"}), AreDocIds("d:1", "d:2"));
  EXPECT_THAT(Run({"ft.search", "i1", "*ia*"}), AreDocIds("d:1", "d:3"));
  EXPECT_THAT(Run({"ft.search", "i1", "lake*"}), AreDocIds("d:2", "d:3", "d:4"));
  EXPECT_THAT(Run({"ft.search", "i1", "*lake"}), AreDocIds("d:3", "d:4"));
}

struct SortTest : SearchFamilyTest, public testing::WithParamInterface<bool /* sortable */> {};

TEST_P(SortTest, BasicSort) {
  auto AreRange = [](size_t total, size_t l, size_t r, string_view prefix) {
    vector<string> out;
    for (size_t i = min(l, r); i < max(l, r); i++)
      out.push_back(absl::StrCat(prefix, i));
    if (l > r)
      reverse(out.begin(), out.end());
    return DocIds(total, out);
  };

  vector<string_view> params{"ft.create", "i1", "prefix", "1", "d:", "schema", "ord", "numeric"};
  if (GetParam())
    params.emplace_back("sortable");
  Run(params);

  size_t num_docs = 100;
  for (size_t i = 0; i < num_docs; i++)
    Run({"hset", absl::StrCat("d:", i), "ord", absl::StrCat(i)});

  // Check SORTBY in ASC and DESC mode with different LIMIT parameters
  for (int take = 17; take < 35; take += 7) {
    for (size_t i = 0; i < num_docs - take; i++)
      EXPECT_THAT(
          Run({"ft.search", "i1", "*", "SORTBY", "ord", "LIMIT", to_string(i), to_string(take)}),
          AreRange(num_docs, i, i + take, "d:"));

    for (size_t i = 0; i < num_docs - take; i++)
      EXPECT_THAT(Run({"ft.search", "i1", "*", "SORTBY", "ord", "DESC", "LIMIT", to_string(i),
                       to_string(take)}),
                  AreRange(num_docs, num_docs - i, num_docs - i - take, "d:"));
  }

  params = {"ft.create", "i2", "prefix", "1", "d2:", "schema", "name", "text"};
  if (GetParam())
    params.emplace_back("sortable");
  Run(params);

  absl::InsecureBitGen gen;
  vector<string> random_strs;
  for (size_t i = 0; i < 10; i++)
    random_strs.emplace_back(GetRandomHex(gen, 7));
  sort(random_strs.begin(), random_strs.end());

  for (size_t i = 0; i < 10; i++)
    Run({"hset", absl::StrCat("d2:", i), "name", random_strs[i]});

  for (size_t i = 0; i < 7; i++)
    EXPECT_THAT(Run({"ft.search", "i2", "*", "SORTBY", "name", "DESC", "LIMIT", to_string(i), "3"}),
                AreRange(10, 10 - i, 10 - i - 3, "d2:"));
}

INSTANTIATE_TEST_SUITE_P(Sortable, SortTest, testing::Values(true));
INSTANTIATE_TEST_SUITE_P(NotSortable, SortTest, testing::Values(false));

TEST_F(SearchFamilyTest, FtProfile) {
  Run({"ft.create", "i1", "schema", "name", "text"});

  auto resp = Run({"ft.profile", "i1", "search", "query", "(a | b) c d"});
  ASSERT_ARRAY_OF_TWO_ARRAYS(resp);

  const auto& top_level = resp.GetVec();
  EXPECT_THAT(top_level[0], IsMapWithSize());

  const auto& profile_result = top_level[1].GetVec();
  EXPECT_EQ(profile_result.size(), shard_set->size() + 1);

  EXPECT_THAT(profile_result[0].GetVec(), ElementsAre("took", _, "hits", _, "serialized", _));

  for (size_t sid = 0; sid < shard_set->size(); sid++) {
    const auto& shard_resp = profile_result[sid + 1].GetVec();
    EXPECT_THAT(shard_resp, ElementsAre("took", _, "tree", _));

    const auto& tree = shard_resp[3].GetVec();
    EXPECT_EQ(tree[3].GetString() /* operation */, "Logical{n=3,o=and}"s);
    EXPECT_GT(tree[1].GetInt() /* total time*/, tree[5].GetInt() /* self time */);
    EXPECT_EQ(tree[7].GetInt() /* processed */, 0);
  }

  // Test LIMITED throws no errors
  resp = Run({"ft.profile", "i1", "search", "limited", "query", "(a | b) c d"});
  ASSERT_ARRAY_OF_TWO_ARRAYS(resp);
}

TEST_F(SearchFamilyTest, FtProfileInvalidQuery) {
  Run({"json.set", "j1", ".", R"({"id":"1"})"});
  Run({"ft.create", "i1", "on", "json", "schema", "$.id", "as", "id", "tag"});

  auto resp = Run({"ft.profile", "i1", "search", "query", "@id:[1 1]"});
  ASSERT_ARRAY_OF_TWO_ARRAYS(resp);

  EXPECT_THAT(resp.GetVec()[0], IsMapWithSize());

  resp = Run({"ft.profile", "i1", "search", "query", "@{invalid13289}"});
  EXPECT_THAT(resp, ErrArg("query syntax error"));
}

TEST_F(SearchFamilyTest, FtProfileErrorReply) {
  Run({"ft.create", "i1", "schema", "name", "text"});

  auto resp = Run({"ft.profile", "i1", "not_search", "query", "(a | b) c d"});
  EXPECT_THAT(resp, ErrArg("no `SEARCH` or `AGGREGATE` provided"));

  resp = Run({"ft.profile", "i1", "search", "not_query", "(a | b) c d"});
  EXPECT_THAT(resp, ErrArg(kSyntaxErr));

  resp = Run({"ft.profile", "non_existent_key", "search", "query", "(a | b) c d"});
  EXPECT_THAT(resp, ErrArg("non_existent_key: no such index"));
}

TEST_F(SearchFamilyTest, SimpleExpiry) {
  EXPECT_EQ(Run({"ft.create", "i1", "schema", "title", "text", "expires-in", "numeric"}), "OK");

  Run({"hset", "d:1", "title", "never to expire", "expires-in", "100500"});

  Run({"hset", "d:2", "title", "first to expire", "expires-in", "50"});
  Run({"pexpire", "d:2", "50"});

  Run({"hset", "d:3", "title", "second to expire", "expires-in", "100"});
  Run({"pexpire", "d:3", "100"});

  EXPECT_THAT(Run({"ft.search", "i1", "*"}), AreDocIds("d:1", "d:2", "d:3"));

  AdvanceTime(60);
  ThisFiber::SleepFor(5ms);  // Give heartbeat time to delete expired doc
  EXPECT_THAT(Run({"ft.search", "i1", "*"}), AreDocIds("d:1", "d:3"));

  AdvanceTime(60);
  Run({"HGETALL", "d:3"});  // Trigger expiry by access
  EXPECT_THAT(Run({"ft.search", "i1", "*"}), AreDocIds("d:1"));

  Run({"flushall"});
}

TEST_F(SearchFamilyTest, DocsEditing) {
  auto resp = Run({"JSON.SET", "k1", ".", R"({"a":"1"})"});
  EXPECT_EQ(resp, "OK");

  resp = Run({"FT.CREATE", "index", "ON", "JSON", "SCHEMA", "$.a", "AS", "a", "TEXT"});
  EXPECT_EQ(resp, "OK");

  resp = Run({"FT.SEARCH", "index", "*"});
  EXPECT_THAT(resp, IsMapWithSize("k1", IsMap("$", R"({"a":"1"})")));

  // Test dump and restore
  resp = Run({"DUMP", "k1"});
  auto dump = resp.GetBuf();

  resp = Run({"DEL", "k1"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"RESTORE", "k1", "0", ToSV(dump)});
  EXPECT_EQ(resp, "OK");

  resp = Run({"FT.SEARCH", "index", "*"});
  EXPECT_THAT(resp, IsMapWithSize("k1", IsMap("$", R"({"a":"1"})")));

  // Test renaming a key
  EXPECT_EQ(Run({"RENAME", "k1", "new_k1"}), "OK");

  resp = Run({"FT.SEARCH", "index", "*"});
  EXPECT_THAT(resp, IsMapWithSize("new_k1", IsMap("$", R"({"a":"1"})")));

  EXPECT_EQ(Run({"RENAME", "new_k1", "k1"}), "OK");

  resp = Run({"FT.SEARCH", "index", "*"});
  EXPECT_THAT(resp, IsMapWithSize("k1", IsMap("$", R"({"a":"1"})")));
}

TEST_F(SearchFamilyTest, AggregateGroupBy) {
  auto resp = Run(
      {"ft.create", "i1", "ON", "HASH", "SCHEMA", "word", "TAG", "foo", "NUMERIC", "text", "TEXT"});
  EXPECT_EQ(resp, "OK");

  Run({"hset", "key:1", "word", "item1", "foo", "10", "text", "\"first key\"", "non_indexed_value",
       "1"});
  Run({"hset", "key:2", "word", "item2", "foo", "20", "text", "\"second key\"", "non_indexed_value",
       "2"});
  Run({"hset", "key:3", "word", "item1", "foo", "40", "text", "\"third key\"", "non_indexed_value",
       "3"});

  resp = Run(
      {"ft.aggregate", "i1", "*", "GROUPBY", "1", "@word", "REDUCE", "COUNT", "0", "AS", "count"});
  EXPECT_THAT(resp, IsUnordArrayWithSize(IsMap("count", "2", "word", "item1"),
                                         IsMap("word", "item2", "count", "1")));

  resp = Run({"ft.aggregate", "i1", "*", "GROUPBY", "1", "@word", "REDUCE", "SUM", "1", "@foo",
              "AS", "foo_total"});
  EXPECT_THAT(resp, IsUnordArrayWithSize(IsMap("foo_total", "50", "word", "item1"),
                                         IsMap("foo_total", "20", "word", "item2")));

  resp = Run({"ft.aggregate", "i1", "*", "GROUPBY", "1", "@word", "REDUCE", "AVG", "1", "@foo",
              "AS", "foo_average"});
  EXPECT_THAT(resp, IsUnordArrayWithSize(IsMap("foo_average", "20", "word", "item2"),
                                         IsMap("foo_average", "25", "word", "item1")));

  resp = Run({"ft.aggregate", "i1", "*", "GROUPBY", "2", "@word", "@text", "REDUCE", "SUM", "1",
              "@foo", "AS", "foo_total"});
  EXPECT_THAT(resp, IsUnordArrayWithSize(
                        IsMap("foo_total", "10", "word", "item1", "text", "\"first key\""),
                        IsMap("foo_total", "40", "word", "item1", "text", "\"third key\""),
                        IsMap("foo_total", "20", "word", "item2", "text", "\"second key\"")));

  resp = Run({"ft.aggregate", "i1", "*", "LOAD", "2", "foo", "word", "GROUPBY", "1", "@word",
              "REDUCE", "SUM", "1", "@foo", "AS", "foo_total"});
  EXPECT_THAT(resp, IsUnordArrayWithSize(IsMap("foo_total", "20", "word", "item2"),
                                         IsMap("foo_total", "50", "word", "item1")));

  resp = Run({"ft.aggregate", "i1", "*", "LOAD", "2", "foo", "text", "GROUPBY", "2", "@word",
              "@text", "REDUCE", "SUM", "1", "@foo", "AS", "foo_total"});
  EXPECT_THAT(resp, IsUnordArrayWithSize(
                        IsMap("foo_total", "40", "word", "item1", "text", "\"third key\""),
                        IsMap("foo_total", "20", "word", "item2", "text", "\"second key\""),
                        IsMap("foo_total", "10", "word", "item1", "text", "\"first key\"")));
}

TEST_F(SearchFamilyTest, JsonAggregateGroupBy) {
  auto resp =
      Run({"FT.CREATE", "json_index", "ON", "JSON", "SCHEMA", "$.name", "AS", "name", "TEXT",
           "$.price", "AS", "price", "NUMERIC", "$.quantity", "AS", "quantity", "NUMERIC"});
  EXPECT_EQ(resp, "OK");

  Run({"JSON.SET", "product:1", "$", R"({"name": "Product A", "price": 10, "quantity": 2})"});
  Run({"JSON.SET", "product:2", "$", R"({"name": "Product B", "price": 20, "quantity": 3})"});
  Run({"JSON.SET", "product:3", "$", R"({"name": "Product C", "price": 30, "quantity": 5})"});

  resp = Run({"FT.AGGREGATE", "json_index", "*", "GROUPBY", "0", "REDUCE", "SUM", "1", "price",
              "AS", "total_price"});
  EXPECT_THAT(resp, IsUnordArrayWithSize(IsMap("total_price", "60")));

  resp = Run({"FT.AGGREGATE", "json_index", "*", "GROUPBY", "0", "REDUCE", "AVG", "1", "price",
              "AS", "avg_price"});
  EXPECT_THAT(resp, IsUnordArrayWithSize(IsMap("avg_price", "20")));
}

TEST_F(SearchFamilyTest, JsonAggregateGroupByWithoutAtSign) {
  auto resp =
      Run({"FT.CREATE", "index", "ON", "HASH", "SCHEMA", "group", "TAG", "value", "NUMERIC"});
  EXPECT_EQ(resp, "OK");

  absl::FlagSaver fs;
  Run({"HSET", "h1", "group", "first", "value", "1"});
  Run({"HSET", "h2", "group", "second", "value", "2"});
  Run({"HSET", "h3", "group", "first", "value", "3"});

  absl::SetFlag(&FLAGS_search_reject_legacy_field, false);
  resp = Run({"FT.AGGREGATE", "index", "*", "GROUPBY", "1", "group", "REDUCE", "COUNT", "0", "AS",
              "count"});
  EXPECT_THAT(resp, IsUnordArrayWithSize(IsMap("count", "2", "group", "first"),
                                         IsMap("group", "second", "count", "1")));
  absl::SetFlag(&FLAGS_search_reject_legacy_field, true);
  resp = Run({"FT.AGGREGATE", "index", "*", "GROUPBY", "1", "group", "REDUCE", "COUNT", "0", "AS",
              "count"});
  EXPECT_THAT(resp, ErrArg("bad arguments: Field name should start with '@'"));
}

TEST_F(SearchFamilyTest, AggregateGroupByReduceSort) {
  Run({"ft.create", "i1", "schema", "even", "tag", "sortable", "value", "numeric", "sortable"});
  for (size_t i = 0; i < 101; i++) {  // 51 even, 50 odd
    Run({"hset", absl::StrCat("k", i), "even", (i % 2 == 0) ? "true" : "false", "value",
         absl::StrCat(i)});
  }

  absl::FlagSaver fs;
  absl::SetFlag(&FLAGS_search_reject_legacy_field, false);
  // clang-format off
  auto resp = Run({"ft.aggregate", "i1", "*",
                  "GROUPBY", "1", "@even",
                      "REDUCE", "count", "0", "as", "count",
                      "REDUCE", "count_distinct", "1", "even", "as", "distinct_tags",
                      "REDUCE", "count_distinct", "1", "value", "as", "distinct_vals",
                      "REDUCE", "max", "1", "value", "as", "max_val",
                      "REDUCE", "min", "1", "value", "as", "min_val",
                  "SORTBY", "1", "count"});
  // clang-format on

  EXPECT_THAT(resp,
              IsUnordArrayWithSize(IsMap("even", "false", "count", "50", "distinct_tags", "1",
                                         "distinct_vals", "50", "max_val", "99", "min_val", "1"),
                                   IsMap("even", "true", "count", "51", "distinct_tags", "1",
                                         "distinct_vals", "51", "max_val", "100", "min_val", "0")));
  absl::SetFlag(&FLAGS_search_reject_legacy_field, true);
  // clang-format off
  resp = Run({"ft.aggregate", "i1", "*",
                  "GROUPBY", "1", "@even",
                      "REDUCE", "count", "0", "as", "count",
                      "REDUCE", "count_distinct", "1", "even", "as", "distinct_tags",
                      "REDUCE", "count_distinct", "1", "value", "as", "distinct_vals",
                      "REDUCE", "max", "1", "value", "as", "max_val",
                      "REDUCE", "min", "1", "value", "as", "min_val",
                  "SORTBY", "1", "count"});
  // clang-format on

  EXPECT_THAT(resp, ErrArg("SORTBY field name 'count' must start with '@'"));
}

TEST_F(SearchFamilyTest, AggregateLoadGroupBy) {
  for (size_t i = 0; i < 101; i++) {  // 51 even, 50 odd
    Run({"hset", absl::StrCat("k", i), "even", (i % 2 == 0) ? "true" : "false", "value",
         absl::StrCat(i)});
  }
  Run({"ft.create", "i1", "schema", "value", "numeric", "sortable"});

  // clang-format off
  auto resp = Run({"ft.aggregate", "i1", "*",
                  "LOAD", "1", "even",
                  "GROUPBY", "1", "@even"});
  // clang-format on

  EXPECT_THAT(resp, IsUnordArrayWithSize(IsMap("even", "false"), IsMap("even", "true")));
}

TEST_F(SearchFamilyTest, AggregateLoad) {
  Run({"hset", "key:1", "word", "item1", "foo", "10"});
  Run({"hset", "key:2", "word", "item2", "foo", "20"});
  Run({"hset", "key:3", "word", "item1", "foo", "30"});

  auto resp = Run({"ft.create", "index", "ON", "HASH", "SCHEMA", "word", "TAG", "foo", "NUMERIC"});
  EXPECT_EQ(resp, "OK");

  // ft.aggregate index "*" LOAD 1 @word LOAD 1 @foo
  resp = Run({"ft.aggregate", "index", "*", "LOAD", "1", "@word", "LOAD", "1", "@foo"});
  EXPECT_THAT(resp, IsUnordArrayWithSize(IsMap("word", "item1", "foo", "30"),
                                         IsMap("word", "item2", "foo", "20"),
                                         IsMap("word", "item1", "foo", "10")));

  // ft.aggregate index "*" GROUPBY 1 @word REDUCE SUM 1 @foo AS foo_total LOAD 1 foo_total
  resp = Run({"ft.aggregate", "index", "*", "GROUPBY", "1", "@word", "REDUCE", "SUM", "1", "@foo",
              "AS", "foo_total", "LOAD", "1", "foo_total"});
  EXPECT_THAT(resp, ErrArg("LOAD cannot be applied after projectors or reducers"));
}

TEST_F(SearchFamilyTest, Vector) {
  auto resp = Run({"ft.create", "ann", "ON", "HASH", "SCHEMA", "vector", "VECTOR", "HNSW", "8",
                   "TYPE", "FLOAT32", "DIM", "100", "distance_metric", "cosine", "M", "64"});
  EXPECT_EQ(resp, "OK");
}

TEST_F(SearchFamilyTest, EscapedSymbols) {
  Run({"ft.create", "i1", "ON", "HASH", "SCHEMA", "color", "tag"});

  // TODO ',' is separator, we need to check should next request work or not
  // In redis it works for JSON but not for HASH
  // Run({"hset", "i1", "color", R"(blue,1\$+)"});
  // EXPECT_THAT(Run({"ft.search", "i1", R"(@color:{blue\,1\\\$\+})"}), AreDocIds("i1"));
  // EXPECT_THAT(Run({"ft.search", "i1", "@color:{blue}"}), kNoResults);

  Run({"hset", "i1", "color", "blue.1\"%="});
  EXPECT_THAT(Run({"ft.search", "i1", "@color:{blue\\.1\\\"\\%\\=}"}), AreDocIds("i1"));
  EXPECT_THAT(Run({"ft.search", "i1", "@color:{blue}"}), kNoResults);

  Run({"hset", "i1", "color", "blue<1'^~"});
  EXPECT_THAT(Run({"ft.search", "i1", "@color:{blue\\<1\\'\\^\\~}"}), AreDocIds("i1"));
  EXPECT_THAT(Run({"ft.search", "i1", "@color:{blue}"}), kNoResults);

  Run({"hset", "i1", "color", "blue>1:&/"});
  EXPECT_THAT(Run({"ft.search", "i1", "@color:{blue\\>1\\:\\&\\/}"}), AreDocIds("i1"));
  EXPECT_THAT(Run({"ft.search", "i1", "@color:{blue}"}), kNoResults);

  Run({"hset", "i1", "color", "blue{1;* "});
  EXPECT_THAT(Run({"ft.search", "i1", "@color:{blue\\{1\\;\\*\\ }"}), AreDocIds("i1"));
  EXPECT_THAT(Run({"ft.search", "i1", "@color:{blue}"}), kNoResults);

  Run({"hset", "i1", "color", "blue}1!("});
  EXPECT_THAT(Run({"ft.search", "i1", "@color:{blue\\}1\\!\\(}"}), AreDocIds("i1"));
  EXPECT_THAT(Run({"ft.search", "i1", "@color:{blue}"}), kNoResults);

  Run({"hset", "i1", "color", "blue[1@)"});
  EXPECT_THAT(Run({"ft.search", "i1", "@color:{blue\\[1\\@\\)}"}), AreDocIds("i1"));
  EXPECT_THAT(Run({"ft.search", "i1", "@color:{blue}"}), kNoResults);

  Run({"hset", "i1", "color", "blue]1#-"});
  EXPECT_THAT(Run({"ft.search", "i1", "@color:{blue\\]1\\#\\-}"}), AreDocIds("i1"));
  EXPECT_THAT(Run({"ft.search", "i1", "@color:{blue}"}), kNoResults);
}

TEST_F(SearchFamilyTest, FlushSearchIndices) {
  auto resp =
      Run({"FT.CREATE", "json", "ON", "JSON", "SCHEMA", "$.nested.value", "AS", "value", "TEXT"});
  EXPECT_EQ(resp, "OK");

  EXPECT_EQ(Run({"FLUSHALL"}), "OK");

  // Test that the index was removed
  resp = Run({"FT.CREATE", "json", "ON", "JSON", "SCHEMA", "$.another.nested.value", "AS", "value",
              "TEXT"});
  EXPECT_EQ(resp, "OK");

  EXPECT_EQ(Run({"FLUSHDB"}), "OK");

  // Test that the index was removed
  resp = Run({"FT.CREATE", "json", "ON", "JSON", "SCHEMA", "$.another.nested.value", "AS", "value",
              "TEXT"});
  EXPECT_EQ(resp, "OK");

  EXPECT_EQ(Run({"select", "1"}), "OK");
  EXPECT_EQ(Run({"FLUSHDB"}), "OK");
  EXPECT_EQ(Run({"select", "0"}), "OK");

  // Test that index was not removed
  resp = Run({"FT.CREATE", "json", "ON", "JSON", "SCHEMA", "$.another.nested.value", "AS", "value",
              "TEXT"});
  EXPECT_THAT(resp, ErrArg("ERR Index already exists"));
}

TEST_F(SearchFamilyTest, AggregateWithLoadOptionHard) {
  // Test HASH
  Run({"HSET", "h1", "word", "item1", "foo", "10", "text", "first key"});
  Run({"HSET", "h2", "word", "item2", "foo", "20", "text", "second key"});

  auto resp = Run(
      {"FT.CREATE", "i1", "ON", "HASH", "SCHEMA", "word", "TAG", "foo", "NUMERIC", "text", "TEXT"});
  EXPECT_EQ(resp, "OK");

  resp = Run({"FT.AGGREGATE", "i1", "*", "LOAD", "2", "foo", "text", "GROUPBY", "2", "@word",
              "@text", "REDUCE", "SUM", "1", "@foo", "AS", "foo_total"});
  EXPECT_THAT(resp,
              IsUnordArrayWithSize(IsMap("foo_total", "20", "word", "item2", "text", "second key"),
                                   IsMap("foo_total", "10", "word", "item1", "text", "first key")));

  resp = Run({"FT.AGGREGATE", "i1", "*", "LOAD", "1", "@word", "GROUPBY", "1", "@word", "REDUCE",
              "SUM", "1", "@foo", "AS", "foo_total"});
  EXPECT_THAT(resp, IsUnordArrayWithSize(IsMap("foo_total", "20", "word", "item2"),
                                         IsMap("foo_total", "10", "word", "item1")));

  resp = Run({"FT.CREATE", "i2", "ON", "JSON", "SCHEMA", "$.word", "AS", "word", "TAG", "$.foo",
              "AS", "foo", "NUMERIC", "$.text", "AS", "text", "TEXT"});
  EXPECT_EQ(resp, "OK");

  // Test JSON
  Run({"JSON.SET", "j1", ".", R"({"word":"item1","foo":10,"text":"first key"})"});
  Run({"JSON.SET", "j2", ".", R"({"word":"item2","foo":20,"text":"second key"})"});

  resp = Run({"FT.AGGREGATE", "i2", "*", "LOAD", "2", "foo", "text", "GROUPBY", "2", "@word",
              "@text", "REDUCE", "SUM", "1", "@foo", "AS", "foo_total"});
  EXPECT_THAT(resp,
              IsUnordArrayWithSize(IsMap("foo_total", "20", "word", "item2", "text", "second key"),
                                   IsMap("foo_total", "10", "word", "item1", "text", "first key")));

  resp = Run({"FT.AGGREGATE", "i2", "*", "LOAD", "1", "@word", "GROUPBY", "1", "@word", "REDUCE",
              "SUM", "1", "@foo", "AS", "foo_total"});
  EXPECT_THAT(resp, IsUnordArrayWithSize(IsMap("foo_total", "20", "word", "item2"),
                                         IsMap("foo_total", "10", "word", "item1")));
}

TEST_F(SearchFamilyTest, WrongFieldTypeJson) {
  EXPECT_EQ(Run({"FT.CREATE", "i1", "ON", "JSON", "SCHEMA", "$.value", "AS", "value", "NUMERIC",
                 "SORTABLE"}),
            "OK");

  EXPECT_EQ(Run({"FT.CREATE", "i2", "ON", "JSON", "SCHEMA", "$.value", "AS", "value", "NUMERIC"}),
            "OK");

  auto resp =
      Run({"FT.CREATE", "i3", "ON", "JSON", "SCHEMA", "$.arr[*].id", "AS", "id", "NUMERIC"});
  EXPECT_EQ(resp, "OK");

  resp = Run({"FT.CREATE", "i4", "ON", "JSON", "SCHEMA", "$.arr[*].id", "AS", "id", "NUMERIC",
              "SORTABLE"});
  EXPECT_EQ(resp, "OK");

  // Test simple
  Run({"JSON.SET", "j1", ".", R"({"value":"one"})"});
  Run({"JSON.SET", "j2", ".", R"({"value":1})"});

  resp = Run({"FT.SEARCH", "i1", "*"});
  EXPECT_THAT(resp, AreDocIds("j2"));

  resp = Run({"FT.AGGREGATE", "i1", "*", "LOAD", "1", "$.value"});
  EXPECT_THAT(resp, IsUnordArrayWithSize(IsMap("$.value", "1")));

  // Test with two fields. One is loading
  Run({"JSON.SET", "j3", ".", R"({"value":"two","another_value":1})"});
  Run({"JSON.SET", "j4", ".", R"({"value":2,"another_value":2})"});

  absl::FlagSaver fs;
  absl::SetFlag(&FLAGS_search_reject_legacy_field, false);
  resp = Run({"FT.AGGREGATE", "i2", "*", "LOAD", "2", "$.value", "$.another_value", "GROUPBY", "2",
              "$.value", "$.another_value", "REDUCE", "COUNT", "0", "AS", "count"});
  EXPECT_THAT(resp,
              IsUnordArrayWithSize(
                  IsMap("$.value", "1", "$.another_value", ArgType(RespExpr::NIL), "count", "1"),
                  IsMap("$.value", "2", "$.another_value", "2", "count", "1")));
  absl::SetFlag(&FLAGS_search_reject_legacy_field, true);

  resp = Run({"FT.AGGREGATE", "i2", "*", "LOAD", "2", "$.value", "$.another_value", "GROUPBY", "2",
              "$.value", "$.another_value", "REDUCE", "COUNT", "0", "AS", "count"});
  EXPECT_THAT(resp, ErrArg("bad arguments: Field name should start with '@'"));

  // Test multiple field values
  Run({"JSON.SET", "j5", ".", R"({"arr":[{"id":1},{"id":"two"}]})"});
  Run({"JSON.SET", "j6", ".", R"({"arr":[{"id":1},{"id":2}]})"});
  Run({"JSON.SET", "j7", ".", R"({"arr":[]})"});

  resp = Run({"FT.SEARCH", "i3", "*"});
  EXPECT_THAT(resp, AreDocIds("j1", "j2", "j3", "j4", "j6", "j7"));  // Only j5 fails

  resp = Run({"FT.SEARCH", "i4", "*"});
  EXPECT_THAT(resp, AreDocIds("j1", "j2", "j3", "j4", "j6", "j7"));  // Only j5 fails
}

TEST_F(SearchFamilyTest, WrongFieldTypeHash) {
  EXPECT_EQ(Run({"FT.CREATE", "i1", "ON", "HASH", "SCHEMA", "value", "NUMERIC", "SORTABLE"}), "OK");
  EXPECT_EQ(Run({"FT.CREATE", "i2", "ON", "HASH", "SCHEMA", "value", "NUMERIC"}), "OK");

  // Test simple
  Run({"HSET", "h1", "value", "one"});
  Run({"HSET", "h2", "value", "1"});

  auto resp = Run({"FT.SEARCH", "i1", "*"});
  EXPECT_THAT(resp, IsMapWithSize("h2", IsMap("value", "1")));

  resp = Run({"FT.AGGREGATE", "i1", "*", "LOAD", "1", "@value"});
  EXPECT_THAT(resp, IsUnordArrayWithSize(IsMap("value", "1")));

  // Test with two fields. One is loading
  Run({"HSET", "h3", "value", "two", "another_value", "1"});
  Run({"HSET", "h4", "value", "2", "another_value", "2"});

  resp = Run({"FT.SEARCH", "i2", "*", "LOAD", "1", "@another_value"});
  EXPECT_THAT(resp, IsMapWithSize("h2", IsMap("value", "1"), "h4",
                                  IsMap("value", "2", "another_value", "2")));

  resp = Run({"FT.AGGREGATE", "i2", "*", "LOAD", "2", "@value", "@another_value", "GROUPBY", "2",
              "@value", "@another_value", "REDUCE", "COUNT", "0", "AS", "count"});
  EXPECT_THAT(resp, IsUnordArrayWithSize(
                        IsMap("value", "1", "another_value", ArgType(RespExpr::NIL), "count", "1"),
                        IsMap("value", "2", "another_value", "2", "count", "1")));
}

TEST_F(SearchFamilyTest, WrongFieldTypeHardJson) {
  auto resp = Run({"FT.CREATE", "i1", "ON", "JSON", "SCHEMA", "$.data", "AS", "data", "NUMERIC"});
  EXPECT_EQ(resp, "OK");

  resp = Run(
      {"FT.CREATE", "i2", "ON", "JSON", "SCHEMA", "$.data", "AS", "data", "NUMERIC", "SORTABLE"});
  EXPECT_EQ(resp, "OK");

  resp = Run({"FT.CREATE", "i3", "ON", "JSON", "SCHEMA", "$.data", "AS", "data", "TAG"});
  EXPECT_EQ(resp, "OK");

  resp =
      Run({"FT.CREATE", "i4", "ON", "JSON", "SCHEMA", "$.data", "AS", "data", "TAG", "SORTABLE"});
  EXPECT_EQ(resp, "OK");

  resp = Run({"FT.CREATE", "i5", "ON", "JSON", "SCHEMA", "$.data", "AS", "data", "TEXT"});
  EXPECT_EQ(resp, "OK");

  resp =
      Run({"FT.CREATE", "i6", "ON", "JSON", "SCHEMA", "$.data", "AS", "data", "TEXT", "SORTABLE"});
  EXPECT_EQ(resp, "OK");

  resp = Run({"FT.CREATE", "i7", "ON", "JSON", "SCHEMA", "$.data", "AS", "data", "VECTOR", "FLAT",
              "6", "TYPE", "FLOAT32", "DIM", "3", "DISTANCE_METRIC", "L2"});
  EXPECT_EQ(resp, "OK");

  Run({"JSON.SET", "j1", ".", R"({"data":1,"name":"doc_with_int"})"});
  Run({"JSON.SET", "j2", ".", R"({"data":"1","name":"doc_with_int_as_string"})"});
  Run({"JSON.SET", "j3", ".", R"({"data":"string","name":"doc_with_string"})"});
  Run({"JSON.SET", "j4", ".",
       R"({"data":["first", "second", "third"],"name":"doc_with_strings"})"});
  Run({"JSON.SET", "j5", ".", R"({"name":"no_data"})"});
  Run({"JSON.SET", "j6", ".", R"({"data":[5,4,3],"name":"doc_with_vector"})"});
  Run({"JSON.SET", "j7", ".", R"({"data":"[5,4,3]","name":"doc_with_vector_as_string"})"});
  Run({"JSON.SET", "j8", ".", R"({"data":null,"name":"doc_with_null"})"});
  Run({"JSON.SET", "j9", ".", R"({"data":[null, null, null],"name":"doc_with_nulls"})"});
  Run({"JSON.SET", "j10", ".", R"({"data":true,"name":"doc_with_boolean"})"});
  Run({"JSON.SET", "j11", ".", R"({"data":[true, false, true],"name":"doc_with_booleans"})"});

  resp = Run({"FT.SEARCH", "i1", "*"});
  EXPECT_THAT(resp, AreDocIds("j1", "j5", "j6", "j8", "j9"));

  resp = Run({"FT.SEARCH", "i2", "*"});
  EXPECT_THAT(resp, AreDocIds("j1", "j5", "j6", "j8", "j9"));

  resp = Run({"FT.SEARCH", "i3", "*"});
  EXPECT_THAT(resp, AreDocIds("j2", "j3", "j4", "j5", "j7", "j8", "j9", "j10", "j11"));

  resp = Run({"FT.SEARCH", "i4", "*"});
  EXPECT_THAT(resp, AreDocIds("j2", "j3", "j4", "j5", "j7", "j8", "j9", "j10", "j11"));

  resp = Run({"FT.SEARCH", "i5", "*"});
  EXPECT_THAT(resp, AreDocIds("j2", "j3", "j4", "j5", "j7", "j8", "j9"));

  resp = Run({"FT.SEARCH", "i6", "*"});
  EXPECT_THAT(resp, AreDocIds("j2", "j3", "j4", "j5", "j7", "j8", "j9"));

  resp = Run({"FT.SEARCH", "i7", "*"});
  EXPECT_THAT(resp, AreDocIds("j5", "j6", "j8"));
}

TEST_F(SearchFamilyTest, WrongFieldTypeHardHash) {
  auto resp = Run({"FT.CREATE", "i1", "ON", "HASH", "SCHEMA", "data", "NUMERIC"});
  EXPECT_EQ(resp, "OK");

  resp = Run({"FT.CREATE", "i2", "ON", "HASH", "SCHEMA", "data", "NUMERIC", "SORTABLE"});
  EXPECT_EQ(resp, "OK");

  resp = Run({"FT.CREATE", "i3", "ON", "HASH", "SCHEMA", "data", "TAG"});
  EXPECT_EQ(resp, "OK");

  resp = Run({"FT.CREATE", "i4", "ON", "HASH", "SCHEMA", "data", "TAG", "SORTABLE"});
  EXPECT_EQ(resp, "OK");

  resp = Run({"FT.CREATE", "i5", "ON", "HASH", "SCHEMA", "data", "TEXT"});
  EXPECT_EQ(resp, "OK");

  resp = Run({"FT.CREATE", "i6", "ON", "HASH", "SCHEMA", "data", "TEXT", "SORTABLE"});
  EXPECT_EQ(resp, "OK");

  resp = Run({"FT.CREATE", "i7", "ON", "HASH", "SCHEMA", "data", "VECTOR", "FLAT", "6", "TYPE",
              "FLOAT32", "DIM", "3", "DISTANCE_METRIC", "L2"});
  EXPECT_EQ(resp, "OK");

  Run({"HSET", "j1", "data", "1", "name", "doc_with_int"});
  Run({"HSET", "j2", "data", "1", "name", "doc_with_int_as_string"});
  Run({"HSET", "j3", "data", "string", "name", "doc_with_string"});
  Run({"HSET", "j4", "name", "no_data"});
  Run({"HSET", "j5", "data", "5,4,3", "name", "doc_with_fake_vector"});
  Run({"HSET", "j6", "data", "[5,4,3]", "name", "doc_with_fake_vector_as_string"});

  // Vector [1, 2, 3]
  std::string vector = std::string("\x3f\x80\x00\x00\x40\x00\x00\x00\x40\x40\x00\x00", 12);
  Run({"HSET", "j7", "data", vector, "name", "doc_with_vector [1, 2, 3]"});

  resp = Run({"FT.SEARCH", "i1", "*"});
  EXPECT_THAT(resp, AreDocIds("j2", "j1", "j4"));

  resp = Run({"FT.SEARCH", "i2", "*"});
  EXPECT_THAT(resp, AreDocIds("j2", "j1", "j4"));

  resp = Run({"FT.SEARCH", "i3", "*"});
  EXPECT_THAT(resp, AreDocIds("j2", "j7", "j3", "j6", "j1", "j4", "j5"));

  resp = Run({"FT.SEARCH", "i4", "*"});
  EXPECT_THAT(resp, AreDocIds("j2", "j7", "j3", "j6", "j1", "j4", "j5"));

  resp = Run({"FT.SEARCH", "i5", "*"});
  EXPECT_THAT(resp, AreDocIds("j4", "j2", "j7", "j3", "j6", "j1", "j5"));

  resp = Run({"FT.SEARCH", "i6", "*"});
  EXPECT_THAT(resp, AreDocIds("j4", "j2", "j7", "j3", "j6", "j1", "j5"));

  resp = Run({"FT.SEARCH", "i7", "*"});
  EXPECT_THAT(resp, AreDocIds("j4", "j7"));
}

TEST_F(SearchFamilyTest, WrongVectorFieldType) {
  auto resp =
      Run({"FT.CREATE", "index", "ON", "JSON", "SCHEMA", "$.vector_field", "AS", "vector_field",
           "VECTOR", "FLAT", "6", "TYPE", "FLOAT32", "DIM", "3", "DISTANCE_METRIC", "L2"});
  EXPECT_EQ(resp, "OK");

  Run({"JSON.SET", "j1", ".",
       R"({"vector_field": [0.1, 0.2, 0.3], "name": "doc_with_correct_dim"})"});
  Run({"JSON.SET", "j2", ".", R"({"vector_field": [0.1, 0.2], "name": "doc_with_small_dim"})"});
  Run({"JSON.SET", "j3", ".",
       R"({"vector_field": [0.1, 0.2, 0.3, 0.4], "name": "doc_with_large_dim"})"});
  Run({"JSON.SET", "j4", ".", R"({"vector_field": [1, 2, 3], "name": "doc_with_int_values"})"});
  Run({"JSON.SET", "j5", ".",
       R"({"vector_field":"not_vector", "name":"doc_with_incorrect_field_type"})"});
  Run({"JSON.SET", "j6", ".", R"({"name":"doc_with_no_field"})"});
  Run({"JSON.SET", "j7", ".",
       R"({"vector_field": [999999999999999999999999999999999999999, -999999999999999999999999999999999999999, 500000000000000000000000000000000000000], "name": "doc_with_out_of_range_values"})"});
  Run({"JSON.SET", "j8", ".", R"({"vector_field":null, "name": "doc_with_null"})"});
  Run({"JSON.SET", "j9", ".", R"({"vector_field":[null, null, null], "name": "doc_with_nulls"})"});
  Run({"JSON.SET", "j10", ".", R"({"vector_field":true, "name": "doc_with_boolean"})"});
  Run({"JSON.SET", "j11", ".",
       R"({"vector_field":[true, false, true], "name": "doc_with_booleans"})"});
  Run({"JSON.SET", "j12", ".", R"({"vector_field":1, "name": "doc_with_int"})"});

  resp = Run({"FT.SEARCH", "index", "*"});
  EXPECT_THAT(resp, AreDocIds("j6", "j7", "j1", "j4", "j8"));
}

// Test that FT.AGGREGATE prints only needed fields
TEST_F(SearchFamilyTest, AggregateResultFields) {
  auto resp = Run({"FT.CREATE", "i1", "ON", "JSON", "SCHEMA", "$.a", "AS", "a", "TEXT", "SORTABLE",
                   "$.b", "AS", "b", "TEXT", "$.c", "AS", "c", "TEXT"});
  EXPECT_EQ(resp, "OK");

  resp = Run({"FT.CREATE", "i2", "ON", "JSON", "SCHEMA", "$.id", "AS", "id", "NUMERIC", "$.number",
              "AS", "number", "NUMERIC"});
  EXPECT_EQ(resp, "OK");

  Run({"JSON.SET", "j1", ".", R"({"a":"1","b":"2","c":"3"})"});
  Run({"JSON.SET", "j2", ".", R"({"a":"4","b":"5","c":"6"})"});
  Run({"JSON.SET", "j3", ".", R"({"a":"7","b":"8","c":"9"})"});

  resp = Run({"FT.AGGREGATE", "i1", "*"});
  EXPECT_THAT(resp, IsUnordArrayWithSize(IsMap(), IsMap(), IsMap()));

  absl::FlagSaver fs;
  absl::SetFlag(&FLAGS_search_reject_legacy_field, false);
  resp = Run({"FT.AGGREGATE", "i1", "*", "SORTBY", "1", "a"});
  EXPECT_THAT(resp, IsUnordArrayWithSize(IsMap("a", "1"), IsMap("a", "4"), IsMap("a", "7")));
  absl::SetFlag(&FLAGS_search_reject_legacy_field, true);
  resp = Run({"FT.AGGREGATE", "i1", "*", "SORTBY", "1", "a"});
  EXPECT_THAT(resp, ErrArg("SORTBY field name 'a' must start with '@'"));

  absl::SetFlag(&FLAGS_search_reject_legacy_field, false);
  resp = Run({"FT.AGGREGATE", "i1", "*", "LOAD", "1", "@b", "SORTBY", "1", "a"});
  EXPECT_THAT(resp, IsUnordArrayWithSize(IsMap("b", "2", "a", "1"), IsMap("b", "5", "a", "4"),
                                         IsMap("b", "8", "a", "7")));
  absl::SetFlag(&FLAGS_search_reject_legacy_field, true);
  resp = Run({"FT.AGGREGATE", "i1", "*", "LOAD", "1", "@b", "SORTBY", "1", "a"});
  EXPECT_THAT(resp, ErrArg("SORTBY field name 'a' must start with '@'"));

  absl::SetFlag(&FLAGS_search_reject_legacy_field, false);
  resp = Run({"FT.AGGREGATE", "i1", "*", "SORTBY", "1", "a", "GROUPBY", "2", "@b", "@a", "REDUCE",
              "COUNT", "0", "AS", "count"});
  EXPECT_THAT(resp, IsUnordArrayWithSize(IsMap("b", "8", "a", "7", "count", "1"),
                                         IsMap("b", "2", "a", "1", "count", "1"),
                                         IsMap("b", "5", "a", "4", "count", "1")));
  absl::SetFlag(&FLAGS_search_reject_legacy_field, true);
  resp = Run({"FT.AGGREGATE", "i1", "*", "SORTBY", "1", "a", "GROUPBY", "2", "@b", "@a", "REDUCE",
              "COUNT", "0", "AS", "count"});
  EXPECT_THAT(resp, ErrArg("SORTBY field name 'a' must start with '@'"));

  Run({"JSON.SET", "j4", ".", R"({"id":1, "number":4})"});
  Run({"JSON.SET", "j5", ".", R"({"id":2})"});

  resp = Run({"FT.AGGREGATE", "i2", "*", "LOAD", "2", "@id", "@number"});
  EXPECT_THAT(resp, IsUnordArrayWithSize(IsMap("id", "1", "number", "4"), IsMap("id", "2"), IsMap(),
                                         IsMap(), IsMap()));
}

TEST_F(SearchFamilyTest, AggregateSortByJson) {
  Run({"FT.CREATE", "index", "ON", "JSON", "SCHEMA", "$.name", "AS", "name", "TEXT", "$.number",
       "AS", "number", "NUMERIC", "$.group", "AS", "group", "TAG"});
  Run({"JSON.SET", "j1", "$", R"({"name": "first", "number": 1200, "group": "first"})"});
  Run({"JSON.SET", "j2", "$", R"({"name": "second", "number": 800, "group": "first"})"});
  Run({"JSON.SET", "j3", "$", R"({"name": "third", "number": 300, "group": "first"})"});
  Run({"JSON.SET", "j4", "$", R"({"name": "fourth", "number": 400, "group": "second"})"});
  Run({"JSON.SET", "j5", "$", R"({"name": "fifth", "number": 900, "group": "second"})"});
  Run({"JSON.SET", "j6", "$", R"({"name": "sixth", "number": 300, "group": "first"})"});
  Run({"JSON.SET", "j7", "$", R"({"name": "seventh", "number": 400, "group": "second"})"});
  Run({"JSON.SET", "j8", "$", R"({"name": "eighth", "group": "first"})"});
  Run({"JSON.SET", "j9", "$", R"({"name": "ninth", "group": "second"})"});

  // Test sorting by name (DESC) and number (ASC)
  auto resp = Run({"FT.AGGREGATE", "index", "*", "SORTBY", "4", "@name", "DESC", "@number", "ASC"});
  EXPECT_THAT(
      resp, IsUnordArrayWithSize(
                IsMap("name", "third", "number", "300"), IsMap("name", "sixth", "number", "300"),
                IsMap("name", "seventh", "number", "400"), IsMap("name", "second", "number", "800"),
                IsMap("name", "ninth"), IsMap("name", "fourth", "number", "400"),
                IsMap("name", "first", "number", "1200"), IsMap("name", "fifth", "number", "900"),
                IsMap("name", "eighth")));

  // Test sorting by name (ASC) and number (DESC)
  resp = Run({"FT.AGGREGATE", "index", "*", "SORTBY", "4", "@name", "ASC", "@number", "DESC"});
  EXPECT_THAT(
      resp, IsUnordArrayWithSize(
                IsMap("name", "eighth"), IsMap("name", "fifth", "number", "900"),
                IsMap("name", "first", "number", "1200"), IsMap("name", "fourth", "number", "400"),
                IsMap("name", "ninth"), IsMap("name", "second", "number", "800"),
                IsMap("name", "seventh", "number", "400"), IsMap("name", "sixth", "number", "300"),
                IsMap("name", "third", "number", "300")));

  // Test sorting by group (ASC), number (DESC), and name
  resp = Run(
      {"FT.AGGREGATE", "index", "*", "SORTBY", "5", "@group", "ASC", "@number", "DESC", "@name"});
  EXPECT_THAT(resp,
              IsUnordArrayWithSize(IsMap("group", "first", "number", "1200", "name", "first"),
                                   IsMap("group", "first", "number", "800", "name", "second"),
                                   IsMap("group", "first", "number", "300", "name", "sixth"),
                                   IsMap("group", "first", "number", "300", "name", "third"),
                                   IsMap("group", "first", "name", "eighth"),
                                   IsMap("group", "second", "number", "900", "name", "fifth"),
                                   IsMap("group", "second", "number", "400", "name", "fourth"),
                                   IsMap("group", "second", "number", "400", "name", "seventh"),
                                   IsMap("group", "second", "name", "ninth")));

  // Test sorting by number (ASC), group (DESC), and name
  resp = Run(
      {"FT.AGGREGATE", "index", "*", "SORTBY", "5", "@number", "ASC", "@group", "DESC", "@name"});
  EXPECT_THAT(resp,
              IsUnordArrayWithSize(IsMap("number", "300", "group", "first", "name", "sixth"),
                                   IsMap("number", "300", "group", "first", "name", "third"),
                                   IsMap("number", "400", "group", "second", "name", "fourth"),
                                   IsMap("number", "400", "group", "second", "name", "seventh"),
                                   IsMap("number", "800", "group", "first", "name", "second"),
                                   IsMap("number", "900", "group", "second", "name", "fifth"),
                                   IsMap("number", "1200", "group", "first", "name", "first"),
                                   IsMap("group", "second", "name", "ninth"),
                                   IsMap("group", "first", "name", "eighth")));

  // Test sorting with MAX 3
  resp = Run({"FT.AGGREGATE", "index", "*", "SORTBY", "1", "@number", "MAX", "3"});
  EXPECT_THAT(resp, IsUnordArrayWithSize(IsMap("number", "300"), IsMap("number", "300"),
                                         IsMap("number", "400")));

  // Test sorting with MAX 3
  resp = Run({"FT.AGGREGATE", "index", "*", "SORTBY", "2", "@number", "DESC", "MAX", "3"});
  EXPECT_THAT(resp, IsUnordArrayWithSize(IsMap("number", "1200"), IsMap("number", "900"),
                                         IsMap("number", "800")));

  // Test sorting by number (ASC) with MAX 999
  resp = Run({"FT.AGGREGATE", "index", "*", "SORTBY", "1", "@number", "MAX", "999"});
  EXPECT_THAT(resp, IsUnordArrayWithSize(IsMap("number", "300"), IsMap("number", "300"),
                                         IsMap("number", "400"), IsMap("number", "400"),
                                         IsMap("number", "800"), IsMap("number", "900"),
                                         IsMap("number", "1200"), IsMap(), IsMap()));

  // Test sorting by name and number (DESC)
  resp = Run({"FT.AGGREGATE", "index", "*", "SORTBY", "3", "@name", "@number", "DESC"});
  EXPECT_THAT(
      resp, IsUnordArrayWithSize(
                IsMap("name", "eighth"), IsMap("name", "fifth", "number", "900"),
                IsMap("name", "first", "number", "1200"), IsMap("name", "fourth", "number", "400"),
                IsMap("name", "ninth"), IsMap("name", "second", "number", "800"),
                IsMap("name", "seventh", "number", "400"), IsMap("name", "sixth", "number", "300"),
                IsMap("name", "third", "number", "300")));

  // Test SORTBY with MAX, GROUPBY, and REDUCE COUNT
  resp = Run({"FT.AGGREGATE", "index", "*", "SORTBY", "1", "@name", "MAX", "3", "GROUPBY", "1",
              "@number", "REDUCE", "COUNT", "0", "AS", "count"});
  EXPECT_THAT(resp, IsUnordArrayWithSize(IsMap("number", "900", "count", "1"),
                                         IsMap("number", ArgType(RespExpr::NIL), "count", "1"),
                                         IsMap("number", "1200", "count", "1")));

  // Test SORTBY with MAX, GROUPBY (0 fields), and REDUCE COUNT
  resp = Run({"FT.AGGREGATE", "index", "*", "SORTBY", "1", "@name", "MAX", "3", "GROUPBY", "0",
              "REDUCE", "COUNT", "0", "AS", "count"});
  EXPECT_THAT(resp, IsUnordArrayWithSize(IsMap("count", "3")));
}

TEST_F(SearchFamilyTest, AggregateSortByParsingErrors) {
  Run({"FT.CREATE", "index", "ON", "JSON", "SCHEMA", "$.name", "AS", "name", "TEXT", "$.number",
       "AS", "number", "NUMERIC", "$.group", "AS", "group", "TAG"});
  Run({"JSON.SET", "j1", "$", R"({"name": "first", "number": 1200, "group": "first"})"});
  Run({"JSON.SET", "j2", "$", R"({"name": "second", "number": 800, "group": "first"})"});
  Run({"JSON.SET", "j3", "$", R"({"name": "third", "number": 300, "group": "first"})"});
  Run({"JSON.SET", "j4", "$", R"({"name": "fourth", "number": 400, "group": "second"})"});
  Run({"JSON.SET", "j5", "$", R"({"name": "fifth", "number": 900, "group": "second"})"});
  Run({"JSON.SET", "j6", "$", R"({"name": "sixth", "number": 300, "group": "first"})"});
  Run({"JSON.SET", "j7", "$", R"({"name": "seventh", "number": 400, "group": "second"})"});
  Run({"JSON.SET", "j8", "$", R"({"name": "eighth", "group": "first"})"});
  Run({"JSON.SET", "j9", "$", R"({"name": "ninth", "group": "second"})"});

  // Test SORTBY with invalid argument count
  auto resp = Run({"FT.AGGREGATE", "index", "*", "SORTBY", "999", "@name", "@number", "DESC"});
  EXPECT_THAT(resp, ErrArg("bad arguments for SORTBY: specified invalid number of strings"));

  // Test SORTBY with negative argument count
  resp = Run({"FT.AGGREGATE", "index", "*", "SORTBY", "-3", "@name", "@number", "DESC"});
  EXPECT_THAT(resp, ErrArg(kInvalidIntErr));

  // Test MAX with invalid value
  resp = Run({"FT.AGGREGATE", "index", "*", "SORTBY", "1", "@name", "MAX", "-10"});
  EXPECT_THAT(resp, ErrArg(kInvalidIntErr));

  // Test MAX without a value
  resp = Run({"FT.AGGREGATE", "index", "*", "SORTBY", "1", "@name", "MAX"});
  EXPECT_THAT(resp, ErrArg(kSyntaxErr));

  // Test SORTBY with a non-existing field
  /* Temporary unsupported
  resp = Run({"FT.AGGREGATE", "index", "*", "SORTBY", "1", "@nonexistingfield"});
  EXPECT_THAT(resp, ErrArg("Property `nonexistingfield` not loaded nor in schema")); */

  // Test SORTBY with an invalid value
  resp = Run({"FT.AGGREGATE", "index", "*", "SORTBY", "notvalue", "@name"});
  EXPECT_THAT(resp, ErrArg(kInvalidIntErr));
}

TEST_F(SearchFamilyTest, AggregateSortByParsingErrorsWithoutAt) {
  Run({"FT.CREATE", "index", "ON", "JSON", "SCHEMA", "$.name", "AS", "name", "TEXT", "$.number",
       "AS", "number", "NUMERIC", "$.group", "AS", "group", "TAG"});

  Run({"JSON.SET", "j1", "$", R"({"name": "first", "number": 1200, "group": "first"})"});

  // Test SORTBY with field name without '@'
  auto resp = Run({"FT.AGGREGATE", "index", "*", "SORTBY", "1", "name"});
  EXPECT_THAT(resp, ErrArg("SORTBY field name 'name' must start with '@'"));

  // Test SORTBY with field name without '@' and multiple sort fields
  resp = Run({"FT.AGGREGATE", "index", "*", "SORTBY", "3", "name", "@number", "DESC"});
  EXPECT_THAT(resp, ErrArg("SORTBY field name 'name' must start with '@'"));

  // Test SORTBY with field name without '@' and MAX option
  resp = Run({"FT.AGGREGATE", "index", "*", "SORTBY", "1", "name", "MAX", "1"});
  EXPECT_THAT(resp, ErrArg("SORTBY field name 'name' must start with '@'"));

  // Check that the old error still works for wrong number of args
  resp = Run({"FT.AGGREGATE", "index", "*", "SORTBY", "2", "@name"});
  EXPECT_THAT(resp, ErrArg("bad arguments for SORTBY: specified invalid number of strings"));
}

TEST_F(SearchFamilyTest, InvalidSearchOptions) {
  Run({"FT.CREATE", "idx", "ON", "JSON", "SCHEMA", "$.field1", "AS", "field1", "TEXT", "$.field2",
       "AS", "field2", "TEXT"});

  Run({"JSON.SET", "j1", ".", R"({"field1":"first","field2":"second"})"});

  /* Test with an empty query and LOAD. TODO: Add separate test for query syntax
  auto resp = Run({"FT.SEARCH", "idx", "", "LOAD", "1", "@field1"});
  EXPECT_THAT(resp, IsMapWithSize()); */

  // Test with LIMIT missing arguments
  auto resp = Run({"FT.SEARCH", "idx", "*", "LIMIT", "0"});
  EXPECT_THAT(resp, ErrArg(kSyntaxErr));

  // Test with LIMIT exceeding the maximum allowed value
  resp = Run({"FT.SEARCH", "idx", "*", "LIMIT", "0", "100000000000000000000"});
  EXPECT_THAT(resp, ErrArg(kInvalidIntErr));

  // Test with LIMIT and negative arguments
  resp = Run({"FT.SEARCH", "idx", "*", "LIMIT", "-1", "10"});
  EXPECT_THAT(resp, ErrArg(kInvalidIntErr));

  // Test with LIMIT and invalid argument types
  resp = Run({"FT.SEARCH", "idx", "*", "LIMIT", "start", "count"});
  EXPECT_THAT(resp, ErrArg(kInvalidIntErr));

  // Test with invalid RETURN syntax (missing count)
  resp = Run({"FT.SEARCH", "idx", "*", "RETURN", "@field1", "@field2"});
  EXPECT_THAT(resp, ErrArg(kInvalidIntErr));

  // Test with RETURN having duplicate fields
  resp = Run({"FT.SEARCH", "idx", "*", "RETURN", "4", "field1", "field1", "field2", "field2"});
  EXPECT_THAT(resp, IsMapWithSize("j1", IsMap("field1", "first", "field2", "second")));

  // Test with RETURN exceeding maximum allowed count
  resp = Run({"FT.SEARCH", "idx", "*", "RETURN", "100000000000000000000", "@field1", "@field2"});
  EXPECT_THAT(resp, ErrArg(kInvalidIntErr));

  // Test with NOCONTENT and RETURN
  resp = Run({"FT.SEARCH", "idx", "*", "NOCONTENT", "RETURN", "2", "@field1", "@field2"});
  EXPECT_THAT(resp, IsArray(IntArg(1), "j1"));
}

TEST_F(SearchFamilyTest, KnnSearchOptions) {
  auto resp = Run({"FT.CREATE", "my_index", "ON",  "JSON",   "PREFIX",          "1",     "doc:",
                   "SCHEMA",    "$.vector", "AS",  "vector", "VECTOR",          "FLAT",  "6",
                   "TYPE",      "FLOAT32",  "DIM", "4",      "DISTANCE_METRIC", "COSINE"});
  EXPECT_EQ(resp, "OK");

  Run({"JSON.SET", "doc:1", ".", R"({"vector": [0.1, 0.2, 0.3, 0.4]})"});
  Run({"JSON.SET", "doc:2", ".", R"({"vector": [0.5, 0.6, 0.7, 0.8]})"});
  Run({"JSON.SET", "doc:3", ".", R"({"vector": [0.9, 0.1, 0.4, 0.3]})"});

  std::string query_vector("\x00\x00\x00\x3f\x00\x00\x00\x40\x00\x00\x00\x41\x00\x00\x80\x42", 16);

  // KNN 2
  resp = Run({"FT.SEARCH", "my_index", "*=>[KNN 2 @vector $query_vector]", "PARAMS", "2",
              "query_vector", query_vector});
  EXPECT_THAT(resp, AreDocIds("doc:1", "doc:2"));

  // KNN 11929939
  resp = Run({"FT.SEARCH", "my_index", "*=>[KNN 11929939 @vector $query_vector]", "PARAMS", "2",
              "query_vector", query_vector});
  EXPECT_THAT(resp, AreDocIds("doc:1", "doc:2", "doc:3"));

  // KNN 11929939, LIMIT 4 2
  resp = Run({"FT.SEARCH", "my_index", "*=>[KNN 11929939 @vector $query_vector]", "PARAMS", "2",
              "query_vector", query_vector, "LIMIT", "4", "2"});
  EXPECT_THAT(resp, IntArg(3));

  // KNN 11929939, LIMIT 0 10
  resp = Run({"FT.SEARCH", "my_index", "*=>[KNN 11929939 @vector $query_vector]", "PARAMS", "2",
              "query_vector", query_vector, "LIMIT", "0", "10"});
  EXPECT_THAT(resp, AreDocIds("doc:1", "doc:2", "doc:3"));

  // KNN 1, LIMIT 0 2
  resp = Run({"FT.SEARCH", "my_index", "*=>[KNN 1 @vector $query_vector]", "PARAMS", "2",
              "query_vector", query_vector, "LIMIT", "0", "2"});
  EXPECT_THAT(resp, AreDocIds("doc:1"));

  // Parenthesized star - used by LangChain for KNN queries (issue #6342)
  resp = Run({"FT.SEARCH", "my_index", "(*)=>[KNN 2 @vector $query_vector]", "PARAMS", "2",
              "query_vector", query_vector});
  EXPECT_THAT(resp, AreDocIds("doc:1", "doc:2"));

  // Double parenthesized star
  resp = Run({"FT.SEARCH", "my_index", "((*))=>[KNN 2 @vector $query_vector]", "PARAMS", "2",
              "query_vector", query_vector});
  EXPECT_THAT(resp, AreDocIds("doc:1", "doc:2"));
}

TEST_F(SearchFamilyTest, KnnWithSortBy) {
  Run({"FT.CREATE", "i1",      "ON",     "JSON", "PREFIX",          "1",    "d:",
       "SCHEMA",    "$.v",     "AS",     "v",    "VECTOR",          "FLAT", "6",
       "TYPE",      "FLOAT32", "DIM",    "1",    "DISTANCE_METRIC", "L2",   "$.d",
       "AS",        "d",       "NUMERIC"});

  vector<string> doc_ids(100);
  for (size_t i = 0; i < doc_ids.size(); i++) {
    doc_ids[i] = absl::StrCat("d:", i);
    auto v = absl::StrFormat(R"({"v": [%d.0], "d": %d})", i, i);
    Run({"JSON.SET", doc_ids[i], ".", v});
  }

  // We first select knn_limit closest values and then sort in REVERSE by distance
  // on a non-sortable field. The result should be first cut off by knn_limit and then sorted
  for (size_t knn_limit = 8; knn_limit < 47; knn_limit += 3) {
    vector<string> expect_ids(doc_ids.begin() + knn_limit - min<size_t>(knn_limit, 10u),
                              doc_ids.begin() + knn_limit);
    reverse(expect_ids.begin(), expect_ids.end());

    const float qpoint = 0.0f;
    std::string q = absl::StrFormat("*=>[KNN %d @v $query_vector]", knn_limit);
    auto resp = Run({"ft.search", "i1", q, "SORTBY", "d", "DESC", "PARAMS", "2", "query_vector",
                     FloatSV(&qpoint), "LIMIT", "0", "10", "RETURN", "1", "d"});
    EXPECT_THAT(resp, DocIds(knn_limit, expect_ids)) << knn_limit;
  }
}

TEST_F(SearchFamilyTest, InvalidAggregateOptions) {
  Run({"FT.CREATE", "idx", "ON", "JSON", "SCHEMA", "$.field1", "AS", "field1", "TEXT", "$.field2",
       "AS", "field2", "TEXT"});

  Run({"JSON.SET", "j1", ".", R"({"field1":"first","field2":"second"})"});

  // Test GROUPBY with no arguments
  auto resp = Run({"FT.AGGREGATE", "idx", "*", "GROUPBY"});
  EXPECT_THAT(resp, ErrArg(kSyntaxErr));

  // Test GROUPBY with invalid count
  resp = Run({"FT.AGGREGATE", "idx", "*", "GROUPBY", "-1", "@field1"});
  EXPECT_THAT(resp, ErrArg(kInvalidIntErr));

  resp =
      Run({"FT.AGGREGATE", "idx", "*", "GROUPBY", "100000000000000000000", "@field1", "@field2"});
  EXPECT_THAT(resp, ErrArg(kInvalidIntErr));

  // Test REDUCE with no REDUCE function
  resp = Run({"FT.AGGREGATE", "idx", "*", "GROUPBY", "1", "@field1", "REDUCE"});
  EXPECT_THAT(resp, ErrArg("reducer function  not found"));

  /* // Test REDUCE with COUNT function
  resp = Run({"FT.AGGREGATE", "idx", "*", "GROUPBY", "1", "@field1", "REDUCE", "COUNT", "0"});
  EXPECT_THAT(resp, IsMapWithSize("__generated_aliascount", "1", "field1", "first")); */

  // Test REDUCE with invalid function
  resp = Run({"FT.AGGREGATE", "idx", "*", "GROUPBY", "1", "@field1", "REDUCE", "INVALIDFUNC", "0",
              "AS", "result"});
  EXPECT_THAT(resp, ErrArg("reducer function INVALIDFUNC not found"));

  // Test SORTBY with no arguments
  resp = Run({"FT.AGGREGATE", "idx", "*", "SORTBY"});
  EXPECT_THAT(resp, ErrArg(kSyntaxErr));

  // Test SORTBY with invalid count
  resp = Run({"FT.AGGREGATE", "idx", "*", "SORTBY", "-1", "@field1"});
  EXPECT_THAT(resp, ErrArg(kInvalidIntErr));

  resp = Run({"FT.AGGREGATE", "idx", "*", "SORTBY", "100000000000000000000", "@field1"});
  EXPECT_THAT(resp, ErrArg(kInvalidIntErr));

  // Test LIMIT with invalid arguments
  resp = Run({"FT.AGGREGATE", "idx", "*", "LIMIT", "0"});
  EXPECT_THAT(resp, ErrArg(kSyntaxErr));

  resp = Run({"FT.AGGREGATE", "idx", "*", "LIMIT", "-1", "10"});
  EXPECT_THAT(resp, ErrArg(kInvalidIntErr));

  resp = Run({"FT.AGGREGATE", "idx", "*", "LIMIT", "0", "100000000000000000000"});
  EXPECT_THAT(resp, ErrArg(kInvalidIntErr));

  // Test LOAD with invalid arguments
  resp = Run({"FT.AGGREGATE", "idx", "*", "LOAD", "@field1", "@field2"});
  EXPECT_THAT(resp, ErrArg(kInvalidIntErr));

  resp = Run({"FT.AGGREGATE", "idx", "*", "LOAD", "-1", "@field1"});
  EXPECT_THAT(resp, ErrArg(kInvalidIntErr));

  resp = Run({"FT.AGGREGATE", "idx", "*", "LOAD", "100000000000000000000", "@field1", "@field2"});
  EXPECT_THAT(resp, ErrArg(kInvalidIntErr));
}

TEST_F(SearchFamilyTest, InvalidCreateOptions) {
  // Test with a duplicate field in the schema
  auto resp = Run({"FT.CREATE", "index", "ON", "HASH", "SCHEMA", "title", "TEXT", "title", "TEXT"});
  EXPECT_THAT(resp, ErrArg("Duplicate field in schema - title"));

  // Test with no fields in the schema
  resp = Run({"FT.CREATE", "index", "ON", "HASH", "SCHEMA"});
  EXPECT_THAT(resp, ErrArg("Fields arguments are missing"));

  // Test with an invalid field type
  resp = Run({"FT.CREATE", "index", "ON", "HASH", "SCHEMA", "title", "UNKNOWN_TYPE"});
  EXPECT_THAT(resp, ErrArg("Field type UNKNOWN_TYPE is not supported"));

  // Test with an invalid STOPWORDS argument
  resp = Run({"FT.CREATE", "index", "ON", "HASH", "STOPWORDS", "10", "the", "and", "of", "SCHEMA",
              "title", "TEXT"});
  EXPECT_THAT(resp, ErrArg(kSyntaxErr));

  resp = Run({"FT.CREATE", "index", "ON", "HASH", "STOPWORDS", "99999999999999999999", "the", "and",
              "of", "SCHEMA", "title", "TEXT"});
  EXPECT_THAT(resp, ErrArg(kInvalidIntErr));

  resp = Run({"FT.CREATE", "index", "ON", "HASH", "STOPWORDS", "-1", "the", "and", "of", "SCHEMA",
              "title", "TEXT"});
  EXPECT_THAT(resp, ErrArg(kInvalidIntErr));

  resp = Run({"FT.CREATE", "index", "ON", "HASH", "STOPWORDS", "not_a_number", "the", "and", "of",
              "SCHEMA", "title", "TEXT"});
  EXPECT_THAT(resp, ErrArg(kInvalidIntErr));
}

TEST_F(SearchFamilyTest, SynonymManagement) {
  // Create index with prefix
  EXPECT_EQ(
      Run({"FT.CREATE", "my_idx", "ON", "HASH", "PREFIX", "1", "doc:", "SCHEMA", "title", "TEXT"}),
      "OK");

  // Add first group of synonyms
  EXPECT_EQ(Run({"FT.SYNUPDATE", "my_idx", "1", "cat", "feline", "kitty"}), "OK");

  // Add second group of synonyms
  EXPECT_EQ(Run({"FT.SYNUPDATE", "my_idx", "2", "kitty", "cute", "adorable"}), "OK");

  // Add third group of synonyms
  EXPECT_EQ(Run({"FT.SYNUPDATE", "my_idx", "3", "kitty", "tiger", "cub"}), "OK");

  // Check the dump output
  auto resp = Run({"FT.SYNDUMP", "my_idx"});
  EXPECT_THAT(resp, IsUnordArray("cub", IsArray("3"), "cute", IsArray("2"), "adorable",
                                 IsArray("2"), "kitty", IsArray("1", "2", "3"), "feline",
                                 IsArray("1"), "tiger", IsArray("3"), "cat", IsArray("1")));
}

TEST_F(SearchFamilyTest, SynonymsSearch) {
  // Create search index
  auto resp =
      Run({"FT.CREATE", "myIndex", "ON", "HASH", "PREFIX", "1", "doc:", "SCHEMA", "title", "TEXT"});
  EXPECT_EQ(resp, "OK");

  // Add documents
  EXPECT_THAT(Run({"HSET", "doc:1", "title", "car"}), IntArg(1));
  EXPECT_THAT(Run({"HSET", "doc:2", "title", "automobile"}), IntArg(1));
  EXPECT_THAT(Run({"HSET", "doc:3", "title", "vehicle"}), IntArg(1));

  // Add synonyms "car" and "automobile" to group 1
  resp = Run({"FT.SYNUPDATE", "myIndex", "1", "car", "automobile"});
  EXPECT_EQ(resp, "OK");

  // Check synonyms list
  resp = Run({"FT.SYNDUMP", "myIndex"});
  ASSERT_THAT(resp, ArrLen(4));

  // Search for "car" (should find both "car" and "automobile")
  resp = Run({"FT.SEARCH", "myIndex", "car"});
  EXPECT_THAT(resp, AreDocIds("doc:1", "doc:2"));

  // Search for "automobile" (should find both "car" and "automobile")
  resp = Run({"FT.SEARCH", "myIndex", "automobile"});
  EXPECT_THAT(resp, AreDocIds("doc:1", "doc:2"));

  // Add "vehicle" to the synonym group
  resp = Run({"FT.SYNUPDATE", "myIndex", "1", "vehicle"});
  EXPECT_EQ(resp, "OK");

  // Search for "vehicle" (should find all three documents)
  resp = Run({"FT.SEARCH", "myIndex", "vehicle"});
  EXPECT_THAT(resp, AreDocIds("doc:1", "doc:2", "doc:3"));
}

// Test for case-insensitive synonyms
TEST_F(SearchFamilyTest, CaseInsensitiveSynonyms) {
  // Create an index
  EXPECT_EQ(Run({"FT.CREATE", "case_idx", "ON", "HASH", "PREFIX", "1", "doc:", "SCHEMA", "title",
                 "TEXT"}),
            "OK");

  // Add documents with different case words
  EXPECT_THAT(Run({"HSET", "doc:1", "title", "The cat is sleeping"}), IntArg(1));
  EXPECT_THAT(Run({"HSET", "doc:2", "title", "A feline hunter"}), IntArg(1));
  EXPECT_THAT(Run({"HSET", "doc:3", "title", "The dog is barking"}), IntArg(1));
  EXPECT_THAT(Run({"HSET", "doc:4", "title", "A Canine friend"}), IntArg(1));

  // Add synonym groups with text IDs
  EXPECT_EQ(Run({"FT.SYNUPDATE", "case_idx", "my_synonyms_group0", "cat", "feline"}), "OK");
  EXPECT_EQ(Run({"FT.SYNUPDATE", "case_idx", "my_synonyms_group1", "dog", "canine"}), "OK");

  // Check synonym output
  auto resp = Run({"FT.SYNDUMP", "case_idx"});
  EXPECT_THAT(resp, ArrLen(8));  // 4 terms, each with a list of groups

  // Synonym search is case-insensitive
  // Search for "cat" should find "cat" and "feline"
  resp = Run({"FT.SEARCH", "case_idx", "cat"});
  EXPECT_THAT(resp, AreDocIds("doc:1", "doc:2"));

  // Search for "feline" should find "feline" and "cat"
  resp = Run({"FT.SEARCH", "case_idx", "feline"});
  EXPECT_THAT(resp, AreDocIds("doc:2", "doc:1"));

  // Search for "dog" should find "dog" and "canine"
  resp = Run({"FT.SEARCH", "case_idx", "dog"});
  EXPECT_THAT(resp, AreDocIds("doc:3", "doc:4"));

  // Search for "canine" should find "canine" and "dog"
  resp = Run({"FT.SEARCH", "case_idx", "canine"});
  EXPECT_THAT(resp, AreDocIds("doc:4", "doc:3"));

  // Search with different case
  // Search for "Cat" (uppercase) should find "cat" and "feline"
  resp = Run({"FT.SEARCH", "case_idx", "Cat"});
  EXPECT_THAT(resp, AreDocIds("doc:1", "doc:2"));

  // Search for "FELINE" (uppercase) should find "feline" and "cat"
  resp = Run({"FT.SEARCH", "case_idx", "FELINE"});
  EXPECT_THAT(resp, AreDocIds("doc:2", "doc:1"));

  // Search for "DoG" (mixed case) should find "dog" and "canine"
  resp = Run({"FT.SEARCH", "case_idx", "DoG"});
  EXPECT_THAT(resp, AreDocIds("doc:3", "doc:4"));

  // Search for "cAnInE" (mixed case) should find "canine" and "dog"
  resp = Run({"FT.SEARCH", "case_idx", "cAnInE"});
  EXPECT_THAT(resp, AreDocIds("doc:4", "doc:3"));
}

TEST_F(SearchFamilyTest, SynonymsWithSpaces) {
  EXPECT_EQ(Run({"FT.CREATE", "my_index", "ON", "HASH", "PREFIX", "1", "doc:", "SCHEMA", "field",
                 "TEXT"}),
            "OK");

  EXPECT_EQ(Run({"FT.SYNUPDATE", "my_index", "syn_group", "word1", "word2"}), "OK");

  EXPECT_THAT(Run({"HSET", "doc:1", "field", " syn_group"}), IntArg(1));
  EXPECT_THAT(Run({"HSET", "doc:2", "field", "syn_group"}), IntArg(1));
  EXPECT_THAT(Run({"HSET", "doc:3", "field", "word1"}), IntArg(1));
  EXPECT_THAT(Run({"HSET", "doc:4", "field", "word2"}), IntArg(1));
  EXPECT_THAT(Run({"HSET", "doc:5", "field", R"(\ syn_group)"}), IntArg(1));

  auto resp = Run({"FT.SEARCH", "my_index", "word1"});
  EXPECT_THAT(resp, AreDocIds("doc:3", "doc:4"));

  resp = Run({"FT.SEARCH", "my_index", "word2"});
  EXPECT_THAT(resp, AreDocIds("doc:4", "doc:3"));

  resp = Run({"FT.SEARCH", "my_index", "syn_group"});
  EXPECT_THAT(resp, AreDocIds("doc:2", "doc:1", "doc:5"));

  // FT.SEARCH my_index "\ syn_group"
  // FT.SEARCH my_index " syn_group"
  // The both transform to " syn_group" after syntax analysis
  // " syn_group" passes to query_str in FtSearch
  resp = Run({"FT.SEARCH", "my_index", " syn_group"});
  EXPECT_THAT(resp, AreDocIds("doc:1", "doc:2", "doc:5"));
}

TEST_F(SearchFamilyTest, SynonymsWithLeadingSpaces) {
  EXPECT_EQ(Run({"FT.CREATE", "my_index", "ON", "HASH", "PREFIX", "1", "doc:", "SCHEMA", "title",
                 "TEXT"}),
            "OK");

  EXPECT_EQ(Run({"FT.SYNUPDATE", "my_index", "group1", "word", "    several_spaces_synonym"}),
            "OK");

  auto resp = Run({"FT.SYNDUMP", "my_index"});
  EXPECT_THAT(resp, IsUnordArray("    several_spaces_synonym", IsArray("group1"), "word",
                                 IsArray("group1")));

  EXPECT_THAT(Run({"HSET", "doc:1", "title", "word"}), IntArg(1));
  EXPECT_THAT(Run({"HSET", "doc:2", "title", "several_spaces_synonym"}), IntArg(1));

  resp = Run({"FT.SEARCH", "my_index", "word"});
  EXPECT_THAT(resp, AreDocIds("doc:1"));

  resp = Run({"FT.SEARCH", "my_index", "several_spaces_synonym"});
  EXPECT_THAT(resp, AreDocIds("doc:2"));

  EXPECT_THAT(Run({"HSET", "doc:3", "title", "    several_spaces_synonym"}), IntArg(1));

  resp = Run({"FT.SEARCH", "my_index", "word"});
  EXPECT_THAT(resp, AreDocIds("doc:1"));
}

// Test to verify prefix search works correctly with synonyms
TEST_F(SearchFamilyTest, PrefixSearchWithSynonyms) {
  // Create search index
  EXPECT_EQ(Run({"FT.CREATE", "prefix_index", "ON", "HASH", "PREFIX", "1", "doc:", "SCHEMA",
                 "title", "TEXT"}),
            "OK");

  // Add documents with words that start with the same prefix
  EXPECT_THAT(Run({"HSET", "doc:1", "title", "apple"}), IntArg(1));
  EXPECT_THAT(Run({"HSET", "doc:2", "title", "application"}), IntArg(1));
  EXPECT_THAT(Run({"HSET", "doc:3", "title", "banana"}), IntArg(1));
  EXPECT_THAT(Run({"HSET", "doc:4", "title", "appetizer"}), IntArg(1));
  EXPECT_THAT(Run({"HSET", "doc:5", "title", "pineapple"}), IntArg(1));
  EXPECT_THAT(Run({"HSET", "doc:6", "title", "macintosh"}), IntArg(1));

  // Check prefix search before adding synonyms
  auto resp = Run({"FT.SEARCH", "prefix_index", "app*"});
  EXPECT_THAT(resp, AreDocIds("doc:1", "doc:2", "doc:4"));

  // Add synonym: apple <-> macintosh
  EXPECT_EQ(Run({"FT.SYNUPDATE", "prefix_index", "1", "apple", "macintosh"}), "OK");

  // Verify prefix search still works after adding synonyms
  resp = Run({"FT.SEARCH", "prefix_index", "app*"});
  EXPECT_THAT(resp, AreDocIds("doc:1", "doc:2", "doc:4"));

  // Check exact term search for terms that are now synonyms
  resp = Run({"FT.SEARCH", "prefix_index", "apple"});
  EXPECT_THAT(resp, AreDocIds("doc:1", "doc:6"));  // Should find both apple and macintosh

  resp = Run({"FT.SEARCH", "prefix_index", "macintosh"});
  EXPECT_THAT(resp, AreDocIds("doc:6", "doc:1"));  // Should find both macintosh and apple

  // Check that prefix search for mac* only finds macintosh, not apple
  resp = Run({"FT.SEARCH", "prefix_index", "mac*"});
  EXPECT_THAT(resp, AreDocIds("doc:6"));  // Should only find macintosh
}

TEST_F(SearchFamilyTest, SearchSortByOptionNonSortableFieldJson) {
  Run({"JSON.SET", "json1", "$", R"({"text":"2"})"});
  Run({"JSON.SET", "json2", "$", R"({"text":"1"})"});

  auto resp = Run({"FT.CREATE", "index", "ON", "JSON", "SCHEMA", "$.text", "AS", "text", "TEXT"});
  EXPECT_EQ(resp, "OK");

  auto expect_expr = [](std::string_view text_field) {
    return IsArray(2, "json2", IsMap(text_field, "1", "$", R"({"text":"1"})"), "json1",
                   IsMap(text_field, "2", "$", R"({"text":"2"})"));
  };

  resp = Run({"FT.SEARCH", "index", "*", "SORTBY", "text"});
  EXPECT_THAT(resp, expect_expr("text"sv));
}

TEST_F(SearchFamilyTest, SearchNonNullFields) {
  // Basic schema with text, tag, and numeric fields
  EXPECT_EQ(Run({"ft.create", "i1", "schema", "title", "text", "tags", "tag", "score", "numeric",
                 "sortable"}),
            "OK");

  EXPECT_EQ(Run({"ft.create", "i2", "on", "json", "schema", "$.title", "as", "title", "text",
                 "$.meta.tags", "as", "tags", "tag", "$.meta.score", "as", "score", "numeric"}),
            "OK");

  EXPECT_EQ(Run({"ft.create", "text_idx", "ON", "HASH", "PREFIX", "1", "text:", "SCHEMA", "content",
                 "TEXT"}),
            "OK");

  EXPECT_EQ(Run({"ft.create", "tag_idx", "ON", "HASH", "PREFIX", "1", "tag:", "SCHEMA",
                 "categories", "TAG", "SEPARATOR", ","}),
            "OK");

  EXPECT_EQ(Run({"ft.create", "num_idx", "ON", "HASH", "PREFIX", "1", "num:", "SCHEMA", "price",
                 "NUMERIC", "SORTABLE"}),
            "OK");

  Run({"hset", "d:1", "title", "Document with title and tags", "tags", "tag1,tag2"});
  Run({"hset", "d:2", "title", "Document with title and score", "score", "75"});
  Run({"hset", "d:3", "title", "Document with all fields", "tags", "tag2,tag3", "score", "100"});
  Run({"hset", "d:4", "tags", "Document with only tags", "score", "50"});

  // Testing non-null field searches with @field:* syntax
  EXPECT_THAT(Run({"ft.search", "i1", "@title:*"}), AreDocIds("d:1", "d:2", "d:3"));
  EXPECT_THAT(Run({"ft.search", "i1", "@tags:*"}), AreDocIds("d:1", "d:3", "d:4"));
  EXPECT_THAT(Run({"ft.search", "i1", "@score:*"}), AreDocIds("d:2", "d:3", "d:4"));

  // Testing combinations of non-null field searches
  EXPECT_THAT(Run({"ft.search", "i1", "@title:* @tags:*"}), AreDocIds("d:1", "d:3"));
  EXPECT_THAT(Run({"ft.search", "i1", "@title:* @score:*"}), AreDocIds("d:2", "d:3"));
  EXPECT_THAT(Run({"ft.search", "i1", "@tags:* @score:*"}), AreDocIds("d:3", "d:4"));
  EXPECT_THAT(Run({"ft.search", "i1", "@title:* @tags:* @score:*"}), AreDocIds("d:3"));

  // Testing non-null field searches with sorting
  auto result = Run({"ft.search", "i1", "@score:*", "SORTBY", "score", "DESC"});
  ASSERT_EQ(result.GetVec().size(), 7);
  EXPECT_EQ(result.GetVec()[1].GetString(), "d:3");  // Highest score (100) first
  EXPECT_EQ(result.GetVec()[3].GetString(), "d:2");  // Middle score (75)
  EXPECT_EQ(result.GetVec()[5].GetString(), "d:4");  // Lowest score (50) last

  // Testing non-null field searches with JSON
  Run({"json.set", "j:1", ".",
       R"({"title": "JSON document", "meta": {"tags": ["tag1", "tag2"]}})"});
  Run({"json.set", "j:2", ".", R"({"meta": {"score": 100}})"});
  Run({"json.set", "j:3", ".",
       R"({"title": "Full JSON", "meta": {"tags": ["tag3"], "score": 80}})"});

  EXPECT_THAT(Run({"ft.search", "i2", "@title:*"}), AreDocIds("j:1", "j:3"));
  EXPECT_THAT(Run({"ft.search", "i2", "@tags:*"}), AreDocIds("j:1", "j:3"));
  EXPECT_THAT(Run({"ft.search", "i2", "@score:*"}), AreDocIds("j:2", "j:3"));
  EXPECT_THAT(Run({"ft.search", "i2", "@title:* @tags:* @score:*"}), AreDocIds("j:3"));

  // Testing text indices with star query
  Run({"hset", "text:1", "content", "apple banana"});
  Run({"hset", "text:2", "content", "cherry date"});
  Run({"hset", "text:3", "content", "elephant fig"});

  EXPECT_THAT(Run({"ft.search", "text_idx", "*"}), AreDocIds("text:1", "text:2", "text:3"));

  // Testing tag indices with star query
  Run({"hset", "tag:1", "categories", "fruit,food"});
  Run({"hset", "tag:2", "categories", "drink,beverage"});
  Run({"hset", "tag:3", "categories", "tech,gadget"});

  EXPECT_THAT(Run({"ft.search", "tag_idx", "*"}), AreDocIds("tag:1", "tag:2", "tag:3"));

  // Testing numeric indices with star query
  Run({"hset", "num:1", "price", "10.5"});
  Run({"hset", "num:2", "price", "20.75"});
  Run({"hset", "num:3", "price", "30.99"});

  EXPECT_THAT(Run({"ft.search", "num_idx", "*"}), AreDocIds("num:1", "num:2", "num:3"));

  // Testing vector indices with star query
  string vector1 = R"(\x00\x00\x80\x3f\x00\x00\x00\x00\x00\x00\x00\x00)";  // [1,0,0]
  string vector2 = R"(\x00\x00\x00\x00\x00\x00\x80\x3f\x00\x00\x00\x00)";  // [0,1,0]
  string vector3 = R"(\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x80\x3f)";  // [0,0,1]

  Run({"hset", "vec:1", "embedding", vector1});
  Run({"hset", "vec:2", "embedding", vector2});
  Run({"hset", "vec:3", "embedding", vector3});

  // Testing star query with result limit
  auto limit_result = Run({"ft.search", "text_idx", "*", "LIMIT", "0", "2"});

  // No sorting, so results returned are in random order (implementation-dependent).
  EXPECT_THAT(limit_result, RespElementsAre(IntArg(3), _, _, _, _));

  // Testing star query with sorting
  auto price_desc_result = Run({"ft.search", "num_idx", "*", "SORTBY", "price", "DESC"});
  ASSERT_EQ(price_desc_result.GetVec().size(), 7);
  EXPECT_EQ(price_desc_result.GetVec()[1].GetString(), "num:3");  // Most expensive item first
  EXPECT_EQ(price_desc_result.GetVec()[3].GetString(), "num:2");
  EXPECT_EQ(price_desc_result.GetVec()[5].GetString(), "num:1");  // Cheapest item last

  auto price_asc_result = Run({"ft.search", "num_idx", "*", "SORTBY", "price", "ASC"});
  ASSERT_EQ(price_asc_result.GetVec().size(), 7);
  EXPECT_EQ(price_asc_result.GetVec()[1].GetString(), "num:1");  // Cheapest item first
  EXPECT_EQ(price_asc_result.GetVec()[3].GetString(), "num:2");
  EXPECT_EQ(price_asc_result.GetVec()[5].GetString(), "num:3");  // Most expensive item last
}

TEST_F(SearchFamilyTest, SortIndexBasicOperations) {
  // Create an index with a numeric field and a text field, both SORTABLE
  EXPECT_EQ(Run({"ft.create", "sort_idx", "SCHEMA", "num_field", "NUMERIC", "SORTABLE", "str_field",
                 "TEXT", "SORTABLE"}),
            "OK");

  // Add documents with different field values - only with both fields for test simplification
  Run({"hset", "doc:1", "num_field", "10", "str_field", "apple"});
  Run({"hset", "doc:2", "num_field", "20", "str_field", "banana"});
  Run({"hset", "doc:3", "num_field", "5", "str_field", "cherry"});
  Run({"hset", "doc:4", "num_field", "15", "str_field", "date"});

  // Test search with star (* - all documents)
  EXPECT_THAT(Run({"ft.search", "sort_idx", "*"}), AreDocIds("doc:1", "doc:2", "doc:3", "doc:4"));

  // Test search by field presence
  EXPECT_THAT(Run({"ft.search", "sort_idx", "@num_field:*"}),
              AreDocIds("doc:1", "doc:2", "doc:3", "doc:4"));
  EXPECT_THAT(Run({"ft.search", "sort_idx", "@str_field:*"}),
              AreDocIds("doc:1", "doc:2", "doc:3", "doc:4"));

  // Test sorting by numeric field (ascending)
  auto num_asc_result = Run({"ft.search", "sort_idx", "*", "SORTBY", "num_field", "ASC"});

  // Check the overall order, not specific indices
  ASSERT_GE(num_asc_result.GetVec().size(), 9);  // 4 documents * 2 + 1

  // Collect document IDs in the order they appear in the result
  std::vector<std::string> sorted_ids;
  for (size_t i = 1; i < num_asc_result.GetVec().size(); i += 2) {
    sorted_ids.push_back(num_asc_result.GetVec()[i].GetString());
  }

  // Verify that the numeric field sorting order is correct
  ASSERT_EQ(sorted_ids.size(), 4);
  EXPECT_EQ(sorted_ids[0], "doc:3");  // 5
  EXPECT_EQ(sorted_ids[1], "doc:1");  // 10
  EXPECT_EQ(sorted_ids[2], "doc:4");  // 15
  EXPECT_EQ(sorted_ids[3], "doc:2");  // 20

  // Sorting by text field (descending)
  auto str_desc_result = Run({"ft.search", "sort_idx", "*", "SORTBY", "str_field", "DESC"});

  // Check the overall order of text sorting
  sorted_ids.clear();
  for (size_t i = 1; i < str_desc_result.GetVec().size(); i += 2) {
    sorted_ids.push_back(str_desc_result.GetVec()[i].GetString());
  }

  ASSERT_EQ(sorted_ids.size(), 4);
  EXPECT_EQ(sorted_ids[0], "doc:4");  // date
  EXPECT_EQ(sorted_ids[1], "doc:3");  // cherry
  EXPECT_EQ(sorted_ids[2], "doc:2");  // banana
  EXPECT_EQ(sorted_ids[3], "doc:1");  // apple

  // Update a document
  Run({"hset", "doc:3", "num_field", "30"});  // 5 -> 30

  // Check the updated sorting
  auto updated_result = Run({"ft.search", "sort_idx", "*", "SORTBY", "num_field", "ASC"});
  sorted_ids.clear();
  for (size_t i = 1; i < updated_result.GetVec().size(); i += 2) {
    sorted_ids.push_back(updated_result.GetVec()[i].GetString());
  }

  ASSERT_EQ(sorted_ids.size(), 4);
  EXPECT_EQ(sorted_ids[0], "doc:1");  // 10
  EXPECT_EQ(sorted_ids[1], "doc:4");  // 15
  EXPECT_EQ(sorted_ids[2], "doc:2");  // 20
  EXPECT_EQ(sorted_ids[3], "doc:3");  // 30

  // Test document deletion
  Run({"del", "doc:2"});
  auto after_delete_result = Run({"ft.search", "sort_idx", "*"});
  EXPECT_THAT(after_delete_result, AreDocIds("doc:1", "doc:3", "doc:4"));
}

// Separate test for documents with missing fields during sorting
TEST_F(SearchFamilyTest, SortIndexWithNullFields) {
  EXPECT_EQ(Run({"ft.create", "null_sort_idx", "SCHEMA", "num_field", "NUMERIC", "SORTABLE"}),
            "OK");

  // Documents with and without numeric field
  Run({"hset", "doc:1", "num_field", "10"});
  Run({"hset", "doc:2", "num_field", "20"});
  Run({"hset", "doc:3", "other_field", "value"});  // no numeric field

  // Verify that all documents are indexed
  EXPECT_THAT(Run({"ft.search", "null_sort_idx", "*"}), AreDocIds("doc:1", "doc:2", "doc:3"));

  // Verify that only documents with numeric field are found by @num_field:* query
  EXPECT_THAT(Run({"ft.search", "null_sort_idx", "@num_field:*"}), AreDocIds("doc:1", "doc:2"));

  // When sorting, documents without the field should be at the end (but exact order may vary)
  auto sort_result = Run({"ft.search", "null_sort_idx", "*", "SORTBY", "num_field", "ASC"});

  // Collect results
  std::vector<std::string> sorted_ids;
  for (size_t i = 1; i < sort_result.GetVec().size(); i += 2) {
    sorted_ids.push_back(sort_result.GetVec()[i].GetString());
  }

  // Verify that documents with numeric fields are in the correct order,
  // and the document without a numeric field is either at the end or not included (depends on
  // implementation)
  ASSERT_GE(sorted_ids.size(), 2);

  // Check only documents with known field values
  auto doc1_pos = std::find(sorted_ids.begin(), sorted_ids.end(), "doc:1");
  auto doc2_pos = std::find(sorted_ids.begin(), sorted_ids.end(), "doc:2");

  ASSERT_NE(doc1_pos, sorted_ids.end());
  ASSERT_NE(doc2_pos, sorted_ids.end());

  // doc:1 (10) should be before doc:2 (20) in ascending sort
  EXPECT_LT(std::distance(sorted_ids.begin(), doc1_pos),
            std::distance(sorted_ids.begin(), doc2_pos));
}

TEST_F(SearchFamilyTest, VectorIndexOperations) {
  // Create an index with a vector field
  EXPECT_EQ(Run({"ft.create", "vector_idx", "SCHEMA", "vec", "VECTOR", "FLAT", "6", "TYPE",
                 "FLOAT32", "DIM", "3", "DISTANCE_METRIC", "L2", "name", "TEXT"}),
            "OK");

  // Function to convert float vectors to binary representation
  auto FloatsToBytes = [](const std::vector<float>& floats) -> std::string {
    return std::string(reinterpret_cast<const char*>(floats.data()), floats.size() * sizeof(float));
  };

  // Prepare vector data in binary format
  std::string vec1 = FloatsToBytes({1.0f, 0.0f, 0.0f});
  std::string vec2 = FloatsToBytes({0.0f, 1.0f, 0.0f});
  std::string vec3 = FloatsToBytes({0.0f, 0.0f, 1.0f});
  std::string vec4 = FloatsToBytes({0.5f, 0.5f, 0.0f});
  std::string vec5 = FloatsToBytes({0.3f, 0.3f, 0.3f});

  // Add documents with vector data in binary format
  Run({"hset", "vec:1", "vec", vec1, "name", "vector1"});
  Run({"hset", "vec:2", "vec", vec2, "name", "vector2"});
  Run({"hset", "vec:3", "vec", vec3, "name", "vector3"});
  Run({"hset", "vec:4", "vec", vec4, "name", "vector4"});
  Run({"hset", "vec:5", "vec", vec5, "name", "vector5"});

  // Basic star search
  auto star_search = Run({"ft.search", "vector_idx", "*"});
  EXPECT_THAT(star_search, AreDocIds("vec:1", "vec:2", "vec:3", "vec:4", "vec:5"));

  // Search by vector field presence
  auto vec_field_search = Run({"ft.search", "vector_idx", "@vec:*"});
  EXPECT_THAT(vec_field_search, AreDocIds("vec:1", "vec:2", "vec:3", "vec:4", "vec:5"));
}

// Test to verify that @field:* syntax works with sortable fields
TEST_F(SearchFamilyTest, SortIndexGetAllResults) {
  // Create an index with a numeric field that is SORTABLE but not indexed as a regular field
  EXPECT_EQ(Run({"ft.create", "sort_only_idx", "SCHEMA", "sort_field", "NUMERIC", "SORTABLE"}),
            "OK");

  // Add documents with and without the sortable field
  Run({"hset", "doc:1", "sort_field", "10", "other_field", "value1"});
  Run({"hset", "doc:2", "sort_field", "20", "other_field", "value2"});
  Run({"hset", "doc:3", "sort_field", "30", "other_field", "value3"});
  Run({"hset", "doc:4", "other_field", "value4"});  // no sort_field
  Run({"hset", "doc:5", "other_field", "value5"});  // no sort_field

  // Test that all documents are indexed
  EXPECT_THAT(Run({"ft.search", "sort_only_idx", "*"}),
              AreDocIds("doc:1", "doc:2", "doc:3", "doc:4", "doc:5"));

  // Test that @field:* search works for sortable field
  // This should only return documents that have the sort_field
  EXPECT_THAT(Run({"ft.search", "sort_only_idx", "@sort_field:*"}),
              AreDocIds("doc:1", "doc:2", "doc:3"));

  // Test sorting with @field:* query
  auto sort_result =
      Run({"ft.search", "sort_only_idx", "@sort_field:*", "SORTBY", "sort_field", "DESC"});

  // Collect document IDs in order
  std::vector<std::string> sorted_ids;
  for (size_t i = 1; i < sort_result.GetVec().size(); i += 2) {
    sorted_ids.push_back(sort_result.GetVec()[i].GetString());
  }

  // Verify correct order
  ASSERT_EQ(sorted_ids.size(), 3);
  EXPECT_EQ(sorted_ids[0], "doc:3");  // 30
  EXPECT_EQ(sorted_ids[1], "doc:2");  // 20
  EXPECT_EQ(sorted_ids[2], "doc:1");  // 10
}

TEST_F(SearchFamilyTest, JsonWithNullFields) {
  // Create indices for text, tag, and numeric fields (non-sortable)
  EXPECT_EQ(Run({"FT.CREATE", "idx:regular", "ON", "JSON", "SCHEMA", "$.text_field", "AS",
                 "text_field", "TEXT", "$.tag_field", "AS", "tag_field", "TAG", "$.num_field", "AS",
                 "num_field", "NUMERIC"}),
            "OK");

  // Create indices for text, tag, and numeric fields (sortable)
  EXPECT_EQ(Run({"FT.CREATE",    "idx:sortable", "ON",         "JSON",    "SCHEMA",
                 "$.text_field", "AS",           "text_field", "TEXT",    "SORTABLE",
                 "$.tag_field",  "AS",           "tag_field",  "TAG",     "SORTABLE",
                 "$.num_field",  "AS",           "num_field",  "NUMERIC", "SORTABLE"}),
            "OK");

  // Create JSON documents with null values in different field types
  Run({"JSON.SET", "doc:1", ".",
       R"({"text_field": "sample text", "tag_field": "tag1,tag2", "num_field": 100})"});
  Run({"JSON.SET", "doc:2", ".", R"({"text_field": null, "tag_field": "tag3", "num_field": 200})"});
  Run({"JSON.SET", "doc:3", ".",
       R"({"text_field": "another text", "tag_field": null, "num_field": 300})"});
  Run({"JSON.SET", "doc:4", ".",
       R"({"text_field": "more text", "tag_field": "tag4,tag5", "num_field": null})"});
  Run({"JSON.SET", "doc:5", ".", R"({"text_field": null, "tag_field": null, "num_field": null})"});
  Run({"JSON.SET", "doc:6", ".", R"({"other_field": "not indexed field"})"});

  // Test @field:* searches on non-sortable index
  EXPECT_THAT(Run({"FT.SEARCH", "idx:regular", "@text_field:*"}),
              AreDocIds("doc:1", "doc:3", "doc:4"));
  EXPECT_THAT(Run({"FT.SEARCH", "idx:regular", "@tag_field:*"}),
              AreDocIds("doc:1", "doc:2", "doc:4"));
  EXPECT_THAT(Run({"FT.SEARCH", "idx:regular", "@num_field:*"}),
              AreDocIds("doc:1", "doc:2", "doc:3"));

  // Test @field:* searches on sortable index
  EXPECT_THAT(Run({"FT.SEARCH", "idx:sortable", "@text_field:*"}),
              AreDocIds("doc:1", "doc:3", "doc:4"));
  EXPECT_THAT(Run({"FT.SEARCH", "idx:sortable", "@tag_field:*"}),
              AreDocIds("doc:1", "doc:2", "doc:4"));
  EXPECT_THAT(Run({"FT.SEARCH", "idx:sortable", "@num_field:*"}),
              AreDocIds("doc:1", "doc:2", "doc:3"));

  // Test search for documents with non-null values for all fields
  EXPECT_THAT(Run({"FT.SEARCH", "idx:regular", "@text_field:* @tag_field:* @num_field:*"}),
              AreDocIds("doc:1"));
  EXPECT_THAT(Run({"FT.SEARCH", "idx:sortable", "@text_field:* @tag_field:* @num_field:*"}),
              AreDocIds("doc:1"));

  // Test combined queries
  EXPECT_THAT(Run({"FT.SEARCH", "idx:regular", "@text_field:* @tag_field:*"}),
              AreDocIds("doc:1", "doc:4"));
  EXPECT_THAT(Run({"FT.SEARCH", "idx:regular", "@text_field:* @num_field:*"}),
              AreDocIds("doc:1", "doc:3"));
  EXPECT_THAT(Run({"FT.SEARCH", "idx:regular", "@tag_field:* @num_field:*"}),
              AreDocIds("doc:1", "doc:2"));
}

TEST_F(SearchFamilyTest, TestHsetDeleteDocumentHnswSchemaCrash) {
  EXPECT_EQ(Run({"FT.CREATE", "idx", "SCHEMA", "n", "NUMERIC", "v", "VECTOR", "HNSW", "8", "TYPE",
                 "FLOAT16", "DIM", "4", "DISTANCE_METRIC", "L2", "M", "65536"}),
            "OK");

  auto res = Run({"HSET", "doc", "n", "0"});
  EXPECT_EQ(res, 1);

  res = Run({"DEL", "doc"});
  EXPECT_EQ(res, 1);
}

TEST_F(SearchFamilyTest, RenameDocumentBetweenIndices) {
  absl::FlagSaver fs;

  SetTestFlag("cluster_mode", "emulated");
  ResetService();

  EXPECT_EQ(Run({"ft.create", "idx1", "prefix", "1", "idx1", "filter", "@index==\"yes\"", "schema",
                 "t", "text"}),
            "OK");
  EXPECT_EQ(Run({"ft.create", "idx2", "prefix", "1", "idx2", "filter", "@index==\"yes\"", "schema",
                 "t", "text"}),
            "OK");

  Run({"hset", "idx1:{doc}1", "t", "foo1", "index", "yes"});

  EXPECT_EQ(Run({"rename", "idx1:{doc}1", "idx2:{doc}1"}), "OK");
  EXPECT_EQ(Run({"rename", "idx2:{doc}1", "idx1:{doc}1"}), "OK");
}

TEST_F(SearchFamilyTest, JsonSetIndexesBug) {
  auto resp = Run(
      {"FT.CREATE", "index", "ON", "json", "SCHEMA", "$.text", "AS", "text", "TEXT", "SORTABLE"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"JSON.SET", "j1", "$", R"({"text":"some text"})"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"JSON.SET", "j1", "$", R"({"asd}"})"});
  EXPECT_THAT(resp, ErrArg("ERR failed to parse JSON"));

  resp = Run({"FT.AGGREGATE", "index", "*", "GROUPBY", "1", "@text"});
  EXPECT_THAT(resp, IsUnordArrayWithSize(IsMap("text", "some text")));
}

TEST_F(SearchFamilyTest, SearchReindexWriteSearchRace) {
  const std::string kIndexName = "myRaceIdx";
  const int kWriterOps = 200;
  const int kSearcherOps = 200;
  const int kReindexerOps = 200;

  auto writer_fiber = pp_->at(0)->LaunchFiber([&] {
    for (int i = 1; i <= kWriterOps; ++i) {
      std::string doc_key = absl::StrCat("doc:", i);
      std::string content = absl::StrCat("text data item ", i, " for race condition test");
      std::string tags_val = absl::StrCat("tagA,tagB,", (i % 10));
      std::string numeric_field_val = std::to_string(i);
      Run({"hset", doc_key, "content", content, "tags", tags_val, "numeric_field",
           numeric_field_val});
    }
  });

  auto searcher_fiber = pp_->at(1)->LaunchFiber([&] {
    for (int i = 1; i <= kSearcherOps; ++i) {
      int random_val_content = 1 + (i % kWriterOps);
      std::string query_content = absl::StrCat("@content:item", random_val_content);
      Run({"ft.search", kIndexName, query_content});
    }
  });

  auto reindexer_fiber = pp_->at(2)->LaunchFiber([&] {
    for (int i = 1; i <= kReindexerOps; ++i) {
      Run({"ft.create", kIndexName, "ON", "HASH", "PREFIX", "1", "doc:", "SCHEMA", "content",
           "TEXT", "SORTABLE", "tags", "TAG", "SORTABLE", "numeric_field", "NUMERIC", "SORTABLE"});
      Run({"ft.dropindex", kIndexName});
    }
  });

  // Join fibers
  writer_fiber.Join();
  searcher_fiber.Join();
  reindexer_fiber.Join();

  ASSERT_FALSE(service_->IsShardSetLocked());
}

TEST_F(SearchFamilyTest, IgnoredOptionsInFtCreate) {
  GTEST_SKIP() << "The usage of ignored options is now wrong - it skips supported ones!";

  // Create an index with various options, some of which should be ignored
  // INDEXMISSING and INDEXEMPTY are supported by default
  auto resp = Run({"FT.CREATE",
                   "idx",
                   "ON",
                   "HASH",
                   "SCHEMA",
                   "title",
                   "TEXT",
                   "UNF",
                   "NOSTEM",
                   "CASESENSITIVE",
                   "WITHSUFFIXTRIE",
                   "INDEXMISSING",
                   "INDEXEMPTY",
                   "WEIGHT",
                   "1",
                   "SEPARATOR",
                   "|",
                   "PHONETIC",
                   "dm:en",
                   "SORTABLE"});

  // Check that the response is OK, indicating the index was created successfully
  EXPECT_THAT(resp, "OK");

  Run({"HSET", "doc:1", "title", "Test Document"});

  // Verify that the index was created correctly
  resp = Run({"FT.SEARCH", "idx", "*"});
  EXPECT_THAT(resp, AreDocIds("doc:1"));
}

TEST_F(SearchFamilyTest, JsonDelIndexesBug) {
  auto resp = Run(
      {"FT.CREATE", "index", "ON", "json", "SCHEMA", "$.text", "AS", "text", "TEXT", "SORTABLE"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"JSON.SET", "j1", "$", R"({"text":"some text"})"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"JSON.DEL", "j1", "$.text"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"FT.AGGREGATE", "index", "*", "GROUPBY", "1", "@text"});
  EXPECT_THAT(resp, IsUnordArrayWithSize(IsMap("text", ArgType(RespExpr::NIL))));
}

TEST_F(SearchFamilyTest, SearchStatsInfoRace) {
  auto index_ops_fiber = pp_->at(0)->LaunchFiber([&] {
    for (int i = 1; i <= 5; ++i) {
      std::string idx_name = absl::StrCat("idx", i);
      std::string prefix = absl::StrCat("prefix", i, ":");
      Run({"FT.CREATE", idx_name, "ON", "HASH", "PREFIX", "1", prefix});
      Run({"FT.DROPINDEX", idx_name});
    }
  });

  auto info_ops_fiber = pp_->at(1)->LaunchFiber([&] {
    for (int i = 1; i <= 10; ++i) {
      Run({"INFO"});
    }
  });

  index_ops_fiber.Join();
  info_ops_fiber.Join();

  ASSERT_FALSE(service_->IsShardSetLocked());
}

TEST_F(SearchFamilyTest, EmptyKeyBug) {
  auto resp = Run({"FT.CREATE", "index", "ON", "HASH", "SCHEMA", "field", "TEXT"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"HSET", "", "field", "value"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"FT.SEARCH", "index", "*"});
  EXPECT_THAT(resp, AreDocIds(""));
}

TEST_F(SearchFamilyTest, SetDoesNotUpdateIndexesBug) {
  auto resp = Run({"FT.CREATE", "index", "ON", "HASH", "SCHEMA", "field", "TEXT"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"HSET", "k1", "field", "value"});
  EXPECT_THAT(resp, IntArg(1));

  // Here we are changing the type of k1 from HASH to STRING.
  // This should affect the index, the hset value should not be indexed anymore.
  resp = Run({"SET", "k1", "anothervalue"});
  EXPECT_EQ(resp, "OK");

  resp = Run({"RENAME", "k1", "anotherkey"});
  EXPECT_EQ(resp, "OK");

  /* Here we should see that the value is indexed again.
     We have checks in indexes that prove that the key was not present in the index.
     The bug was, that this check was failing for this operation because it was not removed from the
     index during the SET operation */
  resp = Run({"HSET", "k1", "field", "value"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"FT.SEARCH", "index", "*"});
  EXPECT_THAT(resp, AreDocIds("k1"));
}

TEST_F(SearchFamilyTest, SortStoreDoesNotUpdateIndexesBug) {
  // Create an index over HASH
  auto resp = Run({"FT.CREATE", "index", "ON", "HASH", "SCHEMA", "field", "TEXT"});
  EXPECT_THAT(resp, "OK");

  // Index a HASH document under k1
  resp = Run({"HSET", "k1", "field", "value"});
  EXPECT_THAT(resp, IntArg(1));

  // Prepare a source list to sort and store into k1 (overwriting k1 to LIST)
  EXPECT_THAT(Run({"RPUSH", "lst", "b", "a"}), IntArg(2));
  // SORT lst STORE k1 -> changes type of k1 from HASH to LIST
  Run({"SORT", "lst", "ALPHA", "STORE", "k1"});

  // Rename away and recreate k1 as HASH again
  EXPECT_EQ(Run({"RENAME", "k1", "anotherkey"}), "OK");
  EXPECT_THAT(Run({"HSET", "k1", "field", "value"}), IntArg(1));

  // If SORT/STORE failed to remove k1 from indexes, the re-index here should crash.
  // Successful run should contain only the new k1 document in the index.
  resp = Run({"FT.SEARCH", "index", "*"});
  EXPECT_THAT(resp, AreDocIds("k1"));
}

TEST_F(SearchFamilyTest, BlockSizeOptionFtCreate) {
  // Create an index with a block size option
  auto resp = Run({"FT.CREATE", "index", "ON", "HASH", "SCHEMA", "number1", "NUMERIC", "BLOCKSIZE",
                   "2", "number2", "NUMERIC", "BLOCKSIZE", "1024"});
  EXPECT_THAT(resp, "OK");

  // Verify that the index was created successfully
  resp = Run({"FT.INFO", "index"});
  EXPECT_THAT(resp, IsArray(_, _, _, _, _, _, "attributes",
                            IsUnordArray(IsArray("identifier", "number1", "attribute", "number1",
                                                 "type", "NUMERIC", "blocksize", "2"),
                                         IsArray("identifier", "number2", "attribute", "number2",
                                                 "type", "NUMERIC", "blocksize", "1024")),
                            "num_docs", IntArg(0), _, _, _, _));

  // Add a document to the index
  for (int i = 1; i <= 5; ++i) {
    Run({"HSET", absl::StrCat("doc:", i), "number1", std::to_string(i), "number2",
         std::to_string(i * 10)});
  }

  // Search the index
  resp = Run({"FT.SEARCH", "index", "@number1:[1 3] @number2:[10 30]", "SORTBY", "number1", "ASC"});
  EXPECT_THAT(resp, AreDocIds("doc:1", "doc:2", "doc:3"));
}

TEST_F(SearchFamilyTest, AggregateWithLoadFromJoinSimple) {
  Run({"ft.create", "idx1", "ON", "HASH", "SCHEMA", "num1", "NUMERIC", "num2", "NUMERIC"});
  Run({"ft.create", "idx2", "ON", "HASH", "SCHEMA", "num3", "NUMERIC", "num4", "NUMERIC"});

  Run({"hset", "k1", "num1", "0", "num2", "1"});
  Run({"hset", "k2", "num1", "1", "num2", "2"});

  Run({"hset", "k3", "num3", "0", "num4", "3"});
  Run({"hset", "k4", "num3", "1", "num4", "4"});

  auto resp = Run({"ft.aggregate", "idx1", "*", "LOAD", "4", "idx1.num1", "idx1.num2", "idx2.num3",
                   "idx2.num4", "LOAD_FROM", "idx2", "1", "idx2.num3=idx1.num1"});

  EXPECT_THAT(resp,
              IsUnordArrayWithSize(
                  IsMap("idx1.num1", "1", "idx1.num2", "2", "idx2.num3", "1", "idx2.num4", "4"),
                  IsMap("idx1.num1", "0", "idx1.num2", "1", "idx2.num3", "0", "idx2.num4", "3")));
}

TEST_F(SearchFamilyTest, AggregateWithLoadFromJoinMultipleJoins) {
  Run({"ft.create", "idx1", "ON", "HASH", "SCHEMA", "num1", "NUMERIC", "str1", "TEXT"});
  Run({"ft.create", "idx2", "ON", "HASH", "SCHEMA", "num2", "NUMERIC", "str2", "TAG"});
  Run({"ft.create", "idx3", "ON", "HASH", "SCHEMA", "num3", "NUMERIC", "str3", "TAG"});
  Run({"ft.create", "idx4", "ON", "HASH", "SCHEMA", "num4", "NUMERIC", "str4", "TEXT"});

  Run({"hset", "k1", "num1", "0", "str1", "value1"});
  Run({"hset", "k2", "num1", "1", "str1", "value2"});

  Run({"hset", "k3", "num2", "0", "str2", "value3"});
  Run({"hset", "k4", "num2", "1", "str2", "value4"});

  Run({"hset", "k5", "num3", "2", "str3", "value1"});
  Run({"hset", "k6", "num3", "3", "str3", "value2"});

  Run({"hset", "k7", "num4", "2", "str4", "value3"});
  Run({"hset", "k8", "num4", "3", "str4", "value4"});

  auto resp = Run({"ft.aggregate",
                   "idx1",
                   "*",
                   "LOAD",
                   "8",
                   "idx1.num1",
                   "idx1.str1",
                   "idx2.num2",
                   "idx2.str2",
                   "idx3.num3",
                   "idx3.str3",
                   "idx4.num4",
                   "idx4.str4",
                   "LOAD_FROM",
                   "idx2",
                   "1",
                   "idx2.num2=idx1.num1",
                   "LOAD_FROM",
                   "idx3",
                   "1",
                   "idx3.str3=idx1.str1",
                   "LOAD_FROM",
                   "idx4",
                   "1",
                   "idx4.str4=idx2.str2"});

  EXPECT_THAT(
      resp,
      IsUnordArrayWithSize(
          IsMap("idx1.num1", "1", "idx1.str1", "value2", "idx2.num2", "1", "idx2.str2", "value4",
                "idx3.num3", "3", "idx3.str3", "value2", "idx4.num4", "3", "idx4.str4", "value4"),
          IsMap("idx1.num1", "0", "idx1.str1", "value1", "idx2.num2", "0", "idx2.str2", "value3",
                "idx3.num3", "2", "idx3.str3", "value1", "idx4.num4", "2", "idx4.str4", "value3")));

  // Simple requests
  resp = Run({"ft.aggregate", "idx1", "*", "LOAD", "4", "idx1.num1", "idx1.str1", "idx2.num2",
              "idx2.str2", "LOAD_FROM", "idx2", "1", "idx2.num2=idx1.num1"});
  EXPECT_THAT(
      resp,
      IsUnordArrayWithSize(
          IsMap("idx1.num1", "1", "idx1.str1", "value2", "idx2.num2", "1", "idx2.str2", "value4"),
          IsMap("idx1.num1", "0", "idx1.str1", "value1", "idx2.num2", "0", "idx2.str2", "value3")));

  resp = Run({"ft.aggregate", "idx1", "*", "LOAD", "4", "idx1.num1", "idx1.str1", "idx3.num3",
              "idx3.str3", "LOAD_FROM", "idx3", "1", "idx3.str3=idx1.str1"});
  EXPECT_THAT(
      resp,
      IsUnordArrayWithSize(
          IsMap("idx1.num1", "1", "idx1.str1", "value2", "idx3.num3", "3", "idx3.str3", "value2"),
          IsMap("idx1.num1", "0", "idx1.str1", "value1", "idx3.num3", "2", "idx3.str3", "value1")));

  resp = Run({"ft.aggregate", "idx2", "*", "LOAD", "4", "idx2.num2", "idx2.str2", "idx4.num4",
              "idx4.str4", "LOAD_FROM", "idx4", "1", "idx4.str4=idx2.str2"});
  EXPECT_THAT(
      resp,
      IsUnordArrayWithSize(
          IsMap("idx2.num2", "1", "idx2.str2", "value4", "idx4.num4", "3", "idx4.str4", "value4"),
          IsMap("idx2.num2", "0", "idx2.str2", "value3", "idx4.num4", "2", "idx4.str4", "value3")));

  resp = Run({"ft.aggregate", "idx3", "*", "LOAD", "4", "idx3.num3", "idx3.str3", "idx4.num4",
              "idx4.str4", "LOAD_FROM", "idx4", "1", "idx3.num3=idx4.num4"});
  EXPECT_THAT(
      resp,
      IsUnordArrayWithSize(
          IsMap("idx3.num3", "3", "idx3.str3", "value2", "idx4.num4", "3", "idx4.str4", "value4"),
          IsMap("idx3.num3", "2", "idx3.str3", "value1", "idx4.num4", "2", "idx4.str4", "value3")));
}

TEST_F(SearchFamilyTest, AggregateWithLoadFromMultipleFields) {
  Run({"ft.create", "idx1", "ON", "HASH", "SCHEMA", "num1", "NUMERIC", "str1", "TEXT", "num2",
       "NUMERIC"});
  Run({"ft.create", "idx2", "ON", "HASH", "SCHEMA", "num2", "NUMERIC", "str2", "TAG", "num3",
       "NUMERIC"});
  Run({"ft.create", "idx3", "ON", "HASH", "SCHEMA", "num3", "NUMERIC", "str3", "TEXT", "num4",
       "NUMERIC"});

  Run({"hset", "k1", "num1", "0", "str1", "value1", "num2", "5"});
  Run({"hset", "k2", "num1", "1", "str1", "value2", "num2", "10"});

  Run({"hset", "k3", "num2", "1", "str2", "value3", "num3", "10"});
  Run({"hset", "k4", "num2", "0", "str2", "value4", "num3", "5"});

  Run({"hset", "k5", "num3", "2", "str3", "value4", "num4", "5"});
  Run({"hset", "k6", "num3", "3", "str3", "value3", "num4", "10"});

  auto resp = Run({"ft.aggregate",
                   "idx1",
                   "*",
                   "LOAD",
                   "9",
                   "idx1.num1",
                   "idx1.str1",
                   "idx1.num2",
                   "idx2.num2",
                   "idx2.str2",
                   "idx2.num3",
                   "idx3.num3",
                   "idx3.str3",
                   "idx3.num4",
                   "LOAD_FROM",
                   "idx2",
                   "2",
                   "idx1.num1=idx2.num2",
                   "idx1.num2=idx2.num3",
                   "LOAD_FROM",
                   "idx3",
                   "3",
                   "idx1.num2=idx3.num4",
                   "idx2.num3=idx3.num4",
                   "idx2.str2=idx3.str3"});

  EXPECT_THAT(
      resp, IsUnordArrayWithSize(IsMap("idx1.num1", "1", "idx1.str1", "value2", "idx1.num2", "10",
                                       "idx2.num2", "1", "idx2.str2", "value3", "idx2.num3", "10",
                                       "idx3.num3", "3", "idx3.str3", "value3", "idx3.num4", "10"),
                                 IsMap("idx1.num1", "0", "idx1.str1", "value1", "idx1.num2", "5",
                                       "idx2.num2", "0", "idx2.str2", "value4", "idx2.num3", "5",
                                       "idx3.num3", "2", "idx3.str3", "value4", "idx3.num4", "5")));
}

TEST_F(SearchFamilyTest, AggregateWithLoadFromSeveralCopiesOfSameKey) {
  Run({"ft.create", "idx1", "ON", "HASH", "SCHEMA", "num1", "NUMERIC", "str1", "TEXT", "num2",
       "NUMERIC"});
  Run({"ft.create", "idx2", "ON", "HASH", "SCHEMA", "num2", "NUMERIC", "str2", "TAG", "num3",
       "NUMERIC"});
  Run({"ft.create", "idx3", "ON", "HASH", "SCHEMA", "num3", "NUMERIC", "str3", "TEXT", "num4",
       "NUMERIC"});

  Run({"hset", "k1", "num1", "0", "str1", "value1", "num2", "5"});
  Run({"hset", "k2", "num1", "1", "str1", "value2", "num2", "10"});

  Run({"hset", "k3", "num2", "1", "str2", "value3", "num3", "10"});
  Run({"hset", "k4", "num2", "0", "str2", "value4", "num3", "5"});

  Run({"hset", "k5", "num3", "2", "str3", "value1", "num4", "15"});
  Run({"hset", "k6", "num3", "3", "str3", "value1", "num4", "20"});
  Run({"hset", "k7", "num3", "4", "str3", "value2", "num4", "25"});
  Run({"hset", "k8", "num3", "5", "str3", "value2", "num4", "30"});

  auto resp = Run({"ft.aggregate",
                   "idx1",
                   "*",
                   "LOAD",
                   "9",
                   "idx1.num1",
                   "idx1.str1",
                   "idx1.num2",
                   "idx2.num2",
                   "idx2.str2",
                   "idx2.num3",
                   "idx3.num3",
                   "idx3.str3",
                   "idx3.num4",
                   "LOAD_FROM",
                   "idx2",
                   "2",
                   "idx1.num1=idx2.num2",
                   "idx1.num2=idx2.num3",
                   "LOAD_FROM",
                   "idx3",
                   "1",  // Multiple copies of the same key
                   "idx1.str1=idx3.str3"});

  EXPECT_THAT(resp, IsUnordArrayWithSize(
                        IsMap("idx1.num1", "0", "idx1.str1", "value1", "idx1.num2", "5",
                              "idx2.num2", "0", "idx2.str2", "value4", "idx2.num3", "5",
                              "idx3.num3", "2", "idx3.str3", "value1", "idx3.num4", "15"),
                        IsMap("idx1.num1", "0", "idx1.str1", "value1", "idx1.num2", "5",
                              "idx2.num2", "0", "idx2.str2", "value4", "idx2.num3", "5",
                              "idx3.num3", "3", "idx3.str3", "value1", "idx3.num4", "20"),
                        IsMap("idx1.num1", "1", "idx1.str1", "value2", "idx1.num2", "10",
                              "idx2.num2", "1", "idx2.str2", "value3", "idx2.num3", "10",
                              "idx3.num3", "4", "idx3.str3", "value2", "idx3.num4", "25"),
                        IsMap("idx1.num1", "1", "idx1.str1", "value2", "idx1.num2", "10",
                              "idx2.num2", "1", "idx2.str2", "value3", "idx2.num3", "10",
                              "idx3.num3", "5", "idx3.str3", "value2", "idx3.num4", "30")));
}

TEST_F(SearchFamilyTest, AggregateWithLoadFromNoMatches) {
  Run({"ft.create", "idx1", "ON", "HASH", "SCHEMA", "num1", "NUMERIC", "str1", "TEXT"});
  Run({"ft.create", "idx2", "ON", "HASH", "SCHEMA", "num2", "NUMERIC", "str2", "TEXT"});

  Run({"hset", "k1", "num1", "0", "str1", "value1"});
  Run({"hset", "k2", "num1", "1", "str1", "value2"});

  Run({"hset", "k3", "num2", "0", "str2", "value3"});
  Run({"hset", "k4", "num2", "1", "str2", "value4"});

  auto resp =
      Run({"ft.aggregate", "idx1", "*", "LOAD", "4", "idx1.num1", "idx1.str1", "idx2.num2",
           "idx2.str2", "LOAD_FROM", "idx2", "2", "idx2.num2=idx1.num1", "idx2.str2=idx1.str1"});

  EXPECT_THAT(resp, IntArg(0));  // No matches, so result should be empty
}

TEST_F(SearchFamilyTest, AggregateWithLoadFromQueries) {
  Run({"ft.create", "idx1", "ON", "HASH", "SCHEMA", "num1", "NUMERIC", "str1", "TAG"});
  Run({"ft.create", "idx2", "ON", "HASH", "SCHEMA", "num2", "NUMERIC", "str2", "TEXT"});

  // Another case
  Run({"ft.create", "idx3", "ON", "HASH", "SCHEMA", "num3", "NUMERIC", "str3", "TAG"});
  Run({"ft.create", "idx4", "ON", "HASH", "SCHEMA", "num4", "NUMERIC", "str4", "TAG"});

  std::vector<::testing::Matcher<RespExpr>> matchers;
  for (int i = 0; i < 100; ++i) {
    // For even i str1 and str2 should match, for odd i they should not
    std::string str1 = absl::StrCat("tag", i);
    std::string str2 = i % 2 == 0 ? str1 : absl::StrCat("text", i);
    Run({"hset", absl::StrCat("k1:", i), "num1", std::to_string(i), "str1", str1});
    Run({"hset", absl::StrCat("k2:", i), "num2", std::to_string(i), "str2", str2});

    if (i % 2 == 0 && i >= 35 && i <= 57) {
      matchers.emplace_back(IsMap("idx1.num1", std::to_string(i), "idx1.str1", str1, "idx2.num2",
                                  std::to_string(i), "idx2.str2", str2));
    }
  }
  matchers.insert(matchers.begin(), IntArg(matchers.size()));

  auto resp = Run({"ft.aggregate", "idx1", "@num1:[35 57]", "LOAD", "4", "idx1.num1", "idx1.str1",
                   "idx2.num2", "idx2.str2", "LOAD_FROM", "idx2", "1", "idx2.str2=idx1.str1",
                   "QUERY", "@num2:[35 57]"});

  EXPECT_THAT(resp.GetVec(), UnorderedElementsAreArray(matchers));

  size_t num3 = 1;
  size_t num4 = 5;

  std::vector<std::string> tag_values = {"tag1", "tag2", "tag3", "tag4"};
  matchers.clear();
  for (size_t i = 0; i < 100; ++i) {
    std::string str = tag_values[i % tag_values.size()];
    const size_t num3_actual = i * 100 + num3;
    const size_t num4_actual = i * 100 + num4;

    Run({"hset", absl::StrCat("k3:", i), "num3", std::to_string(num3_actual), "str3", str});
    Run({"hset", absl::StrCat("k4:", i), "num4", std::to_string(num4_actual), "str4", str});

    if ((str == "tag1" || str == "tag4") && num3 == num4) {
      matchers.emplace_back(IsMap("idx3.num3", std::to_string(num3_actual), "idx3.str3", str,
                                  "idx4.num4", std::to_string(num4_actual), "idx4.str4", str));
    }

    num3 = (num3 + 3) % 12;
    num4 = (num4 + 7) % 12;
  }
  DCHECK(!matchers.empty());
  matchers.insert(matchers.begin(), IntArg(matchers.size()));

  resp = Run({"ft.aggregate", "idx3", "@str3:{tag1|tag4}", "LOAD", "4", "idx3.num3", "idx3.str3",
              "idx4.num4", "idx4.str4", "LOAD_FROM", "idx4", "1", "idx4.num4=idx3.num3", "QUERY",
              "@str4:{tag1|tag4}"});
  EXPECT_THAT(resp.GetVec(), UnorderedElementsAreArray(matchers));
}

TEST_F(SearchFamilyTest, AggregateWithLoadFromSyntaxErrors) {
  Run({"ft.create", "idx1", "ON", "HASH", "SCHEMA", "num1", "NUMERIC", "str1", "TEXT"});
  Run({"ft.create", "idx2", "ON", "HASH", "SCHEMA", "num2", "NUMERIC", "str2", "TEXT"});
  Run({"ft.create", "idx3", "ON", "HASH", "SCHEMA", "num3", "NUMERIC", "str3", "TEXT"});

  Run({"hset", "k1", "num1", "0", "str1", "str"});
  Run({"hset", "k2", "num2", "0", "str2", "str"});
  Run({"hset", "k3", "num3", "0", "str3", "str"});

  // Test when index does not exist
  EXPECT_THAT(Run({"ft.aggregate", "idx1", "*", "LOAD", "2", "idx1.num1", "idx1.str1", "LOAD_FROM",
                   "idx4", "1", "idx4.num2=idx1.num1"}),
              IntArg(0));

  // Test when index exists but no LOAD_FROM is specified
  EXPECT_THAT(Run({"ft.aggregate", "idx1", "*", "LOAD", "2", "idx1.num1", "idx1.str1", "LOAD_FROM",
                   "idx3", "1", "idx3.num3=idx2.num2"}),
              ErrArg("bad arguments for LOAD_FROM: unknown index 'idx2'"));

  // Test when index exists but was specified after it was used
  EXPECT_THAT(
      Run({"ft.aggregate", "idx1", "*", "LOAD", "2", "idx1.num1", "idx1.str1", "LOAD_FROM", "idx2",
           "1", "idx2.num2=idx3.num3", "LOAD_FROM", "idx3", "1", "idx3.str3=idx1.str1"}),
      ErrArg("bad arguments for LOAD_FROM: unknown index 'idx3'"));

  // Test when LOAD_FROM is not using fields of current index
  EXPECT_THAT(
      Run({"ft.aggregate", "idx1", "*", "LOAD", "2", "idx1.num1", "idx1.str1", "LOAD_FROM", "idx2",
           "1", "idx2.str2=idx1.str1", "LOAD_FROM", "idx3", "1", "idx2.str2=idx1.str1"}),
      ErrArg("bad arguments for LOAD_FROM: one of the field must be from the current index 'idx3'. "
             "Got 'idx2.str2' and 'idx1.str1'"));

  // Test when field of index does not exist
  EXPECT_THAT(Run({"ft.aggregate", "idx1", "*", "LOAD", "2", "idx1.num1", "idx1.str1", "LOAD_FROM",
                   "idx2", "1", "idx2.num2=idx1.nonexistent_field"}),
              IntArg(0));
  EXPECT_THAT(Run({"ft.aggregate", "idx1", "*", "LOAD", "2", "idx1.num1", "idx1.str1", "LOAD_FROM",
                   "idx2", "1", "idx2.nonexistent_field=idx1.num1"}),
              IntArg(0));

  // Test when field in QUERY does not exist in index
  EXPECT_THAT(Run({"ft.aggregate", "idx1", "*", "LOAD", "2", "idx1.num1", "idx1.str1", "LOAD_FROM",
                   "idx2", "1", "idx2.num2=idx1.num1", "QUERY", "@nonexistent_tag:{tag1|tag2}"}),
              IntArg(0));

  // Test when field in LOAD does not exist in index
  EXPECT_THAT(Run({"ft.aggregate", "idx1", "*", "LOAD", "2", "idx1.num1", "idx1.non_existent_field",
                   "LOAD_FROM", "idx2", "1", "idx2.num2=idx1.num1"}),
              IsUnordArrayWithSize(
                  IsMap("idx1.num1", "0", "idx1.non_existent_field", ArgType(RespExpr::NIL))));

  // Test index aliases
  EXPECT_THAT(Run({"ft.aggregate", "idx1", "*", "LOAD", "4", "idx1.num1", "idx1.str1", "alias.num2",
                   "alias.str2", "LOAD_FROM", "idx2", "AS", "alias", "1", "alias.num2=idx1.num1"}),
              IsUnordArrayWithSize(IsMap("idx1.num1", "0", "idx1.str1", "str", "alias.num2", "0",
                                         "alias.str2", "str")));
  EXPECT_THAT(Run({"ft.aggregate", "idx1", "*", "LOAD", "4", "idx1.num1", "idx1.str1", "idx2.num2",
                   "idx2.str2", "LOAD_FROM", "idx2", "AS", "alias", "1", "alias.num2=idx1.num1"}),
              ErrArg("Unknown index alias 'idx2' in the LOAD option. Field: 'num2'"));

  // Test same index used multiple times
  EXPECT_THAT(Run({"ft.aggregate", "idx1", "*", "LOAD", "4", "idx1.num1", "idx1.str1", "idx2.num2",
                   "idx2.str2", "LOAD_FROM", "idx2", "1", "idx2.num2=idx1.num1", "LOAD_FROM",
                   "idx2", "1", "idx2.str2=idx1.str1"}),
              ErrArg("Duplicate index alias in LOAD_FROM: 'idx2'"));
}

TEST_F(SearchFamilyTest, AggregateWithLoadFromSortingAndLimiting) {
  Run({"ft.create", "idx1", "ON", "HASH", "SCHEMA", "num1", "NUMERIC", "str1", "TEXT"});
  Run({"ft.create", "idx2", "ON", "HASH", "SCHEMA", "num2", "NUMERIC", "str2", "TEXT"});

  std::vector<::testing::Matcher<RespExpr>> matchers;
  for (int i = 0; i < 100; ++i) {
    const std::string num_value = std::to_string(i);
    const std::string str_value = absl::StrCat("value", i);
    Run({"hset", absl::StrCat("k1:", i), "num1", num_value, "str1", str_value});
    Run({"hset", absl::StrCat("k2:", i), "num2", num_value, "str2", str_value});

    if (i > 79 && i <= 89) {
      // Insert to beginning because we will sort DESCENDING
      matchers.emplace(matchers.begin(), IsMap("idx1.num1", num_value, "idx1.str1", str_value,
                                               "idx2.num2", num_value, "idx2.str2", str_value));
    }
  }
  DCHECK_EQ(matchers.size(), 10u);
  matchers.insert(matchers.begin(), IntArg(10));

  auto resp = Run({"ft.aggregate",
                   "idx1",
                   "*",
                   "LOAD",
                   "4",
                   "idx1.num1",
                   "idx1.str1",
                   "idx2.num2",
                   "idx2.str2",
                   "LOAD_FROM",
                   "idx2",
                   "1",
                   "idx2.num2=idx1.num1",
                   "SORTBY",
                   "2",
                   "@idx1.num1",
                   "DESC",
                   "LIMIT",
                   "10",
                   "10"});

  EXPECT_THAT(resp.GetVec(), ElementsAreArray(matchers));
}

TEST_F(SearchFamilyTest, AggregateWithLoadFromSortBySeveralFields) {
  Run({"ft.create", "idx1", "ON", "HASH", "SCHEMA", "num1", "NUMERIC", "str1", "TEXT", "num3",
       "NUMERIC"});
  Run({"ft.create", "idx2", "ON", "HASH", "SCHEMA", "num2", "NUMERIC", "str2", "TEXT", "num4",
       "NUMERIC"});

  std::vector<std::pair<int, std::string>> expected;
  for (int i = 0; i < 100; ++i) {
    const std::string num_value = std::to_string(i % 10);  // Only 10 distinct values
    const std::string str_value = absl::StrCat("value", i);
    Run({"hset", absl::StrCat("k1:", i), "num1", num_value, "str1", str_value, "num3",
         std::to_string(i)});
    Run({"hset", absl::StrCat("k2:", i), "num2", num_value, "str2", str_value, "num4",
         std::to_string(i)});

    expected.emplace_back(i % 10, str_value);
  }

  // Sort by num1 ASC, str1 DESC
  std::sort(expected.begin(), expected.end(), [](const auto& a, const auto& b) {
    if (a.first != b.first) {
      return a.first < b.first;  // Ascending order for num1
    }
    return a.second > b.second;  // Descending order for str1
  });

  std::vector<::testing::Matcher<RespExpr>> matchers;
  matchers.push_back(IntArg(20));
  for (size_t i = 50; i < 70; ++i) {
    const auto& [num, str] = expected[i];
    matchers.emplace_back(IsMap("idx1.num1", std::to_string(num), "idx1.str1", str, "idx2.num2",
                                std::to_string(num), "idx2.str2", str));
  }

  auto resp = Run({"ft.aggregate",
                   "idx1",
                   "*",
                   "LOAD",
                   "4",
                   "idx1.num1",
                   "idx1.str1",
                   "idx2.num2",
                   "idx2.str2",
                   "LOAD_FROM",
                   "idx2",
                   "1",
                   "idx2.num4=idx1.num3",
                   "SORTBY",
                   "4",
                   "@idx1.num1",
                   "ASC",
                   "@idx1.str1",
                   "DESC",
                   "LIMIT",
                   "50",
                   "20"});

  EXPECT_THAT(resp.GetVec(), ElementsAreArray(matchers));
}

TEST_F(SearchFamilyTest, NumericFilter) {
  // Index name, age, height
  Run({"FT.CREATE", "i1", "ON", "HASH", "SCHEMA", "name", "TEXT", "age", "NUMERIC", "height",
       "NUMERIC"});

  // Index name, age
  Run({"FT.CREATE", "i2", "ON", "HASH", "SCHEMA", "name", "TEXT", "age", "NUMERIC"});

  Run({"HSET", "id:1", "name", "John", "age", "28", "height", "184"});
  Run({"HSET", "id:2", "name", "Ivan", "age", "30", "height", "180"});
  Run({"HSET", "id:3", "name", "Jon", "age", "25", "height", "182"});
  Run({"HSET", "id:4", "name", "Juan", "age", "32", "height", "186"});
  Run({"HSET", "id:5", "name", "Ioan", "age", "35", "height", "181"});

  // Filter with non-star query
  auto res = Run({"FT.SEARCH", "i1", "I*", "FILTER", "age", "31", "40"});
  EXPECT_THAT(res, AreDocIds("id:5"));

  // Filter on ONE NUMERIC index
  res = Run({"FT.SEARCH", "i1", "*", "FILTER", "age", "25", "28"});
  EXPECT_THAT(res, AreDocIds("id:1", "id:3"));

  // Filter on TWO NUMERIC indexes
  res =
      Run({"FT.SEARCH", "i1", "*", "FILTER", "age", "25", "28", "FILTER", "height", "180", "182"});
  EXPECT_THAT(res, AreDocIds("id:3"));

  // Filter on TWO NUMERIC indexes where second filtering produce empty result
  res =
      Run({"FT.SEARCH", "i1", "*", "FILTER", "age", "25", "28", "FILTER", "height", "200", "300"});
  EXPECT_THAT(res, AreDocIds());

  // Filter on index which doesn't exists
  res = Run({"FT.SEARCH", "i2", "*", "FILTER", "height", "180", "190"});
  EXPECT_THAT(res, ErrArg("Invalid field: height"));

  // Two filters on same field
  res = Run({"FT.SEARCH", "i1", "J*", "FILTER", "age", "25", "30", "FILTER", "age", "28", "32"});
  EXPECT_THAT(res, AreDocIds("id:1"));

  Run({"FLUSHALL"});
}

TEST_F(SearchFamilyTest, MAXSEARCHRESULTS) {
  EXPECT_EQ(Run({"HSET", "s1", "phrase", "hello world"}), 1);
  EXPECT_EQ(Run({"HSET", "s2", "phrase", "hello simple world"}), 1);
  EXPECT_EQ(Run({"HSET", "s3", "phrase", "hello somewhat less simple world"}), 1);
  EXPECT_EQ(Run({"FT.CREATE", "memes", "SCHEMA", "phrase", "TEXT"}), "OK");

  auto resp = Run({"FT.CONFIG", "GET", "MAXSEARCHRESULTS"});
  EXPECT_THAT(resp, IsArray("MAXSEARCHRESULTS", "1000000"));

  resp = Run({"FT.SEARCH", "memes", "@phrase:(hello world)", "NOCONTENT"});
  EXPECT_THAT(resp, RespElementsAre(IntArg(3), _, _, _));

  resp = Run({"FT.CONFIG", "SET", "MAXSEARCHRESULTS", "1"});
  EXPECT_EQ(resp, "OK");

  resp = Run({"FT.SEARCH", "memes", "@phrase:(hello world)", "NOCONTENT"});
  EXPECT_THAT(resp, RespElementsAre(IntArg(3), _));

  resp = Run({"FT.SEARCH", "memes", "@phrase:(hello world)", "NOCONTENT", "LIMIT", "0", "1"});
  EXPECT_THAT(resp, RespElementsAre(IntArg(3), _));

  resp = Run({"FT.SEARCH", "memes", "@phrase:(hello world)", "NOCONTENT", "LIMIT", "0", "3"});
  EXPECT_THAT(resp, ErrArg("LIMIT exceeds maximum of 1"));

  resp = Run({"FT.CONFIG", "GET", "MAXSEARCHRESULTS"});
  EXPECT_THAT(resp, IsArray("MAXSEARCHRESULTS", "1"));

  resp = Run({"FT.CONFIG", "HELP", "MAXSEARCHRESULTS"});
  EXPECT_THAT(resp, IsArray("MAXSEARCHRESULTS", "Description",
                            "Maximum number of results from ft.search command", "Value", "1"));

  resp = Run({"FT.CONFIG", "GET", "*"});
  // Should contain MAXSEARCHRESULTS among other search config parameters
  EXPECT_THAT(resp, RespArray(Contains("MAXSEARCHRESULTS")));
  EXPECT_THAT(resp, RespArray(Contains("1")));

  resp = Run({"FT.CONFIG", "HELP", "*"});
  // Should contain MAXSEARCHRESULTS description among other search configs
  EXPECT_THAT(resp.GetVec(),
              Contains(IsArray("MAXSEARCHRESULTS", "Description",
                               "Maximum number of results from ft.search command", "Value", "1")));

  // restore normal value for other tests
  Run({"FT.CONFIG", "SET", "MAXSEARCHRESULTS", "1000000"});
}

TEST_F(SearchFamilyTest, InvalidConfigOptions) {
  // Test with an invalid argument
  auto resp = Run({"FT.CONFIG", "INVALIDARG", "INVLIDARG"});
  EXPECT_THAT(resp, ErrArg("Unknown subcommand"));

  // Test with an invalid argument
  resp = Run({"FT.CONFIG", "GET", "INVALIDARG"});
  EXPECT_THAT(resp, IsArray());

  // Test with an invalid argument
  resp = Run({"FT.CONFIG", "SET", "INVALIDARG"});
  EXPECT_THAT(resp, ErrArg(kSyntaxErr));

  // Test with an invalid argument
  resp = Run({"FT.CONFIG", "SET", "INVALIDARG", "5"});
  EXPECT_THAT(resp, ErrArg("Invalid option"));

  // Test with an invalid value
  resp = Run({"FT.CONFIG", "SET", "MAXSEARCHRESULTS", "not_a_number"});
  EXPECT_THAT(resp, ErrArg("ERR FT.CONFIG SET failed (possibly related to argument "
                           "'MAXSEARCHRESULTS') - argument can not be set"));

  // Test with an invalid argument
  resp = Run({"FT.CONFIG", "HELP", "INVALIDARG"});
  EXPECT_THAT(resp, IsArray());
}

TEST_F(SearchFamilyTest, DropIndexWithDD) {
  // Create an index on HASH documents
  Run({"FT.CREATE", "idx", "ON", "HASH", "PREFIX", "1", "doc:", "SCHEMA", "name", "TEXT"});

  // Add some documents
  Run({"HSET", "doc:1", "name", "Alice"});
  Run({"HSET", "doc:2", "name", "Bob"});
  Run({"HSET", "doc:3", "name", "Charlie"});

  // Verify documents exist
  auto resp = Run({"EXISTS", "doc:1", "doc:2", "doc:3"});
  EXPECT_THAT(resp, IntArg(3));

  // Verify index works
  resp = Run({"FT.SEARCH", "idx", "*"});
  EXPECT_THAT(resp, AreDocIds("doc:1", "doc:2", "doc:3"));

  // Drop index WITHOUT DD - documents should remain
  Run({"FT.DROPINDEX", "idx"});
  resp = Run({"EXISTS", "doc:1", "doc:2", "doc:3"});
  EXPECT_THAT(resp, IntArg(3));

  // Create index again
  Run({"FT.CREATE", "idx", "ON", "HASH", "PREFIX", "1", "doc:", "SCHEMA", "name", "TEXT"});
  ThisFiber::Yield();

  // Verify index works again
  resp = Run({"FT.SEARCH", "idx", "*"});
  EXPECT_THAT(resp, AreDocIds("doc:1", "doc:2", "doc:3"));

  // Drop index WITH DD - documents should be deleted
  Run({"FT.DROPINDEX", "idx", "DD"});
  resp = Run({"EXISTS", "doc:1", "doc:2", "doc:3"});
  EXPECT_THAT(resp, IntArg(0));
}

TEST_F(SearchFamilyTest, DropIndexWithDDJson) {
  // Create an index on JSON documents
  Run({"FT.CREATE", "jidx", "ON", "JSON", "PREFIX", "1", "jdoc:", "SCHEMA", "$.name", "AS", "name",
       "TEXT"});

  // Add some JSON documents
  Run({"JSON.SET", "jdoc:1", "$", R"({"name": "Alice"})"});
  Run({"JSON.SET", "jdoc:2", "$", R"({"name": "Bob"})"});
  Run({"JSON.SET", "jdoc:3", "$", R"({"name": "Charlie"})"});

  // Verify documents exist
  auto resp = Run({"EXISTS", "jdoc:1", "jdoc:2", "jdoc:3"});
  EXPECT_THAT(resp, IntArg(3));

  // Verify index works
  resp = Run({"FT.SEARCH", "jidx", "*"});
  EXPECT_THAT(resp, AreDocIds("jdoc:1", "jdoc:2", "jdoc:3"));

  // Drop index WITH DD - documents should be deleted
  Run({"FT.DROPINDEX", "jidx", "DD"});
  resp = Run({"EXISTS", "jdoc:1", "jdoc:2", "jdoc:3"});
  EXPECT_THAT(resp, IntArg(0));
}

TEST_F(SearchFamilyTest, DropIndexWithInvalidOption) {
  // Create an index
  Run({"FT.CREATE", "idx", "ON", "HASH", "PREFIX", "1", "doc:", "SCHEMA", "name", "TEXT"});
  Run({"HSET", "doc:1", "name", "test"});

  // Drop with unrecognized option (should be ignored, index dropped but documents remain)
  auto resp = Run({"FT.DROPINDEX", "idx", "INVALID"});
  EXPECT_THAT(resp, "OK");

  // Document should still exist
  resp = Run({"EXISTS", "doc:1"});
  EXPECT_THAT(resp, IntArg(1));

  // Clean up
  Run({"DEL", "doc:1"});
}

TEST_F(SearchFamilyTest, ZsetStoreCommandsOverwriteIndexedHash) {
  Run({"FT.CREATE", "idx", "ON", "HASH", "SCHEMA", "field", "TEXT"});
  EXPECT_THAT(Run({"ZADD", "zset1", "1", "a", "2", "b"}), IntArg(2));
  EXPECT_THAT(Run({"ZADD", "zset2", "1.5", "a", "3", "c"}), IntArg(2));

  // Test ZINTERSTORE
  EXPECT_THAT(Run({"HSET", "dest", "field", "value"}), IntArg(1));
  EXPECT_THAT(Run({"ZINTERSTORE", "dest", "2", "zset1", "zset2"}), IntArg(1));
  EXPECT_EQ(Run({"RENAME", "dest", "x"}), "OK");

  // Test ZUNIONSTORE
  EXPECT_THAT(Run({"HSET", "dest", "field", "value"}), IntArg(1));
  EXPECT_THAT(Run({"ZUNIONSTORE", "dest", "2", "zset1", "zset2"}), IntArg(3));
  EXPECT_EQ(Run({"RENAME", "dest", "y"}), "OK");
}

TEST_F(SearchFamilyTest, SetStoreCommandsOverwriteIndexedHash) {
  Run({"FT.CREATE", "idx", "ON", "HASH", "SCHEMA", "field", "TEXT"});
  EXPECT_THAT(Run({"SADD", "set1", "a", "b", "c"}), IntArg(3));
  EXPECT_THAT(Run({"SADD", "set2", "b", "c", "d"}), IntArg(3));

  // Test SINTERSTORE
  EXPECT_THAT(Run({"HSET", "dest", "field", "value"}), IntArg(1));
  EXPECT_THAT(Run({"SINTERSTORE", "dest", "set1", "set2"}), IntArg(2));
  EXPECT_EQ(Run({"RENAME", "dest", "x"}), "OK");

  // Test SUNIONSTORE
  EXPECT_THAT(Run({"HSET", "dest", "field", "value"}), IntArg(1));
  EXPECT_THAT(Run({"SUNIONSTORE", "dest", "set1", "set2"}), IntArg(4));
  EXPECT_EQ(Run({"RENAME", "dest", "y"}), "OK");

  // Test SDIFFSTORE
  EXPECT_THAT(Run({"HSET", "dest", "field", "value"}), IntArg(1));
  EXPECT_THAT(Run({"SDIFFSTORE", "dest", "set1", "set2"}), IntArg(1));
  EXPECT_EQ(Run({"RENAME", "dest", "z"}), "OK");
}

TEST_F(SearchFamilyTest, HsetOnDifferentDatabasesCrash) {
  // This test verifies that creating documents with the same key on different databases
  // doesn't crash. Only database 0 is indexed.
  Run({"FT.CREATE", "idx", "ON", "HASH", "SCHEMA", "field1", "TEXT"});

  // Create document on database 0 - should be indexed
  EXPECT_THAT(Run({"HSET", "hash1", "field1", "value1"}), IntArg(1));
  EXPECT_THAT(Run({"FT.SEARCH", "idx", "value1"}), AreDocIds("hash1"));

  // Switch to database 1
  EXPECT_THAT(Run({"SELECT", "1"}), "OK");

  // Create document with same key on database 1 - should NOT crash
  EXPECT_THAT(Run({"HSET", "hash1", "field1", "another_value"}), IntArg(1));

  // Search on database 1 should return no results (only db 0 is indexed)
  auto resp = Run({"FT.SEARCH", "idx", "another_value"});
  EXPECT_THAT(resp, IntArg(0));

  // Switch back to database 0
  EXPECT_THAT(Run({"SELECT", "0"}), "OK");

  // Search on database 0 should still find the original document
  EXPECT_THAT(Run({"FT.SEARCH", "idx", "value1"}), AreDocIds("hash1"));
}

TEST_F(SearchFamilyTest, QueryStringBytesLimit) {
  EXPECT_EQ(Run({"ft.create", "idx", "ON", "HASH", "SCHEMA", "name", "TEXT", "age", "NUMERIC"}),
            "OK");

  Run({"hset", "doc1", "name", "alice", "age", "30"});
  Run({"hset", "doc2", "name", "bob", "age", "25"});

  absl::FlagSaver fs;

  string query = "@name:alice @age:[25 30]";
  size_t query_len = query.size();

  // Set limit to query_len - 1 (just below query length)
  absl::SetFlag(&FLAGS_search_query_string_bytes, query_len - 1);

  auto resp = Run({"ft.search", "idx", query});
  EXPECT_THAT(resp, ErrArg(absl::StrCat("Query string is too long, max length is ", query_len - 1,
                                        " bytes")));

  absl::SetFlag(&FLAGS_search_query_string_bytes, query_len);

  resp = Run({"ft.search", "idx", query});
  EXPECT_THAT(resp, AreDocIds("doc1"));

  // Test FT.AGGREGATE with same query
  absl::SetFlag(&FLAGS_search_query_string_bytes, query_len - 1);

  resp = Run({"ft.aggregate", "idx", query, "LOAD", "1", "name"});
  EXPECT_THAT(resp, ErrArg(absl::StrCat("Query string is too long, max length is ", query_len - 1,
                                        " bytes")));

  absl::SetFlag(&FLAGS_search_query_string_bytes, query_len);

  resp = Run({"ft.aggregate", "idx", query, "LOAD", "1", "name"});
  EXPECT_THAT(resp, IsUnordArrayWithSize(IsMap("name", "alice")));
}

TEST_F(SearchFamilyTest, KnnHnsw) {
  // Create an index with a vector field using HASH documents
  auto resp = Run({"FT.CREATE", "knn_idx", "ON", "HASH", "SCHEMA", "even", "TAG", "pos", "VECTOR",
                   "HNSW", "6", "TYPE", "FLOAT32", "DIM", "1", "DISTANCE_METRIC", "L2"});
  EXPECT_EQ(resp, "OK");

  // Helper to convert float to binary format
  auto FloatToBytes = [](float f) -> string {
    return string(reinterpret_cast<const char*>(&f), sizeof(float));
  };

  // Add some test documents with vector data
  Run({"HSET", "doc1", "even", "yes", "pos", FloatToBytes(1.0f)});
  Run({"HSET", "doc2", "even", "no", "pos", FloatToBytes(2.0f)});
  Run({"HSET", "doc3", "even", "yes", "pos", FloatToBytes(3.0f)});

  // Add documents without the vector field
  Run({"HSET", "doc4", "even", "yes"});
  Run({"HSET", "doc5", "even", "maybe"});

  // Query vector (2.0f - should find doc2 closest, but filtered to "yes" docs)
  string query_vec = FloatToBytes(2.0f);

  // Perform KNN search with tag filter
  resp = Run({"FT.SEARCH", "knn_idx", "@even:{yes} => [KNN 3 @pos $vec]", "PARAMS", "2", "vec",
              query_vec});
  // Should return documents with "even": "yes" sorted by vector distance to 2.0
  EXPECT_THAT(resp, AreDocIds("doc3", "doc1"));

  // Verify that document without field is added to tag but not in hnsw vector index
  resp = Run({"FT.SEARCH", "knn_idx", "@even:{maybe}"});
  EXPECT_THAT(resp, AreDocIds("doc5"));

  resp = Run({"FT.SEARCH", "knn_idx", "@even:{maybe} => [KNN 3 @pos $vec]", "PARAMS", "2", "vec",
              query_vec});
  EXPECT_THAT(resp, IntArg(0));

  // Verify that empty prefilter return zero results
  resp = Run({"FT.SEARCH", "knn_idx", "@even:{non_existing} => [KNN 3 @pos $vec]", "PARAMS", "2",
              "vec", query_vec});
  EXPECT_THAT(resp, IntArg(0));
}

TEST_F(SearchFamilyTest, KnnHnswCosineDistanceCalculation) {
  // Create index with 3D vectors using COSINE distance metric with HNSW
  auto resp = Run({"FT.CREATE", "cosine_idx", "ON", "HASH", "SCHEMA", "vec", "VECTOR", "HNSW", "6",
                   "TYPE", "FLOAT32", "DIM", "3", "DISTANCE_METRIC", "COSINE"});
  EXPECT_EQ(resp, "OK");

  // Query vector will be [1, 0, 0]
  // Cosine distance = 1 - cosine_similarity = 1 - (dot_product / (norm1 * norm2))

  // doc1: [1, 0, 0] - identical to query, distance = 0
  Run({"HSET", "doc1", "vec", Vec3ToBytes(1.0f, 0.0f, 0.0f)});

  // doc2: [0, 1, 0] - orthogonal (y-axis), distance = 1
  Run({"HSET", "doc2", "vec", Vec3ToBytes(0.0f, 1.0f, 0.0f)});

  // doc3: [0, 0, 1] - orthogonal (z-axis), distance = 1
  Run({"HSET", "doc3", "vec", Vec3ToBytes(0.0f, 0.0f, 1.0f)});

  // doc4: [-1, 0, 0] - opposite direction, distance = 2
  Run({"HSET", "doc4", "vec", Vec3ToBytes(-1.0f, 0.0f, 0.0f)});

  // doc5: [2, 0, 0] - same direction, 2x magnitude, distance = 0 (cosine is magnitude-invariant)
  Run({"HSET", "doc5", "vec", Vec3ToBytes(2.0f, 0.0f, 0.0f)});

  // doc6: [0, 0, 0] - EDGE CASE: zero vector (undefined cosine, implementation-dependent)
  Run({"HSET", "doc6", "vec", Vec3ToBytes(0.0f, 0.0f, 0.0f)});

  // doc7: [1, 1, 0] - 45° angle in xy-plane, cos_sim = 1/√2 ≈ 0.707, distance ≈ 0.293
  Run({"HSET", "doc7", "vec", Vec3ToBytes(1.0f, 1.0f, 0.0f)});

  // doc8: [1, 1, 1] - equal components, cos_sim = 1/√3 ≈ 0.577, distance ≈ 0.423
  Run({"HSET", "doc8", "vec", Vec3ToBytes(1.0f, 1.0f, 1.0f)});

  // doc9: [0.1, 0, 0] - EDGE CASE: very small magnitude, same direction, distance = 0
  Run({"HSET", "doc9", "vec", Vec3ToBytes(0.1f, 0.0f, 0.0f)});

  // doc10: [10, 0, 0] - EDGE CASE: very large magnitude, same direction, distance = 0
  Run({"HSET", "doc10", "vec", Vec3ToBytes(10.0f, 0.0f, 0.0f)});

  // Query with [1, 0, 0]
  string query_vec = Vec3ToBytes(1.0f, 0.0f, 0.0f);

  // Test: Verify all distance scores
  resp = Run({"FT.SEARCH", "cosine_idx", "*=>[KNN 10 @vec $query_vec AS score]", "PARAMS", "2",
              "query_vec", query_vec, "RETURN", "1", "score", "SORTBY", "score"});

  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  auto results = resp.GetVec();
  ASSERT_GE(results.size(), 3);  // At least count + 1 doc

  // Gather all scores
  std::map<string, double> doc_scores;
  for (size_t i = 1; i < results.size(); i += 2) {
    string doc_id = results[i].GetString();
    double score = vector_score("score", results[i + 1].GetVec());
    doc_scores[doc_id] = score;
  }

  // Verify expected distances (with tolerance for floating-point)
  // doc1, doc5, doc9, doc10 should all have distance ≈ 0 (same direction, magnitude-invariant)
  if (doc_scores.contains("doc1")) {
    EXPECT_LT(doc_scores["doc1"], 0.01);
  }

  if (doc_scores.contains("doc5")) {
    EXPECT_LT(doc_scores["doc5"], 0.01);
  }

  if (doc_scores.contains("doc9")) {
    EXPECT_LT(doc_scores["doc9"], 0.01);
  }

  if (doc_scores.contains("doc10")) {
    EXPECT_LT(doc_scores["doc10"], 0.01);
  }

  // doc7: 45° angle, distance ≈ 1 - 1/√2 ≈ 0.293
  if (doc_scores.contains("doc7")) {
    EXPECT_GT(doc_scores["doc7"], 0.25);
    EXPECT_LT(doc_scores["doc7"], 0.35);
  }

  // doc8: distance ≈ 1 - 1/√3 ≈ 0.423
  if (doc_scores.contains("doc8")) {
    EXPECT_GT(doc_scores["doc8"], 0.38);
    EXPECT_LT(doc_scores["doc8"], 0.47);
  }

  // doc2, doc3: orthogonal, distance = 1
  if (doc_scores.contains("doc2")) {
    EXPECT_GT(doc_scores["doc2"], 0.95);
    EXPECT_LT(doc_scores["doc2"], 1.05);
  }

  if (doc_scores.contains("doc3")) {
    EXPECT_GT(doc_scores["doc3"], 0.95);
    EXPECT_LT(doc_scores["doc3"], 1.05);
  }

  // doc4: opposite direction, distance = 2
  if (doc_scores.contains("doc4")) {
    EXPECT_GT(doc_scores["doc4"], 1.95);
    EXPECT_LT(doc_scores["doc4"], 2.05);
  }

  // doc6: zero vector - EDGE CASE, behavior is implementation-dependent
  // Most implementations treat it as maximum distance or handle specially
}

TEST_F(SearchFamilyTest, KnnHnswL2DistanceCalculation) {
  // Create index with 3D vectors using L2 (Euclidean) distance metric with HNSW
  auto resp = Run({"FT.CREATE", "l2_idx", "ON", "HASH", "SCHEMA", "vec", "VECTOR", "HNSW", "6",
                   "TYPE", "FLOAT32", "DIM", "3", "DISTANCE_METRIC", "L2"});
  EXPECT_EQ(resp, "OK");

  // Query vector will be [1, 0, 0]
  // L2_distance = sqrt(sum((a[i] - b[i])^2))

  // doc1: [1, 0, 0] - identical to query, distance = 0
  Run({"HSET", "doc1", "vec", Vec3ToBytes(1.0f, 0.0f, 0.0f)});

  // doc2: [0, 1, 0] - orthogonal, distance = sqrt(1 + 1 + 0) = √2 ≈ 1.414
  Run({"HSET", "doc2", "vec", Vec3ToBytes(0.0f, 1.0f, 0.0f)});

  // doc3: [0, 0, 1] - orthogonal, distance = sqrt(1 + 0 + 1) = √2 ≈ 1.414
  Run({"HSET", "doc3", "vec", Vec3ToBytes(0.0f, 0.0f, 1.0f)});

  // doc4: [-1, 0, 0] - opposite direction, distance = sqrt(4 + 0 + 0) = 2
  Run({"HSET", "doc4", "vec", Vec3ToBytes(-1.0f, 0.0f, 0.0f)});

  // doc5: [2, 0, 0] - same direction, 2x magnitude, distance = sqrt(1 + 0 + 0) = 1
  Run({"HSET", "doc5", "vec", Vec3ToBytes(2.0f, 0.0f, 0.0f)});

  // doc6: [0, 0, 0] - EDGE CASE: zero vector, distance = sqrt(1 + 0 + 0) = 1
  Run({"HSET", "doc6", "vec", Vec3ToBytes(0.0f, 0.0f, 0.0f)});

  // doc7: [1, 1, 0] - distance = sqrt(0 + 1 + 0) = 1
  Run({"HSET", "doc7", "vec", Vec3ToBytes(1.0f, 1.0f, 0.0f)});

  // doc8: [1, 1, 1] - distance = sqrt(0 + 1 + 1) = √2 ≈ 1.414
  Run({"HSET", "doc8", "vec", Vec3ToBytes(1.0f, 1.0f, 1.0f)});

  // doc9: [0.1, 0, 0] - EDGE CASE: very small magnitude, distance = sqrt(0.81 + 0 + 0) = 0.9
  Run({"HSET", "doc9", "vec", Vec3ToBytes(0.1f, 0.0f, 0.0f)});

  // doc10: [10, 0, 0] - EDGE CASE: very large magnitude, distance = sqrt(81 + 0 + 0) = 9
  Run({"HSET", "doc10", "vec", Vec3ToBytes(10.0f, 0.0f, 0.0f)});

  // Query with [1, 0, 0]
  string query_vec = Vec3ToBytes(1.0f, 0.0f, 0.0f);

  // Test: Verify all distance scores
  resp = Run({"FT.SEARCH", "l2_idx", "*=>[KNN 10 @vec $query_vec AS score]", "PARAMS", "2",
              "query_vec", query_vec, "RETURN", "1", "score", "SORTBY", "score"});

  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  auto results = resp.GetVec();
  ASSERT_GE(results.size(), 3);  // At least count + 1 doc

  // Gather all scores
  std::map<string, double> doc_scores;
  for (size_t i = 1; i < results.size(); i += 2) {
    string doc_id = results[i].GetString();
    double score = vector_score("score", results[i + 1].GetVec());
    doc_scores[doc_id] = score;
  }

  // Verify expected distances (with tolerance for floating-point)
  // doc1: distance = 0 (identical)
  if (doc_scores.contains("doc1")) {
    EXPECT_LT(doc_scores["doc1"], 0.01);
  }

  // doc9: distance = 0.9 (small magnitude, same direction)
  if (doc_scores.contains("doc9")) {
    EXPECT_GT(doc_scores["doc9"], 0.85);
    EXPECT_LT(doc_scores["doc9"], 0.95);
  }

  // doc5, doc6, doc7: distance = 1
  if (doc_scores.contains("doc5")) {
    EXPECT_GT(doc_scores["doc5"], 0.95);
    EXPECT_LT(doc_scores["doc5"], 1.05);
  }

  if (doc_scores.contains("doc6")) {
    EXPECT_GT(doc_scores["doc6"], 0.95);
    EXPECT_LT(doc_scores["doc6"], 1.05);
  }

  if (doc_scores.contains("doc7")) {
    EXPECT_GT(doc_scores["doc7"], 0.95);
    EXPECT_LT(doc_scores["doc7"], 1.05);
  }

  // doc2, doc3, doc8: distance = √2 ≈ 1.414
  if (doc_scores.contains("doc2")) {
    EXPECT_GT(doc_scores["doc2"], 1.37);
    EXPECT_LT(doc_scores["doc2"], 1.46);
  }

  if (doc_scores.contains("doc3")) {
    EXPECT_GT(doc_scores["doc3"], 1.37);
    EXPECT_LT(doc_scores["doc3"], 1.46);
  }

  if (doc_scores.contains("doc8")) {
    EXPECT_GT(doc_scores["doc8"], 1.37);
    EXPECT_LT(doc_scores["doc8"], 1.46);
  }

  // doc4: distance = 2 (opposite direction)
  if (doc_scores.contains("doc4")) {
    EXPECT_GT(doc_scores["doc4"], 1.95);
    EXPECT_LT(doc_scores["doc4"], 2.05);
  }

  // doc10: distance = 9 (large magnitude, same direction)
  if (doc_scores.contains("doc10")) {
    EXPECT_GT(doc_scores["doc10"], 8.95);
    EXPECT_LT(doc_scores["doc10"], 9.05);
  }
}

TEST_F(SearchFamilyTest, KnnHnswIPDistanceCalculation) {
  // Create index with 3D vectors using IP (Inner Product) distance metric with HNSW
  auto resp = Run({"FT.CREATE", "ip_idx", "ON", "HASH", "SCHEMA", "vec", "VECTOR", "HNSW", "6",
                   "TYPE", "FLOAT32", "DIM", "3", "DISTANCE_METRIC", "IP"});
  EXPECT_EQ(resp, "OK");

  // Comprehensive test cases with edge cases - SAME VECTORS as other tests
  // Query vector will be [1, 0, 0]
  // IP_distance = 1 - dot_product(a, b)

  // doc1: [1, 0, 0] - dot = 1, distance = 0
  Run({"HSET", "doc1", "vec", Vec3ToBytes(1.0f, 0.0f, 0.0f)});

  // doc2: [0, 1, 0] - dot = 0, distance = 1
  Run({"HSET", "doc2", "vec", Vec3ToBytes(0.0f, 1.0f, 0.0f)});

  // doc3: [0, 0, 1] - dot = 0, distance = 1
  Run({"HSET", "doc3", "vec", Vec3ToBytes(0.0f, 0.0f, 1.0f)});

  // doc4: [-1, 0, 0] - dot = -1, distance = 2
  Run({"HSET", "doc4", "vec", Vec3ToBytes(-1.0f, 0.0f, 0.0f)});

  // doc5: [2, 0, 0] - dot = 2, distance = -1 (NOT magnitude-invariant like cosine)
  Run({"HSET", "doc5", "vec", Vec3ToBytes(2.0f, 0.0f, 0.0f)});

  // doc6: [0, 0, 0] - EDGE CASE: zero vector, dot = 0, distance = 1
  Run({"HSET", "doc6", "vec", Vec3ToBytes(0.0f, 0.0f, 0.0f)});

  // doc7: [1, 1, 0] - dot = 1, distance = 0
  Run({"HSET", "doc7", "vec", Vec3ToBytes(1.0f, 1.0f, 0.0f)});

  // doc8: [1, 1, 1] - dot = 1, distance = 0
  Run({"HSET", "doc8", "vec", Vec3ToBytes(1.0f, 1.0f, 1.0f)});

  // doc9: [0.1, 0, 0] - EDGE CASE: dot = 0.1, distance = 0.9
  Run({"HSET", "doc9", "vec", Vec3ToBytes(0.1f, 0.0f, 0.0f)});

  // doc10: [10, 0, 0] - EDGE CASE: dot = 10, distance = -9
  Run({"HSET", "doc10", "vec", Vec3ToBytes(10.0f, 0.0f, 0.0f)});

  // Query with [1, 0, 0]
  string query_vec = Vec3ToBytes(1.0f, 0.0f, 0.0f);

  // Test: Verify all distance scores
  // For IP, lower distance means higher dot product (better match)
  resp = Run({"FT.SEARCH", "ip_idx", "*=>[KNN 10 @vec $query_vec AS score]", "PARAMS", "2",
              "query_vec", query_vec, "RETURN", "1", "score", "SORTBY", "score"});

  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  auto results = resp.GetVec();
  ASSERT_GE(results.size(), 3);  // At least count + 1 doc

  // Gather all scores
  std::map<string, double> doc_scores;
  for (size_t i = 1; i < results.size(); i += 2) {
    string doc_id = results[i].GetString();
    double score = vector_score("score", results[i + 1].GetVec());
    doc_scores[doc_id] = score;
  }

  // Verify expected distances (with tolerance for floating-point)
  // doc10: distance = -9 (dot = 10, EDGE CASE: large magnitude advantage)
  if (doc_scores.contains("doc10")) {
    EXPECT_GT(doc_scores["doc10"], -9.05);
    EXPECT_LT(doc_scores["doc10"], -8.95);
  }

  // doc5: distance = -1 (dot = 2, shows magnitude matters for IP unlike cosine)
  if (doc_scores.contains("doc5")) {
    EXPECT_GT(doc_scores["doc5"], -1.05);
    EXPECT_LT(doc_scores["doc5"], -0.95);
  }

  // doc1, doc7, doc8: distance = 0 (dot = 1)
  if (doc_scores.contains("doc1")) {
    EXPECT_GT(doc_scores["doc1"], -0.05);
    EXPECT_LT(doc_scores["doc1"], 0.05);
  }

  if (doc_scores.contains("doc7")) {
    EXPECT_GT(doc_scores["doc7"], -0.05);
    EXPECT_LT(doc_scores["doc7"], 0.05);
  }

  if (doc_scores.contains("doc8")) {
    EXPECT_GT(doc_scores["doc8"], -0.05);
    EXPECT_LT(doc_scores["doc8"], 0.05);
  }

  // doc9: distance = 0.9 (dot = 0.1, EDGE CASE: small magnitude penalty)
  if (doc_scores.contains("doc9")) {
    EXPECT_GT(doc_scores["doc9"], 0.85);
    EXPECT_LT(doc_scores["doc9"], 0.95);
  }

  // doc2, doc3, doc6: distance = 1 (dot = 0)
  if (doc_scores.contains("doc2")) {
    EXPECT_GT(doc_scores["doc2"], 0.95);
    EXPECT_LT(doc_scores["doc2"], 1.05);
  }

  if (doc_scores.contains("doc3")) {
    EXPECT_GT(doc_scores["doc3"], 0.95);
    EXPECT_LT(doc_scores["doc3"], 1.05);
  }

  if (doc_scores.contains("doc6")) {
    EXPECT_GT(doc_scores["doc6"], 0.95);
    EXPECT_LT(doc_scores["doc6"], 1.05);
  }

  // doc4: distance = 2 (dot = -1, opposite direction is worst)
  if (doc_scores.contains("doc4")) {
    EXPECT_GT(doc_scores["doc4"], 1.95);
    EXPECT_LT(doc_scores["doc4"], 2.05);
  }
}

TEST_F(SearchFamilyTest, ParseCSSResponse) {
  using Fields = std::map<std::string, std::string>;
  using Docs = std::map<std::string, Fields>;

  std::string msg1 =
      "*17\r\n:8\r\n$2\r\ns0\r\n*2\r\n$5\r\ntitle\r\n$6\r\ntest "
      "0\r\n$2\r\ns3\r\n*2\r\n$5\r\ntitle\r\n$6\r\ntest "
      "3\r\n$2\r\ns7\r\n*2\r\n$5\r\ntitle\r\n$6\r\ntest "
      "7\r\n$2\r\ns8\r\n*2\r\n$5\r\ntitle\r\n$6\r\ntest "
      "8\r\n$2\r\ns4\r\n*2\r\n$5\r\ntitle\r\n$6\r\ntest "
      "4\r\n$2\r\ns9\r\n*2\r\n$5\r\ntitle\r\n$6\r\ntest 9\r\n";

  std::string msg2 =
      "$2\r\ns1\r\n*2\r\n$5\r\ntitle\r\n$6\r\ntest "
      "1\r\n$2\r\ns5\r\n*2\r\n$5\r\ntitle\r\n$6\r\ntest 5\r\n";

  RESPParser reader;
  auto reply = reader.Feed(msg1.c_str(), msg1.size());
  ASSERT_TRUE(reply->Empty());

  reply = reader.Feed(msg2.c_str(), msg2.size());
  ASSERT_FALSE(reply->Empty());

  EXPECT_EQ(reply->GetType(), RESPObj::Type::ARRAY);
  auto array = *reply->As<RESPArray>();
  EXPECT_GE(array.Size(), 1);
  EXPECT_EQ(array[0].GetType(), RESPObj::Type::INTEGER);

  Docs search_results;
  for (size_t i = 1; i < array.Size(); i += 2) {
    auto& fields = search_results[*array[i].As<std::string>()];

    auto field_array = *array[i + 1].As<RESPArray>();

    for (size_t j = 0; j < field_array.Size(); j += 2) {
      std::string field_name = *field_array[j].As<std::string>();
      std::string field_value = *field_array[j + 1].As<std::string>();

      fields[field_name] = field_value;
    }
  }

  EXPECT_EQ(search_results.size(), 8);

  EXPECT_EQ(search_results["s0"]["title"], "test 0");
  EXPECT_EQ(search_results["s1"]["title"], "test 1");
  EXPECT_EQ(search_results["s3"]["title"], "test 3");
  EXPECT_EQ(search_results["s4"]["title"], "test 4");
  EXPECT_EQ(search_results["s5"]["title"], "test 5");
  EXPECT_EQ(search_results["s7"]["title"], "test 7");
  EXPECT_EQ(search_results["s8"]["title"], "test 8");
  EXPECT_EQ(search_results["s9"]["title"], "test 9");
}

TEST_F(SearchFamilyTest, WithSortKeysOption) {
  EXPECT_EQ(Run({"ft.create", "users", "SCHEMA", "first_name", "TEXT", "SORTABLE", "last_name",
                 "TEXT", "age", "NUMERIC", "SORTABLE"}),
            "OK");

  Run({"HSET", "user1", "first_name", "alice", "last_name", "jones", "age", "35"});
  Run({"HSET", "user2", "first_name", "bob", "last_name", "jones", "age", "36"});

  EXPECT_THAT(Run({"FT.SEARCH", "users", "jones", "SORTBY", "age", "WITHSORTKEYS", "NOCONTENT"}),
              IsArray(IntArg(2), "user1", "#35", "user2", "#36"));

  EXPECT_THAT(
      Run({"FT.SEARCH", "users", "jones", "SORTBY", "first_name", "WITHSORTKEYS", "NOCONTENT"}),
      IsArray(IntArg(2), "user1", "$alice", "user2", "$bob"));

  EXPECT_THAT(Run({"FT.SEARCH", "users", "jones", "WITHSORTKEYS", "NOCONTENT"}),
              IsArray(IntArg(2), "user1", ArgType(RespExpr::NIL), "user2", ArgType(RespExpr::NIL)));

  EXPECT_THAT(
      Run({"FT.SEARCH", "users", "jones", "SORTBY", "last_name", "WITHSORTKEYS"}),
      IsUnordArray(IntArg(2),

                   "user2", "$jones", IsMap("last_name", "jones", "first_name", "bob", "age", "36"),
                   "user1", "$jones",
                   IsMap("last_name", "jones", "first_name", "alice", "age", "35")));
}

// GEO index tests for FT.SEARCH with HASH and JSON documents

TEST_F(SearchFamilyTest, GeoSearchHash) {
  auto resp =
      Run({"FT.CREATE", "geo_idx", "ON", "HASH", "SCHEMA", "name", "TEXT", "location", "GEO"});
  EXPECT_EQ(resp, "OK");

  // Add documents with geo coordinates as "lon,lat" or "lon lat" format
  Run({"HSET", "city:1", "name", "Mountain View", "location", "-122.08, 37.386"});
  Run({"HSET", "city:2", "name", "Palo Alto", "location", "-122.143, 37.444"});
  Run({"HSET", "city:3", "name", "San Jose", "location", "-121.886, 37.338"});
  Run({"HSET", "city:4", "name", "San Francisco", "location", "-122.419, 37.774"});

  // Search within 30 miles of Mountain View - should find nearby cities
  resp = Run({"FT.SEARCH", "geo_idx", "@location:[-122.08 37.386 30 mi]"});
  EXPECT_THAT(resp, AreDocIds("city:1", "city:2", "city:3"));

  // Search within 50 miles - should include San Francisco
  resp = Run({"FT.SEARCH", "geo_idx", "@location:[-122.08 37.386 50 mi]"});
  EXPECT_THAT(resp, AreDocIds("city:1", "city:2", "city:3", "city:4"));

  // Search with very small radius - only exact match
  resp = Run({"FT.SEARCH", "geo_idx", "@location:[-122.08 37.386 1 km]"});
  EXPECT_THAT(resp, AreDocIds("city:1"));

  // Search with wildcard - return all geo indexed docs
  resp = Run({"FT.SEARCH", "geo_idx", "@location:*"});
  EXPECT_THAT(resp, AreDocIds("city:1", "city:2", "city:3", "city:4"));

  // Combine geo search with text search
  resp = Run({"FT.SEARCH", "geo_idx", "San* @location:[-122.08 37.386 50 mi]"});
  EXPECT_THAT(resp, AreDocIds("city:3", "city:4"));
}

TEST_F(SearchFamilyTest, GeoSearchJson) {
  auto resp = Run({"FT.CREATE", "geo_idx", "ON", "JSON", "SCHEMA", "$.name", "AS", "name", "TEXT",
                   "$.location", "AS", "location", "GEO"});
  EXPECT_EQ(resp, "OK");

  // Add JSON documents with geo coordinates
  Run({"JSON.SET", "city:1", ".", R"({"name":"Mountain View","location":"-122.08, 37.386"})"});
  Run({"JSON.SET", "city:2", ".", R"({"name":"Palo Alto","location":"-122.143, 37.444"})"});
  Run({"JSON.SET", "city:3", ".", R"({"name":"San Jose","location":"-121.886, 37.338"})"});
  Run({"JSON.SET", "city:4", ".", R"({"name":"San Francisco","location":"-122.419, 37.774"})"});

  // Search within 30 miles of Mountain View
  resp = Run({"FT.SEARCH", "geo_idx", "@location:[-122.08 37.386 30 mi]"});
  EXPECT_THAT(resp, AreDocIds("city:1", "city:2", "city:3"));

  // Search within 50 miles - should include San Francisco
  resp = Run({"FT.SEARCH", "geo_idx", "@location:[-122.08 37.386 50 mi]"});
  EXPECT_THAT(resp, AreDocIds("city:1", "city:2", "city:3", "city:4"));

  // Search with kilometers
  resp = Run({"FT.SEARCH", "geo_idx", "@location:[-122.08 37.386 50 km]"});
  EXPECT_THAT(resp, AreDocIds("city:1", "city:2", "city:3"));

  // Search with wildcard
  resp = Run({"FT.SEARCH", "geo_idx", "@location:*"});
  EXPECT_THAT(resp, AreDocIds("city:1", "city:2", "city:3", "city:4"));
}

TEST_F(SearchFamilyTest, GeoSearchInvalidValues) {
  auto resp =
      Run({"FT.CREATE", "geo_idx", "ON", "HASH", "SCHEMA", "name", "TEXT", "location", "GEO"});
  EXPECT_EQ(resp, "OK");

  // Test documents with invalid geo values are excluded from index
  Run({"HSET", "d:1", "name", "valid", "location", "-122.08, 37.386"});
  Run({"HSET", "d:2", "name", "invalid_text", "location", "not a coordinate"});
  Run({"HSET", "d:3", "name", "missing_lon", "location", ", 37.386"});
  Run({"HSET", "d:4", "name", "missing_lat", "location", "-122.08,"});
  Run({"HSET", "d:7", "name", "empty", "location", ""});
  Run({"HSET", "d:8", "name", "no_location"});
  Run({"HSET", "d:9", "name", "space_format", "location", "-122.08,  37.386"});

  // Only valid coordinates should be indexed (d:1 and d:9)
  resp = Run({"FT.SEARCH", "geo_idx", "@location:*"});
  EXPECT_THAT(resp, AreDocIds("d:1", "d:9"));

  // Search should only find valid documents
  resp = Run({"FT.SEARCH", "geo_idx", "@location:[-122.08 37.386 100 mi]"});
  EXPECT_THAT(resp, AreDocIds("d:1", "d:9"));

  // All documents should still be searchable by other fields
  // TODO: failed to add - silent skip?
  // resp = Run({"FT.SEARCH", "geo_idx", "*"});
  // EXPECT_THAT(resp, AreDocIds("d:1", "d:2", "d:3", "d:4", "d:5", "d:6", "d:7", "d:8", "d:9"));
}

TEST_F(SearchFamilyTest, GeoSearchInvalidValuesJson) {
  auto resp = Run({"FT.CREATE", "geo_idx", "ON", "JSON", "SCHEMA", "$.name", "AS", "name", "TEXT",
                   "$.location", "AS", "location", "GEO"});
  EXPECT_EQ(resp, "OK");

  // Test JSON documents with various invalid geo values
  Run({"JSON.SET", "j:1", ".", R"({"name":"valid","location":"-122.08, 37.386"})"});
  Run({"JSON.SET", "j:2", ".", R"({"name":"invalid_text","location":"not a coordinate"})"});
  Run({"JSON.SET", "j:3", ".", R"({"name":"number","location":12345})"});
  Run({"JSON.SET", "j:4", ".", R"({"name":"null_value","location":null})"});
  Run({"JSON.SET", "j:5", ".", R"({"name":"array","location":["-122.08", "37.386"]})"});
  Run({"JSON.SET", "j:6", ".", R"({"name":"no_location"})"});
  Run({"JSON.SET", "j:7", ".", R"({"name":"empty_string","location":""})"});
  Run({"JSON.SET", "j:8", ".", R"({"name":"valid 2","location":"-122.08, 37.386"})"});

  // Only valid coordinates should be indexed
  resp = Run({"FT.SEARCH", "geo_idx", "@location:*"});
  EXPECT_THAT(resp, AreDocIds("j:1", "j:8"));

  // All documents should still be searchable via full-text
  // TODO: failed to add - silent skip?
  // resp = Run({"FT.SEARCH", "geo_idx", "*"});
  // EXPECT_THAT(resp, AreDocIds("j:1", "j:2", "j:3", "j:4", "j:5", "j:6", "j:7", "j:8"));
}

TEST_F(SearchFamilyTest, GeoSearchUnits) {
  auto resp = Run({"FT.CREATE", "geo_idx", "ON", "HASH", "SCHEMA", "location", "GEO"});
  EXPECT_EQ(resp, "OK");

  // Test different distance units: m, km, mi, ft
  // TODO: support lowercase
  // TODO: support query with without dot for coord (i.e.) 0.0 0.0
  Run({"HSET", "p:1", "location", "0, 0"});      // Origin
  Run({"HSET", "p:2", "location", "0.001, 0"});  // ~111 meters east
  Run({"HSET", "p:3", "location", "0.01, 0"});   // ~1.11 km east
  Run({"HSET", "p:4", "location", "0.1, 0"});    // ~11.1 km east

  // Test meters
  resp = Run({"FT.SEARCH", "geo_idx", "@location:[0.0 0.0 200 M]"});
  EXPECT_THAT(resp, AreDocIds("p:1", "p:2"));

  // Test kilometers
  resp = Run({"FT.SEARCH", "geo_idx", "@location:[0.0 0.0 2 KM]"});
  EXPECT_THAT(resp, AreDocIds("p:1", "p:2", "p:3"));

  // Test miles
  resp = Run({"FT.SEARCH", "geo_idx", "@location:[0.0 0.0 10 MI]"});
  EXPECT_THAT(resp, AreDocIds("p:1", "p:2", "p:3", "p:4"));

  // Test feet
  resp = Run({"FT.SEARCH", "geo_idx", "@location:[0.0 0.0 500 FT]"});
  EXPECT_THAT(resp, AreDocIds("p:1", "p:2"));
}

TEST_F(SearchFamilyTest, HnswVectorRange) {
  auto FloatToBytes = [](float f) -> string {
    return string(reinterpret_cast<const char*>(&f), sizeof(float));
  };

  // 1-D HNSW index with an extra numeric field for SORTBY testing
  Run({"FT.CREATE", "idx", "ON", "HASH", "SCHEMA", "pos", "VECTOR", "HNSW", "6", "TYPE", "FLOAT32",
       "DIM", "1", "DISTANCE_METRIC", "L2", "val", "NUMERIC"});

  // 10 docs at positions 0..9, val = i*10
  for (int i = 0; i < 10; i++) {
    Run({"HSET", absl::StrFormat("k%d", i), "pos", FloatToBytes(static_cast<float>(i)), "val",
         absl::StrFormat("%d", i * 10)});
  }

  string query_vec = FloatToBytes(5.0f);

  // Basic range: query at 5.0, radius 1.5 → k4 (dist=1), k5 (dist=0), k6 (dist=1)
  auto resp = Run({"FT.SEARCH", "idx", "@pos:[VECTOR_RANGE 1.5 $vec]=>{$YIELD_DISTANCE_AS: dist}",
                   "PARAMS", "2", "vec", query_vec, "LIMIT", "0", "10"});
  EXPECT_THAT(resp, AreDocIds("k4", "k5", "k6"));

  // Score alias is returned in each document by default
  resp = Run({"FT.SEARCH", "idx", "@pos:[VECTOR_RANGE 1.5 $vec]=>{$YIELD_DISTANCE_AS: dist}",
              "PARAMS", "2", "vec", query_vec, "RETURN", "1", "dist"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  // Response: [total, key1, [field, val, ...], ...]
  // Each doc should have "dist" in its fields
  auto& arr = resp.GetVec();
  ASSERT_GE(arr.size(), 3u);
  for (size_t i = 2; i < arr.size(); i += 2) {
    auto fields = arr[i].GetVec();
    ASSERT_GE(fields.size(), 2u);
    EXPECT_EQ(fields[0].GetString(), "dist");
  }

  // Large radius — all 10 docs returned
  resp = Run({"FT.SEARCH", "idx", "@pos:[VECTOR_RANGE 100 $vec]=>{$YIELD_DISTANCE_AS: dist}",
              "PARAMS", "2", "vec", query_vec, "LIMIT", "0", "20"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_EQ(resp.GetVec()[0].GetInt(), 10);

  // SORTBY val ASC — tests that sort_score is populated for non-score SORTBY
  resp = Run({"FT.SEARCH", "idx", "@pos:[VECTOR_RANGE 1.5 $vec]=>{$YIELD_DISTANCE_AS: dist}",
              "PARAMS", "2", "vec", query_vec, "SORTBY", "val", "ASC", "RETURN", "1", "val",
              "LIMIT", "0", "10"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  auto& asc_arr = resp.GetVec();
  // Extract val values from response: [total, key, [val, v1], key, [val, v2], ...]
  vector<int> vals_asc;
  for (size_t i = 2; i < asc_arr.size(); i += 2) {
    auto fields = asc_arr[i].GetVec();
    ASSERT_GE(fields.size(), 2u);
    vals_asc.push_back(stoi(fields[1].GetString()));
  }
  EXPECT_THAT(vals_asc, ElementsAre(40, 50, 60));

  // SORTBY val DESC
  resp = Run({"FT.SEARCH", "idx", "@pos:[VECTOR_RANGE 1.5 $vec]=>{$YIELD_DISTANCE_AS: dist}",
              "PARAMS", "2", "vec", query_vec, "SORTBY", "val", "DESC", "RETURN", "1", "val",
              "LIMIT", "0", "10"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  auto& desc_arr = resp.GetVec();
  vector<int> vals_desc;
  for (size_t i = 2; i < desc_arr.size(); i += 2) {
    auto fields = desc_arr[i].GetVec();
    ASSERT_GE(fields.size(), 2u);
    vals_desc.push_back(stoi(fields[1].GetString()));
  }
  EXPECT_THAT(vals_desc, ElementsAre(60, 50, 40));
}

TEST_F(SearchFamilyTest, GeoIndexFieldValidation) {
  // Test 1: Correct geo field definition and usage with HASH
  auto resp =
      Run({"FT.CREATE", "idx_hash", "ON", "HASH", "SCHEMA", "name", "TEXT", "coords", "GEO"});
  EXPECT_EQ(resp, "OK");

  // Documents with correct geo fields
  Run({"HSET", "h:1", "name", "Location_A", "coords", "-122.4194, 37.7749"});
  Run({"HSET", "h:2", "name", "Location_B", "coords", "-118.2437, 34.0522"});

  // Verify correct geo fields are indexed properly
  resp = Run({"FT.SEARCH", "idx_hash", "@coords:*"});
  EXPECT_THAT(resp, AreDocIds("h:1", "h:2"));

  // Test geo search with correct fields
  resp = Run({"FT.SEARCH", "idx_hash", "@coords:[-122.4194 37.7749 50 mi]"});
  EXPECT_THAT(resp, AreDocIds("h:1"));

  // Test 2: Missing geo fields
  Run({"HSET", "h:3", "name", "No_Coords"});  // Missing coords field entirely

  // Documents with missing geo fields should not appear in geo queries
  resp = Run({"FT.SEARCH", "idx_hash", "@coords:*"});
  EXPECT_THAT(resp, AreDocIds("h:1", "h:2"));

  // But should still be searchable by text fields
  resp = Run({"FT.SEARCH", "idx_hash", "@name:No_Coords"});
  EXPECT_THAT(resp, AreDocIds("h:3"));

  // Test 3: Incorrect geo field formats
  Run({"HSET", "h:4", "name", "Empty_Coords", "coords", ""});  // Empty coords field
  Run({"HSET", "h:5", "name", "Invalid_Text", "coords", "not a coordinate"});
  Run({"HSET", "h:6", "name", "Out_of_Range_Lat", "coords", "-122.0, 91.0"});  // Lat > 90
  Run({"HSET", "h:7", "name", "Out_of_Range_Lon", "coords", "181.0, 45.0"});   // Lon > 180
  Run({"HSET", "h:8", "name", "Missing_Lon", "coords", ", 37.7749"});
  Run({"HSET", "h:9", "name", "Missing_Lat", "coords", "-122.4194,"});
  Run({"HSET", "h:10", "name", "Single_Value", "coords", "-122.4194"});
  Run({"HSET", "h:11", "name", "Too_Many_Values", "coords", "-122.4194, 37.7749, 100"});
  Run({"HSET", "h:12", "name", "Special_Chars", "coords", "abc#@!, xyz!@#"});

  // Verify incorrect formats are not indexed in geo field
  resp = Run({"FT.SEARCH", "idx_hash", "@coords:*"});
  EXPECT_THAT(resp, AreDocIds("h:1", "h:2"));

  // Verify incorrect formats are not indexed at all
  resp = Run({"FT.SEARCH", "idx_hash", "*"});
  EXPECT_THAT(resp, AreDocIds("h:1", "h:2", "h:3"));

  // Test 4: Correct geo field definition with JSON
  resp = Run({"FT.CREATE", "idx_json", "ON", "JSON", "SCHEMA", "$.name", "AS", "name", "TEXT",
              "$.location", "AS", "location", "GEO"});
  EXPECT_EQ(resp, "OK");

  // JSON documents with correct geo fields
  Run({"JSON.SET", "j:1", ".", R"({"name":"City_A","location":"-122.4194, 37.7749"})"});
  Run({"JSON.SET", "j:2", ".", R"({"name":"City_B","location":"-118.2437, 34.0522"})"});

  // Verify correct geo fields are indexed
  resp = Run({"FT.SEARCH", "idx_json", "@location:*"});
  EXPECT_THAT(resp, AreDocIds("j:1", "j:2"));

  // Test 5: JSON documents with missing geo fields
  Run({"JSON.SET", "j:3", ".", R"({"name":"No_Location"})"});  // Missing location field
  Run({"JSON.SET", "j:4", ".", R"({"name":"Null_Location","location":null})"});  // Null value

  // Missing/null geo fields should not appear in geo queries
  resp = Run({"FT.SEARCH", "idx_json", "@location:*"});
  EXPECT_THAT(resp, AreDocIds("j:1", "j:2"));

  // But should be searchable by text
  resp = Run({"FT.SEARCH", "idx_json", "@name:*Location"});
  EXPECT_THAT(resp, AreDocIds("j:3", "j:4"));

  // Test 6: JSON documents with incorrect geo field types/formats
  Run({"JSON.SET", "j:5", ".", R"({"name":"Empty_Location","location":""})"});  // Empty string
  Run({"JSON.SET", "j:6", ".", R"({"name":"Number_Type","location":12345})"});
  Run({"JSON.SET", "j:7", ".", R"({"name":"Boolean_Type","location":true})"});
  Run({"JSON.SET", "j:8", ".", R"({"name":"Array_Type","location":["-122.4", "37.7"]})"});
  Run({"JSON.SET", "j:9", ".", R"({"name":"Object_Type","location":{"lon":-122.4,"lat":37.7}})"});
  Run({"JSON.SET", "j:10", ".", R"({"name":"Invalid_Format","location":"invalid coords"})"});
  Run({"JSON.SET", "j:11", ".", R"({"name":"Out_of_Range","location":"200, 100"})"});

  // Verify incorrect types/formats are not indexed as geo
  resp = Run({"FT.SEARCH", "idx_json", "@location:*"});
  EXPECT_THAT(resp, AreDocIds("j:1", "j:2"));

  // Documents with incorrect geo formats should still be searchable by text
  resp = Run({"FT.SEARCH", "idx_json", "@name:*"});
  EXPECT_THAT(resp, AreDocIds("j:1", "j:2", "j:3", "j:4"));

  // Test 7: Adding multiple locations for same document should index all locations
  Run({"JSON.SET", "j:12", ".",
       R"({"name":"Multi_Locations","location":["-123.00, 12.00", "-124.0, 12.0"]})"});

  resp = Run({"FT.SEARCH", "idx_json", "@location:[-123.00 12.00 1 m]"});
  EXPECT_THAT(resp, AreDocIds("j:12"));

  resp = Run({"FT.SEARCH", "idx_json", "@location:[-124.00 12.00 1 m]"});
  EXPECT_THAT(resp, AreDocIds("j:12"));

  // Check that we return only one document even if multiple locations match
  resp = Run({"FT.SEARCH", "idx_json", "@location:*"});
  EXPECT_THAT(resp, AreDocIds("j:1", "j:2", "j:12"));

  resp = Run({"FT.SEARCH", "idx_json", "@location:[-124.00 12.00 1000 km]"});
  EXPECT_THAT(resp, AreDocIds("j:12"));

  // Deleting multi location document should remove all locations
  Run({"JSON.DEL", "j:12"});
  resp = Run({"FT.SEARCH", "idx_json", "@location:*"});
  EXPECT_THAT(resp, AreDocIds("j:1", "j:2"));
}

TEST_F(SearchFamilyTest, VectorFieldWrongSizeDoesNotCrash) {
  // DIM=1 FLOAT32 expects exactly 4 bytes per value.
  Run({"FT.CREATE", "idx", "ON", "HASH", "SCHEMA", "pos", "VECTOR", "HNSW", "6", "TYPE", "FLOAT32",
       "DIM", "1", "DISTANCE_METRIC", "L2"});

  // Insert values with wrong byte lengths (6 and 7 bytes instead of 4).
  Run({"HSET", "k1", "pos", "AAAAAAA"});  // 7 bytes
  Run({"HSET", "k2", "pos", "AQAAAA"});   // 6 bytes
  Run({"HSET", "k3", "pos", "AgAAAA"});   // 6 bytes

  // FT.SEARCH must not crash when serializing the wrong-sized vector fields.
  auto resp = Run({"FT.SEARCH", "idx", "*", "PARAMS", "2", "vec", "AQAAAA", "LIMIT", "0", "10"});
  EXPECT_THAT(resp, Not(ErrArg("")));

  // Same scenario with 10-byte values and multiple keys.
  Run({"FT.CREATE", "idx2", "ON", "HASH", "SCHEMA", "v", "VECTOR", "HNSW", "6", "TYPE", "FLOAT32",
       "DIM", "1", "DISTANCE_METRIC", "L2"});
  Run({"HSET", "a1", "v", "aaaaaaaaaa"});  // 10 bytes
  Run({"HSET", "a2", "v", "bbbbbbbbbb"});
  Run({"HSET", "a3", "v", "cccccccccc"});
  Run({"HSET", "a4", "v", "dddddddddd"});
  Run({"HSET", "a5", "v", "eeeeeeeeee"});

  resp = Run({"FT.SEARCH", "idx2", "*", "PARAMS", "2", "vec", "aaaaaaaaaa", "LIMIT", "0", "100"});
  EXPECT_THAT(resp, Not(ErrArg("")));
}

TEST_F(SearchFamilyTest, SortBySkipsDocsWithoutSortField) {
  // KeepTopKSorted skips docs that don't have the sort field, returning fewer sort scores
  // than result.ids.size(). The loop then accesses sort_scores[i] out-of-bounds.
  Run({"FT.CREATE", "idx", "ON", "HASH", "SCHEMA", "val", "NUMERIC"});

  Run({"HSET", "valid:1", "val", "123"});
  Run({"HSET", "valid:2", "val", "456"});
  Run({"HSET", "valid:3", "val", "789"});

  // These docs are indexed (no prefix restriction) but lack the sort field.
  // They appear in '*' search results but are skipped by KeepTopKSorted.
  for (int i = 0; i < 97; i++)
    Run({"HSET", absl::StrCat("nofield:", i), "txt", "garbage"});

  auto resp = Run({"FT.SEARCH", "idx", "*", "SORTBY", "val", "LIMIT", "0", "100"});
  auto vec = resp.GetVec();

  // Extract doc keys from the response (indices 1, 3, 5, ...).
  vector<string> keys;
  for (size_t i = 1; i < vec.size(); i += 2)
    keys.push_back(vec[i].GetString());

  EXPECT_THAT(keys, ElementsAre("valid:1", "valid:2", "valid:3"));
}

TEST_F(SearchFamilyTest, NumericIndexRejectsNonFiniteValues) {
  // Regression test: HSET with inf/nan values on a NUMERIC field used to crash with
  // DCHECK(std::isfinite(value)) in RangeTree::Add, because absl::SimpleAtod accepts
  // "inf", "-inf", "nan" etc. as valid doubles.
  Run({"FT.CREATE", "idx", "ON", "HASH", "SCHEMA", "val", "NUMERIC"});

  Run({"HSET", "doc:1", "val", "inf"});
  Run({"HSET", "doc:2", "val", "-inf"});
  Run({"HSET", "doc:3", "val", "+inf"});
  Run({"HSET", "doc:4", "val", "nan"});
  Run({"HSET", "doc:5", "val", "42"});  // finite — must still be indexed

  // Non-finite docs are not in the numeric index; only doc:5 should match the range query.
  auto resp = Run({"FT.SEARCH", "idx", "@val:[-inf +inf]"});
  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(1), "doc:5", _)));
}

}  // namespace dfly


================================================
FILE: src/server/serializer_base.cc
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/serializer_base.h"

#include "base/logging.h"

namespace dfly {

SerializerBase::SerializerBase(DbSlice* slice) : db_slice_(slice) {
}

SerializerBase::~SerializerBase() {
}

uint64_t SerializerBase::RegisterChangeListener() {
  DCHECK(db_slice_);
  auto cb = [this](DbIndex db_index, const DbSlice::ChangeReq& req) {
    HandleChangeReq(db_index, req);
  };
  snapshot_version_ = db_slice_->RegisterOnChange(std::move(cb));
  return snapshot_version_;
}

void SerializerBase::UnregisterChangeListener() {
  if (snapshot_version_ == 0)
    return;
  DCHECK(db_slice_);
  db_slice_->UnregisterOnChange(snapshot_version_);
  snapshot_version_ = 0;
}

void SerializerBase::MarkBucketSerializing(BucketIdentity bid) {
  DCHECK(!bucket_states_.contains(bid)) << "Bucket already in transient state";
  bucket_states_.emplace(bid, BucketState{BucketPhase::kSerializing, {}});
}

void SerializerBase::FinishBucketIteration(BucketIdentity bid,
                                           std::vector<TieredDelayedEntry> delayed) {
  auto it = bucket_states_.find(bid);
  DCHECK(it != bucket_states_.end() && it->second.phase == BucketPhase::kSerializing);

  if (delayed.empty()) {
    // Serializing -> Covered
    bucket_states_.erase(it);
  } else {
    // Serializing -> DelayedPending
    it->second.phase = BucketPhase::kDelayedPending;
    it->second.delayed = std::move(delayed);
  }
}

void SerializerBase::CompleteBucketDelayed(BucketIdentity bid) {
  auto it = bucket_states_.find(bid);
  DCHECK(it != bucket_states_.end() && it->second.phase == BucketPhase::kDelayedPending);
  bucket_states_.erase(it);
}

void SerializerBase::OnChange(DbIndex db_index, PrimeTable::bucket_iterator it) {
  std::lock_guard guard(big_value_mu_);

  if (it.is_done() || it.GetVersion() >= snapshot_version_) {
    ++stats_.buckets_skipped;
    return;
  }

  BucketIdentity bid = it.bucket_address();
  if (bucket_states_.contains(bid)) {
    ++stats_.change_during_serialization;
    return;
  }

  it.SetVersion(snapshot_version_);
  MarkBucketSerializing(bid);
  DoSerializeBucket(db_index, it);
  FinishBucketIteration(bid, {});
  ++stats_.buckets_on_change;
}

void SerializerBase::OnInsert(DbIndex db_index, std::string_view key) {
  DCHECK(db_slice_);
  PrimeTable* table = db_slice_->GetTables(db_index).first;
  table->CVCUponInsert(snapshot_version_, key, [this, db_index](PrimeTable::bucket_iterator bit) {
    DCHECK_LT(bit.GetVersion(), snapshot_version_);
    OnChange(db_index, bit);
  });
}

void SerializerBase::HandleChangeReq(DbIndex db_index, const DbSlice::ChangeReq& req) {
  if (auto update = req.update(); update) {
    OnChange(db_index, *update);
  } else {
    OnInsert(db_index, std::get<std::string_view>(req.change));
  }
}

}  // namespace dfly


================================================
FILE: src/server/serializer_base.h
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/flat_hash_map.h>

#include <vector>

#include "server/db_slice.h"
#include "server/synchronization.h"
#include "server/table.h"
#include "server/tiered_storage.h"

namespace dfly {

// Opaque identity for a physical DashTable bucket — its memory address.
// Unique across all databases/segments for the lifetime of a serialization.
using BucketIdentity = uintptr_t;

// SerializerBase owns the DbSlice change-listener registration and a per-bucket
// state machine that tracks each bucket through:
//
//   NotVisited  ->  Serializing  ->  (DelayedPending  ->)  Covered
//
// NotVisited and Covered are implicit (bucket absent from the map).
// Only transient states (Serializing, DelayedPending) are stored in the map.
//
// State tracking is purely observational in early PRs: it drives DCHECKs and
// stats but does not alter the serialization control flow.
class SerializerBase {
 public:
  // Aggregated counters for observability.
  struct Stats {
    uint64_t buckets_loop = 0;       // main traversal loop
    uint64_t buckets_on_change = 0;  // OnChange callback fired
    uint64_t buckets_skipped = 0;    // already Covered when seen
    uint64_t keys_serialized = 0;
    uint64_t change_during_serialization = 0;  // change hit an in-flight bucket
  };

  explicit SerializerBase(DbSlice* slice);
  virtual ~SerializerBase();

  // Registers a ChangeCallback with DbSlice.  Returns the snapshot version
  // (version upper-bound for entries that must be saved).
  uint64_t RegisterChangeListener();

  // Unregisters the callback.  Safe to call if already unregistered.
  void UnregisterChangeListener();

  uint64_t snapshot_version() const {
    return snapshot_version_;
  }

  const Stats& GetStats() const {
    return stats_;
  }

 protected:
  // Phase of an in-flight bucket (only stored while transient).
  enum class BucketPhase : uint8_t {
    kSerializing,     // bucket is being iterated by the main loop / OnChange
    kDelayedPending,  // all entries serialized but tiered reads still in-flight
  };

  struct BucketState {
    BucketPhase phase;
    std::vector<TieredDelayedEntry> delayed;
  };

  // --- Bucket state machine ---

  // Transition bucket from NotVisited -> Serializing.
  // Must be called before DoSerializeBucket.  Caller is responsible for
  // stamping the bucket version to snapshot_version_ first.
  void MarkBucketSerializing(BucketIdentity bid);

  // Transition bucket from Serializing -> Covered (empty delayed) or
  // Serializing -> DelayedPending (non-empty delayed).
  // Takes ownership of the delayed entries.
  void FinishBucketIteration(BucketIdentity bid, std::vector<TieredDelayedEntry> delayed);

  // Transition bucket from DelayedPending -> Covered.
  void CompleteBucketDelayed(BucketIdentity bid);

  // --- Subclass serialization hook ---

  // Serialize a single bucket.  Returns the number of entries serialized.
  // Called while big_value_mu_ is held.
  virtual unsigned DoSerializeBucket(DbIndex db_index, PrimeTable::bucket_iterator it) = 0;

  // --- Change callbacks ---

  // Called when an existing bucket is about to be mutated.
  // Default: if unvisited, stamps version, MarkBucketSerializing, DoSerializeBucket,
  //          FinishBucketIteration.
  //          If in-flight, increments change_during_serialization (mutex barrier
  //          preserves the existing serialization behaviour).
  // Holds big_value_mu_ while running.
  virtual void OnChange(DbIndex db_index, PrimeTable::bucket_iterator it);

  // Called when a new key is about to be inserted.
  // Default: CVCUponInsert -> OnChange for every touched bucket.
  virtual void OnInsert(DbIndex db_index, std::string_view key);

  // --- Shared members (to be moved from subclasses in later PRs) ---

  DbSlice* db_slice_;
  DbTableArray db_array_;
  uint64_t snapshot_version_ = 0;
  ThreadLocalMutex big_value_mu_;
  Stats stats_;

 private:
  // Called by DbSlice when a change is detected.
  void HandleChangeReq(DbIndex db_index, const DbSlice::ChangeReq& req);

  absl::flat_hash_map<BucketIdentity, BucketState> bucket_states_;
  uint64_t change_cb_id_ = 0;

  // For unit-test only.
  size_t BucketStateCountForTesting() const {
    return bucket_states_.size();
  }
  friend class SerializerBaseTest;
};

}  // namespace dfly


================================================
FILE: src/server/serializer_base_test.cc
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/serializer_base.h"

#include "base/gtest.h"
#include "base/logging.h"
#include "server/test_utils.h"

namespace dfly {

class SerializerBaseTest : public BaseFamilyTest, public SerializerBase {
 public:
  SerializerBaseTest() : SerializerBase(nullptr) {
  }

 protected:
  using SerializerBase::BucketPhase;
  using SerializerBase::CompleteBucketDelayed;
  using SerializerBase::FinishBucketIteration;
  using SerializerBase::MarkBucketSerializing;

  size_t BucketCount() const {
    return BucketStateCountForTesting();
  }

  unsigned DoSerializeBucket(DbIndex /*db_index*/, PrimeTable::bucket_iterator /*it*/) override {
    return 0;
  }
};

// --- State-machine tests ---

TEST_F(SerializerBaseTest, MarkThenFinishNoneDelayed) {
  constexpr BucketIdentity bid = 0x1000;

  EXPECT_EQ(0u, BucketCount());
  MarkBucketSerializing(bid);
  EXPECT_EQ(1u, BucketCount());

  FinishBucketIteration(bid, {});
  EXPECT_EQ(0u, BucketCount());
}

TEST_F(SerializerBaseTest, MarkThenFinishWithDelayedThenComplete) {
  constexpr BucketIdentity bid = 0x2000;

  MarkBucketSerializing(bid);
  EXPECT_EQ(1u, BucketCount());

  // Simulate one delayed (tiered) entry.
  std::vector<TieredDelayedEntry> delayed;
  delayed.push_back({});
  FinishBucketIteration(bid, std::move(delayed));

  EXPECT_EQ(1u, BucketCount());

  CompleteBucketDelayed(bid);
  EXPECT_EQ(0u, BucketCount());
}

TEST_F(SerializerBaseTest, MultipleBucketsIndependent) {
  constexpr BucketIdentity bid1 = 0x1000;
  constexpr BucketIdentity bid2 = 0x2000;
  constexpr BucketIdentity bid3 = 0x3000;

  MarkBucketSerializing(bid1);
  MarkBucketSerializing(bid2);
  MarkBucketSerializing(bid3);
  EXPECT_EQ(3u, BucketCount());

  FinishBucketIteration(bid2, {});
  EXPECT_EQ(2u, BucketCount());

  std::vector<TieredDelayedEntry> d;
  d.push_back({});
  FinishBucketIteration(bid1, std::move(d));
  EXPECT_EQ(2u, BucketCount());

  FinishBucketIteration(bid3, {});
  EXPECT_EQ(1u, BucketCount());

  CompleteBucketDelayed(bid1);
  EXPECT_EQ(0u, BucketCount());
}

}  // namespace dfly


================================================
FILE: src/server/serializer_commons.cc
================================================
#include "server/serializer_commons.h"

extern "C" {
#include "redis/rdb.h"
}

#include <absl/base/internal/endian.h>

#include <system_error>

#include "base/logging.h"

using namespace std;

namespace dfly {

int PackedUIntMeta::Type() const {
  return (first_byte & 0xC0) >> 6;
}

unsigned PackedUIntMeta::ByteSize() const {
  switch (Type()) {
    case RDB_ENCVAL:
    case RDB_6BITLEN:
      return 0;
    case RDB_14BITLEN:
      return 1;
  };
  switch (first_byte) {
    case RDB_32BITLEN:
      return 4;
    case RDB_64BITLEN:
      return 8;
  };
  return 0;
}

/* Saves an encoded unsigned integer. The first two bits in the first byte are used to
 * hold the encoding type. See the RDB_* definitions for more information
 * on the types of encoding. buf must be at least 9 bytes.
 * */
unsigned WritePackedUInt(uint64_t value, io::MutableBytes buf) {
  if (value < (1 << 6)) {
    /* Save a 6 bit value */
    buf[0] = (value & 0xFF) | (RDB_6BITLEN << 6);
    return 1;
  }

  if (value < (1 << 14)) {
    /* Save a 14 bit value */
    buf[0] = ((value >> 8) & 0xFF) | (RDB_14BITLEN << 6);
    buf[1] = value & 0xFF;
    return 2;
  }

  if (value <= UINT32_MAX) {
    /* Save a 32 bit value */
    buf[0] = RDB_32BITLEN;
    absl::big_endian::Store32(buf.data() + 1, value);
    return 1 + 4;
  }

  /* Save a 64 bit value */
  buf[0] = RDB_64BITLEN;
  absl::big_endian::Store64(buf.data() + 1, value);
  return 1 + 8;
}

io::Result<uint64_t> ReadPackedUInt(PackedUIntMeta meta, io::Bytes bytes) {
  DCHECK(meta.ByteSize() <= bytes.size());
  switch (meta.Type()) {
    case RDB_ENCVAL:
    case RDB_6BITLEN:
      return meta.first_byte & 0x3F;
    case RDB_14BITLEN:
      return ((meta.first_byte & 0x3F) << 8) | bytes[0];
  };
  switch (meta.first_byte) {
    case RDB_32BITLEN:
      return absl::big_endian::Load32(bytes.data());
    case RDB_64BITLEN:
      return absl::big_endian::Load64(bytes.data());
  };
  return make_unexpected(make_error_code(errc::illegal_byte_sequence));
}

}  // namespace dfly


================================================
FILE: src/server/serializer_commons.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include <nonstd/expected.hpp>
#include <system_error>

#include "io/io.h"
#include "server/error.h"

namespace dfly {

using nonstd::make_unexpected;

#define SET_OR_RETURN(expr, dest)              \
  do {                                         \
    auto exp_val = (expr);                     \
    if (!exp_val) {                            \
      VLOG(1) << "Error while calling " #expr; \
      return exp_val.error();                  \
    }                                          \
    dest = std::move(exp_val.value());         \
  } while (0)

#define SET_OR_UNEXPECT(expr, dest)            \
  {                                            \
    auto exp_res = (expr);                     \
    if (!exp_res)                              \
      return make_unexpected(exp_res.error()); \
    dest = std::move(exp_res.value());         \
  }

// Represents meta information for an encoded packed unsigned integer.
struct PackedUIntMeta {
  // Initialize by first byte in sequence.
  PackedUIntMeta(uint8_t first_byte) : first_byte{first_byte} {
  }

  // Get underlying RDB type.
  int Type() const;

  // Get additional size in bytes (excluding first one).
  unsigned ByteSize() const;

  uint8_t first_byte;
};

// Saves an packed unsigned integer. The first two bits in the first byte are used to
// hold the encoding type. See the RDB_* definitions for more information
// on the types of encoding. buf must be at least 9 bytes.
unsigned WritePackedUInt(uint64_t value, io::MutableBytes dest);

// Deserialize packed unsigned integer.
io::Result<uint64_t> ReadPackedUInt(PackedUIntMeta meta, io::Bytes source);

}  // namespace dfly


================================================
FILE: src/server/server_family.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/server_family.h"

#include <absl/cleanup/cleanup.h>
#include <absl/random/random.h>  // for master_replid_ generation.
#include <absl/strings/match.h>
#include <absl/strings/str_join.h>
#include <absl/strings/str_replace.h>
#include <absl/strings/strip.h>
#include <croncpp.h>  // cron::cronexpr
#include <fcntl.h>    // for mkstemp
#include <hdr/hdr_histogram.h>
#include <sys/resource.h>
#include <sys/stat.h>  // for fchmod
#include <sys/utsname.h>
#include <unistd.h>  // for getpid(), write(), close(), unlink(), fsync()

#include <algorithm>
#include <chrono>
#include <filesystem>
#include <fstream>
#include <optional>
#include <unordered_map>
#include <unordered_set>

#include "absl/strings/ascii.h"
#include "core/detail/gen_utils.h"
#include "facade/error.h"
#include "server/common.h"
#include "server/slowlog.h"

extern "C" {
#include "redis/redis_aux.h"
}

#include "base/flags.h"
#include "base/histogram.h"
#include "base/logging.h"
#include "core/compact_object.h"
#include "core/dense_set.h"
#include "facade/cmd_arg_parser.h"
#include "facade/dragonfly_connection.h"
#include "facade/dragonfly_listener.h"
#include "facade/reply_builder.h"
#include "io/file_util.h"
#include "io/proc_reader.h"
#include "search/doc_index.h"
#include "server/acl/acl_commands_def.h"
#include "server/acl/user_registry.h"
#include "server/command_registry.h"
#include "server/conn_context.h"
#include "server/debugcmd.h"
#include "server/detail/save_stages_controller.h"
#include "server/detail/snapshot_storage.h"
#include "server/dflycmd.h"
#include "server/engine_shard_set.h"
#include "server/error.h"
#include "server/generic_family.h"
#include "server/journal/journal.h"
#include "server/main_service.h"
#include "server/memory_cmd.h"
#include "server/multi_command_squasher.h"
#include "server/namespaces.h"
#include "server/rdb_load.h"
#include "server/rdb_save.h"
#include "server/replica.h"
#include "server/script_mgr.h"
#include "server/search/search_family.h"
#include "server/server_state.h"
#include "server/snapshot.h"
#include "server/tiered_storage.h"
#include "server/transaction.h"
#include "server/version.h"
#include "strings/human_readable.h"
#include "util/accept_server.h"
#include "util/aws/aws.h"

using namespace std;

struct ReplicaOfFlag {
  string host;
  string port;

  bool has_value() const {
    return !host.empty() && !port.empty();
  }
};

static bool AbslParseFlag(std::string_view in, ReplicaOfFlag* flag, std::string* err);
static std::string AbslUnparseFlag(const ReplicaOfFlag& flag);

struct CronExprFlag {
  static constexpr std::string_view kCronPrefix = "0 "sv;
  std::optional<cron::cronexpr> cron_expr;
};

static bool AbslParseFlag(std::string_view in, CronExprFlag* flag, std::string* err);
static std::string AbslUnparseFlag(const CronExprFlag& flag);

ABSL_FLAG(string, dir, "", "working directory");
ABSL_FLAG(string, dbfilename, "dump-{timestamp}",
          "the filename to save/load the DB, instead of/with {timestamp} can be used {Y}, {m}, and "
          "{d} macros");
ABSL_FLAG(string, requirepass, "",
          "password for AUTH authentication. "
          "If empty can also be set with DFLY_PASSWORD environment variable.");
ABSL_FLAG(uint32_t, maxclients, 64000, "Maximum number of concurrent clients allowed.");

ABSL_FLAG(string, save_schedule, "", "the flag is deprecated, please use snapshot_cron instead");
ABSL_FLAG(CronExprFlag, snapshot_cron, {},
          "cron expression for the time to save a snapshot, crontab style");
ABSL_FLAG(bool, df_snapshot_format, true,
          "if true, save in dragonfly-specific snapshotting format");
ABSL_FLAG(int, epoll_file_threads, 0,
          "thread size for file workers when running in epoll mode, default is hardware concurrent "
          "threads");
ABSL_FLAG(ReplicaOfFlag, replicaof, ReplicaOfFlag{},
          "Specifies a host and port which point to a target master "
          "to replicate. "
          "Format should be <IPv4>:<PORT> or host:<PORT> or [<IPv6>]:<PORT>");
ABSL_FLAG(int32_t, slowlog_log_slower_than, 10000,
          "Add commands slower than this threshold to slow log. The value is expressed in "
          "microseconds and if it's negative - disables the slowlog.");
ABSL_FLAG(uint32_t, slowlog_max_len, 20, "Slow log maximum length.");

ABSL_FLAG(uint32_t, pause_wait_timeout, 1,
          "Timeout in seconds, to set up the pause for all connections for CLIENT PAUSE command "
          "and cluster slot migration finalization procedure.");

ABSL_FLAG(string, s3_endpoint, "", "endpoint for s3 snapshots, default uses aws regional endpoint");
ABSL_FLAG(bool, s3_use_https, true, "whether to use https for s3 endpoints");
// Disable EC2 metadata by default, or if a users credentials are invalid the
// AWS client will spent 30s trying to connect to inaccessable EC2 endpoints
// to load the credentials.
ABSL_FLAG(bool, s3_ec2_metadata, false,
          "whether to load credentials and configuration from EC2 metadata");
// Enables S3 payload signing over HTTP. This reduces the latency and resource
// usage when writing snapshots to S3, at the expense of security.
ABSL_FLAG(bool, s3_sign_payload, true,
          "whether to sign the s3 request payload when uploading snapshots");

ABSL_FLAG(bool, info_replication_valkey_compatible, true,
          "when true - output valkey compatible values for info-replication");

ABSL_FLAG(bool, managed_service_info, false,
          "Hides some implementation details from users when true (i.e. in managed service env)");

ABSL_FLAG(string, availability_zone, "",
          "server availability zone, used by clients to read from local-zone replicas");

ABSL_FLAG(bool, keep_legacy_memory_metrics, true, "legacy metrics format");
// TODO deprecate when flipped in production
ABSL_FLAG(bool, replicaof_no_one_start_journal, true,
          "when set, preserves journal offsets after REPLICAOF NO ONE");

ABSL_DECLARE_FLAG(int32_t, port);
ABSL_DECLARE_FLAG(bool, cache_mode);
ABSL_DECLARE_FLAG(int32_t, hz);
ABSL_DECLARE_FLAG(bool, tls);
ABSL_DECLARE_FLAG(string, tls_ca_cert_file);
ABSL_DECLARE_FLAG(string, tls_ca_cert_dir);
ABSL_DECLARE_FLAG(int, replica_priority);
ABSL_DECLARE_FLAG(double, rss_oom_deny_ratio);
ABSL_DECLARE_FLAG(bool, experimental_replicaof_v2);

bool AbslParseFlag(std::string_view in, ReplicaOfFlag* flag, std::string* err) {
#define RETURN_ON_ERROR(cond, m)                                           \
  do {                                                                     \
    if ((cond)) {                                                          \
      *err = m;                                                            \
      LOG(WARNING) << "Error in parsing arguments for --replicaof: " << m; \
      return false;                                                        \
    }                                                                      \
  } while (0)

  if (in.empty()) {  // on empty flag "parse" nothing. If we return false then DF exists.
    *flag = ReplicaOfFlag{};
    return true;
  }

  auto pos = in.find_last_of(':');
  RETURN_ON_ERROR(pos == string::npos, "missing ':'.");

  string_view ip = in.substr(0, pos);
  flag->port = in.substr(pos + 1);

  RETURN_ON_ERROR(ip.empty() || flag->port.empty(), "IP/host or port are empty.");

  // For IPv6: ip1.front == '[' AND ip1.back == ']'
  // For IPv4: ip1.front != '[' AND ip1.back != ']'
  // Together, this ip1.front == '[' iff ip1.back == ']', which can be implemented as XNOR (NOT XOR)
  RETURN_ON_ERROR(((ip.front() == '[') ^ (ip.back() == ']')), "unclosed brackets.");

  if (ip.front() == '[') {
    // shortest possible IPv6 is '::1' (loopback)
    RETURN_ON_ERROR(ip.length() <= 2, "IPv6 host name is too short");

    flag->host = ip.substr(1, ip.length() - 2);
  } else {
    flag->host = ip;
  }

  VLOG(1) << "--replicaof: Received " << flag->host << " :  " << flag->port;
  return true;
#undef RETURN_ON_ERROR
}

std::string AbslUnparseFlag(const ReplicaOfFlag& flag) {
  return (flag.has_value()) ? absl::StrCat(flag.host, ":", flag.port) : "";
}

bool AbslParseFlag(std::string_view in, CronExprFlag* flag, std::string* err) {
  if (in.empty()) {
    flag->cron_expr = std::nullopt;
    return true;
  }
  if (absl::StartsWith(in, "\"")) {
    *err = absl::StrCat("Could it be that you put quotes in the flagfile?");

    return false;
  }

  std::string raw_cron_expr = absl::StrCat(CronExprFlag::kCronPrefix, in);
  try {
    VLOG(1) << "creating cron from: '" << raw_cron_expr << "'";
    flag->cron_expr = cron::make_cron(raw_cron_expr);
    return true;
  } catch (const cron::bad_cronexpr& ex) {
    *err = ex.what();
  }
  return false;
}

std::string AbslUnparseFlag(const CronExprFlag& flag) {
  if (flag.cron_expr) {
    auto str_expr = to_cronstr(*flag.cron_expr);
    DCHECK(absl::StartsWith(str_expr, CronExprFlag::kCronPrefix));
    return str_expr.substr(CronExprFlag::kCronPrefix.size());
  }
  return "";
}

namespace dfly {

using absl::GetFlag;
using absl::StrCat;
using namespace facade;
using namespace util;
using detail::SaveStagesController;
using http::StringResponse;
using strings::HumanReadableNumBytes;

using EngineFunc = void (ServerFamily::*)(CmdArgList args, CommandContext*);

inline CommandId::Handler HandlerFunc(ServerFamily* se, EngineFunc f) {
  return [=](CmdArgList args, CommandContext* cntx) { return (se->*f)(args, cntx); };
}

namespace {

// TODO these should be configurable as command line flag and at runtime via config set
constexpr std::array<double, 3> kLatencyPercentiles = {50.0, 99.0, 99.9};

bool is_histogram_empty(const hdr_histogram* h) {
  return hdr_min(h) == std::numeric_limits<int64_t>::max();
}

const auto kRedisVersion = "7.4.0";

// Captured memory peaks
struct {
  std::atomic<size_t> used = 0;
  std::atomic<size_t> rss = 0;
} glob_memory_peaks;

size_t FetchRssMemory(const io::StatusData& sdata) {
  return sdata.vm_rss + sdata.hugetlb_pages;
}

using CI = CommandId;

struct CmdArgListFormatter {
  void operator()(std::string* out, MutableSlice arg) const {
    out->append(absl::StrCat("`", std::string_view(arg.data(), arg.size()), "`"));
  }
};

string UnknownCmd(string cmd, CmdArgList args) {
  return absl::StrCat("unknown command '", cmd, "' with args beginning with: ",
                      absl::StrJoin(args.begin(), args.end(), ", ", CmdArgListFormatter()));
}

std::shared_ptr<detail::SnapshotStorage> CreateCloudSnapshotStorage(std::string_view uri) {
  if (detail::IsS3Path(uri)) {
#ifdef WITH_AWS
    shard_set->pool()->GetNextProactor()->Await([&] { util::aws::Init(); });
    return std::make_shared<detail::AwsS3SnapshotStorage>(
        absl::GetFlag(FLAGS_s3_endpoint), absl::GetFlag(FLAGS_s3_use_https),
        absl::GetFlag(FLAGS_s3_ec2_metadata), absl::GetFlag(FLAGS_s3_sign_payload));
#else
    LOG(ERROR) << "Compiled without AWS support";
    exit(1);
#endif
  } else if (detail::IsGCSPath(uri)) {
#ifdef WITH_GCP
    auto gcs = std::make_shared<detail::GcsSnapshotStorage>();
    auto ec = shard_set->pool()->GetNextProactor()->Await([&] { return gcs->Init(3000); });
    if (ec) {
      LOG(ERROR) << "Failed to initialize GCS snapshot storage: " << ec.message();
      exit(1);
    }
    return gcs;
#else
    LOG(ERROR) << "Compiled without GCP support";
    exit(1);
#endif
  } else {
    LOG(ERROR) << "Uknown cloud storage " << uri;
    exit(1);
  }
}

// Check that if TLS is used at least one form of client authentication is
// enabled. That means either using a password or giving a root
// certificate for authenticating client certificates which will
// be required.
bool ValidateServerTlsFlags() {
  if (!absl::GetFlag(FLAGS_tls)) {
    return true;
  }

  bool has_auth = false;

  if (!dfly::GetPassword().empty()) {
    has_auth = true;
  }

  if (!(absl::GetFlag(FLAGS_tls_ca_cert_file).empty() &&
        absl::GetFlag(FLAGS_tls_ca_cert_dir).empty())) {
    has_auth = true;
  }

  if (!has_auth) {
    LOG(ERROR) << "TLS configured but no authentication method is used!";
    return false;
  }

  return true;
}

template <typename T> void UpdateMax(T* maxv, T current) {
  *maxv = std::max(*maxv, current);
}

void SetMasterFlagOnAllThreads(bool is_master) {
  auto cb = [is_master](unsigned, auto*) { ServerState::tlocal()->is_master = is_master; };
  shard_set->pool()->AwaitBrief(cb);
}

std::optional<cron::cronexpr> InferSnapshotCronExpr() {
  string save_time = GetFlag(FLAGS_save_schedule);
  auto cron_expr = GetFlag(FLAGS_snapshot_cron);

  if (!save_time.empty()) {
    LOG(ERROR) << "save_schedule flag is deprecated, please use snapshot_cron instead";
    exit(1);
  }

  if (cron_expr.cron_expr) {
    return std::move(cron_expr.cron_expr);
  }

  return std::nullopt;
}

void ClientSetName(CmdArgList args, CommandContext* cmd_cntx) {
  if (args.size() == 1) {
    cmd_cntx->conn()->SetName(string{ArgS(args, 0)});
    return cmd_cntx->rb()->SendOk();
  }
  return cmd_cntx->SendError(facade::kSyntaxErr);
}

void ClientGetName(CmdArgList args, CommandContext* cmd_cntx) {
  if (!args.empty()) {
    return cmd_cntx->SendError(facade::kSyntaxErr);
  }
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (auto name = cmd_cntx->conn()->GetName(); !name.empty()) {
    return rb->SendBulkString(name);
  } else {
    return rb->SendNull();
  }
}

void ClientInfo(CmdArgList args, CommandContext* cmd_cntx) {
  if (!args.empty()) {
    return cmd_cntx->SendError(facade::kSyntaxErr);
  }
  auto* conn = cmd_cntx->conn();
  string info = conn->GetClientInfo();

  // redis-py (5expects these fields. We append dummy values to keep the output parsable.
  absl::StrAppend(&info, " db=", cmd_cntx->server_conn_cntx()->db_index(), "\r\n");
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  return rb->SendBulkString(info);
}

void ClientList(CmdArgList args, absl::Span<facade::Listener*> listeners,
                CommandContext* cmd_cntx) {
  if (!args.empty()) {
    return cmd_cntx->SendError(facade::kSyntaxErr);
  }

  vector<string> client_info;
  absl::base_internal::SpinLock mu;

  // we can not preempt the connection traversal, so we need to use a spinlock.
  // alternatively we could lock when mutating the connection list, but it seems not important.
  auto cb = [&](unsigned thread_index, util::Connection* conn) {
    facade::Connection* dcon = static_cast<facade::Connection*>(conn);
    string info = dcon->GetClientInfo(thread_index);
    absl::base_internal::SpinLockHolder l(&mu);
    client_info.push_back(std::move(info));
  };

  for (auto* listener : listeners) {
    listener->TraverseConnections(cb);
  }

  string result = absl::StrJoin(client_info, "\n");
  result.append("\n");
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  return rb->SendVerbatimString(result);
}

void ClientTracking(CmdArgList args, CommandContext* cmd_cntx) {
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (!rb->IsResp3())
    return cmd_cntx->SendError(
        "Client tracking is currently not supported for RESP2. Please use RESP3.");

  CmdArgParser parser{args};
  if (!parser.HasAtLeast(1) || args.size() > 3)
    return cmd_cntx->SendError(kSyntaxErr);

  bool is_on = false;
  using Tracking = ConnectionState::ClientTracking;
  Tracking::Options option = Tracking::NONE;
  if (parser.Check("ON")) {
    is_on = true;
  } else if (!parser.Check("OFF")) {
    return cmd_cntx->SendError(kSyntaxErr);
  }

  bool noloop = false;

  if (parser.HasNext()) {
    if (parser.Check("OPTIN")) {
      option = Tracking::OPTIN;
    } else if (parser.Check("OPTOUT")) {
      option = Tracking::OPTOUT;
    } else if (parser.Check("NOLOOP")) {
      noloop = true;
    } else {
      return cmd_cntx->SendError(kSyntaxErr);
    }
  }

  if (parser.HasNext()) {
    if (!noloop && parser.Check("NOLOOP")) {
      noloop = true;
    } else {
      return cmd_cntx->SendError(kSyntaxErr);
    }
  }

  auto* conn_cntx = cmd_cntx->server_conn_cntx();
  if (is_on) {
    ++conn_cntx->subscriptions;
  }

  conn_cntx->conn_state.tracking_info_.SetClientTracking(is_on);
  conn_cntx->conn_state.tracking_info_.SetOption(option);
  conn_cntx->conn_state.tracking_info_.SetNoLoop(noloop);
  return cmd_cntx->rb()->SendOk();
}

void ClientCaching(CmdArgList args, CommandContext* cmd_cntx) {
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (!rb->IsResp3())
    return cmd_cntx->SendError(
        "Client caching is currently not supported for RESP2. Please use RESP3.");

  if (args.size() != 1) {
    return cmd_cntx->SendError(kSyntaxErr);
  }

  auto* cntx = cmd_cntx->server_conn_cntx();
  if (!cntx->conn_state.tracking_info_.IsTrackingOn()) {
    return cmd_cntx->SendError(
        "CLIENT CACHING can be called only when the client is in tracking mode with OPTIN or "
        "OPTOUT mode enabled");
  }

  using Tracking = ConnectionState::ClientTracking;
  CmdArgParser parser{args};

  if (parser.Check("YES")) {
    if (!cntx->conn_state.tracking_info_.HasOption(Tracking::OPTIN)) {
      return cmd_cntx->SendError(
          "CLIENT CACHING YES is only valid when tracking is enabled in OPTIN mode");
    }
  } else if (parser.Check("NO")) {
    if (!cntx->conn_state.tracking_info_.HasOption(Tracking::OPTOUT)) {
      return cmd_cntx->SendError(
          "CLIENT CACHING NO is only valid when tracking is enabled in OPTOUT mode");
    }
    cntx->conn_state.tracking_info_.ResetCachingSequenceNumber();
  } else {
    return cmd_cntx->SendError(kSyntaxErr);
  }

  bool is_multi = cmd_cntx->tx() && cmd_cntx->tx()->IsMulti();
  cntx->conn_state.tracking_info_.SetCachingSequenceNumber(is_multi);
  cmd_cntx->rb()->SendOk();
}

void ClientSetInfo(CmdArgList args, CommandContext* cmd_cntx) {
  if (args.size() != 2) {
    return cmd_cntx->SendError(kSyntaxErr);
  }

  auto* conn = cmd_cntx->conn();
  if (conn == nullptr) {
    return cmd_cntx->SendError("No connection");
  }

  string type = absl::AsciiStrToUpper(ArgS(args, 0));
  string_view val = ArgS(args, 1);

  if (type == "LIB-NAME") {
    conn->SetLibName(string(val));
  } else if (type == "LIB-VER") {
    conn->SetLibVersion(string(val));
  } else {
    return cmd_cntx->SendError(kSyntaxErr);
  }

  cmd_cntx->rb()->SendOk();
}

void ClientId(CmdArgList args, CommandContext* cmd_cntx) {
  if (args.size() != 0) {
    return cmd_cntx->SendError(kSyntaxErr);
  }

  return cmd_cntx->rb()->SendLong(cmd_cntx->conn()->GetClientId());
}

void ClientKill(CmdArgList args, absl::Span<facade::Listener*> listeners,
                CommandContext* cmd_cntx) {
  std::function<bool(facade::Connection * conn)> evaluator;

  if (args.size() == 1) {
    string_view ip_port = ArgS(args, 0);
    if (ip_port.find(':') != ip_port.npos) {
      evaluator = [ip_port](facade::Connection* conn) {
        return conn->RemoteEndpointStr() == ip_port;
      };
    }
  } else if (args.size() == 2) {
    string filter_type = absl::AsciiStrToUpper(ArgS(args, 0));
    string_view filter_value = ArgS(args, 1);
    if (filter_type == "ADDR") {
      evaluator = [filter_value](facade::Connection* conn) {
        return conn->RemoteEndpointStr() == filter_value;
      };
    } else if (filter_type == "LADDR") {
      evaluator = [filter_value](facade::Connection* conn) {
        return conn->LocalBindStr() == filter_value;
      };
    } else if (filter_type == "ID") {
      uint32_t id;
      if (absl::SimpleAtoi(filter_value, &id)) {
        evaluator = [id](facade::Connection* conn) { return conn->GetClientId() == id; };
      }
    }
    // TODO: Add support for KILL USER/TYPE/SKIPME
  }

  if (!evaluator) {
    return cmd_cntx->SendError(kSyntaxErr);
  }

  const bool is_admin_request = cmd_cntx->conn()->IsPrivileged();

  atomic<uint32_t> killed_connections = 0;
  atomic<uint32_t> kill_errors = 0;

  auto cb = [&](unsigned idx, ProactorBase* p) mutable {
    // Step 1 aggregate the per thread connections from all listeners
    std::vector<facade::Connection::WeakRef> connections;
    auto traverse_cb = [&](unsigned idx, util::Connection* conn) {
      facade::Connection* dconn = static_cast<facade::Connection*>(conn);
      if (evaluator(dconn)) {
        if (is_admin_request || !dconn->IsPrivileged()) {
          connections.push_back(dconn->Borrow());
        } else {
          kill_errors.fetch_add(1);
        }
      }
    };
    for (auto* listener : listeners) {
      listener->TraverseConnectionsOnThread(traverse_cb, UINT32_MAX, nullptr);
    }

    // Step 2 kill the clients
    for (auto& tcon : connections) {
      facade::Connection* conn = tcon.Get();
      if (conn && conn->socket()->proactor()->GetPoolIndex() == p->GetPoolIndex()) {
        conn->ShutdownSelfBlocking();
        killed_connections.fetch_add(1);
      }
    }
  };

  shard_set->pool()->AwaitFiberOnAll(cb);

  if (kill_errors.load() == 0) {
    return cmd_cntx->rb()->SendLong(killed_connections.load());
  } else {
    return cmd_cntx->SendError(absl::StrCat("Killed ", killed_connections.load(),
                                            " client(s), but unable to kill ", kill_errors.load(),
                                            " admin client(s)."));
  }
}

void ClientMigrate(CmdArgList args, absl::Span<facade::Listener*> listeners,
                   CommandContext* cmd_cntx) {
  if (args.size() != 2) {
    return cmd_cntx->SendError(kSyntaxErr);
  }

  uint32_t id;
  if (!absl::SimpleAtoi(args[0], &id)) {
    return cmd_cntx->SendError("Invalid client id");
  }

  uint32_t tid = 0;
  if (!absl::SimpleAtoi(args[1], &tid) || tid >= shard_set->pool()->size()) {
    return cmd_cntx->SendError("Invalid thread id");
  }

  unsigned migrated = 0;
  auto cb_brief = [&](unsigned current_tid, ProactorBase* p) {
    if (current_tid == tid) {
      return;  // we should not migrate to the same thread
    }

    auto traverse_cb = [&](unsigned, util::Connection* conn) {
      facade::Connection* dconn = static_cast<facade::Connection*>(conn);
      if (dconn->GetClientId() == id) {
        ++migrated;
        dconn->RequestAsyncMigration(shard_set->pool()->at(tid), true /* force */);
      }
    };

    for (auto* listener : listeners) {
      if (listener->IsPrivilegedInterface())
        continue;  // skip privileged interfaces

      listener->TraverseConnectionsOnThread(traverse_cb, UINT32_MAX, nullptr);
    }
  };

  shard_set->pool()->AwaitBrief(cb_brief);

  return cmd_cntx->rb()->SendLong(migrated);
}

std::string_view GetOSString() {
  // Call uname() only once since it can be expensive. Cache the final result in a static string.
  static string os_string = []() {
    utsname os_name;
    uname(&os_name);
    return StrCat(os_name.sysname, " ", os_name.release, " ", os_name.machine);
  }();

  return os_string;
}

string_view GetRedisMode() {
  return IsClusterEnabledOrEmulated() ? "cluster"sv : "standalone"sv;
}

struct ReplicaOfArgs {
  string host;
  uint16_t port;
  std::optional<cluster::SlotRange> slot_range;
  static nonstd::expected<ReplicaOfArgs, ErrorReply> FromCmdArgs(CmdArgList args);
  bool IsReplicaOfNoOne() const {
    return port == 0;
  }
  friend std::ostream& operator<<(std::ostream& os, const ReplicaOfArgs& args) {
    if (args.IsReplicaOfNoOne()) {
      return os << "NO ONE";
    }
    os << args.host << ":" << args.port;
    if (args.slot_range.has_value()) {
      os << " SLOTS [" << args.slot_range.value().start << "-" << args.slot_range.value().end
         << "]";
    }
    return os;
  }
};

nonstd::expected<ReplicaOfArgs, ErrorReply> ReplicaOfArgs::FromCmdArgs(CmdArgList args) {
  ReplicaOfArgs replicaof_args;
  CmdArgParser parser(args);

  if (parser.Check("NO")) {
    parser.ExpectTag("ONE");
    replicaof_args.port = 0;
  } else {
    replicaof_args.host = parser.Next<string>();
    replicaof_args.port = parser.Next<uint16_t>();
    if (auto err = parser.TakeError(); err || replicaof_args.port < 1) {
      return nonstd::make_unexpected(ErrorReply("port is out of range"));
    }
    if (parser.HasNext()) {
      auto [slot_start, slot_end] = parser.Next<SlotId, SlotId>();
      replicaof_args.slot_range = cluster::SlotRange{slot_start, slot_end};
      if (auto err = parser.TakeError(); err || !replicaof_args.slot_range->IsValid()) {
        return nonstd::make_unexpected(ErrorReply("Invalid slot range"));
      }
    }
  }

  if (auto err = parser.TakeError(); err) {
    return nonstd::make_unexpected(err.MakeReply());
  }
  return replicaof_args;
}

uint64_t GetDelayMs(uint64_t ts) {
  uint64_t now_ns = fb2::ProactorBase::GetMonotonicTimeNs();
  uint64_t delay_ns = 0;
  if (ts < now_ns - 1000000) {  // if more than 1ms has passed between ts and now_ns
    delay_ns = (now_ns - ts) / 1000000;
  }
  return delay_ns;
}

bool ReadProcStats(io::StatusData* sdata) {
#ifdef __linux__
  io::Result<io::StatusData> sdata_res = io::ReadStatusInfo();
  if (!sdata_res) {
    LOG_FIRST_N(ERROR, 10) << "Error fetching /proc/self/status stats. error "
                           << sdata_res.error().message();
    return false;
  }

  size_t total_rss = FetchRssMemory(*sdata_res);
  rss_mem_current.store(total_rss, memory_order_relaxed);
  if (total_rss > glob_memory_peaks.rss.load(memory_order_relaxed))
    glob_memory_peaks.rss.store(total_rss, memory_order_relaxed);

  *sdata = *sdata_res;
  return true;
#else
  return false;
#endif
}

// Rewrite the configuration file with runtime modified settings
GenericError RewriteConfigFile() {
  absl::CommandLineFlag* flagfile_flag = absl::FindCommandLineFlag("flagfile");
  if (!flagfile_flag || flagfile_flag->CurrentValue().empty()) {
    return GenericError("The server is running without a config file");
  }

  std::string config_file_path = flagfile_flag->CurrentValue();

  // Read original config file
  std::ifstream file(config_file_path);
  if (!file.is_open()) {
    return GenericError("Cannot read config file");
  }

  std::string original_content;
  std::string line;
  std::unordered_set<std::string> existing_flags;
  std::vector<std::string> updated_lines;
  bool in_generated_section = false;
  bool had_generated_section = false;

  // Get only runtime modified flag values (not startup config)
  std::unordered_map<std::string, std::string> current_flags;
  auto all_flags = absl::GetAllFlags();
  for (const auto& [flag_name, flag_ptr] : all_flags) {
    // Only include flags that were modified at runtime via CONFIG SET
    // We exclude 'flagfile' and other startup-only configs
    if (flag_ptr->CurrentValue() != flag_ptr->DefaultValue() && flag_name != "flagfile") {
      // Additional check: only include if the config is known to ConfigRegistry
      // This ensures we only write configs that can be modified at runtime
      auto config_names = config_registry.List(flag_name);
      if (!config_names.empty()) {
        current_flags[std::string(flag_name)] = flag_ptr->CurrentValue();
      }
    }
  }

  // Process original file line by line
  while (std::getline(file, line)) {
    std::string trimmed = line;
    trimmed.erase(0, trimmed.find_first_not_of(" \t"));

    // Skip generated section from previous rewrites
    if (trimmed == "# Generated by CONFIG REWRITE") {
      in_generated_section = true;
      had_generated_section = true;
      break;
    }

    if (!in_generated_section) {
      // Check if this line is a flag definition
      if (!trimmed.empty() && trimmed[0] == '-' && trimmed[1] == '-') {
        size_t eq_pos = trimmed.find('=');
        if (eq_pos != std::string::npos) {
          std::string flag_name = trimmed.substr(2, eq_pos - 2);
          if (current_flags.count(flag_name)) {
            // Update existing flag with current value
            updated_lines.push_back(absl::StrCat("--", flag_name, "=", current_flags[flag_name]));
            existing_flags.insert(flag_name);
          } else {
            // Keep original line if flag is not in current active flags
            updated_lines.push_back(line);
          }
        } else {
          // Keep original line as-is
          updated_lines.push_back(line);
        }
      } else {
        // Keep comments and other lines as-is
        updated_lines.push_back(line);
      }
    }
  }
  file.close();

  // Collect new flags that weren't in original config
  std::vector<std::string> new_flags;
  for (const auto& [flag_name, flag_value] : current_flags) {
    if (existing_flags.find(flag_name) == existing_flags.end()) {
      new_flags.push_back(absl::StrCat("--", flag_name, "=", flag_value));
    }
  }

  // Build final content
  std::string final_content;
  for (const auto& line : updated_lines) {
    final_content += line + "\n";
  }

  // Add new flags section if there are any
  if (!new_flags.empty()) {
    if (!final_content.empty() && final_content.back() != '\n') {
      final_content += "\n";
    }
    // Only add extra spacing if this is the first time adding generated section
    if (!had_generated_section) {
      final_content += "\n# Generated by CONFIG REWRITE\n";
    } else {
      final_content += "# Generated by CONFIG REWRITE\n";
    }
    for (const auto& new_flag : new_flags) {
      final_content += new_flag + "\n";
    }
  }

  // Atomic write using mkstemp + rename
  std::string tmp_template = config_file_path + ".tmpXXXXXX";
  int fd = mkstemp(tmp_template.data());
  if (fd == -1) {
    return GenericError("Failed to create temporary file");
  }

  size_t off = 0;
  while (off < final_content.size()) {
    ssize_t n = write(fd, final_content.c_str() + off, final_content.size() - off);
    if (n <= 0) {
      close(fd);
      unlink(tmp_template.data());
      return GenericError("Failed to write config file");
    }
    off += n;
  }

  fsync(fd);
  fchmod(fd, 0644);
  close(fd);

  if (rename(tmp_template.data(), config_file_path.c_str()) == -1) {
    unlink(tmp_template.data());
    return GenericError("Failed to rewrite config file");
  }

  return {};
}

bool IsMaster() {
  // We call this function on startup where tlocal() == nullptr. We handle
  // this case below.
  if (!ServerState::tlocal()) {
    return true;
  }
  return ServerState::tlocal()->is_master;
}

}  // namespace

void SlowLogGet(dfly::CmdArgList args, std::string_view sub_cmd, util::ProactorPool* pp,
                CommandContext* cmd_cntx) {
  size_t requested_slow_log_length = UINT32_MAX;
  size_t argc = args.size();
  if (argc >= 3) {
    return cmd_cntx->SendError(facade::UnknownSubCmd(sub_cmd, "SLOWLOG"), facade::kSyntaxErrType);
  } else if (argc == 2) {
    string_view length = facade::ArgS(args, 1);
    int64_t num;
    if ((!absl::SimpleAtoi(length, &num)) || (num < -1)) {
      return cmd_cntx->SendError("count should be greater than or equal to -1",
                                 facade::kSyntaxErrType);
    }
    if (num >= 0) {
      requested_slow_log_length = num;
    }
  }

  // gather all the individual slowlogs from all the fibers and sort them by their timestamp
  std::vector<boost::circular_buffer<SlowLogEntry>> entries(pp->size());
  pp->AwaitFiberOnAll([&](auto index, auto* context) {
    auto shard_entries = ServerState::tlocal()->GetSlowLog().Entries();
    entries[index] = shard_entries;
  });

  std::vector<std::pair<SlowLogEntry, unsigned>> merged_slow_log;
  for (size_t i = 0; i < entries.size(); ++i) {
    for (const auto& log_item : entries[i]) {
      merged_slow_log.emplace_back(log_item, i);
    }
  }

  std::sort(merged_slow_log.begin(), merged_slow_log.end(), [](const auto& e1, const auto& e2) {
    return e1.first.unix_ts_usec > e2.first.unix_ts_usec;
  });

  requested_slow_log_length = std::min(merged_slow_log.size(), requested_slow_log_length);

  auto* rb = static_cast<facade::RedisReplyBuilder*>(cmd_cntx->rb());
  rb->StartArray(requested_slow_log_length);
  for (size_t i = 0; i < requested_slow_log_length; ++i) {
    const auto& entry = merged_slow_log[i].first;
    const auto& args = entry.cmd_args;

    rb->StartArray(6);

    rb->SendLong(entry.entry_id * pp->size() + merged_slow_log[i].second);
    rb->SendLong(entry.unix_ts_usec / 1000000);
    rb->SendLong(entry.exec_time_usec);

    // if we truncated the args, there is one pseudo-element containing the number of truncated
    // args that we must add, so the result length is increased by 1
    size_t len = args.size() + int(args.size() < entry.original_length);

    rb->StartArray(len);

    for (const auto& arg : args) {
      if (arg.second > 0) {
        auto suffix = absl::StrCat("... (", arg.second, " more bytes)");
        auto cmd_arg = arg.first.substr(0, kMaximumSlowlogArgLength - suffix.length());
        rb->SendBulkString(absl::StrCat(cmd_arg, suffix));
      } else {
        rb->SendBulkString(arg.first);
      }
    }
    // if we truncated arguments - add a special string to indicate that.
    if (args.size() < entry.original_length) {
      rb->SendBulkString(
          absl::StrCat("... (", entry.original_length - args.size(), " more arguments)"));
    }

    rb->SendBulkString(entry.client_ip);
    rb->SendBulkString(entry.client_name);
  }
}

std::optional<fb2::Fiber> Pause(std::vector<facade::Listener*> listeners, Namespace* ns,
                                facade::Connection* conn, ClientPause pause_state,
                                std::function<bool()> is_pause_in_progress,
                                std::function<void()> maybe_cleanup) {
  // Track connections and set pause state to be able to wait untill all running transactions read
  // the new pause state. Exlude already paused commands from the busy count. Exlude tracking
  // blocked connections because: a) If the connection is blocked it is puased. b) We read pause
  // state after waking from blocking so if the trasaction was waken by another running
  //    command that did not pause on the new state yet we will pause after waking up.
  DispatchTracker tracker{listeners, conn, true /* ignore paused commands */,
                          true /*ignore blocking*/};
  shard_set->pool()->AwaitFiberOnAll([&tracker, pause_state](unsigned, util::ProactorBase*) {
    // Commands don't suspend before checking the pause state, so
    // it's impossible to deadlock on waiting for a command that will be paused.
    tracker.TrackOnThread();
    ServerState::tlocal()->SetPauseState(pause_state, true);
  });

  // Wait for all busy commands to finish running before replying to guarantee
  // that no more (write) operations will occur.
  const absl::Duration kDispatchTimeout = absl::Seconds(absl::GetFlag(FLAGS_pause_wait_timeout));
  if (!tracker.Wait(kDispatchTimeout)) {
    LOG(WARNING) << "Couldn't wait for commands to finish dispatching in " << kDispatchTimeout;
    shard_set->pool()->AwaitBrief([pause_state](unsigned, util::ProactorBase*) {
      ServerState::tlocal()->SetPauseState(pause_state, false);
    });
    return std::nullopt;
  }

  // We should not expire/evict keys while clients are paused.
  shard_set->RunBriefInParallel(
      [ns](EngineShard* shard) { ns->GetDbSlice(shard->shard_id()).SetExpireAllowed(false); });

  return fb2::Fiber("client_pause",
                    [is_pause_in_progress, pause_state, ns, maybe_cleanup]() mutable {
                      // On server shutdown we sleep 10ms to make sure all running task finish,
                      // therefore 10ms steps ensure this fiber will not left hanging .
                      constexpr auto step = 10ms;
                      while (is_pause_in_progress()) {
                        ThisFiber::SleepFor(step);
                      }

                      ServerState& etl = *ServerState::tlocal();
                      if (etl.gstate() != GlobalState::SHUTTING_DOWN) {
                        shard_set->pool()->AwaitFiberOnAll([pause_state](util::ProactorBase* pb) {
                          ServerState::tlocal()->SetPauseState(pause_state, false);
                        });
                        shard_set->RunBriefInParallel([ns](EngineShard* shard) {
                          ns->GetDbSlice(shard->shard_id()).SetExpireAllowed(true);
                        });
                      }
                      if (maybe_cleanup) {
                        maybe_cleanup();
                      }
                    });
}

ServerFamily::ServerFamily(Service* service) : service_(*service) {
  start_time_ = time(NULL);
  thread_safe_save_info_.Update([this](SaveInfoData* data) { data->save_time = start_time_; });
  script_mgr_.reset(new ScriptMgr());

  {
    absl::InsecureBitGen eng;
    master_replid_ = GetRandomHex(eng, CONFIG_RUN_ID_SIZE);
    DCHECK_EQ(CONFIG_RUN_ID_SIZE, master_replid_.size());
  }

  if (auto ec =
          detail::ValidateFilename(GetFlag(FLAGS_dbfilename), GetFlag(FLAGS_df_snapshot_format));
      ec) {
    LOG(ERROR) << ec.Format();
    exit(1);
  }

  if (!ValidateServerTlsFlags()) {
    exit(1);
  }
  ValidateClientTlsFlags();
  dfly_cmd_ = make_unique<DflyCmd>(this);
  legacy_format_metrics_ = GetFlag(FLAGS_keep_legacy_memory_metrics);
}

ServerFamily::~ServerFamily() {
}

void SetMaxClients(std::vector<facade::Listener*>& listeners, uint32_t maxclients) {
  for (auto* listener : listeners) {
    if (!listener->IsPrivilegedInterface()) {
      listener->socket()->proactor()->Await(
          [listener, maxclients]() { listener->SetMaxClients(maxclients); });
    }
  }
}

void SetSlowLogMaxLen(util::ProactorPool& pool, uint32_t val) {
  pool.AwaitFiberOnAll(
      [&val](auto index, auto* context) { ServerState::tlocal()->GetSlowLog().ChangeLength(val); });
}

void SetSlowLogThreshold(util::ProactorPool& pool, int32_t val) {
  pool.AwaitFiberOnAll([val](auto index, auto* context) {
    ServerState::tlocal()->log_slower_than_usec = val < 0 ? UINT32_MAX : uint32_t(val);
  });
}

void ServerFamily::Init(util::AcceptServer* acceptor, std::vector<facade::Listener*> listeners) {
  CHECK(acceptor_ == nullptr);
  acceptor_ = acceptor;
  listeners_ = std::move(listeners);

  auto os_string = GetOSString();
  LOG_FIRST_N(INFO, 1) << "Host OS: " << os_string << " with " << shard_set->pool()->size()
                       << " threads";
  SetMaxClients(listeners_, absl::GetFlag(FLAGS_maxclients));
  config_registry.RegisterSetter<uint32_t>(
      "maxclients", [this](uint32_t val) { SetMaxClients(listeners_, val); });

  SetSlowLogThreshold(service_.proactor_pool(), absl::GetFlag(FLAGS_slowlog_log_slower_than));
  config_registry.RegisterMutable("slowlog_log_slower_than",
                                  [this](const absl::CommandLineFlag& flag) {
                                    auto res = flag.TryGet<int32_t>();
                                    if (res.has_value())
                                      SetSlowLogThreshold(service_.proactor_pool(), res.value());
                                    return res.has_value();
                                  });
  SetSlowLogMaxLen(service_.proactor_pool(), absl::GetFlag(FLAGS_slowlog_max_len));
  config_registry.RegisterSetter<uint32_t>(
      "slowlog_max_len", [this](uint32_t val) { SetSlowLogMaxLen(service_.proactor_pool(), val); });

  // We only reconfigure TLS when the 'tls' config key changes. Therefore to
  // update TLS certs, first update tls_cert_file, then set 'tls true'.
  config_registry.RegisterMutable("tls", [this](const absl::CommandLineFlag& flag) {
    if (!ValidateServerTlsFlags()) {
      return false;
    }
    for (facade::Listener* l : listeners_) {
      // Must reconfigure in the listener proactor to avoid a race.
      if (!l->socket()->proactor()->Await([l] { return l->ReconfigureTLS(); })) {
        return false;
      }
    }
    return true;
  });
  config_registry.RegisterMutable("tls_cert_file");
  config_registry.RegisterMutable("tls_key_file");
  config_registry.RegisterMutable("tls_ca_cert_file");
  config_registry.RegisterMutable("tls_ca_cert_dir");
  config_registry.RegisterMutable("replica_priority");
  config_registry.RegisterMutable("lua_undeclared_keys_shas");
  config_registry.RegisterMutable("lua_float_as_int_shas");
  config_registry.RegisterMutable("point_in_time_snapshot");

  pb_task_ = shard_set->pool()->GetNextProactor();
  if (pb_task_->GetKind() == ProactorBase::EPOLL) {
    fq_threadpool_.reset(new fb2::FiberQueueThreadPool(absl::GetFlag(FLAGS_epoll_file_threads)));
  }

  string flag_dir = GetFlag(FLAGS_dir);

  if (detail::IsCloudPath(flag_dir)) {
    snapshot_storage_ = CreateCloudSnapshotStorage(flag_dir);
  } else if (fq_threadpool_) {
    snapshot_storage_ = std::make_shared<detail::FileSnapshotStorage>(fq_threadpool_.get());
  } else {
    snapshot_storage_ = std::make_shared<detail::FileSnapshotStorage>(nullptr);
  }

  // check for '--replicaof' before loading anything
  if (ReplicaOfFlag flag = GetFlag(FLAGS_replicaof); flag.has_value()) {
    service_.proactor_pool().GetNextProactor()->Await(
        [this, &flag]() { this->Replicate(flag.host, flag.port); });
  } else {  // load from snapshot only if --replicaof is empty
    LoadFromSnapshot();
  }

  const auto create_snapshot_schedule_fb = [this] {
    snapshot_schedule_fb_ =
        service_.proactor_pool().GetNextProactor()->LaunchFiber([this] { SnapshotScheduling(); });
  };
  config_registry.RegisterMutable(
      "snapshot_cron", [this, create_snapshot_schedule_fb](const absl::CommandLineFlag& flag) {
        JoinSnapshotSchedule();
        create_snapshot_schedule_fb();
        return true;
      });
  create_snapshot_schedule_fb();
}

void ServerFamily::LoadFromSnapshot() {
  {
    util::fb2::LockGuard lk{loading_stats_mu_};
    loading_stats_.restore_count++;
  }

  const auto load_path_result =
      snapshot_storage_->LoadPath(GetFlag(FLAGS_dir), GetFlag(FLAGS_dbfilename));

  if (load_path_result) {
    const std::string& load_path = *load_path_result;
    if (!load_path.empty()) {
      auto future = Load(load_path, LoadExistingKeys::kFail);
      load_fiber_ = service_.proactor_pool().GetNextProactor()->LaunchFiber([future]() mutable {
        // Wait for load to finish in a dedicated fiber.
        // Failure to load on start causes Dragonfly to exit with an error code.
        if (!future.has_value() || future->Get()) {
          // Error was already printed to log at this point.
          exit(1);
        }
      });
    }
  } else {
    if (std::error_code(load_path_result.error()) == std::errc::no_such_file_or_directory) {
      LOG(WARNING) << "Load snapshot: No snapshot found";
    } else {
      loading_stats_mu_.lock();
      loading_stats_.failed_restore_count++;
      loading_stats_mu_.unlock();
      LOG(ERROR) << "Failed to load snapshot with error: " << load_path_result.error().Format();
      exit(1);
    }
  }
}

void ServerFamily::JoinSnapshotSchedule() {
  schedule_done_.Notify();
  snapshot_schedule_fb_.JoinIfNeeded();
  schedule_done_.Reset();
}

void ServerFamily::Shutdown() {
  VLOG(1) << "ServerFamily::Shutdown";

  load_fiber_.JoinIfNeeded();

  JoinSnapshotSchedule();

  bg_save_fb_.JoinIfNeeded();

  if (save_on_shutdown_ && !absl::GetFlag(FLAGS_dbfilename).empty()) {
    shard_set->pool()->GetNextProactor()->Await([this]() ABSL_LOCKS_EXCLUDED(loading_stats_mu_) {
      GenericError ec = DoSave();

      util::fb2::LockGuard lk{loading_stats_mu_};
      loading_stats_.backup_count++;

      if (ec) {
        loading_stats_.failed_backup_count++;
        LOG(WARNING) << "Failed to perform snapshot " << ec.Format();
      }
    });
  }

  client_pause_ec_.await([this] { return active_pauses_.load() == 0; });

  pb_task_->Await([this] {
    auto ec = journal::Close();
    LOG_IF(ERROR, ec) << "Error closing journal " << ec;

    util::fb2::LockGuard lk(replicaof_mu_);
    if (replica_) {
      replica_->Stop();
    }
    StopAllClusterReplicas();

    dfly_cmd_->Shutdown();
    DebugCmd::Shutdown();
#ifdef WITH_SEARCH
    SearchFamily::Shutdown();
#endif
  });
}

bool ServerFamily::HasPrivilegedInterface() {
  return any_of(listeners_.begin(), listeners_.end(),
                [](auto* l) { return l->IsPrivilegedInterface(); });
}

void ServerFamily::UpdateMemoryGlobalStats() {
  // Called from all shards, but one updates global stats below
  if (EngineShard::tlocal()->shard_id() > 0)
    return;

  // Update used memory peak
  uint64_t mem_current = used_mem_current.load(std::memory_order_relaxed);
  if (mem_current > glob_memory_peaks.used.load(memory_order_relaxed))
    glob_memory_peaks.used.store(mem_current, memory_order_relaxed);

  io::StatusData status_data;
  bool success = ReadProcStats(&status_data);  // updates glob_memory_peaks.rss
  if (!success)
    return;

  size_t total_rss = FetchRssMemory(status_data);

  // Decide on stopping or accepting new connections based on oom deny ratio
  double rss_oom_deny_ratio = ServerState::tlocal()->rss_oom_deny_ratio;
  if (rss_oom_deny_ratio > 0) {
    size_t memory_limit = max_memory_limit.load(memory_order_relaxed) * rss_oom_deny_ratio;
    if (total_rss > memory_limit && accepting_connections_ && HasPrivilegedInterface()) {
      LOG_EVERY_T(WARNING, 10)
          << "Accepting connections stopped, used memory over limit: total_rss " << total_rss
          << " > memory_limit " << memory_limit;
      ChangeConnectionAccept(false);
    } else if (total_rss < memory_limit && !accepting_connections_) {
      LOG_EVERY_T(INFO, 10) << "Accepting connections again, used memory below limit";
      ChangeConnectionAccept(true);
    }
  }
}

struct AggregateLoadResult {
  AggregateError first_error;
  std::atomic<size_t> keys_read;
};

void ServerFamily::FlushAll(Namespace* ns) {
  const CommandId* cid = service_.FindCmd("FLUSHALL");
  boost::intrusive_ptr<Transaction> flush_trans(new Transaction{cid});
  flush_trans->InitByArgs(ns, 0, {});
  VLOG(1) << "Performing flush";
  Drakarys(flush_trans.get(), DbSlice::kDbAll, false);
}

// Load starts as many fibers as there are files to load each one separately.
// It starts one more fiber that waits for all load fibers to finish and returns the first
// error (if any occured) with a future.
std::optional<fb2::Future<GenericError>> ServerFamily::Load(const std::string& path,
                                                            LoadExistingKeys existing_keys) {
  DCHECK(!path.empty());
  DCHECK_GT(shard_count(), 0u);

  // TODO: to move it to helio.
  auto immediate = [](auto val) {
    fb2::Future<GenericError> future;
    future.Resolve(std::move(val));
    return future;
  };

  if (!IsMaster()) {
    return immediate(string("Replica cannot load data"));
  }

  auto expand_result = snapshot_storage_->ExpandSnapshot(path);
  if (!expand_result) {
    LOG(ERROR) << "Failed to load snapshot: " << expand_result.error().Format();

    return immediate(expand_result.error());
  }

  auto prev_state = service_.SwitchState(GlobalState::ACTIVE, GlobalState::LOADING);
  if (prev_state != GlobalState::ACTIVE) {
    LOG(WARNING) << prev_state << " in progress, ignored";
    return {};
  }

  // Reset state on error
  absl::Cleanup reset_state{
      [this]() { service_.SwitchState(GlobalState::LOADING, GlobalState::ACTIVE); }};

  auto& pool = service_.proactor_pool();

  const vector<string>& paths = *expand_result;

  LOG(INFO) << "Loading " << path;

  vector<fb2::Fiber> load_fibers;
  load_fibers.reserve(paths.size());

  LoadOptions load_opts;
  auto load_context = std::make_unique<RdbLoadContext>();
  if (absl::EndsWith(path, "summary.dfs")) {
    // we read summary first to get snapshot_id and load data correctly
    error_code load_ec = pool.GetNextProactor()->Await(
        [&] { return LoadRdb(path, existing_keys, &load_opts, load_context.get()); });
    if (load_ec)
      return immediate(load_ec);
  }

  auto aggregated_result = std::make_shared<AggregateLoadResult>();

  for (const auto& file : paths) {
    // we have already read summary so we skip it now
    if (absl::EndsWith(file, "summary.dfs"))
      continue;

    // For single file, choose thread that does not handle shards if possible.
    // This will balance out the CPU during the load.
    ProactorBase* proactor;
    if (paths.size() == 1 && shard_count() < pool.size()) {
      proactor = pool.at(shard_count());
    } else {
      proactor = pool.GetNextProactor();
    }

    auto load_func = [file, existing_keys, load_opts, aggregated_result,
                      load_context = load_context.get(), this]() mutable {
      error_code load_ec = LoadRdb(file, existing_keys, &load_opts, load_context);
      if (load_ec) {
        aggregated_result->first_error = load_ec;
      } else {
        aggregated_result->keys_read.fetch_add(load_opts.num_loaded_keys, memory_order_relaxed);
      }
    };
    load_fibers.push_back(proactor->LaunchFiber(std::move(load_func)));
  }

  fb2::Future<GenericError> future;

  // Run fiber that empties the channel and sets ec_promise.
  auto load_join_func = [this, aggregated_result, load_fibers = std::move(load_fibers),
                         load_context = std::move(load_context), future]() mutable {
    for (auto& fiber : load_fibers) {
      fiber.Join();
    }

    if (aggregated_result->first_error) {
      load_context->PerformPostLoad(&service_, true);
      LOG(ERROR) << "Rdb load failed: " << (*aggregated_result->first_error).message();
    } else {
      load_context->PerformPostLoad(&service_);
      LOG(INFO) << "Load finished, num keys read: " << aggregated_result->keys_read;
    }

    service_.SwitchState(GlobalState::LOADING, GlobalState::ACTIVE);
    future.Resolve(*(aggregated_result->first_error));
  };
  pool.GetNextProactor()->Dispatch(std::move(load_join_func));

  std::move(reset_state).Cancel();  // load_join_func resets state after loading
  return future;
}

void ServerFamily::SnapshotScheduling() {
  const std::optional<cron::cronexpr> cron_expr = InferSnapshotCronExpr();
  if (!cron_expr) {
    return;
  }

  ServerState* ss = ServerState::tlocal();
  do {
    if (schedule_done_.WaitFor(100ms)) {
      return;
    }
  } while (ss->gstate() == GlobalState::LOADING);

  while (true) {
    const std::chrono::time_point now = std::chrono::system_clock::now();
    const std::chrono::time_point next = cron::cron_next(cron_expr.value(), now);

    if (schedule_done_.WaitFor(next - now)) {
      break;
    };

    GenericError ec = DoSave();

    util::fb2::LockGuard lk{loading_stats_mu_};
    loading_stats_.backup_count++;

    if (ec) {
      loading_stats_.failed_backup_count++;
      LOG(WARNING) << "Failed to perform snapshot " << ec.Format();
    }
  }
}

std::error_code ServerFamily::LoadRdb(const std::string& rdb_file, LoadExistingKeys existing_keys,
                                      LoadOptions* load_opts, RdbLoadContext* load_context) {
  DCHECK(load_opts);
  VLOG(1) << "Loading data from " << rdb_file;
  CHECK(fb2::ProactorBase::IsProactorThread()) << "must be called from proactor thread";

  const std::string& filt_snapshot_id = load_opts->snapshot_id;

  ProactorBase* proactor = fb2::ProactorBase::me();
  error_code result;
  auto fb = proactor->LaunchFiber([&] {
    io::ReadonlyFileOrError res = snapshot_storage_->OpenReadFile(rdb_file);
    if (!res) {
      result = res.error();
      return;
    }

    io::FileSource fs(*res);

    RdbLoader loader{&service_, load_context, filt_snapshot_id};
    loader.SetShardCount(load_opts->shard_count);
    if (existing_keys == LoadExistingKeys::kOverride) {
      loader.SetOverrideExistingKeys(true);
    }

    auto ec = loader.Load(&fs);
    if (ec) {
      // We ignore incorrect_snapshot_id, it means we try to load file from incorrect snapshot.
      if (ec.value() != rdb::errc::incorrect_snapshot_id)
        result = ec;
    } else {
      VLOG(1) << "Done loading RDB from " << rdb_file << ", keys loaded: " << loader.keys_loaded();
      VLOG(1) << "Loading finished after " << strings::HumanReadableElapsedTime(loader.load_time());
      load_opts->num_loaded_keys = loader.keys_loaded();
      load_opts->snapshot_id = loader.GetSnapshotId();
      load_opts->shard_count = loader.shard_count();
    }
  });

  fb.Join();
  return result;
}

enum class MetricType : uint8_t { COUNTER, GAUGE, SUMMARY, HISTOGRAM };

const char* MetricTypeName(MetricType type) {
  switch (type) {
    case MetricType::COUNTER:
      return "counter";
    case MetricType::GAUGE:
      return "gauge";
    case MetricType::SUMMARY:
      return "summary";
    case MetricType::HISTOGRAM:
      return "histogram";
  }
  return "unknown";
}

inline string GetMetricFullName(string_view metric_name) {
  return StrCat("dragonfly_", metric_name);
}

void AppendMetricHeader(string_view metric_name, string_view metric_help, MetricType type,
                        string* dest) {
  const auto full_metric_name = GetMetricFullName(metric_name);
  absl::StrAppend(dest, "# HELP ", full_metric_name, " ", metric_help, "\n");
  absl::StrAppend(dest, "# TYPE ", full_metric_name, " ", MetricTypeName(type), "\n");
}

void AppendLabelTupple(absl::Span<const string_view> label_names,
                       absl::Span<const string_view> label_values, string* dest) {
  if (label_names.empty())
    return;

  absl::StrAppend(dest, "{");
  for (size_t i = 0; i < label_names.size(); ++i) {
    if (i > 0) {
      absl::StrAppend(dest, ", ");
    }
    absl::StrAppend(dest, label_names[i], "=\"", label_values[i], "\"");
  }

  absl::StrAppend(dest, "}");
}

void AppendMetricValue(string_view metric_name, const absl::AlphaNum& value,
                       absl::Span<const string_view> label_names,
                       absl::Span<const string_view> label_values, string* dest) {
  absl::StrAppend(dest, GetMetricFullName(metric_name));
  AppendLabelTupple(label_names, label_values, dest);
  absl::StrAppend(dest, " ", value, "\n");
}

void AppendMetricWithoutLabels(string_view name, string_view help, const absl::AlphaNum& value,
                               MetricType type, string* dest) {
  AppendMetricHeader(name, help, type, dest);
  AppendMetricValue(name, value, {}, {}, dest);
}

void AppendPipelineLatencySummary(string_view name, string_view help, const base::Histogram& hist,
                                  uint64_t total_count, double total_sum_usec, string* dest) {
  AppendMetricHeader(name, help, MetricType::SUMMARY, dest);
  const string full_name = GetMetricFullName(name);
  if (hist.count() > 0) {
    auto [p95, p99] = hist.Percentiles(95, 99);
    AppendMetricValue(name, p95 * 1e-6, {"quantile"}, {"0.95"}, dest);
    AppendMetricValue(name, p99 * 1e-6, {"quantile"}, {"0.99"}, dest);
  }
  // Use monotonically increasing counters for _sum/_count so that Prometheus
  // rate()/irate() functions work correctly even though the histogram is decayed.
  absl::StrAppend(dest, full_name, "_sum ", total_sum_usec * 1e-6, "\n");
  absl::StrAppend(dest, full_name, "_count ", total_count, "\n");
}

void PrintPrometheusMetrics(uint64_t uptime, const Metrics& m, DflyCmd* dfly_cmd,
                            StringResponse* resp, bool legacy) {
  // Server metrics
  AppendMetricHeader("version", "", MetricType::GAUGE, &resp->body());
  AppendMetricValue("version", 1, {"version"}, {GetVersion()}, &resp->body());

  AppendMetricWithoutLabels("master", "1 if master 0 if replica", IsMaster() ? 1 : 0,
                            MetricType::GAUGE, &resp->body());
  AppendMetricWithoutLabels("uptime_in_seconds", "", uptime, MetricType::COUNTER, &resp->body());

  // Clients metrics
  const auto& conn_stats = m.facade_stats.conn_stats;
  AppendMetricWithoutLabels("max_clients", "Maximal number of clients", GetFlag(FLAGS_maxclients),
                            MetricType::GAUGE, &resp->body());
  AppendMetricHeader("connected_clients", "", MetricType::GAUGE, &resp->body());
  AppendMetricValue("connected_clients", conn_stats.num_conns_main, {"listener"}, {"main"},
                    &resp->body());
  AppendMetricValue("connected_clients", conn_stats.num_conns_other, {"listener"}, {"other"},
                    &resp->body());
  AppendMetricWithoutLabels("blocked_clients", "", conn_stats.num_blocked_clients,
                            MetricType::GAUGE, &resp->body());
  AppendMetricWithoutLabels("pipeline_queue_length", "", conn_stats.pipeline_queue_entries,
                            MetricType::GAUGE, &resp->body());
  AppendMetricWithoutLabels("send_delay_seconds", "",
                            double(GetDelayMs(m.oldest_pending_send_ts)) / 1000.0,
                            MetricType::GAUGE, &resp->body());

  AppendMetricWithoutLabels("pipeline_throttle_total", "", conn_stats.pipeline_throttle_count,
                            MetricType::COUNTER, &resp->body());
  AppendMetricWithoutLabels("pipeline_commands_total", "", conn_stats.pipelined_cmd_cnt,
                            MetricType::COUNTER, &resp->body());
  AppendMetricWithoutLabels("pipeline_dispatch_calls_total", "", conn_stats.pipeline_dispatch_calls,
                            MetricType::COUNTER, &resp->body());
  AppendMetricWithoutLabels("pipeline_dispatch_commands_total", "",
                            conn_stats.pipeline_dispatch_commands, MetricType::COUNTER,
                            &resp->body());
  AppendMetricWithoutLabels("pipeline_dispatch_skip_flush_total", "",
                            conn_stats.skip_pipeline_flushing, MetricType::COUNTER, &resp->body());
  AppendMetricWithoutLabels("pipeline_dispatch_flush_duration_seconds", "",
                            conn_stats.pipeline_dispatch_flush_usec * 1e-6, MetricType::COUNTER,
                            &resp->body());

  AppendMetricWithoutLabels("pipeline_commands_duration_seconds", "",
                            conn_stats.pipelined_cmd_latency * 1e-6, MetricType::COUNTER,
                            &resp->body());
  AppendMetricWithoutLabels("pipeline_queue_wait_duration_seconds", "",
                            conn_stats.pipelined_wait_latency * 1e-6, MetricType::COUNTER,
                            &resp->body());
  AppendMetricWithoutLabels("pipeline_blocking_commands_total", "",
                            m.coordinator_stats.blocking_commands_in_pipelines, MetricType::COUNTER,
                            &resp->body());

  // pipelined_cmd_cnt/pipelined_cmd_latency are monotonically increasing counters used for
  // Prometheus _count/_sum; the histogram is decayed and therefore not monotonic.
  AppendPipelineLatencySummary("pipeline_latency_seconds", "Pipeline command latency distribution",
                               conn_stats.pipelined_latency_hist, conn_stats.pipelined_cmd_cnt,
                               conn_stats.pipelined_cmd_latency, &resp->body());

  AppendMetricWithoutLabels("cmd_squash_stats_ignored_total", "",
                            m.coordinator_stats.squash_stats_ignored, MetricType::COUNTER,
                            &resp->body());

  AppendMetricWithoutLabels("cmd_squash_hop_total", "", m.coordinator_stats.multi_squash_hops,
                            MetricType::COUNTER, &resp->body());

  AppendMetricWithoutLabels("cmd_squash_commands_total", "", m.coordinator_stats.squashed_commands,
                            MetricType::COUNTER, &resp->body());

  AppendMetricWithoutLabels("cmd_squash_hop_duration_seconds", "",
                            m.coordinator_stats.multi_squash_exec_hop_usec * 1e-6,
                            MetricType::COUNTER, &resp->body());
  AppendMetricWithoutLabels("cmd_squash_hop_reply_seconds", "",
                            m.coordinator_stats.multi_squash_exec_reply_usec * 1e-6,
                            MetricType::COUNTER, &resp->body());

  string connections_libs;
  AppendMetricHeader("connections_libs", "Total number of connections by libname:ver",
                     MetricType::GAUGE, &connections_libs);
  for (const auto& [lib, count] : m.connections_lib_name_ver_map) {
    AppendMetricValue("connections_libs", count, {"lib"}, {lib}, &connections_libs);
  }
  absl::StrAppend(&resp->body(), connections_libs);

  // Memory metrics
  io::StatusData sdata;
  bool success = ReadProcStats(&sdata);
  AppendMetricWithoutLabels("memory_used_bytes", "", m.heap_used_bytes, MetricType::GAUGE,
                            &resp->body());
  AppendMetricWithoutLabels("memory_used_peak_bytes", "", m.used_mem_peak, MetricType::GAUGE,
                            &resp->body());
  AppendMetricWithoutLabels("fibers_count", "", m.worker_fiber_count, MetricType::GAUGE,
                            &resp->body());
  AppendMetricWithoutLabels("blocked_tasks", "", m.blocked_tasks, MetricType::GAUGE, &resp->body());

  AppendMetricWithoutLabels("memory_max_bytes", "", max_memory_limit.load(memory_order_relaxed),
                            MetricType::GAUGE, &resp->body());

  if (m.events.insertion_rejections | m.coordinator_stats.oom_error_cmd_cnt) {
    AppendMetricHeader("oom_errors_total", "Rejected requests due to out of memory errors",
                       MetricType::COUNTER, &resp->body());
    AppendMetricValue("oom_errors_total", m.events.insertion_rejections, {"type"}, {"insert"},
                      &resp->body());
    AppendMetricValue("oom_errors_total", m.coordinator_stats.oom_error_cmd_cnt, {"type"}, {"cmd"},
                      &resp->body());
  }
  if (success) {
    size_t rss = FetchRssMemory(sdata);
    AppendMetricWithoutLabels("used_memory_rss_bytes", "", rss, MetricType::GAUGE, &resp->body());
    AppendMetricWithoutLabels("swap_memory_bytes", "", sdata.vm_swap, MetricType::GAUGE,
                              &resp->body());
  }

  DbStats total;
  for (const auto& db_stats : m.db_stats) {
    total += db_stats;
  }

  {
    string type_used_memory_metric;
    bool added = false;
    AppendMetricHeader("type_used_memory", "Memory used per type", MetricType::GAUGE,
                       &type_used_memory_metric);

    for (unsigned type = 0; type < total.memory_usage_by_type.size(); type++) {
      size_t mem = total.memory_usage_by_type[type];
      if (mem > 0) {
        AppendMetricValue("type_used_memory", mem, {"type"}, {ObjTypeToString(type)},
                          &type_used_memory_metric);
        added = true;
      }
    }
    if (added)
      absl::StrAppend(&resp->body(), type_used_memory_metric);
  }

  // Stats metrics
  AppendMetricWithoutLabels("connections_received_total", "", conn_stats.conn_received_cnt,
                            MetricType::COUNTER, &resp->body());

  AppendMetricHeader("commands_processed_total", "", MetricType::COUNTER, &resp->body());
  AppendMetricValue("commands_processed_total", conn_stats.command_cnt_main, {"listener"}, {"main"},
                    &resp->body());
  AppendMetricValue("commands_processed_total", conn_stats.command_cnt_other, {"listener"},
                    {"other"}, &resp->body());
  AppendMetricWithoutLabels("keyspace_hits_total", "", m.events.hits, MetricType::COUNTER,
                            &resp->body());
  AppendMetricWithoutLabels("keyspace_misses_total", "", m.events.misses, MetricType::COUNTER,
                            &resp->body());
  AppendMetricWithoutLabels("keyspace_mutations_total", "", m.events.mutations, MetricType::COUNTER,
                            &resp->body());
  AppendMetricWithoutLabels("lua_interpreter_cnt", "", m.lua_stats.interpreter_cnt,
                            MetricType::GAUGE, &resp->body());

  AppendMetricWithoutLabels("freed_memory_lua", "", m.lua_stats.gc_freed_memory,
                            MetricType::COUNTER, &resp->body());
  AppendMetricWithoutLabels("lua_blocked_total", "", m.lua_stats.blocked_cnt, MetricType::COUNTER,
                            &resp->body());
  AppendMetricWithoutLabels("lua_gc_interpreter_return", "", m.lua_stats.interpreter_return,
                            MetricType::COUNTER, &resp->body());
  AppendMetricWithoutLabels("lua_force_gc_calls", "", m.lua_stats.force_gc_calls,
                            MetricType::COUNTER, &resp->body());
  AppendMetricWithoutLabels("lua_gc_duration_total_sec", "", m.lua_stats.gc_duration_ns * 1e-9,
                            MetricType::COUNTER, &resp->body());

  AppendMetricWithoutLabels("backups_total", "", m.loading_stats.backup_count, MetricType::COUNTER,
                            &resp->body());
  AppendMetricWithoutLabels("failed_backups_total", "", m.loading_stats.failed_backup_count,
                            MetricType::COUNTER, &resp->body());
  AppendMetricWithoutLabels("restores_total", "", m.loading_stats.restore_count,
                            MetricType::COUNTER, &resp->body());
  AppendMetricWithoutLabels("failed_restores_total", "", m.loading_stats.failed_restore_count,
                            MetricType::COUNTER, &resp->body());

  // Net metrics
  AppendMetricWithoutLabels("net_input_recv_total", "", conn_stats.io_read_cnt, MetricType::COUNTER,
                            &resp->body());
  AppendMetricWithoutLabels("net_read_yields_total", "", conn_stats.num_read_yields,
                            MetricType::COUNTER, &resp->body());

  AppendMetricWithoutLabels("net_input_bytes_total", "", conn_stats.io_read_bytes,
                            MetricType::COUNTER, &resp->body());

  AppendMetricWithoutLabels("net_output_bytes_total", "", m.facade_stats.reply_stats.io_write_bytes,
                            MetricType::COUNTER, &resp->body());
  {
    AppendMetricWithoutLabels("reply_duration_seconds", "",
                              m.facade_stats.reply_stats.send_stats.total_duration * 1e-9,
                              MetricType::COUNTER, &resp->body());
    AppendMetricWithoutLabels("reply_total", "", m.facade_stats.reply_stats.send_stats.count,
                              MetricType::COUNTER, &resp->body());
  }

  AppendMetricWithoutLabels("script_error_total", "", m.facade_stats.reply_stats.script_error_count,
                            MetricType::COUNTER, &resp->body());

  AppendMetricHeader("listener_accept_error_total", "Listener accept errors", MetricType::COUNTER,
                     &resp->body());
  AppendMetricValue("listener_accept_error_total", m.refused_conn_max_clients_reached_count,
                    {"reason"}, {"limit_reached"}, &resp->body());
  AppendMetricValue("listener_accept_error_total", m.facade_stats.conn_stats.tls_accept_disconnects,
                    {"reason"}, {"tls_error"}, &resp->body());

  // Per-DB expired/evicted totals
  {
    string exp_str, evict_str;
    for (size_t i = 0; i < m.db_stats.size(); ++i) {
      const auto& s = m.db_stats[i];
      if (s.events.expired_keys > 0)
        AppendMetricValue("expired_keys_total", s.events.expired_keys, {"db"}, {StrCat("db", i)},
                          &exp_str);
      if (s.events.evicted_keys > 0)
        AppendMetricValue("evicted_keys_total", s.events.evicted_keys, {"db"}, {StrCat("db", i)},
                          &evict_str);
    }
    AppendMetricHeader("expired_keys_total", "", MetricType::COUNTER, &resp->body());
    absl::StrAppend(&resp->body(), exp_str);
    AppendMetricHeader("evicted_keys_total", "", MetricType::COUNTER, &resp->body());
    absl::StrAppend(&resp->body(), evict_str);
  }

  // Memory stats
  if (legacy) {
    AppendMetricWithoutLabels("memory_fiberstack_vms_bytes",
                              "virtual memory size used by all the fibers",
                              m.worker_fiber_stack_size, MetricType::GAUGE, &resp->body());

    AppendMetricWithoutLabels(
        "commands_squashing_replies_bytes", "",
        m.facade_stats.reply_stats.squashing_current_reply_size.load(memory_order_relaxed),
        MetricType::GAUGE, &resp->body());

    AppendMetricWithoutLabels("tls_bytes", "", m.tls_bytes, MetricType::GAUGE, &resp->body());
    AppendMetricWithoutLabels("snapshot_serialization_bytes", "", m.serialization_bytes,
                              MetricType::GAUGE, &resp->body());

    AppendMetricWithoutLabels("used_memory_lua", "", m.lua_stats.used_bytes, MetricType::GAUGE,
                              &resp->body());

    AppendMetricWithoutLabels("client_read_buffer_bytes", "", conn_stats.read_buf_capacity,
                              MetricType::GAUGE, &resp->body());
    AppendMetricWithoutLabels("dispatch_queue_bytes", "", conn_stats.dispatch_queue_bytes,
                              MetricType::GAUGE, &resp->body());
    AppendMetricWithoutLabels("pipeline_queue_bytes", "", conn_stats.pipeline_queue_bytes,
                              MetricType::GAUGE, &resp->body());
    AppendMetricWithoutLabels("pipeline_cmd_cache_bytes", "", conn_stats.pipeline_cmd_cache_bytes,
                              MetricType::GAUGE, &resp->body());
  }

  string memory_by_class_bytes;
  AppendMetricHeader("memory_by_class_bytes", "Memory metrics", MetricType::GAUGE,
                     &memory_by_class_bytes);

  AppendMetricValue("memory_by_class_bytes", m.lua_stats.used_bytes, {"class"}, {"used_lua"},
                    &memory_by_class_bytes);

  AppendMetricValue("memory_by_class_bytes", m.worker_fiber_stack_size, {"class"},
                    {"fiberstack_vms"}, &memory_by_class_bytes);

  AppendMetricValue("memory_by_class_bytes", m.tls_bytes, {"class"}, {"tls"},
                    &memory_by_class_bytes);

  const size_t squashed =
      m.facade_stats.reply_stats.squashing_current_reply_size.load(memory_order_relaxed);

  AppendMetricValue("memory_by_class_bytes", squashed, {"class"}, {"commands_squashing_replies"},
                    &memory_by_class_bytes);

  AppendMetricValue("memory_by_class_bytes", conn_stats.pipeline_cmd_cache_bytes, {"class"},
                    {"pipeline_cmd_cache"}, &memory_by_class_bytes);

  AppendMetricValue("memory_by_class_bytes", conn_stats.pipeline_queue_bytes, {"class"},
                    {"pipeline_queue"}, &memory_by_class_bytes);

  AppendMetricValue("memory_by_class_bytes", conn_stats.dispatch_queue_bytes, {"class"},
                    {"dispatch_queue"}, &memory_by_class_bytes);

  AppendMetricValue("memory_by_class_bytes", conn_stats.read_buf_capacity, {"class"},
                    {"client_read_buffer"}, &memory_by_class_bytes);

  AppendMetricValue("memory_by_class_bytes", total.table_mem_usage, {"class"}, {"table_used"},
                    &memory_by_class_bytes);

  AppendMetricValue("memory_by_class_bytes", total.obj_memory_usage, {"class"}, {"object_used"},
                    &memory_by_class_bytes);

  AppendMetricValue("memory_by_class_bytes", m.coordinator_stats.stored_cmd_bytes, {"class"},
                    {"conn_stored_commands"}, &memory_by_class_bytes);

  AppendMetricValue("memory_by_class_bytes", m.search_stats.used_memory, {"class"}, {"search_used"},
                    &memory_by_class_bytes);

  AppendMetricValue("memory_by_class_bytes", m.interned_string_stats.pool_bytes, {"class"},
                    {"interned_string_pool"}, &memory_by_class_bytes);

  AppendMetricValue("memory_by_class_bytes", m.interned_string_stats.pool_table_bytes, {"class"},
                    {"interned_string_table"}, &memory_by_class_bytes);

  // Interned string stats
  AppendMetricWithoutLabels("interned_string_entries", "Number of unique interned strings",
                            m.interned_string_stats.pool_entries, MetricType::GAUGE, &resp->body());
  AppendMetricWithoutLabels("interned_string_hits_total", "Interned string pool hits",
                            m.interned_string_stats.hits, MetricType::COUNTER, &resp->body());
  AppendMetricWithoutLabels("interned_string_misses_total", "Interned string pool misses",
                            m.interned_string_stats.misses, MetricType::COUNTER, &resp->body());
  AppendMetricWithoutLabels("interned_string_entries_dedup_factor",
                            "Deduplication achieved by interned strings",
                            m.interned_string_stats.pool_entries == 0
                                ? 0.0
                                : static_cast<double>(m.interned_string_stats.live_references) /
                                      static_cast<double>(m.interned_string_stats.pool_entries),
                            MetricType::GAUGE, &resp->body());

  // Command stats
  if (!m.cmd_stats_map.empty()) {
    string command_metrics;

    AppendMetricHeader("commands_total", "Total number of commands executed", MetricType::COUNTER,
                       &command_metrics);
    for (const auto& [name, stat] : m.cmd_stats_map) {
      const auto calls = stat.first;
      AppendMetricValue("commands_total", calls, {"cmd"}, {name}, &command_metrics);
    }

    AppendMetricHeader("commands_duration_seconds", "Duration of commands in seconds",
                       MetricType::COUNTER, &command_metrics);
    for (const auto& [name, stat] : m.cmd_stats_map) {
      const double duration_seconds = stat.second * 1e-6;
      AppendMetricValue("commands_duration_seconds", duration_seconds, {"cmd"}, {name},
                        &command_metrics);
    }

    absl::StrAppend(&resp->body(), command_metrics);
  }

  if (m.replica_side_info) {  // replica side
    const auto reconnect_count = m.replica_side_info->summary.reconnect_count;
    AppendMetricWithoutLabels("replica_reconnect_count", "Number of replica reconnects",
                              reconnect_count, MetricType::COUNTER, &resp->body());
  } else {  // Master side
    string replication_lag_metrics;
    vector<ReplicaRoleInfo> replicas_info = dfly_cmd->GetReplicasRoleInfo();
    ReplicationMemoryStats repl_mem;
    dfly_cmd->GetReplicationMemoryStats(&repl_mem);
    if (legacy) {
      AppendMetricWithoutLabels(
          "replication_streaming_bytes", "Stable sync replication memory usage",
          repl_mem.streamer_buf_capacity_bytes, MetricType::GAUGE, &resp->body());
      AppendMetricWithoutLabels("replication_full_sync_bytes", "Full sync memory usage",
                                repl_mem.full_sync_buf_bytes, MetricType::GAUGE, &resp->body());
    }
    AppendMetricValue("memory_by_class_bytes", repl_mem.streamer_buf_capacity_bytes, {"class"},
                      {"replication_streaming"}, &memory_by_class_bytes);
    AppendMetricValue("memory_by_class_bytes", repl_mem.full_sync_buf_bytes, {"class"},
                      {"replication_full_sync"}, &memory_by_class_bytes);

    AppendMetricWithoutLabels("replication_psync_count", "Pync count",
                              m.coordinator_stats.psync_requests_total, MetricType::COUNTER,
                              &resp->body());
    AppendMetricHeader("connected_replica_lag_records", "Lag in records of a connected replica.",
                       MetricType::GAUGE, &replication_lag_metrics);
    for (const auto& replica : replicas_info) {
      AppendMetricValue("connected_replica_lag_records", replica.lsn_lag,
                        {"replica_ip", "replica_port", "replica_state"},
                        {replica.address, absl::StrCat(replica.listening_port), replica.state},
                        &replication_lag_metrics);
    }
    absl::StrAppend(&resp->body(), replication_lag_metrics);
  }

  AppendMetricWithoutLabels("fiber_switch_total", "", m.fiber_switch_cnt, MetricType::COUNTER,
                            &resp->body());
  double delay_seconds = m.fiber_switch_delay_usec * 1e-6;
  AppendMetricWithoutLabels("fiber_switch_delay_seconds_total", "", delay_seconds,
                            MetricType::COUNTER, &resp->body());

  AppendMetricWithoutLabels("fiber_longrun_total", "", m.fiber_longrun_cnt, MetricType::COUNTER,
                            &resp->body());
  double longrun_seconds = m.fiber_longrun_usec * 1e-6;
  AppendMetricWithoutLabels("fiber_longrun_seconds", "", longrun_seconds, MetricType::COUNTER,
                            &resp->body());
  AppendMetricWithoutLabels("tx_queue_len", "", m.tx_queue_len, MetricType::GAUGE, &resp->body());

  {
    bool added = false;
    string str;
    AppendMetricHeader("transaction_widths_total", "Transaction counts by their widths",
                       MetricType::COUNTER, &str);

    for (unsigned width = 0; width < shard_set->size(); ++width) {
      uint64_t count = m.coordinator_stats.tx_width_freq_arr[width];

      if (count > 0) {
        AppendMetricValue("transaction_widths_total", count, {"width"}, {StrCat("w", width + 1)},
                          &str);
        added = true;
      }
    }
    if (added)
      absl::StrAppend(&resp->body(), str);
  }

  if (IsClusterEnabled()) {
    string migration_errors_str;
    AppendMetricHeader("migration_errors_total", "Total error numbers of current migrations",
                       MetricType::GAUGE, &migration_errors_str);
    AppendMetricValue("migration_errors_total", m.migration_errors_total, {"num"},
                      {"migration errors"}, &migration_errors_str);
    absl::StrAppend(&resp->body(), migration_errors_str);

    string moved_errors_str;
    uint64_t moved_total_errors = 0;
    if (m.facade_stats.reply_stats.err_count.contains("MOVED")) {
      moved_total_errors = m.facade_stats.reply_stats.err_count.at("MOVED");
    }
    AppendMetricHeader("moved_errors_total", "Total number of moved slot errors",
                       MetricType::COUNTER, &moved_errors_str);
    AppendMetricValue("moved_errors_total", moved_total_errors, {"num"}, {"moved errors"},
                      &moved_errors_str);
    absl::StrAppend(&resp->body(), moved_errors_str);
  }

  string db_key_metrics, db_key_expire_metrics, db_capacity_metrics;

  AppendMetricHeader("db_keys", "Total number of keys by DB", MetricType::GAUGE, &db_key_metrics);
  AppendMetricHeader("db_capacity", "Table capacity by DB", MetricType::GAUGE,
                     &db_capacity_metrics);

  AppendMetricHeader("db_keys_expiring", "Total number of expiring keys by DB", MetricType::GAUGE,
                     &db_key_expire_metrics);

  for (size_t i = 0; i < m.db_stats.size(); ++i) {
    AppendMetricValue("db_keys", m.db_stats[i].key_count, {"db"}, {StrCat("db", i)},
                      &db_key_metrics);
    AppendMetricValue("db_capacity", m.db_stats[i].prime_capacity, {"db"}, {StrCat("db", i)},
                      &db_capacity_metrics);

    AppendMetricValue("db_keys_expiring", m.db_stats[i].expire_count, {"db"}, {StrCat("db", i)},
                      &db_key_expire_metrics);

    AppendMetricValue("keyspace_hits_total", m.db_stats[i].events.hits, {"db"}, {StrCat("db", i)},
                      &resp->body());
    AppendMetricValue("keyspace_misses_total", m.db_stats[i].events.misses, {"db"},
                      {StrCat("db", i)}, &resp->body());
  }

  absl::StrAppend(&resp->body(), db_key_metrics, db_key_expire_metrics, db_capacity_metrics,
                  memory_by_class_bytes);

  AppendMetricWithoutLabels("defrag_invocations", "Defrag invocations",
                            m.shard_stats.defrag_task_invocation_total, MetricType::COUNTER,
                            &resp->body());
  AppendMetricWithoutLabels("defrag_attempts", "Objects examined",
                            m.shard_stats.defrag_attempt_total, MetricType::COUNTER, &resp->body());
  AppendMetricWithoutLabels("defrag_objects_moved", "Objects moved",
                            m.shard_stats.defrag_realloc_total, MetricType::COUNTER, &resp->body());

  AppendMetricHeader("defrag_skipped_total", "Defrag tasks skipped", MetricType::COUNTER,
                     &resp->body());
  AppendMetricValue("defrag_skipped_total", m.shard_stats.defrag_skipped_mem_under_threshold,
                    {"reason"}, {"mem_under_threshold"}, &resp->body());
  AppendMetricValue("defrag_skipped_total", m.shard_stats.defrag_skipped_within_check_interval,
                    {"reason"}, {"within_check_interval"}, &resp->body());
  AppendMetricValue("defrag_skipped_total", m.shard_stats.defrag_skipped_not_enough_fragmentation,
                    {"reason"}, {"not_enough_fragmentation"}, &resp->body());

  AppendMetricWithoutLabels("huffman_tables_built", "Huffman tables built",
                            m.shard_stats.huffman_tables_built, MetricType::COUNTER, &resp->body());

  AppendMetricHeader("list_reads", "List Reads Patterns", MetricType::COUNTER, &resp->body());
  AppendMetricValue("list_reads", m.qlist_stats.total_node_reads, {"type"}, {"total"},
                    &resp->body());
  AppendMetricValue("list_reads", m.qlist_stats.interior_node_reads, {"type"}, {"interior"},
                    &resp->body());

  // Tiered metrics
  {
    AppendMetricWithoutLabels("tiered_entries", "Tiered entries", total.tiered_entries,
                              MetricType::GAUGE, &resp->body());

    // Bytes: used, allocated, capacity
    AppendMetricHeader("tiered_bytes", "Tiered bytes", MetricType::GAUGE, &resp->body());
    AppendMetricValue("tiered_bytes", total.tiered_used_bytes, {"type"}, {"used"}, &resp->body());
    AppendMetricValue("tiered_bytes", m.tiered_stats.cold_storage_bytes, {"type"}, {"cold"},
                      &resp->body());
    AppendMetricValue("tiered_bytes", m.tiered_stats.allocated_bytes, {"type"}, {"allocated"},
                      &resp->body());
    AppendMetricValue("tiered_bytes", m.tiered_stats.capacity_bytes, {"type"}, {"capacity"},
                      &resp->body());

    // Events: stash, fetch, upload, cancel
    AppendMetricHeader("tiered_events", "Tiered events", MetricType::COUNTER, &resp->body());
    AppendMetricValue("tiered_events", m.tiered_stats.total_stashes, {"type"}, {"stash"},
                      &resp->body());
    AppendMetricValue("tiered_events", m.tiered_stats.total_fetches, {"type"}, {"fetch"},
                      &resp->body());
    AppendMetricValue("tiered_events", m.tiered_stats.total_uploads, {"type"}, {"upload"},
                      &resp->body());
    AppendMetricValue("tiered_events", m.tiered_stats.total_cancels, {"type"}, {"cancel"},
                      &resp->body());
    AppendMetricValue("tiered_events", m.tiered_stats.total_deletes, {"type"}, {"delete"},
                      &resp->body());

    // Hits: ram, cool, missed
    AppendMetricHeader("tiered_hits", "Tiered hits", MetricType::COUNTER, &resp->body());
    AppendMetricValue("tiered_hits", m.events.ram_hits, {"type"}, {"ram"}, &resp->body());
    AppendMetricValue("tiered_hits", m.events.ram_cool_hits, {"type"}, {"cool"}, &resp->body());
    AppendMetricValue("tiered_hits", m.events.ram_misses, {"type"}, {"disk"}, &resp->body());

    // Potential problems due to overloading system
    AppendMetricHeader("tiered_overload", "Potential problems due to overloading",
                       MetricType::COUNTER, &resp->body());
    AppendMetricValue("tiered_overload", m.tiered_stats.total_clients_throttled, {"type"},
                      {"client throttling"}, &resp->body());
    AppendMetricValue("tiered_overload", m.tiered_stats.total_stash_overflows, {"type"},
                      {"stash overflows"}, &resp->body());

    AppendMetricHeader("tiered_list_events", "Tiered List Events", MetricType::COUNTER,
                       &resp->body());
    AppendMetricValue("tiered_list_events", m.qlist_stats.offload_requests, {"type"}, {"offload"},
                      &resp->body());
    AppendMetricValue("tiered_list_events", m.qlist_stats.onload_requests, {"type"}, {"onload"},
                      &resp->body());
  }

  // Stream access pattern metrics
  if (m.shard_stats.stream_sequential_accesses || m.shard_stats.stream_random_accesses ||
      m.shard_stats.stream_fetch_all_accesses) {
    AppendMetricHeader("stream_accesses_total", "Total stream accesses by type",
                       MetricType::COUNTER, &resp->body());
    AppendMetricValue("stream_accesses_total", m.shard_stats.stream_sequential_accesses,
                      {"access_type"}, {"sequential"}, &resp->body());
    AppendMetricValue("stream_accesses_total", m.shard_stats.stream_random_accesses,
                      {"access_type"}, {"random"}, &resp->body());
    AppendMetricValue("stream_accesses_total", m.shard_stats.stream_fetch_all_accesses,
                      {"access_type"}, {"fetch_all"}, &resp->body());
  }
}

void ServerFamily::ConfigureMetrics(util::HttpListenerBase* http_base) {
  // The naming of the metrics should be compatible with redis_exporter, see
  // https://github.com/oliver006/redis_exporter/blob/master/exporter/exporter.go#L111

  auto cb = [this](const util::http::QueryArgs& args, util::HttpContext* send) {
    StringResponse resp = util::http::MakeStringResponse(boost::beast::http::status::ok);
    util::http::SetMime(util::http::kTextMime, &resp);
    uint64_t uptime = time(NULL) - start_time_;
    PrintPrometheusMetrics(uptime, GetMetrics(&namespaces->GetDefaultNamespace()), dfly_cmd_.get(),
                           &resp, legacy_format_metrics_);
    return send->Invoke(std::move(resp));
  };

  http_base->RegisterCb("/metrics", cb);
}

void ServerFamily::PauseReplication(bool pause) {
  util::fb2::LockGuard lk(replicaof_mu_);

  // Switch to primary mode.
  if (!IsMaster()) {
    auto repl_ptr = replica_;
    CHECK(repl_ptr);
    repl_ptr->Pause(pause);
  }
}

std::optional<ReplicaOffsetInfo> ServerFamily::GetReplicaOffsetInfo() {
  util::fb2::LockGuard lk(replicaof_mu_);

  // Switch to primary mode.
  if (!IsMaster()) {
    auto repl_ptr = replica_;
    CHECK(repl_ptr);
    return ReplicaOffsetInfo{repl_ptr->GetSyncId(), repl_ptr->GetReplicaOffset()};
  }
  return nullopt;
}

vector<facade::Listener*> ServerFamily::GetNonPriviligedListeners() const {
  std::vector<facade::Listener*> listeners;
  listeners.reserve(listeners.size());
  for (facade::Listener* listener : listeners_) {
    if (!listener->IsPrivilegedInterface()) {
      listeners.push_back(listener);
    }
  }
  return listeners;
}

bool ServerFamily::AreAllReplicasInStableSync() const {
  auto roles = dfly_cmd_->GetReplicasRoleInfo();
  if (roles.empty()) {
    return true;
  }
  auto match = SyncStateName(DflyCmd::SyncState::STABLE_SYNC);
  return std::all_of(roles.begin(), roles.end(),
                     [&match](auto& elem) { return elem.state == match; });
}

optional<Metrics::ReplicaInfo> ServerFamily::GetReplicaSummary() const {
  util::fb2::LockGuard lk(replicaof_mu_);
  if (replica_ == nullptr) {
    return nullopt;
  }

  Metrics::ReplicaInfo info;
  info.summary = replica_->GetSummary();
  for (const auto& cl_repl : cluster_replicas_) {
    info.cl_repl_summary.push_back(cl_repl->GetSummary());
  }

  return info;
}

void ServerFamily::OnClose(ConnectionContext* cntx) {
  dfly_cmd_->OnClose(cntx->conn_state.replication_info.repl_session_id);
}

void ServerFamily::StatsMC(std::string_view section, CommandContext* cmd_ctx) {
  if (!section.empty()) {
    return cmd_ctx->SendError("");
  }
  string info;

#define ADD_LINE(name, val) absl::StrAppend(&info, "STAT " #name " ", val, "\r\n")

  time_t now = time(NULL);
  struct rusage ru;
  getrusage(RUSAGE_SELF, &ru);

  auto dbl_time = [](const timeval& tv) -> double {
    return tv.tv_sec + double(tv.tv_usec) / 1000000.0;
  };

  double utime = dbl_time(ru.ru_utime);
  double systime = dbl_time(ru.ru_stime);
  auto kind = ProactorBase::me()->GetKind();
  const char* multiplex_api = (kind == ProactorBase::IOURING) ? "iouring" : "epoll";

  Metrics m = GetMetrics(&namespaces->GetDefaultNamespace());
  uint64_t uptime = time(NULL) - start_time_;

  const uint32_t total_conns =
      m.facade_stats.conn_stats.num_conns_main + m.facade_stats.conn_stats.num_conns_other;
  ADD_LINE(pid, getpid());
  ADD_LINE(uptime, uptime);
  ADD_LINE(time, now);
  ADD_LINE(version, kGitTag);
  ADD_LINE(libevent, multiplex_api);
  ADD_LINE(pointer_size, sizeof(void*));
  ADD_LINE(rusage_user, utime);
  ADD_LINE(rusage_system, systime);
  ADD_LINE(max_connections, -1);
  ADD_LINE(curr_connections, total_conns);
  ADD_LINE(total_connections, -1);
  ADD_LINE(rejected_connections, -1);
  ADD_LINE(bytes_read, m.facade_stats.conn_stats.io_read_bytes);
  ADD_LINE(bytes_written, m.facade_stats.reply_stats.io_write_bytes);
  ADD_LINE(limit_maxbytes, -1);

  absl::StrAppend(&info, "END\r\n");

  MCReplyBuilder* mc_builder = static_cast<MCReplyBuilder*>(cmd_ctx->rb());
  mc_builder->SendRaw(info);

#undef ADD_LINE
}

GenericError ServerFamily::DoSave(bool ignore_state) {
  const CommandId* cid = service().FindCmd("SAVE");
  CHECK_NOTNULL(cid);
  boost::intrusive_ptr<Transaction> trans(new Transaction{cid});
  trans->InitByArgs(&namespaces->GetDefaultNamespace(), 0, {});
  return DoSave(SaveCmdOptions{absl::GetFlag(FLAGS_df_snapshot_format), {}, {}}, trans.get(),
                ignore_state);
}

GenericError ServerFamily::DoSaveCheckAndStart(const SaveCmdOptions& save_cmd_opts,
                                               Transaction* trans, DoSaveCheckAndStartOpts opts) {
  auto [ignore_state, bg_save] = opts;
  auto state = ServerState::tlocal()->gstate();

  // In some cases we want to create a snapshot even if server is not active, f.e in takeover
  if (!ignore_state && (state != GlobalState::ACTIVE && state != GlobalState::SHUTTING_DOWN)) {
    return GenericError{make_error_code(errc::operation_in_progress),
                        StrCat(GlobalStateName(state), " - can not save database")};
  }

  std::shared_ptr<SaveStagesController> controller;
  {
    util::fb2::LockGuard lk(save_mu_);
    if (save_controller_) {
      return GenericError{make_error_code(errc::operation_in_progress),
                          "SAVING - can not save database"};
    }

    auto snapshot_storage = save_cmd_opts.cloud_uri.empty()
                                ? snapshot_storage_
                                : CreateCloudSnapshotStorage(save_cmd_opts.cloud_uri);

    controller = make_shared<SaveStagesController>(detail::SaveStagesInputs{
        save_cmd_opts.new_version, save_cmd_opts.cloud_uri, save_cmd_opts.basename, trans,
        &service_, fq_threadpool_.get(), snapshot_storage, opts.bg_save});
    save_controller_ = controller;
  }

  // Initialize resources outside of mutex (this may take time for S3 operations)
  auto res = controller->Init();
  if (res) {
    DCHECK_EQ(res->error, true);
    thread_safe_save_info_.Update([&](SaveInfoData* data) {
      data->last_error = res->error;
      data->last_error_time = res->save_time;
      data->failed_duration_sec = res->duration_sec;
      if (bg_save) {
        data->last_bgsave_status = false;
      }
    });

    // Reset the controller under lock if initialization failed.
    util::fb2::LockGuard lk(save_mu_);
    if (save_controller_ == controller) {
      save_controller_.reset();
    }
    return res->error;
  }

  // Success - update state
  controller->Start();
  thread_safe_save_info_.Update(
      [bg_save](SaveInfoData* data) { data->bgsave_in_progress = bg_save; });

  return {};
}

GenericError ServerFamily::WaitUntilSaveFinished(Transaction* trans, bool ignore_state) {
  std::shared_ptr<SaveStagesController> controller;
  {
    util::fb2::LockGuard lk(save_mu_);
    controller = save_controller_;
  }

  if (!controller) {
    return GenericError{make_error_code(errc::operation_not_supported), "Save not in progress"};
  }

  controller->WaitAllSnapshots();
  detail::SaveInfo save_info;

  VLOG(1) << "Before WaitUntilSaveFinished::Finalize";
  bool is_bg_save;
  {
    util::fb2::LockGuard lk(save_mu_);
    // It's possible that another save was initiated and the controller has changed.
    // We only finalize and reset if it's still the same one we were waiting for.
    if (save_controller_ == controller) {
      save_info = save_controller_->Finalize();
      is_bg_save = save_controller_->IsBgSave();
      save_controller_.reset();
    } else {
      // Another save has started. The old one is already finalized by the new one.
      // We just need to get the info.
      return GenericError("Save operation was superseded by another save");
    }
  }

  thread_safe_save_info_.Update([&](SaveInfoData* data) {
    if (is_bg_save) {
      data->bgsave_in_progress = false;
      data->last_bgsave_status = !save_info.error;
    }

    if (save_info.error) {
      data->last_error = save_info.error;
      data->last_error_time = save_info.save_time;
      data->failed_duration_sec = save_info.duration_sec;
    } else {
      data->save_time = save_info.save_time;
      data->success_duration_sec = save_info.duration_sec;
      data->file_name = save_info.file_name;
      data->freq_map = save_info.freq_map;
    }
  });

  return save_info.error;
}

GenericError ServerFamily::DoSave(const SaveCmdOptions& save_cmd_opts, Transaction* trans,
                                  bool ignore_state) {
  DoSaveCheckAndStartOpts opts{.ignore_state = ignore_state};
  if (auto ec = DoSaveCheckAndStart(save_cmd_opts, trans, opts); ec) {
    return ec;
  }

  return WaitUntilSaveFinished(trans, ignore_state);
}

bool ServerFamily::TEST_IsSaving() const {
  std::atomic_bool is_saving{false};
  shard_set->pool()->AwaitFiberOnAll([&](auto*) {
    if (SliceSnapshot::IsSnaphotInProgress())
      is_saving.store(true, std::memory_order_relaxed);
  });
  return is_saving.load(std::memory_order_relaxed);
}

void ServerFamily::Drakarys(Transaction* transaction, DbIndex db_ind, bool wait) {
  VLOG(1) << "Drakarys";

  vector<fb2::Fiber> fibers(shard_set->size());
  transaction->Execute(
      [db_ind, &fibers](Transaction* t, EngineShard* shard) {
        fibers[shard->shard_id()] = t->GetDbSlice(shard->shard_id()).FlushDb(db_ind);
        return OpStatus::OK;
      },
      true);

  auto action = wait ? &fb2::Fiber::JoinIfNeeded : &fb2::Fiber::Detach;
  for (auto& f : fibers)
    (f.*action)();
}

SaveInfoData ServerFamily::GetLastSaveInfo() const {
  return thread_safe_save_info_.Get();
}

void ServerFamily::DbSize(CmdArgList args, CommandContext* cmd_cntx) {
  atomic_ulong num_keys{0};

  auto* cntx = cmd_cntx->server_conn_cntx();
  shard_set->RunBriefInParallel(
      [&](EngineShard* shard) {
        auto db_size = cntx->ns->GetDbSlice(shard->shard_id()).DbSize(cntx->conn_state.db_index);
        num_keys.fetch_add(db_size, memory_order_relaxed);
      },
      [](ShardId) { return true; });

  return cmd_cntx->rb()->SendLong(num_keys.load(memory_order_relaxed));
}

void ServerFamily::CancelBlockingOnThread(std::function<OpStatus(ArgSlice)> status_cb) {
  auto cb = [status_cb](unsigned thread_index, util::Connection* conn) {
    if (auto fcntx = static_cast<facade::Connection*>(conn)->cntx(); fcntx) {
      auto* cntx = static_cast<ConnectionContext*>(fcntx);
      if (cntx->transaction) {
        cntx->transaction->CancelBlocking(status_cb);
      }
    }
  };

  for (auto* listener : listeners_) {
    listener->TraverseConnectionsOnThread(cb, UINT32_MAX, nullptr);
  }
}

string GetPassword() {
  string flag = GetFlag(FLAGS_requirepass);
  if (!flag.empty()) {
    return flag;
  }

  const char* env_var = getenv("DFLY_PASSWORD");
  if (env_var) {
    return env_var;
  }

  return "";
}

void ServerFamily::SendInvalidationMessages() const {
  // send invalidation message (caused by flushdb) to all the clients which
  // turned on client tracking
  auto cb = [](unsigned thread_index, util::Connection* conn) {
    facade::ConnectionContext* fc = static_cast<facade::Connection*>(conn)->cntx();
    if (fc) {
      ConnectionContext* cntx = static_cast<ConnectionContext*>(fc);
      if (cntx->conn_state.tracking_info_.IsTrackingOn()) {
        facade::Connection::InvalidationMessage x;
        x.invalidate_due_to_flush = true;
        cntx->conn()->SendInvalidationMessageAsync(x);
      }
    }
  };
  for (auto* listener : listeners_) {
    listener->TraverseConnections(cb);
  }
}

void ServerFamily::FlushDb(CmdArgList args, CommandContext* cmd_cntx) {
  if (args.size() > 1)
    return cmd_cntx->SendError(kSyntaxErr);

  bool sync = CmdArgParser{args}.Check("SYNC");
  string_view cmd_name = cmd_cntx->tx()->GetCId()->name();
  DbIndex index = cmd_name == "FLUSHALL" ? DbSlice::kDbAll : cmd_cntx->tx()->GetDbIndex();
  Drakarys(cmd_cntx->tx(), index, sync);
  SendInvalidationMessages();
  cmd_cntx->rb()->SendOk();
}

bool ServerFamily::DoAuth(ConnectionContext* cntx, std::string_view username,
                          std::string_view password) {
  const auto* registry = ServerState::tlocal()->user_registry;
  CHECK(registry);
  const bool is_authorized = registry->AuthUser(username, password);
  if (is_authorized) {
    cntx->authed_username = username;
    auto cred = registry->GetCredentials(username);
    cntx->acl_commands = cred.acl_commands;
    cntx->keys = std::move(cred.keys);
    cntx->pub_sub = std::move(cred.pub_sub);
    cntx->ns = &namespaces->GetOrInsert(cred.ns);
    cntx->authenticated = true;
    cntx->acl_db_idx = cred.db;
    if (cred.db == std::numeric_limits<size_t>::max()) {
      cntx->conn_state.db_index = 0;
    } else {
      auto cb = [ns = cntx->ns, index = cred.db](EngineShard* shard) {
        auto& db_slice = ns->GetDbSlice(shard->shard_id());
        db_slice.ActivateDb(index);
        return OpStatus::OK;
      };
      shard_set->RunBriefInParallel(std::move(cb));
      cntx->conn_state.db_index = cred.db;
    }
  }
  return is_authorized;
}

void ServerFamily::Auth(CmdArgList args, CommandContext* cmd_cntx) {
  if (args.size() > 2) {
    return cmd_cntx->SendError(kSyntaxErr);
  }

  ConnectionContext* cntx = cmd_cntx->server_conn_cntx();
  // non admin port auth
  if (!cntx->conn()->IsPrivileged()) {
    const bool one_arg = args.size() == 1;
    std::string_view username = one_arg ? "default" : facade::ToSV(args[0]);
    const size_t index = one_arg ? 0 : 1;
    std::string_view password = facade::ToSV(args[index]);
    if (DoAuth(cntx, username, password)) {
      return cmd_cntx->rb()->SendOk();
    }
    auto& log = ServerState::tlocal()->acl_log;
    using Reason = acl::AclLog::Reason;
    log.Add(*cntx, "AUTH", Reason::AUTH, std::string(username));
    return cmd_cntx->SendError(facade::kAuthRejected, facade::kNoAuthErrType);
  }

  if (!cntx->req_auth) {
    return cmd_cntx->SendError(
        "AUTH <password> called without any password configured for "
        "admin port. Are you sure your configuration is correct?");
  }

  string_view pass = ArgS(args, 0);
  if (pass == GetPassword()) {
    cntx->authenticated = true;
    cmd_cntx->rb()->SendOk();
  } else {
    return cmd_cntx->SendError(facade::kAuthRejected, facade::kNoAuthErrType);
  }
}

void ServerFamily::ClientUnPauseCmd(CmdArgList args, CommandContext* cmd_cntx) {
  if (!args.empty()) {
    return cmd_cntx->SendError(facade::kSyntaxErr);
  }
  is_c_pause_in_progress_.store(false, std::memory_order_relaxed);
  cmd_cntx->rb()->SendOk();
}

void ServerFamily::ChangeConnectionAccept(bool accept) {
  DCHECK_NE(accept, accepting_connections_);
  auto h = accept ? &ListenerInterface::resume_accepting : &ListenerInterface::pause_accepting;
  for (auto* listener : GetNonPriviligedListeners())
    listener->socket()->proactor()->Await([listener, h]() { (listener->*h)(); });
  accepting_connections_ = accept;
}

void ClientHelp(SinkReplyBuilder* builder) {
  string_view help_arr[] = {
      "CLIENT <subcommand> [<arg> [value] [opt] ...]. Subcommands are:",
      "CACHING (YES|NO)",
      "    Enable/disable tracking of the keys for next command in OPTIN/OPTOUT modes.",
      "GETNAME",
      "    Return the name of the current connection.",
      "ID",
      "    Return the ID of the current connection.",
      "KILL <ip:port>",
      "    Kill connection made from <ip:port>.",
      "KILL <option> <value> [<option> <value> [...]]",
      "    Kill connections. Options are:",
      "    * ADDR (<ip:port>|<unixsocket>:0)",
      "      Kill connections made from the specified address",
      "    * LADDR (<ip:port>|<unixsocket>:0)",
      "      Kill connections made to specified local address",
      "    * ID <client-id>",
      "      Kill connections by client id.",
      "INFO",
      "    Return information about the current client connection.",
      "LIST",
      "    Return information about client connections.",
      "UNPAUSE",
      "    Stop the current client pause, resuming traffic.",
      "PAUSE <timeout> [WRITE|ALL]",
      "    Suspend all, or just write, clients for <timeout> milliseconds.",
      "SETNAME <name>",
      "    Assign the name <name> to the current connection.",
      "SETINFO <option> <value>",
      "Set client meta attr. Options are:",
      "    * LIB-NAME: the client lib name.",
      "    * LIB-VER: the client lib version.",
      "TRACKING (ON|OFF) [OPTIN] [OPTOUT] [NOLOOP]",
      "    Control server assisted client side caching.",
      "MIGRATE <client-id> <tid>",
      "    Migrates connection specified by client-id to the specified thread id.",
      "HELP",
      "    Print this help."};
  auto* rb = static_cast<RedisReplyBuilder*>(builder);
  return rb->SendSimpleStrArr(help_arr);
}

void ServerFamily::Client(CmdArgList args, CommandContext* cmd_cntx) {
  string sub_cmd = absl::AsciiStrToUpper(ArgS(args, 0));
  CmdArgList sub_args = args.subspan(1);
  auto* builder = cmd_cntx->rb();

  if (sub_cmd == "SETNAME") {
    return ClientSetName(sub_args, cmd_cntx);
  } else if (sub_cmd == "GETNAME") {
    return ClientGetName(sub_args, cmd_cntx);
  } else if (sub_cmd == "INFO") {
    return ClientInfo(sub_args, cmd_cntx);
  } else if (sub_cmd == "LIST") {
    return ClientList(sub_args, absl::MakeSpan(listeners_), cmd_cntx);
  } else if (sub_cmd == "PAUSE") {
    return ClientPauseCmd(sub_args, cmd_cntx);
  } else if (sub_cmd == "UNPAUSE") {
    return ClientUnPauseCmd(sub_args, cmd_cntx);
  } else if (sub_cmd == "TRACKING") {
    return ClientTracking(sub_args, cmd_cntx);
  } else if (sub_cmd == "KILL") {
    return ClientKill(sub_args, absl::MakeSpan(listeners_), cmd_cntx);
  } else if (sub_cmd == "CACHING") {
    return ClientCaching(sub_args, cmd_cntx);
  } else if (sub_cmd == "SETINFO") {
    return ClientSetInfo(sub_args, cmd_cntx);
  } else if (sub_cmd == "ID") {
    return ClientId(sub_args, cmd_cntx);
  } else if (sub_cmd == "MIGRATE") {
    return ClientMigrate(sub_args, absl::MakeSpan(listeners_), cmd_cntx);
  } else if (sub_cmd == "HELP") {
    return ClientHelp(builder);
  }

  return cmd_cntx->SendError(UnknownSubCmd(sub_cmd, "CLIENT"), kSyntaxErrType);
}

void ServerFamily::Config(CmdArgList args, CommandContext* cmd_cntx) {
  string sub_cmd = absl::AsciiStrToUpper(ArgS(args, 0));

  auto* builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (sub_cmd == "HELP") {
    string_view help_arr[] = {
        "CONFIG <subcommand> [<arg> [value] [opt] ...]. Subcommands are:",
        "GET <pattern>",
        "    Return parameters matching the glob-like <pattern> and their values.",
        "SET <directive> <value>",
        "    Set the configuration <directive> to <value>.",
        "RESETSTAT",
        "    Reset statistics reported by the INFO command.",
        "REWRITE",
        "    Rewrite the configuration file with the current configuration.",
        "HELP",
        "    Prints this help.",
    };

    return builder->SendSimpleStrArr(help_arr);
  }

  if (sub_cmd == "SET") {
    if (args.size() != 3) {
      return cmd_cntx->SendError(WrongNumArgsError("config|set"), kConfigErrType);
    }

    string param = absl::AsciiStrToLower(ArgS(args, 1));

    ConfigRegistry::SetResult result = config_registry.Set(param, ArgS(args, 2));

    const char kErrPrefix[] = "CONFIG SET failed (possibly related to argument '";
    switch (result) {
      case ConfigRegistry::SetResult::OK:
        return builder->SendOk();
      case ConfigRegistry::SetResult::UNKNOWN:
        return cmd_cntx->SendError(
            absl::StrCat("Unknown option or number of arguments for CONFIG SET - '", param, "'"),
            kConfigErrType);

      case ConfigRegistry::SetResult::READONLY:
        return cmd_cntx->SendError(
            absl::StrCat(kErrPrefix, param, "') - can't set immutable config"), kConfigErrType);
      case ConfigRegistry::SetResult::INVALID:
        return cmd_cntx->SendError(absl::StrCat(kErrPrefix, param, "') - argument can not be set"),
                                   kConfigErrType);
    }
    ABSL_UNREACHABLE();
  }

  if (sub_cmd == "GET" && args.size() == 2) {
    vector<string> res;
    string_view param = ArgS(args, 1);

    // Support 'databases' for backward compatibility.
    if (param == "databases") {
      res.emplace_back(param);
      res.push_back(absl::StrCat(absl::GetFlag(FLAGS_dbnum)));
    } else {
      vector<string> names = config_registry.List(param);

      for (const auto& name : names) {
        auto value = config_registry.Get(name);
        DCHECK(value.has_value());
        if (value.has_value()) {
          // Convert internal name (search_query_string_bytes) back to user-facing format
          // (search.query-string-bytes)
          string display_name = DenormalizeConfigName(name);
          res.push_back(display_name);
          res.push_back(*value);
        }
      }
    }
    auto* rb = static_cast<RedisReplyBuilder*>(builder);
    return rb->SendBulkStrArr(res, CollectionType::MAP);
  }

  if (sub_cmd == "REWRITE") {
    if (auto ec = RewriteConfigFile(); ec) {
      return cmd_cntx->SendError(ec.Format(), kConfigErrType);
    }
    return builder->SendOk();
  }

  if (sub_cmd == "RESETSTAT") {
    ResetStat(cmd_cntx->server_conn_cntx()->ns);
    return builder->SendOk();
  } else {
    return cmd_cntx->SendError(UnknownSubCmd(sub_cmd, "CONFIG"), kSyntaxErrType);
  }
}

void ServerFamily::Debug(CmdArgList args, CommandContext* cmd_cntx) {
  DebugCmd dbg_cmd{this, &service_.cluster_family(), cmd_cntx->server_conn_cntx()};

  return dbg_cmd.Run(args, cmd_cntx);
}

void ServerFamily::Memory(CmdArgList args, CommandContext* cmd_cntx) {
  MemoryCmd mem_cmd{this, cmd_cntx};

  return mem_cmd.Run(args);
}

void ServerFamily::Shrink(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  auto cb = [key](Transaction* t, EngineShard* shard) -> OpResult<int64_t> {
    auto& db_slice = t->GetDbSlice(shard->shard_id());
    auto it = db_slice.FindReadOnly(t->GetDbContext(), key).it;
    if (!IsValid(it)) {
      return OpStatus::KEY_NOTFOUND;
    }

    const PrimeValue& pv = it->second;
    unsigned encoding = pv.Encoding();
    unsigned obj_type = pv.ObjType();

    // Only DenseSet-based structures (set or hash with kEncodingStrMap2)
    if (encoding != kEncodingStrMap2 || (obj_type != OBJ_SET && obj_type != OBJ_HASH)) {
      return OpStatus::WRONG_TYPE;
    }

    DenseSet* ds = static_cast<DenseSet*>(pv.RObjPtr());
    ds->set_time(MemberTimeSeconds(t->GetDbContext().time_now_ms));
    size_t current_size = ds->UpperBoundSize();
    size_t bucket_count = ds->BucketCount();

    if (current_size == 0 || bucket_count == 0) {
      return 0;
    }

    size_t optimal_size = std::max(size_t(8), absl::bit_ceil(current_size));
    if (optimal_size >= bucket_count) {
      return 0;
    }

    size_t bucket_bytes_before = bucket_count * sizeof(void*);
    ds->Shrink(optimal_size);
    size_t bucket_bytes_after = ds->BucketCount() * sizeof(void*);

    return bucket_bytes_before - bucket_bytes_after;
  };

  OpResult<int64_t> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));

  if (result.status() == OpStatus::KEY_NOTFOUND) {
    return rb->SendNull();
  }
  if (result.status() == OpStatus::WRONG_TYPE) {
    return cmd_cntx->SendError("WRONGTYPE Key is not a set or hash with DenseSet encoding");
  }
  if (!result) {
    return cmd_cntx->SendError(result.status());
  }

  rb->SendLong(*result);
}

void ServerFamily::BgSaveFb(boost::intrusive_ptr<Transaction> trans) {
  GenericError ec = WaitUntilSaveFinished(trans.get());
  if (ec) {
    LOG(INFO) << "Error in BgSaveFb: " << ec.Format();
  }
}

std::optional<SaveCmdOptions> ServerFamily::GetSaveCmdOpts(CmdArgList args,
                                                           CommandContext* cmd_cntx) {
  if (args.size() > 3) {
    cmd_cntx->SendError(kSyntaxErr);
    return {};
  }

  SaveCmdOptions save_cmd_opts;
  save_cmd_opts.new_version = absl::GetFlag(FLAGS_df_snapshot_format);

  if (args.size() >= 1) {
    string sub_cmd = absl::AsciiStrToUpper(ArgS(args, 0));
    if (sub_cmd == "DF") {
      save_cmd_opts.new_version = true;
    } else if (sub_cmd == "RDB") {
      save_cmd_opts.new_version = false;
    } else {
      cmd_cntx->SendError(UnknownSubCmd(sub_cmd, "SAVE"), kSyntaxErrType);
      return {};
    }
  }

  if (args.size() >= 2) {
    if (detail::IsS3Path(ArgS(args, 1))) {
#ifdef WITH_AWS
      save_cmd_opts.cloud_uri = ArgS(args, 1);
#else
      LOG(ERROR) << "Compiled without AWS support";
      exit(1);
#endif
    } else if (detail::IsGCSPath(ArgS(args, 1))) {
      save_cmd_opts.cloud_uri = ArgS(args, 1);
    } else {
      // no cloud_uri get basename and return
      save_cmd_opts.basename = ArgS(args, 1);
      return save_cmd_opts;
    }
    // cloud_uri is set so get basename if provided
    if (args.size() == 3) {
      save_cmd_opts.basename = ArgS(args, 2);
    }
  }

  return save_cmd_opts;
}

// SAVE [DF|RDB] [CLOUD_URI] [BASENAME]
// TODO add missing [SCHEDULE]
void ServerFamily::BgSave(CmdArgList args, CommandContext* cmd_cntx) {
  auto maybe_res = GetSaveCmdOpts(args, cmd_cntx);
  if (!maybe_res) {
    return;
  }

  DoSaveCheckAndStartOpts opts{.bg_save = true};
  if (auto ec = DoSaveCheckAndStart(*maybe_res, cmd_cntx->tx(), opts); ec) {
    return cmd_cntx->SendError(ec.Format());
  }
  bg_save_fb_.JoinIfNeeded();
  bg_save_fb_ = fb2::Fiber("bg_save_fiber", &ServerFamily::BgSaveFb, this,
                           boost::intrusive_ptr<Transaction>(cmd_cntx->tx()));
  cmd_cntx->rb()->SendOk();
}

// SAVE [DF|RDB] [CLOUD_URI] [BASENAME]
// Allows saving the snapshot of the dataset on disk, potentially overriding the format
// and the snapshot name.
void ServerFamily::Save(CmdArgList args, CommandContext* cmd_cntx) {
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  auto maybe_res = GetSaveCmdOpts(args, cmd_cntx);
  if (!maybe_res) {
    return;
  }

  GenericError ec = DoSave(*maybe_res, cmd_cntx->tx());
  if (ec) {
    return cmd_cntx->SendError(ec.Format());
  } else {
    rb->SendOk();
  }
}

static void MergeDbSliceStats(const DbSlice::Stats& src, Metrics* dest) {
  if (src.db_stats.size() > dest->db_stats.size())
    dest->db_stats.resize(src.db_stats.size());

  for (size_t i = 0; i < src.db_stats.size(); ++i)
    dest->db_stats[i] += src.db_stats[i];

  dest->events += src.events;
  dest->small_string_bytes += src.small_string_bytes;
}

void ServerFamily::ResetStat(Namespace* ns) {
  shard_set->pool()->AwaitBrief(
      [registry = service_.mutable_registry(), ns](unsigned index, auto*) {
        registry->ResetCallStats(index);
        EngineShard* shard = EngineShard::tlocal();
        if (shard) {
          auto& db_slice = ns->GetDbSlice(shard->shard_id());
          db_slice.ResetEvents();
        }
        facade::ResetStats();
        ServerState::tlocal()->exec_freq_count.clear();

        auto reset_cb = [](uint64_t) -> uint64_t { return 0u; };
        ServerState::tlocal()->stats.tx_width_freq_arr.apply(reset_cb);
        ServerState::tlocal()->stats.squash_width_freq_arr.apply(reset_cb);
      });
}

Metrics ServerFamily::GetMetrics(Namespace* ns) const {
  Metrics result;
  util::fb2::Mutex mu;

  uint64_t start = absl::GetCurrentTimeNanos();

  auto cmd_stat_cb = [&dest = result.cmd_stats_map](string_view name, const CmdCallStats& stat) {
    auto& [calls, sum] = dest[absl::AsciiStrToLower(name)];
    calls += stat.first;
    sum += stat.second;
  };

  auto cb = [&](unsigned index, ProactorBase* pb) {
    EngineShard* shard = EngineShard::tlocal();
    ServerState* ss = ServerState::tlocal();

    lock_guard lk(mu);

    result.fiber_switch_cnt += fb2::FiberSwitchEpoch();
    result.fiber_switch_delay_usec += fb2::FiberSwitchDelayUsec();
    result.fiber_longrun_cnt += fb2::FiberLongRunCnt();
    result.fiber_longrun_usec += fb2::FiberLongRunSumUsec();
    result.worker_fiber_stack_size += fb2::WorkerFibersStackSize();
    result.worker_fiber_count += fb2::WorkerFibersCount();
    result.blocked_tasks += TaskQueue::blocked_submitters();

    result.coordinator_stats.Add(ss->stats);

    result.qps += uint64_t(ss->MovingSum6());
    result.facade_stats += *tl_facade_stats;
    result.serialization_bytes += SliceSnapshot::GetThreadLocalMemoryUsage();

    if (shard) {
      result.heap_used_bytes += shard->UsedMemory();
      MergeDbSliceStats(ns->GetDbSlice(shard->shard_id()).GetStats(), &result);
      result.shard_stats += shard->stats();

      if (shard->tiered_storage()) {
        result.tiered_stats += shard->tiered_storage()->GetStats();
      }

      if (shard->search_indices()) {
        result.search_stats += shard->search_indices()->GetStats();
      }

      result.qlist_stats += QList::stats;

      result.traverse_ttl_per_sec += shard->GetMovingSum6(EngineShard::TTL_TRAVERSE);
      result.delete_ttl_per_sec += shard->GetMovingSum6(EngineShard::TTL_DELETE);
      if (result.tx_queue_len < shard->txq()->size())
        result.tx_queue_len = shard->txq()->size();

      if (shard->journal()) {
        result.lsn_buffer_size += journal::LsnBufferSize();
        result.lsn_buffer_bytes += journal::LsnBufferBytes();
      }
    }  // if (shard)

    result.tls_bytes += Listener::TLSUsedMemoryThreadLocal();
    result.refused_conn_max_clients_reached_count += Listener::RefusedConnectionMaxClientsCount();

    result.lua_stats += InterpreterManager::tl_stats();

    auto connections_lib_name_ver_map = facade::Connection::GetLibStatsTL();
    for (auto& [k, v] : connections_lib_name_ver_map) {
      result.connections_lib_name_ver_map[k] += v;
    }

    auto& send_list = facade::SinkReplyBuilder::pending_list;
    if (!send_list.empty()) {
      DCHECK(std::is_sorted(send_list.begin(), send_list.end(),
                            [](const auto& left, const auto& right) {
                              return left.timestamp_ns < right.timestamp_ns;
                            }));

      auto& oldest_member = send_list.front();
      result.oldest_pending_send_ts =
          min<uint64_t>(result.oldest_pending_send_ts, oldest_member.timestamp_ns);
    }
    service_.mutable_registry()->MergeCallStats(index, cmd_stat_cb);
    result.interned_string_stats += GetInternedStringStats();
  };  // cb

  service_.proactor_pool().AwaitFiberOnAll(std::move(cb));

  uint64_t after_cb = absl::GetCurrentTimeNanos();

  // Normalize moving average stats
  result.qps /= 6;
  result.traverse_ttl_per_sec /= 6;
  result.delete_ttl_per_sec /= 6;

  if (!IsMaster()) {
    result.replica_side_info = GetReplicaSummary();
  }

  {
    util::fb2::LockGuard lk{loading_stats_mu_};
    result.loading_stats = loading_stats_;
  }

  result.migration_errors_total = service_.cluster_family().MigrationsErrorsCount();

  // Update peak stats. We rely on the fact that GetMetrics is called frequently enough to
  // update peak_stats_ from it.
  {
    util::fb2::LockGuard lk{peak_stats_mu_};
    // Note: PeakStats::conn_dispatch_queue_bytes is a legacy name. It now tracks the combined
    // server-wide total of dispatch_queue_bytes and pipeline_queue_bytes for ALL connections.
    UpdateMax(&peak_stats_.conn_dispatch_queue_bytes,
              result.facade_stats.conn_stats.dispatch_queue_bytes +
                  result.facade_stats.conn_stats.pipeline_queue_bytes);
    UpdateMax(&peak_stats_.conn_read_buf_capacity,
              result.facade_stats.conn_stats.read_buf_capacity);
    result.peak_stats = peak_stats_;
  }

  result.peak_stats = peak_stats_;
  result.cmd_latency_map = service_.mutable_registry()->LatencyMap();
  result.used_mem_peak = glob_memory_peaks.used.load(memory_order_relaxed);
  result.used_mem_rss_peak = glob_memory_peaks.rss.load(memory_order_relaxed);

  uint64_t delta_ms = (absl::GetCurrentTimeNanos() - start) / 1'000'000;
  if (delta_ms > 30) {
    uint64_t cb_dur = (after_cb - start) / 1'000'000;
    LOG(INFO) << "GetMetrics took " << delta_ms << " ms, out of which callback took " << cb_dur
              << " ms";
  }
  return result;
}

string ServerFamily::FormatInfoMetrics(const Metrics& m, std::string_view section,
                                       bool priveleged) const {
  string info;
  DbStats total;

  for (const auto& db_stats : m.db_stats)
    total += db_stats;

  auto should_enter = [&](string_view name, bool hidden = false) {
    if ((!hidden && section.empty()) || section == "ALL" || section == name) {
      auto normalized_name = string{name.substr(0, 1)} + absl::AsciiStrToLower(name.substr(1));
      absl::StrAppend(&info, info.empty() ? "" : "\r\n", "# ", normalized_name, "\r\n");
      return true;
    }
    return false;
  };

  auto append = [&info](const absl::AlphaNum& a1, const absl::AlphaNum& a2) {
    absl::StrAppend(&info, a1, ":", a2, "\r\n");
  };

  bool show_managed_info = priveleged || !absl::GetFlag(FLAGS_managed_service_info);

  // For some reason on some distributions (like Fedora and OpenSuse) each call to append
  // increase the stack usage of this function. So we use the lambda trick to avoid this.
  // Also, it's more readable.
  auto add_server_info = [&] {
    ProactorBase* proactor = ProactorBase::me();

    // proactor might be null in tests.
    auto kind = proactor ? ProactorBase::me()->GetKind() : ProactorBase::EPOLL;
    const char* multiplex_api = (kind == ProactorBase::IOURING) ? "iouring" : "epoll";

    append("redis_version", kRedisVersion);
    append("dragonfly_version", GetVersion());
    append("redis_mode", GetRedisMode());
    append("arch_bits", 64);
    // Add process_id for Redis compatibility (same order as Redis INFO output).
    append("process_id", getpid());

    if (show_managed_info) {
      append("os", GetOSString());
      append("thread_count", service_.proactor_pool().size());
    }
    append("multiplexing_api", multiplex_api);
    append("tcp_port", GetFlag(FLAGS_port));

    // Add availability_zone if it's not empty
    const auto& az = GetFlag(FLAGS_availability_zone);
    if (!az.empty()) {
      append("availability_zone", az);
    }

    uint64_t uptime = time(NULL) - start_time_;
    append("uptime_in_seconds", uptime);
    append("uptime_in_days", uptime / (3600 * 24));

    append("hz", GetFlag(FLAGS_hz));
    append("executable", base::kProgramName);
    absl::CommandLineFlag* flagfile_flag = absl::FindCommandLineFlag("flagfile");
    append("config_file", flagfile_flag->CurrentValue());
  };

  auto add_clients_info = [&] {
    append("connected_clients",
           m.facade_stats.conn_stats.num_conns_main + m.facade_stats.conn_stats.num_conns_other);
    append("max_clients", GetFlag(FLAGS_maxclients));
    append("client_read_buffer_bytes", m.facade_stats.conn_stats.read_buf_capacity);
    append("blocked_clients", m.facade_stats.conn_stats.num_blocked_clients);
    append("pipeline_queue_length", m.facade_stats.conn_stats.pipeline_queue_entries);
    append("send_delay_ms", GetDelayMs(m.oldest_pending_send_ts));
    append("timeout_disconnects", m.coordinator_stats.conn_timeout_events);
  };

  auto add_mem_info = [&] {
    append("used_memory", m.heap_used_bytes);
    append("used_memory_human", HumanReadableNumBytes(m.heap_used_bytes));
    append("used_memory_peak", m.used_mem_peak);
    append("used_memory_peak_human", HumanReadableNumBytes(m.used_mem_peak));

    // Virtual memory size, upper bound estimation on the RSS memory used by the fiber stacks.
    append("fibers_stack_vms", m.worker_fiber_stack_size);
    append("fibers_count", m.worker_fiber_count);

    io::StatusData sdata;
    bool success = ReadProcStats(&sdata);
    size_t rss = FetchRssMemory(sdata);
    if (success) {
      append("used_memory_rss", rss);
      append("used_memory_rss_human", HumanReadableNumBytes(rss));
    }
    append("used_memory_peak_rss", glob_memory_peaks.used.load(memory_order_relaxed));

    size_t limit = max_memory_limit.load(memory_order_relaxed);
    append("maxmemory", limit);
    append("maxmemory_human", HumanReadableNumBytes(limit));

    append("used_memory_lua", m.lua_stats.used_bytes);

    // Blob - all these cases where the key/objects are represented by a single blob allocated on
    // heap. For example, strings or intsets. members of lists, sets, zsets etc
    // are not accounted for to avoid complex computations. In some cases, when number of members
    // is known we approximate their allocations by taking 16 bytes per member.
    append("object_used_memory", total.obj_memory_usage);

    for (unsigned type = 0; type < total.memory_usage_by_type.size(); type++) {
      size_t mem = total.memory_usage_by_type[type];
      if (mem > 0) {
        append(absl::StrCat("type_used_memory_", ObjTypeToString(type)), mem);
      }
    }
    append("table_used_memory", total.table_mem_usage);
    append("prime_capacity", total.prime_capacity);
    append("num_entries", total.key_count);
    append("inline_keys", total.inline_keys);
    append("small_string_bytes", m.small_string_bytes);
    append("pipeline_cache_bytes", m.facade_stats.conn_stats.pipeline_cmd_cache_bytes);
    append("dispatch_queue_bytes", m.facade_stats.conn_stats.dispatch_queue_bytes);
    append("pipeline_queue_bytes", m.facade_stats.conn_stats.pipeline_queue_bytes);
    append("dispatch_queue_subscriber_bytes",
           m.facade_stats.conn_stats.dispatch_queue_subscriber_bytes);
    append("dispatch_queue_peak_bytes", m.peak_stats.conn_dispatch_queue_bytes);
    append("client_read_buffer_peak_bytes", m.peak_stats.conn_read_buf_capacity);
    append("tls_bytes", m.tls_bytes);
    append("snapshot_serialization_bytes", m.serialization_bytes);
    append("commands_squashing_replies_bytes",
           m.facade_stats.reply_stats.squashing_current_reply_size.load(memory_order_relaxed));
    append("psync_buffer_size", m.lsn_buffer_size);
    append("psync_buffer_bytes", m.lsn_buffer_bytes);

    if (GetFlag(FLAGS_cache_mode)) {
      append("cache_mode", "cache");
      // PHP Symphony needs this field to work.
      append("maxmemory_policy", "eviction");
    } else {
      append("cache_mode", "store");
      // Compatible with redis based frameworks.
      append("maxmemory_policy", "noeviction");
    }

    // master
    if (!m.replica_side_info) {
      ReplicationMemoryStats repl_mem;
      dfly_cmd_->GetReplicationMemoryStats(&repl_mem);
      append("replication_streaming_buffer_bytes", repl_mem.streamer_buf_capacity_bytes);
      append("replication_full_sync_buffer_bytes", repl_mem.full_sync_buf_bytes);
    }

    if (auto controller_copy = GetSaveController()) {
      append("save_buffer_bytes", controller_copy->GetSaveBuffersSize());
    }
  };

  auto add_stats_info = [&] {
    auto& conn_stats = m.facade_stats.conn_stats;
    auto& reply_stats = m.facade_stats.reply_stats;

    append("total_connections_received", conn_stats.conn_received_cnt);
    append("total_handshakes_started", conn_stats.handshakes_started);
    append("total_handshakes_completed", conn_stats.handshakes_completed);
    append("total_commands_processed", conn_stats.command_cnt_main + conn_stats.command_cnt_other);
    append("instantaneous_ops_per_sec", m.qps);
    append("total_pipelined_commands", conn_stats.pipelined_cmd_cnt);
    append("pipeline_throttle_total", conn_stats.pipeline_throttle_count);
    append("pipelined_latency_usec", conn_stats.pipelined_cmd_latency);
    append("total_net_input_bytes", conn_stats.io_read_bytes);
    append("connection_migrations", conn_stats.num_migrations);
    append("connection_recv_provided_calls", conn_stats.num_recv_provided_calls);
    append("total_net_output_bytes", reply_stats.io_write_bytes);
    append("rdb_save_usec", m.coordinator_stats.rdb_save_usec);
    append("rdb_save_count", m.coordinator_stats.rdb_save_count);
    append("big_value_preemptions", m.coordinator_stats.big_value_preemptions);
    append("compressed_blobs", m.coordinator_stats.compressed_blobs);
    append("instantaneous_input_kbps", -1);
    append("instantaneous_output_kbps", -1);
    append("rejected_connections", -1);
    append("expired_keys", m.events.expired_keys);
    append("evicted_keys", m.events.evicted_keys);
    append("total_heartbeat_expired_keys", m.shard_stats.total_heartbeat_expired_keys);
    append("total_heartbeat_expired_bytes", m.shard_stats.total_heartbeat_expired_bytes);
    append("total_heartbeat_expired_calls", m.shard_stats.total_heartbeat_expired_calls);
    append("hard_evictions", m.events.hard_evictions);
    append("garbage_checked", m.events.garbage_checked);
    append("garbage_collected", m.events.garbage_collected);
    append("bump_ups", m.events.bumpups);
    append("stash_unloaded", m.events.stash_unloaded);
    append("oom_rejections", m.events.insertion_rejections + m.coordinator_stats.oom_error_cmd_cnt);
    append("traverse_ttl_sec", m.traverse_ttl_per_sec);
    append("delete_ttl_sec", m.delete_ttl_per_sec);
    append("keyspace_hits", m.events.hits);
    append("keyspace_misses", m.events.misses);
    append("keyspace_mutations", m.events.mutations);
    append("total_reads_processed", conn_stats.io_read_cnt);
    append("total_writes_processed", reply_stats.io_write_cnt);
    append("huffenc_attempt_total", m.events.huff_encode_total);
    append("huffenc_success_total", m.events.huff_encode_success);
    append("defrag_attempt_total", m.shard_stats.defrag_attempt_total);
    append("defrag_realloc_total", m.shard_stats.defrag_realloc_total);
    append("defrag_task_invocation_total", m.shard_stats.defrag_task_invocation_total);

    // Number of connections that are currently blocked on grabbing interpreter.
    append("blocked_on_interpreter", m.coordinator_stats.blocked_on_interpreter);
    append("lua_interpreter_cnt", m.lua_stats.interpreter_cnt);

    // Total number of events of when a connection was blocked on grabbing interpreter.
    append("lua_blocked_total", m.lua_stats.blocked_cnt);

    append("lua_interpreter_return", m.lua_stats.interpreter_return);
    append("lua_force_gc_calls", m.lua_stats.force_gc_calls);
    append("lua_gc_freed_memory_total", m.lua_stats.gc_freed_memory);
    append("lua_gc_duration_total_sec", m.lua_stats.gc_duration_ns * 1e-9);
  };

  auto add_tiered_info = [&] {
    append("tiered_entries", total.tiered_entries);
    append("tiered_entries_bytes", total.tiered_used_bytes);
    append("tiered_entries_bytes_human", HumanReadableNumBytes(total.tiered_used_bytes));

    append("tiered_total_stashes", m.tiered_stats.total_stashes);
    append("tiered_total_fetches", m.tiered_stats.total_fetches);
    append("tiered_total_cancels", m.tiered_stats.total_cancels);
    append("tiered_total_deletes", m.tiered_stats.total_deletes);
    append("tiered_total_uploads", m.tiered_stats.total_uploads);
    append("tiered_total_stash_overflows", m.tiered_stats.total_stash_overflows);
    append("tiered_heap_buf_allocations", m.tiered_stats.total_heap_buf_allocs);
    append("tiered_registered_buf_allocations", m.tiered_stats.total_registered_buf_allocs);

    append("tiered_allocated_bytes", m.tiered_stats.allocated_bytes);
    append("tiered_capacity_bytes", m.tiered_stats.capacity_bytes);

    append("tiered_pending_read_cnt", m.tiered_stats.pending_read_cnt);
    append("tiered_pending_stash_cnt", m.tiered_stats.pending_stash_cnt);

    append("tiered_small_bins_cnt", m.tiered_stats.small_bins_cnt);
    append("tiered_small_bins_entries_cnt", m.tiered_stats.small_bins_entries_cnt);
    append("tiered_small_bins_filling_bytes", m.tiered_stats.small_bins_filling_bytes);
    append("tiered_cold_storage_bytes", m.tiered_stats.cold_storage_bytes);
    append("tiered_offloading_steps", m.tiered_stats.total_offloading_steps);
    append("tiered_offloading_stashes", m.tiered_stats.total_offloading_stashes);
    append("tiered_ram_hits", m.events.ram_hits);
    append("tiered_ram_cool_hits", m.events.ram_cool_hits);
    append("tiered_ram_misses", m.events.ram_misses);

    append("tiered_clients_throttled", m.tiered_stats.clients_throttled);
    append("tiered_total_clients_throttled", m.tiered_stats.total_clients_throttled);
  };

  auto add_persistence_info = [&] {
    size_t current_snap_keys = 0;
    size_t total_snap_keys = 0;
    double perc = 0;
    bool is_saving = false;
    uint32_t curent_durration_sec = 0;
    if (auto controller_copy = GetSaveController()) {
      is_saving = true;
      curent_durration_sec = controller_copy->GetCurrentSaveDuration();
      auto res = controller_copy->GetCurrentSnapshotProgress();
      if (res.total_keys != 0) {
        current_snap_keys = res.current_keys;
        total_snap_keys = res.total_keys;
        perc = (static_cast<double>(current_snap_keys) / total_snap_keys) * 100;
      }
    }

    append("current_snapshot_perc", perc);
    append("current_save_keys_processed", current_snap_keys);
    append("current_save_keys_total", total_snap_keys);

    auto save_info = GetLastSaveInfo();
    // when last success save
    append("last_success_save", save_info.save_time);
    append("last_saved_file", save_info.file_name);
    append("last_success_save_duration_sec", save_info.success_duration_sec);

    ServerState* ss = ServerState::tlocal();

    // ss can be null in tests.
    unsigned is_loading = ss && (ss->gstate() == GlobalState::LOADING);
    append("loading", is_loading);
    append("saving", is_saving);
    append("current_save_duration_sec", curent_durration_sec);

    for (const auto& k_v : save_info.freq_map) {
      append(StrCat("rdb_", k_v.first), k_v.second);
    }
    append("rdb_changes_since_last_success_save", m.events.update);

    append("rdb_bgsave_in_progress", static_cast<int>(save_info.bgsave_in_progress));
    std::string val = save_info.last_bgsave_status ? "ok" : "err";
    append("rdb_last_bgsave_status", val);

    // when last failed save
    append("last_failed_save", save_info.last_error_time);
    append("last_error", save_info.last_error.Format());
    append("last_failed_save_duration_sec", save_info.failed_duration_sec);
  };

  auto add_tx_info = [&] {
    append("tx_shard_polls", m.shard_stats.poll_execution_total);
    append("tx_shard_optimistic_total", m.shard_stats.tx_optimistic_total);
    append("tx_shard_ooo_total", m.shard_stats.tx_ooo_total);
    append("tx_global_total", m.coordinator_stats.tx_global_cnt);
    append("tx_normal_total", m.coordinator_stats.tx_normal_cnt);
    append("tx_inline_runs_total", m.coordinator_stats.tx_inline_runs);
    append("tx_schedule_cancel_total", m.coordinator_stats.tx_schedule_cancel_cnt);
    append("tx_batch_scheduled_items_total", m.shard_stats.tx_batch_scheduled_items_total);
    append("tx_batch_schedule_calls_total", m.shard_stats.tx_batch_schedule_calls_total);
    append("tx_with_freq", absl::StrJoin(m.coordinator_stats.tx_width_freq_arr, ","));
    append("squash_with_freq", absl::StrJoin(m.coordinator_stats.squash_width_freq_arr, ","));
    append("tx_queue_len", m.tx_queue_len);

    append("eval_io_coordination_total", m.coordinator_stats.eval_io_coordination_cnt);
    append("eval_shardlocal_coordination_total",
           m.coordinator_stats.eval_shardlocal_coordination_cnt);
    append("eval_squashed_flushes", m.coordinator_stats.eval_squashed_flushes);
  };

  auto add_repl_info = [&] {
    if (!m.replica_side_info) {
      vector<ReplicaRoleInfo> replicas_info = dfly_cmd_->GetReplicasRoleInfo();
      append("role", "master");
      append("connected_slaves", replicas_info.size());

      if (show_managed_info) {
        for (size_t i = 0; i < replicas_info.size(); i++) {
          auto& r = replicas_info[i];
          // e.g. slave0:ip=172.19.0.3,port=6379,state=full_sync
          append(StrCat("slave", i), StrCat("ip=", r.address, ",port=", r.listening_port,
                                            ",state=", r.state, ",lag=", r.lsn_lag));
        }
      }
      append("master_replid", master_replid_);
    } else {
      append("role", GetFlag(FLAGS_info_replication_valkey_compatible) ? "slave" : "replica");

      auto replication_info_cb = [&](const Replica::Summary& rinfo) {
        append("master_host", rinfo.host);
        append("master_port", rinfo.port);

        const char* link = rinfo.master_link_established ? "up" : "down";
        append("master_link_status", link);
        append("master_last_io_seconds_ago", rinfo.master_last_io_sec);
        append("master_sync_in_progress", rinfo.full_sync_in_progress);
        append("master_replid", rinfo.master_id);
        if (rinfo.full_sync_done || (rinfo.passed_full_sync && !rinfo.master_link_established))
          append("slave_repl_offset", rinfo.repl_offset_sum);
        append("slave_priority", GetFlag(FLAGS_replica_priority));
        append("slave_read_only", 1);
        append("psync_attempts", rinfo.psync_attempts);
        append("psync_successes", rinfo.psync_successes);
      };

      const auto& info = *m.replica_side_info;

      replication_info_cb(info.summary);
      // Special case, when multiple masters replicate to a single replica.
      for (const auto& summary : info.cl_repl_summary) {
        replication_info_cb(summary);
      }
    }
  };

  auto add_cmdstats = [&] {
    auto append_sorted = [&append](string_view prefix, auto display) {
      sort(display.begin(), display.end());
      for (const auto& k_v : display) {
        append(StrCat(prefix, k_v.first), k_v.second);
      }
    };

    vector<pair<string_view, string>> commands;
    for (const auto& [name, stats] : m.cmd_stats_map) {
      const auto calls = stats.first, sum = stats.second;
      commands.push_back(
          {name, absl::StrJoin({absl::StrCat("calls=", calls), absl::StrCat("usec=", sum),
                                absl::StrCat("usec_per_call=", static_cast<double>(sum) / calls)},
                               ",")});
    }

    auto unknown_cmd = service_.UknownCmdMap();

    append_sorted("cmdstat_", std::move(commands));
    append_sorted("unknown_",
                  vector<pair<string_view, uint64_t>>(unknown_cmd.cbegin(), unknown_cmd.cend()));
  };

  if (should_enter("SERVER")) {
    add_server_info();
  }

  if (should_enter("CLIENTS")) {
    add_clients_info();
  }

  if (should_enter("MEMORY")) {
    add_mem_info();
  }

  if (should_enter("STATS")) {
    add_stats_info();
  }

  if (should_enter("TIERED", true)) {
    add_tiered_info();
  }

  if (should_enter("PERSISTENCE", true)) {
    add_persistence_info();
  }

  if (should_enter("TRANSACTION", true)) {
    add_tx_info();
  }

  if (should_enter("REPLICATION")) {
    add_repl_info();
  }

  if (should_enter("COMMANDSTATS", true)) {
    add_cmdstats();
  }

  if (should_enter("MODULES")) {
    append("module",
           "name=ReJSON,ver=20000,api=1,filters=0,usedby=[search],using=[],options=[handle-io-"
           "errors]");
    append("module",
           "name=search,ver=20000,api=1,filters=0,usedby=[],using=[ReJSON],options=[handle-io-"
           "errors]");
  }

#ifdef WITH_SEARCH
  if (should_enter("SEARCH", true)) {
    append("search_memory", m.search_stats.used_memory);
    append("search_num_indices", m.search_stats.num_indices);
    append("search_num_entries", m.search_stats.num_entries);
  }
#endif

  if (should_enter("ERRORSTATS", true)) {
    for (const auto& k_v : m.facade_stats.reply_stats.err_count) {
      append(k_v.first, k_v.second);
    }
  }

  if (should_enter("KEYSPACE")) {
    for (size_t i = 0; i < m.db_stats.size(); ++i) {
      const auto& stats = m.db_stats[i];
      bool show = (i == 0) || (stats.key_count > 0);
      if (show) {
        size_t total = stats.events.hits + stats.events.misses;
        double hit_ratio =
            (total > 0) ? static_cast<double>(stats.events.hits) / (total)*100.0 : 0.0;
        string val = StrCat("keys=", stats.key_count, ",expires=", stats.expire_count,
                            ",hits=", stats.events.hits, ",misses=", stats.events.misses,
                            ",hit_ratio=", absl::StrFormat("%.2f", hit_ratio),
                            ",avg_ttl=-1");  // TODO
        append(StrCat("db", i), val);
      }
    }
  }

#ifndef __APPLE__
  if (should_enter("CPU")) {
    struct rusage ru, cu, tu;
    getrusage(RUSAGE_SELF, &ru);
    getrusage(RUSAGE_CHILDREN, &cu);
    getrusage(RUSAGE_THREAD, &tu);
    append("used_cpu_sys", StrCat(ru.ru_stime.tv_sec, ".", ru.ru_stime.tv_usec));
    append("used_cpu_user", StrCat(ru.ru_utime.tv_sec, ".", ru.ru_utime.tv_usec));
    append("used_cpu_sys_children", StrCat(cu.ru_stime.tv_sec, ".", cu.ru_stime.tv_usec));
    append("used_cpu_user_children", StrCat(cu.ru_utime.tv_sec, ".", cu.ru_utime.tv_usec));
    append("used_cpu_sys_main_thread", StrCat(tu.ru_stime.tv_sec, ".", tu.ru_stime.tv_usec));
    append("used_cpu_user_main_thread", StrCat(tu.ru_utime.tv_sec, ".", tu.ru_utime.tv_usec));
  }
#endif

  if (should_enter("CLUSTER")) {
    append("cluster_enabled", IsClusterEnabledOrEmulated());
    append("migration_errors_total", service_.cluster_family().MigrationsErrorsCount());
    append("total_migrated_keys", m.shard_stats.total_migrated_keys);
  }

  if (should_enter("LATENCYSTATS")) {
    for (const auto& [cmd_name, hist] : m.cmd_latency_map) {
      if (!hist) {
        continue;
      }

      if (is_histogram_empty(hist)) {
        continue;
      }

      absl::InlinedVector<std::string, 4> stats;
      for (const auto percentile : kLatencyPercentiles) {
        const auto value = hdr_value_at_percentile(hist, percentile);
        // If the percentile is an integer, print it as an integer, otherwise print it as a double
        if (std::trunc(percentile) == percentile) {
          stats.emplace_back(absl::StrFormat("p%d=%d", static_cast<int64_t>(percentile), value));
        } else {
          stats.emplace_back(absl::StrFormat("p%g=%d", percentile, value));
        }
      }

      append(absl::StrFormat("latency_percentiles_usec_%s", cmd_name), absl::StrJoin(stats, ","));
    }
  }

  return info;
}

void ServerFamily::Info(CmdArgList args, CommandContext* cmd_cntx) {
  std::vector<std::string> sections;
  bool need_metrics{false};  // Save time - do not fetch metrics if we don't need them.
  Metrics metrics;

  sections.reserve(args.size());
  for (const auto& arg : args) {
    sections.emplace_back(absl::AsciiStrToUpper(arg));
    const auto& section = sections.back();
    if (!need_metrics && (section != "SERVER") && (section != "REPLICATION")) {
      need_metrics = true;
    }
  }

  if (need_metrics || sections.empty()) {
    metrics = GetMetrics(cmd_cntx->server_conn_cntx()->ns);
  } else if (!IsMaster()) {
    metrics.replica_side_info = GetReplicaSummary();
  }

  std::string info;
  bool is_priveleged = cmd_cntx->conn()->IsPrivileged();
  // For multiple requested sections, invalid section names are ignored (not included in the
  // output). The command does not abort or return an error if some sections are invalid. This
  // matches Valkey behavior.
  if (sections.empty()) {  // No sections: default to all sections.
    info = FormatInfoMetrics(metrics, "", is_priveleged);
  } else if (sections.size() == 1) {  // Single section
    info = FormatInfoMetrics(metrics, sections[0], is_priveleged);
  } else {  // Multiple sections: concatenate results for each requested section.
    for (const auto& section : sections) {
      const std::string section_str = FormatInfoMetrics(metrics, section, is_priveleged);
      if (!section_str.empty()) {
        if (!info.empty()) {
          absl::StrAppend(&info, "\r\n", section_str);
        } else {
          info = section_str;
        }
      }
    }
  }

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  rb->SendVerbatimString(info);
}

void ServerFamily::Hello(CmdArgList args, CommandContext* cmd_cntx) {
  // If no arguments are provided default to RESP2.
  bool is_resp3 = false;
  bool has_auth = false;
  bool has_setname = false;
  string_view username;
  string_view password;
  string_view clientname;

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (!args.empty()) {
    string_view proto_version = ArgS(args, 0);
    is_resp3 = proto_version == "3";
    bool valid_proto_version = proto_version == "2" || is_resp3;
    if (!valid_proto_version) {
      cmd_cntx->SendError(UnknownCmd("HELLO", args));
      return;
    }

    for (uint32_t i = 1; i < args.size(); i++) {
      auto sub_cmd = ArgS(args, i);
      auto moreargs = args.size() - 1 - i;
      if (absl::EqualsIgnoreCase(sub_cmd, "AUTH") && moreargs >= 2) {
        has_auth = true;
        username = ArgS(args, i + 1);
        password = ArgS(args, i + 2);
        i += 2;
      } else if (absl::EqualsIgnoreCase(sub_cmd, "SETNAME") && moreargs > 0) {
        has_setname = true;
        clientname = ArgS(args, i + 1);
        i += 1;
      } else {
        cmd_cntx->SendError(kSyntaxErr);
        return;
      }
    }
  }

  ConnectionContext* cntx = cmd_cntx->server_conn_cntx();
  if (has_auth && !DoAuth(cntx, username, password)) {
    return cmd_cntx->SendError(facade::kAuthRejected, facade::kNoAuthErrType);
  }

  if (cntx->req_auth && !cntx->authenticated) {
    cmd_cntx->SendError(
        "-NOAUTH HELLO must be called with the client already "
        "authenticated, otherwise the HELLO <proto> AUTH <user> <pass> "
        "option can be used to authenticate the client and "
        "select the RESP protocol version at the same time",
        facade::kNoAuthErrType);
    return;
  }

  if (has_setname) {
    cntx->conn()->SetName(string{clientname});
  }

  int proto_version = 2;
  if (is_resp3) {
    proto_version = 3;
    rb->SetRespVersion(RespVersion::kResp3);
  } else {
    // Issuing hello 2 again is valid and should switch back to RESP2
    rb->SetRespVersion(RespVersion::kResp2);
  }

  // Define number of fields in the response - add availability_zone if flag is not empty
  const auto& az = GetFlag(FLAGS_availability_zone);
  const int fields_count = az.empty() ? 7 : 8;

  SinkReplyBuilder::ReplyAggregator agg(rb);
  rb->StartCollection(fields_count, CollectionType::MAP);
  rb->SendBulkString("server");
  rb->SendBulkString("redis");
  rb->SendBulkString("version");
  rb->SendBulkString(kRedisVersion);
  rb->SendBulkString("dragonfly_version");
  rb->SendBulkString(GetVersion());
  rb->SendBulkString("proto");
  rb->SendLong(proto_version);
  rb->SendBulkString("id");
  rb->SendLong(cntx->conn()->GetClientId());
  rb->SendBulkString("mode");
  rb->SendBulkString(GetRedisMode());
  rb->SendBulkString("role");
  rb->SendBulkString(IsMaster() ? "master" : "slave");

  // Add availability_zone to the response if flag is explicitly set and not empty
  if (!az.empty()) {
    rb->SendBulkString("availability_zone");
    rb->SendBulkString(az);
  }
}

void ServerFamily::AddReplicaOf(CmdArgList args, CommandContext* cmd_cntx) {
  util::fb2::LockGuard lk(replicaof_mu_);
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (IsMaster()) {
    return cmd_cntx->SendError(
        "Calling ADDREPLICAOFF allowed only after server is already a replica");
  }
  CHECK(replica_);

  auto replicaof_args = ReplicaOfArgs::FromCmdArgs(args);
  if (!replicaof_args.has_value()) {
    return cmd_cntx->SendError(replicaof_args.error());
  }
  if (replicaof_args->IsReplicaOfNoOne()) {
    return cmd_cntx->SendError("ADDREPLICAOF does not support no one");
  }
  LOG(INFO) << "Add Replica " << *replicaof_args;

  auto add_replica = make_unique<Replica>(replicaof_args->host, replicaof_args->port, &service_,
                                          master_replid(), replicaof_args->slot_range);
  GenericError ec = add_replica->Start();
  if (ec) {
    return cmd_cntx->SendError(ec.Format());
  }
  add_replica->StartMainReplicationFiber(nullopt);
  cluster_replicas_.push_back(std::move(add_replica));
  rb->SendOk();
}

void ServerFamily::ReplicaOfInternal(CmdArgList args, CommandContext* cmd_cntx,
                                     ActionOnConnectionFail on_err) {
  std::shared_ptr<Replica> new_replica;
  std::optional<Replica::LastMasterSyncData> last_master_data;
  {
    util::fb2::LockGuard lk(replicaof_mu_);  // Only one REPLICAOF command can run at a time

    // We should not execute replica of command while loading from snapshot.
    ServerState* ss = ServerState::tlocal();
    if (ss->is_master && ss->gstate() == GlobalState::LOADING) {
      cmd_cntx->SendError(kLoadingErr);
      return;
    }

    auto replicaof_args = ReplicaOfArgs::FromCmdArgs(args);
    if (!replicaof_args.has_value()) {
      cmd_cntx->SendError(replicaof_args.error());
      return;
    }

    LOG(INFO) << "Replicating " << *replicaof_args;

    // If NO ONE was supplied, just stop the current replica (if it exists)
    if (replicaof_args->IsReplicaOfNoOne()) {
      if (!ss->is_master) {
        CHECK(replica_);

        SetMasterFlagOnAllThreads(true);  // Flip flag before clearing replica
        // No partial sync for NO ONE flow
        replica_->Stop();
        replica_.reset();

        StopAllClusterReplicas();
      }

      // May not switch to ACTIVE if the process is, for example, shutting down at the same time.
      service_.SwitchState(GlobalState::LOADING, GlobalState::ACTIVE);

      return cmd_cntx->rb()->SendOk();
    }

    // If any replication is in progress, stop it, cancellation should kick in immediately

    if (replica_)
      last_master_data = replica_->Stop();
    StopAllClusterReplicas();

    const GlobalState gstate = ServerState::tlocal()->gstate();
    if (gstate == GlobalState::TAKEN_OVER) {
      service_.SwitchState(GlobalState::TAKEN_OVER, GlobalState::LOADING);
    } else if (auto prev_state = service_.SwitchState(GlobalState::ACTIVE, GlobalState::LOADING);
               prev_state != GlobalState::ACTIVE) {
      LOG(WARNING) << prev_state << " in progress, ignored";
      cmd_cntx->SendError("Invalid state");
      return;
    }

    // Create a new replica and assign it
    new_replica = make_shared<Replica>(replicaof_args->host, replicaof_args->port, &service_,
                                       master_replid(), replicaof_args->slot_range);

    replica_ = new_replica;

    // TODO: disconnect pending blocked clients (pubsub, blocking commands)
    SetMasterFlagOnAllThreads(false);  // Flip flag after assiging replica

  }  // release the lock, lk.unlock()
  // We proceed connecting below without the lock to allow interrupting the replica immediately.
  // From this point and onward, it should be highly responsive.

  GenericError ec{};
  switch (on_err) {
    case ActionOnConnectionFail::kReturnOnError:
      ec = new_replica->Start();
      break;
    case ActionOnConnectionFail::kContinueReplication:
      new_replica->EnableReplication();
      break;
  };

  // If the replication attempt failed, clean up global state. The replica should have stopped
  // internally.
  util::fb2::LockGuard lk(replicaof_mu_);  // Only one REPLICAOF command can run at a time

  // If there was an error above during Start we must not start the main replication fiber.
  // However, it could be the case that Start() above connected succefully and by the time
  // we acquire the lock, the context got cancelled because another ReplicaOf command
  // executed and acquired the replicaof_mu_ before us.
  const bool cancelled = new_replica->IsContextCancelled();
  if (ec || cancelled) {
    if (replica_ == new_replica) {
      service_.SwitchState(GlobalState::LOADING, GlobalState::ACTIVE);
      SetMasterFlagOnAllThreads(true);
      replica_.reset();
    }
    cmd_cntx->SendError(ec ? ec.Format() : "replication cancelled");
    return;
  }
  // Successfully connected now we flush
  // If we are called by "Replicate", tx will be null but we do not need
  // to flush anything.
  if (on_err == ActionOnConnectionFail::kReturnOnError) {
    new_replica->StartMainReplicationFiber(last_master_data);
  }
  cmd_cntx->rb()->SendOk();
}

void ServerFamily::StopAllClusterReplicas() {
  // Stop all cluster replication.
  for (auto& replica : cluster_replicas_) {
    replica->Stop();
    replica.reset();
  }
  cluster_replicas_.clear();
}

void ServerFamily::ReplicaOf(CmdArgList args, CommandContext* cmd_cntx) {
  const bool use_replica_of_v2 = absl::GetFlag(FLAGS_experimental_replicaof_v2);
  if (use_replica_of_v2) {
    ReplicaOfInternalV2(args, cmd_cntx, ActionOnConnectionFail::kReturnOnError);
    return;
  }
  ReplicaOfInternal(args, cmd_cntx, ActionOnConnectionFail::kReturnOnError);
}

void ServerFamily::Replicate(string_view host, string_view port) {
  StringVec replicaof_params{string(host), string(port)};

  CmdArgVec args_vec;
  for (auto& s : replicaof_params) {
    args_vec.emplace_back(MutableSlice{s.data(), s.size()});
  }
  CmdArgList args_list = absl::MakeSpan(args_vec);
  io::NullSink sink;
  facade::RedisReplyBuilder rb(&sink);
  const bool use_replica_of_v2 = absl::GetFlag(FLAGS_experimental_replicaof_v2);
  CommandContext cmd_cntx{&rb, nullptr};
  if (use_replica_of_v2) {
    ReplicaOfInternalV2(args_list, &cmd_cntx, ActionOnConnectionFail::kContinueReplication);
    return;
  }
  ReplicaOfInternal(args_list, &cmd_cntx, ActionOnConnectionFail::kContinueReplication);
}

void ServerFamily::StartJournalInShardThreads(Replica* repl_ptr) {
  shard_set->RunBriefInParallel([this, repl_ptr](auto* shard) {
    size_t index = shard->shard_id();
    auto flow_map = repl_ptr->GetFlowMapAtIndex(index);
    size_t rec_executed = repl_ptr->GetRecCountExecutedPerShard(flow_map);
    LOG(INFO) << "Shard " << index << " starts journal at: " << rec_executed;
    journal::StartInThreadAtLsn(rec_executed);
  });
}

void ServerFamily::ReplicaOfNoOne(SinkReplyBuilder* builder) {
  util::fb2::LockGuard lk(replicaof_mu_);

  if (!IsMaster()) {
    CHECK(replica_);

    auto repl_ptr = replica_;
    if (absl::GetFlag(FLAGS_replicaof_no_one_start_journal)) {
      // Start journal and keep offsets.
      StartJournalInShardThreads(repl_ptr.get());
    }
    // flip flag before clearing replica_
    SetMasterFlagOnAllThreads(true);

    last_master_data_ = replica_->Stop();
    replica_.reset();
    StopAllClusterReplicas();
  }

  // May not switch to ACTIVE if the process is, for example, shutting down at the same time.
  service_.SwitchState(GlobalState::LOADING, GlobalState::ACTIVE);

  return builder->SendOk();
}

void ServerFamily::ReplicaOfInternalV2(CmdArgList args, CommandContext* cmd_cntx,
                                       ActionOnConnectionFail on_error)
    ABSL_LOCKS_EXCLUDED(replicaof_mu_) {
  auto replicaof_args = ReplicaOfArgs::FromCmdArgs(args);
  if (!replicaof_args.has_value()) {
    return cmd_cntx->SendError(replicaof_args.error());
  }

  LOG(INFO) << "Initiate replication with: " << *replicaof_args;
  // This is a "weak" check. For example, if the node is already a replica,
  // it could be the case that one of the flows disconnects. The MainReplicationFiber
  // will then loop and if it can't partial sync it will enter LOADING state because of
  // full sync. Note that the fiber is not aware of the replicaof_mu_ so even
  // if that mutex is locked below before any state check we can't really enforce
  // that the old replication fiber won't try to full sync and update the state to LOADING.
  // What is more here is that we always call `replica->Stop()`. So even if we end up in the
  // scenario described, the semantics are well defined. First, cancel the old replica and
  // move on with the new one. Cancelation will be slower and ReplicaOf() will
  // induce higher latency -- but that's ok because it's an highly improbable flow with
  // well defined semantics.
  ServerState* ss = ServerState::tlocal();

  if (IsMaster() && ss->gstate() == GlobalState::LOADING) {
    return cmd_cntx->SendError(kLoadingErr);
  }

  // replicaof no one
  if (replicaof_args->IsReplicaOfNoOne()) {
    return ReplicaOfNoOne(cmd_cntx->rb());
  }

  auto new_replica = make_shared<Replica>(replicaof_args->host, replicaof_args->port, &service_,
                                          master_replid(), replicaof_args->slot_range);
  GenericError ec;
  switch (on_error) {
    case ActionOnConnectionFail::kReturnOnError:
      ec = new_replica->Start();
      break;
    case ActionOnConnectionFail::kContinueReplication:
      new_replica->EnableReplication();
      break;
  };

  if (ec || new_replica->IsContextCancelled()) {
    return cmd_cntx->SendError(ec ? ec.Format() : "replication cancelled");
  }

  // Critical section.
  // 1. Stop the old replica_ if it exists
  // 2. Update all the pointers to the new replica and update master flag
  // 3. Start the main replication fiber
  // 4. Send OK
  util::fb2::LockGuard lk(replicaof_mu_);
  std::optional<Replica::LastMasterSyncData> last_master_data;
  if (replica_)
    last_master_data = replica_->Stop();

  StopAllClusterReplicas();

  if (ServerState::tlocal()->gstate() == GlobalState::TAKEN_OVER)
    service_.SwitchState(GlobalState::TAKEN_OVER, GlobalState::LOADING);

  // TODO Update thread locals. That way INFO never blocks
  replica_ = new_replica;
  SetMasterFlagOnAllThreads(false);

  if (on_error == ActionOnConnectionFail::kReturnOnError) {
    replica_->StartMainReplicationFiber(last_master_data);
  }

  cmd_cntx->rb()->SendOk();
}

// REPLTAKEOVER <seconds> [SAVE]
// SAVE is used only by tests.
void ServerFamily::ReplTakeOver(CmdArgList args, CommandContext* cmd_cntx) {
  VLOG(1) << "ReplTakeOver start";

  CmdArgParser parser{args};

  int timeout_sec = parser.Next<int>();
  bool save_flag = static_cast<bool>(parser.Check("SAVE"));

  auto* builder = cmd_cntx->rb();
  if (parser.HasNext())
    return cmd_cntx->SendError(absl::StrCat("Unsupported option:", string_view(parser.Next())));

  RETURN_ON_PARSE_ERROR(parser, cmd_cntx);

  // We allow zero timeouts for tests.
  if (timeout_sec < 0) {
    return cmd_cntx->SendError("timeout is negative");
  }

  // We return OK, to support idempotency semantics.
  if (IsMaster())
    return builder->SendOk();

  util::fb2::LockGuard lk(replicaof_mu_);

  auto repl_ptr = replica_;
  CHECK(repl_ptr);

  // Start journal to allow partial sync from same source master
  StartJournalInShardThreads(repl_ptr.get());

  auto info = replica_->GetSummary();
  if (!info.full_sync_done) {
    return cmd_cntx->SendError("Full sync not done");
  }

  std::error_code res = replica_->TakeOver(timeout_sec, save_flag);
  if (res) {
    LOG(WARNING) << "Takeover failed with error: " << res << " - " << res.message();
    return cmd_cntx->SendError(absl::StrCat("Couldn't execute takeover: ", res.message()));
  }

  LOG(INFO) << "Takeover successful, promoting this instance to master.";

  if (IsClusterEnabled()) {
    service().cluster_family().ReconcileReplicaSlots();
  }

  last_master_data_ = replica_->Stop();
  replica_.reset();

  SetMasterFlagOnAllThreads(true);
  return builder->SendOk();
}

void ServerFamily::ReplConf(CmdArgList args, CommandContext* cmd_cntx) {
  auto* builder = cmd_cntx->rb();
  {
    util::fb2::LockGuard lk(replicaof_mu_);
    if (!IsMaster()) {
      return cmd_cntx->SendError("Replicating a replica is unsupported");
    }
  }

  auto err_cb = [&]() mutable {
    LOG(ERROR) << "Error in receiving command: " << args;
    cmd_cntx->SendError(kSyntaxErr);
  };

  if (args.size() % 2 == 1)
    return err_cb();

  ConnectionContext* cntx = cmd_cntx->server_conn_cntx();
  for (unsigned i = 0; i < args.size(); i += 2) {
    DCHECK_LT(i + 1, args.size());

    string cmd = absl::AsciiStrToUpper(ArgS(args, i));
    std::string_view arg = ArgS(args, i + 1);
    if (cmd == "CAPA") {
      if (arg == "dragonfly" && args.size() == 2 && i == 0) {
        auto [sid, flow_count] = dfly_cmd_->CreateSyncSession(&cntx->conn_state);
        cntx->conn()->SetName(absl::StrCat("repl_ctrl_", sid));

        string sync_id = absl::StrCat("SYNC", sid);
        cntx->conn_state.replication_info.repl_session_id = sid;

        cntx->replica_conn = true;

        // The response for 'capa dragonfly' is: <masterid> <syncid> <numthreads> <version>
        auto* rb = static_cast<RedisReplyBuilder*>(builder);
        rb->StartArray(4);
        rb->SendSimpleString(master_replid_);
        rb->SendSimpleString(sync_id);
        rb->SendLong(flow_count);
        rb->SendLong(unsigned(DflyVersion::CURRENT_VER));
        return;
      }
    } else if (cmd == "LISTENING-PORT") {
      uint32_t replica_listening_port;
      if (!absl::SimpleAtoi(arg, &replica_listening_port)) {
        return cmd_cntx->SendError(kInvalidIntErr);
      }
      cntx->conn_state.replication_info.repl_listening_port = replica_listening_port;
      // We set a default value of ip_address here, because LISTENING-PORT is a mandatory field
      // but IP-ADDRESS is optional
      if (cntx->conn_state.replication_info.repl_ip_address.empty()) {
        cntx->conn_state.replication_info.repl_ip_address = cntx->conn()->RemoteEndpointAddress();
      }
    } else if (cmd == "IP-ADDRESS") {
      cntx->conn_state.replication_info.repl_ip_address = arg;
    } else if (cmd == "CLIENT-ID" && args.size() == 2) {
      auto info = dfly_cmd_->GetReplicaInfoFromConnection(&cntx->conn_state);
      DCHECK(info != nullptr);
      if (info) {
        info->id = arg;
      }
    } else if (cmd == "CLIENT-VERSION" && args.size() == 2) {
      unsigned version;
      if (!absl::SimpleAtoi(arg, &version)) {
        return cmd_cntx->SendError(kInvalidIntErr);
      }
      dfly_cmd_->SetDflyClientVersion(&cntx->conn_state, DflyVersion(version));
    } else if (cmd == "ACK" && args.size() == 2) {
      // Don't send error/Ok back through the socket, because we don't want to interleave with
      // the journal writes that we write into the same socket.

      if (!cntx->master_repl_flow) {
        LOG(ERROR) << "No replication flow assigned";
        return;
      }

      uint64_t ack;
      if (!absl::SimpleAtoi(arg, &ack)) {
        LOG(ERROR) << "Bad int in REPLCONF ACK command! arg=" << arg;
        return;
      }
      VLOG(2) << "Received client ACK=" << ack;
      cntx->master_repl_flow->last_acked_lsn = ack;
      return;
    } else {
      VLOG(1) << "Error " << cmd << " " << arg << " " << args.size();
      return err_cb();
    }
  }

  return builder->SendOk();
}

void ServerFamily::Role(CmdArgList args, CommandContext* cmd_cntx) {
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  util::fb2::LockGuard lk(replicaof_mu_);
  // Thread local var is_master is updated under mutex replicaof_mu_ together with replica_,
  // ensuring eventual consistency of is_master. When determining if the server is a replica and
  // accessing the replica_ object, we must lock replicaof_mu_. Using is_master alone is
  // insufficient in this scenario.
  if (!replica_) {
    rb->StartArray(2);
    rb->SendBulkString("master");
    auto vec = dfly_cmd_->GetReplicasRoleInfo();
    rb->StartArray(vec.size());
    for (auto& data : vec) {
      rb->StartArray(3);
      rb->SendBulkString(data.address);
      rb->SendBulkString(absl::StrCat(data.listening_port));
      rb->SendBulkString(data.state);
    }

  } else {
    rb->StartArray(4 + cluster_replicas_.size() * 3);
    rb->SendBulkString(GetFlag(FLAGS_info_replication_valkey_compatible) ? "slave" : "replica");

    auto send_replica_info = [rb](const Replica::Summary& rinfo) {
      rb->SendBulkString(rinfo.host);
      rb->SendBulkString(absl::StrCat(rinfo.port));
      if (rinfo.full_sync_done) {
        rb->SendBulkString(GetFlag(FLAGS_info_replication_valkey_compatible) ? "online"
                                                                             : "stable_sync");
      } else if (rinfo.full_sync_in_progress) {
        rb->SendBulkString("full_sync");
      } else if (rinfo.master_link_established) {
        rb->SendBulkString("preparation");
      } else {
        rb->SendBulkString("connecting");
      }
    };
    send_replica_info(replica_->GetSummary());
    for (const auto& replica : cluster_replicas_) {
      send_replica_info(replica->GetSummary());
    }
  }
}

void ServerFamily::Script(CmdArgList args, CommandContext* cmd_cntx) {
  script_mgr_->Run(args, cmd_cntx->tx(), cmd_cntx->rb(), cmd_cntx->server_conn_cntx());
}

void ServerFamily::LastSave(CmdArgList args, CommandContext* cmd_cntx) {
  auto info = thread_safe_save_info_.Get();
  cmd_cntx->rb()->SendLong(info.save_time);
}

void ServerFamily::Latency(CmdArgList args, CommandContext* cmd_cntx) {
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  string sub_cmd = absl::AsciiStrToUpper(ArgS(args, 0));

  if (sub_cmd == "LATEST" || sub_cmd == "HISTOGRAM") {
    return rb->SendEmptyArray();
  }

  return cmd_cntx->SendError(UnknownSubCmd(sub_cmd, "LATENCY"), kSyntaxErrType);
}

void ServerFamily::ShutdownCmd(CmdArgList args, CommandContext* cmd_cntx) {
  // Supported options (case-insensitive):
  // SAVE | NOSAVE, NOW, FORCE, ABORT, SAFE (Valkey-specific, the same as SAVE in Dragonfly)
  enum ShutBits : uint32_t {
    SB_SAVE = 1u << 0,
    SB_NOSAVE = 1u << 1,
    SB_NOW = 1u << 2,
    SB_FORCE = 1u << 3,
    SB_ABORT = 1u << 4,
  };

  uint32_t sb = 0;

  CmdArgParser parser(args);
  while (parser.HasNext()) {
    // Map SAFE to SAVE directly (fallthrough behavior)
    ShutBits opt = parser.MapNext("SAVE", SB_SAVE, "NOSAVE", SB_NOSAVE, "NOW", SB_NOW, "FORCE",
                                  SB_FORCE, "ABORT", SB_ABORT, "SAFE", SB_SAVE);
    sb |= static_cast<uint32_t>(opt);
  }

  RETURN_ON_PARSE_ERROR(parser, cmd_cntx);

  // Conflicting toggles
  if ((sb & SB_SAVE) && (sb & SB_NOSAVE)) {
    return cmd_cntx->SendError(kSyntaxErr);
  }

  if (sb & SB_ABORT) {
    // We currently do not support aborting an in-progress shutdown sequence.
    return cmd_cntx->SendError("SHUTDOWN ABORT is not supported");
  }

  // Configure save behavior on shutdown according to options.
  if (sb & SB_FORCE) {
    // FORCE implies no snapshot on shutdown regardless of SAVE/SAFE
    save_on_shutdown_ = false;
  } else if (sb & SB_NOSAVE) {
    save_on_shutdown_ = false;
  } else if (sb & SB_SAVE) {
    save_on_shutdown_ = true;
  }

  // Wire NOW/FORCE to a single fast-shutdown flag for listeners.
  facade::g_shutdown_fast.store((sb & (SB_NOW | SB_FORCE)) != 0, std::memory_order_seq_cst);

  CHECK_NOTNULL(acceptor_)->Stop();
  cmd_cntx->rb()->SendOk();

  // Reset flag for any subsequent restarts (mainly for tests).
  facade::g_shutdown_fast.store(false, std::memory_order_seq_cst);
}

void ServerFamily::Dfly(CmdArgList args, CommandContext* cmd_cntx) {
  dfly_cmd_->Run(args, cmd_cntx);
}

void ServerFamily::SlowLog(CmdArgList args, CommandContext* cmd_cntx) {
  string sub_cmd = absl::AsciiStrToUpper(ArgS(args, 0));
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (sub_cmd == "HELP") {
    string_view help[] = {
        "SLOWLOG <subcommand> [<arg> [value] [opt] ...]. Subcommands are:",
        "GET [<count>]",
        "    Return top <count> entries from the slowlog (default: 10, -1 mean all).",
        "    Entries are made of:",
        "    id, timestamp, time in microseconds, arguments array, client IP and port,",
        "    client name",
        "LEN",
        "    Return the length of the slowlog.",
        "RESET",
        "    Reset the slowlog.",
        "HELP",
        "    Prints this help.",
    };

    rb->SendSimpleStrArr(help);
    return;
  }

  if (sub_cmd == "LEN") {
    vector<int> lengths(service_.proactor_pool().size());
    service_.proactor_pool().AwaitFiberOnAll([&lengths](auto index, auto* context) {
      lengths[index] = ServerState::tlocal()->GetSlowLog().Length();
    });
    int sum = std::accumulate(lengths.begin(), lengths.end(), 0);
    return rb->SendLong(sum);
  }

  if (sub_cmd == "RESET") {
    service_.proactor_pool().AwaitFiberOnAll(
        [](auto index, auto* context) { ServerState::tlocal()->GetSlowLog().Reset(); });
    return rb->SendOk();
  }

  if (sub_cmd == "GET") {
    return SlowLogGet(args, sub_cmd, &service_.proactor_pool(), cmd_cntx);
  }
  cmd_cntx->SendError(UnknownSubCmd(sub_cmd, "SLOWLOG"), kSyntaxErrType);
}

void ServerFamily::Module(CmdArgList args, CommandContext* cmd_cntx) {
  string sub_cmd = absl::AsciiStrToUpper(ArgS(args, 0));
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  if (sub_cmd != "LIST")
    return cmd_cntx->SendError(kSyntaxErr);

  rb->StartArray(2);

  // Json
  rb->StartCollection(2, CollectionType::MAP);
  rb->SendSimpleString("name");
  rb->SendSimpleString("ReJSON");
  rb->SendSimpleString("ver");
  rb->SendLong(20'808);

  // Search
  rb->StartCollection(2, CollectionType::MAP);
  rb->SendSimpleString("name");
  rb->SendSimpleString("search");
  rb->SendSimpleString("ver");
  rb->SendLong(21'015);  // we target v2
}

void ServerFamily::ClientPauseCmd(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser(args);
  auto listeners = GetNonPriviligedListeners();

  auto timeout = parser.Next<uint64_t>();
  ClientPause pause_state = ClientPause::ALL;
  if (parser.HasNext()) {
    pause_state = parser.MapNext("WRITE", ClientPause::WRITE, "ALL", ClientPause::ALL);
  }
  RETURN_ON_PARSE_ERROR(parser, cmd_cntx);

  const auto timeout_ms = timeout * 1ms;
  auto is_pause_in_progress = [this, end_time = chrono::steady_clock::now() + timeout_ms] {
    return ServerState::tlocal()->gstate() != GlobalState::SHUTTING_DOWN &&
           chrono::steady_clock::now() < end_time && is_c_pause_in_progress_.load();
  };

  auto cleanup = [this] {
    active_pauses_.fetch_sub(1);
    client_pause_ec_.notify();
  };

  ConnectionContext* cntx = cmd_cntx->server_conn_cntx();
  if (auto pause_fb_opt = Pause(listeners, cntx->ns, cntx->conn(), pause_state,
                                std::move(is_pause_in_progress), cleanup);
      pause_fb_opt) {
    is_c_pause_in_progress_.store(true);
    active_pauses_.fetch_add(1);
    pause_fb_opt->Detach();
    return cmd_cntx->rb()->SendOk();
  }
  cmd_cntx->SendError("Failed to pause all running clients");
}

#define HFUNC(x) SetHandler(HandlerFunc(this, &ServerFamily::x))

namespace acl {
constexpr uint32_t kAuth = FAST | CONNECTION;
constexpr uint32_t kBGSave = ADMIN | SLOW | DANGEROUS;
constexpr uint32_t kClient = SLOW | CONNECTION;
constexpr uint32_t kConfig = ADMIN | SLOW | DANGEROUS;
constexpr uint32_t kDbSize = KEYSPACE | READ | FAST;
constexpr uint32_t kDebug = ADMIN | SLOW | DANGEROUS;
constexpr uint32_t kFlushDB = KEYSPACE | WRITE | SLOW | DANGEROUS;
constexpr uint32_t kFlushAll = KEYSPACE | WRITE | SLOW | DANGEROUS;
constexpr uint32_t kInfo = SLOW | DANGEROUS;
constexpr uint32_t kHello = FAST | CONNECTION;
constexpr uint32_t kLastSave = ADMIN | FAST | DANGEROUS;
constexpr uint32_t kLatency = ADMIN | SLOW | DANGEROUS;
constexpr uint32_t kMemory = READ | SLOW;
constexpr uint32_t kSave = ADMIN | SLOW | DANGEROUS;
constexpr uint32_t kShutDown = ADMIN | SLOW | DANGEROUS;
constexpr uint32_t kSlaveOf = ADMIN | SLOW | DANGEROUS;
constexpr uint32_t kReplicaOf = ADMIN | SLOW | DANGEROUS;
constexpr uint32_t kReplTakeOver = DANGEROUS;
constexpr uint32_t kReplConf = ADMIN | SLOW | DANGEROUS;
constexpr uint32_t kRole = ADMIN | FAST | DANGEROUS;
constexpr uint32_t kSlowLog = ADMIN | SLOW | DANGEROUS;
constexpr uint32_t kScript = SLOW | SCRIPTING;
constexpr uint32_t kModule = ADMIN | SLOW | DANGEROUS;
// TODO(check this)
constexpr uint32_t kDfly = ADMIN;
}  // namespace acl

void ServerFamily::Register(CommandRegistry* registry) {
  constexpr auto kReplicaOpts = CO::LOADING | CO::ADMIN | CO::GLOBAL_TRANS;
  constexpr auto kMemOpts = CO::LOADING | CO::READONLY | CO::FAST;
  registry->StartFamily();
  *registry
      << CI{"AUTH", CO::NOSCRIPT | CO::FAST | CO::LOADING, -2, 0, 0, acl::kAuth}.HFUNC(Auth)
      << CI{"BGSAVE", CO::ADMIN | CO::GLOBAL_TRANS, -1, 0, 0, acl::kBGSave}.HFUNC(BgSave)
      << CI{"CLIENT", CO::NOSCRIPT | CO::LOADING, -2, 0, 0, acl::kClient}.HFUNC(Client)
      << CI{"CONFIG", CO::ADMIN | CO::LOADING | CO::DANGEROUS, -2, 0, 0, acl::kConfig}.HFUNC(Config)
      << CI{"DBSIZE", CO::READONLY | CO::FAST | CO::LOADING, 1, 0, 0, acl::kDbSize}.HFUNC(DbSize)
      << CI{"DEBUG", CO::ADMIN | CO::LOADING, -2, 0, 0, acl::kDebug}.HFUNC(Debug)
      << CI{"FLUSHDB", CO::JOURNALED | CO::GLOBAL_TRANS | CO::DANGEROUS, -1, 0, 0, acl::kFlushDB}
             .HFUNC(FlushDb)
      << CI{"FLUSHALL", CO::JOURNALED | CO::GLOBAL_TRANS | CO::DANGEROUS, -1, 0, 0, acl::kFlushAll}
             .HFUNC(FlushDb)
      << CI{"INFO", CO::LOADING, -1, 0, 0, acl::kInfo}.HFUNC(Info)
      << CI{"HELLO", CO::LOADING, -1, 0, 0, acl::kHello}.HFUNC(Hello)
      << CI{"LASTSAVE", CO::LOADING | CO::FAST, 1, 0, 0, acl::kLastSave}.HFUNC(LastSave)
      << CI{"LATENCY", CO::NOSCRIPT | CO::LOADING | CO::FAST, -2, 0, 0, acl::kLatency}.HFUNC(
             Latency)
      << CI{"MEMORY", kMemOpts, -2, 0, 0, acl::kMemory}.HFUNC(Memory)
      << CI{"SHRINK", CO::JOURNALED | CO::FAST, 2, 1, 1, acl::kMemory}.HFUNC(Shrink)
      << CI{"SAVE", CO::ADMIN | CO::GLOBAL_TRANS, -1, 0, 0, acl::kSave}.HFUNC(Save)
      << CI{"SHUTDOWN",    CO::ADMIN | CO::NOSCRIPT | CO::LOADING | CO::DANGEROUS, -1, 0, 0,
            acl::kShutDown}
             .HFUNC(ShutdownCmd)
      << CI{"SLAVEOF", kReplicaOpts, 3, 0, 0, acl::kSlaveOf}.HFUNC(ReplicaOf)
      << CI{"REPLICAOF", kReplicaOpts, -3, 0, 0, acl::kReplicaOf}.HFUNC(ReplicaOf)
      << CI{"ADDREPLICAOF", kReplicaOpts, 5, 0, 0, acl::kReplicaOf}.HFUNC(AddReplicaOf)
      << CI{"REPLTAKEOVER", CO::ADMIN | CO::GLOBAL_TRANS, -2, 0, 0, acl::kReplTakeOver}.HFUNC(
             ReplTakeOver)
      << CI{"REPLCONF", CO::ADMIN | CO::LOADING, -1, 0, 0, acl::kReplConf}.HFUNC(ReplConf)
      << CI{"ROLE", CO::LOADING | CO::FAST | CO::NOSCRIPT, 1, 0, 0, acl::kRole}.HFUNC(Role)
      << CI{"SLOWLOG", CO::ADMIN | CO::FAST, -2, 0, 0, acl::kSlowLog}.HFUNC(SlowLog)
      << CI{"SCRIPT", CO::NOSCRIPT | CO::NO_KEY_TRANSACTIONAL, -2, 0, 0, acl::kScript}.HFUNC(Script)
      << CI{"DFLY", CO::ADMIN | CO::GLOBAL_TRANS | CO::HIDDEN, -2, 0, 0, acl::kDfly}.HFUNC(Dfly)
      << CI{"MODULE", CO::ADMIN, 2, 0, 0, acl::kModule}.HFUNC(Module);
}

}  // namespace dfly


================================================
FILE: src/server/server_family.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <functional>
#include <memory>
#include <optional>
#include <string>

#include "core/qlist.h"
#include "facade/facade_stats.h"
#include "facade/facade_types.h"
#include "server/db_slice.h"
#include "server/engine_shard_set.h"
#include "server/replica_types.h"
#include "server/server_state.h"
#include "server/stats.h"
#include "util/fibers/fiberqueue_threadpool.h"
#include "util/fibers/future.h"

struct hdr_histogram;

namespace facade {
class Listener;
}  // namespace facade

namespace util {

class AcceptServer;
class HttpListenerBase;

}  // namespace util

namespace dfly {

namespace detail {

struct SaveStagesController;
class SnapshotStorage;

}  // namespace detail

std::string GetPassword();

class CommandContext;
class CommandRegistry;
class DflyCmd;
class Replica;
class Service;
class ScriptMgr;
class RdbLoadContext;

struct ReplicaRoleInfo {
  std::string id;
  std::string address;
  uint32_t listening_port;
  std::string_view state;
  uint64_t lsn_lag;
};

struct ReplicationMemoryStats {
  size_t streamer_buf_capacity_bytes = 0;  // total capacities of streamer buffers
  size_t full_sync_buf_bytes = 0;          // total bytes used for full sync buffers
};

struct LoadingStats {
  size_t restore_count = 0;
  size_t failed_restore_count = 0;

  size_t backup_count = 0;
  size_t failed_backup_count = 0;
};

// Global peak stats recorded after aggregating metrics over all shards.
// Note that those values are only updated during GetMetrics calls.
struct PeakStats {
  size_t conn_dispatch_queue_bytes = 0;  // peak value of conn_stats.dispatch_queue_bytes
  size_t conn_read_buf_capacity = 0;     // peak of total read buf capcacities
};

// Aggregated metrics over multiple sources on all shards
struct Metrics {
  SliceEvents events;              // general keyspace stats
  std::vector<DbStats> db_stats;   // dbsize stats
  EngineShard::Stats shard_stats;  // per-shard stats

  facade::FacadeStats facade_stats;  // client stats and buffer sizes
  TieredStats tiered_stats;

  SearchStats search_stats;
  ServerState::Stats coordinator_stats;  // stats on transaction running
  PeakStats peak_stats;
  QList::Stats qlist_stats;

  size_t qps = 0;

  size_t used_mem_peak = 0;
  size_t used_mem_rss_peak = 0;

  size_t heap_used_bytes = 0;
  size_t small_string_bytes = 0;
  uint32_t traverse_ttl_per_sec = 0;
  uint32_t delete_ttl_per_sec = 0;
  uint64_t hoffman_encode_total = 0, hoffman_encode_success = 0;
  uint64_t fiber_switch_cnt = 0;
  uint64_t fiber_switch_delay_usec = 0;
  uint64_t tls_bytes = 0;
  uint64_t refused_conn_max_clients_reached_count = 0;
  uint64_t serialization_bytes = 0;

  // Statistics about fibers running for a long time (more than 1ms).
  uint64_t fiber_longrun_cnt = 0;
  uint64_t fiber_longrun_usec = 0;

  // Max length of the all the tx shard-queues.
  uint32_t tx_queue_len = 0;
  uint32_t worker_fiber_count = 0;
  uint32_t blocked_tasks = 0;
  size_t worker_fiber_stack_size = 0;

  size_t lsn_buffer_size = 0;
  size_t lsn_buffer_bytes = 0;

  // monotonic timestamp (ProactorBase::GetMonotonicTimeNs) of the connection stuck on send
  // for longest time.
  uint64_t oldest_pending_send_ts = uint64_t(-1);

  InterpreterManager::Stats lua_stats;

  // command call frequencies (count, aggregated latency in usec).
  std::map<std::string, std::pair<uint64_t, uint64_t>> cmd_stats_map;

  absl::flat_hash_map<std::string, uint64_t> connections_lib_name_ver_map;

  struct ReplicaInfo {
    ReplicaSummary summary;

    // cluster
    std::vector<ReplicaSummary> cl_repl_summary;
  };

  // Replica reconnect stats on the replica side. Undefined for master
  std::optional<ReplicaInfo> replica_side_info;

  size_t migration_errors_total;

  LoadingStats loading_stats;

  absl::flat_hash_map<std::string, hdr_histogram*> cmd_latency_map;

  InternedStringStats interned_string_stats;
};

// Contains the state of the last save operation.
// This object is immutable.
struct SaveInfoData {
  time_t save_time = 0;  // epoch time in seconds.
  uint32_t success_duration_sec = 0;
  std::string file_name;
  std::vector<std::pair<std::string_view, size_t>> freq_map;  // RDB_TYPE_xxx -> count mapping.

  // last error save info
  GenericError last_error;
  time_t last_error_time = 0;      // epoch time in seconds.
  time_t failed_duration_sec = 0;  // epoch time in seconds.

  // false if last attempt failed
  bool last_bgsave_status = true;
  bool bgsave_in_progress = false;
};

// A thread-safe wrapper for SaveInfoData using the Copy-on-Write pattern.
class ThreadSafeSaveInfo {
 public:
  // Returns a snapshot of the current save info.
  SaveInfoData Get() const {
    std::lock_guard<util::fb2::Mutex> lock(data_mutex_);
    return data_;
  }

  // The modifier function is called under a lock.
  void Update(std::function<void(SaveInfoData*)> modifier) {
    std::lock_guard<util::fb2::Mutex> lock(writer_mutex_);
    SaveInfoData new_data(Get());
    modifier(&new_data);
    UpdateData(new_data);
  }

 private:
  void UpdateData(const SaveInfoData& new_data) {
    std::lock_guard<util::fb2::Mutex> lock(data_mutex_);
    data_ = new_data;
  }

  mutable util::fb2::Mutex writer_mutex_;
  mutable util::fb2::Mutex data_mutex_;
  SaveInfoData data_;
};

struct SnapshotSpec {
  std::string hour_spec;
  std::string minute_spec;
};

struct ReplicaOffsetInfo {
  std::string sync_id;
  std::vector<uint64_t> flow_offsets;
};

struct SaveCmdOptions {
  // if new_version is true, saves DF specific, non redis compatible snapshot.
  bool new_version;
  // cloud storage URI
  std::string_view cloud_uri;
  // if basename is not empty it will override dbfilename flag
  std::string_view basename;
};

class ServerFamily {
  using SinkReplyBuilder = facade::SinkReplyBuilder;

 public:
  explicit ServerFamily(Service* service);
  ~ServerFamily();

  void Init(util::AcceptServer* acceptor, std::vector<facade::Listener*> listeners);
  void Register(CommandRegistry* registry);
  void Shutdown() ABSL_LOCKS_EXCLUDED(replicaof_mu_);

  // Public because is used by DflyCmd.
  void ShutdownCmd(CmdArgList args, CommandContext* cmd_cntx);

  Service& service() {
    return service_;
  }

  void ResetStat(Namespace* ns);

  Metrics GetMetrics(Namespace* ns) const;

  std::string FormatInfoMetrics(const Metrics& metrics, std::string_view section,
                                bool priveleged) const;

  ScriptMgr* script_mgr() {
    return script_mgr_.get();
  }

  const ScriptMgr* script_mgr() const {
    return script_mgr_.get();
  }

  void StatsMC(std::string_view section, CommandContext* cmd_ctx);

  GenericError DoSave(const SaveCmdOptions& save_cmd_opts, Transaction* transaction,
                      bool ignore_state = false);

  // Calls DoSave with a default generated transaction and with the format
  // specified in --df_snapshot_format
  GenericError DoSave(bool ignore_state = false);

  // Burns down and destroy all the data from the database.
  // if kDbAll is passed, burns all the databases to the ground.
  // `wait` makes it wait for all fibers to finish and decommit
  void Drakarys(Transaction* transaction, DbIndex db_ind, bool wait);

  SaveInfoData GetLastSaveInfo() const;

  void FlushAll(Namespace* ns);

  // Load snapshot from file (.rdb file or summary.dfs file) and return
  // future with error_code.
  enum class LoadExistingKeys : uint8_t { kFail, kOverride };
  std::optional<util::fb2::Future<GenericError>> Load(const std::string& file_name,
                                                      LoadExistingKeys existing_keys);

  bool TEST_IsSaving() const;

  void ConfigureMetrics(util::HttpListenerBase* listener);

  void PauseReplication(bool pause) ABSL_LOCKS_EXCLUDED(replicaof_mu_);
  std::optional<ReplicaOffsetInfo> GetReplicaOffsetInfo() ABSL_LOCKS_EXCLUDED(replicaof_mu_);

  const std::string& master_replid() const {
    return master_replid_;
  }

  DflyCmd* GetDflyCmd() const {
    return dfly_cmd_.get();
  }

  std::optional<LastMasterSyncData> GetLastMasterData() const {
    return last_master_data_;
  }

  absl::Span<facade::Listener* const> GetListeners() const {
    return listeners_;
  }

  std::vector<facade::Listener*> GetNonPriviligedListeners() const;

  // Replica-side method. Returns replication summary if this server is a replica,
  // nullopt otherwise.
  std::optional<Metrics::ReplicaInfo> GetReplicaSummary() const;

  void OnClose(ConnectionContext* cntx);

  void CancelBlockingOnThread(std::function<facade::OpStatus(facade::ArgSlice)> = {});

  // Sets the server to replicate another instance. Does not flush the database beforehand!
  void Replicate(std::string_view host, std::string_view port);

  void UpdateMemoryGlobalStats();

  // Return true if no replicas are registered or if all replicas reached stable sync
  // Used in debug populate to DCHECK insocsistent flows that violate transaction gurantees
  bool AreAllReplicasInStableSync() const;

 private:
  // Helper to safely get save controller copy
  std::shared_ptr<detail::SaveStagesController> GetSaveController() const {
    util::fb2::LockGuard lk{save_mu_};
    return save_controller_;
  }

  bool HasPrivilegedInterface();
  void JoinSnapshotSchedule();
  void LoadFromSnapshot() ABSL_LOCKS_EXCLUDED(loading_stats_mu_);

  uint32_t shard_count() const {
    return shard_set->size();
  }

  void Auth(CmdArgList args, CommandContext* cmd_cntx);
  void Client(CmdArgList args, CommandContext* cmd_cntx);
  void Config(CmdArgList args, CommandContext* cmd_cntx);
  void DbSize(CmdArgList args, CommandContext* cmd_cntx);
  void Debug(CmdArgList args, CommandContext* cmd_cntx);
  void Dfly(CmdArgList args, CommandContext* cmd_cntx);
  void Memory(CmdArgList args, CommandContext* cmd_cntx);
  void Shrink(CmdArgList args, CommandContext* cmd_cntx);
  void FlushDb(CmdArgList args, CommandContext* cmd_cntx);
  void Info(CmdArgList args, CommandContext* cmd_cntx) ABSL_LOCKS_EXCLUDED(replicaof_mu_);
  void Hello(CmdArgList args, CommandContext* cmd_cntx);
  void LastSave(CmdArgList args, CommandContext* cmd_cntx);
  void Latency(CmdArgList args, CommandContext* cmd_cntx);
  void ReplicaOf(CmdArgList args, CommandContext* cmd_cntx);
  void AddReplicaOf(CmdArgList args, CommandContext* cmd_cntx);
  void ReplTakeOver(CmdArgList args, CommandContext* cmd_cntx) ABSL_LOCKS_EXCLUDED(replicaof_mu_);
  void ReplConf(CmdArgList args, CommandContext* cmd_cntx);
  void Role(CmdArgList args, CommandContext* cmd_cntx) ABSL_LOCKS_EXCLUDED(replicaof_mu_);
  void Save(CmdArgList args, CommandContext* cmd_cntx);
  void BgSave(CmdArgList args, CommandContext* cmd_cntx);
  void Script(CmdArgList args, CommandContext* cmd_cntx);
  void SlowLog(CmdArgList args, CommandContext* cmd_cntx);
  void Module(CmdArgList args, CommandContext* cmd_cntx);

  void SyncGeneric(std::string_view repl_master_id, uint64_t offs, ConnectionContext* cntx);

  enum ActionOnConnectionFail {
    kReturnOnError,        // if we fail to connect to master, return to err
    kContinueReplication,  // continue attempting to connect to master, regardless of initial
                           // failure
  };

  // REPLICAOF implementation. See arguments above
  void ReplicaOfInternal(CmdArgList args, CommandContext* cmnd_cntx,
                         ActionOnConnectionFail on_error) ABSL_LOCKS_EXCLUDED(replicaof_mu_);

  void StartJournalInShardThreads(Replica* repl_ptr);

  void ReplicaOfNoOne(SinkReplyBuilder* builder) ABSL_LOCKS_EXCLUDED(replicaof_mu_);

  // REPLICAOF implementation without two phase locking.
  void ReplicaOfInternalV2(CmdArgList args, CommandContext* cmnd_cntx,
                           ActionOnConnectionFail on_error) ABSL_LOCKS_EXCLUDED(replicaof_mu_);

  struct LoadOptions {
    std::string snapshot_id;
    uint32_t shard_count = 0;      // Shard count of the snapshot being loaded.
    uint64_t num_loaded_keys = 0;  // Number of keys loaded.
  };

  // Updates LoadOptions if successful. If snapshot_id and shard_count are passed in,
  // may use them for consistency checks.
  std::error_code LoadRdb(const std::string& rdb_file, LoadExistingKeys existing_keys,
                          LoadOptions* load_opts, RdbLoadContext* load_context);

  void SnapshotScheduling() ABSL_LOCKS_EXCLUDED(loading_stats_mu_);

  void SendInvalidationMessages() const;

  std::optional<SaveCmdOptions> GetSaveCmdOpts(CmdArgList args, CommandContext* cmd_cntx);

  void BgSaveFb(boost::intrusive_ptr<Transaction> trans);

  struct DoSaveCheckAndStartOpts {
    bool ignore_state = false;
    bool bg_save = false;
  };

  GenericError DoSaveCheckAndStart(const SaveCmdOptions& save_cmd_opts, Transaction* trans,
                                   DoSaveCheckAndStartOpts opts) ABSL_LOCKS_EXCLUDED(save_mu_);

  GenericError WaitUntilSaveFinished(Transaction* trans,
                                     bool ignore_state = false) ABSL_NO_THREAD_SAFETY_ANALYSIS;
  void StopAllClusterReplicas() ABSL_EXCLUSIVE_LOCKS_REQUIRED(replicaof_mu_);

  static bool DoAuth(ConnectionContext* cntx, std::string_view username, std::string_view password);

  void ClientPauseCmd(CmdArgList args, CommandContext* cmd_cntx);
  void ClientUnPauseCmd(CmdArgList args, CommandContext* cmd_cntx);

  // Set accepting_connections_ and update listners according to it
  void ChangeConnectionAccept(bool accept);

  util::fb2::Fiber snapshot_schedule_fb_;
  util::fb2::Fiber load_fiber_;

  Service& service_;

  util::AcceptServer* acceptor_ = nullptr;
  std::vector<facade::Listener*> listeners_;
  bool accepting_connections_ = true;  // reject connections near oom
  util::ProactorBase* pb_task_ = nullptr;

  mutable util::fb2::Mutex replicaof_mu_, save_mu_;
  std::shared_ptr<Replica> replica_ ABSL_GUARDED_BY(replicaof_mu_);
  std::vector<std::unique_ptr<Replica>> cluster_replicas_
      ABSL_GUARDED_BY(replicaof_mu_);  // used to replicating multiple nodes to single dragonfly

  std::unique_ptr<ScriptMgr> script_mgr_;
  std::unique_ptr<DflyCmd> dfly_cmd_;

  std::string master_replid_;
  std::optional<LastMasterSyncData> last_master_data_;

  time_t start_time_ = 0;  // in seconds, epoch time.

  ThreadSafeSaveInfo thread_safe_save_info_;
  std::shared_ptr<detail::SaveStagesController> save_controller_ ABSL_GUARDED_BY(save_mu_);

  // Used to override save on shutdown behavior that is usually set
  // be --dbfilename.
  bool save_on_shutdown_{true};

  util::fb2::Done schedule_done_;
  std::unique_ptr<util::fb2::FiberQueueThreadPool> fq_threadpool_;
  std::shared_ptr<detail::SnapshotStorage> snapshot_storage_;

  std::atomic<bool> is_c_pause_in_progress_ = false;
  // We need this because if dragonfly shuts down during pause, ServerState will destruct
  // before the dettached fiber Pause() causing a seg fault.
  std::atomic<size_t> active_pauses_ = 0;
  util::fb2::EventCount client_pause_ec_;

  // protected by save_mu_
  util::fb2::Fiber bg_save_fb_;

  mutable util::fb2::Mutex peak_stats_mu_;
  mutable PeakStats peak_stats_;

  mutable util::fb2::Mutex loading_stats_mu_;
  LoadingStats loading_stats_ ABSL_GUARDED_BY(loading_stats_mu_);

  bool legacy_format_metrics_ = true;
};

// Reusable CLIENT PAUSE implementation that blocks while polling is_pause_in_progress
std::optional<util::fb2::Fiber> Pause(std::vector<facade::Listener*> listeners, Namespace* ns,
                                      facade::Connection* conn, ClientPause pause_state,
                                      std::function<bool()> is_pause_in_progress,
                                      std::function<void()> maybe_cleanup = {});

}  // namespace dfly


================================================
FILE: src/server/server_family_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/server_family.h"

#include <absl/strings/match.h>

#include "absl/strings/str_cat.h"
#include "base/flags.h"
#include "base/gtest.h"
#include "base/logging.h"
#include "facade/facade_test.h"
#include "facade/socket_utils.h"
#include "server/test_utils.h"

using namespace testing;
using namespace std;
using namespace util;
using namespace boost;

ABSL_DECLARE_FLAG(string, cluster_mode);

namespace dfly {

class ServerFamilyTest : public BaseFamilyTest {
 protected:
};

#ifdef __linux__
TEST_F(ServerFamilyTest, ReadTcpInfo) {
  // Create a TCP socket
  int sockfd = socket(AF_INET, SOCK_STREAM, 0);
  ASSERT_GT(sockfd, 0) << "Failed to create socket";

  // We'll create a socket in LISTEN state
  struct sockaddr_in server_addr;
  memset(&server_addr, 0, sizeof(server_addr));
  server_addr.sin_family = AF_INET;
  server_addr.sin_addr.s_addr = INADDR_ANY;
  server_addr.sin_port = 0;  // Let the system choose a free port

  // Bind to the port
  ASSERT_EQ(bind(sockfd, (struct sockaddr*)&server_addr, sizeof(server_addr)), 0)
      << "Failed to bind socket: " << strerror(errno);

  // Start listening
  ASSERT_EQ(listen(sockfd, 1), 0) << "Failed to listen on socket: " << strerror(errno);

  // Get socket info
  std::string socket_info = GetSocketInfo(sockfd);
  std::cout << "Socket info for valid socket: " << socket_info << std::endl;
  EXPECT_FALSE(socket_info.empty()) << "Socket info should not be empty";

  // The socket info should contain some recognizable patterns
  // For a listening socket, it should contain information about the local address
  EXPECT_NE(socket_info.find("State: LISTEN"), std::string::npos)
      << "Socket info doesn't contain expected local address pattern";

  // Close the socket
  close(sockfd);

  // Test invalid socket
  socket_info = GetSocketInfo(-1);
  EXPECT_EQ(socket_info, "invalid socket");
}

TEST_F(ServerFamilyTest, GetTcpSocketInfoIPv6) {
  // Create an IPv6 TCP socket
  int sockfd = socket(AF_INET6, SOCK_STREAM, 0);
  ASSERT_GT(sockfd, 0) << "Failed to create IPv6 socket";

  // We'll create a socket in LISTEN state
  struct sockaddr_in6 server_addr;
  memset(&server_addr, 0, sizeof(server_addr));
  server_addr.sin6_family = AF_INET6;
  server_addr.sin6_addr = in6addr_any;
  server_addr.sin6_port = 0;  // Let the system choose a free port

  // Bind to the port
  ASSERT_EQ(bind(sockfd, (struct sockaddr*)&server_addr, sizeof(server_addr)), 0)
      << "Failed to bind IPv6 socket: " << strerror(errno);

  // Start listening
  ASSERT_EQ(listen(sockfd, 1), 0) << "Failed to listen on IPv6 socket: " << strerror(errno);

  // Get socket info
  std::string socket_info = GetSocketInfo(sockfd);
  std::cout << "Socket info for valid IPv6 socket: " << socket_info << std::endl;
  EXPECT_FALSE(socket_info.empty()) << "IPv6 socket info should not be empty";

  // The socket info should contain some recognizable patterns
  // For a listening IPv6 socket, it should contain information about the local address
  EXPECT_NE(socket_info.find("State: LISTEN"), std::string::npos)
      << "IPv6 socket info doesn't contain expected LISTEN state";

  // If IPv6 support works correctly, the socket info should indicate an IPv6 address format
  EXPECT_NE(socket_info.find("Local: ["), std::string::npos)
      << "IPv6 socket info doesn't use IPv6 address format";

  // Close the socket
  close(sockfd);
}
#endif

TEST_F(ServerFamilyTest, SlowLogTruncation) {
  auto resp = Run({"config", "set", "slowlog_max_len", "3"});
  EXPECT_THAT(resp.GetString(), "OK");
  resp = Run({"config", "set", "slowlog_log_slower_than", "0"});
  EXPECT_THAT(resp.GetString(), "OK");

  // Test args count truncation: 32 args (no truncation) vs 33 args (truncated)
  std::vector<std::string> cmd_args = {"LPUSH", "mykey"};
  for (int i = 1; i <= 30; ++i) {
    cmd_args.push_back(std::to_string(i));
  }
  resp = Run(absl::Span<std::string>(cmd_args));
  EXPECT_THAT(resp.GetInt(), 30);
  resp = Run({"slowlog", "get"});
  auto slowlog = resp.GetVec();
  EXPECT_THAT(slowlog[0].GetVec()[3].GetVec(), ElementsAreArray(cmd_args));

  cmd_args.push_back("31");
  resp = Run(absl::Span<std::string>(cmd_args));
  EXPECT_THAT(resp.GetInt(), 61);
  resp = Run({"slowlog", "get"});
  slowlog = resp.GetVec();
  auto commands = slowlog[0].GetVec()[3].GetVec();
  EXPECT_THAT(commands.size(), 32);
  EXPECT_THAT(commands[31].GetString(), "... (2 more arguments)");

  // Test args length truncation: 128 bytes (no truncation) vs 129 bytes (truncated)
  std::string at_limit = std::string(128, 'A');
  resp = Run({"lpush", "key1", at_limit});
  resp = Run({"slowlog", "get"});
  slowlog = resp.GetVec();
  EXPECT_THAT(slowlog[0].GetVec()[3].GetVec()[2].GetString(), at_limit);

  std::string over_limit = std::string(129, 'A');
  resp = Run({"lpush", "key2", over_limit});
  resp = Run({"slowlog", "get"});
  slowlog = resp.GetVec();
  auto truncated = slowlog[0].GetVec()[3].GetVec()[2].GetString();
  EXPECT_THAT(truncated, std::string(110, 'A') + "... (1 more bytes)");
}

TEST_F(ServerFamilyTest, SlowLogMaxLengthZero) {
  auto resp = Run({"config", "set", "slowlog_max_len", "0"});
  EXPECT_THAT(resp.GetString(), "OK");
  resp = Run({"config", "set", "slowlog_log_slower_than", "0"});
  EXPECT_THAT(resp.GetString(), "OK");
  Run({"slowlog", "reset"});

  // issue an arbitrary command
  resp = Run({"set", "foo", "bar"});
  EXPECT_THAT(resp.GetString(), "OK");
  resp = Run({"slowlog", "get"});

  // slowlog should be empty since max_len is 0
  EXPECT_THAT(resp.GetVec().size(), 0);
}

TEST_F(ServerFamilyTest, SlowLogGetLen) {
  auto resp = Run({"config", "set", "slowlog_max_len", "3"});
  EXPECT_THAT(resp.GetString(), "OK");
  resp = Run({"config", "set", "slowlog_log_slower_than", "0"});
  EXPECT_THAT(resp.GetString(), "OK");

  for (int i = 1; i <= 3; ++i) {
    resp = Run({"lpush", "mykey", std::to_string(i)});
    EXPECT_THAT(resp.GetInt(), i);
  }

  // Test GET 0 - returns empty
  resp = Run({"slowlog", "get", "0"});
  EXPECT_THAT(resp.GetVec().size(), 0);

  // Test GET -1 - returns all entries
  resp = Run({"slowlog", "get", "-1"});
  EXPECT_THAT(resp.GetVec().size(), 3);

  // Test GET < -1 - returns error
  resp = Run({"slowlog", "get", "-2"});
  EXPECT_THAT(resp.GetString(), "ERR count should be greater than or equal to -1");
}

TEST_F(ServerFamilyTest, SlowLogLen) {
  auto resp = Run({"config", "set", "slowlog_max_len", "3"});
  EXPECT_THAT(resp.GetString(), "OK");
  resp = Run({"config", "set", "slowlog_log_slower_than", "0"});
  EXPECT_THAT(resp.GetString(), "OK");
  Run({"slowlog", "reset"});

  for (int i = 1; i < 4; ++i) {
    resp = Run({"lpush", "mykey", std::to_string(i)});
    EXPECT_THAT(resp.GetInt(), i);
  }

  resp = Run({"slowlog", "len"});
  EXPECT_THAT(resp.GetInt(), 3);
}

TEST_F(ServerFamilyTest, SlowLogMinusOneDisabled) {
  auto resp = Run({"config", "set", "slowlog_max_len", "3"});
  EXPECT_THAT(resp.GetString(), "OK");
  resp = Run({"config", "set", "slowlog_log_slower_than", "-1"});
  EXPECT_THAT(resp.GetString(), "OK");
  Run({"slowlog", "reset"});

  // issue some commands
  for (int i = 1; i < 4; ++i) {
    resp = Run({"lpush", "mykey", std::to_string(i)});
    EXPECT_THAT(resp.GetInt(), i);
  }

  // slowlog is still empty
  resp = Run({"slowlog", "get"});
  EXPECT_THAT(resp.GetVec().size(), 0);
  resp = Run({"slowlog", "len"});
  EXPECT_THAT(resp.GetInt(), 0);
}

// Test how slowlog captures additional information about heavy commands
TEST_F(ServerFamilyTest, SlowLogExecEval) {
  Run({"config", "set", "slowlog_max_len", "20"});
  Run({"config", "set", "slowlog_log_slower_than", "0"});

  // Run EXEC
  {
    Run({"multi"});
    Run({"set", "first", "ok"});
    Run({"set", "second2", "ok"});
    Run({"get", "third3"});
    Run({"exec"});
  }

  // Run EVAL
  {
    const std::string_view script = R"(
for i, key in ipairs(KEYS) do
  redis.call('GET', key)
end
for i, key in ipairs(KEYS) do
  redis.call('SET', key, 'some-data')
end
return 'OK';
    )";
    auto resp = Run({"EVAL", script, "3", "first", "second2", "third3", "second2"});
    EXPECT_EQ(resp, "OK");
  }

  size_t found = 0;
  auto resp = Run({"slowlog", "get"});
  for (const auto& entry : resp.GetVec()) {
    const auto& args = entry.GetVec()[3].GetVec();
    if (args[0] == "EXEC") {
      EXPECT_THAT(args, ElementsAreArray({"EXEC", "num_cmds: 3", "is_write: 1"}));
      found++;
    } else if (args[0] == "EVAL") {
      const auto sha = "41e84cf7973712deda6c1737a69bd1365eeb060f";
      EXPECT_THAT(args, ElementsAreArray({"EVAL", sha, "num_cmds: 6", "slow_cmds: 6", "tx_mode: 2",
                                          "tx_shards: 2", "is_write: 1", "lock_tags: 3", "3",
                                          "first", "second2", "third3", "second2"}));
      found++;
    }
  }

  EXPECT_EQ(found, 2);
}

TEST_F(ServerFamilyTest, ClientPause) {
  auto start = absl::Now();
  Run({"CLIENT", "PAUSE", "50"});

  Run({"get", "key"});
  EXPECT_GT((absl::Now() - start), absl::Milliseconds(50));

  start = absl::Now();

  Run({"CLIENT", "PAUSE", "50", "WRITE"});

  Run({"get", "key"});
  EXPECT_LT((absl::Now() - start), absl::Milliseconds(10));
  Run({"set", "key", "value2"});
  EXPECT_GT((absl::Now() - start), absl::Milliseconds(50));
}

TEST_F(ServerFamilyTest, ClientTrackingOnAndOff) {
  // case 1. can't use the feature for resp2
  auto resp = Run({"CLIENT", "TRACKING", "ON"});
  EXPECT_THAT(resp.GetString(),
              "ERR Client tracking is currently not supported for RESP2. Please use RESP3.");

  // case 2. allows when resp3 is used
  Run({"HELLO", "3"});
  resp = Run({"CLIENT", "TRACKING", "ON"});
  EXPECT_THAT(resp.GetString(), "OK");

  resp = Run({"CLIENT", "CACHING", "YES"});
  EXPECT_THAT(
      resp, ErrArg("ERR CLIENT CACHING YES is only valid when tracking is enabled in OPTIN mode"));

  resp = Run({"CLIENT", "CACHING", "NO"});
  EXPECT_THAT(
      resp, ErrArg("ERR CLIENT CACHING NO is only valid when tracking is enabled in OPTOUT mode"));

  // case 3. turn off client tracking
  resp = Run({"CLIENT", "TRACKING", "OFF"});
  EXPECT_THAT(resp.GetString(), "OK");

  resp = Run({"CLIENT", "CACHING", "YES"});
  EXPECT_THAT(
      resp,
      ErrArg("CLIENT CACHING can be called only when the client is in tracking mode with OPTIN or "
             "OPTOUT mode enabled"));
}

TEST_F(ServerFamilyTest, ToggleTrackingOnAndOff) {
  Run("HELLO 3");
  // seq = 0
  auto resp = Run("CLIENT TRACKING ON OPTIN");
  // seq = 1
  EXPECT_THAT(resp.GetString(), "OK");

  resp = Run("CLIENT CACHING YES");
  // seq = 2, caching = 1
  EXPECT_THAT(resp.GetString(), "OK");

  resp = Run("CLIENT TRACKING OFF");
  resp = Run("CLIENT TRACKING ON OPTIN");
  // seq = 3, caching = 1
  EXPECT_THAT(resp.GetString(), "OK");
  // seq(3) != (caching(1) + 1)
  resp = Run("GET foo");
  resp = Run("SET foo tmp");
  EXPECT_THAT(resp.GetString(), "OK");

  EXPECT_EQ(InvalidationMessagesLen("IO0"), 0);
}

TEST_F(ServerFamilyTest, ClientTrackingReadKey) {
  // case 1. only read the keys doesn't trigger any notification.
  Run({"HELLO", "3"});
  Run({"CLIENT", "TRACKING", "ON"});

  Run({"SET", "FOO", "10"});
  Run({"GET", "FOO"});
  EXPECT_EQ(InvalidationMessagesLen("IO0"), 0);

  Run({"GET", "BAR"});
  EXPECT_EQ(InvalidationMessagesLen("IO0"), 0);
}

TEST_F(ServerFamilyTest, ClientTrackingOptin) {
  Run({"HELLO", "3"});
  Run({"CLIENT", "TRACKING", "ON", "OPTIN"});

  Run({"GET", "FOO"});
  Run({"SET", "FOO", "10"});
  EXPECT_EQ(InvalidationMessagesLen("IO0"), 0);
  Run({"GET", "FOO"});
  EXPECT_EQ(InvalidationMessagesLen("IO0"), 0);

  Run({"CLIENT", "CACHING", "YES"});
  // Start tracking once
  Run({"GET", "FOO"});
  Run({"SET", "FOO", "20"});
  Run({"GET", "FOO"});
  EXPECT_EQ(InvalidationMessagesLen("IO0"), 1);

  Run({"GET", "BAR"});
  Run({"SET", "BAR", "20"});
  Run({"GET", "BAR"});
  EXPECT_EQ(InvalidationMessagesLen("IO0"), 1);

  // Start tracking once
  Run({"CLIENT", "CACHING", "YES"});
  Run({"GET", "BAR"});
  Run({"SET", "BAR", "20"});
  Run({"GET", "BAR"});
  EXPECT_EQ(InvalidationMessagesLen("IO0"), 2);
}

TEST_F(ServerFamilyTest, ClientTrackingMulti) {
  Run({"HELLO", "3"});
  Run({"CLIENT", "TRACKING", "ON"});
  Run({"MULTI"});
  Run({"GET", "FOO"});
  Run({"SET", "TMP", "10"});
  Run({"GET", "FOOBAR"});
  Run({"EXEC"});

  Run({"SET", "FOO", "10"});
  Run({"SET", "FOOBAR", "10"});
  EXPECT_EQ(InvalidationMessagesLen("IO0"), 2);
}

TEST_F(ServerFamilyTest, ClientTrackingCompatibilityMulti) {
  // Compatibility Test, all CLIENT commands should be allowed in MULTI
  Run({"HELLO", "3"});
  Run({"MULTI"});
  auto resp = Run({"CLIENT", "TRACKING", "ON"});
  EXPECT_THAT(resp.GetString(), "QUEUED");
  // Used by sentinel in MULTI/EXEC blocks
  resp = Run({"CLIENT", "KILL", "127.0.0.1:6380"});
  EXPECT_THAT(resp.GetString(), "QUEUED");
  resp = Run({"CLIENT", "SETNAME", "YO"});
  EXPECT_THAT(resp.GetString(), "QUEUED");
  resp = Run({"CLIENT", "GETNAME"});
  EXPECT_THAT(resp.GetString(), "QUEUED");
  Run({"EXEC"});

  Run({"GET", "FOO"});
  Run({"SET", "FOO", "10"});
  EXPECT_EQ(InvalidationMessagesLen("IO0"), 1);

  Run({"MULTI"});
  resp = Run({"CLIENT", "PAUSE", "0", "WRITE"});
  EXPECT_THAT(resp.GetString(), "QUEUED");
  Run({"EXEC"});
}

TEST_F(ServerFamilyTest, ClientTrackingMultiOptin) {
  Run({"HELLO", "3"});
  // Check stickiness
  Run({"CLIENT", "TRACKING", "ON", "OPTIN"});
  Run({"CLIENT", "CACHING", "YES"});
  Run({"MULTI"});
  Run({"GET", "FOO"});
  Run({"SET", "TMP", "10"});
  Run({"GET", "FOOBAR"});
  Run({"DISCARD"});

  Run({"SET", "FOO", "10"});
  EXPECT_EQ(InvalidationMessagesLen("IO0"), 0);

  Run({"CLIENT", "CACHING", "YES"});
  Run({"MULTI"});
  Run({"GET", "FOO"});
  Run({"SET", "TMP", "10"});
  Run({"GET", "FOOBAR"});
  Run({"EXEC"});

  Run({"SET", "FOO", "10"});
  Run({"SET", "FOOBAR", "10"});
  EXPECT_EQ(InvalidationMessagesLen("IO0"), 2);

  // CACHING enclosed in MULTI
  Run({"MULTI"});
  Run({"GET", "TMP"});
  Run({"GET", "TMP_TMP"});
  Run({"SET", "TMP", "10"});
  Run({"CLIENT", "CACHING", "YES"});
  Run({"GET", "FOO"});
  Run({"GET", "FOOBAR"});
  Run({"EXEC"});

  EXPECT_EQ(InvalidationMessagesLen("IO0"), 2);
  Run({"SET", "TMP", "10"});
  EXPECT_EQ(InvalidationMessagesLen("IO0"), 2);
  Run({"SET", "FOO", "10"});
  EXPECT_EQ(InvalidationMessagesLen("IO0"), 3);
  Run({"SET", "FOOBAR", "10"});
  EXPECT_EQ(InvalidationMessagesLen("IO0"), 4);

  // CACHING enclosed in MULTI, ON/OFF
  Run({"MULTI"});
  Run({"GET", "TMP"});
  Run({"SET", "TMP", "10"});
  Run({"CLIENT", "CACHING", "YES"});
  Run({"GET", "FOO"});
  Run({"GET", "BAR"});
  Run({"EXEC"});

  EXPECT_EQ(InvalidationMessagesLen("IO0"), 4);
  Run({"SET", "FOO", "10"});
  Run({"GET", "FOO"});
  EXPECT_EQ(InvalidationMessagesLen("IO0"), 5);
  Run({"SET", "BAR", "10"});
  Run({"GET", "BAR"});
  EXPECT_EQ(InvalidationMessagesLen("IO0"), 6);
}

TEST_F(ServerFamilyTest, ClientTrackingOptout) {
  Run({"HELLO", "3"});
  // Check stickiness
  Run({"CLIENT", "TRACKING", "ON", "OPTOUT"});
  Run({"GET", "FOO"});
  Run({"SET", "FOO", "BAR"});
  Run({"GET", "BAR"});
  Run({"SET", "BAR", "FOO"});
  EXPECT_EQ(InvalidationMessagesLen("IO0"), 2);

  // Switch off tracking for a single command
  Run({"CLIENT", "CACHING", "NO"});
  Run({"GET", "FOO"});
  Run({"SET", "FOO", "BAR"});
  EXPECT_EQ(InvalidationMessagesLen("IO0"), 2);
}

TEST_F(ServerFamilyTest, ClientTrackingMultiOptout) {
  Run({"HELLO", "3"});
  // Check stickiness
  Run({"CLIENT", "TRACKING", "ON", "OPTOUT"});

  Run({"MULTI"});
  Run({"GET", "FOO"});
  Run({"SET", "TMP", "10"});
  Run({"GET", "FOOBAR"});
  Run({"EXEC"});

  Run({"SET", "FOO", "10"});
  Run({"SET", "FOOBAR", "10"});
  EXPECT_EQ(InvalidationMessagesLen("IO0"), 2);

  // CACHING enclosed in MULTI
  Run({"MULTI"});
  Run({"CLIENT", "CACHING", "NO"});
  Run({"GET", "TMP"});
  Run({"GET", "TMP_TMP"});
  Run({"SET", "TMP", "10"});
  Run({"SET", "TMP_TMP", "10"});
  Run({"EXEC"});

  EXPECT_EQ(InvalidationMessagesLen("IO0"), 2);
}

TEST_F(ServerFamilyTest, ClientTrackingUpdateKey) {
  Run({"HELLO", "3"});
  Run({"CLIENT", "TRACKING", "ON"});

  Run({"GET", "FOO"});
  Run({"SET", "FOO", "10"});
  const auto& msg = GetInvalidationMessage("IO0", 0);
  EXPECT_EQ(msg.key, "FOO");

  // make sure invalidation message only gets sent once.
  Run({"GET", "FOO"});
  EXPECT_EQ(InvalidationMessagesLen("IO0"), 1);

  // update string from another connection
  // need to do another read to re-initialize the tracking of the key.
  Run({"GET", "FOO"});
  pp_->at(1)->Await([&] { return Run({"SET", "FOO", "30"}); });
  pp_->AwaitFiberOnAll([](ProactorBase* pb) {});
  const auto& msg2 = GetInvalidationMessage("IO0", 1);
  EXPECT_EQ(msg2.key, "FOO");

  // case 4. test multi command
  Run({"MGET", "X1", "X2", "X3", "X4", "Y1", "Y2", "Y3", "Y4", "Z1", "Z2", "Z3", "Z4"});
  pp_->at(1)->Await([&] { return Run({"MSET", "X1", "1", "Y3", "2", "Z2", "3", "Z4", "5"}); });
  pp_->AwaitFiberOnAll([](ProactorBase* pb) {});
  EXPECT_EQ(InvalidationMessagesLen("IO0"), 6);
  std::vector<std::string_view> keys_invalidated;
  for (unsigned int i = 2; i < 6; ++i)
    keys_invalidated.push_back(GetInvalidationMessage("IO0", i).key);
  ASSERT_THAT(keys_invalidated, UnorderedElementsAre("X1", "Y3", "Z2", "Z4"));

  Run({"FLUSHDB"});
}

TEST_F(ServerFamilyTest, ClientTrackingDeleteKey) {
  Run({"HELLO", "3"});
  Run({"CLIENT", "TRACKING", "ON"});
  Run({"SET", "FOO", "10"});
  Run({"GET", "FOO"});
  pp_->at(1)->Await([&] { return Run({"DEL", "FOO"}); });
  pp_->AwaitFiberOnAll([](ProactorBase* pb) {});
  EXPECT_EQ(GetInvalidationMessage("IO0", 0).key, "FOO");
}

TEST_F(ServerFamilyTest, ClientTrackingRenameKey) {
  Run({"HELLO", "3"});
  Run({"CLIENT", "TRACKING", "ON"});
  Run({"SET", "FOO", "10"});
  Run({"GET", "FOO"});
  pp_->at(1)->Await([&] { return Run({"RENAME", "FOO", "BAR"}); });
  pp_->AwaitFiberOnAll([](ProactorBase* pb) {});
  EXPECT_EQ(GetInvalidationMessage("IO0", 0).key, "FOO");
}

TEST_F(ServerFamilyTest, ClientTrackingExpireKey) {
  Run({"HELLO", "3"});
  Run({"CLIENT", "TRACKING", "ON"});
  Run({"SET", "C", "10"});
  Run({"GET", "C"});
  Run({"EXPIRE", "C", "1"});
  AdvanceTime(1000);
  auto resp = Run({"GET", "C"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));
  EXPECT_EQ(InvalidationMessagesLen("IO0"), 1);
  EXPECT_EQ(GetInvalidationMessage("IO0", 0).key, "C");
}

TEST_F(ServerFamilyTest, ClientTrackingSelectDB) {
  Run({"HELLO", "3"});
  Run({"CLIENT", "TRACKING", "ON"});
  Run({"SET", "C", "10"});
  Run({"GET", "C"});
  pp_->at(1)->Await([&] { return Run({"SELECT", "2"}); });
  pp_->at(1)->Await([&] { return Run({"SET", "C", "1000"}); });
  pp_->AwaitFiberOnAll([](ProactorBase* pb) {});
  EXPECT_EQ(InvalidationMessagesLen("IO0"), 1);
  EXPECT_EQ(GetInvalidationMessage("IO0", 0).key, "C");
}

TEST_F(ServerFamilyTest, ClientTrackingNonTransactionalBug) {
  Run({"HELLO", "3"});
  Run({"CLIENT", "TRACKING", "ON"});

  Run({"CLUSTER", "SLOTS"});
}

TEST_F(ServerFamilyTest, ClientTrackingLuaBug) {
  Run({"HELLO", "3"});
  // Check stickiness
  Run({"CLIENT", "TRACKING", "ON"});
  using namespace std::string_literals;
  std::string eval = R"(redis.call('get', 'foo'); redis.call('set', 'foo', 'bar'); )";
  Run({"EVAL", absl::StrCat(eval, "return 1"), "1", "foo"});
  Run({"PING"});

  EXPECT_EQ(InvalidationMessagesLen("IO0"), 1);
  absl::StrAppend(&eval, R"(redis.call('get', 'oof'); redis.call('set', 'oof', 'bar'); return 1)");
  Run({"EVAL", eval, "2", "foo", "oof"});
  Run({"PING"});
  EXPECT_EQ(InvalidationMessagesLen("IO0"), 3);
}

TEST_F(ServerFamilyTest, ConfigNormalization) {
  // TODO: Ideally we'd also test that INFO REPLICATION returns the value set in the config, but
  // there is no way currently to setup a mock replica in unit tests.

  absl::FlagSaver fs;  // Restores the flag to default value after test finishes

  // Default value
  EXPECT_THAT(Run({"config", "get", "replica-priority"}),
              RespArray(ElementsAre("replica_priority", "100")));
  EXPECT_THAT(Run({"config", "get", "replica_priority"}),
              RespArray(ElementsAre("replica_priority", "100")));

  // Set with dash
  EXPECT_THAT(Run({"config", "set", "replica-priority", "7"}), "OK");

  EXPECT_THAT(Run({"config", "get", "replica-priority"}),
              RespArray(ElementsAre("replica_priority", "7")));
  EXPECT_THAT(Run({"config", "get", "replica_priority"}),
              RespArray(ElementsAre("replica_priority", "7")));

  // Set with underscore
  EXPECT_THAT(Run({"config", "set", "replica_priority", "13"}), "OK");

  EXPECT_THAT(Run({"config", "get", "replica-priority"}),
              RespArray(ElementsAre("replica_priority", "13")));
  EXPECT_THAT(Run({"config", "get", "replica_priority"}),
              RespArray(ElementsAre("replica_priority", "13")));
}

// Verify CONFIG GET returns numeric bytes for memory configs (Redis/Valkey compatibility).
TEST_F(ServerFamilyTest, ConfigGetMemoryBytes) {
  absl::FlagSaver fs;

  // Set maxmemory using human-readable format
  EXPECT_THAT(Run({"config", "set", "maxmemory", "1GB"}), "OK");

  // CONFIG GET should return numeric bytes, not human-readable format
  EXPECT_THAT(Run({"config", "get", "maxmemory"}),
              RespArray(ElementsAre("maxmemory", "1073741824")));

  // Test another value
  EXPECT_THAT(Run({"config", "set", "maxmemory", "512MB"}), "OK");
  EXPECT_THAT(Run({"config", "get", "maxmemory"}),
              RespArray(ElementsAre("maxmemory", "536870912")));
}

TEST_F(ServerFamilyTest, CommandDocsOk) {
  EXPECT_THAT(Run({"command", "docs"}), ErrArg("COMMAND DOCS Not Implemented"));
}

TEST_F(ServerFamilyTest, PubSubCommandErr) {
  // Check conditions only in non cluster mode
  if (auto cluster_mode = absl::GetFlag(FLAGS_cluster_mode); cluster_mode == "") {
    EXPECT_THAT(Run({"PUBSUB", "SHARDCHANNELS"}),
                ErrArg("PUBSUB SHARDCHANNELS is not supported in non cluster mode"));
    EXPECT_THAT(Run({"PUBSUB", "SHARDNUMSUB"}),
                ErrArg("PUBSUB SHARDNUMSUB is not supported in non cluster mode"));
  }
  EXPECT_THAT(Run({"PUBSUB", "INVALIDSUBCOMMAND"}),
              ErrArg("Unknown subcommand or wrong number of arguments for 'INVALIDSUBCOMMAND'. Try "
                     "PUBSUB HELP."));
}

TEST_F(ServerFamilyTest, InfoMultipleSections) {
  // Check that when querying multiple valid sections, both are returned non empty.
  Run({"set", "foo", "bar"});  // set some data
  auto resp = Run({"info", "replication", "persistence"});
  auto info = resp.GetString();
  EXPECT_NE(info.find("# Replication"), std::string::npos);
  EXPECT_NE(info.find("# Persistence"), std::string::npos);
}

TEST_F(ServerFamilyTest, InfoMultipleSectionsInvalid) {
  // Check that when querying a valid and an invalid section, only the valid section is returned.
  Run({"set", "foo", "bar"});  // set some data
  auto resp = Run({"info", "replication", "invalidsection"});
  auto info = resp.GetString();
  EXPECT_NE(info.find("# Replication"), std::string::npos);
  EXPECT_EQ(info.find("# invalidsection"), std::string::npos);
}

// DEBUG POPULATE with val_size=0 caused SIGFPE (division by zero) in DoPopulateBatch.
TEST_F(ServerFamilyTest, DebugPopulateZeroValSize) {
  // val_size=0 with the default element count (1) must not crash the server.
  auto resp = Run({"DEBUG", "POPULATE", "1", "key", "0"});
  EXPECT_THAT(resp, ErrArg("val_size must be positive"));
}

TEST_F(ServerFamilyTest, MemoryArenaSummary) {
  auto resp = Run({"MEMORY", "ARENA", "SUMMARY"});
  const auto response = resp.GetString();

  EXPECT_THAT(response, HasSubstr("BlockSize"));

  for (const auto shard_id : std::views::iota(0UL, shard_set->size())) {
    EXPECT_THAT(response, HasSubstr("Arena statistics for thread " + std::to_string(shard_id)));
  }

  EXPECT_THAT(response, HasSubstr("Arena statistics for machine"));

  resp = Run({"MEMORY", "ARENA", "SUMMARY", "0"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  resp = Run({"MEMORY", "ARENA", "SUMMARY", "X"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  resp = Run({"MEMORY", "ARENA", "SUMMARY", "BACKING"});
  EXPECT_THAT(resp.GetString(), HasSubstr("BlockSize"));

  resp = Run({"MEMORY", "ARENA", "SUMMARY", "BACKING", "0"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  resp = Run({"MEMORY", "ARENA"});
  EXPECT_THAT(resp.GetString(), HasSubstr("Count"));
}

}  // namespace dfly


================================================
FILE: src/server/server_state.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/server_state.h"

#include <mimalloc.h>

extern "C" {
#include "redis/zmalloc.h"
}

#include "base/flag_utils.h"
#include "base/flags.h"
#include "base/logging.h"
#include "facade/conn_context.h"
#include "facade/dragonfly_connection.h"
#include "facade/facade_stats.h"
#include "server/common.h"
#include "server/journal/journal.h"
#include "util/listener_interface.h"

using facade::operator""_KB;

ABSL_FLAG(uint32_t, interpreter_per_thread, 10, "Lua interpreters per thread");
ABSL_FLAG(uint32_t, timeout, 0,
          "Close the connection after it is idle for N seconds (0 to disable)");
ABSL_FLAG(uint32_t, send_timeout, 0,
          "Close the connection after it is stuck on send for N seconds (0 to disable)");

ABSL_FLAG(double, rss_oom_deny_ratio, 1.25,
          "When the ratio between maxmemory and RSS memory exceeds this value, commands marked as "
          "DENYOOM will fail with OOM error and new connections to non-admin port will be "
          "rejected. Negative value disables this feature.");

ABSL_FLAG(size_t, serialization_max_chunk_size, 64_KB,
          "Maximum size of a value that may be serialized at once during snapshotting or full "
          "sync. Values bigger than this threshold will be serialized using streaming "
          "serialization. 0 - to disable streaming mode");
ABSL_FLAG(uint32_t, max_squashed_cmd_num, 100,
          "Max number of commands squashed in a single shard during squash optimizaiton");

namespace dfly {

using namespace std;
using namespace std::chrono_literals;

__thread ServerState* ServerState::state_ = nullptr;

facade::ConnectionStats* ServerState::tl_connection_stats() {
  return &facade::tl_facade_stats->conn_stats;
}

ServerState::Stats::Stats(unsigned num_shards)
    : tx_width_freq_arr(num_shards), squash_width_freq_arr(num_shards) {
}

ServerState::Stats& ServerState::Stats::Add(const ServerState::Stats& other) {
  static_assert(sizeof(Stats) == 26 * 8, "Stats size mismatch");

#define ADD(x) this->x += (other.x)

  ADD(eval_io_coordination_cnt);

  ADD(eval_shardlocal_coordination_cnt);
  ADD(eval_squashed_flushes);

  ADD(tx_global_cnt);
  ADD(tx_normal_cnt);
  ADD(tx_inline_runs);
  ADD(tx_schedule_cancel_cnt);

  ADD(multi_squash_hops);
  ADD(multi_squash_exec_hop_usec);
  ADD(multi_squash_exec_reply_usec);
  ADD(squashed_commands);
  ADD(squash_stats_ignored);
  ADD(blocking_commands_in_pipelines);
  ADD(blocked_on_interpreter);
  ADD(rdb_save_usec);
  ADD(rdb_save_count);

  ADD(big_value_preemptions);
  ADD(compressed_blobs);

  ADD(oom_error_cmd_cnt);
  ADD(conn_timeout_events);
  ADD(psync_requests_total);

  if (this->tx_width_freq_arr.size() > 0) {
    DCHECK_EQ(this->tx_width_freq_arr.size(), other.tx_width_freq_arr.size());
    this->tx_width_freq_arr += other.tx_width_freq_arr;
  } else {
    this->tx_width_freq_arr = other.tx_width_freq_arr;
  }
  if (this->squash_width_freq_arr.size() > 0) {
    DCHECK_EQ(this->squash_width_freq_arr.size(), other.squash_width_freq_arr.size());
    this->squash_width_freq_arr += other.squash_width_freq_arr;
  } else {
    this->squash_width_freq_arr = other.squash_width_freq_arr;
  }

  ADD(stored_cmd_bytes);
  return *this;
#undef ADD
}

void MonitorsRepo::Add(facade::Connection* connection) {
  VLOG(1) << "register connection "
          << " at address 0x" << std::hex << (const void*)connection << " for thread "
          << util::ProactorBase::me()->GetPoolIndex();

  monitors_.push_back(connection);
}

void MonitorsRepo::Remove(const facade::Connection* conn) {
  auto it = std::find_if(monitors_.begin(), monitors_.end(),
                         [&conn](const auto& val) { return val == conn; });
  if (it != monitors_.end()) {
    VLOG(1) << "removing connection 0x" << std::hex << conn << " releasing token";
    monitors_.erase(it);
  } else {
    VLOG(1) << "no connection 0x" << std::hex << conn << " found in the registered list here";
  }
}

void MonitorsRepo::NotifyChangeCount(bool added) {
  if (added) {
    ++global_count_;
  } else {
    DCHECK(global_count_ > 0);
    --global_count_;
  }
}

ServerState::ServerState() : interpreter_mgr_{absl::GetFlag(FLAGS_interpreter_per_thread)} {
  CHECK(mi_heap_get_backing() == mi_heap_get_default());

  mi_heap_t* tlh = mi_heap_new();
  init_zmalloc_threadlocal(tlh);
  data_heap_ = tlh;

  UpdateFromFlags();
}

ServerState::~ServerState() {
  watcher_fiber_.JoinIfNeeded();
}

void ServerState::Init(uint32_t thread_index, uint32_t num_shards,
                       util::ListenerInterface* main_listener, acl::UserRegistry* registry) {
  state_ = new ServerState();
  state_->gstate_ = GlobalState::ACTIVE;
  state_->thread_index_ = thread_index;
  state_->user_registry = registry;
  state_->stats = Stats(num_shards);
  if (main_listener) {
    state_->watcher_fiber_ = util::fb2::Fiber(
        util::fb2::Launch::post, "ConnectionsWatcher",
        [state = state_, main_listener] { state->ConnectionsWatcherFb(main_listener); });
  }
}

void ServerState::Destroy() {
  delete state_;
  state_ = nullptr;
}

void ServerState::EnterLameDuck() {
  gstate_ = GlobalState::SHUTTING_DOWN;
  watcher_cv_.notify_all();
}

ServerState::MemoryUsageStats ServerState::GetMemoryUsage(uint64_t now_ns) {
  static constexpr uint64_t kCacheEveryNs = 1000;
  if (now_ns > used_mem_last_update_ + kCacheEveryNs) {
    used_mem_last_update_ = now_ns;
    memory_stats_cached_.used_mem = used_mem_current.load(std::memory_order_relaxed);
    memory_stats_cached_.rss_mem = rss_mem_current.load(std::memory_order_relaxed);
  }
  return memory_stats_cached_;
}

bool ServerState::AllowInlineScheduling() const {
  // We can't allow inline scheduling during a full sync, because then journaling transactions
  // will be scheduled before RdbLoader::LoadItemsBuffer is finished. We can't use the regular
  // locking mechanism because RdbLoader is not using transactions.
  if (gstate_ == GlobalState::LOADING)
    return false;

  // Journal callbacks can preempt; This means we have to disallow inline scheduling
  // because then we might interleave the callbacks loop from an inlined-scheduled command
  // and a normally-scheduled command.
  // The problematic loop is in JournalSlice::AddLogRecord, going over all the callbacks.

  if (journal::HasRegisteredCallbacks())
    return false;

  return true;
}

void ServerState::SetPauseState(ClientPause state, bool start) {
  client_pauses_[int(state)] += (start ? 1 : -1);
  if (!client_pauses_[int(state)]) {
    client_pause_ec_.notifyAll();
  }
}

void ServerState::AwaitPauseState(bool is_write) {
  client_pause_ec_.await([is_write, this]() {
    return client_pauses_[int(ClientPause::ALL)] == 0 &&
           (!is_write || client_pauses_[int(ClientPause::WRITE)] == 0);
  });
}

void ServerState::DecommitMemory(uint8_t flags) {
  if (flags & kDataHeap) {
    mi_heap_collect(data_heap(), true);
  }
  if (flags & kBackingHeap) {
    mi_heap_collect(mi_heap_get_backing(), true);
  }

  if (flags & kGlibcmalloc) {
    // trims the memory (reduces RSS usage) from the malloc allocator. Does not present in
    // MUSL lib.
#ifdef __GLIBC__
// There is an issue with malloc_trim and sanitizers because the asan replace malloc but is not
// aware of malloc_trim which causes malloc_trim to segfault because it's not initialized properly
#ifndef ABSL_HAVE_ADDRESS_SANITIZER
    malloc_trim(0);
#endif
#endif
  }
}

void ServerState::UpdateFromFlags() {
  rss_oom_deny_ratio = absl::GetFlag(FLAGS_rss_oom_deny_ratio);
  serialization_max_chunk_size = absl::GetFlag(FLAGS_serialization_max_chunk_size);
  max_squash_cmd_num = absl::GetFlag(FLAGS_max_squashed_cmd_num);
}

vector<string> ServerState::GetMutableFlagNames() {
  return base::GetFlagNames(FLAGS_rss_oom_deny_ratio, FLAGS_serialization_max_chunk_size,
                            FLAGS_max_squashed_cmd_num);
}

Interpreter* ServerState::BorrowInterpreter() {
  stats.blocked_on_interpreter++;
  auto* ptr = interpreter_mgr_.Get();
  stats.blocked_on_interpreter--;
  return ptr;
}

void ServerState::ReturnInterpreter(Interpreter* ir) {
  interpreter_mgr_.Return(ir);
}

void ServerState::FlushScriptCache() {
  cached_script_params_.clear();
  interpreter_mgr_.Reset();
}

void ServerState::AlterInterpreters(std::function<void(Interpreter*)> modf) {
  interpreter_mgr_.Alter(std::move(modf));
}

ServerState* ServerState::SafeTLocal() {
  // https://stackoverflow.com/a/75622732
  asm volatile("");
  return state_;
}

bool ServerState::ShouldLogSlowCmd(unsigned latency_usec) const {
  return slow_log_shard_.IsEnabled() && latency_usec >= log_slower_than_usec;
}

void ServerState::ConnectionsWatcherFb(util::ListenerInterface* main) {
  optional<facade::Connection::WeakRef> last_reference;

  while (true) {
    util::fb2::NoOpLock noop;
    if (watcher_cv_.wait_for(noop, 1s, [this] { return gstate_ == GlobalState::SHUTTING_DOWN; })) {
      break;
    }

    const uint32_t timeout = absl::GetFlag(FLAGS_timeout);
    const uint32_t send_timeout = absl::GetFlag(FLAGS_send_timeout);
    VLOG(1) << "ConnectionsWatcherFb: timeout=" << timeout << ", send_timeout=" << send_timeout;

    if (timeout == 0 && send_timeout == 0) {
      continue;
    }

    facade::Connection* from = nullptr;
    if (last_reference && !last_reference->IsExpired()) {
      from = last_reference->Get();
    }

    // We use weak refs, because ShutdownSelf below can potentially block the fiber,
    // and during this time some of the connections might be destroyed. Weak refs allow checking
    // validity of each connection.
    vector<facade::Connection::WeakRef> conn_refs;

    auto cb = [&](unsigned thread_index, util::Connection* conn) {
      facade::Connection* dfly_conn = static_cast<facade::Connection*>(conn);
      using Phase = facade::Connection::Phase;
      auto phase = dfly_conn->phase();
      bool is_replica = true;
      if (dfly_conn->cntx()) {
        is_replica = dfly_conn->cntx()->replica_conn;
      }

      bool idle_read = timeout != 0 && !is_replica && phase == Phase::READ_SOCKET &&
                       dfly_conn->idle_time() > timeout;
      bool stuck_sending = send_timeout != 0 && !is_replica && dfly_conn->IsSending() &&
                           dfly_conn->GetSendWaitTimeSec() > send_timeout;

      VLOG(2) << "Connection check: " << dfly_conn->GetClientInfo()
              << ", phase=" << static_cast<int>(phase) << ", idle_time=" << dfly_conn->idle_time()
              << ", is_replica=" << is_replica << ", is_sending=" << dfly_conn->IsSending()
              << ", idle_read=" << idle_read << ", stuck_sending=" << stuck_sending;

      if (idle_read || stuck_sending) {
        conn_refs.push_back(dfly_conn->Borrow());
      }
    };

    util::Connection* next = main->TraverseConnectionsOnThread(cb, 100, from);
    if (next) {
      last_reference = static_cast<facade::Connection*>(next)->Borrow();
    } else {
      last_reference.reset();
    }

    VLOG(1) << "Found " << conn_refs.size() << " connections to close due to timeout";
    for (auto& ref : conn_refs) {
      facade::Connection* conn = ref.Get();
      if (conn) {
        VLOG(1) << "Closing connection due to timeout: " << conn->GetClientInfo();
        conn->ShutdownSelfBlocking();
        stats.conn_timeout_events++;
      }
    }
  }
}

void ServerState::UnsubscribeSlotsAndUpdateChannelStore(const ChannelStore::ChannelsSubMap& sub_map,
                                                        ChannelStore* replacement) {
  channel_store_->UnsubscribeConnectionsFromDeletedSlots(sub_map, thread_index_);
  channel_store_ = replacement;
}

void ServerState::RecordCmd(bool is_main_conn) {
  if (is_main_conn) {
    ++tl_connection_stats()->command_cnt_main;
  } else {
    ++tl_connection_stats()->command_cnt_other;
  }
  qps_.Inc();
}
}  // end of namespace dfly


================================================
FILE: src/server/server_state.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <optional>
#include <valarray>
#include <vector>

#include "base/histogram.h"
#include "core/interpreter.h"
#include "server/acl/acl_log.h"
#include "server/channel_store.h"
#include "server/common_types.h"
#include "server/script_mgr.h"
#include "server/slowlog.h"
#include "util/sliding_counter.h"

typedef struct mi_heap_s mi_heap_t;

namespace facade {
class Connection;
struct ConnectionStats;
}  // namespace facade

namespace util {
class ListenerInterface;
}

namespace dfly {

namespace acl {
class UserRegistry;
}  // namespace acl

// This would be used as a thread local storage of sending
// monitor messages.
// Each thread will have its own list of all the connections that are
// used for monitoring. When a connection is set to monitor it would register
// itself to this list on all i/o threads. When a new command is dispatched,
// and this list is not empty, it would send in the same thread context as then
// thread that registered here the command.
// Note about performance: we are assuming that we would not have many connections
// that are registered here. This is not pub sub where it must be high performance
// and may support many to many with tens or more of connections. It is assumed that
// since monitoring is for debugging only, we would have less than 1 in most cases.
// Also note that we holding this list on the thread level since this is the context
// at which this would run. It also minimized the number of copied for this list.
class MonitorsRepo {
 public:
  using MonitorVec = std::vector<facade::Connection*>;

  // This function adds a new connection to be monitored. This function only add
  // new connection that belong to this thread! Must not be called outside of this
  // thread context
  void Add(facade::Connection* conn);

  // This function remove a connection what was monitored. This function only removes
  // a connection that belong to this thread! Must not be called outside of this
  // thread context
  void Remove(const facade::Connection* conn);

  // We have for each thread the total number of monitors in the application.
  // So this call is thread safe since we hold a copy of this for each thread.
  // If this return true, then we don't need to run the monitor operation at all.
  bool Empty() const {
    return global_count_ == 0u;
  }

  // This function is run on all threads to either increment or decrement the "shared" counter
  // of the monitors - it must be called as part of removing a monitor (for example
  // when a connection is closed).
  void NotifyChangeCount(bool added);

  std::size_t Size() const {
    return monitors_.size();
  }

  const MonitorVec& monitors() const {
    return monitors_;
  }

 private:
  MonitorVec monitors_;            // save connections belonging to this thread only!
  unsigned int global_count_ = 0;  // by global its means that we count the monitor for all threads
};

enum class ClientPause { WRITE, ALL };

// Present in every server thread. This class differs from EngineShard. The latter manages
// state around engine shards while the former represents coordinator/connection state.
// There may be threads that handle engine shards but not IO, there may be threads that handle IO
// but not engine shards and there can be threads that handle both.
// Instances of ServerState are present only for threads that handle
// IO and manage incoming connections.
class ServerState {  // public struct - to allow initialization.
  ServerState(const ServerState&) = delete;
  void operator=(const ServerState&) = delete;

 public:
  struct Stats {
    Stats(unsigned num_shards = 0);  // Default initialization should be valid for Add()

    Stats(Stats&& other) = default;
    Stats& operator=(Stats&& other) = default;
    Stats(const Stats&) = delete;
    Stats& operator=(const Stats& other) = delete;

    Stats& Add(const Stats& other);

    uint64_t tx_global_cnt = 0;
    uint64_t tx_normal_cnt = 0;
    uint64_t tx_inline_runs = 0;
    uint64_t tx_schedule_cancel_cnt = 0;

    uint64_t eval_io_coordination_cnt = 0;
    uint64_t eval_shardlocal_coordination_cnt = 0;
    uint64_t eval_squashed_flushes = 0;

    uint64_t multi_squash_hops = 0;
    uint64_t multi_squash_exec_hop_usec = 0;
    uint64_t multi_squash_exec_reply_usec = 0;
    uint64_t squashed_commands = 0;
    uint64_t squash_stats_ignored = 0;
    uint64_t blocking_commands_in_pipelines = 0;
    uint64_t blocked_on_interpreter = 0;

    uint64_t rdb_save_usec = 0;
    uint64_t rdb_save_count = 0;

    uint64_t big_value_preemptions = 0;
    uint64_t compressed_blobs = 0;

    // Number of times we rejected command dispatch due to OOM condition.
    uint64_t oom_error_cmd_cnt = 0;
    uint32_t conn_timeout_events = 0;
    uint64_t psync_requests_total = 0;
    std::valarray<uint64_t> tx_width_freq_arr, squash_width_freq_arr;

    // Memory size of stored commands during multi-exec in connections
    size_t stored_cmd_bytes = 0;
  };

  // Unsafe version.
  // Do not use after fiber migration because it can cause a data race.
  static ServerState* tlocal() {
    return state_;
  }

  // Safe version.
  // Calls to tlocal() before and after a fiber migrates to a different thread may both
  // return the thread local of the thread that run the fiber before the migration. Use this
  // function to avoid this and access the correct thread local after the migration.
  static ServerState* __attribute__((noinline)) SafeTLocal();

  static facade::ConnectionStats* tl_connection_stats();

  ServerState();
  ~ServerState();

  static void Init(uint32_t thread_index, uint32_t num_shards,
                   util::ListenerInterface* main_listener, acl::UserRegistry* registry);
  static void Destroy();

  void EnterLameDuck();

  void TxCountInc() {
    ++live_transactions_;
  }

  void TxCountDec() {
    --live_transactions_;  // can go negative since we can start on one thread and end on another.
  }

  int64_t live_transactions() const {
    return live_transactions_;
  }

  GlobalState gstate() const {
    return gstate_;
  }

  void set_gstate(GlobalState s) {
    gstate_ = s;
  }

  struct MemoryUsageStats {
    uint64_t used_mem = 0;
    uint64_t rss_mem = 0;
  };

  MemoryUsageStats GetMemoryUsage(uint64_t now_ns);

  bool AllowInlineScheduling() const;

  // Borrow interpreter from interpreter pool, return it with ReturnInterpreter.
  // Will block if no interpreters are aviable. Use with caution!
  Interpreter* BorrowInterpreter();

  // Return interpreter to internal manager to be re-used.
  void ReturnInterpreter(Interpreter*);

  void FlushScriptCache();

  // Invoke function on all free interpreters. They are marked atomically as
  // used and the function is allowed to suspend.
  void AlterInterpreters(std::function<void(Interpreter*)> modf);

  // Returns sum of all requests in the last 6 seconds
  // (not including the current one).
  uint32_t MovingSum6() const {
    return qps_.SumTail();
  }

  void RecordCmd(bool is_main_conn);

  // data heap used by zmalloc and shards.
  mi_heap_t* data_heap() {
    return data_heap_;
  }

  constexpr MonitorsRepo& Monitors() {
    return monitors_;
  }

  const absl::flat_hash_map<std::string, base::Histogram>& call_latency_histos() const {
    return call_latency_histos_;
  }

  void RecordCallLatency(std::string_view sha, uint64_t latency_usec) {
    call_latency_histos_[sha].Add(latency_usec);
  }

  void SetScriptParams(const ScriptMgr::ScriptKey& key, ScriptMgr::ScriptParams params) {
    cached_script_params_[key] = params;
  }

  std::optional<ScriptMgr::ScriptParams> GetScriptParams(const ScriptMgr::ScriptKey& key) {
    auto it = cached_script_params_.find(key);
    return it != cached_script_params_.end() ? std::optional{it->second} : std::nullopt;
  }

  uint32_t thread_index() const {
    return thread_index_;
  }

  ChannelStore* channel_store() const {
    return channel_store_;
  }

  void UpdateChannelStore(ChannelStore* replacement) {
    channel_store_ = replacement;
  }

  void UnsubscribeSlotsAndUpdateChannelStore(const ChannelStore::ChannelsSubMap& sub_map,
                                             ChannelStore* replacement);

  bool ShouldLogSlowCmd(unsigned latency_usec) const;

  Stats stats;

  bool is_master = true;
  uint32_t log_slower_than_usec = UINT32_MAX;
  uint32_t max_squash_cmd_num = 32;

  acl::UserRegistry* user_registry;

  acl::AclLog acl_log;

  // Starts or ends a `CLIENT PAUSE` command. @state controls whether
  // this is pausing only writes or every command, @start controls
  // whether this is starting or ending the pause.
  void SetPauseState(ClientPause state, bool start);

  // Awaits until the pause is over and the command can execute.
  // @is_write controls whether the command is a write command or not.
  void AwaitPauseState(bool is_write);

  bool IsPaused() const {
    return (client_pauses_[0] + client_pauses_[1]) > 0;
  }

  SlowLogShard& GetSlowLog() {
    return slow_log_shard_;
  };

  // Tries to returns as much RSS memory as possible to the OS.
  // Decommits 3 possible heaps according to the flags.
  // For decommit_glibcmalloc the heap is global for the process, for others it's specific only
  // for this thread.
  enum : uint8_t {
    kDataHeap = 1,
    kBackingHeap = 2,
    kGlibcmalloc = 4,
    kAllMemory = kDataHeap | kBackingHeap | kGlibcmalloc
  };
  void DecommitMemory(uint8_t flags);

  void UpdateFromFlags();                                 // Update configration from flags
  static std::vector<std::string> GetMutableFlagNames();  // Dependencies of UpdateFromFlags

  // Exec descriptor frequency count for this thread.
  absl::flat_hash_map<std::string, unsigned> exec_freq_count;
  double rss_oom_deny_ratio;
  size_t serialization_max_chunk_size;

 private:
  // A fiber constantly watching connections on the main listener.
  void ConnectionsWatcherFb(util::ListenerInterface* main);

  int64_t live_transactions_ = 0;
  SlowLogShard slow_log_shard_;
  mi_heap_t* data_heap_;

  InterpreterManager interpreter_mgr_;
  absl::flat_hash_map<ScriptMgr::ScriptKey, ScriptMgr::ScriptParams> cached_script_params_;

  ChannelStore* channel_store_;

  GlobalState gstate_ = GlobalState::ACTIVE;

  // To support concurrent `CLIENT PAUSE commands` correctly, we store the amount
  // of current CLIENT PAUSE commands that are in effect. Blocked execution fibers
  // should subscribe to `client_pause_ec_` through `AwaitPauseState` to be
  // notified when the break is over.
  int client_pauses_[2] = {};
  util::fb2::EventCount client_pause_ec_;

  // Monitors connections. Currently responsible for closing timed out connections.
  util::fb2::Fiber watcher_fiber_;
  util::fb2::CondVarAny watcher_cv_;

  using Counter = util::SlidingCounter<7>;
  Counter qps_;

  MonitorsRepo monitors_;

  absl::flat_hash_map<std::string, base::Histogram> call_latency_histos_;
  uint32_t thread_index_ = 0;

  uint64_t used_mem_last_update_ = 0;
  MemoryUsageStats memory_stats_cached_;  // thread local cache of used and rss memory current

  static __thread ServerState* state_;
};

}  // namespace dfly


================================================
FILE: src/server/set_family.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/set_family.h"

#include "server/family_utils.h"

extern "C" {
#include "redis/intset.h"
#include "redis/redis_aux.h"
#include "redis/util.h"  // for string2ll
}

#include "base/cycle_clock.h"
#include "base/logging.h"
#include "base/stl_util.h"
#include "core/detail/listpack_wrap.h"
#include "core/string_set.h"
#include "facade/cmd_arg_parser.h"
#include "server/acl/acl_commands_def.h"
#include "server/command_registry.h"
#include "server/conn_context.h"
#include "server/container_utils.h"
#include "server/db_slice.h"
#include "server/engine_shard_set.h"
#include "server/error.h"
#include "server/journal/journal.h"
#include "server/transaction.h"

namespace dfly {

using namespace facade;

using namespace std;

using ResultStringVec = vector<OpResult<StringVec>>;
using ResultSetView = OpResult<absl::flat_hash_set<std::string_view>>;
using SvArray = vector<std::string_view>;
using SetType = pair<void*, unsigned>;

namespace {

// Possible sources of new set entries
using NewEntries = std::variant<ArgSlice, absl::flat_hash_set<std::string_view>>;

auto EntriesRange(const NewEntries& entries) {
  return base::it::Wrap(cmn::kToSV, entries);
}

constexpr uint32_t kMaxIntSetEntries = 256;

bool IsDenseEncoding(const CompactObj& co) {
  return co.Encoding() == kEncodingStrMap2;
}

intset* IntsetAddSafe(string_view val, intset* is, bool* success, bool* added) {
  long long llval;
  *added = false;
  if (!string2ll(val.data(), val.size(), &llval)) {
    *success = false;
    return is;
  }

  uint8_t inserted = 0;
  is = intsetAdd(is, llval, &inserted);
  if (inserted) {
    *added = true;
    *success = intsetLen(is) <= kMaxIntSetEntries;
  } else {
    *added = false;
    *success = true;
  }

  return is;
}

struct StringSetWrapper {
  StringSetWrapper(const CompactObj& obj, const DbContext& db_cntx)
      : StringSetWrapper(obj.RObjPtr(), db_cntx.time_now_ms) {
    DCHECK(IsDenseEncoding(obj));
  }

  StringSetWrapper(const SetType& st, const DbContext& db_cntx)
      : StringSetWrapper(st.first, db_cntx.time_now_ms) {
    DCHECK_EQ(st.second, kEncodingStrMap2);
  }

  static void Init(CompactObj* obj) {
    obj->InitRobj(OBJ_SET, kEncodingStrMap2, CompactObj::AllocateMR<StringSet>());
  }

  unsigned Add(const NewEntries& entries, uint32_t ttl_sec, bool keepttl) const {
    unsigned res = 0;
    string_view members[StringSet::kMaxBatchLen];
    size_t entries_len = std::visit([](const auto& e) { return e.size(); }, entries);
    unsigned len = 0;
    if (ss->BucketCount() < entries_len) {
      ss->Reserve(entries_len);
    }
    for (string_view member : EntriesRange(entries)) {
      members[len++] = member;
      if (len == StringSet::kMaxBatchLen) {
        res += ss->AddMany(absl::MakeSpan(members, StringSet::kMaxBatchLen), ttl_sec, keepttl);
        len = 0;
      }
    }

    if (len) {
      res += ss->AddMany(absl::MakeSpan(members, len), ttl_sec, keepttl);
    }

    return res;
  }

  pair<unsigned, bool> Remove(const facade::ArgRange& entries) const {
    unsigned removed = 0;
    for (string_view member : entries)
      removed += ss->Erase(member);
    return {removed, ss->Empty()};
  }

  uint64_t Scan(uint64_t curs, const ScanOpts& scan_op, StringVec* res) const {
    uint32_t count = scan_op.limit;
    long maxiterations = count * 10;

    const auto start_cycles = base::CycleClock::Now();
    // Approximately 100usec
    const uint64_t timeout_cycles = base::CycleClock::Now() + base::CycleClock::Frequency() / 10000;

    do {
      auto scan_callback = [&](sds ptr) {
        if (string_view str{ptr, sdslen(ptr)}; scan_op.Matches(str))
          res->emplace_back(str);
      };
      curs = ss->Scan(curs, scan_callback);
    } while (curs && maxiterations-- && res->size() < count &&
             (base::CycleClock::Now() - start_cycles) < timeout_cycles);
    return curs;
  }

  explicit operator StringSet*() const {
    return ss;
  }

  StringSet* operator->() const {
    return ss;
  }

  auto Range() const {
    auto transform = [](sds ptr) { return string_view{ptr, sdslen(ptr)}; };
    return base::it::Transform(transform, base::it::Range(ss->begin(), ss->end()));
  }

 private:
  StringSetWrapper(void* robj_ptr, uint64_t now_ms) : ss(static_cast<StringSet*>(robj_ptr)) {
    ss->set_time(MemberTimeSeconds(now_ms));
  }

  StringSet* const ss;
};

// returns (removed, isempty)
pair<unsigned, bool> RemoveSet(const DbContext& db_context, const facade::ArgRange& vals,
                               CompactObj* set) {
  if (set->Encoding() == kEncodingIntSet) {
    intset* is = (intset*)set->RObjPtr();
    long long llval;

    unsigned removed = 0;
    for (string_view val : vals) {
      if (!string2ll(val.data(), val.size(), &llval)) {
        continue;
      }

      int is_removed = 0;
      is = intsetRemove(is, llval, &is_removed);
      removed += is_removed;
    }
    set->SetRObjPtr(is);

    return {removed, intsetLen(is) == 0};
  } else {
    return StringSetWrapper{*set, db_context}.Remove(vals);
  }
}

void InitSet(const NewEntries& vals, CompactObj* set) {
  bool int_set = true;
  long long intv;

  for (string_view v : EntriesRange(vals)) {
    if (!string2ll(v.data(), v.size(), &intv)) {
      int_set = false;
      break;
    }
  }

  if (int_set) {
    intset* is = intsetNew();
    set->InitRobj(OBJ_SET, kEncodingIntSet, is);
  } else {
    StringSetWrapper::Init(set);
  }
}

uint32_t SetTypeLen(const DbContext& db_context, const SetType& set) {
  if (set.second == kEncodingIntSet) {
    return intsetLen((const intset*)set.first);
  } else {
    return StringSetWrapper(set, db_context)->UpperBoundSize();
  }
}

bool IsInSet(const DbContext& db_context, const SetType& st, int64_t val) {
  if (st.second == kEncodingIntSet)
    return intsetFind((intset*)st.first, val);

  char buf[32];
  char* next = absl::numbers_internal::FastIntToBuffer(val, buf);
  string_view str{buf, size_t(next - buf)};

  return StringSetWrapper(st, db_context)->Contains(str);
}

bool IsInSet(const DbContext& db_context, const SetType& st, string_view member) {
  if (st.second == kEncodingIntSet) {
    long long llval;
    if (!string2ll(member.data(), member.size(), &llval))
      return false;

    return intsetFind((intset*)st.first, llval);
  } else {
    return StringSetWrapper(st, db_context)->Contains(member);
  }
}

// returns -3 if member is not found, -1 if no ttl is associated with this member.
int32_t GetExpiry(const DbContext& db_context, const SetType& st, string_view member) {
  if (st.second == kEncodingIntSet) {
    long long llval;
    if (!string2ll(member.data(), member.size(), &llval))
      return -3;

    return -1;
  } else {
    StringSetWrapper ss{st, db_context};
    auto it = ss->Find(member);
    if (it == ss->end())
      return -3;

    return it.HasExpiry() ? it.ExpiryTime() : -1;
  }
}

// Removes arg from result.
void DiffStrSet(const DbContext& db_context, const SetType& st,
                absl::flat_hash_set<string>* result) {
  for (string_view entry : StringSetWrapper{st, db_context}.Range())
    result->erase(entry);
}

void InterStrSet(const DbContext& db_context, const vector<SetType>& vec, StringVec* result) {
  for (string_view str : StringSetWrapper{vec.front(), db_context}.Range()) {
    size_t j = 1;
    for (j = 1; j < vec.size(); ++j) {
      if (vec[j].first != vec.front().first && !IsInSet(db_context, vec[j], str)) {
        break;
      }
    }

    if (j == vec.size()) {
      result->emplace_back(str);
    }
  }
}

template <typename C = absl::flat_hash_set<string>>
StringVec RandMemberStrSetPicky(StringSet* strset, size_t count) {
  C picks;
  picks.reserve(count);

  size_t tries = 0;
  while (picks.size() < count && tries++ < count * 2) {
    auto member = *strset->GetRandomMember();
    picks.insert(picks.end(), {member, sdslen(member)});
  }

  if constexpr (is_same_v<StringVec, C>)
    return picks;
  return StringVec{make_move_iterator(picks.begin()), make_move_iterator(picks.end())};
}

StringVec RandMemberStrSet(const DbContext& db_context, const CompactObj& co,
                           PicksGenerator& generator, size_t picks_count) {
  CHECK(IsDenseEncoding(co));
  StringSetWrapper strset{co, db_context};

  // If the set is small, extract entries with StringSet::GetRandomMember
  if (picks_count * 5 < strset->UpperBoundSize()) {
    StringSet* ss(strset);
    if (bool unique = (dynamic_cast<UniquePicksGenerator*>(&generator) != nullptr); unique)
      return RandMemberStrSetPicky(ss, picks_count);
    else
      return RandMemberStrSetPicky<StringVec>(ss, picks_count);
  }

  std::unordered_map<RandomPick, std::uint32_t> times_index_is_picked;
  for (std::size_t i = 0; i < picks_count; i++) {
    times_index_is_picked[generator.Generate()]++;
  }

  StringVec result;
  result.reserve(picks_count);

  std::uint32_t ss_entry_index = 0;
  for (string_view str : strset.Range()) {
    auto it = times_index_is_picked.find(ss_entry_index++);
    if (it != times_index_is_picked.end()) {
      while (it->second--)
        result.emplace_back(str);
    }
  }
  /* Equal elements in the result are always successive. So, it is necessary to shuffle them */
  absl::BitGen gen;
  std::shuffle(result.begin(), result.end(), gen);

  return result;
}

StringVec RandMemberSet(const DbContext& db_context, const CompactObj& co,
                        PicksGenerator& generator, std::size_t picks_count) {
  if (co.Encoding() == kEncodingIntSet) {
    intset* is = static_cast<intset*>(co.RObjPtr());

    StringVec result;
    result.reserve(picks_count);

    for (std::size_t i = 0; i < picks_count; i++) {
      const std::size_t picked_index = generator.Generate();

      int64_t value = 0;
      CHECK_GT(intsetGet(is, picked_index, &value), std::uint8_t(0));

      result.push_back(absl::StrCat(value));
    }
    return result;
  }
  return RandMemberStrSet(db_context, co, generator, picks_count);
}

vector<string> ToVec(absl::flat_hash_set<string>&& set) {
  vector<string> result(set.size());
  size_t i = 0;

  // extract invalidates current iterator. therefore, we increment it first before extracting.
  // hence the weird loop.
  for (auto it = set.begin(); it != set.end();) {
    result[i] = std::move(set.extract(it++).value());
    ++i;
  }

  return result;
}

ResultSetView UnionResultVec(const ResultStringVec& result_vec) {
  absl::flat_hash_set<std::string_view> uniques;

  for (const auto& val : result_vec) {
    if (val || val.status() == OpStatus::SKIPPED) {
      for (const string& s : val.value()) {
        uniques.emplace(s);
      }
      continue;
    }

    if (val.status() != OpStatus::KEY_NOTFOUND) {
      return val.status();
    }
  }

  return uniques;
}

ResultSetView DiffResultVec(const ResultStringVec& result_vec, ShardId src_shard) {
  for (const auto& res : result_vec) {
    if (res.status() == OpStatus::WRONG_TYPE)
      return res.status();
  }

  absl::flat_hash_set<std::string_view> uniques;

  for (const auto& val : result_vec[src_shard].value()) {
    uniques.emplace(val);
  }

  for (unsigned i = 0; i < result_vec.size(); ++i) {
    if (i == src_shard)
      continue;

    if (result_vec[i]) {
      for (const string& s : result_vec[i].value()) {
        uniques.erase(s);
      }
    }
  }
  return uniques;
}

OpResult<SvArray> InterResultVec(const ResultStringVec& result_vec, unsigned required_shard_cnt,
                                 unsigned limit = 0) {
  absl::flat_hash_map<std::string_view, unsigned> uniques;

  for (const auto& res : result_vec) {
    if (!res && !base::_in(res.status(), {OpStatus::SKIPPED, OpStatus::KEY_NOTFOUND}))
      return res.status();
  }

  for (const auto& res : result_vec) {
    if (res.status() == OpStatus::KEY_NOTFOUND)
      return OpStatus::OK;  // empty set.
  }

  std::vector<const StringVec*> sorted_vec;
  for (const auto& res : result_vec) {
    if (res.status() == OpStatus::SKIPPED)
      continue;
    DCHECK(res);  // we handled it above.
    sorted_vec.push_back(&res.value());
  }

  // Sort the per shard-sorted sets
  if (!sorted_vec.empty()) {
    std::sort(sorted_vec.begin(), sorted_vec.end(),
              [](const auto* lhs, const auto* rhs) { return lhs->size() < rhs->size(); });

    for (const string& s : *sorted_vec[0]) {
      uniques.emplace(s, 1);
    }
    // Remove the smallest
    sorted_vec.erase(sorted_vec.begin());

    for (const auto& res : sorted_vec) {
      for (const string& s : *res) {
        auto it = uniques.find(s);
        if (it != uniques.end()) {
          ++it->second;
        }
      }
    }
  }

  SvArray result;
  result.reserve(uniques.size());

  for (const auto& k_v : uniques) {
    if (k_v.second == required_shard_cnt) {
      if (limit != 0 && result.size() >= limit)
        return result;
      result.push_back(k_v.first);
    }
  }

  return result;
}

SvArray ToSvArray(const absl::flat_hash_set<std::string_view>& set) {
  SvArray result;
  result.reserve(set.size());
  copy(set.begin(), set.end(), back_inserter(result));
  return result;
}

// if overwrite is true then OpAdd writes vals into the key and discards its previous value.
OpResult<uint32_t> OpAdd(const OpArgs& op_args, std::string_view key, const NewEntries& vals,
                         bool overwrite, bool journal_update) {
  auto& db_slice = op_args.GetDbSlice();
  auto vals_it = EntriesRange(vals);

  VLOG(2) << "OpAdd(" << key << ")";

  // overwrite - meaning we run in the context of 2-hop operation and we want
  // to overwrite the key. However, if the set is empty it means we should delete the
  // key if it exists.
  if (overwrite && (vals_it.begin() == vals_it.end())) {
    auto res_it = db_slice.FindMutable(op_args.db_cntx, key, OBJ_SET);
    if (res_it) {
      db_slice.DelMutable(op_args.db_cntx, std::move(*res_it));
      if (journal_update && op_args.shard->journal()) {
        RecordJournal(op_args, "DEL"sv, ArgSlice{key});
      }
    }
    return OpStatus::OK;
  }

  // We can use std::nullopt here because we check the type later.
  // If the overwrite is true, we will call InitSet that calles SetMeta
  auto op_res = db_slice.AddOrFind(op_args.db_cntx, key, std::nullopt);
  RETURN_ON_BAD_STATUS(op_res);
  auto& add_res = *op_res;

  PrimeValue& co = add_res.it->second;

  if (!add_res.is_new) {
    // for non-overwrite case it must be set.
    if (!overwrite && co.ObjType() != OBJ_SET)
      return OpStatus::WRONG_TYPE;

    if (overwrite)  // Overwriting the value removes expiration
      db_slice.RemoveExpire(op_args.db_cntx.db_index, add_res.it);
  }

  if (add_res.is_new || overwrite) {
    // If we're overwriting an existing key (not a new one), we need to remove it from
    // search indexes first. This prevents crashes when the key is indexed (e.g., HASH or JSON).
    if (!add_res.is_new && overwrite) {
      RemoveKeyFromIndexesIfNeeded(key, op_args.db_cntx, co, op_args.shard);
    }

    // does not store the values, merely sets the encoding.
    // TODO: why not store the values as well?
    InitSet(vals, &co);
  }

  uint32_t res = 0;

  if (co.Encoding() == kEncodingIntSet) {
    intset* is = (intset*)co.RObjPtr();
    bool success = true;

    for (auto val : vals_it) {
      bool added = false;
      is = IntsetAddSafe(val, is, &success, &added);
      res += added;

      if (!success) {
        co.SetRObjPtr(is);

        StringSet* ss = SetFamily::ConvertToStrSet(is, intsetLen(is));
        if (!ss) {
          return OpStatus::OUT_OF_MEMORY;
        }

        // frees 'is' on a way.
        co.InitRobj(OBJ_SET, kEncodingStrMap2, ss);
        break;
      }
    }

    if (success)
      co.SetRObjPtr(is);
  }

  if (co.Encoding() != kEncodingIntSet) {
    res = StringSetWrapper{co, op_args.db_cntx}.Add(vals, UINT32_MAX, false);
  }

  // TODO: consider optimization to record real command if the replica is in stable_sync state
  // and there is no slot migration process going on.
  if (journal_update && op_args.shard->journal()) {
    if (overwrite) {
      RecordJournal(op_args, "DEL"sv, ArgSlice{key});
    }
    size_t size = visit([](auto& c) { return c.size(); }, vals);
    vector<string_view> mapped(size + 1);
    mapped[0] = key;
    std::copy(vals_it.begin(), vals_it.end(), mapped.begin() + 1);
    RecordJournal(op_args, "SADD"sv, mapped);
  }
  return res;
}

OpResult<uint32_t> OpAddEx(const OpArgs& op_args, string_view key, uint32_t ttl_sec,
                           const NewEntries& vals, bool keepttl) {
  auto& db_slice = op_args.GetDbSlice();

  auto op_res = db_slice.AddOrFind(op_args.db_cntx, key, OBJ_SET);
  RETURN_ON_BAD_STATUS(op_res);
  auto& add_res = *op_res;

  CompactObj& co = add_res.it->second;

  if (add_res.is_new) {
    StringSetWrapper::Init(&co);
  } else {
    // Update stats and trigger any handle the old value if needed.
    if (co.Encoding() == kEncodingIntSet) {
      intset* is = (intset*)co.RObjPtr();
      StringSet* ss = SetFamily::ConvertToStrSet(is, intsetLen(is));
      if (!ss) {
        return OpStatus::OUT_OF_MEMORY;
      }
      co.InitRobj(OBJ_SET, kEncodingStrMap2, ss);
    }

    CHECK(IsDenseEncoding(co));
  }

  return StringSetWrapper{co, op_args.db_cntx}.Add(vals, ttl_sec, keepttl);
}

OpResult<uint32_t> OpRem(const OpArgs& op_args, string_view key, const facade::ArgRange& vals,
                         bool journal_rewrite) {
  auto& db_slice = op_args.GetDbSlice();
  auto find_res = db_slice.FindMutable(op_args.db_cntx, key, OBJ_SET);
  if (!find_res) {
    return find_res.status();
  }

  CompactObj& co = find_res->it->second;
  auto [removed, isempty] = RemoveSet(op_args.db_cntx, vals, &co);

  find_res->post_updater.Run();

  if (isempty) {
    db_slice.Del(op_args.db_cntx, find_res->it);
  }
  if (removed && journal_rewrite && op_args.shard->journal()) {
    vector<string_view> mapped(vals.Size() + 1);
    mapped[0] = key;
    std::copy(vals.begin(), vals.end(), mapped.begin() + 1);
    RecordJournal(op_args, "SREM"sv, mapped);
  }

  return removed;
}

// For SMOVE. Comprised of 2 transactional steps: Find and Commit.
// After Find Mover decides on the outcome of the operation, applies it in commit
// and reports the result.
class Mover {
 public:
  Mover(string_view src, string_view dest, string_view member, bool journal_rewrite)
      : src_(src), dest_(dest), member_(member), journal_rewrite_(journal_rewrite) {
  }

  void Find(Transaction* t);
  OpResult<unsigned> Commit(Transaction* t);

 private:
  OpStatus OpFind(Transaction* t, EngineShard* es);
  OpStatus OpMutate(Transaction* t, EngineShard* es);

  string_view src_, dest_, member_;
  OpResult<bool> found_[2];
  bool journal_rewrite_;
};

OpStatus Mover::OpFind(Transaction* t, EngineShard* es) {
  auto& db_slice = t->GetDbSlice(es->shard_id());
  ShardArgs largs = t->GetShardArgs(es->shard_id());

  // In case both src and dest are in the same shard, largs size will be 2.
  DCHECK_LE(largs.Size(), 2u);

  for (auto k : largs) {
    unsigned index = (k == src_) ? 0 : 1;
    auto res = db_slice.FindReadOnly(t->GetDbContext(), k, OBJ_SET);
    if (res && index == 0) {  // successful src find.
      DCHECK(!res->is_done());
      const CompactObj& val = res.value()->second;
      SetType st{val.RObjPtr(), val.Encoding()};
      found_[0] = IsInSet(t->GetDbContext(), st, member_);
    } else {
      found_[index] = res.status();
    }
  }

  return OpStatus::OK;
}

OpStatus Mover::OpMutate(Transaction* t, EngineShard* es) {
  ShardArgs largs = t->GetShardArgs(es->shard_id());
  DCHECK_LE(largs.Size(), 2u);

  OpArgs op_args = t->GetOpArgs(es);
  for (auto k : largs) {
    if (k == src_) {
      CHECK_EQ(1u,
               OpRem(op_args, k, ArgSlice{member_}, journal_rewrite_).value());  // must succeed.
    } else {
      DCHECK_EQ(k, dest_);
      OpAdd(op_args, k, ArgSlice(&member_, 1), false, journal_rewrite_);
    }
  }

  return OpStatus::OK;
}

void Mover::Find(Transaction* t) {
  // non-concluding step.
  t->Execute([this](Transaction* t, EngineShard* es) { return this->OpFind(t, es); }, false);
}

OpResult<unsigned> Mover::Commit(Transaction* t) {
  OpResult<unsigned> res;
  bool noop = false;

  if (found_[0].status() == OpStatus::WRONG_TYPE || found_[1].status() == OpStatus::WRONG_TYPE) {
    res = OpStatus::WRONG_TYPE;
    noop = true;
  } else if (!found_[0].value_or(false)) {
    res = 0;
    noop = true;
  } else {
    res = 1;
    noop = (src_ == dest_);
  }

  if (noop) {
    t->Conclude();
  } else {
    t->Execute([this](Transaction* t, EngineShard* es) { return this->OpMutate(t, es); }, true);
  }

  return res;
}

// Read-only OpUnion op on sets.
OpResult<StringVec> OpUnion(const OpArgs& op_args, ShardArgs::Iterator start,
                            ShardArgs::Iterator end) {
  DCHECK(start != end);
  absl::flat_hash_set<string> uniques;

  for (; start != end; ++start) {
    auto find_res = op_args.GetDbSlice().FindReadOnly(op_args.db_cntx, *start, OBJ_SET);
    if (find_res) {
      const PrimeValue& pv = find_res.value()->second;
      if (IsDenseEncoding(pv)) {
        StringSet* ss = (StringSet*)pv.RObjPtr();
        ss->set_time(MemberTimeSeconds(op_args.db_cntx.time_now_ms));
      }
      container_utils::IterateSet(pv, [&uniques](container_utils::ContainerEntry ce) {
        uniques.emplace(ce.ToString());
        return true;
      });
      continue;
    }

    if (find_res.status() != OpStatus::KEY_NOTFOUND) {
      return find_res.status();
    }
  }

  return ToVec(std::move(uniques));
}

// Read-only OpDiff op on sets.
OpResult<StringVec> OpDiff(const OpArgs& op_args, ShardArgs::Iterator start,
                           ShardArgs::Iterator end) {
  auto& db_slice = op_args.GetDbSlice();
  DCHECK(start != end);
  DVLOG(1) << "OpDiff from " << *start;
  auto find_res = db_slice.FindReadOnly(op_args.db_cntx, *start, OBJ_SET);

  if (!find_res) {
    return find_res.status();
  }

  absl::flat_hash_set<string> uniques;
  const PrimeValue& pv = find_res.value()->second;
  if (IsDenseEncoding(pv)) {
    StringSet* ss = (StringSet*)pv.RObjPtr();
    ss->set_time(MemberTimeSeconds(op_args.db_cntx.time_now_ms));
  }

  container_utils::IterateSet(pv, [&uniques](container_utils::ContainerEntry ce) {
    uniques.emplace(ce.ToString());
    return true;
  });

  DCHECK(!uniques.empty());  // otherwise the key would not exist.

  for (++start; start != end; ++start) {
    auto diff_res = db_slice.FindReadOnly(op_args.db_cntx, *start, OBJ_SET);
    if (!diff_res) {
      if (diff_res.status() == OpStatus::WRONG_TYPE) {
        return OpStatus::WRONG_TYPE;
      }
      continue;  // KEY_NOTFOUND
    }

    SetType st2{diff_res.value()->second.RObjPtr(), diff_res.value()->second.Encoding()};
    if (st2.second == kEncodingIntSet) {
      int ii = 0;
      intset* is = (intset*)st2.first;
      int64_t intele;
      char buf[32];

      while (intsetGet(is, ii++, &intele)) {
        char* next = absl::numbers_internal::FastIntToBuffer(intele, buf);
        uniques.erase(string_view{buf, size_t(next - buf)});
      }
    } else {
      DiffStrSet(op_args.db_cntx, st2, &uniques);
    }
  }

  return ToVec(std::move(uniques));
}

// Read-only OpInter op on sets.
OpResult<StringVec> OpInter(const Transaction* t, EngineShard* es, bool remove_first) {
  auto& db_slice = t->GetDbSlice(es->shard_id());
  ShardArgs args = t->GetShardArgs(es->shard_id());
  auto it = args.begin();
  if (remove_first) {
    ++it;
  }
  DCHECK(it != args.end());

  StringVec result;
  if (args.Size() == 1 + unsigned(remove_first)) {
    auto find_res = db_slice.FindReadOnly(t->GetDbContext(), *it, OBJ_SET);
    if (!find_res)
      return find_res.status();

    const PrimeValue& pv = find_res.value()->second;
    if (IsDenseEncoding(pv)) {
      StringSet* ss = (StringSet*)pv.RObjPtr();
      ss->set_time(MemberTimeSeconds(t->GetDbContext().time_now_ms));
    }

    result.reserve(pv.Size());
    container_utils::IterateSet(find_res.value()->second,
                                [&result](container_utils::ContainerEntry ce) {
                                  result.push_back(ce.ToString());
                                  return true;
                                });
    return result;
  }

  vector<SetType> sets(args.Size() - int(remove_first));

  OpStatus status = OpStatus::OK;
  unsigned index = 0;
  for (; it != args.end(); ++it) {
    auto& dest = sets[index++];
    auto find_res = db_slice.FindReadOnly(t->GetDbContext(), *it, OBJ_SET);
    if (!find_res) {
      if (status == OpStatus::OK || status == OpStatus::KEY_NOTFOUND ||
          find_res.status() != OpStatus::KEY_NOTFOUND) {
        status = find_res.status();
      }
      continue;
    }
    const PrimeValue& pv = find_res.value()->second;
    void* ptr = pv.RObjPtr();
    dest = make_pair(ptr, pv.Encoding());
  }

  if (status != OpStatus::OK)
    return status;

  auto comp = [db_contx = t->GetDbContext()](const SetType& left, const SetType& right) {
    return SetTypeLen(db_contx, left) < SetTypeLen(db_contx, right);
  };

  std::sort(sets.begin(), sets.end(), comp);

  int encoding = sets.front().second;
  result.reserve(SetTypeLen(t->GetDbContext(), sets.front()));
  if (encoding == kEncodingIntSet) {
    int ii = 0;
    intset* is = (intset*)sets.front().first;
    int64_t intele;

    while (intsetGet(is, ii++, &intele)) {
      size_t j = 1;
      for (j = 1; j < sets.size(); j++) {
        if (sets[j].first != is && !IsInSet(t->GetDbContext(), sets[j], intele))
          break;
      }

      /* Only take action when all sets contain the member */
      if (j == sets.size()) {
        result.push_back(absl::StrCat(intele));
      }
    }
  } else {
    InterStrSet(t->GetDbContext(), sets, &result);
  }

  return result;
}

OpResult<StringVec> OpRandMember(const OpArgs& op_args, std::string_view key, int count) {
  auto find_res = op_args.GetDbSlice().FindReadOnly(op_args.db_cntx, key, OBJ_SET);
  if (!find_res)
    return find_res.status();

  const CompactObj& co = find_res.value()->second;

  const std::uint32_t size = co.Size();
  const bool picks_are_unique = count >= 0;
  const std::uint32_t picks_count =
      picks_are_unique ? std::min(static_cast<std::uint32_t>(count), size) : std::abs(count);

  auto generator = [picks_are_unique, picks_count, size]() -> std::unique_ptr<PicksGenerator> {
    if (picks_are_unique) {
      return std::make_unique<UniquePicksGenerator>(picks_count, size);
    } else {
      return std::make_unique<NonUniquePicksGenerator>(size);
    }
  }();

  return RandMemberSet(op_args.db_cntx, co, *generator, picks_count);
}

// count - how many elements to pop.
OpResult<StringVec> OpPop(const OpArgs& op_args, string_view key, unsigned count) {
  auto& db_cntx = op_args.db_cntx;
  auto& db_slice = op_args.GetDbSlice();
  auto find_res = db_slice.FindMutable(db_cntx, key, OBJ_SET);
  if (!find_res) {
    return find_res.status();
  }

  PrimeValue& co = find_res->it->second;

  const std::uint32_t size = co.Size();
  const std::uint32_t picks_count = std::min(count, size);

  /* CASE 1:
   * The number of requested elements is greater than or equal to
   * the number of elements inside the set: simply return the whole set. */
  if (count >= size) {
    if (IsDenseEncoding(co)) {
      StringSet* ss = (StringSet*)co.RObjPtr();
      ss->set_time(MemberTimeSeconds(op_args.db_cntx.time_now_ms));
    }

    StringVec result;
    result.reserve(picks_count);

    container_utils::IterateSet(co, [&result](container_utils::ContainerEntry ce) {
      result.push_back(ce.ToString());
      return true;
    });

    // Delete the set as it is now empty
    db_slice.DelMutable(op_args.db_cntx, std::move(*find_res));

    // Replicate as DEL.
    if (op_args.shard->journal()) {
      RecordJournal(op_args, "DEL"sv, ArgSlice{key});
    }
    return result;
  }

  /* CASE 2:
   * The number of requested elements is less than the number of elements inside the set.
   * In this case, we need to select random members from the set and then remove them. */
  UniquePicksGenerator generator{picks_count, size};

  // Select random members
  StringVec result = RandMemberSet(db_cntx, co, generator, picks_count);

  // Remove selected members
  auto [removed, is_empty] = RemoveSet(db_cntx, result, &co);
  find_res->post_updater.Run();

  CHECK(!is_empty);

  // Replicate as SREM with removed keys, because SPOP is not deterministic.
  if (removed && op_args.shard->journal()) {
    vector<string_view> mapped(result.size() + 1);
    mapped[0] = key;
    copy(result.begin(), result.end(), mapped.begin() + 1);
    RecordJournal(op_args, "SREM"sv, mapped);
  }

  return result;
}

OpResult<StringVec> OpScan(const OpArgs& op_args, string_view key, uint64_t* cursor,
                           const ScanOpts& scan_op) {
  auto find_res = op_args.GetDbSlice().FindReadOnly(op_args.db_cntx, key, OBJ_SET);

  if (!find_res) {
    *cursor = 0;
    return find_res.status();
  }

  auto it = find_res.value();
  StringVec res;

  if (it->second.Encoding() == kEncodingIntSet) {
    intset* is = (intset*)it->second.RObjPtr();
    int64_t intele;
    uint32_t pos = 0;
    while (intsetGet(is, pos++, &intele)) {
      std::string int_str = absl::StrCat(intele);
      if (scan_op.Matches(int_str)) {
        res.push_back(int_str);
      }
    }
    *cursor = 0;
  } else {
    *cursor = StringSetWrapper{it->second, op_args.db_cntx}.Scan(*cursor, scan_op, &res);
  }

  return res;
}

void SendNumeric(OpResult<uint32_t> result, CommandContext* cmd_cntx) {
  switch (result.status()) {
    case OpStatus::OK:
      return cmd_cntx->SendLong(result.value());
    case OpStatus::WRONG_TYPE:
      return cmd_cntx->SendError(kWrongTypeErr);
    default:
      return cmd_cntx->SendLong(0);
  }
}

struct SetReplies {
  explicit SetReplies(CommandContext* cntx)
      : cmd_cntx(cntx), script(cntx->server_conn_cntx()->conn_state.script_info) {
  }

  template <typename T> void Send(vector<T> sv) {
    if (script)  // output is sorted under scripts
      sort(sv.begin(), sv.end());
    auto replier = [vec = std::move(sv)](facade::SinkReplyBuilder* builder) {
      auto* rb = static_cast<RedisReplyBuilder*>(builder);
      rb->SendBulkStrArr(vec, CollectionType::SET);
    };
    cmd_cntx->ReplyWith(std::move(replier));
  }

  void Send(const ResultSetView& rsv) {
    if (!rsv)
      return cmd_cntx->SendError(rsv.status());

    SvArray arr = ToSvArray(rsv.value());
    Send(std::move(arr));
  }

  CommandContext* cmd_cntx;
  bool script;
};

void CmdSAdd(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  auto values = args.subspan(1);

  auto cb = [key, values](Transaction* t, EngineShard* shard) {
    return OpAdd(t->GetOpArgs(shard), key, values, false, false);
  };

  OpResult<uint32_t> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  if (result) {
    return cmd_cntx->SendLong(result.value());
  }

  cmd_cntx->SendError(result.status());
}

void CmdSIsMember(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view val = ArgS(args, 1);

  auto cb = [&](Transaction* t, EngineShard* shard) {
    auto find_res = t->GetDbSlice(shard->shard_id()).FindReadOnly(t->GetDbContext(), key, OBJ_SET);

    if (find_res) {
      SetType st{find_res.value()->second.RObjPtr(), find_res.value()->second.Encoding()};
      return IsInSet(t->GetDbContext(), st, val) ? OpStatus::OK : OpStatus::KEY_NOTFOUND;
    }

    return find_res.status();
  };

  OpResult<void> result = cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));
  SendNumeric(result ? OpResult<uint32_t>(1) : result.status(), cmd_cntx);
}

void CmdSMIsMember(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  auto members = args.subspan(1);

  vector<int32_t> memberships(members.size());

  auto cb = [&](Transaction* t, EngineShard* shard) {
    DbContext db_cntx = t->GetDbContext();
    auto find_res = t->GetDbSlice(shard->shard_id()).FindReadOnly(db_cntx, key, OBJ_SET);
    if (find_res) {
      SetType st{(*find_res)->second.RObjPtr(), find_res.value()->second.Encoding()};
      for (size_t i = 0; i < members.size(); ++i)
        memberships[i] = IsInSet(db_cntx, st, ToSV(members[i]));
      ;
      return OpStatus::OK;
    }
    return find_res.status();
  };

  OpResult<void> result = cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));

  auto replier = [result, memberships = std::move(memberships)](facade::SinkReplyBuilder* builder) {
    auto* rb = static_cast<RedisReplyBuilder*>(builder);
    if (result || result == OpStatus::KEY_NOTFOUND) {
      rb->SendLongArr(absl::MakeConstSpan(memberships));
    } else {
      rb->SendError(result.status());
    }
  };
  cmd_cntx->ReplyWith(std::move(replier));
}

void CmdSMove(CmdArgList args, CommandContext* cmd_cntx) {
  string_view src = ArgS(args, 0);
  string_view dest = ArgS(args, 1);
  string_view member = ArgS(args, 2);

  Mover mover{src, dest, member, true};
  mover.Find(cmd_cntx->tx());

  OpResult<unsigned> result = mover.Commit(cmd_cntx->tx());
  if (!result) {
    return cmd_cntx->SendError(result.status());
  }

  cmd_cntx->SendLong(result.value());
}

void CmdSRem(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  auto vals = args.subspan(1);

  auto cb = [key, vals](Transaction* t, EngineShard* shard) {
    return OpRem(t->GetOpArgs(shard), key, vals, false);
  };

  OpResult<uint32_t> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  SendNumeric(result, cmd_cntx);
}

void CmdSCard(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);

  auto cb = [&](Transaction* t, EngineShard* shard) -> OpResult<uint32_t> {
    auto find_res = t->GetDbSlice(shard->shard_id()).FindReadOnly(t->GetDbContext(), key, OBJ_SET);
    if (!find_res) {
      return find_res.status();
    }

    return find_res.value()->second.Size();
  };

  OpResult<uint32_t> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  SendNumeric(result, cmd_cntx);
}

void CmdSPop(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  unsigned count = 1;
  if (args.size() > 1) {
    string_view arg = ArgS(args, 1);
    if (!absl::SimpleAtoi(arg, &count)) {
      cmd_cntx->SendError(kInvalidIntErr);
      return;
    }
  }

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpPop(t->GetOpArgs(shard), key, count);
  };

  OpResult<StringVec> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  auto replier = [result = std::move(result),
                  pop_single = (args.size() == 1)](facade::SinkReplyBuilder* builder) {
    auto* rb = static_cast<RedisReplyBuilder*>(builder);
    if (result || result.status() == OpStatus::KEY_NOTFOUND) {
      if (pop_single) {  // SPOP key
        if (result.status() == OpStatus::KEY_NOTFOUND) {
          rb->SendNull();
        } else {
          DCHECK_EQ(1u, result.value().size());
          rb->SendBulkString(result.value().front());
        }
      } else {  // SPOP key cnt
        rb->SendBulkStrArr(*result, CollectionType::SET);
      }
      return;
    }

    rb->SendError(result.status());
  };
  cmd_cntx->ReplyWith(std::move(replier));
}

void CmdSDiff(CmdArgList args, CommandContext* cmd_cntx) {
  ResultStringVec result_set(shard_set->size(), OpStatus::SKIPPED);
  string_view src_key = ArgS(args, 0);
  ShardId src_shard = Shard(src_key, result_set.size());

  auto cb = [&](Transaction* t, EngineShard* shard) {
    ShardArgs largs = t->GetShardArgs(shard->shard_id());
    if (shard->shard_id() == src_shard) {
      CHECK_EQ(src_key, largs.Front());
      result_set[shard->shard_id()] = OpDiff(t->GetOpArgs(shard), largs.begin(), largs.end());
    } else {
      result_set[shard->shard_id()] = OpUnion(t->GetOpArgs(shard), largs.begin(), largs.end());
    }

    return OpStatus::OK;
  };

  cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));
  ResultSetView rsv = DiffResultVec(result_set, src_shard);
  SetReplies{cmd_cntx}.Send(rsv);
}

void CmdSDiffStore(CmdArgList args, CommandContext* cmd_cntx) {
  ResultStringVec result_set(shard_set->size(), OpStatus::SKIPPED);
  string_view dest_key = ArgS(args, 0);
  ShardId dest_shard = Shard(dest_key, result_set.size());
  string_view src_key = ArgS(args, 1);
  ShardId src_shard = Shard(src_key, result_set.size());

  VLOG(1) << "SDiffStore " << src_key << " " << src_shard;

  // read-only op
  auto diff_cb = [&](Transaction* t, EngineShard* shard) {
    ShardArgs largs = t->GetShardArgs(shard->shard_id());
    OpArgs op_args = t->GetOpArgs(shard);
    DCHECK(!largs.Empty());
    ShardArgs::Iterator start = largs.begin();
    ShardArgs::Iterator end = largs.end();
    if (shard->shard_id() == dest_shard) {
      CHECK_EQ(*start, dest_key);
      ++start;
      if (start == end)
        return OpStatus::OK;
    }

    if (shard->shard_id() == src_shard) {
      CHECK_EQ(src_key, *start);
      result_set[shard->shard_id()] = OpDiff(op_args, start, end);  // Diff
    } else {
      result_set[shard->shard_id()] = OpUnion(op_args, start, end);  // Union
    }

    return OpStatus::OK;
  };

  cmd_cntx->tx()->Execute(std::move(diff_cb), false);
  ResultSetView rsv = DiffResultVec(result_set, src_shard);
  if (!rsv) {
    cmd_cntx->tx()->Conclude();
    cmd_cntx->SendError(rsv.status());
    return;
  }

  size_t result_size = rsv.value().size();
  auto store_cb = [&](Transaction* t, EngineShard* shard) {
    if (shard->shard_id() == dest_shard) {
      OpAdd(t->GetOpArgs(shard), dest_key, std::move(rsv.value()), true, true);
    }

    return OpStatus::OK;
  };

  cmd_cntx->tx()->Execute(std::move(store_cb), true);
  cmd_cntx->SendLong(result_size);
}

void CmdSMembers(CmdArgList args, CommandContext* cmd_cntx) {
  auto cb = [](Transaction* t, EngineShard* shard) { return OpInter(t, shard, false); };

  OpResult<StringVec> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));

  if (result || result.status() == OpStatus::KEY_NOTFOUND) {
    SetReplies{cmd_cntx}.Send(std::move(*result));
  } else {
    cmd_cntx->SendError(result.status());
  }
}

void CmdSRandMember(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  string_view key = parser.Next();

  bool is_count = parser.HasNext();
  int count = is_count ? parser.Next<int>() : 1;

  if (parser.HasNext())
    return cmd_cntx->SendError(WrongNumArgsError("SRANDMEMBER"));

  if (auto err = parser.TakeError(); err)
    return cmd_cntx->SendError(err.MakeReply());

  const auto cb = [&](Transaction* t, EngineShard* shard) -> OpResult<StringVec> {
    return OpRandMember(t->GetOpArgs(shard), key, count);
  };

  OpResult<StringVec> result = cmd_cntx->tx()->ScheduleSingleHopT(cb);

  auto replier = [is_count, result = std::move(result)](facade::SinkReplyBuilder* builder) {
    auto* rb = static_cast<RedisReplyBuilder*>(builder);
    if (result || result == OpStatus::KEY_NOTFOUND) {
      if (is_count) {
        rb->SendBulkStrArr(*result, CollectionType::SET);
      } else if (result->size()) {
        rb->SendBulkString(result->front());
      } else {
        rb->SendNull();
      }
      return;
    }
    rb->SendError(result.status());
  };
  cmd_cntx->ReplyWith(std::move(replier));
}

void CmdSInter(CmdArgList args, CommandContext* cmd_cntx) {
  ResultStringVec result_set(shard_set->size(), OpStatus::SKIPPED);

  auto cb = [&](Transaction* t, EngineShard* shard) {
    result_set[shard->shard_id()] = OpInter(t, shard, false);

    return OpStatus::OK;
  };

  cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));
  OpResult<SvArray> result = InterResultVec(result_set, cmd_cntx->tx()->GetUniqueShardCnt());
  if (result) {
    SetReplies{cmd_cntx}.Send(std::move(*result));
  } else {
    cmd_cntx->SendError(result.status());
  }
}

void CmdSInterStore(CmdArgList args, CommandContext* cmd_cntx) {
  ResultStringVec result_set(shard_set->size(), OpStatus::SKIPPED);
  string_view dest_key = ArgS(args, 0);
  ShardId dest_shard = Shard(dest_key, result_set.size());
  atomic_uint32_t inter_shard_cnt{0};

  auto inter_cb = [&](Transaction* t, EngineShard* shard) {
    ShardArgs largs = t->GetShardArgs(shard->shard_id());
    if (shard->shard_id() == dest_shard) {
      CHECK_EQ(largs.Front(), dest_key);
      if (largs.Size() == 1)
        return OpStatus::OK;
    }
    inter_shard_cnt.fetch_add(1, memory_order_relaxed);
    result_set[shard->shard_id()] = OpInter(t, shard, shard->shard_id() == dest_shard);
    return OpStatus::OK;
  };

  cmd_cntx->tx()->Execute(std::move(inter_cb), false);

  OpResult<SvArray> result = InterResultVec(result_set, inter_shard_cnt.load(memory_order_relaxed));
  if (!result) {
    cmd_cntx->tx()->Conclude();
    cmd_cntx->SendError(result.status());
    return;
  }

  auto store_cb = [&](Transaction* t, EngineShard* shard) {
    if (shard->shard_id() == dest_shard) {
      OpAdd(t->GetOpArgs(shard), dest_key, result.value(), true, true);
    }

    return OpStatus::OK;
  };

  cmd_cntx->tx()->Execute(std::move(store_cb), true);
  cmd_cntx->SendLong(result->size());
}

void CmdSInterCard(CmdArgList args, CommandContext* cmd_cntx) {
  unsigned num_keys;
  if (!absl::SimpleAtoi(ArgS(args, 0), &num_keys))
    return cmd_cntx->SendError(kSyntaxErr);

  unsigned limit = 0;
  if (args.size() == (num_keys + 3) && ArgS(args, 1 + num_keys) == "LIMIT") {
    if (!absl::SimpleAtoi(ArgS(args, num_keys + 2), &limit))
      return cmd_cntx->SendError("limit can't be negative");
  } else if (args.size() > (num_keys + 1))
    return cmd_cntx->SendError(kSyntaxErr);

  ResultStringVec result_set(shard_set->size(), OpStatus::SKIPPED);
  auto cb = [&](Transaction* t, EngineShard* shard) {
    result_set[shard->shard_id()] = OpInter(t, shard, false);
    return OpStatus::OK;
  };

  cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));
  OpResult<SvArray> result = InterResultVec(result_set, cmd_cntx->tx()->GetUniqueShardCnt(), limit);

  if (result) {
    return cmd_cntx->SendLong(result->size());
  }
  cmd_cntx->SendError(result.status());
}

void CmdSUnion(CmdArgList args, CommandContext* cmd_cntx) {
  ResultStringVec result_set(shard_set->size());

  auto cb = [&](Transaction* t, EngineShard* shard) {
    ShardArgs largs = t->GetShardArgs(shard->shard_id());
    result_set[shard->shard_id()] = OpUnion(t->GetOpArgs(shard), largs.begin(), largs.end());
    return OpStatus::OK;
  };

  cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));

  ResultSetView unionset = UnionResultVec(result_set);
  SetReplies{cmd_cntx}.Send(unionset);
}

void CmdSUnionStore(CmdArgList args, CommandContext* cmd_cntx) {
  ResultStringVec result_set(shard_set->size(), OpStatus::SKIPPED);
  string_view dest_key = ArgS(args, 0);
  ShardId dest_shard = Shard(dest_key, result_set.size());

  auto union_cb = [&](Transaction* t, EngineShard* shard) {
    ShardArgs largs = t->GetShardArgs(shard->shard_id());
    ShardArgs::Iterator start = largs.begin(), end = largs.end();
    if (shard->shard_id() == dest_shard) {
      CHECK_EQ(*start, dest_key);
      ++start;
      if (start == end)
        return OpStatus::OK;
    }
    result_set[shard->shard_id()] = OpUnion(t->GetOpArgs(shard), start, end);
    return OpStatus::OK;
  };

  cmd_cntx->tx()->Execute(std::move(union_cb), false);

  ResultSetView unionset = UnionResultVec(result_set);
  if (!unionset) {
    cmd_cntx->tx()->Conclude();
    cmd_cntx->SendError(unionset.status());
    return;
  }

  size_t result_size = unionset.value().size();
  auto store_cb = [&](Transaction* t, EngineShard* shard) {
    if (shard->shard_id() == dest_shard) {
      OpAdd(t->GetOpArgs(shard), dest_key, std::move(unionset.value()), true, true);
    }

    return OpStatus::OK;
  };

  cmd_cntx->tx()->Execute(std::move(store_cb), true);
  cmd_cntx->SendLong(result_size);
}

void CmdSScan(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view token = ArgS(args, 1);

  uint64_t cursor = 0;

  if (!absl::SimpleAtoi(token, &cursor)) {
    return cmd_cntx->SendError("invalid cursor");
  }

  // SSCAN key cursor [MATCH pattern] [COUNT count]
  if (args.size() > 6) {
    DVLOG(1) << "got " << args.size() << " this is more than it should be";
    return cmd_cntx->SendError(kSyntaxErr);
  }

  OpResult<ScanOpts> ops = ScanOpts::TryFrom(args.subspan(2));
  if (!ops) {
    DVLOG(1) << "SScan invalid args - return " << ops << " to the user";
    return cmd_cntx->SendError(ops.status());
  }

  const ScanOpts& scan_op = ops.value();

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpScan(t->GetOpArgs(shard), key, &cursor, scan_op);
  };

  OpResult<StringVec> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  if (result.status() != OpStatus::WRONG_TYPE) {
    auto replier = [cursor, result = std::move(result)](facade::SinkReplyBuilder* builder) {
      auto* rb = static_cast<RedisReplyBuilder*>(builder);
      RedisReplyBuilder::ArrayScope scope{rb, 2};
      rb->SendBulkString(absl::StrCat(cursor));
      rb->SendBulkStrArr(*result);
    };
    cmd_cntx->ReplyWith(std::move(replier));
  } else {
    cmd_cntx->SendError(result.status());
  }
}

// Syntax: saddex key [KEEPTTL] ttl_sec member [member...]
void CmdSAddEx(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser(args);

  const std::string_view key = parser.Next<std::string_view>();
  const bool keepttl = parser.Check("KEEPTTL");
  const uint32_t ttl_sec = parser.Next<uint32_t>();

  if (auto err = parser.TakeError(); err) {
    return cmd_cntx->SendError(err.MakeReply());
  }
  constexpr uint32_t kMaxTtl = (1UL << 26);
  if (ttl_sec == 0 || ttl_sec > kMaxTtl) {
    return cmd_cntx->SendError(kInvalidIntErr);
  }

  CmdArgList vals = parser.Tail();
  if (vals.empty()) {
    return cmd_cntx->SendError(WrongNumArgsError("SADDEX"));
  }

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpAddEx(t->GetOpArgs(shard), key, ttl_sec, vals, keepttl);
  };

  OpResult<uint32_t> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  if (result) {
    return cmd_cntx->SendLong(result.value());
  }

  cmd_cntx->SendError(result.status());
}

}  // namespace

auto SetFamily::LoadIntSetBlob(std::string_view blob, PrimeValue* pv) -> LoadBlobResult {
  if (!intsetValidateIntegrity((const uint8_t*)blob.data(), blob.size(), 0)) {
    LOG(ERROR) << "Intset integrity check failed.";
    return LoadBlobResult::kCorrupted;
  }

  const intset* is = (const intset*)blob.data();

  unsigned len = intsetLen(is);

  if (len > SetFamily::MaxIntsetEntries()) {
    StringSet* set = SetFamily::ConvertToStrSet(is, len);

    if (!set) {
      LOG(ERROR) << "OOM in ConvertToStrSet " << len;
      return LoadBlobResult::kOutOfMemory;
    }
    pv->InitRobj(OBJ_SET, kEncodingStrMap2, set);
  } else {
    intset* mine = reinterpret_cast<intset*>(CompactObj::memory_resource()->allocate(blob.size()));
    ::memcpy(mine, blob.data(), blob.size());
    pv->InitRobj(OBJ_SET, kEncodingIntSet, mine);
  }

  return LoadBlobResult::kSuccess;
}

auto SetFamily::LoadLPSetBlob(std::string_view blob, PrimeValue* pv) -> LoadBlobResult {
  if (!lpValidateIntegrity((uint8_t*)blob.data(), blob.size(), 0, nullptr, nullptr)) {
    LOG(ERROR) << "ListPack integrity check failed.";
    return LoadBlobResult::kCorrupted;
  }

  unsigned char* lp = (unsigned char*)blob.data();
  StringSet* set = CompactObj::AllocateMR<StringSet>();
  for (unsigned char* cur = lpFirst(lp); cur != nullptr; cur = lpNext(lp, cur)) {
    unsigned char field_buf[LP_INTBUF_SIZE];
    string_view elem = detail::ListpackWrap::GetView(cur, field_buf);
    if (!set->Add(elem)) {
      LOG(ERROR) << "Duplicate member " << elem;
      CompactObj::DeleteMR<StringSet>(set);
      return LoadBlobResult::kCorrupted;
    }
  }
  pv->InitRobj(OBJ_SET, kEncodingStrMap2, set);
  return LoadBlobResult::kSuccess;
}

StringSet* SetFamily::ConvertToStrSet(const intset* is, size_t expected_len) {
  int64_t intele;
  char buf[32];
  int ii = 0;

  StringSet* ss = CompactObj::AllocateMR<StringSet>();
  if (expected_len) {
    ss->Reserve(expected_len);
  }

  while (intsetGet(const_cast<intset*>(is), ii++, &intele)) {
    char* next = absl::numbers_internal::FastIntToBuffer(intele, buf);
    string_view str{buf, size_t(next - buf)};
    CHECK(ss->Add(str));
  }

  return ss;
}

using CI = CommandId;

#define HFUNC(x) SetHandler(&Cmd##x)

void SetFamily::Register(CommandRegistry* registry) {
  registry->StartFamily(acl::SET);
  *registry << CI{"SADD", CO::JOURNALED | CO::FAST | CO::DENYOOM, -3, 1, 1}.HFUNC(SAdd)
            << CI{"SDIFF", CO::READONLY, -2, 1, -1}.HFUNC(SDiff)
            << CI{"SDIFFSTORE", CO::JOURNALED | CO::DENYOOM | CO::NO_AUTOJOURNAL, -3, 1, -1}.HFUNC(
                   SDiffStore)
            << CI{"SINTER", CO::READONLY, -2, 1, -1}.HFUNC(SInter)
            << CI{"SINTERSTORE", CO::JOURNALED | CO::DENYOOM | CO::NO_AUTOJOURNAL, -3, 1, -1}.HFUNC(
                   SInterStore)
            << CI{"SINTERCARD", CO::READONLY | CO::VARIADIC_KEYS, -3, 2, 2}.HFUNC(SInterCard)
            << CI{"SMEMBERS", CO::READONLY, 2, 1, 1}.HFUNC(SMembers)
            << CI{"SISMEMBER", CO::FAST | CO::READONLY, 3, 1, 1}.HFUNC(SIsMember)
            << CI{"SMISMEMBER", CO::FAST | CO::READONLY, -3, 1, 1}.HFUNC(SMIsMember)
            << CI{"SMOVE", CO::FAST | CO::JOURNALED | CO::NO_AUTOJOURNAL, 4, 1, 2}.HFUNC(SMove)
            << CI{"SREM", CO::JOURNALED | CO::FAST, -3, 1, 1}.HFUNC(SRem)
            << CI{"SCARD", CO::READONLY | CO::FAST, 2, 1, 1}.HFUNC(SCard)
            << CI{"SPOP", CO::JOURNALED | CO::FAST | CO::NO_AUTOJOURNAL, -2, 1, 1}.HFUNC(SPop)
            << CI{"SRANDMEMBER", CO::READONLY, -2, 1, 1}.HFUNC(SRandMember)
            << CI{"SUNION", CO::READONLY, -2, 1, -1}.HFUNC(SUnion)
            << CI{"SUNIONSTORE", CO::JOURNALED | CO::DENYOOM | CO::NO_AUTOJOURNAL, -3, 1, -1}.HFUNC(
                   SUnionStore)
            << CI{"SSCAN", CO::READONLY, -3, 1, 1}.HFUNC(SScan)
            << CI{"SADDEX", CO::JOURNALED | CO::FAST | CO::DENYOOM, -4, 1, 1}.HFUNC(SAddEx);
}

uint32_t SetFamily::MaxIntsetEntries() {
  return kMaxIntSetEntries;
}

int32_t SetFamily::FieldExpireTime(const DbContext& db_context, const PrimeValue& pv,
                                   std::string_view field) {
  DCHECK_EQ(OBJ_SET, pv.ObjType());

  SetType st{pv.RObjPtr(), pv.Encoding()};
  return GetExpiry(db_context, st, field);
}

vector<long> SetFamily::SetFieldsExpireTime(const OpArgs& op_args, uint32_t ttl_sec,
                                            CmdArgList values, PrimeValue* pv) {
  DCHECK_EQ(OBJ_SET, pv->ObjType());

  if (pv->Encoding() == kEncodingIntSet) {
    // a valid result can never be a intset, since it doesnt keep ttl
    intset* is = (intset*)pv->RObjPtr();
    StringSet* ss = SetFamily::ConvertToStrSet(is, intsetLen(is));
    if (!ss) {
      std::vector<long> out(values.size(), -2);
      return out;
    }
    pv->InitRobj(OBJ_SET, kEncodingStrMap2, ss);
  }

  auto ss = static_cast<StringSet*>(pv->RObjPtr());
  ss->set_time(MemberTimeSeconds(op_args.db_cntx.time_now_ms));
  return ExpireElements(ss, values, ttl_sec);
}

}  // namespace dfly


================================================
FILE: src/server/set_family.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include "facade/facade_types.h"
#include "server/table.h"
#include "server/tx_base.h"

typedef struct intset intset;

namespace dfly {

using facade::OpResult;

class StringSet;

class SetFamily {
 public:
  static void Register(CommandRegistry* registry);

  static LoadBlobResult LoadIntSetBlob(std::string_view blob, PrimeValue* pv);
  static LoadBlobResult LoadLPSetBlob(std::string_view blob, PrimeValue* pv);

  static uint32_t MaxIntsetEntries();

  // Returns nullptr on OOM.
  static StringSet* ConvertToStrSet(const intset* is, size_t expected_len);

  // returns expiry time in seconds since kMemberExpiryBase date.
  // returns -3 if field was not found, -1 if no ttl is associated with the item.
  static int32_t FieldExpireTime(const DbContext& db_context, const PrimeValue& pv,
                                 std::string_view field);

  static std::vector<long> SetFieldsExpireTime(const OpArgs& op_args, uint32_t ttl_sec,
                                               facade::CmdArgList values, PrimeValue* pv);
};

}  // namespace dfly


================================================
FILE: src/server/set_family_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/set_family.h"

#include "absl/flags/flag.h"
#include "base/gtest.h"
#include "base/logging.h"
#include "facade/facade_test.h"
#include "server/test_utils.h"

extern "C" {
#include "redis/intset.h"
#include "redis/zmalloc.h"
}

ABSL_DECLARE_FLAG(std::string, shard_round_robin_prefix);

using namespace testing;
using namespace std;
using namespace util;
using namespace boost;

namespace dfly {

class SetFamilyTest : public BaseFamilyTest {
 protected:
};

MATCHER_P(ConsistsOfMatcher, elements, "") {
  auto vec = arg.GetVec();
  for (const auto& x : vec) {
    if (elements.find(x.GetString()) == elements.end()) {
      return false;
    }
  }
  return true;
}

auto ConsistsOf(std::initializer_list<std::string> elements) {
  return ConsistsOfMatcher(std::unordered_set<std::string>{elements});
}

TEST_F(SetFamilyTest, SAdd) {
  auto resp = Run({"sadd", "x", "1", "2", "3"});
  EXPECT_THAT(resp, IntArg(3));
  resp = Run({"sadd", "x", "2", "3"});
  EXPECT_THAT(resp, IntArg(0));
  Run({"set", "a", "foo"});
  resp = Run({"sadd", "a", "b"});
  EXPECT_THAT(resp, ErrArg("WRONGTYPE "));
  resp = Run({"type", "x"});
  EXPECT_EQ(resp, "set");
}

TEST_F(SetFamilyTest, IntConv) {
  auto resp = Run({"sadd", "x", "134"});
  EXPECT_THAT(resp, IntArg(1));
  resp = Run({"sadd", "x", "abc"});
  EXPECT_THAT(resp, IntArg(1));
  resp = Run({"sadd", "x", "134"});
  EXPECT_THAT(resp, IntArg(0));
}

TEST_F(SetFamilyTest, SUnionStore) {
  auto resp = Run({"sadd", "b", "1", "2", "3"});
  Run({"sadd", "c", "10", "11"});
  Run({"set", "a", "foo"});
  resp = Run({"sunionstore", "a", "b", "c"});

  EXPECT_THAT(resp, IntArg(5));
  resp = Run({"type", "a"});
  ASSERT_EQ(resp, "set");

  resp = Run({"smembers", "a"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), UnorderedElementsAre("11", "10", "1", "2", "3"));
}

// Check that SUNIONSTORE overwrites a value including resetting its expiration
TEST_F(SetFamilyTest, SUnionStoreExpiration) {
  Run({"sadd", "s1", "a", "b"});
  Run({"sadd", "s2", "c", "d"});

  Run({"set", "target", "some-value"});
  EXPECT_THAT(Run({"expire", "target", "1010"}), IntArg(1));
  EXPECT_THAT(Run({"ttl", "target"}), IntArg(1010));

  EXPECT_THAT(Run({"sunionstore", "target", "s1", "s2"}), IntArg(4));
  EXPECT_THAT(Run({"scard", "target"}), IntArg(4));
  EXPECT_THAT(Run({"ttl", "target"}), IntArg(-1));
}

TEST_F(SetFamilyTest, SDiff) {
  auto resp = Run({"sadd", "b", "1", "2", "3"});
  Run({"sadd", "c", "10", "11"});
  Run({"set", "a", "foo"});

  resp = Run({"sdiff", "b", "c"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), UnorderedElementsAre("1", "2", "3"));

  resp = Run({"sdiffstore", "a", "b", "c"});
  EXPECT_THAT(resp, IntArg(3));

  Run({"set", "str", "foo"});
  EXPECT_THAT(Run({"sdiff", "b", "str"}), ErrArg("WRONGTYPE "));

  Run({"sadd", "bar", "x", "a", "b", "c"});
  Run({"sadd", "foo", "c"});
  Run({"sadd", "car", "a", "d"});
  EXPECT_EQ(2, CheckedInt({"SDIFFSTORE", "tar", "bar", "foo", "car"}));
}

TEST_F(SetFamilyTest, SInter) {
  auto resp = Run({"sadd", "a", "1", "2", "3", "4"});
  Run({"sadd", "b", "3", "5", "6", "2"});
  resp = Run({"sinterstore", "d", "a", "b"});
  EXPECT_THAT(resp, IntArg(2));
  resp = Run({"smembers", "d"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), UnorderedElementsAre("3", "2"));

  Run({"set", "y", ""});
  resp = Run({"sinter", "x", "y"});
  ASSERT_EQ(1, GetDebugInfo("IO0").shards_count);
  EXPECT_THAT(resp, ErrArg("WRONGTYPE Operation against a key"));
  resp = Run({"sinterstore", "none1", "none2"});
  EXPECT_THAT(resp, IntArg(0));

  EXPECT_THAT(Run({"sinter"}), ErrArg("wrong number of arguments"));
}

TEST_F(SetFamilyTest, SInterCard) {
  Run({"sadd", "s1", "2", "b", "1", "a"});
  Run({"sadd", "s2", "3", "c", "2", "b"});
  Run({"sadd", "s3", "2", "b", "3", "c"});

  EXPECT_EQ(2, CheckedInt({"sintercard", "2", "s1", "s2"}));
  EXPECT_EQ(0, CheckedInt({"sintercard", "2", "s1", "s4"}));
  EXPECT_EQ(2, CheckedInt({"sintercard", "2", "s2", "s3", "LIMIT", "2"}));
  EXPECT_EQ(4, CheckedInt({"sintercard", "1", "s1"}));

  auto resp = Run({"sintercard", "a", "s1", "s2"});
  // redis does not throw this message, but SimpleAtoi does
  EXPECT_THAT(resp, ErrArg("value is not an integer or out of range"));
  resp = Run({"sintercard", "2", "s1", "s2", "LIMIT"});
  EXPECT_THAT(resp, ErrArg("syntax error"));
  resp = Run({"sintercard", "2", "s1", "s2", "LIMIT", "a"});
  EXPECT_THAT(resp, ErrArg("limit can't be negative"));
  resp = Run({"sintercard", "2", "s1", "s2", "LIMIT", "-1"});
  EXPECT_THAT(resp, ErrArg("limit can't be negative"));
  resp = Run({"sintercard", "2", "s1"});
  EXPECT_THAT(resp, ErrArg("syntax error"));
  resp = Run({"sintercard", "-1", "s1"});
  EXPECT_THAT(resp, ErrArg("value is not an integer or out of range"));
}

TEST_F(SetFamilyTest, SMove) {
  auto resp = Run({"sadd", "a", "1", "2", "3", "4"});
  Run({"sadd", "b", "3", "5", "6", "2"});
  resp = Run({"smove", "a", "b", "1"});
  EXPECT_THAT(resp, IntArg(1));

  Run({"sadd", "x", "a", "b", "c"});
  Run({"sadd", "y", "c"});
  EXPECT_THAT(Run({"smove", "x", "y", "c"}), IntArg(1));
}

TEST_F(SetFamilyTest, SPop) {
  auto resp = Run({"sadd", "x", "1", "2", "3"});
  resp = Run({"spop", "x", "3"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), UnorderedElementsAre("1", "2", "3"));
  resp = Run({"type", "x"});
  EXPECT_EQ(resp, "none");

  Run({"sadd", "x", "1", "2", "3"});
  resp = Run({"spop", "x", "2"});

  ASSERT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp.GetVec(), IsSubsetOf({"1", "2", "3"}));

  resp = Run({"scard", "x"});
  EXPECT_THAT(resp, IntArg(1));

  Run({"sadd", "y", "a", "b", "c"});
  resp = Run({"spop", "y", "1"});
  EXPECT_THAT(resp, ArgType(RespExpr::STRING));
  EXPECT_THAT(resp, testing::AnyOf("a", "b", "c"));

  resp = Run({"smembers", "y"});
  ASSERT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp.GetVec(), IsSubsetOf({"a", "b", "c"}));

  // Test POP on large set with small pop count
  vector<string> xlarge{"sadd", "xlarge"};
  for (size_t i = 0; i < 100; i++)
    xlarge.push_back(to_string(i));
  Run(absl::MakeSpan(xlarge));

  resp = Run({"spop", "xlarge", "2"});
  {
    auto elems = resp.GetVec();
    EXPECT_NE(elems[0].GetString(), elems[1].GetString());
  }

  resp = Run({"scard", "xlarge"});
  EXPECT_THAT(resp, IntArg(98));
}

TEST_F(SetFamilyTest, SRandMember) {
  // Test IntSet
  Run({"sadd", "x", "1", "2", "3"});

  // Test if count > 0 (IntSet)
  auto resp = Run({"SRandMember", "x"});
  ASSERT_THAT(resp, ArgType(RespExpr::STRING));
  EXPECT_THAT(resp, AnyOf("1", "2", "3"));

  resp = Run({"SRandMember", "x", "1"});
  ASSERT_THAT(resp, ArgType(RespExpr::STRING));
  EXPECT_THAT(resp, AnyOf("1", "2", "3"));

  resp = Run({"SRandMember", "x", "2"});
  ASSERT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp.GetVec(), IsSubsetOf({"1", "2", "3"}));

  resp = Run({"SRandMember", "x", "3"});
  ASSERT_THAT(resp, ArrLen(3));
  EXPECT_THAT(resp.GetVec(), UnorderedElementsAre("1", "2", "3"));

  // Test if count is larger than the size of the IntSet
  resp = Run({"SRandMember", "x", "25"});
  ASSERT_THAT(resp, ArrLen(3));
  EXPECT_THAT(resp.GetVec(), UnorderedElementsAre("1", "2", "3"));

  // Test if count < 0 (IntSet)
  resp = Run({"SRandMember", "x", "-1"});
  ASSERT_THAT(resp, ArgType(RespExpr::STRING));
  EXPECT_THAT(resp, AnyOf("1", "2", "3"));

  resp = Run({"SRandMember", "x", "-2"});
  ASSERT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp, ConsistsOf({"1", "2", "3"}));

  resp = Run({"SRandMember", "x", "-3"});
  ASSERT_THAT(resp, ArrLen(3));
  EXPECT_THAT(resp, ConsistsOf({"1", "2", "3"}));

  // Test if count < 0, but the absolute value is larger than the size of the IntSet
  resp = Run({"SRandMember", "x", "-25"});
  ASSERT_THAT(resp, ArrLen(25));
  EXPECT_THAT(resp, ConsistsOf({"1", "2", "3"}));

  // Test StrSet
  Run({"sadd", "y", "a", "b", "c"});

  // Test if count > 0 (StrSet)
  resp = Run({"SRandMember", "y"});
  ASSERT_THAT(resp, ArgType(RespExpr::STRING));
  EXPECT_THAT(resp, AnyOf("a", "b", "c"));

  resp = Run({"SRandMember", "y", "1"});
  ASSERT_THAT(resp, ArgType(RespExpr::STRING));
  EXPECT_THAT(resp, AnyOf("a", "b", "c"));

  resp = Run({"SRandMember", "y", "2"});
  ASSERT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp.GetVec(), IsSubsetOf({"a", "b", "c"}));

  resp = Run({"SRandMember", "y", "3"});
  ASSERT_THAT(resp, ArrLen(3));
  EXPECT_THAT(resp.GetVec(), UnorderedElementsAre("a", "b", "c"));

  // Test if count is larger than the size of the StrSet
  resp = Run({"SRandMember", "y", "25"});
  ASSERT_THAT(resp, ArrLen(3));
  EXPECT_THAT(resp.GetVec(), UnorderedElementsAre("a", "b", "c"));

  // Test if count < 0 (StrSet)
  resp = Run({"SRandMember", "y", "-1"});
  ASSERT_THAT(resp, ArgType(RespExpr::STRING));
  EXPECT_THAT(resp, AnyOf("a", "b", "c"));

  resp = Run({"SRandMember", "y", "-2"});
  ASSERT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp, ConsistsOf({"a", "b", "c"}));

  resp = Run({"SRandMember", "y", "-3"});
  ASSERT_THAT(resp, ArrLen(3));
  EXPECT_THAT(resp, ConsistsOf({"a", "b", "c"}));

  // Test if count < 0, but the absolute value is larger than the size of the StrSet
  resp = Run({"SRandMember", "y", "-25"});
  ASSERT_THAT(resp, ArrLen(25));
  EXPECT_THAT(resp, ConsistsOf({"a", "b", "c"}));

  // Test if count is 0
  ASSERT_THAT(Run({"SRandMember", "x", "0"}), ArrLen(0));

  // Test if set is empty
  EXPECT_THAT(Run({"SAdd", "empty::set", "1"}), IntArg(1));
  EXPECT_THAT(Run({"SRem", "empty::set", "1"}), IntArg(1));
  ASSERT_THAT(Run({"SRandMember", "empty::set", "0"}), ArrLen(0));
  ASSERT_THAT(Run({"SRandMember", "empty::set", "3"}), ArrLen(0));
  ASSERT_THAT(Run({"SRandMember", "empty::set", "-4"}), ArrLen(0));

  // Test if key does not exist
  ASSERT_THAT(Run({"SRandMember", "unknown::set"}), ArgType(RespExpr::NIL));
  ASSERT_THAT(Run({"SRandMember", "unknown::set", "0"}), ArrLen(0));

  // Test wrong arguments
  resp = Run({"SRandMember", "x", "5", "3"});
  EXPECT_THAT(resp, ErrArg("wrong number of arguments"));
}

TEST_F(SetFamilyTest, SMIsMember) {
  Run({"sadd", "foo", "a"});
  Run({"sadd", "foo", "b"});

  auto resp = Run({"smismember", "foo"});
  EXPECT_THAT(resp, ErrArg("wrong number of arguments"));

  resp = Run({"smismember", "foo1", "a", "b"});
  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(0), IntArg(0))));

  resp = Run({"smismember", "foo", "a", "c"});
  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(1), IntArg(0))));

  resp = Run({"smismember", "foo", "a", "b"});
  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(1), IntArg(1))));

  resp = Run({"smismember", "foo", "d", "e"});
  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(0), IntArg(0))));

  resp = Run({"smismember", "foo", "b"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"smismember", "foo", "x"});
  EXPECT_THAT(resp, IntArg(0));
}

TEST_F(SetFamilyTest, Empty) {
  auto resp = Run({"smembers", "x"});
  ASSERT_THAT(resp, ArrLen(0));
}

TEST_F(SetFamilyTest, SScan) {
  auto resp = Run("sscan non-existing-key 100 count 5");
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  ASSERT_THAT(resp.GetVec(), ElementsAre(ArgType(RespExpr::STRING), ArgType(RespExpr::ARRAY)));
  EXPECT_EQ(ToSV(resp.GetVec()[0].GetBuf()), "0");
  EXPECT_EQ(StrArray(resp.GetVec()[1]).size(), 0);

  // Test for int set
  for (int i = 0; i < 15; i++) {
    Run({"sadd", "myintset", absl::StrCat(i)});
  }

  // Note that even though this limit by 4, it would return more because
  // all fields are on intlist
  resp = Run({"sscan", "myintset", "0", "count", "4"});
  auto vec = StrArray(resp.GetVec()[1]);
  EXPECT_THAT(vec.size(), 15);

  resp = Run({"sscan", "myintset", "0", "match", "1*"});
  vec = StrArray(resp.GetVec()[1]);
  EXPECT_THAT(vec, UnorderedElementsAre("1", "10", "11", "12", "13", "14"));

  // test string set
  for (int i = 0; i < 15; i++) {
    Run({"sadd", "mystrset", absl::StrCat("str-", i)});
  }

  resp = Run({"sscan", "mystrset", "0", "count", "5"});
  vec = StrArray(resp.GetVec()[1]);
  EXPECT_THAT(vec.size(), 5);

  resp = Run({"sscan", "mystrset", "0", "match", "str-1*"});
  vec = StrArray(resp.GetVec()[1]);
  EXPECT_THAT(vec, UnorderedElementsAre("str-1", "str-10", "str-11", "str-12", "str-13", "str-14"));

  resp = Run({"sscan", "mystrset", "0", "match", "str-1*", "count", "3"});
  vec = StrArray(resp.GetVec()[1]);
  EXPECT_THAT(vec, IsSubsetOf({"str-1", "str-10", "str-11", "str-12", "str-13", "str-14"}));
  EXPECT_EQ(vec.size(), 3);

  // nothing should match this
  resp = Run({"sscan", "mystrset", "0", "match", "1*"});
  vec = StrArray(resp.GetVec()[1]);
  EXPECT_THAT(vec.size(), 0);
}

TEST_F(SetFamilyTest, HugeSScan) {
  for (int i = 0; i < 60000; i += 5) {
    Run({"sadd", "myintset", absl::StrCat(i), absl::StrCat(i + 1), absl::StrCat(i + 2),
         absl::StrCat(i + 3), absl::StrCat(i + 4)});
  }

  auto resp = Run({"sscan", "myintset", "0", "count", "50000"});
  auto vec = StrArray(resp.GetVec()[1]);
  EXPECT_GE(vec.size(), 50000);
}

TEST_F(SetFamilyTest, IntSetMemcpy) {
  // This logic is used in CompactObject::DefragIntSet
  intset* original = intsetNew();
  uint8_t success = 0;
  for (int i = 0; i < 250; ++i) {
    original = intsetAdd(original, i, &success);
    ASSERT_THAT(success, 1);
  }
  const size_t blob_len = intsetBlobLen(original);
  intset* replacement = (intset*)zmalloc(blob_len);
  memcpy(replacement, original, blob_len);

  ASSERT_THAT(original->encoding, replacement->encoding);
  ASSERT_THAT(original->length, replacement->length);

  for (int i = 0; i < 250; ++i) {
    int64_t value;
    ASSERT_THAT(intsetGet(replacement, i, &value), 1);
    ASSERT_THAT(value, i);
  }

  zfree(original);
  zfree(replacement);
}

TEST_F(SetFamilyTest, SAddEx) {
  TEST_current_time_ms = kMemberExpiryBase * 1000;
  EXPECT_THAT(Run({"saddex", "key", "2", "val"}), IntArg(1));
  AdvanceTime(1500);
  EXPECT_THAT(Run({"saddex", "key", "2", "val"}), IntArg(0));
  AdvanceTime(1000);
  EXPECT_EQ(1, CheckedInt({"sismember", "key", "val"}));

  auto resp = Run({"saddex", "k", "one", "v"});
  EXPECT_THAT(resp, ErrArg("value is not an integer or out of range"));

  // KEEPTTL support. add field orig with TTL=10
  EXPECT_THAT(Run({"saddex", "key", "10", "orig"}), IntArg(1));

  // add fields new and orig with TTL=1 and KEEPTTL=true. orig ttl should be preserved
  EXPECT_THAT(Run({"saddex", "key", "KEEPTTL", "1", "orig", "new"}), IntArg(1));
  EXPECT_LE(CheckedInt({"fieldttl", "key", "new"}), 1);

  // The expiry for orig should be unchanged, at least greater than 5 at this point given some time
  // has passed since we set it to 10
  EXPECT_GT(CheckedInt({"fieldttl", "key", "orig"}), 5);

  // without KEEPTTL the TTL should be overwritten
  EXPECT_THAT(Run({"saddex", "key", "2", "orig", "new"}), IntArg(0));
  EXPECT_LE(CheckedInt({"fieldttl", "key", "orig"}), 2);

  // At least one arg is expected
  EXPECT_THAT(Run({"saddex", "key", "KEEPTTL", "2"}), ErrArg("wrong number of arguments"));
}

TEST_F(SetFamilyTest, CheckSetLinkExpiryTransfer) {
  for (int i = 0; i < 10; i++) {
    EXPECT_THAT(Run(absl::StrCat("SADDEX key 5 ", i)), IntArg(1));
  }
  for (int i = 0; i < 9; i++) {
    Run(absl::StrCat("SREM key ", i));
  }
  EXPECT_THAT(Run("SCARD key"), IntArg(1));
  AdvanceTime(6000);
  Run("SMEMBERS key");
  EXPECT_THAT(Run("SCARD key"), IntArg(0));
}

TEST_F(SetFamilyTest, SetInter_5590) {
  absl::FlagSaver fs;
  SetTestFlag("num_shards", "2");
  num_threads_ = 3;
  SetTestFlag("shard_round_robin_prefix", "prefix-");
  ResetService();

  Run("DEBUG POPULATE 1 prefix- 5 RAND ELEMENTS 5000 TYPE SET");
  Run("SADD prefix-:0 common");
  // shard 0 has 1 key
  EXPECT_THAT(GetShardKeyCount(), Contains(Pair(0, 1)));

  Run("SADD prefix-foo bar hello common");
  // shard 1 has 1 key
  EXPECT_THAT(GetShardKeyCount(), Contains(Pair(0, 1)));
  EXPECT_THAT(GetShardKeyCount(), Contains(Pair(1, 1)));

  int64_t start = absl::GetCurrentTimeNanos();
  Run("SINTER prefix-foo prefix-:0");
  int64_t end = absl::GetCurrentTimeNanos();
  // Less than 100 ms. Before the fix it took 3seconds.
  EXPECT_LE(end - start, 100000000);
}

}  // namespace dfly


================================================
FILE: src/server/sharding.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/sharding.h"

#include <xxhash.h>

#include "absl/strings/match.h"
#include "base/flags.h"
#include "base/logging.h"
#include "server/cluster_support.h"
#include "server/common.h"
#include "util/fibers/synchronization.h"

using namespace std;

ABSL_FLAG(string, shard_round_robin_prefix, "", "Deprecated -- will be removed");

namespace dfly {
namespace {
// RoundRobinSharder implements a way to distribute keys that begin with some prefix.
// Round-robin is disabled by default. It is not a general use-case optimization, but instead only
// reasonable when there are a few highly contended keys, which we'd like to spread between the
// shards evenly.
// When enabled, the distribution is done via hash table: the hash of the key is used to look into
// a pre-allocated vector. This means that collisions are possible, but are very unlikely if only
// a few keys are used.
// Thread safe.
class RoundRobinSharder {
 public:
  static void Init(uint32_t shard_set_size) {
    round_robin_prefix_ = absl::GetFlag(FLAGS_shard_round_robin_prefix);
    shard_set_size_ = shard_set_size;

    if (IsEnabled()) {
      LOG(WARNING) << "shard_round_robin_prefix is deprecated and will be removed in new versions";
      // ~100k entries will consume 200kb per thread, and will allow 100 keys with < 2.5% collision
      // probability. Since this has a considerable footprint, we only allocate when enabled. We're
      // using a prime number close to 100k for better utilization.
      constexpr size_t kRoundRobinSize = 100'003;
      round_robin_shards_tl_cache_.resize(kRoundRobinSize);
      std::fill(round_robin_shards_tl_cache_.begin(), round_robin_shards_tl_cache_.end(),
                kInvalidSid);

      util::fb2::LockGuard guard(mutex_);
      if (round_robin_shards_.empty()) {
        round_robin_shards_ = round_robin_shards_tl_cache_;
      }
    }
  }

  static bool IsEnabled() {
    return !round_robin_prefix_.empty();
  }

  static optional<ShardId> TryGetShardId(string_view key, XXH64_hash_t key_hash) {
    DCHECK(!round_robin_shards_tl_cache_.empty());

    if (!absl::StartsWith(key, round_robin_prefix_)) {
      return nullopt;
    }

    size_t index = key_hash % round_robin_shards_tl_cache_.size();
    ShardId sid = round_robin_shards_tl_cache_[index];

    if (sid == kInvalidSid) {
      util::fb2::LockGuard guard(mutex_);
      sid = round_robin_shards_[index];
      if (sid == kInvalidSid) {
        sid = next_shard_;
        round_robin_shards_[index] = sid;
        next_shard_ = (next_shard_ + 1) % shard_set_size_;
      }
      round_robin_shards_tl_cache_[index] = sid;
    }

    return sid;
  }

 private:
  static thread_local string round_robin_prefix_;
  static thread_local vector<ShardId> round_robin_shards_tl_cache_;
  static thread_local uint32_t shard_set_size_;
  static vector<ShardId> round_robin_shards_ ABSL_GUARDED_BY(mutex_);
  static ShardId next_shard_ ABSL_GUARDED_BY(mutex_);
  static util::fb2::Mutex mutex_;
};

}  // namespace

thread_local string RoundRobinSharder::round_robin_prefix_;
thread_local uint32_t RoundRobinSharder::shard_set_size_;
thread_local vector<ShardId> RoundRobinSharder::round_robin_shards_tl_cache_;
vector<ShardId> RoundRobinSharder::round_robin_shards_;
ShardId RoundRobinSharder::next_shard_;
util::fb2::Mutex RoundRobinSharder::mutex_;

ShardId Shard(string_view v, ShardId shard_num) {
  // This cluster sharding is not necessary and may degrade keys distribution among shard threads.
  // For example, if we have 3 shards, then no single-char keys will be assigned to shard 2 and
  // 32 single char keys in range ['_' - '~'] will be assigned to shard 0.
  // Yes, SlotId function does not have great distribution properties.
  // On the other side, slot based sharding may help with pipeline squashing optimizations,
  // because they rely on commands being single-sharded.
  // TODO: once we improve our squashing logic, we can remove this.
  if (IsClusterShardedBySlot()) {
    return KeySlot(v) % shard_num;
  }

  if (IsClusterShardedByTag()) {
    v = LockTagOptions::instance().Tag(v);
  }

  XXH64_hash_t hash = XXH64(v.data(), v.size(), 120577240643ULL);

  if (RoundRobinSharder::IsEnabled()) {
    auto round_robin = RoundRobinSharder::TryGetShardId(v, hash);
    if (round_robin.has_value()) {
      return *round_robin;
    }
  }

  return hash % shard_num;
}

namespace sharding {
void InitThreadLocals(uint32_t shard_set_size) {
  RoundRobinSharder::Init(shard_set_size);
}
}  // namespace sharding

}  // namespace dfly


================================================
FILE: src/server/sharding.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <string_view>

#include "server/common_types.h"

namespace dfly {

ShardId Shard(std::string_view v, ShardId shard_num);

namespace sharding {
void InitThreadLocals(uint32_t shard_set_size);
}

}  // namespace dfly


================================================
FILE: src/server/slowlog.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/slowlog.h"

#include "base/logging.h"
#include "facade/facade_types.h"

namespace dfly {

using namespace std;

void SlowLogShard::ChangeLength(const size_t new_length) {
  log_entries_.set_capacity(new_length);
}

void SlowLogShard::Reset() {
  log_entries_.clear();
}

void SlowLogShard::Add(const string_view command_name, CmdArgList args,
                       const string_view client_name, const string_view client_ip,
                       uint64_t exec_time_usec, uint64_t unix_ts_usec) {
  DCHECK_GT(log_entries_.capacity(), 0u);

  vector<pair<string, uint32_t>> slowlog_args;
  size_t slowlog_effective_length = args.size();
  if (args.size() > kMaximumSlowlogArgCount) {
    // we store one argument fewer because the last argument is "wasted"
    // for telling how many further arguments there are
    slowlog_effective_length = kMaximumSlowlogArgCount - 1;
  }
  slowlog_args.reserve(slowlog_effective_length);
  slowlog_args.emplace_back(command_name, 0);

  for (size_t i = 0; i < slowlog_effective_length; ++i) {
    string_view arg = facade::ArgS(args, i);
    size_t extra_bytes = 0;
    // If any of the arguments is deemed too long, it will be truncated
    // and the truncated string will be suffixed by the number of truncated bytes in
    // this format: "... (n more bytes)"
    size_t extra_bytes_suffix_length = 0;
    if (arg.size() > kMaximumSlowlogArgLength) {
      extra_bytes = arg.size() - kMaximumSlowlogArgLength;
    }
    slowlog_args.emplace_back(arg.substr(0, kMaximumSlowlogArgLength - extra_bytes_suffix_length),
                              extra_bytes);
  }

  log_entries_.push_back(SlowLogEntry{slowlog_entry_id_++, unix_ts_usec, exec_time_usec,
                                      /* +1 for the command */ args.size() + 1,
                                      std::move(slowlog_args), string(client_ip),
                                      string(client_name)});
}

}  // namespace dfly


================================================
FILE: src/server/slowlog.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <boost/circular_buffer.hpp>
#include <string>
#include <vector>

#include "base/integral_types.h"
#include "facade/facade_types.h"

namespace dfly {

using facade::CmdArgList;

constexpr size_t kMaximumSlowlogArgCount = 31;  // 32 - 1 for the command name
constexpr size_t kMaximumSlowlogArgLength = 128;

struct SlowLogEntry {
  uint32_t entry_id;
  uint64_t unix_ts_usec;
  uint64_t exec_time_usec;
  size_t original_length;
  // a vector of pairs of argument and extra bytes if the argument was truncated
  std::vector<std::pair<std::string, uint32_t>> cmd_args;
  std::string client_ip;
  std::string client_name;
};

class SlowLogShard {
 public:
  boost::circular_buffer<SlowLogEntry>& Entries() {
    return log_entries_;
  }

  void Add(const std::string_view command_name, CmdArgList args, const std::string_view client_name,
           const std::string_view client_ip, uint64_t exec_time_usec, uint64_t unix_ts_usec);
  void Reset();
  void ChangeLength(size_t new_length);

  size_t Length() const {
    return log_entries_.size();
  }

  size_t IsEnabled() const {
    return log_entries_.capacity() > 0;
  }

 private:
  uint32_t slowlog_entry_id_ = 0;

  // TODO: to replace with base::RingBuffer because circular_buffer does not seem to support
  // move semantics.
  boost::circular_buffer<SlowLogEntry> log_entries_;
};
}  // namespace dfly


================================================
FILE: src/server/snapshot.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/snapshot.h"

#include <absl/strings/str_cat.h>

#include <mutex>

#include "base/cycle_clock.h"
#include "base/flags.h"
#include "base/logging.h"
#include "core/search/base.h"
#include "server/db_slice.h"
#include "server/engine_shard_set.h"
#include "server/execution_state.h"
#include "server/journal/journal.h"
#include "server/rdb_extensions.h"
#include "server/rdb_save.h"
#include "server/search/global_hnsw_index.h"
#include "server/server_state.h"
#include "server/tiered_storage.h"
#include "util/fibers/stacktrace.h"
#include "util/fibers/synchronization.h"

ABSL_FLAG(bool, point_in_time_snapshot, true, "If true replication uses point in time snapshoting");
ABSL_FLAG(bool, background_snapshotting, false, "Whether to run snapshot as a background fiber");
ABSL_FLAG(bool, serialize_hnsw_index, false, "Serialize HNSW vector index graph structure");

namespace dfly {

using namespace std;
using namespace util;
using namespace chrono_literals;

using facade::operator""_KB;

namespace {
thread_local absl::flat_hash_set<SliceSnapshot*> tl_slice_snapshots;

// Controls the chunks size for pushing serialized data. The larger the chunk the more CPU
// it may require (especially with compression), and less responsive the server may be.
constexpr size_t kMinBlobSize = 8_KB;

}  // namespace

SliceSnapshot::SliceSnapshot(CompressionMode compression_mode, DbSlice* slice,
                             SnapshotDataConsumerInterface* consumer, ExecutionState* cntx,
                             DflyVersion replica_dfly_version)
    : db_slice_(slice),
      db_array_(slice->databases()),
      compression_mode_(compression_mode),
      replica_dfly_version_(replica_dfly_version),
      consumer_(consumer),
      cntx_(cntx) {
  tl_slice_snapshots.insert(this);
}

SliceSnapshot::~SliceSnapshot() {
  DCHECK(db_slice_->shard_owner()->IsMyThread());
  tl_slice_snapshots.erase(this);
}

size_t SliceSnapshot::GetThreadLocalMemoryUsage() {
  size_t mem = 0;
  for (SliceSnapshot* snapshot : tl_slice_snapshots) {
    mem += snapshot->GetBufferCapacity();
  }
  return mem;
}

bool SliceSnapshot::IsSnaphotInProgress() {
  return !tl_slice_snapshots.empty();
}

void SliceSnapshot::Start(bool stream_journal, SnapshotFlush allow_flush) {
  DCHECK(!snapshot_fb_.IsJoinable());

  auto db_cb = [this](DbIndex db_index, const DbSlice::ChangeReq& req) {
    OnDbChange(db_index, req);
  };

  use_background_mode_ = absl::GetFlag(FLAGS_background_snapshotting);
  snapshot_version_ = db_slice_->RegisterOnChange(std::move(db_cb));

  if (stream_journal) {
    use_snapshot_version_ = absl::GetFlag(FLAGS_point_in_time_snapshot);
    journal_cb_id_ = journal::RegisterConsumer(this);
    if (!use_snapshot_version_) {
      auto moved_cb = [this](DbIndex db_index, const DbSlice::MovedItemsVec& items) {
        OnMoved(db_index, items);
      };
      moved_cb_id_ = db_slice_->RegisterOnMove(std::move(moved_cb));
    }
  }

  size_t flush_threshold = 0;
  RdbSerializer::ConsumeFun consume_fun;
  if (allow_flush == SnapshotFlush::kAllow) {
    flush_threshold = ServerState::tlocal()->serialization_max_chunk_size;
    if (flush_threshold != 0) {
      // The callback receives data directly from the serializer, no need to call back into it.
      consume_fun = [this](std::string data) {
        HandleFlushData(std::move(data));
        VLOG(2) << "HandleFlushData via callback";
        ++ServerState::tlocal()->stats.big_value_preemptions;
      };
    }
  }
  serializer_ = std::make_unique<RdbSerializer>(compression_mode_, consume_fun, flush_threshold);

  VLOG(1) << "DbSaver::Start - saving entries with version less than " << snapshot_version_;

  fb2::Fiber::Opts opts{.priority = use_background_mode_ ? fb2::FiberPriority::BACKGROUND
                                                         : fb2::FiberPriority::NORMAL,
                        .name = absl::StrCat("SliceSnapshot-", ProactorBase::me()->GetPoolIndex())};
  snapshot_fb_ = fb2::Fiber(opts, [this, stream_journal] {
    // TODO add error processing for index serialization
    SerializeIndexMappings();
    SerializeGlobalHnswIndices();
    this->IterateBucketsFb(stream_journal);
    db_slice_->UnregisterOnChange(snapshot_version_);
    if (!use_snapshot_version_) {
      db_slice_->UnregisterOnMoved(moved_cb_id_);
    }
    consumer_->Finalize();
    VLOG(1) << "Serialization peak bytes: " << serializer_->GetSerializationPeakBytes();
  });
}

// Called only for replication use-case.
void SliceSnapshot::FinalizeJournalStream(bool cancel) {
  VLOG(1) << "FinalizeJournalStream";
  DCHECK(db_slice_->shard_owner()->IsMyThread());
  if (!journal_cb_id_) {  // Finalize only once.
    // In case of incremental snapshotting in StartIncremental, if an error is encountered,
    // journal_cb_id_ may not be set, but the snapshot fiber is still running.
    snapshot_fb_.JoinIfNeeded();
    return;
  }
  uint32_t cb_id = journal_cb_id_;
  journal_cb_id_ = 0;

  // Wait for serialization to finish in any case.
  snapshot_fb_.JoinIfNeeded();

  journal::UnregisterConsumer(cb_id);
  if (!cancel) {
    // always succeeds because serializer_ flushes to string.
    VLOG(1) << "FinalizeJournalStream lsn: " << journal::GetLsn();
    std::ignore = serializer_->SendJournalOffset(journal::GetLsn());
    PushSerialized(true);
  }
}

// The algorithm is to go over all the buckets and serialize those with
// version < snapshot_version_. In order to serialize each physical bucket exactly once we update
// bucket version to snapshot_version_ once it has been serialized.
// We handle serialization at physical bucket granularity.
// To further complicate things, Table::Traverse covers a logical bucket that may comprise of
// several physical buckets in dash table. For example, items belonging to logical bucket 0
// can reside in buckets 0,1 and stash buckets 56-59.
// PrimeTable::Traverse guarantees an atomic traversal of a single logical bucket,
// it also guarantees 100% coverage of all items that exists when the traversal started
// and survived until it finished.

void SliceSnapshot::SerializeIndexMapping(
    uint32_t shard_id, std::string_view index_name,
    const std::vector<std::pair<std::string, search::DocId>>& mappings) {
  // Format: [RDB_OPCODE_SHARD_DOC_INDEX, shard_id, index_name, mapping_count,
  //          then for each mapping: key_string, doc_id]
  if (auto ec = serializer_->WriteOpcode(RDB_OPCODE_SHARD_DOC_INDEX); ec)
    return;
  if (auto ec = serializer_->SaveLen(shard_id); ec)
    return;
  if (auto ec = serializer_->SaveString(index_name); ec)
    return;
  if (auto ec = serializer_->SaveLen(mappings.size()); ec)
    return;

  for (const auto& [key, doc_id] : mappings) {
    if (auto ec = serializer_->SaveString(key); ec)
      return;
    if (auto ec = serializer_->SaveLen(doc_id); ec)
      return;
  }
  PushSerialized(false);
}

void SliceSnapshot::SerializeIndexMappings() {
#ifdef WITH_SEARCH
  if (SaveMode() == dfly::SaveMode::RDB || !absl::GetFlag(FLAGS_serialize_hnsw_index) ||
      replica_dfly_version_ < DflyVersion::VER6) {
    return;
  }

  // Get all HNSW index names from the global registry
  absl::flat_hash_set<std::string> hnsw_index_names =
      GlobalHnswIndexRegistry::Instance().GetIndexNames();

  auto* indices = db_slice_->shard_owner()->search_indices();
  uint32_t shard_id = db_slice_->shard_owner()->shard_id();

  for (const auto& index_name : hnsw_index_names) {
    auto* index = indices->GetIndex(index_name);
    if (!index) {
      continue;
    }

    auto mappings = index->SerializeKeyIndex();
    if (mappings.empty()) {
      continue;
    }

    SerializeIndexMapping(shard_id, index_name, mappings);
  }
#endif
}

void SliceSnapshot::SerializeGlobalHnswIndices() {
#ifdef WITH_SEARCH
  // Serialize HNSW global indices for shard 0 only
  if (db_slice_->shard_owner()->shard_id() != 0 || SaveMode() == dfly::SaveMode::RDB ||
      !absl::GetFlag(FLAGS_serialize_hnsw_index) || replica_dfly_version_ < DflyVersion::VER6) {
    return;
  }

  auto all_indices = GlobalHnswIndexRegistry::Instance().GetAll();

  // Preallocate buffer for HNSW entry serialization.
  std::vector<uint8_t> tmp_buf;

  for (const auto& [index_key, index] : all_indices) {
    {
      // Acquire a read lock to ensure a consistent snapshot of the graph.
      // While held, Add/Remove calls will defer into the adapter's internal list
      // and will be replayed automatically on the next write operation.
      auto read_lock = index->GetReadLock();

      // Format: [RDB_OPCODE_VECTOR_INDEX, index_name, elements_number,
      //          then for each node: binary encoded entry via SaveHNSWEntry]
      if (auto ec = serializer_->WriteOpcode(RDB_OPCODE_VECTOR_INDEX); ec) {
        continue;
      }
      if (auto ec = serializer_->SaveString(index_key); ec) {
        continue;
      }

      size_t node_count = index->GetNodeCount();
      if (auto ec = serializer_->SaveLen(node_count); ec) {
        continue;
      }

      constexpr size_t kBatchSize = 1000;
      for (size_t i = 0; i < node_count; i += kBatchSize) {
        size_t batch_end = std::min(i + kBatchSize, node_count);
        auto nodes = index->GetNodesRange(i, batch_end);
        for (const auto& node : nodes) {
          tmp_buf.resize(node.TotalSize());
          if (auto ec = serializer_->SaveHNSWEntry(node, absl::MakeSpan(tmp_buf)); ec)
            break;
        }
      }
    }  // read_lock released here

    // Flush after completing entire index to avoid splitting HNSW data across compressed blobs.
    // The HNSW loader expects all nodes for an index to be readable in one pass.
    PushSerialized(false);
  }
#endif
}

// Serializes all the entries with version less than snapshot_version_.
void SliceSnapshot::IterateBucketsFb(bool send_full_sync_cut) {
  const uint64_t kCyclesPerJiffy = base::CycleClock::Frequency() >> 16;  // ~15usec.

  for (DbIndex db_indx = 0; db_indx < db_array_.size(); ++db_indx) {
    stats_.keys_total += db_slice_->DbSize(db_indx);
  }

  for (DbIndex snapshot_db_index_ = 0; snapshot_db_index_ < db_array_.size();
       ++snapshot_db_index_) {
    if (!cntx_->IsRunning())
      return;

    if (!db_array_[snapshot_db_index_])
      continue;

    PrimeTable* pt = &db_array_[snapshot_db_index_]->prime;
    VLOG(1) << "Start traversing " << pt->size() << " items for index " << snapshot_db_index_;

    do {
      if (!cntx_->IsRunning()) {
        return;
      }

      snapshot_cursor_ = pt->TraverseBuckets(
          snapshot_cursor_,
          [this, &snapshot_db_index_](auto it) { return BucketSaveCb(snapshot_db_index_, it); });

      if (use_background_mode_) {
        // Yielding for background fibers has low overhead if the time slice isn't used up.
        // Do it after every bucket for maximum responsiveness.
        DCHECK(ThisFiber::Priority() == fb2::FiberPriority::BACKGROUND);
        ThisFiber::Yield();
        PushSerialized(false);
      } else {
        if (!PushSerialized(false)) {
          if (!use_background_mode_ && ThisFiber::GetRunningTimeCycles() > kCyclesPerJiffy) {
            ThisFiber::Yield();
          }
        }
      }
    } while (snapshot_cursor_);

    DVLOG(2) << "after loop " << ThisFiber::GetName();
    // Wait for all the outstanding delayed entries and serialize them as well.
    PushDelayedEntries(true, nullptr);
    PushSerialized(true);
  }  // for (dbindex)

  CHECK(!serialize_bucket_running_);
  if (send_full_sync_cut) {
    CHECK(!serializer_->SendFullSyncCut());
    PushSerialized(true);
  }

  // serialized + side_saved must be equal to the total saved.
  VLOG(1) << "Exit SnapshotSerializer loop_serialized: " << stats_.loop_serialized
          << ", side_saved " << stats_.side_saved << ", cbcalls " << stats_.savecb_calls
          << ", journal_saved " << stats_.jounal_changes << ", moved_saved " << stats_.moved_saved
          << ", flushed_under_lock " << stats_.flushed_under_lock;
}

bool SliceSnapshot::BucketSaveCb(DbIndex db_index, PrimeTable::bucket_iterator it) {
  std::lock_guard guard(big_value_mu_);

  ++stats_.savecb_calls;

  if (use_snapshot_version_) {
    if (it.GetVersion() >= snapshot_version_) {
      // either has been already serialized or added after snapshotting started.
      DVLOG(3) << "Skipped " << it.segment_id() << ":" << it.bucket_id() << " at "
               << it.GetVersion();
      ++stats_.skipped;
      return false;
    }

    db_slice_->FlushChangeToEarlierCallbacks(db_index, DbSlice::Iterator::FromPrime(it),
                                             snapshot_version_);
  }

  auto* latch = db_slice_->GetLatch();

  // Locking this never preempts. We merely just increment the underline counter such that
  // if SerializeBucket preempts, Heartbeat() won't run because the blocking counter is not
  // zero.
  std::lock_guard latch_guard(*latch);

  stats_.loop_serialized += SerializeBucket(db_index, it, false);

  return false;
}

unsigned SliceSnapshot::SerializeBucket(DbIndex db_index, PrimeTable::bucket_iterator it,
                                        bool push_tiered) {
  if (use_snapshot_version_) {
    DCHECK_LT(it.GetVersion(), snapshot_version_);
    it.SetVersion(snapshot_version_);
  }

  // traverse physical bucket and write it into string file.
  serialize_bucket_running_ = true;

  unsigned result = 0;

  std::vector<TieredDelayEntryKey> bucket_tiered_keys;
  const bool tiering_enabled = EngineShard::tlocal()->tiered_storage() != nullptr;
  const bool track_tiered_keys = push_tiered && tiering_enabled;

  for (it.AdvanceIfNotOccupied(); !it.is_done(); ++it) {
    ++result;
    // might preempt due to big value serialization.
    SerializeEntry(db_index, it->first, it->second);
    // Track tiered keys to push them with priority after the loop, but only for callbacks.
    if (track_tiered_keys && it->second.IsExternal()) {
      bucket_tiered_keys.emplace_back(db_index, it->first.ToString());
    }
  }

  if (tiering_enabled) {
    // Push tracked tiered keys forcefully. If there are too many delayed entries
    // accumulated we should also push them forcefully.
    const size_t kMaxDelayedEntries = 512;
    PushDelayedEntries(delayed_entries_.size() > kMaxDelayedEntries,
                       track_tiered_keys ? &bucket_tiered_keys : nullptr);
  }

  serialize_bucket_running_ = false;
  return result;
}

void SliceSnapshot::SerializeEntry(DbIndex db_indx, const PrimeKey& pk, const PrimeValue& pv) {
  if (pv.IsExternal() && pv.IsCool())
    return SerializeEntry(db_indx, pk, pv.GetCool().record->value);

  time_t expire_time = pk.GetExpireTime();
  uint32_t mc_flags = pv.HasFlag() ? db_slice_->GetMCFlag(db_indx, pk) : 0;

  if (pv.IsExternal()) {
    // TODO: we loose the stickiness attribute by cloning like this PrimeKey.
    SerializeExternal(db_indx, PrimeKey{pk.ToString()}, pv, expire_time, mc_flags);
  } else {
    io::Result<uint8_t> res = serializer_->SaveEntry(pk, pv, expire_time, mc_flags, db_indx);
    CHECK(res);
    ++type_freq_map_[*res];
  }
}

void SliceSnapshot::HandleFlushData(std::string data) {
  if (data.empty())
    return;

  if (big_value_mu_.is_locked()) {
    ++stats_.flushed_under_lock;
  }
  size_t serialized = data.size();
  uint64_t id = rec_id_++;

  if (use_background_mode_) {
    // Yield after possibly long cpu slice due to compression and serialization
    // before possbile suspension of ConsumeData resets the cpu time of the last slice
    if (ThisFiber::Priority() == fb2::FiberPriority::BACKGROUND)
      ThisFiber::Yield();
    // else: This function is invoked from the journal with regular priority as well.
    // TODO: Mavbe Sleep() to provide write backpressure in advance?
  }

  uint64_t running_cycles = ThisFiber::GetRunningTimeCycles();

  fb2::NoOpLock lk;
  // We create a critical section here that ensures that records are pushed in sequential order.
  // As a result, it is not possible for two fiber producers to push concurrently.
  // If A.id = 5, and then B.id = 6, and both are blocked here, it means that last_pushed_id_ < 4.
  // Once last_pushed_id_ = 4, A will be unblocked, while B will wait until A finishes pushing and
  // update last_pushed_id_ to 5.
  seq_cond_.wait(lk, [&] { return id == this->last_pushed_id_ + 1; });

  // Blocking point.
  consumer_->ConsumeData(std::move(data), cntx_);

  DCHECK_EQ(last_pushed_id_ + 1, id);
  last_pushed_id_ = id;
  seq_cond_.notify_all();

  if (!use_background_mode_) {
    // serializer_->Flush can be quite slow for large values or due to compression, therefore
    // we counter-balance CPU over-usage by sleeping.
    // We measure running_cycles before the preemption points, because they reset the counter.
    uint64_t sleep_usec = (running_cycles * 1000'000 / base::CycleClock::Frequency()) / 2;
    ThisFiber::SleepFor(chrono::microseconds(std::min<uint64_t>(sleep_usec, 2000ul)));
  }

  VLOG(2) << "Pushed with Serialize() " << serialized;
}

size_t SliceSnapshot::FlushSerialized() {
  std::string blob = serializer_->Flush(RdbSerializerBase::FlushState::kFlushEndEntry);
  size_t serialized = blob.size();
  HandleFlushData(std::move(blob));
  return serialized;
}

bool SliceSnapshot::PushSerialized(bool force) {
  if (!force && serializer_->SerializedLen() < kMinBlobSize)
    return false;
  return FlushSerialized();
}

void SliceSnapshot::PushDelayedEntries(bool force,
                                       std::vector<TieredDelayEntryKey>* bucket_tiered_keys) {
  using DelayedEntryIt = decltype(delayed_entries_)::iterator;

  // Serializes a single delayed entry. Resolves the tiered read future, write the
  // key/value and removes the entry from the map.
  auto serialize_entry = [this](DelayedEntryIt it) {
    auto& entry = it->second;
    auto value = entry->value.Get();

    if (!value.has_value()) {
      cntx_->ReportError(make_error_code(errc::io_error),
                         absl::StrCat("Failed to read tiered key: ", entry->key.ToString()));
      return;
    }

    PrimeValue pv{*value};
    auto res = serializer_->SaveEntry(entry->key, pv, entry->expire, entry->mc_flags, entry->dbid);
    CHECK(res);

    delayed_entries_.erase(it);

    // If we have serialized enough data we should push it to avoid building
    // up a large blob in memory.
    PushSerialized(false);
  };

  // When tiered_keys are provided, we should serialize the entries matching the keys.
  if (bucket_tiered_keys) {
    for (const auto& key : *bucket_tiered_keys) {
      if (auto it = delayed_entries_.find(key); it != delayed_entries_.end())
        serialize_entry(it);
    }
  }

  // Serialize the delayed entries that are resolved, or all if force it true.
  for (auto it = delayed_entries_.begin(); it != delayed_entries_.end();) {
    if (!force && !it->second->value.IsResolved()) {
      ++it;
      continue;
    }
    serialize_entry(it++);
  }

  // If we need to serialize all entries (force=true), we should push
  // leftover serialized data after the loop.
  PushSerialized(force);
}

void SliceSnapshot::SerializeExternal(DbIndex db_index, PrimeKey pk, const PrimeValue& pv,
                                      time_t expire_time, uint32_t mc_flags) {
  // We prefer avoid blocking, so we just schedule a tiered read and append
  // it to the delayed entries.
  auto key = pk.ToString();
  auto future = ReadTieredString(db_index, key, pv, EngineShard::tlocal()->tiered_storage());
  auto entry = std::make_unique<TieredDelayedEntry>(db_index, std::move(pk), std::move(future),
                                                    expire_time, mc_flags);
  delayed_entries_.emplace(std::make_pair(db_index, key), std::move(entry));
  ++type_freq_map_[RDB_TYPE_STRING];
}

// Ordering invariant (both modes):
//   For any key K, the replica must receive K's baseline value strictly before any journal entry
//   that mutates K. This is required for baseline-dependent journal entries (e.g., HSET, LPUSH)
//   which cannot be replayed without the prior value.
//
// PIT mode: enforced by serialize-before-mutate. OnDbChange serializes the bucket before the
//   mutation commits; ConsumeJournalChange runs after the mutation on the same fiber, so the
//   baseline is always first. big_value_mu_ prevents interleaving with the traversal fiber's
//   SerializeBucket (which can preempt via consume_fun_).
//
// Non-PIT mode: OnDbChange only acquires big_value_mu_ as a barrier — no serialization. The
//   mutex prevents journaling mutations from slipping in the middle of bucket serialization
//   on the traversal fiber — see ConsumeJournalChange for details. OnMoved handles items
//   displaced across the traversal cursor.
void SliceSnapshot::OnDbChange(DbIndex db_index, const DbSlice::ChangeReq& req) {
  std::lock_guard guard(big_value_mu_);
  if (use_snapshot_version_) {
    PrimeTable* table = db_slice_->GetTables(db_index).first;
    const PrimeTable::bucket_iterator* bit = req.update();

    if (bit) {
      if (!bit->is_done() && bit->GetVersion() < snapshot_version_) {
        stats_.side_saved += SerializeBucket(db_index, *bit, true);
      }
    } else {
      string_view key = get<string_view>(req.change);
      table->CVCUponInsert(snapshot_version_, key,
                           [this, db_index](PrimeTable::bucket_iterator it) {
                             DCHECK_LT(it.GetVersion(), snapshot_version_);
                             stats_.side_saved += SerializeBucket(db_index, it, true);
                           });
    }
  }
}

bool SliceSnapshot::IsPositionSerialized(DbIndex id, PrimeTable::Cursor cursor) {
  uint8_t depth = db_slice_->GetTables(id).first->depth();

  return id < snapshot_db_index_ ||
         (id == snapshot_db_index_ &&
          (cursor.bucket_id() < snapshot_cursor_.bucket_id() ||
           (cursor.bucket_id() == snapshot_cursor_.bucket_id() &&
            cursor.segment_id(depth) < snapshot_cursor_.segment_id(depth))));
}

void SliceSnapshot::OnMoved(DbIndex id, const DbSlice::MovedItemsVec& items) {
  std::lock_guard barrier(big_value_mu_);
  DCHECK(!use_snapshot_version_);
  for (const auto& item_cursors : items) {
    // If item was moved from a bucket that was serialized to a bucket that was not serialized
    // serialize the moved item.
    const PrimeTable::Cursor& dest = item_cursors.second;
    const PrimeTable::Cursor& source = item_cursors.first;
    if (IsPositionSerialized(id, dest) && !IsPositionSerialized(id, source)) {
      PrimeTable::bucket_iterator bit = db_slice_->GetTables(id).first->CursorToBucketIt(dest);
      ++stats_.moved_saved;
      SerializeBucket(id, bit, true);
    }
  }
}

// big_value_mu_ prevents expiry/eviction DEL journal entries from interleaving with an
// in-progress SaveEntry for a large value. SaveEntry may yield mid-entry (emitting chunks
// across multiple scheduler turns); expiry paths emit DEL via RecordDelete directly,
// bypassing OnDbChange. Without the lock, such a DEL could be written between two chunks
// of the same entry, producing an invalid wire format for the downstream consumer.
//
// Note: even if the protocol were extended to support interleaved chunks, the lock would
// still be required semantically: a DEL journal entry must not be applied on the replica
// while the entry's baseline is still being loaded. The delayed deletion queue proposal
// in the design doc addresses this without a shard-wide lock.
//
// Note: for transaction-driven mutations, baseline-before-journal ordering is already
// guaranteed by call order on the mutation fiber (OnDbChange precedes ConsumeJournalChange);
// big_value_mu_ is not needed for that ordering.
void SliceSnapshot::ConsumeJournalChange(const journal::JournalChangeItem& item) {
  std::lock_guard barrier(big_value_mu_);

  // remove when we support interleaving chunks.
  LOG_IF(DFATAL, serialize_bucket_running_)
      << "Internal error: can not run interleave journal and bucket serialization";
  std::ignore = serializer_->WriteJournalEntry(item.journal_item.data);
  ++stats_.jounal_changes;
}

void SliceSnapshot::ThrottleIfNeeded() {
  PushSerialized(false);
}

size_t SliceSnapshot::GetBufferCapacity() const {
  if (serializer_ == nullptr) {
    return 0;
  }

  return serializer_->GetBufferCapacity();
}

size_t SliceSnapshot::GetTempBuffersSize() const {
  if (serializer_ == nullptr) {
    return 0;
  }

  return serializer_->GetTempBufferSize();
}

RdbSaver::SnapshotStats SliceSnapshot::GetCurrentSnapshotProgress() const {
  return {stats_.loop_serialized + stats_.side_saved, stats_.keys_total};
}

}  // namespace dfly


================================================
FILE: src/server/snapshot.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <deque>

#include "server/db_slice.h"
#include "server/rdb_save.h"
#include "server/synchronization.h"
#include "server/table.h"
#include "server/tiered_storage.h"

namespace dfly {

class ExecutionState;

namespace journal {
struct Entry;
}  // namespace journal

namespace search {
using DocId = uint32_t;
}  // namespace search

// ┌────────────────┐   ┌─────────────┐
// │IterateBucketsFb│   │  OnDbChange │
// └──────┬─────────┘   └─┬───────────┘
//        │               │            OnDbChange forces whole bucket to be
//        ▼               ▼            serialized if iterate didn't reach it yet
// ┌──────────────────────────┐
// │     SerializeBucket      │        Both might fall back to a temporary serializer
// └────────────┬─────────────┘        if default is used on another db index
//              │
//              |                      Socket is left open in journal streaming mode
//              ▼
// ┌──────────────────────────┐          ┌─────────────────────────┐
// │     SerializeEntry       │          │  ConsumeJournalChange   │
// └─────────────┬────────────┘          └────────────┬────────────┘
//               │                                    │
//         PushBytes                                  │   into serializer buffer)
//               │                                    ▼
//               ▼                        ┌──────────────────────────┐
//               ▼                        │     WriteJournalEntry    │
// ┌──────────────────────────────┐       │  (appends journal entry  │
// │     push_cb(buffer)          │       │   into serializer buffer)│
// └──────────────────────────────┘       └──────────────────────────┘

// SliceSnapshot is used for iterating over a shard at a specified point-in-time
// and submitting all values to an output sink.
// In journal streaming mode, the snapshot continues submitting changes
// over the sink until explicitly stopped.
class SliceSnapshot : public journal::JournalConsumerInterface {
 public:
  // Represents a target sink for receiving snapshot data. Specifically designed
  // to send data to RdbSaver wrapping up a file shard or a socket.
  struct SnapshotDataConsumerInterface {
    virtual ~SnapshotDataConsumerInterface() = default;

    // Receives a chunk of snapshot data for processing
    virtual void ConsumeData(std::string data, ExecutionState* cntx) = 0;

    // Finalizes the snapshot writing
    virtual void Finalize() = 0;
  };

  SliceSnapshot(CompressionMode compression_mode, DbSlice* slice,
                SnapshotDataConsumerInterface* consumer, ExecutionState* cntx,
                DflyVersion replica_dfly_version);
  ~SliceSnapshot();

  static size_t GetThreadLocalMemoryUsage();
  static bool IsSnaphotInProgress();

  // Initialize snapshot, start bucket iteration fiber, register listeners.
  // In journal streaming mode it needs to be stopped by either Stop or Cancel.
  enum class SnapshotFlush : uint8_t { kAllow, kDisallow };

  void Start(bool stream_journal, SnapshotFlush allow_flush = SnapshotFlush::kDisallow);

  // Finalizes journal streaming writes. Only called for replication.
  // Blocking. Must be called from the Snapshot thread.
  void FinalizeJournalStream(bool cancel);

  // Waits for a regular, non journal snapshot to finish.
  // Called only for non-replication, backups usecases.
  void WaitSnapshotting() {
    snapshot_fb_.JoinIfNeeded();
  }

  const RdbTypeFreqMap& freq_map() const {
    return type_freq_map_;
  }

  // Get different sizes, in bytes. All disjoint.
  size_t GetBufferCapacity() const;
  size_t GetTempBuffersSize() const;

  RdbSaver::SnapshotStats GetCurrentSnapshotProgress() const;

  // Journal listener
  void ConsumeJournalChange(const journal::JournalChangeItem& item);
  void ThrottleIfNeeded();

 private:
  [[maybe_unused]] void SerializeIndexMapping(
      uint32_t shard_id, std::string_view index_name,
      const std::vector<std::pair<std::string, search::DocId>>& mappings);

  // Serialize ShardDocIndex key-to-DocId mappings for all search indices on this shard
  void SerializeIndexMappings();

  // Serialize HNSW global indices for shard 0 only
  void SerializeGlobalHnswIndices();

  // Main snapshotting fiber that iterates over all buckets in the db slice
  // and submits them to SerializeBucket.
  void IterateBucketsFb(bool send_full_sync_cut);

  // Called on traversing cursor by IterateBucketsFb.
  bool BucketSaveCb(DbIndex db_index, PrimeTable::bucket_iterator it);

  // Serialize single bucket.
  // Returns number of serialized entries, updates bucket version to snapshot version.
  unsigned SerializeBucket(DbIndex db_index, PrimeTable::bucket_iterator bucket_it,
                           bool push_tracked_tiered_keys);

  // Serialize entry into passed serializer.
  void SerializeEntry(DbIndex db_index, const PrimeKey& pk, const PrimeValue& pv);

  // DbChange listener
  void OnDbChange(DbIndex db_index, const DbSlice::ChangeReq& req);

  // DbSlice moved listener
  void OnMoved(DbIndex db_index, const DbSlice::MovedItemsVec& items);
  bool IsPositionSerialized(DbIndex db_index, PrimeTable::Cursor cursor);

  // Push serializer's internal buffer.
  // Push regardless of buffer size if force is true.
  // Return true if pushed. Can block. Is called from the snapshot thread.
  bool PushSerialized(bool force);
  void SerializeExternal(DbIndex db_index, PrimeKey pk, const PrimeValue& pv, time_t expire_time,
                         uint32_t mc_flags);

  // Handles data provided by RdbSerializer when its internal buffer exceeds the threshold
  // during big value serialization (e.g. huge sets/lists or large strings).
  // The data has already been extracted from the serializer and is owned here, ensuring correct
  // plumbing and making it safe to move.
  void HandleFlushData(std::string data);

  // Used for explicit flushes at safe points (e.g. between entries). Can block.
  size_t FlushSerialized();

  // Tuple <db_index, key> is used as a key to uniquely identify tiered entry on shard.
  using TieredDelayEntryKey = std::pair<DbIndex, std::string>;

  // Serialize delayed entries.
  // If bucket_tiered_keys is provided we should serialize these keys forcefully.
  // Other entries can be serialized if they are resolved, but we don't wait for them unless force
  // is true.
  void PushDelayedEntries(bool force, std::vector<TieredDelayEntryKey>* bucket_tiered_keys);

  DbSlice* db_slice_;
  const DbTableArray db_array_;
  PrimeTable::Cursor snapshot_cursor_;
  DbIndex snapshot_db_index_ = 0;

  std::unique_ptr<RdbSerializer> serializer_;

  // Delayed entries that are waiting for tiered storage reads to complete before they can be
  // serialized.
  absl::flat_hash_map<TieredDelayEntryKey, std::unique_ptr<TieredDelayedEntry>> delayed_entries_;

  // Used for sanity checks.
  bool serialize_bucket_running_ = false;

  util::fb2::Fiber snapshot_fb_;  // IterateEntriesFb
  util::fb2::CondVarAny seq_cond_;

  const CompressionMode compression_mode_;
  RdbTypeFreqMap type_freq_map_;

  // version upper bound for entries that should be saved (not included).
  uint64_t snapshot_version_;
  uint64_t moved_cb_id_ = 0;
  uint32_t journal_cb_id_ = 0;
  uint32_t moved_cb_id = 0;

  bool use_background_mode_ = false;
  bool use_snapshot_version_ = true;
  DflyVersion replica_dfly_version_ = DflyVersion::CURRENT_VER;

  uint64_t rec_id_ = 1, last_pushed_id_ = 0;

  struct Stats {
    size_t loop_serialized = 0;
    size_t skipped = 0;
    size_t side_saved = 0;
    size_t savecb_calls = 0;
    size_t keys_total = 0;
    size_t jounal_changes = 0;
    size_t moved_saved = 0;
    size_t flushed_under_lock = 0;
  } stats_;

  ThreadLocalMutex big_value_mu_;

  SnapshotDataConsumerInterface* consumer_;
  ExecutionState* cntx_;
};

}  // namespace dfly


================================================
FILE: src/server/stats.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/stats.h"

#include <algorithm>

namespace dfly {

#define ADD(x) (x) += o.x

TieredStats& TieredStats::operator+=(const TieredStats& o) {
  static_assert(sizeof(TieredStats) == 168);

  ADD(total_stashes);
  ADD(total_fetches);
  ADD(total_cancels);
  ADD(total_deletes);
  ADD(total_defrags);
  ADD(total_uploads);
  ADD(total_heap_buf_allocs);
  ADD(total_registered_buf_allocs);

  ADD(allocated_bytes);
  ADD(capacity_bytes);

  ADD(pending_read_cnt);
  ADD(pending_stash_cnt);

  ADD(small_bins_cnt);
  ADD(small_bins_entries_cnt);
  ADD(small_bins_filling_bytes);
  ADD(small_bins_filling_entries_cnt);

  ADD(total_stash_overflows);
  ADD(cold_storage_bytes);
  ADD(total_offloading_steps);
  ADD(total_offloading_stashes);

  ADD(clients_throttled);
  ADD(total_clients_throttled);
  return *this;
}

SearchStats& SearchStats::operator+=(const SearchStats& o) {
  static_assert(sizeof(SearchStats) == 24);
  ADD(used_memory);
  ADD(num_entries);

  // Different shards could have inconsistent num_indices values during concurrent operations.
  // This can happen on concurrent index creation.
  // We use max to ensure that the total num_indices is the maximum of all shards.
  num_indices = std::max(num_indices, o.num_indices);
  return *this;
}

#undef ADD

}  // namespace dfly


================================================
FILE: src/server/stats.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <cstddef>
#include <cstdint>

namespace dfly {

struct TieredStats {
  uint64_t total_stashes = 0;
  uint64_t total_fetches = 0;
  uint64_t total_cancels = 0;
  uint64_t total_deletes = 0;
  uint64_t total_defrags = 0;
  uint64_t total_uploads = 0;
  uint64_t total_registered_buf_allocs = 0;
  uint64_t total_heap_buf_allocs = 0;

  // How many times the system did not perform Stash call due to overloaded disk write queue
  // (disjoint with total_stashes).
  uint64_t total_stash_overflows = 0;
  uint64_t total_offloading_steps = 0;
  uint64_t total_offloading_stashes = 0;

  size_t allocated_bytes = 0;
  size_t capacity_bytes = 0;

  uint32_t pending_read_cnt = 0;
  uint32_t pending_stash_cnt = 0;

  uint64_t small_bins_cnt = 0;
  uint64_t small_bins_entries_cnt = 0;
  size_t small_bins_filling_bytes = 0;
  size_t small_bins_filling_entries_cnt = 0;
  size_t cold_storage_bytes = 0;

  uint64_t clients_throttled = 0;        // current number of throttled clients
  uint64_t total_clients_throttled = 0;  // total number of throttles

  TieredStats& operator+=(const TieredStats&);
};

struct SearchStats {
  size_t used_memory = 0;
  size_t num_indices = 0;
  size_t num_entries = 0;

  SearchStats& operator+=(const SearchStats&);
};

}  // namespace dfly


================================================
FILE: src/server/stream_family.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/stream_family.h"

#include <absl/cleanup/cleanup.h>
#include <absl/strings/ascii.h>
#include <absl/strings/str_cat.h>

extern "C" {
#include "redis/redis_aux.h"
#include "redis/stream.h"
#include "redis/zmalloc.h"
}

#include "base/logging.h"
#include "facade/cmd_arg_parser.h"
#include "server/acl/acl_commands_def.h"
#include "server/blocking_controller.h"
#include "server/command_registry.h"
#include "server/conn_context.h"
#include "server/db_slice.h"
#include "server/engine_shard_set.h"
#include "server/error.h"
#include "server/execution_state.h"
#include "server/family_utils.h"
#include "server/namespaces.h"
#include "server/transaction.h"

namespace dfly {

using namespace facade;
using namespace std;

StreamMemTracker::StreamMemTracker() {
  start_size_ = zmalloc_used_memory_tl;
}

void StreamMemTracker::UpdateStreamSize(PrimeValue& pv) const {
  const size_t current = zmalloc_used_memory_tl;
  int64_t diff = static_cast<int64_t>(current) - static_cast<int64_t>(start_size_);
  pv.AddStreamSize(diff);
  // Under any flow we must not end up with this special value.
  DCHECK(pv.MallocUsed() != 0);
}

namespace {

// ---------------------------------------------------------------------------
// Stream helper functions (only used within stream_family)
// ---------------------------------------------------------------------------

/* Set 'id' to be its successor stream ID.
 * If 'id' is the maximal possible id, it is wrapped around to 0-0 and C_ERR
 * is returned. */
int StreamIncrID(streamID* id) {
  int ret = C_OK;
  if (id->seq == UINT64_MAX) {
    if (id->ms == UINT64_MAX) {
      id->ms = id->seq = 0;
      ret = C_ERR;
    } else {
      id->ms++;
      id->seq = 0;
    }
  } else {
    id->seq++;
  }
  return ret;
}

/* Set 'id' to be its predecessor stream ID.
 * If 'id' is the minimal possible id, it resets to UINT64_MAX and C_ERR is returned. */
int StreamDecrID(streamID* id) {
  int ret = C_OK;
  if (id->seq == 0) {
    if (id->ms == 0) {
      id->ms = id->seq = UINT64_MAX;
      ret = C_ERR;
    } else {
      id->ms--;
      id->seq = UINT64_MAX;
    }
  } else {
    id->seq--;
  }
  return ret;
}

/* Returns non-zero if the ID is 0-0. */
int StreamIDEqZero(streamID* id) {
  return !(id->ms || id->seq);
}

/* Returns non-zero if the range from 'start' to 'end' contains a tombstone. */
int StreamRangeHasTombstones(stream* s, streamID* start, streamID* end) {
  streamID start_id, end_id;

  if (!s->length || StreamIDEqZero(&s->max_deleted_entry_id)) {
    return 0;
  }

  if (start) {
    start_id = *start;
  } else {
    start_id.ms = 0;
    start_id.seq = 0;
  }

  if (end) {
    end_id = *end;
  } else {
    end_id.ms = UINT64_MAX;
    end_id.seq = UINT64_MAX;
  }

  if (streamCompareID(&start_id, &s->max_deleted_entry_id) <= 0 &&
      streamCompareID(&s->max_deleted_entry_id, &end_id) <= 0) {
    return 1;
  }
  return 0;
}

int64_t StreamTrim(stream* s, streamAddTrimArgs* args);  // defined below

/* Trims a stream by length. Returns the number of deleted items. */
int64_t StreamTrimByLength(stream* s, long long maxlen, int approx) {
  streamAddTrimArgs args = {};
  args.trim_strategy = TRIM_STRATEGY_MAXLEN;
  args.approx_trim = approx;
  args.limit = approx ? 100 * server.stream_node_max_entries : 0;
  args.maxlen = maxlen;
  return StreamTrim(s, &args);
}

/* Trims a stream by minimum ID. Returns the number of deleted items. */
int64_t StreamTrimByID(stream* s, streamID minid, int approx) {
  streamAddTrimArgs args = {};
  args.trim_strategy = TRIM_STRATEGY_MINID;
  args.approx_trim = approx;
  args.limit = approx ? 100 * server.stream_node_max_entries : 0;
  args.minid = minid;
  return StreamTrim(s, &args);
}

/* Return 1 if 'id' exists in 's' (and not marked as deleted). */
int StreamEntryExists(stream* s, streamID* id) {
  streamIterator si;
  streamIteratorStart(&si, s, id, id, 0);
  streamID myid;
  int64_t numfields;
  int found = streamIteratorGetID(&si, &myid, &numfields);
  streamIteratorStop(&si);
  if (!found)
    return 0;
  serverAssert(streamCompareID(id, &myid) == 0);
  return 1;
}

int64_t LpGetInteger(uint8_t* ele) {
  int64_t v = 0;
  int res = lpGetInteger(ele, &v);
  DCHECK(res != 0);
  return v;
}

void StreamIteratorRemoveEntry(streamIterator* si, streamID* current) {
  uint8_t* lp = static_cast<uint8_t*>(si->lp);
  int64_t aux;

  int64_t flags = LpGetInteger(si->lp_flags);
  flags |= STREAM_ITEM_FLAG_DELETED;
  lp = lpReplaceInteger(lp, &si->lp_flags, flags);

  uint8_t* p = lpFirst(lp);
  aux = LpGetInteger(p);

  if (aux == 1) {
    lpFree(lp);
    checkedRaxRemove(si->stream->rax, si->ri.key, si->ri.key_len, NULL);
  } else {
    lp = lpReplaceInteger(lp, &p, aux - 1);
    p = lpNext(lp, p);
    aux = LpGetInteger(p);
    lp = lpReplaceInteger(lp, &p, aux + 1);
    if (si->lp != lp)
      raxInsert(si->stream->rax, si->ri.key, si->ri.key_len, lp, NULL);
    CHECK_GT(lpBytes(lp), 0u);
  }

  si->stream->length--;

  streamID start, end;
  if (si->rev) {
    streamDecodeID(si->start_key, &start);
    end = *current;
  } else {
    start = *current;
    streamDecodeID(si->end_key, &end);
  }
  streamIteratorStop(si);
  streamIteratorStart(si, si->stream, &start, &end, si->rev);
}

/* Delete the specified item ID from the stream, returning 1 if deleted. */
int StreamDeleteItem(stream* s, streamID* id) {
  int deleted = 0;
  streamIterator si;
  streamIteratorStart(&si, s, id, id, 0);
  streamID myid;
  int64_t numfields;
  if (streamIteratorGetID(&si, &myid, &numfields)) {
    StreamIteratorRemoveEntry(&si, &myid);
    deleted = 1;
  }
  streamIteratorStop(&si);
  return deleted;
}

/* Get the last valid (non-tombstone) streamID of 's'. */
void StreamLastValidID(stream* s, streamID* maxid) {
  streamIterator si;
  streamIteratorStart(&si, s, NULL, NULL, 1);
  int64_t numfields;
  if (!streamIteratorGetID(&si, maxid, &numfields) && s->length)
    serverPanic("Corrupt stream, length is %llu, but no max id", (unsigned long long)s->length);
  streamIteratorStop(&si);
}

/* Calculate the lag for a consumer group. */
long long StreamCGLag(stream* s, streamCG* cg) {
  int valid = 0;
  long long lag = 0;

  if (!s->entries_added) {
    lag = 0;
    valid = 1;
  } else if (cg->entries_read != SCG_INVALID_ENTRIES_READ &&
             !StreamRangeHasTombstones(s, &cg->last_id, NULL)) {
    lag = (long long)s->entries_added - cg->entries_read;
    valid = 1;
  } else {
    long long entries_read = streamEstimateDistanceFromFirstEverEntry(s, &cg->last_id);
    if (entries_read != SCG_INVALID_ENTRIES_READ) {
      lag = (long long)s->entries_added - entries_read;
      valid = 1;
    }
  }

  if (valid) {
    return lag;
  }
  return SCG_INVALID_LAG;
}

/* Lookup the consumer group in the specified stream. */
streamCG* StreamLookupCG(stream* s, sds groupname) {
  if (s->cgroups == NULL)
    return NULL;
  void* cg = NULL;
  raxFind(s->cgroups, (unsigned char*)groupname, sdslen(groupname), &cg);
  return static_cast<streamCG*>(cg);
}

/* Lookup a consumer by name in the group 'cg'. */
streamConsumer* StreamLookupConsumer(streamCG* cg, sds name) {
  if (cg == NULL)
    return NULL;
  void* consumer = NULL;
  raxFind(cg->consumers, (unsigned char*)name, sdslen(name), &consumer);
  return static_cast<streamConsumer*>(consumer);
}

/* Delete the specified consumer from consumer group 'cg'. */
void StreamDelConsumer(streamCG* cg, streamConsumer* consumer) {
  raxIterator ri;
  raxStart(&ri, consumer->pel);
  raxSeek(&ri, "^", NULL, 0);
  while (raxNext(&ri)) {
    streamNACK* nack = static_cast<streamNACK*>(ri.data);
    raxRemove(cg->pel, ri.key, ri.key_len, NULL);
    streamFreeNACK(nack);
  }
  raxStop(&ri);

  raxRemove(cg->consumers, (unsigned char*)consumer->name, sdslen(consumer->name), NULL);
  raxFree(consumer->pel);
  sdsfree(consumer->name);
  zfree(consumer);
}

/* Get the stream ID of the edge (first or last) entry in a listpack node.
 * Returns 1 if found, 0 if the listpack is empty or invalid. */
int LpGetEdgeStreamID(uint8_t* lp, int first, streamID* master_id, streamID* edge_id) {
  if (lp == NULL)
    return 0;

  uint8_t* lp_ele;
  if (first) {
    lp_ele = lpFirst(lp);
    lp_ele = lpNext(lp, lp_ele);  // skip entry count
    lp_ele = lpNext(lp, lp_ele);  // skip deleted count
    int64_t master_fields_count = LpGetInteger(lp_ele);
    lp_ele = lpNext(lp, lp_ele);  // seek first field
    for (int64_t i = 0; i < master_fields_count; i++)
      lp_ele = lpNext(lp, lp_ele);
    lp_ele = lpNext(lp, lp_ele);
    if (lp_ele == NULL)
      return 0;
  } else {
    lp_ele = lpLast(lp);
    int64_t lp_count = LpGetInteger(lp_ele);
    if (lp_count == 0)
      return 0;
    while (lp_count--)
      lp_ele = lpPrev(lp, lp_ele);
  }

  lp_ele = lpNext(lp, lp_ele);  // seek ID (lp_ele points to 'flags')
  streamID id = *master_id;
  id.ms += LpGetInteger(lp_ele);
  lp_ele = lpNext(lp, lp_ele);
  id.seq += LpGetInteger(lp_ele);
  *edge_id = id;
  return 1;
}

/* Trim the stream 's' according to args->trim_strategy, and return the
 * number of elements removed from the stream. The 'approx' option, if non-zero,
 * specifies that the trimming must be performed in a approximated way in
 * order to maximize performances. This means that the stream may contain
 * entries with IDs < 'id' in case of MINID (or more elements than 'maxlen'
 * in case of MAXLEN), and elements are only removed if we can remove
 * a *whole* node of the radix tree. The elements are removed from the head
 * of the stream (older elements).
 *
 * The function may return zero if:
 *
 * 1) The minimal entry ID of the stream is already < 'id' (MINID); or
 * 2) The stream is already shorter or equal to the specified max length (MAXLEN); or
 * 3) The 'approx' option is true and the head node did not have enough elements
 *    to be deleted.
 *
 * args->limit is the maximum number of entries to delete. The purpose is to
 * prevent this function from taking to long.
 * If 'limit' is 0 then we do not limit the number of deleted entries.
 * Much like the 'approx', if 'limit' is smaller than the number of entries
 * that should be trimmed, there is a chance we will still have entries with
 * IDs < 'id' (or number of elements >= maxlen in case of MAXLEN).
 */
int64_t StreamTrim(stream* s, streamAddTrimArgs* args) {
  size_t maxlen = args->maxlen;
  streamID* id = &args->minid;
  int approx = args->approx_trim;
  int64_t limit = args->limit;
  int trim_strategy = args->trim_strategy;

  if (trim_strategy == TRIM_STRATEGY_NONE)
    return 0;

  raxIterator ri;
  raxStart(&ri, s->rax);
  raxSeek(&ri, "^", NULL, 0);

  int64_t deleted = 0;
  while (raxNext(&ri)) {
    if (trim_strategy == TRIM_STRATEGY_MAXLEN && s->length <= maxlen)
      break;

    uint8_t* lp = static_cast<uint8_t*>(ri.data);
    CHECK_GT(lpBytes(lp), 0u);
    uint8_t* p = lpFirst(lp);
    int64_t entries = LpGetInteger(p);

    if (limit && (deleted + entries) > limit)
      break;

    int remove_node;
    streamID master_id = {0, 0};
    if (trim_strategy == TRIM_STRATEGY_MAXLEN) {
      remove_node = s->length - entries >= maxlen;
    } else {
      streamDecodeID(ri.key, &master_id);
      streamID last_id = {0, 0};
      LpGetEdgeStreamID(lp, 0, &master_id, &last_id);
      remove_node = streamCompareID(&last_id, id) < 0;
    }

    if (remove_node) {
      lpFree(lp);
      checkedRaxRemove(s->rax, ri.key, ri.key_len, NULL);
      raxSeek(&ri, ">=", ri.key, ri.key_len);
      s->length -= entries;
      deleted += entries;
      continue;
    }

    if (approx)
      break;

    int64_t deleted_from_lp = 0;
    p = lpNext(lp, p);  // skip deleted field
    p = lpNext(lp, p);  // skip num-of-fields

    int64_t master_fields_count = LpGetInteger(p);
    p = lpNext(lp, p);
    for (int64_t j = 0; j < master_fields_count; j++)
      p = lpNext(lp, p);
    p = lpNext(lp, p);  // skip zero master entry terminator

    while (p) {
      uint8_t* pcopy = p;
      int64_t flags = LpGetInteger(p);
      p = lpNext(lp, p);
      int64_t to_skip;

      int64_t ms_delta = LpGetInteger(p);
      p = lpNext(lp, p);
      int64_t seq_delta = LpGetInteger(p);
      p = lpNext(lp, p);

      streamID currid = {0, 0};
      if (trim_strategy == TRIM_STRATEGY_MINID) {
        currid.ms = master_id.ms + ms_delta;
        currid.seq = master_id.seq + seq_delta;
      }

      int stop;
      if (trim_strategy == TRIM_STRATEGY_MAXLEN) {
        stop = s->length <= maxlen;
      } else {
        stop = streamCompareID(&currid, id) >= 0;
      }
      if (stop)
        break;

      if (flags & STREAM_ITEM_FLAG_SAMEFIELDS) {
        to_skip = master_fields_count;
      } else {
        to_skip = LpGetInteger(p);
        p = lpNext(lp, p);
        to_skip *= 2;
      }

      while (to_skip--)
        p = lpNext(lp, p);
      p = lpNext(lp, p);

      if (!(flags & STREAM_ITEM_FLAG_DELETED)) {
        intptr_t delta = p - lp;
        flags |= STREAM_ITEM_FLAG_DELETED;
        lp = lpReplaceInteger(lp, &pcopy, flags);
        deleted_from_lp++;
        s->length--;
        p = lp + delta;
      }
    }
    deleted += deleted_from_lp;

    p = lpFirst(lp);
    lp = lpReplaceInteger(lp, &p, entries - deleted_from_lp);
    p = lpNext(lp, p);
    int64_t marked_deleted = LpGetInteger(p);
    lp = lpReplaceInteger(lp, &p, marked_deleted + deleted_from_lp);

    raxInsert(s->rax, ri.key, ri.key_len, lp, NULL);
    CHECK_GT(lpBytes(lp), 0u);
    break;
  }
  raxStop(&ri);

  if (s->length == 0) {
    s->first_id.ms = 0;
    s->first_id.seq = 0;
  } else if (deleted) {
    streamGetEdgeID(s, 1, 1, &s->first_id);
  }

  return deleted;
}

void FreeConsumerVoid(void* sc_) {
  streamConsumer* sc = static_cast<streamConsumer*>(sc_);
  raxFree(sc->pel);
  sdsfree(sc->name);
  zfree(sc);
}

void StreamFreeCG(streamCG* cg) {
  raxFreeWithCallback(cg->pel, zfree);
  raxFreeWithCallback(cg->consumers, FreeConsumerVoid);
  zfree(cg);
}

// ---------------------------------------------------------------------------

struct Record {
  streamID id;
  vector<pair<string, string>> kv_arr;
  uint64_t delivery_time = 0;
};

using RecordVec = vector<Record>;

using nonstd::make_unexpected;

template <typename T> using ParseResult = io::Result<T, ErrorReply>;

nonstd::unexpected_type<ErrorReply> CreateSyntaxError(std::string_view message) {
  return make_unexpected(ErrorReply{message, kSyntaxErrType});
}

struct ParsedStreamId {
  streamID val;

  // Was an ID different than "ms-*" specified? for XADD only.
  bool has_seq = false;
  // Was an ID different than "*" specified? for XADD only.
  bool id_given = false;

  // Whether to lookup messages after the last ID in the stream. Used for XREAD
  // when using ID '$'.
  bool resolve_last_id = false;
};

struct RangeId {
  ParsedStreamId parsed_id;
  bool exclude = false;
};

struct TrimOpts {
  static constexpr int32_t kNoTrimLimit = -1;

  bool HasLimit() const {
    return limit != kNoTrimLimit;
  }

  bool IsMaxLen() const {
    return std::holds_alternative<uint32_t>(length_or_id);
  }

  uint32_t AsMaxLen() const {
    return std::get<uint32_t>(length_or_id);
  }

  const ParsedStreamId& AsMinId() const {
    return std::get<ParsedStreamId>(length_or_id);
  }

  // First is MaxLen, second is MinId.
  std::variant<uint32_t, ParsedStreamId> length_or_id;
  int32_t limit = kNoTrimLimit;
  bool approx = false;
};

struct AddOpts {
  std::optional<TrimOpts> trim_opts;
  ParsedStreamId parsed_id;
  bool no_mkstream = false;
};

/* Used to journal the XADD command.
   The actual stream ID assigned after adding may differ from the one specified in the command.
   So, for the replica, we need to specify the exact ID that was actually added. */
struct AddArgsJournaler {
  void SetStreamId(std::string_view stream_id) {
    add_args[stream_id_index] = stream_id;
  }

  CmdArgVec add_args;
  size_t stream_id_index;
};

struct NACKInfo {
  streamID pel_id;
  string consumer_name;
  size_t delivery_time;
  size_t delivery_count;
};

struct ConsumerInfo {
  string name;
  mstime_t seen_time;
  mstime_t active_time;
  size_t pel_count;
  vector<NACKInfo> pending;
  size_t idle;
};

struct GroupInfo {
  string name;
  size_t consumer_size;
  size_t pending_size;
  streamID last_id;
  int64_t entries_read;
  int64_t lag;
  vector<NACKInfo> stream_nack_vec;
  vector<ConsumerInfo> consumer_info_vec;
};

using GroupInfoVec = vector<GroupInfo>;

struct StreamInfo {
  size_t length;
  size_t radix_tree_keys;
  size_t radix_tree_nodes;
  size_t groups;
  streamID recorded_first_entry_id;
  streamID last_generated_id;
  streamID max_deleted_entry_id;
  size_t entries_added;
  Record first_entry;
  Record last_entry;
  vector<Record> entries;
  GroupInfoVec cgroups;
};

enum class StreamAccessKind { kNone, kSequential, kRandom, kFetchAll };

struct RangeOpts {
  ParsedStreamId start;
  ParsedStreamId end;
  bool is_rev = false;
  uint32_t count = kuint32max;

  // readgroup range fields
  streamCG* group = nullptr;
  streamConsumer* consumer = nullptr;
  bool noack = false;

  StreamAccessKind access_kind = StreamAccessKind::kRandom;
};

void RecordStreamAccess(const OpArgs& op_args, StreamAccessKind kind) {
  auto& stats = op_args.shard->stats();
  switch (kind) {
    case StreamAccessKind::kNone:
      // No-op: skip metrics recording for internal calls
      break;
    case StreamAccessKind::kSequential:
      stats.stream_sequential_accesses++;
      break;
    case StreamAccessKind::kRandom:
      stats.stream_random_accesses++;
      break;
    case StreamAccessKind::kFetchAll:
      stats.stream_fetch_all_accesses++;
      break;
  }
}

struct StreamIDsItem {
  ParsedStreamId id;

  // Readgroup fields - id and group-consumer pair is exclusive.
  streamCG* group = nullptr;
  streamConsumer* consumer = nullptr;
  bool serve_history = false;
  bool is_consumer_new = false;
};

struct ReadOpts {
  // Contains a mapping from stream name to the starting stream ID.
  unordered_map<string_view, StreamIDsItem> stream_ids;
  // Contains the maximum number of entries to return for each stream.
  uint32_t count = kuint32max;
  // Contains the time to block waiting for entries, or -1 if should not block.
  int64_t timeout = -1;
  size_t streams_arg = 0;

  // readgroup fields
  bool read_group = false;
  string_view group_name;
  string_view consumer_name;
  bool noack = false;
};

const char kTrimOptionConflictErr[] =
    "MAXLEN and MINID options at the same time are not compatible";
const char kInvalidStreamId[] = "Invalid stream ID specified as stream command argument";
const char kXGroupKeyNotFound[] =
    "The XGROUP subcommand requires the key to exist. "
    "Note that for CREATE you may want to use the MKSTREAM option to create "
    "an empty stream automatically.";
const char kSameStreamFound[] = "Same stream specified multiple time";

const uint32_t STREAM_LISTPACK_MAX_SIZE = 1 << 30;
const uint32_t kStreamNodeMaxBytes = 4096;
const uint32_t kStreamNodeMaxEntries = 100;
const uint32_t STREAM_LISTPACK_MAX_PRE_ALLOCATE = 4096;

string StreamIdRepr(const streamID& id) {
  return absl::StrCat(id.ms, "-", id.seq);
};

facade::ErrorReply NoGroupError(string_view key, string_view cgroup) {
  return facade::ErrorReply(
      absl::StrCat("-NOGROUP No such consumer group '", cgroup, "' for key name '", key, "'"),
      kNoGroupErrType);
}

facade::ErrorReply NoGroupOrKey(string_view key, string_view cgroup, string_view suffix = "") {
  return facade::ErrorReply(
      absl::StrCat("-NOGROUP No such key '", key, "'", " or consumer group '", cgroup, "'", suffix),
      kNoGroupErrType);
}

string LeqTopIdError(string_view cmd_name) {
  return absl::StrCat("The ID specified in ", cmd_name,
                      " is equal or smaller than the target stream top item");
}

inline const uint8_t* SafePtr(MutableSlice field) {
  return field.empty() ? reinterpret_cast<const uint8_t*>("")
                       : reinterpret_cast<const uint8_t*>(field.data());
}

bool ParseID(string_view strid, bool strict, uint64_t missing_seq, ParsedStreamId* dest) {
  if (strid.empty() || strid.size() > 127)
    return false;

  if (strid == "*")
    return true;

  dest->id_given = true;
  dest->has_seq = true;

  /* Handle the "-" and "+" special cases. */
  if (strid == "-" || strid == "+") {
    if (strict)
      return false;

    if (strid == "-") {
      dest->val.ms = 0;
      dest->val.seq = 0;
      return true;
    }

    dest->val.ms = UINT64_MAX;
    dest->val.seq = UINT64_MAX;
    return true;
  }

  /* Parse <ms>-<seq> form. */
  streamID result{.ms = 0, .seq = missing_seq};

  size_t dash_pos = strid.find('-');
  if (!absl::SimpleAtoi(strid.substr(0, dash_pos), &result.ms))
    return false;

  if (dash_pos != string_view::npos) {
    if (dash_pos + 1 == strid.size())
      return false;

    if (dash_pos + 2 == strid.size() && strid[dash_pos + 1] == '*') {
      result.seq = 0;
      dest->has_seq = false;
    } else if (!absl::SimpleAtoi(strid.substr(dash_pos + 1), &result.seq)) {
      return false;
    }
  }

  dest->val = result;

  return true;
}

enum class RangeBoundary { kStart, kEnd };
bool ParseRangeId(string_view id, RangeBoundary type, RangeId* dest) {
  if (id.empty())
    return false;
  if (id[0] == '(') {
    dest->exclude = true;
    id.remove_prefix(1);
  }
  uint64 missing_seq = type == RangeBoundary::kStart ? 0 : -1;
  return ParseID(id, dest->exclude, missing_seq, &dest->parsed_id);
}

/* This is a wrapper function for lpGet() to directly get an integer value
 * from the listpack (that may store numbers as a string), converting
 * the string if needed.
 * The `valid` argument is an optional output parameter to get an indication
 * if the record was valid, when this parameter is NULL, the function will
 * fail with an assertion. */
static inline int64_t lpGetIntegerIfValid(unsigned char* ele, int* valid) {
  int64_t v;
  unsigned char* e = lpGet(ele, &v, NULL);
  if (e == NULL) {
    if (valid)
      *valid = 1;
    return v;
  }
  /* The following code path should never be used for how listpacks work:
   * they should always be able to store an int64_t value in integer
   * encoded form. However the implementation may change. */
  long long ll;
  int ret = string2ll((char*)e, v, &ll);
  if (valid)
    *valid = ret;
  else
    serverAssert(ret != 0);
  v = ll;
  return v;
}

int64_t lpGetInteger(unsigned char* ele) {
  return lpGetIntegerIfValid(ele, NULL);
}

/* Generate the next stream item ID given the previous one. If the current
 * milliseconds Unix time is greater than the previous one, just use this
 * as time part and start with sequence part of zero. Otherwise we use the
 * previous time (and never go backward) and increment the sequence. */
void StreamNextID(uint64_t now_ms, const streamID* last_id, streamID* new_id) {
  if (now_ms > last_id->ms) {
    new_id->ms = now_ms;
    new_id->seq = 0;
  } else {
    *new_id = *last_id;
    StreamIncrID(new_id);
  }
}

/* Convert the specified stream entry ID as a 128 bit big endian number, so
 * that the IDs can be sorted lexicographically. */
inline void StreamEncodeID(uint8_t* buf, const streamID& id) {
  absl::big_endian::Store64(buf, id.ms);
  absl::big_endian::Store64(buf + 8, id.seq);
}

/* Adds a new item into the stream 's' having the specified number of
 * field-value pairs as specified in 'numfields' and stored into 'argv'.
 * Returns the new entry ID populating the 'added_id' structure.
 *
 * If 'use_id' is not NULL, the ID is not auto-generated by the function,
 * but instead the passed ID is used to add the new entry. In this case
 * adding the entry may fail as specified later in this comment.
 *
 * When 'use_id' is used alongside with a zero 'seq-given', the sequence
 * part of the passed ID is ignored and the function will attempt to use an
 * auto-generated sequence.
 *
 * The function returns 0 if the item was added, this is always true
 * if the ID was generated by the function. However the function may return
 * errors in several cases:
 * 1. If an ID was given via 'use_id', but adding it failed since the
 *    current top ID is greater or equal, it returns EDOM.
 * 2. If a size of a single element or the sum of the elements is too big to
 *    be stored into the stream. it returns ERANGE. */
int StreamAppendItem(stream* s, CmdArgList fields, uint64_t now_ms, streamID* added_id,
                     streamID* use_id, int seq_given) {
  /* Generate the new entry ID. */
  streamID id;
  if (use_id) {
    if (seq_given) {
      id = *use_id;
    } else {
      /* The automatically generated sequence can be either zero (new
       * timestamps) or the incremented sequence of the last ID. In the
       * latter case, we need to prevent an overflow/advancing forward
       * in time. */
      if (s->last_id.ms == use_id->ms) {
        if (s->last_id.seq == UINT64_MAX) {
          return EDOM;
        }
        id = s->last_id;
        id.seq++;
      } else {
        id = *use_id;
      }
    }
  } else {
    StreamNextID(now_ms, &s->last_id, &id);
  }

  /* Check that the new ID is greater than the last entry ID
   * or return an error. Automatically generated IDs might
   * overflow (and wrap-around) when incrementing the sequence
     part. */
  if (streamCompareID(&id, &s->last_id) <= 0) {
    return EDOM;
  }

  /* Avoid overflow when trying to add an element to the stream (listpack
   * can only host up to 32bit length strings, and also a total listpack size
   * can't be bigger than 32bit length. */
  size_t totelelen = 0;
  for (size_t i = 0; i < fields.size(); i++) {
    totelelen += fields[i].size();
  }

  if (totelelen > STREAM_LISTPACK_MAX_SIZE) {
    return ERANGE;
  }

  /* Add the new entry. */
  raxIterator ri;
  raxStart(&ri, s->rax);
  raxSeek(&ri, "$", NULL, 0);

  size_t lp_bytes = 0;      /* Total bytes in the tail listpack. */
  unsigned char* lp = NULL; /* Tail listpack pointer. */

  /* We have to add the key into the radix tree in lexicographic order,
   * to do so we consider the ID as a single 128 bit number written in
   * big endian, so that the most significant bytes are the first ones. */
  uint8_t rax_key[16]; /* Key in the radix tree containing the listpack.*/
  streamID master_id;  /* ID of the master entry in the listpack. */

  if (!raxEOF(&ri)) {
    /* Get a reference to the tail node listpack. */
    lp = (uint8_t*)ri.data;
    lp_bytes = lpBytes(lp);
    CHECK_GT(lp_bytes, 0U);
    DCHECK(ri.key_len == sizeof(rax_key));
    memcpy(rax_key, ri.key, sizeof(rax_key));
  }

  /* Create a new listpack and radix tree node if needed. Note that when
   * a new listpack is created, we populate it with a "master entry". This
   * is just a set of fields that is taken as references in order to compress
   * the stream entries that we'll add inside the listpack.
   *
   * Note that while we use the first added entry fields to create
   * the master entry, the first added entry is NOT represented in the master
   * entry, which is a stand alone object. But of course, the first entry
   * will compress well because it's used as reference.
   *
   * The master entry is composed like in the following example:
   *
   * +-------+---------+------------+---------+--/--+---------+---------+-+
   * | count | deleted | num-fields | field_1 | field_2 | ... | field_N |0|
   * +-------+---------+------------+---------+--/--+---------+---------+-+
   *
   * count and deleted just represent respectively the total number of
   * entries inside the listpack that are valid, and marked as deleted
   * (deleted flag in the entry flags set). So the total number of items
   * actually inside the listpack (both deleted and not) is count+deleted.
   *
   * The real entries will be encoded with an ID that is just the
   * millisecond and sequence difference compared to the key stored at
   * the radix tree node containing the listpack (delta encoding), and
   * if the fields of the entry are the same as the master entry fields, the
   * entry flags will specify this fact and the entry fields and number
   * of fields will be omitted (see later in the code of this function).
   *
   * The "0" entry at the end is the same as the 'lp-count' entry in the
   * regular stream entries (see below), and marks the fact that there are
   * no more entries, when we scan the stream from right to left. */

  /* First of all, check if we can append to the current macro node or
   * if we need to switch to the next one. 'lp' will be set to NULL if
   * the current node is full. */
  if (lp != NULL) {
    int new_node = 0;
    size_t node_max_bytes = kStreamNodeMaxBytes;
    if (node_max_bytes == 0 || node_max_bytes > STREAM_LISTPACK_MAX_SIZE)
      node_max_bytes = STREAM_LISTPACK_MAX_SIZE;
    if (lp_bytes + totelelen >= node_max_bytes) {
      new_node = 1;
    } else if (kStreamNodeMaxEntries) {
      unsigned char* lp_ele = lpFirst(lp);
      /* Count both live entries and deleted ones. */
      int64_t count = lpGetInteger(lp_ele) + lpGetInteger(lpNext(lp, lp_ele));
      if (count >= kStreamNodeMaxEntries) {
        new_node = 1;
      }
    }

    if (new_node) {
      /* Shrink extra pre-allocated memory */
      lp = lpShrinkToFit(lp);
      if (ri.key_len != sizeof(rax_key) || memcmp(ri.key, rax_key, sizeof(rax_key)) != 0) {
        LOG(DFATAL) << "StreamAppendItem: Key mismatch";
      }
      if (ri.data != lp)
        raxInsert(s->rax, ri.key, ri.key_len, lp, NULL);
      lp = NULL;
    }
  }

  int flags = 0;
  unsigned numfields = fields.size() / 2;
  uint8_t* old_lp = lp;
  if (lp == NULL) {
    master_id = id;
    StreamEncodeID(rax_key, id);
    /* Create the listpack having the master entry ID and fields.
     * Pre-allocate some bytes when creating listpack to avoid realloc on
     * every XADD. Since listpack.c uses malloc_size, it'll grow in steps,
     * and won't realloc on every XADD.
     * When listpack reaches max number of entries, we'll shrink the
     * allocation to fit the data. */
    size_t prealloc = STREAM_LISTPACK_MAX_PRE_ALLOCATE;

    lp = lpNew(prealloc);
    lp = lpAppendInteger(lp, 1); /* One item, the one we are adding. */
    lp = lpAppendInteger(lp, 0); /* Zero deleted so far. */
    lp = lpAppendInteger(lp, numfields);
    for (int64_t i = 0; i < numfields; i++) {
      MutableSlice field = fields[i * 2];

      lp = lpAppend(lp, SafePtr(field), field.size());
    }
    lp = lpAppendInteger(lp, 0); /* Master entry zero terminator. */
    raxInsert(s->rax, (unsigned char*)&rax_key, sizeof(rax_key), lp, NULL);
    old_lp = lp;
    /* The first entry we insert, has obviously the same fields of the
     * master entry. */
    flags |= STREAM_ITEM_FLAG_SAMEFIELDS;
  } else {  // lp != NULL
    if (ri.key_len != sizeof(rax_key) || memcmp(ri.key, rax_key, sizeof(rax_key)) != 0) {
      LOG(DFATAL) << "StreamAppendItem: Key mismatch";
    }

    /* Read the master ID from the radix tree key. */
    streamDecodeID(rax_key, &master_id);
    unsigned char* lp_ele = lpFirst(lp);

    /* Update count and skip the deleted fields. */
    int64_t count = lpGetInteger(lp_ele);
    lp = lpReplaceInteger(lp, &lp_ele, count + 1);
    lp_ele = lpNext(lp, lp_ele); /* seek deleted. */
    lp_ele = lpNext(lp, lp_ele); /* seek master entry num fields. */

    /* Check if the entry we are adding, have the same fields
     * as the master entry. */
    int64_t master_fields_count = lpGetInteger(lp_ele);
    lp_ele = lpNext(lp, lp_ele);
    if (numfields == master_fields_count) {
      int64_t i;
      for (i = 0; i < master_fields_count; i++) {
        MutableSlice field = fields[i * 2];
        int64_t e_len;
        unsigned char buf[LP_INTBUF_SIZE];
        unsigned char* e = lpGet(lp_ele, &e_len, buf);
        /* Stop if there is a mismatch. */
        if (field.size() != (size_t)e_len || memcmp(e, field.data(), e_len) != 0)
          break;
        lp_ele = lpNext(lp, lp_ele);
      }
      /* All fields are the same! We can compress the field names
       * setting a single bit in the flags. */
      if (i == master_fields_count)
        flags |= STREAM_ITEM_FLAG_SAMEFIELDS;
    }
  }

  /* Populate the listpack with the new entry. We use the following
   * encoding:
   *
   * +-----+--------+----------+-------+-------+-/-+-------+-------+--------+
   * |flags|entry-id|num-fields|field-1|value-1|...|field-N|value-N|lp-count|
   * +-----+--------+----------+-------+-------+-/-+-------+-------+--------+
   *
   * However if the SAMEFIELD flag is set, we have just to populate
   * the entry with the values, so it becomes:
   *
   * +-----+--------+-------+-/-+-------+--------+
   * |flags|entry-id|value-1|...|value-N|lp-count|
   * +-----+--------+-------+-/-+-------+--------+
   *
   * The entry-id field is actually two separated fields: the ms
   * and seq difference compared to the master entry.
   *
   * The lp-count field is a number that states the number of listpack pieces
   * that compose the entry, so that it's possible to travel the entry
   * in reverse order: we can just start from the end of the listpack, read
   * the entry, and jump back N times to seek the "flags" field to read
   * the stream full entry. */
  lp = lpAppendInteger(lp, flags);
  lp = lpAppendInteger(lp, id.ms - master_id.ms);
  lp = lpAppendInteger(lp, id.seq - master_id.seq);
  if (!(flags & STREAM_ITEM_FLAG_SAMEFIELDS))
    lp = lpAppendInteger(lp, numfields);
  for (int64_t i = 0; i < numfields; i++) {
    MutableSlice field = fields[i * 2], value = fields[i * 2 + 1];
    if (!(flags & STREAM_ITEM_FLAG_SAMEFIELDS))
      lp = lpAppend(lp, SafePtr(field), field.size());
    lp = lpAppend(lp, SafePtr(value), value.size());
  }
  /* Compute and store the lp-count field. */
  int64_t lp_count = numfields;
  lp_count += 3; /* Add the 3 fixed fields flags + ms-diff + seq-diff. */
  if (!(flags & STREAM_ITEM_FLAG_SAMEFIELDS)) {
    /* If the item is not compressed, it also has the fields other than
     * the values, and an additional num-fields field. */
    lp_count += numfields + 1;
  }
  lp = lpAppendInteger(lp, lp_count);

  /* Insert back into the tree in order to update the listpack pointer. */
  if (old_lp != lp) {
    raxInsert(s->rax, (unsigned char*)&rax_key, sizeof(rax_key), lp, NULL);
  }
  s->length++;
  s->entries_added++;
  s->last_id = id;

  // Must find the last entry as we just inserted it.
  CHECK_EQ(1, raxSeek(&ri, "$", NULL, 0));
  lp_bytes = lpBytes((uint8_t*)ri.data);
  CHECK_GT(lp_bytes, 0U);
  raxStop(&ri);

  if (s->length == 1)
    s->first_id = id;
  if (added_id)
    *added_id = id;

  return 0;
}

/* Create a NACK entry setting the delivery count to 1 and the delivery
 * time to the current time or test-hooked time. The NACK consumer will be
 * set to the one specified as argument of the function. */
streamNACK* StreamCreateNACK(streamConsumer* consumer, uint64_t now_ms) {
  streamNACK* nack = reinterpret_cast<streamNACK*>(zmalloc(sizeof(*nack)));
  nack->delivery_time = now_ms;
  nack->delivery_count = 1;
  nack->consumer = consumer;
  return nack;
}

std::string StreamsIdToString(streamID id) {
  return absl::StrCat(id.ms, "-", id.seq);
}

/* Return value represents the number of deleted items. */
int64_t TrimStream(const TrimOpts& opts, stream* s) {
  if (!opts.HasLimit()) {
    if (opts.IsMaxLen()) {
      return StreamTrimByLength(s, opts.AsMaxLen(), opts.approx);
    } else {
      const auto& min_id = opts.AsMinId().val;
      return StreamTrimByID(s, min_id, opts.approx);
    }
  }

  streamAddTrimArgs trim_args = {};
  trim_args.approx_trim = opts.approx;
  trim_args.limit = opts.limit;

  if (opts.IsMaxLen()) {
    trim_args.trim_strategy = TRIM_STRATEGY_MAXLEN;
    trim_args.maxlen = opts.AsMaxLen();
  } else {
    trim_args.trim_strategy = TRIM_STRATEGY_MINID;
    trim_args.minid = opts.AsMinId().val;
  }

  return StreamTrim(s, &trim_args);
}

bool JournalAsMinId(const TrimOpts& opts) {
  return opts.approx || opts.IsMaxLen();
}

OpResult<streamID> OpAdd(const OpArgs& op_args, string_view key, const AddOpts& opts,
                         CmdArgList args, AddArgsJournaler journaler) {
  DCHECK(!args.empty() && args.size() % 2 == 0);

  auto& db_slice = op_args.GetDbSlice();

  DbSlice::ItAndUpdater add_res;
  if (opts.no_mkstream) {
    auto res_it = db_slice.FindMutable(op_args.db_cntx, key, OBJ_STREAM);
    RETURN_ON_BAD_STATUS(res_it);
    add_res = std::move(*res_it);
  } else {
    auto op_res = db_slice.AddOrFind(op_args.db_cntx, key, OBJ_STREAM);
    RETURN_ON_BAD_STATUS(op_res);
    add_res = std::move(*op_res);
  }

  auto& it = add_res.it;

  StreamMemTracker mem_tracker;
  absl::Cleanup on_exit([it, &mem_tracker]() mutable { mem_tracker.UpdateStreamSize(it->second); });

  if (add_res.is_new) {
    stream* s = streamNew();
    it->second.InitRobj(OBJ_STREAM, OBJ_ENCODING_STREAM, s);
  }

  stream* stream_inst = (stream*)it->second.RObjPtr();

  streamID result_id;
  const auto& parsed_id = opts.parsed_id;
  streamID passed_id = parsed_id.val;
  int res = StreamAppendItem(stream_inst, args, op_args.db_cntx.time_now_ms, &result_id,
                             parsed_id.id_given ? &passed_id : nullptr, parsed_id.has_seq);

  if (res != 0) {
    if (add_res.is_new) {
      std::move(on_exit).Cancel();
      db_slice.DelMutable(op_args.db_cntx, std::move(add_res));
    }
    if (res == ERANGE)
      return OpStatus::OUT_OF_RANGE;
    if (res == EDOM)
      return OpStatus::STREAM_ID_SMALL;

    return OpStatus::OUT_OF_MEMORY;
  }

  if (opts.trim_opts) {
    int64_t deleted_items_number = TrimStream(opts.trim_opts.value(), stream_inst);
    VLOG(2) << "Trimmed " << deleted_items_number << " items from stream " << key
            << " during the XADD command";
  }

  if (op_args.shard->journal()) {
    std::string result_id_as_string = StreamsIdToString(result_id);
    const bool stream_is_empty = stream_inst->length == 0;

    if (opts.trim_opts && (stream_is_empty || JournalAsMinId(opts.trim_opts.value()))) {
      std::string last_id;

      CmdArgVec journal_args = {key};
      journal_args.reserve(args.size() + 4);

      if (stream_is_empty) {
        // We need remove the whole stream in replica
        journal_args.emplace_back("MAXLEN"sv);
        journal_args.emplace_back("0"sv);
      } else {
        // We need to set exact MinId in the journal.
        // For this we are using new first_id from the stream
        last_id = StreamsIdToString(stream_inst->first_id);
        journal_args.emplace_back("MINID"sv);
        journal_args.emplace_back(last_id);
      }

      if (opts.no_mkstream) {
        journal_args.emplace_back("NOMKSTREAM"sv);
      }

      journal_args.emplace_back(result_id_as_string);

      for (size_t i = 0; i < args.size(); i++) {
        journal_args.emplace_back(args[i]);
      }

      RecordJournal(op_args, "XADD"sv, journal_args);
    } else {
      journaler.SetStreamId(result_id_as_string);
      RecordJournal(op_args, "XADD"sv, journaler.add_args);
    }
  }

  RecordStreamAccess(op_args, StreamAccessKind::kSequential);

  auto blocking_controller = op_args.db_cntx.ns->GetBlockingController(op_args.shard->shard_id());
  if (blocking_controller) {
    blocking_controller->Awaken(op_args.db_cntx.db_index, key);
  }

  return result_id;
}

OpResult<RecordVec> OpRange(const OpArgs& op_args, string_view key, const RangeOpts& opts) {
  // It's write because we add a NACK. Relevant to XReadGroup only
  const bool is_write_command = opts.group;
  auto& db_slice = op_args.GetDbSlice();
  DbSlice::ItAndUpdater it;
  const CompactObj* cobj;
  if (is_write_command) {
    auto res = db_slice.FindMutable(op_args.db_cntx, key, OBJ_STREAM);
    if (!res)
      return res.status();
    it = std::move(*res);
    cobj = &it.it->second;
  } else {
    auto res = db_slice.FindReadOnly(op_args.db_cntx, key, OBJ_STREAM);
    if (!res)
      return res.status();
    cobj = &(*res)->second;
  }

  RecordVec result;

  if (opts.count == 0)
    return result;

  streamIterator si;
  int64_t numfields;
  streamID id;
  stream* s = (stream*)cobj->RObjPtr();
  streamID sstart = opts.start.val, send = opts.end.val;

  // Classify access pattern: fetch-all if start <= first_id and end is MAX.
  StreamAccessKind effective_kind = opts.access_kind;
  if (effective_kind != StreamAccessKind::kNone && s->length > 0 &&
      streamCompareID(&sstart, &s->first_id) <= 0 && send.ms == UINT64_MAX &&
      send.seq == UINT64_MAX) {
    effective_kind = StreamAccessKind::kFetchAll;
  }
  RecordStreamAccess(op_args, effective_kind);

  streamIteratorStart(&si, s, &sstart, &send, opts.is_rev);
  while (streamIteratorGetID(&si, &id, &numfields)) {
    Record rec;
    rec.id = id;
    rec.kv_arr.reserve(numfields);
    if (opts.group && streamCompareID(&id, &opts.group->last_id) > 0) {
      if (opts.group->entries_read != SCG_INVALID_ENTRIES_READ &&
          streamCompareID(&opts.group->last_id, &s->first_id) >= 0 &&
          !StreamRangeHasTombstones(s, &opts.group->last_id, NULL)) {
        /* A valid counter and no tombstones in the group's last-delivered-id and the stream's
         * last-generated-id, we can increment the read counter to keep tracking the group's
         * progress. */
        opts.group->entries_read++;
      } else if (s->entries_added) {
        /* The group's counter may be invalid, so we try to obtain it. */
        opts.group->entries_read = streamEstimateDistanceFromFirstEverEntry(s, &id);
      }
      opts.group->last_id = id;
    }

    /* Emit the field-value pairs. */
    while (numfields--) {
      unsigned char *key, *value;
      int64_t key_len, value_len;
      streamIteratorGetField(&si, &key, &value, &key_len, &value_len);
      string skey(reinterpret_cast<char*>(key), key_len);
      string sval(reinterpret_cast<char*>(value), value_len);

      rec.kv_arr.emplace_back(std::move(skey), std::move(sval));
    }

    result.push_back(std::move(rec));

    // Only relevant for XREADGROUP flow. Should not trigger on XREAD which is READ only.
    if (is_write_command && !opts.noack) {
      StreamMemTracker mem_track;
      unsigned char buf[sizeof(streamID)];
      StreamEncodeID(buf, id);
      uint64_t now_ms = op_args.db_cntx.time_now_ms;

      /* Try to add a new NACK. Most of the time this will work and
       * will not require extra lookups. We'll fix the problem later
       * if we find that there is already an entry for this ID. */
      streamNACK* nack = StreamCreateNACK(opts.consumer, now_ms);
      int group_inserted = raxTryInsert(opts.group->pel, buf, sizeof(buf), nack, nullptr);

      int consumer_inserted = raxTryInsert(opts.consumer->pel, buf, sizeof(buf), nack, nullptr);

      /* Now we can check if the entry was already busy, and
       * in that case reassign the entry to the new consumer,
       * or update it if the consumer is the same as before. */
      if (group_inserted == 0) {
        streamFreeNACK(nack);
        int fres = raxFind(opts.group->pel, buf, sizeof(buf), (void**)&nack);
        DCHECK(fres);
        raxRemove(nack->consumer->pel, buf, sizeof(buf), NULL);
        LOG_IF(DFATAL, nack->consumer->pel->numnodes == 0) << "Invalid rax state";

        /* Update the consumer and NACK metadata. */
        nack->consumer = opts.consumer;
        nack->delivery_time = now_ms;
        nack->delivery_count = 1;
        /* Add the entry in the new consumer local PEL. */
        raxInsert(opts.consumer->pel, buf, sizeof(buf), nack, NULL);
      } else if (group_inserted == 1 && consumer_inserted == 0) {
        LOG(DFATAL) << "Internal error";
        return OpStatus::SKIPPED;  // ("NACK half-created. Should not be possible.");
      }
      opts.consumer->active_time = now_ms;
      result.back().delivery_time = now_ms;
      mem_track.UpdateStreamSize(it.it->second);
    }
    if (opts.count == result.size())
      break;
  }

  streamIteratorStop(&si);

  return result;
}

OpResult<RecordVec> OpRangeFromConsumerPEL(const OpArgs& op_args, string_view key,
                                           const RangeOpts& opts) {
  RecordVec result;

  if (opts.count == 0)
    return result;

  RecordStreamAccess(op_args, StreamAccessKind::kRandom);

  unsigned char start_key[sizeof(streamID)];
  unsigned char end_key[sizeof(streamID)];
  auto sstart = opts.start.val;
  auto send = opts.end.val;

  StreamEncodeID(start_key, sstart);
  StreamEncodeID(end_key, send);
  raxIterator ri;

  raxStart(&ri, opts.consumer->pel);
  raxSeek(&ri, ">=", start_key, sizeof(start_key));
  size_t ecount = 0;
  while (raxNext(&ri) && (!opts.count || ecount < opts.count)) {
    if (memcmp(ri.key, &send, ri.key_len) > 0)
      break;
    streamID id;

    streamDecodeID(ri.key, &id);
    RangeOpts ropts;
    ropts.start.val = id;
    ropts.end.val = id;
    ropts.access_kind =
        StreamAccessKind::kNone;  // Prevent per-entry counting; already recorded above
    auto op_result = OpRange(op_args, key, ropts);
    if (!op_result || !op_result.value().size()) {
      Record rec;
      rec.id = id;
      result.push_back(rec);
    } else {
      streamNACK* nack = static_cast<streamNACK*>(ri.data);
      nack->delivery_time = op_args.db_cntx.time_now_ms;
      nack->delivery_count++;
      result.push_back(std::move(op_result.value()[0]));
    }
    ecount++;
  }
  raxStop(&ri);
  return result;
}

namespace {
// Our C-API doesn't use const, so we have to const cast.
// Only intended for read-only functions.
stream* GetReadOnlyStream(const CompactObj& cobj) {
  return const_cast<stream*>((const stream*)cobj.RObjPtr());
}

// Reassigns a pending NACK entry to a new consumer, updating the PELs of both the old and new
// consumer. If the NACK already belongs to the target consumer, this is a no-op for the PELs.
void ReassignNACKToConsumer(streamNACK* nack, streamConsumer* consumer, uint8_t* key_buf,
                            size_t key_len, uint64_t now_ms) {
  if (nack->consumer != consumer) {
    if (nack->consumer) {
      raxRemove(nack->consumer->pel, key_buf, key_len, nullptr);
      LOG_IF(DFATAL, nack->consumer->pel->numnodes == 0) << "Invalid rax state";
    }
    raxInsert(consumer->pel, key_buf, key_len, nack, nullptr);
    nack->consumer = consumer;
  }
  consumer->active_time = now_ms;
}

}  // namespace
// Returns the range response for each stream on this shard in order of
// GetShardArgs.
vector<RecordVec> OpRead(const OpArgs& op_args, const ShardArgs& shard_args, const ReadOpts& opts) {
  DCHECK(!shard_args.Empty());

  RangeOpts range_opts;
  range_opts.count = opts.count;
  range_opts.end = ParsedStreamId{.val = streamID{
                                      .ms = UINT64_MAX,
                                      .seq = UINT64_MAX,
                                  }};

  vector<RecordVec> response(shard_args.Size());
  unsigned index = 0;
  for (string_view key : shard_args) {
    const auto& sitem = opts.stream_ids.at(key);
    auto& dest = response[index++];

    // We skip, group can be empty after waking up from a blocked read
    if (!sitem.group && opts.read_group) {
      continue;
    }

    range_opts.start = sitem.id;
    range_opts.group = sitem.group;
    range_opts.consumer = sitem.consumer;
    range_opts.noack = opts.noack;
    // XREAD/XREADGROUP new deliveries are sequential (fetch-all detected in OpRange).
    range_opts.access_kind = StreamAccessKind::kSequential;

    OpResult<RecordVec> range_res;

    if (sitem.serve_history)
      range_res = OpRangeFromConsumerPEL(op_args, key, range_opts);
    else
      range_res = OpRange(op_args, key, range_opts);
    if (range_res) {
      dest = std::move(range_res.value());
    }
  }

  return response;
}

OpResult<uint32_t> OpLen(const OpArgs& op_args, string_view key) {
  auto& db_slice = op_args.GetDbSlice();
  auto res_it = db_slice.FindReadOnly(op_args.db_cntx, key, OBJ_STREAM);
  RETURN_ON_BAD_STATUS(res_it);
  const CompactObj& cobj = (*res_it)->second;
  stream* s = (stream*)cobj.RObjPtr();
  return s->length;
}

OpResult<vector<GroupInfo>> OpListGroups(const DbContext& db_cntx, string_view key,
                                         EngineShard* shard) {
  auto& db_slice = db_cntx.GetDbSlice(shard->shard_id());
  auto res_it = db_slice.FindReadOnly(db_cntx, key, OBJ_STREAM);
  RETURN_ON_BAD_STATUS(res_it);

  vector<GroupInfo> result;
  const CompactObj& cobj = (*res_it)->second;
  stream* s = (stream*)cobj.RObjPtr();

  if (s->cgroups) {
    result.reserve(raxSize(s->cgroups));

    raxIterator ri;
    raxStart(&ri, s->cgroups);
    raxSeek(&ri, "^", NULL, 0);
    while (raxNext(&ri)) {
      streamCG* cg = (streamCG*)ri.data;
      GroupInfo ginfo;
      ginfo.name.assign(reinterpret_cast<char*>(ri.key), ri.key_len);
      ginfo.consumer_size = raxSize(cg->consumers);
      ginfo.pending_size = raxSize(cg->pel);
      ginfo.last_id = cg->last_id;
      ginfo.entries_read = cg->entries_read;
      ginfo.lag = StreamCGLag(s, cg);
      result.push_back(std::move(ginfo));
    }
    raxStop(&ri);
  }

  return result;
}

vector<Record> GetStreamRecords(stream* s, streamID start, streamID end, bool reverse,
                                size_t count) {
  streamIterator si;
  int64_t numfields;
  streamID id;
  size_t arraylen = 0;
  vector<Record> records;

  streamIteratorStart(&si, s, &start, &end, reverse);
  while (streamIteratorGetID(&si, &id, &numfields)) {
    Record rec;
    rec.id = id;
    rec.kv_arr.reserve(numfields);

    while (numfields--) {
      unsigned char *key, *value;
      int64_t key_len, value_len;
      streamIteratorGetField(&si, &key, &value, &key_len, &value_len);
      string skey(reinterpret_cast<char*>(key), key_len);
      string sval(reinterpret_cast<char*>(value), value_len);

      rec.kv_arr.emplace_back(std::move(skey), std::move(sval));
    }
    records.push_back(std::move(rec));
    arraylen++;
    if (count && count == arraylen)
      break;
  }

  streamIteratorStop(&si);

  return records;
}

void GetGroupPEL(stream* s, streamCG* cg, long long count, GroupInfo* ginfo) {
  vector<NACKInfo> nack_info_vec;
  long long arraylen_cg_pel = 0;
  raxIterator ri_cg_pel;
  raxStart(&ri_cg_pel, cg->pel);
  raxSeek(&ri_cg_pel, "^", NULL, 0);
  while (raxNext(&ri_cg_pel) && (!count || arraylen_cg_pel < count)) {
    streamNACK* nack = static_cast<streamNACK*>(ri_cg_pel.data);
    NACKInfo nack_info;

    streamID id;
    streamDecodeID(ri_cg_pel.key, &id);
    nack_info.pel_id = id;
    nack_info.consumer_name = nack->consumer->name;
    nack_info.delivery_time = nack->delivery_time;
    nack_info.delivery_count = nack->delivery_count;

    nack_info_vec.push_back(nack_info);
    arraylen_cg_pel++;
  }
  raxStop(&ri_cg_pel);
  ginfo->stream_nack_vec = std::move(nack_info_vec);
}

void GetConsumers(stream* s, streamCG* cg, long long count, GroupInfo* ginfo) {
  vector<ConsumerInfo> consumer_info_vec;
  raxIterator ri_consumers;
  raxStart(&ri_consumers, cg->consumers);
  raxSeek(&ri_consumers, "^", NULL, 0);
  while (raxNext(&ri_consumers)) {
    ConsumerInfo consumer_info;
    streamConsumer* consumer = static_cast<streamConsumer*>(ri_consumers.data);

    LOG_IF(DFATAL, consumer->pel->numnodes == 0) << "Invalid rax state";

    consumer_info.name = consumer->name;
    consumer_info.seen_time = consumer->seen_time;
    consumer_info.active_time = consumer->active_time;
    consumer_info.pel_count = raxSize(consumer->pel);

    /* Consumer PEL */
    long long arraylen_cpel = 0;
    raxIterator ri_cpel;
    raxStart(&ri_cpel, consumer->pel);
    raxSeek(&ri_cpel, "^", NULL, 0);
    vector<NACKInfo> consumer_pel_vec;
    while (raxNext(&ri_cpel) && (!count || arraylen_cpel < count)) {
      NACKInfo nack_info;
      streamNACK* nack = static_cast<streamNACK*>(ri_cpel.data);

      streamID id;
      streamDecodeID(ri_cpel.key, &id);
      nack_info.pel_id = id;
      nack_info.delivery_time = nack->delivery_time;
      nack_info.delivery_count = nack->delivery_count;

      consumer_pel_vec.push_back(nack_info);
      arraylen_cpel++;
    }
    consumer_info.pending = consumer_pel_vec;
    consumer_info_vec.push_back(consumer_info);
    raxStop(&ri_cpel);
  }
  raxStop(&ri_consumers);
  ginfo->consumer_info_vec = std::move(consumer_info_vec);
}

OpResult<StreamInfo> OpStreams(const DbContext& db_cntx, string_view key, EngineShard* shard,
                               int full, size_t count) {
  auto& db_slice = db_cntx.GetDbSlice(shard->shard_id());
  auto res_it = db_slice.FindReadOnly(db_cntx, key, OBJ_STREAM);
  RETURN_ON_BAD_STATUS(res_it);

  // Record access only after successful key validation
  if (full) {
    shard->stats().stream_fetch_all_accesses++;
  } else {
    shard->stats().stream_sequential_accesses++;
  }

  vector<StreamInfo> result;
  const CompactObj& cobj = (*res_it)->second;
  stream* s = (stream*)cobj.RObjPtr();

  StreamInfo sinfo;
  sinfo.length = s->length;

  sinfo.radix_tree_keys = raxSize(s->rax);
  sinfo.radix_tree_nodes = s->rax->numnodes;
  sinfo.last_generated_id = s->last_id;
  sinfo.max_deleted_entry_id = s->max_deleted_entry_id;
  sinfo.entries_added = s->entries_added;
  sinfo.recorded_first_entry_id = s->first_id;
  sinfo.groups = s->cgroups ? raxSize(s->cgroups) : 0;
  sinfo.entries = GetStreamRecords(s, s->first_id, s->last_id, false, count);

  if (full) {
    if (s->cgroups) {
      GroupInfoVec group_info_vec;

      raxIterator ri_cgroups;
      raxStart(&ri_cgroups, s->cgroups);
      raxSeek(&ri_cgroups, "^", NULL, 0);
      while (raxNext(&ri_cgroups)) {
        streamCG* cg = (streamCG*)ri_cgroups.data;
        GroupInfo ginfo;
        ginfo.name.assign(reinterpret_cast<char*>(ri_cgroups.key), ri_cgroups.key_len);
        ginfo.last_id = cg->last_id;
        ginfo.consumer_size = raxSize(cg->consumers);
        ginfo.pending_size = raxSize(cg->pel);
        ginfo.entries_read = cg->entries_read;
        ginfo.lag = StreamCGLag(s, cg);
        GetGroupPEL(s, cg, count, &ginfo);
        GetConsumers(s, cg, count, &ginfo);

        group_info_vec.push_back(ginfo);
      }
      raxStop(&ri_cgroups);

      sinfo.cgroups = group_info_vec;
    }
  } else {
    vector<Record> first_entry_vector = GetStreamRecords(s, s->first_id, s->last_id, false, 1);
    if (first_entry_vector.size() != 0) {
      sinfo.first_entry = first_entry_vector.at(0);
    }
    vector<Record> last_entry_vector = GetStreamRecords(s, s->first_id, s->last_id, true, 1);
    if (last_entry_vector.size() != 0) {
      sinfo.last_entry = last_entry_vector.at(0);
    }
  }

  return sinfo;
}

OpResult<vector<ConsumerInfo>> OpConsumers(const DbContext& db_cntx, EngineShard* shard,
                                           string_view stream_name, string_view group_name) {
  auto& db_slice = db_cntx.GetDbSlice(shard->shard_id());
  auto res_it = db_slice.FindReadOnly(db_cntx, stream_name, OBJ_STREAM);
  RETURN_ON_BAD_STATUS(res_it);

  vector<ConsumerInfo> result;
  const CompactObj& cobj = (*res_it)->second;
  stream* s = GetReadOnlyStream(cobj);
  streamCG* cg = StreamLookupCG(s, WrapSds(group_name));
  if (cg == NULL) {
    return OpStatus::INVALID_VALUE;
  }
  result.reserve(raxSize(s->cgroups));

  raxIterator ri;
  raxStart(&ri, cg->consumers);
  raxSeek(&ri, "^", NULL, 0);
  mstime_t now = db_cntx.time_now_ms;
  while (raxNext(&ri)) {
    ConsumerInfo consumer_info;
    streamConsumer* consumer = (streamConsumer*)ri.data;
    mstime_t idle = now - consumer->seen_time;
    if (idle < 0)
      idle = 0;

    consumer_info.name = consumer->name;
    consumer_info.pel_count = raxSize(consumer->pel);
    consumer_info.idle = idle;
    consumer_info.active_time = consumer->active_time;
    result.push_back(std::move(consumer_info));
  }
  raxStop(&ri);
  return result;
}

constexpr uint8_t kCreateOptMkstream = 1 << 0;

struct CreateOpts {
  string_view gname;
  string_view id;
  uint8_t flags = 0;
};

OpStatus OpCreate(const OpArgs& op_args, string_view key, const CreateOpts& opts) {
  auto& db_slice = op_args.GetDbSlice();
  auto res_it = db_slice.FindMutable(op_args.db_cntx, key, OBJ_STREAM);
  int64_t entries_read = SCG_INVALID_ENTRIES_READ;
  StreamMemTracker mem_tracker;
  bool stream_created_by_mkstream = false;
  if (!res_it) {
    if (opts.flags & kCreateOptMkstream) {
      // MKSTREAM is enabled, so create the stream
      res_it = db_slice.AddNew(op_args.db_cntx, key, PrimeValue{}, 0);
      if (!res_it)
        return res_it.status();

      stream* s = streamNew();
      res_it->it->second.InitRobj(OBJ_STREAM, OBJ_ENCODING_STREAM, s);
      stream_created_by_mkstream = true;
    } else {
      return res_it.status();
    }
  }

  CompactObj& cobj = res_it->it->second;
  stream* s = (stream*)cobj.RObjPtr();

  streamID id;
  ParsedStreamId parsed_id;
  if (opts.id == "$") {
    id = s->last_id;
  } else {
    if (ParseID(opts.id, true, 0, &parsed_id)) {
      id = parsed_id.val;
    } else {
      if (stream_created_by_mkstream) {
        db_slice.DelMutable(op_args.db_cntx, std::move(*res_it));
      }
      return OpStatus::SYNTAX_ERR;
    }
  }

  streamCG* cg = streamCreateCG(s, opts.gname.data(), opts.gname.size(), &id, entries_read);
  mem_tracker.UpdateStreamSize(res_it->it->second);
  return cg ? OpStatus::OK : OpStatus::BUSY_GROUP;
}

struct FindGroupResult {
  stream* s = nullptr;
  streamCG* cg = nullptr;
  DbSlice::ItAndUpdater it;
};

OpResult<FindGroupResult> FindGroup(const OpArgs& op_args, string_view key, string_view gname,
                                    bool skip_group = true) {
  auto& db_slice = op_args.GetDbSlice();
  auto res_it = db_slice.FindMutable(op_args.db_cntx, key, OBJ_STREAM);
  RETURN_ON_BAD_STATUS(res_it);

  CompactObj& cobj = res_it->it->second;
  auto* s = static_cast<stream*>(cobj.RObjPtr());
  auto* cg = StreamLookupCG(s, WrapSds(gname));
  if (skip_group && !cg)
    return OpStatus::SKIPPED;

  return FindGroupResult{s, cg, std::move(*res_it)};
}

// Try to get the consumer. If not found, create a new one.
streamConsumer* FindOrAddConsumer(string_view name, streamCG* cg, uint64_t now_ms,
                                  bool* is_consumer_new) {
  // Try to get the consumer. If not found, create a new one.
  auto cname = WrapSds(name);
  streamConsumer* consumer = StreamLookupConsumer(cg, cname);
  if (consumer) {
    consumer->seen_time = now_ms;
  } else {
    // TODO: notify xgroup-createconsumer event once we support stream events.
    if (is_consumer_new) {
      *is_consumer_new = true;
    }
    consumer = StreamCreateConsumer(cg, name, now_ms, SCC_DEFAULT);
  }

  return consumer;
}

constexpr uint8_t kClaimForce = 1 << 0;
constexpr uint8_t kClaimJustID = 1 << 1;
constexpr uint8_t kClaimLastID = 1 << 2;

struct ClaimOpts {
  string_view group;
  string_view consumer;
  int64 min_idle_time;
  int64 delivery_time = -1;
  int retry = -1;
  uint8_t flags = 0;
  int32_t count = 100;      // only for XAUTOCLAIM
  streamID start = {0, 0};  // only for XAUTOCLAIM
  streamID last_id;
};

struct ClaimInfo {
  bool justid = false;
  vector<streamID> ids;
  RecordVec records;
  streamID end_id = {0, 0};      // only for XAUTOCLAIM
  vector<streamID> deleted_ids;  // only for XAUTOCLAIM
};

void AppendClaimResultItem(ClaimInfo& result, stream* s, streamID id) {
  int64_t numfields;
  if (result.justid) {
    result.ids.push_back(id);
    return;
  }
  streamIterator it;
  streamID cid;
  streamIteratorStart(&it, s, &id, &id, 0);
  while (streamIteratorGetID(&it, &cid, &numfields)) {
    Record rec;
    rec.id = cid;
    rec.kv_arr.reserve(numfields);

    /* Emit the field-value pairs. */
    while (numfields--) {
      unsigned char *key, *value;
      int64_t key_len, value_len;
      streamIteratorGetField(&it, &key, &value, &key_len, &value_len);
      string skey(reinterpret_cast<char*>(key), key_len);
      string sval(reinterpret_cast<char*>(value), value_len);

      rec.kv_arr.emplace_back(std::move(skey), std::move(sval));
    }
    result.records.push_back(std::move(rec));
  }
  streamIteratorStop(&it);
}

// XCLAIM key group consumer min-idle-time id
OpResult<ClaimInfo> OpClaim(const OpArgs& op_args, string_view key, const ClaimOpts& opts,
                            absl::Span<streamID> ids) {
  auto cgr_res = FindGroup(op_args, key, opts.group);
  RETURN_ON_BAD_STATUS(cgr_res);
  RecordStreamAccess(op_args, StreamAccessKind::kRandom);

  uint64_t now_ms = op_args.db_cntx.time_now_ms;
  ClaimInfo result;
  result.justid = (opts.flags & kClaimJustID);

  streamID last_id = opts.last_id;
  if (opts.flags & kClaimLastID) {
    if (streamCompareID(&last_id, &cgr_res->cg->last_id) > 0) {
      cgr_res->cg->last_id = last_id;
    }
  }

  StreamMemTracker tracker;

  streamConsumer* consumer = FindOrAddConsumer(opts.consumer, cgr_res->cg, now_ms, nullptr);

  for (streamID id : ids) {
    std::array<uint8_t, sizeof(streamID)> buf;
    StreamEncodeID(buf.begin(), id);

    streamNACK* nack = nullptr;
    int fres = raxFind(cgr_res->cg->pel, buf.begin(), sizeof(buf), (void**)&nack);
    if (!StreamEntryExists(cgr_res->s, &id)) {
      if (fres) {
        /* Release the NACK */
        raxRemove(cgr_res->cg->pel, buf.begin(), sizeof(buf), nullptr);
        raxRemove(nack->consumer->pel, buf.begin(), sizeof(buf), nullptr);
        LOG_IF(DFATAL, nack->consumer->pel->numnodes == 0) << "Invalid rax state";
        streamFreeNACK(nack);
      }
      continue;
    }

    // We didn't find a nack but the FORCE option is given.
    // Create the NACK forcefully.
    if ((opts.flags & kClaimForce) && fres == 0) {
      /* Create the NACK. */
      nack = StreamCreateNACK(nullptr, now_ms);
      raxInsert(cgr_res->cg->pel, buf.begin(), sizeof(buf), nack, nullptr);
    }

    // We found the nack, continue.
    if (nack) {
      // First check if the entry id exceeds the `min_idle_time`.
      if (nack->consumer && opts.min_idle_time) {
        mstime_t this_idle = now_ms - nack->delivery_time;
        if (this_idle < opts.min_idle_time) {
          continue;
        }
      }

      // Set the delivery time for the entry.
      nack->delivery_time = opts.delivery_time;
      /* Set the delivery attempts counter if given, otherwise
       * autoincrement unless JUSTID option provided */
      if (opts.retry >= 0) {
        nack->delivery_count = opts.retry;
      } else if (!(opts.flags & kClaimJustID)) {
        nack->delivery_count++;
      }
      // Note: nack->consumer is NULL if we created the NACK above because of the FORCE option.
      ReassignNACKToConsumer(nack, consumer, buf.begin(), sizeof(buf), now_ms);

      /* Send the reply for this entry. */
      AppendClaimResultItem(result, cgr_res->s, id);
      // TODO: propagate this change with streamPropagateXCLAIM
    }
  }
  tracker.UpdateStreamSize(cgr_res->it.it->second);
  return result;
}

// XGROUP DESTROY key groupname
OpStatus OpDestroyGroup(const OpArgs& op_args, string_view key, string_view gname) {
  auto cgr_res = FindGroup(op_args, key, gname);
  RETURN_ON_BAD_STATUS(cgr_res);
  StreamMemTracker mem_tracker;

  raxRemove(cgr_res->s->cgroups, (uint8_t*)(gname.data()), gname.size(), NULL);
  StreamFreeCG(cgr_res->cg);

  mem_tracker.UpdateStreamSize(cgr_res->it.it->second);

  // Awake readers blocked on this group
  auto blocking_controller = op_args.db_cntx.ns->GetBlockingController(op_args.shard->shard_id());
  if (blocking_controller) {
    blocking_controller->Awaken(op_args.db_cntx.db_index, key);
  }

  return OpStatus::OK;
}

struct GroupConsumerPair {
  streamCG* group;
  streamConsumer* consumer;
};

struct GroupConsumerPairOpts {
  string_view group;
  string_view consumer;
};

// XGROUP CREATECONSUMER key groupname consumername
OpResult<uint32_t> OpCreateConsumer(const OpArgs& op_args, string_view key, string_view gname,
                                    string_view consumer_name) {
  auto cgroup_res = FindGroup(op_args, key, gname);
  RETURN_ON_BAD_STATUS(cgroup_res);

  StreamMemTracker mem_tracker;

  streamConsumer* consumer = StreamCreateConsumer(
      cgroup_res->cg, consumer_name, op_args.db_cntx.time_now_ms, SCC_NO_NOTIFY | SCC_NO_DIRTIFY);

  mem_tracker.UpdateStreamSize(cgroup_res->it.it->second);
  return consumer ? OpStatus::OK : OpStatus::KEY_EXISTS;
}

// XGROUP DELCONSUMER key groupname consumername
OpResult<uint32_t> OpDelConsumer(const OpArgs& op_args, string_view key, string_view gname,
                                 string_view consumer_name) {
  auto cgroup_res = FindGroup(op_args, key, gname);
  RETURN_ON_BAD_STATUS(cgroup_res);
  StreamMemTracker mem_tracker;

  long long pending = 0;
  streamConsumer* consumer = StreamLookupConsumer(cgroup_res->cg, WrapSds(consumer_name));
  if (consumer) {
    pending = raxSize(consumer->pel);
    StreamDelConsumer(cgroup_res->cg, consumer);
  }

  mem_tracker.UpdateStreamSize(cgroup_res->it.it->second);
  return pending;
}

OpStatus OpSetId(const OpArgs& op_args, string_view key, string_view gname, string_view id,
                 std::optional<int64_t> entries_read) {
  auto cgr_res = FindGroup(op_args, key, gname);
  RETURN_ON_BAD_STATUS(cgr_res);

  streamID sid;
  ParsedStreamId parsed_id;
  if (id == "$") {
    sid = cgr_res->s->last_id;
  } else {
    if (ParseID(id, true, 0, &parsed_id)) {
      sid = parsed_id.val;
    } else {
      return OpStatus::SYNTAX_ERR;
    }
  }
  cgr_res->cg->last_id = sid;
  if (entries_read) {
    cgr_res->cg->entries_read = *entries_read;
  }

  return OpStatus::OK;
}

ErrorReply OpXSetId(const OpArgs& op_args, string_view key, const streamID& sid) {
  auto& db_slice = op_args.GetDbSlice();
  auto res_it = db_slice.FindMutable(op_args.db_cntx, key, OBJ_STREAM);
  if (!res_it)
    return res_it.status();

  StreamMemTracker mem_tracker;

  PrimeValue& pv = res_it->it->second;
  stream* stream_inst = (stream*)pv.RObjPtr();
  streamID max_xdel_id{0, 0};
  streamID id = sid;

  if (streamCompareID(&id, &stream_inst->max_deleted_entry_id) < 0) {
    return ErrorReply{"The ID specified in XSETID is smaller than current max_deleted_entry_id",
                      "stream_smaller_deleted"};
  }

  /* If the stream has at least one item, we want to check that the user
   * is setting a last ID that is equal or greater than the current top
   * item, otherwise the fundamental ID monotonicity assumption is violated. */
  if (stream_inst->length > 0) {
    streamID maxid;
    StreamLastValidID(stream_inst, &maxid);

    if (streamCompareID(&id, &maxid) < 0) {
      return OpStatus::STREAM_ID_SMALL;
    }
  }

  stream_inst->last_id = sid;

  raxIterator ri;
  raxStart(&ri, stream_inst->rax);
  raxSeek(&ri, "$", NULL, 0);

  if (!raxEOF(&ri)) {
    /* Get a reference to the tail node listpack. */
    size_t lp_bytes = lpBytes((uint8_t*)ri.data);
    CHECK_GT(lp_bytes, 0U);
  }
  raxStop(&ri);

  if (!StreamIDEqZero(&max_xdel_id))
    stream_inst->max_deleted_entry_id = max_xdel_id;

  RecordStreamAccess(op_args, StreamAccessKind::kSequential);

  mem_tracker.UpdateStreamSize(pv);

  return OpStatus::OK;
}

OpResult<uint32_t> OpDel(const OpArgs& op_args, string_view key, absl::Span<streamID> ids) {
  auto& db_slice = op_args.GetDbSlice();
  auto res_it = db_slice.FindMutable(op_args.db_cntx, key, OBJ_STREAM);
  RETURN_ON_BAD_STATUS(res_it);

  PrimeValue& pv = res_it->it->second;
  stream* stream_inst = (stream*)pv.RObjPtr();

  uint32_t deleted = 0;
  bool first_entry = false;

  StreamMemTracker tracker;

  // Capture last_id before deletion loop for heuristic (deletion can change it)
  streamID original_last_id = stream_inst->last_id;

  for (size_t j = 0; j < ids.size(); j++) {
    streamID id = ids[j];
    if (!StreamDeleteItem(stream_inst, &id))
      continue;

    /* We want to know if the first entry in the stream was deleted
     * so we can later set the new one. */
    if (streamCompareID(&id, &stream_inst->first_id) == 0) {
      first_entry = 1;
    }
    /* Update the stream's maximal tombstone if needed. */
    if (streamCompareID(&id, &stream_inst->max_deleted_entry_id) > 0) {
      stream_inst->max_deleted_entry_id = id;
    }
    deleted++;
  }

  /* Update the stream's first ID. */
  if (deleted) {
    if (stream_inst->length == 0) {
      stream_inst->first_id.ms = 0;
      stream_inst->first_id.seq = 0;
    } else if (first_entry) {
      streamGetEdgeID(stream_inst, 1, 1, &stream_inst->first_id);
    }
    // Only update size tracking if we actually deleted something.
    // This avoids issues with memory tracking noise from other operations
    // in the same thread.
    tracker.UpdateStreamSize(pv);
  }

  // Heuristic: if any deleted ID shares ms with original last_id, it's a tail delete (sequential).
  bool is_sequential = false;
  for (size_t j = 0; j < ids.size(); j++) {
    if (ids[j].ms == original_last_id.ms) {
      is_sequential = true;
      break;
    }
  }
  RecordStreamAccess(op_args,
                     is_sequential ? StreamAccessKind::kSequential : StreamAccessKind::kRandom);

  return deleted;
}

// XACK key groupname id [id ...]
OpResult<uint32_t> OpAck(const OpArgs& op_args, string_view key, string_view gname,
                         absl::Span<streamID> ids) {
  auto res = FindGroup(op_args, key, gname, false);
  RETURN_ON_BAD_STATUS(res);

  if (res->cg == nullptr || res->s == nullptr) {
    return 0;
  }

  int acknowledged = 0;
  StreamMemTracker mem_tracker;
  for (auto& id : ids) {
    unsigned char buf[sizeof(streamID)];
    StreamEncodeID(buf, id);

    // From Redis' xackCommand's implemenation
    // Lookup the ID in the group PEL: it will have a reference to the
    // NACK structure that will have a reference to the consumer, so that
    // we are able to remove the entry from both PELs.
    streamNACK* nack = nullptr;
    int fres = raxFind(res->cg->pel, buf, sizeof(buf), (void**)&nack);
    if (fres) {
      raxRemove(res->cg->pel, buf, sizeof(buf), nullptr);
      raxRemove(nack->consumer->pel, buf, sizeof(buf), nullptr);
      streamFreeNACK(nack);
      acknowledged++;
    }
  }
  mem_tracker.UpdateStreamSize(res->it.it->second);
  return acknowledged;
}

OpResult<ClaimInfo> OpAutoClaim(const OpArgs& op_args, string_view key, const ClaimOpts& opts) {
  auto cgr_res = FindGroup(op_args, key, opts.group, false);
  RETURN_ON_BAD_STATUS(cgr_res);
  RecordStreamAccess(op_args, StreamAccessKind::kRandom);

  stream* stream = cgr_res->s;
  streamCG* group = cgr_res->cg;

  if (stream == nullptr || group == nullptr) {
    return OpStatus::KEY_NOTFOUND;
  }

  StreamMemTracker mem_tracker;

  // from Redis spec on XAutoClaim:
  // https://redis.io/commands/xautoclaim/
  // The maximum number of pending entries that the command scans is the product of
  // multiplying <count>'s value by 10 (hard-coded).
  int64_t attempts = opts.count * 10;

  unsigned char start_key[sizeof(streamID)];
  streamID start_id = opts.start;
  StreamEncodeID(start_key, start_id);
  raxIterator ri;
  raxStart(&ri, group->pel);
  raxSeek(&ri, ">=", start_key, sizeof(start_key));

  ClaimInfo result;
  result.justid = (opts.flags & kClaimJustID);

  uint64_t now_ms = op_args.db_cntx.time_now_ms;
  int count = opts.count;

  streamConsumer* consumer = FindOrAddConsumer(opts.consumer, group, now_ms, nullptr);

  while (attempts-- && count && raxNext(&ri)) {
    streamNACK* nack = (streamNACK*)ri.data;

    streamID id;
    streamDecodeID(ri.key, &id);

    if (!StreamEntryExists(stream, &id)) {
      // TODO: to propagate this change to replica as XCLAIM command
      // - since we delete it from NACK. See streamPropagateXCLAIM call.
      raxRemove(group->pel, ri.key, ri.key_len, nullptr);
      raxRemove(nack->consumer->pel, ri.key, ri.key_len, nullptr);
      streamFreeNACK(nack);
      result.deleted_ids.push_back(id);
      raxSeek(&ri, ">=", ri.key, ri.key_len);

      count--; /* Count is a limit of the command response size. */
      continue;
    }

    if (opts.min_idle_time) {
      mstime_t this_idle = now_ms - nack->delivery_time;
      if (this_idle < opts.min_idle_time)
        continue;
    }

    nack->delivery_time = now_ms;
    if (!result.justid) {
      nack->delivery_count++;
    }
    ReassignNACKToConsumer(nack, consumer, ri.key, ri.key_len, now_ms);
    AppendClaimResultItem(result, stream, id);
    count--;
    // TODO: propagate xclaim to replica
  }

  raxNext(&ri);
  streamID end_id;
  if (raxEOF(&ri)) {
    end_id.ms = end_id.seq = 0;
  } else {
    streamDecodeID(ri.key, &end_id);
  }
  raxStop(&ri);
  result.end_id = end_id;

  mem_tracker.UpdateStreamSize(cgr_res->it.it->second);

  return result;
}

struct PendingOpts {
  string_view group_name;
  string_view consumer_name;
  ParsedStreamId start;
  ParsedStreamId end;
  int64_t min_idle_time = 0;
  int64_t count = -1;
};

struct PendingReducedResult {
  uint64_t count = 0;
  streamID start;
  streamID end;
  vector<pair<string_view, uint64_t /* size of consumer pending list*/>> consumer_list;
};

struct PendingExtendedResult {
  streamID start;
  string_view consumer_name;
  uint64_t delivery_count;
  mstime_t elapsed;
};

using PendingExtendedResultList = std::vector<PendingExtendedResult>;
using PendingResult = std::variant<PendingReducedResult, PendingExtendedResultList>;

PendingReducedResult GetPendingReducedResult(streamCG* cg) {
  PendingReducedResult result;
  result.count = raxSize(cg->pel);
  if (!result.count) {
    return result;
  }

  raxIterator ri;

  raxStart(&ri, cg->pel);
  raxSeek(&ri, "^", nullptr, 0);
  raxNext(&ri);
  streamDecodeID(ri.key, &result.start);

  raxSeek(&ri, "$", nullptr, 0);
  raxNext(&ri);
  streamDecodeID(ri.key, &result.end);

  raxStart(&ri, cg->consumers);
  raxSeek(&ri, "^", nullptr, 0);
  while (raxNext(&ri)) {
    streamConsumer* consumer = static_cast<streamConsumer*>(ri.data);
    uint64_t pel_size = raxSize(consumer->pel);
    if (!pel_size)
      continue;

    pair<string_view, uint64_t> item;
    item.first = string_view{consumer->name, sdslen(consumer->name)};
    item.second = pel_size;
    result.consumer_list.push_back(item);
  }
  raxStop(&ri);
  return result;
}

PendingExtendedResultList GetPendingExtendedResult(uint64_t now_ms, streamCG* cg,
                                                   streamConsumer* consumer,
                                                   const PendingOpts& opts) {
  PendingExtendedResultList result;
  rax* pel = consumer ? consumer->pel : cg->pel;
  streamID sstart = opts.start.val, send = opts.end.val;
  unsigned char start_key[sizeof(streamID)];
  unsigned char end_key[sizeof(streamID)];
  raxIterator ri;

  StreamEncodeID(start_key, sstart);
  StreamEncodeID(end_key, send);
  raxStart(&ri, pel);
  raxSeek(&ri, ">=", start_key, sizeof(start_key));

  auto count = opts.count;
  while (count && raxNext(&ri)) {
    if (memcmp(ri.key, end_key, ri.key_len) > 0) {
      break;
    }
    streamNACK* nack = static_cast<streamNACK*>(ri.data);

    if (opts.min_idle_time) {
      mstime_t this_idle = now_ms - nack->delivery_time;
      if (this_idle < opts.min_idle_time) {
        continue;
      }
    }

    count--;

    /* Entry ID. */
    streamID id;
    streamDecodeID(ri.key, &id);

    /* Milliseconds elapsed since last delivery. */
    mstime_t elapsed = now_ms - nack->delivery_time;
    if (elapsed < 0) {
      elapsed = 0;
    }

    PendingExtendedResult item = {.start = id,
                                  .consumer_name = nack->consumer->name,
                                  .delivery_count = nack->delivery_count,
                                  .elapsed = elapsed};
    result.push_back(item);
  }
  raxStop(&ri);
  return result;
}

OpResult<PendingResult> OpPending(const OpArgs& op_args, string_view key, const PendingOpts& opts) {
  auto cgroup_res = FindGroup(op_args, key, opts.group_name);
  RETURN_ON_BAD_STATUS(cgroup_res);

  streamConsumer* consumer = nullptr;
  if (!opts.consumer_name.empty()) {
    consumer = StreamLookupConsumer(cgroup_res->cg, WrapSds(opts.consumer_name));
  }

  PendingResult result;

  if (opts.count == -1) {
    result = GetPendingReducedResult(cgroup_res->cg);
  } else {
    result = GetPendingExtendedResult(op_args.db_cntx.time_now_ms, cgroup_res->cg, consumer, opts);
  }
  return result;
}

void CreateGroup(facade::CmdArgParser* parser, CommandContext* cmd_cntx) {
  auto key = parser->Next();

  CreateOpts opts;
  std::tie(opts.gname, opts.id) = parser->Next<string_view, string_view>();
  if (parser->Check("MKSTREAM")) {
    opts.flags |= kCreateOptMkstream;
  }

  RETURN_ON_PARSE_ERROR(*parser, cmd_cntx);

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpCreate(t->GetOpArgs(shard), key, opts);
  };

  OpStatus result = cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));
  switch (result) {
    case OpStatus::KEY_NOTFOUND:
      return cmd_cntx->SendError(kXGroupKeyNotFound);
    default:
      cmd_cntx->SendError(result);
  }
}

void DestroyGroup(facade::CmdArgParser* parser, CommandContext* cmd_cntx) {
  auto [key, gname] = parser->Next<string_view, string_view>();

  RETURN_ON_PARSE_ERROR(*parser, cmd_cntx);

  if (parser->HasNext())
    return cmd_cntx->SendError(UnknownSubCmd("DESTROY", "XGROUP"));

  auto cb = [&, &key = key, &gname = gname](Transaction* t, EngineShard* shard) {
    return OpDestroyGroup(t->GetOpArgs(shard), key, gname);
  };

  OpStatus result = cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));
  switch (result) {
    case OpStatus::OK:
      return cmd_cntx->SendLong(1);
    case OpStatus::SKIPPED:
      return cmd_cntx->SendLong(0);
    case OpStatus::KEY_NOTFOUND:
      return cmd_cntx->SendError(kXGroupKeyNotFound);
    default:
      cmd_cntx->SendError(result);
  }
}

void CreateConsumer(facade::CmdArgParser* parser, CommandContext* cmd_cntx) {
  auto [key, gname, consumer] = parser->Next<string_view, string_view, string_view>();

  RETURN_ON_PARSE_ERROR(*parser, cmd_cntx);

  if (parser->HasNext())
    return cmd_cntx->SendError(UnknownSubCmd("CREATECONSUMER", "XGROUP"));

  auto cb = [&, &key = key, &gname = gname, &consumer = consumer](Transaction* t,
                                                                  EngineShard* shard) {
    return OpCreateConsumer(t->GetOpArgs(shard), key, gname, consumer);
  };
  OpResult<uint32_t> result = cmd_cntx->tx()->ScheduleSingleHopT(cb);

  switch (result.status()) {
    case OpStatus::OK:
      return cmd_cntx->SendLong(1);
    case OpStatus::KEY_EXISTS:
      return cmd_cntx->SendLong(0);
    case OpStatus::SKIPPED:
      return cmd_cntx->SendError(NoGroupError(key, gname));
    case OpStatus::KEY_NOTFOUND:
      return cmd_cntx->SendError(kXGroupKeyNotFound);
    default:
      cmd_cntx->SendError(result.status());
  }
}

void DelConsumer(facade::CmdArgParser* parser, CommandContext* cmd_cntx) {
  auto [key, gname, consumer] = parser->Next<string_view, string_view, string_view>();

  RETURN_ON_PARSE_ERROR(*parser, cmd_cntx);

  if (parser->HasNext())
    return cmd_cntx->SendError(UnknownSubCmd("DELCONSUMER", "XGROUP"));

  auto cb = [&, &key = key, &gname = gname, &consumer = consumer](Transaction* t,
                                                                  EngineShard* shard) {
    return OpDelConsumer(t->GetOpArgs(shard), key, gname, consumer);
  };

  OpResult<uint32_t> result = cmd_cntx->tx()->ScheduleSingleHopT(cb);

  switch (result.status()) {
    case OpStatus::OK:
      return cmd_cntx->SendLong(*result);
    case OpStatus::SKIPPED:
      return cmd_cntx->SendError(NoGroupError(key, gname));
    case OpStatus::KEY_NOTFOUND:
      return cmd_cntx->SendError(kXGroupKeyNotFound);
    default:
      cmd_cntx->SendError(result.status());
  }
}

void SetId(facade::CmdArgParser* parser, CommandContext* cmd_cntx) {
  auto [key, gname, id] = parser->Next<string_view, string_view, string_view>();
  std::optional<int64_t> entries_read;

  while (parser->HasNext()) {
    if (parser->Check("ENTRIESREAD") && parser->HasAtLeast(1)) {
      entries_read = parser->Next<int64>();
      if (parser->HasError() || *entries_read < SCG_INVALID_ENTRIES_READ) {
        return cmd_cntx->SendError(kSyntaxErr);
      }
    } else {
      return cmd_cntx->SendError(kSyntaxErr);
    }
  }

  RETURN_ON_PARSE_ERROR(*parser, cmd_cntx);

  auto cb = [&, &key = key, &gname = gname, &id = id](Transaction* t, EngineShard* shard) {
    return OpSetId(t->GetOpArgs(shard), key, gname, id, entries_read);
  };

  OpStatus result = cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));
  switch (result) {
    case OpStatus::SKIPPED:
      return cmd_cntx->SendError(NoGroupError(key, gname));
    case OpStatus::KEY_NOTFOUND:
      return cmd_cntx->SendError(kXGroupKeyNotFound);
    default:
      cmd_cntx->SendError(result);
  }
}

void XGroupHelp(CmdArgList args, CommandContext* cmd_cntx) {
  string_view help_arr[] = {"XGROUP <subcommand> [<arg> [value] [opt] ...]. Subcommands are:",
                            "CREATE <key> <groupname> <id|$> [option]",
                            "    Create a new consumer group. Options are:",
                            "    * MKSTREAM",
                            "      Create the empty stream if it does not exist.",
                            "    * ENTRIESREAD entries_read",
                            "      Set the group's entries_read counter (internal use).",
                            "CREATECONSUMER <key> <groupname> <consumer>",
                            "    Create a new consumer in the specified group.",
                            "DELCONSUMER <key> <groupname> <consumer>",
                            "    Remove the specified consumer.",
                            "DESTROY <key> <groupname>",
                            "    Remove the specified group.",
                            "SETID <key> <groupname> <id|$> [ENTRIESREAD entries_read]",
                            "    Set the current group ID and entries_read counter.",
                            "HELP",
                            "    Print this help."};
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  return rb->SendSimpleStrArr(help_arr);
}

OpResult<int64_t> OpTrim(const OpArgs& op_args, std::string_view key, const TrimOpts& opts,
                         bool journal_as_minid) {
  auto& db_slice = op_args.GetDbSlice();
  auto res_it = db_slice.FindMutable(op_args.db_cntx, key, OBJ_STREAM);
  if (!res_it) {
    if (res_it.status() == OpStatus::KEY_NOTFOUND) {
      return 0;
    }
    return res_it.status();
  }

  PrimeValue& pv = res_it->it->second;
  stream* s = (stream*)pv.RObjPtr();

  StreamMemTracker mem_tracker;

  int64_t deleted_items_number = TrimStream(opts, s);

  RecordStreamAccess(op_args, StreamAccessKind::kSequential);

  mem_tracker.UpdateStreamSize(pv);

  if (op_args.shard->journal() && journal_as_minid) {
    const bool stream_is_empty = s->length == 0;
    if (stream_is_empty) {
      // We need remove the whole stream in replica
      RecordJournal(op_args, "XTRIM"sv, ArgSlice{key, "MAXLEN"sv, "0"sv});
    } else {
      // We need to set exact MinId in the journal.
      // For this we are using new first_id from the stream
      std::string last_id = StreamsIdToString(s->first_id);
      RecordJournal(op_args, "XTRIM"sv, ArgSlice{key, "MINID"sv, last_id});
    }
  }

  return deleted_items_number;
}

ParseResult<TrimOpts> ParseTrimOpts(bool max_len, CmdArgParser* parser) {
  TrimOpts opts;
  opts.approx = parser->Check("~");
  if (!opts.approx) {
    parser->Check("=");
  }

  if (max_len) {
    opts.length_or_id = parser->Next<uint32_t>();
  } else {
    ParsedStreamId parsed_id;
    if (!ParseID(parser->Next(), false, 0, &parsed_id)) {
      return CreateSyntaxError(kSyntaxErr);
    }

    opts.length_or_id = parsed_id;  // trivial copy
  }

  if (parser->Check("LIMIT")) {
    if (!opts.approx) {
      return CreateSyntaxError(kSyntaxErr);
    }

    opts.limit = parser->Next<uint32_t>();
  }

  return opts;
}

ParseResult<TrimOpts> ParseTrimOpts(CmdArgParser* parser) {
  bool max_len = parser->Check("MAXLEN");
  if (!max_len) {
    parser->ExpectTag("MINID");
  }

  auto res = ParseTrimOpts(max_len, parser);

  if (parser->Check("MAXLEN") || parser->Check("MINID")) {
    return CreateSyntaxError(kTrimOptionConflictErr);
  }

  return res;
}

ParseResult<AddOpts> ParseAddOpts(CmdArgParser* parser) {
  AddOpts opts;
  while (parser->HasNext()) {
    if (parser->Check("NOMKSTREAM")) {
      opts.no_mkstream = true;
      continue;
    }

    bool max_len = parser->Check("MAXLEN");
    if (max_len || parser->Check("MINID")) {
      if (opts.trim_opts) {
        return CreateSyntaxError(kTrimOptionConflictErr);
      }

      auto trim_opts = ParseTrimOpts(max_len, parser);
      if (!trim_opts) {
        return make_unexpected(trim_opts.error());
      }

      opts.trim_opts = trim_opts.value();  // trivial copy
    } else {
      // It is StreamId
      std::string_view id = parser->Next();
      if (!ParseID(id, true, 0, &opts.parsed_id)) {
        return CreateSyntaxError(kInvalidStreamId);
      }
      break;
    }
  }

  return opts;
}

struct StreamReplies {
  explicit StreamReplies(SinkReplyBuilder* rb) : rb{static_cast<RedisReplyBuilder*>(rb)} {
    DCHECK(dynamic_cast<RedisReplyBuilder*>(rb));
  }

  void SendRecord(const Record& record) const {
    RedisReplyBuilder::ArrayScope scope{rb, 2};
    rb->SendBulkString(StreamIdRepr(record.id));
    rb->StartArray(record.kv_arr.size() * 2);
    for (const auto& k_v : record.kv_arr) {
      rb->SendBulkString(k_v.first);
      rb->SendBulkString(k_v.second);
    }
  }

  void SendIDs(absl::Span<const streamID> ids) const {
    RedisReplyBuilder::ArrayScope scope{rb, ids.size()};
    for (auto id : ids)
      rb->SendBulkString(StreamIdRepr(id));
  }

  void SendRecords(absl::Span<const Record> records) const {
    RedisReplyBuilder::ArrayScope scope{rb, records.size()};
    for (const auto& record : records)
      SendRecord(record);
  }

  void SendStreamRecords(string_view key, absl::Span<const Record> records) const {
    rb->SendBulkString(key);
    SendRecords(records);
  }

  void SendClaimInfo(const ClaimInfo& ci) const {
    if (ci.justid) {
      SendIDs(ci.ids);
    } else {
      SendRecords(ci.records);
    }
  }

  RedisReplyBuilder* rb;
};

std::optional<ReadOpts> ParseReadArgsOrReply(CmdArgList args, bool read_group,
                                             SinkReplyBuilder* builder) {
  size_t streams_count = 0;

  ReadOpts opts;
  opts.read_group = read_group;
  size_t id_indx = 0;

  if (opts.read_group) {
    string arg = absl::AsciiStrToUpper(ArgS(args, id_indx));

    if (arg.size() - 1 < 2) {
      builder->SendError(kSyntaxErr);
      return std::nullopt;
    }

    if (arg != "GROUP") {
      const auto m = "Missing 'GROUP' in 'XREADGROUP' command";
      builder->SendError(m, kSyntaxErr);
      return std::nullopt;
    }
    id_indx++;
    opts.group_name = ArgS(args, id_indx);
    opts.consumer_name = ArgS(args, ++id_indx);
    if (opts.consumer_name.empty()) {
      builder->SendError("consumer name can't be empty", kSyntaxErrType);
      return std::nullopt;
    }
    id_indx++;
  }

  for (; id_indx < args.size(); ++id_indx) {
    string arg = absl::AsciiStrToUpper(ArgS(args, id_indx));

    bool remaining_args = args.size() - id_indx - 1 > 0;
    if (arg == "BLOCK" && remaining_args) {
      id_indx++;
      arg = ArgS(args, id_indx);
      if (!absl::SimpleAtoi(arg, &opts.timeout)) {
        builder->SendError(kInvalidIntErr);
        return std::nullopt;
      }
    } else if (arg == "COUNT" && remaining_args) {
      id_indx++;
      arg = ArgS(args, id_indx);
      if (!absl::SimpleAtoi(arg, &opts.count)) {
        builder->SendError(kInvalidIntErr);
        return std::nullopt;
      }
    } else if (opts.read_group && arg == "NOACK") {
      opts.noack = true;
    } else if (arg == "STREAMS" && remaining_args) {
      opts.streams_arg = id_indx + 1;

      size_t pair_count = args.size() - opts.streams_arg;
      if ((pair_count % 2) != 0) {
        const char* cmd_name = read_group ? "xreadgroup" : "xread";
        const char* symbol = read_group ? ">" : "$";
        const auto msg = absl::StrCat("Unbalanced '", cmd_name,
                                      "' list of streams: for each stream key an ID or '", symbol,
                                      "' must be specified");
        builder->SendError(msg, kSyntaxErr);
        return std::nullopt;
      }
      streams_count = pair_count / 2;
      break;
    } else {
      builder->SendError(kSyntaxErr);
      return std::nullopt;
    }
  }

  // STREAMS option is required.
  if (opts.streams_arg == 0) {
    builder->SendError(kSyntaxErr);
    return std::nullopt;
  }

  // Parse the stream IDs.
  for (size_t i = opts.streams_arg + streams_count; i < args.size(); i++) {
    string_view key = ArgS(args, i - streams_count);
    string_view idstr = ArgS(args, i);

    StreamIDsItem sitem;
    ParsedStreamId id;

    if (idstr == "$") {
      // Set ID to 0 so if the ID cannot be resolved (when the stream doesn't
      // exist) it takes the first entry added.
      if (opts.read_group) {
        builder->SendError("The $ can be specified only when calling XREAD.", kSyntaxErr);
        return std::nullopt;
      }
      id.val.ms = 0;
      id.val.seq = 0;
      id.resolve_last_id = true;
      sitem.id = id;
      auto [_, is_inserted] = opts.stream_ids.emplace(key, sitem);
      if (!is_inserted) {
        builder->SendError(kSameStreamFound);
        return std::nullopt;
      }
      continue;
    }

    if (idstr == ">") {
      if (!opts.read_group) {
        builder->SendError(
            "The > ID can be specified only when calling XREADGROUP using the GROUP <group> "
            "<consumer> option.",
            kSyntaxErr);
        return std::nullopt;
      }
      id.val.ms = UINT64_MAX;
      id.val.seq = UINT64_MAX;
      sitem.id = id;
      auto [_, is_inserted] = opts.stream_ids.emplace(key, sitem);
      if (!is_inserted) {
        builder->SendError(kSameStreamFound);
        return std::nullopt;
      }
      continue;
    }

    if (!ParseID(idstr, true, 0, &id)) {
      builder->SendError(kInvalidStreamId, kSyntaxErrType);
      return std::nullopt;
    }

    // We only include messages with IDs greater than start so increment the
    // starting ID.
    StreamIncrID(&id.val);
    sitem.id = id;
    auto [_, is_inserted] = opts.stream_ids.emplace(key, sitem);
    if (!is_inserted) {
      builder->SendError(kSameStreamFound);
      return std::nullopt;
    }
  }
  return opts;
}

void XRangeGeneric(std::string_view key, std::string_view start, std::string_view end,
                   CmdArgList args, bool is_rev, CommandContext* cmd_cntx) {
  RangeOpts range_opts;
  RangeId rs, re;

  if (!ParseRangeId(start, RangeBoundary::kStart, &rs) ||
      !ParseRangeId(end, RangeBoundary::kEnd, &re)) {
    return cmd_cntx->SendError(kInvalidStreamId, kSyntaxErrType);
  }

  if (rs.exclude && StreamIncrID(&rs.parsed_id.val) != C_OK) {
    return cmd_cntx->SendError("invalid start ID for the interval", kSyntaxErrType);
  }

  if (re.exclude && StreamDecrID(&re.parsed_id.val) != C_OK) {
    return cmd_cntx->SendError("invalid end ID for the interval", kSyntaxErrType);
  }

  if (!args.empty()) {
    if (args.size() != 2) {
      return cmd_cntx->SendError(WrongNumArgsError("XRANGE"), kSyntaxErrType);
    }

    string opt = absl::AsciiStrToUpper(ArgS(args, 0));
    string_view val = ArgS(args, 1);

    if (opt != "COUNT" || !absl::SimpleAtoi(val, &range_opts.count)) {
      return cmd_cntx->SendError(kSyntaxErr);
    }
  }

  range_opts.start = rs.parsed_id;
  range_opts.end = re.parsed_id;
  range_opts.is_rev = is_rev;

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpRange(t->GetOpArgs(shard), key, range_opts);
  };

  OpResult<RecordVec> result = cmd_cntx->tx()->ScheduleSingleHopT(cb);
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (result) {
    SinkReplyBuilder::ReplyAggregator agg(rb);
    StreamReplies{rb}.SendRecords(*result);
    return;
  }

  if (result.status() == OpStatus::KEY_NOTFOUND) {
    return rb->SendEmptyArray();
  }
  return cmd_cntx->SendError(result.status());
}

void JournalConsumerCreationIfNeeded(OpArgs op_args, const ReadOpts& opts, std::string_view key) {
  const bool is_consumer_new = opts.stream_ids.at(key).is_consumer_new;

  if (!op_args.shard->journal() || !is_consumer_new) {
    return;
  }

  CmdArgVec args = {"CREATECONSUMER", key, opts.group_name, opts.consumer_name};
  RecordJournal(op_args, "XGROUP"sv, args);
}

// Valkey 7.2.11:
// --------------
// If the consumer was created but nothing was read the consumer is *not* deleted
// and XINFO should show it. If NOACK is used, consumer creation is replicated
// but ignored when NOACK is omitted.
// Journal rewrites for when reading via `>`:
// * without noack -> xclaim + xgroup setid
// * with noack -> xgroup createconsumer +  xgroup setid
//
// Redis 7.0.15:
// --------------
// Redis deletes the consumer in case the stream is empty and nothing
// was read even if the command blocks. On the later case, after
// unblocking, the consumer is created again and its side effects are
// replicated similar to what described above.
//
// Dragonfly simply propagates consumer creation but does not roll back consumer
// creation.
void JournalXReadGroupIfNeeded(OpArgs op_args, const ReadOpts& opts, const RecordVec& records,
                               std::string_view key) {
  if (!op_args.shard->journal()) {
    return;
  }

  const bool serve_history = opts.stream_ids.at(key).serve_history;

  if (serve_history) {
    return;
  }

  // Reading from >
  auto journal_xgroup = [&opts, op_args](const auto& records, std::string_view key) {
    if (!records.empty()) {
      const auto& sitem = opts.stream_ids.at(key);
      auto id = absl::StrCat(records.back().id.ms, "-", records.back().id.seq);
      auto entries_read = absl::StrCat(sitem.group->entries_read);
      CmdArgVec journal_args = {"SETID", key, opts.group_name, id, "ENTRIESREAD", entries_read};
      RecordJournal(op_args, "XGROUP"sv, journal_args);
    }
  };

  // If NOACK is *not* set we add entries to PEL. Consumer is created as a side
  // effect of XCLAIM.
  if (!opts.noack) {
    for (auto& record : records) {
      auto id = absl::StrCat(record.id.ms, "-", record.id.seq);
      auto deliv_time = absl::StrCat(record.delivery_time);
      CmdArgVec journal_args = {
          key, opts.group_name, opts.consumer_name, "0",      id, "TIME", deliv_time, "RETRYCOUNT",
          "1", "FORCE",         "JUSTID",           "LASTID", id};

      RecordJournal(op_args, "XCLAIM"sv, journal_args);
    }
    journal_xgroup(records, key);
    return;
  }

  journal_xgroup(records, key);
}

// Set is_consumer_new to true if the consumer is created. Only relevant for,
// when XReadBlock is called from XREADGROUP command.
void XReadBlock(ReadOpts* opts, Transaction* tx, SinkReplyBuilder* builder,
                ConnectionContext* cntx) {
  // If BLOCK is not set just return an empty array as there are no resolvable
  // entries.
  auto* rb = static_cast<RedisReplyBuilder*>(builder);
  if (opts->timeout == -1 || tx->IsMulti()) {
    // Close the transaction and release locks.
    tx->Conclude();
    return rb->SendNullArray();
  }

  auto tp = (opts->timeout) ? chrono::steady_clock::now() + chrono::milliseconds(opts->timeout)
                            : Transaction::time_point::max();

  const auto key_checker = [opts](EngineShard* owner, const DbContext& context, Transaction* tx,
                                  std::string_view key) -> bool {
    auto& db_slice = context.GetDbSlice(owner->shard_id());
    auto res_it = db_slice.FindReadOnly(context, key, OBJ_STREAM);
    if (!res_it.ok())
      return false;

    StreamIDsItem& sitem = opts->stream_ids.at(key);
    if (sitem.id.val.ms != UINT64_MAX && sitem.id.val.seq != UINT64_MAX)
      return true;

    const CompactObj& cobj = (*res_it)->second;
    stream* s = GetReadOnlyStream(cobj);
    streamID last_id = s->last_id;
    if (s->length) {
      StreamLastValidID(s, &last_id);
    }

    // Update group pointer and check it's validity
    if (opts->read_group) {
      sitem.group = StreamLookupCG(s, WrapSds(opts->group_name));
      if (!sitem.group)
        return true;  // abort
    }

    return streamCompareID(&last_id, &sitem.group->last_id) > 0;
  };

  if (auto status =
          tx->WaitOnWatch(tp, Transaction::kShardArgs, key_checker, &cntx->blocked, &cntx->paused);
      status != OpStatus::OK)
    return rb->SendNullArray();

  // Resolve the entry in the woken key. Note this must not use OpRead since
  // only the shard that contains the woken key blocks for the awoken
  // transaction to proceed.
  OpResult<RecordVec> result;
  std::string key;
  auto range_cb = [&](Transaction* t, EngineShard* shard) {
    if (auto wake_key = t->GetWakeKey(shard->shard_id()); wake_key) {
      RangeOpts range_opts;
      range_opts.end = ParsedStreamId{.val = streamID{
                                          .ms = UINT64_MAX,
                                          .seq = UINT64_MAX,
                                      }};
      StreamIDsItem& sitem = opts->stream_ids.at(*wake_key);
      range_opts.start = sitem.id;

      // Expect group to exist? No guarantees from transactional framework
      if (opts->read_group && !sitem.group) {
        result = OpStatus::INVALID_VALUE;
        return OpStatus::OK;
      }

      if (sitem.id.val.ms == UINT64_MAX || sitem.id.val.seq == UINT64_MAX) {
        range_opts.start.val = sitem.group->last_id;  // only for '>'
        StreamIncrID(&range_opts.start.val);
      }

      range_opts.group = sitem.group;

      // Update consumer, only for XReadGroup path
      std::optional<StreamMemTracker> tracker;
      if (sitem.group) {
        tracker = StreamMemTracker{};
        sitem.is_consumer_new = false;
        range_opts.consumer = FindOrAddConsumer(opts->consumer_name, sitem.group,
                                                GetCurrentTimeMs(), &sitem.is_consumer_new);
        sitem.consumer = range_opts.consumer;
        if (!sitem.consumer) {
          return OpStatus::OUT_OF_MEMORY;
        }

        if (sitem.consumer->pel->numnodes == 0) {
          LOG(DFATAL) << "Internal error when accessing consumer data, seen_time "
                      << sitem.consumer->seen_time;
          result = OpStatus::CANCELLED;
          return OpStatus::OK;
        }
      }

      key = *wake_key;

      if (tracker) {
        auto op_args = t->GetOpArgs(shard);
        auto& db_slice = op_args.GetDbSlice();
        auto it = db_slice.FindMutable(op_args.db_cntx, key, OBJ_STREAM);
        DCHECK(it);
        if (it) {
          tracker->UpdateStreamSize(it->it->second);
        }
      }

      range_opts.noack = opts->noack;
      range_opts.access_kind = StreamAccessKind::kSequential;

      result = OpRange(t->GetOpArgs(shard), *wake_key, range_opts);
      if (result) {
        JournalConsumerCreationIfNeeded(t->GetOpArgs(shard), *opts, *wake_key);
        JournalXReadGroupIfNeeded(t->GetOpArgs(shard), *opts, *result, *wake_key);
      }
    }
    return OpStatus::OK;
  };
  tx->Execute(std::move(range_cb), true);

  if (result) {
    SinkReplyBuilder::ReplyAggregator agg(rb);
    if (opts->read_group && rb->IsResp3()) {
      rb->StartCollection(1, CollectionType::MAP);
    } else {
      rb->StartArray(1);
      rb->StartArray(2);
    }
    return StreamReplies{rb}.SendStreamRecords(key, *result);
  } else if (result.status() == OpStatus::INVALID_VALUE) {
    return rb->SendError("-NOGROUP the consumer group this client was blocked on no longer exists");
  }
  return rb->SendNullArray();
}

void XReadGeneric2(CmdArgList args, bool read_group, CommandContext* cmd_cntx) {
  optional<ReadOpts> opts = ParseReadArgsOrReply(args, read_group, cmd_cntx->rb());
  if (!opts)
    return;

  // Determine if streams have entries or any error occured
  AggregateValue<optional<facade::ErrorReply>> err;
  atomic_bool have_entries = false;
  auto* tx = cmd_cntx->tx();
  // With a single shard we can call OpRead in a single hop, falling back to
  // avoid concluding if no entries are available.
  const bool is_single_shard = tx->GetUniqueShardCnt() == 1;
  vector<RecordVec> fastread_prefetched;

  auto cb = [&](auto* tx, auto* es) -> Transaction::RunnableResult {
    auto op_args = tx->GetOpArgs(es);
    for (string_view skey : tx->GetShardArgs(es->shard_id())) {
      if (auto res = HasEntries2(op_args, skey, &*opts); holds_alternative<facade::ErrorReply>(res))
        err = get<facade::ErrorReply>(res);
      else if (holds_alternative<bool>(res) && get<bool>(res))
        have_entries.store(true, memory_order_relaxed);
    }

    if (is_single_shard) {
      if (have_entries.load(memory_order_relaxed)) {
        fastread_prefetched = OpRead(tx->GetOpArgs(es), tx->GetShardArgs(es->shard_id()), *opts);
        if (read_group) {
          size_t index = 0;
          for (auto key : tx->GetShardArgs(es->shard_id())) {
            // We can batch here to improve journal writes
            JournalConsumerCreationIfNeeded(op_args, *opts, key);
            JournalXReadGroupIfNeeded(op_args, *opts, fastread_prefetched[index++], key);
          }
        }
      } else {
        // We didn't read any entries but we might added new consumers
        for (auto key : tx->GetShardArgs(es->shard_id())) {
          JournalConsumerCreationIfNeeded(op_args, *opts, key);
        }
        return {OpStatus::OK, Transaction::RunnableResult::AVOID_CONCLUDING};
      }
    }
    return OpStatus::OK;
  };
  tx->Execute(cb, is_single_shard);

  if (err) {
    tx->Conclude();
    return cmd_cntx->SendError(**err);
  }

  if (!have_entries.load(memory_order_relaxed))
    return XReadBlock(&*opts, tx, cmd_cntx->rb(), cmd_cntx->server_conn_cntx());

  vector<vector<RecordVec>> xread_resp;
  if (is_single_shard && have_entries.load(memory_order_relaxed)) {
    xread_resp = {std::move(fastread_prefetched)};
  } else {
    xread_resp.resize(shard_set->size());
    auto read_cb = [&](Transaction* t, EngineShard* shard) {
      ShardId sid = shard->shard_id();
      auto op_args = tx->GetOpArgs(shard);
      xread_resp[sid] = OpRead(op_args, t->GetShardArgs(sid), *opts);
      if (read_group) {
        size_t index = 0;
        for (auto key : tx->GetShardArgs(sid)) {
          JournalConsumerCreationIfNeeded(op_args, *opts, key);
          JournalXReadGroupIfNeeded(op_args, *opts, xread_resp[sid][index++], key);
        }
      }
      return OpStatus::OK;
    };
    tx->Execute(std::move(read_cb), true);
  }

  // Count number of streams and merge final results in correct order
  int resolved_streams = 0;
  vector<RecordVec> results(opts->stream_ids.size());
  for (size_t i = 0; i < xread_resp.size(); i++) {
    vector<RecordVec>& sub_results = xread_resp[i];
    ShardId sid = xread_resp.size() < shard_set->size() ? tx->GetUniqueShard() : i;
    if (!tx->IsActive(sid)) {
      DCHECK(sub_results.empty());
      continue;
    }

    ShardArgs shard_args = tx->GetShardArgs(sid);
    DCHECK_EQ(shard_args.Size(), sub_results.size());

    auto shard_args_it = shard_args.begin();
    for (size_t j = 0; j < sub_results.size(); j++, ++shard_args_it) {
      if (sub_results[j].empty())
        continue;

      resolved_streams++;
      results[shard_args_it.index() - opts->streams_arg] = std::move(sub_results[j]);
    }
  }

  // Send all results back
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  SinkReplyBuilder::ReplyScope scope(rb);
  if (opts->read_group) {
    if (rb->IsResp3()) {
      rb->StartCollection(opts->stream_ids.size(), CollectionType::MAP);
      for (size_t i = 0; i < opts->stream_ids.size(); i++) {
        string_view key = ArgS(args, i + opts->streams_arg);
        StreamReplies{rb}.SendStreamRecords(key, results[i]);
      }
    } else {
      rb->StartArray(opts->stream_ids.size());
      for (size_t i = 0; i < opts->stream_ids.size(); i++) {
        string_view key = ArgS(args, i + opts->streams_arg);
        rb->StartArray(2);
        StreamReplies{rb}.SendStreamRecords(key, results[i]);
      }
    }
  } else {
    if (rb->IsResp3()) {
      rb->StartCollection(resolved_streams, CollectionType::MAP);
      for (size_t i = 0; i < results.size(); ++i) {
        if (results[i].empty()) {
          continue;
        }
        string_view key = ArgS(args, i + opts->streams_arg);
        StreamReplies{rb}.SendStreamRecords(key, results[i]);
      }
    } else {
      rb->StartArray(resolved_streams);
      for (size_t i = 0; i < results.size(); i++) {
        if (results[i].empty())
          continue;
        string_view key = ArgS(args, i + opts->streams_arg);
        rb->StartArray(2);
        StreamReplies{rb}.SendStreamRecords(key, results[i]);
      }
    }
  }
}

void HelpSubCmd(facade::CmdArgParser* parser, CommandContext* cmd_cntx) {
  XGroupHelp(parser->Tail(), cmd_cntx);
}

bool ParseXpendingOptions(CmdArgList& args, PendingOpts& opts, SinkReplyBuilder* builder) {
  size_t id_indx = 0;
  string arg = absl::AsciiStrToUpper(ArgS(args, id_indx));

  if (arg == "IDLE" && args.size() > 4) {
    id_indx++;
    if (!absl::SimpleAtoi(ArgS(args, id_indx), &opts.min_idle_time)) {
      builder->SendError(kInvalidIntErr, kSyntaxErrType);
      return false;
    }
    // Ignore negative min_idle_time
    opts.min_idle_time = std::max(opts.min_idle_time, static_cast<int64_t>(0));
    args.remove_prefix(2);
    id_indx = 0;
  }
  if (args.size() < 3) {
    builder->SendError(WrongNumArgsError("XPENDING"), kSyntaxErrType);
    return false;
  }

  // Parse start and end
  RangeId rs, re;
  string_view start = ArgS(args, id_indx);
  id_indx++;
  string_view end = ArgS(args, id_indx);
  if (!ParseRangeId(start, RangeBoundary::kStart, &rs) ||
      !ParseRangeId(end, RangeBoundary::kEnd, &re)) {
    builder->SendError(kInvalidStreamId, kSyntaxErrType);
    return false;
  }

  if (rs.exclude && StreamIncrID(&rs.parsed_id.val) != C_OK) {
    builder->SendError("invalid start ID for the interval", kSyntaxErrType);
    return false;
  }

  if (re.exclude && StreamDecrID(&re.parsed_id.val) != C_OK) {
    builder->SendError("invalid end ID for the interval", kSyntaxErrType);
    return false;
  }
  id_indx++;
  opts.start = rs.parsed_id;
  opts.end = re.parsed_id;

  // Parse count
  if (!absl::SimpleAtoi(ArgS(args, id_indx), &opts.count)) {
    builder->SendError(kInvalidIntErr, kSyntaxErrType);
    return false;
  }

  // Ignore negative count value
  opts.count = std::max(opts.count, static_cast<int64_t>(0));
  if (args.size() - id_indx - 1) {
    id_indx++;
    opts.consumer_name = ArgS(args, id_indx);
  }
  return true;
}

}  // namespace

void CmdXAdd(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  string_view key = parser.Next();

  auto parsed_add_opts = ParseAddOpts(&parser);

  if (auto err = parser.TakeError(); err || !parsed_add_opts) {
    cmd_cntx->SendError(!parsed_add_opts ? parsed_add_opts.error() : err.MakeReply());
    return;
  }

  // Save the index of the stream ID in the arguments list.
  // We need this during journaling
  // It is (parser.GetCurrentIndex() - 1) because the stream id is the last parsed argument in the
  // ParseAddOpts
  const size_t stream_id_index_in_args = parser.GetCurrentIndex() - 1;
  AddArgsJournaler journaler{{args.begin(), args.end()}, stream_id_index_in_args};

  CmdArgList fields = parser.Tail();
  if (fields.empty() || fields.size() % 2 != 0) {
    return rb->SendError(WrongNumArgsError("XADD"), kSyntaxErrType);
  }

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpAdd(t->GetOpArgs(shard), key, parsed_add_opts.value(), fields, journaler);
  };

  OpResult<streamID> add_result = cmd_cntx->tx()->ScheduleSingleHopT(cb);

  if (add_result) {
    rb->SendBulkString(StreamIdRepr(*add_result));
  } else {
    if (add_result == OpStatus::KEY_NOTFOUND) {
      rb->SendNull();
    } else if (add_result == OpStatus::STREAM_ID_SMALL) {
      cmd_cntx->SendError(LeqTopIdError("XADD"));
    } else {
      cmd_cntx->SendError(add_result.status());
    }
  }
}

absl::InlinedVector<streamID, 8> GetXclaimIds(CmdArgList& args) {
  size_t i;
  absl::InlinedVector<streamID, 8> ids;
  for (i = 0; i < args.size(); ++i) {
    ParsedStreamId parsed_id;
    string_view str_id = ArgS(args, i);
    if (!ParseID(str_id, true, 0, &parsed_id)) {
      if (i > 0) {
        break;
      }
      return ids;
    }
    ids.push_back(parsed_id.val);
  }
  args.remove_prefix(i);
  return ids;
}

bool ParseXclaimOptions(CmdArgList args, ClaimOpts& opts, CommandContext* cmd_cntx) {
  for (size_t i = 0; i < args.size(); ++i) {
    string arg = absl::AsciiStrToUpper(ArgS(args, i));
    bool remaining_args = args.size() - i - 1 > 0;

    if (remaining_args) {
      if (arg == "IDLE") {
        arg = ArgS(args, ++i);
        if (!absl::SimpleAtoi(arg, &opts.delivery_time)) {
          cmd_cntx->SendError(kInvalidIntErr);
          return false;
        }
        continue;
      } else if (arg == "TIME") {
        arg = ArgS(args, ++i);
        if (!absl::SimpleAtoi(arg, &opts.delivery_time)) {
          cmd_cntx->SendError(kInvalidIntErr);
          return false;
        }
        continue;
      } else if (arg == "RETRYCOUNT") {
        arg = ArgS(args, ++i);
        if (!absl::SimpleAtoi(arg, &opts.retry)) {
          cmd_cntx->SendError(kInvalidIntErr);
          return false;
        }
        continue;
      } else if (arg == "LASTID") {
        opts.flags |= kClaimLastID;
        arg = ArgS(args, ++i);
        ParsedStreamId parsed_id;
        if (ParseID(arg, true, 0, &parsed_id)) {
          opts.last_id = parsed_id.val;
        } else {
          cmd_cntx->SendError(kInvalidStreamId, kSyntaxErrType);
          return false;
        }
        continue;
      }
    }
    if (arg == "FORCE") {
      opts.flags |= kClaimForce;
    } else if (arg == "JUSTID") {
      opts.flags |= kClaimJustID;
    } else {
      cmd_cntx->SendError("Unknown argument given for XCLAIM command", kSyntaxErr);
      return false;
    }
  }
  return true;
}

void CmdXClaim(CmdArgList args, CommandContext* cmd_cntx) {
  ClaimOpts opts;
  string_view key = ArgS(args, 0);
  opts.group = ArgS(args, 1);
  opts.consumer = ArgS(args, 2);

  if (opts.group.empty() || opts.consumer.empty()) {
    return cmd_cntx->SendError(kSyntaxErr);
  }

  if (!absl::SimpleAtoi(ArgS(args, 3), &opts.min_idle_time)) {
    return cmd_cntx->SendError(kSyntaxErr);
  }
  // Ignore negative min-idle-time
  opts.min_idle_time = std::max(opts.min_idle_time, static_cast<int64>(0));
  args.remove_prefix(4);

  auto ids = GetXclaimIds(args);
  if (ids.empty()) {
    // No ids given.
    return cmd_cntx->SendError(kInvalidStreamId, kSyntaxErrType);
  }

  // parse the options
  if (!ParseXclaimOptions(args, opts, cmd_cntx))
    return;

  uint64_t now = cmd_cntx->tx()->GetDbContext().time_now_ms;
  DCHECK_GT(now, 0u);

  if (opts.delivery_time < 0 || static_cast<uint64_t>(opts.delivery_time) > now)
    opts.delivery_time = now;

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpClaim(t->GetOpArgs(shard), key, opts, absl::Span{ids.data(), ids.size()});
  };
  OpResult<ClaimInfo> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  if (!result) {
    if (result.status() == OpStatus::SKIPPED) {
      // Return empty result when operation is skipped
      StreamReplies{cmd_cntx->rb()}.SendClaimInfo(ClaimInfo{});
      return;
    }
    cmd_cntx->SendError(result.status());
    return;
  }

  StreamReplies{cmd_cntx->rb()}.SendClaimInfo(result.value());
}

void CmdXDel(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  args.remove_prefix(1);

  absl::InlinedVector<streamID, 8> ids(args.size());

  for (size_t i = 0; i < args.size(); ++i) {
    ParsedStreamId parsed_id;
    string_view str_id = ArgS(args, i);
    if (!ParseID(str_id, true, 0, &parsed_id)) {
      return cmd_cntx->SendError(kInvalidStreamId, kSyntaxErrType);
    }
    ids[i] = parsed_id.val;
  }

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpDel(t->GetOpArgs(shard), key, absl::Span{ids.data(), ids.size()});
  };

  OpResult<uint32_t> result = cmd_cntx->tx()->ScheduleSingleHopT(cb);
  if (result || result.status() == OpStatus::KEY_NOTFOUND) {
    return cmd_cntx->SendLong(*result);
  }

  cmd_cntx->SendError(result.status());
}

void CmdXGroup(CmdArgList args, CommandContext* cmd_cntx) {
  facade::CmdArgParser parser{args};

  auto sub_cmd_func = parser.MapNext("HELP", &HelpSubCmd, "CREATE", &CreateGroup, "DESTROY",
                                     &DestroyGroup, "CREATECONSUMER", &CreateConsumer,
                                     "DELCONSUMER", &DelConsumer, "SETID", &SetId);

  if (auto err = parser.TakeError(); err)
    return cmd_cntx->SendError(err.MakeReply());

  sub_cmd_func(&parser, cmd_cntx);
}

void CmdXInfo(CmdArgList args, CommandContext* cmd_cntx) {
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  string sub_cmd = absl::AsciiStrToUpper(ArgS(args, 0));

  if (sub_cmd == "HELP") {
    string_view help_arr[] = {"CONSUMERS <key> <groupname>",
                              "    Show consumers of <groupname>.",
                              "GROUPS <key>",
                              "    Show the stream consumer groups.",
                              "STREAM <key> [FULL [COUNT <count>]",
                              "    Show information about the stream.",
                              "HELP",
                              "    Prints this help."};
    return rb->SendSimpleStrArr(help_arr);
  }

  ConnectionContext* cntx = cmd_cntx->server_conn_cntx();
  if (args.size() >= 2) {
    string_view key = ArgS(args, 1);
    ShardId sid = Shard(key, shard_set->size());

    if (sub_cmd == "GROUPS") {
      // We do not use transactional xemantics for xinfo since it's informational command.
      auto cb = [&]() {
        EngineShard* shard = EngineShard::tlocal();
        DbContext db_context{cntx->ns, cntx->db_index(), GetCurrentTimeMs()};
        return OpListGroups(db_context, key, shard);
      };

      OpResult<vector<GroupInfo>> result = shard_set->Await(sid, std::move(cb));
      if (result) {
        rb->StartArray(result->size());
        for (const auto& ginfo : *result) {
          string last_id = StreamIdRepr(ginfo.last_id);

          rb->StartCollection(6, CollectionType::MAP);
          rb->SendBulkString("name");
          rb->SendBulkString(ginfo.name);
          rb->SendBulkString("consumers");
          rb->SendLong(ginfo.consumer_size);
          rb->SendBulkString("pending");
          rb->SendLong(ginfo.pending_size);
          rb->SendBulkString("last-delivered-id");
          rb->SendBulkString(last_id);
          rb->SendBulkString("entries-read");
          if (ginfo.entries_read != SCG_INVALID_ENTRIES_READ) {
            rb->SendLong(ginfo.entries_read);
          } else {
            rb->SendNull();
          }
          rb->SendBulkString("lag");
          if (ginfo.lag != SCG_INVALID_LAG) {
            rb->SendLong(ginfo.lag);
          } else {
            rb->SendNull();
          }
        }
        return;
      }
      return cmd_cntx->SendError(result.status());
    } else if (sub_cmd == "STREAM") {
      int full = 0;
      size_t count = 10;  // default count for xinfo streams

      if (args.size() == 4 || args.size() > 5) {
        return rb->SendError(
            "unknown subcommand or wrong number of arguments for 'STREAM'. Try XINFO HELP.");
      }

      if (args.size() >= 3) {
        full = 1;
        string full_arg = absl::AsciiStrToUpper(ArgS(args, 2));
        if (full_arg != "FULL") {
          return rb->SendError(
              "unknown subcommand or wrong number of arguments for 'STREAM'. Try XINFO HELP.");
        }
        if (args.size() > 3) {
          string count_arg = absl::AsciiStrToUpper(ArgS(args, 3));
          string_view count_value_arg = ArgS(args, 4);
          if (count_arg != "COUNT") {
            return rb->SendError(
                "unknown subcommand or wrong number of arguments for 'STREAM'. Try XINFO HELP.");
          }

          if (!absl::SimpleAtoi(count_value_arg, &count)) {
            return rb->SendError(kInvalidIntErr);
          }
        }
      }

      auto cb = [&]() {
        EngineShard* shard = EngineShard::tlocal();
        return OpStreams(DbContext{cntx->ns, cntx->db_index(), GetCurrentTimeMs()}, key, shard,
                         full, count);
      };

      OpResult<StreamInfo> sinfo = shard_set->Await(sid, std::move(cb));
      if (sinfo) {
        if (full) {
          rb->StartCollection(9, CollectionType::MAP);
        } else {
          rb->StartCollection(10, CollectionType::MAP);
        }

        rb->SendBulkString("length");
        rb->SendLong(sinfo->length);

        rb->SendBulkString("radix-tree-keys");
        rb->SendLong(sinfo->radix_tree_keys);

        rb->SendBulkString("radix-tree-nodes");
        rb->SendLong(sinfo->radix_tree_nodes);

        rb->SendBulkString("last-generated-id");
        rb->SendBulkString(StreamIdRepr(sinfo->last_generated_id));

        rb->SendBulkString("max-deleted-entry-id");
        rb->SendBulkString(StreamIdRepr(sinfo->max_deleted_entry_id));

        rb->SendBulkString("entries-added");
        rb->SendLong(sinfo->entries_added);

        rb->SendBulkString("recorded-first-entry-id");
        rb->SendBulkString(StreamIdRepr(sinfo->recorded_first_entry_id));

        if (full) {
          rb->SendBulkString("entries");
          StreamReplies{rb}.SendRecords(sinfo->entries);

          rb->SendBulkString("groups");
          rb->StartArray(sinfo->cgroups.size());
          for (const auto& ginfo : sinfo->cgroups) {
            rb->StartCollection(7, CollectionType::MAP);

            rb->SendBulkString("name");
            rb->SendBulkString(ginfo.name);

            rb->SendBulkString("last-delivered-id");
            rb->SendBulkString(StreamIdRepr(ginfo.last_id));

            rb->SendBulkString("entries-read");
            if (ginfo.entries_read != SCG_INVALID_ENTRIES_READ) {
              rb->SendLong(ginfo.entries_read);
            } else {
              rb->SendNull();
            }
            rb->SendBulkString("lag");
            if (ginfo.lag != SCG_INVALID_LAG) {
              rb->SendLong(ginfo.lag);
            } else {
              rb->SendNull();
            }

            rb->SendBulkString("pel-count");
            rb->SendLong(ginfo.pending_size);

            rb->SendBulkString("pending");
            rb->StartArray(ginfo.stream_nack_vec.size());
            for (const auto& pending_info : ginfo.stream_nack_vec) {
              rb->StartArray(4);
              rb->SendBulkString(StreamIdRepr(pending_info.pel_id));
              rb->SendBulkString(pending_info.consumer_name);
              rb->SendLong(pending_info.delivery_time);
              rb->SendLong(pending_info.delivery_count);
            }

            rb->SendBulkString("consumers");
            rb->StartArray(ginfo.consumer_info_vec.size());
            for (const auto& consumer_info : ginfo.consumer_info_vec) {
              rb->StartCollection(5, CollectionType::MAP);

              rb->SendBulkString("name");
              rb->SendBulkString(consumer_info.name);

              rb->SendBulkString("seen-time");
              rb->SendLong(consumer_info.seen_time);

              rb->SendBulkString("active-time");
              rb->SendLong(consumer_info.active_time);

              rb->SendBulkString("pel-count");
              rb->SendLong(consumer_info.pel_count);

              rb->SendBulkString("pending");
              if (consumer_info.pending.size() == 0) {
                rb->SendEmptyArray();
              } else {
                rb->StartArray(consumer_info.pending.size());
              }
              for (const auto& pending : consumer_info.pending) {
                rb->StartArray(3);

                rb->SendBulkString(StreamIdRepr(pending.pel_id));
                rb->SendLong(pending.delivery_time);
                rb->SendLong(pending.delivery_count);
              }
            }
          }
        } else {
          rb->SendBulkString("groups");
          rb->SendLong(sinfo->groups);

          rb->SendBulkString("first-entry");
          if (sinfo->first_entry.kv_arr.size() != 0) {
            StreamReplies{rb}.SendRecord(sinfo->first_entry);
          } else {
            rb->SendNullArray();
          }

          rb->SendBulkString("last-entry");
          if (sinfo->last_entry.kv_arr.size() != 0) {
            StreamReplies{rb}.SendRecord(sinfo->last_entry);
          } else {
            rb->SendNullArray();
          }
        }
        return;
      }
      return cmd_cntx->SendError(sinfo.status());
    } else if (sub_cmd == "CONSUMERS") {
      if (args.size() < 3) {
        return cmd_cntx->SendError(kSyntaxErr);
      }
      string_view stream_name = ArgS(args, 1);
      string_view group_name = ArgS(args, 2);
      auto cb = [&]() {
        return OpConsumers(DbContext{cntx->ns, cntx->db_index(), GetCurrentTimeMs()},
                           EngineShard::tlocal(), stream_name, group_name);
      };

      OpResult<vector<ConsumerInfo>> result = shard_set->Await(sid, std::move(cb));
      if (result) {
        rb->StartArray(result->size());
        int64_t now_ms = GetCurrentTimeMs();
        for (const auto& consumer_info : *result) {
          int64_t active = consumer_info.active_time;
          int64_t inactive = active != -1 ? now_ms - active : -1;

          rb->StartCollection(4, CollectionType::MAP);
          rb->SendBulkString("name");
          rb->SendBulkString(consumer_info.name);
          rb->SendBulkString("pending");
          rb->SendLong(consumer_info.pel_count);
          rb->SendBulkString("idle");
          rb->SendLong(consumer_info.idle);
          rb->SendBulkString("inactive");
          rb->SendLong(inactive);
        }
        return;
      }
      if (result.status() == OpStatus::INVALID_VALUE) {
        return rb->SendError(NoGroupError(stream_name, group_name));
      }
      return cmd_cntx->SendError(result.status());
    }
  }
  return cmd_cntx->SendError(UnknownSubCmd(sub_cmd, "XINFO"));
}

void CmdXLen(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  auto cb = [&](Transaction* t, EngineShard* shard) { return OpLen(t->GetOpArgs(shard), key); };

  OpResult<uint32_t> result = cmd_cntx->tx()->ScheduleSingleHopT(cb);
  if (result || result.status() == OpStatus::KEY_NOTFOUND) {
    return cmd_cntx->SendLong(*result);
  }

  return cmd_cntx->SendError(result.status());
}

void CmdXPending(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  PendingOpts opts;
  opts.group_name = ArgS(args, 1);
  args.remove_prefix(2);

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (!args.empty() && !ParseXpendingOptions(args, opts, rb)) {
    return;
  }

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpPending(t->GetOpArgs(shard), key, opts);
  };
  OpResult<PendingResult> op_result = cmd_cntx->tx()->ScheduleSingleHopT(cb);
  if (!op_result) {
    if (op_result.status() == OpStatus::SKIPPED)
      return cmd_cntx->SendError(NoGroupError(key, opts.group_name));
    return cmd_cntx->SendError(op_result.status());
  }
  const PendingResult& result = op_result.value();

  SinkReplyBuilder::ReplyScope scope{rb};
  if (std::holds_alternative<PendingReducedResult>(result)) {
    const auto& res = std::get<PendingReducedResult>(result);
    rb->StartArray(4);
    rb->SendLong(res.count);
    if (res.count) {
      rb->SendBulkString(StreamIdRepr(res.start));
      rb->SendBulkString(StreamIdRepr(res.end));
      rb->StartArray(res.consumer_list.size());

      for (auto& [consumer_name, count] : res.consumer_list) {
        rb->StartArray(2);
        rb->SendBulkString(consumer_name);
        rb->SendLong(count);
      }
    } else {
      for (unsigned j = 0; j < 3; ++j)
        rb->SendNull();
    }
  } else {
    const auto& res = std::get<PendingExtendedResultList>(result);
    if (!res.size()) {
      return rb->SendEmptyArray();
    }

    rb->StartArray(res.size());
    for (auto& item : res) {
      rb->StartArray(4);
      rb->SendBulkString(StreamIdRepr(item.start));
      rb->SendBulkString(item.consumer_name);
      rb->SendLong(item.elapsed);
      rb->SendLong(item.delivery_count);
    }
  }
}

void CmdXRange(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = args[0];
  string_view start = args[1];
  string_view end = args[2];

  XRangeGeneric(key, start, end, args.subspan(3), false, cmd_cntx);
}

void CmdXRevRange(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = args[0];
  string_view start = args[1];
  string_view end = args[2];

  XRangeGeneric(key, end, start, args.subspan(3), true, cmd_cntx);
}

// If opts.read_group is true then this is a WRITE command. We don't however journal the consumer
// creation, only the side effects later on from the scheduled callbacks.
variant<bool, facade::ErrorReply> HasEntries2(const OpArgs& op_args, string_view skey,
                                              ReadOpts* opts) {
  const bool is_write_command = opts->read_group;
  auto& db_slice = op_args.GetDbSlice();

  DbSlice::ItAndUpdater it;
  const CompactObj* cobj;

  auto error = [&](auto res_it) -> variant<bool, facade::ErrorReply> {
    if (res_it.status() == OpStatus::WRONG_TYPE)
      return facade::ErrorReply{res_it.status()};
    else if (res_it.status() == OpStatus::KEY_NOTFOUND && opts->read_group)
      return facade::ErrorReply{
          NoGroupOrKey(skey, opts->group_name, " in XREADGROUP with GROUP option")};
    return false;
  };

  if (is_write_command) {
    auto res = db_slice.FindMutable(op_args.db_cntx, skey, OBJ_STREAM);
    if (!res)
      return error(std::move(res));
    it = std::move(*res);
    cobj = &it.it->second;
  } else {
    auto res = db_slice.FindReadOnly(op_args.db_cntx, skey, OBJ_STREAM);
    if (!res)
      return error(res);
    cobj = &(*res)->second;
  }

  stream* s = GetReadOnlyStream(*cobj);

  // Fetch last id
  streamID last_id = s->last_id;
  if (s->length)
    StreamLastValidID(s, &last_id);

  // Check requested
  auto& requested_sitem = opts->stream_ids.at(skey);

  // Look up group consumer if needed
  streamCG* group = nullptr;
  streamConsumer* consumer = nullptr;
  if (is_write_command) {
    group = StreamLookupCG(s, WrapSds(opts->group_name));
    if (!group)
      return facade::ErrorReply{
          NoGroupOrKey(skey, opts->group_name, " in XREADGROUP with GROUP option")};

    StreamMemTracker tracker;
    requested_sitem.is_consumer_new = false;
    consumer = FindOrAddConsumer(opts->consumer_name, group, op_args.db_cntx.time_now_ms,
                                 &requested_sitem.is_consumer_new);
    tracker.UpdateStreamSize(it.it->second);

    requested_sitem.group = group;
    requested_sitem.consumer = consumer;

    // If '>' is not provided, consumer PEL is used. So don't need to block.
    if (requested_sitem.id.val.ms != UINT64_MAX || requested_sitem.id.val.seq != UINT64_MAX) {
      requested_sitem.serve_history = true;
      return true;
    }

    // we know the requested last_id only when we already have it
    if (streamCompareID(&last_id, &requested_sitem.group->last_id) > 0) {
      requested_sitem.id.val = requested_sitem.group->last_id;
      StreamIncrID(&requested_sitem.id.val);
    }
  } else {
    // Resolve $ to the last ID in the stream.
    if (requested_sitem.id.resolve_last_id) {
      requested_sitem.id.val = last_id;
      StreamIncrID(&requested_sitem.id.val);  // include id's strictly greater
      requested_sitem.id.resolve_last_id = false;
      return false;
    }
  }

  return streamCompareID(&last_id, &requested_sitem.id.val) >= 0;
}

void CmdXRead(CmdArgList args, CommandContext* cmd_cntx) {
  XReadGeneric2(args, false, cmd_cntx);
}

void CmdXReadGroup(CmdArgList args, CommandContext* cmd_cntx) {
  XReadGeneric2(args, true, cmd_cntx);
}

void CmdXSetId(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view idstr = ArgS(args, 1);

  ParsedStreamId parsed_id;
  if (!ParseID(idstr, true, 0, &parsed_id)) {
    return cmd_cntx->SendError(kInvalidStreamId, kSyntaxErrType);
  }

  facade::ErrorReply reply(OpStatus::OK);
  auto cb = [&](Transaction* t, EngineShard* shard) {
    reply = OpXSetId(t->GetOpArgs(shard), key, parsed_id.val);
    return OpStatus::OK;
  };

  cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));
  if (reply.status == OpStatus::STREAM_ID_SMALL) {
    return cmd_cntx->SendError(LeqTopIdError("XSETID"));
  }
  return cmd_cntx->SendError(reply);
}

void CmdXTrim(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  std::string_view key = parser.Next();

  auto parsed_trim_opts = ParseTrimOpts(&parser);
  if (!parser.Finalize() || !parsed_trim_opts) {
    auto err = parser.TakeError();
    cmd_cntx->SendError(!parsed_trim_opts ? parsed_trim_opts.error() : err.MakeReply());
    return;
  }

  auto& trim_opts = parsed_trim_opts.value();

  // We can auto-journal if we are not trimming approximately or by maxlen
  const bool enable_auto_journaling = !JournalAsMinId(trim_opts);
  if (enable_auto_journaling) {
    cmd_cntx->tx()->ReviveAutoJournal();
  }

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpTrim(t->GetOpArgs(shard), key, trim_opts, !enable_auto_journaling);
  };

  OpResult<int64_t> trim_result = cmd_cntx->tx()->ScheduleSingleHopT(cb);
  if (trim_result) {
    rb->SendLong(*trim_result);
  } else {
    cmd_cntx->SendError(trim_result.status());
  }
}

void CmdXAck(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view group = ArgS(args, 1);
  args.remove_prefix(2);
  absl::InlinedVector<streamID, 8> ids(args.size());

  for (size_t i = 0; i < args.size(); ++i) {
    ParsedStreamId parsed_id;
    string_view str_id = ArgS(args, i);
    if (!ParseID(str_id, true, 0, &parsed_id)) {
      return cmd_cntx->SendError(kInvalidStreamId, kSyntaxErrType);
    }
    ids[i] = parsed_id.val;
  }

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpAck(t->GetOpArgs(shard), key, group, absl::Span{ids.data(), ids.size()});
  };

  OpResult<uint32_t> result = cmd_cntx->tx()->ScheduleSingleHopT(cb);
  if (result || result.status() == OpStatus::KEY_NOTFOUND) {
    return cmd_cntx->SendLong(*result);
  }

  cmd_cntx->SendError(result.status());
}

void CmdXAutoClaim(CmdArgList args, CommandContext* cmd_cntx) {
  ClaimOpts opts;
  string_view key = ArgS(args, 0);
  opts.group = ArgS(args, 1);
  opts.consumer = ArgS(args, 2);
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  if (opts.group.empty() || opts.consumer.empty()) {
    return cmd_cntx->SendError(kSyntaxErr);
  }

  if (!absl::SimpleAtoi(ArgS(args, 3), &opts.min_idle_time)) {
    return rb->SendError(kSyntaxErr);
  }

  opts.min_idle_time = std::max((int64)0, opts.min_idle_time);

  string_view start = ArgS(args, 4);
  RangeId rs;

  if (!ParseRangeId(start, RangeBoundary::kStart, &rs)) {
    return rb->SendError(kSyntaxErr);
  }

  if (rs.exclude && StreamDecrID(&rs.parsed_id.val) != C_OK) {
    return rb->SendError("invalid start ID for the interval", kSyntaxErrType);
  }
  opts.start = rs.parsed_id.val;

  for (size_t i = 5; i < args.size(); ++i) {
    string arg = absl::AsciiStrToUpper(ArgS(args, i));

    bool remaining_args = args.size() - i - 1 > 0;

    if (remaining_args) {
      if (arg == "COUNT") {
        arg = ArgS(args, ++i);
        if (!absl::SimpleAtoi(arg, &opts.count)) {
          return rb->SendError(kInvalidIntErr);
        }
        if (opts.count <= 0 || opts.count >= (1L << 18)) {
          return rb->SendError("COUNT must be > 0 and less than 2^18");
        }
        continue;
      }
    }
    if (arg == "JUSTID") {
      opts.flags |= kClaimJustID;
    } else {
      return cmd_cntx->SendError("Unknown argument given for XAUTOCLAIM command", kSyntaxErr);
    }
  }

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpAutoClaim(t->GetOpArgs(shard), key, opts);
  };
  OpResult<ClaimInfo> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));

  if (result.status() == OpStatus::KEY_NOTFOUND) {
    rb->SendError(NoGroupOrKey(key, opts.group));
    return;
  }

  if (!result) {
    cmd_cntx->SendError(result.status());
    return;
  }

  const ClaimInfo& cresult = result.value();

  rb->StartArray(3);
  rb->SendBulkString(StreamIdRepr(cresult.end_id));
  StreamReplies{rb}.SendClaimInfo(cresult);
  StreamReplies{rb}.SendIDs(cresult.deleted_ids);
}

#define HFUNC(x) SetHandler(&Cmd##x)

namespace acl {
constexpr uint32_t kXAdd = WRITE | STREAM | FAST;
constexpr uint32_t kXClaim = WRITE | FAST;
constexpr uint32_t kXDel = WRITE | STREAM | FAST;
constexpr uint32_t kXGroup = SLOW;
constexpr uint32_t kXInfo = SLOW;
constexpr uint32_t kXLen = READ | STREAM | FAST;
constexpr uint32_t kXPending = READ | STREAM;
constexpr uint32_t kXRange = READ | STREAM | SLOW;
constexpr uint32_t kXRevRange = READ | STREAM | SLOW;
constexpr uint32_t kXRead = READ | STREAM | SLOW | BLOCKING;
constexpr uint32_t kXReadGroup = WRITE | STREAM | SLOW | BLOCKING;
constexpr uint32_t kXSetId = WRITE | STREAM | SLOW;
constexpr uint32_t kXTrim = WRITE | STREAM | SLOW;
constexpr uint32_t kXGroupHelp = READ | STREAM | SLOW;
constexpr uint32_t kXAck = WRITE | STREAM | FAST;
constexpr uint32_t kXAutoClaim = WRITE | STREAM | FAST;
}  // namespace acl

void StreamFamily::Register(CommandRegistry* registry) {
  using CI = CommandId;
  registry->StartFamily();
  constexpr auto kReadFlags = CO::READONLY | CO::BLOCKING | CO::VARIADIC_KEYS;
  *registry
      << CI{"XADD",    CO::JOURNALED | CO::DENYOOM | CO::FAST | CO::NO_AUTOJOURNAL, -5, 1, 1,
            acl::kXAdd}
             .HFUNC(XAdd)
      << CI{"XCLAIM", CO::JOURNALED | CO::FAST, -6, 1, 1, acl::kXClaim}.HFUNC(XClaim)
      << CI{"XDEL", CO::JOURNALED | CO::FAST, -3, 1, 1, acl::kXDel}.HFUNC(XDel)
      << CI{"XGROUP", CO::JOURNALED | CO::DENYOOM, -3, 2, 2, acl::kXGroup}.HFUNC(XGroup)
      << CI{"XINFO", CO::READONLY, -2, 0, 0, acl::kXInfo}.HFUNC(XInfo)
      << CI{"XLEN", CO::READONLY | CO::FAST, 2, 1, 1, acl::kXLen}.HFUNC(XLen)
      << CI{"XPENDING", CO::READONLY, -3, 1, 1, acl::kXPending}.HFUNC(XPending)
      << CI{"XRANGE", CO::READONLY, -4, 1, 1, acl::kXRange}.HFUNC(XRange)
      << CI{"XREVRANGE", CO::READONLY, -4, 1, 1, acl::kXRevRange}.HFUNC(XRevRange)
      << CI{"XREAD", kReadFlags, -3, 3, 3, acl::kXRead}.HFUNC(XRead)
      << CI{"XREADGROUP",
            CO::VARIADIC_KEYS | CO::BLOCKING | CO::JOURNALED | CO::NO_AUTOJOURNAL,
            -6,
            6,
            6,
            acl::kXReadGroup}
             .HFUNC(XReadGroup)
      << CI{"XSETID", CO::JOURNALED, 3, 1, 1, acl::kXSetId}.HFUNC(XSetId)
      << CI{"XTRIM", CO::JOURNALED | CO::FAST | CO::NO_AUTOJOURNAL, -4, 1, 1, acl::kXTrim}.HFUNC(
             XTrim)
      << CI{"_XGROUP_HELP", CO::NOSCRIPT | CO::HIDDEN, 2, 0, 0, acl::kXGroupHelp}.SetHandler(
             XGroupHelp)
      << CI{"XACK", CO::JOURNALED | CO::FAST, -4, 1, 1, acl::kXAck}.HFUNC(XAck)
      << CI{"XAUTOCLAIM", CO::JOURNALED | CO::FAST, -6, 1, 1, acl::kXAutoClaim}.HFUNC(XAutoClaim);
}

}  // namespace dfly


================================================
FILE: src/server/stream_family.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <cstddef>

namespace dfly {

class CommandRegistry;
struct CompactValue;

class StreamMemTracker {
 public:
  StreamMemTracker();

  void UpdateStreamSize(CompactValue& pv) const;

 private:
  size_t start_size_{0};
};

class StreamFamily {
 public:
  static void Register(CommandRegistry* registry);
};

}  // namespace dfly


================================================
FILE: src/server/stream_family_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/stream_family.h"

#include "base/flags.h"
#include "base/gtest.h"
#include "base/logging.h"
#include "facade/facade_test.h"
#include "server/test_utils.h"

using namespace testing;
using namespace std;
using namespace util;

namespace dfly {

const auto kMatchNil = ArgType(RespExpr::NIL);

class StreamFamilyTest : public BaseFamilyTest {
 protected:
};

TEST_F(StreamFamilyTest, Add) {
  auto resp = Run({"xadd", "key", "*", "field", "value"});
  ASSERT_THAT(resp, ArgType(RespExpr::STRING));
  string id = string(ToSV(resp.GetBuf()));
  EXPECT_THAT(id, EndsWith("-0"));

  resp = Run({"xrange", "null", "-", "+"});
  EXPECT_THAT(resp, ArrLen(0));

  resp = Run({"xrange", "key", "-", "+"});
  EXPECT_THAT(resp, ArrLen(2));
  auto sub_arr = resp.GetVec();
  EXPECT_THAT(sub_arr, ElementsAre(id, ArrLen(2)));

  resp = Run({"xlen", "key"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"xadd", "key", "badid", "f1", "val1"});
  EXPECT_THAT(resp, ErrArg("Invalid stream ID"));

  resp = Run({"xadd", "key", "nomkstream", "*", "field2", "value2"});
  ASSERT_THAT(resp, ArgType(RespExpr::STRING));

  resp = Run({"xadd", "noexist", "nomkstream", "*", "field", "value"});
  EXPECT_THAT(resp, kMatchNil);
}

TEST_F(StreamFamilyTest, AddExtended) {
  auto resp0 = Run({"xadd", "key", "5", "f1", "v1", "f2", "v2"});
  EXPECT_EQ(resp0, "5-0");
  resp0 = Run({"xrange", "key", "5-0", "5-0"});
  EXPECT_THAT(resp0, ArrLen(2));
  auto sub_arr = resp0.GetVec();
  EXPECT_THAT(sub_arr, ElementsAre("5-0", ArrLen(4)));
  sub_arr = sub_arr[1].GetVec();
  EXPECT_THAT(sub_arr, ElementsAre("f1", "v1", "f2", "v2"));

  auto resp1 = Run({"xadd", "key", "maxlen", "1", "*", "field1", "val1"});
  string id1 = string(ToSV(resp1.GetBuf()));

  auto resp2 = Run({"xadd", "key", "maxlen", "1", "*", "field2", "val2"});
  string id2 = string(ToSV(resp2.GetBuf()));

  EXPECT_THAT(Run({"xlen", "key"}), IntArg(1));
  EXPECT_THAT(Run({"xrange", "key", id1, id1}), ArrLen(0));

  auto resp3 = Run({"xadd", "key", id2, "f1", "val1"});
  EXPECT_THAT(resp3, ErrArg("equal or smaller than"));

  Run({"xadd", "key2", "5-0", "field", "val"});
  Run({"xadd", "key2", "6-0", "field1", "val1"});
  Run({"xadd", "key2", "7-0", "field2", "val2"});
  auto resp = Run({"xadd", "key2", "minid", "6", "*", "field3", "val3"});
  EXPECT_THAT(Run({"xlen", "key2"}), IntArg(3));
  EXPECT_THAT(Run({"xrange", "key2", "5-0", "5-0"}), ArrLen(0));

  for (int i = 0; i < 700; i++) {
    Run({"xadd", "key3", "*", "field", "val"});
  }
  resp = Run({"xadd", "key3", "maxlen", "~", "500", "*", "field", "val"});
  EXPECT_THAT(Run({"xlen", "key3"}), IntArg(501));
  for (int i = 0; i < 700; i++) {
    Run({"xadd", "key4", "*", "field", "val"});
  }
  resp = Run({"xadd", "key4", "maxlen", "~", "500", "limit", "100", "*", "field", "val"});
  EXPECT_THAT(Run({"xlen", "key4"}), IntArg(601));
}

TEST_F(StreamFamilyTest, XrangeRangeAutocomplete) {
  Run({"xadd", "mystream", "1609459200000-0", "0", "0"});
  Run({"xadd", "mystream", "1609459200001-0", "1", "1"});
  Run({"xadd", "mystream", "1609459200001-1", "2", "2"});
  Run({"xadd", "mystream", "1609459200002-0", "3", "3"});
  auto resp = Run({"xrange", "mystream", "1609459200000", "1609459200001"});
  EXPECT_THAT(resp, RespElementsAre(RespElementsAre("1609459200000-0", RespElementsAre("0", "0")),
                                    RespElementsAre("1609459200001-0", RespElementsAre("1", "1")),
                                    RespElementsAre("1609459200001-1", RespElementsAre("2", "2"))));
  resp = Run({"xrange", "mystream", "1609459200000", "(1609459200001"});
  EXPECT_THAT(resp, RespElementsAre(RespElementsAre("1609459200000-0", RespElementsAre("0", "0")),
                                    RespElementsAre("1609459200001-0", RespElementsAre("1", "1")),
                                    RespElementsAre("1609459200001-1", RespElementsAre("2", "2"))));
}

TEST_F(StreamFamilyTest, Range) {
  Run({"xadd", "key", "1-*", "f1", "v1"});
  Run({"xadd", "key", "1-*", "f2", "v2"});
  auto resp = Run({"xrange", "key", "-", "+"});
  EXPECT_THAT(resp, ArrLen(2));
  auto sub_arr = resp.GetVec();
  EXPECT_THAT(sub_arr, ElementsAre(ArrLen(2), ArrLen(2)));
  auto sub0 = sub_arr[0].GetVec();
  auto sub1 = sub_arr[1].GetVec();
  EXPECT_THAT(sub0, ElementsAre("1-0", ArrLen(2)));
  EXPECT_THAT(sub1, ElementsAre("1-1", ArrLen(2)));

  resp = Run({"xrevrange", "key", "+", "-"});
  sub_arr = resp.GetVec();
  sub0 = sub_arr[0].GetVec();
  sub1 = sub_arr[1].GetVec();
  EXPECT_THAT(sub0, ElementsAre("1-1", ArrLen(2)));
  EXPECT_THAT(sub1, ElementsAre("1-0", ArrLen(2)));
}

TEST_F(StreamFamilyTest, GroupCreate) {
  auto resp = Run({"xadd", "key", "1-*", "f1", "v1"});
  EXPECT_EQ(resp, "1-0");
  resp = Run({"xgroup", "create", "key", "grname", "1"});
  EXPECT_EQ(resp, "OK");
  resp = Run({"xgroup", "create", "test", "test", "0"});
  EXPECT_THAT(resp, ErrArg("requires the key to exist"));
  resp = Run({"xgroup", "create", "test", "test", "0", "MKSTREAM"});
  EXPECT_THAT(resp, "OK");
  resp = Run({"xgroup", "create", "test", "test", "0", "MKSTREAM"});
  EXPECT_THAT(resp, ErrArg("BUSYGROUP"));
}

TEST_F(StreamFamilyTest, XRead) {
  Run({"xadd", "foo", "1-*", "k1", "v1"});
  Run({"xadd", "foo", "1-*", "k2", "v2"});
  Run({"xadd", "foo", "1-*", "k3", "v3"});
  Run({"xadd", "bar", "1-*", "k4", "v4"});
  EXPECT_EQ(GetMetrics().shard_stats.tx_optimistic_total, 4u);

  // Receive all records from a single stream, in a single hop
  auto resp = Run({"xread", "streams", "foo", "0"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("foo", ArrLen(3)));
  EXPECT_EQ(GetMetrics().shard_stats.tx_optimistic_total, 5u);

  // Receive all records from both streams.
  resp = Run({"xread", "streams", "foo", "bar", "0", "0"});

  // 2 results
  ASSERT_THAT(resp, RespArray(ElementsAre(ArrLen(2), ArrLen(2))));
  ASSERT_THAT(resp.GetVec()[0], RespArray(ElementsAre("foo", ArrLen(3))));
  ASSERT_THAT(resp.GetVec()[1], RespArray(ElementsAre("bar", ArrLen(1))));

  // Order of the requested streams is maintained.
  resp = Run({"xread", "streams", "bar", "foo", "0", "0"});
  ASSERT_THAT(resp, RespArray(ElementsAre(ArrLen(2), ArrLen(2))));
  ASSERT_THAT(resp.GetVec()[0], RespArray(ElementsAre("bar", ArrLen(1))));
  ASSERT_THAT(resp.GetVec()[1], RespArray(ElementsAre("foo", ArrLen(3))));

  // Limit count.
  resp = Run({"xread", "count", "1", "streams", "foo", "bar", "0", "0"});
  ASSERT_THAT(resp, RespArray(ElementsAre(ArrLen(2), ArrLen(2))));
  ASSERT_THAT(resp.GetVec()[0], RespArray(ElementsAre("foo", ArrLen(1))));
  ASSERT_THAT(resp.GetVec()[1], RespArray(ElementsAre("bar", ArrLen(1))));

  // Read from ID.
  resp = Run({"xread", "count", "10", "streams", "foo", "bar", "1-1", "2-0"});
  // Note when the response has length 1, Run returns the first element.
  EXPECT_THAT(resp.GetVec(), ElementsAre("foo", ArrLen(1)));
  EXPECT_THAT(resp.GetVec()[1].GetVec()[0].GetVec(), ElementsAre("1-2", ArrLen(2)));

  // Stream not found.
  resp = Run({"xread", "streams", "foo", "notfound", "0", "0"});
  // Note when the response has length 1, Run returns the first element.
  EXPECT_THAT(resp.GetVec(), ElementsAre("foo", ArrLen(3)));

  // Not found.
  resp = Run({"xread", "streams", "notfound", "0"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL_ARRAY));

  // XREAD returns a map response on RESP3
  Run({"HELLO", "3"});
  resp = Run({"xread", "streams", "foo", "bar", "0", "0"});
  ASSERT_THAT(resp, RespArray(ElementsAre("foo", ArrLen(3), "bar", ArrLen(1))));

  const auto foo_resp = resp.GetVec()[1];
  ASSERT_THAT(foo_resp, RespArray(ElementsAre(ArrLen(2), ArrLen(2), ArrLen(2))));

  const auto first_kv_entry = foo_resp.GetVec()[0];
  const auto expected = RespArray(ElementsAre("k1", "v1"));
  ASSERT_THAT(first_kv_entry, RespArray(ElementsAre("1-0", expected)));
}

TEST_F(StreamFamilyTest, XReadGroup) {
  Run({"xadd", "foo", "1-*", "k1", "v1"});
  Run({"xadd", "foo", "1-*", "k2", "v2"});
  Run({"xadd", "foo", "1-*", "k3", "v3"});
  Run({"xadd", "bar", "1-*", "k4", "v4"});

  Run({"xadd", "mystream", "1-*", "k1", "v1"});
  Run({"xadd", "mystream", "1-*", "k2", "v2"});
  Run({"xadd", "mystream", "1-*", "k3", "v3"});

  Run({"xgroup", "create", "foo", "group", "0"});
  Run({"xgroup", "create", "bar", "group", "0"});

  // consumer PEL is empty, so resp should have empty list
  auto resp = Run({"xreadgroup", "group", "group", "alice", "streams", "foo", "0"});
  EXPECT_THAT(resp, RespArray(ElementsAre("foo", ArrLen(0))));

  // should return unread entries with key "foo"
  resp = Run({"xreadgroup", "group", "group", "alice", "streams", "foo", ">"});
  // only "foo" key entries are read
  EXPECT_THAT(resp, RespArray(ElementsAre("foo", ArrLen(3))));

  Run({"xadd", "foo", "1-*", "k5", "v5"});
  resp = Run({"xreadgroup", "group", "group", "alice", "streams", "bar", "foo", ">", ">"});
  EXPECT_THAT(resp, RespArray(ElementsAre(ArrLen(2), ArrLen(2))));

  EXPECT_THAT(resp.GetVec()[0].GetVec()[1].GetVec()[0], RespArray(ElementsAre("1-0", ArrLen(2))));
  EXPECT_THAT(resp.GetVec()[1].GetVec()[1].GetVec()[0], RespArray(ElementsAre("1-3", ArrLen(2))));

  // now we can specify id for "foo" and it fetches from alice's consumer PEL
  resp = Run({"xreadgroup", "group", "group", "alice", "streams", "foo", "0"});
  EXPECT_THAT(resp.GetVec()[1], ArrLen(4));

  // now ">" gives nil
  resp = Run({"xreadgroup", "group", "group", "alice", "streams", "foo", ">"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL_ARRAY));

  // count limits the fetched entries
  resp = Run(
      {"xreadgroup", "group", "group", "alice", "count", "2", "streams", "foo", "bar", "0", "0"});
  EXPECT_THAT(resp, RespArray(ElementsAre(ArrLen(2), ArrLen(2))));
  EXPECT_THAT(resp.GetVec()[0].GetVec(), ElementsAre("foo", ArrLen(2)));
  EXPECT_THAT(resp.GetVec()[1].GetVec(), ElementsAre("bar", ArrLen(1)));

  // bob will not get entries of alice
  resp = Run({"xreadgroup", "group", "group", "bob", "streams", "foo", "0"});
  EXPECT_THAT(resp, RespArray(ElementsAre("foo", ArrLen(0))));

  resp = Run({"xinfo", "groups", "foo"});
  // 2 consumers created
  EXPECT_THAT(resp.GetVec()[3], IntArg(2));
  // check last_delivery_id
  EXPECT_THAT(resp.GetVec()[7], "1-3");

  // Noack
  Run({"xadd", "foo", "1-*", "k6", "v6"});
  resp = Run({"xreadgroup", "group", "group", "bob", "noack", "streams", "foo", ">"});
  // check basic results
  EXPECT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp.GetVec(), ElementsAre("foo", ArrLen(1)));
  // Entry is not inserted in Bob's consumer PEL.
  resp = Run({"xreadgroup", "group", "group", "bob", "streams", "foo", "0"});
  EXPECT_THAT(resp, RespArray(ElementsAre("foo", ArrLen(0))));

  // No Group
  resp = Run({"xreadgroup", "group", "nogroup", "alice", "streams", "foo", "0"});
  EXPECT_THAT(
      resp,
      ErrArg("No such key 'foo' or consumer group 'nogroup' in XREADGROUP with GROUP option"));

  // '>' gives the null array result if group doesn't exist
  resp = Run({"xreadgroup", "group", "group", "alice", "streams", "mystream", ">"});
  EXPECT_THAT(
      resp,
      ErrArg("No such key 'mystream' or consumer group 'group' in XREADGROUP with GROUP option"));

  Run({"xadd", "foo", "1-*", "k7", "v7"});
  resp = Run({"xreadgroup", "group", "group", "alice", "streams", "mystream", "foo", ">", ">"});
  // returns no group error as "group" was not created for mystream.
  EXPECT_THAT(
      resp,
      ErrArg("No such key 'mystream' or consumer group 'group' in XREADGROUP with GROUP option"));

  // returns no group error when key doesn't exists
  // this is how Redis' behave
  resp = Run({"xreadgroup", "group", "group", "consumer", "count", "10", "block", "5000", "streams",
              "nostream", ">"});
  EXPECT_THAT(
      resp,
      ErrArg("No such key 'nostream' or consumer group 'group' in XREADGROUP with GROUP option"));

  // block on empty stream via xgroup create.
  Run({"xgroup", "create", "emptystream", "group", "0", "mkstream"});
  auto before = absl::Now();
  resp = Run({"xreadgroup", "group", "group", "consumer", "count", "10", "block", "1000", "streams",
              "emptystream", ">"});
  EXPECT_GE(absl::Now() - before, absl::Seconds(1));
  EXPECT_THAT(resp, ArgType(RespExpr::NIL_ARRAY));
}

TEST_F(StreamFamilyTest, XReadBlock) {
  Run({"xadd", "foo", "1-*", "k1", "v1"});
  Run({"xadd", "foo", "1-*", "k2", "v2"});
  Run({"xadd", "foo", "1-*", "k3", "v3"});
  Run({"xadd", "bar", "1-*", "k4", "v4"});

  // Receive all records from both streams.
  auto resp = Run({"xread", "block", "100", "streams", "foo", "bar", "0", "0"});
  EXPECT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp.GetVec()[0].GetVec(), ElementsAre("foo", ArrLen(3)));
  EXPECT_THAT(resp.GetVec()[1].GetVec(), ElementsAre("bar", ArrLen(1)));

  // Timeout.
  resp = Run({"xread", "block", "1", "streams", "foo", "$"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL_ARRAY));

  // Timeout again, on two steams
  resp = Run({"xread", "block", "1", "streams", "foo", "bar", "$", "$"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL_ARRAY));

  // Run XREAD BLOCK from 2 fibers.
  RespExpr resp0, resp1;
  auto fb0 = pp_->at(0)->LaunchFiber(Launch::dispatch, [&] {
    resp0 = Run({"xread", "block", "0", "streams", "foo", "$"});
  });
  auto fb1 = pp_->at(1)->LaunchFiber(Launch::dispatch, [&] {
    resp1 = Run({"xread", "block", "0", "streams", "foo", "bar", "$", "$"});
  });
  ThisFiber::SleepFor(50us);

  resp = pp_->at(1)->Await([&] { return Run("xadd", {"xadd", "foo", "1-*", "k5", "v5"}); });

  fb0.Join();
  fb1.Join();

  // Both xread calls should have been unblocked.
  //
  // Note when the response has length 1, Run returns the first element.
  EXPECT_THAT(resp0.GetVec(), ElementsAre("foo", ArrLen(1)));
  EXPECT_THAT(resp1.GetVec(), ElementsAre("foo", ArrLen(1)));
}

TEST_F(StreamFamilyTest, XReadGroupBlockwithoutBlock) {
  Run({"xadd", "foo", "1-*", "k1", "v1"});
  Run({"xadd", "foo", "1-*", "k2", "v2"});
  Run({"xadd", "foo", "1-*", "k3", "v3"});
  Run({"xadd", "bar", "1-*", "k4", "v4"});

  Run({"xgroup", "create", "foo", "group", "0"});
  Run({"xgroup", "create", "bar", "group", "0"});

  // Receive all records from both streams.
  auto resp = Run(
      {"xreadgroup", "group", "group", "alice", "block", "100", "streams", "foo", "bar", ">", ">"});
  EXPECT_THAT(resp, RespArray(ElementsAre(ArrLen(2), ArrLen(2))));
  EXPECT_THAT(resp.GetVec()[0].GetVec(), ElementsAre("foo", ArrLen(3)));
  EXPECT_THAT(resp.GetVec()[1].GetVec(), ElementsAre("bar", ArrLen(1)));
}

TEST_F(StreamFamilyTest, XReadGroupBlock) {
  Run({"xgroup", "create", "foo", "group", "0", "MKSTREAM"});
  Run({"xgroup", "create", "bar", "group", "0", "MKSTREAM"});

  // Timeout
  auto resp = Run(
      {"xreadgroup", "group", "group", "alice", "block", "1", "streams", "foo", "bar", ">", ">"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL_ARRAY));

  // Run XREADGROUP BLOCK from 2 fibers.
  RespExpr resp0, resp1;
  auto fb0 = pp_->at(0)->LaunchFiber(Launch::dispatch, [&] {
    resp0 = Run(
        {"xreadgroup", "group", "group", "alice", "block", "0", "streams", "foo", "bar", ">", ">"});
  });
  auto fb1 = pp_->at(1)->LaunchFiber(Launch::dispatch, [&] {
    resp1 = Run(
        {"xreadgroup", "group", "group", "alice", "block", "0", "streams", "foo", "bar", ">", ">"});
  });
  ThisFiber::SleepFor(50us);

  pp_->at(1)->Await([&] { return Run("xadd", {"xadd", "foo", "1-*", "k5", "v5"}); });
  // Only one xreadgroup call should have been unblocked.

  ThisFiber::SleepFor(50us);
  pp_->at(1)->Await([&] { return Run("xadd", {"xadd", "bar", "1-*", "k5", "v5"}); });
  // The second one should be unblocked
  ThisFiber::SleepFor(50us);

  fb0.Join();
  fb1.Join();

  if (resp0.GetVec()[0].GetString() == "foo") {
    EXPECT_THAT(resp0.GetVec(), ElementsAre("foo", ArrLen(1)));
    EXPECT_THAT(resp1.GetVec(), ElementsAre("bar", ArrLen(1)));
  } else {
    EXPECT_THAT(resp1.GetVec(), ElementsAre("foo", ArrLen(1)));
    EXPECT_THAT(resp0.GetVec(), ElementsAre("bar", ArrLen(1)));
  }

  // Call XGROUP DESTROY while blocking
  Run({"xgroup", "create", "to-delete", "to-delete", "0", "MKSTREAM"});
  fb0 = pp_->at(1)->LaunchFiber(Launch::dispatch, [&] {
    resp0 = Run({"xreadgroup", "group", "to-delete", "consumer", "block", "0", "streams",
                 "to-delete", ">"});
  });

  Run({"xgroup", "destroy", "to-delete", "to-delete"});
  fb0.Join();
  EXPECT_THAT(resp0, ErrArg("consumer group this client was blocked on no longer exists"));
}

TEST_F(StreamFamilyTest, XReadGroupBlockDelconsumer) {
  Run({"XGROUP", "CREATE", "foo", "group", "0", "MKSTREAM"});

  RespExpr resp0;
  auto fb0 = pp_->at(1)->LaunchFiber(Launch::dispatch, [&] {
    resp0 = Run({"XREADGROUP", "GROUP", "group", "alice", "BLOCK", "0", "streams", "foo", ">"});
  });
  ThisFiber::SleepFor(50us);

  // Del consumer while it's blocked
  RespExpr resp_del_consumer = Run({"XGROUP", "DELCONSUMER", "foo", "group", "alice"});

  pp_->at(1)->Await([&] { return Run("xadd", {"XADD", "foo", "1-0", "k1", "v1"}); });
  fb0.Join();

  EXPECT_THAT(resp0.GetVec(), ElementsAre("foo", ArrLen(1)));
  EXPECT_THAT(resp_del_consumer, IntArg(0));
}

TEST_F(StreamFamilyTest, XReadInvalidArgs) {
  // Invalid COUNT value.
  auto resp = Run({"xread", "count", "invalid", "streams", "s1", "s2", "0", "0"});
  EXPECT_THAT(resp, ErrArg("not an integer or out of range"));

  // Missing COUNT value.
  resp = Run({"xread", "count"});
  EXPECT_THAT(resp, ErrArg("wrong number of arguments for 'xread' command"));

  // Invalid BLOCK value.
  resp = Run({"xread", "block", "invalid", "streams", "s1", "s2", "0", "0"});
  EXPECT_THAT(resp, ErrArg("not an integer or out of range"));

  // Missing BLOCK value.
  resp = Run({"xread", "block", "streams", "s1", "s2", "0", "0"});
  EXPECT_THAT(resp, ErrArg("not an integer or out of range"));

  // Missing STREAMS.
  resp = Run({"xread", "count", "5"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  // Unbalanced list of streams.
  resp = Run({"xread", "count", "invalid", "streams", "s1", "s2", "0", "0"});
  EXPECT_THAT(resp, ErrArg("value is not an integer"));

  // Wrong type.
  Run({"set", "foo", "v"});
  resp = Run({"xread", "streams", "foo", "0"});
  EXPECT_THAT(resp, ErrArg("key holding the wrong kind of value"));
}

TEST_F(StreamFamilyTest, XReadGroupInvalidArgs) {
  Run({"xgroup", "create", "group", "foo", "0", "mkstream"});
  // Invalid COUNT value.
  auto resp =
      Run({"xreadgroup", "group", "group", "alice", "count", "invalid", "streams", "foo", "0"});
  EXPECT_THAT(resp, ErrArg("not an integer or out of range"));

  // Invalid "stream" instead of GROUP.
  resp = Run({"xreadgroup", "stream", "group", "alice", "count", "1", "streams", "foo", "0"});
  EXPECT_THAT(resp, ErrArg("Missing 'GROUP' in 'XREADGROUP' command"));

  // Missing streams.
  resp = Run({"xreadgroup", "group", "group", "alice", "streams"});
  EXPECT_THAT(resp, ErrArg("wrong number of arguments for 'xreadgroup' command"));

  // Missing consumer.
  resp = Run({"xreadgroup", "group", "group", "streams", "foo", "0"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  // Missing block value.
  resp = Run({"xreadgroup", "group", "group", "alice", "block", "streams", "foo", "0"});
  EXPECT_THAT(resp, ErrArg("not an integer or out of range"));

  // Invalid block value.
  resp = Run({"xreadgroup", "group", "group", "alice", "block", "invalid", "streams", "foo", "0"});
  EXPECT_THAT(resp, ErrArg("not an integer or out of range"));

  // Unbalanced list of streams.
  resp = Run({"xreadgroup", "group", "group", "alice", "streams", "s1", "s2", "s3", "0", "0"});
  EXPECT_THAT(resp, ErrArg("Unbalanced 'xreadgroup' list of streams: for each stream key an ID or "
                           "'>' must be specified"));

  resp = Run({"XREAD", "COUNT", "1", "STREAMS", "mystream"});
  ASSERT_THAT(resp, ErrArg("Unbalanced 'xread' list of streams: for each stream key an ID or '$' "
                           "must be specified"));
}

TEST_F(StreamFamilyTest, XReadGroupEmpty) {
  Run({"XADD", "stream", "*", "foo", "bar"});
  Run({"XGROUP", "CREATE", "stream", "group", "0"});
  auto resp = Run({"XREADGROUP", "GROUP", "group", "consumer1", "STREAMS", "stream", "0"});
  EXPECT_THAT(resp, ArrLen(2));
}

TEST_F(StreamFamilyTest, Issue854) {
  auto resp = Run({"xgroup", "help"});
  EXPECT_THAT(resp, ArgType(RespExpr::ARRAY));

  resp = Run({"eval", "redis.call('xgroup', 'help')", "0"});
  EXPECT_THAT(resp, ErrArg("is not allowed"));
}

TEST_F(StreamFamilyTest, XGroupConsumer) {
  Run({"xgroup", "create", "foo", "group", "$", "MKSTREAM"});
  auto resp = Run({"xgroup", "createconsumer", "foo", "group", "bob"});
  EXPECT_THAT(resp, IntArg(1));
  Run({"xgroup", "createconsumer", "foo", "group", "alice"});
  resp = Run({"xinfo", "groups", "foo"});
  EXPECT_THAT(resp.GetVec()[3], IntArg(2));
  Run({"xgroup", "delconsumer", "foo", "group", "alice"});
  resp = Run({"xinfo", "groups", "foo"});
  EXPECT_THAT(resp.GetVec()[3], IntArg(1));

  resp = Run({"xgroup", "createconsumer", "foo", "group", "alice"});
  EXPECT_THAT(resp, IntArg(1));

  // ensure createconsumer doesn't create consumer that already exists
  resp = Run({"xgroup", "createconsumer", "foo", "group", "alice"});
  EXPECT_THAT(resp, IntArg(0));

  // nogrouperror
  resp = Run({"xgroup", "createconsumer", "foo", "not-exists", "alice"});
  EXPECT_THAT(resp, ErrArg("NOGROUP"));
}

TEST_F(StreamFamilyTest, Xclaim) {
  Run({"xadd", "foo", "1-0", "k1", "v1"});
  Run({"xadd", "foo", "1-1", "k2", "v2"});
  Run({"xadd", "foo", "1-2", "k3", "v3"});
  Run({"xadd", "foo", "1-3", "k4", "v4"});

  // create a group for foo stream
  Run({"xgroup", "create", "foo", "group", "0"});
  // alice consume all the stream entries
  Run({"xreadgroup", "group", "group", "alice", "streams", "foo", ">"});

  // bob claims alice's two pending stream entries
  auto resp = Run({"xclaim", "foo", "group", "bob", "0", "1-2", "1-3"});
  EXPECT_THAT(resp, RespArray(ElementsAre(
                        RespArray(ElementsAre("1-2", RespArray(ElementsAre("k3", "v3")))),
                        RespArray(ElementsAre("1-3", RespArray(ElementsAre("k4", "v4")))))));

  // bob really have these claimed entries
  resp = Run({"xreadgroup", "group", "group", "bob", "streams", "foo", "0"});
  EXPECT_THAT(resp,
              RespArray(ElementsAre(
                  "foo", RespArray(ElementsAre(
                             RespArray(ElementsAre("1-2", RespArray(ElementsAre("k3", "v3")))),
                             RespArray(ElementsAre("1-3", RespArray(ElementsAre("k4", "v4")))))))));

  // alice no longer have those entries
  resp = Run({"xreadgroup", "group", "group", "alice", "streams", "foo", "0"});
  EXPECT_THAT(resp,
              RespArray(ElementsAre(
                  "foo", RespArray(ElementsAre(
                             RespArray(ElementsAre("1-0", RespArray(ElementsAre("k1", "v1")))),
                             RespArray(ElementsAre("1-1", RespArray(ElementsAre("k2", "v2")))))))));

  // xclaim ensures that entries before the min-idle-time are not claimed by bob
  resp = Run({"xclaim", "foo", "group", "bob", "3600000", "1-0"});
  EXPECT_THAT(resp, ArrLen(0));
  resp = Run({"xreadgroup", "group", "group", "alice", "streams", "foo", "0"});
  EXPECT_THAT(resp,
              RespArray(ElementsAre(
                  "foo", RespArray(ElementsAre(
                             RespArray(ElementsAre("1-0", RespArray(ElementsAre("k1", "v1")))),
                             RespArray(ElementsAre("1-1", RespArray(ElementsAre("k2", "v2")))))))));

  Run({"xadd", "foo", "1-4", "k5", "v5"});
  Run({"xreadgroup", "group", "group", "alice", "streams", "foo", ">"});
  // xclaim returns only claimed ids when justid is set
  resp = Run({"xclaim", "foo", "group", "bob", "0", "1-0", "1-4", "justid"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("1-0", "1-4"));

  Run({"xadd", "foo", "1-5", "k6", "v6"});
  // bob should claim the id forcefully even if it is not yet present in group pel
  resp = Run({"xclaim", "foo", "group", "bob", "0", "1-5", "force", "justid"});
  EXPECT_THAT(resp.GetString(), "1-5");
  resp = Run({"xreadgroup", "group", "group", "bob", "streams", "foo", "0"});
  EXPECT_THAT(resp.GetVec()[1].GetVec()[4].GetVec(),
              ElementsAre("1-5", RespArray(ElementsAre("k6", "v6"))));

  TEST_current_time_ms += 2000;
  resp = Run({"xclaim", "foo", "group", "alice", "0", "1-4", "TIME",
              absl::StrCat(TEST_current_time_ms - 500), "justid"});
  EXPECT_THAT(resp.GetString(), "1-4");

  // min idle time is exceeded for this entry
  resp = Run({"xclaim", "foo", "group", "bob", "600", "1-4"});
  ASSERT_THAT(resp, ArrLen(0));

  resp = Run({"xclaim", "foo", "group", "bob", "400", "1-4", "justid"});
  EXPECT_THAT(resp.GetString(), "1-4");

  //  test RETRYCOUNT
  Run({"xadd", "foo", "1-6", "k7", "v7"});
  resp = Run({"xclaim", "foo", "group", "bob", "0", "1-6", "force", "justid", "retrycount", "5"});
  EXPECT_THAT(resp.GetString(), "1-6");
  resp = Run({"xpending", "foo", "group", "1-6", "1-6", "1"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("1-6", "bob", ArgType(RespExpr::INT64), IntArg(5)));

  // test LASTID
  Run({"xreadgroup", "group", "group", "bob", "count", "2", "streams", "foo", ">"});
  Run({"xclaim", "foo", "group", "alice", "0", "1-6", "LASTID", "1-4"});
  resp = Run({"xinfo", "groups", "foo"});
  EXPECT_EQ(resp.GetVec()[7], "1-6");

  Run({"xclaim", "foo", "group", "bob", "0", "1-6", "LASTID", "1-9"});
  resp = Run({"xinfo", "groups", "foo"});
  EXPECT_EQ(resp.GetVec()[7], "1-9");
}

TEST_F(StreamFamilyTest, XTrim) {
  Run({"xadd", "foo", "1-*", "k", "v"});
  Run({"xadd", "foo", "1-*", "k", "v"});
  Run({"xadd", "foo", "1-*", "k", "v"});
  Run({"xadd", "foo", "1-*", "k", "v"});

  // Trim to maxlen 2, 2 entries should have been deleted with 2 entries remaining.
  auto resp = Run({"xtrim", "foo", "maxlen", "2"});
  EXPECT_THAT(resp, IntArg(2));
  resp = Run({"xlen", "foo"});
  EXPECT_THAT(resp, IntArg(2));

  Run({"xadd", "foo", "1-*", "k", "v"});
  Run({"xadd", "foo", "1-*", "k", "v"});

  // Trim messages whose ID is before 1-4, 2 entries should have been deleted with
  // 2 entries remaining.
  resp = Run({"xtrim", "foo", "minid", "1-4"});
  EXPECT_THAT(resp, IntArg(2));
  resp = Run({"xlen", "foo"});
  EXPECT_THAT(resp, IntArg(2));

  // Trim no changes needed.
  resp = Run({"xtrim", "foo", "maxlen", "5"});
  EXPECT_THAT(resp, IntArg(0));
  resp = Run({"xlen", "foo"});
  EXPECT_THAT(resp, IntArg(2));

  Run({"xadd", "foo", "1-*", "k", "v"});
  Run({"xadd", "foo", "1-*", "k", "v"});

  // Trim exact.
  resp = Run({"xtrim", "foo", "maxlen", "=", "2"});
  EXPECT_THAT(resp, IntArg(2));
  resp = Run({"xlen", "foo"});
  EXPECT_THAT(resp, IntArg(2));

  Run({"xadd", "foo", "1-*", "k", "v"});
  Run({"xadd", "foo", "1-*", "k", "v"});

  // Trim approx.
  resp = Run({"xtrim", "foo", "maxlen", "~", "2"});
  EXPECT_THAT(resp, IntArg(0));
  resp = Run({"xlen", "foo"});
  EXPECT_THAT(resp, IntArg(4));

  // Trim stream not found should return no entries.
  resp = Run({"xtrim", "notfound", "maxlen", "5"});
  EXPECT_THAT(resp, IntArg(0));
}

TEST_F(StreamFamilyTest, XTrimInvalidArgs) {
  // Missing threshold.
  auto resp = Run({"xtrim", "foo"});
  EXPECT_THAT(resp, ErrArg("wrong number of arguments"));
  resp = Run({"xtrim", "foo", "maxlen"});
  EXPECT_THAT(resp, ErrArg("wrong number of arguments"));
  resp = Run({"xtrim", "foo", "minid"});
  EXPECT_THAT(resp, ErrArg("wrong number of arguments"));

  // Invalid threshold.
  resp = Run({"xtrim", "foo", "maxlen", "nan"});
  EXPECT_THAT(resp, ErrArg("not an integer or out of range"));
  resp = Run({"xtrim", "foo", "maxlen", "-1"});
  EXPECT_THAT(resp, ErrArg("not an integer or out of range"));
  resp = Run({"xtrim", "foo", "minid", "nan"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  // Limit with non-approx.
  resp = Run({"xtrim", "foo", "maxlen", "2", "limit", "5"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  // Include both maxlen and minid.
  resp = Run({"xtrim", "foo", "maxlen", "2", "minid", "1-1"});
  EXPECT_THAT(resp, ErrArg("MAXLEN and MINID options at the same time are not compatible"));
  resp = Run({"xtrim", "foo", "minid", "1-1", "maxlen", "2"});
  EXPECT_THAT(resp, ErrArg("MAXLEN and MINID options at the same time are not compatible"));

  // Invalid limit.
  resp = Run({"xtrim", "foo", "maxlen", "~", "2", "limit", "nan"});
  EXPECT_THAT(resp, ErrArg("value is not an integer or out of range"));
}

TEST_F(StreamFamilyTest, XTrimWrongSyntax) {
  auto resp = Run({"xtrim", "-992", "k1 \"v1\" k2 \"v2 with spaces\" \"k3 with spaces\" \"v3\"",
                   "list1 element1"});
  EXPECT_THAT(resp, ErrArg("syntax error"));
}

TEST_F(StreamFamilyTest, XPending) {
  Run({"xadd", "foo", "1-0", "k1", "v1"});
  Run({"xadd", "foo", "1-1", "k2", "v2"});
  Run({"xadd", "foo", "1-2", "k3", "v3"});

  // create a group for foo stream
  Run({"xgroup", "create", "foo", "group", "0"});
  // alice consume all the stream entries
  Run({"xreadgroup", "group", "group", "alice", "streams", "foo", ">"});
  // bob doesn't have pending entries
  Run({"xgroup", "createconsumer", "foo", "group", "bob"});

  // XPending should print 4 entries
  auto resp = Run({"xpending", "foo", "group"});
  EXPECT_THAT(resp, RespArray(ElementsAre(
                        IntArg(3), "1-0", "1-2",
                        RespArray(ElementsAre(RespArray(ElementsAre("alice", IntArg(3))))))));

  resp = Run({"xpending", "foo", "group", "-", "+", "10"});
  EXPECT_THAT(resp,
              RespArray(ElementsAre(
                  RespArray(ElementsAre("1-0", "alice", ArgType(RespExpr::INT64), IntArg(1))),
                  RespArray(ElementsAre("1-1", "alice", ArgType(RespExpr::INT64), IntArg(1))),
                  RespArray(ElementsAre("1-2", "alice", ArgType(RespExpr::INT64), IntArg(1))))));

  // only return a single entry
  resp = Run({"xpending", "foo", "group", "-", "+", "1"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("1-0", "alice", ArgType(RespExpr::INT64), IntArg(1)));

  // Bob read a new entry
  Run({"xadd", "foo", "1-3", "k4", "v4"});
  Run({"xreadgroup", "group", "group", "bob", "streams", "foo", ">"});
  // Bob now has` an entry in his pending list
  resp = Run({"xpending", "foo", "group", "-", "+", "10", "bob"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("1-3", "bob", ArgType(RespExpr::INT64), IntArg(1)));

  Run({"xadd", "foo", "1-4", "k5", "v5"});
  TEST_current_time_ms = 100;
  Run({"xreadgroup", "group", "group", "bob", "streams", "foo", ">"});
  TEST_current_time_ms += 3000;

  // min-idle-time is exceeding the delivery time of last inserted entry
  resp = Run({"xpending", "foo", "group", "IDLE", "4000", "-", "+", "10"});
  EXPECT_THAT(resp, ArrLen(0));
}

TEST_F(StreamFamilyTest, XPendingMissingGroup) {
  auto resp = Run({"xpending", "?"});
  EXPECT_THAT(resp, ErrArg("wrong number of arguments"));
}

TEST_F(StreamFamilyTest, XReadGroupEmptyConsumer) {
  Run({"xadd", "s", "*", "x", "y"});
  Run({"xgroup", "create", "s", "g", "0"});
  auto resp = Run({"xreadgroup", "group", "g", "", "streams", "s", ">"});
  EXPECT_THAT(resp, ErrArg("consumer name can't be empty"));
}

TEST_F(StreamFamilyTest, XPendingInvalidArgs) {
  Run({"xadd", "foo", "1-0", "k1", "v1"});
  Run({"xadd", "foo", "1-1", "k2", "v2"});

  auto resp = Run({"xpending", "unknown", "group"});
  EXPECT_THAT(resp, ErrArg("no such key"));

  // group doesn't exist
  resp = Run({"xpending", "foo", "group"});
  EXPECT_THAT(resp, ErrArg("NOGROUP"));

  Run({"xgroup", "create", "foo", "group", "0"});
  // start end count not provided
  resp = Run({"xpending", "foo", "group", "IDLE", "0"});
  EXPECT_THAT(resp, ErrArg("wrong number of arguments"));

  // count not provided
  resp = Run({"xpending", "foo", "group", "-", "+"});
  EXPECT_THAT(resp, ErrArg("wrong number of arguments"));
}

TEST_F(StreamFamilyTest, XPendingEmpty) {
  Run({"XADD", "stream", "*", "foo", "bar"});
  Run({"XADD", "stream", "*", "foo", "bar"});
  Run({"XGROUP", "CREATE", "stream", "group", "0"});
  auto resp = Run({"XPENDING", "stream", "group"});
  EXPECT_THAT(resp, RespArray(ElementsAre(IntArg(0), kMatchNil, kMatchNil, kMatchNil)));
}

TEST_F(StreamFamilyTest, XAck) {
  Run({"xadd", "foo", "1-0", "k0", "v0"});
  Run({"xadd", "foo", "1-1", "k1", "v1"});
  Run({"xadd", "foo", "1-2", "k2", "v2"});
  Run({"xadd", "foo", "1-3", "k3", "v3"});
  Run({"xgroup", "create", "foo", "cgroup", "0"});
  Run({"xreadgroup", "group", "cgroup", "consumer", "count", "4", "streams", "foo", ">"});

  // PEL of cgroup now has 4 messages.
  // Acknowledge a message that exists.
  auto resp = Run({"xack", "foo", "cgroup", "1-0"});
  EXPECT_THAT(resp, IntArg(1));

  // acknowledge a message from non-existing stream.
  resp = Run({"xack", "nosuchstream", "cgroup", "1-0"});
  EXPECT_THAT(resp, IntArg(0));

  // acknowledge a message for a non-existing consumer group.
  resp = Run({"xack", "foo", "nosuchcgroup", "1-0"});
  EXPECT_THAT(resp, IntArg(0));

  // Verifies message id 1-0 gets removed from PEL.
  resp = Run({"xreadgroup", "group", "cgroup", "consumer", "streams", "foo", "0"});
  EXPECT_THAT(resp,
              RespArray(ElementsAre(
                  "foo", RespArray(ElementsAre(
                             RespArray(ElementsAre("1-1", RespArray(ElementsAre("k1", "v1")))),
                             RespArray(ElementsAre("1-2", RespArray(ElementsAre("k2", "v2")))),
                             RespArray(ElementsAre("1-3", RespArray(ElementsAre("k3", "v3")))))))));

  // acknowledge a message that doesn't exist
  resp = Run({"xack", "foo", "cgroup", "1-9"});
  EXPECT_THAT(resp, IntArg(0));

  // Verifies no message gets removed from PEL.
  resp = Run({"xreadgroup", "group", "cgroup", "consumer", "streams", "foo", "0"});
  EXPECT_THAT(resp,
              RespArray(ElementsAre(
                  "foo", RespArray(ElementsAre(
                             RespArray(ElementsAre("1-1", RespArray(ElementsAre("k1", "v1")))),
                             RespArray(ElementsAre("1-2", RespArray(ElementsAre("k2", "v2")))),
                             RespArray(ElementsAre("1-3", RespArray(ElementsAre("k3", "v3")))))))));

  // acknowledge another message that exists and one non-existing message.
  resp = Run({"xack", "foo", "cgroup", "1-3", "1-9"});
  EXPECT_THAT(resp, IntArg(1));

  // Verifies only "1-3" gets removed from PEL.
  resp = Run({"xreadgroup", "group", "cgroup", "consumer", "streams", "foo", "0"});
  EXPECT_THAT(resp,
              RespArray(ElementsAre(
                  "foo", RespArray(ElementsAre(
                             RespArray(ElementsAre("1-1", RespArray(ElementsAre("k1", "v1")))),
                             RespArray(ElementsAre("1-2", RespArray(ElementsAre("k2", "v2")))))))));

  // acknowledge all the existing messages left.
  resp = Run({"xack", "foo", "cgroup", "1-1", "1-2"});
  EXPECT_THAT(resp, IntArg(2));

  // Verifies that PEL is empty.
  resp = Run({"xreadgroup", "group", "cgroup", "consumer", "streams", "foo", "0"});
  EXPECT_THAT(resp, RespArray(ElementsAre("foo", ArrLen(0))));
}

TEST_F(StreamFamilyTest, XInfoGroups) {
  Run({"del", "mystream"});
  Run({"xgroup", "create", "mystream", "mygroup", "$", "MKSTREAM"});

  // non-existent-stream
  auto resp = Run({"xinfo", "groups", "non-existent-stream"});
  EXPECT_THAT(resp, ErrArg("no such key"));

  // group with no consumers
  resp = Run({"xinfo", "groups", "mystream"});
  EXPECT_THAT(resp, ArrLen(12));
  EXPECT_THAT(resp.GetVec(),
              ElementsAre("name", "mygroup", "consumers", IntArg(0), "pending", IntArg(0),
                          "last-delivered-id", "0-0", "entries-read", kMatchNil, "lag", IntArg(0)));

  // group with multiple consumers
  Run({"xgroup", "createconsumer", "mystream", "mygroup", "consumer1"});
  Run({"xgroup", "createconsumer", "mystream", "mygroup", "consumer2"});
  resp = Run({"xinfo", "groups", "mystream"});
  EXPECT_THAT(resp, ArrLen(12));
  EXPECT_THAT(resp.GetVec()[3], IntArg(2));

  // group with lag
  Run({"xadd", "mystream", "1-0", "test-field-1", "test-value-1"});
  Run({"xadd", "mystream", "2-0", "test-field-2", "test-value-2"});
  resp = Run({"xinfo", "groups", "mystream"});
  EXPECT_THAT(resp.GetVec()[11], IntArg(2));
  EXPECT_THAT(resp.GetVec()[7], "0-0");

  // group with no lag, before ack
  Run({"xreadgroup", "group", "mygroup", "consumer1", "STREAMS", "mystream", ">"});
  resp = Run({"xinfo", "groups", "mystream"});
  EXPECT_THAT(resp.GetVec(),
              ElementsAre("name", "mygroup", "consumers", IntArg(2), "pending", IntArg(2),
                          "last-delivered-id", "2-0", "entries-read", IntArg(2), "lag", IntArg(0)));

  // after ack
  Run({"xack", "mystream", "mygroup", "1-0"});
  Run({"xack", "mystream", "mygroup", "2-0"});
  resp = Run({"xinfo", "groups", "mystream"});
  EXPECT_THAT(resp.GetVec(),
              ElementsAre("name", "mygroup", "consumers", IntArg(2), "pending", IntArg(0),
                          "last-delivered-id", "2-0", "entries-read", IntArg(2), "lag", IntArg(0)));
}

TEST_F(StreamFamilyTest, XInfoConsumers) {
  Run({"del", "mystream"});
  Run({"xgroup", "create", "mystream", "mygroup", "$", "MKSTREAM"});

  // no consumer
  auto resp = Run({"xinfo", "consumers", "mystream", "mygroup"});
  EXPECT_THAT(resp, ArrLen(0));

  // invalid key
  resp = Run({"xinfo", "consumers", "non-existent-stream", "mygroup"});
  EXPECT_THAT(resp, ErrArg("no such key"));

  // invalid group
  resp = Run({"xinfo", "consumers", "mystream", "non-existent-group"});
  EXPECT_THAT(resp, ErrArg("NOGROUP"));

  Run({"xgroup", "createconsumer", "mystream", "mygroup", "first-consumer"});
  Run({"xgroup", "createconsumer", "mystream", "mygroup", "second-consumer"});
  resp = Run({"xinfo", "consumers", "mystream", "mygroup"});
  EXPECT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp.GetVec()[0], ArrLen(8));
  EXPECT_THAT(resp.GetVec()[1], ArrLen(8));
  EXPECT_THAT(resp.GetVec()[0].GetVec()[1], "first-consumer");
  EXPECT_THAT(resp.GetVec()[1].GetVec()[1], "second-consumer");

  Run({"xadd", "mystream", "1-0", "test-field-1", "test-value-1"});
  Run({"xreadgroup", "group", "mygroup", "consumer1", "STREAMS", "mystream", ">"});
  resp = Run({"xinfo", "consumers", "mystream", "mygroup"});
  // pending for first-consumer
  EXPECT_THAT(resp.GetVec()[0].GetVec()[3], IntArg(1));
  // pending for second-consumer
  EXPECT_THAT(resp.GetVec()[1].GetVec()[3], IntArg(0));
}

TEST_F(StreamFamilyTest, XAutoClaim) {
  Run({"xadd", "foo", "1-0", "k1", "v1"});
  Run({"xadd", "foo", "1-1", "k2", "v2"});
  Run({"xadd", "foo", "1-2", "k3", "v3"});
  Run({"xadd", "foo", "1-3", "k4", "v4"});

  // create a group for foo stream
  Run({"xgroup", "create", "foo", "group", "0"});
  // alice consume all the stream entries
  Run({"xreadgroup", "group", "group", "alice", "streams", "foo", ">"});

  // bob claims alice's two pending stream entries
  // testing the mandatory command options.
  auto resp = Run({"xautoclaim", "foo", "group", "bob", "0", "1-2"});
  EXPECT_THAT(
      resp,
      RespArray(ElementsAre(
          "0-0",
          RespArray(ElementsAre(RespArray(ElementsAre("1-2", RespArray(ElementsAre("k3", "v3")))),
                                RespArray(ElementsAre("1-3", RespArray(ElementsAre("k4", "v4")))))),
          RespArray(ElementsAre()))));

  // bob really has these claimed entries
  resp = Run({"xreadgroup", "group", "group", "bob", "streams", "foo", "0"});
  EXPECT_THAT(resp,
              RespArray(ElementsAre(
                  "foo", RespArray(ElementsAre(
                             RespArray(ElementsAre("1-2", RespArray(ElementsAre("k3", "v3")))),
                             RespArray(ElementsAre("1-3", RespArray(ElementsAre("k4", "v4")))))))));

  // alice no longer have those entries
  resp = Run({"xreadgroup", "group", "group", "alice", "streams", "foo", "0"});
  EXPECT_THAT(resp,
              RespArray(ElementsAre(
                  "foo", RespArray(ElementsAre(
                             RespArray(ElementsAre("1-0", RespArray(ElementsAre("k1", "v1")))),
                             RespArray(ElementsAre("1-1", RespArray(ElementsAre("k2", "v2")))))))));

  // xautoclaim ensures that entries before the min-idle-time are not claimed by bob
  resp = Run({"xautoclaim", "foo", "group", "bob", "3600000", "0-0"});
  EXPECT_THAT(resp,
              RespArray(ElementsAre("0-0", RespArray(ElementsAre()), RespArray(ElementsAre()))));

  Run({"xadd", "foo", "1-4", "k5", "v5"});
  Run({"xreadgroup", "group", "group", "alice", "streams", "foo", ">"});
  // xautoclaim returns only claimed ids when justid is set
  resp = Run({"xautoclaim", "foo", "group", "bob", "0", "0-0", "justid"});
  EXPECT_THAT(
      resp, RespArray(ElementsAre("0-0", RespArray(ElementsAre("1-0", "1-1", "1-2", "1-3", "1-4")),
                                  RespArray(ElementsAre()))));

  Run({"xadd", "foo", "1-5", "k6", "v6"});
  Run({"xadd", "foo", "1-6", "k7", "v7"});
  Run({"xreadgroup", "group", "group", "alice", "streams", "foo", ">"});
  // test count and end_id
  resp = Run({"xautoclaim", "foo", "group", "bob", "0", "1-5", "count", "1", "justid"});
  EXPECT_THAT(
      resp, RespArray(ElementsAre("1-6", RespArray(ElementsAre("1-5")), RespArray(ElementsAre()))));

  resp = Run({"xautoclaim", "foo", "group", "bob", "0", "1-6", "count", "1", "justid"});
  EXPECT_THAT(
      resp, RespArray(ElementsAre("0-0", RespArray(ElementsAre("1-6")), RespArray(ElementsAre()))));

  resp = Run({"xautoclaim", "foo", "group", "bob", "0", "1-10", "count", "1", "justid"});
  EXPECT_THAT(resp,
              RespArray(ElementsAre("0-0", RespArray(ElementsAre()), RespArray(ElementsAre()))));

  // if a message being claimed is deleted, it should be listed separately.
  Run({"xdel", "foo", "1-2", "1-4"});
  resp = Run({"xautoclaim", "foo", "group", "alice", "0", "0-0", "justid"});
  EXPECT_THAT(
      resp, RespArray(ElementsAre("0-0", RespArray(ElementsAre("1-0", "1-1", "1-3", "1-5", "1-6")),
                                  RespArray(ElementsAre("1-2", "1-4")))));
}

TEST_F(StreamFamilyTest, XInfoStream) {
  Run({"del", "mystream"});
  Run({"xgroup", "create", "mystream", "mygroup", "$", "MKSTREAM"});
  Run({"xgroup", "createconsumer", "mystream", "mygroup", "first-consumer"});

  // invalid key
  auto resp = Run({"xinfo", "stream", "non-existent-stream"});
  EXPECT_THAT(resp, ErrArg("no such key"));

  // invalid args
  resp = Run({"xinfo", "stream", "mystream", "extra-arg"});
  EXPECT_THAT(
      resp,
      ErrArg("unknown subcommand or wrong number of arguments for 'STREAM'. Try XINFO HELP."));
  resp = Run({"xinfo", "stream", "mystream", "full", "count"});
  EXPECT_THAT(
      resp,
      ErrArg("unknown subcommand or wrong number of arguments for 'STREAM'. Try XINFO HELP."));
  resp = Run({"xinfo", "stream", "mystream", "full", "count", "a"});
  EXPECT_THAT(resp, ErrArg("value is not an integer or out of range"));

  // no message in stream
  resp = Run({"xinfo", "stream", "mystream"});
  EXPECT_THAT(resp, ArrLen(20));
  EXPECT_THAT(
      resp.GetVec(),
      ElementsAre("length", IntArg(0), "radix-tree-keys", IntArg(0), "radix-tree-nodes", IntArg(1),
                  "last-generated-id", "0-0", "max-deleted-entry-id", "0-0", "entries-added",
                  IntArg(0), "recorded-first-entry-id", "0-0", "groups", IntArg(1), "first-entry",
                  ArgType(RespExpr::NIL_ARRAY), "last-entry", ArgType(RespExpr::NIL_ARRAY)));

  Run({"xadd", "mystream", "1-1", "message", "one"});
  Run({"xadd", "mystream", "2-1", "message", "two"});
  Run({"xadd", "mystream", "3-1", "message", "three"});
  Run({"xadd", "mystream", "4-1", "message", "four"});
  Run({"xadd", "mystream", "5-1", "message", "five"});
  Run({"xadd", "mystream", "6-1", "message", "six"});
  Run({"xadd", "mystream", "7-1", "message", "seven"});
  Run({"xadd", "mystream", "8-1", "message", "eight"});
  Run({"xadd", "mystream", "9-1", "message", "nine"});
  Run({"xadd", "mystream", "10-1", "message", "ten"});
  Run({"xadd", "mystream", "11-1", "message", "eleven"});
  resp = Run({"xinfo", "stream", "mystream"});
  EXPECT_THAT(resp.GetVec(),
              ElementsAre("length", IntArg(11), "radix-tree-keys", IntArg(1), "radix-tree-nodes",
                          IntArg(2), "last-generated-id", "11-1", "max-deleted-entry-id", "0-0",
                          "entries-added", IntArg(11), "recorded-first-entry-id", "1-1", "groups",
                          IntArg(1), "first-entry", ArrLen(2), "last-entry", ArrLen(2)));
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0], "1-1");
  EXPECT_THAT(resp.GetVec()[17].GetVec()[1].GetVec(), ElementsAre("message", "one"));
  EXPECT_THAT(resp.GetVec()[19].GetVec()[0], "11-1");
  EXPECT_THAT(resp.GetVec()[19].GetVec()[1].GetVec(), ElementsAre("message", "eleven"));

  // full - default
  resp = Run({"xinfo", "stream", "mystream", "full"});
  EXPECT_THAT(resp, ArrLen(18));
  EXPECT_THAT(resp.GetVec()[15], ArrLen(10));
  EXPECT_THAT(resp.GetVec()[17], ArrLen(1));
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0], ArrLen(14));
  EXPECT_THAT(resp.GetVec(),
              ElementsAre("length", IntArg(11), "radix-tree-keys", IntArg(1), "radix-tree-nodes",
                          IntArg(2), "last-generated-id", "11-1", "max-deleted-entry-id", "0-0",
                          "entries-added", IntArg(11), "recorded-first-entry-id", "1-1", "entries",
                          ArrLen(10), "groups", ArrLen(1)));
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec(),
              ElementsAre("name", "mygroup", "last-delivered-id", "0-0", "entries-read", kMatchNil,
                          "lag", IntArg(11), "pel-count", IntArg(0), "pending", ArrLen(0),
                          "consumers", ArrLen(1)));
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec()[13].GetVec()[0].GetVec(),
              ElementsAre("name", "first-consumer", "seen-time", ArgType(RespExpr::INT64),
                          "active-time", IntArg(-1), "pel-count", IntArg(0), "pending", ArrLen(0)));

  // full with count less than number of messages in stream
  resp = Run({"xinfo", "stream", "mystream", "full", "count", "5"});
  EXPECT_THAT(resp.GetVec()[15], ArrLen(5));

  // full with count exceeding number of messages in stream
  resp = Run({"xinfo", "stream", "mystream", "full", "count", "12"});
  EXPECT_THAT(resp.GetVec()[15], ArrLen(11));

  // full - all messages
  resp = Run({"xinfo", "stream", "mystream", "full", "count", "0"});
  EXPECT_THAT(resp.GetVec()[15], ArrLen(11));

  // read message
  Run({"xreadgroup", "group", "mygroup", "first-consumer", "STREAMS", "mystream", ">"});
  resp = Run({"xinfo", "stream", "mystream", "full", "count", "0"});
  EXPECT_THAT(resp.GetVec()[15], ArrLen(11));
  // group
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec()[5], IntArg(11));   // entries-read
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec()[7], IntArg(0));    // lag
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec()[9], IntArg(11));   // pel-count
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec()[11], ArrLen(11));  // pending list
  // consumer
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec()[13].GetVec()[0].GetVec()[7],
              IntArg(11));  // pel-count
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec()[13].GetVec()[0].GetVec()[9],
              ArrLen(11));  // pending list

  // delete message
  Run({"xdel", "mystream", "1-1"});
  resp = Run({"xinfo", "stream", "mystream"});
  EXPECT_THAT(resp.GetVec(),
              ElementsAre("length", IntArg(10), "radix-tree-keys", IntArg(1), "radix-tree-nodes",
                          IntArg(2), "last-generated-id", "11-1", "max-deleted-entry-id", "1-1",
                          "entries-added", IntArg(11), "recorded-first-entry-id", "2-1", "groups",
                          IntArg(1), "first-entry", ArrLen(2), "last-entry", ArrLen(2)));
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0], "2-1");
  EXPECT_THAT(resp.GetVec()[17].GetVec()[1].GetVec(), ElementsAre("message", "two"));
  EXPECT_THAT(resp.GetVec()[19].GetVec()[0], "11-1");
  EXPECT_THAT(resp.GetVec()[19].GetVec()[1].GetVec(), ElementsAre("message", "eleven"));

  resp = Run({"xinfo", "stream", "mystream", "full", "count", "0"});
  EXPECT_THAT(resp.GetVec()[15], ArrLen(10));
  EXPECT_THAT(resp.GetVec(),
              ElementsAre("length", IntArg(10), "radix-tree-keys", IntArg(1), "radix-tree-nodes",
                          IntArg(2), "last-generated-id", "11-1", "max-deleted-entry-id", "1-1",
                          "entries-added", IntArg(11), "recorded-first-entry-id", "2-1", "entries",
                          ArrLen(10), "groups", ArrLen(1)));
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec(),
              ElementsAre("name", "mygroup", "last-delivered-id", "11-1", "entries-read",
                          IntArg(11), "lag", IntArg(0), "pel-count", IntArg(11), "pending",
                          ArrLen(11), "consumers", ArrLen(1)));
  EXPECT_THAT(
      resp.GetVec()[17].GetVec()[0].GetVec()[13].GetVec()[0].GetVec(),
      ElementsAre("name", "first-consumer", "seen-time", ArgType(RespExpr::INT64), "active-time",
                  ArgType(RespExpr::INT64), "pel-count", IntArg(11), "pending", ArrLen(11)));
}

TEST_F(StreamFamilyTest, AutoClaimPelItemsFromAnotherConsumer) {
  auto resp = Run({"xadd", "mystream", "*", "a", "1"});
  string id1 = resp.GetString();
  resp = Run({"xadd", "mystream", "*", "b", "2"});
  string id2 = resp.GetString();
  resp = Run({"xadd", "mystream", "*", "c", "3"});
  string id3 = resp.GetString();
  resp = Run({"xadd", "mystream", "*", "d", "4"});
  string id4 = resp.GetString();

  Run({"XGROUP", "CREATE", "mystream", "mygroup", "0"});

  // Consumer 1 reads item 1 from the stream without acknowledgements.
  // Consumer 2 then claims pending item 1 from the PEL of consumer 1
  resp = Run(
      {"XREADGROUP", "GROUP", "mygroup", "consumer1", "COUNT", "1", "STREAMS", "mystream", ">"});

  auto match_a1 = RespElementsAre("a", "1");
  ASSERT_THAT(resp, RespElementsAre("mystream", RespElementsAre(RespElementsAre(id1, match_a1))));

  AdvanceTime(200);  // Advance time to greater time than the idle time in the autoclaim (10)
  resp = Run({"XAUTOCLAIM", "mystream", "mygroup", "consumer2", "10", "-", "COUNT", "1"});

  EXPECT_THAT(resp, RespElementsAre("0-0", ArrLen(1), ArrLen(0)));
  EXPECT_THAT(resp.GetVec()[1], RespElementsAre(RespElementsAre(id1, match_a1)));

  Run({"XREADGROUP", "GROUP", "mygroup", "consumer1", "COUNT", "3", "STREAMS", "mystream", ">"});
  AdvanceTime(200);

  // Delete item 2 from the stream.Now consumer 1 has PEL that contains
  // only item 3. Try to use consumer 2 to claim the deleted item 2
  // from the PEL of consumer 1, this should return nil
  resp = Run({"XDEL", "mystream", id2});
  ASSERT_THAT(resp, IntArg(1));

  // id1 and id3 are self - claimed here but not id2('count' was set to 3)
  // we make sure id2 is indeed skipped(the cursor points to id4)
  resp = Run({"XAUTOCLAIM", "mystream", "mygroup", "consumer2", "10", "-", "COUNT", "3"});
  auto match_id1_a1 = RespElementsAre(id1, match_a1);
  auto match_id3_c3 = RespElementsAre(id3, RespElementsAre("c", "3"));
  ASSERT_THAT(resp, RespElementsAre(id4, RespElementsAre(match_id1_a1, match_id3_c3),
                                    RespElementsAre(id2)));
  // Delete item 3 from the stream.Now consumer 1 has PEL that is empty.
  // Try to use consumer 2 to claim the deleted item 3 from the PEL
  // of consumer 1, this should return nil
  AdvanceTime(200);

  ASSERT_THAT(Run({"XDEL", "mystream", id4}), IntArg(1));

  // id1 and id3 are self - claimed here but not id2 and id4('count' is default 100)
  // we also test the JUSTID modifier here.note that, when using JUSTID,
  // deleted entries are returned in reply(consistent with XCLAIM).
  resp = Run({"XAUTOCLAIM", "mystream", "mygroup", "consumer2", "10", "-", "JUSTID"});
  ASSERT_THAT(resp, RespElementsAre("0-0", RespElementsAre(id1, id3), RespElementsAre(id4)));
}

TEST_F(StreamFamilyTest, AutoClaimDelCount) {
  Run({"xadd", "x", "1-0", "f", "v"});
  Run({"xadd", "x", "2-0", "f", "v"});
  Run({"xadd", "x", "3-0", "f", "v"});
  Run({"XGROUP", "CREATE", "x", "grp", "0"});
  auto resp = Run({"XREADGROUP", "GROUP", "grp", "Alice", "STREAMS", "x", ">"});

  auto m1 = RespElementsAre("1-0", _);
  auto m2 = RespElementsAre("2-0", _);
  auto m3 = RespElementsAre("3-0", _);
  EXPECT_THAT(resp, RespElementsAre("x", RespElementsAre(m1, m2, m3)));

  EXPECT_THAT(Run({"XDEL", "x", "1-0"}), IntArg(1));
  EXPECT_THAT(Run({"XDEL", "x", "2-0"}), IntArg(1));

  resp = Run({"XAUTOCLAIM", "x", "grp", "Bob", "0", "0-0", "COUNT", "1"});
  EXPECT_THAT(resp, RespElementsAre("2-0", ArrLen(0), RespElementsAre("1-0")));

  resp = Run({"XAUTOCLAIM", "x", "grp", "Bob", "0", "2-0", "COUNT", "1"});
  EXPECT_THAT(resp, RespElementsAre("3-0", ArrLen(0), RespElementsAre("2-0")));

  resp = Run({"XAUTOCLAIM", "x", "grp", "Bob", "0", "3-0", "COUNT", "1"});
  EXPECT_THAT(resp, RespElementsAre(
                        "0-0", RespElementsAre(RespElementsAre("3-0", RespElementsAre("f", "v"))),
                        ArrLen(0)));
  resp = Run({"xpending", "x", "grp", "-", "+", "10", "Alice"});
  EXPECT_THAT(resp, ArrLen(0));

  resp = Run({"XAUTOCLAIM", "x", "grp", "Bob", "0", "3-0", "COUNT", "704505322"});
  EXPECT_THAT(resp, ErrArg("COUNT"));
}

TEST_F(StreamFamilyTest, XAddMaxSeq) {
  Run({"XADD", "x", "1-18446744073709551615", "f1", "v1"});
  auto resp = Run({"XADD", "x", "1-*", "f2", "v2"});
  EXPECT_THAT(resp, ErrArg("The ID specified in XADD is equal or smaller"));
}

TEST_F(StreamFamilyTest, XsetIdSmallerMaxDeleted) {
  Run({"XADD", "x", "1-1", "a", "1"});
  Run({"XADD", "x", "1-2", "b", "2"});
  Run({"XADD", "x", "1-3", "c", "3"});
  Run({"XDEL", "x", "1-2"});
  Run({"XDEL", "x", "1-3"});
  auto resp = Run({"XINFO", "stream", "x"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  auto vec = resp.GetVec();
  string max_del_id;
  for (unsigned i = 0; i < vec.size(); i += 2) {
    if (vec[i] == "max-deleted-entry-id") {
      max_del_id = vec[i + 1].GetString();
      break;
    }
  }
  EXPECT_EQ(max_del_id, "1-3");

  resp = Run({"XSETID", "x", "1-2"});
  ASSERT_THAT(resp, ErrArg("smaller"));
}

TEST_F(StreamFamilyTest, SeenActiveTime) {
  TEST_current_time_ms = 1000;

  Run({"XGROUP", "CREATE", "mystream", "mygroup", "$", "MKSTREAM"});
  Run({"XREADGROUP", "GROUP", "mygroup", "Alice", "COUNT", "1", "STREAMS", "mystream", ">"});
  AdvanceTime(100);
  auto resp = Run({"xinfo", "consumers", "mystream", "mygroup"});
  EXPECT_THAT(resp, RespElementsAre("name", "Alice", "pending", IntArg(0), "idle", IntArg(100),
                                    "inactive", IntArg(-1)));

  Run({"XADD", "mystream", "*", "f", "v"});
  Run({"XREADGROUP", "GROUP", "mygroup", "Alice", "COUNT", "1", "STREAMS", "mystream", ">"});
  AdvanceTime(50);

  resp = Run({"xinfo", "consumers", "mystream", "mygroup"});
  EXPECT_THAT(resp, RespElementsAre("name", "Alice", "pending", IntArg(1), "idle", IntArg(50),
                                    "inactive", IntArg(50)));
  AdvanceTime(100);
  resp = Run({"XREADGROUP", "GROUP", "mygroup", "Alice", "COUNT", "1", "STREAMS", "mystream", ">"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL_ARRAY));

  resp = Run({"xinfo", "consumers", "mystream", "mygroup"});

  // Idle is 0 because XREADGROUP just run, but inactive continues clocking because nothing was
  // read.
  EXPECT_THAT(resp, RespElementsAre("name", "Alice", "pending", IntArg(1), "idle", IntArg(0),
                                    "inactive", IntArg(150)));

  // Serialize/deserialize.
  resp = Run({"XINFO", "STREAM", "mystream", "FULL"});
  auto groups = resp.GetVec()[17];
  auto consumers = groups.GetVec()[0].GetVec()[13].GetVec()[0];
  EXPECT_THAT(consumers, RespElementsAre("name", "Alice", "seen-time", IntArg(1250), "active-time",
                                         IntArg(1100), "pel-count", IntArg(1), "pending", _));

  resp = Run({"DUMP", "mystream"});
  Run({"del", "mystream"});
  resp = Run({"RESTORE", "mystream", "0", resp.GetString()});
  EXPECT_EQ(resp, "OK");
  resp = Run({"XINFO", "STREAM", "mystream", "FULL"});
  groups = resp.GetVec()[17];
  consumers = groups.GetVec()[0].GetVec()[13].GetVec()[0];
  EXPECT_THAT(consumers, RespElementsAre("name", "Alice", "seen-time", IntArg(1250), "active-time",
                                         IntArg(1100), "pel-count", IntArg(1), "pending", _));
}

TEST_F(StreamFamilyTest, XClaimWithNonExistentGroup) {
  Run({"xadd", "mystream", "1-0", "field1", "value1"});
  Run({"xadd", "mystream", "1-1", "field2", "value2"});

  auto resp = Run({"xclaim", "mystream", "nonexistent-group", "consumer1", "0", "1-0"});

  EXPECT_THAT(resp, ArrLen(0));

  resp = Run({"xclaim", "mystream", "nonexistent-group", "consumer1", "0", "1-0", "1-1"});
  EXPECT_THAT(resp, ArrLen(0));

  resp = Run({"xclaim", "mystream", "nonexistent-group", "consumer1", "0", "1-0", "justid"});
  EXPECT_THAT(resp, ArrLen(0));
}

TEST_F(StreamFamilyTest, XDelNonExistentId) {
  string key = R"(k1 "v1" k2 "v2 with spaces" "k3 with spaces" "v3")";
  Run({"XADD", key, "0", "set1", "member1"});

  // Try to delete a non-existent ID - should not crash (issue #5202)
  auto resp = Run({"XDEL", key, "46-867"});
  EXPECT_THAT(resp, IntArg(0));  // Nothing deleted
}

// Test consumer group lag when tombstone created after last_id
TEST_F(StreamFamilyTest, ConsumerGroupLagWithTombstoneAfterLastId) {
  Run("DEL x");
  Run("XADD x 1-0 data a");
  Run("XADD x 2-0 data b");
  Run("XADD x 3-0 data c");
  Run("XADD x 4-0 data d");
  Run("XADD x 5-0 data e");
  Run("XADD x 6-0 data f");
  Run("XDEL x 3-0");
  Run("XGROUP CREATE x g1 0");

  // Read all messages (5 actual entries since 3-0 was deleted, but entries_added is 6)
  Run("XREADGROUP GROUP g1 c11 COUNT 10 STREAMS x >");
  auto resp = Run("XINFO STREAM x FULL");
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec(),
              ElementsAre("name", "g1", "last-delivered-id", "6-0", "entries-read", IntArg(6),
                          "lag", IntArg(0), "pel-count", _, "pending", _, "consumers", _));

  // Add more messages
  Run("XADD x 7-0 data g");
  Run("XADD x 8-0 data h");
  Run("XADD x 9-0 data i");
  Run("XADD x 10-0 data j");
  resp = Run("XINFO STREAM x FULL");
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec(),
              ElementsAre("name", "g1", "last-delivered-id", "6-0", "entries-read", IntArg(6),
                          "lag", IntArg(4), "pel-count", _, "pending", _, "consumers", _));

  // Read 3 more messages (COUNT 3 will read 7-0, 8-0, 9-0)
  Run("XREADGROUP GROUP g1 c11 COUNT 3 STREAMS x >");
  Run("XDEL x 9-0");
  // Now there is a tombstone in the stream after the consumer group last_id
  // so the lag can't be calculated
  resp = Run("XINFO STREAM x FULL");
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec(),
              ElementsAre("name", "g1", "last-delivered-id", "9-0", "entries-read", IntArg(9),
                          "lag", kMatchNil, "pel-count", _, "pending", _, "consumers", _));

  // Read one more message to catch up
  Run("XREADGROUP GROUP g1 c12 COUNT 1 STREAMS x >");
  resp = Run("XINFO STREAM x FULL");
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec(),
              ElementsAre("name", "g1", "last-delivered-id", "10-0", "entries-read", IntArg(10),
                          "lag", IntArg(0), "pel-count", _, "pending", _, "consumers", _));
}

// Test consumer group lag with XTRIM
TEST_F(StreamFamilyTest, ConsumerGroupLagWithXTrim) {
  Run("DEL x");
  Run("XADD x 1-0 data a");
  Run("XADD x 2-0 data b");
  Run("XADD x 3-0 data c");
  Run("XADD x 4-0 data d");
  Run("XADD x 5-0 data e");
  Run("XDEL x 3-0");
  Run("XGROUP CREATE x g1 0");
  Run("XGROUP CREATE x g2 0");

  auto resp = Run("XINFO STREAM x FULL");
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec(),
              ElementsAre("name", "g1", "last-delivered-id", "0-0", "entries-read", kMatchNil,
                          "lag", kMatchNil, "pel-count", _, "pending", _, "consumers", _));

  // Read messages one by one
  Run("XREADGROUP GROUP g1 c11 COUNT 1 STREAMS x >");
  resp = Run("XINFO STREAM x FULL");
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec(),
              ElementsAre("name", "g1", "last-delivered-id", "1-0", "entries-read", kMatchNil,
                          "lag", kMatchNil, "pel-count", _, "pending", _, "consumers", _));

  Run("XREADGROUP GROUP g1 c11 COUNT 1 STREAMS x >");
  resp = Run("XINFO STREAM x FULL");
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec(),
              ElementsAre("name", "g1", "last-delivered-id", "2-0", "entries-read", kMatchNil,
                          "lag", kMatchNil, "pel-count", _, "pending", _, "consumers", _));

  Run("XREADGROUP GROUP g1 c11 COUNT 1 STREAMS x >");
  resp = Run("XINFO STREAM x FULL");
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec(),
              ElementsAre("name", "g1", "last-delivered-id", "4-0", "entries-read", kMatchNil,
                          "lag", kMatchNil, "pel-count", _, "pending", _, "consumers", _));

  Run("XREADGROUP GROUP g1 c11 COUNT 1 STREAMS x >");
  resp = Run("XINFO STREAM x FULL");
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec(),
              ElementsAre("name", "g1", "last-delivered-id", "5-0", "entries-read", IntArg(5),
                          "lag", IntArg(0), "pel-count", _, "pending", _, "consumers", _));

  // Add more messages
  Run("XADD x 6-0 data f");
  Run("XADD x 7-0 data g");
  resp = Run("XINFO STREAM x FULL");
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec(),
              ElementsAre("name", "g1", "last-delivered-id", "5-0", "entries-read", IntArg(5),
                          "lag", IntArg(2), "pel-count", _, "pending", _, "consumers", _));

  // XTRIM
  Run("XTRIM x MINID = 7-0");
  resp = Run("XINFO STREAM x FULL");
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec(),
              ElementsAre("name", "g1", "last-delivered-id", "5-0", "entries-read", IntArg(5),
                          "lag", IntArg(2), "pel-count", _, "pending", _, "consumers", _));
  EXPECT_THAT(resp.GetVec()[17].GetVec()[1].GetVec(),
              ElementsAre("name", "g2", "last-delivered-id", "0-0", "entries-read", kMatchNil,
                          "lag", IntArg(1), "pel-count", _, "pending", _, "consumers", _));

  // Read all remaining with g1
  Run("XREADGROUP GROUP g1 c11 STREAMS x >");
  resp = Run("XINFO STREAM x FULL");
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec(),
              ElementsAre("name", "g1", "last-delivered-id", "7-0", "entries-read", IntArg(7),
                          "lag", IntArg(0), "pel-count", _, "pending", _, "consumers", _));
}

// Test consumer group lag with XADD trimming
TEST_F(StreamFamilyTest, ConsumerGroupLagWithXAddTrimming) {
  Run("DEL x");
  Run("XADD x 1-0 data a");
  Run("XADD x 2-0 data b");
  Run("XADD x 3-0 data c");
  Run("XADD x 4-0 data d");
  Run("XADD x 5-0 data e");
  Run("XDEL x 3-0");
  Run("XGROUP CREATE x g1 0");
  Run("XGROUP CREATE x g2 0");

  auto resp = Run("XINFO STREAM x FULL");
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec(),
              ElementsAre("name", "g1", "last-delivered-id", "0-0", "entries-read", kMatchNil,
                          "lag", kMatchNil, "pel-count", _, "pending", _, "consumers", _));

  // Read messages one by one
  Run("XREADGROUP GROUP g1 c11 COUNT 1 STREAMS x >");
  resp = Run("XINFO STREAM x FULL");
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec(),
              ElementsAre("name", "g1", "last-delivered-id", "1-0", "entries-read", kMatchNil,
                          "lag", kMatchNil, "pel-count", _, "pending", _, "consumers", _));

  Run("XREADGROUP GROUP g1 c11 COUNT 1 STREAMS x >");
  resp = Run("XINFO STREAM x FULL");
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec(),
              ElementsAre("name", "g1", "last-delivered-id", "2-0", "entries-read", kMatchNil,
                          "lag", kMatchNil, "pel-count", _, "pending", _, "consumers", _));

  Run("XREADGROUP GROUP g1 c11 COUNT 1 STREAMS x >");
  resp = Run("XINFO STREAM x FULL");
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec(),
              ElementsAre("name", "g1", "last-delivered-id", "4-0", "entries-read", kMatchNil,
                          "lag", kMatchNil, "pel-count", _, "pending", _, "consumers", _));

  Run("XREADGROUP GROUP g1 c11 COUNT 1 STREAMS x >");
  resp = Run("XINFO STREAM x FULL");
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec(),
              ElementsAre("name", "g1", "last-delivered-id", "5-0", "entries-read", IntArg(5),
                          "lag", IntArg(0), "pel-count", _, "pending", _, "consumers", _));

  // Add more messages
  Run("XADD x 6-0 data f");
  Run("XADD x 7-0 data g");
  resp = Run("XINFO STREAM x FULL");
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec(),
              ElementsAre("name", "g1", "last-delivered-id", "5-0", "entries-read", IntArg(5),
                          "lag", IntArg(2), "pel-count", _, "pending", _, "consumers", _));

  // XADD with MINID trimming
  Run("XADD x MINID = 7-0 8-0 data h");
  resp = Run("XINFO STREAM x FULL");
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec(),
              ElementsAre("name", "g1", "last-delivered-id", "5-0", "entries-read", IntArg(5),
                          "lag", IntArg(3), "pel-count", _, "pending", _, "consumers", _));
  EXPECT_THAT(resp.GetVec()[17].GetVec()[1].GetVec(),
              ElementsAre("name", "g2", "last-delivered-id", "0-0", "entries-read", kMatchNil,
                          "lag", IntArg(2), "pel-count", _, "pending", _, "consumers", _));

  // Read all remaining with g1
  Run("XREADGROUP GROUP g1 c11 STREAMS x >");
  resp = Run("XINFO STREAM x FULL");
  EXPECT_THAT(resp.GetVec()[17].GetVec()[0].GetVec(),
              ElementsAre("name", "g1", "last-delivered-id", "8-0", "entries-read", IntArg(8),
                          "lag", IntArg(0), "pel-count", _, "pending", _, "consumers", _));
}

TEST_F(StreamFamilyTest, XTrimCrashWithMallocUsedZero) {
  auto resp = Run("xadd mystream 0-0 field1 value1");
  EXPECT_THAT(
      resp, ErrArg("The ID specified in XADD is equal or smaller than the target stream top item"));

  // Without the fix we would have crashed here with check failed MallocUsed() != 0
  Run("XTRIM mystream MAXLEN 0");
}

TEST_F(StreamFamilyTest, XReadGroupMultipleStreams) {
  Run("XGROUP CREATE mystream1 mygroup $ MKSTREAM");
  Run("XGROUP CREATE mystream mygroup $ MKSTREAM");

  Run("XADD mystream 2000-0 field1 value1");
  Run("XADD mystream 2000-1 field1 value1");
  Run("XADD mystream 2000-2 field1 value1");

  Run("XADD mystream1 2000-0 field1 value1");
  Run("XADD mystream1 2000-1 field1 value1");
  Run("XADD mystream1 2000-2 field1 value1");

  auto resp = Run("XREADGROUP GROUP mygroup myconsumer STREAMS mystream mystream1 > 2000-0");

  EXPECT_THAT(resp, RespArray(ElementsAre(ArrLen(2), ArrLen(2))));

  const auto& vec = resp.GetVec();

  auto first_stream = vec[0];
  EXPECT_THAT(first_stream, RespArray(ElementsAre("mystream", ArrLen(3))));
  auto entries = first_stream.GetVec()[1].GetVec();
  EXPECT_THAT(entries[0], RespArray(ElementsAre("2000-0", RespElementsAre("field1", "value1"))));
  EXPECT_THAT(entries[1], RespArray(ElementsAre("2000-1", RespElementsAre("field1", "value1"))));
  EXPECT_THAT(entries[2], RespArray(ElementsAre("2000-2", RespElementsAre("field1", "value1"))));

  auto second_stream = vec[1];
  EXPECT_THAT(second_stream, RespArray(ElementsAre("mystream1", ArrLen(0))));
}

TEST_F(StreamFamilyTest, XGroupSetIdEntriesRead) {
  Run("XGROUP CREATE mystream mygroup $ MKSTREAM");
  Run("XADD mystream 2000-0 key val");
  Run("XGROUP SETID mystream mygroup 2000-0 ENTRIESREAD 100");

  auto resp = Run("XINFO GROUPS mystream");
  EXPECT_THAT(resp.GetVec(), ElementsAre("name", "mygroup", "consumers", IntArg(0), "pending",
                                         IntArg(0), "last-delivered-id", "2000-0", "entries-read",
                                         IntArg(100), "lag", IntArg(-99)));

  Run("XGROUP SETID mystream mygroup 2000-0 ENTRIESREAD -1");
  resp = Run("XINFO GROUPS mystream");
  EXPECT_THAT(resp.GetVec(), ElementsAre("name", "mygroup", "consumers", IntArg(0), "pending",
                                         IntArg(0), "last-delivered-id", "2000-0", "entries-read",
                                         kMatchNil, "lag", IntArg(0)));
}

TEST_F(StreamFamilyTest, XInfoConsumersArityCrash) {
  Run("XGROUP CREATE mystream mygroup $ MKSTREAM");
  auto resp = Run("XINFO CONSUMERS mystream");
  EXPECT_THAT(resp, ErrArg("syntax error"));
}

TEST_F(StreamFamilyTest, GroupCreateInvalidIdMemoryTracking) {
  auto resp = Run({"xgroup", "create", "mystream", "mygroup", "notanumber", "MKSTREAM"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  // Verify the stream was not created (no orphan stream after the error)
  resp = Run({"exists", "mystream"});
  EXPECT_THAT(resp, IntArg(0));
}

TEST_F(StreamFamilyTest, XAddOnOrphanedStreamMemoryTracking) {
  auto resp = Run({"xgroup", "create", "mystream", "mygroup", "invalid_id", "MKSTREAM"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  resp = Run({"xadd", "mystream", "0-0", "field", "value"});
  EXPECT_THAT(resp, ErrArg("equal or smaller"));

  resp = Run({"exists", "mystream"});
  EXPECT_THAT(resp, IntArg(0));
}

TEST_F(StreamFamilyTest, XAutoClaimEmptyConsumer) {
  Run({"xadd", "stream4", "*", "field", "val1"});
  Run({"xgroup", "create", "stream4", "group2", "0"});
  auto resp = Run({"xautoclaim", "stream4", "group2", "", "0", "0-0"});
  EXPECT_THAT(resp, AnyOf(ErrArg(""), ArgType(RespExpr::ARRAY)));
}

}  // namespace dfly


================================================
FILE: src/server/string_family.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include <absl/container/inlined_vector.h>
#include <absl/strings/match.h>
#include <absl/strings/str_cat.h>

#include <algorithm>
#include <array>
#include <chrono>
#include <cstdint>
#include <variant>

#include "base/flags.h"
#include "base/logging.h"
#include "base/stl_util.h"
#include "core/overloaded.h"
#include "facade/cmd_arg_parser.h"
#include "facade/op_status.h"
#include "facade/reply_builder.h"
#include "facade/reply_capture.h"
#include "redis/redis_aux.h"
#include "server/acl/acl_commands_def.h"
#include "server/cmd_support.h"
#include "server/command_families.h"
#include "server/command_registry.h"
#include "server/conn_context.h"
#include "server/db_slice.h"
#include "server/engine_shard_set.h"
#include "server/error.h"
#include "server/execution_state.h"
#include "server/family_utils.h"
#include "server/generic_family.h"
#include "server/journal/journal.h"
#include "server/search/doc_index.h"
#include "server/table.h"
#include "server/tiered_storage.h"
#include "server/transaction.h"
#include "util/fibers/future.h"

ABSL_FLAG(bool, mget_dedup_keys, false, "If true, MGET will deduplicate keys");

namespace dfly {

namespace {

using namespace std;
using namespace facade;
using namespace util;

using CI = CommandId;

enum class ExpT { EX, PX, EXAT, PXAT };

constexpr uint32_t kMaxStrLen = 1 << 28;

// Either immediately available value or tiering future + result
template <typename T> using TResultOrT = variant<T, TieredStorage::TResult<T>>;
using StringResult = TResultOrT<string>;

StringResult ReadString(DbIndex dbid, string_view key, const PrimeValue& pv, EngineShard* es) {
  return pv.IsExternal() ? StringResult{ReadTieredString(dbid, key, pv, es->tiered_storage())}
                         : StringResult{pv.ToString()};
}

// Helper for performing SET operations with various options
class SetCmd {
 public:
  explicit SetCmd(OpArgs op_args, bool explicit_journal)
      : op_args_(op_args), explicit_journal_{explicit_journal} {
  }

  enum SetFlags {
    SET_ALWAYS = 0,
    SET_IF_NOTEXIST = 1 << 0,     /* NX: Set if key not exists. */
    SET_IF_EXISTS = 1 << 1,       /* XX: Set if key exists. */
    SET_KEEP_EXPIRE = 1 << 2,     /* KEEPTTL: Set and keep the ttl */
    SET_GET = 1 << 3,             /* GET: Set if want to get key before set */
    SET_EXPIRE_AFTER_MS = 1 << 4, /* EX,PX,EXAT,PXAT: Expire after ms. */
    SET_STICK = 1 << 5,           /* Set STICK flag */
  };

  struct SetParams {
    uint16_t flags = SET_ALWAYS;
    uint32_t memcache_flags = 0;
    uint64_t expire_after_ms = 0;  // Relative value based on now. 0 means no expiration.
    optional<StringResult>* prev_val = nullptr;  // if set, previous value will be stored if found
    BackPressureFuture* backpressure = nullptr;

    constexpr bool IsConditionalSet() const {
      return flags & SET_IF_NOTEXIST || flags & SET_IF_EXISTS;
    }
  };

  OpStatus Set(const SetParams& params, std::string_view key, std::string_view value);

 private:
  OpStatus SetExisting(const SetParams& params, std::string_view value,
                       DbSlice::ItAndUpdater* it_upd);

  void AddNew(const SetParams& params, const DbSlice::Iterator& it, std::string_view key,
              std::string_view value);

  // Called at the end of AddNew of SetExisting
  void PostEdit(const SetParams& params, std::string_view key, std::string_view value,
                PrimeValue* pv);

  void RecordJournal(const SetParams& params, std::string_view key, std::string_view value);

  OpStatus CachePrevIfNeeded(const SetParams& params, DbSlice::Iterator it);

  const OpArgs op_args_;
  bool explicit_journal_;  // call RecordJournal (auto journaling disabled)
};

size_t SetRangeInternal(std::string* value, size_t start, std::string_view range) {
  value->resize(max(value->size(), start + range.size()));
  memcpy(value->data() + start, range.data(), range.size());
  return value->size();
}

OpResult<TResultOrT<size_t>> OpStrLen(const OpArgs& op_args, string_view key) {
  auto& db_slice = op_args.GetDbSlice();
  auto it_res = db_slice.FindReadOnly(op_args.db_cntx, key, OBJ_STRING);
  if (it_res == OpStatus::KEY_NOTFOUND) {
    return {0u};
  }
  RETURN_ON_BAD_STATUS(it_res);

  // For external entries we have to enqueue reads because modify operations like append could be
  // already pending.
  // TODO(vlad): Optimize to return co.Size() if no modify operations are present
  // TODO(vlad): Omit decoding string to just query it's length
  if (const auto& co = it_res.value()->second; co.IsExternal()) {
    auto cb = [](string_view s) { return s.size(); };

    TieredStorage::TResult<size_t> fut = ReadTiered<size_t>(
        op_args.db_cntx.db_index, key, co, std::move(cb), op_args.shard->tiered_storage());
    return {std::move(fut)};
  } else {
    return {co.Size()};
  }
}

OpResult<TResultOrT<size_t>> OpSetRange(const OpArgs& op_args, string_view key, size_t start,
                                        string_view range) {
  VLOG(2) << "SetRange(" << key << ", " << start << ", " << range << ")";
  auto& db_slice = op_args.GetDbSlice();

  if (range.empty()) {
    return OpStrLen(op_args, key);
  }

  auto op_res = db_slice.AddOrFind(op_args.db_cntx, key, OBJ_STRING);
  RETURN_ON_BAD_STATUS(op_res);
  auto& res = *op_res;

  if (res.it->second.IsExternal()) {
    return {ModifyTiered<size_t>(
        op_args.db_cntx.db_index, key, res.it->second,
        [start = start, range = string(range)](std::string* s) {
          return SetRangeInternal(s, start, range);
        },
        op_args.shard->tiered_storage())};
  } else {
    string value;

    if (!res.is_new)
      value = res.it->second.ToString();

    size_t len = SetRangeInternal(&value, start, range);
    res.it->second.SetString(value);
    return {len};
  }
}

OpResult<StringResult> OpGetRange(const OpArgs& op_args, string_view key, int32_t start,
                                  int32_t end) {
  auto read_cb = [start, end](std::string_view slice) mutable -> string {
    int32_t strlen = slice.size();
    if (strlen == 0)
      return "";

    if (start < 0) {
      if (end < start) {
        return "";
      }
      start = strlen + start;
      start = max(start, 0);
    }

    if (end < 0) {
      end = strlen + end;
      end = max(end, 0);
    } else {
      end = min(end, strlen - 1);
    }

    if (start > end) {
      return "";
    }

    return string{slice.substr(start, end - start + 1)};
  };

  auto& db_slice = op_args.GetDbSlice();
  auto it_res = db_slice.FindReadOnly(op_args.db_cntx, key, OBJ_STRING);
  if (it_res == OpStatus::KEY_NOTFOUND) {
    return StringResult(string{});
  }
  RETURN_ON_BAD_STATUS(it_res);

  const PrimeValue& co = it_res.value()->second;
  if (co.IsExternal()) {
    fb2::Future<io::Result<string>> fut = ReadTiered<string>(
        op_args.db_cntx.db_index, key, co,
        [read_cb](std::string_view sv) mutable { return read_cb(sv); },
        op_args.shard->tiered_storage());
    return {std::move(fut)};
  }

  string tmp;
  string_view slice = co.GetSlice(&tmp);
  return {read_cb(slice)};
};

// TODO: Don't copy whole value just to append
size_t ExtendExisting(const DbSlice::Iterator& it, string_view key, string_view val, bool prepend) {
  string tmp;
  string_view slice = it->second.GetSlice(&tmp);

  string new_val = prepend ? absl::StrCat(val, slice) : absl::StrCat(slice, val);
  it->second.SetString(new_val);
  return new_val.size();
}

OpResult<bool> ExtendOrSkip(const OpArgs& op_args, string_view key, string_view val, bool prepend) {
  auto& db_slice = op_args.GetDbSlice();
  auto it_res = db_slice.FindMutable(op_args.db_cntx, key, OBJ_STRING);
  if (!it_res) {
    return false;
  }

  return ExtendExisting(it_res->it, key, val, prepend);
}

OpResult<double> OpIncrFloat(const OpArgs& op_args, string_view key, double val) {
  auto& db_slice = op_args.GetDbSlice();

  auto op_res = db_slice.AddOrFind(op_args.db_cntx, key, OBJ_STRING);
  RETURN_ON_BAD_STATUS(op_res);
  auto& add_res = *op_res;

  char buf[128];

  if (add_res.is_new) {
    char* str = RedisReplyBuilder::FormatDouble(val, buf, sizeof(buf));
    add_res.it->second.SetString(str);

    return val;
  }

  if (add_res.it->second.Size() == 0)
    return OpStatus::INVALID_FLOAT;

  string tmp;
  string_view slice = add_res.it->second.GetSlice(&tmp);

  double base = 0;
  if (!ParseDouble(slice, &base)) {
    return OpStatus::INVALID_FLOAT;
  }

  base += val;

  if (isnan(base) || isinf(base)) {
    return OpStatus::NAN_OR_INF_DURING_INCR;
  }

  char* str = RedisReplyBuilder::FormatDouble(base, buf, sizeof(buf));

  add_res.it->second.SetString(str);

  return base;
}

// if skip_on_missing - returns KEY_NOTFOUND.
OpResult<int64_t> OpIncrBy(const OpArgs& op_args, string_view key, int64_t incr,
                           bool skip_on_missing) {
  auto& db_slice = op_args.GetDbSlice();

  // we avoid using AddOrFind because of skip_on_missing option for memcache.
  auto res = db_slice.FindMutable(op_args.db_cntx, key, OBJ_STRING);

  if (!res) {
    if (res.status() == OpStatus::WRONG_TYPE)
      return res.status();

    if (skip_on_missing)
      return OpStatus::KEY_NOTFOUND;

    PrimeValue pv;
    pv.SetInt(incr);

    auto op_result = db_slice.AddNew(op_args.db_cntx, key, std::move(pv), 0);
    RETURN_ON_BAD_STATUS(op_result);

    return incr;
  }

  // Type is already checked by FindMutable (OBJ_STRING)
  auto opt_prev = res->it->second.TryGetInt();
  if (!opt_prev) {
    return OpStatus::INVALID_VALUE;
  }

  long long prev = *opt_prev;
  if ((incr < 0 && prev < 0 && incr < (LLONG_MIN - prev)) ||
      (incr > 0 && prev > 0 && incr > (LLONG_MAX - prev))) {
    return OpStatus::OUT_OF_RANGE;
  }

  int64_t new_val = prev + incr;
  DCHECK(!res->it->second.IsExternal());
  res->it->second.SetInt(new_val);

  return new_val;
}

// Returns true if keys were set, false otherwise.
OpStatus OpMSet(const OpArgs& op_args, const ShardArgs& args) {
  DCHECK(!args.Empty() && args.Size() % 2 == 0);

  SetCmd::SetParams params;
  SetCmd sg(op_args, false);

  OpStatus result = OpStatus::OK;
  size_t stored = 0;
  for (auto it = args.begin(); it != args.end();) {
    string_view key = *(it++);
    string_view value = *(it++);
    if (auto status = sg.Set(params, key, value); status != OpStatus::OK) {
      result = status;
      break;
    }

    stored++;
  }

  // Above loop could have parial success (e.g. OOM), replicate only what changed
  if (auto journal = op_args.shard->journal(); journal) {
    if (stored * 2 == args.Size()) {
      RecordJournal(op_args, "MSET", args, op_args.tx->GetUniqueShardCnt());
      DCHECK_EQ(result, OpStatus::OK);
    } else if (stored > 0) {
      vector<string_view> store_args(args.begin(), args.end());
      store_args.resize(stored * 2);
      RecordJournal(op_args, "MSET", store_args, op_args.tx->GetUniqueShardCnt());
    }
  }
  return result;
}

bool IsValueWithinBounds(const int64_t value, const int64_t bound) {
  if (bound >= 0) {
    return value >= INT64_MIN + bound;
  }

  return value <= INT64_MAX + bound;
}

// emission_interval_ns assumed to be positive // TODO: Change to unsigned??
// limit is assumed to be positive
OpResult<array<int64_t, 5>> OpThrottle(const OpArgs& op_args, const string_view key,
                                       const int64_t limit, const int64_t emission_interval_ns,
                                       const uint64_t quantity) {
  constexpr uint64_t kSecondToMilliSecond = 1000;
  constexpr uint64_t kMilliSecondToNanoSecond = 1000000;
  auto& db_slice = op_args.GetDbSlice();

  // Total size of the bucket
  const int64_t delay_variation_tolerance_ns = emission_interval_ns * limit;  // should be positive

  int64_t remaining = 0;
  int64_t reset_after_ms = -kSecondToMilliSecond;
  int64_t retry_after_ms = -kSecondToMilliSecond;

  // Cost of this request
  const int64_t increment_ns = emission_interval_ns * quantity;  // should be nonnegative

  auto res = db_slice.FindMutable(op_args.db_cntx, key, OBJ_STRING);
  const int64_t now_ns = GetCurrentTimeNs();

  int64_t tat_ns = now_ns;
  if (res) {
    // Type is already checked by FindMutable (OBJ_STRING)
    auto opt_prev = res->it->second.TryGetInt();
    if (!opt_prev) {
      return OpStatus::INVALID_VALUE;
    }
    tat_ns = *opt_prev;
  } else if (res.status() == OpStatus::WRONG_TYPE) {
    return res.status();
  }

  int64_t new_tat_ns = max(tat_ns, now_ns);
  if (new_tat_ns > INT64_MAX - increment_ns) {
    return OpStatus::INVALID_INT;
  }
  new_tat_ns += increment_ns;

  if (new_tat_ns < INT64_MIN + delay_variation_tolerance_ns) {
    return OpStatus::INVALID_INT;
  }

  // The cutoff point before which a request is rejected (throttled) and at or after which a request
  // is accepted.
  const int64_t allow_at_ns = new_tat_ns - delay_variation_tolerance_ns;

  if (!IsValueWithinBounds(now_ns, allow_at_ns)) {
    return OpStatus::INVALID_INT;
  }

  const int64_t diff_ns = now_ns - allow_at_ns;

  const bool limited = diff_ns < 0;
  int64_t ttl_ns;
  if (limited) {
    if (increment_ns <= delay_variation_tolerance_ns) {
      if (diff_ns == INT64_MIN) {
        return OpStatus::INVALID_INT;
      }
      retry_after_ms = (-diff_ns + kMilliSecondToNanoSecond - 1) / kMilliSecondToNanoSecond;
    }

    if (now_ns >= 0 ? tat_ns < INT64_MIN + now_ns : tat_ns > INT64_MAX + now_ns) {
      return OpStatus::INVALID_INT;
    }
    ttl_ns = tat_ns - now_ns;
  } else {
    if (!IsValueWithinBounds(new_tat_ns, now_ns)) {
      return OpStatus::INVALID_INT;
    }
    ttl_ns = new_tat_ns - now_ns;
  }

  if (ttl_ns < delay_variation_tolerance_ns - INT64_MAX) {
    return OpStatus::INVALID_INT;
  }
  const int64_t next_ns = delay_variation_tolerance_ns - ttl_ns;
  if (next_ns > -emission_interval_ns) {
    remaining = next_ns / emission_interval_ns;
  }
  reset_after_ms = (ttl_ns + kMilliSecondToNanoSecond - 1) / kMilliSecondToNanoSecond;

  if (!limited) {
    // Although most computation so far is in nanoseconds, we must store expiry as milliseconds.
    // While this causes loss of precision, the value stored against the throttle key is still in
    // the nanosecond units. When the key is loaded, that value will be read and used as tat_ns. The
    // loss of precision will cause the throttle key to be expired a bit earlier than expected, so
    // to make up, we round up its expiry by at most 1 millisecond. Extending the key life does not
    // break behavior because the tat_ns value will be used to check for throttling.
    const int64_t new_tat_ms =
        (new_tat_ns + kMilliSecondToNanoSecond - 1) / kMilliSecondToNanoSecond;
    if (res) {
      db_slice.AddExpire(op_args.db_cntx.db_index, res->it, new_tat_ms);
      res->it->second.SetInt(new_tat_ns);
    } else {
      PrimeValue pv;
      pv.SetInt(new_tat_ns);

      auto res = db_slice.AddNew(op_args.db_cntx, key, std::move(pv), new_tat_ms);
      if (!res) {
        return res.status();
      }
    }
  }

  return array<int64_t, 5>{limited ? 1 : 0, limit, remaining, retry_after_ms, reset_after_ms};
}

struct GetResp {
  string_view value;
  uint64_t mc_ver = 0;
  uint32_t mc_flag = 0;
  uint32_t ttl_sec = 0;
};

struct MGetResponse {
  explicit MGetResponse(size_t size = 0) : resp_arr(size) {
  }

  std::unique_ptr<char[]> storage;
  absl::InlinedVector<std::optional<GetResp>, 2> resp_arr;
};

template <typename Iter> using SearchKey = std::function<OpResult<Iter>(string_view)>;

// A find operation which can mutate, for commands which can write, eg GAT
using SearchMut = SearchKey<DbSlice::Iterator>;

// Const find operation, for read-only commands, eg MGet
using SearchConst = SearchKey<DbSlice::ConstIterator>;

template <typename Iter>
MGetResponse CollectKeys(BlockingCounter wait_bc, AggregateError* err, MemcacheCmdFlags cmd_flags,
                         const Transaction* t, EngineShard* shard, SearchKey<Iter> find_op) {
  ShardArgs keys = t->GetShardArgs(shard->shard_id());
  DCHECK(!keys.Empty());

  if constexpr (std::is_same_v<Iter, DbSlice::Iterator>) {
    const CommandId* cid = t->GetCId();
    DCHECK(!cid->IsReadOnly()) << "mutable iterator used with read-only command " << cid->name();
  }

  MGetResponse response(keys.Size());
  struct Item {
    Iter it;
    int source_index = -1;  // in case of duplicate keys, points to the first occurrence.
  };

  absl::InlinedVector<Item, 32> items(keys.Size());

  // First, fetch all iterators and count total size ahead
  size_t total_size = 0;
  unsigned index = 0;
  static bool mget_dedup_keys = absl::GetFlag(FLAGS_mget_dedup_keys);

  // We can not make it thread-local because we may preempt during the Find loop due to
  // replication of expiry events.
  absl::flat_hash_map<string_view, unsigned> key_index;
  if (mget_dedup_keys) {
    key_index.reserve(keys.Size());
  }

  for (string_view key : keys) {
    if (mget_dedup_keys) {
      auto [it, inserted] = key_index.try_emplace(key, index);
      if (!inserted) {  // duplicate -> point to the first occurrence.
        items[index++].source_index = it->second;
        continue;
      }
    }

    auto it_res = find_op(key);
    auto& dest = items[index++];
    if (it_res) {
      dest.it = *it_res;
      total_size += (*it_res)->second.Size();
    }
  }

  VLOG_IF(1, total_size > 10000000) << "OpMGet: allocating " << total_size << " bytes";

  // Allocate enough for all values
  response.storage = make_unique<char[]>(total_size);
  char* next = response.storage.get();
  bool fetch_mcflag = cmd_flags.return_flags;
  bool fetch_cas = cmd_flags.return_cas;
  const DbSlice& db_slice = t->GetDbSlice(shard->shard_id());

  for (size_t i = 0; i < items.size(); ++i) {
    auto it = items[i].it;
    if (it.is_done()) {
      if (items[i].source_index >= 0) {
        response.resp_arr[i] = response.resp_arr[items[i].source_index];
      }
      continue;
    }
    auto& resp = response.resp_arr[i].emplace();

    // Copy to buffer or trigger tiered read that will eventually write to
    // buffer
    const PrimeValue& value = it->second;
    if (value.IsExternal()) {
      wait_bc->Add(1);
      auto cb = [next, err, wait_bc](const io::Result<string_view>& v) mutable {
        if (v.has_value())
          memcpy(next, v->data(), v->size());
        else
          *err = v.error();
        wait_bc->Dec();
      };
      ReadTiered(t->GetDbIndex(), it.key(), value, std::move(cb), shard->tiered_storage());
    } else {
      value.GetString(next);
    }

    size_t size = value.Size();
    resp.value = string_view(next, size);
    next += size;

    // Note - correct behavior is to return TTL before it was updated by GAT,
    // but this is complex to implement so we return the updated TTL.
    if (it->first.HasExpire() && cmd_flags.return_ttl) {
      int64_t expire_time_ms = it->first.GetExpireTime();
      int64_t ttl_ms = expire_time_ms - t->GetDbContext().time_now_ms;
      resp.ttl_sec = ttl_ms > 0 ? static_cast<uint32_t>((ttl_ms + 999) / 1000) : 0;
    }
    if (fetch_mcflag) {
      if (value.HasFlag()) {
        resp.mc_flag = db_slice.GetMCFlag(t->GetDbIndex(), it->first);
      }

      if (fetch_cas) {
        resp.mc_ver = it.GetVersion();
      }
    }
  }
  key_index.clear();

  return response;
}

// Extend key with value, either prepend or append. Return size of stored string
// after modification
OpResult<TResultOrT<size_t>> OpExtend(const OpArgs& op_args, std::string_view key,
                                      std::string_view value, bool prepend) {
  auto* shard = op_args.shard;
  auto it_res = op_args.GetDbSlice().AddOrFind(op_args.db_cntx, key, OBJ_STRING);
  RETURN_ON_BAD_STATUS(it_res);

  if (it_res->is_new) {
    it_res->it->second.SetString(value);
    return {it_res->it->second.Size()};
  }

  if (const PrimeValue& pv = it_res->it->second; pv.IsExternal()) {
    auto modf = [value = string{value}, prepend](std::string* v) {
      *v = prepend ? absl::StrCat(value, *v) : absl::StrCat(*v, value);
      return v->size();
    };
    return {ModifyTiered<size_t>(op_args.db_cntx.db_index, key, pv, std::move(modf),
                                 shard->tiered_storage())};
  } else {
    return {ExtendExisting(it_res->it, key, value, prepend)};
  }
}

// Helper for building replies for strings
struct GetReplies {
  GetReplies(SinkReplyBuilder* rb) : rb{static_cast<RedisReplyBuilder*>(rb)} {
    DCHECK(dynamic_cast<RedisReplyBuilder*>(rb));
  }

  template <typename T> void Send(OpResult<T>&& res) const {
    switch (res.status()) {
      case OpStatus::OK:
        return Send(std::move(res.value()));
      case OpStatus::WRONG_TYPE:
        return rb->SendError(kWrongTypeErr);
      case OpStatus::IO_ERROR:
        return rb->SendError(kTieredIoError);
      default:
        rb->SendNull();
    }
  }

  template <typename T> void Send(optional<T>&& res) const {
    if (res.has_value())
      return Send(std::move(*res));
    return rb->SendNull();
  }

  template <typename T> void Send(TResultOrT<T>&& res) const {
    if (holds_alternative<T>(res))
      return Send(get<T>(res));

    io::Result<T> iores = get<1>(std::move(res)).Get();
    if (iores.has_value())
      Send(*iores);
    else
      Send(iores.error().message());
  }

  void Send(size_t val) const {
    rb->SendLong(val);
  }

  void Send(string_view str) const {
    rb->SendBulkString(str);
  }

  RedisReplyBuilder* rb;
};

cmd::CmdR ExtendGeneric(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view value = ArgS(args, 1);
  bool prepend = cmd_cntx->cid()->name().starts_with('P');

  VLOG(2) << "ExtendGeneric(" << key << ", " << value << ")";

  if (cmd_cntx->mc_command() == nullptr) {
    auto cb = [&](Transaction* t, EngineShard* shard) {
      return OpExtend(t->GetOpArgs(shard), key, value, prepend);
    };

    RedisReplyBuilder* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
    GetReplies{rb}.Send(co_await cmd::SingleHopT(cb));
  } else {
    // Memcached skips if key is missing
    auto cb = [&](Transaction* t, EngineShard* shard) {
      return ExtendOrSkip(t->GetOpArgs(shard), key, value, prepend);
    };

    OpResult<bool> result = co_await cmd::SingleHopT(cb);
    MCRender render(cmd_cntx->mc_command()->cmd_flags);
    if (result) {
      cmd_cntx->rb()->SendSimpleString(render.RenderStored(result.value()));
    } else {
      cmd_cntx->rb()->SendError(result.status());
    }
  }

  co_return std::nullopt;
}

// Wrapper to call SetCmd::Set in ScheduleSingleHop
OpStatus SetGeneric(const SetCmd::SetParams& sparams, string_view key, string_view value,
                    const CommandContext& ctx) {
  bool explicit_journal = ctx.cid()->opt_mask() & CO::NO_AUTOJOURNAL;
  return ctx.tx()->ScheduleSingleHop([&](Transaction* t, EngineShard* shard) {
    return SetCmd(t->GetOpArgs(shard), explicit_journal).Set(sparams, key, value);
  });
}

cmd::CmdR IncrByGeneric(CommandContext* cmd_cntx, string_view key, int64_t val) {
  bool skip_on_missing = (cmd_cntx->mc_command() != nullptr);

  auto cb = [&](Transaction* t, EngineShard* shard) {
    OpResult<int64_t> res = OpIncrBy(t->GetOpArgs(shard), key, val, skip_on_missing);
    return res;
  };
  auto result = co_await cmd::SingleHopT(cb);

  auto* rb = cmd_cntx->rb();
  switch (result.status()) {
    case OpStatus::OK:
      rb->SendLong(result.value());
      break;
    case OpStatus::INVALID_VALUE:
      rb->SendError(kInvalidIntErr);
      break;
    case OpStatus::OUT_OF_RANGE:
      rb->SendError(kIncrOverflow);
      break;
    case OpStatus::KEY_NOTFOUND:  // Relevant only for MC
      rb->SendSimpleString(MCRender{cmd_cntx->mc_command()->cmd_flags}.RenderNotFound());
      break;
    default:
      rb->SendError(result.status());
      break;
  }
  co_return std::nullopt;
}

struct GetAndTouchParams {
  const Transaction* t;
  EngineShard* shard;
  const DbSlice::ExpireParams& expire_params;
  const string_view key;
};

OpResult<DbSlice::Iterator> FindKeyAndSetExpiry(const GetAndTouchParams& params) {
  const DbContext& ctx = params.t->GetDbContext();
  DbSlice& db_slice = params.t->GetDbSlice(params.shard->shard_id());
  auto find_res = db_slice.FindMutable(ctx, params.key, OBJ_STRING);
  if (!IsValid(find_res->it)) {
    return OpStatus::KEY_NOTFOUND;
  }

  find_res->post_updater.Run();

  auto update = db_slice.UpdateExpire(ctx, find_res->it, find_res->exp_it, params.expire_params);
  if (!update.ok()) {
    return update.status();
  }

  const int64_t value = update.value();
  const bool expired = value == -1;
  if (params.shard->journal()) {
    const OpArgs& op_args = params.t->GetOpArgs(params.shard);
    if (expired) {
      RecordJournal(op_args, "DEL"sv, ArgSlice{(params.key)});
    } else {
      RecordJournal(op_args, "PEXPIREAT"sv, ArgSlice{(params.key), (absl::StrCat(value))});
    }
  }

  if (expired) {
    return OpStatus::KEY_NOTFOUND;
  }
  return find_res->it;
}

MGetResponse OpMGet(BlockingCounter wait_bc, AggregateError* err, MemcacheCmdFlags cmd_flags,
                    const Transaction* t, EngineShard* shard,
                    const DbSlice::ExpireParams* gat_params = nullptr) {
  if (gat_params) {
    SearchMut find_op = [&](string_view key) {
      return FindKeyAndSetExpiry(GetAndTouchParams{
          .t = t,
          .shard = shard,
          .expire_params = *gat_params,
          .key = key,
      });
    };
    return CollectKeys(std::move(wait_bc), err, cmd_flags, t, shard, std::move(find_op));
  } else {
    SearchConst find_op = [&](string_view key) {
      const DbSlice& db_slice = t->GetDbSlice(shard->shard_id());
      return db_slice.FindReadOnly(t->GetDbContext(), key, OBJ_STRING);
    };
    return CollectKeys(std::move(wait_bc), err, cmd_flags, t, shard, std::move(find_op));
  }
}

OpStatus SetCmd::Set(const SetParams& params, string_view key, string_view value) {
  auto& db_slice = op_args_.GetDbSlice();

  DCHECK(db_slice.IsDbValid(op_args_.db_cntx.db_index));
  VLOG(2) << "Set " << key << "(" << db_slice.shard_id() << ") ";

  if (params.IsConditionalSet()) {
    auto find_res = db_slice.FindMutable(op_args_.db_cntx, key);
    if (auto status = CachePrevIfNeeded(params, find_res.it); status != OpStatus::OK)
      return status;

    if (params.flags & SET_IF_EXISTS) {
      if (IsValid(find_res.it)) {
        return SetExisting(params, value, &find_res);
      } else {
        return OpStatus::SKIPPED;
      }
    } else {
      DCHECK(params.flags & SET_IF_NOTEXIST) << params.flags;
      if (IsValid(find_res.it)) {
        return OpStatus::SKIPPED;
      }  // else AddNew() is called below
    }
  }

  // We can use std::nullopt here because SET command can change the key type to string
  auto op_res = db_slice.AddOrFind(op_args_.db_cntx, key, std::nullopt);
  RETURN_ON_BAD_STATUS(op_res);

  if (!op_res->is_new) {
    if (auto status = CachePrevIfNeeded(params, op_res->it); status != OpStatus::OK)
      return status;

    return SetExisting(params, value, &(*op_res));
  } else {
    AddNew(params, op_res->it, key, value);
    return OpStatus::OK;
  }
}

OpStatus SetCmd::SetExisting(const SetParams& params, string_view value,
                             DbSlice::ItAndUpdater* it_upd) {
  DCHECK_EQ(params.flags & SET_IF_NOTEXIST, 0);

  PrimeKey& key = it_upd->it->first;
  PrimeValue& prime_value = it_upd->it->second;
  EngineShard* shard = op_args_.shard;

  auto& db_slice = op_args_.GetDbSlice();
  uint64_t at_ms =
      params.expire_after_ms ? params.expire_after_ms + op_args_.db_cntx.time_now_ms : 0;

  if (!(params.flags & SET_KEEP_EXPIRE)) {
    if (at_ms) {
      db_slice.AddExpire(op_args_.db_cntx.db_index, it_upd->it, at_ms);
    } else {
      db_slice.RemoveExpire(op_args_.db_cntx.db_index, it_upd->it);
    }
  }

  if (params.flags & SET_STICK) {
    key.SetSticky(true);
  }

  bool has_expire = key.HasExpire();

  it_upd->post_updater.ReduceHeapUsage();

  // Update flags
  // TODO: avoid calling SetMCFlag if flags are not changed
  prime_value.SetFlag(params.memcache_flags != 0);
  db_slice.SetMCFlag(op_args_.db_cntx.db_index, key, params.memcache_flags);

  // We need to remove the key from search indices, because we are overwriting it to OBJ_STRING
  RemoveKeyFromIndexesIfNeeded(it_upd->it.key(), op_args_.db_cntx, prime_value, shard);

  // If value is external, mark it as deleted
  if (prime_value.IsExternal()) {
    shard->tiered_storage()->Delete(op_args_.db_cntx.db_index, &prime_value);
  }

  // overwrite existing entry.
  prime_value.SetString(value);

  DCHECK_EQ(has_expire, key.HasExpire());

  PostEdit(params, it_upd->it.key(), value, &prime_value);
  return OpStatus::OK;
}

void SetCmd::AddNew(const SetParams& params, const DbSlice::Iterator& it, std::string_view key,
                    std::string_view value) {
  auto& db_slice = op_args_.GetDbSlice();
  it->second = PrimeValue{value};

  if (params.expire_after_ms) {
    db_slice.AddExpire(op_args_.db_cntx.db_index, it,
                       params.expire_after_ms + op_args_.db_cntx.time_now_ms);
  }

  if (params.memcache_flags) {
    it->second.SetFlag(true);
    db_slice.SetMCFlag(op_args_.db_cntx.db_index, it->first, params.memcache_flags);
  }

  if (params.flags & SET_STICK) {
    it->first.SetSticky(true);
  }

  PostEdit(params, key, value, &it->second);
}

void SetCmd::PostEdit(const SetParams& params, std::string_view key, std::string_view value,
                      PrimeValue* pv) {
  EngineShard* shard = op_args_.shard;

  // Currently we always try to offload, but Stash may ignore it, if disk I/O is overloaded.
  // If we are beyond the offloading threshold, StashPrimeValue may populate a backpressure future
  // via the provided out-parameter.
  if (auto* ts = shard->tiered_storage(); ts) {
    StashPrimeValue(op_args_.db_cntx.db_index, key, pv, ts, params.backpressure);
  }

  if (explicit_journal_ && op_args_.shard->journal()) {
    RecordJournal(params, key, value);
  }
}

void SetCmd::RecordJournal(const SetParams& params, string_view key, string_view value) {
  absl::InlinedVector<string_view, 5> cmds({key, value});  // 5 is theoretical maximum;

  std::string exp_str;
  if (params.flags & SET_EXPIRE_AFTER_MS) {
    exp_str = absl::StrCat(params.expire_after_ms + op_args_.db_cntx.time_now_ms);
    cmds.insert(cmds.end(), {"PXAT", exp_str});
  } else if (params.flags & SET_KEEP_EXPIRE) {
    cmds.push_back("KEEPTTL");
  }

  if (params.flags & SET_STICK) {
    cmds.push_back("STICK");
  }
  if (params.memcache_flags) {
    cmds.push_back("_MCFLAGS");
    cmds.push_back(absl::StrCat(params.memcache_flags));
  }

  // Skip NX/XX because SET operation was executed.
  // Skip GET, because its not important on replica.

  dfly::RecordJournal(op_args_, "SET", ArgSlice{cmds});
}

OpStatus SetCmd::CachePrevIfNeeded(const SetCmd::SetParams& params, DbSlice::Iterator it) {
  if (!params.prev_val || !IsValid(it))
    return OpStatus::OK;
  if (it->second.ObjType() != OBJ_STRING)
    return OpStatus::WRONG_TYPE;

  *params.prev_val =
      ReadString(op_args_.db_cntx.db_index, it.key(), it->second, EngineShard::tlocal());
  return OpStatus::OK;
}

struct NegativeExpire {};  // Returned if relative expiry was in the past
std::variant<SetCmd::SetParams, facade::ErrorReply, NegativeExpire> ParseSetParams(
    CmdArgParser parser, const CommandContext* cmd_cntx) {
  SetCmd::SetParams sparams;

  sparams.memcache_flags = cmd_cntx->mc_command() ? cmd_cntx->mc_command()->flags : 0;

  while (parser.HasNext()) {
    if (auto exp_type = parser.TryMapNext("EX", ExpT::EX, "PX", ExpT::PX, "EXAT", ExpT::EXAT,
                                          "PXAT", ExpT::PXAT);
        exp_type) {
      auto int_arg = parser.Next<int64_t>();
      if (parser.HasError())
        break;

      // We can set expiry only once.
      if (sparams.flags & SetCmd::SET_EXPIRE_AFTER_MS)
        return facade::ErrorReply{kSyntaxErr};

      sparams.flags |= SetCmd::SET_EXPIRE_AFTER_MS;

      // Since PXAT/EXAT can change this, we need to check this ahead
      if (int_arg <= 0)
        return facade::ErrorReply{InvalidExpireTime("set")};

      DbSlice::ExpireParams expiry{
          .value = int_arg,
          .unit = *exp_type == ExpT::PX || *exp_type == ExpT::PXAT ? TimeUnit::MSEC : TimeUnit::SEC,
          .absolute = *exp_type == ExpT::EXAT || *exp_type == ExpT::PXAT,
      };

      int64_t now_ms = GetCurrentTimeMs();
      auto [rel_ms, abs_ms] = expiry.Calculate(now_ms, false);
      if (abs_ms < 0)
        return facade::ErrorReply{InvalidExpireTime("set")};

      // Remove existed key if the key is expired already
      if (rel_ms < 0)
        return NegativeExpire{};

      tie(sparams.expire_after_ms, ignore) = expiry.Calculate(now_ms, true);
    } else if (parser.Check("_MCFLAGS")) {
      sparams.memcache_flags = parser.Next<uint32_t>();
    } else {
      uint16_t flag = parser.MapNext(  //
          "GET", SetCmd::SET_GET, "STICK", SetCmd::SET_STICK, "KEEPTTL", SetCmd::SET_KEEP_EXPIRE,
          "XX", SetCmd::SET_IF_EXISTS, "NX", SetCmd::SET_IF_NOTEXIST);
      sparams.flags |= flag;
    }
  }

  if (auto err = parser.TakeError(); err)
    return err.MakeReply();

  auto has_mask = [&](uint16_t m) { return (sparams.flags & m) == m; };
  if (has_mask(SetCmd::SET_IF_EXISTS | SetCmd::SET_IF_NOTEXIST) ||
      has_mask(SetCmd::SET_KEEP_EXPIRE | SetCmd::SET_EXPIRE_AFTER_MS)) {
    return facade::ErrorReply{kSyntaxErr};
  }

  return sparams;
}

cmd::CmdR CmdSet(CmdArgList args, CommandContext* cmd_cntx) {
  facade::CmdArgParser parser{args};

  auto [key, value] = parser.Next<string_view, string_view>();
  auto params_result = ParseSetParams(parser, cmd_cntx);

  if (holds_alternative<facade::ErrorReply>(params_result))
    co_return get<facade::ErrorReply>(params_result);

  if (holds_alternative<NegativeExpire>(params_result)) {
    auto del_cb = [](const Transaction* tx, EngineShard* es) {
      ShardArgs args = tx->GetShardArgs(es->shard_id());
      GenericFamily::OpDel(tx->GetOpArgs(es), args, false);
      return OpStatus::OK;
    };
    co_await cmd::SingleHop(del_cb);

    if (cmd_cntx->mc_command() != nullptr) {
      cmd_cntx->rb()->SendSimpleString(
          MCRender{cmd_cntx->mc_command()->cmd_flags}.RenderStored(true));
    } else {
      cmd_cntx->rb()->SendOk();
    }
    co_return std::nullopt;
  }

  auto& sparams = get<SetCmd::SetParams>(params_result);

  optional<StringResult> prev;
  if (sparams.flags & SetCmd::SET_GET)
    sparams.prev_val = &prev;

  optional<util::fb2::Future<bool>> backpressure;
  sparams.backpressure = &backpressure;

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return SetCmd(t->GetOpArgs(shard), true).Set(sparams, key, value);
  };

  OpStatus result = co_await cmd::SingleHop(cb);
  auto* rb = cmd_cntx->rb();

  switch (result) {
    case OpStatus::WRONG_TYPE:
      rb->SendError(kWrongTypeErr);  // TODO(vlad): use co_return after await?
      co_return std::nullopt;
    case OpStatus::OUT_OF_MEMORY:
      rb->SendError(kOutOfMemory);
      co_return std::nullopt;
    default:
      break;
  };

  // If backpressure was provided, wait with reasonable limit (to avoid client deadlocking).
  if (backpressure) {
    std::move(backpressure)->GetFor(5ms);
  }

  if (sparams.flags & SetCmd::SET_GET) {
    GetReplies{rb}.Send(std::move(prev));
    co_return std::nullopt;
  }

  if (cmd_cntx->mc_command() != nullptr) {
    MCRender render(cmd_cntx->mc_command()->cmd_flags);
    rb->SendSimpleString(render.RenderStored(result == OpStatus::OK));
  } else if (result == OpStatus::OK) {
    rb->SendOk();
  } else {
    static_cast<RedisReplyBuilder*>(rb)->SendNull();
  }

  co_return std::nullopt;
}

/// (P)SETEX key seconds (milliseconds) value
void CmdSetExGeneric(CmdArgList args, CommandContext* cmd_cntx) {
  string_view cmd_name = cmd_cntx->cid()->name();

  CmdArgParser parser{args};
  auto [key, exp_int, value] = parser.Next<string_view, int64_t, string_view>();

  RETURN_ON_PARSE_ERROR(parser, cmd_cntx);

  if (exp_int < 1)
    return cmd_cntx->SendError(InvalidExpireTime(cmd_name));

  DbSlice::ExpireParams expiry{
      .value = exp_int,
      .unit = cmd_name.front() == 'P' ? TimeUnit::MSEC : TimeUnit::SEC,
      .absolute = false,
  };

  int64_t now_ms = GetCurrentTimeMs();
  auto [_, abs_ms] = expiry.Calculate(now_ms, false);
  if (abs_ms < 0)
    return cmd_cntx->SendError(InvalidExpireTime("set"));

  SetCmd::SetParams sparams;
  sparams.flags |= SetCmd::SET_EXPIRE_AFTER_MS;
  sparams.expire_after_ms = expiry.Calculate(now_ms, true).first;
  cmd_cntx->SendError(SetGeneric(sparams, key, value, *cmd_cntx));
}

void CmdSetNx(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view value = ArgS(args, 1);

  SetCmd::SetParams sparams;
  sparams.flags |= SetCmd::SET_IF_NOTEXIST;
  if (cmd_cntx->mc_command())
    sparams.memcache_flags = cmd_cntx->mc_command()->flags;

  switch (SetGeneric(sparams, key, value, *cmd_cntx)) {
    case OpStatus::OK:
      return cmd_cntx->SendLong(1);  // Successfully set the value
    case OpStatus::OUT_OF_MEMORY:
      return cmd_cntx->SendError(kOutOfMemory);
    case OpStatus::SKIPPED:
      return cmd_cntx->SendLong(0);  // Existed, zero updates performed
    default:
      LOG(FATAL) << "Invalid result";
  }
}

void CmdGet(CmdArgList args, CommandContext* cmd_cntx) {
  auto cb = [key = ArgS(args, 0)](Transaction* tx, EngineShard* es) -> OpResult<StringResult> {
    auto it_res = tx->GetDbSlice(es->shard_id()).FindReadOnly(tx->GetDbContext(), key, OBJ_STRING);
    if (!it_res.ok())
      return it_res.status();

    return ReadString(tx->GetDbIndex(), key, (*it_res)->second, es);
  };

  GetReplies{cmd_cntx->rb()}.Send(cmd_cntx->tx()->ScheduleSingleHopT(cb));
}

void CmdGetDel(CmdArgList args, CommandContext* cmd_cntx) {
  auto cb = [key = ArgS(args, 0)](Transaction* tx, EngineShard* es) -> OpResult<StringResult> {
    auto& db_slice = tx->GetDbSlice(es->shard_id());
    auto it_res = db_slice.FindMutable(tx->GetDbContext(), key, OBJ_STRING);
    if (!it_res.ok())
      return it_res.status();

    auto value = ReadString(tx->GetDbIndex(), key, it_res->it->second, es);
    db_slice.DelMutable(tx->GetDbContext(), std::move(*it_res));
    return value;
  };

  GetReplies{cmd_cntx->rb()}.Send(cmd_cntx->tx()->ScheduleSingleHopT(cb));
}

void CmdDigest(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  auto cb = [&key](Transaction* tx, EngineShard* es) -> OpResult<string> {
    auto it_res = tx->GetDbSlice(es->shard_id()).FindReadOnly(tx->GetDbContext(), key, OBJ_STRING);
    if (!it_res.ok()) {
      return it_res.status();
    }

    // Read string value (handles tiered storage if needed)
    StringResult str_result = ReadString(tx->GetDbIndex(), key, (*it_res)->second, es);

    // Handle both immediate value and tiered storage future
    string value;
    if (holds_alternative<string>(str_result)) {
      value = std::move(get<string>(str_result));
    } else {
      auto& future = get<TieredStorage::TResult<string>>(str_result);
      io::Result<string> io_res = future.Get();
      if (!io_res) {
        return OpStatus::IO_ERROR;
      }
      value = std::move(*io_res);
    }

    // Compute XXH3 hash and return as 16-char hex string
    return XXH3_Digest(value);
  };

  OpResult<string> result = cmd_cntx->tx()->ScheduleSingleHopT(cb);
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  if (result) {
    rb->SendBulkString(*result);
  } else if (result.status() == OpStatus::KEY_NOTFOUND) {
    rb->SendNull();
  } else {
    cmd_cntx->SendError(result.status());
  }
}

void CmdGetSet(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view value = ArgS(args, 1);

  optional<StringResult> prev;
  SetCmd::SetParams sparams{.prev_val = &prev};

  if (OpStatus status = SetGeneric(sparams, key, value, *cmd_cntx); status != OpStatus::OK)
    return cmd_cntx->SendError(status);

  GetReplies{cmd_cntx->rb()}.Send(std::move(prev));
}

void CmdGetEx(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser{args};
  string_view key = parser.Next();

  DbSlice::ExpireParams exp_params;
  bool defined = false;
  auto* builder = cmd_cntx->rb();
  while (parser.HasNext()) {
    if (auto exp_type = parser.TryMapNext("EX", ExpT::EX, "PX", ExpT::PX, "EXAT", ExpT::EXAT,
                                          "PXAT", ExpT::PXAT);
        exp_type) {
      auto int_arg = parser.Next<int64_t>();
      RETURN_ON_PARSE_ERROR(parser, cmd_cntx);

      if (defined) {
        return cmd_cntx->SendError(kSyntaxErr, kSyntaxErrType);
      }

      if (int_arg <= 0) {
        return cmd_cntx->SendError(InvalidExpireTime("getex"));
      }

      exp_params.absolute = *exp_type == ExpT::EXAT || *exp_type == ExpT::PXAT;
      exp_params.value = int_arg;
      exp_params.unit =
          *exp_type == ExpT::PX || *exp_type == ExpT::PXAT ? TimeUnit::MSEC : TimeUnit::SEC;
      defined = true;
    } else if (parser.Check("PERSIST")) {
      exp_params.persist = true;
    } else {
      return builder->SendError(kSyntaxErr);
    }
  }

  auto cb = [&](Transaction* t, EngineShard* shard) -> OpResult<StringResult> {
    auto op_args = t->GetOpArgs(shard);

    auto it_res = op_args.GetDbSlice().FindMutable(op_args.db_cntx, key, OBJ_STRING);
    if (!it_res)
      return it_res.status();

    StringResult value = ReadString(t->GetDbIndex(), key, it_res->it->second, shard);

    if (exp_params.IsDefined()) {
      it_res->post_updater.Run();  // Run manually before possible delete due to negative expire
      RETURN_ON_BAD_STATUS(op_args.GetDbSlice().UpdateExpire(op_args.db_cntx, it_res->it,
                                                             it_res->exp_it, exp_params));
    }

    // Replicate GETEX as PEXPIREAT or PERSIST
    if (shard->journal()) {
      if (exp_params.persist) {
        RecordJournal(op_args, "PERSIST", {key});
      } else {
        auto [ignore, abs_time] = exp_params.Calculate(op_args.db_cntx.time_now_ms, false);
        auto abs_time_str = absl::StrCat(abs_time);
        RecordJournal(op_args, "PEXPIREAT", {key, abs_time_str});
      }
    }

    return value;
  };

  GetReplies{cmd_cntx->rb()}.Send(cmd_cntx->tx()->ScheduleSingleHopT(cb));
}

cmd::CmdR CmdIncr(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  return IncrByGeneric(cmd_cntx, key, 1);
}

cmd::CmdR CmdIncrBy(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view sval = ArgS(args, 1);
  int64_t val;

  if (!absl::SimpleAtoi(sval, &val)) {
    cmd_cntx->SendError(kInvalidIntErr);
    return cmd::kAborted;
  }
  return IncrByGeneric(cmd_cntx, key, val);
}

cmd::CmdR CmdIncrByFloat(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view sval = ArgS(args, 1);
  double val;

  if (!absl::SimpleAtod(sval, &val)) {
    co_return facade::ErrorReply{kInvalidFloatErr};
  }

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpIncrFloat(t->GetOpArgs(shard), key, val);
  };

  OpResult<double> result = co_await cmd::SingleHopT(cb);
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  if (result)
    rb->SendDouble(result.value());
  else
    rb->SendError(result.status());
  co_return std::nullopt;
}

cmd::CmdR CmdDecr(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  return IncrByGeneric(cmd_cntx, key, -1);
}

cmd::CmdR CmdDecrBy(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view sval = ArgS(args, 1);
  int64_t val;

  if (!absl::SimpleAtoi(sval, &val)) {
    cmd_cntx->SendError(kInvalidIntErr);
    return cmd::kAborted;
  }
  if (val == INT64_MIN) {
    cmd_cntx->SendError(kIncrOverflow);
    return cmd::kAborted;
  }

  return IncrByGeneric(cmd_cntx, key, -val);
}

// Reorder per-shard results according to argument order of primary command
void ReorderShardResults(absl::Span<MGetResponse> mget_resp, const Transaction* t,
                         absl::Span<optional<GetResp>> dest) {
  for (ShardId sid = 0; sid < mget_resp.size(); ++sid) {
    if (!t->IsActive(sid))
      continue;

    auto& src = mget_resp[sid];
    ShardArgs shard_args = t->GetShardArgs(sid);
    unsigned src_indx = 0;
    for (auto it = shard_args.begin(); it != shard_args.end(); ++it, ++src_indx) {
      if (!src.resp_arr[src_indx])
        continue;

      DCHECK_LT(it.index(), dest.size());
      auto& item = dest[it.index()];
      item = src.resp_arr[src_indx];
    }
  }
}

cmd::CmdR MGetGeneric(CommandContext* cmd_cntx, CmdArgList args,
                      std::optional<DbSlice::ExpireParams> gat_params) {
  DCHECK_GE(args.size(), 1U);

  MemcacheCmdFlags cmd_flags;

  if (cmd_cntx->mc_command()) {
    cmd_flags = cmd_cntx->mc_command()->cmd_flags;
  }

  fb2::BlockingCounter tiering_bc{0};  // Count of pending tiered reads
  AggregateError tiering_err;          // First tiering error

  unique_ptr<MGetResponse[]> mget_resp(new MGetResponse[shard_set->size()]);

  auto gat_ptr = gat_params ? &*gat_params : nullptr;
  auto cb = [&](Transaction* t, EngineShard* shard) {
    mget_resp[shard->shard_id()] = OpMGet(tiering_bc, &tiering_err, cmd_flags, t, shard, gat_ptr);
    return OpStatus::OK;
  };

  // Waiter objects needs to be used to keep tx alive in its scope for ReorderShardResults
  cmd::SingleHopWaiter waiter{cmd_cntx, cb};
  auto result = co_await waiter;
  CHECK_EQ(OpStatus::OK, result);

  // wait for all tiered reads to finish and check for errors
  tiering_bc->Wait();
  if (auto err = std::move(tiering_err).Destroy(); err) {
    cmd_cntx->rb()->SendError(err.message());
    co_return std::nullopt;
  }

  size_t arg_len = args.size();

  unique_ptr<optional<GetResp>[]> mget_results(new optional<GetResp>[arg_len]);
  ReorderShardResults(absl::MakeSpan(mget_resp.get(), shard_set->size()), cmd_cntx->tx(),
                      absl::MakeSpan(mget_results.get(), arg_len));

  SinkReplyBuilder::ReplyScope scope{cmd_cntx->rb()};
  if (cmd_cntx->mc_command()) {
    auto* mc_builder = static_cast<MCReplyBuilder*>(cmd_cntx->rb());
    facade::MCRender mc_render{cmd_cntx->mc_command()->cmd_flags};
    for (size_t i = 0; i < arg_len; ++i) {
      const auto& entry = mget_results[i];
      if (entry) {
        mc_builder->SendValue(cmd_cntx->mc_command()->cmd_flags, cmd_cntx->at(i), entry->value, 0,
                              entry->mc_flag, entry->ttl_sec);
      } else {
        mc_builder->SendSimpleString(mc_render.RenderMiss());
      }
    }
    mc_builder->SendSimpleString(mc_render.RenderGetEnd());
  } else {
    auto* redis_builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
    redis_builder->StartArray(arg_len);
    for (size_t i = 0; i < arg_len; ++i) {
      const auto& entry = mget_results[i];
      if (entry) {
        redis_builder->SendBulkString(entry->value);
      } else {
        redis_builder->SendNull();
      }
    }
  }
  co_return std::nullopt;
}

cmd::CmdR CmdMGet(CmdArgList args, CommandContext* cmd_cntx) {
  return MGetGeneric(cmd_cntx, args, std::nullopt);
}

// Implements the memcache GAT command. The expected input is
// GAT key [keys...]
// The expiry argument is stored in mc_command()->expire_ts
cmd::CmdR CmdGAT(CmdArgList args, CommandContext* cmd_cntx) {
  if (!cmd_cntx->mc_command()) {
    cmd_cntx->SendError("GAT is a memcache-only command");
    return cmd::kAborted;
  }
  int64_t expire_ts = cmd_cntx->mc_command()->expire_ts;
  DbSlice::ExpireParams expire_params{
      .value = expire_ts, .absolute = true, .persist = expire_ts == 0};
  return MGetGeneric(cmd_cntx, args, expire_params);
}

void CmdMSet(CmdArgList args, CommandContext* cmd_cntx) {
  if (VLOG_IS_ON(2)) {
    string str;
    for (size_t i = 1; i < args.size(); ++i) {
      absl::StrAppend(&str, " ", ArgS(args, i));
    }
    LOG(INFO) << "MSET/" << cmd_cntx->tx()->GetUniqueShardCnt() << str;
  }

  AggregateStatus result;
  auto cb = [&](Transaction* t, EngineShard* shard) {
    ShardArgs args = t->GetShardArgs(shard->shard_id());
    if (auto status = OpMSet(t->GetOpArgs(shard), args); status != OpStatus::OK)
      result = status;
    return OpStatus::OK;
  };

  if (auto status = cmd_cntx->tx()->ScheduleSingleHop(std::move(cb)); status != OpStatus::OK)
    result = status;

  if (*result == OpStatus::OK) {
    cmd_cntx->SendOk();
  } else {
    cmd_cntx->SendError(*result);
  }
}

void CmdMSetNx(CmdArgList args, CommandContext* cmd_cntx) {
  atomic_bool exists{false};

  auto cb = [&](Transaction* t, EngineShard* es) {
    auto sid = es->shard_id();
    auto args = t->GetShardArgs(sid);
    auto op_args = t->GetOpArgs(es);
    for (auto arg_it = args.begin(); arg_it != args.end(); ++arg_it) {
      auto it = op_args.GetDbSlice().FindReadOnly(t->GetDbContext(), *arg_it).it;
      ++arg_it;
      if (IsValid(it)) {
        exists.store(true, memory_order_relaxed);
        break;
      }
    }

    return OpStatus::OK;
  };

  cmd_cntx->tx()->Execute(std::move(cb), false);
  const bool to_skip = exists.load(memory_order_relaxed);

  AggregateStatus result;
  auto epilog_cb = [&](Transaction* t, EngineShard* shard) {
    if (to_skip)
      return OpStatus::OK;

    auto args = t->GetShardArgs(shard->shard_id());
    if (auto status = OpMSet(t->GetOpArgs(shard), args); status != OpStatus::OK)
      result = status;
    return OpStatus::OK;
  };
  cmd_cntx->tx()->Execute(std::move(epilog_cb), true);

  cmd_cntx->SendLong(to_skip || (*result != OpStatus::OK) ? 0 : 1);
}

void CmdStrLen(CmdArgList args, CommandContext* cmd_cntx) {
  auto cb = [key = ArgS(args, 0)](Transaction* t, EngineShard* shard) {
    return OpStrLen(t->GetOpArgs(shard), key);
  };
  GetReplies{cmd_cntx->rb()}.Send(cmd_cntx->tx()->ScheduleSingleHopT(cb));
}

void CmdGetRange(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser(args);
  auto [key, start, end] = parser.Next<string_view, int32_t, int32_t>();

  RETURN_ON_PARSE_ERROR(parser, cmd_cntx);

  auto cb = [&, &key = key, &start = start, &end = end](Transaction* t, EngineShard* shard) {
    return OpGetRange(t->GetOpArgs(shard), key, start, end);
  };

  GetReplies{cmd_cntx->rb()}.Send(cmd_cntx->tx()->ScheduleSingleHopT(cb));
}

void CmdSetRange(CmdArgList args, CommandContext* cmd_cntx) {
  CmdArgParser parser(args);
  auto [key, start, value] = parser.Next<string_view, int32_t, string_view>();
  auto* builder = cmd_cntx->rb();

  RETURN_ON_PARSE_ERROR(parser, cmd_cntx);

  if (start < 0) {
    return builder->SendError("offset is out of range");
  }

  if (size_t min_size = start + value.size(); min_size > kMaxStrLen) {
    return builder->SendError("string exceeds maximum allowed size");
  }

  auto cb = [&, &key = key, &start = start, &value = value](Transaction* t, EngineShard* shard) {
    return OpSetRange(t->GetOpArgs(shard), key, start, value);
  };
  GetReplies{builder}.Send(cmd_cntx->tx()->ScheduleSingleHopT(cb));
}

/* CL.THROTTLE <key> <max_burst> <count per period> <period> [<quantity>] */
/* Response is array of 5 integers. The meaning of each array item is:
 *  1. Whether the action was limited:
 *   - 0 indicates the action is allowed.
 *   - 1 indicates that the action was limited/blocked.
 *  2. The total limit of the key (max_burst + 1). This is equivalent to the
 * common X-RateLimit-Limit HTTP header.
 *  3. The remaining limit of the key. Equivalent to X-RateLimit-Remaining.
 *  4. The number of seconds until the user should retry, and always -1 if the
 * action was allowed. Equivalent to Retry-After.
 *  5. The number of seconds until the limit will reset to its maximum capacity.
 * Equivalent to X-RateLimit-Reset.
 */
void CmdClThrottle(CmdArgList args, CommandContext* cmd_cntx) {
  constexpr uint64_t kSecondToNanoSecond = 1000000000;
  const string_view key = ArgS(args, 0);

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  // Allow max burst in number of tokens
  uint64_t max_burst;
  const string_view max_burst_str = ArgS(args, 1);
  if (!absl::SimpleAtoi(max_burst_str, &max_burst)) {
    return rb->SendError(kInvalidIntErr);
  }

  // Emit count of tokens per period
  uint64_t count;
  const string_view count_str = ArgS(args, 2);
  if (!absl::SimpleAtoi(count_str, &count)) {
    return rb->SendError(kInvalidIntErr);
  }

  // Period of emitting count of tokens
  uint64_t period;
  const string_view period_str = ArgS(args, 3);
  if (!absl::SimpleAtoi(period_str, &period)) {
    return rb->SendError(kInvalidIntErr);
  }

  // Apply quantity of tokens now
  uint64_t quantity = 1;
  if (args.size() > 4) {
    const string_view quantity_str = ArgS(args, 4);

    if (!absl::SimpleAtoi(quantity_str, &quantity)) {
      return rb->SendError(kInvalidIntErr);
    }
  }

  if (max_burst > INT64_MAX - 1) {
    return rb->SendError(kInvalidIntErr);
  }
  const int64_t limit = max_burst + 1;

  if (period > UINT64_MAX / kSecondToNanoSecond || count == 0 ||
      period * kSecondToNanoSecond / count > INT64_MAX) {
    return rb->SendError(kInvalidIntErr);
  }

  const int64_t emission_interval_ns = period * kSecondToNanoSecond / count;

  if (emission_interval_ns == 0) {
    return rb->SendError("zero rates are not supported");
  }

  if (emission_interval_ns > INT64_MAX / limit) {
    return cmd_cntx->SendError(kInvalidIntErr);
  }

  if (quantity != 0 && static_cast<uint64_t>(emission_interval_ns) > INT64_MAX / quantity) {
    return cmd_cntx->SendError(kInvalidIntErr);
  }

  auto cb = [&](Transaction* t, EngineShard* shard) -> OpResult<array<int64_t, 5>> {
    return OpThrottle(t->GetOpArgs(shard), key, limit, emission_interval_ns, quantity);
  };

  Transaction* trans = cmd_cntx->tx();
  OpResult<array<int64_t, 5>> result = trans->ScheduleSingleHopT(std::move(cb));

  if (result) {
    RedisReplyBuilder* redis_builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
    redis_builder->StartArray(result->size());
    auto& array = result.value();

    int64_t retry_after_s = array[3] / 1000;
    if (array[3] > 0) {
      retry_after_s += 1;
    }
    array[3] = retry_after_s;

    int64_t reset_after_s = array[4] / 1000;
    if (array[4] > 0) {
      reset_after_s += 1;
    }
    array[4] = reset_after_s;

    for (const auto& v : array) {
      redis_builder->SendLong(v);
    }
  } else {
    switch (result.status()) {
      case OpStatus::WRONG_TYPE:
        cmd_cntx->SendError(kWrongTypeErr);
        break;
      case OpStatus::INVALID_INT:
      case OpStatus::INVALID_VALUE:
        cmd_cntx->SendError(kInvalidIntErr);
        break;
      case OpStatus::OUT_OF_MEMORY:
        cmd_cntx->SendError(kOutOfMemory);
        break;
      default:
        cmd_cntx->SendError(result.status());
        break;
    }
  }
}

}  // namespace

#define HFUNC(x) SetHandler(&Cmd##x)

void RegisterStringFamily(CommandRegistry* registry) {
  constexpr uint32_t kMSetMask = CO::JOURNALED | CO::DENYOOM | CO::NO_AUTOJOURNAL;

  registry->StartFamily(acl::STRING);
  *registry
      << CI{"SET", CO::JOURNALED | CO::DENYOOM | CO::NO_AUTOJOURNAL, -3, 1, 1}.SetAsyncHandler(
             CmdSet)
      << CI{"SETEX", CO::JOURNALED | CO::DENYOOM | CO::NO_AUTOJOURNAL, 4, 1, 1}.HFUNC(SetExGeneric)
      << CI{"PSETEX", CO::JOURNALED | CO::DENYOOM | CO::NO_AUTOJOURNAL, 4, 1, 1}.HFUNC(SetExGeneric)
      << CI{"SETNX", CO::JOURNALED | CO::DENYOOM | CO::FAST, 3, 1, 1}.HFUNC(SetNx)
      << CI{"APPEND", CO::JOURNALED | CO::DENYOOM | CO::FAST, 3, 1, 1}.SetAsyncHandler(
             ExtendGeneric)
      << CI{"PREPEND", CO::JOURNALED | CO::DENYOOM | CO::FAST, 3, 1, 1}.SetAsyncHandler(
             ExtendGeneric)
      << CI{"INCR", CO::JOURNALED | CO::FAST, 2, 1, 1}.SetAsyncHandler(CmdIncr)
      << CI{"DECR", CO::JOURNALED | CO::FAST, 2, 1, 1}.SetAsyncHandler(CmdDecr)
      << CI{"INCRBY", CO::JOURNALED | CO::FAST, 3, 1, 1}.SetAsyncHandler(CmdIncrBy)
      << CI{"INCRBYFLOAT", CO::JOURNALED | CO::FAST, 3, 1, 1}.SetAsyncHandler(CmdIncrByFloat)
      << CI{"DECRBY", CO::JOURNALED | CO::FAST, 3, 1, 1}.SetAsyncHandler(CmdDecrBy)
      << CI{"GET", CO::READONLY | CO::FAST, 2, 1, 1}.HFUNC(Get)
      << CI{"GETDEL", CO::JOURNALED | CO::FAST, 2, 1, 1}.HFUNC(GetDel)
      << CI{"DIGEST", CO::READONLY | CO::FAST, 2, 1, 1}.HFUNC(Digest)
      << CI{"GETEX", CO::JOURNALED | CO::DENYOOM | CO::FAST | CO::NO_AUTOJOURNAL, -2, 1, 1}.HFUNC(
             GetEx)
      << CI{"GETSET", CO::JOURNALED | CO::DENYOOM | CO::FAST, 3, 1, 1}.HFUNC(GetSet)
      << CI{"MGET", CO::READONLY | CO::FAST | CO::IDEMPOTENT, -2, 1, -1}.SetAsyncHandler(CmdMGet)
      << CI{"MSET", kMSetMask, -3, 1, -1}.HFUNC(MSet)
      << CI{"MSETNX", kMSetMask, -3, 1, -1}.HFUNC(MSetNx)
      << CI{"STRLEN", CO::READONLY | CO::FAST, 2, 1, 1}.HFUNC(StrLen)
      << CI{"GETRANGE", CO::READONLY, 4, 1, 1}.HFUNC(GetRange)
      << CI{"SUBSTR", CO::READONLY, 4, 1, 1}.HFUNC(GetRange)  // Alias for GetRange
      << CI{"SETRANGE", CO::JOURNALED | CO::DENYOOM, 4, 1, 1}.HFUNC(SetRange)
      << CI{"CL.THROTTLE", CO::JOURNALED | CO::DENYOOM | CO::FAST, -5, 1, 1, acl::THROTTLE}.HFUNC(
             ClThrottle)
      << CI{"GAT", CO::JOURNALED | CO::DENYOOM | CO::NO_AUTOJOURNAL | CO::HIDDEN, -2, 1, -1}
             .SetAsyncHandler(CmdGAT);
}

}  // namespace dfly


================================================
FILE: src/server/string_family_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#include "base/gtest.h"
#include "base/logging.h"
#include "facade/facade_test.h"
#include "server/conn_context.h"
#include "server/engine_shard_set.h"
#include "server/error.h"
#include "server/test_utils.h"
#include "server/transaction.h"

using namespace testing;
using namespace std;
using namespace util;
using absl::StrCat;

namespace dfly {

class StringFamilyTest : public BaseFamilyTest {
 protected:
};

vector<int64_t> ToIntArr(const RespExpr& e) {
  vector<int64_t> res;
  CHECK_EQ(e.type, RespExpr::ARRAY);
  const RespVec* vec = get<RespVec*>(e.u);
  for (auto a : *vec) {
    int64_t val;
    std::string_view s = ToSV(a.GetBuf());
    CHECK(absl::SimpleAtoi(s, &val)) << s;
    res.push_back(val);
  }

  return res;
}

TEST_F(StringFamilyTest, SetGet) {
  EXPECT_EQ(Run({"set", "key", "val"}), "OK");
  EXPECT_EQ(Run({"get", "key"}), "val");
  EXPECT_EQ(Run({"set", "key1", "1"}), "OK");
  EXPECT_EQ(Run({"get", "key1"}), "1");
  EXPECT_EQ(Run({"set", "key", "2"}), "OK");
  EXPECT_EQ(Run({"get", "key"}), "2");
  EXPECT_THAT(Run({"get", "key3"}), ArgType(RespExpr::NIL));

  auto metrics = GetMetrics();
  EXPECT_EQ(7, metrics.coordinator_stats.tx_normal_cnt);
  EXPECT_EQ(3, metrics.events.hits);
  EXPECT_EQ(1, metrics.events.misses);
  EXPECT_EQ(3, metrics.events.mutations);
}

TEST_F(StringFamilyTest, Incr) {
  ASSERT_EQ(Run({"set", "key", "0"}), "OK");
  ASSERT_THAT(Run({"incr", "key"}), IntArg(1));

  ASSERT_EQ(Run({"set", "key1", "123456789"}), "OK");
  ASSERT_THAT(Run({"incrby", "key1", "0"}), IntArg(123456789));

  ASSERT_EQ(Run({"set", "key1", "-123456789"}), "OK");
  ASSERT_THAT(Run({"incrby", "key1", "0"}), IntArg(-123456789));

  ASSERT_EQ(Run({"set", "key1", "   -123  "}), "OK");
  ASSERT_THAT(Run({"incrby", "key1", "1"}), ErrArg("ERR value is not an integer"));

  ASSERT_THAT(Run({"incrby", "ne", "0"}), IntArg(0));
  ASSERT_THAT(Run({"decrby", "a", "-9223372036854775808"}), ErrArg("overflow"));
  auto metrics = GetMetrics();
  EXPECT_EQ(9, metrics.events.mutations);
  EXPECT_EQ(0, metrics.events.misses);
  EXPECT_EQ(0, metrics.events.hits);
}

TEST_F(StringFamilyTest, Append) {
  Run({"setex", "key", "100", "val"});
  EXPECT_THAT(Run({"ttl", "key"}), IntArg(100));

  EXPECT_THAT(Run({"append", "key", "bar"}), IntArg(6));
  EXPECT_THAT(Run({"ttl", "key"}), IntArg(100));
}

TEST_F(StringFamilyTest, Expire) {
  ASSERT_EQ(Run({"set", "key", "val", "PX", "20"}), "OK");

  AdvanceTime(10);
  EXPECT_EQ(Run({"get", "key"}), "val");

  AdvanceTime(10);

  EXPECT_THAT(Run({"get", "key"}), ArgType(RespExpr::NIL));

  ASSERT_THAT(Run({"set", "i", "1", "PX", "10"}), "OK");
  ASSERT_THAT(Run({"incr", "i"}), IntArg(2));

  AdvanceTime(10);
  ASSERT_THAT(Run({"incr", "i"}), IntArg(1));
}

TEST_F(StringFamilyTest, Keepttl) {
  ASSERT_EQ(Run({"set", "key", "val", "EX", "100"}), "OK");
  ASSERT_EQ(Run({"set", "key", "val"}), "OK");
  auto resp = Run({"ttl", "key"});
  auto actual = get<int64_t>(resp.u);
  ASSERT_EQ(actual, -1);

  resp = Run({"set", "key", "val", "EX", "200"});
  ASSERT_EQ(Run({"set", "key", "val", "KEEPTTL"}), "OK");

  resp = Run({"ttl", "key"});
  actual = get<int64_t>(resp.u);

  EXPECT_TRUE(actual > 0 && actual <= 200);
}

TEST_F(StringFamilyTest, SetOptionsSyntaxError) {
  auto TEST_current_time_s = TEST_current_time_ms / 1000;

  EXPECT_THAT(Run({"set", "key", "val", "EX", "1030", "PX", "1030"}), ErrArg("ERR syntax error"));
  EXPECT_THAT(
      Run({"set", "key", "val", "EX", "1030", "EXAT", absl::StrCat(TEST_current_time_s + 1030)}),
      ErrArg("ERR syntax error"));
  EXPECT_THAT(
      Run({"set", "key", "val", "EX", "1030", "PXAT", absl::StrCat(TEST_current_time_ms + 1030)}),
      ErrArg("ERR syntax error"));

  EXPECT_THAT(Run({"set", "key", "val", "PX", "1030", "EX", "1030"}), ErrArg("ERR syntax error"));
  EXPECT_THAT(
      Run({"set", "key", "val", "PX", "1030", "EXAT", absl::StrCat(TEST_current_time_s + 1030)}),
      ErrArg("ERR syntax error"));
  EXPECT_THAT(
      Run({"set", "key", "val", "PX", "1030", "PXAT", absl::StrCat(TEST_current_time_ms + 1030)}),
      ErrArg("ERR syntax error"));
  EXPECT_THAT(
      Run({"set", "key", "val", "EXAT", absl::StrCat(TEST_current_time_s + 1030), "EX", "1030"}),
      ErrArg("ERR syntax error"));
  EXPECT_THAT(
      Run({"set", "key", "val", "EXAT", absl::StrCat(TEST_current_time_s + 1030), "PX", "1030"}),
      ErrArg("ERR syntax error"));
  EXPECT_THAT(Run({"set", "key", "val", "EXAT", absl::StrCat(TEST_current_time_s + 1030), "PXAT",
                   absl::StrCat(TEST_current_time_ms + 1030)}),
              ErrArg("ERR syntax error"));

  EXPECT_THAT(
      Run({"set", "key", "val", "PXAT", absl::StrCat(TEST_current_time_ms + 1030), "EX", "1030"}),
      ErrArg("ERR syntax error"));
  EXPECT_THAT(
      Run({"set", "key", "val", "PXAT", absl::StrCat(TEST_current_time_ms + 1030), "PX", "1030"}),
      ErrArg("ERR syntax error"));
  EXPECT_THAT(Run({"set", "key", "val", "PXAT", absl::StrCat(TEST_current_time_ms + 1030), "EXAT",
                   absl::StrCat(TEST_current_time_s + 1030)}),
              ErrArg("ERR syntax error"));

  EXPECT_THAT(Run({"set", "key", "val", "EX", "1030", "KEEPTTL"}), ErrArg("ERR syntax error"));
  EXPECT_THAT(Run({"set", "key", "val", "PX", "1030", "KEEPTTL"}), ErrArg("ERR syntax error"));
  EXPECT_THAT(
      Run({"set", "key", "val", "EXAT", absl::StrCat(TEST_current_time_s + 1030), "KEEPTTL"}),
      ErrArg("ERR syntax error"));
  EXPECT_THAT(
      Run({"set", "key", "val", "PXAT", absl::StrCat(TEST_current_time_ms + 1030), "KEEPTTL"}),
      ErrArg("ERR syntax error"));

  EXPECT_THAT(Run({"set", "key", "val", "KEEPTTL", "PX", "1030"}), ErrArg("ERR syntax error"));
  EXPECT_THAT(
      Run({"set", "key", "val", "KEEPTTL", "PXAT", absl::StrCat(TEST_current_time_ms + 1030)}),
      ErrArg("ERR syntax error"));
  EXPECT_THAT(Run({"set", "key", "val", "KEEPTTL", "EX", "1030"}), ErrArg("ERR syntax error"));
  EXPECT_THAT(
      Run({"set", "key", "val", "KEEPTTL", "EXAT", absl::StrCat(TEST_current_time_s + 1030)}),
      ErrArg("ERR syntax error"));

  EXPECT_THAT(Run({"set", "key", "val", "NX", "XX"}), ErrArg("ERR syntax error"));
  EXPECT_THAT(Run({"set", "key", "val", "XX", "NX"}), ErrArg("ERR syntax error"));

  EXPECT_THAT(Run({"set", "key", "val", "PX", "9223372036854775800"}),
              ErrArg("invalid expire time"));
  EXPECT_THAT(Run({"SET", "foo", "bar", "EX", "18446744073709561"}), ErrArg("invalid expire time"));
}

TEST_F(StringFamilyTest, Set) {
  auto resp = Run({"set", "foo", "bar", "XX"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  resp = Run({"set", "foo", "bar", "NX"});
  ASSERT_THAT(resp, "OK");
  resp = Run({"set", "foo", "bar", "NX"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  resp = Run({"set", "foo", "bar", "xx"});
  ASSERT_THAT(resp, "OK");

  resp = Run({"set", "foo", "bar", "ex", "abc"});
  ASSERT_THAT(resp, ErrArg(kInvalidIntErr));

  resp = Run({"set", "foo", "bar", "ex", "-1"});
  ASSERT_THAT(resp, ErrArg("invalid expire time"));

  resp = Run({"set", "foo", "bar", "ex", "1"});
  ASSERT_THAT(resp, "OK");

  ASSERT_THAT(Run({"sadd", "s1", "1"}), IntArg(1));
  ASSERT_THAT(Run({"set", "s1", "2"}), "OK");
}

TEST_F(StringFamilyTest, SetHugeKey) {
  const string key(36000000, 'b');
  auto resp = Run({"set", key, "1"});
  ASSERT_THAT(resp, "OK");
  Run({"del", key});
}

TEST_F(StringFamilyTest, MSetLong) {
  vector<string> command({"mset"});
  for (unsigned i = 0; i < 12000; ++i) {
    command.push_back(StrCat("key", i));
    command.push_back(StrCat("val", i));
  }
  auto resp = Run(absl::MakeSpan(command));
  EXPECT_EQ(resp, "OK");
}

TEST_F(StringFamilyTest, MGetSet) {
  Run({"mset", "z", "0"});         // single key
  auto resp = Run({"mget", "z"});  // single key
  EXPECT_THAT(resp, "0");

  Run({"mset", "x", "0", "b", "0"});

  ASSERT_EQ(2, GetDebugInfo("IO0").shards_count);

  auto mget_fb = pp_->at(0)->LaunchFiber([&] {
    for (size_t i = 0; i < 1000; ++i) {
      RespExpr resp = Run({"mget", "b", "x"});
      ASSERT_THAT(resp, ArrLen(2));
      auto ivec = ToIntArr(resp);

      ASSERT_GE(ivec[1], ivec[0]);
    }
  });

  auto set_fb = pp_->at(1)->LaunchFiber([&] {
    for (size_t i = 1; i < 2000; ++i) {
      Run({"set", "x", StrCat(i)});
      Run({"set", "b", StrCat(i)});
    }
  });

  mget_fb.Join();
  set_fb.Join();
}

TEST_F(StringFamilyTest, MGetCachingModeBug2276) {
  absl::FlagSaver fs;
  SetTestFlag("cache_mode", "true");
  ResetService();
  Run({"debug", "populate", "18000", "key", "32", "RAND"});

  // Scan starts traversing the database, because we populated the database with lots of items we
  // assume that scan will return items from the same bucket that reside next to each other.
  auto resp = Run({"scan", "0"});
  ASSERT_THAT(resp, ArrLen(2));
  StringVec vec = StrArray(resp.GetVec()[1]);
  ASSERT_GE(vec.size(), 10);

  auto get_bump_ups = [](const string& str) -> size_t {
    const string matcher = "bump_ups:";
    const auto pos = str.find(matcher) + matcher.size();
    const auto next_new_line =
        str.find("\r\n", pos);  // Find the position of the next "\r\n" after the initial position
    const auto sub = str.substr(pos, next_new_line - pos);
    return atoi(sub.c_str());
  };

  resp = Run({"info", "stats"});
  EXPECT_EQ(get_bump_ups(resp.GetString()), 0);

  auto mget_resp = StrArray(Run(
      {"mget", vec[0], vec[1], vec[2], vec[3], vec[4], vec[5], vec[6], vec[7], vec[8], vec[9]}));

  resp = Run({"info", "stats"});
  size_t bumps1 = get_bump_ups(resp.GetString());

  EXPECT_GE(bumps1, 0);
  EXPECT_LE(bumps1, 10);

  for (int i = 0; i < 10; ++i) {
    auto get_resp = Run({"get", vec[i]});
    EXPECT_EQ(get_resp, mget_resp[i]);
  }

  resp = Run({"info", "stats"});
  size_t bumps2 = get_bump_ups(resp.GetString());
  EXPECT_GT(bumps2, bumps1);
}

TEST_F(StringFamilyTest, MGetCachingModeBug2465) {
  absl::FlagSaver fs;
  SetTestFlag("cache_mode", "true");
  ResetService();
  Run({"debug", "populate", "18000", "key", "32", "RAND"});

  // Scan starts traversing the database, because we populated the database with lots of items we
  // assume that scan will return items from the same bucket that reside next to each other.
  auto resp = Run({"scan", "0"});
  ASSERT_THAT(resp, ArrLen(2));
  StringVec vec = StrArray(resp.GetVec()[1]);
  ASSERT_GE(vec.size(), 10);

  auto get_bump_ups = [](const string& str) -> size_t {
    const string matcher = "bump_ups:";
    const auto pos = str.find(matcher) + matcher.size();
    const auto next_new_line =
        str.find("\r\n", pos);  // Find the position of the next "\r\n" after the initial position
    const auto sub = str.substr(pos, next_new_line - pos);
    return atoi(sub.c_str());
  };

  resp = Run({"info", "stats"});
  EXPECT_EQ(get_bump_ups(resp.GetString()), 0);

  Run({"del", vec[1]});
  Run({"lpush", vec[1], "a"});

  resp = Run({"get", vec[2]});
  string val = resp.GetString();
  auto mget_resp = StrArray(Run({"mget", vec[2], vec[2], vec[2]}));
  EXPECT_THAT(mget_resp, ElementsAre(val, val, val));

  resp = Run({"info", "stats"});
  size_t bumps = get_bump_ups(resp.GetString());
  EXPECT_EQ(bumps, 2);  // one bump for get and one for mget
}

TEST_F(StringFamilyTest, MSetGet) {
  Run({"mset", "x", "0", "y", "0", "a", "0", "b", "0"});
  ASSERT_EQ(2, GetDebugInfo().shards_count);

  Run({"mset", "x", "0", "y", "0"});
  ASSERT_EQ(1, GetDebugInfo().shards_count);

  Run({"mset", "x", "1", "b", "5", "x", "0"});
  ASSERT_EQ(2, GetDebugInfo().shards_count);

  int64_t val = CheckedInt({"get", "x"});
  EXPECT_EQ(0, val);

  val = CheckedInt({"get", "b"});
  EXPECT_EQ(5, val);

  auto mset_fb = pp_->at(0)->LaunchFiber([&] {
    for (size_t i = 0; i < 1000; ++i) {
      RespExpr resp = Run({"mset", "x", StrCat(i), "b", StrCat(i)});
      ASSERT_EQ(resp, "OK") << i;
    }
  });

  // A problematic order when mset is not atomic: set x, get x, get b (old), set b
  auto get_fb = pp_->at(2)->LaunchFiber([&] {
    for (size_t i = 0; i < 1000; ++i) {
      int64_t x = CheckedInt({"get", "x"});
      int64_t z = CheckedInt({"get", "b"});

      ASSERT_LE(x, z) << "Inconsistency at " << i;
    }
  });

  mset_fb.Join();
  get_fb.Join();
}

TEST_F(StringFamilyTest, MSetDel) {
  auto mset_fb = pp_->at(0)->LaunchFiber([&] {
    for (size_t i = 0; i < 1000; ++i) {
      Run({"mset", "x", "0", "z", "0"});
    }
  });

  auto del_fb = pp_->at(2)->LaunchFiber([&] {
    for (size_t i = 0; i < 1000; ++i) {
      CheckedInt({"del", "x", "z"});
    }
  });

  mset_fb.Join();
  del_fb.Join();
}

TEST_F(StringFamilyTest, IntKey) {
  Run({"mset", "1", "1", "-1000", "-1000"});
  auto resp = Run({"get", "1"});
  ASSERT_THAT(resp, "1");
}

TEST_F(StringFamilyTest, SingleShard) {
  Run({"mset", "x", "1", "y", "1"});
  ASSERT_EQ(1, GetDebugInfo("IO0").shards_count);

  Run({"mget", "x", "y", "b"});
  ASSERT_EQ(2, GetDebugInfo("IO0").shards_count);

  auto resp = Run({"mget", "x", "y"});
  ASSERT_EQ(1, GetDebugInfo("IO0").shards_count);
  ASSERT_THAT(ToIntArr(resp), ElementsAre(1, 1));

  auto mset_fb = pp_->at(0)->LaunchFiber([&] {
    for (size_t i = 0; i < 100; ++i) {
      Run({"mset", "x", "0", "y", "0"});
    }
  });

  // Specially multiple shards to avoid fast-path.
  auto mget_fb = pp_->at(1)->LaunchFiber([&] {
    for (size_t i = 0; i < 100; ++i) {
      Run({"mget", "x", "b", "y"});
    }
  });
  mset_fb.Join();
  mget_fb.Join();
}

TEST_F(StringFamilyTest, MSetIncr) {
  /*  serializable orders
   init: x=z=0

   mset x=z=1
   mset, incr x, incr z = 2, 2
   incr x, mset, incr z = 1, 2
   incr x, incr z, mset = 1, 1
*/

  /* unserializable scenario when mset is not atomic with respect to incr x
      set x, incr x, incr z, set z = 2, 1
    */

  Run({"mset", "a", "0", "b", "0", "c", "0"});
  ASSERT_EQ(2, GetDebugInfo("IO0").shards_count);

  auto mset_fb = pp_->at(0)->LaunchFiber([&] {
    for (size_t i = 1; i < 1000; ++i) {
      string base = StrCat(i * 900);
      auto resp = Run({"mset", "b", base, "a", base, "c", base});
      ASSERT_EQ(resp, "OK");
    }
  });

  auto get_fb = pp_->at(1)->LaunchFiber([&] {
    for (unsigned j = 0; j < 900; ++j) {
      int64_t a = CheckedInt({"incr", "a"});
      int64_t b = CheckedInt({"incr", "b"});
      ASSERT_LE(a, b);

      int64_t c = CheckedInt({"incr", "c"});
      if (a > c) {
        LOG(ERROR) << "Consistency error ";
      }
      ASSERT_LE(a, c);
    }
  });
  mset_fb.Join();
  get_fb.Join();
}

TEST_F(StringFamilyTest, SetEx) {
  ASSERT_EQ(Run({"setex", "key", "1", "val"}), "OK");
  ASSERT_EQ(Run({"setex", "key", "10", "val"}), "OK");
  ASSERT_THAT(Run({"ttl", "key"}), IntArg(10));
  ASSERT_THAT(Run({"setex", "key", "0", "val"}), ErrArg("invalid expire time"));
  ASSERT_EQ(Run({"setex", "key", StrCat(5 * 365 * 24 * 3600), "val"}), "OK");
  ASSERT_THAT(Run({"setex", "key", StrCat(1 << 30), "val"}), "OK");
  ASSERT_THAT(Run({"ttl", "key"}), IntArg(kMaxExpireDeadlineSec));
  ASSERT_THAT(Run({"SETEX", "foo", "18446744073709561", "bar"}), ErrArg("invalid expire time"));
}

TEST_F(StringFamilyTest, Range) {
  Run({"set", "key1", "Hello World"});
  EXPECT_EQ(Run({"getrange", "key1", "5", "3"}), "");

  Run({"SETRANGE", "key1", "6", "Earth"});
  EXPECT_EQ(Run({"get", "key1"}), "Hello Earth");

  Run({"SETRANGE", "key2", "2", "Earth"});
  EXPECT_EQ(Run({"get", "key2"}), string_view("\000\000Earth", 7));

  Run({"SETRANGE", "key3", "0", ""});
  EXPECT_EQ(0, CheckedInt({"exists", "key3"}));

  Run({"SETRANGE", "key3", "0", "abc"});
  EXPECT_EQ(1, CheckedInt({"exists", "key3"}));

  Run({"SET", "key3", "123"});
  EXPECT_EQ(Run({"getrange", "key3", "2", "3"}), "3");
  EXPECT_EQ(Run({"getrange", "key3", "3", "3"}), "");
  EXPECT_EQ(Run({"getrange", "key3", "4", "5"}), "");

  Run({"SET", "num", "1234"});
  EXPECT_EQ(Run({"getrange", "num", "3", "5000"}), "4");
  EXPECT_EQ(Run({"getrange", "num", "-5000", "10000"}), "1234");

  Run({"SET", "key4", "1"});
  EXPECT_EQ(Run({"getrange", "key4", "-1", "-2"}), "");
  EXPECT_EQ(Run({"getrange", "key4", "0", "-2"}), "1");

  EXPECT_EQ(CheckedInt({"SETRANGE", "key5", "1", ""}), 0);
  EXPECT_EQ(Run({"GET", "key5"}).type, facade::RespExpr::NIL);

  EXPECT_EQ(CheckedInt({"SETRANGE", "num", "6", ""}), 4);
  EXPECT_EQ(Run({"GET", "num"}), "1234");

  // we support only 256MB string so this test is failed now
  // EXPECT_THAT(CheckedInt({"SETRANGE", "", "268435456", "0"}), 268435457);
}

TEST_F(StringFamilyTest, IncrByFloat) {
  Run({"SET", "nonum", "  11"});
  auto resp = Run({"INCRBYFLOAT", "nonum", "1.0"});
  EXPECT_THAT(resp, ErrArg("not a valid float"));

  Run({"SET", "inf", "+inf"});
  resp = Run({"INCRBYFLOAT", "inf", "1.0"});
  EXPECT_THAT(resp, ErrArg("increment would produce NaN or Infinity"));

  Run({"SET", "nonum", "11 "});
  resp = Run({"INCRBYFLOAT", "nonum", "1.0"});
  EXPECT_THAT(resp, ErrArg("not a valid float"));

  Run({"SET", "num", "2.566"});
  resp = Run({"INCRBYFLOAT", "num", "1.0"});
  EXPECT_EQ(resp, "3.566");
}

TEST_F(StringFamilyTest, RestoreHighTTL) {
  Run({"SET", "X", "1"});
  auto buffer = Run({"DUMP", "X"}).GetBuf();
  Run({"DEL", "X"});
  EXPECT_EQ(Run({"RESTORE", "X", "5430186761345", ToSV(buffer)}), "OK");
}

TEST_F(StringFamilyTest, SetNx) {
  // Make sure that we "screen out" invalid parameters for this command
  // this is important as it uses similar path as the "normal" set
  auto resp = Run({"setnx", "foo", "bar", "XX"});
  EXPECT_THAT(resp, ErrArg("wrong number of arguments"));

  resp = Run({"setnx", "foo", "bar", "NX"});
  ASSERT_THAT(resp, ErrArg("wrong number of arguments"));

  resp = Run({"setnx", "foo", "bar", "xx"});
  ASSERT_THAT(resp, ErrArg("wrong number of arguments"));

  resp = Run({"setnx", "foo", "bar", "ex", "abc"});
  ASSERT_THAT(resp, ErrArg("wrong number of arguments"));

  resp = Run({"setnx", "foo", "bar", "ex", "-1"});
  ASSERT_THAT(resp, ErrArg("wrong number of arguments"));

  resp = Run({"setnx", "foo", "bar", "ex", "1"});
  ASSERT_THAT(resp, ErrArg("wrong number of arguments"));

  // now let see how it goes for the valid parameters
  EXPECT_EQ(1, CheckedInt({"setnx", "foo", "bar"}));
  EXPECT_EQ(Run({"get", "foo"}), "bar");
  // second call to the same key should return 0 as we have it
  EXPECT_EQ(0, CheckedInt({"setnx", "foo", "hello"}));
  EXPECT_EQ(Run({"get", "foo"}), "bar");  // the value was not changed
}

TEST_F(StringFamilyTest, SetPxAtExAt) {
  // Expiration time as set at unix time
  auto TEST_current_time_s = TEST_current_time_ms / 1000;

  auto resp = Run({"set", "foo", "bar", "EXAT", "-1"});
  ASSERT_THAT(resp, ErrArg("invalid expire time"));
  resp = Run({"set", "foo", "bar", "EXAT", absl::StrCat(TEST_current_time_s - 1)});
  ASSERT_THAT(resp, "OK");  // it would return OK but will not set the value - expiration time is 0
                            // (checked with Redis)
  EXPECT_EQ(Run({"get", "foo"}).type, facade::RespExpr::NIL);

  resp = Run({"set", "foo", "bar", "PXAT", "-1"});
  ASSERT_THAT(resp, ErrArg("invalid expire time"));

  resp = Run({"set", "foo", "bar", "PXAT", absl::StrCat(TEST_current_time_ms - 23)});
  ASSERT_THAT(resp, "OK");  // it would return OK but will not set the value (checked with Redis)
  EXPECT_EQ(Run({"get", "foo"}).type, facade::RespExpr::NIL);

  resp = Run({"set", "foo", "bar", "EXAT", absl::StrCat(TEST_current_time_s + 1)});
  ASSERT_THAT(resp, "OK");  // valid expiration time
  EXPECT_EQ(Run({"get", "foo"}), "bar");

  resp = Run({"set", "foo2", "abc", "PXAT", absl::StrCat(TEST_current_time_ms + 300)});
  ASSERT_THAT(resp, "OK");
  EXPECT_EQ(Run({"get", "foo2"}), "abc");
}

TEST_F(StringFamilyTest, SetStick) {
  Run({"set", "foo", "bar", "STICK"});
  EXPECT_THAT(Run({"STICK", "foo"}), IntArg(0));
}

TEST_F(StringFamilyTest, GetDel) {
  auto resp = Run({"set", "foo", "bar"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"getdel", "foo"});
  // foo's value
  ASSERT_THAT(resp, ArgType(RespExpr::STRING));

  resp = Run({"get", "foo"});
  ASSERT_THAT(resp, ArgType(RespExpr::NIL));
}

TEST_F(StringFamilyTest, GetEx) {
  auto resp = Run({"set", "foo", "bar"});
  EXPECT_THAT(resp, "OK");

  resp = Run({"getex", "foo", "EX"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  resp = Run({"getex", "foo", "EX", "1", "px", "1"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  resp = Run({"getex", "foo", "bar", "EX"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  resp = Run({"getex", "foo", "PERSIST", "1"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  resp = Run({"getex", "foo", "PXAT"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  resp = Run({"getex", "foo", "EX", "0"});
  EXPECT_THAT(resp, ErrArg("invalid expire time"));

  resp = Run({"getex", "foo", "PXAT", "-1"});
  EXPECT_THAT(resp, ErrArg("invalid expire time"));

  EXPECT_EQ(Run({"getex", "foo"}), "bar");

  resp = Run({"getex", "foo", "PERSIST"});
  EXPECT_EQ(resp, "bar");
  EXPECT_THAT(Run({"TTL", "foo"}), IntArg(-1));

  resp = Run({"getex", "foo", "pxat", absl::StrCat(TEST_current_time_ms - 1)});
  EXPECT_EQ(resp, "bar");

  EXPECT_THAT(Run({"getex", "foo"}), ArgType(RespExpr::NIL));

  Run({"set", "foo", "bar"});

  resp = Run({"getex", "foo", "PXAT", absl::StrCat(TEST_current_time_ms + 10)});
  EXPECT_EQ(resp, "bar");

  AdvanceTime(9);
  EXPECT_EQ(Run({"getex", "foo"}), "bar");

  AdvanceTime(1);
  EXPECT_THAT(Run({"getex", "foo"}), ArgType(RespExpr::NIL));

  Run({"set", "foo", "bar"});

  resp = Run({"getex", "foo", "exat", absl::StrCat(TEST_current_time_ms / 1000 - 1)});
  EXPECT_EQ(resp, "bar");
  EXPECT_THAT(Run({"getex", "foo"}), ArgType(RespExpr::NIL));

  Run({"set", "foo", "bar"});

  uint64_t next_two_seconds = TEST_current_time_ms + 2000;
  uint64_t next_two_seconds_round_down = static_cast<uint64_t>(next_two_seconds / 1000);
  uint64_t diff = next_two_seconds_round_down * 1000 - TEST_current_time_ms;

  resp = Run({"getex", "foo", "EXAT", absl::StrCat(next_two_seconds_round_down)});
  EXPECT_EQ(resp, "bar");

  AdvanceTime(diff - 1);
  EXPECT_EQ(Run({"getex", "foo"}), "bar");

  AdvanceTime(1);
  EXPECT_THAT(Run({"getex", "foo"}), ArgType(RespExpr::NIL));

  Run({"set", "foo", "bar"});

  resp = Run({"getex", "foo", "PX", "10"});

  AdvanceTime(9);
  EXPECT_EQ(Run({"getex", "foo"}), "bar");

  AdvanceTime(1);
  EXPECT_THAT(Run({"getex", "foo"}), ArgType(RespExpr::NIL));

  Run({"set", "foo", "bar"});

  resp = Run({"getex", "foo", "ex", "1"});

  AdvanceTime(999);
  EXPECT_EQ(Run({"getex", "foo"}), "bar");

  AdvanceTime(1);
  EXPECT_THAT(Run({"getex", "foo"}), ArgType(RespExpr::NIL));
}

TEST_F(StringFamilyTest, ClThrottle) {
  const int64_t limit = 5;
  const char* const key = "foo";
  const char* const max_burst = "4";  // limit - 1
  const char* const count = "1";
  const char* const period = "10";

  // You can never make a request larger than the maximum.
  auto resp = Run({"cl.throttle", key, max_burst, count, period, "6"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  ASSERT_THAT(resp.GetVec(),
              ElementsAre(IntArg(1), IntArg(limit), IntArg(5), IntArg(-1), IntArg(0)));

  // Rate limit normal requests appropriately.
  resp = Run({"cl.throttle", key, max_burst, count, period});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  ASSERT_THAT(resp.GetVec(),
              ElementsAre(IntArg(0), IntArg(limit), IntArg(4), IntArg(-1), IntArg(11)));

  resp = Run({"cl.throttle", key, max_burst, count, period});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  ASSERT_THAT(resp.GetVec(),
              ElementsAre(IntArg(0), IntArg(limit), IntArg(3), IntArg(-1), IntArg(21)));

  resp = Run({"cl.throttle", key, max_burst, count, period});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  ASSERT_THAT(resp.GetVec(),
              ElementsAre(IntArg(0), IntArg(limit), IntArg(2), IntArg(-1), IntArg(31)));

  resp = Run({"cl.throttle", key, max_burst, count, period});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  ASSERT_THAT(resp.GetVec(),
              ElementsAre(IntArg(0), IntArg(limit), IntArg(1), IntArg(-1), IntArg(41)));

  resp = Run({"cl.throttle", key, max_burst, count, period});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  ASSERT_THAT(resp.GetVec(),
              ElementsAre(IntArg(0), IntArg(limit), IntArg(0), IntArg(-1), IntArg(51)));

  resp = Run({"cl.throttle", key, max_burst, count, period});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  ASSERT_THAT(resp.GetVec(),
              ElementsAre(IntArg(1), IntArg(limit), IntArg(0), IntArg(11), IntArg(51)));

  AdvanceTime(30000);
  resp = Run({"cl.throttle", key, max_burst, count, period, "1"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  ASSERT_THAT(resp.GetVec(),
              ElementsAre(IntArg(0), IntArg(limit), IntArg(2), IntArg(-1), IntArg(31)));

  AdvanceTime(1000);
  resp = Run({"cl.throttle", key, max_burst, count, period, "1"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  ASSERT_THAT(resp.GetVec(),
              ElementsAre(IntArg(0), IntArg(limit), IntArg(1), IntArg(-1), IntArg(40)));

  AdvanceTime(9000);
  resp = Run({"cl.throttle", key, max_burst, count, period, "1"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  ASSERT_THAT(resp.GetVec(),
              ElementsAre(IntArg(0), IntArg(limit), IntArg(1), IntArg(-1), IntArg(41)));

  AdvanceTime(40000);
  resp = Run({"cl.throttle", key, max_burst, count, period, "1"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  ASSERT_THAT(resp.GetVec(),
              ElementsAre(IntArg(0), IntArg(limit), IntArg(4), IntArg(-1), IntArg(11)));

  AdvanceTime(15000);
  resp = Run({"cl.throttle", key, max_burst, count, period, "1"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  ASSERT_THAT(resp.GetVec(),
              ElementsAre(IntArg(0), IntArg(limit), IntArg(4), IntArg(-1), IntArg(11)));

  // Zero-volume request just peeks at the state.
  resp = Run({"cl.throttle", key, max_burst, count, period, "0"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  ASSERT_THAT(resp.GetVec(),
              ElementsAre(IntArg(0), IntArg(limit), IntArg(4), IntArg(-1), IntArg(11)));

  // High-volume request uses up more of the limit.
  resp = Run({"cl.throttle", key, max_burst, count, period, "2"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  ASSERT_THAT(resp.GetVec(),
              ElementsAre(IntArg(0), IntArg(limit), IntArg(2), IntArg(-1), IntArg(31)));

  // Large requests cannot exceed limits
  resp = Run({"cl.throttle", key, max_burst, count, period, "5"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  ASSERT_THAT(resp.GetVec(),
              ElementsAre(IntArg(1), IntArg(limit), IntArg(2), IntArg(31), IntArg(31)));

  // Zero rates aren't supported
  resp = Run({"cl.throttle", "bar", "10", "1", "0"});
  ASSERT_EQ(RespExpr::ERROR, resp.type);
  EXPECT_THAT(resp, ErrArg("zero rates are not supported"));

  // count == 0
  resp = Run({"cl.throttle", "bar", "10", "0", "1"});
  ASSERT_EQ(RespExpr::ERROR, resp.type);
  EXPECT_THAT(resp, ErrArg(kInvalidIntErr));

  // emission interval = 2000 nanoseconds, cost = 2 units
  resp = Run({"cl.throttle", "bar", max_burst, "500000", "1", "2"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  ASSERT_THAT(resp.GetVec(),
              ElementsAre(IntArg(0), IntArg(limit), IntArg(limit - 2), IntArg(-1), IntArg(1)));
}

TEST_F(StringFamilyTest, SetMGetWithNilResp3) {
  Run({"hello", "3"});

  EXPECT_EQ(Run({"set", "key", "val"}), "OK");
  EXPECT_EQ(Run({"get", "key"}), "val");
  RespExpr resp = Run({"mget", "key", "nonexist"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  EXPECT_THAT(resp.GetVec(), ElementsAre("val", ArgType(RespExpr::NIL)));
}

TEST_F(StringFamilyTest, OverrideOther) {
  Run({"lpush", "a", "fooo"});
  Run({"set", "a", string(100, 'b')});
  Metrics metrics = GetMetrics();

  size_t list_usage = metrics.db_stats[0].memory_usage_by_type[OBJ_LIST];
  size_t string_usage = metrics.db_stats[0].memory_usage_by_type[OBJ_STRING];
  EXPECT_EQ(list_usage, 0);
  EXPECT_GT(string_usage, 0);
  EXPECT_LT(string_usage, 100);
}

TEST_F(StringFamilyTest, SetWithGetParam) {
  EXPECT_THAT(Run({"set", "key1", "val1", "get"}), ArgType(RespExpr::NIL));
  EXPECT_EQ(Run({"set", "key1", "val2", "get"}), "val1");

  EXPECT_THAT(Run({"set", "key2", "val2", "nx", "get"}), ArgType(RespExpr::NIL));
  EXPECT_THAT(Run({"set", "key2", "not used", "nx", "get"}), "val2");
  EXPECT_EQ(Run({"get", "key2"}), "val2");

  EXPECT_THAT(Run({"set", "key3", "not used", "xx", "get"}), ArgType(RespExpr::NIL));
  EXPECT_THAT(Run({"set", "key2", "val3", "xx", "get"}), "val2");
  EXPECT_EQ(Run({"get", "key2"}), "val3");

  EXPECT_THAT(Run({"sadd", "key4", "1"}), IntArg(1));
  EXPECT_THAT(Run({"set", "key4", "2", "get"}), ErrArg("wrong kind of value"));
  EXPECT_THAT(Run({"set", "key4", "2", "xx", "get"}), ErrArg("wrong kind of value"));
}

TEST_F(StringFamilyTest, SetWithHashtagsNoCluster) {
  SetTestFlag("cluster_mode", "");
  SetTestFlag("lock_on_hashtags", "false");
  ResetService();

  auto fb = ExpectUsedKeys({"{key}1"});
  EXPECT_EQ(Run({"set", "{key}1", "val1"}), "OK");
  fb.Join();
  EXPECT_FALSE(IsLocked(0, "{key}1"));

  fb = ExpectUsedKeys({"{key}2"});
  EXPECT_EQ(Run({"set", "{key}2", "val2"}), "OK");
  fb.Join();

  fb = ExpectUsedKeys({"{key}1", "{key}2"});
  EXPECT_THAT(Run({"mget", "{key}1", "{key}2"}), RespArray(ElementsAre("val1", "val2")));
  fb.Join();
  EXPECT_NE(1, GetDebugInfo().shards_count);
}

TEST_F(StringFamilyTest, SetWithHashtagsWithEmulatedCluster) {
  SetTestFlag("cluster_mode", "emulated");
  SetTestFlag("lock_on_hashtags", "false");
  ResetService();

  auto fb = ExpectUsedKeys({"{key}1"});
  EXPECT_EQ(Run({"set", "{key}1", "val1"}), "OK");
  fb.Join();

  fb = ExpectUsedKeys({"{key}2"});
  EXPECT_EQ(Run({"set", "{key}2", "val2"}), "OK");
  fb.Join();

  fb = ExpectUsedKeys({"{key}1", "{key}2"});
  EXPECT_THAT(Run({"mget", "{key}1", "{key}2"}), RespArray(ElementsAre("val1", "val2")));
  fb.Join();
  EXPECT_EQ(1, GetDebugInfo().shards_count);
}

TEST_F(StringFamilyTest, SetWithHashtagsWithHashtagLock) {
  SetTestFlag("cluster_mode", "emulated");
  SetTestFlag("lock_on_hashtags", "true");
  ResetService();

  auto fb = ExpectUsedKeys({"key"});
  EXPECT_EQ(Run({"set", "{key}1", "val1"}), "OK");
  fb.Join();

  fb = ExpectUsedKeys({"key"});
  EXPECT_EQ(Run({"set", "{key}2", "val2"}), "OK");
  fb.Join();

  fb = ExpectUsedKeys({"key"});
  EXPECT_THAT(Run({"mget", "{key}1", "{key}2"}), RespArray(ElementsAre("val1", "val2")));
  fb.Join();
  EXPECT_EQ(1, GetDebugInfo().shards_count);
}

TEST_F(StringFamilyTest, MultiSetWithHashtagsDontLockHashtags) {
  SetTestFlag("cluster_mode", "");
  SetTestFlag("lock_on_hashtags", "false");
  ResetService();

  auto fb = ExpectUsedKeys({"{key}1", "{key}2", "{key}3"});

  EXPECT_EQ(Run({"multi"}), "OK");
  EXPECT_EQ(Run({"set", "{key}1", "val1"}), "QUEUED");
  EXPECT_EQ(Run({"set", "{key}2", "val2"}), "QUEUED");
  EXPECT_EQ(Run({"eval", "return redis.call('set', KEYS[1], 'val3')", "1", "{key}3"}), "QUEUED");
  EXPECT_THAT(Run({"exec"}), RespArray(ElementsAre("OK", "OK", "OK")));
  fb.Join();
}

TEST_F(StringFamilyTest, MultiSetWithHashtagsLockHashtags) {
  SetTestFlag("cluster_mode", "emulated");
  SetTestFlag("lock_on_hashtags", "true");
  ResetService();

  auto fb = ExpectUsedKeys({"key"});

  EXPECT_EQ(Run({"multi"}), "OK");
  EXPECT_EQ(Run({"set", "{key}1", "val1"}), "QUEUED");
  EXPECT_EQ(Run({"set", "{key}2", "val2"}), "QUEUED");
  EXPECT_EQ(Run({"eval", "return redis.call('set', KEYS[1], 'val3')", "1", "{key}3"}), "QUEUED");
  EXPECT_THAT(Run({"exec"}), RespArray(ElementsAre("OK", "OK", "OK")));
  fb.Join();
}

TEST_F(StringFamilyTest, EmptyKeys) {
  EXPECT_EQ(0, CheckedInt({"strlen", "foo"}));
  EXPECT_EQ(Run({"SUBSTR", "foo", "0", "-1"}), "");
}

TEST_F(StringFamilyTest, Digest) {
  // Basic digest computation returns 16-char hex string
  Run({"set", "key", "value"});
  auto resp = Run({"digest", "key"});
  ASSERT_EQ(resp.type, RespExpr::STRING);
  string digest = resp.GetString();
  EXPECT_EQ("87d57e269b9df0f0", digest);

  // Digest of non-existent key returns nil
  EXPECT_THAT(Run({"digest", "nonexistent"}), ArgType(RespExpr::NIL));

  // Digest consistency - same value always produces same digest
  Run({"set", "key1", "testvalue"});
  Run({"set", "key2", "testvalue"});
  auto digest1 = Run({"digest", "key1"});
  auto digest2 = Run({"digest", "key2"});
  EXPECT_EQ(ToSV(digest1.GetBuf()), ToSV(digest2.GetBuf()));

  // Different values produce different digests
  Run({"set", "key3", "different"});
  auto digest3 = Run({"digest", "key3"});
  EXPECT_NE(ToSV(digest1.GetBuf()), ToSV(digest3.GetBuf()));

  // Works with integer-encoded strings
  Run({"set", "intkey", "123"});
  auto int_digest = Run({"digest", "intkey"});
  ASSERT_EQ(int_digest.type, RespExpr::STRING);
  EXPECT_EQ(16, ToSV(int_digest.GetBuf()).size());

  // Works with empty strings
  Run({"set", "empty", ""});
  auto empty_digest = Run({"digest", "empty"});
  ASSERT_EQ(empty_digest.type, RespExpr::STRING);
  EXPECT_EQ(16, ToSV(empty_digest.GetBuf()).size());

  // Digest of non-string type returns WRONGTYPE error
  Run({"lpush", "list", "item"});
  EXPECT_THAT(Run({"digest", "list"}), ErrArg("WRONGTYPE"));
}

// GAT is a memcache-only command. Sending it via Redis RESP protocol should return an error
// instead of crashing (DCHECK on mc_command()).
TEST_F(StringFamilyTest, GatViaRedisProtocol) {
  Run({"set", "key", "val"});
  auto resp = Run({"GAT", "key"});
  EXPECT_THAT(resp, ErrArg("memcache-only"));
}

TEST_F(StringFamilyTest, MSetNxOddArgs) {
  auto resp = Run({"msetnx", "key", "value", "key2"});
  EXPECT_THAT(resp, ErrArg("wrong number of arguments"));

  resp = Run({"mset", "key", "value", "key2"});
  EXPECT_THAT(resp, ErrArg("wrong number of arguments"));
}

}  // namespace dfly


================================================
FILE: src/server/string_stats.cc
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/string_stats.h"

#include <absl/strings/str_cat.h>

#include "base/logging.h"

namespace {

void MakeHLL(HllBufferPtr* p) {
  p->size = getDenseHllSize();
  p->hll = new uint8_t[p->size];
  CHECK_EQ(0, createDenseHll(*p));
}

}  // namespace

namespace dfly {

using namespace container_utils;

UniqueStrings::UniqueStrings() {
  MakeHLL(&counter_);
}

UniqueStrings::UniqueStrings(UniqueStrings&& other) noexcept
    : total_count{other.total_count}, total_bytes{other.total_bytes}, counter_{other.counter_} {
  other.counter_ = HllBufferPtr{};
}

UniqueStrings& UniqueStrings::operator=(UniqueStrings&& other) noexcept {
  if (this == &other) {
    return *this;
  }

  delete[] counter_.hll;
  counter_ = other.counter_;
  total_count = other.total_count;
  total_bytes = other.total_bytes;
  other.counter_ = HllBufferPtr{};
  return *this;
}

void UniqueStrings::AddHMap(const PrimeValue& pv) {
  // Only adds the keys of a map
  IterateMap(pv, [&](const ContainerEntry& k, const auto&) { return AddString(k); });
}

void UniqueStrings::AddSet(const PrimeValue& pv) {
  IterateSet(pv, [&](const ContainerEntry& e) { return AddString(e); });
}

void UniqueStrings::AddList(const PrimeValue& pv) {
  IterateList(pv, [&](const ContainerEntry& e) { return AddString(e); });
}

void UniqueStrings::AddZSet(const PrimeValue& pv) {
  IterateSortedSet(pv, [&](const ContainerEntry& e, auto) { return AddString(e); });
}

void UniqueStrings::Add(const UniqueStrings& other) {
  total_count += other.total_count;
  total_bytes += other.total_bytes;
  HllBufferPtr inputs[2] = {other.counter_, counter_};
  CHECK_EQ(0, pfmerge(inputs, 2, counter_));
}

std::string UniqueStrings::ToString(std::string_view label) const {
  if (total_count == 0)
    return {};
  std::string result;
  absl::StrAppend(&result, label, ":\n");
  absl::StrAppend(&result, "  total strings: ", total_count, "\n");
  absl::StrAppend(&result, "  unique strings: ", UniqueCount(), "\n");
  absl::StrAppend(&result, "  total bytes: ", total_bytes, "\n");
  absl::StrAppend(&result, "  average length: ", AverageLength(), "\n");
  absl::StrAppend(&result, "  estimated savings: ", ByteSavingsOnDedup(), " bytes\n");
  return result;
}

bool UniqueStrings::AddString(const ContainerEntry& e) {  // NOLINT must always return true
  // Count both strings and ints, because ints might be used as keys and will benefit from
  // deduplication just like strings.
  if (e.IsString()) {
    CHECK_NE(-1, pfadd_dense(counter_, reinterpret_cast<const unsigned char*>(e.data()), e.size()));
    ++total_count;
    total_bytes += e.size();
  } else {
    char buf[absl::numbers_internal::kFastToBufferSize];
    const char* end = absl::numbers_internal::FastIntToBuffer(e.as_long(), buf);
    const auto size = end - buf;
    const int result = pfadd_dense(counter_, reinterpret_cast<const unsigned char*>(buf), size);
    CHECK_NE(-1, result);
    ++total_count;
    total_bytes += size;
  }
  return true;
}

uint64_t UniqueStrings::ByteSavingsOnDedup() const {
  const auto uniques = UniqueCount();
  const auto diff = total_count > uniques ? total_count - uniques : 0;
  return diff * AverageLength();
}

}  // namespace dfly


================================================
FILE: src/server/string_stats.h
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

extern "C" {
#include "redis/hyperloglog.h"
}

#include "server/container_utils.h"

namespace dfly {

struct UniqueStrings {
  uint64_t total_count{0};
  uint64_t total_bytes{0};

  UniqueStrings();
  ~UniqueStrings() {
    delete[] counter_.hll;
  }

  UniqueStrings(const UniqueStrings&) = delete;
  UniqueStrings& operator=(const UniqueStrings&) = delete;

  // To store in flat hash map
  UniqueStrings(UniqueStrings&&) noexcept;
  UniqueStrings& operator=(UniqueStrings&&) noexcept;

  void AddHMap(const PrimeValue& pv);
  void AddSet(const PrimeValue& pv);
  void AddList(const PrimeValue& pv);
  void AddZSet(const PrimeValue& pv);

  void Add(const UniqueStrings& other);

  std::string ToString(std::string_view label) const;

 private:
  HllBufferPtr counter_;
  bool AddString(const container_utils::ContainerEntry& e);

  uint64_t ByteSavingsOnDedup() const;

  uint64_t UniqueCount() const {
    return pfcountSingle(counter_);
  }

  double AverageLength() const {
    return total_count ? static_cast<double>(total_bytes) / total_count : 0;
  }
};

}  // namespace dfly


================================================
FILE: src/server/string_stats_test.cc
================================================
// Copyright 2026, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/string_stats.h"

#include <absl/strings/numbers.h>
#include <absl/strings/str_split.h>

#include "base/gtest.h"
#include "base/logging.h"
#include "server/test_utils.h"

using namespace testing;

namespace {

std::string GetValue(std::string_view row) {
  static constexpr std::string_view bytes = " bytes";
  auto value = absl::StripAsciiWhitespace(row.substr(row.find(':') + 1));
  if (value.ends_with(bytes))
    value.remove_suffix(bytes.length());
  return {value.begin(), value.end()};
}

}  // namespace

namespace dfly {

class StringStatsTest : public BaseFamilyTest {
 protected:
  struct ParsedBucket {
    uint64_t total_strings = 0;
    uint64_t unique_strings = 0;
    uint64_t total_bytes = 0;
    double average_length = 0;
    uint64_t estimated_savings = 0;
  };

  static std::optional<ParsedBucket> ParseStats(std::string_view output) {
    std::vector<std::string_view> rows = absl::StrSplit(output, "\n", absl::SkipWhitespace());
    for (auto& row : rows)
      row = absl::StripAsciiWhitespace(row);

    auto it = rows.begin();
    while (it != rows.end() && !it->starts_with("Strings"))
      ++it;

    if (it == rows.end())
      return std::nullopt;

    ParsedBucket bucket;
    EXPECT_NE(it, rows.end());
    EXPECT_TRUE(absl::SimpleAtoi(GetValue(*++it), &bucket.total_strings));
    EXPECT_NE(it, rows.end());
    EXPECT_TRUE(absl::SimpleAtoi(GetValue(*++it), &bucket.unique_strings));
    EXPECT_NE(it, rows.end());
    EXPECT_TRUE(absl::SimpleAtoi(GetValue(*++it), &bucket.total_bytes));
    EXPECT_NE(it, rows.end());
    EXPECT_TRUE(absl::SimpleAtod(GetValue(*++it), &bucket.average_length));
    EXPECT_NE(it, rows.end());
    EXPECT_TRUE(absl::SimpleAtoi(GetValue(*++it), &bucket.estimated_savings));
    return bucket;
  }
};

TEST_F(StringStatsTest, HashWithDuplicateFields) {
  for (int i = 0; i < 100; ++i) {
    Run({"HSET", absl::StrCat("user:", i), "name", absl::StrCat("name_", i), "email",
         absl::StrCat("email_", i), "age", absl::StrCat(20 + i)});
  }

  const auto resp = Run({"DEBUG", "UNIQ-STRS"});

  EXPECT_THAT(resp.GetString(), HasSubstr("hash"));

  const auto bucket = ParseStats(resp.GetString());
  EXPECT_TRUE(bucket.has_value());

  EXPECT_EQ(bucket->total_strings, 300);
  EXPECT_LE(bucket->unique_strings, 5);
  EXPECT_GE(bucket->unique_strings, 2);
  EXPECT_GT(bucket->estimated_savings, 0);
}

TEST_F(StringStatsTest, SetWithUniqueMembers) {
  for (int i = 0; i < 10; ++i) {
    Run({"SADD", absl::StrCat("set:", i), absl::StrCat("unique_member_", i, "_a"),
         absl::StrCat("unique_member_", i, "_b"), absl::StrCat("unique_member_", i, "_c")});
  }

  const auto resp = Run({"DEBUG", "UNIQ-STRS"});

  const auto bucket = ParseStats(resp.GetString());
  EXPECT_TRUE(bucket.has_value());

  EXPECT_EQ(bucket->total_strings, 30);
  EXPECT_NEAR(bucket->unique_strings, 30, 3);
  EXPECT_LE(bucket->estimated_savings, bucket->total_bytes * 0.15);
}

TEST_F(StringStatsTest, SetWithDuplicateMembers) {
  for (int i = 0; i < 50; ++i) {
    Run({"SADD", absl::StrCat("set:", i), "alpha", "beta", "gamma"});
  }

  const auto resp = Run({"DEBUG", "UNIQ-STRS"});

  const auto bucket = ParseStats(resp.GetString());
  EXPECT_TRUE(bucket.has_value());

  EXPECT_EQ(bucket->total_strings, 150);
  EXPECT_LE(bucket->unique_strings, 5);
  EXPECT_GE(bucket->unique_strings, 2);
  EXPECT_GT(bucket->estimated_savings, 0);
}

TEST_F(StringStatsTest, MultipleTypes) {
  for (int i = 0; i < 10; ++i) {
    Run({"HSET", absl::StrCat("h:", i), "field", "value"});
    Run({"SADD", absl::StrCat("s:", i), "member"});
  }

  const auto resp = Run({"DEBUG", "UNIQ-STRS"});
  const std::string output = resp.GetString();

  EXPECT_THAT(output, HasSubstr("hash"));
  EXPECT_THAT(output, HasSubstr("set"));
}

TEST_F(StringStatsTest, EmptyDatabase) {
  const auto resp = Run({"DEBUG", "UNIQ-STRS"});
  const std::string output = resp.GetString();

  EXPECT_THAT(output, HasSubstr("___begin unique string stats___"));
  EXPECT_THAT(output, HasSubstr("___end unique string stats___"));

  auto bucket = ParseStats(output);
  EXPECT_FALSE(bucket.has_value());
}

TEST_F(StringStatsTest, NumberKeys) {
  for (int i = 0; i < 100; ++i) {
    Run({"LPUSH", absl::StrCat("h:", i), "007", "value"});
  }

  const auto resp = Run({"DEBUG", "UNIQ-STRS"});
  const std::string output = resp.GetString();

  EXPECT_THAT(output, HasSubstr("list"));
  const auto bucket = ParseStats(output);
  EXPECT_TRUE(bucket.has_value());

  EXPECT_EQ(bucket->total_strings, 200);
  EXPECT_EQ(bucket->unique_strings, 2);
}

}  // namespace dfly


================================================
FILE: src/server/synchronization.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/synchronization.h"

#include "base/logging.h"
#include "server/engine_shard_set.h"
#include "server/server_state.h"

namespace dfly {

ThreadLocalMutex::ThreadLocalMutex() {
  shard_ = EngineShard::tlocal();
}

ThreadLocalMutex::~ThreadLocalMutex() {
  DCHECK_EQ(EngineShard::tlocal(), shard_);
}

void ThreadLocalMutex::lock() {
  if (ServerState::tlocal()->serialization_max_chunk_size != 0) {
    DCHECK_EQ(EngineShard::tlocal(), shard_);
    util::fb2::NoOpLock noop_lk_;
    if (locked_fiber_ != nullptr) {
      DCHECK(util::fb2::detail::FiberActive() != locked_fiber_);
    }
    cond_var_.wait(noop_lk_, [this]() { return !flag_; });
    flag_ = true;
    DCHECK_EQ(locked_fiber_, nullptr);
    locked_fiber_ = util::fb2::detail::FiberActive();
  }
}

void ThreadLocalMutex::unlock() {
  if (ServerState::tlocal()->serialization_max_chunk_size != 0) {
    DCHECK_EQ(EngineShard::tlocal(), shard_);
    flag_ = false;
    cond_var_.notify_one();
    locked_fiber_ = nullptr;
  }
}

void LocalLatch::unlock() {
  DCHECK_GT(mutating_, 0u);
  --mutating_;
  if (mutating_ == 0) {
    cond_var_.notify_all();
  }
}

void LocalLatch::Wait() {
  util::fb2::NoOpLock noop_lk_;
  cond_var_.wait(noop_lk_, [this]() { return mutating_ == 0; });
}

}  // namespace dfly


================================================
FILE: src/server/synchronization.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include "util/fibers/fibers.h"
#include "util/fibers/synchronization.h"

namespace dfly {

class EngineShard;

// Helper class used to guarantee atomicity between serialization of buckets
class ABSL_LOCKABLE ThreadLocalMutex {
 public:
  ThreadLocalMutex();
  ~ThreadLocalMutex();

  void lock() ABSL_EXCLUSIVE_LOCK_FUNCTION();
  void unlock() ABSL_UNLOCK_FUNCTION();
  bool is_locked() const {
    return flag_;
  }

 private:
  EngineShard* shard_;
  util::fb2::CondVarAny cond_var_;
  bool flag_ = false;
  util::fb2::detail::FiberInterface* locked_fiber_{nullptr};
};

// Replacement of std::SharedLock that allows -Wthread-safety
template <typename Mutex> class ABSL_SCOPED_LOCKABLE SharedLock {
 public:
  explicit SharedLock(Mutex& m) ABSL_EXCLUSIVE_LOCK_FUNCTION(m) : m_(m) {
    m_.lock_shared();
    is_locked_ = true;
  }

  ~SharedLock() ABSL_UNLOCK_FUNCTION() {
    if (is_locked_) {
      m_.unlock_shared();
    }
  }

  void unlock() ABSL_UNLOCK_FUNCTION() {
    m_.unlock_shared();
    is_locked_ = false;
  }

 private:
  Mutex& m_;
  bool is_locked_;
};

// A single threaded latch that passes a waiter fiber if its count is 0.
// Fibers that increase/decrease the count do not wait on the latch.
class LocalLatch {
 public:
  void lock() {
    ++mutating_;
  }

  void unlock();

  void Wait();

  bool IsBlocked() const {
    return mutating_ > 0;
  }

 private:
  util::fb2::CondVarAny cond_var_;
  size_t mutating_ = 0;
};

}  // namespace dfly


================================================
FILE: src/server/table.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/table.h"

#include "base/flags.h"
#include "base/logging.h"
#include "core/top_keys.h"
#include "server/cluster_support.h"
#include "server/server_state.h"

using namespace std;
namespace dfly {
#define ADD(x) (x) += o.x

// It should be const, but we override this variable in our tests so that they run faster.
unsigned kInitSegmentLog = 3;

void DbTableStats::AddTypeMemoryUsage(unsigned type, int64_t delta) {
  if (type >= memory_usage_by_type.size()) {
    LOG(DFATAL) << "Encountered unknown type when aggregating per-type memory: " << type;
    return;
  }

  DCHECK_GE(obj_memory_usage, memory_usage_by_type[type]);

  if (delta < 0 && memory_usage_by_type[type] < size_t(-delta)) {
    LOG_EVERY_T(ERROR, 1) << "Encountered underflow memory usage when aggregating per-type memory: "
                          << obj_memory_usage << " + " << delta << ", type: " << type;

    // Truncate delta to avoid underflow, but keep the memory usage consistent with the sum of
    // per-type usage.
    delta = -static_cast<int64_t>(memory_usage_by_type[type]);
  }

  obj_memory_usage += delta;
  memory_usage_by_type[type] += delta;
}

DbTableStats& DbTableStats::operator+=(const DbTableStats& o) {
  constexpr size_t kDbSz = sizeof(DbTableStats) - sizeof(memory_usage_by_type);
  static_assert(kDbSz == 72);

  ADD(inline_keys);
  ADD(expire_count);
  ADD(obj_memory_usage);
  ADD(tiered_entries);
  ADD(tiered_used_bytes);
  ADD(events.hits);
  ADD(events.misses);
  ADD(events.expired_keys);
  ADD(events.evicted_keys);

  for (size_t i = 0; i < o.memory_usage_by_type.size(); ++i) {
    memory_usage_by_type[i] += o.memory_usage_by_type[i];
  }

  return *this;
}

SlotStats& SlotStats::operator+=(const SlotStats& o) {
  static_assert(sizeof(SlotStats) == 32);

  ADD(key_count);
  ADD(total_reads);
  ADD(total_writes);
  ADD(memory_bytes);
  return *this;
}

std::optional<const IntentLock> LockTable::Find(LockTag tag) const {
  LockFp fp = tag.Fingerprint();
  if (auto it = locks_.find(fp); it != locks_.end())
    return it->second;
  return std::nullopt;
}

std::optional<const IntentLock> LockTable::Find(uint64_t fp) const {
  if (auto it = locks_.find(fp); it != locks_.end())
    return it->second;
  return std::nullopt;
}

void LockTable::Release(uint64_t fp, IntentLock::Mode mode) {
  auto it = locks_.find(fp);
  DCHECK(it != locks_.end()) << fp;

  it->second.Release(mode);
  if (it->second.IsFree())
    locks_.erase(it);
}

[[maybe_unused]] constexpr size_t kSzTable = sizeof(DbTable);

DbTable::SampleTopKeys::~SampleTopKeys() {
  delete top_keys;
}

DbTable::SampleUniqueKeys::~SampleUniqueKeys() {
  delete[] dense_hll;
}

DbTable::DbTable(PMR_NS::memory_resource* mr, DbIndex db_index)
    : prime(kInitSegmentLog, detail::PrimeTablePolicy{}, mr),
      mcflag(0, detail::ExpireTablePolicy{}, mr),
      index(db_index) {
  if (IsClusterEnabled()) {
    slots_stats.reset(new SlotStats[kMaxSlotNum + 1]);
  }
  thread_index = ServerState::tlocal()->thread_index();
}

DbTable::~DbTable() {
  DCHECK_EQ(thread_index, ServerState::tlocal()->thread_index());
  delete sample_top_keys;
  delete sample_unique_keys;
}

void DbTable::Clear() {
  prime.size();
  prime.Clear();
  mcflag.Clear();
  stats = DbTableStats{};
}

PrimeIterator DbTable::Launder(PrimeIterator it, string_view key) {
  if (!it.IsOccupied() || it->first != key) {
    it = prime.Find(key);
  }
  return it;
}

}  // namespace dfly


================================================
FILE: src/server/table.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/flat_hash_map.h>

#include <boost/smart_ptr/intrusive_ptr.hpp>
#include <boost/smart_ptr/intrusive_ref_counter.hpp>

#include "core/expire_period.h"
#include "core/intent_lock.h"
#include "server/detail/table.h"
#include "server/tx_base.h"

extern "C" {
#include "redis/redis_aux.h"
}
namespace base {
class Histogram;
}

namespace dfly {

using PrimeKey = detail::PrimeKey;
using PrimeValue = detail::PrimeValue;

using PrimeTable = DashTable<PrimeKey, PrimeValue, detail::PrimeTablePolicy>;
using ExpireTable = DashTable<PrimeKey, ExpirePeriod, detail::ExpireTablePolicy>;

/// Iterators are invalidated when new keys are added to the table or some entries are deleted.
/// Iterators are still valid if a different entry in the table was mutated.
using PrimeIterator = PrimeTable::iterator;
using PrimeConstIterator = PrimeTable::const_iterator;
using ExpireIterator = ExpireTable::iterator;
using ExpireConstIterator = ExpireTable::const_iterator;

class TopKeys;

inline bool IsValid(PrimeIterator it) {
  return !it.is_done();
}

inline bool IsValid(ExpireIterator it) {
  return !it.is_done();
}

inline bool IsValid(PrimeConstIterator it) {
  return !it.is_done();
}

inline bool IsValid(ExpireConstIterator it) {
  return !it.is_done();
}

struct SlotStats {
  uint64_t key_count = 0;
  uint64_t total_reads = 0;
  uint64_t total_writes = 0;
  uint64_t memory_bytes = 0;
  SlotStats& operator+=(const SlotStats& o);
};

struct DbTableStats {
  // Number of inline keys.
  uint64_t inline_keys = 0;

  // number of keys with ttls set.
  uint64_t expire_count = 0;

  // Object memory usage besides hash-table capacity.
  // Applies for any non-inline objects.
  size_t obj_memory_usage = 0;

  size_t tiered_entries = 0;
  size_t tiered_used_bytes = 0;

  struct {
    // Per-database hits/misses on keys
    size_t hits = 0;
    size_t misses = 0;

    // Per-database expired/evicted keys
    size_t expired_keys = 0;
    size_t evicted_keys = 0;
  } events;

  std::array<size_t, OBJ_TYPE_MAX> memory_usage_by_type = {};

  // Mostly used internally, exposed for tiered storage.
  void AddTypeMemoryUsage(unsigned type, int64_t delta);

  DbTableStats& operator+=(const DbTableStats& o);
};

// Table for recording locks. Keys used with the lock table should be normalized with LockTag.
class LockTable {
 public:
  size_t Size() const {
    return locks_.size();
  }
  std::optional<const IntentLock> Find(LockTag tag) const;
  std::optional<const IntentLock> Find(LockFp fp) const;

  bool Acquire(LockFp fp, IntentLock::Mode mode) {
    return locks_[fp].Acquire(mode);
  }

  void Release(LockFp fp, IntentLock::Mode mode);

  auto begin() const {
    return locks_.cbegin();
  }

  auto end() const {
    return locks_.cend();
  }

 private:
  // We use fingerprinting before accessing locks - no need to mix more.
  struct Hasher {
    size_t operator()(LockFp val) const {
      return val;
    }
  };
  absl::flat_hash_map<LockFp, IntentLock, Hasher> locks_;
};

// A single Db table that represents a table that can be chosen with "SELECT" command.
struct DbTable : boost::intrusive_ref_counter<DbTable, boost::thread_unsafe_counter> {
  PrimeTable prime;
  // ExpireTable expire;  // TTL is now embedded in CompactKey via SDS_TTL_TAG.
  DashTable<PrimeKey, uint32_t, detail::ExpireTablePolicy> mcflag;

  // Contains transaction locks
  LockTable trans_locks;

  // Stores a list of dependant dirty flags for each watched key.
  absl::flat_hash_map<std::string, std::vector<std::atomic_bool*>> watched_keys;

  // Keyspace notifications: list of expired keys since last batch of messages was published.
  mutable std::vector<std::string> expired_keys_events_;

  mutable DbTableStats stats;
  std::unique_ptr<SlotStats[]> slots_stats;
  PrimeTable::Cursor expire_cursor;

  struct SampleTopKeys {
    TopKeys* top_keys = nullptr;
    uint64_t total_samples = 0;

    SampleTopKeys() = default;
    ~SampleTopKeys();
    void operator=(const SampleTopKeys& other) = delete;
    SampleTopKeys(const SampleTopKeys& other) = delete;
  };
  SampleTopKeys* sample_top_keys = nullptr;

  struct SampleUniqueKeys {
    uint8_t* dense_hll = nullptr;
    uint64_t total_samples = 0;

    SampleUniqueKeys() = default;
    ~SampleUniqueKeys();

    void operator=(const SampleUniqueKeys& other) = delete;
    SampleUniqueKeys(const SampleUniqueKeys& other) = delete;
  };
  SampleUniqueKeys* sample_unique_keys = nullptr;
  base::Histogram* sample_values_hist = nullptr;

  DbIndex index;
  uint32_t thread_index;

  explicit DbTable(PMR_NS::memory_resource* mr, DbIndex index);
  ~DbTable();

  void Clear();
  PrimeIterator Launder(PrimeIterator it, std::string_view key);

  size_t table_memory() const {
    return prime.mem_usage();
  }
};

// We use reference counting semantics of DbTable when doing snapshotting.
// There we need to preserve the copy of the table in case someone flushes it during
// the snapshot process. We copy the pointers in StartSnapshotInShard function.
using DbTableArray = std::vector<boost::intrusive_ptr<DbTable>>;

// ChangeReq - describes the change to the table.
struct ChangeReq {
  // If iterator is set then it's an update to the existing bucket.
  // Otherwise (string_view is set) then it's a new key that is going to be added to the table.
  std::variant<PrimeTable::bucket_iterator, std::string_view> change;

  explicit ChangeReq(PrimeTable::bucket_iterator it) : change(it) {
  }
  explicit ChangeReq(std::string_view key) : change(key) {
  }

  const PrimeTable::bucket_iterator* update() const {
    return std::get_if<PrimeTable::bucket_iterator>(&change);
  }
};

}  // namespace dfly


================================================
FILE: src/server/test_utils.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/test_utils.h"

#include "server/acl/acl_commands_def.h"
#include "server/acl/acl_family.h"
#include "util/fibers/fibers.h"

extern "C" {
#include "redis/zmalloc.h"
}

#include <absl/flags/reflection.h>
#include <absl/strings/match.h>
#include <absl/strings/str_split.h>
#include <mimalloc.h>

#include "base/flags.h"
#include "base/logging.h"
#include "base/stl_util.h"
#include "facade/dragonfly_connection.h"
#include "facade/reply_builder.h"
#include "io/file_util.h"
#include "server/acl/acl_log.h"
#include "util/fibers/pool.h"

using namespace std;

ABSL_DECLARE_FLAG(string, dbfilename);
ABSL_DECLARE_FLAG(double, rss_oom_deny_ratio);
ABSL_DECLARE_FLAG(uint32_t, num_shards);
ABSL_FLAG(bool, force_epoll, false, "If true, uses epoll api instead iouring to run tests");
ABSL_DECLARE_FLAG(uint32_t, acllog_max_len);
ABSL_DECLARE_FLAG(bool, enable_heartbeat_rss_eviction);

namespace dfly {

namespace {

// Default stack size for fibers. We decrease it by 16 bytes because some allocators
// need additional 8-16 bytes for their internal structures, thus over reserving additional
// memory pages if using round sizes.
#ifdef NDEBUG
constexpr size_t kFiberDefaultStackSize = 32_KB - 16;
#elif defined SANITIZERS
// Increase stack size for sanitizers builds.
constexpr size_t kFiberDefaultStackSize = 64_KB - 16;
#else
// Increase stack size for debug builds.
constexpr size_t kFiberDefaultStackSize = 50_KB - 16;
#endif

}  // namespace

std::ostream& operator<<(std::ostream& os, const DbStats& stats) {
  os << "keycount: " << stats.key_count << ", tiered_size: " << stats.tiered_used_bytes
     << ", tiered_entries: " << stats.tiered_entries << "\n";

  return os;
}

extern unsigned kInitSegmentLog;

using MP = MemcacheParser;
using namespace util;
using namespace testing;

static vector<string> SplitLines(const std::string& src) {
  vector<string> res = absl::StrSplit(src, "\r\n");
  if (res.back().empty())
    res.pop_back();
  for (auto& v : res) {
    absl::StripAsciiWhitespace(&v);
  }
  return res;
}

TestConnection::TestConnection(facade::ServiceInterface* si, Protocol protocol)
    : facade::Connection(protocol, nullptr, nullptr, si) {
  cc_.reset(new dfly::ConnectionContext(this, {}));
  static_cast<dfly::ConnectionContext*>(cc_.get())->skip_acl_validation = true;
  SetSocket(ProactorBase::me()->CreateSocket());
  OnConnectionStart();
}

void TestConnection::SendPubMessageAsync(PubMessage pmsg) {
  messages.push_back(std::move(pmsg));
}

void TestConnection::SendInvalidationMessageAsync(InvalidationMessage msg) {
  invalidate_messages.push_back(std::move(msg));
}

std::string TestConnection::RemoteEndpointStr() const {
  return "";
}

void TransactionSuspension::Start() {
  static CommandId cid{"TEST", CO::JOURNALED | CO::GLOBAL_TRANS, -1, 0, 0, acl::NONE};

  transaction_ = new dfly::Transaction{&cid};

  auto st = transaction_->InitByArgs(&namespaces->GetDefaultNamespace(), 0, {});
  CHECK_EQ(st, OpStatus::OK);

  transaction_->Execute([](Transaction* t, EngineShard* shard) { return OpStatus::OK; }, false);
}

void TransactionSuspension::Terminate() {
  transaction_->Conclude();
  transaction_ = nullptr;
}

class BaseFamilyTest::TestConnWrapper {
 public:
  TestConnWrapper(facade::ServiceInterface* si, Protocol proto);
  ~TestConnWrapper();

  CmdArgVec Args(ArgSlice list);

  RespVec ParseResponse(bool fully_consumed);

  // returns: type(pmessage), pattern, channel, message.
  const facade::Connection::PubMessage& GetPubMessage(size_t index) const;

  const facade::Connection::InvalidationMessage& GetInvalidationMessage(size_t index) const;

  ConnectionContext* cmd_cntx() {
    auto cntx = static_cast<ConnectionContext*>(dummy_conn_->cntx());
    cntx->ns = &namespaces->GetDefaultNamespace();
    return cntx;
  }

  StringVec SplitLines() const {
    return dfly::SplitLines(sink_.str());
  }

  void ClearSink() {
    sink_.Clear();
    expr_builder_.Clear();
  }

  TestConnection* conn() {
    return dummy_conn_.get();
  }

  SinkReplyBuilder* builder() {
    return builder_.get();
  }

 private:
  ::io::StringSink sink_;  // holds the response blob

  std::unique_ptr<TestConnection> dummy_conn_;

  std::vector<std::unique_ptr<std::string>> tmp_str_vec_;

  RespExprBuilder expr_builder_;
  std::unique_ptr<SinkReplyBuilder> builder_;
};

BaseFamilyTest::TestConnWrapper::TestConnWrapper(facade::ServiceInterface* si, Protocol proto)
    : dummy_conn_(new TestConnection(si, proto)) {
  switch (proto) {
    case Protocol::REDIS:
      builder_.reset(new RedisReplyBuilder{&sink_});
      break;
    case Protocol::MEMCACHE:
      builder_.reset(new MCReplyBuilder{&sink_});
      break;
  }
}

BaseFamilyTest::TestConnWrapper::~TestConnWrapper() {
}

BaseFamilyTest::BaseFamilyTest() {
}

BaseFamilyTest::~BaseFamilyTest() {
  for (auto* v : resp_vec_)
    delete v;
}

void BaseFamilyTest::SetUpTestSuite() {
  kInitSegmentLog = 1;

  absl::SetFlag(&FLAGS_rss_oom_deny_ratio, -1);
  absl::SetFlag(&FLAGS_dbfilename, "");
  // We don't want rss eviction
  absl::SetFlag(&FLAGS_enable_heartbeat_rss_eviction, false);

  static bool init = true;
  if (exchange(init, false)) {
    fb2::SetDefaultStackResource(&fb2::std_malloc_resource, kFiberDefaultStackSize);
  }

  init_zmalloc_threadlocal(mi_heap_get_backing());

  // TODO: go over all env variables starting with FLAGS_ and make sure they are in the below list.
  static constexpr const char* kEnvFlags[] = {
      "cluster_mode",
      "lock_on_hashtags",
      "force_epoll",
  };
  for (string_view flag : kEnvFlags) {
    const char* value = getenv(absl::StrCat("FLAGS_", flag).data());
    if (value != nullptr) {
      SetTestFlag(flag, value);
    }
  }
}

void BaseFamilyTest::SetUp() {
  max_memory_limit = INT_MAX;
  ResetService();
}

void BaseFamilyTest::TearDown() {
  CHECK_EQ(NumLocked(), 0U);

  {
    std::unique_lock conn_lck{mu_};
    connections_.clear();
  }

  ShutdownService();

  const TestInfo* const test_info = UnitTest::GetInstance()->current_test_info();
  LOG(INFO) << "Finishing " << test_info->name();
}

void BaseFamilyTest::ResetService() {
  if (service_ != nullptr) {
    TEST_InvalidateLockTagOptions();

    ShutdownService();
  }

#ifdef __linux__
  if (absl::GetFlag(FLAGS_force_epoll)) {
    pp_.reset(fb2::Pool::Epoll(num_threads_));
  } else {
    pp_.reset(fb2::Pool::IOUring(16, num_threads_));
  }
#else
  pp_.reset(fb2::Pool::Epoll(num_threads_));
#endif

  // Using a different default than production could expose bugs
  if (absl::GetFlag(FLAGS_num_shards) == 0) {
    absl::SetFlag(&FLAGS_num_shards, num_threads_ - 1);
  }
  pp_->Run();
  service_ = std::make_unique<Service>(pp_.get());

  // Must be reset before starting the service. Engine shard heartbeat task updates this
  // value, and if reset after some invocations of heartbeat have run, the accumulated data is
  // lost and can cause test failure.
  used_mem_current = 0;
  service_->Init(nullptr, {});

  TEST_current_time_ms = absl::GetCurrentTimeNanos() / 1000000;
  auto default_ns = &namespaces->GetDefaultNamespace();
  auto cb = [&](EngineShard* s) {
    default_ns->GetDbSlice(s->shard_id()).UpdateExpireBase(TEST_current_time_ms - 1000, 0);
  };
  shard_set->RunBriefInParallel(cb);

  const TestInfo* const test_info = UnitTest::GetInstance()->current_test_info();
  LOG(INFO) << "Starting " << test_info->name();

  watchdog_fiber_ = pp_->GetNextProactor()->LaunchFiber([this] {
    ThisFiber::SetName("Watchdog");

    if (!watchdog_done_.WaitFor(20s)) {
      LOG(ERROR) << "Deadlock detected!!!!";
      absl::SetFlag(&FLAGS_alsologtostderr, true);
      fb2::Mutex m;
      shard_set->pool()->AwaitFiberOnAll([&m, this](unsigned index, ProactorBase* base) {
        ThisFiber::SetName("Watchdog");
        std::unique_lock lk(m);
        LOG(ERROR) << "Proactor " << index << ":\n";
        fb2::detail::FiberInterface::PrintAllFiberStackTraces();
        EngineShard* es = EngineShard::tlocal();

        if (es != nullptr) {
          TxQueue* txq = es->txq();
          if (!txq->Empty()) {
            LOG(ERROR) << "TxQueue for shard " << es->shard_id();

            auto head = txq->Head();
            auto it = head;
            do {
              Transaction* trans = std::get<Transaction*>(es->txq()->At(it));
              LOG(ERROR) << "Transaction " << trans->DebugId(es->shard_id());
              it = txq->Next(it);
            } while (it != head);
          }

          LOG(ERROR) << "TxLocks for shard " << es->shard_id();
          for (const auto& k_v : namespaces->GetDefaultNamespace()
                                     .GetDbSlice(es->shard_id())
                                     .GetDBTable(0)
                                     ->trans_locks) {
            LOG(ERROR) << "Key " << k_v.first << " " << k_v.second;
          }

          LOG(ERROR) << "Transaction for shard " << es->shard_id();
          std::unique_lock conn_lck{mu_};
          for (auto& conn : connections_) {
            auto* context = conn.second->cmd_cntx();
            if (context->transaction && context->transaction->IsScheduled() &&
                context->transaction->IsActive(es->shard_id())) {
              LOG(ERROR) << context->transaction->DebugId(es->shard_id());
            }
          }
        }
      });
    }
  });
}

void BaseFamilyTest::ShutdownService() {
  if (service_ == nullptr) {
    return;
  }

  // Don't save files during shutdown
  CleanupSnapshots();
  absl::SetFlag(&FLAGS_dbfilename, "");

  service_->Shutdown();
  service_.reset();

  // Stop the watchdog before shutting down the service, because shutdown tears down namespaces
  // which the watchdog's diagnostic code may access. Must run before we delete shard_set as
  // the watchdog accesses it.
  watchdog_done_.Notify();
  watchdog_fiber_.Join();

  delete shard_set;
  shard_set = nullptr;

  pp_->Stop();
}

void BaseFamilyTest::InitWithDbFilename() {
  ShutdownService();

  absl::SetFlag(&FLAGS_dbfilename, "rdbtestdump");
  CleanupSnapshots();
  ResetService();
}

void BaseFamilyTest::CleanupSnapshots() {
  string dbfilename = absl::GetFlag(FLAGS_dbfilename);
  if (dbfilename.empty())
    return;

  auto rdb_files = io::StatFiles(absl::StrCat(dbfilename, "*"));
  CHECK(rdb_files);
  for (const auto& fl : *rdb_files) {
    unlink(fl.name.c_str());
  }
}

unsigned BaseFamilyTest::NumLocked() {
  atomic_uint count = 0;
  auto default_ns = &namespaces->GetDefaultNamespace();
  shard_set->RunBriefInParallel([&](EngineShard* shard) {
    for (const auto& db : default_ns->GetDbSlice(shard->shard_id()).databases()) {
      if (db == nullptr) {
        continue;
      }
      count += db->trans_locks.Size();
    }
  });
  return count;
}

void BaseFamilyTest::ClearMetrics() {
  shard_set->pool()->AwaitBrief([](unsigned, auto*) {
    ServerState::tlocal()->stats = ServerState::Stats(shard_set->size());
  });
}

string BaseFamilyTest::FormatMetrics(const Metrics& metrics) const {
  return service_->server_family().FormatInfoMetrics(metrics, "ALL", true);
}

void BaseFamilyTest::WaitUntilLocked(DbIndex db_index, string_view key, double timeout) {
  auto step = 50us;
  auto timeout_micro = chrono::duration_cast<chrono::microseconds>(1000ms * timeout);
  int64_t steps = timeout_micro.count() / step.count();
  do {
    ThisFiber::SleepFor(step);
  } while (!IsLocked(db_index, key) && --steps > 0);
  CHECK(IsLocked(db_index, key));
}

bool BaseFamilyTest::WaitUntilCondition(std::function<bool()> condition_cb,
                                        std::chrono::milliseconds timeout_ms) {
  auto step = 50us;
  auto timeout_micro = chrono::duration_cast<chrono::microseconds>(timeout_ms);
  int64_t steps = timeout_micro.count() / step.count();
  do {
    ThisFiber::SleepFor(step);
  } while (!condition_cb() && --steps > 0);
  return condition_cb();
}

RespExpr BaseFamilyTest::Run(ArgSlice list) {
  if (!ProactorBase::IsProactorThread()) {
    return pp_->at(0)->Await([&] {
      ThisFiber::SetName("Test::Run");
      return this->Run(list);
    });
  }

  return Run(GetId(), list);
}

RespExpr BaseFamilyTest::Run(std::string_view command) {
  std::vector<std::string_view> command_list = absl::StrSplit(command, ' ');
  return Run(command_list);
}

RespExpr BaseFamilyTest::RunPrivileged(std::initializer_list<const std::string_view> list) {
  if (!ProactorBase::IsProactorThread()) {
    return pp_->at(0)->Await([&] { return this->RunPrivileged(list); });
  }
  string id = GetId();
  TestConnWrapper* conn_wrapper = AddFindConn(Protocol::REDIS, id);
  // Before running the command set the connection as admin connection
  conn_wrapper->conn()->SetPrivileged(true);
  auto res = Run(id, ArgSlice{list.begin(), list.size()});
  // After running the command set the connection as non admin connection
  // because the connction is returned to the poll. This way the next call to Run from the same
  // thread will not have the connection set as admin.
  conn_wrapper->conn()->SetPrivileged(false);
  return res;
}

RespExpr BaseFamilyTest::Run(absl::Span<const std::string> span) {
  vector<string_view> sv_vec(span.size());
  for (unsigned i = 0; i < span.size(); ++i) {
    sv_vec[i] = span[i];
  }
  return Run(sv_vec);
}

RespExpr BaseFamilyTest::Run(std::string_view id, ArgSlice slice) {
  if (!ProactorBase::IsProactorThread()) {
    return pp_->at(0)->Await([&] { return this->Run(id, slice); });
  }

  TestConnWrapper* conn_wrapper = AddFindConn(Protocol::REDIS, id);

  CmdArgVec args = conn_wrapper->Args(slice);

  ConnectionContext* context = conn_wrapper->cmd_cntx();
  context->ns = &namespaces->GetDefaultNamespace();

  DCHECK(context->transaction == nullptr) << id;
  CommandContext cmd_cntx;
  cmd_cntx.Init(conn_wrapper->builder(), context);
  cmd_cntx.Assign(args.begin(), args.end(), args.size());
  service_->DispatchCommand(ParsedArgs{cmd_cntx}, &cmd_cntx, AsyncPreference::ONLY_SYNC);

  DCHECK(context->transaction == nullptr);

  auto cmd = absl::AsciiStrToUpper(slice.front());
  if (cmd == "EVAL" || cmd == "EVALSHA" || cmd == "EVAL_RO" || cmd == "EVALSHA_RO" ||
      cmd == "EXEC") {
    shard_set->AwaitRunningOnShardQueue([](auto*) {});  // Wait for async UnlockMulti.
  }

  unique_lock lk(mu_);
  last_cmd_dbg_info_ = context->last_command_debug;

  RespVec vec = conn_wrapper->ParseResponse(single_response_);
  if (vec.size() == 1)
    return vec.front();
  RespVec* new_vec = new RespVec(vec);
  resp_vec_.push_back(new_vec);
  RespExpr e;
  e.type = RespExpr::ARRAY;
  e.u = new_vec;

  return e;
}

void BaseFamilyTest::RunMany(const std::vector<std::vector<std::string>>& cmds) {
  if (!ProactorBase::IsProactorThread()) {
    return pp_->at(0)->Await([&] { return this->RunMany(cmds); });
  }
  TestConnWrapper* conn_wrapper = AddFindConn(Protocol::REDIS, GetId());
  auto* context = conn_wrapper->cmd_cntx();
  context->ns = &namespaces->GetDefaultNamespace();
  vector<cmn::BackedArguments> backed_args_vec(cmds.size());
  for (size_t i = 0; i < cmds.size(); ++i) {
    backed_args_vec[i] = cmn::BackedArguments(cmds[i].begin(), cmds[i].end(), cmds[i].size());
  }
  auto next_fn = [it = backed_args_vec.begin()]() mutable {
    ParsedArgs args(*it);
    ++it;
    return args;
  };
  service_->DispatchManyCommands(next_fn, cmds.size(), conn_wrapper->builder(), context);
  DCHECK(context->transaction == nullptr);
}

auto BaseFamilyTest::RunMC(MP::CmdType cmd_type, string_view key, MCArgs args) -> MCResponse {
  if (!ProactorBase::IsProactorThread()) {
    return pp_->at(0)->Await([&] { return this->RunMC(cmd_type, key, args); });
  }

  TestConnWrapper* conn = AddFindConn(Protocol::MEMCACHE, GetId());

  CommandContext cmd_cntx{conn->builder(), conn->cmd_cntx()};
  cmd_cntx.ConfigureMCExtension(true);
  auto& cmd = *cmd_cntx.mc_command();
  cmd.type = cmd_type;

  string_view kv[2] = {key, args.value};
  unsigned num_args = MP::IsStoreCmd(cmd_type) ? 2 : 1;
  cmd_cntx.Assign(kv, kv + num_args, num_args);
  cmd.flags = args.val_flags;
  cmd.expire_ts = args.ttl.count();
  cmd.delta = args.delta;
  if (cmd.type >= MP::GET && cmd.type <= MP::GATS) {
    cmd.cmd_flags.return_value = true;
    cmd.cmd_flags.return_flags = true;
    cmd.cmd_flags.return_cas = (cmd.type == MP::GETS || cmd.type == MP::GATS);
  }
  auto* context = conn->cmd_cntx();

  DCHECK(context->transaction == nullptr);

  service_->DispatchMC(&cmd_cntx, AsyncPreference::ONLY_SYNC);

  DCHECK(context->transaction == nullptr);

  return conn->SplitLines();
}

auto BaseFamilyTest::RunMC(MP::CmdType cmd_type, std::string_view key) -> MCResponse {
  if (!ProactorBase::IsProactorThread()) {
    return pp_->at(0)->Await([&] { return this->RunMC(cmd_type, key, MCArgs{}); });
  }

  return RunMC(cmd_type, key, MCArgs{});
}

auto BaseFamilyTest::GetMC(MP::CmdType cmd_type, std::initializer_list<std::string_view> list)
    -> MCResponse {
  CHECK_GT(list.size(), 0u);
  CHECK(base::_in(cmd_type, {MP::GET, MP::GAT, MP::GETS, MP::GATS}));

  if (!ProactorBase::IsProactorThread()) {
    return pp_->at(0)->Await([&] { return this->GetMC(cmd_type, list); });
  }

  TestConnWrapper* conn = AddFindConn(Protocol::MEMCACHE, GetId());

  CommandContext cmd_cntx{conn->builder(), conn->cmd_cntx()};
  cmd_cntx.ConfigureMCExtension(true);
  auto& cmd = *cmd_cntx.mc_command();
  cmd.type = cmd_type;
  auto src = list.begin();
  if (cmd.type == MP::GAT || cmd.type == MP::GATS) {
    CHECK(absl::SimpleAtoi(*src++, &cmd.expire_ts));
  }

  cmd_cntx.Assign(src, list.end(), list.end() - src);
  service_->DispatchMC(&cmd_cntx, AsyncPreference::ONLY_SYNC);

  return conn->SplitLines();
}

int64_t BaseFamilyTest::CheckedInt(ArgSlice list) {
  RespExpr resp = Run(list);
  if (resp.type == RespExpr::INT64) {
    return get<int64_t>(resp.u);
  }
  if (resp.type == RespExpr::NIL) {
    return INT64_MIN;
  }

  CHECK_EQ(RespExpr::STRING, int(resp.type)) << list;
  string_view sv = ToSV(resp.GetBuf());
  int64_t res;
  CHECK(absl::SimpleAtoi(sv, &res)) << "|" << sv << "|";
  return res;
}

string BaseFamilyTest::CheckedString(ArgSlice list) {
  RespExpr resp = Run(list);
  CHECK_EQ(RespExpr::STRING, int(resp.type)) << list;
  return string{ToSV(resp.GetBuf())};
}

CmdArgVec BaseFamilyTest::TestConnWrapper::Args(ArgSlice list) {
  CHECK_NE(0u, list.size());

  CmdArgVec res;
  string* str = new string;

  // I compact all the arguments together on purpose.
  // This way I check that arguments handling works well without c-string endings.
  for (auto v : list) {
    str->append(v);
  }
  tmp_str_vec_.emplace_back(str);
  size_t offset = 0;
  for (auto v : list) {
    if (v.empty()) {
      res.push_back(MutableSlice{});
    } else {
      res.emplace_back(str->data() + offset, v.size());
      offset += v.size();
    }
  }

  return res;
}

RespVec BaseFamilyTest::TestConnWrapper::ParseResponse(bool fully_consumed) {
  tmp_str_vec_.emplace_back(new string{sink_.str()});
  auto& s = *tmp_str_vec_.back();

  RESPParser parser;
  auto obj = parser.Feed(s.data(), s.size());

  CHECK(obj.has_value()) << "Failed to parse response: \"" << s << "\" (" << s.size() << " chars)";

  if (fully_consumed) {
    size_t buf_pos = parser.BufferPos();
    // After parsing, if successful, buf_pos can be 0 when the internal buffer is cleared
    buf_pos = obj && !buf_pos ? s.size() : buf_pos;
    DCHECK_EQ(buf_pos, s.size()) << s;
  }

  // Build expressions from the parsed object. We must consume the RESPObj before
  // freeing it, since BuildExpr copies string data into owned_strings_.
  auto& parsed = *obj;

  // The old RedisParser unwraps top-level arrays: elements go directly into res.
  // We match that behavior here for compatibility with existing tests.
  RespVec res;
  auto type = parsed.GetType();
  if (type == RESPObj::Type::ARRAY || type == RESPObj::Type::MAP || type == RESPObj::Type::SET) {
    auto arr = parsed.As<RESPArray>();
    if (arr.has_value() && arr->Size() != SIZE_MAX) {
      for (size_t i = 0; i < arr->Size(); ++i) {
        res.push_back(expr_builder_.BuildExpr((*arr)[i]));
      }
    } else {
      // Null aggregate (e.g. *-1\r\n) — produce a NIL_ARRAY entry.
      res.push_back(expr_builder_.BuildExpr(parsed));
    }
  } else {
    res.push_back(expr_builder_.BuildExpr(parsed));
  }

  // parsed (RESPObj) goes out of scope here, freeing zmalloc-allocated hiredis
  // reply data on this thread. All needed string data has been copied into
  // expr_builder_.owned_strings_.

  return res;
}

const facade::Connection::PubMessage& BaseFamilyTest::TestConnWrapper::GetPubMessage(
    size_t index) const {
  CHECK_LT(index, dummy_conn_->messages.size());
  return dummy_conn_->messages[index];
}

const facade::Connection::InvalidationMessage&
BaseFamilyTest::TestConnWrapper::GetInvalidationMessage(size_t index) const {
  CHECK_LT(index, dummy_conn_->invalidate_messages.size());
  return dummy_conn_->invalidate_messages[index];
}

bool BaseFamilyTest::IsLocked(DbIndex db_index, std::string_view key) const {
  return service_->IsLocked(&namespaces->GetDefaultNamespace(), db_index, key);
}

string BaseFamilyTest::GetId() const {
  int32 id = ProactorBase::me()->GetPoolIndex();
  CHECK_GE(id, 0);
  return absl::StrCat("IO", id);
}

size_t BaseFamilyTest::SubscriberMessagesLen(string_view conn_id) const {
  auto it = connections_.find(conn_id);
  if (it == connections_.end())
    return 0;

  return it->second->conn()->messages.size();
}

size_t BaseFamilyTest::InvalidationMessagesLen(string_view conn_id) const {
  auto it = connections_.find(conn_id);
  if (it == connections_.end())
    return 0;

  return it->second->conn()->invalidate_messages.size();
}

const facade::Connection::PubMessage& BaseFamilyTest::GetPublishedMessage(string_view conn_id,
                                                                          size_t index) const {
  auto it = connections_.find(conn_id);
  CHECK(it != connections_.end());

  return it->second->GetPubMessage(index);
}

const facade::Connection::InvalidationMessage& BaseFamilyTest::GetInvalidationMessage(
    string_view conn_id, size_t index) const {
  auto it = connections_.find(conn_id);
  CHECK(it != connections_.end());
  return it->second->GetInvalidationMessage(index);
}

ConnectionContext::DebugInfo BaseFamilyTest::GetDebugInfo(const std::string& id) const {
  auto it = connections_.find(id);
  CHECK(it != connections_.end());

  return it->second->cmd_cntx()->last_command_debug;
}

auto BaseFamilyTest::AddFindConn(Protocol proto, std::string_view id) -> TestConnWrapper* {
  DCHECK(ProactorBase::IsProactorThread());

  unique_lock lk(mu_);

  auto [it, inserted] = connections_.emplace(id, nullptr);

  if (inserted) {
    it->second = make_unique<TestConnWrapper>(service_.get(), proto);
  } else {
    it->second->ClearSink();
  }
  return it->second.get();
}

vector<string> BaseFamilyTest::StrArray(const RespExpr& expr) {
  CHECK(expr.type == RespExpr::ARRAY || expr.type == RespExpr::NIL_ARRAY);
  if (expr.type == RespExpr::NIL_ARRAY)
    return vector<string>{};

  const RespVec* src = get<RespVec*>(expr.u);
  vector<string> res(src->size());
  for (size_t i = 0; i < src->size(); ++i) {
    res[i] = ToSV(src->at(i).GetBuf());
  }

  return res;
}

vector<LockFp> BaseFamilyTest::GetLastFps() {
  fb2::Mutex mu;
  vector<LockFp> result;

  auto add_keys = [&](ProactorBase* proactor) {
    EngineShard* shard = EngineShard::tlocal();
    if (shard == nullptr) {
      return;
    }

    lock_guard lk(mu);
    for (auto fp :
         namespaces->GetDefaultNamespace().GetDbSlice(shard->shard_id()).TEST_GetLastLockedFps()) {
      result.push_back(fp);
    }
  };
  shard_set->pool()->AwaitFiberOnAll(add_keys);

  return result;
}

void BaseFamilyTest::ExpectConditionWithinTimeout(const std::function<bool()>& condition,
                                                  absl::Duration timeout) {
  absl::Time deadline = absl::Now() + timeout;

  while (deadline > absl::Now()) {
    if (condition()) {
      break;
    }
    ThisFiber::SleepFor(5ms);
  }

  EXPECT_LE(absl::Now(), deadline)
      << "Timeout of " << timeout << " reached when expecting condition";
}

fb2::Fiber BaseFamilyTest::ExpectConditionWithSuspension(const std::function<bool()>& condition) {
  TransactionSuspension tx;
  pp_->at(0)->Await([&] { tx.Start(); });

  auto fb =
      pp_->at(0)->LaunchFiber(fb2::Launch::dispatch, [condition, tx = std::move(tx)]() mutable {
        ExpectConditionWithinTimeout(condition);
        tx.Terminate();
      });
  return fb;
}

util::fb2::Fiber BaseFamilyTest::ExpectUsedKeys(const std::vector<std::string_view>& keys) {
  vector<LockFp> key_fps;
  for (const auto& k : keys) {
    key_fps.push_back(LockTag(k).Fingerprint());
  }
  sort(key_fps.begin(), key_fps.end());
  auto cb = [=] {
    auto last_fps = GetLastFps();
    sort(last_fps.begin(), last_fps.end());
    return last_fps == key_fps;
  };

  return ExpectConditionWithSuspension(std::move(cb));
}

void BaseFamilyTest::SetTestFlag(string_view flag_name, string_view new_value) {
  auto* flag = absl::FindCommandLineFlag(flag_name);
  CHECK_NE(flag, nullptr);
  VLOG(1) << "Changing flag " << flag_name << " from " << flag->CurrentValue() << " to "
          << new_value;
  string error;
  CHECK(flag->ParseFrom(new_value, &error)) << "Error: " << error;
}

std::map<int, int> BaseFamilyTest::GetShardKeyCount() {
  map<int, int> m;

  auto res = Run({"debug", "shards"});
  for (string_view line : absl::StrSplit(res.GetString(), '\n')) {
    vector<string> parts = absl::StrSplit(line, ": ");
    if (parts.size() != 2) {
      continue;
    }

    string_view k = parts[0];
    if (!absl::StartsWith(k, "shard") || !absl::EndsWith(k, "_key_count")) {
      continue;
    }

    CHECK(absl::ConsumePrefix(&k, "shard")) << k;
    CHECK(absl::ConsumeSuffix(&k, "_key_count")) << k;
    int sid;
    CHECK(absl::SimpleAtoi(k, &sid));
    int count;
    CHECK(absl::SimpleAtoi(parts[1], &count));
    m[sid] = count;
  }
  return m;
}

const acl::AclFamily* BaseFamilyTest::TestInitAclFam() {
  absl::SetFlag(&FLAGS_acllog_max_len, 0);
  return service_->TestInit();
}

}  // namespace dfly


================================================
FILE: src/server/test_utils.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <gmock/gmock.h>

#include <chrono>

#include "facade/dragonfly_connection.h"
#include "facade/memcache_parser.h"
#include "facade/resp_expr_test_utils.h"
#include "facade/resp_parser.h"
#include "io/io.h"
#include "server/conn_context.h"
#include "server/main_service.h"
#include "server/namespaces.h"
#include "server/transaction.h"
#include "util/proactor_pool.h"

namespace dfly {
using namespace facade;
using util::fb2::Fiber;
using util::fb2::Launch;

// Test hook defined in common.cc.
void TEST_InvalidateLockTagOptions();

class TestConnection : public facade::Connection {
 public:
  explicit TestConnection(facade::ServiceInterface* si, Protocol protocol);
  std::string RemoteEndpointStr() const override;

  void SendPubMessageAsync(PubMessage pmsg) final;

  void SendInvalidationMessageAsync(InvalidationMessage msg) final;

  bool IsPrivileged() const override {
    return is_privileged_;
  }
  void SetPrivileged(bool is_privileged) {
    is_privileged_ = is_privileged;
  }

  std::vector<PubMessage> messages;

  std::vector<InvalidationMessage> invalidate_messages;

 private:
  bool is_privileged_ = false;
};

// The TransactionSuspension class is designed to facilitate the temporary suspension of commands
// executions. When the 'start' method is invoked, it enforces the suspension of other
// transactions by acquiring a global shard lock. Conversely, invoking the 'terminate' method
// releases the global shard lock, enabling all transactions in the queue to resume execution.
class TransactionSuspension {
 public:
  void Start();
  void Terminate();

 private:
  boost::intrusive_ptr<dfly::Transaction> transaction_;
};

class BaseFamilyTest : public ::testing::Test {
 protected:
  BaseFamilyTest();
  ~BaseFamilyTest();

  static void SetUpTestSuite();

  void SetUp() override;
  void TearDown() override;

  class TestConnWrapper;

  RespExpr Run(std::initializer_list<const std::string_view> list) {
    return Run(ArgSlice{list.begin(), list.size()});
  }

  // Runs the command in a mocked privileged connection
  // Use for running commands which are allowed only when using admin connection.
  RespExpr RunPrivileged(std::initializer_list<const std::string_view> list);

  RespExpr Run(ArgSlice list);
  RespExpr Run(absl::Span<const std::string> list);

  RespExpr Run(std::string_view id, ArgSlice list);

  RespExpr Run(std::string_view command);

  void RunMany(const std::vector<std::vector<std::string>>& cmds);

  using MCResponse = std::vector<std::string>;

  struct MCArgs {
    std::string_view value;
    uint32_t val_flags;
    std::chrono::seconds ttl;
    uint64_t delta;

    explicit MCArgs(std::string_view v = {}, uint32_t f = 0) : value(v), val_flags(f) {
      ttl = std::chrono::seconds{0};
      delta = 0;
    }

    explicit MCArgs(uint64_t d) : MCArgs() {
      delta = d;
    }
  };

  MCResponse RunMC(MemcacheParser::CmdType cmd_type, std::string_view key, MCArgs args);
  MCResponse RunMC(MemcacheParser::CmdType cmd_type, std::string_view key = std::string_view{});
  MCResponse GetMC(MemcacheParser::CmdType cmd_type, std::initializer_list<std::string_view> list);

  int64_t CheckedInt(std::initializer_list<std::string_view> list) {
    return CheckedInt(ArgSlice{list.begin(), list.size()});
  }
  int64_t CheckedInt(ArgSlice list);
  std::string CheckedString(ArgSlice list);

  void ResetService();

  void ShutdownService();

  void InitWithDbFilename();
  void CleanupSnapshots();

  bool IsLocked(DbIndex db_index, std::string_view key) const;
  ConnectionContext::DebugInfo GetDebugInfo(const std::string& id) const;

  ConnectionContext::DebugInfo GetDebugInfo() const {
    return GetDebugInfo("IO0");
  }

  TestConnWrapper* AddFindConn(Protocol proto, std::string_view id);
  static std::vector<std::string> StrArray(const RespExpr& expr);

  Metrics GetMetrics() const {
    return service_->server_family().GetMetrics(&namespaces->GetDefaultNamespace());
  }

  void ClearMetrics();
  std::string FormatMetrics(const Metrics& metrics) const;

  void AdvanceTime(int64_t ms) {
    TEST_current_time_ms += ms;
  }

  // Wait for a locked key to unlock. Aborts after timeout seconds passed.
  void WaitUntilLocked(DbIndex db_index, std::string_view key, double timeout = 3);

  // Wait until condition_cb returns true or timeout reached. Returns condition_cb value
  bool WaitUntilCondition(std::function<bool()> condition_cb,
                          std::chrono::milliseconds timeout_ms = std::chrono::milliseconds(100));

  std::string GetId() const;
  size_t SubscriberMessagesLen(std::string_view conn_id) const;

  size_t InvalidationMessagesLen(std::string_view conn_id) const;

  const facade::Connection::PubMessage& GetPublishedMessage(std::string_view conn_id,
                                                            size_t index) const;

  const facade::Connection::InvalidationMessage& GetInvalidationMessage(std::string_view conn_id,
                                                                        size_t index) const;

  static std::vector<LockFp> GetLastFps();
  static void ExpectConditionWithinTimeout(const std::function<bool()>& condition,
                                           absl::Duration timeout = absl::Seconds(10));
  util::fb2::Fiber ExpectConditionWithSuspension(const std::function<bool()>& condition);
  util::fb2::Fiber ExpectUsedKeys(const std::vector<std::string_view>& keys);

  static unsigned NumLocked();

  static void SetTestFlag(std::string_view flag_name, std::string_view new_value);

  const acl::AclFamily* TestInitAclFam();

  std::map<int, int> GetShardKeyCount();

  std::unique_ptr<util::ProactorPool> pp_;
  std::unique_ptr<Service> service_;
  unsigned num_threads_ = 3;

  absl::flat_hash_map<std::string, std::unique_ptr<TestConnWrapper>> connections_;
  util::fb2::Mutex mu_;
  ConnectionContext::DebugInfo last_cmd_dbg_info_;

  std::vector<RespVec*> resp_vec_;
  bool single_response_ = true;
  util::fb2::Fiber watchdog_fiber_;
  util::fb2::Done watchdog_done_;
};

std::ostream& operator<<(std::ostream& os, const DbStats& stats);

}  // namespace dfly


================================================
FILE: src/server/tiered_storage.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/tiered_storage.h"

#include <mimalloc.h>

#include <cstddef>
#include <functional>
#include <memory>
#include <optional>
#include <variant>

#include "absl/cleanup/cleanup.h"
#include "absl/flags/internal/flag.h"
#include "absl/functional/bind_front.h"
#include "absl/functional/overload.h"
#include "base/flag_utils.h"
#include "base/flags.h"
#include "base/logging.h"
#include "core/detail/listpack_wrap.h"
#include "server/db_slice.h"
#include "server/engine_shard_set.h"
#include "server/snapshot.h"
#include "server/table.h"
#include "server/tiering/common.h"
#include "server/tiering/op_manager.h"
#include "server/tiering/serialized_map.h"
#include "server/tiering/small_bins.h"

extern "C" {
#include "redis/listpack.h"
}

using namespace facade;

using AtLeast64 = base::ConstrainedNumericFlagValue<size_t, 64>;  // ABSL_FLAG breaks with commas
ABSL_FLAG(AtLeast64, tiered_min_value_size, 64,
          "Minimum size of values eligible for offloading. Must be at least 64");

ABSL_FLAG(bool, tiered_experimental_cooling, true,
          "If true, uses intermediate cooling layer "
          "when offloading values to storage");

ABSL_FLAG(unsigned, tiered_storage_write_depth, 200,
          "Maximum number of concurrent stash requests issued by background offload");

ABSL_FLAG(float, tiered_offload_threshold, 0.5,
          "Ratio of free memory (free/max memory) below which offloading starts");

ABSL_FLAG(float, tiered_upload_threshold, 0.1,
          "Ratio of free memory (free/max memory) below which uploading stops");

ABSL_FLAG(bool, tiered_experimental_hash_support, false, "Experimental hash datatype offloading");

namespace dfly {

using namespace std;
using namespace util;
using tiering::FragmentRef;
using tiering::KeyRef;
using tiering::TieredCoolRecord;

namespace {

bool OccupiesWholePages(size_t size) {
  return size >= TieredStorage::kMinOccupancySize;
}

// Stashed bins no longer have bin ids, so this sentinel is used to differentiate from regular reads
constexpr auto kFragmentedBin = tiering::SmallBins::kInvalidBin - 1;

// Called after setting new value in place of previous segment
void RecordDeleted(const PrimeValue& pv, size_t tiered_len, DbTableStats* stats) {
  stats->AddTypeMemoryUsage(pv.ObjType(), pv.MallocUsed());
  stats->tiered_entries--;
  stats->tiered_used_bytes -= tiered_len;
}

tiering::DiskSegment FromCoolItem(const PrimeValue::CoolItem& item) {
  return {item.record->page_index * tiering::kPageSize + item.page_offset, item.serialized_size};
}

string SerializeToString(const TieredStorage::StashDescriptor& blobs) {
  size_t est_size = blobs.EstimatedSerializedSize();
  string s(est_size, 0);
  size_t written = blobs.Serialize({reinterpret_cast<uint8_t*>(s.data()), s.size()});
  s.resize(written);
  return s;
}

}  // anonymous namespace

size_t TieredStorage::StashDescriptor::EstimatedSerializedSize() const {
  return visit(
      absl::Overload{[](const array<string_view, 2>& a) { return a[0].size() + a[1].size(); },
                     [](uint8_t* ptr) {
                       detail::ListpackWrap lw{ptr};
                       return tiering::SerializedMap::EstimateSize(lw.UsedBytes(), lw.size());
                     }},
      blob);
};

size_t TieredStorage::StashDescriptor::Serialize(io::MutableBytes buffer) const {
  DCHECK_LE(EstimatedSerializedSize(), buffer.size());

  switch (rep) {
    case CompactObj::ExternalRep::STRING: {
      auto strs = std::get<std::array<std::string_view, 2>>(blob);
      memcpy(buffer.data(), strs[0].data(), strs[0].size());
      if (!strs[1].empty())
        memcpy(buffer.data() + strs[0].size(), strs[1].data(), strs[1].size());
      return strs[0].size() + strs[1].size();
    }
    case CompactObj::ExternalRep::SERIALIZED_MAP: {
      detail::ListpackWrap lw{static_cast<uint8_t*>(std::get<uint8_t*>(blob))};
      return tiering::SerializedMap::Serialize(
          lw, {reinterpret_cast<char*>(buffer.data()), buffer.length()});
    }
  };
  return 0;
}

class TieredStorage::ShardOpManager : public tiering::OpManager {
  friend class TieredStorage;

 public:
  ShardOpManager(TieredStorage* ts, DbSlice* db_slice, size_t max_size)
      : tiering::OpManager{max_size}, ts_{ts}, db_slice_{*db_slice} {
  }

  // Clear Stash pending flag for entry
  void ClearStashPending(OpManager::KeyRef key) {
    UnblockBackpressure(key, false);
    if (auto pv = Find(key.first, key.second); pv) {
      pv->SetStashPending(false);
      stats_.total_cancels++;
    }
  }

  // Clear stash pending flag for all contained entries of bin
  void ClearStashPending(tiering::SmallBins::BinId id) {
    for (const auto& key : ts_->bins_->ReportStashAborted(id))
      ClearStashPending(key);
  }

  DbTableStats* GetDbTableStats(DbIndex dbid) {
    return db_slice_.MutableStats(dbid);
  }

  void DeleteOffloaded(DbIndex dbid, const tiering::DiskSegment& segment);

 private:
  PrimeValue* Find(DbIndex dbid, string_view key) {
    // TODO: Get DbContext for transaction for correct dbid and time
    // Bypass all update and stat mechanisms
    auto it = db_slice_.GetDBTable(dbid)->prime.Find(key);
    return IsValid(it) ? &it->second : nullptr;
  }

  // Load all values from bin by their hashes
  void Defragment(tiering::DiskSegment segment, string_view value);

  void NotifyStashed(const OwnedEntryId& id,
                     const io::Result<tiering::DiskSegment>& segment) override {
    if (!segment) {
      VLOG(1) << "Stash failed " << segment.error().message();
      visit([this](auto id) { ClearStashPending(id); }, id);
    } else {
      visit([this, segment](auto id) { SetExternal(id, *segment); }, id);
    }
  }

  bool NotifyFetched(const OwnedEntryId& id, tiering::DiskSegment segment,
                     tiering::Decoder* decoder) override;

  bool NotifyDelete(tiering::DiskSegment segment) override;

  // If we are low on memory, remove entries from the ColdQueue,
  // and promote their PrimeValues to be fully external.
  void RetireColdEntries(size_t additional_memory);

  // Set value to be an in-memory type again. Update memory stats.
  void Upload(DbIndex dbid, string_view value, PrimeValue* pv) {
    DCHECK(!value.empty());

    switch (pv->GetExternalRep()) {
      case CompactObj::ExternalRep::STRING:
        pv->Materialize(value, true);
        break;
      case CompactObj::ExternalRep::SERIALIZED_MAP:
        tiering::SerializedMapDecoder decoder{};
        decoder.Initialize(value);
        decoder.Upload(pv);
        break;
    };

    RecordDeleted(*pv, value.size(), GetDbTableStats(dbid));
  }

  // Find entry by key in db_slice and store external segment in place of original value.
  // Update memory stats
  void SetExternal(OpManager::KeyRef key, tiering::DiskSegment segment) {
    UnblockBackpressure(key, true);
    if (auto* pv = Find(key.first, key.second); pv) {
      auto* stats = GetDbTableStats(key.first);

      pv->SetStashPending(false);
      stats->tiered_entries++;
      stats->tiered_used_bytes += segment.length;
      stats_.total_stashes++;

      StashDescriptor blobs{FragmentRef{*pv}.GetSerializationDescr()};
      if (ts_->config_.experimental_cooling) {
        RetireColdEntries(pv->MallocUsed());
        ts_->CoolDown(key.first, key.second, segment, blobs.rep, pv);
      } else {
        stats->AddTypeMemoryUsage(pv->ObjType(), -pv->MallocUsed());
        pv->SetExternal(segment.offset, segment.length, blobs.rep);
      }
    } else {
      LOG(DFATAL) << "Should not reach here";
    }
  }

  // Find bin by id and call SetExternal for all contained entries
  void SetExternal(tiering::SmallBins::BinId id, tiering::DiskSegment segment) {
    for (const auto& [sub_dbid, sub_key, sub_segment] : ts_->bins_->ReportStashed(id, segment))
      SetExternal({sub_dbid, sub_key}, sub_segment);
  }

  // If any backpressure (throttling) is active, notify that the operation finished
  void UnblockBackpressure(OpManager::KeyRef id, bool result) {
    if (auto node = ts_->stash_backpressure_.extract(id); !node.empty())
      node.mapped().Resolve(result);
  }

  struct {
    uint64_t total_stashes = 0, total_cancels = 0, total_fetches = 0;
    uint64_t total_defrags = 0;
    uint64_t total_uploads = 0;
  } stats_;

  TieredStorage* ts_;
  DbSlice& db_slice_;
};

void TieredStorage::ShardOpManager::Defragment(tiering::DiskSegment segment, string_view page) {
  // Note: Bin could've already been deleted, in that case DeleteBin returns an empty list
  for (auto [dbid, hash, item_segment] : ts_->bins_->DeleteBin(segment, page)) {
    // Search for key with the same hash and value pointing to the same segment.
    // If it still exists, it must correspond to the value stored in this bin
    auto predicate = [item_segment = item_segment](const PrimeKey& key, const PrimeValue& probe) {
      return probe.IsExternal() && tiering::DiskSegment{probe.GetExternalSlice()} == item_segment;
    };
    auto it = db_slice_.GetDBTable(dbid)->prime.FindFirst(hash, predicate);
    if (!IsValid(it))
      continue;

    // TODO: Handle upload and cooling via type dependent decoders

    stats_.total_defrags++;
    PrimeValue& pv = it->second;
    if (pv.IsCool()) {
      PrimeValue::CoolItem item = pv.GetCool();
      tiering::DiskSegment segment = FromCoolItem(item);

      // We remove it from both cool storage and the offline storage.
      pv = ts_->DeleteCool(item.record);
      auto* stats = GetDbTableStats(dbid);
      stats->tiered_entries--;
      stats->tiered_used_bytes -= segment.length;
    } else {
      // Cut out relevant part of value and restore it to memory
      string_view value = page.substr(item_segment.offset - segment.offset, item_segment.length);
      Upload(dbid, value, &pv);
    }
  }
}

bool TieredStorage::ShardOpManager::NotifyFetched(const OwnedEntryId& id,
                                                  tiering::DiskSegment segment,
                                                  tiering::Decoder* decoder) {
  ++stats_.total_fetches;

  if (id == OwnedEntryId{kFragmentedBin}) {  // Generally we read whole bins only for defrag
    auto* bdecoder = static_cast<tiering::BareDecoder*>(decoder);
    Defragment(segment, bdecoder->slice);
    return true;  // delete
  }

  tiering::Decoder::UploadMetrics metrics = decoder->GetMetrics();

  // 1. When modified is true we MUST upload the value back to memory.
  // 2. On the other hand, if read is caused by snapshotting we do not want to fetch it.
  //    Currently, our heuristic is not very smart, because we stop uploading any reads during
  //    the snapshotting.
  // TODO: to revisit this when we rewrite it with more efficient snapshotting algorithm.
  bool should_upload = metrics.modified;
  should_upload |= (ts_->UploadBudget() > int64_t(metrics.estimated_mem_usage)) &&
                   !SliceSnapshot::IsSnaphotInProgress();

  if (!should_upload)
    return false;

  const auto& key = get<tiering::DbKeyId>(id);
  auto* pv = Find(key.first, key.second);
  if (pv && pv->IsExternal() && segment == pv->GetExternalSlice()) {
    if (metrics.modified || pv->WasTouched()) {
      ++stats_.total_uploads;
      decoder->Upload(pv);
      RecordDeleted(*pv, segment.length, GetDbTableStats(key.first));
      return true;
    }
    pv->SetTouched(true);
    return false;
  }

  LOG(DFATAL) << "Internal error, should not reach this";
  return false;
}

bool TieredStorage::ShardOpManager::NotifyDelete(tiering::DiskSegment segment) {
  DVLOG(2) << "NotifyDelete [" << segment.offset << "," << segment.length << "]";

  if (OccupiesWholePages(segment.length))
    return true;

  auto bin = ts_->bins_->Delete(segment);
  if (bin.empty) {
    return true;
  }

  if (bin.fragmented) {
    // Trigger read to signal need for defragmentation. NotifyFetched will handle it.
    DVLOG(2) << "Enqueueing bin defragmentation for: " << bin.segment.offset;
    Enqueue(kFragmentedBin, bin.segment, tiering::BareDecoder{}, [](auto res) {});
  }

  return false;
}

void TieredStorage::ShardOpManager::RetireColdEntries(size_t additional_memory) {
  int64_t budget = ts_->UploadBudget() - additional_memory;
  if (budget > 0)
    return;

  size_t gained = ts_->ReclaimMemory(-budget);
  VLOG(1) << "Upload budget: " << budget << ", gained " << gained;

  // Update memory_budget directly since we know that gained bytes were released.
  // We will overwrite the budget correctly in the next Hearbeat.
  db_slice_.UpdateMemoryParams(gained + db_slice_.memory_budget(), db_slice_.bytes_per_object());
}

void TieredStorage::ShardOpManager::DeleteOffloaded(DbIndex dbid,
                                                    const tiering::DiskSegment& segment) {
  auto* stats = GetDbTableStats(dbid);
  OpManager::DeleteOffloaded(segment);
  stats->tiered_used_bytes -= segment.length;
  stats->tiered_entries--;
}

TieredStorage::TieredStorage(size_t max_size, DbSlice* db_slice)
    : op_manager_{make_unique<ShardOpManager>(this, db_slice, max_size)},
      bins_{make_unique<tiering::SmallBins>()} {
  UpdateFromFlags();
}

TieredStorage::~TieredStorage() {
}

error_code TieredStorage::Open(string_view base_path) {
  // dts - dragonfly tiered storage.
  string path = absl::StrCat(
      base_path, "-", absl::Dec(ProactorBase::me()->GetPoolIndex(), absl::kZeroPad4), ".dts");
  return op_manager_->Open(path);
}

void TieredStorage::Close() {
  for (auto& [_, f] : stash_backpressure_)
    f.Resolve(false);
  op_manager_->Close();
}

void TieredStorage::ReadInternal(DbIndex dbid, std::string_view key,
                                 const tiering::DiskSegment& segment,
                                 const tiering::Decoder& decoder,
                                 std::function<void(io::Result<tiering::Decoder*>)> cb) {
  // TODO: improve performance by avoiding one more function wrap
  op_manager_->Enqueue(KeyRef(dbid, key), segment, decoder, std::move(cb));
}

void TieredStorage::Stash(DbIndex dbid, string_view key, const StashDescriptor& blobs,
                          BackPressureFuture* backpressure) {
  CHECK(!bins_->IsPending(dbid, key));  // Because has stash pending is false (ShouldStash checks)

  size_t est_size = blobs.EstimatedSerializedSize();
  DCHECK_GT(est_size, 0u);

  tiering::OpManager::PendingId id;
  error_code ec;

  if (OccupiesWholePages(est_size)) {  // large enough for own page
    id = KeyRef(dbid, key);
    auto serialize = absl::bind_front(&StashDescriptor::Serialize, &blobs);
    ec = op_manager_->PrepareAndStash(id, est_size, serialize);
  } else if (auto bin = bins_->Stash(dbid, key, SerializeToString(blobs)); bin) {
    id = bin->id;
    auto serialize = absl::bind_front(&tiering::SmallBins::SerializeBin, bins_.get(), &*bin);
    ec = op_manager_->PrepareAndStash(id, 4_KB, serialize);
  } else {
    return;  // added to bin, no operations pending
  }

  // Set stash pending to false on single value or whole bin
  if (ec) {
    // file_too_large if we reached the limits of the storage,
    // operation_would_block if we need to wait for a file to grow.
    bool to_log = ec != errc::file_too_large && ec != errc::operation_would_block &&
                  ec != errc::operation_in_progress;
    LOG_IF(ERROR, to_log) << "Stash failed: " << ec.message();
    visit([this](auto id) { op_manager_->ClearStashPending(id); }, id);
    return;
  }

  // If we are in the active offloading phase, throttle stashes by providing backpressure future
  if (backpressure && ShouldOffload()) {
    stats_.total_clients_throttled++;
    *backpressure = stash_backpressure_[{dbid, string{key}}];
  }
}

void TieredStorage::Delete(DbIndex dbid, FragmentRef fragment_ref) {
  DCHECK(!fragment_ref.HasStashPending());
  ++stats_.total_deletes;

  tiering::DiskSegment segment = fragment_ref.GetExternalSlice();
  if (auto* cool = fragment_ref.GetCoolRecord(); cool) {
    auto hot = DeleteCool(cool);
    DCHECK_EQ(hot.ObjType(), OBJ_STRING);
  }
  fragment_ref.ClearOffloaded();
  op_manager_->DeleteOffloaded(dbid, segment);
}

void TieredStorage::CancelStash(DbIndex dbid, std::string_view key,
                                tiering::FragmentRef fragment_ref) {
  DCHECK(fragment_ref.HasStashPending());

  // If any previous write was happening, it has been cancelled
  if (auto node = stash_backpressure_.extract(make_pair(dbid, key)); !node.empty())
    std::move(node.mapped()).Resolve(false);

  // TODO: Don't recompute size estimate, try-delete bin first
  StashDescriptor blobs{fragment_ref.GetSerializationDescr()};
  size_t size = blobs.EstimatedSerializedSize();
  if (OccupiesWholePages(size)) {
    op_manager_->CancelPending(KeyRef(dbid, key));
  } else if (auto bin = bins_->Delete(dbid, key); bin) {
    op_manager_->CancelPending(*bin);
  }
  fragment_ref.ClearStashPending();
}

TieredStats TieredStorage::GetStats() const {
  TieredStats stats{};

  {  // ShardOpManager stats
    auto shard_stats = op_manager_->stats_;
    stats.total_fetches = shard_stats.total_fetches;
    stats.total_stashes = shard_stats.total_stashes;
    stats.total_cancels = shard_stats.total_cancels;
    stats.total_defrags = shard_stats.total_defrags;
    stats.total_uploads = shard_stats.total_uploads;
  }

  {  // OpManager stats
    tiering::OpManager::Stats op_stats = op_manager_->GetStats();
    stats.pending_read_cnt = op_stats.pending_read_cnt;
    stats.pending_stash_cnt = op_stats.pending_stash_cnt;
    stats.allocated_bytes = op_stats.disk_stats.allocated_bytes;
    stats.capacity_bytes = op_stats.disk_stats.capacity_bytes;
    stats.total_heap_buf_allocs = op_stats.disk_stats.heap_buf_alloc_count;
    stats.total_registered_buf_allocs = op_stats.disk_stats.registered_buf_alloc_count;
  }

  {  // SmallBins stats
    tiering::SmallBins::Stats bins_stats = bins_->GetStats();
    stats.small_bins_cnt = bins_stats.stashed_bins_cnt;
    stats.small_bins_entries_cnt = bins_stats.stashed_entries_cnt;
    stats.small_bins_filling_bytes = bins_stats.current_bin_bytes;
    stats.small_bins_filling_entries_cnt = bins_stats.current_entries_cnt;
  }

  {  // Own stats
    stats.total_stash_overflows = stats_.stash_overflow_cnt;
    stats.cold_storage_bytes = stats_.cool_memory_used;
    stats.total_offloading_steps = stats_.offloading_steps;
    stats.total_offloading_stashes = stats_.offloading_stashes;
    stats.clients_throttled = stash_backpressure_.size();
    stats.total_clients_throttled = stats_.total_clients_throttled;
  }
  return stats;
}

float TieredStorage::WriteDepthUsage() const {
  return 1.0f * op_manager_->GetStats().pending_stash_cnt / config_.write_depth_limit;
}

void TieredStorage::UpdateFromFlags() {
  config_ = {
      .min_value_size = absl::GetFlag(FLAGS_tiered_min_value_size),
      .experimental_cooling = absl::GetFlag(FLAGS_tiered_experimental_cooling),
      .write_depth_limit = absl::GetFlag(FLAGS_tiered_storage_write_depth),
      .offload_threshold = absl::GetFlag(FLAGS_tiered_offload_threshold),
      .upload_threshold = absl::GetFlag(FLAGS_tiered_upload_threshold),
      .experimental_hash_offload = absl::GetFlag(FLAGS_tiered_experimental_hash_support),
  };
}

std::vector<std::string> TieredStorage::GetMutableFlagNames() {
  return base::GetFlagNames(FLAGS_tiered_min_value_size, FLAGS_tiered_experimental_cooling,
                            FLAGS_tiered_storage_write_depth, FLAGS_tiered_offload_threshold,
                            FLAGS_tiered_upload_threshold, FLAGS_tiered_experimental_hash_support);
}

bool TieredStorage::ShouldOffload() const {
  size_t free_memory = op_manager_->db_slice_.memory_budget();
  size_t per_shard = max_memory_limit.load(memory_order_relaxed) / shard_set->size();
  // Cool values are already offloadeded, so don't count them as used memory
  return (free_memory + CoolMemoryUsage()) < config_.offload_threshold * per_shard;
}

int64_t TieredStorage::UploadBudget() const {
  size_t free_memory = op_manager_->db_slice_.memory_budget();
  size_t per_shard = max_memory_limit.load(memory_order_relaxed) / shard_set->size();
  return int64_t(free_memory) - int64_t(config_.upload_threshold * per_shard);
}

void TieredStorage::RunOffloading(DbIndex dbid) {
  using namespace tiering::literals;
  if (SliceSnapshot::IsSnaphotInProgress())
    return;

  const auto start_cycles = base::CycleClock::Now();

  // Don't run offloading if there's only very little space left
  auto disk_stats = op_manager_->GetStats().disk_stats;
  if (disk_stats.allocated_bytes + 1_MB > disk_stats.max_file_size)
    return;

  string tmp;
  auto cb = [this, dbid, &tmp](PrimeIterator it) mutable {
    stats_.offloading_steps++;
    auto blobs = ShouldStash(it->second);
    if (blobs) {
      if (it->second.WasTouched()) {
        it->second.SetTouched(false);
      } else {
        stats_.offloading_stashes++;
        it->second.SetStashPending(true);
        Stash(dbid, it->first.GetSlice(&tmp), *blobs, nullptr);
      }
    }
  };

  PrimeTable& table = op_manager_->db_slice_.GetDBTable(dbid)->prime;

  // Loop over entry with time and max stash budget.
  uint64_t cycles = 0;
  do {
    offloading_cursor_ = table.TraverseBySegmentOrder(offloading_cursor_, cb);

    if (op_manager_->GetStats().pending_stash_cnt >= config_.write_depth_limit)
      break;

    // TODO: yield as background fiber to perform more work on idle
    cycles = base::CycleClock::Now() - start_cycles;
    if (base::CycleClock::ToUsec(cycles) >= 100)
      break;
  } while (offloading_cursor_);
}

size_t TieredStorage::ReclaimMemory(size_t goal) {
  size_t gained = 0;
  do {
    size_t memory_before = stats_.cool_memory_used;
    TieredCoolRecord* record = PopCool();
    if (record == nullptr)  // nothing to pull anymore
      break;

    gained += memory_before - stats_.cool_memory_used;

    // Find the entry that points to the cool item and externalize it.
    auto predicate = [record](const PrimeKey& key, const PrimeValue& probe) {
      return probe.IsExternal() && probe.IsCool() && probe.GetCool().record == record;
    };

    PrimeIterator it = op_manager_->db_slice_.GetDBTable(record->db_index)
                           ->prime.FindFirst(record->key_hash, predicate);
    CHECK(IsValid(it));
    PrimeValue& pv = it->second;

    // Now the item is only in storage.
    tiering::DiskSegment segment = FromCoolItem(pv.GetCool());
    pv.Freeze(segment.offset, segment.length);

    auto* stats = op_manager_->GetDbTableStats(record->db_index);
    stats->AddTypeMemoryUsage(record->value.ObjType(), -record->value.MallocUsed());
    CompactObj::DeleteMR<TieredCoolRecord>(record);
  } while (gained < goal);

  return gained;
}

auto TieredStorage::ShouldStash(const tiering::FragmentRef& fragment_ref) const
    -> std::optional<StashDescriptor> {
  // Check value state
  if (fragment_ref.IsOffloaded() || fragment_ref.HasStashPending())
    return nullopt;

  // For now, hash offloading is conditional
  if (fragment_ref.ObjType() == OBJ_HASH && !config_.experimental_hash_offload)
    return nullopt;

  // Estimate value size
  StashDescriptor blobs{fragment_ref.GetSerializationDescr()};
  size_t estimated_size = blobs.EstimatedSerializedSize();
  if (estimated_size < config_.min_value_size)
    return nullopt;

  // Limit write depth. TODO: Provide backpressure?
  if (op_manager_->GetStats().pending_stash_cnt >= config_.write_depth_limit) {
    ++stats_.stash_overflow_cnt;
    return {};
  }

  const auto& disk_stats = op_manager_->GetStats().disk_stats;
  if (disk_stats.allocated_bytes + tiering::kPageSize + estimated_size < disk_stats.max_file_size) {
    return blobs;
  }
  return nullopt;
}

void TieredStorage::CoolDown(DbIndex db_ind, std::string_view str,
                             const tiering::DiskSegment& segment, CompactObj::ExternalRep rep,
                             PrimeValue* pv) {
  TieredCoolRecord* record = CompactObj::AllocateMR<TieredCoolRecord>();
  cool_queue_.push_front(*record);
  stats_.cool_memory_used += (sizeof(TieredCoolRecord) + pv->MallocUsed());

  record->key_hash = CompactObj::HashCode(str);
  record->db_index = db_ind;
  record->page_index = segment.offset / tiering::kPageSize;
  record->value = std::move(*pv);

  pv->SetCool(segment.offset, segment.length, rep, record);
}

PrimeValue TieredStorage::Warmup(DbIndex dbid, PrimeValue::CoolItem item) {
  tiering::DiskSegment segment = FromCoolItem(item);

  // We remove it from both cool storage and the offline storage.
  PrimeValue hot = DeleteCool(item.record);
  op_manager_->DeleteOffloaded(dbid, segment);
  return hot;
}

PrimeValue TieredStorage::DeleteCool(TieredCoolRecord* record) {
  auto it = CoolQueue::s_iterator_to(*record);
  cool_queue_.erase(it);

  PrimeValue hot{std::move(record->value)};
  stats_.cool_memory_used -= (sizeof(TieredCoolRecord) + hot.MallocUsed());
  CompactObj::DeleteMR<TieredCoolRecord>(record);
  return hot;
}

TieredCoolRecord* TieredStorage::PopCool() {
  if (cool_queue_.empty())
    return nullptr;

  TieredCoolRecord& res = cool_queue_.back();
  cool_queue_.pop_back();
  stats_.cool_memory_used -= (sizeof(TieredCoolRecord) + res.value.MallocUsed());
  return &res;
}

void StashPrimeValue(DbIndex dbid, std::string_view key, PrimeValue* pv, TieredStorage* ts,
                     BackPressureFuture* backpressure) {
  if (auto blobs = ts->ShouldStash(*pv); blobs) {
    pv->SetStashPending(true);
    ts->Stash(dbid, key, *blobs, backpressure);
  }
}

void ReadTiered(DbIndex dbid, std::string_view key, const PrimeValue& value,
                function<void(io::Result<string_view>)> readf, TieredStorage* ts) {
  auto cb = [readf = std::move(readf)](io::Result<tiering::StringDecoder*> res) mutable {
    readf(res.transform([](tiering::StringDecoder* d) { return d->GetView(); }));
  };
  ts->Read(dbid, key, value.GetExternalSlice(), tiering::StringDecoder{value}, std::move(cb));
}

template <typename T>
TieredStorage::TResult<T> ModifyTiered(DbIndex dbid, std::string_view key, const PrimeValue& value,
                                       std::function<T(std::string*)> modf, TieredStorage* ts) {
  DCHECK(value.IsExternal());
  DCHECK_EQ(value.ObjType(), OBJ_STRING);

  util::fb2::Future<io::Result<T>> future;

  auto cb = [future, modf = std::move(modf)](io::Result<tiering::StringDecoder*> res) mutable {
    future.Resolve(res.transform([&modf](auto* d) { return modf(d->Write()); }));
  };
  ts->Read(dbid, key, value.GetExternalSlice(), tiering::StringDecoder{value}, std::move(cb));

  return future;
}

// Instantiate for size_t only - used in string_family's OpExtend.
template TieredStorage::TResult<size_t> ModifyTiered(DbIndex dbid, std::string_view key,
                                                     const PrimeValue& value,
                                                     std::function<size_t(std::string*)> modf,
                                                     TieredStorage* ts);

}  // namespace dfly


================================================
FILE: src/server/tiered_storage.h
================================================
// Copyright 2023, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once

#include <absl/container/flat_hash_map.h>

#include <boost/intrusive/list.hpp>
#include <memory>
#include <utility>
#include <vector>

#include "core/tiering_types.h"
#include "io/io.h"  // for io::Result (TODO: replace with nonstd/expected)
#include "server/stats.h"
#include "server/table.h"
#include "server/tiering/common.h"
#include "server/tiering/entry_map.h"
#include "util/fibers/future.h"

namespace dfly {

class DbSlice;

namespace tiering {
class SmallBins;
struct Decoder;
};  // namespace tiering

struct TieredStorageBase {
  // Min sizes of values taking up full page on their own
  const static size_t kMinOccupancySize = tiering::kPageSize / 2;
  struct StashDescriptor : public tiering::FragmentRef::SerializationDescr {
    StashDescriptor() = default;

    StashDescriptor(const tiering::FragmentRef::SerializationDescr& params)  // NOLINT
        : tiering::FragmentRef::SerializationDescr(params) {
    }

    size_t EstimatedSerializedSize() const;
    size_t Serialize(io::MutableBytes buffer) const;
  };

  template <typename T> using TResult = util::fb2::Future<io::Result<T>>;
};

struct TieredDelayedEntry {
  DbIndex dbid;
  PrimeKey key;
  util::fb2::Future<io::Result<std::string>> value;
  time_t expire;
  uint32_t mc_flags;
};

using BackPressureFuture = std::optional<util::fb2::Future<bool>>;

#ifdef WITH_TIERING

// Manages offloaded values
class TieredStorage : public TieredStorageBase {
  class ShardOpManager;

 public:
  explicit TieredStorage(size_t max_file_size, DbSlice* db_slice);
  ~TieredStorage();  // drop forward declared unique_ptrs

  TieredStorage(TieredStorage&& other) = delete;
  TieredStorage(const TieredStorage& other) = delete;

  std::error_code Open(std::string_view path);
  void Close();

  // Enqueue read external value with generic decoder.
  template <typename D, typename F>
  void Read(DbIndex dbid, std::string_view key, const tiering::DiskSegment& segment,
            const D& decoder, F&& f) {
    // TODO(vlad): untangle endless callback wrapping!
    // Templates don't consider implicit conversions, so explicitly convert to std::function
    auto wrapped_cb = [f = std::forward<F>(f)](io::Result<tiering::Decoder*> res) mutable {
      f(res.transform([](auto* d) { return static_cast<D*>(d); }));
    };
    ReadInternal(dbid, key, segment, decoder, wrapped_cb);
  }

  // Returns StashDescriptor if a value should be stashed.
  std::optional<StashDescriptor> ShouldStash(const tiering::FragmentRef& fragment_ref) const;

  // Stash value, returns optional future for backpressure is not null.
  // if `provide_bp` is set and conditions are met.
  void Stash(DbIndex dbid, std::string_view key, const StashDescriptor& blobs,
             BackPressureFuture* backpressure);

  // Delete value, must be offloaded (external type)
  void Delete(DbIndex dbid, tiering::FragmentRef fragment_ref);

  // Cancel pending stash for the fragment, must have HasStashPending() true.
  void CancelStash(DbIndex dbid, std::string_view key, tiering::FragmentRef fragment_ref);

  // Run offloading loop until i/o device is loaded or all entries were traversed
  void RunOffloading(DbIndex dbid);

  // Prune cool entries to reach the set memory goal with freed memory
  size_t ReclaimMemory(size_t goal);

  // Returns the primary value, and deletes the cool item as well as its offloaded storage.
  PrimeValue Warmup(DbIndex dbid, PrimeValue::CoolItem item);

  TieredStats GetStats() const;

  void UpdateFromFlags();  // Update internal values based on current flag values
  static std::vector<std::string> GetMutableFlagNames();  // Triggers UpdateFromFlags

  bool ShouldOffload() const;     // True if below tiered_offload_threshold
  float WriteDepthUsage() const;  // Ratio (0-1) of used storage_write_depth for stashes

  // How much we are above tiered_upload_threshold. Can be negative!
  int64_t UploadBudget() const;
  size_t CoolMemoryUsage() const {
    return stats_.cool_memory_used;
  }

 private:
  void ReadInternal(DbIndex dbid, std::string_view key, const tiering::DiskSegment& segment,
                    const tiering::Decoder& decoder,
                    std::function<void(io::Result<tiering::Decoder*>)> cb);

  // Moves pv contents to the cool storage and updates pv to point to it.
  void CoolDown(DbIndex db_ind, std::string_view str, const tiering::DiskSegment& segment,
                CompactObj::ExternalRep rep, PrimeValue* pv);

  PrimeValue DeleteCool(tiering::TieredCoolRecord* record);
  tiering::TieredCoolRecord* PopCool();

  PrimeTable::Cursor offloading_cursor_;  // where RunOffloading left off

  // Stash operations waiting for completion to throttle
  tiering::EntryMap<::util::fb2::Future<bool>> stash_backpressure_;

  std::unique_ptr<ShardOpManager> op_manager_;
  std::unique_ptr<tiering::SmallBins> bins_;

  using CoolQueue = ::boost::intrusive::list<tiering::TieredCoolRecord>;
  CoolQueue cool_queue_;

  struct {
    size_t min_value_size;
    bool experimental_cooling;
    unsigned write_depth_limit;
    float offload_threshold;
    float upload_threshold;
    bool experimental_hash_offload;
  } config_;

  mutable struct {
    uint64_t stash_overflow_cnt = 0;
    uint64_t total_deletes = 0;
    uint64_t offloading_steps = 0;
    uint64_t offloading_stashes = 0;
    uint64_t total_clients_throttled = 0;
    size_t cool_memory_used = 0;
  } stats_;
};

// Read offloaded value. It must be of external string type
void ReadTiered(DbIndex dbid, std::string_view key, const PrimeValue& value,
                std::function<void(io::Result<std::string_view>)> readf, TieredStorage* ts);

// Read offloaded value and apply transformation cb on the read result. Returns future of the
// transformed result.
template <typename T>
TieredStorage::TResult<T> ReadTiered(DbIndex dbid, std::string_view key, const PrimeValue& value,
                                     std::function<T(std::string_view)> cb, TieredStorage* ts) {
  TieredStorage::TResult<T> fut;
  auto read_cb = [fut, cb = std::move(cb)](io::Result<std::string_view> res) mutable {
    fut.Resolve(res.transform([&](std::string_view sv) { return cb(sv); }));
  };
  ReadTiered(dbid, key, value, std::move(read_cb), ts);
  return fut;
}

inline TieredStorage::TResult<std::string> ReadTieredString(DbIndex dbid, std::string_view key,
                                                            const PrimeValue& value,
                                                            TieredStorage* ts) {
  return ReadTiered<std::string>(
      dbid, key, value, [](std::string_view val) { return std::string(val); }, ts);
}

// Reads offloaded value, and applies modifications on it and return generic result from callback.
// Unlike with immutable Reads - the modified value will be uploaded back to memory.
// This is handled by OpManager when modf completes.
template <typename T>
TieredStorage::TResult<T> ModifyTiered(DbIndex dbid, std::string_view key, const PrimeValue& value,
                                       std::function<T(std::string*)> modf, TieredStorage* ts);

// Stash value if it meets criteria. If the value was stashed and `backpressure` is not nullptr,
// assign/set the backpressure future to `*backpressure`.
void StashPrimeValue(DbIndex dbid, std::string_view key, PrimeValue* pv, TieredStorage* ts,
                     BackPressureFuture* backpressure);
#else

class TieredStorage : public TieredStorageBase {
  class ShardOpManager;

 public:
  explicit TieredStorage(size_t max_size, DbSlice* db_slice) {
  }

  TieredStorage(TieredStorage&& other) = delete;
  TieredStorage(const TieredStorage& other) = delete;

  std::error_code Open(std::string_view path) {
    return {};
  }

  void Close() {
  }

  // Read offloaded value. It must be of external type
  void Read(DbIndex dbid, std::string_view key, const PrimeValue& value,
            std::function<void(io::Result<std::string_view>)> readf) {
  }

  template <typename D, typename F>
  void Read(DbIndex dbid, std::string_view key, const tiering::DiskSegment& value, const D& decoder,
            F&& f) {
  }

  template <typename T>
  TResult<T> Modify(DbIndex dbid, std::string_view key, const PrimeValue& value,
                    std::function<T(std::string*)> modf) {
    return {};
  }

  std::optional<StashDescriptor> ShouldStash(const tiering::FragmentRef& fragment) const {
    return {};
  }

  void Stash(DbIndex dbid, std::string_view key, const StashDescriptor& blobs,
             BackPressureFuture* backpressure) {
  }

  void Delete(DbIndex dbid, PrimeValue* value) {
  }

  size_t ReclaimMemory(size_t goal) {
    return 0;
  }

  float WriteDepthUsage() const {
    return 0;
  }

  size_t CoolMemoryUsage() const {
    return 0;
  }

  void CancelStash(DbIndex dbid, std::string_view key, tiering::FragmentRef fragment_ref) {
  }

  TieredStats GetStats() const {
    return {};
  }

  void RunOffloading(DbIndex dbid) {
  }

  void UpdateFromFlags() {
  }

  static std::vector<std::string> GetMutableFlagNames() {
    return {};
  }

  bool ShouldOffload() const {
    return false;
  }

  int64_t UploadBudget() const {
    return 0;
  }

  PrimeValue Warmup(DbIndex dbid, PrimeValue::CoolItem item) {
    return PrimeValue{};
  }
};

template <typename T>
TieredStorage::TResult<T> ReadTiered(DbIndex dbid, std::string_view key, const PrimeValue& value,
                                     std::function<T(std::string_view)> cb, TieredStorage* ts) {
  return {};
}

inline void ReadTiered(DbIndex dbid, std::string_view key, const PrimeValue& value,
                       std::function<void(io::Result<std::string_view>)> readf, TieredStorage* ts) {
}

inline TieredStorage::TResult<std::string> ReadTieredString(DbIndex dbid, std::string_view key,
                                                            const PrimeValue& value,
                                                            TieredStorage* ts) {
  return {};
}

template <typename T>
TieredStorage::TResult<T> ModifyTiered(DbIndex dbid, std::string_view key, const PrimeValue& value,
                                       std::function<T(std::string*)> modf, TieredStorage* ts) {
  return {};
}

inline void StashPrimeValue(DbIndex dbid, std::string_view key, PrimeValue* pv, TieredStorage* ts,
                            BackPressureFuture* backpressure) {
}

#endif  // WITH_TIERING

}  // namespace dfly


================================================
FILE: src/server/tiered_storage_test.cc
================================================
// Copyright 2022, Roman Gershman.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/tiered_storage.h"

#include <absl/strings/str_cat.h>
#include <gmock/gmock.h>
#include <gtest/gtest.h>

#include "absl/flags/internal/flag.h"
#include "absl/flags/reflection.h"
#include "base/flags.h"
#include "base/logging.h"
#include "facade/facade_test.h"
#include "server/engine_shard_set.h"
#include "server/test_utils.h"
#include "util/fibers/fibers.h"

using namespace std;
using namespace testing;
using namespace util;

ABSL_DECLARE_FLAG(bool, force_epoll);
ABSL_DECLARE_FLAG(string, tiered_prefix);
ABSL_DECLARE_FLAG(float, tiered_offload_threshold);
ABSL_DECLARE_FLAG(float, tiered_upload_threshold);
ABSL_DECLARE_FLAG(unsigned, tiered_storage_write_depth);
ABSL_DECLARE_FLAG(bool, tiered_experimental_cooling);
ABSL_DECLARE_FLAG(uint64_t, registered_buffer_size);
ABSL_DECLARE_FLAG(bool, tiered_experimental_hash_support);

namespace dfly {

using absl::GetFlag;
using absl::SetFlag;

string BuildString(size_t len, char c = 'A') {
  return string(len, c);
}

class TieredStorageTest : public BaseFamilyTest {
 protected:
  TieredStorageTest() {
    num_threads_ = 1;
  }

  void SetUp() override {
    if (GetFlag(FLAGS_force_epoll)) {
      LOG(WARNING) << "Can't run tiered tests on EPOLL";
      exit(0);
    }

    // Disable registered buffers in half of the runs to use only small heap allocated buffers
    // to possibly catch out of bounds reads/writes with sanitizers
    if (absl::InsecureBitGen{}() % 2) {
      SetFlag(&FLAGS_registered_buffer_size, 0);
    }

    SetFlag(&FLAGS_tiered_storage_write_depth, 15000);
    if (GetFlag(FLAGS_tiered_prefix).empty()) {
      SetFlag(&FLAGS_tiered_prefix, "/tmp/tiered_storage_test");
    }

    BaseFamilyTest::SetUp();
  }

  void UpdateFromFlags() {
    pp_->at(0)->AwaitBrief([] { EngineShard::tlocal()->tiered_storage()->UpdateFromFlags(); });
  }
};

// Test that should run with both modes of "cooling"
class LatentCoolingTSTest : public TieredStorageTest, public testing::WithParamInterface<bool> {
  void SetUp() override {
    fs.emplace();
    SetFlag(&FLAGS_tiered_experimental_cooling, GetParam());
    TieredStorageTest::SetUp();
  }

  optional<absl::FlagSaver> fs;
};

INSTANTIATE_TEST_SUITE_P(TS, LatentCoolingTSTest, testing::Values(true, false));

// Disabled cooling and all values are offloaded
class PureDiskTSTest : public TieredStorageTest {
  void SetUp() override {
    fs.emplace();
    SetFlag(&FLAGS_tiered_offload_threshold, 1.0);
    SetFlag(&FLAGS_tiered_experimental_cooling, false);
    TieredStorageTest::SetUp();
  }

  optional<absl::FlagSaver> fs;
};

// Perform simple series of SET, GETSET and GET
TEST_P(LatentCoolingTSTest, SimpleGetSet) {
  absl::FlagSaver saver;
  SetFlag(&FLAGS_tiered_offload_threshold, 0.0f);  // disable offloading
  UpdateFromFlags();

  const int kMin = 256;
  const int kMax = tiering::kPageSize + 10;

  // Perform SETs
  for (size_t i = kMin; i < kMax; i++) {
    Run({"SET", absl::StrCat("k", i), BuildString(i)});
  }

  // Make sure all entries were stashed, except the one not filling a small page
  size_t stashes = 0;
  ExpectConditionWithinTimeout([this, &stashes] {
    stashes = GetMetrics().tiered_stats.total_stashes;
    return stashes >= kMax - kMin - 1;
  });

  // All entries were accounted for except that one (see comment above)
  auto metrics = GetMetrics();
  EXPECT_EQ(metrics.db_stats[0].tiered_entries, kMax - kMin - 1);
  EXPECT_LE(metrics.db_stats[0].tiered_used_bytes, (kMax - 1 + kMin) * (kMax - kMin) / 2 - 2047);

  // Perform GETSETs
  for (size_t i = kMin; i < kMax; i++) {
    auto resp = Run({"GETSET", absl::StrCat("k", i), string(i, 'B')});
    ASSERT_EQ(resp, BuildString(i)) << i;
  }

  // Perform GETs
  for (size_t i = kMin; i < kMax; i++) {
    auto resp = Run({"GET", absl::StrCat("k", i)});
    ASSERT_EQ(resp, string(i, 'B')) << i;
    Run({"GET", absl::StrCat("k", i)});  // To enforce uploads.
  }

  metrics = GetMetrics();
  EXPECT_EQ(metrics.db_stats[0].tiered_entries, 0);
  EXPECT_EQ(metrics.db_stats[0].tiered_used_bytes, 0);
}

TEST_F(TieredStorageTest, IntStrings) {
  absl::FlagSaver saver;
  SetFlag(&FLAGS_tiered_upload_threshold,
          0.0f);  // do not stop uploads based on free-memory threshold (this test does not itself
                  // trigger uploads)
  UpdateFromFlags();

  // STRING object can be encoded as LONG LONG internally
  string short_int_string = BuildString(18, '1');
  Run({"SET", "k1", short_int_string});

  // STRING object is not offloaded due to its small size
  string long_int_string = BuildString(32, '1');
  Run({"SET", "k2", long_int_string});

  // Long STRING object that is offloaded
  string tiered_int_string = BuildString(4096, '1');
  Run({"SET", "k3", tiered_int_string});

  ExpectConditionWithinTimeout([this] { return GetMetrics().tiered_stats.total_stashes == 1; });
}

// Use MGET to load multiple offloaded values
TEST_P(LatentCoolingTSTest, MGET) {
  vector<string> command = {"MGET"}, values = {};
  for (char key = 'A'; key <= 'Z'; key++) {
    command.emplace_back(1, key);
    values.emplace_back(3000, key);
    Run({"SET", command.back(), values.back()});
  }

  ExpectConditionWithinTimeout(
      [this, &values] { return GetMetrics().tiered_stats.total_stashes >= values.size(); });

  auto resp = Run(absl::MakeSpan(command));
  auto elements = resp.GetVec();
  for (size_t i = 0; i < elements.size(); i++)
    EXPECT_EQ(elements[i], values[i]);
}

// Issue many APPEND commands to an offloaded value that are executed at once (with CLIENT PAUSE).
// They should all finish within the same io completion loop.
TEST_F(TieredStorageTest, AppendStorm) {
  const size_t kAppends = 20;

  absl::FlagSaver saver;
  absl::SetFlag(&FLAGS_tiered_offload_threshold, 1.0);
  absl::SetFlag(&FLAGS_tiered_upload_threshold, 0.0);
  absl::SetFlag(&FLAGS_tiered_experimental_cooling, false);
  UpdateFromFlags();

  // Offload single value
  string base_value(4096, 'a');
  Run({"SET", "key", base_value});
  ExpectConditionWithinTimeout([this] { return GetMetrics().tiered_stats.total_stashes == 1; });

  // Accumulate APPENDs
  Run({"CLIENT", "pause", "1000"});
  vector<Fiber> fibs;
  for (size_t i = 0; i < kAppends; i++) {
    fibs.emplace_back(pp_->at(0)->LaunchFiber([this, i] {
      Run(absl::StrCat(i), {"APPEND", "key", string(96, 'b')});
    }));
  }

  // Throw in a SETRANGE
  fibs.emplace_back(pp_->at(0)->LaunchFiber([this] {
    Run("range", {"SETRANGE", "key", "0", string(96, 'x')});
  }));

  // Throw in a GETRANGE to a range that keeps constant
  string get_range;
  fibs.emplace_back(pp_->at(0)->LaunchFiber([this, &get_range] {
    get_range = Run("get", {"GETRANGE", "key", "96", "191"}).GetString();
  }));

  // Unlock and wait
  Run({"CLIENT", "unpause"});
  for (auto& f : fibs)
    f.JoinIfNeeded();

  // Check partial result is right
  EXPECT_EQ(get_range, string(96, 'a'));

  // Get value and verify it
  auto value = Run({"GET", "key"});
  EXPECT_EQ(value, string(96, 'x') + string(4000, 'a') + string(kAppends * 96, 'b'));

  // Check value was read no more than once for APPENDs and once for GET
  auto metrics = GetMetrics();
  EXPECT_LE(metrics.tiered_stats.total_fetches, 2u);
  EXPECT_LE(metrics.tiered_stats.total_uploads, 2u);
}

// SETRANGE and GETRANGE
TEST_P(LatentCoolingTSTest, Ranges) {
  Run({"SET", "key", string(3000, 'a')});
  ExpectConditionWithinTimeout([this] { return GetMetrics().tiered_stats.total_stashes >= 1; });

  Run({"SETRANGE", "key", "1000", string(1000, 'b')});
  auto resp = Run({"GET", "key"});
  EXPECT_EQ(resp, string(1000, 'a') + string(1000, 'b') + string(1000, 'a'));

  Run({"DEL", "key"});
  Run({"SET", "key", string(1500, 'c') + string(1500, 'd')});
  ExpectConditionWithinTimeout([this] { return GetMetrics().tiered_stats.total_stashes >= 2; });

  resp = Run({"GETRANGE", "key", "1000", "1999"});
  EXPECT_EQ(resp, string(500, 'c') + string(500, 'd'));
}

// Stash values from different databases and read them back
TEST_P(LatentCoolingTSTest, MultiDb) {
  for (size_t i = 0; i < 10; i++) {
    Run({"SELECT", absl::StrCat(i)});
    Run({"SET", absl::StrCat("k", i), BuildString(3000, char('A' + i))});
  }

  ExpectConditionWithinTimeout([this] { return GetMetrics().tiered_stats.total_stashes >= 10; });

  for (size_t i = 0; i < 10; i++) {
    Run({"SELECT", absl::StrCat(i)});
    EXPECT_EQ(GetMetrics().db_stats[i].tiered_entries, 1);
    string key = absl::StrCat("k", i);
    EXPECT_EQ(Run({"GET", key}), BuildString(3000, char('A' + i)));
    Run({"GET", key});
    EXPECT_EQ(GetMetrics().db_stats[i].tiered_entries, 0);
  }
}

// Trigger defragmentation
TEST_F(TieredStorageTest, Defrag) {
  for (char k = 'a'; k < 'a' + 8; k++) {
    Run({"SET", string(1, k), string(600, k)});
  }

  ExpectConditionWithinTimeout([this] { return GetMetrics().tiered_stats.total_stashes >= 1; });

  // 7 out 8 are in one bin, the last one made if flush and is now filling
  auto metrics = GetMetrics();
  ASSERT_EQ(metrics.tiered_stats.small_bins_cnt, 1u);
  ASSERT_EQ(metrics.tiered_stats.small_bins_entries_cnt, 7u);

  // Distorted due to encoded values.
  ASSERT_EQ(metrics.tiered_stats.small_bins_filling_bytes, 537);

  // Reading 3 values still leaves the bin more than half occupied
  for (unsigned j = 0; j < 2; ++j) {
    Run({"GET", string(1, 'a')});
    Run({"GET", string(1, 'b')});
    Run({"GET", string(1, 'c')});
  }
  metrics = GetMetrics();
  EXPECT_EQ(metrics.tiered_stats.small_bins_cnt, 1u);
  EXPECT_EQ(metrics.tiered_stats.small_bins_entries_cnt, 4u);

  // This tirggers defragmentation, as only 3 < 7/2 remain left
  Run({"GET", string(1, 'd')});

  // Wait that any reads caused by defrags has been finished.
  ExpectConditionWithinTimeout([this] { return GetMetrics().tiered_stats.pending_read_cnt == 0; });
  metrics = GetMetrics();
  EXPECT_EQ(metrics.tiered_stats.total_defrags, 3u);
  EXPECT_EQ(metrics.tiered_stats.small_bins_cnt, 0u);
  EXPECT_EQ(metrics.tiered_stats.allocated_bytes, 0u);
}

TEST_F(PureDiskTSTest, BackgroundOffloading) {
  absl::FlagSaver saver;
  SetFlag(&FLAGS_tiered_upload_threshold, 0.0f);  // upload all values
  UpdateFromFlags();

  const int kNum = 500;

  max_memory_limit = kNum * 4096;

  // Stash all values
  string value = BuildString(3000);
  for (size_t i = 0; i < kNum; i++) {
    Run({"SETEX", absl::StrCat("k", i), "100", value});
  }

  ExpectConditionWithinTimeout([&] { return GetMetrics().db_stats[0].tiered_entries == kNum; });
  ASSERT_EQ(GetMetrics().tiered_stats.total_stashes, kNum);
  ASSERT_EQ(GetMetrics().db_stats[0].tiered_entries, kNum);

  // Trigger re-fetch and test TTL is preserved.
  for (size_t i = 0; i < kNum; i++) {
    string key = absl::StrCat("k", i);
    auto resp = Run({"TTL", key});
    EXPECT_THAT(resp, IntArg(100));

    resp = Run({"GET", key});
    EXPECT_EQ(resp, value);
    resp = Run({"TTL", key});
    EXPECT_THAT(resp, IntArg(100));
    Run({"GET", key});  // enforce uploads
  }

  // Wait for offload to do it all again
  ExpectConditionWithinTimeout([&] { return GetMetrics().db_stats[0].tiered_entries == kNum; });
  auto resp = Run({"INFO", "ALL"});
  VLOG(1) << "INFO " << resp.GetString();
  auto metrics = GetMetrics();

  // Not all values were necessary uploaded during GET calls, but all that were uploaded
  // should be re-stashed again.
  EXPECT_EQ(metrics.tiered_stats.total_stashes, kNum + metrics.tiered_stats.total_uploads)
      << resp.GetString();
  EXPECT_EQ(metrics.tiered_stats.allocated_bytes, kNum * 4096);
}

// Verify correctness of our offloading startegy, offloading values only after second access.
TEST_F(PureDiskTSTest, OffloadingStrategy) {
  // Create value and wait to be offlaoded
  string value = BuildString(3000);
  Run({"set", "key", value});
  ExpectConditionWithinTimeout([&] { return GetMetrics().db_stats[0].tiered_entries == 1; });

  // Check base values
  auto metrics = GetMetrics();
  EXPECT_EQ(metrics.tiered_stats.total_fetches, 0);
  EXPECT_EQ(metrics.tiered_stats.total_uploads, 0);
  EXPECT_EQ(metrics.tiered_stats.total_stashes, 1);

  // Repeat a few times
  for (size_t i = 1; i <= 3; i++) {
    // Value is not uploaded after first read
    Run({"get", "key"});
    metrics = GetMetrics();
    EXPECT_EQ(metrics.tiered_stats.total_fetches, 2 * i - 1);
    EXPECT_EQ(metrics.tiered_stats.total_uploads, i - 1);

    // But on second read upload should happend at the end of chain due to two touches
    Run({"get", "key"});
    ExpectConditionWithinTimeout([&] { return GetMetrics().tiered_stats.total_uploads == i; });
    metrics = GetMetrics();
    EXPECT_EQ(metrics.tiered_stats.total_fetches, 2 * i);

    // Wait for offloading again
    ExpectConditionWithinTimeout([&] { return GetMetrics().db_stats[0].tiered_entries == 1; });
    metrics = GetMetrics();
    EXPECT_EQ(metrics.tiered_stats.total_offloading_stashes, i);
    EXPECT_EQ(metrics.tiered_stats.total_stashes, i + 1);
  }
}

// Test FLUSHALL while reading entries
TEST_F(PureDiskTSTest, FlushAll) {
  const int kNum = 500;
  for (size_t i = 0; i < kNum; i++) {
    Run({"SET", absl::StrCat("k", i), BuildString(3000)});
  }
  ExpectConditionWithinTimeout([&] { return GetMetrics().db_stats[0].tiered_entries == kNum; });

  // Start reading random entries
  atomic_bool done = false;
  auto reader = pp_->at(0)->LaunchFiber([&] {
    while (!done) {
      Run("reader", {"GET", absl::StrCat("k", rand() % kNum)});
      util::ThisFiber::Yield();
    }
  });

  Metrics metrics;
  ExpectConditionWithinTimeout([&] {
    metrics = GetMetrics();

    // Note that metrics.events.hits is not consistent with total_fetches
    // and it can happen that hits is greater than total_fetches due to in-progress reads.
    return metrics.tiered_stats.total_fetches > 2;
  });
  LOG(INFO) << FormatMetrics(metrics);

  Run({"FLUSHALL"});

  done = true;
  util::ThisFiber::SleepFor(100ms);
  reader.Join();

  metrics = GetMetrics();
  LOG(INFO) << FormatMetrics(metrics);

  EXPECT_EQ(metrics.db_stats.front().tiered_entries, 0u);
}

// Check FLUSHALL clears filling bytes of small bins
TEST_F(TieredStorageTest, FlushPending) {
  absl::FlagSaver saver;
  SetFlag(&FLAGS_tiered_offload_threshold, 1.0f);  // offload all values

  const int kNum = 10;
  for (size_t i = 0; i < kNum; i++) {
    Run({"SET", absl::StrCat("k", i), BuildString(256)});
  }
  ExpectConditionWithinTimeout(
      [&] { return GetMetrics().tiered_stats.small_bins_filling_bytes > 0; });
  Run({"FLUSHALL"});
  EXPECT_EQ(GetMetrics().tiered_stats.small_bins_filling_bytes, 0u);
}

// Test that clients are throttled if many stashes are issued.
// Stashes are released with CLIENT UNPAUSE to occur at the same time
TEST_F(PureDiskTSTest, ThrottleClients) {
  absl::FlagSaver saver;
  absl::SetFlag(&FLAGS_tiered_upload_threshold, 0.0);
  UpdateFromFlags();

  // issue client pause to accumualte SETs
  Run({"CLIENT", "PAUSE", "1000"});

  string value(4096, 'a');
  vector<Fiber> fibs;
  for (size_t i = 0; i < 100; i++) {
    fibs.emplace_back(pp_->at(0)->LaunchFiber([this, i, &value] {
      string key = absl::StrCat("k", i);
      Run(key, {"SET", key, value});
    }));
  }
  ThisFiber::Yield();

  // Unpause
  Run({"CLIENT", "UNPAUSE"});

  // Check if at least some of the clients were caugth throttling
  // but we provided backpressure for all of them
  auto metrics = GetMetrics();
  EXPECT_GT(metrics.tiered_stats.clients_throttled, fibs.size() / 10);
  EXPECT_EQ(metrics.tiered_stats.total_clients_throttled, fibs.size());

  for (auto& fib : fibs)
    fib.JoinIfNeeded();

  // Because of the 5ms max wait time for backpressure, we can't rely on the stashes to have
  // finished even after all the fibers joined, so expect the condition with a timeout
  ExpectConditionWithinTimeout(
      [&] { return GetMetrics().tiered_stats.total_stashes == fibs.size(); });
}

TEST_F(TieredStorageTest, Expiry) {
  string val = BuildString(100);
  Run({"psetex", "key1", "1", val});
  AdvanceTime(10);
  Run({"psetex", "key1", "1", val});
  auto resp = Run({"get", "key1"});
  EXPECT_EQ(resp, val);
}

TEST_F(PureDiskTSTest, SetExistingExpire) {
  const int kNum = 20;
  for (size_t i = 0; i < kNum; i++) {
    Run({"SETEX", absl::StrCat("k", i), "100", BuildString(256)});
  }
  ExpectConditionWithinTimeout([&] { return GetMetrics().tiered_stats.total_stashes > 1; });

  for (size_t i = 0; i < kNum; i++) {
    Run({"SETEX", absl::StrCat("k", i), "100", BuildString(256)});
  }

  for (size_t i = 0; i < kNum; i++) {
    auto resp = Run({"TTL", absl::StrCat("k", i)});
    EXPECT_THAT(resp, IntArg(100));
  }
}

TEST_F(PureDiskTSTest, Dump) {
  const int kNum = 10;
  for (size_t i = 0; i < kNum; i++) {
    Run({"SET", absl::StrCat("k", i), BuildString(3000)});  // big enough to trigger offloading.
  }

  ExpectConditionWithinTimeout([&] { return GetMetrics().tiered_stats.total_stashes == kNum; });

  auto resp = Run({"DUMP", "k0"});
  EXPECT_THAT(Run({"del", "k0"}), IntArg(1));
  resp = Run({"restore", "k0", "0", facade::ToSV(resp.GetBuf())});
  EXPECT_EQ(resp, "OK");
}

TEST_P(LatentCoolingTSTest, SimpleHash) {
  absl::FlagSaver saver;
  absl::SetFlag(&FLAGS_tiered_experimental_hash_support, true);
  // For now, never upload as its not implemented yet
  absl::SetFlag(&FLAGS_tiered_upload_threshold, 0.0);
  UpdateFromFlags();

  static constexpr size_t kNUM = 100;

  auto build_command = [](string_view key) {
    vector<string> cmd = {"HSET", string{key}};
    for (char c = 'a'; c <= 'z'; c++) {
      cmd.push_back(string{1, c});
      cmd.push_back(string{31, 'x'} + c);
    }
    return cmd;
  };

  // Create some hashes
  for (size_t i = 0; i < kNUM; i++) {
    Run(build_command(absl::StrCat("k", i)));
  }

  // Wait for all to be stashed or in end up in bins
  ExpectConditionWithinTimeout([this] {
    auto metrics = GetMetrics();
    return metrics.tiered_stats.total_stashes +
               metrics.tiered_stats.small_bins_filling_entries_cnt ==
           kNUM;
  });

  // Verify correctness
  for (size_t i = 0; i < kNUM; i++) {
    string key = absl::StrCat("k", i);
    EXPECT_THAT(Run({"HLEN", key}), IntArg(26));

    auto resp = Run({"HGET", key, string{1, 'f'}});
    auto v = string{31, 'x'} + 'f';
    EXPECT_EQ(resp, v);
  }
}

}  // namespace dfly


================================================
FILE: src/server/tiering/CMakeLists.txt
================================================
# Minimum set needed for successful compilation
if(NOT WITH_TIERING)
    add_library(dfly_tiering decoders.cc serialized_map.cc)
    target_link_libraries(dfly_tiering dfly_transaction dfly_facade redis_lib base io)
    return()
endif()

add_library(dfly_tiering
    decoders.cc disk_storage.cc external_alloc.cc
    op_manager.cc serialized_map.cc small_bins.cc)
target_link_libraries(dfly_tiering
    dfly_transaction dfly_facade redis_lib base io)


helio_cxx_test(disk_storage_test dfly_test_lib LABELS DFLY)
helio_cxx_test(external_alloc_test dfly_test_lib LABELS DFLY)
helio_cxx_test(op_manager_test dfly_test_lib LABELS DFLY)
helio_cxx_test(serialized_map_test dfly_test_lib LABELS DFLY)
helio_cxx_test(small_bins_test dfly_test_lib LABELS DFLY)

add_dependencies(check_dfly disk_storage_test external_alloc_test op_manager_test serialized_map_test small_bins_test)


================================================
FILE: src/server/tiering/common.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <memory>
#include <optional>
#include <variant>

namespace dfly::tiering {

inline namespace literals {

constexpr inline unsigned long long operator""_MB(unsigned long long x) {
  return x << 20U;
}

constexpr inline unsigned long long operator""_KB(unsigned long long x) {
  return x << 10U;
}

}  // namespace literals

constexpr size_t kPageSize = 4_KB;

// Location on the offloaded blob, measured in bytes
struct DiskSegment {
  DiskSegment() = default;
  DiskSegment(size_t offset, size_t length) : offset{offset}, length{length} {
  }
  DiskSegment(std::pair<size_t, size_t> p) : offset{p.first}, length(p.second) {
  }

  bool operator==(const DiskSegment& other) const {
    return offset == other.offset && length == other.length;
  }

  DiskSegment ContainingPages() const {
    return {offset / kPageSize * kPageSize, (length + kPageSize - 1) / kPageSize * kPageSize};
  }

  size_t offset = 0, length = 0;

  friend std::ostream& operator<<(std::ostream& os, const DiskSegment& ds) {
    return os << "[" << ds.offset << ", " << ds.length << "]";
  }
};

using KeyRef = std::pair<uint16_t /* DbIndex */, std::string_view>;

// Two separate keyspaces are provided - one for strings, one for numeric identifiers.
// Ids can be used to track auxiliary values that don't map to real keys (like a page index).
// Specifically, we track page indexes when serializing small-bin pages with multiple items.
using PendingId = std::variant<uintptr_t, KeyRef>;

};  // namespace dfly::tiering


================================================
FILE: src/server/tiering/decoders.cc
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/tiering/decoders.h"

#include "base/logging.h"
#include "core/detail/listpack_wrap.h"
#include "server/tiering/serialized_map.h"

extern "C" {
#include "redis/redis_aux.h"  // for OBJ_HASH
}

namespace dfly::tiering {

std::unique_ptr<Decoder> BareDecoder::Clone() const {
  return std::make_unique<BareDecoder>();
}

void BareDecoder::Initialize(std::string_view slice) {
  this->slice = slice;
}

void BareDecoder::Upload(CompactObj* obj) {
  ABSL_UNREACHABLE();
}

Decoder::UploadMetrics BareDecoder::GetMetrics() const {
  ABSL_UNREACHABLE();
  return UploadMetrics{};
}

StringDecoder::StringDecoder(const CompactObj& obj) : StringDecoder{obj.GetStrEncoding()} {
}

StringDecoder::StringDecoder(CompactObj::StrEncoding encoding) : encoding_{encoding} {
}

std::unique_ptr<Decoder> StringDecoder::Clone() const {
  return std::unique_ptr<StringDecoder>{new StringDecoder(encoding_)};
}

void StringDecoder::Initialize(std::string_view slice) {
  slice_ = slice;
  value_ = encoding_.Decode(slice);
}

void StringDecoder::Upload(CompactObj* obj) {
  if (modified_)
    obj->Materialize(value_.view(), false);
  else
    obj->Materialize(slice_, true);
}

Decoder::UploadMetrics StringDecoder::GetMetrics() const {
  return UploadMetrics{
      .modified = modified_,
      .estimated_mem_usage = value_.view().size(),
  };
}

std::string* StringDecoder::Write() {
  modified_ = true;
  return value_.GetMutable();
}

std::unique_ptr<Decoder> SerializedMapDecoder::Clone() const {
  return std::make_unique<SerializedMapDecoder>();
}

void SerializedMapDecoder::Initialize(std::string_view slice) {
  map_ = std::make_unique<SerializedMap>(slice);
}

Decoder::UploadMetrics SerializedMapDecoder::GetMetrics() const {
  return UploadMetrics{.modified = false,
                       .estimated_mem_usage = map_->DataBytes() + map_->size() * 2 * 8};
}

void SerializedMapDecoder::Upload(CompactObj* obj) {
  auto lw = detail::ListpackWrap::WithCapacity(GetMetrics().estimated_mem_usage);
  for (const auto& [key, value] : *map_)
    lw.Insert(key, value, true);
  obj->InitRobj(OBJ_HASH, kEncodingListPack, lw.GetPointer());
}

SerializedMap* SerializedMapDecoder::Get() const {
  return map_.get();
}

}  // namespace dfly::tiering


================================================
FILE: src/server/tiering/decoders.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <memory>
#include <optional>
#include <string>
#include <string_view>

#include "core/compact_object.h"

namespace dfly::tiering {

struct SerializedMap;

// Decodes serialized value and provides it to callbacks.
// Acts as generic interface to callback driver (OpManager)
struct Decoder {
  struct UploadMetrics {
    bool modified;               // whether the value as modified
    size_t estimated_mem_usage;  // Estimated memory usage if uploaded
  };

  virtual ~Decoder() = default;

  // Poor man's type-erasure copy
  virtual std::unique_ptr<Decoder> Clone() const = 0;

  // Initialize decoder from slice
  virtual void Initialize(std::string_view slice) = 0;

  // Compute upload metrics to determine if its worth
  virtual UploadMetrics GetMetrics() const = 0;

  // Store value in compact object
  virtual void Upload(CompactObj* obj) = 0;
};

// Basic "bare" decoder that just stores the provided slice
struct BareDecoder : public Decoder {
  std::unique_ptr<Decoder> Clone() const override;
  void Initialize(std::string_view slice) override;
  UploadMetrics GetMetrics() const override;
  void Upload(CompactObj* obj) override;

  std::string_view slice;
};

// Decodes string value with objects StrEncoding
struct StringDecoder : public Decoder {
  explicit StringDecoder(const CompactObj& obj);

  std::unique_ptr<Decoder> Clone() const override;
  void Initialize(std::string_view slice) override;
  UploadMetrics GetMetrics() const override;
  void Upload(CompactObj* obj) override;

  std::string_view GetView() const {
    return value_.view();
  }

  std::string* Write();

 private:
  explicit StringDecoder(CompactObj::StrEncoding encoding);

  bool modified_ = false;
  std::string_view slice_;
  CompactObj::StrEncoding encoding_;
  dfly::StringOrView value_;
};

// Decodes SerializedMaps
struct SerializedMapDecoder : public Decoder {
  std::unique_ptr<Decoder> Clone() const override;
  void Initialize(std::string_view slice) override;
  UploadMetrics GetMetrics() const override;
  void Upload(CompactObj* obj) override;

  SerializedMap* Get() const;

 private:
  std::unique_ptr<SerializedMap> map_;
};

}  // namespace dfly::tiering


================================================
FILE: src/server/tiering/disk_storage.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/tiering/disk_storage.h"

#include <system_error>

#include "base/flags.h"
#include "base/logging.h"
#include "io/io_buf.h"
#include "server/error.h"
#include "server/tiering/common.h"
#include "server/tiering/external_alloc.h"
#include "util/fibers/uring_file.h"
#include "util/fibers/uring_proactor.h"

using namespace ::dfly::tiering::literals;

ABSL_FLAG(bool, backing_file_direct, true, "If true uses O_DIRECT to open backing files");

ABSL_FLAG(uint64_t, registered_buffer_size, 512_KB,
          "Size of registered buffer for IoUring fixed read/writes");

namespace dfly::tiering {

using namespace std;
using namespace ::util::fb2;

namespace {

constexpr unsigned kHeapSliceId = UINT_MAX;

RegisteredSlice AllocateTmpBuf(size_t size) {
  size = (size + kPageSize - 1) / kPageSize * kPageSize;
  VLOG(2) << "Fallback to temporary allocation: " << size;

  uint8_t* buf = new (align_val_t(kPageSize)) uint8_t[size];
  return RegisteredSlice{{buf, size}, kHeapSliceId};
}

void DestroyTmpBuf(RegisteredSlice buf) {
  DCHECK_EQ(buf.buf_idx, kHeapSliceId);
  ::operator delete[](buf.bytes.data(), align_val_t(kPageSize));
}

void ReturnBuf(RegisteredSlice buf) {
  DCHECK_EQ(ProactorBase::me()->GetKind(), ProactorBase::IOURING);
  auto* up = static_cast<UringProactor*>(ProactorBase::me());

  if (buf.buf_idx != kHeapSliceId)
    up->ReturnRegisteredSlice(buf);
  else
    DestroyTmpBuf(buf);
}

constexpr off_t kInitialSize = 1UL << 28;  // 256MB

template <typename... Ts> error_code DoFiberCall(void (SubmitEntry::*c)(Ts...), Ts... args) {
  auto* proactor = static_cast<UringProactor*>(ProactorBase::me());
  FiberCall fc(proactor);
  (fc.operator->()->*c)(std::forward<Ts>(args)...);
  FiberCall::IoResult io_res = fc.Get();
  return io_res < 0 ? error_code{-io_res, system_category()} : error_code{};
}

}  // anonymous namespace

DiskStorage::DiskStorage(size_t max_size) : max_size_(max_size) {
}

DiskStorage::~DiskStorage() {
}

error_code DiskStorage::Open(string_view path) {
  DCHECK_EQ(ProactorBase::me()->GetKind(), ProactorBase::IOURING);
  CHECK(!backing_file_);

  int kFlags = O_CREAT | O_RDWR | O_TRUNC | O_CLOEXEC;
  if (absl::GetFlag(FLAGS_backing_file_direct))
    kFlags |= O_DIRECT;

  backing_file_path_ = path;
  auto res = OpenLinux(path, kFlags, 0666);
  if (!res)
    return res.error();
  backing_file_ = std::move(res.value());

  int fd = backing_file_->fd();

  auto ec = DoFiberCall(&SubmitEntry::PrepFallocate, fd, 0, 0L, kInitialSize);
  VLOG_IF(1, ec) << "Fallocate not supported";

  RETURN_ON_ERR(DoFiberCall(&SubmitEntry::PrepFadvise, fd, 0L, 0L, POSIX_FADV_RANDOM));

  alloc_.AddStorage(0, kInitialSize);

  // TODO(vlad): Even though this is called only once for regular use,
  // the testing code runs this initializer every time, never unregistering previous buffers
  auto* up = static_cast<UringProactor*>(ProactorBase::me());
  auto registered_buffer_size = absl::GetFlag(FLAGS_registered_buffer_size);
  if (registered_buffer_size > 0) {
    if (int io_res = up->RegisterBuffers(registered_buffer_size); io_res < 0)
      return error_code{-io_res, system_category()};
  }
  return {};
}

void DiskStorage::Close() {
  using namespace chrono_literals;

  // TODO: to fix this polling.
  while (pending_ops_ > 0 || grow_.pending)
    util::ThisFiber::SleepFor(10ms);

  auto ec = backing_file_->Close();
  LOG_IF(ERROR, ec) << "Failed to close backing file: " << ec;
  backing_file_.reset();

  int errc = unlink(backing_file_path_.c_str());
  LOG_IF(ERROR, errc != 0) << "Failed to unlink backing file: "
                           << std::error_code{errc, std::system_category()};
}

void DiskStorage::Read(DiskSegment segment, ReadCb cb) {
  DCHECK_GT(segment.length, 0u);
  DCHECK_EQ(segment.offset % kPageSize, 0u);

  size_t len = segment.length;
  RegisteredSlice buf = PrepareBuf(len);
  auto io_cb = [this, cb = std::move(cb), buf, len](int io_res) {
    if (io_res < 0) {
      cb(nonstd::make_unexpected(error_code{-io_res, system_category()}));
    } else {
      cb(string_view{reinterpret_cast<char*>(buf.bytes.data()), len});
    }
    ReturnBuf(buf);
    pending_ops_--;
  };

  pending_ops_++;
  if (buf.buf_idx != kHeapSliceId)
    backing_file_->ReadFixedAsync(buf.bytes, segment.offset, buf.buf_idx, std::move(io_cb));
  else
    backing_file_->ReadAsync(buf.bytes, segment.offset, std::move(io_cb));
}

void DiskStorage::MarkAsFree(DiskSegment segment) {
  DCHECK_GT(segment.length, 0u);
  DCHECK_EQ(segment.offset % kPageSize, 0u);

  alloc_.Free(segment.offset, segment.length);
}

io::Result<std::pair<size_t, RegisteredSlice>> DiskStorage::PrepareStash(size_t length) {
  using namespace nonstd;

  int64_t offset = alloc_.Malloc(length);
  if (offset >= 0)
    return std::make_pair(offset, PrepareBuf(length));

  // If we don't have "enough space", request grow and return to avoid blocking.
  // Note that `alloc_.Malloc` may fail even if we have enough space due to fragmentation,
  // as internally it uses different 256MB segments for different block sizes.
  if (offset < 0) {
    auto ec = RequestGrow(-offset);
    return make_unexpected(ec ? ec : make_error_code(errc::operation_would_block));
  }

  offset = alloc_.Malloc(length);
  if (offset < 0)  // we can't fit it even after resizing
    return make_unexpected(make_error_code(errc::file_too_large));

  return std::make_pair(offset, PrepareBuf(length));
}

void DiskStorage::Stash(DiskSegment segment, RegisteredSlice buf, StashCb cb) {
  auto io_cb = [this, cb = std::move(cb), buf, segment](int io_res) {
    if (io_res < 0) {
      MarkAsFree(segment);
      cb(error_code{-io_res, std::system_category()});
    } else {
      cb({});
    }
    ReturnBuf(buf);
    pending_ops_--;
  };

  pending_ops_++;
  size_t offset = segment.offset;
  if (buf.buf_idx != kHeapSliceId)
    backing_file_->WriteFixedAsync(buf.bytes, offset, buf.buf_idx, std::move(io_cb));
  else
    backing_file_->WriteAsync(buf.bytes, offset, std::move(io_cb));

  // Grow in advance if needed and possible
  size_t capacity = alloc_.capacity();
  size_t available = capacity - alloc_.allocated_bytes();
  if ((available < 256_MB) && (available < capacity * 0.15) && !grow_.pending) {
    auto ec = RequestGrow(256_MB);
    LOG_IF(ERROR, ec && ec != errc::file_too_large) << "Could not call grow :" << ec.message();
  }
}

DiskStorage::Stats DiskStorage::GetStats() const {
  return {
      alloc_.allocated_bytes(),       alloc_.capacity(), heap_buf_alloc_cnt_, reg_buf_alloc_cnt_,
      static_cast<size_t>(max_size_), pending_ops_};
}

error_code DiskStorage::RequestGrow(off_t grow_size) {
  VLOG(1) << "Requesting grow by " << grow_size << " current capacity: " << alloc_.capacity();

  DCHECK_EQ(grow_size % ExternalAllocator::kExtAlignment, 0u);
  if (alloc_.capacity() + grow_size >= static_cast<size_t>(max_size_))
    return make_error_code(errc::file_too_large);

  // Don't try again immediately, most likely it won't succeed ever.
  const uint64_t kCooldownTime = 100'000'000;  // 100ms
  if (grow_.last_err && (ProactorBase::GetMonotonicTimeNs() - grow_.timestamp_ns) < kCooldownTime)
    return make_error_code(errc::operation_canceled);

  if (std::exchange(grow_.pending, true)) {
    LOG_EVERY_T(WARNING, 1) << "Blocked on concurrent grow";
    return make_error_code(errc::operation_in_progress);
  }

  off_t end = alloc_.capacity();
  backing_file_->FallocateAsync(0, end, grow_size, [end, grow_size, this](int res) {
    auto ec = (res < 0) ? std::error_code{-res, std::system_category()} : std::error_code{};
    grow_.pending = false;
    grow_.last_err = ec;
    grow_.timestamp_ns = ProactorBase::GetMonotonicTimeNs();
    if (!ec)
      alloc_.AddStorage(end, grow_size);
  });

  return {};
}

RegisteredSlice DiskStorage::PrepareBuf(size_t size) {
  DCHECK_EQ(ProactorBase::me()->GetKind(), ProactorBase::IOURING);
  auto* up = static_cast<UringProactor*>(ProactorBase::me());

  if (auto borrowed = up->RequestRegisteredSlice(size); borrowed) {
    ++reg_buf_alloc_cnt_;
    return *borrowed;
  }
  ++heap_buf_alloc_cnt_;
  return AllocateTmpBuf(size);
}

}  // namespace dfly::tiering


================================================
FILE: src/server/tiering/disk_storage.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <system_error>

#include "io/io.h"
#include "server/tiering/common.h"
#include "server/tiering/external_alloc.h"
#include "util/fibers/uring_types.h"

namespace util::fb2 {
class LinuxFile;
}  // namespace util::fb2

namespace dfly::tiering {

// Disk storage controlled by asynchronous operations.
// Provides Random Access Read/Stash asynchronous interface around low level linux file.
// Handles ranges management and file growth via underlying ExternalAllocator.
class DiskStorage {
 public:
  struct Stats {
    size_t allocated_bytes = 0;
    size_t capacity_bytes = 0;
    uint64_t heap_buf_alloc_count = 0;
    uint64_t registered_buf_alloc_count = 0;
    size_t max_file_size = 0;
    size_t pending_ops = 0;
  };

  using ReadCb = std::function<void(io::Result<std::string_view>)>;
  using StashCb = std::function<void(std::error_code)>;

  explicit DiskStorage(size_t max_size);
  ~DiskStorage();

  std::error_code Open(std::string_view path);
  void Close();

  // Request read for segment, cb will be called on completion with read value
  void Read(DiskSegment segment, ReadCb cb);

  // Mark segment as free, performed immediately
  void MarkAsFree(DiskSegment segment);

  // Allocate segment of at least given length and prepare buffer. Might block to grow backing file.
  // Return error if not enough space is available or growing failed.
  // Every successful preparation must end in a Stash(), otherwise resources are leaked.
  io::Result<std::pair<size_t /* offset */, util::fb2::RegisteredSlice>> PrepareStash(
      size_t length);

  // Write prepared buffer to given segment and resolve completion callback when write is done.
  void Stash(DiskSegment segment, util::fb2::RegisteredSlice buf, StashCb cb);

  Stats GetStats() const;

 private:
  // Try asynchronously growing backing file by requested size
  std::error_code RequestGrow(off_t grow_size);

  // Returns a buffer with size greater or equal to len.
  util::fb2::RegisteredSlice PrepareBuf(size_t len);

  off_t max_size_;
  size_t pending_ops_ = 0;  // number of ongoing ops for safe shutdown

  // how many times we allocate registered/heap buffers.
  uint64_t heap_buf_alloc_cnt_ = 0, reg_buf_alloc_cnt_ = 0;

  struct {
    bool pending = false;  // currently in progress
    std::error_code last_err;
    uint64_t timestamp_ns;  // last grow finished
  } grow_;                  // status of last RequestGrow() operation

  std::string backing_file_path_;
  std::unique_ptr<util::fb2::LinuxFile> backing_file_;
  ExternalAllocator alloc_;
};

};  // namespace dfly::tiering


================================================
FILE: src/server/tiering/disk_storage_test.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/tiering/disk_storage.h"

#include <memory>

#include "base/gtest.h"
#include "base/logging.h"
#include "server/tiering/common.h"
#include "server/tiering/test_common.h"
#include "util/fibers/fibers.h"
#include "util/fibers/pool.h"

namespace dfly::tiering {

using namespace std;
using namespace std::string_literals;

struct DiskStorageTest : public PoolTestBase {
  ~DiskStorageTest() {
    EXPECT_EQ(pending_ops_, 0);
  }

  error_code Open(string filename = "disk_storage_test_backing") {
    filename_ = filename;
    storage_ = make_unique<DiskStorage>(256_MB);
    return storage_->Open(filename_);
  }

  void Close() {
    storage_->Close();
    storage_.reset();

    // Disk storage deletes its files on exit
    EXPECT_FALSE(std::filesystem::exists(filename_));
  }

  void Stash(size_t index, string value) {
    pending_ops_++;

    auto prepared = storage_->PrepareStash(value.length());
    EXPECT_TRUE(prepared.has_value());
    auto [offset, buf] = *prepared;
    memcpy(buf.bytes.data(), value.data(), value.size());

    DiskSegment segment{offset, value.size()};
    storage_->Stash({offset, value.size()}, buf, [this, index, segment](std::error_code ec) {
      segments_[index] = segment;
      pending_ops_--;
    });
  }

  void Read(size_t index) {
    pending_ops_++;
    storage_->Read(*segments_[index], [this, index](io::Result<string_view> value) {
      last_reads_[index] =
          value.has_value() ? io::Result<string>(*value) : nonstd::make_unexpected(value.error());
      pending_ops_--;
    });
  }

  void Delete(size_t index) {
    storage_->MarkAsFree(*segments_[index]);
    segments_.erase(index);
    last_reads_.erase(index);
  }

  void Wait() const {
    while (pending_ops_ > 0) {
      ::util::ThisFiber::SleepFor(1ms);
    }
  }

  DiskStorage::Stats GetStats() const {
    return storage_->GetStats();
  }

 protected:
  int pending_ops_ = 0;

  std::string filename_;
  std::unordered_map<size_t, io::Result<std::string>> last_reads_;
  std::unordered_map<size_t, io::Result<DiskSegment>> segments_;
  std::unique_ptr<DiskStorage> storage_;
};

TEST_F(DiskStorageTest, Basic) {
  pp_->at(0)->Await([this] {
    // Write 100 values
    Open();
    for (size_t i = 0; i < 100; i++)
      Stash(i, absl::StrCat("value", i));
    Wait();
    EXPECT_EQ(segments_.size(), 100);

    EXPECT_EQ(GetStats().allocated_bytes, 100 * kPageSize);

    // Read all 100 values
    for (size_t i = 0; i < 100; i++)
      Read(i);
    Wait();

    // Expect them to be equal to written
    for (size_t i = 0; i < 100; i++)
      EXPECT_EQ(*last_reads_[i], absl::StrCat("value", i));

    // Delete all values
    for (size_t i = 0; i < 100; i++)
      Delete(i);
    EXPECT_EQ(GetStats().allocated_bytes, 0);

    Close();
  });
}

TEST_F(DiskStorageTest, ReUse) {
  pp_->at(0)->Await([this] {
    Open();

    Stash(0, "value1");
    Wait();
    EXPECT_EQ(segments_[0]->offset, 0u);

    Delete(0);

    Stash(1, "value2");
    Wait();
    EXPECT_EQ(segments_[1]->offset, 0u);

    Close();
  });
}

TEST_F(DiskStorageTest, FlakyDevice) {
  if (!filesystem::exists("/mnt/tiering_flaky"))
    GTEST_SKIP() << "Flaky device not created, use tools/faulty_io.sh";

  pp_->at(0)->Await([this] {
    auto ec = Open("/mnt/tiering_flaky/backing");
    EXPECT_FALSE(ec) << ec.message();

    // Create stash sequence lasting two seconds
    const int kEntries = 200;
    for (int i = 0; i < kEntries; i++) {
      util::ThisFiber::SleepFor(10ms);
      Stash(i, "value");
    }
    Wait();

    // Make sure we saw at least some errors
    int errors = 0;
    for (int i = 0; i < kEntries; i++)
      errors += (!segments_[i].has_value());
    EXPECT_GT(errors, 0);
    EXPECT_LT(errors, kEntries);

    Close();
  });
}

}  // namespace dfly::tiering


================================================
FILE: src/server/tiering/entry_map.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/flat_hash_map.h>

#include <string>

#include "server/common_types.h"

namespace dfly::tiering {

namespace detail {
struct Hasher {
  using is_transparent = void;
  template <typename S> size_t operator()(const std::pair<DbIndex, S>& p) const {
    return absl::HashOf(p);
  }
};

struct Eq {
  using is_transparent = void;
  template <typename S1, typename S2>
  bool operator()(const std::pair<DbIndex, S1>& l, const std::pair<DbIndex, S2>& r) const {
    const auto& [i1, s1] = l;
    const auto& [i2, s2] = r;
    return i1 == i2 && s1 == s2;
  }
};
}  // namespace detail

using DbKeyId = std::pair<DbIndex, std::string>;

// Map of key (db index, string key) -> T with heterogeneous lookup
template <typename T> using EntryMap = absl::flat_hash_map<DbKeyId, T, detail::Hasher, detail::Eq>;

}  // namespace dfly::tiering


================================================
FILE: src/server/tiering/external_alloc.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "src/server/tiering/external_alloc.h"

#include <mimalloc.h>

#include <bitset>
#include <cstring>

#include "base/logging.h"

namespace dfly::tiering {
using namespace std;
using detail::PageClass;

using BinIdx = uint8_t;

namespace {

constexpr inline size_t divup(size_t num, size_t div) {
  return (num + div - 1) / div;
}

constexpr inline size_t alignup(size_t num, size_t align) {
  size_t amask = align - 1;
  return (num + amask) & (~amask);
}

constexpr inline size_t wsize_from_size(size_t size) {
  return divup(size, sizeof(uintptr_t));
}

constexpr size_t kMinBlockSize = ExternalAllocator::kMinBlockSize;

constexpr size_t kSmallPageShift = 20;
constexpr size_t kMediumPageShift = 24;
constexpr size_t kSmallPageSize = 1UL << kSmallPageShift;    // 1MB
constexpr size_t kMediumPageSize = 1UL << kMediumPageShift;  // 16MB

// we preserve 16:1 ratio, i.e. each page can host at least 16 blocks within its class.
constexpr size_t kSmallObjMaxSize = kSmallPageSize / 16;
constexpr size_t kMediumObjMaxSize = kMediumPageSize / 16;

constexpr size_t kSegmentAlignment = 256_MB;
constexpr size_t kSegmentSize = 256_MB;

constexpr unsigned kNumBins = detail::kNumFreePages;
constexpr unsigned kLargeSizeBin = kNumBins - 1;
constexpr unsigned kMaxPagesInSegment = kSegmentSize / kSmallPageSize;
constexpr unsigned kSegDescrAlignment = 16_KB;

constexpr size_t kBinWordLens[kNumBins] = {
    512,   512 * 2, 512 * 3, 2048,  2560,  3072,  3584,   4096,   5120,      6144,
    7168,  8192,    10240,   12288, 14336, 16384, 20480,  24576,  28672,     32768,
    40960, 49152,   57344,   65536, 81920, 98304, 114688, 131072, UINT64_MAX};

static_assert(kBinWordLens[kLargeSizeBin - 1] * 8 == kMediumObjMaxSize);
static_assert(kBinWordLens[kLargeSizeBin] == UINT64_MAX);

constexpr inline BinIdx ToBinIdx(size_t size) {
  // first 4 bins are multiplies of kMinBlockSize.
  if (size < ExternalAllocator::kMinBlockSize * 4) {
    return size <= ExternalAllocator::kMinBlockSize ? 0
                                                    : (size - 1) / ExternalAllocator::kMinBlockSize;
  }

  if (size > kMediumObjMaxSize) {
    return kLargeSizeBin;
  }

  size_t wsize = wsize_from_size(size);

  // to correct rounding up of size to words that the last word will be within the range.
  --wsize;

  // find the highest bit
  uint8_t b = 63 - __builtin_clzl(wsize);
  return (b << 2) + ((wsize >> (b - 2)) & 3) - 40;
}

static_assert(ToBinIdx(kMinBlockSize) == 0);
static_assert(ToBinIdx(kMinBlockSize * 2) == 1);
static_assert(ToBinIdx(kMinBlockSize * 3) == 2);
static_assert(ToBinIdx(kMinBlockSize * 4) == 3);
static_assert(ToBinIdx(kMinBlockSize * 5) == 4);
static_assert(ToBinIdx(kMinBlockSize * 6) == 5);
static_assert(ToBinIdx(kMinBlockSize * 6 + 1) == 6);
static_assert(ToBinIdx(kMinBlockSize * 7) == 6);

size_t ToBlockSize(BinIdx idx) {
  return kBinWordLens[idx] * 8;
}

// num pages in a segment of that class.
unsigned NumPagesInSegment(PageClass pc) {
  switch (pc) {
    case PageClass::SMALL_P:
      return kSegmentSize >> kSmallPageShift;
    case PageClass::MEDIUM_P:
      return kSegmentSize >> kMediumPageShift;
      break;
    case PageClass::LARGE_P:
      return 1;
      break;
  }
  // unreachable.
  return 0;
}

template <size_t N> size_t FindFirst(const std::bitset<N>& bs) {
#ifdef _LIBCPP_VERSION
  for (size_t i = 0; i < bs.size(); ++i) {
    if (bs.test(i))
      return i;
  }
#else
  return bs._Find_first();
#endif
}

};  // namespace

/*
   block 8Kb or more, page - 2MB (256 blocks) or bigger.


   Block sizes grow exponentially - by factor ~1.25. See MI_PAGE_QUEUES_EMPTY definition
   for sizes example.
*/
namespace detail {

// Page can be exactly in either these 3 states:
// 1. unitialized - with no blocks being allocated - segment_inuse will be 0 in that case, 1
// otherwise.
// 2. Partly utilized by 1 or more blocks, with available > 0 in that case. It must be present in
// free_pages_ list then.
// 3. Fully utilized, with available==0, in that case it's not part of free_pages_ list.
struct Page {
  std::bitset<256> free_blocks;  // bitmask of free blocks (32 bytes).
  uint8_t id;                    // index inside the Segment.pages array.

  // need some mapping function to map from block_size to real_block_size given Page class.
  BinIdx bin_idx;
  uint8_t segment_inuse : 1;  // true if segment allocated this page.
  uint8_t reserved[3];

  // can be computed via free_blocks.count().
  uint16_t available;  // in number of blocks.
  Page* next_free;     // next page in the free_pages_ list

  // We can not use c'tor because we use the trick in segment where we allocate more pages
  // than SegmentDescr declares.
  void Reset(uint8_t new_id) {
    static_assert(sizeof(Page) == 48);

    memset(&id, 0, sizeof(Page) - offsetof(Page, id));
    id = new_id;
  }

  void Init(PageClass pc, BinIdx bin_id);
};

constexpr size_t kSegDescrDataSize = sizeof(Page) * kMaxPagesInSegment + 128;
static_assert(kSegDescrDataSize < kSegDescrAlignment);

void Page::Init(PageClass pc, BinIdx bin_id) {
  DCHECK_EQ(available, 0);
  DCHECK(segment_inuse);

  bin_idx = bin_id;
  if (pc == PageClass::LARGE_P) {
    available = 1;
  } else {
    size_t page_size = (pc == PageClass::SMALL_P) ? kSmallPageSize : kMediumPageSize;
    available = page_size / ToBlockSize(bin_id);
  }

  free_blocks.reset();
  for (unsigned i = 0; i < available; ++i) {
    free_blocks.set(i, true);
  }
}

PageClass ClassFromSize(size_t size) {
  if (size <= kSmallObjMaxSize)
    return PageClass::SMALL_P;
  if (size <= kMediumObjMaxSize)
    return PageClass::MEDIUM_P;

  return PageClass::LARGE_P;
}

}  // namespace detail

//
/**
 * SegmentDescr denotes a 256MB segment on external storage -
 * holds upto 256 pages (in case of small pages).
 * Each segment has pages of the same type, but each page can host blocks of
 * different sizes upto maximal block size for that page class.
 * SegmentDescr points to the range within external storage space.
 * By using the page.id together with segment->page_shift and segment->offset
 * one can know where the page is located in the storage.
 * Opposite direction: by giving an offset to the file, segment_id = offset / 256MB.
 * Moreover (offset % 256MB) >> segment.page_shift gives us the page id and subsequently
 * page_start.  segment.pages[page_id].block_size gives us the block size and that in turn gives us
 * block id within the page. We can also know block_size if the originally allocated
   size is provided by using round_up function that was used to allocate the block.
 * SegmentDescr be aligned by kSegDescrAlignment boundaries - ToSegDescr relies on that.
 */
class ExternalAllocator::SegmentDescr {
  SegmentDescr(const SegmentDescr&) = delete;
  void operator=(const SegmentDescr&) = delete;
  friend class ExternalAllocator;

 public:
  explicit SegmentDescr(PageClass pc, size_t offs, uint16_t capacity);

  Page* FindPageSegment() {
    return page_info_.FindPageSegment();
  }

  Page* GetPage(unsigned i) {
    return page_info_.pages + i;
  }

  size_t BlockOffset(const Page* page, unsigned blockpos) {
    return offset_ + page->id * (1 << page_info_.page_shift) +
           ToBlockSize(page->bin_idx) * blockpos;
  }

  bool HasFreePages() const {
    return page_info_.capacity > page_info_.used;
  }

  unsigned capacity() const {
    return page_info_.capacity;
  }

  unsigned used() const {
    return page_info_.used;
  }

  unsigned page_shift() const {
    return page_info_.page_shift;
  }

  PageClass page_class() const {
    return page_class_;
  }

  SegmentDescr *next, *prev;

  // Links seg before this.
  void LinkBefore(SegmentDescr* seg) {
    seg->next = this;
    seg->prev = prev;
    this->prev->next = seg;
    this->prev = seg;
  }

  // detaches this from the circular list.
  // returns next if the list is has more than 1 element
  // returns null otherwise.
  SegmentDescr* Detach() {
    if (next == this)
      return nullptr;

    next->prev = prev;
    prev->next = next;

    SegmentDescr* res = next;
    next = prev = this;
    return res;
  }

 private:
  uint64_t offset_;  // size_ - relevant for large segments.
  PageClass page_class_;

  struct PageInfo {
    uint16_t capacity, used;  // in number of pages.
    uint8_t page_shift;
    Page pages[0];  // must be the last field. Can be 1-256 pages.

    PageInfo(uint16_t c) : capacity(c), used(0), page_shift(0) {
    }

    auto FindPageSegment() -> Page* {
      for (uint32_t i = 0; i < capacity; ++i) {
        if (!pages[i].segment_inuse) {
          pages[i].segment_inuse = 1;
          ++used;
          return pages + i;
        }
      }

      LOG(DFATAL) << "Should not reach here";

      return nullptr;
    }
  };

  PageInfo page_info_;
};

ExternalAllocator::SegmentDescr::SegmentDescr(PageClass pc, size_t offs, uint16_t page_capacity)
    : offset_(offs), page_class_(pc), page_info_(page_capacity) {
  constexpr size_t kDescrSize = sizeof(SegmentDescr);
  (void)kDescrSize;

  next = prev = this;
  DCHECK(pc != PageClass::LARGE_P);

  if (pc == PageClass::MEDIUM_P)
    page_info_.page_shift = kMediumPageShift;
  else
    page_info_.page_shift = kSmallPageShift;

  for (unsigned i = 0; i < page_capacity; ++i) {
    page_info_.pages[i].Reset(i);
  }
}

static detail::Page empty_page;

ExternalAllocator::ExternalAllocator() {
  std::fill(sq_, sq_ + ABSL_ARRAYSIZE(sq_), nullptr);
  std::fill(free_pages_, free_pages_ + detail::kNumFreePages, &empty_page);
}

ExternalAllocator::~ExternalAllocator() {
  for (auto* seg : segments_) {
    mi_free(seg);
  }
}

int64_t ExternalAllocator::Malloc(size_t sz) {
  uint8_t bin_idx = ToBinIdx(sz);
  Page* page = free_pages_[bin_idx];
  if (page->available == 0) {  // empty page.
    PageClass pc = detail::ClassFromSize(sz);
    if (pc == PageClass::LARGE_P) {
      return LargeMalloc(sz);
    }

    page = FindPage(pc);
    if (!page)
      return -int64_t(kSegmentSize);

    DVLOG(2) << "Allocated page: for bin " << bin_idx << " class " << static_cast<int>(pc);
    free_pages_[bin_idx] = page;
    page->Init(pc, bin_idx);
  }

  DCHECK(page->available);
  size_t pos = FindFirst(page->free_blocks);
  page->free_blocks.flip(pos);

  if (--page->available == 0)  // Remove empty page from freelist
    free_pages_[bin_idx] = page->next_free ? page->next_free : &empty_page;

  allocated_bytes_ += ToBlockSize(page->bin_idx);
  SegmentDescr* seg = ToSegDescr(page);
  return seg->BlockOffset(page, pos);
}

void ExternalAllocator::Free(size_t offset, size_t sz) {
  if (sz > kMediumObjMaxSize) {
    size_t align_sz = alignup(sz, 4_KB);
    extent_tree_.Add(offset, align_sz);
    return;
  }

  size_t idx = offset / 256_MB;
  size_t delta = offset % 256_MB;
  CHECK_LT(idx, segments_.size());
  CHECK(segments_[idx]);

  SegmentDescr* seg = segments_[idx];
  unsigned page_id = delta >> seg->page_shift();
  CHECK_LT(page_id, seg->capacity());

  Page* page = seg->GetPage(page_id);
  unsigned page_size = (1 << seg->page_shift());
  unsigned block_offs = delta % page_size;
  unsigned block_size = ToBlockSize(page->bin_idx);
  unsigned block_id = block_offs / block_size;
  unsigned blocks_num = page_size / block_size;

  CHECK_LE(sz, block_size);
  DCHECK_LT(block_id, blocks_num);
  DCHECK(!page->free_blocks[block_id]) << offset;

  page->free_blocks.set(block_id);
  ++page->available;

  DCHECK_EQ(page->available, page->free_blocks.count());
  // If page becomes fully free, return it to segment list, otherwise if it just became non-empty,
  // then return it to free pages list
  if (page->available == blocks_num) {
    FreePage(page, seg, block_size);
  } else if (page->available == 1) {
    DCHECK_NE(page, free_pages_[page->bin_idx]);
    page->next_free = free_pages_[page->bin_idx];
    free_pages_[page->bin_idx] = page;
  }
  allocated_bytes_ -= block_size;
}

void ExternalAllocator::AddStorage(size_t start, size_t size) {
  VLOG(1) << "AddStorage " << start << "/" << size;

  extent_tree_.Add(start, size);
  capacity_ += size;
}

size_t ExternalAllocator::GoodSize(size_t sz) {
  uint8_t bin_idx = ToBinIdx(sz);
  if (bin_idx < kLargeSizeBin)
    return ToBlockSize(bin_idx);

  return alignup(sz, 4_KB);
}

/**
 *
  _____      _            _          __                  _   _
 |  __ \    (_)          | |        / _|                | | (_)
 | |__) | __ ___   ____ _| |_ ___  | |_ _   _ _ __   ___| |_ _  ___  _ __  ___
 |  ___/ '__| \ \ / / _` | __/ _ \ |  _| | | | '_ \ / __| __| |/ _ \| '_ \/ __|
 | |   | |  | |\ V / (_| | ||  __/ | | | |_| | | | | (__| |_| | (_) | | | \__ \
 |_|   |_|  |_| \_/ \__,_|\__\___| |_|  \__,_|_| |_|\___|\__|_|\___/|_| |_|___/

 src: https://patorjk.com/software/taag/#f=Big
 */

// private functions
auto ExternalAllocator::FindPage(PageClass pc) -> Page* {
  DCHECK_NE(pc, PageClass::LARGE_P);

  SegmentDescr* seg = sq_[pc];
  while (seg) {
    if (seg->HasFreePages()) {
      return seg->FindPageSegment();
    }

    // remove head.
    SegmentDescr* next = seg->Detach();
    sq_[pc] = next;
    seg = next;
  }

  // no pages in the existing segments. Lets search in the extent tree.
  auto op_range = extent_tree_.GetRange(kSegmentSize, kSegmentAlignment);
  if (op_range) {
    DCHECK_EQ(0u, op_range->first % kSegmentAlignment);

    unsigned num_pages = NumPagesInSegment(pc);
    size_t seg_idx = op_range->first / kSegmentAlignment;

    if (segments_.size() > seg_idx) {
      DCHECK(segments_[seg_idx] == nullptr);
    } else {
      segments_.resize(seg_idx + 1);
    }

    void* ptr =
        mi_malloc_aligned(sizeof(SegmentDescr) + num_pages * sizeof(Page), kSegDescrAlignment);
    SegmentDescr* seg = new (ptr) SegmentDescr(pc, op_range->first, num_pages);
    segments_[seg_idx] = seg;

    DCHECK(sq_[pc] == NULL);
    DCHECK(seg->next == seg->prev && seg == seg->next);

    sq_[pc] = seg;
    return seg->FindPageSegment();
  }

  return nullptr;
}

int64_t ExternalAllocator::LargeMalloc(size_t size) {
  size_t align_sz = alignup(size, 4_KB);
  auto op_range = extent_tree_.GetRange(align_sz, 4_KB);
  if (!op_range) {
    align_sz = max(align_sz, kSegmentSize);
    return -int64_t(align_sz);
  }

  return op_range->first;
}

void ExternalAllocator::FreePage(Page* page, SegmentDescr* owner, size_t block_size) {
  // page is fully free. Return it to the segment even if it's
  // referenced via free_pages_. The allows more elasticity by potentially reassigning
  // it to other bin sizes.
  BinIdx bidx = ToBinIdx(block_size);

  // Remove fast allocation reference.
  if (free_pages_[bidx] == page) {
    free_pages_[bidx] = page->next_free ? page->next_free : &empty_page;
  } else {
    for (auto* cur = free_pages_[bidx]; cur != nullptr; cur = cur->next_free) {
      if (cur->next_free == page) {
        cur->next_free = page->next_free;
        break;
      }
    }
  }

  page->segment_inuse = 0;
  page->available = 0;
  page->next_free = nullptr;

  if (!owner->HasFreePages()) {
    // Segment was fully booked but now it has a free page.
    // Add it to the tail of segment queue.
    DCHECK(owner->next == owner->prev);

    auto& sq = sq_[owner->page_class()];
    if (sq == nullptr) {
      sq = owner;
    } else {
      sq->LinkBefore(owner);
    }
  }
  --owner->page_info_.used;
}

inline auto ExternalAllocator::ToSegDescr(Page* page) -> SegmentDescr* {
  uintptr_t ptr = (uintptr_t)page;

  // find SegDescr boundary.
  uintptr_t seg_ptr = ptr & ~uintptr_t(kSegDescrAlignment - 1);
  SegmentDescr* res = reinterpret_cast<SegmentDescr*>(seg_ptr);

  DCHECK(res->GetPage(page->id) == page);

  return res;
}

}  // namespace dfly::tiering


================================================
FILE: src/server/tiering/external_alloc.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once

#include <cstddef>
#include <cstdint>
#include <vector>

#include "core/extent_tree.h"
#include "server/tiering/common.h"

namespace dfly::tiering {

/**
 *
 * An external allocator inspired by mimalloc. Its goal is to maintain a state machine for
 * bookkeeping the allocations of different sizes that are backed up by a separate
 * storage. It could be a disk, SSD or another memory allocator. This class serves
 * as a state machine that either returns an offset to the backing storage or the indication
 * of the resource that is missing. The advantage of such design is that we can use it in
 * asynchronous callbacks without blocking on any IO requests.
 * The allocator uses dynamic memory internally. Should be used in a single thread.
 *
 */

namespace detail {
struct Page;

constexpr unsigned kNumFreePages = 29;

/**
 * pages classes can be SMALL, MEDIUM or LARGE. SMALL (2MB) for block sizes upto 128KB.
 * MEDIUM (16MB) for block sizes 128KB-1MB. Anything else is LARGE.
 *
 */
enum PageClass : uint16_t {
  SMALL_P = 0,
  MEDIUM_P = 1,
  LARGE_P = 2,
};

PageClass ClassFromSize(size_t size);

}  // namespace detail

class ExternalAllocator {
  ExternalAllocator(const ExternalAllocator&) = delete;
  void operator=(const ExternalAllocator&) = delete;

 public:
  static constexpr size_t kExtAlignment = 256_MB;     // 256 MB
  static constexpr size_t kMinBlockSize = kPageSize;  // 4KB

  ExternalAllocator();
  ~ExternalAllocator();

  // If a negative result - backing storage is required of size=-result. See AddStorage
  // on how to add more storage.
  // For results >= 0 Returns offset to the backing storage where we may write the data of
  // size sz.
  int64_t Malloc(size_t sz);

  void Free(size_t offset, size_t sz);

  /// Adds backing storage to the allocator. The range should not overlap with already
  /// added storage ranges.
  void AddStorage(size_t start, size_t size);

  // Similar to mi_good_size, returns the size of the underlying block as if
  // were returned by Malloc. Guaranteed that the result not less than sz.
  // No allocation is done.
  static size_t GoodSize(size_t sz);

  size_t capacity() const {
    return capacity_;
  }

  size_t allocated_bytes() const {
    return allocated_bytes_;
  }

 private:
  class SegmentDescr;
  using Page = detail::Page;

  // Returns a page if there is a segment of that class.
  // Returns NULL if no page is found.
  Page* FindPage(detail::PageClass sc);

  int64_t LargeMalloc(size_t size);
  SegmentDescr* GetNewSegment(detail::PageClass sc);
  void FreePage(Page* page, SegmentDescr* owner, size_t block_size);

  static SegmentDescr* ToSegDescr(Page*);

  SegmentDescr* sq_[2];                      // map: PageClass -> free Segment.
  Page* free_pages_[detail::kNumFreePages];  // intrusive linked lists of pages with free blocks

  // A segment for each 256MB range. To get a segment id from the offset, shift right by 28.
  std::vector<SegmentDescr*> segments_;

  ExtentTree extent_tree_;

  size_t capacity_ = 0;  // in bytes.
  size_t allocated_bytes_ = 0;
};

}  // namespace dfly::tiering


================================================
FILE: src/server/tiering/external_alloc_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/tiering/external_alloc.h"

#include "base/gtest.h"
#include "base/logging.h"

namespace dfly::tiering {

using namespace std;

class ExternalAllocatorTest : public ::testing::Test {
 protected:
  static void SetUpTestSuite() {
  }

  static void TearDownTestSuite() {
  }

  ExternalAllocator ext_alloc_;
};

constexpr int64_t kSegSize = 256_MB;

std::map<int64_t, size_t> AllocateFully(ExternalAllocator* alloc) {
  std::map<int64_t, size_t> ranges;

  int64_t res = 0;
  while (res >= 0) {
    for (unsigned j = 1; j < 5; ++j) {
      size_t sz = 8000 * j;
      res = alloc->Malloc(sz);
      if (res < 0)
        break;
      auto [it, added] = ranges.emplace(res, sz);
      VLOG(1) << "res: " << res << " size: " << sz << " added: " << added;
      CHECK(added);
    }
  }

  return ranges;
}

constexpr size_t kMinBlockSize = ExternalAllocator::kMinBlockSize;

TEST_F(ExternalAllocatorTest, Basic) {
  int64_t res = ext_alloc_.Malloc(128);
  EXPECT_EQ(-kSegSize, res);

  ext_alloc_.AddStorage(0, kSegSize);
  EXPECT_EQ(0, ext_alloc_.Malloc(kMinBlockSize - 96));         //  page0: 1
  EXPECT_EQ(kMinBlockSize, ext_alloc_.Malloc(kMinBlockSize));  //  page0: 2

  constexpr auto kAnotherLen = kMinBlockSize * 2 - 10;
  size_t offset2 = ext_alloc_.Malloc(kAnotherLen);  // page1: 1
  EXPECT_EQ(offset2, 1_MB);                         // another page.

  ext_alloc_.Free(offset2, kAnotherLen);         // should return the page to the segment.
  EXPECT_EQ(offset2, ext_alloc_.Malloc(16_KB));  // another page.  page1: 1

  ext_alloc_.Free(0, kMinBlockSize - 96);         // page0: 1
  ext_alloc_.Free(kMinBlockSize, kMinBlockSize);  // page0: 0

  EXPECT_EQ(0, ext_alloc_.Malloc(kMinBlockSize * 2));  // page0
}

TEST_F(ExternalAllocatorTest, Invariants) {
  ext_alloc_.AddStorage(0, kSegSize);

  auto ranges = AllocateFully(&ext_alloc_);
  EXPECT_GT(ext_alloc_.allocated_bytes(), ext_alloc_.capacity() * 0.75);

  off_t last = 0;
  for (const auto& k_v : ranges) {
    ASSERT_GE(k_v.first, last);
    last = k_v.first + k_v.second;
  }

  for (const auto& k_v : ranges) {
    ext_alloc_.Free(k_v.first, k_v.second);
  }
  EXPECT_EQ(0, ext_alloc_.allocated_bytes());

  for (const auto& k_v : ranges) {
    int64_t res = ext_alloc_.Malloc(k_v.second);
    ASSERT_GE(res, 0);
  }
}

TEST_F(ExternalAllocatorTest, Classes) {
  using detail::ClassFromSize;

  ext_alloc_.AddStorage(0, kSegSize);
  constexpr size_t kMaxSmallPage = 64_KB;
  ASSERT_EQ(detail::SMALL_P, ClassFromSize(kMaxSmallPage));
  ASSERT_EQ(detail::MEDIUM_P, ClassFromSize(kMaxSmallPage + 1));
  ASSERT_EQ(detail::LARGE_P, ClassFromSize(1_MB + 1));

  off_t offs1 = ext_alloc_.Malloc(kMaxSmallPage);
  EXPECT_EQ(offs1, 0);

  off_t offs2 = ext_alloc_.Malloc(kMaxSmallPage + 1);
  EXPECT_EQ(offs2, -kSegSize);

  ext_alloc_.AddStorage(kSegSize, kSegSize);
  offs2 = ext_alloc_.Malloc(kMaxSmallPage * 2 + 1);
  ASSERT_GT(offs2, 0);
  offs2 = ext_alloc_.Malloc(1_MB);
  ASSERT_GT(offs2, 0);

  off_t offs3 = ext_alloc_.Malloc(1_MB + 1);
  ASSERT_LT(offs3, 0);
  ext_alloc_.AddStorage(kSegSize * 2, kSegSize);
  offs3 = ext_alloc_.Malloc(1_MB + 1);
  ASSERT_GT(offs3, 0);

  EXPECT_EQ(1_MB + 4_KB, ExternalAllocator::GoodSize(1_MB + 1));
}

// Fill up the allocator until it has to grow, remove 90% and make sure it has free space even with
// extreme fragmentation
TEST_F(ExternalAllocatorTest, EmptyFull) {
  const int kAllocSize = kMinBlockSize;
  ext_alloc_.AddStorage(0, 2 * kSegSize);

  // Fill up the allocator
  vector<int64_t> offsets;
  int64_t offset;
  do {
    offset = ext_alloc_.Malloc(kAllocSize);
    if (offset >= 0)
      offsets.push_back(offset);
  } while (offset >= 0);

  // Keep only 10%, free 90%
  for (size_t i = 0; i < offsets.size(); i++) {
    if (i % 10 == 0)
      continue;
    ext_alloc_.Free(offsets[i], kAllocSize);
  }

  // Expect to succeed adding 10% without growing
  for (size_t i = 0; i < offsets.size() / 10; i++)
    EXPECT_GT(ext_alloc_.Malloc(kAllocSize), 0u);
}

TEST_F(ExternalAllocatorTest, AllocLarge) {
  ext_alloc_.AddStorage(0, kSegSize);

  off_t offs = ext_alloc_.Malloc(2_MB - 1);
  EXPECT_EQ(offs, 0);
  ext_alloc_.Free(offs, 2_MB - 1);
}

}  // namespace dfly::tiering


================================================
FILE: src/server/tiering/op_manager.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/tiering/op_manager.h"

#include <variant>

#include "base/logging.h"
#include "core/overloaded.h"
#include "io/io.h"
#include "server/tiering/common.h"
#include "server/tiering/disk_storage.h"
#include "util/fibers/fibers.h"
namespace dfly::tiering {

using namespace std;

OpManager::OwnedEntryId OpManager::ToOwned(PendingId id) {
  return std::visit(Overloaded{[](uintptr_t i) -> OpManager::OwnedEntryId { return i; },
                               [](std::pair<DbIndex, std::string_view> p) -> OwnedEntryId {
                                 return std::make_pair(p.first, std::string{p.second});
                               }},
                    id);
}

string OpManager::ToString(const OwnedEntryId& id) {
  if (const auto* i = std::get_if<uintptr_t>(&id); i) {
    return absl::StrCat(*i);
  }
  const auto& key = std::get<DbKeyId>(id);
  return absl::StrCat("(", key.first, ":", key.second, ")");
}

OpManager::OpManager(size_t max_size) : storage_{max_size} {
}

OpManager::~OpManager() {
  DCHECK(pending_stash_ver_.empty());
  DCHECK(pending_reads_.empty());
}

std::error_code OpManager::Open(std::string_view file) {
  return storage_.Open(file);
}

void OpManager::Close() {
  storage_.Close();
  DCHECK(pending_stash_ver_.empty());
  DCHECK(pending_reads_.empty());
}

void OpManager::Enqueue(PendingId id, DiskSegment segment, const Decoder& decoder,
                        ReadCallback cb) {
  // Fill pages for prepared read as it has no penalty and potentially covers more small segments
  PrepareRead(segment.ContainingPages())
      .ForSegment(segment, id, decoder)
      .read_cbs.emplace_back(std::move(cb));
}

void OpManager::CancelPending(PendingId id) {
  // If the item isn't offloaded, it has io pending, so cancel it
  DCHECK(pending_stash_ver_.count(ToOwned(id)));
  pending_stash_ver_.erase(ToOwned(id));
}

void OpManager::DeleteOffloaded(DiskSegment segment) {
  EntryOps* pending_read = nullptr;

  auto base_it = pending_reads_.find(segment.ContainingPages().offset);
  if (base_it != pending_reads_.end())
    pending_read = base_it->second.Find(segment);

  if (pending_read) {
    // Mark that the read operation must finalize with deletion.
    pending_read->deleting = true;
  } else if (NotifyDelete(segment) && base_it == pending_reads_.end()) {
    storage_.MarkAsFree(segment.ContainingPages());
  }
}

void OpManager::Stash(PendingId id_ref, tiering::DiskSegment segment,
                      util::fb2::RegisteredSlice buf) {
  auto id = ToOwned(id_ref);
  unsigned version = ++pending_stash_counter_;
  pending_stash_ver_[id] = version;

  auto io_cb = [this, version, id = std::move(id), segment](std::error_code ec) {
    ProcessStashed(id, version,
                   ec ? nonstd::make_unexpected(ec) : io::Result<DiskSegment>(segment));
  };

  // May block due to blocking call to Grow.
  storage_.Stash(segment, buf, std::move(io_cb));
}

std::error_code OpManager::PrepareAndStash(PendingId id, size_t length,
                                           const std::function<size_t(io::MutableBytes)>& writer) {
  auto buf = PrepareStash(length);
  if (!buf.has_value())
    return buf.error();

  size_t written = writer(buf->second.bytes);
  Stash(id, {buf->first, written}, buf->second);
  return {};
}

OpManager::ReadOp& OpManager::PrepareRead(DiskSegment aligned_segment) {
  DCHECK_EQ(aligned_segment.offset % kPageSize, 0u);
  DCHECK_EQ(aligned_segment.length % kPageSize, 0u);

  auto [it, inserted] = pending_reads_.try_emplace(aligned_segment.offset, aligned_segment);
  if (inserted) {
    auto io_cb = [this, aligned_segment](io::Result<std::string_view> result) {
      ProcessRead(aligned_segment.offset, result);
    };
    storage_.Read(aligned_segment, io_cb);
  }
  return it->second;
}

void OpManager::ProcessStashed(const OwnedEntryId& id, unsigned version,
                               const io::Result<DiskSegment>& segment) {
  if (auto it = pending_stash_ver_.find(id);
      it != pending_stash_ver_.end() && it->second == version) {
    pending_stash_ver_.erase(it);
    NotifyStashed(id, segment);
  } else if (segment) {
    // Throw away the value because it's no longer up-to-date even if no error occured
    VLOG(1) << "Releasing segment " << *segment << ", id: " << ToString(id);
    storage_.MarkAsFree(*segment);
  } else {
    LOG(ERROR) << "Stash failed with error " << segment.error();
  }
}

void OpManager::ProcessRead(size_t offset, io::Result<std::string_view> page) {
  util::FiberAtomicGuard guard;  // atomically update items, no in-between states should be possible
  ReadOp* info = &pending_reads_.at(offset);

  // Reorder base read (offset 0) to be last, so reads for defragmentation are handled last.
  // If we already have a page read for defragmentation pending and some other read for the
  // sub-segment is enqueued, we first must handle the sub-segment read, only then the full page
  // read
  for (size_t i = 0; i + 1 < info->entry_ops.size(); i++) {
    if (info->entry_ops[i].segment.offset % kPageSize == 0) {
      std::swap(info->entry_ops[i], info->entry_ops.back());
      break;
    }
  }

  bool deleting_full = false;

  // Notify functions in the loop may append items to info->entry_ops during the traversal
  for (size_t i = 0; i < info->entry_ops.size(); i++) {
    auto& ko = info->entry_ops[i];
    if (page) {
      size_t offset = ko.segment.offset - info->segment.offset;
      ko.decoder->Initialize(page->substr(offset, ko.segment.length));
      for (auto& cb : ko.read_cbs)
        cb(&*ko.decoder);
    } else {
      for (auto& cb : ko.read_cbs)
        cb(page.get_unexpected());
    }

    bool delete_from_storage = ko.deleting;

    // If the item is not being deleted, report is as fetched to be cached potentially.
    // In case it's cached, we might need to delete it.
    if (page.has_value() && !delete_from_storage)
      delete_from_storage |= NotifyFetched(ko.id, ko.segment, &*ko.decoder);

    // If the item is being deleted, check if the full page needs to be deleted.
    if (delete_from_storage)
      deleting_full |= NotifyDelete(ko.segment);
  }

  if (deleting_full) {
    storage_.MarkAsFree(info->segment);
  }

  pending_reads_.erase(offset);
}

OpManager::EntryOps::EntryOps(OwnedEntryId id, DiskSegment segment, const Decoder& decoder)
    : id{std::move(id)}, segment{segment}, decoder{decoder.Clone()} {
}

OpManager::EntryOps& OpManager::ReadOp::ForSegment(DiskSegment key_segment, PendingId id,
                                                   const Decoder& decoder) {
  DCHECK_GE(key_segment.offset, segment.offset);
  DCHECK_LE(key_segment.length, segment.length);

  for (auto& ops : entry_ops) {
    if (ops.segment.offset == key_segment.offset) {
      DCHECK(typeid(*ops.decoder) == typeid(decoder));
      return ops;
    }
  }
  return entry_ops.emplace_back(ToOwned(id), key_segment, decoder);
}

OpManager::EntryOps* OpManager::ReadOp::Find(DiskSegment key_segment) {
  for (auto& ops : entry_ops) {
    if (ops.segment.offset == key_segment.offset)
      return &ops;
  }
  return nullptr;
}

OpManager::Stats OpManager::GetStats() const {
  return {.disk_stats = storage_.GetStats(),
          .pending_read_cnt = pending_reads_.size(),
          .pending_stash_cnt = pending_stash_ver_.size()};
}

}  // namespace dfly::tiering


================================================
FILE: src/server/tiering/op_manager.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/flat_hash_map.h>
#include <absl/container/inlined_vector.h>

#include <variant>

#include "base/function2.hpp"
#include "server/tiering/common.h"
#include "server/tiering/decoders.h"
#include "server/tiering/disk_storage.h"
#include "server/tiering/entry_map.h"
#include "util/fibers/future.h"

namespace dfly::tiering {

// Manages READ/DELETE/STASH operations on top of a DiskStorage.
// Implicitly combines reads with different offsets on the same 4kb page,
// safely schedules deletes after reads and allows cancelling pending stashes
class OpManager {
 public:
  struct Stats {
    DiskStorage::Stats disk_stats;

    size_t pending_read_cnt = 0;
    size_t pending_stash_cnt = 0;
  };

  using KeyRef = ::dfly::tiering::KeyRef;

  using PendingId = ::dfly::tiering::PendingId;

  explicit OpManager(size_t max_size);
  virtual ~OpManager();

  // Open file with underlying disk storage, must be called before use
  std::error_code Open(std::string_view file);

  void Close();

  using ReadCallback =
      fu2::function_base<true /*owns*/, false /*moveable*/, fu2::capacity_fixed<40, 8>,
                         false /* non-throwing*/, false /* strong exceptions guarantees*/,
                         void(io::Result<Decoder*>)>;

  // Enqueue callback to be executed once value is read. Trigger read if none is pending yet for
  // this segment. Multiple entries can be obtained from a single segment, but every distinct id
  // will have it's own independent callback loop that can safely modify the underlying value
  void Enqueue(PendingId id, DiskSegment segment, const Decoder& decoder, ReadCallback cb);

  // Cancel entry with pending io
  void CancelPending(PendingId id);

  // Delete offloaded entry located at the segment.
  void DeleteOffloaded(DiskSegment segment);

  auto PrepareStash(size_t length) {
    return storage_.PrepareStash(length);
  }

  // Stash value to be offloaded. It is opaque to OpManager.
  void Stash(PendingId id, tiering::DiskSegment segment, util::fb2::RegisteredSlice buf);

  // PrepareStash + Stash via function
  std::error_code PrepareAndStash(
      PendingId id, size_t length,
      const std::function<size_t /*written*/ (io::MutableBytes)>& writer);

  Stats GetStats() const;

 protected:
  using OwnedEntryId = std::variant<uintptr_t, DbKeyId>;

  // Notify that a stash succeeded and the entry was stored at the provided segment or failed with
  // given error
  virtual void NotifyStashed(const OwnedEntryId& id, const io::Result<DiskSegment>& segment) = 0;

  // Notify that an entry was successfully fetched. Includes whether entry was modified.
  // Returns true if value needs to be deleted from the storage.
  virtual bool NotifyFetched(const OwnedEntryId& id, DiskSegment segment, Decoder*) = 0;

  // Notify delete. Return true if the filled segment needs to be marked as free.
  virtual bool NotifyDelete(DiskSegment segment) = 0;

  // Describes pending read futures for a single entry
  struct EntryOps {
    EntryOps(OwnedEntryId id, DiskSegment segment, const Decoder& decoder);

    // unique identifier for the entry being read. Used to notify higher layers.
    OwnedEntryId id;

    // For multi-bin reads is a precise segment of the entry within a page.
    DiskSegment segment;

    // We may have multiple callbacks for the same entry.
    absl::InlinedVector<ReadCallback, 1> read_cbs;
    std::unique_ptr<Decoder> decoder;
    bool deleting = false;
  };

  // Describes an ongoing read operation for a fixed segment
  struct ReadOp {
    explicit ReadOp(DiskSegment segment) : segment(segment) {
    }

    // Get ops for id or create new
    EntryOps& ForSegment(DiskSegment segment, PendingId id, const Decoder& decoder);

    // Find if there are operations for the given segment, return nullptr otherwise
    EntryOps* Find(DiskSegment segment);

    DiskSegment segment;  // spanning segment of whole read

    // enqueued operations for different keys for this segment.
    // Has size() > 1 only for small-bin pages with multiple items, otherwise size() == 1.
    absl::InlinedVector<EntryOps, 1> entry_ops;
  };

  // Prepare read operation for aligned segment or return pending if it exists.
  // Refernce is valid until any other read operations occur.
  ReadOp& PrepareRead(DiskSegment aligned_segment);

  // Called once read finished
  void ProcessRead(size_t offset, io::Result<std::string_view> value);

  // Called once Stash finished
  void ProcessStashed(const OwnedEntryId& id, unsigned version,
                      const io::Result<DiskSegment>& segment);

 private:
  static OwnedEntryId ToOwned(PendingId id);
  static std::string ToString(const OwnedEntryId& id);

  DiskStorage storage_;

  // Pending read operations are keyed by the offset of their aligned segment.
  // This prevents an ABA problem in scenarios like: read (pending) → delete → stash → read.
  // After the stash, the second read targets a different segment offset, so it won't
  // interfere with the first read's pending operation, even for the same PendingId.
  absl::flat_hash_map<size_t /* offset */, ReadOp> pending_reads_;

  size_t pending_stash_counter_ = 0;

  // todo: allow heterogeneous lookups with non owned id
  absl::flat_hash_map<OwnedEntryId, unsigned /* version */> pending_stash_ver_;
};

};  // namespace dfly::tiering


================================================
FILE: src/server/tiering/op_manager_test.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/tiering/op_manager.h"

#include <gtest/gtest.h>

#include <memory>

#include "absl/container/flat_hash_map.h"
#include "absl/strings/str_cat.h"
#include "server/tiering/common.h"
#include "server/tiering/test_common.h"
#include "util/fibers/fibers.h"
#include "util/fibers/future.h"

namespace dfly::tiering {

using namespace std;
using namespace std::string_literals;

struct TestDecoder : tiering::BareDecoder {
  std::unique_ptr<tiering::Decoder> Clone() const override {
    return std::make_unique<TestDecoder>();
  }

  void Initialize(std::string_view slice) override {
    tiering::BareDecoder::Initialize(slice);
    value = slice;
  }

  string value;
};

ostream& operator<<(ostream& os, const OpManager::Stats& stats) {
  return os << "pending_read_cnt: " << stats.pending_read_cnt
            << ", pending_stash_cnt: " << stats.pending_stash_cnt
            << ", alloc_bytes: " << stats.disk_stats.allocated_bytes
            << ", capacity_bytes: " << stats.disk_stats.capacity_bytes
            << ", heap_buf_allocs: " << stats.disk_stats.heap_buf_alloc_count
            << ", registered_buf_allocs: " << stats.disk_stats.registered_buf_alloc_count
            << ", max_file_size: " << stats.disk_stats.max_file_size
            << ", pending_ops: " << stats.disk_stats.pending_ops;
}

struct OpManagerTest : PoolTestBase, OpManager {
  OpManagerTest() : OpManager(256_MB) {
  }

  void Open() {
    EXPECT_FALSE(OpManager::Open("op_manager_test_backing"));
  }

  void Close() {
    OpManager::Close();
  }

  util::fb2::Future<std::string> Read(PendingId id, DiskSegment segment) {
    util::fb2::Future<std::string> future;
    Enqueue(id, segment, TestDecoder{}, [future](io::Result<tiering::Decoder*> res) mutable {
      auto* decoder = static_cast<TestDecoder*>(*res);
      future.Resolve(decoder->value);
    });
    return future;
  }

  void NotifyStashed(const OwnedEntryId& id, const io::Result<DiskSegment>& segment) override {
    VLOG(1) << std::get<0>(id) << " stashed";
    ASSERT_TRUE(segment);
    auto [it, inserted] = stashed_.emplace(id, *segment);
    ASSERT_TRUE(inserted);
  }

  bool NotifyFetched(const OwnedEntryId& id, DiskSegment segment, Decoder* decoder) override {
    auto* tdecoder = static_cast<TestDecoder*>(decoder);
    fetched_[id] = std::move(tdecoder->value);
    return false;
  }

  bool NotifyDelete(DiskSegment segment) override {
    return true;
  }

  std::error_code Stash(PendingId id, std::string_view value) {
    return PrepareAndStash(id, value.size(), [=](io::MutableBytes bytes) {
      memcpy(bytes.data(), value.data(), value.size());
      return value.size();
    });
  }

  void WaitForPendingStashes() {
    // Wait for both: pending_stash_cnt tracks entries awaiting version-matching IO completion,
    // but cancelled stash IOs (version-mismatched, superseded by newer stashes for the same id)
    // may still be in flight. Their callbacks free the allocated segments via MarkAsFree,
    // so we must also wait for pending_ops to drain to ensure allocated_bytes is accurate.
    while (GetStats().pending_stash_cnt > 0 || GetStats().disk_stats.pending_ops > 0)
      util::ThisFiber::SleepFor(1ms);
  }

  absl::flat_hash_map<OwnedEntryId, std::string> fetched_;
  absl::flat_hash_map<OwnedEntryId, DiskSegment> stashed_;
};

TEST_F(OpManagerTest, SimpleStashesWithReads) {
  pp_->at(0)->Await([this] {
    Open();

    for (unsigned i = 0; i < 100; i++) {
      EXPECT_FALSE(Stash(i, absl::StrCat("VALUE", i, "cancelled")));
      EXPECT_FALSE(Stash(i, absl::StrCat("VALUE", i, "cancelled")));
      EXPECT_FALSE(Stash(i, absl::StrCat("VALUE", i, "real")));
    }

    EXPECT_EQ(GetStats().pending_stash_cnt, 100);
    WaitForPendingStashes();

    EXPECT_EQ(stashed_.size(), 100u);
    EXPECT_EQ(GetStats().disk_stats.allocated_bytes, 100 * kPageSize) << GetStats();

    for (unsigned i = 0; i < 100; i++) {
      EXPECT_GE(stashed_[i].offset, i > 0);
      EXPECT_EQ(stashed_[i].length, 10 + (i > 9));
      EXPECT_EQ(Read(i, stashed_[i]).Get(), absl::StrCat("VALUE", i, "real"));
      EXPECT_EQ(fetched_.extract(i).mapped(), absl::StrCat("VALUE", i, "real"));
    }

    Close();
  });
}

TEST_F(OpManagerTest, DeleteAfterReads) {
  pp_->at(0)->Await([this] {
    Open();

    EXPECT_FALSE(Stash(0u, absl::StrCat("DATA")));
    WaitForPendingStashes();

    std::vector<util::fb2::Future<std::string>> reads;
    for (unsigned i = 0; i < 100; i++)
      reads.emplace_back(Read(0u, stashed_[0u]));
    DeleteOffloaded(stashed_[0u]);

    for (auto& fut : reads)
      EXPECT_EQ(fut.Get(), "DATA");

    Close();
  });
}

TEST_F(OpManagerTest, ReadSamePageDifferentOffsets) {
  pp_->at(0)->Await([this] {
    Open();

    // Build single numbers blob
    std::string numbers = "H";  // single padding byte to recognize it as small keys
    std::vector<DiskSegment> number_segments;
    for (size_t i = 0; i < 100; i++) {
      std::string number = std::to_string(i);
      number_segments.emplace_back(numbers.size(), number.size());
      numbers += number;
    }

    EXPECT_FALSE(Stash(0u, numbers));
    WaitForPendingStashes();

    EXPECT_EQ(stashed_[0u].offset, 0u);

    // Issue lots of concurrent reads
    std::vector<util::fb2::Future<std::string>> futures;
    for (size_t i = 0; i < 100; i++)
      futures.emplace_back(Read(std::make_pair(0, absl::StrCat("k", i)), number_segments[i]));

    for (size_t i = 0; i < 100; i++)
      EXPECT_EQ(futures[i].Get(), std::to_string(i));

    Close();
  });
}

// Test ABA scenario: stash an entry, issue an async read, delete it and re-stash a new value
// under the same id - all without yielding so the read I/O stays in flight. When I/O completes,
// version tracking in pending_stash_ver_ must ensure only the new stash triggers NotifyStashed
// while the old one is silently discarded (its segment freed).
//
// NOTE: We cannot guarantee that the first read completes after the second stash because we have
// no control over io_uring completion ordering. In practice, the read submitted first likely
// completes before or around the same time as the stash. To fully test the interleaving where
// the new entry's read is issued while the original read is still in flight, we would need a
// mock DiskStorage that allows explicit control over when I/O completions are delivered.
// TODO: Add a DiskStorage mock to enable deterministic I/O completion ordering in tests.
TEST_F(OpManagerTest, StashDeleteRestashWhileReading) {
  pp_->at(0)->Await([this] {
    Open();

    // Stash initial value under id 0
    EXPECT_FALSE(Stash(0u, "ORIGINAL"));
    WaitForPendingStashes();

    DiskSegment original_segment = stashed_.at(0u);

    // Issue an async read - don't wait on it yet so it stays in flight.
    auto read_fut = Read(0u, original_segment);

    // Without yielding: delete the entry, clear tracking, re-stash under the same id.
    // At this point the read for ORIGINAL is still pending in io_uring, and we're issuing
    // a new stash for id 0 with a bumped version.
    DeleteOffloaded(original_segment);
    stashed_.clear();
    EXPECT_FALSE(Stash(0u, "REPLACEMENT"));

    // Both the read and the new stash are now in flight. Let them complete.
    WaitForPendingStashes();
    EXPECT_EQ(read_fut.Get(), "ORIGINAL");

    // Verify only the replacement was notified (single entry in stashed_).
    ASSERT_EQ(stashed_.size(), 1u);
    ASSERT_EQ(1, stashed_.count(0u));
    DiskSegment new_segment = stashed_.at(0u);

    // Read the replacement and verify correctness
    EXPECT_EQ(Read(0u, new_segment).Get(), "REPLACEMENT");

    Close();
  });
}

TEST_F(OpManagerTest, Modify) {
  pp_->at(0)->Await([this] {
    Open();

    std::ignore = Stash(0u, "D");
    WaitForPendingStashes();

    // Atomically issue sequence of modify-read operations
    std::vector<util::fb2::Future<std::string>> futures;
    for (size_t i = 0; i < 10; i++) {
      Enqueue(0u, stashed_[0u], TestDecoder{}, [i](io::Result<tiering::Decoder*> res) {
        auto* decoder = static_cast<TestDecoder*>(*res);
        absl::StrAppend(&decoder->value, i);
      });
      futures.emplace_back(Read(0u, stashed_[0u]));
    }

    // Expect futures to resolve with correct values
    std::string expected = "D";
    for (size_t i = 0; i < futures.size(); i++) {
      absl::StrAppend(&expected, i);
      EXPECT_EQ(futures[i].Get(), expected);
    }

    Close();
  });
}

}  // namespace dfly::tiering


================================================
FILE: src/server/tiering/serialized_map.cc
================================================
#include "server/tiering/serialized_map.h"

#include <absl/base/internal/endian.h>

#include "base/logging.h"
#include "core/detail/listpack_wrap.h"

namespace dfly::tiering {

constexpr size_t kLenBytes = 4;

SerializedMap::Iterator& SerializedMap::Iterator::operator++() {
  slice_.remove_prefix(2 * kLenBytes + key_.size() + value_.size());
  Read();
  return *this;
}

SerializedMap::Iterator::Iterator(std::string_view buffer) : slice_{buffer} {
  Read();
}

void SerializedMap::Iterator::Read() {
  if (slice_.empty())
    return;

  uint32_t key_len = absl::little_endian::Load32(slice_.data());
  uint32_t value_len = absl::little_endian::Load32(slice_.data() + 4);
  key_ = {slice_.data() + 8, key_len};
  value_ = {slice_.data() + 8 + key_len, value_len};
}

SerializedMap::SerializedMap(std::string_view slice) {
  size_ = absl::little_endian::Load32(slice.data());
  DCHECK_GT(size_, 0u);
  slice_ = slice;
}

SerializedMap::Iterator SerializedMap::Find(std::string_view key) const {
  return std::find_if(begin(), end(), [key](auto p) { return p.first == key; });
}

SerializedMap::Iterator SerializedMap::begin() const {
  return Iterator{slice_.substr(kLenBytes)};
}

SerializedMap::Iterator SerializedMap::end() const {
  return Iterator{slice_.substr(slice_.size(), 0)};
}

size_t SerializedMap::size() const {
  return size_;
}

size_t SerializedMap::DataBytes() const {
  return slice_.size() - 4 - size() * 2 * 4;
}

size_t SerializedMap::EstimateSize(size_t data_bytes, size_t entries) {
  return kLenBytes /* entry number */ + data_bytes + entries * 2 * kLenBytes /* string lengths */;
}

size_t SerializedMap::Serialize(const detail::ListpackWrap& lw, absl::Span<char> buffer) {
  DCHECK_GE(buffer.size(), EstimateSize(lw.UsedBytes(), lw.size()));

  char* ptr = buffer.data();
  absl::little_endian::Store32(ptr, lw.size());
  ptr += kLenBytes;

  for (const auto& [key, value] : lw) {
    absl::little_endian::Store32(ptr, key.length());
    ptr += kLenBytes;
    absl::little_endian::Store32(ptr, value.length());
    ptr += kLenBytes;
    memcpy(ptr, key.data(), key.length());
    ptr += key.length();
    memcpy(ptr, value.data(), value.length());
    ptr += value.length();
  }

  return ptr - buffer.data();
}

}  // namespace dfly::tiering


================================================
FILE: src/server/tiering/serialized_map.h
================================================
#pragma once

#include <absl/types/span.h>

#include <string_view>

namespace dfly::detail {
struct ListpackWrap;
}

namespace dfly::tiering {

// Map built over single continuous byte slice to allow easy read operations.
struct SerializedMap {
  struct Iterator {
    using iterator_category = std::forward_iterator_tag;
    using difference_type = std::ptrdiff_t;
    using value_type = std::pair<std::string_view, std::string_view>;
    using reference = value_type;
    using pointer = value_type*;

    Iterator& operator++();

    bool operator==(const Iterator& other) const {
      return slice_.data() == other.slice_.data() && slice_.size() == other.slice_.size();
    }

    bool operator!=(const Iterator& other) const {
      return !operator==(other);
    }

    std::pair<std::string_view, std::string_view> operator*() const {
      return {key_, value_};
    }

   private:
    friend struct SerializedMap;

    explicit Iterator(std::string_view buffer);
    void Read();

    std::string_view slice_;  // the part left
    std::string_view key_, value_;
  };

  explicit SerializedMap(std::string_view slice);

  Iterator Find(std::string_view key) const;  // Linear search
  Iterator begin() const;
  Iterator end() const;
  size_t size() const;

  // Number of bytes of pure keys or values
  size_t DataBytes() const;

  // Estimate upper bound for serialization size
  static size_t EstimateSize(size_t data_bytes, size_t entries);

  // Write a slice that can be used to a SerializedMap on top of it.
  // Returns number of bytes written
  static size_t Serialize(const ::dfly::detail::ListpackWrap& lw, absl::Span<char> buffer);

 private:
  size_t size_;
  std::string_view slice_;
};

}  // namespace dfly::tiering


================================================
FILE: src/server/tiering/serialized_map_test.cc
================================================
#include "server/tiering/serialized_map.h"

#include <mimalloc.h>

#include <map>

#include "base/logging.h"
#include "core/detail/listpack_wrap.h"
#include "gmock/gmock.h"

extern "C" {
#include "redis/zmalloc.h"
}

namespace dfly::tiering {

using namespace std;

struct SerializedMapTest : public ::testing::Test {
  static void SetUpTestSuite() {
    init_zmalloc_threadlocal(mi_heap_get_backing());  // to use ListpackWrap
  }
};

TEST_F(SerializedMapTest, TestBasic) {
  const vector<std::pair<string, string>> kBase = {{"first key", "first value"},
                                                   {"second key", "second value"},
                                                   {"third key", "third value"},
                                                   {"fourth key", "fourth value"},
                                                   {"fifth key", "fifth value"}};
  auto lw = detail::ListpackWrap::WithCapacity(100);
  for (const auto& [k, v] : kBase)
    lw.Insert(k, v, false);
  lw.GetPointer();  // to mark as non dirty // TODO: remove

  // Serialize kBase to buffer
  std::string buffer;
  buffer.resize(SerializedMap::EstimateSize(lw.UsedBytes(), lw.size()));
  size_t written = SerializedMap::Serialize(lw, absl::MakeSpan(buffer));
  EXPECT_GT(written, 0u);
  buffer.resize(written);

  // Build map over buffer and check size
  SerializedMap map{buffer};
  EXPECT_EQ(map.size(), kBase.size());

  // Check entries
  size_t idx = 0;
  for (auto it = map.begin(); it != map.end(); ++it, ++idx) {
    EXPECT_EQ((*it).first, kBase[idx].first);
    EXPECT_EQ((*it).second, kBase[idx].second);
  }
}

}  // namespace dfly::tiering


================================================
FILE: src/server/tiering/small_bins.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/tiering/small_bins.h"

#include <algorithm>
#include <optional>
#include <utility>

#include "absl/base/internal/endian.h"
#include "base/logging.h"
#include "core/compact_object.h"
#include "server/tiering/common.h"
#include "server/tiering/disk_storage.h"

namespace dfly::tiering {
using namespace std;

namespace {

// See FlushBin() for format details
size_t StashedValueSize(string_view value) {
  return 2 /* dbid */ + 8 /* hash */ + 2 /* strlen*/ + value.size();
}

}  // namespace

std::optional<SmallBins::FilledBin> SmallBins::Stash(DbIndex dbid, std::string_view key,
                                                     std::string_view value) {
  DCHECK_LT(value.size(), 2_KB);

  size_t value_bytes = StashedValueSize(value);

  std::optional<FilledBin> filled_bin;
  if (2 /* num entries */ + current_bin_.bytes_ + value_bytes >= kPageSize) {
    filled_bin = exchange(current_bin_, FilledBin{++last_bin_id_});
  }

  current_bin_.bytes_ += value_bytes;
  auto [it, inserted] = current_bin_.entries_.emplace(std::make_pair(dbid, key), string(value));
  CHECK(inserted);

  return filled_bin;
}

size_t SmallBins::SerializeBin(FilledBin* bin, io::MutableBytes dest) {
  DCHECK_GT(bin->entries_.size(), 0u);
  DCHECK_GE(dest.size(), 4_KB);

  auto& pending_set = pending_bins_[bin->id];
  uint8_t* data = dest.data();

  // Store number of entries, 2 bytes
  absl::little_endian::Store16(data, bin->entries_.size());
  data += sizeof(uint16_t);

  // Store all dbids and hashes, n * 10 bytes
  for (const auto& [key, _] : bin->entries_) {
    absl::little_endian::Store16(data, key.first);
    data += sizeof(DbIndex);

    absl::little_endian::Store64(data, CompactObj::HashCode(key.second));
    data += sizeof(uint64_t);
  }

  // Store all values with sizes, n * (2 + x) bytes
  for (const auto& [key, value] : bin->entries_) {
    absl::little_endian::Store16(data, value.size());
    data += sizeof(uint16_t);

    pending_set[key] = {size_t(data - dest.data()), value.size()};
    memcpy(data, value.data(), value.size());
    data += value.size();
  }

  // Steal backing array from bin if relevant
  if (current_bin_.entries_.empty()) {
    // erase doesn't shrink backing, so we can reuse the allocated capacity
    bin->entries_.erase(bin->entries_.begin(), bin->entries_.end());
    current_bin_.entries_ = std::move(bin->entries_);
  }

  return bin->bytes_ + 2;
}

SmallBins::KeySegmentList SmallBins::ReportStashed(BinId id, DiskSegment segment) {
  DVLOG(1) << "ReportStashed " << id;

  DCHECK(pending_bins_.contains(id));
  auto seg_map_node = pending_bins_.extract(id);
  const auto& seg_map = seg_map_node.mapped();
  DCHECK_GT(seg_map.size(), 0u) << id;

  uint16_t bytes = 0;
  SmallBins::KeySegmentList list;
  for (auto& [key, sub_segment] : seg_map) {
    bytes += sub_segment.length;

    DiskSegment real_segment{segment.offset + sub_segment.offset, sub_segment.length};
    list.emplace_back(key.first, key.second, real_segment);
  }

  stats_.stashed_entries_cnt += list.size();
  stashed_bins_[segment.offset] = {uint8_t(list.size()), bytes};
  return list;
}

std::vector<std::pair<DbIndex, std::string>> SmallBins::ReportStashAborted(BinId id) {
  std::vector<std::pair<DbIndex, std::string>> out;

  auto node = pending_bins_.extract(id);
  auto& entries = node.mapped();
  while (!entries.empty())
    out.emplace_back(std::move(entries.extract(entries.begin()).key()));

  return out;
}

std::optional<SmallBins::BinId> SmallBins::Delete(DbIndex dbid, std::string_view key) {
  auto& entries = current_bin_.entries_;
  if (auto it = entries.find(make_pair(dbid, key)); it != entries.end()) {
    size_t stashed_size = StashedValueSize(it->second);
    DCHECK_GE(current_bin_.bytes_, stashed_size);

    current_bin_.bytes_ -= stashed_size;
    entries.erase(it);
    return std::nullopt;
  }

  for (auto& [id, keys] : pending_bins_) {
    if (keys.erase(make_pair(dbid, key)))
      return keys.empty() ? std::make_optional(id) : std::nullopt;
  }
  return std::nullopt;
}

SmallBins::BinInfo SmallBins::Delete(DiskSegment segment) {
  auto full_segment = segment.ContainingPages();
  if (auto it = stashed_bins_.find(full_segment.offset); it != stashed_bins_.end()) {
    stats_.stashed_entries_cnt--;
    auto& bin = it->second;

    DCHECK_LE(segment.length, bin.bytes);
    bin.bytes -= segment.length;

    if (--bin.entries == 0) {
      DCHECK_EQ(bin.bytes, 0u);
      stashed_bins_.erase(it);
      return {full_segment, false /* fragmented */, true /* empty */};
    }

    if (bin.bytes < kPageSize / 2) {
      return {full_segment, true /* fragmented */, false /* empty */};
    }
  }

  return {segment};
}

SmallBins::Stats SmallBins::GetStats() const {
  return Stats{.stashed_bins_cnt = stashed_bins_.size(),
               .stashed_entries_cnt = stats_.stashed_entries_cnt,
               .current_bin_bytes = current_bin_.bytes_,
               .current_entries_cnt = current_bin_.entries_.size()};
}

SmallBins::KeyHashDbList SmallBins::DeleteBin(DiskSegment segment, std::string_view value) {
  DCHECK_EQ(value.size(), kPageSize);

  auto bin = stashed_bins_.extract(segment.offset);
  if (bin.empty())
    return {};

  stats_.stashed_entries_cnt -= bin.mapped().entries;

  const char* data = value.data();

  uint16_t entries = absl::little_endian::Load16(data);
  data += sizeof(uint16_t);

  KeyHashDbList out(entries);

  // Recover dbids and hashes
  for (size_t i = 0; i < entries; i++) {
    DbIndex dbid = absl::little_endian::Load16(data);
    data += sizeof(DbIndex);

    uint64_t hash = absl::little_endian::Load64(data);
    data += sizeof(hash);

    out[i] = {dbid, hash, {0, 0}};
  }

  // Recover segments
  for (size_t i = 0; i < entries; i++) {
    uint16_t length = absl::little_endian::Load16(data);
    data += sizeof(uint16_t);

    std::get<DiskSegment>(out[i]) = {segment.offset + (data - value.data()), length};
    data += length;
  }

  return out;
}

}  // namespace dfly::tiering


================================================
FILE: src/server/tiering/small_bins.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/container/flat_hash_map.h>

#include <optional>
#include <string>
#include <vector>

#include "server/tiering/disk_storage.h"
#include "server/tiering/entry_map.h"

namespace dfly::tiering {

using DbIndex = uint16_t;

// Small bins accumulate small values into larger bins that fill up 4kb pages.
// SIMPLEST VERSION for now.
class SmallBins {
 public:
  struct Stats {
    size_t stashed_bins_cnt = 0;
    size_t stashed_entries_cnt = 0;
    size_t current_bin_bytes = 0;
    size_t current_entries_cnt = 0;
  };

  using BinId = unsigned;
  static const BinId kInvalidBin = std::numeric_limits<BinId>::max();

  struct BinInfo {
    DiskSegment segment;
    bool fragmented = false, empty = false;
  };

  // Packaged bin ready to be serialized with SerializeBin()
  struct FilledBin {
    friend class SmallBins;
    BinId id;

   private:
    explicit FilledBin(BinId id) : id{id} {
    }

    unsigned bytes_ = 0;
    tiering::EntryMap<std::string> entries_;
  };

  // List of locations of values for corresponding keys of previously filled bin
  using KeySegmentList = std::vector<std::tuple<DbIndex, std::string /* key*/, DiskSegment>>;

  // List of item key db indices and hashes
  using KeyHashDbList = std::vector<std::tuple<DbIndex, uint64_t /* hash */, DiskSegment>>;

  // Returns true if the entry is pending inside SmallBins.
  bool IsPending(DbIndex dbid, std::string_view key) const {
    return current_bin_.entries_.count(std::make_pair(dbid, key)) > 0;
  }

  // Enqueue key/value pair for stash. Returns page to be stashed if it filled up.
  std::optional<FilledBin> Stash(DbIndex dbid, std::string_view key, std::string_view value);

  // Report that a stash succeeeded. Returns list of stored keys with calculated value locations.
  KeySegmentList ReportStashed(BinId id, DiskSegment segment);

  // Report that a stash was aborted. Returns list of keys that the entry contained.
  std::vector<std::pair<DbIndex, std::string>> ReportStashAborted(BinId id);

  // Delete a key with pending io. Returns entry id if needs to be deleted.
  std::optional<BinId> Delete(DbIndex dbid, std::string_view key);

  // Delete a stored segment. Returns information about the current bin, which might indicate
  // the need for external actions like deleting empty segments or triggering defragmentation
  BinInfo Delete(DiskSegment segment);

  // Delete stashed bin. Returns list of recovered item key hashes and db indices.
  // Mainly used for defragmentation
  KeyHashDbList DeleteBin(DiskSegment segment, std::string_view value);

  // Serialize filled bin to destination buffer (4kb)
  size_t SerializeBin(FilledBin* bin, io::MutableBytes dest);

  Stats GetStats() const;

 private:
  struct StashInfo {
    uint8_t entries = 0;
    uint16_t bytes = 0;
  };
  static_assert(sizeof(StashInfo) == sizeof(unsigned));

  BinId last_bin_id_ = 0;
  FilledBin current_bin_{last_bin_id_};

  // Pending stashes, their keys and value sizes
  absl::flat_hash_map<unsigned /* id */, tiering::EntryMap<DiskSegment>> pending_bins_;

  // Map of bins that were stashed and should be deleted when number of entries reaches 0
  absl::flat_hash_map<size_t /*offset*/, StashInfo> stashed_bins_;

  struct {
    size_t stashed_entries_cnt = 0;
  } stats_;
};

};  // namespace dfly::tiering


================================================
FILE: src/server/tiering/small_bins_test.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/tiering/small_bins.h"

#include <absl/strings/str_cat.h>

#include <algorithm>

#include "base/gtest.h"
#include "base/logging.h"
#include "server/tiering/disk_storage.h"

namespace dfly::tiering {

using namespace std;
using namespace std::string_literals;

string SmallString(size_t len) {
  return string(len, 'a');
}

struct SmallBinsTest : public ::testing::Test {
  std::pair<tiering::SmallBins::BinId, std::string> Serialize(SmallBins::FilledBin& bin) {
    std::string out(4_KB, 'c');
    size_t written = bins_.SerializeBin(&bin, {reinterpret_cast<uint8_t*>(out.data()), out.size()});
    out.resize(written);
    return {bin.id, out};
  }

 protected:
  SmallBins bins_;
};

TEST_F(SmallBinsTest, SimpleStashRead) {
  // Fill single bin
  std::optional<SmallBins::FilledBin> bin;
  for (unsigned i = 0; !bin; i++)
    bin = bins_.Stash(0, absl::StrCat("k", i), absl::StrCat("v", i));
  auto [id, data] = Serialize(*bin);

  // Verify cut locations point to correct values
  auto segments = bins_.ReportStashed(id, DiskSegment{0, 4_KB});
  for (auto [dbid, key, location] : segments) {
    auto value = "v"s + key.substr(1);
    EXPECT_EQ(value, data.substr(location.offset, location.length));
  }
}

TEST_F(SmallBinsTest, SimpleDeleteAbort) {
  SmallBins bins;

  // Fill single bin
  std::optional<SmallBins::FilledBin> bin;
  unsigned i = 0;
  for (; !bin; i++)
    bin = bins_.Stash(0, absl::StrCat("k", i), absl::StrCat("v", i));
  auto [id, data] = Serialize(*bin);

  // Delete all even values
  for (unsigned j = 0; j <= i; j += 2)
    bins_.Delete(0, absl::StrCat("k", j));

  auto remaining = bins_.ReportStashAborted(id);
  sort(remaining.begin(), remaining.end());

  // Expect all odd keys still to exist
  EXPECT_EQ(remaining.size(), i / 2);
  for (unsigned j = 1; j < i; j += 2) {
    std::pair<DbIndex, std::string> needle{0, absl::StrCat("k", j)};
    EXPECT_TRUE(binary_search(remaining.begin(), remaining.end(), needle)) << j;
  }
}

TEST_F(SmallBinsTest, PartialStashDelete) {
  // Fill single bin
  std::optional<SmallBins::FilledBin> bin;
  unsigned i = 0;
  for (; !bin; i++)
    bin = bins_.Stash(0, absl::StrCat("k", i), absl::StrCat("v", i));
  auto [id, data] = Serialize(*bin);

  // Delete all even values
  for (unsigned j = 0; j <= i; j += 2)
    bins_.Delete(0, absl::StrCat("k", j));

  auto segments = bins_.ReportStashed(id, DiskSegment{0, 4_KB});

  // Expect all odd keys still to exist
  EXPECT_EQ(segments.size(), i / 2);
  for (auto& [dbid, key, segment] : segments) {
    EXPECT_EQ(key, "k"s + data.substr(segment.offset, segment.length).substr(1));
  }

  // Delete all stashed values
  while (!segments.empty()) {
    auto segment = std::get<2>(segments.back());
    segments.pop_back();
    auto bin = bins_.Delete(segment);

    EXPECT_EQ(bin.segment.offset, 0u);
    EXPECT_EQ(bin.segment.length, 4_KB);

    if (segments.empty()) {
      EXPECT_TRUE(bin.empty);
    } else {
      EXPECT_TRUE(bin.fragmented);  // half of the values were deleted
    }
  }
}

TEST_F(SmallBinsTest, UpdateStatsAfterDelete) {
  // caused https://github.com/dragonflydb/dragonfly/issues/3240
  for (unsigned i = 0; i < 10; i++) {
    auto spilled_bin = bins_.Stash(0, absl::StrCat("k", i), SmallString(128));
    ASSERT_FALSE(spilled_bin);
  }

  EXPECT_GT(bins_.GetStats().current_bin_bytes, 128 * 10);
  for (unsigned i = 0; i < 10; i++) {
    auto res = bins_.Delete(0, absl::StrCat("k", i));
    ASSERT_FALSE(res);
  }
  EXPECT_EQ(0u, bins_.GetStats().current_bin_bytes);
}

}  // namespace dfly::tiering


================================================
FILE: src/server/tiering/test_common.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <memory>

#include "base/gtest.h"
#include "base/logging.h"
#include "util/fibers/fibers.h"
#include "util/fibers/pool.h"

namespace dfly::tiering {

class PoolTestBase : public testing::Test {
 protected:
  void SetUp() override {
    pp_.reset(util::fb2::Pool::IOUring(16, 2));
    pp_->Run();
  }

  void TearDown() override {
    pp_->Stop();
    pp_.reset();
  }

  std::unique_ptr<util::ProactorPool> pp_;
};

}  // namespace dfly::tiering


================================================
FILE: src/server/transaction.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/transaction.h"

#include <absl/strings/match.h>

#include <new>

#include "base/flags.h"
#include "base/logging.h"
#include "facade/facade_stats.h"
#include "facade/op_status.h"
#include "redis/redis_aux.h"
#include "server/blocking_controller.h"
#include "server/command_registry.h"
#include "server/db_slice.h"
#include "server/engine_shard_set.h"
#include "server/journal/journal.h"
#include "server/namespaces.h"
#include "server/server_state.h"

ABSL_FLAG(uint32_t, tx_queue_warning_len, 96,
          "Length threshold for warning about long transaction queue");

namespace dfly {

using namespace std;
using namespace util;
using absl::StrCat;

thread_local Transaction::TLTmpSpace Transaction::tmp_space;

namespace {

// Global txid sequence
atomic_uint64_t op_seq{1};

constexpr size_t kTransSize [[maybe_unused]] = sizeof(Transaction);

void AnalyzeTxQueue(const EngineShard* shard, const TxQueue* txq) {
  unsigned q_limit = absl::GetFlag(FLAGS_tx_queue_warning_len);
  if (txq->size() > q_limit) {
    static thread_local time_t last_log_time = 0;
    // TODO: glog provides LOG_EVERY_T, which uses precise clock.
    // We should introduce inside helio LOG_PERIOD_ATLEAST macro that takes seconds and
    // uses low precision clock.
    time_t now = time(nullptr);
    if (now >= last_log_time + 10) {
      last_log_time = now;
      EngineShard::TxQueueInfo info = shard->AnalyzeTxQueue();
      string msg = StrCat("TxQueue is too long. ", info.Format());
      absl::StrAppend(&msg, "poll_executions:", shard->stats().poll_execution_total);

      const Transaction* cont_tx = shard->GetContTx();
      if (cont_tx) {
        absl::StrAppend(&msg, " continuation_tx: ", cont_tx->DebugId(shard->shard_id()), " ",
                        cont_tx->DEBUG_IsArmedInShard(shard->shard_id()) ? " armed" : "");
      }

      LOG(WARNING) << msg;
    }
  }
}

void RecordTxScheduleStats(const Transaction* tx) {
  auto* ss = ServerState::tlocal();
  ++(tx->IsGlobal() ? ss->stats.tx_global_cnt : ss->stats.tx_normal_cnt);
  ++ss->stats.tx_width_freq_arr[tx->GetUniqueShardCnt() - 1];
}

std::ostream& operator<<(std::ostream& os, Transaction::time_point tp) {
  using namespace chrono;
  if (tp == Transaction::time_point::max())
    return os << "inf";
  size_t ms = duration_cast<milliseconds>(tp - Transaction::time_point::clock::now()).count();
  return os << ms << "ms";
}

uint16_t trans_id(const Transaction* ptr) {
  return (intptr_t(ptr) >> 8) & 0xFFFF;
}

struct ScheduleContext {
  Transaction* trans;
  bool optimistic_execution = false;

  std::atomic<ScheduleContext*> next{nullptr};

  std::atomic_uint32_t fail_cnt{0};

  ScheduleContext(Transaction* t, bool optimistic) : trans(t), optimistic_execution(optimistic) {
  }
};

constexpr size_t kAvoidFalseSharingSize = 64;
struct ScheduleQ {
  alignas(kAvoidFalseSharingSize) base::MPSCIntrusiveQueue<ScheduleContext> queue;
  alignas(kAvoidFalseSharingSize) atomic_bool armed{false};
};

void MPSC_intrusive_store_next(ScheduleContext* dest, ScheduleContext* next_node) {
  dest->next.store(next_node, std::memory_order_relaxed);
}

ScheduleContext* MPSC_intrusive_load_next(const ScheduleContext& src) {
  return src.next.load(std::memory_order_acquire);
}

// of shard_num arity.
ScheduleQ* schedule_queues = nullptr;

}  // namespace

bool Transaction::BatonBarrier::IsClaimed() const {
  return claimed_.load(memory_order_relaxed);
}

bool Transaction::BatonBarrier::TryClaim() {
  return !claimed_.exchange(true, memory_order_relaxed);  // false means first means success
}

void Transaction::BatonBarrier::Close() {
  DCHECK(claimed_.load(memory_order_relaxed));
  closed_.store(true, memory_order_relaxed);
  ec_.notify();  // release
}

cv_status Transaction::BatonBarrier::Wait(time_point tp) {
  auto cb = [this] { return closed_.load(memory_order_acquire); };

  if (tp != time_point::max()) {
    // Wait until timepoint and return immediately if we finished without a timeout
    if (ec_.await_until(cb, tp) == cv_status::no_timeout)
      return cv_status::no_timeout;

    // We timed out and claimed the barrier, so no one will be able to claim it anymore
    if (TryClaim()) {
      closed_.store(true, memory_order_relaxed);  // Purely formal
      return cv_status::timeout;
    }

    // fallthrough: otherwise a modification is in progress, wait for it below
  }

  ec_.await(cb);
  return cv_status::no_timeout;
}

Transaction::Guard::Guard(Transaction* tx) : tx(tx) {
  DCHECK(tx->cid_->opt_mask() & CO::GLOBAL_TRANS);
  auto cb = [&](Transaction* t, EngineShard* shard) {
    namespaces->GetDefaultNamespace().GetDbSlice(shard->shard_id()).SetExpireAllowed(false);
    return OpStatus::OK;
  };
  tx->Execute(cb, false);
}

Transaction::Guard::~Guard() {
  auto cb = [&](Transaction* t, EngineShard* shard) {
    namespaces->GetDefaultNamespace().GetDbSlice(shard->shard_id()).SetExpireAllowed(true);
    return OpStatus::OK;
  };
  tx->Execute(cb, true);
  tx->Refurbish();
}

void Transaction::Init(unsigned num_shards) {
  DCHECK(schedule_queues == nullptr);
  schedule_queues = new ScheduleQ[num_shards];
}

void Transaction::Shutdown() {
  DCHECK(schedule_queues);
  delete[] schedule_queues;
  schedule_queues = nullptr;
}

Transaction::Transaction(const CommandId* cid) : cid_{cid} {
  InitTxTime();
  string_view cmd_name(cid_->name());
  if (cmd_name == "EXEC" || cmd_name == "EVAL" || cmd_name == "EVAL_RO" || cmd_name == "EVALSHA" ||
      cmd_name == "EVALSHA_RO") {
    multi_.reset(new MultiData);
    multi_->mode = NOT_DETERMINED;
    multi_->role = DEFAULT;
  }
}

Transaction::Transaction(const Transaction* parent, ShardId shard_id, std::optional<SlotId> slot_id)
    : multi_{make_unique<MultiData>()},
      txid_{parent->txid()},
      unique_shard_cnt_{1},
      unique_shard_id_{shard_id} {
  if (parent->multi_) {
    multi_->mode = parent->multi_->mode;
  } else {
    // Use squashing mechanism for inline execution of single-shard EVAL
    multi_->mode = LOCK_AHEAD;
  }

  multi_->role = SQUASHED_STUB;

  MultiUpdateWithParent(parent);
  if (slot_id.has_value()) {
    unique_slot_checker_.Add(*slot_id);
  }
}

Transaction::~Transaction() {
  DVLOG(3) << "Transaction " << StrCat(Name(), "@", txid_, "/", unique_shard_cnt_, ")")
           << " destroyed";
}

void Transaction::InitBase(Namespace* ns, DbIndex dbid, CmdArgList args) {
  global_ = false;
  db_index_ = dbid;
  full_args_ = args;
  local_result_ = OpStatus::OK;
  stats_.coordinator_index = ProactorBase::me() ? ProactorBase::me()->GetPoolIndex() : kInvalidSid;

  // Namespace is read by poll execution, so it can't be changed on the fly
  if (IsScheduled()) {
    DCHECK_EQ(namespace_, ns);
  } else {
    DCHECK(namespace_ == nullptr || namespace_ == ns);
    namespace_ = ns;
  }
}

void Transaction::InitGlobal() {
  DCHECK(!multi_ || (multi_->mode == GLOBAL || multi_->mode == NON_ATOMIC));

  global_ = true;
  EnableAllShards();
}

void Transaction::BuildShardIndex(const KeyIndex& key_index, std::vector<PerShardCache>* out) {
  // Because of the way we iterate in InitShardData
  DCHECK(!key_index.bonus || key_index.step == 1);

  auto& shard_index = *out;
  for (unsigned i : key_index.Range()) {
    string_view key = ArgS(full_args_, i);
    unique_slot_checker_.Add(key);
    ShardId sid = Shard(key, shard_data_.size());

    unsigned step = key_index.bonus ? 1 : key_index.step;
    shard_index[sid].key_step = step;
    auto& slices = shard_index[sid].slices;
    if (!slices.empty() && slices.back().second == i) {
      slices.back().second = i + step;
    } else {
      slices.emplace_back(i, i + step);
    }
  }
}

void Transaction::InitShardData(absl::Span<const PerShardCache> shard_index, size_t num_args) {
  args_slices_.reserve(num_args);
  DCHECK(kv_fp_.empty());
  kv_fp_.reserve(num_args);

  // Store the concatenated per-shard arguments from the shard index inside kv_args_
  // and make each shard data point to its own sub-span inside kv_args_.
  for (size_t i = 0; i < shard_data_.size(); ++i) {
    auto& sd = shard_data_[i];
    const auto& src = shard_index[i];

    sd.slice_count = src.slices.size();
    sd.slice_start = args_slices_.size();
    sd.fp_start = kv_fp_.size();
    sd.fp_count = 0;

    // Multi transactions can re-initialize on different shards, so clear ACTIVE flag.
    DCHECK_EQ(sd.local_mask & ACTIVE, 0);

    if (sd.slice_count == 0)
      continue;

    sd.local_mask |= ACTIVE;

    unique_shard_cnt_++;
    unique_shard_id_ = i;

    for (const auto& [start, end] : src.slices) {
      args_slices_.emplace_back(start, end);
      for (string_view key : KeyIndex(start, end, src.key_step).Range(full_args_)) {
        kv_fp_.push_back(LockTag(key).Fingerprint());
        sd.fp_count++;
      }
    }
  }
}

void Transaction::PrepareMultiFps(CmdArgList keys) {
  DCHECK_EQ(multi_->mode, LOCK_AHEAD);
  DCHECK_GT(keys.size(), 0u);

  auto& tag_fps = multi_->tag_fps;

  tag_fps.reserve(keys.size());
  for (string_view str : keys) {
    ShardId sid = Shard(str, shard_set->size());
    tag_fps.emplace(sid, LockTag(str).Fingerprint());
  }
}

void Transaction::StoreKeysInArgs(const KeyIndex& key_index) {
  DCHECK(kv_fp_.empty());
  DCHECK(args_slices_.empty());

  // even for a single key we may have multiple arguments per key (MSET).
  if (key_index.bonus)
    args_slices_.emplace_back(*key_index.bonus, *key_index.bonus + 1);
  args_slices_.emplace_back(key_index.start, key_index.end);

  for (string_view key : key_index.Range(full_args_))
    kv_fp_.push_back(LockTag(key).Fingerprint());
}

void Transaction::InitByKeys(const KeyIndex& key_index) {
  // Skip initialization for key-dependent transactions without keys
  if ((key_index.end - key_index.start) + int(bool(key_index.bonus)) == 0)
    return;

  DCHECK_LT(key_index.start, full_args_.size());

  // Stub transactions always operate only on single shard.
  bool is_stub = multi_ && multi_->role == SQUASHED_STUB;

  unique_slot_checker_.Reset();
  if ((key_index.NumArgs() == 1 && !IsAtomicMulti()) || is_stub) {
    DCHECK(!IsActiveMulti() || multi_->mode == NON_ATOMIC);

    // We don't have to split the arguments by shards, so we can copy them directly.
    StoreKeysInArgs(key_index);

    unique_shard_cnt_ = 1;
    string_view akey = full_args_[*key_index];

    if (is_stub)  // stub transactions don't migrate
      DCHECK_EQ(unique_shard_id_, Shard(akey, shard_set->size()));
    else {
      unique_slot_checker_.Add(akey);
      unique_shard_id_ = Shard(akey, shard_set->size());
    }

    // Multi transactions that execute commands on their own (not stubs) can't shrink the backing
    // array, as it still might be read by leftover callbacks.
    shard_data_.resize(IsActiveMulti() ? shard_set->size() : 1);
    shard_data_[SidToId(unique_shard_id_)].local_mask |= ACTIVE;

    return;
  }

  shard_data_.resize(shard_set->size());  // shard_data isn't sparse, so we must allocate for all :(
  DCHECK_EQ(full_args_.size() % key_index.step, 0u) << full_args_;

  // Safe, because flow below is not preemptive.
  auto& shard_index = tmp_space.GetShardIndex(shard_data_.size());

  // Distribute all the arguments by shards.
  BuildShardIndex(key_index, &shard_index);

  // Initialize shard data based on distributed arguments.
  InitShardData(shard_index, key_index.NumArgs());

  DCHECK(!multi_ || multi_->mode != LOCK_AHEAD || !multi_->tag_fps.empty());

  DVLOG(1) << "InitByArgs " << DebugId() << facade::ToSV(full_args_.front());

  // Compress shard data, if we occupy only one shard.
  if (unique_shard_cnt_ == 1) {
    PerShardData* sd;
    if (IsActiveMulti()) {
      sd = &shard_data_[SidToId(unique_shard_id_)];
      DCHECK(sd->local_mask & ACTIVE);
    } else {
      shard_data_.resize(1);
      sd = &shard_data_.front();
      sd->local_mask |= ACTIVE;
    }
    sd->slice_count = -1;
    sd->slice_start = -1;
  }

  // Validation.
  for (const auto& sd : shard_data_) {
    // sd.local_mask may be non-zero for multi transactions with instant locking.
    // Specifically EVALs may maintain state between calls.
    DCHECK(!sd.is_armed.load(memory_order_relaxed));
    if (!multi_) {
      DCHECK_EQ(TxQueue::kEnd, sd.pq_pos);
    }
  }
}

OpStatus Transaction::InitByArgs(Namespace* ns, DbIndex index, CmdArgList args) {
  InitBase(ns, index, args);

  if ((cid_->opt_mask() & CO::GLOBAL_TRANS) > 0) {
    InitGlobal();
    return OpStatus::OK;
  }

  if ((cid_->opt_mask() & CO::NO_KEY_TRANSACTIONAL) > 0) {
    if (((cid_->opt_mask() & CO::NO_KEY_TX_SPAN_ALL) > 0)) {
      EnableAllShards();
    } else {
      EnableShard(0);
    }

    return OpStatus::OK;
  }

  DCHECK_EQ(unique_shard_cnt_, 0u);
  DCHECK(args_slices_.empty());
  DCHECK(kv_fp_.empty());

  OpResult<KeyIndex> key_index = DetermineKeys(cid_, args);
  if (!key_index)
    return key_index.status();

  InitByKeys(*key_index);
  return OpStatus::OK;
}

void Transaction::PrepareSquashedMultiHop(const CommandId* cid,
                                          absl::FunctionRef<bool(ShardId)> enabled) {
  CHECK(multi_->mode == GLOBAL || multi_->mode == LOCK_AHEAD);

  MultiSwitchCmd(cid);

  InitBase(namespace_, db_index_, {});

  // Because squashing already determines active shards by partitioning commands,
  // we don't have to work with keys manually and can just mark active shards.
  // The partitioned commands know it's keys and assume they have correct access.
  DCHECK_EQ(shard_data_.size(), shard_set->size());
  for (unsigned i = 0; i < shard_data_.size(); i++) {
    if (enabled(i)) {
      shard_data_[i].local_mask |= ACTIVE;
      unique_shard_cnt_++;
      unique_shard_id_ = i;
    } else {
      shard_data_[i].local_mask &= ~ACTIVE;
    }
    shard_data_[i].slice_start = 0;
    shard_data_[i].slice_count = 0;
  }

  MultiBecomeSquasher();
}

void Transaction::StartMultiGlobal(Namespace* ns, DbIndex dbid) {
  CHECK(multi_);
  CHECK(shard_data_.empty());  // Make sure default InitByArgs didn't run.

  multi_->mode = GLOBAL;
  InitBase(ns, dbid, {});
  InitGlobal();
  multi_->lock_mode = IntentLock::EXCLUSIVE;

  ScheduleInternal();
}

void Transaction::StartMultiLockedAhead(Namespace* ns, DbIndex dbid, CmdArgList keys,
                                        bool skip_scheduling) {
  DVLOG(1) << "StartMultiLockedAhead on " << keys.size() << " keys";

  DCHECK(multi_);
  DCHECK(shard_data_.empty());  // Make sure default InitByArgs didn't run.

  multi_->mode = LOCK_AHEAD;
  multi_->lock_mode = LockMode();

  PrepareMultiFps(keys);

  InitBase(ns, dbid, keys);
  InitByKeys(KeyIndex(0, keys.size()));

  if (!skip_scheduling)
    ScheduleInternal();

  full_args_ = {};  // InitBase set it to temporary keys, now we reset it.
}

void Transaction::StartMultiNonAtomic() {
  DCHECK(multi_);
  multi_->mode = NON_ATOMIC;
}

void Transaction::InitTxTime() {
  time_now_ms_ = GetCurrentTimeMs();
}

void Transaction::MultiSwitchCmd(const CommandId* cid) {
  DCHECK(multi_);
  DCHECK(!cb_ptr_);

  multi_->cmd_seq_num++;

  if (multi_->role != SQUASHED_STUB)  // stub transactions don't migrate between threads
    unique_shard_id_ = 0;
  unique_shard_cnt_ = 0;

  args_slices_.clear();
  kv_fp_.clear();

  cid_ = cid;
  re_enabled_auto_journal_ = false;
  cb_ptr_.reset();

  for (auto& sd : shard_data_) {
    sd.slice_count = sd.slice_start = 0;
    sd.fp_start = sd.fp_count = 0;  // Reset fingerprints span as kv_fp_ was cleared above.

    if (multi_->mode == NON_ATOMIC) {
      sd.local_mask = 0;  // Non atomic transactions schedule each time, so remove all flags
      DCHECK_EQ(sd.pq_pos, TxQueue::kEnd);
    } else {
      DCHECK(IsAtomicMulti());   // Every command determines it's own active shards
      sd.local_mask &= ~ACTIVE;  // so remove ACTIVE flags, but keep KEYLOCK_ACQUIRED
    }
    DCHECK(!sd.is_armed.load(memory_order_relaxed));
  }

  if (multi_->mode == NON_ATOMIC) {
    coordinator_state_ = 0;
    txid_ = 0;
  } else if (multi_->role == SQUASHED_STUB) {
    DCHECK_EQ(coordinator_state_, 0u);
  }

  // Each hop needs to be prepared, reset role
  if (multi_->role == SQUASHER)
    multi_->role = DEFAULT;
}

void Transaction::MultiUpdateWithParent(const Transaction* parent) {
  // Disabled because of single shard lua optimization
  // DCHECK(multi_);
  // DCHECK(parent->multi_);  // it might not be a squasher yet, but certainly is multi
  DCHECK_EQ(multi_->role, SQUASHED_STUB);
  DCHECK(parent->time_now_ms_);

  txid_ = parent->txid_;
  time_now_ms_ = parent->time_now_ms_;
  unique_slot_checker_ = parent->unique_slot_checker_;
  namespace_ = parent->namespace_;
}

void Transaction::MultiBecomeSquasher() {
  DCHECK(multi_->mode == GLOBAL || multi_->mode == LOCK_AHEAD);
  DCHECK_GT(GetUniqueShardCnt(), 0u);                    // initialized and determined active shards
  DCHECK(cid_->IsMultiTransactional()) << cid_->name();  // proper base command set
  multi_->role = SQUASHER;
}

string Transaction::DebugId(std::optional<ShardId> sid) const {
  DCHECK_GT(use_count_.load(memory_order_relaxed), 0u);
  string res = StrCat(Name(), "@", txid_, "/", unique_shard_cnt_);
  if (multi_) {
    absl::StrAppend(&res, ":", multi_->cmd_seq_num);
  }
  absl::StrAppend(&res, " {id=", trans_id(this));
  absl::StrAppend(&res, " {cb_ptr=", bool(cb_ptr_));
  if (sid) {
    absl::StrAppend(&res, ",mask[", *sid, "]=", int(shard_data_[SidToId(*sid)].local_mask),
                    ",is_armed=", DEBUG_IsArmedInShard(*sid),
                    ",txqpos[]=", shard_data_[SidToId(*sid)].pq_pos);
  }
  absl::StrAppend(&res, "}");
  return res;
}

void Transaction::PrepareSingleSquash(Namespace* ns, ShardId sid, DbIndex db, CmdArgList keys,
                                      MultiMode mode) {
  if (mode == LOCK_AHEAD) {
    StartMultiLockedAhead(ns, db, keys, true);  // delay locking until first hop
  } else {
    DCHECK_EQ(mode, GLOBAL);
    StartMultiGlobal(ns, db);
  }
  EnableShard(sid);
  MultiBecomeSquasher();

  // As we never change commands, conclude immediately
  coordinator_state_ |= COORD_CONCLUDING;
}

// Runs in the dbslice thread. Returns true if the transaction concluded.
bool Transaction::RunInShard(EngineShard* shard, bool allow_q_removal) {
  DCHECK_GT(txid_, 0u);
  CHECK(cb_ptr_) << DebugId();

  unsigned idx = SidToId(shard->shard_id());
  auto& sd = shard_data_[idx];

  sd.stats.total_runs++;

  DCHECK_GT(run_barrier_.DEBUG_Count(), 0u);
  VLOG(2) << "RunInShard: " << DebugId() << " sid:" << shard->shard_id() << " " << sd.local_mask;

  // was_suspended is true meaning that this transaction was suspended and then
  // it was woken up by another transaction in either this thread or a key in another thread.
  // if awaked_prerun is true - it means it was woken up by a transaction in this thread,
  bool was_suspended = sd.local_mask & WAS_SUSPENDED;
  bool awaked_prerun = sd.local_mask & AWAKED_Q;
  DCHECK(was_suspended || !awaked_prerun);

  IntentLock::Mode mode = LockMode();

  DCHECK(IsGlobal() || (sd.local_mask & KEYLOCK_ACQUIRED) || (multi_ && multi_->mode == GLOBAL));

  /*************************************************************************/

  RunCallback(shard);

  /*************************************************************************/
  // at least the coordinator thread owns the reference.
  DCHECK_GE(GetUseCount(), 1u);

  bool is_concluding = coordinator_state_ & COORD_CONCLUDING;

  // If we're allowed, we remove ourselves upon first invocation from the queue,
  // and successive hops are run by continuation_trans_ in engine shard.
  // Otherwise we can remove ourselves only when we're concluding (so no more hops follow).
  if (sd.pq_pos != TxQueue::kEnd && (is_concluding || allow_q_removal)) {
    VLOG(2) << "Remove from txq " << this->DebugId();
    shard->txq()->Remove(sd.pq_pos);
    sd.pq_pos = TxQueue::kEnd;
  }

  // For multi we unlock transaction (i.e. its keys) in UnlockMulti() call.
  // If it's a final hop we should release the locks.
  if (is_concluding) {
    bool became_suspended = !was_suspended && (sd.local_mask & WAS_SUSPENDED);
    KeyLockArgs largs;

    if (IsGlobal()) {
      DCHECK(!awaked_prerun && !became_suspended);  // Global transactions can not be blocking.
      VLOG(2) << "Releasing shard lock";
      shard->shard_lock()->Release(LockMode());
    } else {  // not global.
      largs = GetLockArgs(idx);
      DCHECK(sd.local_mask & KEYLOCK_ACQUIRED);

      // If a transaction has been suspended, we keep the lock so that future transaction
      // touching those keys will be ordered via TxQueue. It's necessary because we preserve
      // the atomicity of awaked transactions by halting the TxQueue.
      if (!became_suspended) {
        GetDbSlice(shard->shard_id()).Release(mode, largs);
        sd.local_mask &= ~KEYLOCK_ACQUIRED;
      }
      sd.local_mask &= ~OUT_OF_ORDER;
    }

    // This is the last hop, so clear cont_trans if its held by the current tx
    // The position is important because we check below if `shard->GetContTx() == nullptr`
    // so we must clear it before we notify awaked transactions.
    shard->RemoveContTx(this);

    // It has 2 responsibilities.
    // 1: to go over potential wakened keys, verify them and activate watch queues.
    // 2: if this transaction was notified and finished running - to remove it from the head
    //    of the queue and notify the next one.

    if (auto* bcontroller = namespace_->GetBlockingController(shard->shard_id()); bcontroller) {
      if (awaked_prerun || was_suspended) {
        bcontroller->RemovedWatched(GetShardArgs(idx), this);
      }

      // Wake only if no tx queue head is currently running
      // Note: RemoveContTx might have no effect above if this tx had no continuations
      if (shard->GetContTx() == nullptr) {
        bcontroller->NotifyPending();
      }
    }
  }

  FinishHop();  // From this point on we can not access 'this'.
  return is_concluding;
}

void Transaction::RunCallback(EngineShard* shard) {
  DCHECK_EQ(shard, EngineShard::tlocal());

  RunnableResult result;
  try {
    result = (*cb_ptr_)(this, shard);

    if (unique_shard_cnt_ == 1) {
      cb_ptr_.reset();  // We can do it because only a single thread runs the callback.
      local_result_ = result;
    } else {
      if (result == OpStatus::OUT_OF_MEMORY) {
        absl::base_internal::SpinLockHolder lk{&local_result_mu_};
        CHECK(local_result_ == OpStatus::OK || local_result_ == OpStatus::OUT_OF_MEMORY);
        local_result_ = result;
      } else {
        CHECK_EQ(OpStatus::OK, result);
      }
    }
  } catch (std::bad_alloc&) {
    LOG_FIRST_N(ERROR, 16) << " out of memory";  // TODO: to log at most once per sec.
    absl::base_internal::SpinLockHolder lk{&local_result_mu_};
    local_result_ = OpStatus::OUT_OF_MEMORY;
  } catch (std::exception& e) {
    LOG(FATAL) << "Unexpected exception " << e.what();
  }

  auto& db_slice = GetDbSlice(shard->shard_id());
  db_slice.OnCbFinishBlocking();

  // Handle result flags to alter behaviour.
  if (result.flags & RunnableResult::AVOID_CONCLUDING) {
    // Multi shard callbacks should either all or none choose to conclude. They can't communicate,
    // so they must know their decision ahead, consequently there is no point in using this flag.
    CHECK_EQ(unique_shard_cnt_, 1u);
    DCHECK((coordinator_state_ & COORD_CONCLUDING) || multi_->concluding);
    coordinator_state_ &= ~COORD_CONCLUDING;
  }

  // Log to journal only once the command finished running
  if ((coordinator_state_ & COORD_CONCLUDING) || (multi_ && multi_->concluding)) {
    LogAutoJournalOnShard(shard, result);
    MaybeInvokeTrackingCb();
  }
}

// TODO: For multi-transactions we should be able to deduce mode() at run-time based
// on the context. For regular multi-transactions we can actually inspect all commands.
// For eval-like transactions - we can decide based on the command flavor (EVAL/EVALRO) or
// auto-tune based on the static analysis (by identifying commands with hardcoded command names).
void Transaction::ScheduleInternal() {
  DCHECK_EQ(txid_, 0u);
  DCHECK_EQ(coordinator_state_ & COORD_SCHED, 0);
  DCHECK_GT(unique_shard_cnt_, 0u);
  DCHECK(!IsAtomicMulti() || cid_->IsMultiTransactional());

  // Try running immediately (during scheduling) if we're concluding and either:
  // - have a single shard, and thus never have to cancel scheduling due to reordering
  // - run as an idempotent command, meaning we can safely repeat the operation if scheduling fails
  bool optimistic_exec = !IsGlobal() && (coordinator_state_ & COORD_CONCLUDING) &&
                         (unique_shard_cnt_ == 1 || (cid_->opt_mask() & CO::IDEMPOTENT));

  DVLOG(1) << "ScheduleInternal " << cid_->name() << " on " << unique_shard_cnt_ << " shards "
           << " optimistic_execution: " << optimistic_exec;

  auto is_active = [this](uint32_t i) { return IsActive(i); };

  // Loop until successfully scheduled in all shards.
  while (true) {
    stats_.schedule_attempts++;

    // This is a contention point for all threads - avoid using it unless necessary.
    // Single shard operations can assign txid later if the immediate run failed.
    if (unique_shard_cnt_ > 1)
      txid_ = op_seq.fetch_add(1, memory_order_relaxed);

    run_barrier_.Start(unique_shard_cnt_);

    if (CanRunInlined()) {
      // We increase the barrier above for this branch as well, in order to calm the DCHECKs
      // in the lower-level code. It's not really needed otherwise because we run inline.

      // single shard schedule operation can't fail
      CHECK(ScheduleInShard(EngineShard::tlocal(), optimistic_exec));
      run_barrier_.Dec();
      break;
    }

    ScheduleContext schedule_ctx{this, optimistic_exec};

    if (unique_shard_cnt_ == 1) {
      // Single shard optimization. Note: we could apply the same optimization
      // to multi-shard transactions as well by creating a vector of ScheduleContext.
      schedule_queues[unique_shard_id_].queue.Push(&schedule_ctx);
      bool current_val = false;
      if (schedule_queues[unique_shard_id_].armed.compare_exchange_strong(current_val, true,
                                                                          memory_order_acq_rel)) {
        shard_set->Add(unique_shard_id_, &Transaction::ScheduleBatchInShard);
      }
    } else {
      auto cb = [&schedule_ctx] {
        if (!schedule_ctx.trans->ScheduleInShard(EngineShard::tlocal(),
                                                 schedule_ctx.optimistic_execution)) {
          schedule_ctx.fail_cnt.fetch_add(1, memory_order_relaxed);
        }
        schedule_ctx.trans->FinishHop();
      };

      IterateActiveShards([cb](const auto& sd, ShardId i) { shard_set->Add(i, cb); });

      // Add this debugging function to print more information when we experience deadlock
      // during tests.
      ThisFiber::PrintLocalsCallback locals([&] {
        return absl::StrCat("unique_shard_cnt_: ", unique_shard_cnt_,
                            " run_barrier_cnt: ", run_barrier_.DEBUG_Count(), "\n");
      });
    }
    run_barrier_.Wait();

    if (schedule_ctx.fail_cnt.load(memory_order_relaxed) == 0) {
      break;
    }

    VLOG(2) << "Cancelling " << DebugId();
    ServerState::tlocal()->stats.tx_schedule_cancel_cnt += 1;

    atomic_bool should_poll_execution{false};
    auto cancel = [&](EngineShard* shard) {
      bool res = CancelShardCb(shard);
      if (res) {
        should_poll_execution.store(true, memory_order_relaxed);
      }
    };
    shard_set->RunBriefInParallel(std::move(cancel), is_active);

    // We must follow up with PollExecution because in rare cases with multi-trans
    // that follows this one, we may find the next transaction in the queue that is never
    // trigerred. Which leads to deadlock. I could solve this by adding PollExecution to
    // CancelShardCb above but then we would need to use the shard_set queue since PollExecution
    // is blocking. I wanted to avoid the additional latency for the general case of running
    // CancelShardCb because of the very rate case below. Therefore, I decided to just fetch the
    // indication that we need to follow up with PollExecution and then send it to shard_set queue.
    // We do not need to wait for this callback to finish - just make sure it will eventually run.
    // See https://github.com/dragonflydb/dragonfly/issues/150 for more info.
    if (should_poll_execution.load(memory_order_relaxed)) {
      IterateActiveShards([](const auto& sd, auto i) {
        shard_set->Add(i, [] { EngineShard::tlocal()->PollExecution("cancel_cleanup", nullptr); });
      });
    }
    InitTxTime();  // update time for next scheduling attempt
  }

  coordinator_state_ |= COORD_SCHED;
  RecordTxScheduleStats(this);
}

void Transaction::UnlockMulti(bool block) {
  DCHECK(multi_);
  DCHECK_GE(GetUseCount(), 1u);  // Greater-equal because there may be callbacks in progress.

  // Return if we either didn't schedule at all (and thus run) or already did conclude
  if ((coordinator_state_ & COORD_SCHED) == 0 || (coordinator_state_ & COORD_CONCLUDING) > 0)
    return;
  coordinator_state_ |= COORD_CONCLUDING;

  // Distribute keys by shards
  DCHECK_EQ(shard_data_.size(), shard_set->size());  // Atomic doesn't use single shard optimization
  vector<vector<LockFp>> sharded_keys(shard_set->size());
  for (const auto& [sid, fp] : multi_->tag_fps)
    sharded_keys[sid].emplace_back(fp);

  // Whether transaction was active on the shard and needs to unlock
  auto is_active = [&](ShardId sid) {
    return !sharded_keys[sid].empty() || multi_->mode == GLOBAL;
  };

  // Count number of active shards ahead and set run/use counts
  size_t occupied_shards = 0;
  for (size_t sid = 0; sid < shard_set->size(); sid++) {
    if (!is_active(sid))
      continue;
    occupied_shards++;
  }
  run_barrier_.Start(occupied_shards);
  use_count_.fetch_add(occupied_shards, std::memory_order_relaxed);

  // Dispatch callbacks to unlock on shards
  for (ShardId sid = 0; sid < shard_data_.size(); sid++) {
    if (!is_active(sid))
      continue;

    shard_set->Add(sid, [this, fps = std::move(sharded_keys[sid])] {
      this->UnlockMultiShardCb(fps, EngineShard::tlocal());
      run_barrier_.Dec();
      intrusive_ptr_release(this);
    });
  }

  if (block) {
    run_barrier_.Wait();
    Refurbish();
  }
}

OpStatus Transaction::ScheduleSingleHop(RunnableType cb) {
  Execute(cb, true);
  return local_result_;
}

void Transaction::SingleHopAsync(RunnableType cb) {
  CHECK(!multi_);
  CHECK_EQ(coordinator_state_, 0u);

  coordinator_state_ |= COORD_CONCLUDING;
  cb_ptr_ = cb;

  if (unique_shard_cnt_ == 1) {
    CHECK_EQ(shard_data_.size(), 1u);

    // Arm immediately
    shard_data_.front().is_armed.store(true, memory_order_relaxed);

    // Keep alive till end and set barrier
    run_barrier_.Add(1);
    use_count_.fetch_add(1, memory_order_relaxed);

    auto shard_cb = [this] {
      bool success = ScheduleInShard(EngineShard::tlocal(), true);
      CHECK(success);  // single shard scheduling can't fail

      if (shard_data_.front().local_mask & OPTIMISTIC_EXECUTION) {  // executed during schedule
        run_barrier_.Dec();
        intrusive_ptr_release(this);
      } else {
        // do we really need to submit a shard callback?
        // an armed transaction will be driven by the next previous txq entry

        // possible deadlock beacuse of api
        // but really we just need to re-schedule the callback
        // shard_set->Add(unique_shard_id_, [this] {
        //  EngineShard::tlocal()->PollExecution("exec_cb", this);
        //  intrusive_ptr_release(this);
        //});
        EngineShard::tlocal()->PollExecution("exec_cb", this);
        intrusive_ptr_release(this);
      }
    };

    // Dispatch to shard
    if (CanRunInlined())
      shard_cb();
    else
      shard_set->Add(unique_shard_id_, shard_cb);
  } else {
    ScheduleInternal();
    DispatchHop();  // won't wait on run_barrier_
  }
}

// Runs in coordinator thread.
void Transaction::Execute(RunnableType cb, bool conclude) {
  if (multi_ && multi_->role == SQUASHED_STUB) {
    local_result_ = RunSquashedMultiCb(cb);
    return;
  }

  local_result_ = OpStatus::OK;
  cb_ptr_ = cb;

  if (IsAtomicMulti()) {
    multi_->concluding = conclude;
  } else {
    coordinator_state_ = conclude ? (coordinator_state_ | COORD_CONCLUDING)
                                  : (coordinator_state_ & ~COORD_CONCLUDING);
  }

  if ((coordinator_state_ & COORD_SCHED) == 0) {
    ScheduleInternal();
  }

  DispatchHop();
  run_barrier_.Wait();
  cb_ptr_.reset();

  if (coordinator_state_ & COORD_CONCLUDING)
    coordinator_state_ &= ~COORD_SCHED;
}

// Runs in coordinator thread.
void Transaction::DispatchHop() {
  DVLOG(1) << "DispatchHop " << DebugId();
  DCHECK_GT(unique_shard_cnt_, 0u);
  DCHECK_GT(use_count_.load(memory_order_relaxed), 0u);
  DCHECK(!IsAtomicMulti() || multi_->lock_mode.has_value());
  DCHECK_LE(shard_data_.size(), 1024u);

  // Hops can start executing immediately after being armed, so we
  // initialize the run barrier before arming, as well as copy indices
  // of active shards to avoid reading concurrently accessed shard data.
  std::bitset<1024> poll_flags(0);
  unsigned run_cnt = 0;
  IterateActiveShards([&poll_flags, &run_cnt](auto& sd, auto i) {
    if ((sd.local_mask & OPTIMISTIC_EXECUTION) == 0) {
      run_cnt++;
      poll_flags.set(i, true);
    }
    sd.local_mask &= ~OPTIMISTIC_EXECUTION;  // we'll run it next time if it avoided concluding
  });

  DCHECK_EQ(run_cnt, poll_flags.count());
  if (run_cnt == 0)  // all callbacks were run immediately
    return;

  run_barrier_.Start(run_cnt);

  // Set armed flags on all active shards.
  std::atomic_thread_fence(memory_order_release);  // once fence to avoid flushing writes in loop
  IterateActiveShards([&poll_flags](auto& sd, auto i) {
    if (poll_flags.test(i))
      sd.is_armed.store(true, memory_order_relaxed);
  });

  if (CanRunInlined()) {
    DCHECK_EQ(run_cnt, 1u);
    DVLOG(1) << "Short-circuit ExecuteAsync " << DebugId();
    EngineShard::tlocal()->PollExecution("exec_cb", this);
    return;
  }

  use_count_.fetch_add(run_cnt, memory_order_relaxed);  // for each pointer from poll_cb

  auto poll_cb = [this] {
    CHECK(namespace_ != nullptr);
    EngineShard::tlocal()->PollExecution("exec_cb", this);
    DVLOG(3) << "ptr_release " << DebugId();
    intrusive_ptr_release(this);  // against use_count_.fetch_add above.
  };
  IterateShards([&poll_cb, &poll_flags](PerShardData& sd, auto i) {
    if (poll_flags.test(i))
      shard_set->Add(i, poll_cb);
  });
}

void Transaction::FinishHop() {
  boost::intrusive_ptr<Transaction> guard(this);  // Keep alive until Dec() fully finishes
  run_barrier_.Dec();
}

void Transaction::Conclude() {
  if (!IsScheduled())
    return;
  auto cb = [](Transaction* t, EngineShard* shard) { return OpStatus::OK; };
  Execute(std::move(cb), true);
}

void Transaction::Refurbish() {
  txid_ = 0;
  coordinator_state_ = 0;
  cb_ptr_.reset();
}

const absl::flat_hash_set<std::pair<ShardId, LockFp>>& Transaction::GetMultiFps() const {
  DCHECK(multi_);
  return multi_->tag_fps;
}

#if 0
string Transaction::DEBUG_PrintFailState(ShardId sid) const {
  auto res = StrCat(
      "usc: ", unique_shard_cnt_, ", name:", GetCId()->name(),
      ", usecnt:", use_count_.load(memory_order_relaxed), ", runcnt: ", run_barrier_.DEBUG_Count(),
      ", coordstate: ", coordinator_state_, ", coord native thread: ", stats_.coordinator_index,
      ", schedule attempts: ", stats_.schedule_attempts, ", report from sid: ", sid, "\n");
  std::atomic_thread_fence(memory_order_acquire);
  for (unsigned i = 0; i < shard_data_.size(); ++i) {
    const auto& sd = shard_data_[i];
    absl::StrAppend(&res, "- shard: ", i, " local_mask:", sd.local_mask,
                    " total_runs: ", sd.stats.total_runs, "\n");
  }
  return res;
}
#endif

void Transaction::EnableShard(ShardId sid) {
  unique_shard_cnt_ = 1;
  unique_shard_id_ = sid;
  shard_data_.resize(IsActiveMulti() ? shard_set->size() : 1);
  shard_data_.front().local_mask |= ACTIVE;
}

void Transaction::EnableAllShards() {
  unique_shard_cnt_ = shard_set->size();
  unique_shard_id_ = unique_shard_cnt_ == 1 ? 0 : kInvalidSid;
  shard_data_.resize(shard_set->size());
  for (auto& sd : shard_data_)
    sd.local_mask |= ACTIVE;
}

// runs in coordinator thread.
// Marks the transaction as expired and removes it from the waiting queue.
void Transaction::ExpireBlocking(WaitKeys wkeys) {
  DCHECK(!IsGlobal());
  DVLOG(1) << "ExpireBlocking " << DebugId();
  run_barrier_.Start(unique_shard_cnt_);

  auto expire_cb = [this, &wkeys] {
    EngineShard* es = EngineShard::tlocal();
    if (wkeys) {
      IndexSlice is(0, 1);
      ShardArgs sa(absl::MakeSpan(&wkeys.value(), 1), absl::MakeSpan(&is, 1));
      ExpireShardCb(sa, es);
    } else {
      ExpireShardCb(GetShardArgs(es->shard_id()), es);
    }
  };
  IterateActiveShards([&expire_cb](PerShardData& sd, auto i) { shard_set->Add(i, expire_cb); });

  run_barrier_.Wait();
  DVLOG(1) << "ExpireBlocking finished " << DebugId();
}

string_view Transaction::Name() const {
  return cid_ ? cid_->name() : "null-command";
}

ShardId Transaction::GetUniqueShard() const {
  DCHECK_EQ(GetUniqueShardCnt(), 1U);
  return unique_shard_id_;
}

optional<SlotId> Transaction::GetUniqueSlotId() const {
  return unique_slot_checker_.GetUniqueSlotId();
}

KeyLockArgs Transaction::GetLockArgs(ShardId sid) const {
  KeyLockArgs res;
  res.db_index = db_index_;

  if (unique_shard_cnt_ == 1) {
    res.fps = {kv_fp_.data(), kv_fp_.size()};
  } else {
    const auto& sd = shard_data_[sid];
    DCHECK_LE(sd.fp_start + sd.fp_count, kv_fp_.size());
    res.fps = {kv_fp_.data() + sd.fp_start, sd.fp_count};
  }
  return res;
}

uint16_t Transaction::DisarmInShard(ShardId sid) {
  auto& sd = shard_data_[SidToId(sid)];
  // NOTE: Maybe compare_exchange is worth it to avoid redundant writes
  return sd.is_armed.exchange(false, memory_order_acquire) ? sd.local_mask : 0;
}

pair<uint16_t, bool> Transaction::DisarmInShardWhen(ShardId sid, uint16_t relevant_flags) {
  auto& sd = shard_data_[SidToId(sid)];
  if (sd.is_armed.load(memory_order_acquire)) {
    bool relevant = sd.local_mask & relevant_flags;
    if (relevant)
      CHECK(sd.is_armed.exchange(false, memory_order_release));
    return {sd.local_mask, relevant};
  }
  return {0, false};
}

bool Transaction::IsActive(ShardId sid) const {
  // If we have only one shard, we often don't store infromation about all shards, so determine it
  // solely by id
  if (unique_shard_cnt_ == 1) {
    // However the active flag is still supposed to be set for our unique shard
    DCHECK((shard_data_[SidToId(unique_shard_id_)].local_mask & ACTIVE));
    return sid == unique_shard_id_;
  }

  return shard_data_[SidToId(sid)].local_mask & ACTIVE;
}

IntentLock::Mode Transaction::LockMode() const {
  return cid_->IsReadOnly() ? IntentLock::SHARED : IntentLock::EXCLUSIVE;
}

OpArgs Transaction::GetOpArgs(EngineShard* shard) const {
  DCHECK(IsActive(shard->shard_id()));
  DCHECK((multi_ && multi_->role == SQUASHED_STUB) || (run_barrier_.DEBUG_Count() > 0));
  return OpArgs{shard, this, GetDbContext()};
}

// This function should not block since it's run via RunBriefInParallel.
bool Transaction::ScheduleInShard(EngineShard* shard, bool execute_optimistic) {
  ShardId sid = SidToId(shard->shard_id());
  auto& sd = shard_data_[sid];

  DCHECK(sd.local_mask & ACTIVE);
  DCHECK_EQ(sd.local_mask & KEYLOCK_ACQUIRED, 0);
  sd.local_mask &= ~(OUT_OF_ORDER | OPTIMISTIC_EXECUTION);

  TxQueue* txq = shard->txq();
  KeyLockArgs lock_args;
  IntentLock::Mode mode = LockMode();
  bool lock_granted = false;

  // If a more recent transaction already commited, we abort
  if (txid_ > 0 && shard->committed_txid() >= txid_)
    return false;

  auto release_fp_locks = [&]() {
    GetDbSlice(shard->shard_id()).Release(mode, lock_args);
    sd.local_mask &= ~KEYLOCK_ACQUIRED;
  };

  // Acquire intent locks. Intent locks are always acquired, even if already locked by others.
  if (!IsGlobal()) {
    lock_args = GetLockArgs(shard->shard_id());
    const bool shard_unlocked = shard->shard_lock()->Check(mode);

    // We need to acquire the fp locks because the executing callback
    // within RunCallback below might preempt.
    const bool keys_unlocked = GetDbSlice(shard->shard_id()).Acquire(mode, lock_args);
    lock_granted = shard_unlocked && keys_unlocked;

    sd.local_mask |= KEYLOCK_ACQUIRED;
    if (lock_granted) {
      sd.local_mask |= OUT_OF_ORDER;
    }

    DVLOG(3) << "Lock granted " << lock_granted << " for trans " << DebugId();

    // Check if we can run immediately
    if (lock_granted && execute_optimistic) {
      sd.local_mask |= OPTIMISTIC_EXECUTION;
      shard->stats().tx_optimistic_total++;

      RunCallback(shard);

      // Check state again, it could've been updated if the callback returned AVOID_CONCLUDING flag.
      // Only possible for single shard.
      if (coordinator_state_ & COORD_CONCLUDING) {
        release_fp_locks();
        return true;
      }
    }
  }

  // Single shard operations might have delayed acquiring txid unless neccessary.
  if (txid_ == 0) {
    DCHECK_EQ(unique_shard_cnt_, 1u);
    txid_ = op_seq.fetch_add(1, memory_order_relaxed);
    DCHECK_GT(txid_, shard->committed_txid());
  }

  // If the new transaction requires reordering of the pending queue (i.e. it comes before tail)
  // and some other transaction already locked its keys we can not reorder 'trans' because
  // the transaction could have deduced that it can run OOO and eagerly execute. Hence, we
  // fail this scheduling attempt for trans.
  if (!txq->Empty() && txid_ < txq->TailScore() && !lock_granted) {
    if (sd.local_mask & KEYLOCK_ACQUIRED) {
      release_fp_locks();
    }
    return false;
  }

  if (IsGlobal()) {
    shard->shard_lock()->Acquire(mode);
    VLOG(1) << "Global shard lock acquired";
  }

  TxQueue::Iterator it = txq->Insert(this);
  DCHECK_EQ(TxQueue::kEnd, sd.pq_pos);
  sd.pq_pos = it;

  AnalyzeTxQueue(shard, txq);
  DVLOG(1) << "Insert into tx-queue, sid(" << sid << ") " << DebugId() << ", qlen " << txq->size();

  return true;
}

void Transaction::ScheduleBatchInShard() {
  EngineShard* shard = EngineShard::tlocal();
  auto& stats = shard->stats();
  stats.tx_batch_schedule_calls_total++;

  ShardId sid = shard->shard_id();
  auto& sq = schedule_queues[sid];

  for (unsigned j = 0;; ++j) {
    // We pull the items from the queue in a loop until we reach the stop condition.
    // TODO: we may have fairness problem here, where transactions being added up all the time
    // and we never break from the loop. It is possible to break early but it's not trivial
    // because we must ensure that there is another ScheduleBatchInShard callback in the queue.
    // Can be checked with testing sq.armed is true when j == 1.
    while (true) {
      ScheduleContext* item = sq.queue.Pop();
      if (!item)
        break;

      if (!item->trans->ScheduleInShard(shard, item->optimistic_execution)) {
        item->fail_cnt.fetch_add(1, memory_order_relaxed);
      }
      item->trans->FinishHop();
      stats.tx_batch_scheduled_items_total++;
    };

    // j==1 means we already signalled that we're done with the current batch.
    if (j == 1)
      break;

    // We signal that we're done with the current batch but then we check if there are more
    // transactions to fetch in the next iteration.
    // We do this to avoid the situation where we have a data race, where
    // a transaction is added to the queue, we've checked that sq.armed is true and skipped
    // adding the callback that fetches the transaction.
    sq.armed.exchange(false, memory_order_acq_rel);
  }
}

bool Transaction::CancelShardCb(EngineShard* shard) {
  ShardId idx = SidToId(shard->shard_id());
  auto& sd = shard_data_[idx];

  TxQueue::Iterator q_pos = exchange(sd.pq_pos, TxQueue::kEnd);
  if (q_pos == TxQueue::kEnd) {
    DCHECK_EQ(sd.local_mask & KEYLOCK_ACQUIRED, 0);
    return false;
  }

  TxQueue* txq = shard->txq();
  bool was_head = txq->Head() == q_pos;

  Transaction* trans = absl::get<Transaction*>(txq->At(q_pos));
  DCHECK(trans == this) << txq->size() << ' ' << sd.pq_pos << ' ' << trans->DebugId();
  txq->Remove(q_pos);

  if (IsGlobal()) {
    shard->shard_lock()->Release(LockMode());
  } else {
    if ((cid_->opt_mask() & CO::NO_KEY_TRANSACTIONAL) == 0) {
      auto lock_args = GetLockArgs(shard->shard_id());
      DCHECK(sd.local_mask & KEYLOCK_ACQUIRED);
      DCHECK(!lock_args.fps.empty());
      GetDbSlice(shard->shard_id()).Release(LockMode(), lock_args);
    }

    sd.local_mask &= ~KEYLOCK_ACQUIRED;
  }

  // Check if we need to poll the next head
  return was_head && !txq->Empty();
}

// runs in engine-shard thread.
ShardArgs Transaction::GetShardArgs(ShardId sid) const {
  DCHECK(!multi_ || multi_->role != SQUASHER);

  // We can read unique_shard_cnt_  only because ShardArgsInShard is called after IsArmedInShard
  // barrier.
  if (unique_shard_cnt_ == 1) {
    return ShardArgs{full_args_, absl::MakeSpan(args_slices_)};
  }

  const auto& sd = shard_data_[sid];
  return ShardArgs{full_args_,
                   absl::MakeSpan(args_slices_.data() + sd.slice_start, sd.slice_count)};
}

OpStatus Transaction::WaitOnWatch(const time_point& tp, WaitKeys wkeys, KeyReadyChecker krc,
                                  bool* block_flag, bool* pause_flag) {
  if (blocking_barrier_.IsClaimed()) {  // Might have been cancelled ahead by a dropping connection
    Conclude();
    return OpStatus::CANCELLED;
  }

  DCHECK(!IsAtomicMulti());  // blocking inside MULTI is not allowed

  // Register keys on active shards blocking controllers and mark shard state as suspended.
  auto cb = [&](Transaction* t, EngineShard* shard) {
    if (wkeys) {  // single string_view.
      IndexSlice is(0, 1);
      ShardArgs sa(absl::MakeSpan(&wkeys.value(), 1), absl::MakeSpan(&is, 1));
      t->WatchInShard(&t->GetNamespace(), sa, shard, krc);
    } else {
      t->WatchInShard(&t->GetNamespace(), t->GetShardArgs(shard->shard_id()), shard, krc);
    }
    return OpStatus::OK;
  };
  Execute(std::move(cb), true);

  // Don't reset the scheduled flag because we didn't release the locks
  coordinator_state_ |= COORD_SCHED;

  auto* stats = ServerState::tl_connection_stats();
  ++stats->num_blocked_clients;
  DVLOG(1) << "WaitOnWatch wait for " << tp << " " << DebugId();

  // Wait for the blocking barrier to be closed.
  // Note: It might return immediately if another thread already notified us.
  *block_flag = true;
  cv_status status = blocking_barrier_.Wait(tp);
  *block_flag = false;

  DVLOG(1) << "WaitOnWatch done " << int(status) << " " << DebugId();
  --stats->num_blocked_clients;

  *pause_flag = true;
  ServerState::tlocal()->AwaitPauseState(true);  // blocking are always write commands
  *pause_flag = false;

  OpStatus result = OpStatus::OK;
  if (status == cv_status::timeout) {
    result = OpStatus::TIMED_OUT;
  } else if (coordinator_state_ & COORD_CANCELLED) {
    DCHECK_GT(block_cancel_result_, OpStatus::OK);
    result = block_cancel_result_;
  }

  // If we don't follow up with an "action" hop, we must clean up manually on all shards.
  if (result != OpStatus::OK)
    ExpireBlocking(wkeys);

  return result;
}

void Transaction::WatchInShard(Namespace* ns, ShardArgs keys, EngineShard* shard,
                               KeyReadyChecker krc) {
  auto& sd = shard_data_[SidToId(shard->shard_id())];

  CHECK_EQ(0, sd.local_mask & WAS_SUSPENDED);
  sd.local_mask |= WAS_SUSPENDED;
  sd.local_mask &= ~OUT_OF_ORDER;

  ns->GetOrAddBlockingController(shard)->AddWatched(keys, std::move(krc), this);
  DVLOG(2) << "WatchInShard " << DebugId();
}

void Transaction::ExpireShardCb(ShardArgs keys, EngineShard* shard) {
  // Blocking transactions don't release keys when suspending, release them now.
  auto lock_args = GetLockArgs(shard->shard_id());
  GetDbSlice(shard->shard_id()).Release(LockMode(), lock_args);

  auto& sd = shard_data_[SidToId(shard->shard_id())];
  sd.local_mask &= ~KEYLOCK_ACQUIRED;

  namespace_->GetBlockingController(shard->shard_id())->RemovedWatched(keys, this);
  DCHECK(!namespace_->GetBlockingController(shard->shard_id())
              ->awakened_transactions()
              .contains(this));

  // Unblock the caller with FinishHop.
  FinishHop();

  // And then poll execution to continue processing the queued transactions.
  shard->PollExecution("unwatchcb", nullptr);
}

DbSlice& Transaction::GetDbSlice(ShardId shard_id) const {
  CHECK(namespace_ != nullptr);
  return namespace_->GetDbSlice(shard_id);
}

OpStatus Transaction::RunSquashedMultiCb(RunnableType cb) {
  DCHECK(multi_ && multi_->role == SQUASHED_STUB);
  DCHECK_EQ(unique_shard_cnt_, 1u);

  auto* shard = EngineShard::tlocal();
  auto& db_slice = GetDbSlice(shard->shard_id());

  auto result = cb(this, shard);
  db_slice.OnCbFinishBlocking();

  LogAutoJournalOnShard(shard, result);
  MaybeInvokeTrackingCb();

  DCHECK_EQ(result.flags, 0);  // if it's sophisticated, we shouldn't squash it
  return result;
}

void Transaction::UnlockMultiShardCb(absl::Span<const LockFp> fps, EngineShard* shard) {
  DCHECK(multi_ && multi_->lock_mode);

  if (multi_->mode == GLOBAL) {
    shard->shard_lock()->Release(IntentLock::EXCLUSIVE);
  } else {
    GetDbSlice(shard->shard_id()).Release(*multi_->lock_mode, KeyLockArgs{db_index_, fps});
  }

  ShardId sid = shard->shard_id();
  auto& sd = shard_data_[SidToId(sid)];

  // It does not have to be that all shards in multi transaction execute this tx.
  // Hence it could stay in the tx queue. We perform the necessary cleanup and remove it from
  // there. The transaction is not guaranteed to be at front.
  if (sd.pq_pos != TxQueue::kEnd) {
    DVLOG(1) << "unlockmulti: TxRemove " << DebugId();

    TxQueue* txq = shard->txq();
    DCHECK(!txq->Empty());
    DCHECK_EQ(absl::get<Transaction*>(txq->At(sd.pq_pos)), this);

    txq->Remove(sd.pq_pos);
    sd.pq_pos = TxQueue::kEnd;
  }

  shard->FinalizeMulti(this);
}

bool Transaction::IsGlobal() const {
  // Please note that a transaction can be non-global even if multi_->mode == GLOBAL.
  // It happens when a transaction is squashed and switches to execute differrent commands.
  return global_;
}

// Runs only in the shard thread.
// Returns true if the transacton has changed its state from suspended to awakened,
// false, otherwise.
bool Transaction::NotifySuspended(ShardId sid, string_view key) {
  // Wake a transaction only once on the first notify.
  // We don't care about preserving the strict order with multiple operations running on blocking
  // keys in parallel, because the internal order is not observable from outside either way.
  if (!blocking_barrier_.TryClaim())
    return false;

  auto& sd = shard_data_[SidToId(sid)];

  DVLOG(1) << "NotifySuspended " << DebugId() << ", local_mask:" << sd.local_mask;

  // We're the first and only to wake this transaction, expect the shard to be suspended.
  CHECK(sd.local_mask & WAS_SUSPENDED);

  // We wake at most once.
  CHECK_EQ(sd.local_mask & AWAKED_Q, 0);

  // Find index of awakened key
  ShardArgs args = GetShardArgs(sid);
  auto it = find_if(args.cbegin(), args.cend(), [key](string_view arg) { return arg == key; });
  CHECK(it != args.cend());

  // Change state to awaked and store index of awakened key
  sd.local_mask |= AWAKED_Q;
  sd.wake_key_pos = it.index();

  blocking_barrier_.Close();
  return true;
}

optional<string_view> Transaction::GetWakeKey(ShardId sid) const {
  auto& sd = shard_data_[SidToId(sid)];
  if ((sd.local_mask & AWAKED_Q) == 0)
    return nullopt;

  CHECK_LT(sd.wake_key_pos, full_args_.size());
  return ArgS(full_args_, sd.wake_key_pos);
}

void Transaction::LogAutoJournalOnShard(EngineShard* shard, RunnableResult result) {
  // TODO: For now, we ignore non shard coordination.
  if (shard == nullptr)
    return;

  // Ignore technical squasher hops.
  if (multi_ && multi_->role == SQUASHER)
    return;

  // Only write commands and/or no-key-transactional commands are logged
  if (!cid_->IsJournaled() && (cid_->opt_mask() & CO::NO_KEY_TRANSACTIONAL) == 0)
    return;

  if (!shard->journal())
    return;

  if (result.status != OpStatus::OK) {
    return;  // Do not log to journal if command execution failed.
  }

  // If autojournaling was disabled and not re-enabled the callback is writing to journal.
  if ((cid_->opt_mask() & CO::NO_AUTOJOURNAL) && !re_enabled_auto_journal_) {
    return;
  }

  journal::Entry::Payload entry_payload;
  string_view cmd{cid_->name()};
  if (unique_shard_cnt_ == 1 || args_slices_.empty()) {
    entry_payload = journal::Entry::Payload(cmd, full_args_);
  } else {
    ShardArgs shard_args = GetShardArgs(shard->shard_id());
    entry_payload = journal::Entry::Payload(cmd, shard_args);
  }
  // Record to journal autojournal commands, here we allow await which anables writing to sync
  // the journal change.
  LogJournalOnShard(std::move(entry_payload));
}

void Transaction::LogJournalOnShard(journal::Entry::Payload&& payload) const {
  journal::RecordEntry(txid_, journal::Op::COMMAND, db_index_,
                       unique_slot_checker_.GetUniqueSlotId(), std::move(payload));
}

void Transaction::ReviveAutoJournal() {
  DCHECK(cid_->opt_mask() & CO::NO_AUTOJOURNAL);
  DCHECK_EQ(run_barrier_.DEBUG_Count(), 0u);  // Can't be changed while dispatching
  re_enabled_auto_journal_ = true;
}

void Transaction::CancelBlocking(const std::function<OpStatus(ArgSlice)>& status_cb) {
  // We're on the owning thread of this transaction, so we can safely access it's data below.
  // First, check if it makes sense to proceed.
  if (blocking_barrier_.IsClaimed() || cid_ == nullptr || (cid_->opt_mask() & CO::BLOCKING) == 0)
    return;

  OpStatus status = OpStatus::CANCELLED;
  if (status_cb) {
    vector<string_view> all_keys;
    IterateActiveShards([this, &all_keys](PerShardData&, auto i) {
      auto shard_keys = GetShardArgs(i);
      all_keys.insert(all_keys.end(), shard_keys.begin(), shard_keys.end());
    });
    status = status_cb(absl::MakeSpan(all_keys));
  }

  if (status == OpStatus::OK)
    return;

  // Check if someone else is about to wake us up
  if (!blocking_barrier_.TryClaim())
    return;

  coordinator_state_ |= COORD_CANCELLED;
  // don't use local_result_ because it can be overwirtten if we cancel ahead
  block_cancel_result_ = status;
  blocking_barrier_.Close();
}

bool Transaction::CanRunInlined() const {
  auto* ss = ServerState::tlocal();
  auto* es = EngineShard::tlocal();
  if (unique_shard_cnt_ == 1 && unique_shard_id_ == ss->thread_index() &&
      ss->AllowInlineScheduling() && !GetDbSlice(es->shard_id()).HasRegisteredCallbacks()) {
    ss->stats.tx_inline_runs++;
    return true;
  }
  return false;
}

OpResult<KeyIndex> DetermineKeys(const CommandId* cid, CmdArgList args) {
  if (cid->opt_mask() & (CO::GLOBAL_TRANS | CO::NO_KEY_TRANSACTIONAL))
    return KeyIndex{};

  int num_custom_keys = -1;

  unsigned start = 0, end = 0, step = 0;
  std::optional<unsigned> bonus = std::nullopt;

  if (cid->opt_mask() & CO::VARIADIC_KEYS) {  // number of keys is not trivially deducable
    // ZUNION/INTER <num_keys> <key1> [<key2> ...]
    // EVAL <script> <num_keys>
    // XREAD ... STREAMS ...
    if (args.size() < 2)
      return OpStatus::SYNTAX_ERR;

    string_view name{cid->name()};

    // Determine based on STREAMS argument position
    if (name == "XREAD" || name == "XREADGROUP") {
      for (size_t i = 0; i < args.size(); ++i) {
        string_view arg = ArgS(args, i);
        if (absl::EqualsIgnoreCase(arg, "STREAMS")) {
          size_t left = args.size() - i - 1;
          return KeyIndex(i + 1, i + 1 + (left / 2));
        }
      }
      return OpStatus::SYNTAX_ERR;
    }

    if (absl::EndsWith(name, "STORE") || name == "CMS.MERGE")
      bonus = 0;  // Z<xxx>STORE and CMS.MERGE <dest> commands

    unsigned num_keys_index;
    if (absl::StartsWith(name, "EVAL") || name == "BLMPOP" || name == "BZMPOP")
      num_keys_index = 1;
    else
      num_keys_index = bonus ? *bonus + 1 : 0;

    string_view num = ArgS(args, num_keys_index);
    if (!absl::SimpleAtoi(num, &num_custom_keys) || num_custom_keys < 0)
      return OpStatus::INVALID_INT;

    if (num_custom_keys == 0 &&
        (absl::StartsWith(name, "ZDIFF") || absl::StartsWith(name, "ZUNION") ||
         absl::StartsWith(name, "ZINTER") || absl::EndsWith(name, "MPOP"))) {
      return OpStatus::AT_LEAST_ONE_KEY;
    }

    if (args.size() < size_t(num_custom_keys) + num_keys_index + 1)
      return OpStatus::SYNTAX_ERR;
  }

  if (cid->first_key_pos() > 0) {
    start = cid->first_key_pos() - 1;
    int8_t last = cid->last_key_pos();

    if (num_custom_keys >= 0) {
      end = start + num_custom_keys;
    } else {
      end = last > 0 ? last : (int(args.size()) + last + 1);
    }
    if (cid->interleaved_step()) {
      step = cid->interleaved_step();
    } else {
      step = 1;
    }

    if (cid->opt_mask() & CO::STORE_LAST_KEY) {
      string_view name{cid->name()};

      if ((name == "GEORADIUSBYMEMBER" && args.size() >= 5) ||
          (name == "GEORADIUS" && args.size() >= 6)) {
        // key member radius .. STORE destkey
        string_view opt = ArgS(args, args.size() - 2);
        if (absl::EqualsIgnoreCase(opt, "STORE") || absl::EqualsIgnoreCase(opt, "STOREDIST")) {
          bonus = args.size() - 1;
        }
      }

      if (name == "SORT") {
        if (args.size() >= 3) {
          // SORT key ... STORE destkey
          string_view opt = ArgS(args, args.size() - 2);
          if (absl::EqualsIgnoreCase(opt, "STORE")) {
            bonus = args.size() - 1;
          }
        }
      }
    }

    return KeyIndex{start, end, step, bonus};
  }

  LOG(FATAL) << "TBD: Not supported " << cid->name();
  return {};
}

std::vector<Transaction::PerShardCache>& Transaction::TLTmpSpace::GetShardIndex(unsigned size) {
  shard_cache.resize(size);
  for (auto& v : shard_cache)
    v.Clear();
  return shard_cache;
}

}  // namespace dfly


================================================
FILE: src/server/transaction.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/base/internal/spinlock.h>
#include <absl/container/flat_hash_map.h>
#include <absl/container/flat_hash_set.h>
#include <absl/container/inlined_vector.h>
#include <absl/functional/function_ref.h>

#include <atomic>
// #include <boost/smart_ptr/intrusive_ptr.hpp>
#include <string_view>
#include <variant>
#include <vector>

#include "core/intent_lock.h"
#include "core/tx_queue.h"
#include "facade/op_status.h"
#include "server/cluster_support.h"
#include "server/common.h"
#include "server/journal/types.h"
#include "server/tx_base.h"
#include "util/fibers/synchronization.h"

namespace dfly {

class BlockingController;

using facade::OpResult;
using facade::OpStatus;

// Central building block of the transactional framework.
//
// Use it to run callbacks on the shard threads - such dispatches are called hops.
//
// Callbacks are not allowed to keep any possibly dangling pointers to data within the shards - it
// must be copied explicitly. The callbacks running on different threads should also never pass any
// messages or wait for each other, as it would block the execution of other transactions.
//
// The shards to run on are determined by the keys of the underlying command.
// Global transactions run on all shards.
//
// 1. Multi transactions
//
// Multi transactions are handled by a single transaction, which exposes the same interface for
// commands as regular transactions, but internally avoids rescheduling. There are multiple modes in
// which a mutli-transaction can run, those are documented in the MultiMode enum.
//
// The flow of EXEC and EVAL is as follows:
//
// ```
// trans->StartMulti_MultiMode_()
// for ([cmd, args]) {
//   trans->MultiSwitchCmd(cmd)  // 1. Set new command
//   trans->InitByArgs(args)     // 2. Re-initialize with arguments
//   cmd->Invoke(trans)          // 3. Run
// }
// trans->UnlockMulti()
// ```
//
// 2. Multi squashing
//
// An important optimization for multi transactions is executing multiple single shard commands in
// parallel. Because multiple commands are "squashed" into a single hop, its called multi squashing.
// To mock the interface for commands, special "stub" transactions are created for each shard that
// directly execute hop callbacks without any scheduling. Transaction roles are represented by the
// MultiRole enum. See MultiCommandSquasher for the detailed squashing approach.
//
// The flow is as follows:
//
// ```
// for (cmd in single_shard_sequence)
//   sharded[shard].push_back(cmd)
//
// tx->PrepareSquashedMultiHop()
// tx->ScheduleSingleHop({
//   Transaction stub_tx {tx}
//   for (cmd)
//     // use stub_tx as regular multi tx, see 1. above
// })
//
// ```
class Transaction {
  friend class BlockingController;

  Transaction(const Transaction&);
  void operator=(const Transaction&) = delete;

  ~Transaction();  // Transactions are reference counted with intrusive_ptr.

  friend void intrusive_ptr_add_ref(Transaction* trans) noexcept {
    trans->use_count_.fetch_add(1, std::memory_order_relaxed);
  }

  friend void intrusive_ptr_release(Transaction* trans) noexcept {
    if (1 == trans->use_count_.fetch_sub(1, std::memory_order_release)) {
      std::atomic_thread_fence(std::memory_order_acquire);
      delete trans;
    }
  }

 public:
  // Result returned by callbacks. Most should use the implicit conversion from OpStatus.
  struct RunnableResult {
    enum Flag : uint16_t {
      // Can be issued by a **single** shard callback to avoid concluding, i.e. perform one more hop
      // even if not requested ahead. Used for blocking command fallback.
      AVOID_CONCLUDING = 1,
    };

    RunnableResult(OpStatus status = OpStatus::OK, uint16_t flags = 0)
        : status(status), flags(flags) {
    }

    operator OpStatus() const {
      return status;
    }

    OpStatus status;
    uint16_t flags;
  };

  static_assert(sizeof(RunnableResult) == 4);

  using time_point = ::std::chrono::steady_clock::time_point;
  // Runnable that is run on shards during hop executions (often named callback).
  // Callacks should return `OpStatus` which is implicitly converitble to `RunnableResult`!
  using RunnableType = absl::FunctionRef<RunnableResult(Transaction* t, EngineShard*)>;

  static constexpr std::nullopt_t kShardArgs{std::nullopt};
  // Provides an override to watch a specific key or kShardArgs to watch all keys in the shard.
  using WaitKeys = std::optional<std::string_view>;

  // Modes in which a multi transaction can run.
  enum MultiMode : uint8_t {
    // Invalid state.
    NOT_DETERMINED = 0,
    // Global transaction.
    GLOBAL = 1,
    // Keys are locked ahead during Schedule.
    LOCK_AHEAD = 2,
    // Each command is executed separately. Equivalent to a pipeline.
    NON_ATOMIC = 3,
  };

  // Squashed parallel execution requires a separate transaction for each shard. Those "stubs"
  // perform no scheduling or real hops, but instead execute the handlers directly inline.
  enum MultiRole {
    DEFAULT = 0,        // Regular multi transaction
    SQUASHER = 1,       // Owner of stub transactions
    SQUASHED_STUB = 2,  // Stub transaction
  };

  // State on specific shard.
  enum LocalMask : uint16_t {
    ACTIVE = 1,  // Whether its active on this shard (to schedule or execute hops)
    OPTIMISTIC_EXECUTION = 1 << 1,  // Whether the shard executed optimistically (during schedule)
    // Whether it can run out of order. Undefined if KEYLOCK_ACQUIRED isn't set
    OUT_OF_ORDER = 1 << 2,
    // Whether its key locks are acquired, never set for global commands.
    KEYLOCK_ACQUIRED = 1 << 3,

    // Whether it was suspended (by WatchInShard()). This flag is sticky and stays forever once set.
    WAS_SUSPENDED = 1 << 4,
    AWAKED_Q = 1 << 5,  // Whether it was awakened (by NotifySuspended())
  };

  struct Guard {
    explicit Guard(Transaction* tx);
    ~Guard();

   private:
    Transaction* tx;
  };

  static void Init(unsigned num_shards);
  static void Shutdown();

  explicit Transaction(const CommandId* cid);

  // Initialize transaction for squashing placed on a specific shard with a given parent tx
  explicit Transaction(const Transaction* parent, ShardId shard_id, std::optional<SlotId> slot_id);

  // Initialize from command (args) on specific db.
  OpStatus InitByArgs(Namespace* ns, DbIndex index, CmdArgList args);

  // Get command arguments for specific shard. Called from shard thread.
  ShardArgs GetShardArgs(ShardId sid) const;

  // Execute transaction hop. If conclude is true, it is removed from the pending queue.
  void Execute(RunnableType cb, bool conclude);

  // Execute single hop and conclude.
  // Callback should return OK for multi key invocations, otherwise return value is ill-defined.
  OpStatus ScheduleSingleHop(RunnableType cb);

  // Experimental command. Dispatch single hop and return,
  // use Blocker() primitive to wait for it to finish
  void SingleHopAsync(RunnableType cb);

  // Execute single hop with return value and conclude.
  // Can be used only for single key invocations, because it writes a into shared variable.
  template <typename F> auto ScheduleSingleHopT(F&& f) -> decltype(f(this, nullptr));

  // Conclude transaction. Ignored if not scheduled
  void Conclude();

  // Called by engine shard to execute a transaction hop.
  // Returns true if the transaction concludes.
  bool RunInShard(EngineShard* shard, bool allow_q_removal);

  // Registers transaction into watched queue and blocks until a) either notification is received.
  // or b) tp is reached. If tp is time_point::max() then waits indefinitely.
  // Expects that the transaction had been scheduled before, and uses Execute(.., true) to register.
  // Returns false if timeout occurred, true if was notified by one of the keys.
  facade::OpStatus WaitOnWatch(const time_point& tp, WaitKeys keys, KeyReadyChecker krc,
                               bool* block_flag, bool* pause_flag);

  // Returns true if transaction is awaked, false if it's timed-out and can be removed from the
  // blocking queue.
  bool NotifySuspended(ShardId sid, std::string_view key);

  // Cancel all blocking watches. Set COORD_CANCELLED.
  // Must be called from coordinator thread.
  void CancelBlocking(const std::function<OpStatus(ArgSlice)>&);

  // Prepare a squashed hop on given shards.
  // Only compatible with multi modes that acquire all locks ahead - global and lock_ahead.
  void PrepareSquashedMultiHop(const CommandId* cid, absl::FunctionRef<bool(ShardId)> enabled);

  // Prepare transaction to do a single ScheduleSingleHop() for squashing
  void PrepareSingleSquash(Namespace* ns, ShardId sid, DbIndex db, CmdArgList keys, MultiMode mode);

  // Start multi in GLOBAL mode.
  void StartMultiGlobal(Namespace* ns, DbIndex dbid);

  // Start multi in LOCK_AHEAD mode with given keys.
  void StartMultiLockedAhead(Namespace* ns, DbIndex dbid, CmdArgList keys,
                             bool skip_scheduling = false);

  // Start multi in NON_ATOMIC mode.
  void StartMultiNonAtomic();

  // Unlock key locks of a multi transaction.
  // If block is set, wait for unlock to finish.
  void UnlockMulti(bool block = false);

  // Set new command for multi transaction.
  void MultiSwitchCmd(const CommandId* cid);

  // Copy txid, time and unique slot from parent
  void MultiUpdateWithParent(const Transaction* parent);

  // Set squasher role
  void MultiBecomeSquasher();

  // Returns locking arguments needed for DbSlice to Acquire/Release transactional locks.
  // Runs in the shard thread.
  KeyLockArgs GetLockArgs(ShardId sid) const;

  // If the transaction is armed, disarm it and return the local mask (ACTIVE is always set).
  // Otherwise 0 is returned. Sync point (acquire).
  uint16_t DisarmInShard(ShardId sid);

  // Same as DisarmInShard, but the transaction is only disarmed if any of the req_flags is present.
  // If the transaction is armed, returns the local mask and a flag whether it was disarmed.
  std::pair<uint16_t, bool /* disarmed */> DisarmInShardWhen(ShardId sid, uint16_t req_flags);

  // Returns if the transaction spans this shard. Safe only when the transaction is armed.
  bool IsActive(ShardId sid) const;

  // If blocking tx was woken up on this shard, get wake key.
  std::optional<std::string_view> GetWakeKey(ShardId sid) const;

  // Get OpArgs for specific shard
  OpArgs GetOpArgs(EngineShard* shard) const;

  TxId txid() const {
    return txid_;
  }

  IntentLock::Mode LockMode() const;  // Based on command mask

  std::string_view Name() const;  // Based on command name

  uint32_t GetUniqueShardCnt() const {
    return unique_shard_cnt_;
  }

  // This method is meaningless if GetUniqueShardCnt() != 1.
  ShardId GetUniqueShard() const;

  std::optional<SlotId> GetUniqueSlotId() const;

  bool IsMulti() const {
    return bool(multi_);
  }

  bool IsScheduled() const {
    return coordinator_state_ & COORD_SCHED;
  }

  MultiMode GetMultiMode() const {
    return multi_->mode;
  }

  util::fb2::EmbeddedBlockingCounter* Blocker() {
    return &run_barrier_;
  }

  // Temporary
  OpStatus* LocalResultPtr() {
    return &local_result_;
  }

  // Whether the transaction is multi and runs in an atomic mode.
  // This, instead of just IsMulti(), should be used to check for the possibility of
  // different optimizations, because they can safely be applied to non-atomic multi
  // transactions as well.
  bool IsAtomicMulti() const {
    return multi_ && (multi_->mode == LOCK_AHEAD || multi_->mode == GLOBAL);
  }

  bool IsGlobal() const;

  DbContext GetDbContext() const {
    return DbContext{namespace_, db_index_, time_now_ms_};
  }

  Namespace& GetNamespace() const {
    return *namespace_;
  }

  DbSlice& GetDbSlice(ShardId sid) const;

  DbIndex GetDbIndex() const {
    return db_index_;
  }

  const CommandId* GetCId() const {
    return cid_;
  }

  // Return debug information about a transaction, include shard local info if passed
  std::string DebugId(std::optional<ShardId> sid = std::nullopt) const;

  // Write a journal entry to a shard journal with the given payload.
  void LogJournalOnShard(journal::Entry::Payload&& payload) const;

  // Re-enable auto journal for commands marked as NO_AUTOJOURNAL. Call during setup.
  void ReviveAutoJournal();

  // Clear all state to make transaction re-usable
  void Refurbish();

  // Get keys multi transaction was initialized with, normalized and unique
  const absl::flat_hash_set<std::pair<ShardId, LockFp>>& GetMultiFps() const;

  bool IsSquashedStub() const {
    return multi_ && multi_->role == SQUASHED_STUB;
  }

  uint32_t DEBUG_GetTxqPosInShard(ShardId sid) const {
    return shard_data_[SidToId(sid)].pq_pos;
  }

  bool DEBUG_IsArmedInShard(ShardId sid) const {
    return shard_data_[SidToId(sid)].is_armed.load(std::memory_order_relaxed);
  }

  uint16_t DEBUG_GetLocalMask(ShardId sid) const {
    return shard_data_[SidToId(sid)].local_mask;
  }

  void SetTrackingCallback(std::function<void(Transaction* trans)> f) {
    tracking_cb_ = std::move(f);
  }

  void MaybeInvokeTrackingCb() {
    if (tracking_cb_) {
      tracking_cb_(this);
    }
  }

  // Remove once BZPOP is stabilized
  std::string DEBUGV18_BlockInfo() {
    return "claimed=" + std::to_string(blocking_barrier_.IsClaimed()) +
           " coord_state=" + std::to_string(int(coordinator_state_)) +
           " local_res=" + std::to_string(int(local_result_));
  }

 private:
  struct alignas(64) PerShardData {
    PerShardData() {
    }
    PerShardData(PerShardData&& other) noexcept {
    }

    // State of shard - bitmask with LocalState flags
    uint16_t local_mask = 0;

    // Set when the shard is prepared for another hop. Sync point. Cleared when execution starts.
    std::atomic_bool is_armed = false;

    uint32_t slice_start = 0;  // Subspan in kv_args_ with local arguments.
    uint32_t slice_count = 0;

    // span into kv_fp_
    uint32_t fp_start = 0;
    uint32_t fp_count = 0;

    // Position in the tx queue. OOO or cancelled schedules remove themselves by this index.
    TxQueue::Iterator pq_pos = TxQueue::kEnd;

    // Index of key relative to args in shard that the shard was woken up after blocking wait.
    uint32_t wake_key_pos = UINT32_MAX;

    // Irrational stats purely for debugging purposes.
    struct Stats {
      unsigned total_runs = 0;  // total number of runs
    } stats;

    // Prevent "false sharing" between cache lines: occupy a full cache line (64 bytes)
    char pad[64 - 7 * sizeof(uint32_t) - sizeof(Stats)];
  };

  static_assert(sizeof(PerShardData) == 64);  // cacheline

  // State of a multi transaction.
  struct MultiData {
    MultiRole role;
    MultiMode mode;
    std::optional<IntentLock::Mode> lock_mode;

    // Unique normalized fingerprints used for scheduling the multi transaction.
    absl::flat_hash_set<std::pair<ShardId, LockFp>> tag_fps;

    // Set if the multi command is concluding to avoid ambiguity with COORD_CONCLUDING
    bool concluding = false;

    unsigned cmd_seq_num = 0;  // used for debugging purposes.
  };

  enum CoordinatorState : uint8_t {
    COORD_SCHED = 1,
    COORD_CONCLUDING = 1 << 1,  // Whether its the last hop of a transaction
    COORD_CANCELLED = 1 << 2,
  };

  // Auxiliary structure used during initialization
  struct PerShardCache {
    std::vector<IndexSlice> slices;
    unsigned key_step = 1;

    void Clear() {
      slices.clear();
    }
  };

  // "Single claim - single modification" barrier. Multiple threads might try to claim it, only one
  // will succeed and will be allowed to modify the guarded object until it closes the barrier.
  // A closed barrier can't be claimed again or re-used in any way.
  class BatonBarrier {
   public:
    bool IsClaimed() const;  // Return if barrier is claimed, only for peeking
    bool TryClaim();         // Return if the barrier was claimed successfully
    void Close();            // Close barrier after it was claimed

    // Wait for barrier until time_point, or indefinitely if time_point::max() was passed.
    // After Wait returns, the barrier is guaranteed to be closed, including expiration.
    std::cv_status Wait(time_point);

   private:
    std::atomic_bool claimed_{false};
    std::atomic_bool closed_{false};
    util::fb2::EventCount ec_{};
  };

  // Init basic fields and reset re-usable.
  void InitBase(Namespace* ns, DbIndex dbid, CmdArgList args);

  // Init as a global transaction.
  void InitGlobal();

  // Init with a set of keys.
  void InitByKeys(const KeyIndex& keys);

  void EnableShard(ShardId sid);
  void EnableAllShards();

  // Build shard index by distributing the arguments by shards based on the key index.
  void BuildShardIndex(const KeyIndex& keys, std::vector<PerShardCache>* out);

  // Init shard data from shard index.
  void InitShardData(absl::Span<const PerShardCache> shard_index, size_t num_args);

  // Store all key index keys in args_. Used only for single shard initialization.
  void StoreKeysInArgs(const KeyIndex& key_index);

  // Multi transactions unlock asynchronously, so they need to keep fingerprints of keys.
  void PrepareMultiFps(CmdArgList keys);

  void ScheduleInternal();

  // Schedule on shards transaction queue. Returns true if scheduled successfully,
  // false if inconsistent order was detected and the schedule needs to be cancelled.
  // if execute_optimistic is true - means we can try executing during the scheduling,
  // subject to uncontended keys.
  bool ScheduleInShard(EngineShard* shard, bool execute_optimistic);

  // Optimized extension of ScheduleInShard. Pulls several transactions queued for scheduling.
  static void ScheduleBatchInShard();

  // Set ARMED flags, start run barrier and submit poll tasks. Doesn't wait for the run barrier
  void DispatchHop();

  // Finish hop, decrement run barrier
  void FinishHop();

  // Run actual callback on shard, store result if single shard or OOM was catched
  void RunCallback(EngineShard* shard);

  // Adds itself to watched queue in the shard. Must run in that shard thread.
  void WatchInShard(Namespace* ns, ShardArgs keys, EngineShard* shard, KeyReadyChecker krc);

  // Expire blocking transaction, unlock keys and unregister it from the blocking controller
  void ExpireBlocking(WaitKeys keys);

  void ExpireShardCb(ShardArgs keys, EngineShard* shard);

  // Returns true if we need to follow up with PollExecution on this shard.
  bool CancelShardCb(EngineShard* shard);

  // Run callback inline as part of multi stub.
  OpStatus RunSquashedMultiCb(RunnableType cb);

  // Set time_now_ms_
  void InitTxTime();

  void UnlockMultiShardCb(absl::Span<const LockFp> fps, EngineShard* shard);

  // Log command in shard's journal, if this is a write command with auto-journaling enabled.
  // Should be called immediately after the last hop.
  void LogAutoJournalOnShard(EngineShard* shard, RunnableResult shard_result);

  // Whether the callback can be run directly on this thread without dispatching on the shard queue
  bool CanRunInlined() const;

  uint32_t GetUseCount() const {
    return use_count_.load(std::memory_order_relaxed);
  }

  bool IsActiveMulti() const {
    return multi_ && multi_->role != SQUASHED_STUB;
  }

  unsigned SidToId(ShardId sid) const {
    return sid < shard_data_.size() ? sid : 0;
  }

  // Iterate over all available shards, run functor accepting (PerShardData&, ShardId)
  template <typename F> void IterateShards(F&& f) {
    if (unique_shard_cnt_ == 1) {
      f(shard_data_[SidToId(unique_shard_id_)], unique_shard_id_);
    } else {
      for (ShardId i = 0; i < shard_data_.size(); ++i) {
        f(shard_data_[i], i);
      }
    }
  }

  // Iterate over ACTIVE shards, run functor accepting (PerShardData&, ShardId)
  template <typename F> void IterateActiveShards(F&& f) {
    IterateShards([&f](auto& sd, auto i) {
      if (sd.local_mask & ACTIVE)
        f(sd, i);
    });
  }

  // Used for waiting for all hop callbacks to run.
  util::fb2::EmbeddedBlockingCounter run_barrier_{0};

  // Stores per-shard data: state flags and keys. Index only with SidToId(shard index)!
  // Theoretically, same size as number of shards, but contains only a single element for
  // single shard non-multi transactions (optimization).
  // TODO: explore dense packing
  absl::InlinedVector<PerShardData, 4> shard_data_;

  // Stores slices of key/values partitioned by shards.
  // Slices reference full_args_.
  // We need values as well since we reorder keys, and we need to know what value corresponds
  // to what key.
  absl::InlinedVector<IndexSlice, 4> args_slices_;

  // Fingerprints of keys, precomputed once during the transaction initialization.
  absl::InlinedVector<LockFp, 4> kv_fp_;

  // Stores the full undivided command.
  CmdArgList full_args_;

  // Set if a NO_AUTOJOURNAL command asked to enable auto journal again
  bool re_enabled_auto_journal_ = false;

  std::optional<RunnableType> cb_ptr_;  // Run on shard threads
  const CommandId* cid_ = nullptr;      // Underlying command
  std::unique_ptr<MultiData> multi_;    // Initialized when the transaction is multi/exec.

  TxId txid_{0};
  bool global_{false};
  Namespace* namespace_{nullptr};
  DbIndex db_index_{0};
  uint64_t time_now_ms_{0};

  std::atomic_uint32_t use_count_{0};  // transaction exists only as an intrusive_ptr

  uint32_t unique_shard_cnt_{0};          // Number of unique shards active
  ShardId unique_shard_id_{kInvalidSid};  // Set if unique_shard_cnt_ = 1
  UniqueSlotChecker unique_slot_checker_;

  // Barrier for waking blocking transactions that ensures exclusivity of waking operation.
  BatonBarrier blocking_barrier_{};

  // Stores status if COORD_CANCELLED was set. Apart from cancelled, it can be moved for cluster
  // changes
  OpStatus block_cancel_result_ = OpStatus::OK;

  // Transaction coordinator state, written and read by coordinator thread.
  uint8_t coordinator_state_ = 0;

  // Result of callbacks. Usually written by single shard only, lock below for multishard oom error
  OpStatus local_result_ = OpStatus::OK;
  absl::base_internal::SpinLock local_result_mu_;

  // Stats purely for debugging purposes
  struct Stats {
    size_t schedule_attempts = 0;
    ShardId coordinator_index = 0;
  } stats_;

  std::function<void(Transaction* trans)> tracking_cb_;

 private:
  struct TLTmpSpace {
    std::vector<PerShardCache>& GetShardIndex(unsigned size);

   private:
    std::vector<PerShardCache> shard_cache;
  };

  static thread_local TLTmpSpace tmp_space;
};

template <typename F> auto Transaction::ScheduleSingleHopT(F&& f) -> decltype(f(this, nullptr)) {
  decltype(f(this, nullptr)) res;

  ScheduleSingleHop([&res, f = std::forward<F>(f)](Transaction* t, EngineShard* shard) {
    res = f(t, shard);
    return res.status();
  });
  return res;
}

OpResult<KeyIndex> DetermineKeys(const CommandId* cid, CmdArgList args);

}  // namespace dfly


================================================
FILE: src/server/tx_base.cc
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/tx_base.h"

#include <xxhash.h>

#include "base/logging.h"
#include "facade/facade_types.h"
#include "server/cluster/cluster_defs.h"
#include "server/engine_shard_set.h"
#include "server/journal/journal.h"
#include "server/namespaces.h"
#include "server/transaction.h"

namespace dfly {

using namespace std;
using Payload = journal::Entry::Payload;

unsigned KeyIndex::operator*() const {
  if (bonus)
    return *bonus;
  return start;
}

KeyIndex& KeyIndex::operator++() {
  if (bonus)
    bonus.reset();
  else
    start = std::min(end, start + step);
  return *this;
}

bool KeyIndex::operator!=(const KeyIndex& ki) const {
  return std::tie(start, end, step, bonus) != std::tie(ki.start, ki.end, ki.step, ki.bonus);
}

DbSlice& DbContext::GetDbSlice(ShardId shard_id) const {
  return ns->GetDbSlice(shard_id);
}

DbSlice& OpArgs::GetDbSlice() const {
  return db_cntx.GetDbSlice(shard->shard_id());
}

size_t ShardArgs::Size() const {
  size_t sz = 0;
  for (const auto& s : slice_.second)
    sz += (s.second - s.first);
  return sz;
}

void RecordJournal(const OpArgs& op_args, string_view cmd, const ShardArgs& args, uint32_t unused) {
  DCHECK(op_args.tx);
  VLOG(2) << "Logging command " << cmd << " from txn " << op_args.tx->txid();
  op_args.tx->LogJournalOnShard(Payload(cmd, args));
}

void RecordJournal(const OpArgs& op_args, std::string_view cmd, facade::ArgSlice args,
                   uint32_t unused) {
  DCHECK(op_args.tx);
  VLOG(2) << "Logging command " << cmd << " from txn " << op_args.tx->txid();
  op_args.tx->LogJournalOnShard(Payload(cmd, args));
}

void RecordDelete(DbIndex dbid, string_view key) {
  journal::RecordEntry(0, journal::Op::COMMAND, dbid, KeySlot(key), Payload("DEL", ArgSlice{key}));
}

LockTag::LockTag(std::string_view key) {
  if (LockTagOptions::instance().enabled)
    str_ = LockTagOptions::instance().Tag(key);
  else
    str_ = key;
}

LockFp LockTag::Fingerprint() const {
  return XXH64(str_.data(), str_.size(), 0x1C69B3F74AC4AE35UL);
}

}  // namespace dfly


================================================
FILE: src/server/tx_base.h
================================================
// Copyright 2024, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <absl/types/span.h>

#include <cstdint>
#include <optional>

#include "base/iterator.h"
#include "common/arg_range.h"
#include "server/common_types.h"

namespace dfly {

using cmn::ArgSlice;

struct KeyLockArgs {
  DbIndex db_index = 0;
  absl::Span<const LockFp> fps;
};

// Describes key indices.
struct KeyIndex {
  KeyIndex(unsigned start = 0, unsigned end = 0, unsigned step = 1,
           std::optional<unsigned> bonus = std::nullopt)
      : start(start), end(end), step(step), bonus(bonus) {
  }

  using iterator_category = std::forward_iterator_tag;
  using value_type = unsigned;
  using difference_type = std::ptrdiff_t;
  using pointer = value_type;
  using reference = value_type;

  unsigned operator*() const;
  KeyIndex& operator++();
  bool operator!=(const KeyIndex& ki) const;

  unsigned NumArgs() const {
    return (end - start) + unsigned(bonus.has_value());
  }

  auto Range() const {
    return base::it::Range(*this, KeyIndex{end, end, step, std::nullopt});
  }

  auto Range(const cmn::ArgSlice& args) const {
    return base::it::Transform([args](unsigned idx) { return args[idx]; }, Range());
  }

 public:
  unsigned start, end, step;      // [start, end) with step
  std::optional<unsigned> bonus;  // destination key, for example for commands that end with STORE
};

struct DbContext {
  Namespace* ns = nullptr;
  DbIndex db_index = 0;
  uint64_t time_now_ms = 0;

  // Convenience method.
  DbSlice& GetDbSlice(ShardId shard_id) const;
};

struct OpArgs {
  EngineShard* shard = nullptr;
  const Transaction* tx = nullptr;
  DbContext db_cntx;

  OpArgs() = default;

  OpArgs(EngineShard* s, const Transaction* tx, const DbContext& cntx)
      : shard(s), tx(tx), db_cntx(cntx) {
  }

  // Convenience method.
  DbSlice& GetDbSlice() const;
};

// A strong type for a lock tag. Helps to disambiguate between keys and the parts of the
// keys that are used for locking.
class LockTag {
  std::string_view str_;

 public:
  using is_stackonly = void;  // marks that this object does not use heap.

  LockTag() = default;
  explicit LockTag(std::string_view key);

  explicit operator std::string_view() const {
    return str_;
  }

  LockFp Fingerprint() const;

  // To make it hashable.
  template <typename H> friend H AbslHashValue(H h, const LockTag& tag) {
    return H::combine(std::move(h), tag.str_);
  }

  bool operator==(const LockTag& o) const {
    return str_ == o.str_;
  }
};

// Checks whether the touched key is valid for a blocking transaction watching it.
using KeyReadyChecker =
    std::function<bool(EngineShard*, const DbContext& context, Transaction* tx, std::string_view)>;

// References arguments in another array.
using IndexSlice = std::pair<uint32_t, uint32_t>;  // [begin, end)

// ShardArgs - hold a span to full arguments and a span of sub-ranges
// referencing those arguments.
class ShardArgs {
  using ArgsIndexPair = std::pair<cmn::ArgSlice, absl::Span<const IndexSlice>>;
  ArgsIndexPair slice_;

 public:
  class Iterator {
    cmn::ArgSlice arglist_;
    absl::Span<const IndexSlice>::const_iterator index_it_;
    uint32_t delta_ = 0;

   public:
    using iterator_category = std::input_iterator_tag;
    using value_type = std::string_view;
    using difference_type = ptrdiff_t;
    using pointer = value_type*;
    using reference = value_type&;

    // First version, corresponds to spans over arguments.
    Iterator(cmn::ArgSlice list, absl::Span<const IndexSlice>::const_iterator it)
        : arglist_(list), index_it_(it) {
    }

    bool operator==(const Iterator& o) const {
      return index_it_ == o.index_it_ && delta_ == o.delta_ && arglist_.data() == o.arglist_.data();
    }

    bool operator!=(const Iterator& o) const {
      return !(*this == o);
    }

    std::string_view operator*() const {
      return arglist_[index()];
    }

    Iterator& operator++() {
      ++delta_;
      if (index() >= index_it_->second) {
        ++index_it_;
        ++delta_ = 0;
      }
      return *this;
    }

    Iterator operator++(int) {
      Iterator copy = *this;
      operator++();
      return copy;
    }

    size_t index() const {
      return index_it_->first + delta_;
    }
  };

  using const_iterator = Iterator;

  ShardArgs(cmn::ArgSlice fa, absl::Span<const IndexSlice> s) : slice_(ArgsIndexPair(fa, s)) {
  }

  ShardArgs() : slice_(ArgsIndexPair{}) {
  }

  size_t Size() const;

  Iterator cbegin() const {
    return Iterator{slice_.first, slice_.second.begin()};
  }

  Iterator cend() const {
    return Iterator{slice_.first, slice_.second.end()};
  }

  Iterator begin() const {
    return cbegin();
  }

  Iterator end() const {
    return cend();
  }

  bool Empty() const {
    return slice_.second.empty();
  }

  std::string_view Front() const {
    return *cbegin();
  }
};

// Record non auto journal command with own txid and dbid.
void RecordJournal(const OpArgs& op_args, std::string_view cmd, const ShardArgs& args,
                   uint32_t unused = 1);
void RecordJournal(const OpArgs& op_args, std::string_view cmd, ArgSlice args, uint32_t unused = 1);

void RecordDelete(DbIndex dbid, std::string_view key);

// Record expiry in journal with independent transaction.
// Must be called from shard thread owning key.
// Might block the calling fiber unless journal::SetFlushMode(false) is called.
inline void RecordExpiryBlocking(DbIndex dbid, std::string_view key) {
  RecordDelete(dbid, key);
}

}  // namespace dfly


================================================
FILE: src/server/version.cc.in
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "version.h"

namespace dfly {

// Do not edit - autogenerated file. Please see version.cc.in for details.

const char kGitTag[] = "@GIT_VER@";
const char kGitSha[] = "@GIT_SHA1@";
const char kGitClean[] = "@GIT_CLEAN_DIRTY@";
const char kBuildTime[] = "@PRJ_BUILD_TIME@";

const char* GetVersion() { return "df-@GIT_VER@"; }

}  // namespace dfly


================================================
FILE: src/server/version.h
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

namespace dfly {

extern const char kGitTag[];
extern const char kGitSha[];
extern const char kGitClean[];
extern const char kBuildTime[];

const char* GetVersion();

// An enum for internal versioning of dragonfly specific behavior.
// Please document for each new entry what the behavior changes are
// and to which released versions this corresponds.
enum class DflyVersion {
  // 1.4  <= ver <= 1.10
  // - Supports receiving ACKs from replicas
  // - Sends version back on REPLCONF capa dragonfly
  VER1,

  // 1.11 <= ver
  // Supports limited partial sync
  VER2,

  // 1.15 < ver
  // ACL with user replication
  VER3,

  // - Periodic lag checks from master to replica
  VER4,

  // - Support partial sync from different master
  VER5,

  // 1.37 <= ver
  // - Per-shard search index definitions (search-index AUX on every flow)
  // - HNSW index serialization opcodes (RDB_OPCODE_VECTOR_INDEX, RDB_OPCODE_SHARD_DOC_INDEX)
  // - hnsw-index-metadata AUX field
  VER6,

  // Always points to the latest version
  CURRENT_VER = VER6,
};

}  // namespace dfly


================================================
FILE: src/server/version_monitor.cc
================================================
// Copyright 2023, Roman Gershman.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/version_monitor.h"

#include <absl/strings/numbers.h>
#include <absl/strings/str_cat.h>
#include <absl/strings/str_split.h>
#include <openssl/err.h>

#include <boost/beast/http/string_body.hpp>
#include <regex>

#include "base/logging.h"
#include "server/version.h"

namespace dfly {

using namespace std;
using namespace util;
using http::TlsClient;

namespace {

std::optional<std::string> GetVersionString(const std::string& version_str) {
  // The server sends a message such as {"latest": "0.12.0"}
  const auto reg_match_expr = R"(\{\"latest"\:[ \t]*\"([0-9]+\.[0-9]+\.[0-9]+)\"\})";
  VLOG(1) << "checking version '" << version_str << "'";
  auto const regex = std::regex(reg_match_expr);
  std::smatch match;
  if (std::regex_match(version_str, match, regex) && match.size() > 1) {
    // the second entry is the match to the group that holds the version string
    return match[1].str();
  } else {
    LOG_FIRST_N(WARNING, 1) << "Remote version - invalid version number: '" << version_str << "'";
    return std::nullopt;
  }
}

std::optional<std::string> GetRemoteVersion(ProactorBase* proactor, SSL_CTX* ssl_context,
                                            const std::string host, std::string_view service,
                                            const std::string& resource,
                                            const std::string& ver_header) {
  namespace bh = boost::beast::http;
  using ResponseType = bh::response<bh::string_body>;

  bh::request<bh::string_body> req{bh::verb::get, resource, 11 /*http 1.1*/};
  req.set(bh::field::host, host);
  req.set(bh::field::user_agent, ver_header);
  ResponseType res;
  TlsClient http_client{proactor};
  http_client.set_connect_timeout_ms(2000);

  auto ec = http_client.Connect(host, service, ssl_context);

  if (ec) {
    LOG_FIRST_N(WARNING, 1) << "Remote version - connection error [" << host << ":" << service
                            << "] : " << ec.message();
    return nullopt;
  }

  ec = http_client.Send(req, &res);
  if (!ec) {
    VLOG(1) << "successfully got response from HTTP GET for host " << host << ":" << service << "/"
            << resource << " response code is " << res.result();

    if (res.result() == bh::status::ok) {
      return GetVersionString(res.body());
    }
  } else {
    static bool is_logged{false};
    if (!is_logged) {
      is_logged = true;

#if (OPENSSL_VERSION_NUMBER >= 0x30000000L)
      const char* func_err = "ssl_internal_error";
#else
      const char* func_err = ERR_func_error_string(ec.value());
#endif

      // Unfortunately AsioStreamAdapter looses the original error category
      // because std::error_code can not be converted into boost::system::error_code.
      // It's fixed in later versions of Boost, but for now we assume it's from TLS.
      LOG(WARNING) << "Remote version - HTTP GET error [" << host << ":" << service << resource
                   << "], error: " << ec.value();
      LOG(WARNING) << "ssl error: " << func_err << "/" << ERR_reason_error_string(ec.value());
    }
  }

  return nullopt;
}

}  // namespace

bool VersionMonitor::IsVersionOutdated(const std::string_view remote,
                                       const std::string_view current) const {
  const absl::InlinedVector<absl::string_view, 3> remote_xyz = absl::StrSplit(remote, ".");
  const absl::InlinedVector<absl::string_view, 3> current_xyz = absl::StrSplit(current, ".");
  if (remote_xyz.size() != current_xyz.size()) {
    LOG(WARNING) << "Can't compare Dragonfly version " << current << " to latest version "
                 << remote;
    return false;
  }
  const auto print_to_log = [](const std::string_view version, const absl::string_view part) {
    LOG(WARNING) << "Can't parse " << version << " part of version " << part << " as a number";
  };
  for (size_t i = 0; i < remote_xyz.size(); ++i) {
    size_t remote_x = 0;
    if (!absl::SimpleAtoi(remote_xyz[i], &remote_x)) {
      print_to_log(remote, remote_xyz[i]);
      return false;
    }
    size_t current_x = 0;
    if (!absl::SimpleAtoi(current_xyz[i], &current_x)) {
      print_to_log(current, current_xyz[i]);
      return false;
    }
    if (remote_x > current_x) {
      return true;
    }

    if (remote_x < current_x) {
      return false;
    }
  }

  return false;
}

void VersionMonitor::Run(ProactorPool* proactor_pool) {
  // Avoid running dev environments.
  if (getenv("DFLY_DEV_ENV")) {
    LOG(WARNING) << "Running in dev environment (DFLY_DEV_ENV is set) - version monitoring is "
                    "disabled";
    return;
  }

  SslPtr ssl_ctx(TlsClient::CreateSslContext());
  if (!ssl_ctx) {
    VLOG(1) << "Remote version - failed to create SSL context - cannot run version monitoring";
    return;
  }

  version_fiber_ = proactor_pool->GetNextProactor()->LaunchFiber(
      [ssl_ctx = std::move(ssl_ctx), this]() mutable { RunTask(std::move(ssl_ctx)); });
}

void VersionMonitor::Shutdown() {
  monitor_ver_done_.Notify();
  if (version_fiber_.IsJoinable()) {
    version_fiber_.Join();
  }
}

void VersionMonitor::RunTask(SslPtr ssl_ctx) {
  const auto loop_sleep_time = std::chrono::hours(24);  // every 24 hours

  const std::string host_name = "version.dragonflydb.io";
  const std::string_view port = "443";
  const std::string resource = "/v1";
  string_view current_version(kGitTag);

  current_version.remove_prefix(1);
  const std::string version_header = absl::StrCat("DragonflyDB/", current_version);

  ProactorBase* my_pb = ProactorBase::me();
  while (true) {
    const std::optional<std::string> remote_version =
        GetRemoteVersion(my_pb, ssl_ctx.get(), host_name, port, resource, version_header);
    if (remote_version) {
      const std::string_view rv = remote_version.value();
      if (IsVersionOutdated(rv, current_version)) {
        LOG_FIRST_N(INFO, 1) << "Your current version '" << current_version
                             << "' is not the latest version. A newer version '" << rv
                             << "' is now available. Please consider an update.";
      }
    }
    if (monitor_ver_done_.WaitFor(loop_sleep_time)) {
      VLOG(1) << "finish running version monitor task";
      return;
    }
  }
}

}  // namespace dfly


================================================
FILE: src/server/version_monitor.h
================================================
// Copyright 2023, Roman Gershman.  All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once

#include "util/fibers/fibers.h"
#include "util/fibers/pool.h"
#include "util/http/http_client.h"

namespace dfly {

class VersionMonitor {
 public:
  void Run(util::ProactorPool* proactor_pool);

  void Shutdown();

 private:
  struct SslDeleter {
    void operator()(SSL_CTX* ssl) {
      if (ssl) {
        util::http::TlsClient::FreeContext(ssl);
      }
    }
  };

  using SslPtr = std::unique_ptr<SSL_CTX, SslDeleter>;
  void RunTask(SslPtr);

  bool IsVersionOutdated(std::string_view remote, std::string_view current) const;

  util::fb2::Fiber version_fiber_;
  util::fb2::Done monitor_ver_done_;
};

}  // namespace dfly


================================================
FILE: src/server/zset_family.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/zset_family.h"

#include <absl/strings/ascii.h>

extern "C" {
#include "redis/listpack.h"
#include "redis/redis_aux.h"
#include "redis/util.h"
#include "redis/zmalloc.h"
}

#include "base/logging.h"
#include "base/stl_util.h"
#include "core/sorted_map.h"
#include "facade/cmd_arg_parser.h"
#include "facade/error.h"
#include "server/acl/acl_commands_def.h"
#include "server/blocking_controller.h"
#include "server/cluster/cluster_defs.h"
#include "server/command_registry.h"
#include "server/conn_context.h"
#include "server/container_utils.h"
#include "server/db_slice.h"
#include "server/engine_shard_set.h"
#include "server/error.h"
#include "server/family_utils.h"
#include "server/namespaces.h"
#include "server/transaction.h"

namespace dfly {

using namespace std;
using namespace facade;
using absl::SimpleAtoi;
namespace {

using CI = CommandId;

const char kNxXxErr[] = "XX and NX options at the same time are not compatible";
const char kLexRangeErr[] = "min or max not valid string range item";
const char kFloatRangeErr[] = "min or max is not a float";
const char kScoreNaN[] = "resulting score is not a number (NaN)";

using MScoreResponse = std::vector<std::optional<double>>;
using ScoredMember = ZSetFamily::ScoredMember;
using ScoredArray = ZSetFamily::ScoredArray;
using ScoredMemberView = ZSetFamily::ScoredMemberView;
using ScoredMemberSpan = ZSetFamily::ScoredMemberSpan;

struct ValidateZMPopResult {
  uint32_t num_keys;
  bool is_max;
  int pop_count;
  float timeout;
};

inline zrangespec GetZrangeSpec(bool reverse, const ZSetFamily::ScoreInterval& si) {
  auto interval = si;
  if (reverse)
    swap(interval.first, interval.second);

  zrangespec range;
  range.min = interval.first.val;
  range.max = interval.second.val;
  range.minex = interval.first.is_open;
  range.maxex = interval.second.is_open;

  return range;
}

sds GetLexStr(const ZSetFamily::LexBound& bound) {
  if (bound.type == ZSetFamily::LexBound::MINUS_INF)
    return cminstring;

  if (bound.type == ZSetFamily::LexBound::PLUS_INF)
    return cmaxstring;

  return sdsnewlen(bound.val.data(), bound.val.size());
};

zlexrangespec GetLexRange(bool reverse, const ZSetFamily::LexInterval& li) {
  auto interval = li;
  if (reverse)
    swap(interval.first, interval.second);

  zlexrangespec range;
  range.minex = 0;
  range.maxex = 0;

  range.min = GetLexStr(interval.first);
  range.max = GetLexStr(interval.second);
  range.minex = (interval.first.type == ZSetFamily::LexBound::OPEN);
  range.maxex = (interval.second.type == ZSetFamily::LexBound::OPEN);

  return range;
}

bool IsListPack(const PrimeValue& pv) {
  return pv.Encoding() == OBJ_ENCODING_LISTPACK;
}

/* Delete the element 'ele' from the sorted set, returning 1 if the element
 * existed and was deleted, 0 otherwise (the element was not there).
 * taken from t_zset.c
 */

int ZsetDel(PrimeValue* pv, std::string_view ele) {
  if (IsListPack(*pv)) {
    uint8_t* lp = (uint8_t*)pv->RObjPtr();
    unsigned char* eptr = detail::ZzlFind(lp, ele, nullptr);
    if (eptr) {
      lp = lpDeleteRangeWithEntry(lp, &eptr, 2);
      pv->SetRObjPtr(lp);
      return 1;
    }
  } else if (pv->Encoding() == OBJ_ENCODING_SKIPLIST) {
    detail::SortedMap* zs = (detail::SortedMap*)pv->RObjPtr();
    if (zs->Delete(ele))
      return 1;
  }
  return 0; /* No such element found. */
}

// taken from t_zset.c
std::optional<double> GetZsetScore(const PrimeValue& pv, std::string_view member) {
  if (IsListPack(pv)) {
    double score;
    if (detail::ZzlFind((uint8_t*)pv.RObjPtr(), member, &score) == NULL)
      return std::nullopt;
    return score;
  }

  if (pv.Encoding() == OBJ_ENCODING_SKIPLIST) {
    detail::SortedMap* zs = (detail::SortedMap*)pv.RObjPtr();
    return zs->GetScore(member);
  }

  LOG(FATAL) << "Unknown sorted set encoding";
  return 0;
}

int ZsetAdd(PrimeValue* pv, double score, std::string_view ele, int in_flags, int* out_flags,
            double* newscore) {
  *out_flags = 0; /* We'll return our response flags. */
  double curscore;

  /* NaN as input is an error regardless of all the other parameters. */
  if (isnan(score)) {
    *out_flags = ZADD_OUT_NAN;
    return 0;
  }

  /* Update the sorted set according to its encoding. */
  if (pv->Encoding() == OBJ_ENCODING_LISTPACK) {
    /* Turn options into simple to check vars. */
    bool incr = (in_flags & ZADD_IN_INCR) != 0;
    bool nx = (in_flags & ZADD_IN_NX) != 0;
    bool xx = (in_flags & ZADD_IN_XX) != 0;
    bool gt = (in_flags & ZADD_IN_GT) != 0;
    bool lt = (in_flags & ZADD_IN_LT) != 0;

    uint8_t* lp = (uint8_t*)pv->RObjPtr();
    uint8_t* eptr = detail::ZzlFind(lp, ele, &curscore);
    if (eptr != NULL) {
      /* NX? Return, same element already exists. */
      if (nx) {
        *out_flags |= ZADD_OUT_NOP;
        return 1;
      }

      /* Prepare the score for the increment if needed. */
      if (incr) {
        score += curscore;
        if (isnan(score)) {
          *out_flags |= ZADD_OUT_NAN;
          return 0;
        }
      }

      /* GT/LT? Only update if score is greater/less than current. */
      if ((lt && score >= curscore) || (gt && score <= curscore)) {
        *out_flags |= ZADD_OUT_NOP;
        return 1;
      }

      if (newscore)
        *newscore = score;

      /* Remove and re-insert when score changed. */
      if (score != curscore) {
        lp = lpDeleteRangeWithEntry(lp, &eptr, 2);
        lp = detail::ZzlInsert(lp, ele, score);
        pv->SetRObjPtr(lp);
        *out_flags |= ZADD_OUT_UPDATED;
      }

      return 1;
    } else if (!xx) {
      unsigned zl_len = lpLength(lp) / 2;

      /* check if the element is too large or the list
       * becomes too long *before* executing zzlInsert. */
      if (zl_len >= ZSET_MAX_LISTPACK_ENTRIES || ele.size() > ZSET_MAX_LISTPACK_VALUE) {
        auto* ptr = detail::SortedMap::FromListPack(pv->memory_resource(), lp);
        pv->InitRobj(OBJ_ZSET, OBJ_ENCODING_SKIPLIST, ptr);
      } else {
        lp = detail::ZzlInsert(lp, ele, score);
        pv->SetRObjPtr(lp);
        if (newscore)
          *newscore = score;
        *out_flags |= ZADD_OUT_ADDED;
        return 1;
      }
    } else {
      *out_flags |= ZADD_OUT_NOP;
      return 1;
    }
  }

  CHECK_EQ(pv->Encoding(), OBJ_ENCODING_SKIPLIST);
  detail::SortedMap* ss = (detail::SortedMap*)pv->RObjPtr();
  return ss->AddElem(score, ele, in_flags, out_flags, newscore);
}

void OutputScoredArrayResult(const OpResult<ScoredArray>& result, SinkReplyBuilder* builder) {
  if (result.status() == OpStatus::WRONG_TYPE) {
    return builder->SendError(kWrongTypeErr);
  }

  LOG_IF(WARNING, !result && result.status() != OpStatus::KEY_NOTFOUND)
      << "Unexpected status " << result.status();
  auto* rb = static_cast<RedisReplyBuilder*>(builder);
  rb->SendScoredArray(result.value(), true /* with scores */);
}

OpResult<DbSlice::ItAndUpdater> PrepareZEntry(const ZSetFamily::ZParams& zparams,
                                              const OpArgs& op_args, string_view key,
                                              size_t member_len) {
  auto& db_slice = op_args.GetDbSlice();
  if (zparams.flags & ZADD_IN_XX) {
    return db_slice.FindMutable(op_args.db_cntx, key, OBJ_ZSET);
  }

  // Here we use nullopt for type because we can override the type if it exists.
  // If override is not set, we will return an error if the type is not OBJ_ZSET.
  auto op_res = db_slice.AddOrFind(op_args.db_cntx, key, std::nullopt);
  RETURN_ON_BAD_STATUS(op_res);
  auto& add_res = *op_res;

  auto& it = add_res.it;
  PrimeValue& pv = it->second;
  if (add_res.is_new || zparams.override) {
    // If we're overwriting an existing key (not a new one), we need to remove it from
    // search indexes first. This prevents crashes when the key is indexed (e.g., HASH or JSON).
    if (!add_res.is_new && zparams.override) {
      RemoveKeyFromIndexesIfNeeded(key, op_args.db_cntx, pv, op_args.shard);
    }

    if (member_len > server.max_map_field_len) {
      pv.InitRobj(OBJ_ZSET, OBJ_ENCODING_SKIPLIST, CompactObj::AllocateMR<detail::SortedMap>());
    } else {
      unsigned char* lp = lpNew(0);
      pv.InitRobj(OBJ_ZSET, OBJ_ENCODING_LISTPACK, lp);
    }
  } else {
    if (it->second.ObjType() != OBJ_ZSET)
      return OpStatus::WRONG_TYPE;
  }

  if (!add_res.is_new && zparams.override)
    db_slice.RemoveExpire(op_args.db_cntx.db_index, it);

  auto* blocking_controller = op_args.db_cntx.ns->GetBlockingController(op_args.shard->shard_id());
  if (add_res.is_new && blocking_controller) {
    blocking_controller->Awaken(op_args.db_cntx.db_index, key);
  }

  return DbSlice::ItAndUpdater{add_res.it, add_res.exp_it, std::move(add_res.post_updater)};
}

enum class Action : uint8_t { RANGE = 0, REMOVE = 1, POP = 2 };

class IntervalVisitor {
 public:
  IntervalVisitor(Action action, const ZSetFamily::RangeParams& params, PrimeValue* pv)
      : action_(action), params_(params), pv_(pv) {
  }

  void operator()(const ZSetFamily::IndexInterval& ii);

  void operator()(const ZSetFamily::ScoreInterval& si);

  void operator()(const ZSetFamily::LexInterval& li);

  void operator()(ZSetFamily::TopNScored sc);

  ScoredArray PopResult() {
    return std::move(result_);
  }

  unsigned removed() const {
    return removed_;
  }

 private:
  void ExtractListPack(const zrangespec& range);
  void ExtractSkipList(const zrangespec& range);

  void ExtractListPack(const zlexrangespec& range);
  void ExtractSkipList(const zlexrangespec& range);

  void PopListPack(ZSetFamily::TopNScored sc);
  void PopSkipList(ZSetFamily::TopNScored sc);

  void ActionRange(unsigned start, unsigned end);  // rank
  void ActionRange(const zrangespec& range);       // score
  void ActionRange(const zlexrangespec& range);    // lex

  void ActionRem(unsigned start, unsigned end);  // rank
  void ActionRem(const zrangespec& range);       // score
  void ActionRem(const zlexrangespec& range);    // lex

  void ActionPop(ZSetFamily::TopNScored sc);

  void Next(uint8_t* zl, uint8_t** eptr, uint8_t** sptr) const {
    if (params_.reverse) {
      detail::ZzlPrev(zl, eptr, sptr);
    } else {
      detail::ZzlNext(zl, eptr, sptr);
    }
  }

  bool IsUnder(double score, const zrangespec& spec) const {
    return params_.reverse ? detail::ZslValueGteMin(score, &spec)
                           : detail::ZslValueLteMax(score, &spec);
  }

  void AddResult(const uint8_t* vstr, unsigned vlen, long long vlon, double score);

  Action action_;
  ZSetFamily::RangeParams params_;
  PrimeValue* pv_;

  ScoredArray result_;
  unsigned removed_ = 0;
};

void IntervalVisitor::operator()(const ZSetFamily::IndexInterval& ii) {
  unsigned long llen = pv_->Size();
  int64_t start = ii.first;
  int64_t end = ii.second;

  if (start < 0)
    start = llen + start;
  if (end < 0)
    end = llen + end;
  if (start < 0)
    start = 0;

  if (start > end || unsigned(start) >= llen) {
    return;
  }

  if (unsigned(end) >= llen)
    end = llen - 1;

  switch (action_) {
    case Action::RANGE:
      ActionRange(start, end);
      break;
    case Action::REMOVE:
      ActionRem(start, end);
      break;
    default:
      break;
  }
}

void IntervalVisitor::operator()(const ZSetFamily::ScoreInterval& si) {
  zrangespec range = GetZrangeSpec(params_.reverse, si);

  switch (action_) {
    case Action::RANGE:
      ActionRange(range);
      break;
    case Action::REMOVE:
      ActionRem(range);
      break;
    default:
      break;
  }
}

void IntervalVisitor::operator()(const ZSetFamily::LexInterval& li) {
  zlexrangespec range = GetLexRange(params_.reverse, li);

  switch (action_) {
    case Action::RANGE:
      ActionRange(range);
      break;
    case Action::REMOVE:
      ActionRem(range);
      break;
    default:
      break;
  }
  detail::ZslFreeLexRange(&range);
}

void IntervalVisitor::operator()(ZSetFamily::TopNScored sc) {
  switch (action_) {
    case Action::POP:
      ActionPop(sc);
      break;
    default:
      break;
  }
}

void IntervalVisitor::ActionRange(unsigned start, unsigned end) {
  if (params_.limit == 0)
    return;

  // Calculate new start and end given offset and limit.
  start += params_.offset;
  end = min<size_t>(size_t(start) + params_.limit - 1, end);
  if (start > end) {
    return;
  }

  container_utils::IterateSortedSet(
      *pv_,
      [this](container_utils::ContainerEntry ce, double score) {
        result_.emplace_back(ce.ToString(), score);
        return true;
      },
      start, end, params_.reverse, params_.with_scores);
}

void IntervalVisitor::ActionRange(const zrangespec& range) {
  if (IsListPack(*pv_)) {
    ExtractListPack(range);
  } else {
    CHECK_EQ(pv_->Encoding(), OBJ_ENCODING_SKIPLIST);
    ExtractSkipList(range);
  }
}

void IntervalVisitor::ActionRange(const zlexrangespec& range) {
  if (IsListPack(*pv_)) {
    ExtractListPack(range);
  } else {
    CHECK_EQ(pv_->Encoding(), OBJ_ENCODING_SKIPLIST);
    ExtractSkipList(range);
  }
}

void IntervalVisitor::ActionRem(unsigned start, unsigned end) {
  if (IsListPack(*pv_)) {
    uint8_t* zl = (uint8_t*)pv_->RObjPtr();

    removed_ = (end - start) + 1;
    zl = lpDeleteRange(zl, 2 * start, 2 * removed_);
    pv_->SetRObjPtr(zl);
  } else {
    CHECK_EQ(OBJ_ENCODING_SKIPLIST, pv_->Encoding());
    detail::SortedMap* zs = (detail::SortedMap*)pv_->RObjPtr();
    removed_ = zs->DeleteRangeByRank(start, end);
  }
}

void IntervalVisitor::ActionRem(const zrangespec& range) {
  if (IsListPack(*pv_)) {
    uint8_t* zl = (uint8_t*)pv_->RObjPtr();
    unsigned long deleted = 0;
    zl = detail::ZzlDeleteRangeByScore(zl, &range, &deleted);
    pv_->SetRObjPtr(zl);
    removed_ = deleted;
  } else {
    CHECK_EQ(OBJ_ENCODING_SKIPLIST, pv_->Encoding());
    detail::SortedMap* zs = (detail::SortedMap*)pv_->RObjPtr();
    removed_ = zs->DeleteRangeByScore(range);
  }
}

void IntervalVisitor::ActionRem(const zlexrangespec& range) {
  if (IsListPack(*pv_)) {
    uint8_t* zl = (uint8_t*)pv_->RObjPtr();
    unsigned long deleted = 0;
    zl = detail::ZzlDeleteRangeByLex(zl, &range, &deleted);
    pv_->SetRObjPtr(zl);
    removed_ = deleted;
  } else {
    CHECK_EQ(OBJ_ENCODING_SKIPLIST, pv_->Encoding());
    detail::SortedMap* zs = (detail::SortedMap*)pv_->RObjPtr();
    removed_ = zs->DeleteRangeByLex(range);
  }
}

void IntervalVisitor::ActionPop(ZSetFamily::TopNScored sc) {
  if (sc > 0) {
    if (IsListPack(*pv_)) {
      PopListPack(sc);
    } else {
      CHECK_EQ(pv_->Encoding(), OBJ_ENCODING_SKIPLIST);
      PopSkipList(sc);
    }
  }
}

void IntervalVisitor::ExtractListPack(const zrangespec& range) {
  uint8_t* zl = (uint8_t*)pv_->RObjPtr();
  uint8_t *eptr, *sptr;
  uint8_t* vstr;
  unsigned int vlen = 0;
  long long vlong = 0;
  unsigned offset = params_.offset;
  unsigned limit = params_.limit;

  /* If reversed, get the last node in range as starting point. */
  if (params_.reverse) {
    eptr = detail::ZzlLastInRange(zl, &range);
  } else {
    eptr = detail::ZzlFirstInRange(zl, &range);
  }

  /* Get score pointer for the first element. */
  if (eptr)
    sptr = lpNext(zl, eptr);

  /* If there is an offset, just traverse the number of elements without
   * checking the score because that is done in the next loop. */
  while (eptr && offset--) {
    Next(zl, &eptr, &sptr);
  }

  while (eptr && limit--) {
    double score = detail::ZzlGetScore(sptr);

    /* Abort when the node is no longer in range. */
    if (!IsUnder(score, range))
      break;

    /* We know the element exists, so lpGetValue should always
     * succeed */
    vstr = lpGetValue(eptr, &vlen, &vlong);

    AddResult(vstr, vlen, vlong, score);

    /* Move to next node */
    Next(zl, &eptr, &sptr);
  }
}

void IntervalVisitor::ExtractSkipList(const zrangespec& range) {
  detail::SortedMap* zs = (detail::SortedMap*)pv_->RObjPtr();

  unsigned offset = params_.offset;
  unsigned limit = params_.limit;

  result_ = zs->GetRange(range, offset, limit, params_.reverse);
}

void IntervalVisitor::ExtractListPack(const zlexrangespec& range) {
  uint8_t* zl = (uint8_t*)pv_->RObjPtr();
  uint8_t *eptr, *sptr = nullptr;
  uint8_t* vstr = nullptr;
  unsigned int vlen = 0;
  long long vlong = 0;
  unsigned offset = params_.offset;
  unsigned limit = params_.limit;

  /* If reversed, get the last node in range as starting point. */
  if (params_.reverse) {
    eptr = detail::ZzlLastInLexRange(zl, &range);
  } else {
    eptr = detail::ZzlFirstInLexRange(zl, &range);
  }

  /* Get score pointer for the first element. */
  if (eptr)
    sptr = lpNext(zl, eptr);

  /* If there is an offset, just traverse the number of elements without
   * checking the score because that is done in the next loop. */
  while (eptr && offset--) {
    Next(zl, &eptr, &sptr);
  }

  while (eptr && limit--) {
    double score = 0;
    if (params_.with_scores) /* don't bother to extract the score if it's gonna be ignored. */
      score = detail::ZzlGetScore(sptr);

    /* Abort when the node is no longer in range. */
    if (params_.reverse) {
      if (!detail::ZzlLexValueGteMin(eptr, &range))
        break;
    } else {
      if (!detail::ZzlLexValueLteMax(eptr, &range))
        break;
    }

    vstr = lpGetValue(eptr, &vlen, &vlong);
    AddResult(vstr, vlen, vlong, score);

    /* Move to next node */
    Next(zl, &eptr, &sptr);
  }
}

void IntervalVisitor::ExtractSkipList(const zlexrangespec& range) {
  detail::SortedMap* zs = (detail::SortedMap*)pv_->RObjPtr();
  unsigned offset = params_.offset;
  unsigned limit = params_.limit;
  result_ = zs->GetLexRange(range, offset, limit, params_.reverse);
}

void IntervalVisitor::PopListPack(ZSetFamily::TopNScored sc) {
  uint8_t* zl = (uint8_t*)pv_->RObjPtr();
  uint8_t *eptr, *sptr;
  uint8_t* vstr;
  unsigned int vlen = 0;
  long long vlong = 0;

  if (params_.reverse) {
    eptr = lpSeek(zl, -2);
  } else {
    eptr = lpSeek(zl, 0);
  }

  /* Get score pointer for the first element. */
  if (eptr)
    sptr = lpNext(zl, eptr);

  /* First we get the entries */
  unsigned int num = sc;
  while (eptr && num--) {
    double score = detail::ZzlGetScore(sptr);
    vstr = lpGetValue(eptr, &vlen, &vlong);
    AddResult(vstr, vlen, vlong, score);

    /* Move to next node */
    Next(zl, &eptr, &sptr);
  }

  int start = 0;
  if (params_.reverse) {
    /* If the number of elements to delete is greater than the listpack length,
     * we set the start to 0 because lpseek fails to search beyond length in reverse */
    start = (2 * sc > lpLength(zl)) ? 0 : -2 * sc;
  }

  /* We can finally delete the elements */
  pv_->SetRObjPtr(lpDeleteRange(zl, start, 2 * sc));
}

void IntervalVisitor::PopSkipList(ZSetFamily::TopNScored sc) {
  detail::SortedMap* zs = (detail::SortedMap*)pv_->RObjPtr();

  /* We start from the header, or the tail if reversed. */
  result_ = zs->PopTopScores(sc, params_.reverse);
}

void IntervalVisitor::AddResult(const uint8_t* vstr, unsigned vlen, long long vlong, double score) {
  if (vstr == NULL) {
    result_.emplace_back(absl::StrCat(vlong), score);
  } else {
    result_.emplace_back(string{reinterpret_cast<const char*>(vstr), vlen}, score);
  }
}

bool ParseBound(string_view src, ZSetFamily::Bound* bound) {
  if (src.empty())
    return false;

  if (src[0] == '(') {
    bound->is_open = true;
    src.remove_prefix(1);
  }

  return ParseDouble(src, &bound->val);
}

bool ParseLexBound(string_view src, ZSetFamily::LexBound* bound) {
  if (src.empty())
    return false;

  if (src == "+") {
    bound->type = ZSetFamily::LexBound::PLUS_INF;
  } else if (src == "-") {
    bound->type = ZSetFamily::LexBound::MINUS_INF;
  } else if (src[0] == '(') {
    bound->type = ZSetFamily::LexBound::OPEN;
    src.remove_prefix(1);
    bound->val = src;
  } else if (src[0] == '[') {
    bound->type = ZSetFamily::LexBound::CLOSED;
    src.remove_prefix(1);
    bound->val = src;
  } else {
    return false;
  }

  return true;
}

enum class AggType : uint8_t { SUM, MIN, MAX, NOOP };
using ScoredMap = absl::flat_hash_map<std::string, double>;

ScoredMap FromObject(const PrimeValue& co, double weight) {
  ZSetFamily::RangeParams params;
  params.with_scores = true;
  // RANGE is a read-only operation, but requires const_cast
  IntervalVisitor vis(Action::RANGE, params, &const_cast<PrimeValue&>(co));
  vis(ZSetFamily::IndexInterval(0, -1));

  ScoredArray arr = vis.PopResult();
  ScoredMap res;
  res.reserve(arr.size());

  for (auto& elem : arr) {
    elem.second *= weight;
    if (isnan(elem.second))
      elem.second = 0;
    res.emplace(std::move(elem));
  }

  return res;
}

ScoredMap ScoreMapFromSet(const PrimeValue& pv, double weight) {
  ScoredMap result;
  container_utils::IterateSet(pv, [&result, weight](container_utils::ContainerEntry ce) {
    result.emplace(ce.ToString(), weight);
    return true;
  });
  return result;
}

double Aggregate(double v1, double v2, AggType atype) {
  switch (atype) {
    case AggType::SUM:
      v1 += v2;
      return isnan(v1) ? 0 : v1;
    case AggType::MAX:
      return max(v1, v2);
    case AggType::MIN:
      return min(v1, v2);
    case AggType::NOOP:
      return 0;
  }
  return 0;
}

// the result is in the destination.
void UnionScoredMap(ScoredMap* dest, ScoredMap* src, AggType agg_type) {
  ScoredMap* target = dest;
  ScoredMap* iter = src;

  if (iter->size() > target->size())
    swap(target, iter);

  for (const auto& elem : *iter) {
    auto [it, inserted] = target->emplace(elem);
    if (!inserted) {
      it->second = Aggregate(it->second, elem.second, agg_type);
    }
  }

  if (target != dest)
    dest->swap(*src);
}

void InterScoredMap(ScoredMap* dest, ScoredMap* src, AggType agg_type) {
  ScoredMap* target = dest;
  ScoredMap* iter = src;

  if (iter->size() > target->size())
    swap(target, iter);

  auto it = iter->begin();
  while (it != iter->end()) {
    auto inter_it = target->find(it->first);
    if (inter_it == target->end()) {
      auto copy_it = it++;
      iter->erase(copy_it);
    } else {
      it->second = Aggregate(it->second, inter_it->second, agg_type);
      ++it;
    }
  }

  if (iter != dest)
    dest->swap(*src);
}

using KeyIterWeightVec = vector<pair<DbSlice::ConstIterator, double>>;

ScoredMap UnionShardKeysWithScore(const KeyIterWeightVec& key_iter_weight_vec, AggType agg_type) {
  ScoredMap result;
  for (const auto& [it, weight] : key_iter_weight_vec) {
    if (it.is_done()) {
      continue;
    }

    ScoredMap sm;
    if (it->second.ObjType() == OBJ_ZSET)
      sm = FromObject(it->second, weight);
    else {
      DCHECK_EQ(it->second.ObjType(), OBJ_SET);
      sm = ScoreMapFromSet(it->second, weight);
    }
    if (result.empty()) {
      result.swap(sm);
    } else {
      UnionScoredMap(&result, &sm, agg_type);
    }
  }
  return result;
}

double GetKeyWeight(const vector<double>& weights, unsigned windex) {
  if (weights.empty()) {
    return 1;
  }

  DCHECK_LT(windex, weights.size());
  return weights[windex];
}

OpResult<KeyIterWeightVec> PrepareWeightedSets(const Transaction& trans, bool store,
                                               string_view dest, const vector<double>& weights,
                                               EngineShard* shard) {
  ShardArgs keys = trans.GetShardArgs(shard->shard_id());
  DCHECK(!keys.Empty());

  unsigned cmdargs_keys_offset = 1;  // after {numkeys} for ZUNION/ZINTER
  unsigned removed_keys = 0;

  ShardArgs::Iterator start = keys.begin(), end = keys.end();

  if (store) {
    // first global index is 2 after {destkey, numkeys}.
    ++cmdargs_keys_offset;
    if (*start == dest) {
      ++start;
      ++removed_keys;
    }

    // In case ONLY the destination key is hosted in this shard no work on this shard should be
    // done in this step
    if (start == end) {
      return OpStatus::OK;
    }
  }

  auto& db_slice = trans.GetDbSlice(shard->shard_id());
  KeyIterWeightVec key_weight_vec(keys.Size() - removed_keys);
  unsigned index = 0;
  DCHECK_GE(start.index(), cmdargs_keys_offset);

  for (; start != end; ++start) {
    auto it_res = db_slice.FindReadOnly(trans.GetDbContext(), *start);

    if (!IsValid(it_res.it)) {
      ++index;
      continue;
    }

    auto obj_type = it_res.it->second.ObjType();
    if (obj_type != OBJ_ZSET && obj_type != OBJ_SET)
      return OpStatus::WRONG_TYPE;

    key_weight_vec[index] = {it_res.it, GetKeyWeight(weights, start.index() - cmdargs_keys_offset)};
    ++index;
  }

  return key_weight_vec;
}

OpResult<ScoredMap> OpUnion(EngineShard* shard, Transaction* t, string_view dest, AggType agg_type,
                            const vector<double>& weights, bool store) {
  OpResult<KeyIterWeightVec> key_vec_res = PrepareWeightedSets(*t, store, dest, weights, shard);
  if (!key_vec_res)
    return key_vec_res.status();

  // Only dest is hosted on this shard.
  if (key_vec_res->empty())
    return OpStatus::OK;

  return UnionShardKeysWithScore(*key_vec_res, agg_type);
}

OpResult<ScoredMap> OpInter(EngineShard* shard, Transaction* t, string_view dest, AggType agg_type,
                            const vector<double>& weights, bool store) {
  OpResult<KeyIterWeightVec> key_vec_res = PrepareWeightedSets(*t, store, dest, weights, shard);
  if (!key_vec_res)
    return key_vec_res.status();

  // Only dest is hosted on this shard.
  if (key_vec_res->empty())
    return OpStatus::SKIPPED;

  ScoredMap result;
  for (const auto& [it, weight] : *key_vec_res) {
    if (it.is_done()) {
      return ScoredMap{};
    }

    ScoredMap sm;
    if (it->second.ObjType() == OBJ_ZSET)
      sm = FromObject(it->second, weight);
    else {
      DCHECK_EQ(it->second.ObjType(), OBJ_SET);
      sm = ScoreMapFromSet(it->second, weight);
    }
    if (result.empty())
      result.swap(sm);
    else
      InterScoredMap(&result, &sm, agg_type);

    if (result.empty())
      return result;
  }

  return result;
}

size_t EstimateListpackMinBytes(ScoredMemberSpan members) {
  size_t bytes = members.size() * 2;  // at least 2 bytes per score;
  for (const auto& member : members) {
    bytes += (member.second.size() + 1);  // string + at least 1 byte for string header.
  }
  return bytes;
}

struct SetOpArgs {
  AggType agg_type = AggType::SUM;
  unsigned num_keys;
  vector<double> weights;
  bool with_scores = false;
};

OpResult<ScoredMap> IntersectResults(vector<OpResult<ScoredMap>>& results, AggType agg_type) {
  ScoredMap result;
  for (auto& op_res : results) {
    if (op_res.status() == OpStatus::SKIPPED)
      continue;

    if (!op_res) {
      return op_res.status();
    }

    if (op_res->empty()) {
      return ScoredMap{};
    }

    if (result.empty()) {
      result.swap(op_res.value());
    } else {
      InterScoredMap(&result, &op_res.value(), agg_type);
    }

    if (result.empty())
      break;
  }
  return result;
}

OpResult<void> FillAggType(string_view agg, SetOpArgs* op_args) {
  if (agg == "SUM") {
    op_args->agg_type = AggType::SUM;
  } else if (agg == "MIN") {
    op_args->agg_type = AggType::MIN;
  } else if (agg == "MAX") {
    op_args->agg_type = AggType::MAX;
  } else {
    return OpStatus::SYNTAX_ERR;
  }
  return OpStatus::OK;
}

// Parse functions return the number of arguments read from CmdArgList
OpResult<unsigned> ParseAggregate(CmdArgList args, bool store, SetOpArgs* op_args) {
  if (args.size() <= 1) {
    return OpStatus::SYNTAX_ERR;
  }

  string agg_type = absl::AsciiStrToUpper(ArgS(args, 1));
  auto filled = FillAggType(agg_type, op_args);
  if (!filled) {
    return filled.status();
  }
  return 1;
}

OpResult<unsigned> ParseWeights(CmdArgList args, SetOpArgs* op_args) {
  if (args.size() <= op_args->num_keys) {
    return OpStatus::SYNTAX_ERR;
  }

  op_args->weights.resize(op_args->num_keys, 1);
  for (unsigned i = 0; i < op_args->num_keys; ++i) {
    string_view weight = ArgS(args, i + 1);
    if (!absl::SimpleAtod(weight, &op_args->weights[i])) {
      return OpStatus::INVALID_FLOAT;
    }
  }

  return op_args->num_keys;
}

OpResult<void> ParseKeyCount(string_view arg_num_keys, SetOpArgs* op_args) {
  // we parsed the structure before, when transaction has been initialized.
  if (!absl::SimpleAtoi(arg_num_keys, &op_args->num_keys)) {
    return OpStatus::SYNTAX_ERR;
  }
  return OpStatus::OK;
}

OpResult<unsigned> ParseWithScores(CmdArgList args, SetOpArgs* op_args) {
  op_args->with_scores = true;
  return 0;
}

OpResult<SetOpArgs> ParseSetOpArgs(CmdArgList args, bool store) {
  string_view num_keys_str = store ? ArgS(args, 1) : ArgS(args, 0);
  SetOpArgs op_args;

  auto parsed = ParseKeyCount(num_keys_str, &op_args);
  if (!parsed) {
    return parsed.status();
  }

  unsigned opt_args_start = op_args.num_keys + (store ? 2 : 1);
  DCHECK_LE(opt_args_start, args.size());  // Checked inside DetermineKeys

  for (size_t i = opt_args_start; i < args.size(); ++i) {
    string arg = absl::AsciiStrToUpper(ArgS(args, i));
    if (arg == "WEIGHTS") {
      auto parsed_cnt = ParseWeights(args.subspan(i), &op_args);
      if (!parsed_cnt) {
        return parsed_cnt.status();
      }
      i += *parsed_cnt;
    } else if (arg == "AGGREGATE") {
      auto parsed_cnt = ParseAggregate(args.subspan(i), store, &op_args);
      if (!parsed_cnt) {
        return parsed_cnt.status();
      }
      i += *parsed_cnt;
    } else if (arg == "WITHSCORES") {
      // Commands with store capability does not offer WITHSCORES option
      if (store) {
        return OpStatus::SYNTAX_ERR;
      }
      auto parsed_cnt = ParseWithScores(args.subspan(i), &op_args);
      if (!parsed_cnt) {
        return parsed_cnt.status();
      }
      i += *parsed_cnt;
    } else {
      return OpStatus::SYNTAX_ERR;
    }
  }
  return op_args;
}

ScoredArray OpBZPop(Transaction* t, EngineShard* shard, std::string_view key, bool is_max) {
  auto& db_slice = t->GetDbSlice(shard->shard_id());
  auto it_res = db_slice.FindMutable(t->GetDbContext(), key, OBJ_ZSET);
  CHECK(it_res) << t->DebugId() << " " << key;  // must exist and must be ok.
  auto it = it_res->it;

  ZSetFamily::RangeParams range_params;
  range_params.reverse = is_max;
  range_params.with_scores = true;
  ZSetFamily::ZRangeSpec range_spec;
  range_spec.params = range_params;
  range_spec.interval = ZSetFamily::TopNScored(1);

  DVLOG(2) << "popping from " << key << " " << t->DebugId();

  PrimeValue& pv = it->second;
  CHECK_GT(pv.Size(), 0u) << key << " " << pv.Encoding();

  IntervalVisitor iv{Action::POP, range_spec.params, &pv};
  std::visit(iv, range_spec.interval);

  it_res->post_updater.Run();

  auto res = iv.PopResult();

  // We don't store empty keys
  CHECK(!res.empty()) << key << " failed to pop from type " << pv.Encoding() << " now size is "
                      << pv.Size();

  auto zlen = pv.Size();
  if (zlen == 0) {
    DVLOG(1) << "deleting key " << key << " " << t->DebugId();
    db_slice.Del(t->GetDbContext(), it_res->it);
  }

  OpArgs op_args = t->GetOpArgs(shard);
  if (op_args.shard->journal()) {
    string command = is_max ? "ZPOPMAX" : "ZPOPMIN";
    RecordJournal(op_args, command, ArgSlice{key}, 1);
  }

  return res;
}

void BZPopMinMax(CmdArgList args, bool is_max, CommandContext* cmd_cntx) {
  DCHECK_GE(args.size(), 2u);

  float timeout;
  auto timeout_str = ArgS(args, args.size() - 1);
  if (!absl::SimpleAtof(timeout_str, &timeout)) {
    return cmd_cntx->SendError("timeout is not a float or out of range");
  }
  if (timeout < 0) {
    return cmd_cntx->SendError("timeout is negative");
  }
  VLOG(1) << "BZPop timeout(" << timeout << ")";

  optional<std::string> callback_ran_key;
  OpResult<ScoredArray> popped_array;
  auto cb = [is_max, &popped_array, &callback_ran_key](Transaction* t, EngineShard* shard,
                                                       std::string_view key) {
    callback_ran_key = key;
    popped_array = OpBZPop(t, shard, key, is_max);
  };

  auto* cntx = cmd_cntx->server_conn_cntx();
  OpResult<string> popped_key = container_utils::RunCbOnFirstNonEmptyBlocking(
      cmd_cntx->tx(), OBJ_ZSET, std::move(cb), unsigned(timeout * 1000), &cntx->blocked,
      &cntx->paused);

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (popped_key) {
    if (!callback_ran_key) {
      return rb->SendNullArray();
    }

    CHECK_EQ(popped_array->size(), 1u) << popped_key << " ran " << *callback_ran_key;
    rb->StartArray(3);
    rb->SendBulkString(*popped_key);
    rb->SendBulkString(popped_array->front().first);
    return rb->SendDouble(popped_array->front().second);
  }

  DVLOG(1) << "result for " << cmd_cntx->tx()->DebugId() << " is " << popped_key.status();
  switch (popped_key.status()) {
    case OpStatus::WRONG_TYPE:
      return cmd_cntx->SendError(kWrongTypeErr);
    case OpStatus::CANCELLED:
    case OpStatus::TIMED_OUT:
      return rb->SendNullArray();
    case OpStatus::KEY_MOVED: {
      auto error = cluster::SlotOwnershipError(*cmd_cntx->tx()->GetUniqueSlotId());
      CHECK(!error.status.has_value() || error.status.value() != facade::OpStatus::OK);
      return cmd_cntx->SendError(error);
    }
    default:
      LOG(ERROR) << "Unexpected error " << popped_key.status();
  }
  return rb->SendNullArray();
}

OpResult<vector<ScoredMap>> OpFetch(EngineShard* shard, Transaction* t, bool skip_dest_key) {
  ShardArgs keys = t->GetShardArgs(shard->shard_id());
  DCHECK(!keys.Empty());

  ShardArgs::Iterator start = keys.begin(), end = keys.end();

  if (skip_dest_key) {
    // If destkey is only found on this shard we can return
    if (++start == end)
      return OpStatus::OK;
  }

  vector<ScoredMap> results;
  results.reserve(keys.Size() - (skip_dest_key ? 1 : 0));

  auto& db_slice = t->GetDbSlice(shard->shard_id());
  for (; start != end; ++start) {
    auto it = db_slice.FindReadOnly(t->GetDbContext(), *start, OBJ_ZSET);

    if (!it) {
      // Key has wrong type so return so we can report error back
      if (it.status() == OpStatus::WRONG_TYPE) {
        return OpStatus::WRONG_TYPE;
      }
      // Key is not found so treat it as empty set
      results.push_back({});
      continue;
    }

    ScoredMap sm = FromObject((*it)->second, 1);
    results.push_back(std::move(sm));
  }

  return results;
}

auto OpPopCount(const ZSetFamily::ZRangeSpec& range_spec, const OpArgs& op_args, string_view key)
    -> OpResult<ScoredArray> {
  auto& db_slice = op_args.GetDbSlice();
  auto res_it = db_slice.FindMutable(op_args.db_cntx, key, OBJ_ZSET);
  if (!res_it)
    return res_it.status();

  PrimeValue& pv = res_it->it->second;

  IntervalVisitor iv{Action::POP, range_spec.params, &pv};
  std::visit(iv, range_spec.interval);

  res_it->post_updater.Run();

  auto zlen = pv.Size();
  if (zlen == 0) {
    op_args.GetDbSlice().Del(op_args.db_cntx, res_it->it);
  }

  // Checking if command conatins flag with no autojournal
  // and we are assuming auto journaling is not re-enabled.
  if ((op_args.tx->GetCId()->opt_mask() & CO::NO_AUTOJOURNAL) && op_args.shard->journal()) {
    auto reverse = range_spec.params.reverse;
    // Checking if interval is actually TopNScored or something else before proceeding.
    DCHECK(std::holds_alternative<ZSetFamily::TopNScored>(range_spec.interval));
    auto count = std::get<ZSetFamily::TopNScored>(range_spec.interval);
    string command = (reverse ? "ZPOPMAX" : "ZPOPMIN");
    RecordJournal(op_args, command, ArgSlice{key, absl::StrCat(count)}, 1);
  }

  return iv.PopResult();
}

auto OpRange(const ZSetFamily::ZRangeSpec& range_spec, const OpArgs& op_args, string_view key)
    -> OpResult<ScoredArray> {
  auto res_it = op_args.GetDbSlice().FindReadOnly(op_args.db_cntx, key, OBJ_ZSET);
  if (!res_it)
    return res_it.status();

  // Action::RANGE is read-only, but requires mutable pointer, thus const_cast
  PrimeValue& pv = const_cast<PrimeValue&>(res_it.value()->second);
  IntervalVisitor iv{Action::RANGE, range_spec.params, &pv};

  std::visit(iv, range_spec.interval);

  return iv.PopResult();
}

OpResult<unsigned> OpRemRange(const OpArgs& op_args, string_view key,
                              const ZSetFamily::ZRangeSpec& range_spec) {
  auto& db_slice = op_args.GetDbSlice();
  auto res_it = db_slice.FindMutable(op_args.db_cntx, key, OBJ_ZSET);
  if (!res_it)
    return res_it.status();

  PrimeValue& pv = res_it->it->second;
  IntervalVisitor iv{Action::REMOVE, range_spec.params, &pv};
  std::visit(iv, range_spec.interval);

  res_it->post_updater.Run();

  auto zlen = pv.Size();
  if (zlen == 0) {
    op_args.GetDbSlice().Del(op_args.db_cntx, res_it->it);
  }

  return iv.removed();
}

struct RankResult {
  unsigned rank;
  double score = 0;
};

OpResult<RankResult> OpRank(const OpArgs& op_args, string_view key, string_view member,
                            bool reverse, bool with_score) {
  auto res_it = op_args.GetDbSlice().FindReadOnly(op_args.db_cntx, key, OBJ_ZSET);
  if (!res_it)
    return res_it.status();

  auto& pv = res_it.value()->second;
  if (IsListPack(pv)) {
    unsigned char* zl = (uint8_t*)pv.RObjPtr();
    unsigned char *eptr, *sptr;

    eptr = lpSeek(zl, 0);
    DCHECK(eptr != NULL);
    sptr = lpNext(zl, eptr);
    DCHECK(sptr != NULL);

    unsigned rank = 1;
    if (member.empty())
      member = ""sv;

    while (eptr != NULL) {
      if (lpCompare(eptr, (const uint8_t*)member.data(), member.size()))
        break;
      rank++;
      detail::ZzlNext(zl, &eptr, &sptr);
    }

    if (eptr == NULL)
      return OpStatus::KEY_NOTFOUND;

    RankResult res{};
    res.rank = reverse ? lpLength(zl) / 2 - rank : rank - 1;
    if (with_score) {
      res.score = detail::ZzlGetScore(sptr);
    }
    return res;
  }
  DCHECK_EQ(pv.Encoding(), OBJ_ENCODING_SKIPLIST);
  detail::SortedMap* ss = (detail::SortedMap*)pv.RObjPtr();

  RankResult res{};

  if (with_score) {
    auto rankAndScore = ss->GetRankAndScore(member, reverse);
    if (!rankAndScore) {
      return OpStatus::KEY_NOTFOUND;
    }
    res.rank = rankAndScore->first;
    res.score = rankAndScore->second;
  } else {
    std::optional<unsigned> rank = ss->GetRank(member, reverse);
    if (!rank) {
      return OpStatus::KEY_NOTFOUND;
    }
    res.rank = *rank;
  }

  return res;
}

OpResult<unsigned> OpCount(const OpArgs& op_args, std::string_view key,
                           const ZSetFamily::ScoreInterval& interval) {
  auto res_it = op_args.GetDbSlice().FindReadOnly(op_args.db_cntx, key, OBJ_ZSET);
  if (!res_it)
    return res_it.status();

  auto& pv = res_it.value()->second;
  zrangespec range = GetZrangeSpec(false, interval);
  unsigned count = 0;

  if (range.min > range.max) {
    return 0;
  }

  if (IsListPack(pv)) {
    uint8_t* zl = (uint8_t*)pv.RObjPtr();
    uint8_t *eptr, *sptr;
    double score;

    /* Use the first element in range as the starting point */
    eptr = detail::ZzlFirstInRange(zl, &range);

    /* No "first" element */
    if (eptr == NULL) {
      return 0;
    }

    /* First element is in range */
    sptr = lpNext(zl, eptr);
    score = detail::ZzlGetScore(sptr);

    DCHECK(detail::ZslValueLteMax(score, &range));

    /* Iterate over elements in range */
    while (eptr) {
      score = detail::ZzlGetScore(sptr);

      /* Abort when the node is no longer in range. */
      if (!detail::ZslValueLteMax(score, &range)) {
        break;
      } else {
        count++;
        detail::ZzlNext(zl, &eptr, &sptr);
      }
    }
  } else {
    CHECK_EQ(unsigned(OBJ_ENCODING_SKIPLIST), pv.Encoding());
    detail::SortedMap* zs = (detail::SortedMap*)pv.RObjPtr();
    count = zs->Count(range);
  }

  return count;
}

OpResult<unsigned> OpLexCount(const OpArgs& op_args, string_view key,
                              const ZSetFamily::LexInterval& interval) {
  auto res_it = op_args.GetDbSlice().FindReadOnly(op_args.db_cntx, key, OBJ_ZSET);
  if (!res_it)
    return res_it.status();

  zlexrangespec range = GetLexRange(false, interval);
  unsigned count = 0;

  auto& pv = res_it.value()->second;
  if (IsListPack(pv)) {
    uint8_t* zl = (uint8_t*)pv.RObjPtr();
    uint8_t *eptr, *sptr;

    /* Use the first element in range as the starting point */
    eptr = detail::ZzlFirstInLexRange(zl, &range);

    if (eptr) {
      /* First element is in range */
      sptr = lpNext(zl, eptr);
      DCHECK(detail::ZzlLexValueLteMax(eptr, &range));

      /* Iterate over elements in range */
      while (eptr) {
        /* Abort when the node is no longer in range. */
        if (!detail::ZzlLexValueLteMax(eptr, &range)) {
          break;
        } else {
          count++;
          detail::ZzlNext(zl, &eptr, &sptr);
        }
      }
    }
  } else {
    DCHECK_EQ(OBJ_ENCODING_SKIPLIST, pv.Encoding());
    detail::SortedMap* zs = (detail::SortedMap*)pv.RObjPtr();
    count = zs->LexCount(range);
  }

  detail::ZslFreeLexRange(&range);
  return count;
}

OpResult<unsigned> OpRem(const OpArgs& op_args, string_view key, const facade::ArgRange& members) {
  auto& db_slice = op_args.GetDbSlice();
  auto res_it = db_slice.FindMutable(op_args.db_cntx, key, OBJ_ZSET);
  if (!res_it)
    return res_it.status();

  auto& pv = res_it->it->second;
  unsigned deleted = 0;
  for (string_view member : members)
    deleted += ZsetDel(&pv, member);

  auto zlen = pv.Size();
  res_it->post_updater.Run();

  if (zlen == 0) {
    op_args.GetDbSlice().Del(op_args.db_cntx, res_it->it);
  }

  return deleted;
}

OpResult<MScoreResponse> OpMScore(const OpArgs& op_args, string_view key,
                                  const facade::ArgRange& members) {
  auto res_it = op_args.GetDbSlice().FindReadOnly(op_args.db_cntx, key, OBJ_ZSET);

  if (res_it.status() == OpStatus::KEY_NOTFOUND) {
    // If the key doesn't exist return an array of NIL values
    MScoreResponse result(members.Size(), std::nullopt);
    return result;
  }

  if (!res_it)
    return res_it.status();

  MScoreResponse scores(members.Size());

  auto& pv = res_it.value()->second;
  size_t i = 0;
  for (string_view member : members.Range())
    scores[i++] = GetZsetScore(pv, member);

  return scores;
}

OpResult<StringVec> OpScan(const OpArgs& op_args, std::string_view key, uint64_t* cursor,
                           const ScanOpts& scan_op) {
  auto find_res = op_args.GetDbSlice().FindReadOnly(op_args.db_cntx, key, OBJ_ZSET);

  if (!find_res) {
    *cursor = 0;
    return find_res.status();
  }

  const PrimeValue& pv = (*find_res)->second;
  StringVec res;
  char buf[128];

  if (IsListPack(pv)) {
    ZSetFamily::RangeParams params;
    params.with_scores = true;
    IntervalVisitor iv{Action::RANGE, params, const_cast<PrimeValue*>(&pv)};

    iv(ZSetFamily::IndexInterval{0, kuint32max});
    ScoredArray arr = iv.PopResult();

    for (size_t i = 0; i < arr.size(); ++i) {
      if (!scan_op.Matches(arr[i].first)) {
        continue;
      }
      res.emplace_back(std::move(arr[i].first));
      char* str = RedisReplyBuilder::FormatDouble(arr[i].second, buf, sizeof(buf));
      res.emplace_back(str);
    }
    *cursor = 0;
  } else {
    CHECK_EQ(unsigned(OBJ_ENCODING_SKIPLIST), pv.Encoding());
    uint32_t count = scan_op.limit;
    detail::SortedMap* sm = (detail::SortedMap*)pv.RObjPtr();
    long maxiterations = count * 10;
    uint64_t cur = *cursor;

    auto cb = [&](string_view str, double score) {
      if (scan_op.Matches(str)) {
        res.emplace_back(str);
        char* str = RedisReplyBuilder::FormatDouble(score, buf, sizeof(buf));
        res.emplace_back(str);
      }
    };
    do {
      cur = sm->Scan(cur, cb);
    } while (cur && maxiterations-- && res.size() < count);
    *cursor = cur;
  }

  return res;
}

OpResult<ScoredArray> OpRandMember(int count, const ZSetFamily::RangeParams& params,
                                   const OpArgs& op_args, string_view key) {
  auto it = op_args.GetDbSlice().FindReadOnly(op_args.db_cntx, key, OBJ_ZSET);
  if (!it)
    return it.status();

  // Action::RANGE is a read-only operation, but requires const_cast
  PrimeValue& pv = const_cast<PrimeValue&>(it.value()->second);

  const std::size_t size = pv.Size();
  const std::size_t picks_count =
      count >= 0 ? std::min(static_cast<std::size_t>(count), size) : std::abs(count);

  ScoredArray result{picks_count};
  std::unique_ptr<PicksGenerator> generator =
      count >= 0 ? static_cast<std::unique_ptr<PicksGenerator>>(
                       std::make_unique<UniquePicksGenerator>(picks_count, size))
                 : std::make_unique<NonUniquePicksGenerator>(size);

  if (picks_count * static_cast<std::uint64_t>(std::log2(size)) < size) {
    for (std::size_t i = 0; i < picks_count; i++) {
      const std::size_t picked_index = generator->Generate();

      IntervalVisitor iv{Action::RANGE, params, &pv};
      iv(ZSetFamily::IndexInterval{picked_index, picked_index});

      result[i] = iv.PopResult().front();
    }
  } else {
    IntervalVisitor iv{Action::RANGE, params, &pv};
    iv(ZSetFamily::IndexInterval{0, -1});

    ScoredArray all_elements = iv.PopResult();

    for (std::size_t i = 0; i < picks_count; i++) {
      result[i] = all_elements[generator->Generate()];
    }
  }

  return result;
}

// Boolean operation: union or intersection, optionally storing output to destination key
void ZBooleanOperation(CmdArgList args, string_view cmd, bool is_union, bool store,
                       CommandContext* cmd_cntx) {
  auto shard_func = is_union ? OpUnion : OpInter;
  auto merge_func = is_union ? UnionScoredMap : InterScoredMap;

  string_view dest_key = ArgS(args, 0);
  OpResult<SetOpArgs> op_args = ParseSetOpArgs(args, store);
  if (!op_args) {
    switch (op_args.status()) {
      case OpStatus::INVALID_FLOAT:
        return cmd_cntx->SendError("weight value is not a float", kSyntaxErrType);
      default:
        return cmd_cntx->SendError(op_args.status());
    }
  }
  if (op_args->num_keys == 0) {
    return cmd_cntx->SendError(absl::StrCat("at least 1 input key is needed for ", cmd));
  }
  Transaction* tx = cmd_cntx->tx();
  vector<OpResult<ScoredMap>> maps(shard_set->size(), OpStatus::SKIPPED);
  auto cb = [&](Transaction* t, EngineShard* shard) {
    maps[shard->shard_id()] =
        shard_func(shard, t, dest_key, op_args->agg_type, op_args->weights, store);
    return OpStatus::OK;
  };
  tx->Execute(cb, !store /* if we don't store, conclude */);

  // Merge results from all shards
  ScoredMap result;
  for (auto& op_res : maps) {
    if (op_res.status() == OpStatus::SKIPPED)
      continue;
    if (!op_res) {
      if (store) {
        tx->Conclude();
      }
      return cmd_cntx->SendError(op_res.status());
    }

    if (result.empty())
      result = std::move(op_res.value());
    else
      merge_func(&result, &op_res.value(), op_args->agg_type);

    if (result.empty() && !is_union)  // intersection only shrinks
      break;
  }

  // Copy to vector for sorting
  vector<ScoredMemberView> smvec(result.size());
  size_t i = 0;
  for (const auto& [str, score] : result)
    smvec[i++] = {score, str};

  SinkReplyBuilder* builder = cmd_cntx->rb();
  if (store) {
    // TODO: Use variant collection to avoid smvec copy for store operation
    auto store_cb = [&, dest_shard = Shard(dest_key, maps.size())](Transaction* t,
                                                                   EngineShard* shard) {
      if (shard->shard_id() == dest_shard)
        ZSetFamily::OpAdd(t->GetOpArgs(shard),
                          ZSetFamily::ZParams{.override = true, .journal_update = true}, dest_key,
                          smvec);
      return OpStatus::OK;
    };
    tx->Execute(store_cb, true);
    builder->SendLong(smvec.size());
  } else {
    std::sort(std::begin(smvec), std::end(smvec));

    // We can't use SendScoredArray because it expects strings, not string_views
    // TOOD: Not longer relevant with new io, use scoping
    auto* rb = static_cast<RedisReplyBuilder*>(builder);
    rb->StartArray(smvec.size() * (op_args->with_scores ? 2 : 1));
    for (const auto& elem : smvec) {
      rb->SendBulkString(elem.second);
      if (op_args->with_scores) {
        rb->SendDouble(elem.first);
      }
    }
  }
}

enum class FilterShards : uint8_t { NO = 0, YES = 1 };

OpResult<ScoredArray> ZPopMinMaxInternal(std::string_view key, FilterShards should_filter_shards,
                                         uint32 count, bool reverse, Transaction* tx) {
  ZSetFamily::RangeParams range_params;
  range_params.reverse = reverse;
  range_params.with_scores = true;
  ZSetFamily::ZRangeSpec range_spec;
  range_spec.params = range_params;

  range_spec.interval = count;

  OpResult<ScoredArray> result;

  std::optional<ShardId> key_shard;
  if (should_filter_shards == FilterShards::YES) {
    key_shard = Shard(key, shard_set->size());
  }
  auto cb = [&](Transaction* t, EngineShard* shard) {
    if (!key_shard.has_value() || *key_shard == shard->shard_id()) {
      result = OpPopCount(range_spec, t->GetOpArgs(shard), key);
    }
    return OpStatus::OK;
  };

  tx->Execute(std::move(cb), true);

  return result;
}

void ZPopMinMaxFromArgs(CmdArgList args, bool reverse, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  uint32 count = 1;
  if (args.size() > 1) {
    string_view count_str = ArgS(args, 1);
    if (!SimpleAtoi(count_str, &count)) {
      return cmd_cntx->SendError(kUintErr);
    }
  }

  OutputScoredArrayResult(ZPopMinMaxInternal(key, FilterShards::NO, count, reverse, cmd_cntx->tx()),
                          cmd_cntx->rb());
}

void ZRangeInternal(CmdArgList args, ZSetFamily::RangeParams range_params,
                    CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view min_s = ArgS(args, 1);
  string_view max_s = ArgS(args, 2);

  ZSetFamily::ZRangeSpec range_spec;
  range_spec.params = range_params;
  using RP = ZSetFamily::RangeParams;

  switch (range_params.interval_type) {
    case RP::IntervalType::SCORE: {
      ZSetFamily::ScoreInterval si;
      if (!ParseBound(min_s, &si.first) || !ParseBound(max_s, &si.second)) {
        return cmd_cntx->SendError(kFloatRangeErr);
      }
      range_spec.interval = si;
      break;
    }
    case RP::IntervalType::LEX: {
      ZSetFamily::LexInterval li;
      if (!ParseLexBound(min_s, &li.first) || !ParseLexBound(max_s, &li.second)) {
        return cmd_cntx->SendError(kLexRangeErr);
      }
      range_spec.interval = li;
      break;
    }
    case RP::IntervalType::RANK: {
      ZSetFamily::IndexInterval ii;
      if (!SimpleAtoi(min_s, &ii.first) || !SimpleAtoi(max_s, &ii.second)) {
        cmd_cntx->SendError(kInvalidIntErr);
        return;
      }
      range_spec.interval = ii;
      break;
    }
  }

  OpResult<ScoredArray> range_result;
  ShardId src_shard = Shard(key, shard_set->size());
  auto range_cb = [&](Transaction* t, EngineShard* shard) {
    if (shard->shard_id() != src_shard) {
      // Only run ZRANGE on the source shard.
      return OpStatus::OK;
    }
    range_result = OpRange(range_spec, t->GetOpArgs(shard), key);
    return OpStatus::OK;
  };

  auto* tx = cmd_cntx->tx();
  // Don't conclude the transaction if we're storing the result.
  tx->Execute(std::move(range_cb), !range_params.store_key);

  if (range_result.status() == OpStatus::WRONG_TYPE) {
    if (range_params.store_key) {
      tx->Conclude();
    }
    return cmd_cntx->SendError(kWrongTypeErr);
  }
  LOG_IF(WARNING, !range_result && range_result.status() != OpStatus::KEY_NOTFOUND)
      << "Unexpected status " << range_result.status();

  if (!range_params.store_key) {
    auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
    rb->SendScoredArray(range_result.value(), range_params.with_scores);
    return;
  }

  OpResult<ZSetFamily::AddResult> add_result;
  ShardId dest_shard = Shard(*range_params.store_key, shard_set->size());
  auto add_cb = [&](Transaction* t, EngineShard* shard) {
    if (shard->shard_id() != dest_shard) {
      // Only write the result on the target shard.
      return OpStatus::OK;
    }

    std::vector<ScoredMemberView> mvec(range_result->size());
    size_t i = 0;
    for (const auto& [str, score] : *range_result) {
      mvec[i++] = {score, str};
    }

    add_result = ZSetFamily::OpAdd(t->GetOpArgs(shard),
                                   ZSetFamily::ZParams{.override = true, .journal_update = true},
                                   *range_params.store_key, mvec);

    return OpStatus::OK;
  };
  tx->Execute(std::move(add_cb), true);

  if (add_result.status() == OpStatus::OUT_OF_MEMORY) {
    return cmd_cntx->SendError(add_result.status());
  }
  LOG_IF(WARNING, !add_result) << "Unexpected status " << add_result.status();

  return cmd_cntx->SendLong(range_result->size());
}

void ZRangeGeneric(CmdArgList args, ZSetFamily::RangeParams range_params,
                   CommandContext* cmd_cntx) {
  facade::CmdArgParser parser{args.subspan(3)};
  using RP = ZSetFamily::RangeParams;

  while (true) {
    RETURN_ON_PARSE_ERROR(parser, cmd_cntx);

    if (!parser.HasNext())
      break;

    if (parser.Check("BYSCORE")) {
      if (exchange(range_params.interval_type, RP::SCORE) == RP::LEX)
        return cmd_cntx->SendError("BYSCORE and BYLEX options are not compatible");
      continue;
    }

    if (parser.Check("BYLEX")) {
      if (exchange(range_params.interval_type, RP::LEX) == RP::SCORE)
        return cmd_cntx->SendError("BYSCORE and BYLEX options are not compatible");
      continue;
    }
    if (parser.Check("REV")) {
      range_params.reverse = true;
      continue;
    }
    if (parser.Check("WITHSCORES")) {
      range_params.with_scores = true;
      continue;
    }

    if (parser.Check("LIMIT")) {
      auto [offset, limit] = parser.Next<int32_t, int32_t>();

      range_params.limit = limit < 0 ? UINT32_MAX : static_cast<uint32_t>(limit);
      range_params.offset = offset < 0 ? UINT32_MAX : static_cast<uint32_t>(offset);
      continue;
    }

    return cmd_cntx->SendError(absl::StrCat("unsupported option ", parser.Peek()));
  }

  if (range_params.offset == UINT32_MAX) {
    auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
    return rb->SendEmptyArray();
  }

  ZRangeInternal(args.subspan(0, 3), range_params, cmd_cntx);
}

void ZRankGeneric(CmdArgList args, bool reverse, CommandContext* cmd_cntx) {
  // send this error exact as redis does, it checks number of arguments first
  if (args.size() > 3) {
    return cmd_cntx->SendError(WrongNumArgsError(reverse ? "ZREVRANK" : "ZRANK"));
  }

  facade::CmdArgParser parser(args);

  string_view key = parser.Next();
  string_view member = parser.Next();
  bool with_score = false;

  if (parser.HasNext()) {
    parser.ExpectTag("WITHSCORE");
    with_score = true;
  }

  if (!parser.Finalize()) {
    return cmd_cntx->SendError(parser.TakeError().MakeReply());
  }

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpRank(t->GetOpArgs(shard), key, member, reverse, with_score);
  };

  OpResult<RankResult> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  if (result) {
    if (with_score) {
      rb->StartArray(2);
      rb->SendLong(result->rank);
      rb->SendDouble(result->score);
    } else {
      rb->SendLong(result->rank);
    }
  } else if (result.status() == OpStatus::KEY_NOTFOUND) {
    rb->SendNull();
  } else {
    cmd_cntx->SendError(result.status());
  }
}

void ZRemRangeGeneric(string_view key, const ZSetFamily::ZRangeSpec& range_spec,
                      CommandContext* cmd_cntx) {
  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpRemRange(t->GetOpArgs(shard), key, range_spec);
  };

  OpResult<unsigned> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  if (result.status() == OpStatus::WRONG_TYPE) {
    cmd_cntx->SendError(kWrongTypeErr);
  } else {
    cmd_cntx->SendLong(*result);
  }
}

// Returns the key of the first non empty set found in the list of shard arguments.
// Returns nullopt if none.
std::optional<std::string_view> GetFirstNonEmptyKeyFound(EngineShard* shard, Transaction* t) {
  ShardArgs keys = t->GetShardArgs(shard->shard_id());
  DCHECK(!keys.Empty());

  auto& db_slice = t->GetDbSlice(shard->shard_id());

  for (string_view key : keys) {
    auto it = db_slice.FindReadOnly(t->GetDbContext(), key, OBJ_ZSET);
    if (!it) {
      continue;
    }
    return std::optional<std::string_view>(key);
  }

  return std::nullopt;
}

// Validates the ZMPop and BZMPop command arguments and extracts the values to the output params.
// If the arguments are invalid sends the appropiate error to builder and returns false.
bool ValidateZMPopCommand(CmdArgList args, bool is_blocking, CommandContext* cmd_cntx,
                          ValidateZMPopResult* result) {
  CmdArgParser parser{args};

  if (is_blocking) {
    if (!absl::SimpleAtof(parser.Next(), &result->timeout)) {
      cmd_cntx->SendError("timeout is not a float or out of range");
      return false;
    }
    if (result->timeout < 0) {
      cmd_cntx->SendError("timeout is negative");
      return false;
    }
  }

  if (!SimpleAtoi(parser.Next(), &(result->num_keys))) {
    cmd_cntx->SendError(kUintErr);
    return false;
  }

  if (result->num_keys <= 0 || !parser.HasAtLeast(result->num_keys + 1)) {
    // We should have at least num_keys keys + a MIN/MAX arg.
    cmd_cntx->SendError(kSyntaxErr);
    return false;
  }
  // Skip over the keys themselves.
  parser.Skip(result->num_keys);

  // We know we have at least one more arg (we checked above).
  if (parser.Check("MAX")) {
    result->is_max = true;
  } else if (parser.Check("MIN")) {
    result->is_max = false;
  } else {
    cmd_cntx->SendError(kSyntaxErr);
    return false;
  }

  result->pop_count = 1;
  // Check if we have additional COUNT argument.
  if (parser.HasNext()) {
    if (!parser.Check("COUNT", &result->pop_count)) {
      cmd_cntx->SendError(kSyntaxErr);
      return false;
    }
  }

  if (!parser.Finalize()) {
    cmd_cntx->SendError(parser.TakeError().MakeReply());
    return false;
  }

  return true;
}

}  // namespace

void ZSetFamily::ZAddGeneric(string_view key, const ZParams& zparams, ScoredMemberSpan memb_sp,
                             CommandContext* cmd_cntx) {
  auto cb = [&](Transaction* t, EngineShard* shard) {
    return ZSetFamily::OpAdd(t->GetOpArgs(shard), zparams, key, memb_sp);
  };

  OpResult<AddResult> add_result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  if (base::_in(add_result.status(), {OpStatus::WRONG_TYPE, OpStatus::OUT_OF_MEMORY})) {
    return cmd_cntx->SendError(add_result.status());
  }

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  // KEY_NOTFOUND may happen in case of XX flag.
  if (add_result.status() == OpStatus::KEY_NOTFOUND) {
    if (zparams.flags & ZADD_IN_INCR)
      rb->SendNull();
    else
      rb->SendLong(0);
  } else if (add_result.status() == OpStatus::SKIPPED) {
    rb->SendNull();
  } else if (add_result->is_nan) {
    cmd_cntx->SendError(kScoreNaN);
  } else {
    if (zparams.flags & ZADD_IN_INCR) {
      rb->SendDouble(add_result->new_score);
    } else {
      rb->SendLong(add_result->num_updated);
    }
  }
}

OpResult<MScoreResponse> ZSetFamily::ZGetMembers(CmdArgList args, Transaction* tx,
                                                 SinkReplyBuilder* builder) {
  string_view key = ArgS(args, 0);
  auto members = args.subspan(1);
  auto cb = [key, members](Transaction* t, EngineShard* shard) {
    return OpMScore(t->GetOpArgs(shard), key, members);
  };

  return tx->ScheduleSingleHopT(std::move(cb));
}

auto ZSetFamily::OpRanges(const std::vector<ZSetFamily::ZRangeSpec>& range_specs,
                          const OpArgs& op_args, string_view key) -> OpResult<vector<ScoredArray>> {
  auto res_it = op_args.GetDbSlice().FindReadOnly(op_args.db_cntx, key, OBJ_ZSET);
  if (!res_it)
    return res_it.status();

  // Action::RANGE is read-only, but requires mutable pointer, thus const_cast
  PrimeValue& pv = const_cast<PrimeValue&>(res_it.value()->second);
  vector<ScoredArray> result_arrays;
  for (auto& range_spec : range_specs) {
    IntervalVisitor iv{Action::RANGE, range_spec.params, &pv};
    std::visit(iv, range_spec.interval);
    result_arrays.push_back(iv.PopResult());
  }

  return result_arrays;
}

OpResult<ZSetFamily::AddResult> ZSetFamily::OpAdd(const OpArgs& op_args,
                                                  const ZSetFamily::ZParams& zparams,
                                                  string_view key, ScoredMemberSpan members) {
  DCHECK(!members.empty() || zparams.override);
  auto& db_slice = op_args.GetDbSlice();

  if (zparams.override && members.empty()) {
    auto res_it = db_slice.FindMutable(op_args.db_cntx, key, OBJ_ZSET);
    if (res_it && IsValid(res_it->it)) {
      db_slice.DelMutable(op_args.db_cntx, std::move(*res_it));
      if (zparams.journal_update && op_args.shard->journal()) {
        RecordJournal(op_args, "DEL"sv, ArgSlice{key});
      }
    }
    return OpStatus::OK;
  }

  // When we have too many members to add, make sure field_len is large enough to use
  // skiplist encoding.
  size_t field_len =
      members.size() > ZSET_MAX_LISTPACK_ENTRIES ? UINT32_MAX : members.front().second.size();
  auto res_it = PrepareZEntry(zparams, op_args, key, field_len);

  if (!res_it)
    return res_it.status();

  unsigned added = 0;
  unsigned updated = 0;

  double new_score = 0;
  int retflags = 0;

  OpStatus op_status = OpStatus::OK;
  AddResult aresult;
  auto& pv = res_it->it->second;
  bool is_list_pack = IsListPack(pv);

  // opportunistically reserve space if multiple entries are about to be added.
  if ((zparams.flags & ZADD_IN_XX) == 0 && members.size() > 2) {
    if (is_list_pack) {
      uint8_t* zl = (uint8_t*)pv.RObjPtr();
      size_t malloc_reserved = zmalloc_size(zl);
      size_t min_sz = EstimateListpackMinBytes(members);
      if (min_sz > malloc_reserved) {
        zl = (uint8_t*)zrealloc(zl, min_sz);
        pv.SetRObjPtr(zl);
      }
    } else {
      detail::SortedMap* sm = (detail::SortedMap*)pv.RObjPtr();
      sm->Reserve(members.size());
    }
  }

  for (size_t j = 0; j < members.size(); j++) {
    const auto& m = members[j];
    int retval = ZsetAdd(&pv, m.first, m.second, zparams.flags, &retflags, &new_score);

    if (zparams.flags & ZADD_IN_INCR) {
      if (retval == 0) {
        CHECK_EQ(1u, members.size());

        aresult.is_nan = true;
        break;
      }

      if (retflags & ZADD_OUT_NOP) {
        op_status = OpStatus::SKIPPED;
      }
    }

    if (retflags & ZADD_OUT_ADDED)
      added++;
    if (retflags & ZADD_OUT_UPDATED)
      updated++;
  }

  if (zparams.flags & ZADD_IN_INCR) {
    aresult.new_score = new_score;
  } else {
    aresult.num_updated = zparams.ch ? added + updated : added;
  }

  if (op_status != OpStatus::OK)
    return op_status;

  // TODO: consider optimization to record real command if the replica is in stable_sync state
  // and there is no slot migration process going on.
  if (zparams.journal_update && op_args.shard->journal()) {
    if (zparams.override) {
      RecordJournal(op_args, "DEL"sv, ArgSlice{key});
    }

    vector<string> scores;
    vector<string_view> mapped;
    scores.reserve(members.size());
    mapped.reserve(members.size() * 2 + 1);
    mapped.push_back(key);
    for (const auto& [score, member] : members) {
      scores.push_back(absl::StrCat(score));
      mapped.push_back(scores.back());
      mapped.push_back(member);
    }
    RecordJournal(op_args, "ZADD"sv, mapped);
  }
  return aresult;
}

OpResult<void> ZSetFamily::OpKeyExisted(const OpArgs& op_args, string_view key) {
  auto res_it = op_args.GetDbSlice().FindReadOnly(op_args.db_cntx, key, OBJ_ZSET);
  return res_it.status();
}

OpResult<double> ZSetFamily::OpScore(const OpArgs& op_args, string_view key, string_view member) {
  auto res_it = op_args.GetDbSlice().FindReadOnly(op_args.db_cntx, key, OBJ_ZSET);
  if (!res_it)
    return res_it.status();

  const PrimeValue& pv = res_it.value()->second;
  auto res = GetZsetScore(pv, member);
  if (!res) {
    return OpStatus::MEMBER_NOTFOUND;
  }
  return *res;
}

namespace {

void CmdBZPopMin(CmdArgList args, CommandContext* cmd_cntx) {
  BZPopMinMax(args, false, cmd_cntx);
}

void CmdBZPopMax(CmdArgList args, CommandContext* cmd_cntx) {
  BZPopMinMax(args, true, cmd_cntx);
}

void CmdZAdd(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);

  ZSetFamily::ZParams zparams;
  size_t i = 1;
  for (; i < args.size() - 1; ++i) {
    string cur_arg = absl::AsciiStrToUpper(ArgS(args, i));

    if (cur_arg == "XX") {
      zparams.flags |= ZADD_IN_XX;  // update only
    } else if (cur_arg == "NX") {
      zparams.flags |= ZADD_IN_NX;  // add new only.
    } else if (cur_arg == "GT") {
      zparams.flags |= ZADD_IN_GT;
    } else if (cur_arg == "LT") {
      zparams.flags |= ZADD_IN_LT;
    } else if (cur_arg == "CH") {
      zparams.ch = true;
    } else if (cur_arg == "INCR") {
      zparams.flags |= ZADD_IN_INCR;
    } else {
      break;
    }
  }

  auto* builder = cmd_cntx->rb();
  if ((args.size() - i) % 2 != 0) {
    builder->SendError(kSyntaxErr);
    return;
  }

  if ((zparams.flags & ZADD_IN_INCR) && (i + 2 < args.size())) {
    builder->SendError("INCR option supports a single increment-element pair");
    return;
  }

  unsigned insert_mask = zparams.flags & (ZADD_IN_NX | ZADD_IN_XX);
  if (insert_mask == (ZADD_IN_NX | ZADD_IN_XX)) {
    builder->SendError(kNxXxErr);
    return;
  }

  constexpr auto kRangeOpt = ZADD_IN_GT | ZADD_IN_LT;
  if (((zparams.flags & ZADD_IN_NX) && (zparams.flags & kRangeOpt)) ||
      ((zparams.flags & kRangeOpt) == kRangeOpt)) {
    builder->SendError("GT, LT, and/or NX options at the same time are not compatible");
    return;
  }

  absl::flat_hash_set<string_view> members_set;
  absl::InlinedVector<ScoredMemberView, 4> members;

  unsigned num_members = (args.size() - i) / 2;

  // We sort the fields if the expected encoding could be listpack.
  bool to_sort_fields = false;

  if (num_members > 2) {
    members.reserve(num_members);

    members_set.reserve(num_members);
    to_sort_fields = true;
  }

  for (; i < args.size(); i += 2) {
    string_view cur_arg = ArgS(args, i);
    double val = 0;

    // Parse the score. Treats Nan as invalid double.
    if (!ParseDouble(cur_arg, &val)) {
      VLOG(1) << "Bad score:" << cur_arg << "|";
      return builder->SendError(kInvalidFloatErr);
    }

    string_view member = ArgS(args, i + 1);
    if (to_sort_fields) {
      auto [_, inserted] = members_set.insert(member);
      to_sort_fields &= inserted;
    }
    members.emplace_back(val, member);
  }
  DCHECK(cmd_cntx->tx());

  if (to_sort_fields) {
    if (num_members == 2) {  // fix unique_members for this special case.
      if (members[0].second == members[1].second) {
        to_sort_fields = false;
      }
    }
    if (to_sort_fields) {
      std::sort(members.begin(), members.end());
    }
  }

  absl::Span memb_sp{members.data(), members.size()};
  ZSetFamily::ZAddGeneric(key, zparams, memb_sp, cmd_cntx);
}

void CmdZCard(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);

  auto cb = [&](Transaction* t, EngineShard* shard) -> OpResult<uint32_t> {
    auto find_res = t->GetDbSlice(shard->shard_id()).FindReadOnly(t->GetDbContext(), key, OBJ_ZSET);
    if (!find_res) {
      return find_res.status();
    }

    return find_res.value()->second.Size();
  };

  OpResult<uint32_t> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  if (result.status() == OpStatus::WRONG_TYPE) {
    cmd_cntx->SendError(kWrongTypeErr);
    return;
  }

  cmd_cntx->SendLong(result.value());
}

void CmdZCount(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);

  string_view min_s = ArgS(args, 1);
  string_view max_s = ArgS(args, 2);

  ZSetFamily::ScoreInterval si;
  if (!ParseBound(min_s, &si.first) || !ParseBound(max_s, &si.second)) {
    return cmd_cntx->SendError(kFloatRangeErr);
  }

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpCount(t->GetOpArgs(shard), key, si);
  };

  OpResult<unsigned> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  if (result.status() == OpStatus::WRONG_TYPE) {
    cmd_cntx->SendError(kWrongTypeErr);
  } else {
    cmd_cntx->SendLong(*result);
  }
}

/* Calculate difference between key set and all other sets. */
vector<ScoredMemberView> ZDiffOp(ShardId key_sid, vector<OpResult<vector<ScoredMap>>> maps,
                                 ScoredMap* result) {
  auto& key_shard_map = maps[key_sid].value();

  // Key set will be first element of shard ScoredMap vector. Scored map for shard containing key
  // should have least one - key set. If it is empty we don't need anything and return
  // immediately.
  if (key_shard_map[0].empty()) {
    return {};
  }

  // Store key set values in result and remove it from vector for further calculations.
  *result = std::move(key_shard_map[0]);
  key_shard_map.erase(key_shard_map.begin());

  auto filter = [&result](const auto& key) mutable {
    auto it = result->find(key);
    if (it != result->end()) {
      result->erase(it);
    }
  };

  // Total O(L)
  // Iterate over the results of each shard
  for (auto& vsm : maps) {
    // Iterate over each fetched set
    for (auto& sm : vsm.value()) {
      // Iterate over each key in the fetched set and filter
      for (auto& [key, value] : sm) {
        filter(key);
      }
    }
  }

  vector<ScoredMemberView> smvec;
  for (const auto& elem : *result) {
    smvec.emplace_back(elem.second, elem.first);
  }

  // Total O(KlogK)
  std::sort(std::begin(smvec), std::end(smvec));

  return smvec;
}

void CmdZDiff(CmdArgList args, CommandContext* cmd_cntx) {
  vector<OpResult<vector<ScoredMap>>> maps(shard_set->size(), OpStatus::OK);

  auto cb = [&](Transaction* t, EngineShard* shard) {
    maps[shard->shard_id()] = OpFetch(shard, t, false /* no destination key */);
    return OpStatus::OK;
  };

  cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  // Check shard results for WRONG_TYPE returned
  for (auto& sm_map : maps) {
    if (sm_map.status() == OpStatus::WRONG_TYPE) {
      cmd_cntx->SendError(sm_map.status());
      return;
    }
  }

  const string_view key = ArgS(args, 1);
  const ShardId sid = Shard(key, shard_set->size());

  // We need to have result stored and not be destructed before function ends because
  // we are passing string_view of result members to other functions
  ScoredMap result;
  // Calculate diff between sets.
  vector<ScoredMemberView> smvec = ZDiffOp(sid, std::move(maps), &result);

  // Empty result set so return
  if (smvec.empty()) {
    rb->SendEmptyArray();
    return;
  }

  const bool with_scores = absl::EqualsIgnoreCase(ArgS(args, args.size() - 1), "WITHSCORES");
  bool is_resp3 = rb->IsResp3();
  rb->StartArray(smvec.size() * ((with_scores && !is_resp3) ? 2 : 1));
  for (const auto& [score, key] : smvec) {
    if (is_resp3)
      rb->StartArray(with_scores ? 2 : 1);
    rb->SendBulkString(key);
    if (with_scores) {
      rb->SendDouble(score);
    }
  }
}

void CmdZDiffStore(CmdArgList args, CommandContext* cmd_cntx) {
  vector<OpResult<vector<ScoredMap>>> maps(shard_set->size(), OpStatus::OK);
  const string_view dest_key = ArgS(args, 0);
  const ShardId dest_shard = Shard(dest_key, shard_set->size());

  auto cb = [&](Transaction* t, EngineShard* shard) {
    // We skip destkey if shard id matches
    const bool skip_dest_key = shard->shard_id() == dest_shard;
    maps[shard->shard_id()] = OpFetch(shard, t, skip_dest_key);
    return OpStatus::OK;
  };

  cmd_cntx->tx()->Execute(std::move(cb), false);

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  // Check shard results for WRONG_TYPE returned
  for (auto& sm_map : maps) {
    if (sm_map.status() == OpStatus::WRONG_TYPE) {
      cmd_cntx->tx()->Conclude();
      return cmd_cntx->SendError(sm_map.status());
    }
  }

  const string_view key = ArgS(args, 2);
  const ShardId sid = Shard(key, shard_set->size());

  // We need to have result stored and not be destructed before function ends because
  // we are passing string_view of result members to other functions
  ScoredMap result;
  // Calculate diff between sets. We stil need to write  destination key even it is empty set
  vector<ScoredMemberView> smvec = ZDiffOp(sid, std::move(maps), &result);

  auto store_cb = [&](Transaction* t, EngineShard* shard) {
    if (shard->shard_id() == dest_shard)
      ZSetFamily::OpAdd(t->GetOpArgs(shard),
                        ZSetFamily::ZParams{.override = true, .journal_update = true}, dest_key,
                        smvec);
    return OpStatus::OK;
  };

  cmd_cntx->tx()->Execute(store_cb, true);
  rb->SendLong(smvec.size());
}

void CmdZIncrBy(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view score_arg = ArgS(args, 1);

  ScoredMemberView scored_member;
  scored_member.second = ArgS(args, 2);

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  if (!absl::SimpleAtod(score_arg, &scored_member.first)) {
    VLOG(1) << "Bad score:" << score_arg << "|";
    return rb->SendError(kInvalidFloatErr);
  }

  if (isnan(scored_member.first)) {
    return rb->SendError(kScoreNaN);
  }

  ZSetFamily::ZParams zparams;
  zparams.flags = ZADD_IN_INCR;

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return ZSetFamily::OpAdd(t->GetOpArgs(shard), zparams, key,
                             ScoredMemberSpan{&scored_member, 1});
  };

  OpResult add_result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  if (add_result.status() == OpStatus::WRONG_TYPE) {
    return rb->SendError(kWrongTypeErr);
  }

  if (add_result.status() == OpStatus::SKIPPED) {
    return rb->SendNull();
  }

  if (add_result->is_nan) {
    return rb->SendError(kScoreNaN);
  }

  rb->SendDouble(add_result->new_score);
}

void CmdZInter(CmdArgList args, CommandContext* cmd_cntx) {
  ZBooleanOperation(args, "zinter", false, false, cmd_cntx);
}

void CmdZInterStore(CmdArgList args, CommandContext* cmd_cntx) {
  ZBooleanOperation(args, "zinterstore", false, true, cmd_cntx);
}

void CmdZInterCard(CmdArgList args, CommandContext* cmd_cntx) {
  unsigned num_keys;
  auto* builder = cmd_cntx->rb();

  if (!absl::SimpleAtoi(ArgS(args, 0), &num_keys)) {
    return cmd_cntx->SendError(OpStatus::SYNTAX_ERR);
  }

  uint64_t limit = 0;
  if (args.size() == (1 + num_keys + 2) && ArgS(args, 1 + num_keys) == "LIMIT") {
    if (!absl::SimpleAtoi(ArgS(args, 1 + num_keys + 1), &limit)) {
      return builder->SendError("limit value is not a positive integer", kSyntaxErrType);
    }
  } else if (args.size() != 1 + num_keys) {
    return builder->SendError(kSyntaxErr);
  }

  vector<OpResult<ScoredMap>> maps(shard_set->size(), OpStatus::SKIPPED);

  auto cb = [&](Transaction* t, EngineShard* shard) {
    maps[shard->shard_id()] = OpInter(shard, t, "", AggType::NOOP, {}, false);
    return OpStatus::OK;
  };

  cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));

  OpResult<ScoredMap> result = IntersectResults(maps, AggType::NOOP);
  if (!result)
    return cmd_cntx->SendError(result.status());

  if (0 < limit && limit < result.value().size()) {
    return builder->SendLong(limit);
  }
  builder->SendLong(result.value().size());
}

// Generic function for ZMPop and BZMPop commands
void ZMPopGeneric(CmdArgList args, CommandContext* cmd_cntx, bool is_blocking) {
  ValidateZMPopResult zmpop_args;
  if (!ValidateZMPopCommand(args, is_blocking, cmd_cntx, &zmpop_args)) {
    return;
  }
  auto* response_builder = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  // From the list of input keys, keep the first (in the order of keys in the command) key found
  // in the current shard.
  std::vector<std::optional<std::string_view>> first_found_key_per_shard_vec(shard_set->size(),
                                                                             std::nullopt);

  auto cb = [&](Transaction* t, EngineShard* shard) {
    std::optional<std::string_view> result = GetFirstNonEmptyKeyFound(shard, t);
    if (result.has_value()) {
      first_found_key_per_shard_vec[shard->shard_id()] = result;
    }
    return OpStatus::OK;
  };

  cmd_cntx->tx()->Execute(std::move(cb), false /* possibly another hop */);

  // Keep all the keys found (first only for each shard) in a set for fast lookups.
  absl::flat_hash_set<std::string_view> first_found_keys_for_shard;
  // We can have at most one result from each shard.
  first_found_keys_for_shard.reserve(std::min(shard_set->size(), zmpop_args.num_keys));
  for (const auto& key : first_found_key_per_shard_vec) {
    if (!key.has_value()) {
      continue;
    }
    first_found_keys_for_shard.insert(*key);
  }

  // Now that we have the first non empty key from each shard, find the first overall first key
  // and pop elements from it.
  std::optional<std::string_view> key_to_pop = std::nullopt;
  // BZMPOP have 1 extra argument as compared to ZMPOP hence adding 1 is is_blocking is true
  ArgRange arg_keys(args.subspan(1 + is_blocking, zmpop_args.num_keys));
  // Find the first arg_key which exists in any shard and is not empty.
  for (std::string_view key : arg_keys) {
    if (first_found_keys_for_shard.contains(key)) {
      key_to_pop = key;
      break;
    }
  }

  if (!key_to_pop.has_value() && (!is_blocking || cmd_cntx->tx()->IsMulti())) {
    cmd_cntx->tx()->Conclude();
    response_builder->SendNull();
    return;
  }
  // if we don't have any key to pop and it's blocking then we will block it using `WaitOnWatch`
  if (is_blocking && !key_to_pop.has_value()) {
    auto trans = cmd_cntx->tx();
    auto* cntx = cmd_cntx->server_conn_cntx();
    auto* ns = &trans->GetNamespace();

    auto limit_tp = Transaction::time_point::max();
    auto limit_ms = (unsigned)(zmpop_args.timeout * 1000);
    if (limit_ms > 0) {
      using namespace std::chrono;
      limit_tp = steady_clock::now() + milliseconds(limit_ms);
    }
    const auto key_checker = [ns](EngineShard* owner, const DbContext& context, Transaction*,
                                  std::string_view key) -> bool {
      return ns->GetDbSlice(owner->shard_id()).FindReadOnly(context, key, OBJ_ZSET).ok();
    };

    DCHECK(trans->IsScheduled());  // Checking if the transaction is scheduled before calling
                                   // `WaitOnWatch`
    auto status = trans->WaitOnWatch(limit_tp, Transaction::kShardArgs, key_checker, &cntx->blocked,
                                     &cntx->paused);

    if (status != OpStatus::OK) {
      response_builder->SendNull();
      return;
    }

    auto cb = [&key_to_pop](Transaction* t, EngineShard* shard) {
      if (auto wake_key = t->GetWakeKey(shard->shard_id()); wake_key) {
        key_to_pop = *wake_key;
      }
      return OpStatus::OK;
    };
    trans->Execute(std::move(cb), false);
  }

  DCHECK(key_to_pop.has_value());

  // Pop elements from relevant set.
  OpResult<ScoredArray> pop_result = ZPopMinMaxInternal(
      *key_to_pop, FilterShards::YES, zmpop_args.pop_count, zmpop_args.is_max, cmd_cntx->tx());

  if (pop_result.status() == OpStatus::WRONG_TYPE) {
    return response_builder->SendError(kWrongTypeErr);
  }

  LOG_IF(WARNING, !pop_result) << "Unexpected status " << pop_result.status();
  response_builder->SendLabeledScoredArray(*key_to_pop, pop_result.value());
}

void CmdZMPop(CmdArgList args, CommandContext* cmd_cntx) {
  ZMPopGeneric(args, cmd_cntx, false);
}

void CmdBZMPop(CmdArgList args, CommandContext* cmd_cntx) {
  ZMPopGeneric(args, cmd_cntx, true);
}

void CmdZPopMax(CmdArgList args, CommandContext* cmd_cntx) {
  ZPopMinMaxFromArgs(args, true, cmd_cntx);
}

void CmdZPopMin(CmdArgList args, CommandContext* cmd_cntx) {
  ZPopMinMaxFromArgs(args, false, cmd_cntx);
}

void CmdZLexCount(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);

  string_view min_s = ArgS(args, 1);
  string_view max_s = ArgS(args, 2);

  ZSetFamily::LexInterval li;
  if (!ParseLexBound(min_s, &li.first) || !ParseLexBound(max_s, &li.second)) {
    return cmd_cntx->SendError(kLexRangeErr);
  }

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpLexCount(t->GetOpArgs(shard), key, li);
  };

  OpResult<unsigned> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  if (result.status() == OpStatus::WRONG_TYPE) {
    cmd_cntx->SendError(kWrongTypeErr);
  } else {
    cmd_cntx->SendLong(*result);
  }
}

using RangeParams = ZSetFamily::RangeParams;

void CmdZRange(CmdArgList args, CommandContext* cmd_cntx) {
  ZRangeGeneric(args, RangeParams{}, cmd_cntx);
}

void CmdZRank(CmdArgList args, CommandContext* cmd_cntx) {
  ZRankGeneric(args, false, cmd_cntx);
}

void CmdZRevRange(CmdArgList args, CommandContext* cmd_cntx) {
  ZRangeGeneric(args, RangeParams{.reverse = true}, cmd_cntx);
}

void CmdZRangeByScore(CmdArgList args, CommandContext* cmd_cntx) {
  ZRangeGeneric(args, RangeParams{.interval_type = RangeParams::SCORE}, cmd_cntx);
}

void CmdZRangeStore(CmdArgList args, CommandContext* cmd_cntx) {
  ZRangeGeneric(args.subspan(1), RangeParams{.with_scores = true, .store_key = ArgS(args, 0)},
                cmd_cntx);
}

void CmdZRevRangeByScore(CmdArgList args, CommandContext* cmd_cntx) {
  ZRangeGeneric(args, RangeParams{.reverse = true, .interval_type = RangeParams::SCORE}, cmd_cntx);
}

void CmdZRevRank(CmdArgList args, CommandContext* cmd_cntx) {
  ZRankGeneric(args, true, cmd_cntx);
}

void CmdZRangeByLex(CmdArgList args, CommandContext* cmd_cntx) {
  ZRangeGeneric(args, RangeParams{.interval_type = RangeParams::LEX}, cmd_cntx);
}

void CmdZRevRangeByLex(CmdArgList args, CommandContext* cmd_cntx) {
  ZRangeGeneric(args, RangeParams{.reverse = true, .interval_type = RangeParams::LEX}, cmd_cntx);
}

void CmdZRemRangeByRank(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view min_s = ArgS(args, 1);
  string_view max_s = ArgS(args, 2);

  ZSetFamily::IndexInterval ii;
  if (!SimpleAtoi(min_s, &ii.first) || !SimpleAtoi(max_s, &ii.second)) {
    return cmd_cntx->SendError(kInvalidIntErr);
  }

  ZSetFamily::ZRangeSpec range_spec;
  range_spec.interval = ii;
  ZRemRangeGeneric(key, range_spec, cmd_cntx);
}

void CmdZRemRangeByScore(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view min_s = ArgS(args, 1);
  string_view max_s = ArgS(args, 2);

  ZSetFamily::ScoreInterval si;
  if (!ParseBound(min_s, &si.first) || !ParseBound(max_s, &si.second)) {
    return cmd_cntx->SendError(kFloatRangeErr);
  }

  ZSetFamily::ZRangeSpec range_spec;

  range_spec.interval = si;

  ZRemRangeGeneric(key, range_spec, cmd_cntx);
}

void CmdZRemRangeByLex(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view min_s = ArgS(args, 1);
  string_view max_s = ArgS(args, 2);

  ZSetFamily::LexInterval li;
  if (!ParseLexBound(min_s, &li.first) || !ParseLexBound(max_s, &li.second)) {
    return cmd_cntx->SendError(kLexRangeErr);
  }

  ZSetFamily::ZRangeSpec range_spec;

  range_spec.interval = li;

  ZRemRangeGeneric(key, range_spec, cmd_cntx);
}

void CmdZRem(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  auto members = args.subspan(1);
  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpRem(t->GetOpArgs(shard), key, members);
  };

  OpResult<unsigned> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  if (result.status() == OpStatus::WRONG_TYPE) {
    cmd_cntx->SendError(kWrongTypeErr);
  } else {
    cmd_cntx->SendLong(*result);
  }
}

void CmdZRandMember(CmdArgList args, CommandContext* cmd_cntx) {
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  if (args.size() > 3)
    return rb->SendError(WrongNumArgsError("ZRANDMEMBER"));

  CmdArgParser parser{args};
  string_view key = parser.Next();

  bool is_count = parser.HasNext();
  int count = is_count ? parser.Next<int>() : 1;

  ZSetFamily::RangeParams params;
  params.with_scores = static_cast<bool>(parser.Check("WITHSCORES"));

  if (parser.HasNext())
    return rb->SendError(absl::StrCat("Unsupported option:", string_view(parser.Next())));

  RETURN_ON_PARSE_ERROR(parser, cmd_cntx);

  const auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpRandMember(count, params, t->GetOpArgs(shard), key);
  };

  OpResult<ScoredArray> result = cmd_cntx->tx()->ScheduleSingleHopT(cb);
  if (result) {
    rb->SendScoredArray(result.value(), params.with_scores);
  } else if (result.status() == OpStatus::KEY_NOTFOUND) {
    if (is_count) {
      rb->SendScoredArray(ScoredArray(), params.with_scores);
    } else {
      rb->SendNull();
    }
  } else {
    cmd_cntx->SendError(result.status());
  }
}

void CmdZScore(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view member = ArgS(args, 1);

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return ZSetFamily::OpScore(t->GetOpArgs(shard), key, member);
  };

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
  OpResult<double> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  if (result.status() == OpStatus::WRONG_TYPE) {
    rb->SendError(kWrongTypeErr);
  } else if (!result) {
    rb->SendNull();
  } else {
    rb->SendDouble(*result);
  }
}

void CmdZMScore(CmdArgList args, CommandContext* cmd_cntx) {
  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  OpResult<MScoreResponse> result = ZSetFamily::ZGetMembers(args, cmd_cntx->tx(), rb);

  if (result.status() == OpStatus::WRONG_TYPE) {
    return rb->SendError(kWrongTypeErr);
  }
  rb->StartArray(result->size());  // Array return type.
  const MScoreResponse& array = result.value();
  for (const auto& p : array) {
    if (p) {
      rb->SendDouble(*p);
    } else {
      rb->SendNull();
    }
  }
}

void CmdZScan(CmdArgList args, CommandContext* cmd_cntx) {
  string_view key = ArgS(args, 0);
  string_view token = ArgS(args, 1);

  uint64_t cursor = 0;

  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());

  if (!absl::SimpleAtoi(token, &cursor)) {
    return cmd_cntx->SendError("invalid cursor");
  }

  OpResult<ScanOpts> ops = ScanOpts::TryFrom(args.subspan(2));
  if (!ops) {
    DVLOG(1) << "Scan invalid args - return " << ops << " to the user";
    return cmd_cntx->SendError(ops.status());
  }
  const ScanOpts& scan_op = ops.value();

  auto cb = [&](Transaction* t, EngineShard* shard) {
    return OpScan(t->GetOpArgs(shard), key, &cursor, scan_op);
  };

  OpResult<StringVec> result = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
  if (result.status() != OpStatus::WRONG_TYPE) {
    rb->StartArray(2);
    rb->SendBulkString(absl::StrCat(cursor));
    rb->StartArray(result->size());  // Within scan the returned page is of type array.
    for (const auto& k : *result) {
      rb->SendBulkString(k);
    }
  } else {
    cmd_cntx->SendError(result.status());
  }
}

void CmdZUnion(CmdArgList args, CommandContext* cmd_cntx) {
  ZBooleanOperation(args, "zunion", true, false, cmd_cntx);
}

void CmdZUnionStore(CmdArgList args, CommandContext* cmd_cntx) {
  ZBooleanOperation(args, "zunionstore", true, true, cmd_cntx);
}

}  // namespace

#define HFUNC(x) SetHandler(&Cmd##x)

LoadBlobResult ZSetFamily::LoadZiplistBlob(std::string_view blob, PrimeValue* pv) {
  unsigned char* lp = lpNew(blob.size());
  if (!ZiplistPairsConvertAndValidateIntegrity((const uint8_t*)blob.data(), blob.size(), &lp)) {
    LOG(ERROR) << "Zset ziplist integrity check failed.";
    zfree(lp);
    return LoadBlobResult::kCorrupted;
  }

  if (lpLength(lp) == 0) {
    lpFree(lp);
    return LoadBlobResult::kEmpty;
  }

  unsigned encoding = OBJ_ENCODING_LISTPACK;
  void* inner;
  if (lpBytes(lp) >= server.max_listpack_map_bytes) {
    inner = detail::SortedMap::FromListPack(CompactObj::memory_resource(), lp);
    lpFree(lp);
    encoding = OBJ_ENCODING_SKIPLIST;
  } else {
    lp = lpShrinkToFit(lp);
    inner = lp;
  }

  pv->InitRobj(OBJ_ZSET, encoding, inner);
  return LoadBlobResult::kSuccess;
}

LoadBlobResult ZSetFamily::LoadListpackBlob(std::string_view blob, PrimeValue* pv) {
  if (!lpValidateIntegrity((uint8_t*)blob.data(), blob.size(), 0, nullptr, nullptr)) {
    LOG(ERROR) << "Zset listpack integrity check failed.";
    return LoadBlobResult::kCorrupted;
  }

  unsigned char* src_lp = (unsigned char*)blob.data();
  unsigned long long bytes = lpBytes(src_lp);
  unsigned char* lp = (uint8_t*)zmalloc(bytes);
  std::memcpy(lp, src_lp, bytes);
  pv->InitRobj(OBJ_ZSET, OBJ_ENCODING_LISTPACK, lp);
  return LoadBlobResult::kSuccess;
}

void ZSetFamily::Register(CommandRegistry* registry) {
  constexpr uint32_t kStoreMask =
      CO::JOURNALED | CO::VARIADIC_KEYS | CO::DENYOOM | CO::NO_AUTOJOURNAL;
  registry->StartFamily(acl::SORTEDSET);
  // TODO: to add support for SCRIPT for BZPOPMIN, BZPOPMAX similarly to BLPOP.
  // We break up chain into multiple calls to reduce stack usage in this function.
  *registry << CI{"ZADD", CO::FAST | CO::JOURNALED | CO::DENYOOM, -4, 1, 1}.HFUNC(ZAdd)
            << CI{"BZPOPMIN", CO::JOURNALED | CO::NOSCRIPT | CO::BLOCKING | CO::NO_AUTOJOURNAL, -3,
                  1, -2}
                   .HFUNC(BZPopMin)
            << CI{"BZPOPMAX", CO::JOURNALED | CO::NOSCRIPT | CO::BLOCKING | CO::NO_AUTOJOURNAL, -3,
                  1, -2}
                   .HFUNC(BZPopMax)
            << CI{"ZCARD", CO::FAST | CO::READONLY, 2, 1, 1}.HFUNC(ZCard)
            << CI{"ZCOUNT", CO::FAST | CO::READONLY, 4, 1, 1}.HFUNC(ZCount)
            << CI{"ZDIFF", CO::READONLY | CO::VARIADIC_KEYS, -3, 2, 2}.HFUNC(ZDiff);

  *registry << CI{"ZDIFFSTORE", kStoreMask, -4, 3, 3}.HFUNC(ZDiffStore)
            << CI{"ZINCRBY", CO::FAST | CO::JOURNALED, 4, 1, 1}.HFUNC(ZIncrBy)
            << CI{"ZINTERSTORE", kStoreMask, -4, 3, 3}.HFUNC(ZInterStore)
            << CI{"ZINTER", CO::READONLY | CO::VARIADIC_KEYS, -3, 2, 2}.HFUNC(ZInter)
            << CI{"ZINTERCARD", CO::READONLY | CO::VARIADIC_KEYS, -3, 2, 2}.HFUNC(ZInterCard)
            << CI{"ZLEXCOUNT", CO::READONLY, 4, 1, 1}.HFUNC(ZLexCount)
            << CI{"ZMPOP", CO::JOURNALED | CO::VARIADIC_KEYS | CO::NO_AUTOJOURNAL, -4, 2, 2}.HFUNC(
                   ZMPop)
            << CI{"BZMPOP", CO::JOURNALED | CO::VARIADIC_KEYS | CO::BLOCKING | CO::NO_AUTOJOURNAL,
                  -5, 3, 3}
                   .HFUNC(BZMPop);

  *registry << CI{"ZPOPMAX", CO::FAST | CO::JOURNALED, -2, 1, 1}.HFUNC(ZPopMax)
            << CI{"ZPOPMIN", CO::FAST | CO::JOURNALED, -2, 1, 1}.HFUNC(ZPopMin)
            << CI{"ZREM", CO::FAST | CO::JOURNALED, -3, 1, 1}.HFUNC(ZRem)
            << CI{"ZRANGE", CO::READONLY, -4, 1, 1}.HFUNC(ZRange)
            << CI{"ZRANDMEMBER", CO::READONLY, -2, 1, 1}.HFUNC(ZRandMember)
            << CI{"ZRANK", CO::READONLY | CO::FAST, -3, 1, 1}.HFUNC(ZRank)
            << CI{"ZRANGEBYLEX", CO::READONLY, -4, 1, 1}.HFUNC(ZRangeByLex)
            << CI{"ZRANGEBYSCORE", CO::READONLY, -4, 1, 1}.HFUNC(ZRangeByScore)
            << CI{"ZRANGESTORE", CO::JOURNALED | CO::DENYOOM | CO::NO_AUTOJOURNAL, -5, 1, 2}.HFUNC(
                   ZRangeStore);

  *registry << CI{"ZSCORE", CO::READONLY | CO::FAST, 3, 1, 1}.HFUNC(ZScore)
            << CI{"ZMSCORE", CO::READONLY | CO::FAST, -3, 1, 1}.HFUNC(ZMScore)
            << CI{"ZREMRANGEBYRANK", CO::JOURNALED, 4, 1, 1}.HFUNC(ZRemRangeByRank)
            << CI{"ZREMRANGEBYSCORE", CO::JOURNALED, 4, 1, 1}.HFUNC(ZRemRangeByScore)
            << CI{"ZREMRANGEBYLEX", CO::JOURNALED, 4, 1, 1}.HFUNC(ZRemRangeByLex)
            << CI{"ZREVRANGE", CO::READONLY, -4, 1, 1}.HFUNC(ZRevRange)
            << CI{"ZREVRANGEBYLEX", CO::READONLY, -4, 1, 1}.HFUNC(ZRevRangeByLex)
            << CI{"ZREVRANGEBYSCORE", CO::READONLY, -4, 1, 1}.HFUNC(ZRevRangeByScore)
            << CI{"ZREVRANK", CO::READONLY | CO::FAST, -3, 1, 1}.HFUNC(ZRevRank)
            << CI{"ZSCAN", CO::READONLY, -3, 1, 1}.HFUNC(ZScan)
            << CI{"ZUNION", CO::READONLY | CO::VARIADIC_KEYS, -3, 2, 2}.HFUNC(ZUnion)
            << CI{"ZUNIONSTORE", kStoreMask, -4, 3, 3}.HFUNC(ZUnionStore);
}

}  // namespace dfly


================================================
FILE: src/server/zset_family.h
================================================
// Copyright 2025, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <string_view>
#include <variant>

#include "facade/op_status.h"
#include "server/common.h"
#include "server/table.h"

namespace facade {
class SinkReplyBuilder;
}  // namespace facade

namespace dfly {

struct OpArgs;

class ZSetFamily {
 public:
  static void Register(CommandRegistry* registry);

  static LoadBlobResult LoadZiplistBlob(std::string_view blob, PrimeValue* pv);
  static LoadBlobResult LoadListpackBlob(std::string_view blob, PrimeValue* pv);

  using IndexInterval = std::pair<int64_t, int64_t>;
  using MScoreResponse = std::vector<std::optional<double>>;

  struct Bound {
    double val;
    bool is_open = false;
    Bound() = default;
    Bound(double v, bool open) : val(v), is_open(open) {
    }
  };

  using ScoreInterval = std::pair<Bound, Bound>;

  struct LexBound {
    std::string_view val;
    enum Type : uint8_t { PLUS_INF, MINUS_INF, OPEN, CLOSED } type = CLOSED;
    LexBound() = default;
    LexBound(std::string_view v, Type t) : val(v), type(t) {
    }
  };

  using LexInterval = std::pair<LexBound, LexBound>;

  using TopNScored = uint32_t;

  struct RangeParams {
    uint32_t offset = 0;
    uint32_t limit = UINT32_MAX;
    bool with_scores = false;
    bool reverse = false;
    enum IntervalType : uint8_t { LEX, RANK, SCORE } interval_type = RANK;
    std::optional<std::string_view> store_key = std::nullopt;
  };

  struct ZRangeSpec {
    std::variant<IndexInterval, ScoreInterval, LexInterval, TopNScored> interval;
    RangeParams params;
    ZRangeSpec() = default;
    ZRangeSpec(const ScoreInterval& si, const RangeParams& rp) : interval(si), params(rp){};
  };

  struct ZParams {
    unsigned flags = 0;  // mask of ZADD_IN_ macros.
    bool ch = false;     // Corresponds to CH option.
    bool override = false;
    bool journal_update = false;
  };

  using ScoredMember = std::pair<std::string, double>;
  using ScoredArray = std::vector<ScoredMember>;
  using ScoredMemberView = std::pair<double, std::string_view>;
  using ScoredMemberSpan = absl::Span<const ScoredMemberView>;

  using SinkReplyBuilder = facade::SinkReplyBuilder;
  template <typename T> using OpResult = facade::OpResult<T>;

  // Used by GeoFamily also
  static void ZAddGeneric(std::string_view key, const ZParams& zparams, ScoredMemberSpan memb_sp,
                          CommandContext* cmd_cntx);

  static OpResult<MScoreResponse> ZGetMembers(CmdArgList args, Transaction* tx,
                                              SinkReplyBuilder* builder);

  static OpResult<std::vector<ScoredArray>> OpRanges(const std::vector<ZRangeSpec>& range_specs,
                                                     const OpArgs& op_args, std::string_view key);

  struct AddResult {
    double new_score = 0;
    unsigned num_updated = 0;

    bool is_nan = false;
  };

  static OpResult<AddResult> OpAdd(const OpArgs& op_args, const ZParams& zparams,
                                   std::string_view key, ScoredMemberSpan members);

  static OpResult<void> OpKeyExisted(const OpArgs& op_args, std::string_view key);

  static OpResult<double> OpScore(const OpArgs& op_args, std::string_view key,
                                  std::string_view member);
};

}  // namespace dfly


================================================
FILE: src/server/zset_family_test.cc
================================================
// Copyright 2022, DragonflyDB authors.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/zset_family.h"

#include <vector>

#include "base/gtest.h"
#include "base/logging.h"
#include "facade/facade_test.h"
#include "server/test_utils.h"

using namespace testing;
using namespace std;
using namespace util;

namespace dfly {

class ZSetFamilyTest : public BaseFamilyTest {
 protected:
};

using ScoredElement = std::pair<std::string, std::string>;

template <typename Array> auto ParseToScoredArray(Array arr) {
  std::vector<ScoredElement> scored_elements;
  for (std::size_t i = 1; i < arr.size(); i += 2) {
    scored_elements.emplace_back(arr[i - 1].GetString(), arr[i].GetString());
  }
  return scored_elements;
}

MATCHER_P(ConsistsOfMatcher, elements, "") {
  auto vec = arg.GetVec();
  for (const auto& x : vec) {
    if (elements.find(x.GetString()) == elements.end()) {
      return false;
    }
  }
  return true;
}

MATCHER_P(ConsistsOfScoredElementsMatcher, elements, "") {
  auto vec = arg.GetVec();
  if (vec.size() % 2) {
    return false;
  }

  auto scored_vec = ParseToScoredArray(vec);
  for (const auto& scored_element : scored_vec) {
    if (elements.find(scored_element) == elements.end()) {
      return false;
    }
  }
  return true;
}

MATCHER_P(IsScoredSubsetOfMatcher, elements_list, "") {
  auto vec = arg.GetVec();
  if (vec.size() % 2) {
    return false;
  }

  auto scored_vec = ParseToScoredArray(vec);
  std::vector<ScoredElement> elements{elements_list};

  std::sort(scored_vec.begin(), scored_vec.end());
  std::sort(elements.begin(), elements.end());

  return std::includes(elements.begin(), elements.end(), scored_vec.begin(), scored_vec.end());
}

MATCHER_P(UnorderedScoredElementsAreMatcher, elements_list, "") {
  auto vec = arg.GetVec();
  if (vec.size() % 2) {
    return false;
  }

  auto scored_vec = ParseToScoredArray(vec);
  return std::is_permutation(scored_vec.begin(), scored_vec.end(), elements_list.begin(),
                             elements_list.end());
}

MATCHER_P2(ContainsLabeledScoredArrayMatcher, label, elements, "") {
  auto label_vec = arg.GetVec();
  if (label_vec.size() != 2) {
    *result_listener << "Labeled Scored Array does no contain two elements.";
    return false;
  }

  if (!ExplainMatchResult(Eq(label), label_vec[0].GetString(), result_listener)) {
    return false;
  }

  auto value_pairs_vec = label_vec[1].GetVec();
  std::set<std::pair<std::string, std::string>> actual_elements;
  for (const auto& scored_element : value_pairs_vec) {
    actual_elements.insert(std::make_pair(scored_element.GetVec()[0].GetString(),
                                          scored_element.GetVec()[1].GetString()));
  }
  if (actual_elements != elements) {
    *result_listener << "Scored elements do not match: ";
    ExplainMatchResult(ElementsAreArray(elements), actual_elements, result_listener);
    return false;
  }

  return true;
}

auto ConsistsOf(std::initializer_list<std::string> elements) {
  return ConsistsOfMatcher(std::unordered_set<std::string>{elements});
}

auto ConsistsOfScoredElements(std::initializer_list<std::pair<std::string, std::string>> elements) {
  return ConsistsOfScoredElementsMatcher(std::set<std::pair<std::string, std::string>>{elements});
}

auto IsScoredSubsetOf(std::initializer_list<std::pair<std::string, std::string>> elements) {
  return IsScoredSubsetOfMatcher(elements);
}

auto UnorderedScoredElementsAre(
    std::initializer_list<std::pair<std::string, std::string>> elements) {
  return UnorderedScoredElementsAreMatcher(elements);
}

auto ContainsLabeledScoredArray(
    std::string_view label, std::initializer_list<std::pair<std::string, std::string>> elements) {
  return ContainsLabeledScoredArrayMatcher(label,
                                           std::set<std::pair<std::string, std::string>>{elements});
}

TEST_F(ZSetFamilyTest, Add) {
  auto resp = Run({"zadd", "x", "1.1", "a"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"zscore", "x", "a"});
  EXPECT_THAT(resp, "1.1");

  resp = Run({"zadd", "x", "2", "a"});
  EXPECT_THAT(resp, IntArg(0));
  resp = Run({"zscore", "x", "a"});
  EXPECT_THAT(resp, "2");

  resp = Run({"zadd", "x", "ch", "3", "a"});
  EXPECT_THAT(resp, IntArg(1));
  resp = Run({"zscore", "x", "a"});
  EXPECT_EQ(resp, "3");

  resp = Run({"zcard", "x"});
  EXPECT_THAT(resp, IntArg(1));

  EXPECT_THAT(Run({"zadd", "x", "", "a"}), ErrArg("not a valid float"));

  EXPECT_THAT(Run({"zadd", "ztmp", "xx", "10", "member"}), IntArg(0));

  const char kHighPrecision[] = "0.79028573343077946";

  Run({"zadd", "zs", kHighPrecision, "a"});
  EXPECT_EQ(Run({"zscore", "zs", "a"}), "0.7902857334307795");
  EXPECT_EQ(0.79028573343077946, 0.7902857334307795);

  resp = Run({"zadd", "x", "1.1", ""});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"zscore", "x", ""});
  EXPECT_EQ(resp, "1.1");
}

TEST_F(ZSetFamilyTest, AddNonUniqeMembers) {
  auto resp = Run({"zadd", "x", "2", "a", "1", "a"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"zscore", "x", "a"});
  EXPECT_EQ(resp, "1");

  resp = Run({"zadd", "y", "3", "a", "1", "a", "2", "b"});
  EXPECT_THAT(resp, IntArg(2));
  EXPECT_EQ("1", Run({"zscore", "y", "a"}));
}

TEST_F(ZSetFamilyTest, ZRem) {
  auto resp = Run({"zadd", "x", "1.1", "b", "2.1", "a"});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"zrem", "x", "b", "c"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"zcard", "x"});
  EXPECT_THAT(resp, IntArg(1));
  EXPECT_THAT(Run({"zrange", "x", "0", "3", "byscore"}), "a");
  EXPECT_THAT(Run({"zrange", "x", "(-inf", "(+inf", "byscore"}), "a");
}

TEST_F(ZSetFamilyTest, ZRandMember) {
  auto resp = Run({"ZAdd", "x", "1", "a", "2", "b", "3", "c"});
  EXPECT_THAT(resp, IntArg(3));

  // Test if count > 0
  resp = Run({"ZRandMember", "x"});
  ASSERT_THAT(resp, ArgType(RespExpr::STRING));
  EXPECT_THAT(resp, AnyOf("a", "b", "c"));

  resp = Run({"ZRandMember", "x", "1"});
  ASSERT_THAT(resp, ArgType(RespExpr::STRING));
  EXPECT_THAT(resp, AnyOf("a", "b", "c"));

  resp = Run({"ZRandMember", "x", "2"});
  ASSERT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp.GetVec(), IsSubsetOf({"a", "b", "c"}));

  resp = Run({"ZRandMember", "x", "3"});
  ASSERT_THAT(resp, ArrLen(3));
  EXPECT_THAT(resp.GetVec(), UnorderedElementsAre("a", "b", "c"));

  // Test if count < 0
  resp = Run({"ZRandMember", "x", "-1"});
  ASSERT_THAT(resp, ArgType(RespExpr::STRING));
  EXPECT_THAT(resp, AnyOf("a", "b", "c"));

  resp = Run({"ZRandMember", "x", "-2"});
  ASSERT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp, ConsistsOf({"a", "b", "c"}));

  resp = Run({"ZRandMember", "x", "-3"});
  ASSERT_THAT(resp, ArrLen(3));
  EXPECT_THAT(resp, ConsistsOf({"a", "b", "c"}));

  // Test if count < 0, but the absolute value is larger than the size of the sorted set
  resp = Run({"ZRandMember", "x", "-15"});
  ASSERT_THAT(resp, ArrLen(15));
  EXPECT_THAT(resp, ConsistsOf({"a", "b", "c"}));

  // Test if count is 0
  ASSERT_THAT(Run({"ZRandMember", "x", "0"}), ArrLen(0));

  // Test if count is larger than the size of the sorted set
  resp = Run({"ZRandMember", "x", "15"});
  ASSERT_THAT(resp, ArrLen(3));
  EXPECT_THAT(resp.GetVec(), UnorderedElementsAre("a", "b", "c"));

  // Test if sorted set is empty
  EXPECT_THAT(Run({"ZAdd", "empty::zset", "1", "one"}), IntArg(1));
  EXPECT_THAT(Run({"ZRem", "empty::zset", "one"}), IntArg(1));
  ASSERT_THAT(Run({"ZRandMember", "empty::zset", "0"}), ArrLen(0));
  ASSERT_THAT(Run({"ZRandMember", "empty::zset", "3"}), ArrLen(0));
  ASSERT_THAT(Run({"ZRandMember", "empty::zset", "-4"}), ArrLen(0));

  // Test if key does not exist
  ASSERT_THAT(Run({"ZRandMember", "y"}), ArgType(RespExpr::NIL));
  ASSERT_THAT(Run({"ZRandMember", "y", "0"}), ArrLen(0));

  // Test WITHSCORES
  resp = Run({"ZRandMember", "x", "1", "WITHSCORES"});
  ASSERT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp, IsScoredSubsetOf({{"a", "1"}, {"b", "2"}, {"c", "3"}}));

  resp = Run({"ZRandMember", "x", "2", "WITHSCORES"});
  ASSERT_THAT(resp, ArrLen(4));
  EXPECT_THAT(resp, IsScoredSubsetOf({{"a", "1"}, {"b", "2"}, {"c", "3"}}));

  resp = Run({"ZRandMember", "x", "3", "WITHSCORES"});
  ASSERT_THAT(resp, ArrLen(6));
  EXPECT_THAT(resp, UnorderedScoredElementsAre({{"a", "1"}, {"b", "2"}, {"c", "3"}}));

  resp = Run({"ZRandMember", "x", "15", "WITHSCORES"});
  ASSERT_THAT(resp, ArrLen(6));
  EXPECT_THAT(resp, UnorderedScoredElementsAre({{"a", "1"}, {"b", "2"}, {"c", "3"}}));

  resp = Run({"ZRandMember", "x", "-1", "WITHSCORES"});
  ASSERT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp, ConsistsOfScoredElements({{"a", "1"}, {"b", "2"}, {"c", "3"}}));

  resp = Run({"ZRandMember", "x", "-2", "WITHSCORES"});
  ASSERT_THAT(resp, ArrLen(4));
  EXPECT_THAT(resp, ConsistsOfScoredElements({{"a", "1"}, {"b", "2"}, {"c", "3"}}));

  resp = Run({"ZRandMember", "x", "-3", "WITHSCORES"});
  ASSERT_THAT(resp, ArrLen(6));
  EXPECT_THAT(resp, ConsistsOfScoredElements({{"a", "1"}, {"b", "2"}, {"c", "3"}}));

  resp = Run({"ZRandMember", "x", "-15", "WITHSCORES"});
  ASSERT_THAT(resp, ArrLen(30));
  EXPECT_THAT(resp, ConsistsOfScoredElements({{"a", "1"}, {"b", "2"}, {"c", "3"}}));
}

TEST_F(ZSetFamilyTest, ZMScore) {
  Run({"zadd", "zms", "3.14", "a"});
  Run({"zadd", "zms", "42", "another"});

  auto resp = Run({"zmscore", "zms", "another", "a", "nofield"});
  ASSERT_EQ(RespExpr::ARRAY, resp.type);
  EXPECT_THAT(resp.GetVec(), ElementsAre("42", "3.14", ArgType(RespExpr::NIL)));
}

// Test for ZMSCORE with member on a non-existent keys
TEST_F(ZSetFamilyTest, ZMScoreNonExistentKeys) {
  // Case 1: Single member with non-existent key (ZMSCORE abc x)
  auto resp = Run({"zmscore", "abc", "x"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  // Case 2: Multiple members with non-existent key (ZMSCORE abc x y z)
  resp = Run({"zmscore", "abc", "x", "y", "z"});
  EXPECT_THAT(resp.GetVec(),
              ElementsAre(ArgType(RespExpr::NIL), ArgType(RespExpr::NIL), ArgType(RespExpr::NIL)));
}

TEST_F(ZSetFamilyTest, ByScore) {
  Run({"zadd", "x", "1.1", "a", "2.1", "b"});
  EXPECT_THAT(Run({"zrangebyscore", "x", "0", "(1.1"}), ArrLen(0));
  EXPECT_THAT(Run({"zrangebyscore", "x", "-inf", "1.1", "limit", "0", "10"}), "a");

  auto resp = Run({"zrangebyscore", "x", "-inf", "1.1", "limit", "0", "10", "WITHSCORES"});
  ASSERT_THAT(resp, ArrLen(2));
  ASSERT_THAT(resp.GetVec(), ElementsAre("a", "1.1"));

  resp = Run({"zrangebyscore", "x", "-inf", "1.1", "WITHSCORES", "limit", "0", "10"});
  ASSERT_THAT(resp, ArrLen(2));
  ASSERT_THAT(resp.GetVec(), ElementsAre("a", "1.1"));

  resp = Run({"zrangebyscore", "x", "-inf", "+inf", "LIMIT", "0", "-1"});
  ASSERT_THAT(resp, ArrLen(2));
  ASSERT_THAT(resp.GetVec(), ElementsAre("a", "b"));

  resp = Run({"zrevrangebyscore", "x", "+inf", "-inf", "limit", "0", "5"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  ASSERT_THAT(resp.GetVec(), ElementsAre("b", "a"));

  EXPECT_EQ(2, CheckedInt({"zcount", "x", "1.1", "2.1"}));
  EXPECT_EQ(1, CheckedInt({"zcount", "x", "(1.1", "2.1"}));
  EXPECT_EQ(0, CheckedInt({"zcount", "y", "(1.1", "2.1"}));
}

TEST_F(ZSetFamilyTest, ZRank) {
  Run({"zadd", "x", "1.1", "a", "2.1", "b"});
  EXPECT_EQ(0, CheckedInt({"zrank", "x", "a"}));
  EXPECT_EQ(1, CheckedInt({"zrank", "x", "b"}));
  EXPECT_EQ(1, CheckedInt({"zrevrank", "x", "a"}));
  EXPECT_EQ(0, CheckedInt({"zrevrank", "x", "b"}));
  EXPECT_THAT(Run({"zrevrank", "x", "c"}), ArgType(RespExpr::NIL));
  EXPECT_THAT(Run({"zrank", "y", "c"}), ArgType(RespExpr::NIL));
  EXPECT_THAT(Run({"zrevrank", "x", "c", "WITHSCORE"}), ArgType(RespExpr::NIL));
  EXPECT_THAT(Run({"zrank", "y", "c", "WITHSCORE"}), ArgType(RespExpr::NIL));

  auto resp = Run({"zrank", "x", "a", "WITHSCORE"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  ASSERT_THAT(resp.GetVec(), ElementsAre(IntArg(0), "1.1"));

  resp = Run({"zrank", "x", "b", "WITHSCORE"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  ASSERT_THAT(resp.GetVec(), ElementsAre(IntArg(1), "2.1"));

  resp = Run({"zrevrank", "x", "a", "WITHSCORE"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  ASSERT_THAT(resp.GetVec(), ElementsAre(IntArg(1), "1.1"));

  resp = Run({"zrevrank", "x", "b", "WITHSCORE"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  ASSERT_THAT(resp.GetVec(), ElementsAre(IntArg(0), "2.1"));

  resp = Run({"zrank", "x", "a", "WITHSCORES"});
  ASSERT_THAT(resp, ErrArg("syntax error"));

  resp = Run({"zrank", "x", "a", "WITHSCORES", "42"});
  ASSERT_THAT(resp, ErrArg("wrong number of arguments for 'zrank' command"));

  resp = Run({"zrevrank", "x", "a", "WITHSCORES", "42"});
  ASSERT_THAT(resp, ErrArg("wrong number of arguments for 'zrevrank' command"));
}

TEST_F(ZSetFamilyTest, LargeSet) {
  for (int i = 0; i < 129; ++i) {
    auto resp = Run({"zadd", "key", absl::StrCat(i), absl::StrCat("element:", i)});
    EXPECT_THAT(resp, IntArg(1)) << i;
  }
  Run({"zadd", "key", "129", ""});

  EXPECT_THAT(Run({"zrangebyscore", "key", "(-inf", "(0.0"}), ArrLen(0));
  EXPECT_THAT(Run({"zrangebyscore", "key", "(5", "0.0"}), ArrLen(0));
  EXPECT_THAT(Run({"zrangebylex", "key", "-", "(element:0"}), ArrLen(0));
  EXPECT_EQ(2, CheckedInt({"zremrangebyscore", "key", "127", "(129"}));
}

TEST_F(ZSetFamilyTest, ZRemRangeRank) {
  Run({"zadd", "x", "1.1", "a", "2.1", "b"});
  EXPECT_THAT(Run({"ZREMRANGEBYRANK", "y", "0", "1"}), IntArg(0));
  EXPECT_THAT(Run({"ZREMRANGEBYRANK", "x", "0", "0"}), IntArg(1));
  EXPECT_EQ(Run({"zrange", "x", "0", "5"}), "b");
  EXPECT_THAT(Run({"ZREMRANGEBYRANK", "x", "0", "1"}), IntArg(1));
  EXPECT_EQ(Run({"type", "x"}), "none");
}

TEST_F(ZSetFamilyTest, ZRemRangeScore) {
  Run({"zadd", "x", "1.1", "a", "2.1", "b"});
  EXPECT_THAT(Run({"ZREMRANGEBYSCORE", "y", "0", "1"}), IntArg(0));
  EXPECT_THAT(Run({"ZREMRANGEBYSCORE", "x", "-inf", "1.1"}), IntArg(1));
  EXPECT_EQ(Run({"zrange", "x", "0", "5"}), "b");
  EXPECT_THAT(Run({"ZREMRANGEBYSCORE", "x", "(2.0", "+inf"}), IntArg(1));
  EXPECT_EQ(Run({"type", "x"}), "none");
  EXPECT_THAT(Run({"zremrangebyscore", "x", "1", "NaN"}), ErrArg("min or max is not a float"));
}

TEST_F(ZSetFamilyTest, IncrBy) {
  auto resp = Run({"zadd", "key", "xx", "incr", "2.1", "member"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  resp = Run({"zadd", "key", "nx", "incr", "2.1", "member"});
  EXPECT_THAT(resp, "2.1");

  resp = Run({"zadd", "key", "nx", "incr", "4.9", "member"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));
}

TEST_F(ZSetFamilyTest, ByLex) {
  Run({
      "zadd", "key",      "0", "alpha", "0", "bar",   "0", "cool", "0", "down",
      "0",    "elephant", "0", "foo",   "0", "great", "0", "hill", "0", "omega",
  });

  auto resp = Run({"zrangebylex", "key", "-", "[cool"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), ElementsAre("alpha", "bar", "cool"));

  EXPECT_EQ(3, CheckedInt({"ZLEXCOUNT", "key", "(foo", "+"}));
  EXPECT_EQ(0, CheckedInt({"ZLEXCOUNT", "key", "(foo", "[fop"}));
  EXPECT_EQ(3, CheckedInt({"ZREMRANGEBYLEX", "key", "(foo", "+"}));

  resp = Run({"zrangebylex", "key", "[a", "+"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  ASSERT_THAT(resp.GetVec(), ElementsAre("alpha", "bar", "cool", "down", "elephant", "foo"));

  resp = Run({"zrangebylex", "key", "-", "+", "LIMIT", "2", "3"});
  ASSERT_THAT(resp.GetVec(), ElementsAre("cool", "down", "elephant"));

  resp = Run({"zrangebylex", "key", "-", "+", "LIMIT", "5", "1"});
  ASSERT_THAT(resp, "foo");
}

TEST_F(ZSetFamilyTest, ZRevRangeByLex) {
  Run({
      "zadd", "key",      "0", "alpha", "0", "bar",   "0", "cool", "0", "down",
      "0",    "elephant", "0", "foo",   "0", "great", "0", "hill", "0", "omega",
  });

  auto resp = Run({"zrevrangebylex", "key", "[cool", "-"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), ElementsAre("cool", "bar", "alpha"));

  EXPECT_EQ(3, CheckedInt({"ZLEXCOUNT", "key", "(foo", "+"}));
  EXPECT_EQ(3, CheckedInt({"ZREMRANGEBYLEX", "key", "(foo", "+"}));

  resp = Run({"zrevrangebylex", "key", "+", "[a"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  ASSERT_THAT(resp.GetVec(), ElementsAre("foo", "elephant", "down", "cool", "bar", "alpha"));

  Run({"zadd", "myzset", "0", "a", "0", "b", "0", "c", "0", "d", "0", "e", "0", "f", "0", "g"});
  resp = Run({"zrevrangebylex", "myzset", "(c", "-"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), ElementsAre("b", "a"));
}

TEST_F(ZSetFamilyTest, ZRange) {
  Run({"zadd", "key", "0", "a", "1", "d", "1", "b", "2", "c", "4", "e"});

  auto resp = Run({"zrange", "key", "0", "2"});
  ASSERT_THAT(resp, ArrLen(3));
  ASSERT_THAT(resp.GetVec(), ElementsAre("a", "b", "d"));

  resp = Run({"zrange", "key", "1", "3", "WITHSCORES"});
  ASSERT_THAT(resp, ArrLen(6));
  ASSERT_THAT(resp.GetVec(), ElementsAre("b", "1", "d", "1", "c", "2"));

  resp = Run({"zrange", "key", "1", "3", "WITHSCORES", "REV"});
  ASSERT_THAT(resp, ArrLen(6));
  ASSERT_THAT(resp.GetVec(), ElementsAre("c", "2", "d", "1", "b", "1"));

  resp = Run({"zrange", "key", "(1", "4", "BYSCORE", "WITHSCORES"});
  ASSERT_THAT(resp, ArrLen(4));
  ASSERT_THAT(resp.GetVec(), ElementsAre("c", "2", "e", "4"));

  resp = Run({"zrange", "key", "-", "d", "BYLEX", "BYSCORE"});
  EXPECT_THAT(resp, ErrArg("BYSCORE and BYLEX options are not compatible"));

  resp = Run({"zrange", "key", "0", "-1", "LIMIT", "3", "-1"});
  ASSERT_THAT(resp, ArrLen(2));
  ASSERT_THAT(resp.GetVec(), ElementsAre("c", "e"));

  Run({"zremrangebyscore", "key", "0", "4"});

  Run({
      "zadd", "key",      "0", "alpha", "0", "bar",   "0", "cool", "0", "down",
      "0",    "elephant", "0", "foo",   "0", "great", "0", "hill", "0", "omega",
  });
  resp = Run({"zrange", "key", "-", "[cool", "BYLEX"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), ElementsAre("alpha", "bar", "cool"));

  resp = Run({"zrange", "key", "[cool", "-", "REV", "BYLEX"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), ElementsAre("cool", "bar", "alpha"));

  resp = Run({"zrange", "key", "+", "[cool", "REV", "BYLEX", "LIMIT", "2", "2"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), ElementsAre("great", "foo"));

  resp = Run({"zrange", "key", "+", "[cool", "BYLEX", "LIMIT", "2", "2", "REV"});
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  EXPECT_THAT(resp.GetVec(), ElementsAre("great", "foo"));

  resp = Run({"zrange", "key", "5", "2147483648"});
  ASSERT_THAT(resp, RespElementsAre("foo", "great", "hill", "omega"));
}

TEST_F(ZSetFamilyTest, ZRevRange) {
  Run({"zadd", "key", "-inf", "a", "1", "b", "2", "c"});
  auto resp = Run({"zrevrangebyscore", "key", "2", "-inf"});
  ASSERT_THAT(resp, ArrLen(3));
  EXPECT_THAT(resp.GetVec(), ElementsAre("c", "b", "a"));

  resp = Run({"zrevrangebyscore", "key", "2", "-inf", "withscores"});
  ASSERT_THAT(resp, ArrLen(6));
  EXPECT_THAT(resp.GetVec(), ElementsAre("c", "2", "b", "1", "a", "-inf"));

  resp = Run({"zrevrange", "key", "0", "2"});
  ASSERT_THAT(resp, ArrLen(3));
  EXPECT_THAT(resp.GetVec(), ElementsAre("c", "b", "a"));

  resp = Run({"zrevrange", "key", "1", "2", "withscores"});
  ASSERT_THAT(resp, ArrLen(4));
  EXPECT_THAT(resp.GetVec(), ElementsAre("b", "1", "a", "-inf"));

  // Make sure that when using with upper case it works as well (see
  // https://github.com/dragonflydb/dragonfly/issues/326)
  resp = Run({"zrevrangebyscore", "key", "2", "-INF"});
  ASSERT_THAT(resp, ArrLen(3));
  EXPECT_THAT(resp.GetVec(), ElementsAre("c", "b", "a"));

  resp = Run({"zrevrangebyscore", "key", "2", "-INF", "withscores"});
  ASSERT_THAT(resp, ArrLen(6));
  EXPECT_THAT(resp.GetVec(), ElementsAre("c", "2", "b", "1", "a", "-inf"));
}

TEST_F(ZSetFamilyTest, ZScan) {
  auto resp = Run("zscan non-existing-key 100 count 5");
  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
  ASSERT_THAT(resp.GetVec(), ElementsAre(ArgType(RespExpr::STRING), ArgType(RespExpr::ARRAY)));
  EXPECT_EQ(ToSV(resp.GetVec()[0].GetBuf()), "0");
  EXPECT_EQ(StrArray(resp.GetVec()[1]).size(), 0);

  string prefix(128, 'a');
  for (unsigned i = 0; i < 100; ++i) {
    Run({"zadd", "key", "1", absl::StrCat(prefix, i)});
  }

  EXPECT_EQ(100, CheckedInt({"zcard", "key"}));
  int64_t cursor = 0;
  size_t scan_len = 0;
  do {
    auto resp = Run({"zscan", "key", absl::StrCat(cursor)});
    ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
    ASSERT_THAT(resp.GetVec(), ElementsAre(ArgType(RespExpr::STRING), ArgType(RespExpr::ARRAY)));
    string_view token = ToSV(resp.GetVec()[0].GetBuf());
    ASSERT_TRUE(absl::SimpleAtoi(token, &cursor));
    auto sub_arr = resp.GetVec()[1].GetVec();
    scan_len += sub_arr.size();
  } while (cursor != 0);

  EXPECT_EQ(100 * 2, scan_len);

  // Check scan with count and match params
  scan_len = 0;
  do {
    auto resp = Run({"zscan", "key", absl::StrCat(cursor), "count", "5", "match", "*0"});
    ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
    ASSERT_THAT(resp.GetVec(), ElementsAre(ArgType(RespExpr::STRING), ArgType(RespExpr::ARRAY)));
    string_view token = ToSV(resp.GetVec()[0].GetBuf());
    ASSERT_TRUE(absl::SimpleAtoi(token, &cursor));
    auto sub_arr = resp.GetVec()[1].GetVec();
    scan_len += sub_arr.size();
  } while (cursor != 0);
  EXPECT_EQ(10 * 2, scan_len);  // expected members a0,a10,a20..,a90
}

TEST_F(ZSetFamilyTest, ZUnionError) {
  RespExpr resp;

  resp = Run({"zunion", "0"});
  EXPECT_THAT(resp, ErrArg("wrong number of arguments"));

  resp = Run({"zunion", "0", "myset"});
  EXPECT_THAT(resp, ErrArg("at least 1 input key is needed"));

  resp = Run({"zunion", "3", "z1", "z2", "z3", "weights", "1", "1", "k"});
  EXPECT_THAT(resp, ErrArg("weight value is not a float"));

  resp = Run({"zunion", "3", "z1", "z2", "z3", "weights", "1", "1", "2", "aggregate", "something"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  resp = Run({"zunion", "3", "z1", "z2", "z3", "weights", "1", "2", "aggregate", "something"});
  EXPECT_THAT(resp, ErrArg("weight value is not a float"));

  resp = Run({"zunion", "3", "z1", "z2", "z3", "aggregate", "sum", "somescore"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  resp = Run({"zunion", "3", "z1", "z2", "z3", "withscores", "someargs"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  resp = Run({"zunion", "1"});
  EXPECT_THAT(resp, ErrArg("wrong number of arguments"));

  resp = Run({"zunion", "2", "z1"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  resp = Run({"zunion", "2", "z1", "z2", "z3"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  resp = Run({"zunion", "2", "z1", "z2", "weights", "1", "2", "3"});
  EXPECT_THAT(resp, ErrArg("syntax error"));
}

TEST_F(ZSetFamilyTest, ZUnion) {
  RespExpr resp;

  EXPECT_EQ(2, CheckedInt({"zadd", "z1", "1", "a", "3", "b"}));
  EXPECT_EQ(2, CheckedInt({"zadd", "z2", "3", "c", "2", "b"}));
  EXPECT_EQ(2, CheckedInt({"zadd", "z3", "1", "c", "1", "d"}));

  resp = Run({"zunion", "3", "z1", "z2", "z3"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("a", "d", "c", "b"));

  resp = Run({"zunion", "3", "z1", "z2", "z3", "weights", "1", "1", "2"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("a", "d", "b", "c"));

  // Cover union of sets and zsets
  EXPECT_EQ(2, CheckedInt({"sadd", "s2", "b", "c"}));
  resp = Run({"zunion", "2", "z1", "s2", "weights", "1", "2", "withscores"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("a", "1", "c", "2", "b", "5"));

  resp = Run({"zunion", "3", "z1", "z2", "z3", "weights", "1", "1", "2", "withscores"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("a", "1", "d", "2", "b", "5", "c", "5"));

  resp = Run({"zunion", "3", "z1", "z2", "z3", "weights", "1", "1", "2", "aggregate", "min",
              "withscores"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("a", "1", "b", "2", "c", "2", "d", "2"));

  resp = Run({"zunion", "3", "z1", "z2", "z3", "withscores", "weights", "1", "1", "2", "aggregate",
              "min"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("a", "1", "b", "2", "c", "2", "d", "2"));

  resp = Run({"zunion", "3", "none1", "none2", "z3", "withscores", "weights", "1", "1", "2"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("c", "2", "d", "2"));

  resp = Run({"zunion", "3", "z1", "z2", "z3", "weights", "1", "1", "2", "aggregate", "max",
              "withscores"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("a", "1", "d", "2", "b", "3", "c", "3"));

  resp = Run({"zunion", "1", "z1", "weights", "2", "aggregate", "max", "withscores"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("a", "2", "b", "6"));

  for (unsigned i = 0; i < 256; ++i) {
    Run({"zadd", "large1", "1000", absl::StrCat("aaaaaaaaaa", i)});
    Run({"zadd", "large2", "1000", absl::StrCat("bbbbbbbbbb", i)});
    Run({"zadd", "large2", "1000", absl::StrCat("aaaaaaaaaa", i)});
  }
  resp = Run({"zunion", "2", "large2", "large1"});
  EXPECT_THAT(resp, ArrLen(512));
}

TEST_F(ZSetFamilyTest, ZUnionStore) {
  RespExpr resp;

  resp = Run({"zunionstore", "key", "0"});
  EXPECT_THAT(resp, ErrArg("wrong number of arguments"));

  resp = Run({"zunionstore", "key", "0", "aggregate"});
  EXPECT_THAT(resp, ErrArg("at least 1 input key is needed"));

  resp = Run({"zunionstore", "key", "0", "aggregate", "sum"});
  EXPECT_THAT(resp, ErrArg("at least 1 input key is needed"));
  resp = Run({"zunionstore", "key", "-1", "aggregate", "sum"});
  EXPECT_THAT(resp, ErrArg("out of range"));
  resp = Run({"zunionstore", "key", "2", "foo", "bar", "weights", "1"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  EXPECT_EQ(2, CheckedInt({"zadd", "z1", "1", "a", "2", "b"}));
  EXPECT_EQ(2, CheckedInt({"zadd", "z2", "3", "c", "2", "b"}));

  resp = Run({"zunionstore", "key", "2", "z1", "z2"});
  EXPECT_THAT(resp, IntArg(3));
  resp = Run({"zrange", "key", "0", "-1", "withscores"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("a", "1", "c", "3", "b", "4"));

  resp = Run({"zunionstore", "z1", "1", "z1"});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"zunionstore", "z1", "2", "z1", "z2"});
  EXPECT_THAT(resp, IntArg(3));
  resp = Run({"zrange", "z1", "0", "-1", "withscores"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("a", "1", "c", "3", "b", "4"));

  Run({"set", "foo", "bar"});
  resp = Run({"zunionstore", "foo", "1", "z2"});
  EXPECT_THAT(resp, IntArg(2));
  resp = Run({"zrange", "foo", "0", "-1", "withscores"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("b", "2", "c", "3"));
}

// Check that ZUNIONSTORE overwrites a value including resetting its expiration
TEST_F(ZSetFamilyTest, ZUnionStoreExpiration) {
  EXPECT_THAT(Run({"zadd", "z1", "1", "a", "2", "b"}), IntArg(2));
  EXPECT_THAT(Run({"zadd", "z2", "3", "c", "2", "b"}), IntArg(2));

  Run({"set", "target", "some-value"});
  EXPECT_THAT(Run({"expire", "target", "1010"}), IntArg(1));
  EXPECT_THAT(Run({"ttl", "target"}), IntArg(1010));

  EXPECT_THAT(Run({"zunionstore", "target", "2", "z1", "z2"}), IntArg(3));
  EXPECT_THAT(Run({"ttl", "target"}), IntArg(-1));
}

TEST_F(ZSetFamilyTest, ZUnionStoreOpts) {
  EXPECT_EQ(2, CheckedInt({"zadd", "z1", "1", "a", "2", "b"}));
  EXPECT_EQ(2, CheckedInt({"zadd", "z2", "3", "c", "2", "b"}));
  RespExpr resp;

  EXPECT_EQ(3, CheckedInt({"zunionstore", "a", "2", "z1", "z2", "weights", "1", "3"}));
  resp = Run({"zrange", "a", "0", "-1", "withscores"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("a", "1", "b", "8", "c", "9"));

  resp = Run({"zunionstore", "a", "2", "z1", "z2", "weights", "1"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  resp = Run({"zunionstore", "z1", "1", "z1", "weights", "2"});
  EXPECT_THAT(resp, IntArg(2));
  resp = Run({"zrange", "z1", "0", "-1", "withscores"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("a", "2", "b", "4"));

  resp = Run({"zunionstore", "max", "2", "z1", "z2", "weights", "1", "0", "aggregate", "max"});
  ASSERT_THAT(resp, IntArg(3));
  resp = Run({"zrange", "max", "0", "-1", "withscores"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("c", "0", "a", "2", "b", "4"));

  // Check that infinity is handled correctly.
  Run({"ZADD", "src1", "inf", "x"});
  Run({"ZADD", "src2", "inf", "x"});
  Run({"ZUNIONSTORE", "dest", "2", "src1", "src2", "WEIGHTS", "1.0", "0.0"});
  resp = Run({"ZSCORE", "dest", "x"});
  EXPECT_THAT(resp, DoubleArg(numeric_limits<double>::infinity()));

  Run({"ZADD", "foo", "inf", "e1"});
  Run({"ZADD", "bar", "-inf", "e1", "0.0", "e2"});
  Run({"ZUNIONSTORE", "dest", "3", "foo", "bar", "foo"});
  resp = Run({"ZSCORE", "dest", "e1"});
  EXPECT_THAT(resp, DoubleArg(0));
}

TEST_F(ZSetFamilyTest, ZInterStore) {
  EXPECT_EQ(2, CheckedInt({"zadd", "z1", "1", "a", "2", "b"}));
  EXPECT_EQ(2, CheckedInt({"zadd", "z2", "3", "c", "2", "b"}));
  RespExpr resp;

  EXPECT_EQ(1, CheckedInt({"zinterstore", "a", "2", "z1", "z2"}));
  resp = Run({"zrange", "a", "0", "-1", "withscores"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("b", "4"));

  // support for sets
  EXPECT_EQ(2, CheckedInt({"sadd", "s2", "b", "c"}));
  EXPECT_EQ(1, CheckedInt({"zinterstore", "b", "2", "z1", "s2"}));
  resp = Run({"zrange", "b", "0", "-1", "withscores"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("b", "3"));

  Run({"ZADD", "foo", "10", "a"});
  EXPECT_EQ(1, CheckedInt({"ZINTERSTORE", "bar", "1", "foo", "weights", "2"}));
  resp = Run({"zrange", "bar", "0", "-1", "withscores"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("a", "20"));
}

TEST_F(ZSetFamilyTest, ZInter) {
  EXPECT_EQ(2, CheckedInt({"zadd", "z1", "1", "one", "2", "two"}));
  EXPECT_EQ(3, CheckedInt({"zadd", "z2", "1", "one", "2", "two", "3", "three"}));
  RespExpr resp;

  resp = Run({"zinter", "2", "z1", "z2"});
  EXPECT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp.GetVec(), ElementsAre("one", "two"));

  EXPECT_EQ(3, CheckedInt({"zadd", "z3", "1", "one", "2", "two", "3", "three"}));
  EXPECT_EQ(3, CheckedInt({"zadd", "z4", "4", "four", "5", "five", "6", "six"}));
  EXPECT_EQ(1, CheckedInt({"zadd", "z5", "6", "six"}));

  resp = Run({"zinter", "3", "z3", "z4", "z5"});
  EXPECT_THAT(resp, ArrLen(0));

  // zinter output sorts keys with equal scores lexicographically
  Run({"del", "z1", "z2", "z3", "z4", "z5"});
  Run({"zadd", "z1", "1", "e", "1", "a", "1", "b", "1", "x"});
  Run({"zadd", "z2", "1", "e", "1", "a", "1", "b", "1", "y"});
  Run({"zadd", "z3", "1", "e", "1", "a", "1", "b", "1", "z"});
  Run({"zadd", "z4", "1", "e", "1", "a", "1", "b", "1", "o"});
  EXPECT_THAT(Run({"zinter", "4", "z1", "z2", "z3", "z4"}).GetVec(), ElementsAre("a", "b", "e"));
}

TEST_F(ZSetFamilyTest, ZInterCard) {
  EXPECT_EQ(3, CheckedInt({"zadd", "z1", "1", "a", "2", "b", "3", "c"}));
  EXPECT_EQ(3, CheckedInt({"zadd", "z2", "2", "b", "3", "c", "4", "d"}));

  EXPECT_EQ(2, CheckedInt({"zintercard", "2", "z1", "z2"}));
  EXPECT_EQ(1, CheckedInt({"zintercard", "2", "z1", "z2", "LIMIT", "1"}));

  RespExpr resp;

  resp = Run({"zintercard", "2", "z1", "z2", "LIM"});
  EXPECT_THAT(resp, ErrArg("syntax error"));
  resp = Run({"zintercard", "2", "z1", "z2", "LIMIT"});
  EXPECT_THAT(resp, ErrArg("syntax error"));
  resp = Run({"zintercard", "2", "z1", "z2", "LIMIT", "a"});
  EXPECT_THAT(resp, ErrArg("limit value is not a positive integer"));

  resp = Run({"zintercard", "0", "z1"});
  EXPECT_THAT(resp, ErrArg("at least 1 input"));

  // support for sets
  EXPECT_EQ(3, CheckedInt({"sadd", "s2", "b", "c", "d"}));
  EXPECT_EQ(2, CheckedInt({"zintercard", "2", "z1", "s2"}));
}

TEST_F(ZSetFamilyTest, ZAddBug148) {
  auto resp = Run({"zadd", "key", "1", "9fe9f1eb"});
  EXPECT_THAT(resp, IntArg(1));
}

TEST_F(ZSetFamilyTest, ZMPopInvalidSyntax) {
  // Not enough arguments.
  auto resp = Run({"zmpop", "1", "a"});
  EXPECT_THAT(resp, ErrArg("wrong number of arguments"));

  // Zero keys.
  resp = Run({"zmpop", "0", "MIN", "COUNT", "1"});
  EXPECT_THAT(resp, ErrArg("at least 1 input key is needed"));

  // Number of keys not uint.
  resp = Run({"zmpop", "aa", "a", "MIN"});
  EXPECT_THAT(resp, ErrArg("value is not an integer or out of range"));

  // Missing MIN/MAX.
  resp = Run({"zmpop", "1", "a", "COUNT", "1"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  // Wrong number of keys.
  resp = Run({"zmpop", "1", "a", "b", "MAX"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  // Count with no number.
  resp = Run({"zmpop", "1", "a", "MAX", "COUNT"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  // Count number is not uint.
  resp = Run({"zmpop", "1", "a", "MIN", "COUNT", "boo"});
  EXPECT_THAT(resp, ErrArg("value is not an integer or out of range"));

  // Too many arguments.
  resp = Run({"zmpop", "1", "c", "MAX", "COUNT", "2", "foo"});
  EXPECT_THAT(resp, ErrArg("syntax error"));
}

TEST_F(ZSetFamilyTest, ZMPop) {
  // All sets are empty.
  auto resp = Run({"zmpop", "1", "e", "MIN"});
  EXPECT_THAT(resp, ArgType(RespExpr::NIL));

  // Min operation.
  resp = Run({"zadd", "a", "1", "a1", "2", "a2"});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"zmpop", "1", "a", "MIN"});
  EXPECT_THAT(resp, ContainsLabeledScoredArray("a", {{"a1", "1"}}));

  resp = Run({"ZRANGE", "a", "0", "-1", "WITHSCORES"});
  EXPECT_THAT(resp, RespArray(ElementsAre("a2", "2")));

  // Max operation.
  resp = Run({"zadd", "b", "1", "b1", "2", "b2"});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"zmpop", "1", "b", "MAX"});
  EXPECT_THAT(resp, ContainsLabeledScoredArray("b", {{"b2", "2"}}));

  resp = Run({"ZRANGE", "b", "0", "-1", "WITHSCORES"});
  EXPECT_THAT(resp, RespArray(ElementsAre("b1", "1")));

  // Count > 1.
  resp = Run({"zadd", "c", "1", "c1", "2", "c2"});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"zmpop", "1", "c", "MAX", "COUNT", "2"});
  EXPECT_THAT(resp, ContainsLabeledScoredArray("c", {{"c1", "1"}, {"c2", "2"}}));

  resp = Run({"zcard", "c"});
  EXPECT_THAT(resp, IntArg(0));

  // Count > #elements in set.
  resp = Run({"zadd", "d", "1", "d1", "2", "d2"});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"zmpop", "1", "d", "MAX", "COUNT", "3"});
  EXPECT_THAT(resp, ContainsLabeledScoredArray("d", {{"d1", "1"}, {"d2", "2"}}));

  resp = Run({"zcard", "d"});
  EXPECT_THAT(resp, IntArg(0));

  // First non empty set is not the first set.
  resp = Run({"zadd", "x", "1", "x1"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"zadd", "y", "1", "y1"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"zmpop", "3", "empty", "x", "y", "MAX"});
  EXPECT_THAT(resp, ContainsLabeledScoredArray("x", {{"x1", "1"}}));

  resp = Run({"zcard", "x"});
  EXPECT_THAT(resp, IntArg(0));

  resp = Run({"ZRANGE", "y", "0", "-1", "WITHSCORES"});
  EXPECT_THAT(resp, RespArray(ElementsAre("y1", "1")));
}

TEST_F(ZSetFamilyTest, BZMPopInvalidSyntax) {
  // Not enough arguments.
  auto resp = Run({"bzmpop", "1", "1", "a"});
  EXPECT_THAT(resp, ErrArg("wrong number of arguments"));

  // Zero keys.
  resp = Run({"bzmpop", "1", "0", "MIN", "COUNT", "1"});
  EXPECT_THAT(resp, ErrArg("at least 1 input key is needed"));

  // Number of keys not uint.
  resp = Run({"bzmpop", "1", "aa", "a", "MIN"});
  EXPECT_THAT(resp, ErrArg("value is not an integer or out of range"));

  // Missing MIN/MAX.
  resp = Run({"bzmpop", "1", "1", "a", "COUNT", "1"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  // Wrong number of keys.
  resp = Run({"bzmpop", "1", "1", "a", "b", "MAX"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  // Count with no number.
  resp = Run({"bzmpop", "1", "1", "a", "MAX", "COUNT"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  // Count number is not uint.
  resp = Run({"bzmpop", "1", "1", "a", "MIN", "COUNT", "boo"});
  EXPECT_THAT(resp, ErrArg("value is not an integer or out of range"));

  // Too many arguments.
  resp = Run({"bzmpop", "1", "1", "c", "MAX", "COUNT", "2", "foo"});
  EXPECT_THAT(resp, ErrArg("syntax error"));

  // Negative time argument.
  resp = Run({"bzmpop", "-1", "1", "a", "MIN"});
  EXPECT_THAT(resp, ErrArg("timeout is negative"));
}

TEST_F(ZSetFamilyTest, BZMPop) {
  // Min operation.
  auto resp = Run({"zadd", "a", "1", "a1", "2", "a2"});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"bzmpop", "1", "1", "a", "MIN"});
  EXPECT_THAT(resp, ContainsLabeledScoredArray("a", {{"a1", "1"}}));

  resp = Run({"ZRANGE", "a", "0", "-1", "WITHSCORES"});
  EXPECT_THAT(resp, RespArray(ElementsAre("a2", "2")));

  // Max operation.
  resp = Run({"zadd", "b", "1", "b1", "2", "b2"});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"bzmpop", "1", "1", "b", "MAX"});
  EXPECT_THAT(resp, ContainsLabeledScoredArray("b", {{"b2", "2"}}));

  resp = Run({"ZRANGE", "b", "0", "-1", "WITHSCORES"});
  EXPECT_THAT(resp, RespArray(ElementsAre("b1", "1")));

  // Count > 1.
  resp = Run({"zadd", "c", "1", "c1", "2", "c2"});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"bzmpop", "1", "1", "c", "MAX", "COUNT", "2"});
  EXPECT_THAT(resp, ContainsLabeledScoredArray("c", {{"c1", "1"}, {"c2", "2"}}));

  resp = Run({"zcard", "c"});
  EXPECT_THAT(resp, IntArg(0));

  // Count > #elements in set.
  resp = Run({"zadd", "d", "1", "d1", "2", "d2"});
  EXPECT_THAT(resp, IntArg(2));

  resp = Run({"bzmpop", "1", "1", "d", "MAX", "COUNT", "3"});
  EXPECT_THAT(resp, ContainsLabeledScoredArray("d", {{"d1", "1"}, {"d2", "2"}}));

  resp = Run({"zcard", "d"});
  EXPECT_THAT(resp, IntArg(0));

  // First non empty set is not the first set.
  resp = Run({"zadd", "x", "1", "x1"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"zadd", "y", "1", "y1"});
  EXPECT_THAT(resp, IntArg(1));

  resp = Run({"bzmpop", "1", "3", "empty", "x", "y", "MAX"});
  EXPECT_THAT(resp, ContainsLabeledScoredArray("x", {{"x1", "1"}}));

  resp = Run({"zcard", "x"});
  EXPECT_THAT(resp, IntArg(0));

  resp = Run({"ZRANGE", "y", "0", "-1", "WITHSCORES"});
  EXPECT_THAT(resp, RespArray(ElementsAre("y1", "1")));
}

TEST_F(ZSetFamilyTest, BMPOPBlockingTimeout) {
  RespExpr resp0;

  auto start = absl::Now();
  auto fb0 = pp_->at(0)->LaunchFiber(Launch::dispatch, [&] {
    resp0 = Run({"BZMPOP", "1", "1", "zset1", "MIN"});
    LOG(INFO) << "BZMPOP";
  });
  fb0.Join();
  auto dur = absl::Now() - start;

  // Check that the timeout duration is not too crazy.
  EXPECT_LT(AbsDuration(dur - absl::Milliseconds(1000)), absl::Milliseconds(300));
  EXPECT_THAT(resp0, ArgType(RespExpr::NIL));
}

TEST_F(ZSetFamilyTest, ZPopMin) {
  auto resp = Run({"zadd", "key", "1", "a", "2", "b", "3", "c", "4", "d", "5", "e", "6", "f"});
  EXPECT_THAT(resp, IntArg(6));

  resp = Run({"zpopmin", "key"});
  ASSERT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp.GetVec(), ElementsAre("a", "1"));

  resp = Run({"zpopmin", "key", "0"});
  ASSERT_THAT(resp, ArrLen(0));

  resp = Run({"zpopmin", "key", "2"});
  ASSERT_THAT(resp, ArrLen(4));
  EXPECT_THAT(resp.GetVec(), ElementsAre("b", "2", "c", "3"));

  resp = Run({"zpopmin", "key", "-1"});
  ASSERT_THAT(resp, ErrArg("value is out of range, must be positive"));

  resp = Run({"zpopmin", "key", "1"});
  ASSERT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp.GetVec(), ElementsAre("d", "4"));

  resp = Run({"zpopmin", "key", "3"});
  ASSERT_THAT(resp, ArrLen(4));
  EXPECT_THAT(resp.GetVec(), ElementsAre("e", "5", "f", "6"));

  resp = Run({"zpopmin", "key", "1"});
  ASSERT_THAT(resp, ArrLen(0));
}

TEST_F(ZSetFamilyTest, ZPopMax) {
  auto resp = Run({"zadd", "key", "1", "a", "2", "b", "3", "c", "4", "d", "5", "e", "6", "f"});
  EXPECT_THAT(resp, IntArg(6));

  resp = Run({"zpopmax", "key"});
  ASSERT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp.GetVec(), ElementsAre("f", "6"));

  resp = Run({"zpopmax", "key", "2"});
  ASSERT_THAT(resp, ArrLen(4));
  EXPECT_THAT(resp.GetVec(), ElementsAre("e", "5", "d", "4"));

  resp = Run({"zpopmax", "key", "-1"});
  ASSERT_THAT(resp, ErrArg("value is out of range, must be positive"));

  resp = Run({"zpopmax", "key", "1"});
  ASSERT_THAT(resp, ArrLen(2));
  EXPECT_THAT(resp.GetVec(), ElementsAre("c", "3"));

  resp = Run({"zpopmax", "key", "3"});
  ASSERT_THAT(resp, ArrLen(4));
  EXPECT_THAT(resp.GetVec(), ElementsAre("b", "2", "a", "1"));

  resp = Run({"zpopmax", "key", "1"});
  ASSERT_THAT(resp, ArrLen(0));
}

TEST_F(ZSetFamilyTest, ZAddPopCrash) {
  for (int i = 0; i < 129; ++i) {
    auto resp = Run({"zadd", "key", absl::StrCat(i), absl::StrCat("element:", i)});
    EXPECT_THAT(resp, IntArg(1)) << i;
  }

  auto resp = Run({"zpopmin", "key"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("element:0", "0"));
}

TEST_F(ZSetFamilyTest, Resp3) {
  Run({"hello", "3"});
  Run({"zadd", "x", "1", "a", "2", "b"});
  auto resp = Run({"zrange", "x", "0", "-1", "WITHSCORES"});
  ASSERT_THAT(resp, ArrLen(2));
  ASSERT_THAT(resp.GetVec()[0].GetVec(), ElementsAre("a", DoubleArg(1)));
  ASSERT_THAT(resp.GetVec()[1].GetVec(), ElementsAre("b", DoubleArg(2)));
}

TEST_F(ZSetFamilyTest, BlockingIsReleased) {
  // Inputs for ZSET store commands.
  Run({"ZADD", "A", "1", "x", "2", "b"});
  Run({"ZADD", "B", "1", "x", "3", "b"});
  Run({"ZADD", "C", "1", "x", "10", "a"});
  Run({"ZADD", "D", "1", "x", "5", "c"});
  Run({"ZADD", "E", "2", "x", "1", "c"});
  Run({"ZADD", "F", "1", "c"});

  vector<string> blocking_keys{"zset1", "zset2", "zset3"};
  for (const auto& key : blocking_keys) {
    vector<vector<string>> unblocking_commands;
    // All commands output the same set {2,x}.
    unblocking_commands.push_back({"ZADD", key, "2", "x", "10", "y"});
    unblocking_commands.push_back({"ZINCRBY", key, "2", "x"});
    unblocking_commands.push_back({"ZINTERSTORE", key, "2", "A", "B"});
    unblocking_commands.push_back({"ZUNIONSTORE", key, "2", "C", "D"});
    unblocking_commands.push_back({"ZDIFFSTORE", key, "2", "E", "F"});

    for (auto& cmd : unblocking_commands) {
      RespExpr resp0;
      auto fb0 = pp_->at(0)->LaunchFiber(Launch::dispatch, [&] {
        resp0 = Run({"BZPOPMIN", "zset1", "zset2", "zset3", "0"});
        LOG(INFO) << "BZPOPMIN";
      });

      pp_->at(1)->Await([&] { return Run({cmd.data(), cmd.size()}); });
      fb0.Join();

      ASSERT_THAT(resp0, ArrLen(3)) << cmd[0];
      EXPECT_THAT(resp0.GetVec(), ElementsAre(key, "x", "2")) << cmd[0];

      Run({"DEL", key});
    }

    // Tests for BZMPOP command
    for (auto& cmd : unblocking_commands) {
      RespExpr resp0;
      auto fb0 = pp_->at(0)->LaunchFiber(Launch::dispatch, [&] {
        resp0 = Run({"BZMPOP", "0", "3", "zset1", "zset2", "zset3", "MIN"});
        LOG(INFO) << "BZMPOP";
      });

      pp_->at(1)->Await([&] { return Run({cmd.data(), cmd.size()}); });
      fb0.Join();

      ASSERT_THAT(resp0, ArrLen(2)) << cmd[0];
      EXPECT_THAT(resp0, ContainsLabeledScoredArray(key, {{"x", "2"}})) << cmd[0];

      Run({"DEL", key});
    }
  }
}

TEST_F(ZSetFamilyTest, BlockingWithIncorrectType) {
  RespExpr resp0;
  RespExpr resp1;
  auto fb0 = pp_->at(0)->LaunchFiber(Launch::dispatch, [&] {
    resp0 = Run({"BLPOP", "list1", "0"});
  });
  auto fb1 = pp_->at(1)->LaunchFiber(Launch::dispatch, [&] {
    resp1 = Run({"BZPOPMIN", "list1", "0"});
  });

  ThisFiber::SleepFor(50us);
  pp_->at(2)->Await([&] { return Run({"ZADD", "list1", "1", "a"}); });
  pp_->at(2)->Await([&] { return Run({"LPUSH", "list1", "0"}); });
  fb0.Join();
  fb1.Join();

  EXPECT_THAT(resp1.GetVec(), ElementsAre("list1", "a", "1"));
  EXPECT_THAT(resp0.GetVec(), ElementsAre("list1", "0"));
}

TEST_F(ZSetFamilyTest, BlockingTimeout) {
  RespExpr resp0;

  auto start = absl::Now();
  auto fb0 = pp_->at(0)->LaunchFiber(Launch::dispatch, [&] {
    resp0 = Run({"BZPOPMIN", "zset1", "1"});
    LOG(INFO) << "BZPOPMIN";
  });
  fb0.Join();
  auto dur = absl::Now() - start;

  // Check that the timeout duration is not too crazy.
  EXPECT_LT(AbsDuration(dur - absl::Milliseconds(1000)), absl::Milliseconds(300));
  EXPECT_THAT(resp0, ArgType(RespExpr::NIL_ARRAY));
}

TEST_F(ZSetFamilyTest, ZDiffError) {
  RespExpr resp;

  resp = Run({"zdiff", "-1", "z1"});
  EXPECT_THAT(resp, ErrArg("value is not an integer or out of range"));

  resp = Run({"zdiff", "0"});
  EXPECT_THAT(resp, ErrArg("wrong number of arguments"));

  resp = Run({"zdiff", "0", "z1"});
  EXPECT_THAT(resp, ErrArg("at least 1 input key is needed"));

  resp = Run({"zdiff", "0", "z1", "z2"});
  EXPECT_THAT(resp, ErrArg("at least 1 input key is needed"));

  EXPECT_EQ(1, CheckedInt({"sadd", "s1", "one"}));

  resp = Run({"zdiff", "2", "z1", "s1"});
  EXPECT_THAT(resp, ErrArg("WRONGTYPE Operation against a key holding the wrong kind of value"));

  resp = Run({"zdiff", "2", "s1", "z2"});
  EXPECT_THAT(resp, ErrArg("WRONGTYPE Operation against a key holding the wrong kind of value"));
}

TEST_F(ZSetFamilyTest, ZDiff) {
  RespExpr resp;

  EXPECT_EQ(4, CheckedInt({"zadd", "z1", "1", "one", "2", "two", "3", "three", "4", "four"}));
  EXPECT_EQ(2, CheckedInt({"zadd", "z2", "1", "one", "5", "five"}));
  EXPECT_EQ(2, CheckedInt({"zadd", "z3", "2", "two", "3", "three"}));
  EXPECT_EQ(1, CheckedInt({"zadd", "z4", "4", "four"}));

  resp = Run({"zdiff", "1", "z1"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("one", "two", "three", "four"));

  resp = Run({"zdiff", "2", "z1", "z1"});
  EXPECT_THAT(resp.GetVec().empty(), true);

  resp = Run({"zdiff", "2", "z1", "doesnt_exist"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("one", "two", "three", "four"));

  resp = Run({"zdiff", "2", "z1", "z2"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("two", "three", "four"));

  resp = Run({"zdiff", "2", "z1", "z3"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("one", "four"));

  resp = Run({"zdiff", "4", "z1", "z2", "z3", "z4"});
  EXPECT_THAT(resp.GetVec().empty(), true);

  resp = Run({"zdiff", "2", "doesnt_exist", "key1"});
  EXPECT_THAT(resp.GetVec().empty(), true);

  // WITHSCORES
  resp = Run({"zdiff", "1", "z1", "WITHSCORES"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("one", "1", "two", "2", "three", "3", "four", "4"));

  resp = Run({"zdiff", "2", "z1", "z2", "withscores"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("two", "2", "three", "3", "four", "4"));
}

TEST_F(ZSetFamilyTest, ZDiff_Resp3) {
  Run({"hello", "3"});
  EXPECT_EQ(4, CheckedInt({"zadd", "z1", "1", "one", "2", "two", "3", "three", "4", "four"}));

  auto resp = Run({"zdiff", "1", "z1", "withscores"});
  ASSERT_THAT(resp, ArrLen(4));
  ASSERT_THAT(resp.GetVec()[0].GetVec(), ElementsAre("one", DoubleArg(1)));
  ASSERT_THAT(resp.GetVec()[1].GetVec(), ElementsAre("two", DoubleArg(2)));
  ASSERT_THAT(resp.GetVec()[2].GetVec(), ElementsAre("three", DoubleArg(3)));
  ASSERT_THAT(resp.GetVec()[3].GetVec(), ElementsAre("four", DoubleArg(4)));
}

TEST_F(ZSetFamilyTest, ZDiffStoreError) {
  RespExpr resp;

  resp = Run({"zdiffstore", "key"});
  EXPECT_THAT(resp, ErrArg("wrong number of arguments"));

  resp = Run({"zdiffstore", "key", "0"});
  EXPECT_THAT(resp, ErrArg("wrong number of arguments"));

  resp = Run({"zdiffstore", "key", "-1", "z1"});
  EXPECT_THAT(resp, ErrArg("value is not an integer or out of range"));

  resp = Run({"zdiffstore", "key", "0", "z1"});
  EXPECT_THAT(resp, ErrArg("at least 1 input key is needed"));

  resp = Run({"zdiffstore", "key", "0", "z1", "z2"});
  EXPECT_THAT(resp, ErrArg("at least 1 input key is needed"));

  EXPECT_EQ(1, CheckedInt({"sadd", "s1", "one"}));

  resp = Run({"zdiffstore", "key", "2", "z1", "s1"});
  EXPECT_THAT(resp, ErrArg("WRONGTYPE Operation against a key holding the wrong kind of value"));

  resp = Run({"zdiffstore", "key", "2", "s1", "z2"});
  EXPECT_THAT(resp, ErrArg("WRONGTYPE Operation against a key holding the wrong kind of value"));
}

TEST_F(ZSetFamilyTest, ZDiffStore) {
  RespExpr resp;

  EXPECT_EQ(4, CheckedInt({"zadd", "z1", "1", "one", "2", "two", "3", "three", "4", "four"}));
  EXPECT_EQ(2, CheckedInt({"zadd", "z2", "1", "one", "5", "five"}));
  EXPECT_EQ(2, CheckedInt({"zadd", "z3", "2", "two", "3", "three"}));
  EXPECT_EQ(1, CheckedInt({"zadd", "z4", "4", "four"}));

  resp = Run({"zdiffstore", "key", "1", "z1"});
  EXPECT_THAT(resp, IntArg(4));
  resp = Run({"zrange", "key", "0", "-1", "withscores"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("one", "1", "two", "2", "three", "3", "four", "4"));

  resp = Run({"zdiffstore", "key", "2", "z1", "z1"});
  EXPECT_THAT(resp, IntArg(0));
  resp = Run({"zrange", "key", "0", "-1", "withscores"});
  EXPECT_THAT(resp.GetVec().empty(), true);

  resp = Run({"zdiffstore", "key", "4", "z1", "z2", "z3", "z4"});
  EXPECT_THAT(resp, IntArg(0));
  resp = Run({"zrange", "key", "0", "-1"});
  EXPECT_THAT(resp.GetVec().empty(), true);

  resp = Run({"zdiffstore", "key", "2", "z1", "doesnt_exist"});
  EXPECT_THAT(resp, IntArg(4));
  resp = Run({"zrange", "key", "0", "-1"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("one", "two", "three", "four"));

  resp = Run({"zdiffstore", "key", "2", "doesnt_exits", "z1"});
  EXPECT_THAT(resp, IntArg(0));
  resp = Run({"zrange", "key", "0", "-1"});
  EXPECT_THAT(resp.GetVec().empty(), true);
}

TEST_F(ZSetFamilyTest, Count) {
  for (int i = 0; i < 129; ++i) {
    auto resp = Run({"zadd", "key", absl::StrCat(i), absl::StrCat("element:", i)});
    EXPECT_THAT(resp, IntArg(1)) << i;
  }

  EXPECT_THAT(CheckedInt({"zcount", "key", "-inf", "+inf"}), 129);
  EXPECT_THAT(CheckedInt({"zlexcount", "key", "-", "+"}), 129);

  // Listpack object
  Run({"ZADD", "short", "0", "A"});
  EXPECT_THAT(CheckedInt({"ZLEXCOUNT", "short", "-", "-"}), 0);
  EXPECT_THAT(CheckedInt({"ZLEXCOUNT", "short", "+", "+"}), 0);
  EXPECT_THAT(CheckedInt({"ZLEXCOUNT", "short", "+", "-"}), 0);

  // Sortedset object
  Run({"ZADD", "long", "0", "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"});
  EXPECT_THAT(CheckedInt({"ZLEXCOUNT", "long", "-", "-"}), 0);
  EXPECT_THAT(CheckedInt({"ZLEXCOUNT", "long", "+", "+"}), 0);
  EXPECT_THAT(CheckedInt({"ZLEXCOUNT", "long", "+", "-"}), 0);
}

TEST_F(ZSetFamilyTest, RangeLimit) {
  auto resp = Run({"ZRANGEBYSCORE", "", "0.0", "0.0", "limit", "0"});
  EXPECT_THAT(resp, ErrArg("syntax error"));
  resp = Run({"ZRANGEBYSCORE", "", "0.0", "0.0", "limit", "0", "0"});
  EXPECT_THAT(resp, ArrLen(0));

  resp = Run({"ZRANGEBYSCORE", "", "0.0", "0.0", "foo"});
  EXPECT_THAT(resp, ErrArg("unsupported option"));

  resp = Run({"ZRANGEBYLEX", "foo", "-", "+", "LIMIT", "-1", "3"});
  EXPECT_THAT(resp, ArrLen(0));
}

TEST_F(ZSetFamilyTest, RangeStore) {
  EXPECT_EQ(3, CheckedInt({"ZADD", "src", "1", "a", "2", "b", "3", "c"}));
  EXPECT_EQ(3, CheckedInt({"ZRANGESTORE", "dest", "src", "0", "-1"}));

  RespExpr resp = Run({"ZRANGE", "dest", "0", "-1", "withscores"});
  EXPECT_THAT(resp.GetVec(), ElementsAre("a", "1", "b", "2", "c", "3"));

  // Override dest.

  EXPECT_EQ(0, CheckedInt({"ZRANGESTORE", "dest", "not-found", "0", "-1"}));

  resp = Run({"ZRANGE", "dest", "0", "-1"});
  EXPECT_THAT(resp, ArrLen(0));
}

TEST_F(ZSetFamilyTest, ZRangeZeroElements) {
  Run({"zadd", "myzset", "1", "one"});
  auto resp = Run({"ZRANGE", "myzset", "0", "-1", "LIMIT", "2", "10"});
  ASSERT_THAT(resp, ArrLen(0));
}

TEST_F(ZSetFamilyTest, ZCountMinGreaterThanMaxCrash) {
  // Add 1000 members to the sorted set
  for (int i = 1; i <= 1000; ++i) {
    Run({"zadd", "huge_key", absl::StrCat(i), absl::StrCat("member", i)});
  }

  // Expect ZCOUNT to return 0 when min > max
  auto resp = Run({"zcount", "huge_key", "945", "261"});
  EXPECT_THAT(resp, IntArg(0));
}

}  // namespace dfly


================================================
FILE: tests/README.md
================================================
# System tests


## Pytest

The tests assume you have the "dragonfly" binary in `<root>/build-dbg` directory.
You can override the location of the binary using `DRAGONFLY_PATH` environment var.

### Important fixtures

- `df_server` is the default instance that is available for testing. Use the `dfly_args` decorator to change its default arguments.
- `client` and `async_client` are clients to the default instance. The default instance is re-used accross tests with the same arguments, but each new client flushes the instance.
- `pool` and `async_pool` are client pools that are connected to the default instance

### Custom arguments

- use `--gdb` to start all instances inside gdb.
- use `--df arg=val` to pass custom arguments to all dragonfly instances. Can be used multiple times.
- use `--log-seeder file` to store all single-db commands from the lastest tests seeder inside file.
- use `--existing-port` to use an existing instance for tests instead of starting one
- use `--rand-seed` to set the global random seed. Makes the seeder predictable.
- use `--repeat <N>` to run a test multiple times.

for example,

```sh
pytest dragonfly/connection_test.py -s  --df logtostdout --df vmodule=dragonfly_connection=2 -k test_subscribe
```
### Before you start
Please make sure that you have python 3 installed on you local host.
If have more both python 2 and python 3 installed on you host, you can run the tests with the following command:
```
python3 -m pytest -xv dragonfly
```
It is advisable to use you python virtual environment: [python virtual environment](https://docs.python.org/3/library/venv.html).
To activate it, run:
```
source <virtual env name>/bin/activate
```
Then install all the required dependencies for the tests:
```
pip3 install -r dragonfly/requirements.txt
```

### Running the tests
to run pytest, run:
`pytest -xv dragonfly`

to run selectively, use:
`pytest -xv dragonfly -k <substring>`
For more pytest flags [check here](https://fig.io/manual/pytest).

## Writing tests
The [Getting Started](https://docs.pytest.org/en/7.1.x/getting-started.html) guide is a great resource to become familiar with writing pytest test cases.

Pytest will recursively search the `tests/dragonfly` directory for files matching the patterns `test_*.py` or `*_test.py` for functions matching these [rules](https://docs.pytest.org/en/7.1.x/explanation/goodpractices.html#conventions-for-python-test-discovery):
- Functions or methods outside of a class prefixed by `test`
- Functions or methods prefixed by `test` inside a class prefixed by `Test` (without an `__init__` method)

**Note**: When making a new directory in `tests/dragonfly` be sure to create an `__init__.py` file to avoid [name conflicts](https://docs.pytest.org/en/7.1.x/explanation/goodpractices.html#tests-outside-application-code)

### Passing CLI commands to Dragonfly
To pass custom flags to the Dragonfly executable two class decorators have been created. `@dfly_args` allows you to pass a list of parameters to the Dragonfly executable, similarly `@dfly_multi_test_args` allows you to specify multiple parameter configurations to test with a given test class.

In the case of `@dfly_multi_test_args` each parameter configuration will create one Dragonfly instance which each test will receive a client to as described in the [above section](#interacting-with-dragonfly)

Parameters can use environmental variables with a formatted string where `"{<VAR>}"` will be replaced with the value of the `<VAR>` environment variable. Due to [current pytest limtations](https://github.com/pytest-dev/pytest/issues/349) fixtures cannot be passed to either of these decorators, this is currently the provided way to pass the temporary directory path in a CLI parameter.

### Test Examples
- **[snapshot_test](./dragonfly/snapshot_test.py)**: Example test using `@dfly_args`, environment variables and pre-test setup
- **[generic_test](./dragonfly/generic_test.py)**: Example test using `@dfly_multi_test_args`
- **[connection_test](./dragonfly/connection_test.py)**: Example for testing running with multiple asynchronous connections.

### Writing your own fixtures
The fixture functions located in [conftest.py](./dragonfly/conftest.py).
You can write your own fixture inside this file, as seem fit. Just make sure, before adding new fixture that there maybe one already written.
Try to make the fixture running at the smallest scope possible to ensure that the test can be independent of each other (this will ensure no side effect - match our policy of "share nothing").

### Managing test environment
Do forget to add any new dependency that you may created to [dragonfly/requirement.txt](./dragonfly/requirements.txt) file.
You can do so by running
```
pip3 freeze > requirements.txt
```
from [dragonfly](./dragonfly/) directory.

# Integration tests
Integration tests are located in the `integration` folder.

To simplify running integration test each package should have its own Dockerfile. The Dockerfile should contain everything needed in order to test the package against Dragonfly. Docker can assume Dragonfly is running on localhost:6379.
To run the test:
```
docker build -t [test-name] -f [test-dockerfile-name] .
docker run --network=host [test-name]
```

## Node-Redis
Integration test for node-redis client.
Build:
```
docker build -t node-redis-test -f ./node-redis.Dockerfile .
```
Run:
```
docker run --network=host node-redis-test
```

to run only selected tests use:

```
docker run --network=host node-redis-test npm run test -w ./packages/client -- --redis-version=2.8 -g <regex>
```

In general, you can add this way any option from [mocha framework](https://mochajs.org/#command-line-usage).

## ioredis
NOTE: we are depending on some changes to ioredis test, in order to pass more tests, as we are currently failing
because in monitor command we always returning the command name in upper case, and the tests expected it to
be in lower case.

Integration tests for ioredis client.
[ioredis](https://github.com/luin/ioredis) is a robust, performance-focused and full-featured Redis client for Node.js.
It contains a very extensive test coverage for Redis. Currently not all features are supported by Dragonfly.
As such please use the scripts for running the test successfully -
 **[run_ioredis_on_docker.sh](./integration/run_ioredis_on_docker.sh)**: to run the supported tests on a docker image
 Please note that you can run this script in two forms:

 If the image is already build:
 ```
 ./integration/run_ioredis_on_docker.sh
 ```

A more safe way is to build the image (or ensure that it is up to date), and then execute the tests:
```
 ./integration/run_ioredis_on_docker.sh --build
 ```
 The the "--build" first build the image and then execute the tests.
 Please do not try to run out of docker image as this brings the correct version and patch some tests.
Please note that the script only run tests that are currently supported
You can just build the image with

Build:
```
docker build -t ioredis-test -f ./ioredis.Dockerfile .
```

For more details on the entrypoint setup, compare the `ioredis.Dockerfile`
with the npm test script located on the `package.json` of the ioredis project.

## Jedis
Integration test for the Jedis client.
Build:
```
docker build -t jedis-test -f ./jedis.Dockerfile .
```
Run:
```
docker run --network=host jedis-test
```


================================================
FILE: tests/dragonfly/__init__.py
================================================
import pytest


def dfly_args(*args):
    """Used to define a singular set of arguments for dragonfly test"""
    return pytest.mark.parametrize("df_factory", args, indirect=True)


def dfly_multi_test_args(*args):
    """Used to define multiple sets of arguments to test multiple dragonfly configurations"""
    return pytest.mark.parametrize("df_factory", args, indirect=True)


class PortPicker:
    """A simple port manager to allocate available ports for tests"""

    def __init__(self):
        self.next_port = 5555

    def get_available_port(self):
        while not self.is_port_available(self.next_port):
            self.next_port += 1
        self.next_port += 1
        return self.next_port - 1

    def is_port_available(self, port):
        import socket

        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
            return s.connect_ex(("localhost", port)) != 0


================================================
FILE: tests/dragonfly/acl_family_test.py
================================================
import tempfile

import async_timeout

from . import dfly_args
from .utility import *


@pytest.mark.asyncio
async def test_acl_setuser(async_client):
    await async_client.execute_command("ACL SETUSER kostas")
    result = await async_client.execute_command("ACL LIST")
    assert 2 == len(result)
    assert "user kostas off resetchannels -@all $all" in result

    await async_client.execute_command("ACL SETUSER kostas ON")
    result = await async_client.execute_command("ACL LIST")
    assert "user kostas on resetchannels -@all $all" in result

    await async_client.execute_command("ACL SETUSER kostas +@list +@string +@admin")
    result = await async_client.execute_command("ACL LIST")
    # TODO consider printing to lowercase
    assert "user kostas on resetchannels -@all +@list +@string +@admin $all" in result

    await async_client.execute_command("ACL SETUSER kostas -@list -@admin")
    result = await async_client.execute_command("ACL LIST")
    assert "user kostas on resetchannels -@all +@string -@list -@admin $all" in result

    # mix and match
    await async_client.execute_command("ACL SETUSER kostas +@list -@string")
    result = await async_client.execute_command("ACL LIST")
    assert "user kostas on resetchannels -@all -@admin +@list -@string $all" in result

    # mix and match interleaved
    await async_client.execute_command("ACL SETUSER kostas +@set -@set +@set")
    result = await async_client.execute_command("ACL LIST")
    assert "user kostas on resetchannels -@all -@admin +@list -@string +@set $all" in result

    await async_client.execute_command("ACL SETUSER kostas +@all")
    result = await async_client.execute_command("ACL LIST")
    assert "user kostas on resetchannels -@admin +@list -@string +@set +@all $all" in result

    # commands
    await async_client.execute_command("ACL SETUSER kostas +set +get +hset")
    result = await async_client.execute_command("ACL LIST")
    assert (
        "user kostas on resetchannels -@admin +@list -@string +@set +@all +set +get +hset $all"
        in result
    )

    await async_client.execute_command("ACL SETUSER kostas -set -get +hset")
    result = await async_client.execute_command("ACL LIST")
    assert (
        "user kostas on resetchannels -@admin +@list -@string +@set +@all -set -get +hset $all"
        in result
    )

    # interleaved
    await async_client.execute_command("ACL SETUSER kostas -hset +get -get -@all")
    result = await async_client.execute_command("ACL LIST")
    assert (
        "user kostas on resetchannels -@admin +@list -@string +@set -set -hset -get -@all $all"
        in result
    )

    # interleaved with categories
    await async_client.execute_command("ACL SETUSER kostas +@string +get -get +set")
    result = await async_client.execute_command("ACL LIST")
    assert (
        "user kostas on resetchannels -@admin +@list +@set -hset -@all +@string -get +set $all"
        in result
    )


@pytest.mark.asyncio
async def test_acl_categories(async_client):
    await async_client.execute_command(
        "ACL SETUSER vlad ON >mypass -@all +@string +@list +@connection ~*"
    )

    result = await async_client.execute_command("AUTH vlad mypass")
    assert result == "OK"

    result = await async_client.execute_command("SET foo bar")
    assert result == "OK"

    result = await async_client.execute_command("LPUSH mykey space_monkey")
    assert result == 1

    # This should fail, vlad does not have @admin
    with pytest.raises(redis.exceptions.ResponseError):
        result = await async_client.execute_command("ACL SETUSER vlad ON >mypass")

    # This should fail, vlad does not have @sortedset
    with pytest.raises(redis.exceptions.ResponseError):
        await async_client.execute_command("ZADD myset 1 two")

    result = await async_client.execute_command("AUTH default nopass")
    assert result == "OK"

    # Make vlad an admin
    await async_client.execute_command("ACL SETUSER vlad -@string")
    assert result == "OK"

    result = await async_client.execute_command("AUTH vlad mypass")
    assert result == "OK"

    with pytest.raises(redis.exceptions.ResponseError):
        await async_client.execute_command("GET foo")

    result = await async_client.execute_command("AUTH default nopass")
    assert result == "OK"

    # Vlad goes rogue starts giving admin stats to random users
    await async_client.execute_command("ACL SETUSER adi >adi +@admin")
    assert result == "OK"

    # Vlad can now execute everything
    await async_client.execute_command("ACL SETUSER vlad +@all")
    assert result == "OK"

    await async_client.execute_command("ZADD myset 1 two")
    assert result == "OK"


@pytest.mark.asyncio
async def test_acl_commands(async_client):
    await async_client.execute_command("ACL SETUSER random ON >mypass -@all +set +get ~*")

    result = await async_client.execute_command("AUTH random mypass")
    assert result == "OK"

    result = await async_client.execute_command("SET foo bar")
    assert result == "OK"

    with pytest.raises(redis.exceptions.ResponseError):
        await async_client.execute_command("ZADD myset 1 two")


@pytest.mark.asyncio
async def test_acl_cat_commands_multi_exec_squash(df_factory):
    df = df_factory.create(multi_exec_squash=True, port=1111)

    df.start()

    # Testing acl categories
    client = aioredis.Redis(port=df.port, decode_responses=True)
    res = await client.execute_command("ACL SETUSER kk ON >kk +@transaction +@string ~*")
    assert res == "OK"

    res = await client.execute_command("AUTH kk kk")
    assert res == "OK"

    await client.execute_command("MULTI")
    assert res == "OK"
    for x in range(33):
        await client.execute_command(f"SET x{x} {x}")
    await client.execute_command("EXEC")

    await client.aclose()
    client = aioredis.Redis(port=df.port, decode_responses=True)

    # NOPERM while executing multi
    await client.execute_command("ACL SETUSER kk -@string")
    assert res == "OK"
    await client.execute_command("AUTH kk kk")
    assert res == "OK"
    await client.execute_command("MULTI")
    assert res == "OK"

    with pytest.raises(redis.exceptions.NoPermissionError):
        await client.execute_command(f"SET x bar")
    await client.aclose()

    # NOPERM between multi and exec
    admin_client = aioredis.Redis(port=df.port, decode_responses=True)
    res = await client.execute_command("ACL SETUSER kk +@string")
    assert res == "OK"

    client = aioredis.Redis(port=df.port, decode_responses=True)
    res = await client.execute_command("AUTH kk kk")
    assert res == "OK"
    # CLIENT has permissions, starts MULTI and issues a bunch of SET commands
    await client.execute_command("MULTI")
    assert res == "OK"
    for x in range(33):
        await client.execute_command(f"SET x{x} {x}")

    # revokes permissions after MULTI; ACL checks were done when the commands were queued,
    # so already-queued SET commands still execute successfully on EXEC
    res = await admin_client.execute_command("ACL SETUSER kk -@string")
    assert res == "OK"

    res = await client.execute_command("EXEC")
    for res in res:
        assert res == "OK"

    await admin_client.aclose()
    await client.aclose()

    # Testing acl commands
    client = aioredis.Redis(port=df.port, decode_responses=True)
    res = await client.execute_command("ACL SETUSER myuser ON >kk +@transaction +set ~*")
    assert res == "OK"

    await client.execute_command("AUTH myuser kk")
    assert "OK" == await client.execute_command("MULTI")
    await client.execute_command(f"SET x bar")
    await client.execute_command("EXEC")

    # NOPERM between multi and exec
    admin_client = aioredis.Redis(port=df.port, decode_responses=True)
    res = await admin_client.execute_command("ACL SETUSER myuser -set")
    assert res == "OK"

    # NOPERM while executing multi
    await client.execute_command("MULTI")

    # retry for a few seconds while the ACL SETUSER propagates, some SET commands might get through
    start = time.time()
    denied = False
    while not denied and time.time() - start < 10:
        try:
            await client.execute_command(f"SET x bar")
            await asyncio.sleep(0.1)
        except redis.exceptions.NoPermissionError:
            denied = True
        except Exception as e:
            assert False, f"failed with unexpected error: {e}"
    assert denied, "all SET commands succeeded unexpectedly defying ACL"


@pytest.mark.asyncio
async def test_acl_deluser(df_server):
    client = df_server.client()

    assert await client.execute_command("ACL SETUSER george ON >pass +@transaction +set ~*") == "OK"
    assert await client.execute_command("AUTH george pass") == "OK"

    assert await client.execute_command("MULTI") == "OK"
    assert await client.execute_command("SET the_answer 42") == "QUEUED"

    admin_client = df_server.client()
    assert await admin_client.execute_command("ACL DELUSER george") == 1

    # the connection was destroyed so EXEC will be executed in the new connection without MULTI
    with pytest.raises(redis.exceptions.ResponseError):
        await client.execute_command("EXEC")

    assert await client.execute_command("ACL WHOAMI") == "User is default"


script = """
for i = 1, 10000 do
  redis.call('SET', 'key', i)
  redis.call('SET', 'key1', i)
  redis.call('SET', 'key2', i)
  redis.call('SET', 'key3', i)
end
"""


@pytest.mark.asyncio
@pytest.mark.skip("Flaky on CI, needs investigation")
async def test_acl_del_user_while_running_lua_script(df_server):
    client = aioredis.Redis(port=df_server.port)
    await client.execute_command("ACL SETUSER kostas ON >kk +@string +@scripting ~*")
    await client.execute_command("AUTH kostas kk")
    admin_client = aioredis.Redis(port=df_server.port, decode_responses=True)

    eval_task = asyncio.create_task(client.eval(script, 4, "key", "key1", "key2", "key3"))

    # Let the script start
    await asyncio.sleep(0.1)

    # Delete the user while the script is running
    await admin_client.execute_command("ACL DELUSER kostas")

    # We expect the connection to be closed, so eval task should raise ConnectionError
    with pytest.raises(redis.exceptions.ConnectionError):
        await eval_task

    # The script should have run to completion on the server side.
    for i in range(1, 4):
        res = await admin_client.get(f"key{i}")
        assert res == "10000"


@pytest.mark.asyncio
@pytest.mark.skip("Check TODO in the body below")
async def test_acl_with_long_running_script(df_server):
    client = aioredis.Redis(port=df_server.port)
    await client.execute_command("ACL SETUSER roman ON >yoman +@string +@scripting ~*")
    await client.execute_command("AUTH roman yoman")
    admin_client = aioredis.Redis(port=df_server.port, decode_responses=True)

    eval_task = asyncio.create_task(client.eval(script, 4, "key", "key1", "key2", "key3"))

    # Let the script start
    await asyncio.sleep(0.1)

    # Change permissions while the script is running
    await admin_client.execute_command("ACL SETUSER roman -@string -@scripting")

    # The script should continue and finish successfully
    # TODO(fix): acl context should be immutable while the script is running. This requires
    # a "dummy" context so we can allow acl commands to run in parallel but we don't use stubs
    # anymore. Figure out a good solution for this.
    await eval_task

    for i in range(1, 4):
        res = await admin_client.get(f"key{i}")
        assert res == "10000"


def create_temp_file(content, tmp_dir):
    file = tempfile.NamedTemporaryFile(mode="w", dir=tmp_dir, delete=False)
    acl = os.path.join(tmp_dir, file.name)
    file.write(content)
    file.flush()
    return acl


@pytest.mark.asyncio
@dfly_args({"port": 1111})
async def test_bad_acl_file(df_factory, tmp_dir):
    acl = create_temp_file("ACL SETUSER kostas ON >mypass +@WRONG", tmp_dir)

    df = df_factory.create(aclfile=acl)

    df.start()

    client = aioredis.Redis(port=df.port)

    with pytest.raises(redis.exceptions.ResponseError):
        await client.execute_command("ACL LOAD")


@pytest.mark.asyncio
@dfly_args({"port": 1111})
async def test_good_acl_file(df_factory, tmp_dir):
    # The hash below is password temp
    acl = create_temp_file(
        "USER MrFoo ON #a6864eb339b0e1f6e00d75293a8840abf069a2c0fe82e6e53af6ac099793c1d5 >mypass &bar &r*nd",
        tmp_dir,
    )
    df = df_factory.create(aclfile=acl)

    df.start()
    client = df.client()

    await client.execute_command("ACL LOAD")
    result = await client.execute_command("ACL LIST")
    assert 2 == len(result)
    assert (
        "user MrFoo on #ea71c25a7a60224 #a6864eb339b0e1f resetchannels &bar &r*nd -@all $all"
        in result
        or "user MrFoo on #a6864eb339b0e1f #ea71c25a7a60224 resetchannels &bar &r*nd -@all $all"
        in result
    )
    assert "user default on nopass ~* &* +@all $all" in result
    await client.execute_command("ACL SETUSER MrFoo +@all $0")
    # Check multiple passwords work
    assert "OK" == await client.execute_command("AUTH mypass")
    assert "OK" == await client.execute_command("AUTH temp")
    assert "OK" == await client.execute_command("AUTH default")
    await client.execute_command("ACL DELUSER MrFoo")

    await client.execute_command("ACL SETUSER roy ON >mypass +@string +hset $1")
    await client.execute_command("ACL SETUSER shahar >mypass +@set $2")
    await client.execute_command("ACL SETUSER vlad ~foo ~bar* +@string $3")

    result = await client.execute_command("ACL LIST")
    assert 4 == len(result)
    assert "user roy on #ea71c25a7a60224 resetchannels -@all +@string +hset $1" in result
    assert "user shahar off #ea71c25a7a60224 resetchannels -@all +@set $2" in result
    assert "user vlad off ~foo ~bar* resetchannels -@all +@string $3" in result
    assert "user default on nopass ~* &* +@all $all" in result

    result = await client.execute_command("ACL DELUSER shahar")
    assert result == 1

    result = await client.execute_command("ACL SAVE")

    result = await client.execute_command("ACL LOAD")

    result = await client.execute_command("ACL LIST")
    assert 3 == len(result)
    assert "user roy on #ea71c25a7a60224 resetchannels -@all +@string +hset $1" in result
    assert "user vlad off ~foo ~bar* resetchannels -@all +@string $3" in result
    assert "user default on nopass ~* &* +@all $all" in result


@pytest.mark.asyncio
async def test_acl_log(async_client):
    res = await async_client.execute_command("ACL LOG")
    assert [] == res

    await async_client.execute_command("ACL SETUSER elon >mars ON +@string +@dangerous ~*")

    with pytest.raises(redis.exceptions.AuthenticationError):
        await async_client.execute_command("AUTH elon wrong")

    res = await async_client.execute_command("ACL LOG")
    assert 1 == len(res)
    assert res[0]["reason"] == "AUTH"
    assert res[0]["object"] == "AUTH"
    assert res[0]["username"] == "elon"

    await async_client.execute_command("ACL LOG RESET")
    res = await async_client.execute_command("ACL LOG")
    assert 0 == len(res)

    res = await async_client.execute_command("AUTH elon mars")
    res = await async_client.execute_command("SET mykey 22")

    with pytest.raises(redis.exceptions.ResponseError):
        await async_client.execute_command("hset mk kk 22")

    res = await async_client.execute_command("ACL LOG")
    assert 1 == len(res)
    assert res[0]["reason"] == "COMMAND"
    assert res[0]["object"] == "HSET"
    assert res[0]["username"] == "elon"

    with pytest.raises(redis.exceptions.ResponseError):
        await async_client.execute_command("LPUSH mylist 2")

    res = await async_client.execute_command("ACL LOG")
    assert 2 == len(res)

    res = await async_client.execute_command("ACL LOG RESET")
    await async_client.execute_command("ACL SETUSER elon resetkeys ~foo")

    with pytest.raises(redis.exceptions.ResponseError):
        await async_client.execute_command("SET bar val")

    res = await async_client.execute_command("ACL LOG")
    assert 1 == len(res)
    assert res[0]["reason"] == "KEY"
    assert res[0]["object"] == "SET"
    assert res[0]["username"] == "elon"


@pytest.mark.asyncio
@dfly_args({"port": 1111, "admin_port": 1112, "requirepass": "mypass"})
async def test_require_pass(df_factory):
    df = df_factory.create()
    df.start()

    client = aioredis.Redis(port=df.port)

    with pytest.raises(redis.exceptions.AuthenticationError):
        await client.execute_command("AUTH default wrongpass")

    client = aioredis.Redis(password="mypass", port=df.port, decode_responses=True)

    res = await client.execute_command("AUTH default mypass")
    assert res == "OK"

    res = await client.execute_command("CONFIG SET requirepass newpass")
    assert res == "OK"

    res = await client.execute_command("AUTH default newpass")
    assert res == "OK"

    client = aioredis.Redis(password="newpass", port=df.admin_port, decode_responses=True)

    await client.execute_command("SET foo 44")
    res = await client.execute_command("GET foo")
    assert res == "44"


@pytest.mark.asyncio
@dfly_args({"port": 1111, "requirepass": "temp"})
async def test_require_pass_with_acl_file_order(df_factory, tmp_dir):
    acl = create_temp_file(
        "USER default ON >jordan ~* +@all",
        tmp_dir,
    )

    df = df_factory.create(aclfile=acl)
    df.start()

    client = aioredis.Redis(username="default", password="jordan", port=df.port)

    assert await client.set("foo", "bar")


@pytest.mark.asyncio
async def test_set_acl_file(async_client: aioredis.Redis, tmp_dir):
    # Note the extra space below, it's intented to also check that we properly parse extra spaces
    acl_file_content = "USER    roy ON #ea71c25a7a602246b4c39824b855678894a96f43bb9b71319c39700a1e045222 +@string +@fast +hset\nUSER john on nopass +@string"

    acl = create_temp_file(acl_file_content, tmp_dir)

    await async_client.execute_command(f"CONFIG SET aclfile {acl}")

    await async_client.execute_command("ACL LOAD")

    result = await async_client.execute_command("ACL LIST")
    assert 3 == len(result)

    result = await async_client.execute_command("AUTH roy mypass")
    assert result == "OK"

    result = await async_client.execute_command("AUTH john nopass")
    assert result == "OK"


@pytest.mark.asyncio
@dfly_args({"proactor_threads": 1})
async def test_set_len_acl_log(async_client):
    res = await async_client.execute_command("ACL LOG")
    assert [] == res

    await async_client.execute_command("ACL SETUSER elon >mars ON +@string +@dangerous")

    for x in range(7):
        with pytest.raises(redis.exceptions.AuthenticationError):
            await async_client.execute_command("AUTH elon wrong")

    res = await async_client.execute_command("ACL LOG")
    assert 7 == len(res)

    await async_client.execute_command(f"CONFIG SET acllog_max_len 3")

    res = await async_client.execute_command("ACL LOG")
    assert 3 == len(res)

    await async_client.execute_command(f"CONFIG SET acllog_max_len 10")

    for x in range(7):
        with pytest.raises(redis.exceptions.AuthenticationError):
            await async_client.execute_command("AUTH elon wrong")

    res = await async_client.execute_command("ACL LOG")
    assert 10 == len(res)


@pytest.mark.asyncio
async def test_acl_keys(async_client):
    await async_client.execute_command("ACL SETUSER mrkeys ON >mrkeys allkeys +@admin")
    await async_client.execute_command("AUTH mrkeys mrkeys")

    with pytest.raises(redis.exceptions.ResponseError):
        await async_client.execute_command("SET foo bar")

    await async_client.execute_command(
        "ACL SETUSER mrkeys ON >mrkeys resetkeys +@string ~foo ~bar* ~dr*gon"
    )

    with pytest.raises(redis.exceptions.ResponseError):
        await async_client.execute_command("SET random rand")

    assert "OK" == await async_client.execute_command("SET foo val")
    assert "OK" == await async_client.execute_command("SET bar val")
    assert "OK" == await async_client.execute_command("SET barsomething val")
    assert "OK" == await async_client.execute_command("SET dragon val")

    await async_client.execute_command("ACL SETUSER mrkeys ON >mrkeys allkeys +@sortedset")
    assert "OK" == await async_client.execute_command("SET random rand")

    await async_client.execute_command(
        "ACL SETUSER mrkeys ON >mrkeys resetkeys resetkeys %R~foo %W~bar"
    )

    with pytest.raises(redis.exceptions.ResponseError):
        await async_client.execute_command("SET foo val")
    assert "val" == await async_client.execute_command("GET foo")

    with pytest.raises(redis.exceptions.ResponseError):
        await async_client.execute_command("GET bar")
    assert "OK" == await async_client.execute_command("SET bar val")

    await async_client.execute_command("ACL SETUSER mrkeys resetkeys ~bar* +@sortedset")
    assert 1 == await async_client.execute_command("ZADD barz1 1 val1")
    assert 1 == await async_client.execute_command("ZADD barz2 1 val2")
    # reject because bonus key does not match
    with pytest.raises(redis.exceptions.ResponseError):
        await async_client.execute_command("ZUNIONSTORE destkey 2 barz1 barz2")


@pytest.mark.asyncio
async def test_namespaces(df_server):
    admin = df_server.client()
    assert await admin.execute_command("SET foo admin") == "OK"
    assert await admin.execute_command("GET foo") == "admin"

    # Create ns space named 'ns1'
    await admin.execute_command("ACL SETUSER adi NAMESPACE:ns1 ON >adi_pass +@all ~*")

    adi = df_server.client()
    assert await adi.execute_command("AUTH adi adi_pass") == "OK"
    assert await adi.execute_command("SET foo bar") == "OK"
    assert await adi.execute_command("GET foo") == "bar"
    assert await admin.execute_command("GET foo") == "admin"

    # Adi and Shahar are on the same team
    await admin.execute_command("ACL SETUSER shahar NAMESPACE:ns1 ON >shahar_pass +@all ~*")

    shahar = df_server.client()
    assert await shahar.execute_command("AUTH shahar shahar_pass") == "OK"
    assert await shahar.execute_command("GET foo") == "bar"
    assert await shahar.execute_command("SET foo bar2") == "OK"
    assert await adi.execute_command("GET foo") == "bar2"

    # Roman is a CTO, he has his own private space
    await admin.execute_command("ACL SETUSER roman NAMESPACE:ns2 ON >roman_pass +@all ~*")

    roman = df_server.client()
    assert await roman.execute_command("AUTH roman roman_pass") == "OK"
    assert await roman.execute_command("GET foo") == None


@pytest.mark.asyncio
async def test_default_user_bug(df_server):
    client = df_server.client()

    await client.execute_command("ACL SETUSER default -@all")
    await client.aclose()

    client = df_server.client()

    with pytest.raises(redis.exceptions.ResponseError):
        await client.execute_command("SET foo bar")


@pytest.mark.asyncio
async def test_auth_resp3_bug(df_factory):
    df = df_factory.create()
    df.start()

    client = aioredis.Redis(port=df.port, protocol=3, decode_responses=True)

    await client.execute_command("ACL SETUSER kostas +@all ON >tmp")
    res = await client.execute_command("HELLO 3 AUTH kostas tmp")
    assert res["server"] == "redis"
    assert res["version"] == "7.4.0"
    assert res["proto"] == 3
    assert res["mode"] == "standalone"
    assert res["role"] == "master"
    assert res["id"] == 1


@pytest.mark.asyncio
async def test_acl_pub_sub_auth(df_factory):
    df = df_factory.create()
    df.start()
    client = df.client()
    await client.execute_command("ACL SETUSER kostas on >tmp +subscribe +psubscribe &f*o &bar")
    assert await client.execute_command("AUTH kostas tmp") == "OK"

    res = await client.execute_command("SUBSCRIBE bar")
    assert res == ["subscribe", "bar", 1]

    res = await client.execute_command("SUBSCRIBE foo")
    assert res == ["subscribe", "foo", 2]

    with pytest.raises(redis.exceptions.NoPermissionError):
        res = await client.execute_command("SUBSCRIBE my_channel")

    # PSUBSCRIBE only matches pure literals, no asterisks
    with pytest.raises(redis.exceptions.NoPermissionError):
        res = await client.execute_command("PSUBSCRIBE foo")

    # my_channel is not in our list so the command should fail
    with pytest.raises(redis.exceptions.NoPermissionError):
        res = await client.execute_command("PSUBSCRIBE bar my_channel")

    res = await client.execute_command("PSUBSCRIBE bar")
    assert res == ["psubscribe", "bar", 3]


@pytest.mark.asyncio
async def test_acl_revoke_pub_sub_while_subscribed(df_factory):
    df = df_factory.create()
    df.start()
    publisher = df.client()

    async def publish_worker(client):
        logging.debug("Starting publish_worker")
        for i in range(0, 10):
            logging.debug(f"publisher iteration: {i}")
            await client.publish("channel", f"message{i}")

    async def subscribe_worker(channel: aioredis.client.PubSub):
        logging.debug("Starting subscribe_worker")
        total_msgs = 0
        async with async_timeout.timeout(10):
            while total_msgs != 10:
                try:
                    res = await channel.get_message(ignore_subscribe_messages=True, timeout=5)
                    if res is None:
                        await asyncio.sleep(0.01)
                        continue
                    assert res["data"] == f"message{total_msgs}"
                    logging.debug(f"subscriber iteration: {total_msgs}")
                    total_msgs = total_msgs + 1
                except asyncio.TimeoutError:
                    pass

    await publisher.execute_command("ACL SETUSER kostas >tmp ON +@slow +SUBSCRIBE allchannels")

    subscriber = aioredis.Redis(
        username="kostas", password="tmp", port=df.port, decode_responses=True
    )
    subscriber_obj = subscriber.pubsub()
    await subscriber_obj.subscribe("channel")

    # There's a rare timing issue if we don't wait here, but given the weak guarantees of Pub/Sub,
    # that's probably OK.
    await asyncio.sleep(1)

    subscribe_task = asyncio.create_task(subscribe_worker(subscriber_obj))
    await publish_worker(publisher)
    await subscribe_task

    subscribe_task = asyncio.create_task(subscribe_worker(subscriber_obj))
    # Already subscribed, we should still be able to receive messages on channel
    # We should not be able to unsubscribe
    await publisher.execute_command("ACL SETUSER kostas -SUBSCRIBE -UNSUBSCRIBE")
    await publish_worker(publisher)
    await subscribe_task
    # unsubscribe is not marked async and it's such a mess that it throws the error
    # once we try to resubscribe. Instead I use the raw execute command to check that
    # permission changes work
    with pytest.raises(redis.exceptions.NoPermissionError):
        await subscriber.execute_command("UNSUBSCRIBE channel")

    await publisher.execute_command("ACL SETUSER kostas +SUBSCRIBE +UNSUBSCRIBE")

    subscribe_task = asyncio.create_task(subscribe_worker(subscriber_obj))
    await publisher.execute_command("ACL SETUSER kostas resetchannels")
    await publish_worker(publisher)
    with pytest.raises((redis.exceptions.ConnectionError, redis.exceptions.NoPermissionError)):
        await subscribe_task


@pytest.mark.asyncio
async def test_acl_select(async_client):
    await async_client.execute_command("ACL SETUSER kostas on >tmp +@all $1 ~*")
    assert await async_client.execute_command("AUTH kostas tmp") == "OK"

    res = await async_client.execute_command("SET foo bar")
    assert res == "OK"

    with pytest.raises(redis.exceptions.NoPermissionError):
        await async_client.execute_command("SELECT 0")

    with pytest.raises(redis.exceptions.NoPermissionError):
        await async_client.execute_command("MOVE foo 2")

    res = await async_client.client_list()
    assert res[0]["db"] == "1"


================================================
FILE: tests/dragonfly/bull_sidekiq_test.py
================================================
import json
import logging
import time
import uuid

from redis import asyncio as aioredis

# from bullmq import Queue
# from . import dfly_args


# BULLMQ_QUEUE_NAME = "{test_queue}"

# @pytest.fixture
# async def bullmq_queue(df_server):
#     queue = Queue(BULLMQ_QUEUE_NAME, {"connection": {"host": "localhost", "port": df_server.port}})
#     yield queue
#     await queue.close()


# @dfly_args({"lock_on_hashtags": True})
# async def test_bullmq_push_jobs(async_client: aioredis.Redis, bullmq_queue: Queue):
#     """Push 200 jobs and verify they are stored in Dragonfly."""
#     for i in range(200):
#         await bullmq_queue.add(
#             "process_job",
#             {"job_id": f"job{i}", "payload": f"data for job {i}"},
#         )

#     # BullMQ stores waiting jobs in a list key: bull:<queue_name>:wait
#     wait_key = f"bull:{BULLMQ_QUEUE_NAME}:wait"
#     queue_len = await async_client.llen(wait_key)
#     assert queue_len == 200

#     # Verify a job can be read back
#     raw = await async_client.lindex(wait_key, 0)
#     assert raw is not None
#     mem_usage = await async_client.memory_usage(wait_key)
#     logging.info(f"Queue '{wait_key}' MEMORY USAGE: {mem_usage:,} bytes ({queue_len} jobs)")


def _make_sidekiq_job(i: int) -> str:
    """Generate a job payload matching the Sidekiq wire format.

    Verified against sidekiq/lib/sidekiq/client.rb (atomic_push) and
    sidekiq/lib/sidekiq/job_util.rb (normalize_item).
    """
    jid = uuid.uuid4().hex[:24]  # SecureRandom.hex(12)
    now = time.time()  # Time.now.to_f
    return json.dumps(
        {
            "class": "ProcessJobWorker",
            "args": [
                f"job{i}",
                {"user_id": 100000 + i, "action": "process", "priority": "normal"},
            ],
            "retry": True,
            "queue": "default",
            "jid": jid,
            "created_at": now,
            "enqueued_at": now,
        }
    )


async def test_sidekiq_push_jobs(async_client: aioredis.Redis):
    """Push 2000 Sidekiq jobs and verify they are stored correctly."""
    queue_key = "queue:default"
    num_jobs = 2000

    pipe = async_client.pipeline()
    for i in range(num_jobs):
        pipe.lpush(queue_key, _make_sidekiq_job(i))
    await pipe.execute()

    queue_len = await async_client.llen(queue_key)
    assert queue_len == num_jobs

    # Verify readability
    first = await async_client.lindex(queue_key, 0)
    last = await async_client.lindex(queue_key, -1)
    assert first is not None and last is not None
    parsed = json.loads(first)
    assert parsed["class"] == "ProcessJobWorker"

    mem_usage = await async_client.memory_usage(queue_key)
    logging.info(
        f"Queue '{queue_key}' MEMORY USAGE: {mem_usage:,} bytes ({queue_len} Sidekiq jobs)"
    )


================================================
FILE: tests/dragonfly/celery_test.py
================================================
import logging
import threading
from redis import asyncio as aioredis

import pytest
from celery import Celery
from celery.contrib.testing.worker import (
    setup_app_for_worker,
    TestWorkController,
    _set_task_join_will_block,
)


def _process_job(job_id):
    return f"Worker successfully processed job {job_id}"


@pytest.fixture
def celery_app(df_server):
    broker_url = f"redis://localhost:{df_server.port}/0"
    app = Celery("dragonfly_test", broker=broker_url, backend=broker_url)
    app.conf.task_default_queue = "my_queue"

    app.task(name="process_job")(_process_job)
    yield app

    # Prevent AsyncResult.__del__ on leftover task objects from pinging
    # the Redis backend after the server has already been shut down.
    if hasattr(app, "backend"):
        app.backend.remove_pending_result = lambda *args, **kwargs: None

    app.close()


@pytest.fixture
def celery_worker(celery_app):
    """Teardown order: celery_worker -> celery_app -> df_server,
    so the worker stops while Dragonfly is still running."""
    setup_app_for_worker(celery_app, loglevel="INFO", logfile=None)
    worker = TestWorkController(
        app=celery_app,
        concurrency=1,
        pool="solo",
        loglevel="INFO",
        without_heartbeat=True,
        without_mingle=True,
        without_gossip=True,
    )
    t = threading.Thread(target=worker.start, daemon=True)
    t.start()
    worker.ensure_started()
    # Explicitly allow tests to call .get() on tasks. By default, Celery's eager
    # test worker will block and raise an error if you try to get results from
    # within what it perceives to be a worker context to prevent deadlocks.
    _set_task_join_will_block(False)
    yield worker

    # Must explicitly stop the daemon to prevent it from entering a tight
    # reconnection spin loop when the test abruptly destroys the Redis socket.
    worker.stop()
    # Use a timeout because the worker thread may be blocked on socket.recv()
    # in the kombu event loop and never notice the stop flag.
    # The thread is a daemon, so it will be cleaned up on process exit.
    t.join(timeout=10)


async def test_celery_push_jobs(async_client: aioredis.Redis, celery_app):
    process_job = celery_app.tasks["process_job"]

    results = []
    for i in range(0, 200):
        results.append(process_job.delay(f"job{i}"))

    queue_len = await async_client.llen("my_queue")
    assert queue_len == 200
    mem_usage = await async_client.memory_usage("my_queue")
    logging.info(f"Queue 'my_queue' MEMORY USAGE: {mem_usage:,} bytes ({queue_len} jobs)")


def test_celery_inspect(celery_app, celery_worker):
    process_job = celery_app.tasks["process_job"]
    inspector = celery_app.control.inspect()

    # Worker should be alive
    ping = inspector.ping()
    logging.info(f"Ping response: {ping}")
    assert len(ping) == 1

    # Our task should be registered
    registered = inspector.registered()
    worker_name = list(registered.keys())[0]
    task_names = registered[worker_name]
    assert "process_job" in task_names

    # Check active queues
    queues = inspector.active_queues()
    assert queues is not None
    queue_names = [q["name"] for q in queues[worker_name]]
    assert "my_queue" in queue_names

    # Check stats
    stats = inspector.stats()
    logging.info(f"Stats response: {stats}")
    assert worker_name in stats


================================================
FILE: tests/dragonfly/cluster_mgr_test.py
================================================
import subprocess
import pytest
import redis
from redis import asyncio as aioredis
from .utility import *
from . import dfly_args

BASE_PORT = 30001


async def insert_cluster_data(cluster_client: redis.RedisCluster):
    for i in range(1_000):
        await cluster_client.set(i, i)


async def check_cluster_data(cluster_client: redis.RedisCluster):
    for i in range(1_000):
        assert await cluster_client.get(i) == str(i)


def run_cluster_mgr(args):
    print(f"Running cluster_mgr.py {args}")
    result = subprocess.run(["../tools/cluster_mgr.py", *args])
    logging.debug(result)
    return result.returncode == 0


@pytest.mark.exclude_epoll
@dfly_args({"proactor_threads": 2, "cluster_mode": "yes"})
async def test_cluster_mgr(df_factory):
    NODES = 3
    masters = [df_factory.create(port=BASE_PORT + i) for i in range(NODES)]
    replicas = [df_factory.create(port=BASE_PORT + 100 + i) for i in range(NODES)]
    df_factory.start_all([*masters, *replicas])

    # Initialize a cluster (all slots belong to node 0)
    assert run_cluster_mgr(["--action=config_single_remote", f"--target_port={BASE_PORT}"])
    for i in range(1, NODES):
        assert run_cluster_mgr(
            ["--action=attach", f"--target_port={BASE_PORT}", f"--attach_port={BASE_PORT+i}"]
        )

    # Feed the cluster with data and test that it works correctly
    client = aioredis.RedisCluster(decode_responses=True, host="127.0.0.1", port=masters[0].port)
    await insert_cluster_data(client)
    await check_cluster_data(client)

    # Migrate ~half of the slots to node 1
    assert run_cluster_mgr(
        [
            f"--action=migrate",
            f"--target_port={BASE_PORT + 1}",
            f"--slot_start=8000",
            f"--slot_end=16383",
        ]
    )
    await check_cluster_data(client)

    # Can only detach node 2 (with no assigned slots)
    assert not run_cluster_mgr(["--action=detach", f"--target_port={BASE_PORT}"])
    assert not run_cluster_mgr(["--action=detach", f"--target_port={BASE_PORT + 1}"])
    assert run_cluster_mgr(["--action=detach", f"--target_port={BASE_PORT + 2}"])
    await check_cluster_data(client)

    # Can't attach non-replica as replica
    assert not run_cluster_mgr(
        [
            f"--action=attach",
            f"--target_port={BASE_PORT}",
            f"--attach_port={BASE_PORT+2}",
            f"--attach_as_replica=True",
        ]
    )

    # Reattach node 2 and migrate some slots to it
    assert run_cluster_mgr(
        ["--action=attach", f"--target_port={BASE_PORT}", f"--attach_port={BASE_PORT+2}"]
    )
    await check_cluster_data(client)
    # Slots 7000-8000 belong to node0, while 8001-9000 belong to node1. cluster_mgr doesn't support
    # such a migration in a single command.
    assert not run_cluster_mgr(
        [
            f"--action=migrate",
            f"--target_port={BASE_PORT + 1}",
            f"--slot_start=7000",
            f"--slot_end=9000",
        ]
    )
    assert run_cluster_mgr(
        ["--action=migrate", f"--target_port={BASE_PORT + 2}", "--slot_start=0", "--slot_end=2000"]
    )
    await check_cluster_data(client)
    assert run_cluster_mgr(
        [
            f"--action=migrate",
            f"--target_port={BASE_PORT + 2}",
            f"--slot_start=8000",
            f"--slot_end=10000",
        ]
    )
    await check_cluster_data(client)

    # Can't attach replica before running REPLICAOF
    assert not run_cluster_mgr(
        [
            f"--action=attach",
            f"--attach_as_replica=True",
            f"--target_port={BASE_PORT}",
            f"--attach_port={replicas[0].port}",
        ]
    )

    # Add replicas
    replica_clients = [replica.client() for replica in replicas]
    for i in range(NODES):
        await replica_clients[i].execute_command(f"replicaof 127.0.0.1 {masters[i].port}")
        assert run_cluster_mgr(
            [
                f"--action=attach",
                f"--attach_as_replica=True",
                f"--target_port={masters[i].port}",
                f"--attach_port={replicas[i].port}",
            ]
        )

    # Can't take over when target is a master
    assert not run_cluster_mgr(["--action=takeover", f"--target_port={masters[i].port}"])

    # Take over replica 0
    assert run_cluster_mgr(["--action=takeover", f"--target_port={replicas[0].port}"])
    await replica_clients[0].execute_command("replicaof no one")
    await check_cluster_data(client)

    # Revert take over
    c_master0 = masters[0].client()
    await c_master0.execute_command(f"replicaof 127.0.0.1 {replicas[0].port}")
    assert run_cluster_mgr(
        [
            f"--action=attach",
            f"--attach_as_replica=True",
            f"--target_port={replicas[0].port}",
            f"--attach_port={masters[0].port}",
        ]
    )
    assert run_cluster_mgr(["--action=takeover", f"--target_port={masters[0].port}"])
    await c_master0.execute_command(f"replicaof no one")
    await replica_clients[0].execute_command(f"replicaof 127.0.0.1 {masters[0].port}")
    assert run_cluster_mgr(
        [
            f"--action=attach",
            f"--attach_as_replica=True",
            f"--target_port={masters[0].port}",
            f"--attach_port={replicas[0].port}",
        ]
    )
    await check_cluster_data(client)

    # Print the config - we don't really verify the output, but at least make sure there's no error
    assert run_cluster_mgr(["--action=print_config", f"--target_port={replicas[0].port}"])

    # Test detach replicas work
    for i in range(NODES):
        assert run_cluster_mgr(["--action=detach", f"--target_port={replicas[i].port}"])
    await check_cluster_data(client)
    await client.aclose()


================================================
FILE: tests/dragonfly/cluster_test.py
================================================
import pytest
import copy
import re
import json
import redis
from binascii import crc_hqx
from redis import asyncio as aioredis
import asyncio
from dataclasses import dataclass

from .instance import DflyInstanceFactory, DflyInstance
from .utility import *
from .replication_test import check_all_replicas_finished
from redis.cluster import RedisCluster
from redis.cluster import ClusterNode
from redis.exceptions import MovedError
from .proxy import Proxy
from .seeder import Seeder, SeederBase, DebugPopulateSeeder

from . import dfly_args

BASE_PORT = 30001


def monotonically_increasing_port_number():
    port = BASE_PORT
    while True:
        yield port
        port = port + 1


# Create a generator object
next_port = monotonically_increasing_port_number()


async def get_memory(client, field):
    info = await client.info("memory")
    return info[field]


class RedisClusterNode:
    def __init__(self, port):
        self.port = port
        self.proc = None

    def start(self):
        self.proc = subprocess.Popen(
            [
                "redis-server-6.2.11",
                f"--port {self.port}",
                "--save ''",
                "--cluster-enabled yes",
                f"--cluster-config-file nodes_{self.port}.conf",
                "--cluster-node-timeout 5000",
                "--appendonly no",
                "--protected-mode no",
                "--repl-diskless-sync yes",
                "--repl-diskless-sync-delay 0",
            ]
        )
        logging.debug(self.proc.args)

    def stop(self):
        self.proc.terminate()
        try:
            self.proc.wait(timeout=10)
        except Exception as e:
            pass


@pytest.fixture(scope="function")
def redis_cluster(port_picker):
    # create redis client with 3 node with default slot configuration
    # node1 slots 0-5460
    # node2 slots 5461-10922
    # node3 slots 10923-16383
    ports = [port_picker.get_available_port() for i in range(3)]
    nodes = [RedisClusterNode(port) for port in ports]
    try:
        for node in nodes:
            node.start()
            time.sleep(1)
    except FileNotFoundError as e:
        skip_if_not_in_github()
        raise

    create_command = f'echo "yes" |redis-cli --cluster create {" ".join([f"127.0.0.1:{port}" for port in ports])}'
    subprocess.run(create_command, shell=True)
    time.sleep(4)
    yield nodes
    for node in nodes:
        node.stop()


@dataclass
class MigrationInfo:
    ip: str
    port: int
    slots: list
    node_id: str


@dataclass
class NodeInfo:
    id: str
    instance: DflyInstance
    client: aioredis.Redis
    admin_client: aioredis.Redis
    slots: list
    migrations: list
    replicas: list
    health: str


async def create_node_info(instance) -> NodeInfo:
    client = instance.client()
    node_id = await get_node_id(client)
    ninfo = NodeInfo(
        id=node_id,
        instance=instance,
        client=client,
        admin_client=instance.admin_client(),
        slots=[],
        migrations=[],
        replicas=[],
        health="online",
    )
    return ninfo


def generate_config(nodes):
    return [
        {
            "slot_ranges": [{"start": s, "end": e} for (s, e) in node.slots],
            "master": {
                "id": node.id,
                "ip": "127.0.0.1",
                "port": node.instance.port,
                "health": node.health,
            },
            "replicas": [
                {
                    "id": replica.id,
                    "ip": "127.0.0.1",
                    "port": replica.instance.port,
                    "health": node.health,
                }
                for replica in node.replicas
            ],
            "migrations": [
                {
                    "slot_ranges": [{"start": s, "end": e} for (s, e) in m.slots],
                    "node_id": m.node_id,
                    "ip": m.ip,
                    "port": m.port,
                }
                for m in node.migrations
            ],
        }
        for node in nodes
    ]


async def push_config(config, admin_connections):
    logging.debug("Pushing config %s", config)
    res = await asyncio.gather(
        *(c_admin.execute_command("DFLYCLUSTER", "CONFIG", config) for c_admin in admin_connections)
    )
    assert all([r == "OK" for r in res])


async def wait_for_status(admin_client, node_id, status, timeout=10):
    get_status = lambda: admin_client.execute_command(
        "DFLYCLUSTER", "SLOT-MIGRATION-STATUS", node_id
    )

    if not isinstance(status, list):
        status = [status]

    async for states, breaker in tick_timer(get_status, timeout=timeout):
        with breaker:
            assert len(states) != 0 and all(state[2] in status for state in states), states


async def wait_for_ft_index_creation(client, idx_name, timeout=5):
    get_status = lambda: client.execute_command("FT.INFO", idx_name)

    async for states, breaker in tick_timer(get_status, timeout=timeout):
        with breaker:
            assert len(states) != 0, states


async def wait_for_error(admin_client, node_id, error, timeout=10):
    get_status = lambda: admin_client.execute_command(
        "DFLYCLUSTER", "SLOT-MIGRATION-STATUS", node_id
    )

    async for states, breaker in tick_timer(get_status, timeout=timeout):
        with breaker:
            assert len(states) != 0 and all(error == state[4] for state in states), states


async def wait_for_migration_start(admin_client, node_id):
    while (
        len(await admin_client.execute_command("DFLYCLUSTER", "SLOT-MIGRATION-STATUS", node_id))
        == 0
    ):
        await asyncio.sleep(0.1)


async def check_for_no_state_status(admin_clients):
    for client in admin_clients:
        state = await client.execute_command("DFLYCLUSTER", "SLOT-MIGRATION-STATUS")
        if len(state) != 0:
            logging.debug(f"SLOT-MIGRATION-STATUS is {state}, instead of NO_STATE")
            assert False


def key_slot(key_str) -> int:
    key = str.encode(key_str)
    return crc_hqx(key, 0) % 16384


async def get_node_id(connection):
    id = await connection.execute_command("CLUSTER MYID")
    assert isinstance(id, str)
    return id


def stop_and_get_restore_log(instance):
    instance.stop()
    lines = instance.find_in_logs("RestoreStreamer LSN")
    assert len(lines) == 1
    line = lines[0]
    logging.debug(f"Streamer log line: {line}")
    return line


@dfly_args({})
class TestNotEmulated:
    async def test_cluster_commands_fails_when_not_emulate(self, async_client: aioredis.Redis):
        with pytest.raises(aioredis.ResponseError) as respErr:
            await async_client.execute_command("CLUSTER HELP")
        assert "cluster_mode" in str(respErr.value)

        with pytest.raises(aioredis.ResponseError) as respErr:
            await async_client.execute_command("CLUSTER SLOTS")
        assert "emulated" in str(respErr.value)


@dfly_args({"cluster_mode": "emulated"})
class TestEmulated:
    def test_cluster_slots_command(self, df_server, cluster_client: redis.RedisCluster):
        expected = {(0, 16383): {"primary": ("127.0.0.1", df_server.port), "replicas": []}}
        res = cluster_client.execute_command("CLUSTER SLOTS")
        assert expected == res

    def test_cluster_help_command(self, cluster_client: redis.RedisCluster):
        # `target_nodes` is necessary because CLUSTER HELP is not mapped on redis-py
        res = cluster_client.execute_command(
            "CLUSTER", "HELP", target_nodes=redis.RedisCluster.RANDOM
        )
        assert "HELP" in res
        assert "SLOTS" in res

    def test_cluster_pipeline(self, cluster_client: redis.RedisCluster):
        pipeline = cluster_client.pipeline()
        pipeline.set("foo", "bar")
        pipeline.get("foo")
        val = pipeline.execute()
        assert val == [True, "bar"]


# Unfortunately we can't test --announce_port here because that causes the Python Cluster client to
# throw if it can't access the port in `CLUSTER SLOTS` :|
@dfly_args({"cluster_mode": "emulated", "cluster_announce_ip": "127.0.0.2"})
class TestEmulatedWithAnnounceIp:
    def test_cluster_slots_command(self, df_server, cluster_client: redis.RedisCluster):
        expected = {(0, 16383): {"primary": ("127.0.0.2", df_server.port), "replicas": []}}
        res = cluster_client.execute_command("CLUSTER SLOTS")
        assert expected == res


@dataclass
class ReplicaInfo:
    id: string
    port: int


def verify_slots_result(port: int, answer: list, replicas) -> bool:
    def is_local_host(ip: str) -> bool:
        return ip == "127.0.0.1" or ip == "localhost"

    assert answer[0] == 0  # start shard
    assert answer[1] == 16383  # last shard

    info = answer[2]
    assert len(info) == 3
    ip_addr = info[0]
    assert is_local_host(ip_addr)
    assert info[1] == port

    # Replicas
    assert len(answer) == 3 + len(replicas)
    for i in range(3, len(replicas)):
        replica = replicas[i - 3]
        rep_info = answer[i]
        assert len(rep_info) == 3
        ip_addr = rep_info[0]
        assert is_local_host(ip_addr)
        assert rep_info[1] == replica.port
        assert rep_info[2] == replica.id

    return True


# --managed_service_info means that Dragonfly is running in a managed service, so some details
# are hidden from users, see https://github.com/dragonflydb/dragonfly/issues/4173
@dfly_args({"proactor_threads": 4, "cluster_mode": "emulated", "managed_service_info": "true"})
async def test_emulated_cluster_with_replicas(df_factory):
    master = df_factory.create(port=next(next_port), admin_port=next(next_port))
    replicas = [df_factory.create(port=next(next_port), logtostdout=True) for i in range(1, 3)]

    df_factory.start_all([master, *replicas])

    c_master = master.client()
    c_master_admin = master.admin_client()
    master_id = await c_master.execute_command("CLUSTER MYID")

    c_replicas = [replica.client() for replica in replicas]
    replica_ids = [(await c_replica.execute_command("CLUSTER MYID")) for c_replica in c_replicas]

    for replica, c_replica in zip(replicas, c_replicas):
        res = await c_replica.execute_command("CLUSTER SLOTS")
        assert len(res) == 1
        assert verify_slots_result(port=replica.port, answer=res[0], replicas=[])

    res = await c_master.execute_command("CLUSTER SLOTS")
    assert verify_slots_result(port=master.port, answer=res[0], replicas=[])

    # Connect replicas to master
    for replica, c_replica in zip(replicas, c_replicas):
        rc = await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
        assert rc == "OK"

    await asyncio.sleep(0.5)

    for replica, c_replica in zip(replicas, c_replicas):
        res = await c_replica.execute_command("CLUSTER SLOTS")
        assert verify_slots_result(
            port=master.port, answer=res[0], replicas=[ReplicaInfo(replica.port, id)]
        )

    res = await c_master.execute_command("CLUSTER SLOTS")
    assert verify_slots_result(
        port=master.port,
        answer=res[0],
        replicas=[],
    )

    res = await c_master_admin.execute_command("CLUSTER SLOTS")
    assert verify_slots_result(
        port=master.port,
        answer=res[0],
        replicas=[ReplicaInfo(id, replica.port) for id, replica in zip(replica_ids, replicas)],
    )

    assert await c_master.execute_command("CLUSTER NODES") == {
        f"127.0.0.1:{master.port}": {
            "connected": True,
            "epoch": "0",
            "flags": "myself,master",
            "hostname": "",
            "last_ping_sent": "0",
            "last_pong_rcvd": "0",
            "master_id": "-",
            "migrations": [],
            "node_id": master_id,
            "slots": [["0", "16383"]],
        },
    }

    assert await c_master_admin.execute_command("CLUSTER NODES") == {
        f"127.0.0.1:{master.port}": {
            "connected": True,
            "epoch": "0",
            "flags": "myself,master",
            "hostname": "",
            "last_ping_sent": "0",
            "last_pong_rcvd": "0",
            "master_id": "-",
            "migrations": [],
            "node_id": master_id,
            "slots": [["0", "16383"]],
        },
        f"127.0.0.1:{replicas[0].port}": {
            "connected": True,
            "epoch": "0",
            "flags": "slave",
            "hostname": "",
            "last_ping_sent": "0",
            "last_pong_rcvd": "0",
            "master_id": master_id,
            "migrations": [],
            "node_id": replica_ids[0],
            "slots": [],
        },
        f"127.0.0.1:{replicas[1].port}": {
            "connected": True,
            "epoch": "0",
            "flags": "slave",
            "hostname": "",
            "last_ping_sent": "0",
            "last_pong_rcvd": "0",
            "master_id": master_id,
            "migrations": [],
            "node_id": replica_ids[1],
            "slots": [],
        },
    }


@dfly_args({"proactor_threads": 4, "cluster_mode": "yes"})
async def test_cluster_managed_service_info(df_factory):
    master = df_factory.create(port=next(next_port), admin_port=next(next_port))
    replica = df_factory.create(port=next(next_port), admin_port=next(next_port))

    df_factory.start_all([master, replica])

    c_master = master.client()
    c_master_admin = master.admin_client()
    master_id = await c_master.execute_command("CLUSTER MYID")

    c_replica = replica.client()
    c_replica_admin = replica.admin_client()
    replica_id = await c_replica.execute_command("CLUSTER MYID")

    # Connect replicas to master
    rc = await c_replica_admin.execute_command(f"REPLICAOF localhost {master.port}")
    assert rc == "OK"
    await wait_available_async(c_replica)

    nodes = [await create_node_info(master)]
    nodes[0].slots = [(0, 16383)]
    nodes[0].replicas = [await create_node_info(replica)]
    await push_config(json.dumps(generate_config(nodes)), [master.client(), replica.client()])

    expected_hidden_cluster_slots = [
        [
            0,
            16383,
            [
                "127.0.0.1",
                master.port,
                master_id,
            ],
        ],
    ]
    expected_full_cluster_slots = copy.deepcopy(expected_hidden_cluster_slots)
    expected_full_cluster_slots[0].append(
        [
            "127.0.0.1",
            replica.port,
            replica_id,
        ]
    )
    assert await c_master.execute_command("CLUSTER SLOTS") == expected_full_cluster_slots
    assert await c_master_admin.execute_command("CLUSTER SLOTS") == expected_full_cluster_slots

    expected_hidden_cluster_nodes = {
        f"127.0.0.1:{master.port}": {
            "connected": True,
            "epoch": "0",
            "flags": "myself,master",
            "hostname": "",
            "last_ping_sent": "0",
            "last_pong_rcvd": "0",
            "master_id": "-",
            "migrations": [],
            "node_id": master_id,
            "slots": [["0", "16383"]],
        },
    }
    expected_full_cluster_nodes = copy.deepcopy(expected_hidden_cluster_nodes)
    expected_full_cluster_nodes[f"127.0.0.1:{replica.port}"] = {
        "connected": True,
        "epoch": "0",
        "flags": "slave",
        "hostname": "",
        "last_ping_sent": "0",
        "last_pong_rcvd": "0",
        "master_id": master_id,
        "migrations": [],
        "node_id": replica_id,
        "slots": [],
    }
    assert await c_master.execute_command("CLUSTER NODES") == expected_full_cluster_nodes
    assert await c_master_admin.execute_command("CLUSTER NODES") == expected_full_cluster_nodes

    expected_hidden_cluster_shards = [
        [
            "slots",
            [0, 16383],
            "nodes",
            [
                [
                    "id",
                    master_id,
                    "endpoint",
                    "127.0.0.1",
                    "ip",
                    "127.0.0.1",
                    "port",
                    master.port,
                    "role",
                    "master",
                    "replication-offset",
                    0,
                    "health",
                    "online",
                ],
            ],
        ],
    ]
    expected_full_cluster_shards = copy.deepcopy(expected_hidden_cluster_shards)
    expected_full_cluster_shards[0][3].append(
        [
            "id",
            replica_id,
            "endpoint",
            "127.0.0.1",
            "ip",
            "127.0.0.1",
            "port",
            replica.port,
            "role",
            "replica",
            "replication-offset",
            0,
            "health",
            "online",
        ]
    )
    assert await c_master.execute_command("CLUSTER SHARDS") == expected_full_cluster_shards
    assert await c_master_admin.execute_command("CLUSTER SHARDS") == expected_full_cluster_shards

    # this flag doesn't affect cluster anymore so the results will be the same
    await c_master.execute_command("config set managed_service_info true")

    assert await c_master.execute_command("CLUSTER SLOTS") == expected_full_cluster_slots
    assert await c_master_admin.execute_command("CLUSTER SLOTS") == expected_full_cluster_slots

    assert await c_master.execute_command("CLUSTER NODES") == expected_full_cluster_nodes
    assert await c_master_admin.execute_command("CLUSTER NODES") == expected_full_cluster_nodes

    assert await c_master.execute_command("CLUSTER SHARDS") == expected_full_cluster_shards
    assert await c_master_admin.execute_command("CLUSTER SHARDS") == expected_full_cluster_shards


@dfly_args({"cluster_mode": "emulated"})
async def test_cluster_info(async_client):
    res = await async_client.execute_command("CLUSTER INFO")
    assert len(res) == 16
    assert res == {
        "cluster_current_epoch": "1",
        "cluster_known_nodes": "1",
        "cluster_my_epoch": "1",
        "cluster_size": "1",
        "cluster_slots_assigned": "16384",
        "cluster_slots_fail": "0",
        "cluster_slots_ok": "16384",
        "cluster_slots_pfail": "0",
        "cluster_state": "ok",
        "cluster_stats_messages_meet_received": "0",
        "cluster_stats_messages_ping_received": "1",
        "cluster_stats_messages_ping_sent": "1",
        "cluster_stats_messages_pong_received": "1",
        "cluster_stats_messages_pong_sent": "1",
        "cluster_stats_messages_received": "1",
        "cluster_stats_messages_sent": "1",
    }


@dfly_args({"cluster_mode": "emulated", "cluster_announce_ip": "127.0.0.2"})
@pytest.mark.asyncio
async def test_cluster_nodes(df_server, async_client):
    res = await async_client.execute_command("CLUSTER NODES")
    assert len(res) == 1
    info = res[f"127.0.0.2:{df_server.port}"]
    assert res is not None
    assert info["connected"] == True
    assert info["epoch"] == "0"
    assert info["flags"] == "myself,master"
    assert info["last_ping_sent"] == "0"
    assert info["slots"] == [["0", "16383"]]
    assert info["master_id"] == "-"


"""
Test that slot ownership changes correctly with config changes.

Add a key to node0, then move the slot ownership to node1 and see that they both behave as
intended.
Also add keys to each of them that are *not* moved, and see that they are unaffected by the move.
"""


@dfly_args({"proactor_threads": 4, "cluster_mode": "yes", "cluster_node_id": "inigo montoya"})
async def test_cluster_node_id(df_factory: DflyInstanceFactory):
    node = df_factory.create(port=next(next_port))
    df_factory.start_all([node])

    conn = node.client()
    assert "inigo montoya" == await get_node_id(conn)


@dfly_args({"proactor_threads": 4, "cluster_mode": "yes"})
async def test_cluster_slot_ownership_changes(df_factory: DflyInstanceFactory):
    # Start and configure cluster with 2 nodes
    nodes = [df_factory.create(port=next(next_port), admin_port=next(next_port)) for i in range(2)]

    df_factory.start_all(nodes)

    c_nodes = [node.client() for node in nodes]
    c_nodes_admin = [node.admin_client() for node in nodes]

    node_ids = await asyncio.gather(*(get_node_id(c) for c in c_nodes))

    config = f"""
      [
        {{
          "slot_ranges": [
            {{
              "start": 0,
              "end": LAST_SLOT_CUTOFF
            }}
          ],
          "master": {{
            "id": "{node_ids[0]}",
            "ip": "localhost",
            "port": {nodes[0].port}
          }},
          "replicas": []
        }},
        {{
          "slot_ranges": [
            {{
              "start": NEXT_SLOT_CUTOFF,
              "end": 16383
            }}
          ],
          "master": {{
            "id": "{node_ids[1]}",
            "ip": "localhost",
            "port": {nodes[1].port}
          }},
          "replicas": []
        }}
      ]
    """

    await push_config(
        config.replace("LAST_SLOT_CUTOFF", "5259").replace("NEXT_SLOT_CUTOFF", "5260"),
        c_nodes_admin,
    )

    # Slot for "KEY1" is 5259

    # Insert a key that should stay in node0
    assert await c_nodes[0].set("KEY0", "value")

    # And to node1 (so it happens that 'KEY0' belongs to 0 and 'KEY2' to 1)
    assert await c_nodes[1].set("KEY2", "value")

    # Insert a key that we will move ownership of to node1 (but without migration yet)
    assert await c_nodes[0].set("KEY1", "value")
    assert await c_nodes[0].execute_command("DBSIZE") == 2

    # Make sure that node0 owns "KEY0"
    assert (await c_nodes[0].get("KEY0")) == "value"

    # Make sure that "KEY1" is not owned by node1
    with pytest.raises((MovedError, aioredis.ResponseError)) as e:
        await c_nodes[1].set("KEY1", "value")

    assert e.value.args[0].endswith(f"5259 localhost:{nodes[0].port}")

    # And that node1 only has 1 key ("KEY2")
    assert await c_nodes[1].execute_command("DBSIZE") == 1

    print("Moving ownership over 5259 ('KEY1') to other node")

    await push_config(
        config.replace("LAST_SLOT_CUTOFF", "5258").replace("NEXT_SLOT_CUTOFF", "5259"),
        c_nodes_admin,
    )

    # node0 should have removed "KEY1" as it no longer owns it
    # deleting non owned keys is background operation therefore we add timeout to this check
    @assert_eventually(times=2)
    async def check_dbsize(node_index, expected_size):
        assert await c_nodes[node_index].execute_command("DBSIZE") == expected_size

    await check_dbsize(node_index=0, expected_size=1)
    # node0 should still own "KEY0" though
    assert (await c_nodes[0].get("KEY0")) == "value"
    # node1 should still have "KEY2"
    assert await c_nodes[1].execute_command("DBSIZE") == 1

    # Now node0 should reply with MOVED for "KEY1"
    with pytest.raises((MovedError, aioredis.ResponseError)) as e:
        await c_nodes[0].set("KEY1", "value")

    assert e.value.args[0].endswith(f"5259 localhost:{nodes[1].port}")

    # And node1 should own it and allow using it
    assert await c_nodes[1].set("KEY1", "value")
    assert await c_nodes[1].execute_command("DBSIZE") == 2

    config = f"""
      [
        {{
          "slot_ranges": [
            {{
              "start": 0,
              "end": 16383
            }}
          ],
          "master": {{
            "id": "{node_ids[0]}",
            "ip": "localhost",
            "port": {nodes[0].port}
          }},
          "replicas": []
        }}
      ]
    """
    await push_config(config, c_nodes_admin)

    assert await c_nodes[0].execute_command("DBSIZE") == 1
    assert (await c_nodes[0].get("KEY0")) == "value"
    await check_dbsize(node_index=1, expected_size=0)


# Tests that master commands to the replica are applied regardless of slot ownership
@dfly_args({"proactor_threads": 4, "cluster_mode": "yes"})
async def test_cluster_replica_sets_non_owned_keys(df_factory: DflyInstanceFactory):
    # Start and configure cluster with 1 master and 1 replica, both own all slots
    master = df_factory.create(admin_port=next(next_port))
    replica = df_factory.create(admin_port=next(next_port))
    df_factory.start_all([master, replica])

    async with master.client() as c_master, master.admin_client() as c_master_admin, replica.client() as c_replica, replica.admin_client() as c_replica_admin:
        master_id = await get_node_id(c_master)
        replica_id = await get_node_id(c_replica)

        config = f"""
        [
          {{
            "slot_ranges": [
              {{
                "start": 0,
                "end": 16383
              }}
            ],
            "master": {{
              "id": "{master_id}",
              "ip": "localhost",
              "port": {master.port}
            }},
            "replicas": [
              {{
                "id": "{replica_id}",
                "ip": "localhost",
                "port": {replica.port}
              }}
            ]
          }}
        ]
      """
        await push_config(config, [c_master_admin, c_replica_admin])

        # Setup replication and make sure that it works properly.
        await c_master.set("key", "value")
        await c_replica.execute_command("REPLICAOF", "localhost", master.port)
        await check_all_replicas_finished([c_replica], c_master)
        assert (await c_replica.get("key")) == "value"
        assert await c_replica.execute_command("dbsize") == 1

        # Tell the replica that it and the master no longer own any data, but don't tell that to the
        # master. This will allow us to set keys on the master and make sure that they are set in the
        # replica.

        replica_config = f"""
        [
          {{
            "slot_ranges": [],
            "master": {{
              "id": "{master_id}",
              "ip": "localhost",
              "port": {master.port}
            }},
            "replicas": [
              {{
                "id": "{replica_id}",
                "ip": "localhost",
                "port": {replica.port}
              }}
            ]
          }},
          {{
            "slot_ranges": [
              {{
                "start": 0,
                "end": 16383
              }}
            ],
            "master": {{
              "id": "non-existing-master",
              "ip": "localhost",
              "port": 1111
            }},
            "replicas": []
          }}
        ]
      """

        await push_config(replica_config, [c_replica_admin])

        # The replica should *not* have deleted the key.
        assert await c_replica.execute_command("dbsize") == 1

        # Set another key on the master, which it owns but the replica does not own.
        await c_master.set("key2", "value")
        await check_all_replicas_finished([c_replica], c_master)

        # See that the key exists in both replica and master
        assert await c_master.execute_command("dbsize") == 2
        assert await c_replica.execute_command("dbsize") == 2

        # The replica should still reply with MOVED, despite having that key.
        with pytest.raises((MovedError, aioredis.ResponseError)) as e:
            await c_replica.get("key2")
            assert False, "Should not be able to get key on non-owner cluster node"

        assert re.search(r"\d+ localhost:1111", e.value.args[0])

        await push_config(replica_config, [c_master_admin])
        await check_all_replicas_finished([c_replica], c_master)
        assert await c_master.execute_command("dbsize") == 0
        assert await c_replica.execute_command("dbsize") == 0


@dfly_args({"proactor_threads": 4, "cluster_mode": "yes"})
async def test_cluster_flush_slots_after_config_change(df_factory: DflyInstanceFactory):
    # Start and configure cluster with 1 master and 1 replica, both own all slots
    master = df_factory.create(port=next(next_port), admin_port=next(next_port))
    replica = df_factory.create(port=next(next_port), admin_port=next(next_port))
    df_factory.start_all([master, replica])

    c_master = master.client()
    c_master_admin = master.admin_client()
    master_id = await get_node_id(c_master)

    c_replica = replica.client()
    c_replica_admin = replica.admin_client()
    replica_id = await get_node_id(c_replica)

    config = f"""
      [
        {{
          "slot_ranges": [
            {{
              "start": 0,
              "end": 16383
            }}
          ],
          "master": {{
            "id": "{master_id}",
            "ip": "localhost",
            "port": {master.port}
          }},
          "replicas": [
            {{
              "id": "{replica_id}",
              "ip": "localhost",
              "port": {replica.port}
            }}
          ]
        }}
      ]
    """
    await push_config(config, [c_master_admin, c_replica_admin])

    await c_master.execute_command("debug", "populate", "100000")
    assert await c_master.execute_command("dbsize") == 100_000

    # Setup replication and make sure that it works properly.
    await c_replica.execute_command("REPLICAOF", "localhost", master.port)
    await check_all_replicas_finished([c_replica], c_master)
    assert await c_replica.execute_command("dbsize") == 100_000

    resp = await c_master_admin.execute_command("dflycluster", "getslotinfo", "slots", "0")
    assert resp[0][0] == 0
    slot_0_size = resp[0][2]
    print(f"Slot 0 size = {slot_0_size}")
    assert slot_0_size > 0

    config = f"""
      [
        {{
          "slot_ranges": [
            {{
              "start": 1,
              "end": 16383
            }}
          ],
          "master": {{
            "id": "{master_id}",
            "ip": "localhost",
            "port": {master.port}
          }},
          "replicas": [
            {{
              "id": "{replica_id}",
              "ip": "localhost",
              "port": {replica.port}
            }}
          ]
        }},
        {{
          "slot_ranges": [
            {{
              "start": 0,
              "end": 0
            }}
          ],
          "master": {{
            "id": "other-master",
            "ip": "localhost",
            "port": 9000
          }},
          "replicas": [
            {{
              "id": "other-replica",
              "ip": "localhost",
              "port": 9001
            }}
          ]
        }}
      ]
    """
    await push_config(config, [c_master_admin, c_replica_admin])

    await check_all_replicas_finished([c_replica], c_master)

    assert await c_master.execute_command("dbsize") == (100_000 - slot_0_size)
    assert await c_replica.execute_command("dbsize") == (100_000 - slot_0_size)


@dfly_args({"proactor_threads": 4, "cluster_mode": "yes", "admin_port": next(next_port)})
async def test_cluster_blocking_command(df_server):
    c_master = df_server.client()
    c_master_admin = df_server.admin_client()

    config = [
        {
            "slot_ranges": [{"start": 0, "end": 8000}],
            "master": {"id": await get_node_id(c_master), "ip": "10.0.0.1", "port": 7000},
            "replicas": [],
        },
        {
            "slot_ranges": [{"start": 8001, "end": 16383}],
            "master": {"id": "other", "ip": "10.0.0.2", "port": 7000},
            "replicas": [],
        },
    ]

    assert (
        await c_master_admin.execute_command("DFLYCLUSTER", "CONFIG", json.dumps(config))
    ) == "OK"

    assert (await c_master.execute_command("CLUSTER", "KEYSLOT", "keep-local")) == 3479
    assert (await c_master.execute_command("CLUSTER", "KEYSLOT", "remove-key-4")) == 6103

    v1 = asyncio.create_task(c_master.blpop("keep-local", 2))
    v2 = asyncio.create_task(c_master.blpop("remove-key-4", 2))

    await asyncio.sleep(0.1)

    config[0]["slot_ranges"][0]["end"] = 5000
    config[1]["slot_ranges"][0]["start"] = 5001
    assert (
        await c_master_admin.execute_command("DFLYCLUSTER", "CONFIG", json.dumps(config))
    ) == "OK"

    await c_master.lpush("keep-local", "WORKS")

    assert (await v1) == ("keep-local", "WORKS")
    with pytest.raises(MovedError) as e_info:
        await v2


@dfly_args({"proactor_threads": 4, "cluster_mode": "yes"})
async def test_blocking_commands_cancel(df_factory, df_seeder_factory):
    instances = [
        df_factory.create(port=next(next_port), admin_port=next(next_port)) for i in range(2)
    ]

    df_factory.start_all(instances)

    nodes = [(await create_node_info(instance)) for instance in instances]
    nodes[0].slots = [(0, 16383)]
    nodes[1].slots = []

    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    set_task = asyncio.create_task(nodes[0].client.execute_command("BZPOPMIN set1 0"))
    list_task = asyncio.create_task(nodes[0].client.execute_command("BLPOP list1 0"))

    nodes[0].migrations.append(
        MigrationInfo("127.0.0.1", nodes[1].instance.port, [(0, 16383)], nodes[1].id)
    )
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    await wait_for_status(nodes[0].admin_client, nodes[1].id, "FINISHED")

    nodes[0].migrations = []
    nodes[0].slots = []
    nodes[1].slots = [(0, 16383)]
    logging.debug("remove finished migrations")
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    with pytest.raises(MovedError) as set_e_info:
        await set_task
    assert f"3037 127.0.0.1:{instances[1].port}" == str(set_e_info.value)

    with pytest.raises(MovedError) as list_e_info:
        await list_task
    assert f"7141 127.0.0.1:{instances[1].port}" == str(list_e_info.value)


@pytest.mark.parametrize("set_cluster_node_id", [True, False])
@dfly_args({"proactor_threads": 4, "cluster_mode": "yes"})
async def test_cluster_native_client(
    df_factory: DflyInstanceFactory,
    df_seeder_factory: DflySeederFactory,
    set_cluster_node_id: bool,
):
    # Start and configure cluster with 3 masters and 3 replicas
    masters = [
        df_factory.create(
            port=next(next_port),
            admin_port=next(next_port),
            cluster_node_id=f"master{i}" if set_cluster_node_id else "",
        )
        for i in range(3)
    ]
    df_factory.start_all(masters)
    c_masters_admin = [master.admin_client() for master in masters]
    master_ids = await asyncio.gather(*(get_node_id(c) for c in c_masters_admin))

    replicas = [
        df_factory.create(
            port=next(next_port),
            admin_port=next(next_port),
            cluster_node_id=f"replica{i}" if set_cluster_node_id else "",
            replicaof=f"localhost:{masters[i].port}",
        )
        for i in range(3)
    ]
    df_factory.start_all(replicas)
    c_replicas = [replica.client() for replica in replicas]
    await asyncio.gather(*(wait_available_async(c) for c in c_replicas))
    c_replicas_admin = [replica.admin_client() for replica in replicas]
    replica_ids = await asyncio.gather(*(get_node_id(c) for c in c_replicas))

    config = f"""
      [
        {{
          "slot_ranges": [
            {{
              "start": 0,
              "end": 5000
            }}
          ],
          "master": {{
            "id": "{master_ids[0]}",
            "ip": "localhost",
            "port": {masters[0].port}
          }},
          "replicas": [
              {{
                "id": "{replica_ids[0]}",
                "ip": "localhost",
                "port": {replicas[0].port}
              }}
          ]
        }},
        {{
          "slot_ranges": [
            {{
              "start": 5001,
              "end": 10000
            }}
          ],
          "master": {{
            "id": "{master_ids[1]}",
            "ip": "localhost",
            "port": {masters[1].port}
          }},
          "replicas": [
              {{
                "id": "{replica_ids[1]}",
                "ip": "localhost",
                "port": {replicas[1].port}
              }}
          ]
        }},
        {{
          "slot_ranges": [
            {{
              "start": 10001,
              "end": 16383
            }}
          ],
          "master": {{
            "id": "{master_ids[2]}",
            "ip": "localhost",
            "port": {masters[2].port}
          }},
          "replicas": [
              {{
                "id": "{replica_ids[2]}",
                "ip": "localhost",
                "port": {replicas[2].port}
              }}
          ]
        }}
      ]
    """
    await push_config(config, c_masters_admin + c_replicas_admin)

    seeder = df_seeder_factory.create(port=masters[0].port, cluster_mode=True)
    await seeder.run(target_deviation=0.1)

    client = masters[0].cluster_client()

    assert await client.set("key0", "value") == True
    assert await client.get("key0") == "value"

    async def test_random_keys():
        for i in range(100):
            key = "key" + str(random.randint(0, 100_000))
            assert await client.set(key, "value") == True
            assert await client.get(key) == "value"

    await test_random_keys()

    for i in range(3):
        await check_all_replicas_finished([c_replicas[i]], c_masters_admin[i])

    await asyncio.gather(*(wait_available_async(c) for c in c_replicas))

    # Make sure that getting a value from a replica works as well.
    # We use connections directly to NOT follow 'MOVED' error, as that will redirect to the master.
    for c in c_replicas:
        try:
            assert await c.get("key0")
        except MovedError as e:
            pass

    # Push new config
    config = f"""
      [
        {{
          "slot_ranges": [
            {{
              "start": 0,
              "end": 4000
            }}
          ],
          "master": {{
            "id": "{master_ids[0]}",
            "ip": "localhost",
            "port": {masters[0].port}
          }},
          "replicas": [
              {{
                "id": "{replica_ids[0]}",
                "ip": "localhost",
                "port": {replicas[0].port}
              }}
          ]
        }},
        {{
          "slot_ranges": [
            {{
              "start": 4001,
              "end": 14000
            }}
          ],
          "master": {{
            "id": "{master_ids[1]}",
            "ip": "localhost",
            "port": {masters[1].port}
          }},
          "replicas": [
              {{
                "id": "{replica_ids[1]}",
                "ip": "localhost",
                "port": {replicas[1].port}
              }}
          ]
        }},
        {{
          "slot_ranges": [
            {{
              "start": 14001,
              "end": 16383
            }}
          ],
          "master": {{
            "id": "{master_ids[2]}",
            "ip": "localhost",
            "port": {masters[2].port}
          }},
          "replicas": [
              {{
                "id": "{replica_ids[2]}",
                "ip": "localhost",
                "port": {replicas[2].port}
              }}
          ]
        }}
      ]
    """
    await push_config(config, c_masters_admin + c_replicas_admin)

    await test_random_keys()


@dfly_args({"proactor_threads": 4, "cluster_mode": "yes"})
async def test_config_consistency(df_factory: DflyInstanceFactory):
    # Check slot migration from one node to another
    instances = [
        df_factory.create(port=next(next_port), admin_port=next(next_port)) for i in range(2)
    ]

    df_factory.start_all(instances)

    nodes = [(await create_node_info(instance)) for instance in instances]
    nodes[0].slots = [(0, 5259)]
    nodes[1].slots = [(5260, 16383)]

    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    await check_for_no_state_status([node.admin_client for node in nodes])

    nodes[0].migrations.append(
        MigrationInfo("127.0.0.1", nodes[1].instance.admin_port, [(5200, 5259)], nodes[1].id)
    )

    # Push config to source node. Migration will not start until target node gets the config as well.
    logging.debug("Push migration config to source node")
    await push_config(json.dumps(generate_config(nodes)), [nodes[0].admin_client])

    # some delay to check that migration isn't started until we send config to target node
    await asyncio.sleep(0.2)

    await wait_for_status(nodes[0].admin_client, nodes[1].id, "CONNECTING")
    await check_for_no_state_status([nodes[1].admin_client])

    logging.debug("Push migration config to target node")
    await push_config(json.dumps(generate_config(nodes)), [nodes[1].admin_client])

    await wait_for_status(nodes[1].admin_client, nodes[0].id, "FINISHED")
    await wait_for_status(nodes[0].admin_client, nodes[1].id, "FINISHED")

    nodes[0].migrations = []
    nodes[0].slots = [(0, 5199)]
    nodes[1].slots = [(5200, 16383)]

    logging.debug("remove finished migrations")
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    await check_for_no_state_status([node.admin_client for node in nodes])


@dfly_args({"proactor_threads": 4, "cluster_mode": "yes"})
async def test_cluster_flushall_during_migration(
    df_factory: DflyInstanceFactory, df_seeder_factory
):
    # Check data migration from one node to another
    instances = [
        df_factory.create(
            port=next(next_port),
            admin_port=next(next_port),
            vmodule="cluster_family=2,outgoing_slot_migration=2,incoming_slot_migration=2,streamer=2,server_family=1",
        )
        for i in range(2)
    ]

    df_factory.start_all(instances)

    nodes = [(await create_node_info(instance)) for instance in instances]
    nodes[0].slots = [(0, 16383)]
    nodes[1].slots = []

    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    seeder = df_seeder_factory.create(keys=10_000, port=nodes[0].instance.port, cluster_mode=True)
    await seeder.run(target_deviation=0.1)

    nodes[0].migrations.append(
        MigrationInfo("127.0.0.1", nodes[1].instance.admin_port, [(0, 16383)], nodes[1].id)
    )

    logging.debug("Start migration")
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    await nodes[0].client.execute_command("flushall")

    status1 = await nodes[1].admin_client.execute_command(
        "DFLYCLUSTER", "SLOT-MIGRATION-STATUS", nodes[0].id
    )
    assert (
        len(status1) == 0 or "FINISHED" not in status1[0]
    ), "Weak test case - finished migration too early"

    await wait_for_status(nodes[0].admin_client, nodes[1].id, "FINISHED")

    logging.debug("Finalizing migration")
    nodes[0].migrations = []
    nodes[0].slots = []
    nodes[1].slots = [(0, 16383)]
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])
    logging.debug("Migration finalized")

    assert await nodes[0].client.dbsize() == 0

    # Push config that causes mass async slot deletion on nodes[1]
    nodes[0].slots = [(0, 16383)]
    nodes[1].slots = []
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    # Issue flushall right after pushing new config so it runs at the same time as disowned slots are flushed
    await nodes[1].client.execute_command("flushall")


@pytest.mark.parametrize("interrupt", [False, True])
@dfly_args({"proactor_threads": 4, "cluster_mode": "yes"})
async def test_cluster_data_migration(df_factory: DflyInstanceFactory, interrupt: bool):
    # Check data migration from one node to another
    instances = [
        df_factory.create(
            port=next(next_port),
            admin_port=next(next_port),
            vmodule="outgoing_slot_migration=2,cluster_family=2,incoming_slot_migration=2,streamer=2",
        )
        for i in range(2)
    ]

    df_factory.start_all(instances)

    nodes = [(await create_node_info(instance)) for instance in instances]
    nodes[0].slots = [(0, 9000)]
    nodes[1].slots = [(9001, 16383)]

    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    for i in range(20):
        key = "KEY" + str(i)
        assert await nodes[key_slot(key) // 9001].client.set(key, "value")

    assert await nodes[0].client.execute_command("DBSIZE") == 10

    nodes[0].migrations.append(
        MigrationInfo("127.0.0.1", nodes[1].instance.admin_port, [(3000, 9000)], nodes[1].id)
    )

    logging.debug("Start migration")
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    if interrupt:  # Test nodes properly shut down with pending migration
        await asyncio.sleep(random.random())

        # random instance
        stop = random.getrandbits(1)
        keep = 1 - stop

        nodes[stop].instance.stop()

        slots = await nodes[keep].admin_client.execute_command("CLUSTER SLOTS")
        slots.sort(key=lambda cfg: cfg[0])
        assert 0 in slots[0] and 9000 in slots[0]
        assert 9001 in slots[1] and 16383 in slots[1]

        return

    await wait_for_status(nodes[1].admin_client, nodes[0].id, "FINISHED")

    for i in range(20, 22):
        key = "KEY" + str(i)
        assert await nodes[0 if (key_slot(key) // 3000) == 0 else 1].client.set(key, "value")

    status = await nodes[0].admin_client.execute_command(
        "DFLYCLUSTER", "SLOT-MIGRATION-STATUS", nodes[1].id
    )
    status[0].pop()
    assert status[0] == ["out", nodes[1].id, "FINISHED", 7]

    status = await nodes[1].admin_client.execute_command(
        "DFLYCLUSTER", "SLOT-MIGRATION-STATUS", nodes[0].id
    )
    status[0].pop()
    assert status[0] == ["in", nodes[0].id, "FINISHED", 7]

    nodes[0].migrations = []
    nodes[0].slots = [(0, 2999)]
    nodes[1].slots = [(3000, 16383)]
    logging.debug("remove finished migrations")
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    for i in range(22):
        key = "KEY" + str(i)
        assert await nodes[0 if (key_slot(key) // 3000) == 0 else 1].client.set(key, "value")

    assert await nodes[1].client.execute_command("DBSIZE") == 19

    await check_for_no_state_status([node.admin_client for node in nodes])


@dfly_args({"proactor_threads": 2, "cluster_mode": "yes", "cache_mode": "true"})
async def test_migration_with_key_ttl(df_factory):
    instances = [
        df_factory.create(port=next(next_port), admin_port=next(next_port)) for i in range(2)
    ]

    df_factory.start_all(instances)

    nodes = [(await create_node_info(instance)) for instance in instances]
    nodes[0].slots = [(0, 16383)]
    nodes[1].slots = []

    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    await nodes[0].client.execute_command("set k_with_ttl v1 EX 2")
    await nodes[0].client.execute_command("set k_without_ttl v2")
    await nodes[0].client.execute_command("set k_sticky v3")
    assert await nodes[0].client.execute_command("stick k_sticky") == 1

    nodes[0].migrations.append(
        MigrationInfo("127.0.0.1", instances[1].port, [(0, 16383)], nodes[1].id)
    )
    logging.debug("Start migration")
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    await wait_for_status(nodes[0].admin_client, nodes[1].id, "FINISHED")

    nodes[0].migrations = []
    nodes[0].slots = []
    nodes[1].slots = [(0, 16383)]
    logging.debug("finalize migration")
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    assert await nodes[1].client.execute_command("get k_with_ttl") == "v1"
    assert await nodes[1].client.execute_command("get k_without_ttl") == "v2"
    assert await nodes[1].client.execute_command("get k_sticky") == "v3"
    assert await nodes[1].client.execute_command("ttl k_with_ttl") > 0
    assert await nodes[1].client.execute_command("ttl k_without_ttl") == -1
    assert await nodes[1].client.execute_command("stick k_sticky") == 0  # Sticky bit already set

    await asyncio.sleep(2)  # Force expiration

    assert await nodes[1].client.execute_command("get k_with_ttl") == None
    assert await nodes[1].client.execute_command("get k_without_ttl") == "v2"
    assert await nodes[1].client.execute_command("ttl k_with_ttl") == -2
    assert await nodes[1].client.execute_command("ttl k_without_ttl") == -1
    assert await nodes[1].client.execute_command("stick k_sticky") == 0


@pytest.mark.exclude_epoll
@dfly_args({"proactor_threads": 4, "cluster_mode": "yes", "migration_finalization_timeout_ms": 5})
async def test_network_disconnect_during_migration(df_factory):
    instances = [
        df_factory.create(
            port=next(next_port),
            admin_port=next(next_port),
            vmodule="cluster_family=9,outgoing_slot_migration=9,incoming_slot_migration=9",
        )
        for i in range(2)
    ]

    df_factory.start_all(instances)

    nodes = [(await create_node_info(instance)) for instance in instances]
    nodes[0].slots = [(0, 16383)]
    nodes[1].slots = []

    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    await DebugPopulateSeeder(key_target=100000).run(nodes[0].client)
    start_capture = await DebugPopulateSeeder.capture(nodes[0].client)

    proxy = Proxy("127.0.0.1", next(next_port), "127.0.0.1", nodes[1].instance.admin_port)
    await proxy.start()
    task = asyncio.create_task(proxy.serve())

    nodes[0].migrations.append(MigrationInfo("127.0.0.1", proxy.port, [(0, 16383)], nodes[1].id))
    try:
        logging.debug("Start migration")
        await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

        for _ in range(10):
            await asyncio.sleep(random.randint(0, 50) / 100)
            info = await nodes[0].admin_client.info("CLUSTER")
            logging.debug("drop connection: %s", info)
            proxy.drop_connection()
            logging.debug(
                await nodes[0].admin_client.execute_command("DFLYCLUSTER", "SLOT-MIGRATION-STATUS")
            )

        await wait_for_status(nodes[0].admin_client, nodes[1].id, "SYNC", 20)
    finally:
        await proxy.close(task)

    await proxy.start()
    task = asyncio.create_task(proxy.serve())
    try:
        await wait_for_status(nodes[0].admin_client, nodes[1].id, "FINISHED", 300)
        nodes[0].migrations = []
        nodes[0].slots = []
        nodes[1].slots = [(0, 16383)]
        logging.debug("remove finished migrations")
        await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

        assert (await DebugPopulateSeeder.capture(nodes[1].client)) == start_capture
    finally:
        await proxy.close(task)


@pytest.mark.parametrize(
    "node_count, segments, keys, huge_values, cache_mode",
    [
        pytest.param(3, 16, 20_000, 10, "false"),
        pytest.param(3, 16, 20_000, 10, "true"),
        # 1mb effectively disables breakdown of huge values.
        # TODO: add a test that mixes huge and small values, see
        # https://github.com/dragonflydb/dragonfly/pull/4144/files/11e5e387d31bcf1bc53dfbb28cf3bcaf094d77fa#r1850130930
        pytest.param(3, 16, 20_000, 1_000_000, "true"),
        pytest.param(3, 16, 20_000, 1_000_000, "false"),
        pytest.param(
            5, 20, 30_000, 1_000_000, "false", marks=[pytest.mark.large, pytest.mark.opt_only]
        ),
    ],
)
@dfly_args({"proactor_threads": 4, "cluster_mode": "yes"})
async def test_cluster_fuzzymigration(
    df_factory: DflyInstanceFactory,
    df_seeder_factory,
    node_count: int,
    segments: int,
    keys: int,
    huge_values: int,
    cache_mode: string,
):
    instances = [
        df_factory.create(
            port=next(next_port),
            admin_port=next(next_port),
            vmodule="outgoing_slot_migration=2,cluster_family=2,incoming_slot_migration=2,streamer=2",
            serialization_max_chunk_size=huge_values,
            replication_stream_output_limit=10,
            cache_mode=cache_mode,
        )
        for i in range(node_count)
    ]
    df_factory.start_all(instances)

    nodes = [(await create_node_info(instance)) for instance in instances]

    # Generate equally sized ranges and distribute by nodes
    step = 16400 // segments
    for slot_range in [(s, min(s + step - 1, 16383)) for s in range(0, 16383, step)]:
        nodes[random.randint(0, node_count - 1)].slots.append(slot_range)

    # Push config to all nodes
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    # Fill instances with some data
    seeder = df_seeder_factory.create(
        keys=keys, port=nodes[0].instance.port, cluster_mode=True, mirror_to_fake_redis=True
    )
    seed_task = asyncio.create_task(seeder.run())

    # Counter that pushes values to a list
    async def list_counter(key, client: aioredis.RedisCluster):
        try:
            for i in itertools.count(start=1):
                await client.lpush(key, i)
        except asyncio.exceptions.CancelledError:
            return

    # Start ten counters
    counter_keys = [f"_counter{i}" for i in range(10)]
    counter_connections = [nodes[0].instance.cluster_client() for _ in range(10)]
    counters = [
        asyncio.create_task(list_counter(key, conn))
        for key, conn in zip(counter_keys, counter_connections)
    ]

    # Generate migration plan
    for node_idx, node in enumerate(nodes):
        random.shuffle(node.slots)

        # Decide on number of outgoing slot ranges
        outgoing = [[] for _ in range(node_count)]
        num_outgoing = random.randint(0, len(node.slots))

        # Distribute first 0..num_outgoing
        for slot_range in node.slots[:num_outgoing]:
            dest_idx = random.randint(0, node_count - 1)
            while dest_idx == node_idx:
                dest_idx = random.randint(0, node_count - 1)
            outgoing[dest_idx].append(slot_range)

        for dest_idx, dest_slots in enumerate(outgoing):
            if len(dest_slots) == 0:
                continue

            print(node_idx, "migrates to", dest_idx, "slots", dest_slots)
            node.migrations.append(
                MigrationInfo(
                    ip="127.0.0.1",
                    port=nodes[dest_idx].instance.admin_port,
                    slots=dest_slots,
                    node_id=nodes[dest_idx].id,
                )
            )

    logging.debug("start migrations")
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    logging.debug("finish migrations")

    async def all_finished():
        res = True
        for node in nodes:
            states = await node.admin_client.execute_command("DFLYCLUSTER", "SLOT-MIGRATION-STATUS")
            logging.debug(states)
            for state in states:
                direction, node_id, st, _, _ = state
                if direction == "out":
                    if st == "FINISHED":
                        m_id = [id for id, x in enumerate(node.migrations) if x.node_id == node_id][
                            0
                        ]
                        node.slots = [s for s in node.slots if s not in node.migrations[m_id].slots]
                        target_node = [n for n in nodes if n.id == node_id][0]
                        target_node.slots.extend(node.migrations[m_id].slots)
                        print(
                            "FINISH migration",
                            node.id,
                            ":",
                            node.migrations[m_id].node_id,
                            " slots:",
                            node.migrations[m_id].slots,
                        )
                        node.migrations.pop(m_id)
                        await push_config(
                            json.dumps(generate_config(nodes)),
                            [node.admin_client for node in nodes],
                        )
                    else:
                        res = False
        return res

    @assert_eventually(times=600)
    async def test_all_finished():
        assert await all_finished()

    await test_all_finished()

    for counter in counters:
        counter.cancel()
        await counter

    # Check counter consistency
    cluster_client = nodes[0].instance.cluster_client()
    for key in counter_keys:
        counter_list = await cluster_client.lrange(key, 0, -1)
        for i, j in zip(counter_list, counter_list[1:]):
            assert int(i) == int(j) + 1, f"Found inconsistent list in {key}: {counter_list}"

    # Compare to fake redis, capture ignores counter keys
    seeder.stop()
    await seed_task
    fake_capture = await seeder.capture_fake_redis()

    assert await seeder.compare(fake_capture, nodes[0].instance.port)

    await asyncio.gather(*[c.aclose() for c in counter_connections])


@dfly_args({"proactor_threads": 4, "cluster_mode": "yes"})
async def test_cluster_config_reapply(df_factory: DflyInstanceFactory):
    """Check data migration from one node to another."""
    instances = [
        df_factory.create(port=next(next_port), admin_port=next(next_port)) for i in range(2)
    ]
    df_factory.start_all(instances)

    nodes = [await create_node_info(instance) for instance in instances]
    nodes[0].slots = [(0, 8000)]
    nodes[1].slots = [(8001, 16383)]

    logging.debug("Pushing data to slot 6XXX")
    SIZE = 10_000
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])
    for i in range(SIZE):
        assert await nodes[0].admin_client.set(f"{{key50}}:{i}", i)  # key50 belongs to slot 6686
    assert [SIZE, 0] == [await node.admin_client.dbsize() for node in nodes]

    nodes[0].migrations = [
        MigrationInfo("127.0.0.1", instances[1].admin_port, [(6000, 8000)], nodes[1].id)
    ]
    logging.debug("Migrating slots 6000-8000")
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    await wait_for_status(nodes[0].admin_client, nodes[1].id, "FINISHED")

    assert [SIZE, SIZE] == [await node.client.dbsize() for node in nodes]

    logging.debug("Reapply config with migration")
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    await asyncio.sleep(0.1)
    assert [SIZE, SIZE] == [await node.client.dbsize() for node in nodes]

    logging.debug("Finalizing migration")
    nodes[0].migrations = []
    nodes[0].slots = [(0, 6000)]
    nodes[1].slots = [(6001, 16383)]
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])
    logging.debug("Migration finalized")

    await asyncio.sleep(1)
    assert [0, SIZE] == [await node.client.dbsize() for node in nodes]

    for i in range(SIZE):
        assert str(i) == await nodes[1].client.get(f"{{key50}}:{i}")


@dfly_args({"proactor_threads": 4, "cluster_mode": "yes"})
async def test_cluster_replication_migration(
    df_factory: DflyInstanceFactory, df_seeder_factory: DflySeederFactory
):
    """
    Test replication with migration. Create the following setup:

    master_1 -> replica_1, master_2 -> replica_2

    with each master owning half the slots. Let them then fully exchange their slots
    and make sure the captures on the replicas are equal.
    """
    instances = [
        df_factory.create(port=next(next_port), admin_port=next(next_port)) for i in range(4)
    ]
    df_factory.start_all(instances)

    nodes = [await create_node_info(n) for n in instances]
    m1_node, r1_node, m2_node, r2_node = nodes
    master_nodes = [m1_node, m2_node]

    # divide node slots by half
    m1_node.slots = [(0, 8000)]
    m1_node.replicas = [r1_node]
    m2_node.slots = [(8001, 16383)]
    m2_node.replicas = [r2_node]

    logging.debug("Push initial config")
    await push_config(
        json.dumps(generate_config(master_nodes)), [node.admin_client for node in nodes]
    )

    logging.debug("create data")
    seeder = df_seeder_factory.create(
        keys=2000, port=m1_node.instance.port, cluster_mode=True, mirror_to_fake_redis=True
    )
    seed = asyncio.create_task(seeder.run())

    logging.debug("start replication")
    await r1_node.admin_client.execute_command(f"replicaof localhost {m1_node.instance.port}")
    await r2_node.admin_client.execute_command(f"replicaof localhost {m2_node.instance.port}")

    await wait_available_async(r1_node.admin_client)
    await wait_available_async(r2_node.admin_client)

    logging.debug("start migration")
    m1_node.migrations = [
        MigrationInfo("127.0.0.1", m2_node.instance.admin_port, [(0, 8000)], m2_node.id)
    ]
    m2_node.migrations = [
        MigrationInfo("127.0.0.1", m1_node.instance.admin_port, [(8001, 16383)], m1_node.id)
    ]
    await push_config(
        json.dumps(generate_config(master_nodes)), [node.admin_client for node in nodes]
    )

    await wait_for_status(m1_node.admin_client, m2_node.id, "FINISHED")
    await wait_for_status(m2_node.admin_client, m1_node.id, "FINISHED")

    logging.debug("finish migration")
    m1_node.migrations = []
    m1_node.slots = [(8001, 16383)]
    m2_node.migrations = []
    m2_node.slots = [(0, 8000)]

    await push_config(
        json.dumps(generate_config(master_nodes)), [node.admin_client for node in nodes]
    )

    # wait for replicas to catch up
    await asyncio.sleep(2)

    # ensure captures got exchanged
    seeder.stop()
    await seed
    fake_capture = await seeder.capture_fake_redis()
    assert await seeder.compare(fake_capture, r1_node.instance.port)


@dfly_args({"proactor_threads": 4, "cluster_mode": "yes", "pause_wait_timeout": 10})
async def test_start_replication_during_migration(
    df_factory: DflyInstanceFactory, df_seeder_factory: DflySeederFactory
):
    """
    Test replication with migration. Create the following setup:

    master_1 do migration to master_2 and we start replication for master_1 during this migration

    in the end master_1 and replica_1 should have the same data
    """
    instances = [
        df_factory.create(port=next(next_port), admin_port=next(next_port)) for i in range(3)
    ]
    df_factory.start_all(instances)

    nodes = [await create_node_info(n) for n in instances]
    m1_node, r1_node, m2_node = nodes
    master_nodes = [m1_node, m2_node]

    m1_node.slots = [(0, 16383)]
    m1_node.replicas = [r1_node]
    m2_node.slots = []

    logging.debug("Push initial config")
    await push_config(
        json.dumps(generate_config(master_nodes)), [node.admin_client for node in nodes]
    )

    logging.debug("create data")
    seeder = df_seeder_factory.create(
        keys=10000, port=nodes[0].instance.port, cluster_mode=True, mirror_to_fake_redis=True
    )
    seed = asyncio.create_task(seeder.run())

    logging.debug("start migration")
    m1_node.migrations = [
        MigrationInfo("127.0.0.1", m2_node.instance.admin_port, [(2001, 16383)], m2_node.id)
    ]
    await push_config(
        json.dumps(generate_config(master_nodes)), [node.admin_client for node in nodes]
    )

    logging.debug("start replication")
    await r1_node.admin_client.execute_command(f"replicaof localhost {m1_node.instance.port}")

    await wait_available_async(r1_node.admin_client)

    await wait_for_status(m1_node.admin_client, m2_node.id, "FINISHED")

    logging.debug("finish migration")
    m1_node.migrations = []
    m1_node.slots = [(0, 2000)]
    m2_node.migrations = []
    m2_node.slots = [(2001, 16383)]

    await push_config(
        json.dumps(generate_config(master_nodes)), [node.admin_client for node in nodes]
    )

    await check_all_replicas_finished([r1_node.client], m1_node.client)

    seeder.stop()
    await seed
    fake_capture = await seeder.capture_fake_redis()
    assert await seeder.compare(fake_capture, r1_node.instance.port)


@dfly_args({"proactor_threads": 4, "cluster_mode": "yes"})
async def test_keys_expiration_during_migration(df_factory: DflyInstanceFactory):
    # Check data migration from one node to another with expiration
    instances = [
        df_factory.create(port=next(next_port), admin_port=next(next_port)) for i in range(2)
    ]

    df_factory.start_all(instances)

    nodes = [(await create_node_info(instance)) for instance in instances]
    nodes[0].slots = [(0, 16383)]
    nodes[1].slots = []

    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    logging.debug("Start seeder")
    await nodes[0].client.execute_command("debug", "populate", "100", "foo", "100", "RAND")

    capture_before = await DebugPopulateSeeder.capture(nodes[0].client)

    seeder = ExpirySeeder(timeout=4)
    seeder_task = asyncio.create_task(seeder.run(nodes[0].client))
    await seeder.wait_until_n_inserts(500)

    logging.debug("Start migration")
    nodes[0].migrations.append(
        MigrationInfo("127.0.0.1", nodes[1].instance.admin_port, [(0, 16383)], nodes[1].id)
    )
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    await wait_for_status(nodes[1].admin_client, nodes[0].id, "FINISHED")

    logging.debug("Stop seeders")
    seeder.stop()
    await seeder_task

    logging.debug("finish migration")
    nodes[0].slots = []
    nodes[1].slots = [(0, 16383)]
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    # wait to expire all keys
    await asyncio.sleep(5)

    assert await DebugPopulateSeeder.capture(nodes[1].client) == capture_before

    stats = await nodes[1].client.info("STATS")
    assert stats["expired_keys"] > 0


@pytest.mark.parametrize("migration_first", [False, True])
@dfly_args({"proactor_threads": 4, "cluster_mode": "yes"})
async def test_snapshoting_during_migration(
    df_factory: DflyInstanceFactory, df_seeder_factory: DflySeederFactory, migration_first: bool
):
    """
    Test saving snapshot during migration. Create the following setups:

    1) Start saving and then run migration simultaneously
    2) Run migration and start saving simultaneously

    The result should be the same: snapshot contains all the data that existed before migration
    """
    dbfilename = f"snap_{tmp_file_name()}"
    instances = [
        df_factory.create(
            dbfilename=dbfilename if i == 0 else "",
            port=next(next_port),
            admin_port=next(next_port),
        )
        for i in range(2)
    ]
    df_factory.start_all(instances)

    nodes = [await create_node_info(n) for n in instances]

    nodes[0].slots = [(0, 16383)]
    nodes[1].slots = []

    logging.debug("Push initial config")
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    logging.debug("create data")
    seeder = df_seeder_factory.create(
        keys=10000, port=nodes[0].instance.port, cluster_mode=True, mirror_to_fake_redis=True
    )
    seed = asyncio.create_task(seeder.run())

    nodes[0].migrations = [
        MigrationInfo("127.0.0.1", nodes[1].instance.admin_port, [(0, 16383)], nodes[1].id)
    ]

    async def start_migration():
        logging.debug("start migration")
        await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    async def start_save():
        logging.debug("BGSAVE")
        await nodes[0].client.execute_command(f"BGSAVE")

    if migration_first:
        await start_migration()
        await asyncio.sleep(random.randint(0, 10) / 100)
        await start_save()
    else:
        await start_save()
        await asyncio.sleep(random.randint(0, 10) / 100)
        await start_migration()

    logging.debug("wait for snapshot")
    while await is_saving(nodes[0].client):
        await asyncio.sleep(0.1)

    logging.debug("wait migration finish")
    await wait_for_status(nodes[0].admin_client, nodes[1].id, "FINISHED")

    logging.debug("finish migration")
    nodes[0].migrations = []
    nodes[0].slots = []
    nodes[1].migrations = []
    nodes[1].slots = [(0, 16383)]

    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    seeder.stop()
    await seed
    fake_capture = await seeder.capture_fake_redis()
    assert await seeder.compare(fake_capture, nodes[1].instance.port)

    await nodes[1].client.execute_command(
        "DFLY",
        "LOAD",
        f"{dbfilename}-summary.dfs",
    )

    # TODO: We can't compare the post-loaded data as is, because it might have changed by now.
    # We can try to use FakeRedis with the DebugPopulateSeeder comparison here.


@pytest.mark.exclude_epoll
@dfly_args({"proactor_threads": 4, "cluster_mode": "yes"})
@pytest.mark.asyncio
async def test_cluster_migration_cancel(df_factory: DflyInstanceFactory):
    """Check data migration from one node to another."""
    instances = [
        df_factory.create(port=next(next_port), admin_port=next(next_port)) for i in range(2)
    ]
    df_factory.start_all(instances)

    nodes = [await create_node_info(instance) for instance in instances]
    nodes[0].slots = [(0, 8000)]
    nodes[1].slots = [(8001, 16383)]

    logging.debug("Pushing data to slot 6XXX")
    SIZE = 10_000
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])
    for i in range(SIZE):
        assert await nodes[0].client.set(f"{{key50}}:{i}", i)  # key50 belongs to slot 6686
    assert [SIZE, 0] == [await node.client.dbsize() for node in nodes]

    nodes[0].migrations = [
        MigrationInfo("127.0.0.1", instances[1].admin_port, [(6000, 8000)], nodes[1].id)
    ]
    logging.debug("Migrating slots 6000-8000")
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    logging.debug("Cancelling migration")
    nodes[0].migrations = []
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])
    assert SIZE == await nodes[0].client.dbsize()

    @assert_eventually
    async def node1size0():
        if await nodes[1].client.dbsize() != 0:
            logging.debug(await nodes[1].client.execute_command("keys *"))
            assert False

    await node1size0()

    logging.debug("Reissuing migration")
    nodes[0].migrations.append(
        MigrationInfo("127.0.0.1", instances[1].admin_port, [(6001, 8000)], nodes[1].id)
    )
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])
    await wait_for_status(nodes[0].admin_client, nodes[1].id, "FINISHED")
    assert [SIZE, SIZE] == [await node.client.dbsize() for node in nodes]

    logging.debug("Finalizing migration")
    nodes[0].migrations = []
    nodes[0].slots = [(0, 6000)]
    nodes[1].slots = [(6001, 16383)]
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])
    logging.debug("Migration finalized")

    while 0 != await nodes[0].client.dbsize():
        logging.debug(f"wait until source dbsize is empty")
        await asyncio.sleep(0.1)

    for i in range(SIZE):
        assert str(i) == await nodes[1].client.get(f"{{key50}}:{i}")


@dfly_args({"proactor_threads": 2, "cluster_mode": "yes"})
@pytest.mark.asyncio
@pytest.mark.opt_only
@pytest.mark.exclude_epoll
async def test_cluster_migration_huge_container(df_factory: DflyInstanceFactory):
    instances = [
        df_factory.create(port=next(next_port), admin_port=next(next_port)) for i in range(2)
    ]
    df_factory.start_all(instances)

    nodes = [await create_node_info(instance) for instance in instances]
    nodes[0].slots = [(0, 16383)]
    nodes[1].slots = []

    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    logging.debug("Generating huge containers")
    seeder = DebugPopulateSeeder(
        key_target=100,
        data_size=10_000_000,
        collection_size=10_000,
        variance=1,
        samples=1,
        types=["LIST", "HASH", "SET", "ZSET", "STREAM", "STRING"],
    )
    await seeder.run(nodes[0].client)
    source_data = await DebugPopulateSeeder.capture(nodes[0].client)

    mem_before = await get_memory(nodes[0].client, "used_memory_rss")

    nodes[0].migrations = [
        MigrationInfo("127.0.0.1", instances[1].admin_port, [(0, 16383)], nodes[1].id)
    ]
    logging.debug("Migrating slots")
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    logging.debug("Waiting for migration to finish")
    await wait_for_status(nodes[0].admin_client, nodes[1].id, "FINISHED", timeout=300)

    target_data = await DebugPopulateSeeder.capture(nodes[1].client)
    assert source_data == target_data

    # Get peak memory, because migration removes the data
    mem_after = await get_memory(nodes[0].client, "used_memory_peak_rss")
    logging.debug(f"Memory before {mem_before} after {mem_after}")
    assert mem_after < mem_before * 1.1

    line = stop_and_get_restore_log(nodes[0].instance)

    # 'with X commands' - how many breakdowns we used for the keys
    assert extract_int_after_prefix("with ", line) > 500_000

    assert extract_int_after_prefix("Keys skipped ", line) == 0
    assert extract_int_after_prefix("buckets skipped ", line) == 0
    assert extract_int_after_prefix("keys written ", line) > 90

    # We don't send updates during the migration
    assert extract_int_after_prefix("buckets on_db_update ", line) == 0


@dfly_args(
    {"proactor_threads": 2, "cluster_mode": "yes", "migration_buckets_serialization_threshold": 1}
)
@pytest.mark.large
@pytest.mark.parametrize("chunk_size", [1_000_000, 30])
@pytest.mark.asyncio
@pytest.mark.exclude_epoll
async def test_cluster_migration_while_seeding(
    df_factory: DflyInstanceFactory, df_seeder_factory: DflySeederFactory, chunk_size
):
    instances = [
        df_factory.create(
            port=next(next_port),
            admin_port=next(next_port),
            serialization_max_chunk_size=chunk_size,
        )
        for _ in range(2)
    ]
    df_factory.start_all(instances)

    nodes = [await create_node_info(instance) for instance in instances]
    nodes[0].slots = [(0, 16383)]
    nodes[1].slots = []
    client0 = nodes[0].client

    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    logging.debug("Seeding cluster")
    seeder = df_seeder_factory.create(
        keys=20_000, port=instances[0].port, cluster_mode=True, mirror_to_fake_redis=True
    )
    await seeder.run(target_deviation=0.1)

    seed = asyncio.create_task(seeder.run())
    await asyncio.sleep(1)

    nodes[0].migrations = [
        MigrationInfo("127.0.0.1", instances[1].admin_port, [(0, 16383)], nodes[1].id)
    ]
    logging.debug("Migrating slots")
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    logging.debug("Waiting for migration to finish")
    await wait_for_status(nodes[0].admin_client, nodes[1].id, "FINISHED", timeout=300)
    logging.debug("Migration finished")

    logging.debug("Finalizing migration")
    nodes[0].slots = []
    nodes[1].slots = [(0, 16383)]
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    await asyncio.sleep(1)  # Let seeder feed dest before migration finishes

    seeder.stop()
    await seed
    logging.debug("Seeding finished")

    assert (
        await get_memory(client0, "used_memory_peak_rss")
        < await get_memory(client0, "used_memory_rss") * 1.2
    )

    capture = await seeder.capture_fake_redis()
    assert await seeder.compare(capture, instances[1].port)

    line = stop_and_get_restore_log(nodes[0].instance)
    assert extract_int_after_prefix("Keys skipped ", line) == 0
    assert extract_int_after_prefix("buckets skipped ", line) > 0
    assert extract_int_after_prefix("keys written ", line) >= 15_000
    # buckets on_db_update can be 0 once in a while because we can not predict keys distribution during migration
    assert extract_int_after_prefix("buckets on_db_update ", line) > 0


@dfly_args({"proactor_threads": 2, "cluster_mode": "yes"})
@pytest.mark.asyncio
async def test_cluster_migrations_sequence(
    df_factory: DflyInstanceFactory, df_seeder_factory: DflySeederFactory
):
    instances = [
        df_factory.create(port=next(next_port), admin_port=next(next_port)) for _ in range(2)
    ]
    df_factory.start_all(instances)

    nodes = [await create_node_info(instance) for instance in instances]
    nodes[0].slots = [(0, 16383)]
    nodes[1].slots = []

    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    logging.debug("Seeding cluster")
    seeder = df_seeder_factory.create(
        keys=10_000, port=instances[0].port, cluster_mode=True, mirror_to_fake_redis=True
    )
    await seeder.run(target_deviation=0.1)

    seed = asyncio.create_task(seeder.run())
    await asyncio.sleep(1)

    slot_step = 500
    nodes[0].migrations = [
        MigrationInfo("127.0.0.1", instances[1].admin_port, [(0, slot_step - 1)], nodes[1].id)
    ]
    logging.debug("Migrating slots")
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    for i in range(slot_step, 16301, slot_step):
        logging.debug("Waiting for migration to finish")
        await wait_for_status(nodes[0].admin_client, nodes[1].id, "FINISHED", timeout=10)

        nodes[0].slots = [(i, 16383)]
        nodes[1].slots = [(0, i - 1)]
        end_slot = min(i + slot_step - 1, 16383)
        nodes[0].migrations = [
            MigrationInfo("127.0.0.1", instances[1].admin_port, [(i, end_slot)], nodes[1].id)
        ]

        await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    logging.debug("Waiting for migration to finish")
    await wait_for_status(nodes[0].admin_client, nodes[1].id, "FINISHED", timeout=10)
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    logging.debug("Finalizing migration")
    nodes[0].slots = []
    nodes[1].slots = [(0, 16383)]
    nodes[0].migrations = []
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    logging.debug("stop seeding")
    seeder.stop()
    await seed

    capture = await seeder.capture_fake_redis()
    assert await seeder.compare(capture, instances[1].port)


def parse_lag(replication_info: str):
    lags = re.findall("lag=([0-9]+)\r\n", replication_info)
    assert len(lags) == 1
    return int(lags[0])


async def await_no_lag(client: aioredis.Redis, timeout=10):
    start = time.time()
    while (time.time() - start) < timeout:
        lag = parse_lag(await client.execute_command("info replication"))
        print("current lag =", lag)
        if lag == 0:
            return
        await asyncio.sleep(0.05)

    raise RuntimeError("Lag did not reduced to 0!")


@pytest.mark.exclude_epoll
@dfly_args({"proactor_threads": 4})
async def test_replicate_cluster(df_factory: DflyInstanceFactory, df_seeder_factory):
    """
    Create dragonfly cluster of 2 nodes.
    Create additional dragonfly server in emulated mode.
    Replicate the dragonfly cluster into a single dragonfly node.
    Send traffic before replication start and while replicating.
    Promote the replica to master and check data consistency between cluster and single node.
    """
    replica = df_factory.create(admin_port=next(next_port), cluster_mode="emulated")
    cluster_nodes = [
        df_factory.create(admin_port=next(next_port), cluster_mode="yes") for i in range(2)
    ]

    # Start instances and connect clients
    df_factory.start_all(cluster_nodes + [replica])
    c_nodes = [node.client() for node in cluster_nodes]

    c_replica = replica.client()

    node_ids = await asyncio.gather(*(get_node_id(c) for c in c_nodes))
    config = f"""
      [
        {{
          "slot_ranges": [ {{ "start": 0, "end": LAST_SLOT_CUTOFF }} ],
          "master": {{ "id": "{node_ids[0]}", "ip": "localhost", "port": {cluster_nodes[0].port} }},
          "replicas": []
        }},
        {{
          "slot_ranges": [ {{ "start": NEXT_SLOT_CUTOFF, "end": 16383 }} ],
          "master": {{ "id": "{node_ids[1]}", "ip": "localhost", "port": {cluster_nodes[1].port} }},
          "replicas": []
        }}
      ]
    """

    await push_config(
        config.replace("LAST_SLOT_CUTOFF", "5259").replace("NEXT_SLOT_CUTOFF", "5260"),
        c_nodes,
    )

    # Fill instances with some data
    seeder = df_seeder_factory.create(
        keys=2000, port=cluster_nodes[0].port, cluster_mode=True, mirror_to_fake_redis=True
    )
    await seeder.run(target_deviation=0.1)

    fill_task = asyncio.create_task(seeder.run())

    # Start replication
    await c_replica.execute_command("REPLICAOF localhost " + str(cluster_nodes[0].port) + " 0 5259")
    await c_replica.execute_command(
        "ADDREPLICAOF localhost " + str(cluster_nodes[1].port) + " 5260 16383"
    )

    # give seeder time to run.
    await asyncio.sleep(1.0)
    # Stop seeder
    seeder.stop()
    await fill_task

    # wait for replication to finish
    await asyncio.gather(*(asyncio.create_task(await_no_lag(c)) for c in c_nodes))

    # promote replica to master and compare data
    await c_replica.execute_command("REPLICAOF NO ONE")
    capture = await seeder.capture()
    assert await seeder.compare(capture, replica.port)
    fake_capture = await seeder.capture_fake_redis()
    assert await seeder.compare(fake_capture, replica.port)


async def await_stable_sync(m_client: aioredis.Redis, replica_port, timeout=10):
    start = time.time()

    async def is_stable():
        role = await m_client.execute_command("role")
        return role == [
            "master",
            [["127.0.0.1", str(replica_port), "online"]],
        ]

    while (time.time() - start) < timeout:
        if await is_stable():
            return
        await asyncio.sleep(0.05)

    raise RuntimeError("Failed to reach stable sync")


@dfly_args({"proactor_threads": 4})
async def test_replicate_disconnect_cluster(df_factory: DflyInstanceFactory, df_seeder_factory):
    """
    Create dragonfly cluster of 2 nodes and additional dragonfly server in emulated mode.
    Populate the cluster with data
    Replicate the dragonfly cluster into a single dragonfly node and wait for stable sync
    Break connection between cluster node 0 and replica and reconnect
    Promote replica to master
    Compare cluster data and replica data
    """
    replica = df_factory.create(admin_port=next(next_port), cluster_mode="emulated")
    cluster_nodes = [
        df_factory.create(admin_port=next(next_port), cluster_mode="yes") for i in range(2)
    ]

    # Start instances and connect clients
    df_factory.start_all(cluster_nodes + [replica])
    c_nodes = [node.client() for node in cluster_nodes]

    c_replica = replica.client()

    node_ids = await asyncio.gather(*(get_node_id(c) for c in c_nodes))
    config = f"""
      [
        {{
          "slot_ranges": [ {{ "start": 0, "end": LAST_SLOT_CUTOFF }} ],
          "master": {{ "id": "{node_ids[0]}", "ip": "localhost", "port": {cluster_nodes[0].port} }},
          "replicas": []
        }},
        {{
          "slot_ranges": [ {{ "start": NEXT_SLOT_CUTOFF, "end": 16383 }} ],
          "master": {{ "id": "{node_ids[1]}", "ip": "localhost", "port": {cluster_nodes[1].port} }},
          "replicas": []
        }}
      ]
    """

    await push_config(
        config.replace("LAST_SLOT_CUTOFF", "5259").replace("NEXT_SLOT_CUTOFF", "5260"),
        c_nodes,
    )

    # Fill instances with some data
    seeder = df_seeder_factory.create(
        keys=2000, port=cluster_nodes[0].port, cluster_mode=True, mirror_to_fake_redis=True
    )
    await seeder.run(target_deviation=0.1)

    fill_task = asyncio.create_task(seeder.run())

    proxy = Proxy("127.0.0.1", next(next_port), "127.0.0.1", cluster_nodes[0].port)
    await proxy.start()
    proxy_task = asyncio.create_task(proxy.serve())

    # Start replication
    await c_replica.execute_command("REPLICAOF localhost " + str(proxy.port) + " 0 5259")
    await c_replica.execute_command(
        "ADDREPLICAOF localhost " + str(cluster_nodes[1].port) + " 5260 16383"
    )

    # wait for replication to reach stable state on all nodes
    await asyncio.gather(
        *(asyncio.create_task(await_stable_sync(c, replica.port)) for c in c_nodes)
    )

    # break connection between first node and replica
    await proxy.close(proxy_task)
    await asyncio.sleep(3)

    async def is_first_master_conn_down(conn):
        info = await conn.execute_command("INFO REPLICATION")
        print(info)
        statuses = re.findall("master_link_status:(down|up)\r\n", info)
        assert len(statuses) == 2
        assert statuses[0] == "down"
        assert statuses[1] == "up"

    await is_first_master_conn_down(c_replica)

    # start connection again
    await proxy.start()
    proxy_task = asyncio.create_task(proxy.serve())

    seeder.stop()
    await fill_task

    # wait for stable sync on first master
    await await_stable_sync(c_nodes[0], replica.port)
    # wait for no lag on all cluster nodes
    await asyncio.gather(*(asyncio.create_task(await_no_lag(c)) for c in c_nodes))

    # promote replica to master and compare data
    await c_replica.execute_command("REPLICAOF NO ONE")
    capture = await seeder.capture()
    assert await seeder.compare(capture, replica.port)
    fake_capture = await seeder.capture_fake_redis()
    assert await seeder.compare(fake_capture, replica.port)

    await proxy.close(proxy_task)


def is_offset_eq_master_repl_offset(replication_info: str):
    offset = re.findall("offset=([0-9]+),", replication_info)
    assert len(offset) == 1
    master_repl_offset = re.findall("master_repl_offset:([0-9]+)\r\n", replication_info)
    assert len(master_repl_offset) == 1
    return int(offset[0]) == int(master_repl_offset[0])


async def await_eq_offset(client: aioredis.Redis, timeout=20):
    start = time.time()
    while (time.time() - start) < timeout:
        if is_offset_eq_master_repl_offset(await client.execute_command("info replication")):
            return
        await asyncio.sleep(0.05)

    raise RuntimeError("offset not equal!")


@pytest.mark.exclude_epoll
@dfly_args({"proactor_threads": 4})
async def test_replicate_redis_cluster(redis_cluster, df_factory, df_seeder_factory):
    """
    Create redis cluster of 3 nodes.
    Create dragonfly server in emulated mode.
    Replicate the redis cluster into a single dragonfly node.
    Send traffic before replication start and while replicating.
    Promote the replica to master and check data consistency between cluster and single dragonfly node.
    """
    replica = df_factory.create(admin_port=next(next_port), cluster_mode="emulated")

    # Start instances and connect clients
    df_factory.start_all([replica])

    redis_cluster_nodes = redis_cluster
    node_clients = [
        aioredis.Redis(decode_responses=True, host="localhost", port=node.port)
        for node in redis_cluster_nodes
    ]

    c_replica = replica.client()

    seeder = df_seeder_factory.create(
        keys=2000, port=redis_cluster_nodes[0].port, cluster_mode=True
    )
    await seeder.run(target_deviation=0.1)

    fill_task = asyncio.create_task(seeder.run())

    # Start replication
    await c_replica.execute_command(
        "REPLICAOF localhost " + str(redis_cluster_nodes[0].port) + " 0 5460"
    )
    await asyncio.sleep(0.5)
    await c_replica.execute_command(
        "ADDREPLICAOF localhost " + str(redis_cluster_nodes[1].port) + " 5461 10922"
    )
    await asyncio.sleep(0.5)
    await c_replica.execute_command(
        "ADDREPLICAOF localhost " + str(redis_cluster_nodes[2].port) + " 10923 16383"
    )

    # give seeder time to run.
    await asyncio.sleep(0.5)
    # Stop seeder
    seeder.stop()
    await fill_task

    # wait for replication to finish
    await asyncio.gather(*(asyncio.create_task(await_eq_offset(client)) for client in node_clients))

    await c_replica.execute_command("REPLICAOF NO ONE")
    capture = await seeder.capture()
    assert await seeder.compare(capture, replica.port)


@dfly_args({"proactor_threads": 4, "pause_wait_timeout": 10})
async def test_replicate_disconnect_redis_cluster(redis_cluster, df_factory, df_seeder_factory):
    """
    Create redis cluster of 3 nodes.
    Create dragonfly server in emulated mode.
    Replicate the redis cluster into a single dragonfly node.
    Send traffic before replication start and while replicating.
    Close connection between dfly replica and one of master nodes and reconnect
    Send more traffic
    Promote the replica to master and check data consistency between cluster and single dragonfly node.
    """
    replica = df_factory.create(admin_port=next(next_port), cluster_mode="emulated")

    # Start instances and connect clients
    df_factory.start_all([replica])

    redis_cluster_nodes = redis_cluster
    node_clients = [
        aioredis.Redis(decode_responses=True, host="localhost", port=node.port)
        for node in redis_cluster_nodes
    ]

    c_replica = replica.client()

    seeder = df_seeder_factory.create(
        keys=1000, port=redis_cluster_nodes[0].port, cluster_mode=True
    )
    await seeder.run(target_deviation=0.1)

    fill_task = asyncio.create_task(seeder.run())

    proxy = Proxy("127.0.0.1", next(next_port), "127.0.0.1", redis_cluster_nodes[1].port)
    await proxy.start()
    proxy_task = asyncio.create_task(proxy.serve())

    # Start replication
    await c_replica.execute_command(
        "REPLICAOF localhost " + str(redis_cluster_nodes[0].port) + " 0 5460"
    )
    await c_replica.execute_command("ADDREPLICAOF localhost " + str(proxy.port) + " 5461 10922")
    await c_replica.execute_command(
        "ADDREPLICAOF localhost " + str(redis_cluster_nodes[2].port) + " 10923 16383"
    )

    # give seeder time to run.
    await asyncio.sleep(1)

    # break connection between second node and replica
    await proxy.close(proxy_task)
    await asyncio.sleep(3)

    # check second node connection is down
    info = await c_replica.execute_command("INFO REPLICATION")
    statuses = re.findall("master_link_status:(down|up)\r\n", info)
    assert len(statuses) == 3
    assert statuses[0] == "up"
    assert statuses[1] == "down"
    assert statuses[2] == "up"

    # start connection again
    await proxy.start()
    proxy_task = asyncio.create_task(proxy.serve())

    # give seeder more time to run
    await asyncio.sleep(1)

    # check second node connection is up
    info = await c_replica.execute_command("INFO REPLICATION")
    statuses = re.findall("master_link_status:(down|up)\r\n", info)
    assert len(statuses) == 3
    assert statuses[0] == "up"
    assert statuses[1] == "up"
    assert statuses[2] == "up"

    # give seeder time to run.
    await asyncio.sleep(1)

    # Stop seeder
    seeder.stop()
    await fill_task

    # wait for replication to finish
    await asyncio.gather(*(asyncio.create_task(await_eq_offset(client)) for client in node_clients))

    await c_replica.execute_command("REPLICAOF NO ONE")
    capture = await seeder.capture()
    assert await seeder.compare(capture, replica.port)
    await proxy.close(proxy_task)


@pytest.mark.large
@dfly_args({"cluster_mode": "yes"})
async def test_cluster_memory_consumption_migration(df_factory: DflyInstanceFactory):
    # Check data migration from one node to another
    instances = [
        df_factory.create(
            maxmemory="15G",
            port=next(next_port),
            admin_port=next(next_port),
            vmodule="streamer=2",
        )
        for i in range(3)
    ]

    df_factory.start_all(instances)

    nodes = [(await create_node_info(instance)) for instance in instances]
    nodes[0].slots = [(0, 16383)]
    for i in range(1, len(instances)):
        nodes[i].slots = []

    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    await nodes[0].client.execute_command("DEBUG POPULATE 5000000 test 1000 RAND SLOTS 0 16383")

    await asyncio.sleep(2)

    migration_nodes = len(instances) - 1
    slot_step = 16384 // migration_nodes
    ranges = []
    for i in range(0, migration_nodes):
        ranges.append(i * slot_step)
    ranges.append(16384)

    for i in range(1, len(instances)):
        nodes[0].migrations.append(
            MigrationInfo(
                "127.0.0.1",
                nodes[i].instance.admin_port,
                [(ranges[i - 1], ranges[i] - 1)],
                nodes[i].id,
            )
        )

    logging.debug("Start migration")
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    await wait_for_status(nodes[1].admin_client, nodes[0].id, "FINISHED", 1000)

    nodes[0].migrations = []
    nodes[0].slots = []
    for i in range(1, len(instances)):
        nodes[i].slots = [(ranges[i - 1], ranges[i] - 1)]
    logging.debug("remove finished migrations")
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    await check_for_no_state_status([node.admin_client for node in nodes])


@pytest.mark.large
@pytest.mark.exclude_epoll
@pytest.mark.asyncio
@dfly_args({"proactor_threads": 4, "cluster_mode": "yes", "migration_buckets_cpu_budget": 1})
async def test_migration_timeout_on_sync(df_factory: DflyInstanceFactory, df_seeder_factory):
    # Timeout set to 3 seconds because we must first saturate the socket before we get the timeout
    instances = [
        df_factory.create(
            port=next(next_port),
            admin_port=next(next_port),
            replication_timeout=3000,
            vmodule="outgoing_slot_migration=2,cluster_family=2,incoming_slot_migration=2",
        )
        for i in range(2)
    ]

    df_factory.start_all(instances)

    nodes = [(await create_node_info(instance)) for instance in instances]
    nodes[0].slots = [(0, 16383)]
    nodes[1].slots = []

    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    logging.debug("source node DEBUG POPULATE")

    await DebugPopulateSeeder(key_target=300000, data_size=1000).run(nodes[0].client)

    # we use this seeder to saturate the pending_buf_ in streamer
    seeder = df_seeder_factory.create(port=nodes[0].instance.port, cluster_mode=True)
    fill_task = asyncio.create_task(seeder.run())

    logging.debug("Start migration")
    nodes[0].migrations.append(
        MigrationInfo("127.0.0.1", nodes[1].instance.admin_port, [(0, 16383)], nodes[1].id)
    )
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    await asyncio.sleep(random.randint(0, 50) / 100)
    # to pause migration we need to be in sync state
    await wait_for_status(nodes[1].admin_client, nodes[0].id, "SYNC", 1000)

    logging.debug("debug migration pause")
    await nodes[1].client.execute_command("debug migration pause")

    await wait_for_error(
        nodes[0].admin_client, nodes[1].id, "JournalStreamer write operation timeout", 30
    )

    logging.debug("debug migration resume")
    await nodes[1].client.execute_command("debug migration resume")

    # Stop seeder
    seeder.stop()
    await fill_task

    await wait_for_status(nodes[0].admin_client, nodes[1].id, "FINISHED", 300)
    await wait_for_status(nodes[1].admin_client, nodes[0].id, "FINISHED")

    with pytest.raises(MovedError) as e_info:
        await nodes[0].client.get("x")

    assert f"16287 127.0.0.1:{instances[1].port}" == str(e_info.value)

    nodes[0].migrations = []
    # cancel migration for the source node to get the original data from it
    await push_config(json.dumps(generate_config(nodes)), [nodes[0].admin_client])

    nodes[0].slots = []
    nodes[1].slots = [(0, 16383)]
    # finish migration for the target node to get the migrated data from it
    await push_config(json.dumps(generate_config(nodes)), [nodes[1].admin_client])

    source_capture = await DebugPopulateSeeder.capture(nodes[0].client)
    assert (await DebugPopulateSeeder.capture(nodes[1].client)) == source_capture


"""
Test cluster node distributing its slots into 2 other nodes.
In this test we start migrating to the second node only after the first one finished to
reproduce the bug found in issue #4455
"""


@pytest.mark.asyncio
@dfly_args({"proactor_threads": 4, "cluster_mode": "yes"})
async def test_migration_one_after_another(df_factory: DflyInstanceFactory, df_seeder_factory):
    # 1. Create cluster of 3 nodes with all slots allocated to first node.
    instances = [
        df_factory.create(
            port=next(next_port),
            admin_port=next(next_port),
            vmodule="outgoing_slot_migration=2,cluster_family=2,incoming_slot_migration=2,streamer=2",
        )
        for i in range(3)
    ]
    df_factory.start_all(instances)

    nodes = [(await create_node_info(instance)) for instance in instances]
    nodes[0].slots = [(0, 16383)]
    nodes[1].slots = []
    nodes[2].slots = []
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    logging.debug("DEBUG POPULATE first node")
    key_num = 100000
    await DebugPopulateSeeder(key_target=key_num, data_size=100).run(nodes[0].client)
    dbsize_node0 = await nodes[0].client.dbsize()
    assert dbsize_node0 > (key_num * 0.95)

    # 2. Start migrating part of the slots from first node to second
    logging.debug("Start first migration")
    nodes[0].migrations.append(
        MigrationInfo("127.0.0.1", nodes[1].instance.admin_port, [(0, 16300)], nodes[1].id)
    )
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    # 3. Wait for migratin finish
    await wait_for_status(nodes[0].admin_client, nodes[1].id, "FINISHED", timeout=50)
    await wait_for_status(nodes[1].admin_client, nodes[0].id, "FINISHED", timeout=50)

    nodes[0].migrations = []
    nodes[0].slots = [(16301, 16383)]
    nodes[1].slots = [(0, 16300)]
    nodes[2].slots = []
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    # 4. Start migrating remaind slots from first node to third node
    logging.debug("Start second migration")
    nodes[0].migrations.append(
        MigrationInfo("127.0.0.1", nodes[2].instance.admin_port, [(16301, 16383)], nodes[2].id)
    )
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    # 5. Wait for migratin finish
    await wait_for_status(nodes[0].admin_client, nodes[2].id, "FINISHED", timeout=10)
    await wait_for_status(nodes[2].admin_client, nodes[0].id, "FINISHED", timeout=10)

    nodes[0].migrations = []
    nodes[0].slots = []
    nodes[1].slots = [(0, 16300)]
    nodes[2].slots = [(16301, 16383)]
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    # 6. Check all data was migrated
    # Using dbsize to check all the data was migrated to the other nodes.
    # Note: we can not use the seeder capture as we migrate the data to 2 different nodes.
    # TODO: improve the migration conrrectness by running the seeder capture on slot range (requiers changes in capture script).
    dbsize_node1 = await nodes[1].client.dbsize()
    dbsize_node2 = await nodes[2].client.dbsize()
    assert dbsize_node1 + dbsize_node2 == dbsize_node0
    assert dbsize_node2 > 0 and dbsize_node1 > 0


"""
Test cluster node distributing its slots into 3 other nodes.
In this test we randomize the slot ranges that are migrated to each node
For each migration we start migration, wait for it to finish and once it is finished we send migration finalization config
"""


@pytest.mark.large
@pytest.mark.exclude_epoll
@pytest.mark.asyncio
@dfly_args({"proactor_threads": 4, "cluster_mode": "yes", "pause_wait_timeout": 10})
async def test_migration_rebalance_node(df_factory: DflyInstanceFactory, df_seeder_factory):
    # 1. Create cluster of 3 nodes with all slots allocated to first node.
    instances = [
        df_factory.create(
            port=next(next_port),
            admin_port=next(next_port),
            vmodule="outgoing_slot_migration=2,cluster_family=2,incoming_slot_migration=2,streamer=2",
        )
        for i in range(4)
    ]
    df_factory.start_all(instances)

    def create_random_ranges():
        # Generate 2 random breakpoints within the range
        breakpoints = sorted(random.sample(range(1, 16382), 2))
        ranges = [
            (0, breakpoints[0] - 1),
            (breakpoints[0], breakpoints[1] - 1),
            (breakpoints[1], 16383),
        ]
        return ranges

    # Create 3 random ranges from 0 to 16383
    random_ranges = create_random_ranges()

    nodes = [(await create_node_info(instance)) for instance in instances]
    nodes[0].slots = random_ranges
    nodes[1].slots = []
    nodes[2].slots = []
    nodes[3].slots = []
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    key_num = 100000
    logging.debug(f"DEBUG POPULATE first node with number of keys: {key_num}")
    await DebugPopulateSeeder(key_target=key_num, data_size=100).run(nodes[0].client)
    dbsize_node0 = await nodes[0].client.dbsize()
    assert dbsize_node0 > (key_num * 0.95)

    logging.debug("start seeding")
    # Running seeder with pipeline mode when finalizing migrations leads to errors
    # TODO: I believe that changing the seeder to generate pipeline command only on specific slot will fix the problem
    seeder = df_seeder_factory.create(
        keys=50_000,
        port=instances[0].port,
        cluster_mode=True,
        pipeline=False,
        mirror_to_fake_redis=True,
    )
    await seeder.run(target_deviation=0.1)
    seed = asyncio.create_task(seeder.run())

    migration_info = [
        MigrationInfo("127.0.0.1", nodes[1].instance.admin_port, [random_ranges[0]], nodes[1].id),
        MigrationInfo("127.0.0.1", nodes[2].instance.admin_port, [random_ranges[1]], nodes[2].id),
        MigrationInfo("127.0.0.1", nodes[3].instance.admin_port, [random_ranges[2]], nodes[3].id),
    ]

    nodes_lock = asyncio.Lock()

    async def do_migration(index):
        await asyncio.sleep(random.randint(1, 10) / 5)
        async with nodes_lock:
            logging.debug(f"Start migration from node {index}")
            nodes[0].migrations.append(migration_info[index - 1])
            await push_config(
                json.dumps(generate_config(nodes)), [node.admin_client for node in nodes]
            )

        logging.debug(f"wait migration from node {index}")
        await wait_for_status(nodes[0].admin_client, nodes[index].id, "FINISHED", timeout=50)
        await wait_for_status(nodes[index].admin_client, nodes[0].id, "FINISHED", timeout=50)
        logging.debug(f"finished migration from node {index}")
        await asyncio.sleep(random.randint(1, 5) / 5)
        async with nodes_lock:
            logging.debug(f"Finalize migration from node {index}")
            nodes[index].slots = migration_info[index - 1].slots
            nodes[0].slots.remove(migration_info[index - 1].slots[0])
            nodes[0].migrations.remove(migration_info[index - 1])
            await push_config(
                json.dumps(generate_config(nodes)), [node.admin_client for node in nodes]
            )

    all_migrations = [asyncio.create_task(do_migration(i)) for i in range(1, 4)]
    for migration in all_migrations:
        await migration

    logging.debug("stop seeding")
    seeder.stop()
    await seed
    await asyncio.sleep(0.5)  # wait untill all keys with ttl are expired
    capture = await seeder.capture_fake_redis()
    assert await seeder.compare(capture, nodes[1].instance.port)


@dfly_args({"proactor_threads": 4, "cluster_mode": "yes"})
async def test_migration_restart(df_factory: DflyInstanceFactory, df_seeder_factory):
    # 1. Start migration, and than restart it with another slots set
    instances = [
        df_factory.create(
            port=next(next_port),
            admin_port=next(next_port),
            vmodule="outgoing_slot_migration=2,cluster_family=2,incoming_slot_migration=2",
        )
        for i in range(2)
    ]
    df_factory.start_all(instances)

    nodes = [(await create_node_info(instance)) for instance in instances]
    nodes[0].slots = [(0, 16383)]
    nodes[1].slots = []

    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    logging.debug("Start seeder")
    seeder = df_seeder_factory.create(
        keys=50_000,
        port=instances[0].port,
        cluster_mode=True,
    )
    await seeder.run(target_deviation=0.1)
    capture = await seeder.capture()

    logging.debug(f"Start migration")
    nodes[0].migrations.append(
        MigrationInfo(
            "127.0.0.1",
            nodes[1].instance.admin_port,
            [(random.randint(1, 8000), random.randint(8001, 16383))],
            nodes[1].id,
        )
    )
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    await asyncio.sleep(random.randint(1, 10) / 5)
    logging.debug(f"Restart migration")
    final_migration_range = (random.randint(1, 8000), random.randint(8001, 16382))
    nodes[0].migrations[0] = MigrationInfo(
        "127.0.0.1", nodes[1].instance.admin_port, [final_migration_range], nodes[1].id
    )
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    logging.debug(f"wait migration to finish")
    await wait_for_status(nodes[0].admin_client, nodes[1].id, "FINISHED", timeout=50)
    await wait_for_status(nodes[1].admin_client, nodes[0].id, "FINISHED", timeout=50)

    nodes[0].migrations = []
    nodes[0].slots = [(0, final_migration_range[0] - 1), (final_migration_range[1] + 1, 16383)]
    nodes[1].slots = [final_migration_range]
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    assert await seeder.compare(capture, nodes[0].instance.port)


@dfly_args({"proactor_threads": 2, "cluster_mode": "yes"})
async def test_cluster_sharded_pub_sub(df_factory: DflyInstanceFactory):
    nodes = [df_factory.create(port=next(next_port)) for i in range(2)]
    df_factory.start_all(nodes)

    c_nodes = [node.client() for node in nodes]

    nodes_info = [(await create_node_info(instance)) for instance in nodes]
    nodes_info[0].slots = [(0, 16383)]
    nodes_info[1].slots = []

    await push_config(json.dumps(generate_config(nodes_info)), [node.client for node in nodes_info])
    # channel name kostas crc is at slot 2883 which is part of the first node.
    with pytest.raises((MovedError, aioredis.ResponseError)) as moved_error:
        await c_nodes[1].execute_command("SSUBSCRIBE kostas")

    assert str(moved_error.value).endswith(f"2833 127.0.0.1:{nodes[0].port}")

    node_a = ClusterNode("localhost", nodes[0].port)
    node_b = ClusterNode("localhost", nodes[1].port)

    consumer_client = RedisCluster(startup_nodes=[node_a, node_b])
    consumer = consumer_client.pubsub()
    consumer.ssubscribe("kostas")

    await c_nodes[0].execute_command("SPUBLISH kostas hello")
    # We need to sleep cause we use DispatchBrief internally. Otherwise we can't really gurantee
    # that the client received the message
    await asyncio.sleep(2)

    # Consume subscription message result from above
    message = consumer.get_sharded_message(target_node=node_a)
    assert message == {"type": "ssubscribe", "pattern": None, "channel": b"kostas", "data": 1}

    message = consumer.get_sharded_message(target_node=node_a)
    assert message == {"type": "smessage", "pattern": None, "channel": b"kostas", "data": b"hello"}

    consumer.sunsubscribe("kostas")
    await asyncio.sleep(2)
    await c_nodes[0].execute_command("SPUBLISH kostas new_message")
    message = consumer.get_sharded_message(target_node=node_a)
    assert message == {"type": "sunsubscribe", "pattern": None, "channel": b"kostas", "data": 0}


@dfly_args({"proactor_threads": 2, "cluster_mode": "yes"})
async def test_cluster_sharded_pubsub_shard_commands(df_factory: DflyInstanceFactory):
    nodes = [df_factory.create(port=next(next_port)) for i in range(2)]
    df_factory.start_all(nodes)

    c_nodes = [node.client() for node in nodes]

    nodes_info = [(await create_node_info(instance)) for instance in nodes]
    nodes_info[0].slots = [(0, 16383)]
    nodes_info[1].slots = []

    await push_config(json.dumps(generate_config(nodes_info)), [node.client for node in nodes_info])

    # We are executing SSUBSCRIBE commands and wait for them to be sure that
    # channels are created
    message = await c_nodes[0].execute_command("SSUBSCRIBE pubsub-shard-channel")
    message = await c_nodes[0].execute_command("SSUBSCRIBE shard-channel")

    message = await c_nodes[0].execute_command("PUBSUB SHARDCHANNELS")
    message.sort()
    assert message == ["pubsub-shard-channel", "shard-channel"]

    message = await c_nodes[0].execute_command("PUBSUB SHARDCHANNELS pubsub*")
    assert message == ["pubsub-shard-channel"]

    message = await c_nodes[0].execute_command("PUBSUB SHARDCHANNELS *channel")
    message.sort()
    assert message == ["pubsub-shard-channel", "shard-channel"]

    message = await c_nodes[0].execute_command("PUBSUB SHARDNUMSUB pubsub-shard-channel")
    assert message == ["pubsub-shard-channel", 1]

    message = await c_nodes[0].execute_command(
        "PUBSUB SHARDNUMSUB pubsub-shard-channel shard-channel"
    )
    assert message == ["pubsub-shard-channel", 1, "shard-channel", 1]

    message = await c_nodes[0].execute_command("PUBSUB SHARDNUMSUB")
    assert message == []


@pytest.mark.large
@dfly_args({"proactor_threads": 2, "cluster_mode": "yes"})
async def test_cluster_migration_errors_num(df_factory: DflyInstanceFactory):
    # create cluster with several nodes and create migrations from one node to others
    # but config propagated only to source node to get errors for migrations
    # number of errors should be the same as number of target nodes
    nodes = [
        df_factory.create(
            port=next(next_port),
            admin_port=next(next_port),
            vmodule="cluster_family=2,outgoing_slot_migration=2,incoming_slot_migration=2",
        )
        for i in range(3)
    ]
    df_factory.start_all(nodes)

    c_nodes = [node.client() for node in nodes]

    nodes_info = [(await create_node_info(instance)) for instance in nodes]
    nodes_info[0].slots = [(0, 16383)]
    nodes_info[1].slots = []
    nodes_info[2].slots = []

    await push_config(json.dumps(generate_config(nodes_info)), c_nodes)

    async def wait_for_errors_num(client, err_num, timeout=10):
        cluster_info = lambda: client.info("CLUSTER")

        async for info, breaker in tick_timer(cluster_info, timeout=timeout):
            with breaker:
                assert info["migration_errors_total"] == err_num

    await wait_for_errors_num(c_nodes[0], 0)

    nodes_info[0].migrations.append(
        MigrationInfo("127.0.0.1", nodes_info[1].instance.admin_port, [(0, 100)], nodes_info[1].id)
    )

    await push_config(json.dumps(generate_config(nodes_info)), [c_nodes[0]])

    # the error will be reported after 30 seconds, because config is missing for target node
    await wait_for_errors_num(c_nodes[0], 1, timeout=40)
    # the migration process attempt to start migration in a second so we get more errors
    await wait_for_errors_num(c_nodes[0], 2)


@dfly_args({"proactor_threads": 2, "cluster_mode": "yes"})
async def test_cluster_sharded_pub_sub_migration(df_factory: DflyInstanceFactory):
    instances = [df_factory.create(port=next(next_port)) for i in range(2)]
    df_factory.start_all(instances)

    c_nodes = [instance.client() for instance in instances]

    nodes = [(await create_node_info(instance)) for instance in instances]
    nodes[0].slots = [(0, 16383)]
    nodes[1].slots = []

    await push_config(json.dumps(generate_config(nodes)), [node.client for node in nodes])

    # Setup producer and consumer
    node_a = ClusterNode("localhost", instances[0].port)
    node_b = ClusterNode("localhost", instances[1].port)

    consumer_client = RedisCluster(startup_nodes=[node_a, node_b])
    consumer = consumer_client.pubsub()
    consumer.ssubscribe("kostas")

    # Push new config
    nodes[0].migrations.append(
        MigrationInfo("127.0.0.1", nodes[1].instance.port, [(0, 16383)], nodes[1].id)
    )
    await push_config(json.dumps(generate_config(nodes)), [node.client for node in nodes])

    await wait_for_status(nodes[0].client, nodes[1].id, "FINISHED")

    nodes[0].migrations = []
    nodes[0].slots = []
    nodes[1].slots = [(0, 16383)]
    logging.debug("remove finished migrations")
    await push_config(json.dumps(generate_config(nodes)), [node.client for node in nodes])

    # channel name kostas crc is at slot 2883 which is part of the second now.
    with pytest.raises((MovedError, aioredis.ResponseError)) as moved_error:
        await c_nodes[0].execute_command("SSUBSCRIBE kostas")

    assert str(moved_error.value).endswith(f"2833 127.0.0.1:{instances[1].port}")

    # Consume subscription message result from above
    message = consumer.get_sharded_message(target_node=node_a)
    assert message == {"type": "ssubscribe", "pattern": None, "channel": b"kostas", "data": 1}
    message = consumer.get_sharded_message(target_node=node_a)
    assert message == {"type": "sunsubscribe", "pattern": None, "channel": b"kostas", "data": 0}


@dfly_args({"proactor_threads": 4, "cluster_mode": "yes"})
async def test_readonly_replication(
    df_factory: DflyInstanceFactory, df_seeder_factory: DflySeederFactory
):
    # create cluster master and replica
    # For now replica always should work in read-only mode
    # READONLY command returns always OK without any impact
    # In the future we may decide to implement the same behavior as REDIS
    instances = [
        df_factory.create(port=next(next_port), admin_port=next(next_port)) for i in range(2)
    ]
    df_factory.start_all(instances)

    nodes = [await create_node_info(n) for n in instances]
    m1_node, r1_node = nodes
    master_nodes = [m1_node]

    m1_node.slots = [(0, 16383)]
    m1_node.replicas = [r1_node]

    logging.debug("Push initial config")
    await push_config(
        json.dumps(generate_config(master_nodes)), [node.admin_client for node in nodes]
    )

    logging.debug("create data")
    await m1_node.client.execute_command("SET X 1")

    logging.debug("start replication")
    await r1_node.admin_client.execute_command(f"replicaof localhost {m1_node.instance.admin_port}")

    await wait_available_async(r1_node.admin_client)

    assert await r1_node.client.execute_command("GET X") == "1"
    assert await r1_node.client.execute_command("READONLY")
    assert await r1_node.client.execute_command("GET X") == "1"

    # This behavior can be changed in the future
    assert await r1_node.client.execute_command("GET Y") == None

    m1_node.replicas = []

    logging.debug("Push config without replica")
    await push_config(
        json.dumps(generate_config(master_nodes)), [node.admin_client for node in nodes]
    )

    with pytest.raises((MovedError, aioredis.ResponseError)) as moved_error:
        await r1_node.client.execute_command("GET X")

    assert str(moved_error.value).endswith(f"7165 127.0.0.1:{instances[0].port}")

    with pytest.raises((MovedError, aioredis.ResponseError)) as moved_error:
        await r1_node.client.execute_command("GET Y")

    assert str(moved_error.value).endswith(f"3036 127.0.0.1:{instances[0].port}")


@dfly_args({"proactor_threads": 2, "cluster_mode": "yes"})
async def test_cancel_blocking_cmd_during_mygration_finalization(df_factory: DflyInstanceFactory):
    # blocking commands should be canceled during migration finalization
    instances = [df_factory.create(port=next(next_port)) for i in range(2)]
    df_factory.start_all(instances)

    c_nodes = [instance.client() for instance in instances]

    nodes = [(await create_node_info(instance)) for instance in instances]
    nodes[0].slots = [(0, 16383)]
    nodes[1].slots = []

    await push_config(json.dumps(generate_config(nodes)), [node.client for node in nodes])

    logging.debug("Start blpop task")
    blpop_task = asyncio.create_task(c_nodes[0].blpop("list", 0))

    await asyncio.sleep(0.5)

    assert not blpop_task.done()

    nodes[0].migrations.append(
        MigrationInfo("127.0.0.1", nodes[1].instance.port, [(0, 16383)], nodes[1].id)
    )
    await push_config(json.dumps(generate_config(nodes)), [node.client for node in nodes])

    await wait_for_status(nodes[0].client, nodes[1].id, "FINISHED")

    with pytest.raises(aioredis.ResponseError) as e_info:
        await blpop_task

    assert await c_nodes[1].type("list") == "none"

    nodes[0].migrations = []
    nodes[0].slots = []
    nodes[1].slots = [(0, 16383)]

    logging.debug("remove finished migrations")
    await push_config(json.dumps(generate_config(nodes)), [node.client for node in nodes])

    assert await c_nodes[1].type("list") == "none"


@dfly_args({"cluster_mode": "yes"})
async def test_slot_migration_oom(df_factory):
    instances = [
        df_factory.create(
            port=next(next_port),
            admin_port=next(next_port),
            proactor_threads=4,
            maxmemory="1024MB",
        ),
        df_factory.create(
            port=next(next_port),
            admin_port=next(next_port),
            proactor_threads=2,
            maxmemory="512MB",
        ),
    ]

    df_factory.start_all(instances)

    nodes = [(await create_node_info(instance)) for instance in instances]
    nodes[0].slots = [(0, 16383)]
    nodes[1].slots = []

    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    await nodes[0].client.execute_command("DEBUG POPULATE 100 test 10000000")

    nodes[0].migrations.append(
        MigrationInfo("127.0.0.1", nodes[1].instance.admin_port, [(0, 16383)], nodes[1].id)
    )

    logging.info("Start migration")
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    # Wait for FATAL status
    await wait_for_status(nodes[0].admin_client, nodes[1].id, "FATAL", 300)
    await wait_for_status(nodes[1].admin_client, nodes[0].id, "FATAL")

    # There's a rare timing issue if we don't wait here. Status can be set to FATAL
    # but error message is not still set for slot migration.
    await asyncio.sleep(1)

    # Node_0 slot-migration-status
    status = await nodes[0].admin_client.execute_command(
        "DFLYCLUSTER", "SLOT-MIGRATION-STATUS", nodes[1].id
    )
    # Direction
    assert status[0][0] == "out"
    # Error message
    assert status[0][4] == "Cannot allocate memory: INCOMING_MIGRATION_OOM"

    # Node_1 slot-migration-status
    status = await nodes[1].admin_client.execute_command(
        "DFLYCLUSTER", "SLOT-MIGRATION-STATUS", nodes[0].id
    )
    # Direction
    assert status[0][0] == "in"
    # Error message
    assert status[0][4] == "INCOMING_MIGRATION_OOM"


@dfly_args({"proactor_threads": 4, "cluster_mode": "yes"})
async def test_replica_takeover_moved(
    df_factory: DflyInstanceFactory, df_seeder_factory: DflySeederFactory
):
    instances = [df_factory.create(port=next(next_port)) for i in range(4)]
    df_factory.start_all(instances)

    nodes = [await create_node_info(n) for n in instances]
    m1, r1, m2, r2 = nodes
    master_nodes = [m1, m2]

    m1.slots = [(0, 9000)]
    m2.slots = [(9001, 16383)]

    m1.replicas = [r1]
    m2.replicas = [r2]

    await push_config(json.dumps(generate_config(master_nodes)), [node.client for node in nodes])

    logging.debug("create data")
    await m1.client.execute_command("SET X 1")
    # Slot number 16022
    await m2.client.execute_command("SET FOOX 1")

    logging.debug("start replication")
    await r1.client.execute_command(f"replicaof localhost {m1.instance.port}")
    await r2.client.execute_command(f"replicaof localhost {m2.instance.port}")

    await wait_available_async(r1.client)

    assert await r1.client.execute_command("GET X") == "1"
    assert await r1.client.execute_command("REPLTAKEOVER 20") == "OK"

    with pytest.raises((MovedError, aioredis.ResponseError)) as moved_error:
        await m1.client.execute_command("GET X")

    assert str(moved_error.value).endswith(f"7165 127.0.0.1:{r1.instance.port}")

    with pytest.raises((MovedError, aioredis.ResponseError)) as moved_error:
        await m1.client.execute_command("GET FOOX")

    assert str(moved_error.value).endswith(f"16022 127.0.0.1:{m2.instance.port}")

    # Try write command on the new master. It should succeed because during takeover,
    # we updated the config as well
    assert await r1.client.execute_command("SET X 2") == "OK"

    master_nodes = [r1, m2]
    r1.slots = [(0, 9000)]
    nodes.pop(0)
    await push_config(json.dumps(generate_config(master_nodes)), [node.client for node in nodes])

    assert await r1.client.execute_command("GET X") == "2"
    assert await m2.client.execute_command("GET FOOX") == "1"

    await r1.client.execute_command("flushall")
    assert await r1.client.dbsize() == 0
    await r1.client.execute_command("SET newk foo")
    # Now bring back m1 as a replica of r1
    nodes.append(m1)
    r1.replicas = [m1]
    await push_config(json.dumps(generate_config(master_nodes)), [node.client for node in nodes])
    await m1.client.execute_command(f"replicaof localhost {r1.instance.port}")
    await check_all_replicas_finished([m1.client], r1.client)
    assert await m1.client.execute_command("GET newk") == "foo"


@dfly_args({"proactor_threads": 4, "cluster_mode": "yes", "cluster_search": "yes"})
async def test_SearchRequestDistribution(df_factory: DflyInstanceFactory):
    """
    Create cluster of 3 nodes.
    Send FT.CREATE to first node and check that index was created on all nodes.
    Search for all documents from cluster.
    """

    instances = [
        df_factory.create(
            port=next(next_port),
            admin_port=next(next_port),
            vmodule="coordinator=2,search_family=3,protocol_client=3",
        )
        for i in range(3)
    ]

    df_factory.start_all(instances)

    nodes = [(await create_node_info(instance)) for instance in instances]
    nodes[0].slots = [(0, 5259)]
    nodes[1].slots = [(5260, 10519)]
    nodes[2].slots = [(10520, 16383)]

    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    assert (
        await nodes[0].client.execute_command(
            "FT.CREATE", "idx", "ON", "HASH", "SCHEMA", "title", "TEXT"
        )
        == "OK"
    )

    for node in nodes:
        await wait_for_ft_index_creation(node.client, "idx")

    cclient = instances[0].cluster_client()

    docs_num = 100
    for i in range(0, docs_num):
        assert await cclient.execute_command("HSET", f"s{i}", "title", f"test {i}") == 1

    async def search_test():
        res = await nodes[0].client.execute_command(
            "FT.SEARCH", "idx", "@title:test", "text", "LIMIT", "0", "1000"
        )
        assert res[0] == docs_num
        for i in range(0, docs_num):
            assert f"s{i}" in res

    await asyncio.gather(*(search_test() for _ in range(docs_num)))


@dfly_args({"proactor_threads": 4, "cluster_mode": "yes", "cluster_search": "yes"})
async def test_SortedSearchRequest(df_factory: DflyInstanceFactory):
    """
    Create cluster of 3 nodes.
    Execute Search request with sorting on indexed field.
    """

    instances = [
        df_factory.create(
            port=next(next_port),
            admin_port=next(next_port),
            vmodule="coordinator=2,search_family=3,protocol_client=3",
        )
        for i in range(3)
    ]

    df_factory.start_all(instances)

    nodes = [(await create_node_info(instance)) for instance in instances]
    nodes[0].slots = [(0, 5259)]
    nodes[1].slots = [(5260, 10519)]
    nodes[2].slots = [(10520, 16383)]

    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    assert (
        await nodes[0].client.execute_command(
            "FT.CREATE", "idx", "ON", "HASH", "SCHEMA", "title", "TEXT", "size", "NUMERIC"
        )
        == "OK"
    )

    for node in nodes:
        await wait_for_ft_index_creation(node.client, "idx")

    cclient = instances[0].cluster_client()

    docs_num = 100
    for i in range(0, docs_num):
        assert (
            await cclient.execute_command("HSET", f"s{i}", "title", f"test {i}", "size", f"{i}")
            == 2
        )

    async def search_test():
        limit_size = random.randint(1, docs_num // 2)
        offset = random.randint(0, docs_num // 2)
        res = await nodes[0].client.execute_command(
            "FT.SEARCH",
            "idx",
            "@title:test",
            "text",
            "SORTBY",
            "size",
            "ASC",
            "LIMIT",
            f"{offset}",
            f"{limit_size}",
        )
        assert res[0] == docs_num
        for i in range(offset, offset + limit_size):
            assert f"s{i}" in res, f"offset: {offset}, limit_size: {limit_size}, res: {res}"

        for i in range(0, offset):
            assert f"s{i}" not in res

        for i in range(offset + limit_size, docs_num):
            assert f"s{i}" not in res

    await asyncio.gather(*(search_test() for _ in range(2)))


async def verify_keys_match_number_of_index_docs(client, expected_num_keys):
    # Get number of docs in index
    index_info = await client.execute_command(f"FT.INFO idx")
    index_info_num_docs = index_info[9]

    # Get number of keys in database
    keyspace_info = await client.info("keyspace")
    keyspace_keys = keyspace_info["db0"]["keys"]

    assert index_info_num_docs == keyspace_keys
    assert index_info_num_docs == expected_num_keys
    assert keyspace_keys == expected_num_keys


@dfly_args({"proactor_threads": 2, "cluster_mode": "yes", "cluster_search": "yes"})
async def test_remove_docs_on_cluster_migration(df_factory):
    instances = [
        df_factory.create(port=next(next_port), admin_port=next(next_port)) for i in range(2)
    ]

    df_factory.start_all(instances)

    nodes = [(await create_node_info(instance)) for instance in instances]
    nodes[0].slots = [(0, 16383)]
    nodes[1].slots = []

    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    # Create index on both nodes
    await nodes[0].client.execute_command(
        "FT.CREATE", "idx", "ON", "HASH", "PREFIX", "1", "doc:", "SCHEMA", "v", "TEXT"
    )

    # Populate node 0
    keys = 100
    for i in range(0, keys):
        random_string = "".join(random.choices(string.ascii_letters + string.digits, k=1_000))
        await nodes[0].client.execute_command("HSET", f"doc:{i}", "v", random_string)

    # Verify on node 0 that keys are added and index is populated
    await verify_keys_match_number_of_index_docs(nodes[0].client, keys)

    nodes[0].migrations.append(
        MigrationInfo("127.0.0.1", instances[1].port, [(0, 16383)], nodes[1].id)
    )
    logging.debug("Start migration")
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    await wait_for_status(nodes[0].admin_client, nodes[1].id, "FINISHED")

    nodes[0].migrations = []
    nodes[0].slots = []
    nodes[1].slots = [(0, 16383)]
    logging.debug("finalize migration")
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    await asyncio.sleep(1)

    # Verify on node 1 that keys are moved and index is populated
    await verify_keys_match_number_of_index_docs(nodes[1].client, keys)

    # Verify that node 0 doesn't have any keys and no index docs
    await verify_keys_match_number_of_index_docs(nodes[0].client, 0)


@pytest.mark.large
@pytest.mark.exclude_epoll
@pytest.mark.opt_only
@dfly_args({"cluster_mode": "yes"})
async def test_cluster_migration_with_tiering(df_factory):
    instances = [
        df_factory.create(
            port=next(next_port),
            admin_port=next(next_port),
            proactor_threads=2,
            tiered_prefix="/tmp/tiered/cluster_node",
            tiered_offload_threshold="0.2",
            maxmemory="512MB",
        ),
        df_factory.create(
            port=next(next_port), admin_port=next(next_port), proactor_threads=2, maxmemory="1024MB"
        ),
    ]
    df_factory.start_all(instances)

    nodes = [(await create_node_info(instance)) for instance in instances]
    nodes[0].slots = [(0, 16383)]
    nodes[1].slots = []

    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    keys = 1000000
    await nodes[0].client.execute_command(f"DEBUG POPULATE {keys} size 440")

    await asyncio.sleep(5)  # wait for tiering to offload data

    # We need to wait for some tiered entries to verify migration works with tiering.
    async for info, breaker in info_tick_timer(nodes[0].client, section="TIERED"):
        with breaker:
            logging.info(f"Tiered entries: {info['tiered_entries']}")
            assert info["tiered_entries"] >= 10_000

    nodes[0].migrations.append(
        MigrationInfo("127.0.0.1", instances[1].port, [(0, 16383)], nodes[1].id)
    )

    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    await wait_for_status(nodes[0].admin_client, nodes[1].id, "FINISHED", 300)

    nodes[0].migrations = []
    nodes[0].slots = []
    nodes[1].slots = [(0, 16383)]
    logging.debug("finalize migration")
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    info = await nodes[1].client.info("keyspace")
    assert info["db0"]["keys"] == keys

    async for info, breaker in info_tick_timer(nodes[0].client, section="TIERED"):
        with breaker:
            assert info["tiered_entries"] == 0

    await asyncio.sleep(5)  # wait for tiered deletions to finish

    info = await nodes[0].client.info("keyspace")
    assert info["db0"]["keys"] == 0


@pytest.mark.large
@pytest.mark.exclude_epoll
@pytest.mark.opt_only
@dfly_args({"cluster_mode": "yes"})
async def test_cluster_migration_with_tiering_and_deletes(df_factory: DflyInstanceFactory):
    instances = [
        df_factory.create(
            port=next(next_port),
            admin_port=next(next_port),
            proactor_threads=2,
            tiered_prefix="/tmp/tiered/cluster_node",
            tiered_offload_threshold="0.2",
            maxmemory="512MB",
        ),
        df_factory.create(
            port=next(next_port), admin_port=next(next_port), proactor_threads=2, maxmemory="1024MB"
        ),
    ]
    df_factory.start_all(instances)

    nodes = [(await create_node_info(instance)) for instance in instances]
    nodes[0].slots = [(0, 16383)]
    nodes[1].slots = []

    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    keys = 1000000
    await nodes[0].client.execute_command(f"DEBUG POPULATE {keys} key 440")

    # Expect that number of added keys is 1000000
    info = await nodes[0].client.info("keyspace")
    assert info["db0"]["keys"] == keys

    # Wait for some data to be offloaded to tiered storage
    await asyncio.sleep(10)

    # Wait for sufficient tiered entries
    async for info, breaker in info_tick_timer(nodes[0].client, section="TIERED"):
        with breaker:
            tiered_entries = info["tiered_entries"]
            assert tiered_entries >= 50_000

    nodes[0].migrations.append(
        MigrationInfo("127.0.0.1", instances[1].port, [(0, 16383)], nodes[1].id)
    )

    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    # Delete 50k keys during migration to create mutations and verify that they are applied correctly
    delete_expected_num = 50_000
    delete_succeded = 0

    # Indicator that migration is done and we can stop deleting keys
    migration_done = False

    async def delete_job():
        nonlocal delete_succeded
        for i in range(delete_expected_num):
            if migration_done:
                break
            try:
                await nodes[0].client.delete(f"key:{i}")
                delete_succeded += 1
            except Exception as e:
                pass

    delete_task = asyncio.create_task(delete_job())

    await wait_for_status(nodes[0].admin_client, nodes[1].id, "FINISHED", 300)
    migration_done = True

    await delete_task

    nodes[0].migrations = []
    nodes[0].slots = []
    nodes[1].slots = [(0, 16383)]
    logging.debug("finalize migration")
    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])

    async for info, breaker in info_tick_timer(nodes[0].client, section="TIERED"):
        with breaker:
            assert info["tiered_entries"] == 0

    await asyncio.sleep(5)  # wait for tiered deletions to finish

    info = await nodes[0].client.info("keyspace")
    assert info["db0"]["keys"] == 0

    # Verify that mutations are applied on the target node after migration
    info = await nodes[1].client.info("keyspace")
    assert info["db0"]["keys"] == keys - delete_succeded


@dfly_args(
    {
        "proactor_threads": 1,
        "cluster_mode": "yes",
        "cluster_node_id": "0" * 40,
    }
)
async def test_cluster_config_slot_overflow_doesnt_crash(df_factory: DflyInstanceFactory):
    instance = df_factory.create(port=next(next_port))
    df_factory.start_all([instance])
    client = instance.client()
    node_id = "0" * 40

    # Build invalid config JSON manually - 1E383 is a valid JSON number but overflows uint16_t.
    # We must NOT use json.dumps here because Python would reject 1e383 (infinity).
    invalid_config = (
        '[{"slot_ranges":[{"start":0,"end":8191}],'
        '"master":{"id":"' + node_id + '","ip":"127.0.0.1","port":' + str(instance.port) + "},"
        '"replicas":[]},'
        '{"slot_ranges":[{"start":8192,"end":1E383}],'
        '"master":{"id":"' + "1" * 40 + '","ip":"127.0.0.1","port":9999},'
        '"replicas":[]}]'
    )

    pipe = client.pipeline(transaction=False)
    pipe.execute_command("DFLYCLUSTER", "CONFIG", invalid_config)
    pipe.execute_command("CLUSTER", "MYID")
    results = await pipe.execute(raise_on_error=False)

    # CONFIG must return an error (not crash), MYID must still work
    assert isinstance(results[0], Exception)
    assert results[1] == node_id


================================================
FILE: tests/dragonfly/config_test.py
================================================
import pytest
import redis
from redis.asyncio import Redis as RedisClient
from .utility import *
from .instance import DflyStartException


async def test_maxclients(df_factory):
    # Needs some authentication
    with df_factory.create(port=1111, maxclients=1, admin_port=1112) as server:
        async with server.client() as client1:
            assert ["maxclients", "1"] == await client1.execute_command("CONFIG GET maxclients")

            with pytest.raises(redis.exceptions.ConnectionError):
                async with server.client() as client2:
                    await client2.get("test")

            # Check that admin connections are not limited.
            async with RedisClient(port=server.admin_port) as admin_client:
                await admin_client.get("test")

            await client1.execute_command("CONFIG SET maxclients 3")
            assert ["maxclients", "3"] == await client1.execute_command("CONFIG GET maxclients")
            async with server.client() as client2:
                await client2.get("test")


================================================
FILE: tests/dragonfly/conftest.py
================================================
"""
Pytest fixtures to be provided for all tests without import
"""

import asyncio
import logging
import os
import random
import shutil
import subprocess
import sys
import time
import typing
from copy import deepcopy
from pathlib import Path
from tempfile import gettempdir, mkdtemp
from time import sleep
from typing import Dict, List, Union

import pymemcache
import pytest
import pytest_asyncio
import redis
from redis import asyncio as aioredis

from . import PortPicker
from .instance import DflyInstance, DflyParams, DflyInstanceFactory, RedisServer
from .utility import DflySeederFactory, gen_ca_cert, gen_certificate, skip_if_not_in_github

logging.getLogger("asyncio").setLevel(logging.WARNING)
# Suppress "Unclosed ClusterNode" warnings from redis-py topology refreshes (not actionable)
logging.getLogger("asyncio").addFilter(lambda r: "Unclosed ClusterNode" not in r.getMessage())

DATABASE_INDEX = 0
BASE_LOG_DIR = "/tmp/dragonfly_logs/"
FAILED_PATH = "/tmp/failed/"
LAST_LOGS = "/tmp/last_test_log_dir.txt"


def _download_minio_binary(dest: Path):
    """Download MinIO binary to dest if not already cached.

    Downloads to a temporary file first, then renames atomically to avoid
    leaving a corrupt binary on interrupted downloads.
    """
    import platform
    import urllib.request

    system = platform.system().lower()
    arch = platform.machine()
    arch_map = {"x86_64": "amd64", "aarch64": "arm64", "arm64": "arm64"}
    arch = arch_map.get(arch, arch)
    url = f"https://dl.min.io/server/minio/release/{system}-{arch}/minio"
    logging.info(f"Downloading MinIO binary from {url}")
    tmp_dest = dest.with_suffix(".tmp")
    try:
        urllib.request.urlretrieve(url, tmp_dest)
        tmp_dest.chmod(0o755)
        tmp_dest.rename(dest)
    except Exception:
        tmp_dest.unlink(missing_ok=True)
        raise


def _start_minio_server(endpoint):
    """Start MinIO subprocess and configure env vars for S3 tests."""
    import boto3
    from urllib.parse import urlparse

    cache_dir = Path.home() / ".cache" / "dragonfly-tests"
    cache_dir.mkdir(parents=True, exist_ok=True)
    minio_bin = cache_dir / "minio"

    if not minio_bin.exists():
        _download_minio_binary(minio_bin)

    # Normalize scheme-less values (e.g. "localhost:9000") so urlparse
    # correctly populates hostname/port instead of treating it as a path.
    to_parse = endpoint if "://" in endpoint else "http://" + endpoint
    parsed = urlparse(to_parse)
    address = f":{parsed.port or 9000}"
    endpoint = f"{parsed.scheme}://{parsed.hostname}:{parsed.port or 9000}"

    data_dir = Path(mkdtemp(prefix="minio_data_"))
    minio_log = data_dir / "minio.log"
    log_file = open(minio_log, "w")
    proc = subprocess.Popen(
        [str(minio_bin), "server", str(data_dir), "--address", address],
        env={**os.environ, "MINIO_ROOT_USER": "minioadmin", "MINIO_ROOT_PASSWORD": "minioadmin"},
        stdout=log_file,
        stderr=subprocess.STDOUT,
    )

    bucket = "dragonfly-test"
    try:
        s3 = boto3.client(
            "s3",
            endpoint_url=endpoint,
            aws_access_key_id="minioadmin",
            aws_secret_access_key="minioadmin",
            region_name="us-east-1",
        )

        for attempt in range(30):
            try:
                s3.create_bucket(Bucket=bucket)
                break
            except Exception:
                if proc.poll() is not None:
                    logs = minio_log.read_text()
                    raise RuntimeError(
                        f"MinIO process exited with code {proc.returncode}.\nLogs:\n{logs}"
                    )
                time.sleep(1)
        else:
            logs = minio_log.read_text()
            raise RuntimeError(f"MinIO did not become ready in time.\nLogs:\n{logs}")
    except Exception:
        proc.terminate()
        log_file.close()
        shutil.rmtree(data_dir, ignore_errors=True)
        raise

    log_file.close()
    os.environ["DRAGONFLY_S3_BUCKET"] = bucket
    os.environ["AWS_ACCESS_KEY_ID"] = "minioadmin"
    os.environ["AWS_SECRET_ACCESS_KEY"] = "minioadmin"
    os.environ["AWS_ENDPOINT_URL"] = endpoint
    # Remove any existing session token (e.g. from OIDC auth) as MinIO doesn't support it
    os.environ.pop("AWS_SESSION_TOKEN", None)

    return proc, data_dir


_minio_proc = None
_minio_data_dir = None


# runs on pytest start
def pytest_configure(config):
    global _minio_proc, _minio_data_dir

    # clean everything
    if os.path.exists(FAILED_PATH):
        shutil.rmtree(FAILED_PATH)
    if os.path.exists(BASE_LOG_DIR):
        shutil.rmtree(BASE_LOG_DIR)

    # Start MinIO if MINIO_S3_ENDPOINT is set (must happen before test collection
    # so that @pytest.mark.skipif checking DRAGONFLY_S3_BUCKET sees it)
    endpoint = os.environ.get("MINIO_S3_ENDPOINT")
    if endpoint:
        _minio_proc, _minio_data_dir = _start_minio_server(endpoint)


def pytest_unconfigure(config):
    global _minio_proc, _minio_data_dir

    if _minio_proc is not None:
        _minio_proc.terminate()
        try:
            _minio_proc.wait(timeout=10)
        except subprocess.TimeoutExpired:
            _minio_proc.kill()
            _minio_proc.wait()
        _minio_proc = None

    if _minio_data_dir is not None:
        shutil.rmtree(_minio_data_dir, ignore_errors=True)
        _minio_data_dir = None


@pytest.fixture(scope="class")
def df_log_dir(request):
    """
    Fixture to provide a log directory for the test class.
    This directory will be created before each test class and cleaned up after.
    """
    # Generate a unique directory name for the test class based on its nodeid
    translator = str.maketrans(":[]{}/ ", "_______", "\"*'")
    unique_dir = request.node.name.translate(translator)
    log_dir = os.path.join(BASE_LOG_DIR, unique_dir)

    if os.path.exists(log_dir):
        shutil.rmtree(log_dir)
    os.makedirs(log_dir)

    # needs for action.yml to get logs if timedout is happen for test
    last_logs = open(LAST_LOGS, "w")
    last_logs.write(log_dir)
    last_logs.close()

    return log_dir


def determine_scope(fixture_name, config):
    drop_data_after_each_test = config.getoption("--drop-data-after-each-test", False)
    if drop_data_after_each_test:
        return "class"
    return "session"


@pytest.fixture(scope=determine_scope)
def tmp_dir():
    """
    Pytest fixture to provide the test temporary directory for the session
    where the Dragonfly executable will be run and where all test data
    should be stored. The directory will be cleaned up at the end of a session
    """
    tmp_name = mkdtemp()
    yield Path(tmp_name)
    if os.environ.get("DRAGONFLY_KEEP_TMP"):
        logging.info(f"Keeping tmp dir {tmp_name}")
        return
    shutil.rmtree(tmp_name, ignore_errors=True)


@pytest.fixture(scope=determine_scope)
def test_env(tmp_dir: Path):
    """
    Provide the environment the Dragonfly executable is running in as a
    python dictionary
    """
    env = os.environ.copy()
    env["DRAGONFLY_TMP"] = str(tmp_dir)
    return env


@pytest.fixture(scope="class", params=[{}])
def df_seeder_factory(request) -> DflySeederFactory:
    seed = request.config.getoption("--rand-seed")
    if seed is None:
        seed = random.randrange(sys.maxsize)

    random.seed(int(seed))
    logging.debug(f"Random seed: {seed}, check: {random.randrange(100)}")

    return DflySeederFactory(request.config.getoption("--log-seeder"))


def parse_args(args: List[str]) -> Dict[str, Union[str, None]]:
    args_dict = {}
    for arg in args:
        if "=" in arg:
            pos = arg.find("=")
            name, value = arg[:pos], arg[pos + 1 :]
            args_dict[name] = value
        else:
            args_dict[arg] = None
    return args_dict


@pytest_asyncio.fixture(scope="class")
def event_loop():
    loop = asyncio.new_event_loop()
    yield loop
    loop.close()


@pytest_asyncio.fixture(scope="class", params=[{}])
async def df_factory(
    request,
    tmp_dir,
    test_env,
    df_log_dir,
) -> typing.AsyncGenerator[DflyInstanceFactory, None]:
    """
    Create an instance factory with supplied params.
    """
    os.makedirs(os.path.join(gettempdir(), "tiered"), exist_ok=True)
    scripts_dir = os.path.dirname(os.path.abspath(__file__))
    path = os.environ.get("DRAGONFLY_PATH", os.path.join(scripts_dir, "../../build-dbg/dragonfly"))

    args = request.param if request.param else {}
    existing = request.config.getoption("--existing-port")
    existing_admin = request.config.getoption("--existing-admin-port")
    existing_mc = request.config.getoption("--existing-mc-port")
    params = DflyParams(
        path=path,
        cwd=tmp_dir,
        gdb=request.config.getoption("--gdb"),
        direct_output=request.config.getoption("--direct-out"),
        buffered_out=request.config.getoption("--buffered-output"),
        args=parse_args(request.config.getoption("--df")),
        existing_port=int(existing) if existing else None,
        existing_admin_port=int(existing_admin) if existing_admin else None,
        existing_mc_port=int(existing_mc) if existing_mc else None,
        env=test_env,
        log_dir=df_log_dir,
    )

    factory = DflyInstanceFactory(params, args)
    yield factory
    await factory.stop_all()


@pytest.fixture(scope="class")
def df_server(df_factory: DflyInstanceFactory) -> typing.Generator[DflyInstance, None, None]:
    """
    Start the default Dragonfly server that will be used for the default pools
    and clients.
    """
    instance = df_factory.create()
    instance.start()

    yield instance
    clients_left = None
    try:
        client = redis.Redis(port=instance.port)
        client.client_setname("mgr")
        sleep(0.1)
        clients_left = [x for x in client.client_list() if x["name"] != "mgr"]

        # Graceful shutdown, and avoid saving on shutdown if possible
        try:
            if instance.proc:
                client.shutdown(nosave=True)
        except Exception:
            pass
    except Exception as e:
        print(e, file=sys.stderr)

    instance.stop()

    # TODO: Investigate spurious open connection with cluster client
    # if not instance['cluster_mode']:
    # TODO: Investigate adding fine grain control over the pool by
    # by adding a cache ontop of the clients connection pool and then evict
    # properly with client.connection_pool.disconnect() avoiding non synced
    # side effects
    # assert clients_left == []
    # else:
    #    print("Cluster clients left: ", len(clients_left))

    if instance["cluster_mode"]:
        print("Cluster clients left: ", len(clients_left))


@pytest.fixture(scope="function")
def cluster_client(df_server):
    """
    Return a cluster client to the default instance with all entries flushed.
    """
    client = redis.RedisCluster(decode_responses=True, host="localhost", port=df_server.port)
    client.client_setname("default-cluster-fixture")
    client.flushall()

    yield client
    client.disconnect_connection_pools()


@pytest_asyncio.fixture(scope="function")
async def async_pool(df_server: DflyInstance):
    pool = aioredis.ConnectionPool(
        host="localhost",
        port=df_server.port,
        db=DATABASE_INDEX,
        decode_responses=True,
        max_connections=32,
    )
    yield pool
    await pool.disconnect(inuse_connections=True)


@pytest_asyncio.fixture(scope="function")
async def async_client(async_pool):
    """
    Return an async client to the default instance with all entries flushed.
    """
    client = aioredis.Redis(connection_pool=async_pool)
    await client.client_setname("default-async-fixture")
    await client.flushall()
    await client.select(DATABASE_INDEX)
    yield client


def pytest_addoption(parser):
    parser.addoption("--gdb", action="store_true", default=False, help="Run instances in gdb")
    parser.addoption("--df", action="append", default=[], help="Add arguments to dragonfly")
    parser.addoption(
        "--buffered-output",
        action="store_true",
        default=False,
        help="Makes instance output buffered, grouping it together",
    )
    parser.addoption(
        "--log-seeder", action="store", default=None, help="Store last generator commands in file"
    )
    parser.addoption(
        "--rand-seed",
        action="store",
        default=None,
        help="Set seed for global random. Makes seeder predictable",
    )
    parser.addoption(
        "--existing-port",
        action="store",
        default=None,
        help="Provide a port to the existing process for the test",
    )
    parser.addoption(
        "--existing-admin-port",
        action="store",
        default=None,
        help="Provide an admin port to the existing process for the test",
    )
    parser.addoption(
        "--existing-mc-port",
        action="store",
        default=None,
        help="Provide a port to the existing memcached process for the test",
    )
    parser.addoption(
        "--direct-out",
        action="store_true",
        default=False,
        help="If true, does not post process dragonfly output",
    )

    parser.addoption("--repeat", action="store", help="Number of times to repeat each test")
    parser.addoption(
        "--drop-data-after-each-test",
        action="store_true",
        default=False,
        help="Remove test data after each test, instead of after each session, "
        "useful when running tests on repeat to avoid filling up disk",
    )


def pytest_generate_tests(metafunc):
    if metafunc.config.option.repeat is not None:
        count = int(metafunc.config.option.repeat)

        # We're going to duplicate these tests by parametrizing them,
        # which requires that each test has a fixture to accept the parameter.
        # We can add a new fixture like so:
        metafunc.fixturenames.append("tmp_ct")

        # Now we parametrize. This is what happens when we do e.g.,
        # @pytest.mark.parametrize('tmp_ct', range(count))
        # def test_foo(): pass
        metafunc.parametrize("tmp_ct", range(count))


@pytest.fixture(scope="session")
def port_picker():
    yield PortPicker()


@pytest.fixture(scope="function")
def memcached_client(df_server: DflyInstance):
    client = pymemcache.Client(f"127.0.0.1:{df_server.mc_port}", default_noreply=False)

    yield client

    client.flush_all()  # clean up after test
    client.quit()


@pytest.fixture(scope="session")
def with_tls_ca_cert_args(tmp_dir):
    ca_key = os.path.join(tmp_dir, "ca-key.pem")
    ca_cert = os.path.join(tmp_dir, "ca-cert.pem")
    gen_ca_cert(ca_key, ca_cert)
    return {"ca_key": ca_key, "ca_cert": ca_cert}


@pytest.fixture(scope="session")
def with_tls_server_args(tmp_dir, with_tls_ca_cert_args):
    tls_server_key = os.path.join(tmp_dir, "df-key.pem")
    tls_server_req = os.path.join(tmp_dir, "df-req.pem")
    tls_server_cert = os.path.join(tmp_dir, "df-cert.pem")

    gen_certificate(
        with_tls_ca_cert_args["ca_key"],
        with_tls_ca_cert_args["ca_cert"],
        tls_server_req,
        tls_server_key,
        tls_server_cert,
    )

    args = {"tls": None, "tls_key_file": tls_server_key, "tls_cert_file": tls_server_cert}
    return args


@pytest.fixture(scope="session")
def with_ca_tls_server_args(with_tls_server_args, with_tls_ca_cert_args):
    args = deepcopy(with_tls_server_args)
    args["tls_ca_cert_file"] = with_tls_ca_cert_args["ca_cert"]
    return args


@pytest.fixture(scope="session")
def with_ca_dir_tls_server_args(with_tls_server_args, with_tls_ca_cert_args):
    args = deepcopy(with_tls_server_args)
    ca_cert = with_tls_ca_cert_args["ca_cert"]
    ca_dir = os.path.dirname(ca_cert)
    # We need this because any program that uses OpenSSL requires directories to be set up like this
    # in order to find the certificates. This command, creates the necessary symlinks to the files
    # such that they can be consumed by OpenSSL when loaded from the directory.
    # For more info see: https://www.openssl.org/docs/man3.0/man1/c_rehash.html
    command = f"c_rehash {ca_dir}"
    subprocess.run(command, shell=True)
    args["tls_ca_cert_dir"] = ca_dir
    return args, ca_cert


@pytest.fixture(scope="session")
def with_tls_client_args(tmp_dir, with_tls_ca_cert_args):
    tls_client_key = os.path.join(tmp_dir, "client-key.pem")
    tls_client_req = os.path.join(tmp_dir, "client-req.pem")
    tls_client_cert = os.path.join(tmp_dir, "client-cert.pem")

    gen_certificate(
        with_tls_ca_cert_args["ca_key"],
        with_tls_ca_cert_args["ca_cert"],
        tls_client_req,
        tls_client_key,
        tls_client_cert,
    )

    args = {"ssl": True, "ssl_keyfile": tls_client_key, "ssl_certfile": tls_client_cert}
    return args


@pytest.fixture(scope="session")
def with_ca_tls_client_args(with_tls_client_args, with_tls_ca_cert_args):
    args = deepcopy(with_tls_client_args)
    args["ssl_ca_certs"] = with_tls_ca_cert_args["ca_cert"]
    return args


def copy_failed_logs(log_dir, report):
    assert log_dir
    test_failed_path = os.path.join(FAILED_PATH, os.path.basename(log_dir))
    if not os.path.exists(test_failed_path):
        os.makedirs(test_failed_path)

    logging.error(f"Test failed {report.nodeid} with logs: ")

    for f in os.listdir(log_dir):
        file = os.path.join(log_dir, f)
        if os.path.isfile(file):
            file = file.rstrip("\n")
            logging.error(f"🪵🪵🪵🪵🪵🪵 {file} 🪵🪵🪵🪵🪵🪵")
            shutil.copy(file, test_failed_path)

    # Clean up
    try:
        os.remove(LAST_LOGS)
    except OSError:
        pass


# tests results we get on the "call" state
# but we can not copy logs until "teardown" state because the server isn't stoped
# so we save result of the "call" state and process it on the "teardown" when the server is stoped
@pytest.hookimpl(hookwrapper=True)
def pytest_runtest_makereport(item, call):
    outcome = yield
    report = outcome.get_result()

    if report.when == "call":
        # Store the result of the call phase in the item
        item.call_outcome = report

    if report.when == "teardown":
        call_outcome = getattr(item, "call_outcome", None)
        log_dir = item.funcargs.get("df_log_dir")
        if log_dir:
            if report.failed:
                copy_failed_logs(log_dir, report)
            if call_outcome and call_outcome.failed:
                copy_failed_logs(log_dir, call_outcome)


@pytest.fixture(scope="function")
def redis_server(port_picker) -> RedisServer:
    s = RedisServer(port_picker.get_available_port())
    try:
        s.start()
    except FileNotFoundError as e:
        skip_if_not_in_github()
        raise
    time.sleep(1)
    yield s
    s.stop()


@pytest.fixture(scope="function")
def redis_local_server(port_picker) -> RedisServer:
    s = RedisServer(port_picker.get_available_port())
    time.sleep(1)
    yield s
    s.stop()


================================================
FILE: tests/dragonfly/connection_test.py
================================================
import asyncio
import logging
import random
import socket
import ssl
import string
import time
from dataclasses import dataclass
from threading import Thread

import async_timeout
import pytest
import redis as base_redis
from redis import asyncio as aioredis
from redis.cache import CacheConfig
from redis.backoff import NoBackoff
from redis.retry import Retry
from redis.exceptions import ConnectionError, ResponseError

from . import dfly_args
from .instance import DflyInstance, DflyInstanceFactory
from .utility import tick_timer, assert_eventually

BASE_PORT = 1111


@dataclass(frozen=True)
class CollectedRedisMsg:
    cmd: str
    src: str = "tcp"

    @staticmethod
    def all_from_src(*args, src="tcp"):
        return [CollectedRedisMsg(arg, src) for arg in args]


class CollectingMonitor:
    """Tracks all monitor messages between start() and stop()"""

    def __init__(self, client):
        self.client = client
        self.messages = []
        self._monitor_task = None

    async def _monitor(self):
        async with self.client.monitor() as monitor:
            async for message in monitor.listen():
                self.messages.append(CollectedRedisMsg(message["command"], message["client_type"]))

    async def start(self):
        if self._monitor_task is None:
            self._monitor_task = asyncio.create_task(self._monitor())
        await asyncio.sleep(0.1)

    async def stop(self, timeout=0.1):
        if self._monitor_task:
            # Wait for Dragonfly to send all async monitor messages
            await asyncio.sleep(timeout)
            self._monitor_task.cancel()
            try:
                await self._monitor_task
            except asyncio.CancelledError:
                pass
            self._monitor_task = None

        def should_exclude(cmd: str):
            cmd = cmd.upper()
            return "SELECT" in cmd or "CLIENT SETINFO" in cmd

        while len(self.messages) > 0 and should_exclude(self.messages[0].cmd):
            self.messages = self.messages[1:]
        return self.messages


"""
Test MONITOR command with basic use case
"""


@dfly_args({"proactor_threads": 4})
async def test_monitor_command(async_pool):
    monitor = CollectingMonitor(aioredis.Redis(connection_pool=async_pool))
    await monitor.start()

    c = aioredis.Redis(connection_pool=async_pool)
    await c.set("a", 1)
    await c.get("a")
    await c.lpush("l", "V")
    await c.lpop("l")

    collected = await monitor.stop()
    expected = CollectedRedisMsg.all_from_src("SET a 1", "GET a", "LPUSH l V", "LPOP l")

    assert expected == collected


"""
Test MONITOR command with MULTI/EXEC transaction with squashing
"""


@dfly_args({"proactor_threads": 4, "multi_exec_squash": "true"})
async def test_monitor_command_multi(async_pool):
    monitor = CollectingMonitor(aioredis.Redis(connection_pool=async_pool))
    await monitor.start()

    c = aioredis.Redis(connection_pool=async_pool)
    p = c.pipeline(transaction=True)

    expected = []
    for i in range(100):
        p.lpush(str(i), "V")
        expected.append(f"LPUSH {i} V")

    await p.execute()

    collected = await monitor.stop(0.3)
    expected = CollectedRedisMsg.all_from_src(*expected)

    # The order is random due to squashing
    assert set(expected) == set(collected[1:-1])


"""
Test MONITOR command preserves correct order for MULTI/EXEC sequences
Regression test for https://github.com/dragonflydb/dragonfly/issues/5953
"""


@dfly_args({"proactor_threads": 4})
async def test_monitor_command_multi_exec_order(async_pool):
    monitor = CollectingMonitor(aioredis.Redis(connection_pool=async_pool))
    await monitor.start()

    c = aioredis.Redis(connection_pool=async_pool)
    p = c.pipeline(transaction=True)
    p.ping()
    p.set("key1", "value1")
    p.get("key1")
    await p.execute()

    collected = await monitor.stop()

    # Verify the commands appear in the correct order: MULTI, PING, SET, GET, EXEC
    assert len(collected) == 5
    assert "MULTI" in collected[0].cmd.upper()
    assert "PING" in collected[1].cmd.upper()
    assert "SET" in collected[2].cmd.upper()
    assert "GET" in collected[3].cmd.upper()
    assert "EXEC" in collected[4].cmd.upper()


"""
Test MONITOR command with lua script
https://github.com/dragonflydb/dragonfly/issues/756
"""

TEST_MONITOR_SCRIPT = """
    redis.call('SET', 'A', 1)
    redis.call('GET', 'A')
    redis.call('SADD', 'S', 1, 2, 3)
    redis.call('LPUSH', 'L', 1)
    redis.call('LPOP', 'L')
"""


@dfly_args({"proactor_threads": 4, "lua_auto_async": "false"})
async def test_monitor_command_lua(async_pool):
    monitor = CollectingMonitor(aioredis.Redis(connection_pool=async_pool))
    await monitor.start()

    c = aioredis.Redis(connection_pool=async_pool)
    await c.eval(TEST_MONITOR_SCRIPT, 3, "A", "S", "L")

    collected = await monitor.stop()
    expected = CollectedRedisMsg.all_from_src(
        "SET A 1", "GET A", "SADD S 1 2 3", "LPUSH L 1", "LPOP L", src="lua"
    )

    assert expected == collected[1:]


@dfly_args({"proactor_threads": 1})
async def test_monitor_multi_exec_close(df_server: DflyInstance):
    async def monitor_multi_exec_close():
        client = aioredis.Redis(port=df_server.port, single_connection_client=True)
        try:
            await client.execute_command("MULTI")
            await client.execute_command("MONITOR")
            await client.execute_command("MONITOR")
            await client.execute_command("SET", "a", "1")
            await client.execute_command("EXEC")
        except Exception:
            pass
        finally:
            await client.close()

    for _ in range(200):
        await asyncio.gather(*[monitor_multi_exec_close() for _ in range(10)])

    # If we get here, the server did not crash.
    client = df_server.client()
    assert await client.ping()


"""
Run test in pipeline mode.
This is mostly how this is done with python - its more like a transaction that
the connections is running all commands in its context
"""


async def test_pipeline_support(async_client):
    def generate(max):
        for i in range(max):
            yield f"key{i}", f"value={i}"

    messages = {a: b for a, b in generate(5)}
    assert await run_pipeline_mode(async_client, messages)


async def reader(channel: aioredis.client.PubSub, messages, max: int):
    message_count = len(messages)
    while message_count > 0:
        try:
            async with async_timeout.timeout(1):
                message = await channel.get_message(ignore_subscribe_messages=True)
                if message is not None:
                    message_count = message_count - 1
                    if message["data"] not in messages:
                        return False, f"got unexpected message from pubsub - {message['data']}"
                await asyncio.sleep(0.01)
        except asyncio.TimeoutError:
            pass
    return True, "success"


async def run_pipeline_mode(async_client: aioredis.Redis, messages):
    pipe = async_client.pipeline(transaction=False)
    for key, val in messages.items():
        pipe.set(key, val)
    result = await pipe.execute()

    print(f"got result from the pipeline of {result} with len = {len(result)}")
    if len(result) != len(messages):
        return False, f"number of results from pipe {len(result)} != expected {len(messages)}"
    elif False in result:
        return False, "expecting to successfully get all result good, but some failed"
    else:
        return True, "all command processed successfully"


"""
Test the pipeline command
Open connection to the subscriber and publish on the other end messages
Make sure that we are able to send all of them and that we are getting the
expected results on the subscriber side
"""


async def test_pubsub_command(async_client):
    def generate(max):
        for i in range(max):
            yield f"message number {i}"

    messages = [a for a in generate(5)]
    assert await run_pubsub(async_client, messages, "channel-1")


async def run_pubsub(async_client, messages, channel_name):
    pubsub = async_client.pubsub()
    await pubsub.subscribe(channel_name)

    future = asyncio.create_task(reader(pubsub, messages, len(messages)))
    success = True

    for message in messages:
        res = await async_client.publish(channel_name, message)
        if not res:
            success = False
            break

    await future
    status, message = future.result()

    await pubsub.close()
    if status and success:
        return True, "successfully completed all"
    else:
        return (
            False,
            f"subscriber result: {status}: {message},  publisher publish: success {success}",
        )


async def run_multi_pubsub(async_client, messages, channel_name):
    subs = [async_client.pubsub() for i in range(5)]
    for s in subs:
        await s.subscribe(channel_name)

    tasks = [
        asyncio.create_task(reader(s, messages, random.randint(0, len(messages)))) for s in subs
    ]

    success = True

    for message in messages:
        res = await async_client.publish(channel_name, message)
        if not res:
            success = False
            break

    for f in tasks:
        await f
    results = [f.result() for f in tasks]

    for s in subs:
        await s.close()
    if success:
        for status, message in results:
            if not status:
                return False, f"failed to process {message}"
        return True, "success"
    else:
        return False, "failed to publish"


"""
Test with multiple subscribers for a channel
We want to stress this to see if we have any issue
with the pub sub code since we are "sharing" the message
across multiple connections internally
"""


async def test_multi_pubsub(async_client):
    def generate(max):
        for i in range(max):
            yield f"this is message number {i} from the publisher on the channel"

    messages = [a for a in generate(500)]
    state, message = await run_multi_pubsub(async_client, messages, "my-channel")

    assert state, message


"""
Test PUBSUB NUMSUB command.
"""


async def test_pubsub_subcommand_for_numsub(async_client: aioredis.Redis):
    async def resub(s: "aioredis.PubSub", sub: bool, chan: str):
        if sub:
            await s.subscribe(chan)
        else:
            await s.unsubscribe(chan)
        # Wait for PUSH message to be parsed to make sure upadte was performed
        await s.get_message(timeout=0.1)

    # Subscribe 5 times to chan1
    subs1 = [async_client.pubsub() for i in range(5)]
    await asyncio.gather(*(resub(s, True, "chan1") for s in subs1))
    assert await async_client.pubsub_numsub("chan1") == [("chan1", 5)]

    # Unsubscribe all from chan1
    await asyncio.gather(*(resub(s, False, "chan1") for s in subs1))

    # Make sure numsub drops to 0
    async for numsub, breaker in tick_timer(lambda: async_client.pubsub_numsub("chan1")):
        with breaker:
            assert numsub[0][1] == 0

    # Check empty numsub
    assert await async_client.pubsub_numsub() == []

    subs2 = [async_client.pubsub() for i in range(5)]
    await asyncio.gather(*(resub(s, True, "chan2") for s in subs2))

    subs3 = [async_client.pubsub() for i in range(10)]
    await asyncio.gather(*(resub(s, True, "chan3") for s in subs3))

    assert await async_client.pubsub_numsub("chan2", "chan3") == [("chan2", 5), ("chan3", 10)]

    await asyncio.gather(*(s.unsubscribe() for s in subs2 + subs3))


"""
Test that pubsub clients who are stuck on backpressure from a slow client (the one in the test doesn't read messages at all)
will eventually unblock when it disconnects.
"""


@pytest.mark.large
@dfly_args({"proactor_threads": "1", "publish_buffer_limit": "100"})
async def test_publish_stuck(df_server: DflyInstance, async_client: aioredis.Redis):
    reader, writer = await asyncio.open_connection("127.0.0.1", df_server.port, limit=10)
    writer.write(b"SUBSCRIBE channel\r\n")
    await writer.drain()

    async def pub_task():
        payload = "msg" * 1000
        p = async_client.pipeline()
        for _ in range(1000):
            p.publish("channel", payload)
        await p.execute()

    publishers = [asyncio.create_task(pub_task()) for _ in range(20)]

    await asyncio.sleep(5)

    # Check we reached the limit
    pub_bytes = int((await async_client.info())["dispatch_queue_subscriber_bytes"])
    assert pub_bytes >= 100

    await asyncio.sleep(0.1)

    # Make sure processing is stalled
    new_pub_bytes = int((await async_client.info())["dispatch_queue_subscriber_bytes"])
    assert new_pub_bytes == pub_bytes

    writer.write(b"QUIT\r\n")
    await writer.drain()
    writer.close()

    # Make sure all publishers unblock eventually
    for pub in asyncio.as_completed(publishers):
        await pub


@pytest.mark.large
@dfly_args({"proactor_threads": "4"})
async def test_pubsub_busy_connections(df_server: DflyInstance):
    sleep = 60

    async def sub_thread():
        i = 0

        async def sub_task():
            nonlocal i
            sleep_task = asyncio.create_task(asyncio.sleep(sleep))
            while not sleep_task.done():
                client = df_server.client()
                pubsub = client.pubsub()
                await pubsub.subscribe("channel")
                # await pubsub.unsubscribe("channel")
                i = i + 1
                await client.close()

        subs = [asyncio.create_task(sub_task()) for _ in range(10)]
        for s in subs:
            await s
        logging.debug(f"Exiting thread after {i} subscriptions")

    async def pub_task():
        pub = df_server.client()
        i = 0
        sleep_task = asyncio.create_task(asyncio.sleep(sleep))
        while not sleep_task.done():
            await pub.publish("channel", f"message-{i}")
            i = i + 1

    def run_in_thread():
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(sub_thread())

    threads = []
    for _ in range(10):
        thread = Thread(target=run_in_thread)
        thread.start()
        threads.append(thread)

    await pub_task()

    for thread in threads:
        thread.join()


async def test_subscribers_with_active_publisher(df_server: DflyInstance, max_connections=100):
    # TODO: I am not how to customize the max connections for the pool.
    async_pool = aioredis.ConnectionPool(
        host="localhost",
        port=df_server.port,
        db=0,
        decode_responses=True,
        max_connections=max_connections,
    )

    async def publish_worker():
        client = aioredis.Redis(connection_pool=async_pool)
        for i in range(0, 2000):
            await client.publish("channel", f"message-{i}")
        await client.aclose()

    async def channel_reader(channel: aioredis.client.PubSub):
        for i in range(0, 150):
            try:
                async with async_timeout.timeout(1):
                    message = await channel.get_message(ignore_subscribe_messages=True)
            except asyncio.TimeoutError:
                break

    async def subscribe_worker():
        client = aioredis.Redis(connection_pool=async_pool)
        pubsub = client.pubsub()
        async with pubsub as p:
            await pubsub.subscribe("channel")
            await channel_reader(pubsub)
            await pubsub.unsubscribe("channel")

    # Create a publisher that sends constantly messages to the channel
    # Then create subscribers that will subscribe to already active channel
    pub_task = asyncio.create_task(publish_worker())
    await asyncio.gather(*(subscribe_worker() for _ in range(max_connections - 10)))
    await pub_task
    await async_pool.disconnect()


# This test ensures that no messages are sent after a successful
# acknowledgement of a unsubscribe.
# Low publish_buffer_limit makes publishers block on memory backpressure


@dfly_args({"publish_buffer_limit": 100, "proactor_threads": 2})
async def test_pubsub_unsubscribe(df_server: DflyInstance):
    long_message = "a" * 100_000
    pub_sent = 0
    pub_ready_ev = asyncio.Event()

    async def publisher():
        nonlocal pub_sent
        async with df_server.client(single_connection_client=True) as c:
            for _ in range(32):
                await c.execute_command("PUBLISH", "chan", long_message)
                # Unblock subscriber after a sufficient amount of publish requests accumulated
                pub_sent += 1
                if pub_sent >= 16:
                    pub_ready_ev.set()

    # Get raw connection from the client and subscribe to chan
    cl = df_server.client(single_connection_client=True)
    await cl.ping()
    conn = cl.connection
    await conn.send_command("SUBSCRIBE chan")

    # Flood our only subscriber with large messages to make publishers stop
    tasks = [asyncio.create_task(publisher()) for _ in range(16)]

    # Unsubscribe in the process
    await pub_ready_ev.wait()
    await conn.send_command("UNSUBSCRIBE")

    # No messages should be received after we've read unsubscribe reply
    had_unsub = False
    while True:
        reply = await conn.read_response(timeout=0.2)
        if reply is None:
            break

        if reply[0] == "unsubscribe":
            assert reply[2] == 0  # zero subscriptions left
            had_unsub = True
        else:
            assert not had_unsub, "found message even after all subscriptions were removed"

    assert had_unsub
    await asyncio.gather(*tasks)
    await cl.aclose()


async def produce_expiring_keys(async_client: aioredis.Redis):
    keys = []
    for i in range(10, 50):
        keys.append(f"k{i}")
        await async_client.set(keys[-1], "X", px=200 + i * 10)
    return keys


async def collect_expiring_events(pclient, keys):
    events = []
    async for message in pclient.listen():
        if message["type"] == "subscribe":
            continue

        events.append(message)
        if len(events) >= len(keys):
            break
    return events


@dfly_args({"notify_keyspace_events": "Ex"})
async def test_keyspace_events(async_client: aioredis.Redis):
    pclient = async_client.pubsub()
    await pclient.subscribe("__keyevent@0__:expired")

    keys = await produce_expiring_keys(async_client)

    # We don't support immediate expiration:
    # keys += ['immediate']
    # await async_client.set(keys[-1], 'Y', exat=123) # expired 50 years ago

    events = await collect_expiring_events(pclient, keys)

    assert set(ev["data"] for ev in events) == set(keys)


async def test_keyspace_events_config_set(async_client: aioredis.Redis):
    # nonsense does not make sense as argument, we only accept ex or empty string
    with pytest.raises(ResponseError):
        await async_client.config_set("notify_keyspace_events", "nonsense")

    await async_client.config_set("notify_keyspace_events", "ex")
    pclient = async_client.pubsub()
    await pclient.subscribe("__keyevent@0__:expired")

    keys = await produce_expiring_keys(async_client)

    events = await collect_expiring_events(pclient, keys)

    assert set(ev["data"] for ev in events) == set(keys)

    keys = await produce_expiring_keys(async_client)
    await async_client.config_set("notify_keyspace_events", "")
    with pytest.raises(asyncio.TimeoutError):
        async with async_timeout.timeout(1):
            await collect_expiring_events(pclient, keys)


@dfly_args({"max_busy_read_usec": 50000})
async def test_reply_count(df_server: DflyInstance):
    """Make sure reply aggregations reduce reply counts for common cases"""

    async def get_reply_count():
        metrics = await df_server.metrics()
        return int(metrics["dragonfly_reply"].samples[0].value)

    async def measure(aw):
        before = await get_reply_count()
        await aw
        return await get_reply_count() - before

    async_client = df_server.client()
    await async_client.config_resetstat()
    base = await get_reply_count()
    info_diff = await get_reply_count() - base
    assert info_diff == 0  # no commands yet

    # Warm client buffer up
    await async_client.lpush("warmup", *(i for i in range(500)))
    await async_client.lrange("warmup", 0, -1)

    # Integer list
    await async_client.lpush("list-1", *(i for i in range(100)))
    assert await measure(async_client.lrange("list-1", 0, -1)) == 1

    # Integer set
    await async_client.sadd("set-1", *(i for i in range(100)))
    assert await measure(async_client.smembers("set-1")) <= 2

    # Sorted sets
    await async_client.zadd("zset-1", mapping={str(i): i for i in range(50)})
    assert await measure(async_client.zrange("zset-1", 0, -1, withscores=True)) <= 2

    # Exec call
    e = async_client.pipeline(transaction=True)
    for _ in range(100):
        e.incr("num-1")

    # one - for MULTI-OK, one for the rest. Depends on the squashing efficiency,
    # can be either 1 or 2 replies.
    assert await measure(e.execute()) <= 2

    # Just pipeline
    p = async_client.pipeline(transaction=False)
    for _ in range(100):
        p.incr("num-1")
    assert await measure(p.execute()) <= 2

    # Script result
    assert await measure(async_client.eval('return {1,2,{3,4},5,6,7,8,"nine"}', 0)) == 1

    # Search results
    await async_client.execute_command("FT.CREATE i1 SCHEMA name text")
    for i in range(50):
        await async_client.hset(f"key-{i}", "name", f"name number {i}")
    assert await measure(async_client.ft("i1").search("*")) <= 2


async def test_big_command(df_server, size=8 * 1024):
    reader, writer = await asyncio.open_connection("127.0.0.1", df_server.port)

    writer.write(f"SET a {'v'*size}\n".encode())
    await writer.drain()

    assert "OK" in (await reader.readline()).decode()

    writer.close()
    await writer.wait_closed()


async def test_subscribe_pipelined(async_client: aioredis.Redis):
    pipe = async_client.pipeline(transaction=False)
    pipe.execute_command("subscribe channel").execute_command("subscribe channel")
    await pipe.echo("bye bye").execute()


async def test_subscribe_in_pipeline(async_client: aioredis.Redis):
    pipe = async_client.pipeline(transaction=False)
    pipe.echo("one")
    pipe.execute_command("SUBSCRIBE ch1")
    pipe.echo("two")
    pipe.execute_command("SUBSCRIBE ch2")
    pipe.echo("three")
    res = await pipe.execute()

    assert res == ["one", ["subscribe", "ch1", 1], "two", ["subscribe", "ch2", 2], "three"]


async def test_send_delay_metric(df_server: DflyInstance):
    client = df_server.client()
    await client.client_setname("client1")
    blob = "A" * 1000
    for j in range(10):
        await client.set(f"key-{j}", blob)

    await client.config_set("pipeline_queue_limit", 100)
    reader, writer = await asyncio.open_connection("localhost", df_server.port)

    async def send_data_noread():
        for j in range(500000):
            writer.write(f"GET key-{j % 10}\n".encode())
            await writer.drain()

    t1 = asyncio.create_task(send_data_noread())

    @assert_eventually
    async def wait_for_large_delay():
        info = await client.info("clients")
        assert int(info["send_delay_ms"]) > 100

    # Check that the delay metric indeed increases as we have a connection
    # that is not reading the data.
    await wait_for_large_delay()
    t1.cancel()
    writer.close()


async def test_match_http(df_server: DflyInstance):
    client = df_server.client()
    reader, writer = await asyncio.open_connection("localhost", df_server.port)
    for i in range(2000):
        writer.write(f"foo bar ".encode())
        await writer.drain()


"""
This test makes sure that Dragonfly can receive blocks of pipelined commands even
while a script is still executing. This is a dangerous scenario because both the dispatch fiber
and the connection fiber are actively using the context. What is more, the script execution injects
its own custom reply builder, which can't be used anywhere else, besides the lua script itself.
"""

BUSY_SCRIPT = """
for i=1,300 do
    redis.call('MGET', 'k1', 'k2', 'k3')
end
"""

PACKET1 = """
MGET s1 s2 s3
EVALSHA {sha} 3 k1 k2 k3
"""

PACKET2 = """
MGET m1 m2 m3
MGET m4 m5 m6
MGET m7 m8 m9\n
"""

PACKET3 = (
    """
PING
"""
    * 500
    + "ECHO DONE\n"
)


async def test_parser_while_script_running(async_client: aioredis.Redis, df_server: DflyInstance):
    sha = await async_client.script_load(BUSY_SCRIPT)

    # Use a raw tcp connection for strict control of sent commands
    # Below we send commands while the previous ones didn't finish
    reader, writer = await asyncio.open_connection("localhost", df_server.port)

    # Send first pipeline packet, last commands is a long executing script
    writer.write(PACKET1.format(sha=sha).encode())
    await writer.drain()

    # Give the script some time to start running
    await asyncio.sleep(0.01)

    # Send another packet that will be received while the script is running
    writer.write(PACKET2.encode())
    # The last batch has to be big enough, so the script will finish before it is fully consumed
    writer.write(PACKET3.encode())
    await writer.drain()

    await reader.readuntil(b"DONE")
    writer.close()
    await writer.wait_closed()


"""
    This test makes sure that we can migrate while handling pipelined commands and don't keep replies
    batched even if the stream suddenly stops.
"""


@dfly_args({"proactor_threads": "4", "pipeline_squash": 0})
async def test_pipeline_batching_while_migrating(
    async_client: aioredis.Redis, df_server: DflyInstance
):
    sha = await async_client.script_load("return redis.call('GET', KEYS[1])")

    reader, writer = await asyncio.open_connection("localhost", df_server.port)

    # First, write a EVALSHA that will ask for migration (75% it's on the wrong shard)
    # and some more pipelined commands that will keep Dragonfly busy
    incrs = "".join("INCR a\r\n" for _ in range(50))
    writer.write((f"EVALSHA {sha} 1 a\r\n" + incrs).encode())
    await writer.drain()
    # We migrate only when the socket wakes up, so send another batch to trigger migration
    writer.write("INCR a\r\n".encode())
    await writer.drain()

    # The data doesn't necessarily arrive in a single batch
    async def read():
        reply = ""
        while not reply.strip().endswith("51"):
            reply = (await reader.read(520)).decode()

    # Make sure we recived all replies
    await asyncio.wait_for(read(), timeout=2.0)

    writer.close()
    await writer.wait_closed()


@dfly_args({"proactor_threads": 1})
async def test_large_cmd(async_client: aioredis.Redis):
    MAX_ARR_SIZE = 65535
    res = await async_client.hset(
        "foo", mapping={f"key{i}": f"val{i}" for i in range(MAX_ARR_SIZE // 2)}
    )
    assert res == MAX_ARR_SIZE // 2

    res = await async_client.mset({f"key{i}": f"val{i}" for i in range(MAX_ARR_SIZE // 2)})
    assert res

    res = await async_client.mget([f"key{i}" for i in range(MAX_ARR_SIZE)])
    assert len(res) == MAX_ARR_SIZE


@dfly_args({"proactor_threads": 1})
async def test_parser_memory_stats(df_server, async_client: aioredis.Redis):
    reader, writer = await asyncio.open_connection("127.0.0.1", df_server.port, limit=10)
    writer.write(b"*1000\r\n")
    writer.write(b"$4\r\nmget\r\n")
    val = (b"a" * 100) + b"\r\n"
    for i in range(0, 900):
        writer.write(b"$100\r\n" + val)
    await writer.drain()  # writer is pending because the request is not finished.

    @assert_eventually
    async def check_stats():
        stats = await async_client.execute_command("memory stats")
        assert stats["connections.direct_bytes"] > 130000

    await check_stats()


async def test_reject_non_tls_connections_on_tls(with_tls_server_args, df_factory):
    server: DflyInstance = df_factory.create(
        no_tls_on_admin_port="true",
        admin_port=1111,
        port=1211,
        requirepass="XXX",
        **with_tls_server_args,
    )
    server.start()

    client = server.client(password="XXX")
    with pytest.raises(ResponseError):
        await client.dbsize()
    await client.aclose()

    client = server.admin_client(password="XXX")
    assert await client.dbsize() == 0


async def test_tls_insecure(with_ca_tls_server_args, with_tls_client_args, df_factory):
    server = df_factory.create(port=BASE_PORT, **with_ca_tls_server_args)
    server.start()

    client = aioredis.Redis(port=server.port, **with_tls_client_args, ssl_cert_reqs=None)
    assert await client.dbsize() == 0


async def test_tls_full_auth(with_ca_tls_server_args, with_ca_tls_client_args, df_factory):
    server = df_factory.create(port=BASE_PORT, **with_ca_tls_server_args)
    server.start()

    client = aioredis.Redis(port=server.port, **with_ca_tls_client_args)
    assert await client.dbsize() == 0


async def test_tls_reject(
    with_ca_tls_server_args, with_tls_client_args, df_factory: DflyInstanceFactory
):
    server: DflyInstance = df_factory.create(port=BASE_PORT, **with_ca_tls_server_args)
    server.start()

    client = server.client(**with_tls_client_args, ssl_cert_reqs=None)
    await client.ping()
    await client.aclose()

    client = server.client(**with_tls_client_args)
    with pytest.raises(ConnectionError):
        await client.ping()


@dfly_args({"proactor_threads": "4", "pipeline_squash": 1})
async def test_squashed_pipeline_eval(async_client: aioredis.Redis):
    p = async_client.pipeline(transaction=False)
    for _ in range(5):
        # Deliberately lowcase EVAL to test that it is not squashed
        p.execute_command("eval", "return redis.call('set', KEYS[1], 'value')", 1, "key")
    res = await p.execute()
    assert res == ["OK"] * 5


@dfly_args({"proactor_threads": "4", "pipeline_squash": 10})
async def test_squashed_pipeline(async_client: aioredis.Redis):
    p = async_client.pipeline(transaction=False)

    for j in range(50):
        for i in range(10):
            p.incr(f"k{i}")
        p.execute_command("NOTFOUND")

    res = await p.execute(raise_on_error=False)

    for j in range(50):
        assert res[0:10] == [j + 1] * 10
        assert isinstance(res[10], aioredis.ResponseError)
        res = res[11:]


@dfly_args({"proactor_threads": "4", "pipeline_squash": 10})
async def test_squashed_pipeline_seeder(df_server, df_seeder_factory):
    seeder = df_seeder_factory.create(port=df_server.port, keys=10_000)
    await seeder.run(target_deviation=0.1)


"""
This test makes sure that multi transactions can be integrated into pipeline squashing
"""


@dfly_args({"proactor_threads": "4", "pipeline_squash": 1})
async def test_squashed_pipeline_multi(async_client: aioredis.Redis):
    p = async_client.pipeline(transaction=False)
    for _ in range(5):
        # Series of squashable commands
        for _ in range(5):
            p.set("first", "true")
        # Non-squashable
        p.info()
        # Eval without at tx
        p.execute_command("MULTI")
        p.set("second", "true")
        p.execute_command("EXEC")
        # Finishing sequence
        for _ in range(5):
            p.set("third", "true")
    await p.execute()


async def test_unix_domain_socket(df_factory, tmp_dir):
    server = df_factory.create(proactor_threads=1, port=BASE_PORT, unixsocket="./df.sock")
    server.start()

    await asyncio.sleep(0.5)

    r = aioredis.Redis(unix_socket_path=tmp_dir / "df.sock")
    assert await r.ping()


async def test_unix_socket_only(df_factory, tmp_dir):
    server = df_factory.create(proactor_threads=1, port=0, unixsocket="./df.sock")
    # we call _start because we start() wait for the port to become available and
    # we run here a process without a port.
    server._start()

    await asyncio.sleep(1)

    r = aioredis.Redis(unix_socket_path=tmp_dir / "df.sock")
    assert await r.ping()


"""
Test nested pauses. Executing CLIENT PAUSE should be possible even if another write-pause is active.
It should prolong the pause for all current commands.
"""


@pytest.mark.large
async def test_nested_client_pause(async_client: aioredis.Redis):
    async def do_pause():
        await async_client.execute_command("CLIENT", "PAUSE", "1000", "WRITE")

    async def do_write():
        await async_client.execute_command("LPUSH", "l", "1")

    p1 = asyncio.create_task(do_pause())
    await asyncio.sleep(0.1)

    p2 = asyncio.create_task(do_write())
    assert not p2.done()

    await asyncio.sleep(0.5)
    p3 = asyncio.create_task(do_pause())

    await p1
    await asyncio.sleep(0.1)
    assert not p2.done()  # blocked by p3 now

    await p2
    await asyncio.sleep(0.0)
    assert p3.done()
    await p3


@dfly_args({"proactor_threads": "4"})
async def test_blocking_command_client_pause(async_client: aioredis.Redis):
    """
    1. Check client pause success when blocking transaction is running
    2. lpush is paused after running client puase
    3. once puased is finished lpush will run and blpop will pop the pushed value
    """

    async def blpop_command():
        res = await async_client.execute_command("blpop dest7 10")
        assert res == ["dest7", "value"]

    async def brpoplpush_command():
        res = await async_client.execute_command("brpoplpush src dest7 2")
        assert res == "value"

    async def lpush_command():
        await async_client.execute_command("lpush src value")

    blpop = asyncio.create_task(blpop_command())
    brpoplpush = asyncio.create_task(brpoplpush_command())
    await asyncio.sleep(0.1)

    res = await async_client.execute_command("client pause 1000")
    assert res == "OK"

    lpush = asyncio.create_task(lpush_command())
    assert not lpush.done()

    await lpush
    await brpoplpush
    await blpop


async def test_multiple_blocking_commands_client_pause(async_client: aioredis.Redis):
    """
    Check running client pause command simultaneously with running multiple blocking command
    from multiple connections
    """

    async def just_blpop():
        key = "".join(random.choices(string.ascii_letters, k=3))
        await async_client.execute_command(f"blpop {key} 2")

    async def client_pause():
        res = await async_client.execute_command("client pause 1000")
        assert res == "OK"

    tasks = [just_blpop() for _ in range(20)]
    tasks.append(client_pause())

    all = asyncio.gather(*tasks)

    assert not all.done()
    await all


async def test_tls_when_read_write_is_interleaved(
    with_ca_tls_server_args, with_ca_tls_client_args, df_factory
):
    """
    This test covers a deadlock bug in helio and TlsSocket when a client connection renegotiated a
    handshake without reading its pending data from the socket.
    This is a weak test case and from our local experiments it deadlocked 30% of the test runs
    """
    server: DflyInstance = df_factory.create(
        port=1211, **with_ca_tls_server_args, proactor_threads=1
    )

    server.start()

    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

    ssl_key = with_ca_tls_client_args["ssl_keyfile"]
    ssl_cert = with_ca_tls_client_args["ssl_certfile"]
    ssl_ca_cert = with_ca_tls_client_args["ssl_ca_certs"]

    context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
    context.load_verify_locations(ssl_ca_cert)
    context.load_cert_chain(certfile=ssl_cert, keyfile=ssl_key)
    context.verify_mode = ssl.CERT_REQUIRED
    context.maximum_version = ssl.TLSVersion.TLSv1_2

    ssl_sock = context.wrap_socket(s, server_hostname="localhost")
    ssl_sock.connect(("127.0.0.1", server.port))
    ssl_sock.settimeout(0.1)

    tmp = "f" * 1000
    message = f"SET foo {tmp}\r\n".encode()
    ssl_sock.send(message)

    try:
        for i in range(0, 100_000):
            res = random.randint(1, 4)
            message = b""
            for j in range(0, res):
                message = message + b"GET foo\r\n"
            ssl_sock.send(message)
            ssl_sock.do_handshake()
    except:
        # We might have filled the socket buffer, causing further sending will fail
        pass

    # This deadlocks
    client = aioredis.Redis(port=server.port, **with_ca_tls_client_args)
    await client.execute_command("GET foo")


async def test_lib_name_ver(async_client: aioredis.Redis):
    await async_client.execute_command("client setinfo lib-name dragonfly")
    await async_client.execute_command("client setinfo lib-ver 1.2.3.4")

    list = await async_client.execute_command("client list")
    assert len(list) == 1
    assert list[0]["lib-name"] == "dragonfly"
    assert list[0]["lib-ver"] == "1.2.3.4"


async def test_client_info(async_client: aioredis.Redis):
    """Test CLIENT INFO returns info about the current connection only."""
    await async_client.client_setname("test_client_info")

    info = await async_client.execute_command("CLIENT INFO")
    assert isinstance(info, dict)
    assert info["name"] == "test_client_info"

    # Verify CLIENT INFO returns same format as CLIENT LIST but for single connection
    client_list = await async_client.client_list()
    assert len(client_list) == 1
    # CLIENT INFO should contain the same client id as CLIENT LIST
    assert str(info["id"]) == str(client_list[0]["id"])


async def test_hiredis(df_factory):
    server = df_factory.create(proactor_threads=1)
    server.start()
    client = base_redis.Redis(port=server.port, protocol=3, cache_config=CacheConfig())
    client.ping()


@assert_eventually(times=500)
async def wait_for_conn_drop(async_client):
    clients = await async_client.client_list()
    logging.info("wait_for_conn_drop clients: %s", clients)
    assert len(clients) <= 1


@dfly_args({"timeout": 1})
async def test_timeout(df_server: DflyInstance, async_client: aioredis.Redis):
    # TODO investigate why it fails -- client is not stuck.
    if df_server.has_arg("experimental_io_loop_v2"):
        pytest.skip(f"Fails in the assertion below")

    another_client = df_server.client()
    await another_client.ping()
    clients = await async_client.client_list()
    assert len(clients) == 2

    await asyncio.sleep(2)

    await wait_for_conn_drop(async_client)
    info = await async_client.info("clients")
    assert int(info["timeout_disconnects"]) >= 1


@dfly_args({"send_timeout": 3})
async def test_send_timeout(df_server, async_client: aioredis.Redis):
    reader, writer = await asyncio.open_connection("127.0.0.1", df_server.port)
    writer.write(f"client setname writer_test\n".encode())
    await writer.drain()
    assert "OK" in (await reader.readline()).decode()
    clients = await async_client.client_list()
    assert len(clients) == 2
    size = 1024 * 1024
    writer.write(f"SET a {'v'*size}\n".encode())
    await writer.drain()

    async def get_task():
        while True:
            writer.write(f"GET a\n".encode())
            await writer.drain()
            await asyncio.sleep(0.1)

    get = asyncio.create_task(get_task())

    @assert_eventually(times=600)
    async def wait_for_stuck_on_send():
        clients = await async_client.client_list()
        logging.info("wait_for_stuck_on_send clients: %s", clients)
        phase = next(
            (client["phase"] for client in clients if client["name"] == "writer_test"), None
        )
        assert phase == "send"

    await wait_for_stuck_on_send()
    await wait_for_conn_drop(async_client)
    info = await async_client.info("clients")
    assert int(info["timeout_disconnects"]) >= 1
    logging.info("finished disconnect")
    get.cancel()


# Test that the cache pipeline does not grow or shrink under constant pipeline load.
@dfly_args({"proactor_threads": 1, "pipeline_squash": 9, "max_busy_read_usec": 50000})
async def test_pipeline_cache_only_async_squashed_dispatches(df_factory):
    server = df_factory.create()
    server.start()

    client = server.client()
    await client.ping()  # Make sure the connection and the protocol were established

    async def push_pipeline(size):
        p = client.pipeline(transaction=True)
        for i in range(size):
            p.info()
        res = await p.execute()
        return res

    # Dispatch only async command/pipelines and force squashing. pipeline_cache_bytes,
    # should be zero because:
    # We always dispatch the items that will be squashed, so when `INFO` gets called
    # the cache is empty because the pipeline consumed it throughout its execution
    # high max_busy_read_usec ensures that the connection fiber has enough time to push
    # all the commands to reach the squashing limit.
    for i in range(0, 10):
        # it's actually 11 commands. 8 INFO + 2 from the MULTI/EXEC block that is injected
        # by the client. The minimum to squash is 9 so it will squash the pipeline
        # and INFO ALL should return zero for all the squashed commands in the pipeline
        res = await push_pipeline(8)
        for r in res:
            assert r["pipeline_cache_bytes"] == 0

    # Non zero because we reclaimed/recycled the messages back to the cache
    info = await client.info()
    assert info["pipeline_cache_bytes"] > 0


# Test that the pipeline cache size shrinks on workloads that storm the datastore with
# pipeline commands and then "back off" by gradually reducing the pipeline load such that
# the cache becomes progressively underutilized. At that stage, the pipeline should slowly
# shrink (because it's underutilized).
@pytest.mark.skip("Flaky")
@dfly_args({"proactor_threads": 1})
async def test_pipeline_cache_size(df_server: DflyInstance):
    # Start 1 client.
    good_client = df_server.client()
    bad_actor_client = df_server.client()

    async def push_pipeline(bad_actor_client, size=1):
        # Fill cache.
        p = bad_actor_client.pipeline(transaction=True)
        for i in range(size):
            p.lpush(str(i), "V")
        await p.execute()

    # Establish a baseline for the cache size. We dispatch async here.
    await push_pipeline(bad_actor_client, 32)
    info = await good_client.info()

    old_pipeline_cache_bytes = info["pipeline_cache_bytes"]
    assert old_pipeline_cache_bytes > 0
    assert info["dispatch_queue_bytes"] == 0

    for i in range(30):
        await push_pipeline(bad_actor_client)
        await good_client.execute_command(f"set foo{i} bar")

    info = await good_client.info()

    # Gradually release pipeline.
    assert old_pipeline_cache_bytes > info["pipeline_cache_bytes"]
    assert info["dispatch_queue_bytes"] == 0

    # Now drain the full cache.
    async with async_timeout.timeout(5):
        while info["pipeline_cache_bytes"] != 0:
            await good_client.execute_command(f"set foo{i} bar")
            info = await good_client.info()

    assert info["dispatch_queue_bytes"] == 0


@dfly_args({"proactor_threads": 4, "pipeline_queue_limit": 10})
async def test_pipeline_overlimit(df_server: DflyInstance):
    client = df_server.client()

    await client.set("x", "a" * 1024 * 5)

    async def pipe_overlimit():
        c = df_server.client()
        pipe = c.pipeline()
        for i in range(1000):
            pipe.get("x")
        logging.debug("Executing...")
        res = await pipe.execute()
        logging.debug(f"Executed.")

    pipeline_tasks = [asyncio.create_task(pipe_overlimit()) for _ in range(20)]

    await asyncio.sleep(2)
    await client.config_set("pipeline_queue_limit", 10000)
    for task in pipeline_tasks:
        await task


async def test_client_unpause(df_server: DflyInstance):
    async_client = df_server.client()
    await async_client.client_pause(3000, all=False)

    async def set_foo():
        client = df_server.client()
        async with async_timeout.timeout(2):
            await client.execute_command("SET", "foo", "bar")

    p1 = asyncio.create_task(set_foo())

    await asyncio.sleep(0.5)
    assert not p1.done()

    async with async_timeout.timeout(0.5):
        await async_client.client_unpause()

    async with async_timeout.timeout(0.5):
        await p1
        assert p1.done()

    await async_client.client_pause(1, all=False)
    await asyncio.sleep(2)


async def test_client_pause_b2b(async_client):
    async with async_timeout.timeout(1):
        await async_client.client_pause(2000, all=False)
        await async_client.client_pause(2000, all=False)


async def test_client_unpause_after_pause_all(async_client):
    await async_client.client_pause(2000, all=True)
    # Blocks and waits
    res = await async_client.client_unpause()
    assert res == "OK"
    await async_client.client_pause(2000, all=False)
    res = await async_client.client_unpause()


async def test_client_detached_crash(df_factory):
    server = df_factory.create(proactor_threads=1)
    server.start()
    async_client = server.client()
    await async_client.client_pause(2, all=False)
    server.stop()


async def test_tls_client_kill_preemption(
    with_ca_tls_server_args, with_ca_tls_client_args, df_factory
):
    server = df_factory.create(proactor_threads=4, port=BASE_PORT, **with_ca_tls_server_args)
    server.start()

    client = server.client(
        single_connection_client=True, retry=Retry(NoBackoff(), 0), **with_ca_tls_client_args
    )
    assert await client.dbsize() == 0

    # Get the list of clients
    clients_info = await client.client_list()
    assert len(clients_info) == 1

    kill_id = clients_info[0]["id"]

    async def seed():
        try:
            while True:
                p = client.pipeline(transaction=True)
                for i in range(100):
                    p.lpush(str(i), "V")
                await p.execute()
        except (aioredis.ConnectionError, asyncio.CancelledError):
            pass

    task = asyncio.create_task(seed())

    await asyncio.sleep(0.1)

    cl = aioredis.Redis(port=server.port, **with_ca_tls_client_args)
    await cl.execute_command(f"CLIENT KILL ID {kill_id}")

    # Ensure that the killed client actually disconnects before we cancel the worker task.
    for _ in range(100):
        try:
            await client.ping()
        except aioredis.ConnectionError:
            break
        await asyncio.sleep(0.05)
    else:
        pytest.fail("Killed client did not disconnect")

    # Give the server time to process the kill and write logs
    await asyncio.sleep(0.5)
    task.cancel()
    await task

    server.stop()
    lines = server.find_in_logs("Preempting inside of atomic section, fiber")
    assert len(lines) == 0


@dfly_args({"proactor_threads": 4})
async def test_client_migrate(df_server: DflyInstance):
    """
    Test that we can migrate a client with "CLIENT MIGRATE" command.
    """
    client1 = df_server.client()
    await client1.client_setname("test_migrate")
    resp = await client1.execute_command("DFLY THREAD")
    client_id = await client1.client_id()
    assert resp[1] == 4
    current_tid = resp[0]
    client2 = df_server.client()
    resp = await client2.execute_command("CLIENT", "MIGRATE", client_id, current_tid)
    assert resp == 0  # not migrated as it's the same thread
    dest_tid = (current_tid + 1) % 4
    resp = await client2.execute_command("CLIENT", "MIGRATE", client_id + 999, dest_tid)
    assert resp == 0  # Not migrated as the client does not exist
    resp = await client2.execute_command("CLIENT", "MIGRATE", client_id, dest_tid)
    assert resp == 1  # migrated successfully


async def test_client_migrate_no_conn_leak(df_server: DflyInstance):
    admin = df_server.client()
    resp = await admin.execute_command("DFLY THREAD")
    num_threads = resp[1]

    # Create multiple clients and migrate them all to the same thread.
    # If DecreaseConnStats is called twice per migration (double-decrement bug),
    # the source threads' uint32 counters are invalid.
    num_clients = 20
    clients = []
    client_ids = []
    dest_tid = 0
    for _ in range(num_clients):
        c = df_server.client()
        clients.append(c)
        client_ids.append(await c.client_id())

    info = await admin.info("clients")
    baseline = info["connected_clients"]

    for c, cid in zip(clients, client_ids):
        r = await c.execute_command("DFLY THREAD")
        if r[0] != dest_tid:
            await admin.execute_command("CLIENT", "MIGRATE", cid, dest_tid)

    # Wait for all migrations to complete by polling each client's thread
    for c in clients:
        async for r, breaker in tick_timer(lambda c=c: c.execute_command("DFLY THREAD")):
            with breaker:
                assert r[0] == dest_tid

    # After all migrations complete, connected_clients must stay the same
    info = await admin.info("clients")
    assert (
        info["connected_clients"] == baseline
    ), f"connected_clients changed from {baseline} to {info['connected_clients']} after migrations"

    for c in clients:
        await c.aclose()
    await admin.aclose()


async def test_issue_5931_malformed_protocol_crash(df_server: DflyInstance):
    """
    Regression test for #5931

    The crash.txt file contains malformed RESP protocol that caused the server to crash
    with: "Check failed: RespExpr::STRING == arg.type" in FromArgs()

    This test sends the exact bytes from crash.txt to verify the server handles it
    gracefully without crashing.
    """
    # Open raw TCP connection to send malformed protocol
    reader, writer = await asyncio.open_connection("127.0.0.1", df_server.port)

    try:
        # Send the exact bytes from crash.txt:
        # *0\r\n$5\r\nMULTI\r\n*3\r\n$3\r\nSET\r\n$1\r\na\r\n$1\r\n1\r<0xf4>)1\r\n$4\r\nEXEC\r\n
        crash_data = b"*0\r\n$5\r\nMULTI\r\n*3\r\n$3\r\nSET\r\n$1\r\na\r\n$1\r\n1\r"
        crash_data += bytes([0xF4])  # Binary byte instead of \n
        crash_data += b")1\r\n$4\r\nEXEC\r\n"

        writer.write(crash_data)
        await writer.drain()

        try:
            response = await asyncio.wait_for(reader.read(1024), timeout=2.0)
            # If we get a response, it should be an error, not a crash
            # The server is still running if we got here
        except asyncio.TimeoutError:
            # Timeout is acceptable - connection might be closed
            pass
        except ConnectionError:
            # Connection closed is acceptable - server detected bad protocol
            pass

    finally:
        writer.close()
        await writer.wait_closed()

    # Verify server is still running by making a normal request
    client = df_server.client()
    await client.ping()
    assert await client.ping() == True


async def test_issue_5949_nil_bulk_string_crash(df_server: DflyInstance):
    """
    Regression test for #5949

    The crash1.txt and crash2.txt files contain malformed RESP protocol with NIL bulk
    strings ($-1) as command arguments, which caused the server to crash with:
    "Check failed: RespExpr::STRING == arg.type" in FromArgs()

    According to RESP protocol spec, NIL bulk strings are valid for server responses
    but NOT for command arguments sent by clients. Commands must be arrays of bulk strings.
    """
    # Open raw TCP connection to send malformed protocol
    reader, writer = await asyncio.open_connection("127.0.0.1", df_server.port)

    try:
        # Test crash1.txt: MULTI followed by SET with NIL bulk string argument
        # *1\r\n$5\r\nMULTI\r\n*3\r\n$3\r\nSET\r\n$1\r\na\r\n$-1\r\n1\r\n*1\r\n$4\r\nEXEC\r\n
        crash_data = (
            b"*1\r\n$5\r\nMULTI\r\n*3\r\n$3\r\nSET\r\n$1\r\na\r\n$-1\r\n1\r\n*1\r\n$4\r\nEXEC\r\n"
        )

        writer.write(crash_data)
        await writer.drain()

        try:
            response = await asyncio.wait_for(reader.read(1024), timeout=2.0)
            # If we get a response, it should be an error, not a crash
        except asyncio.TimeoutError:
            # Timeout is acceptable - connection might be closed
            pass
        except ConnectionError:
            # Connection closed is acceptable - server detected bad protocol
            pass

    finally:
        writer.close()
        await writer.wait_closed()

    # Verify server is still running by making a normal request
    client = df_server.client()
    await client.ping()
    assert await client.ping() == True


async def test_issue_6165_squash_invalid_syntax(async_client):
    pipe = async_client.pipeline(transaction=False)
    pipe.set("k", "v")
    pipe.execute_command("RENAME bar")
    res = await pipe.execute(raise_on_error=False)

    assert res[0] == True  # SET key1
    assert isinstance(res[1], aioredis.ResponseError)  # INVALID SYNTAX COMMAND

    pip = async_client.pipeline(transaction=False)
    pip.set("k", "v")
    pip.execute_command("ZUNION 2 set1")
    res = await pip.execute(raise_on_error=False)
    assert res[0] == True  # SET key1
    assert isinstance(res[1], aioredis.ResponseError)  # INVALID SYNTAX


@dfly_args({"proactor_threads": "2", "pipeline_squash": 1})
async def test_quit_in_pipeline(df_server: DflyInstance):
    """
    Regression test: when QUIT is pipelined together with other commands
    (e.g. DEL DEL ... DEL QUIT), the server must flush all preceding replies
    before closing the connection.

    Reproduces the BullMQ removeAllQueueData() pattern.
    """
    NUM_KEYS = 9
    client = df_server.client()

    # Setup: create NUM_KEYS keys
    for i in range(NUM_KEYS):
        await client.set(f"{{b}}:pqt:k{i}", "v")

    # Send DEL for all keys + QUIT in one pipeline
    pipe = client.pipeline(transaction=False)
    for i in range(NUM_KEYS):
        pipe.delete(f"{{b}}:pqt:k{i}")
    pipe.execute_command("QUIT")
    res = await pipe.execute()

    assert res[:NUM_KEYS] == [1] * NUM_KEYS, f"Expected {NUM_KEYS} DEL replies, got: {res}"
    assert res[NUM_KEYS] in (b"OK", True), f"Expected QUIT OK reply, got: {res[NUM_KEYS]}"


async def test_tls_partial_header_read(
    with_ca_tls_server_args, with_ca_tls_client_args, df_factory
):
    server = df_factory.create(port=BASE_PORT, **with_ca_tls_server_args)
    server.start()

    # Connect with raw socket and send only 1 byte (less than the 2-byte TLS header check)
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
        sock.connect(("localhost", server.port))
        # Send 1 byte (less than the 2-byte TLS header that dragonfly expects)
        sock.send(b"\x16")

    # If the server crashes due to UB, it will fail. Otherwise this test passes.
    # The server should handle this gracefully without crashing.
    await asyncio.sleep(0.5)  # Give server time to handle the connection

    # Verify server is still alive by making a valid connection
    client = aioredis.Redis(port=server.port, **with_ca_tls_client_args)
    assert await client.ping()


async def test_blocking_command_pipeline_flush(df_server: DflyInstance):
    blpop_timeout = 5
    num_blpops = 3
    push_after = 1.0
    max_allowed_delay = 2.0
    src_key = "__blpop_pipeline_flush_test__"

    pusher = aioredis.Redis(port=df_server.port)
    await pusher.delete(src_key)

    def encode_resp_command(*args):
        encoded_args = [str(a).encode() for a in args]
        header = f"*{len(encoded_args)}\r\n".encode()
        body = b"".join(f"${len(a)}\r\n".encode() + a + b"\r\n" for a in encoded_args)
        return header + body

    pipeline_data = encode_resp_command("SET", src_key + ":dummy", "val")
    pipeline_data += encode_resp_command("PING")
    pipeline_data += b"".join(
        encode_resp_command("BLPOP", src_key, blpop_timeout) for _ in range(num_blpops)
    )

    conn_reader, writer = await asyncio.open_connection("localhost", df_server.port)
    writer.write(pipeline_data)
    await writer.drain()

    async def expect_reply(expected: str, timeout=max_allowed_delay):
        reply = await asyncio.wait_for(conn_reader.readline(), timeout)
        assert reply == f"{expected}\r\n".encode(), f"expected {expected}, got {reply!r}"

    t0 = time.monotonic()
    await expect_reply("+OK")
    await expect_reply("+PONG")
    total_nonblocking_time = time.monotonic() - t0

    assert (
        total_nonblocking_time < max_allowed_delay
    ), f"Non-blocking replies took {total_nonblocking_time:.2f}s, expected < {max_allowed_delay}s."

    async def delayed_push():
        await asyncio.sleep(push_after)
        await pusher.lpush(src_key, "hello")

    push_task = asyncio.create_task(delayed_push())

    t0 = time.monotonic()
    total_timeout = blpop_timeout * num_blpops + 5
    try:
        await expect_reply("*2", total_timeout)
        first_blpop_time = time.monotonic() - t0
        assert (
            first_blpop_time < max_allowed_delay
        ), f"First blocking response took {first_blpop_time:.2f}s, expected < {max_allowed_delay}s"
    finally:
        writer.close()
        await writer.wait_closed()

        await push_task
        await pusher.delete(src_key, src_key + ":dummy")
        await pusher.aclose()


@dfly_args({"proactor_threads": 2, "async_dispatch_quota": 50})
async def test_pubsub_pipeline_starvation(df_server: DflyInstance):
    reader, writer = await asyncio.open_connection("127.0.0.1", df_server.port)
    # Send subscribe and consume the standard 6-line RESP array reply
    # to completely clean the socket buffer before the flood begins.
    writer.write(b"SUBSCRIBE starvation_chan\r\n")
    await writer.drain()
    for _ in range(6):
        await reader.readline()

    # Continuous Flood Task with batches of 500 commands (publisher)
    keep_flooding = True

    async def flood():
        pub = aioredis.Redis(port=df_server.port)
        while keep_flooding:
            pipe = pub.pipeline(transaction=False)
            for _ in range(500):
                pipe.publish("starvation_chan", "hello")
            await pipe.execute()
            # short sleep to yield the event loop but maintain constant pressure
            await asyncio.sleep(0.001)
        await pub.aclose()

    flood_task = asyncio.create_task(flood())

    try:
        # Wait just 10ms for the first wave to hit the server's queue
        await asyncio.sleep(0.01)

        # Inject UNSUBSCRIBE + PING into the active flood.
        # This triggers our quota logic, forcing the server to yield and read the commands from the TCP buffer, preventing input starvation.
        writer.write(b"UNSUBSCRIBE starvation_chan\r\nPING starvation_survived\r\n")
        await writer.drain()

        # Count the PubSub messages that arrive before the PING
        pubsub_messages_before_ping = 0
        ping_found = False
        async with async_timeout.timeout(2.0):
            while True:
                line = await reader.readline()
                if not line:
                    break

                if b"starvation_survived" in line:
                    ping_found = True
                    break

                if b"message" in line:
                    pubsub_messages_before_ping += 1

        # Assert 1: The PING must arrive before the flood is fully drained.
        assert ping_found, "PING was starved and timed out!"

        # Assert 2: the quota logic prioritized the pipeline.
        # If it was truly starving, this would timeout or hit tens of thousands.
        assert (
            pubsub_messages_before_ping <= 1000
        ), f"Starvation detected! Pipeline queued behind {pubsub_messages_before_ping} messages."
    finally:
        keep_flooding = False
        await flood_task
        writer.close()
        await writer.wait_closed()


================================================
FILE: tests/dragonfly/eval_test.py
================================================
import asyncio
import async_timeout
from redis import asyncio as aioredis
import time
import json
import logging
import pytest
import random
import itertools
import random
import string

from .instance import DflyInstance

from . import dfly_args, dfly_multi_test_args

DJANGO_CACHEOPS_SCRIPT = """
local prefix = KEYS[1]
local key = KEYS[2]
local precall_key = KEYS[3]
local data = ARGV[1]
local dnfs = cjson.decode(ARGV[2])
local timeout = tonumber(ARGV[3])

if precall_key ~= prefix and redis.call('exists', precall_key) == 0 then
  -- Cached data was invalidated during the function call. The data is
  -- stale and should not be cached.
  return
end

-- Write data to cache
redis.call('setex', key, timeout, data)


-- A pair of funcs
-- NOTE: we depend here on keys order being stable
local conj_schema = function (conj)
    local parts = {}
    for field, _ in pairs(conj) do
        table.insert(parts, field)
    end

    return table.concat(parts, ',')
end

local conj_cache_key = function (db_table, conj)
    local parts = {}
    for field, val in pairs(conj) do
        table.insert(parts, field .. '=' .. tostring(val))
    end

    return prefix .. 'conj:' .. db_table .. ':' .. table.concat(parts, '&')
end


-- Update schemes and invalidators
for db_table, disj in pairs(dnfs) do
    for _, conj in ipairs(disj) do
        -- Ensure scheme is known
        redis.acall('sadd', prefix .. 'schemes:' .. db_table, conj_schema(conj))

        -- Add new cache_key to list of dependencies
        local conj_key = conj_cache_key(db_table, conj)

        redis.acall('sadd', conj_key, key)
        -- NOTE: an invalidator should live longer than any key it references.
        --       So we update its ttl on every key if needed.
        -- NOTE: if CACHEOPS_LRU is True when invalidators should be left persistent,
        --       so we strip next section from this script.
        -- TOSTRIP
        local conj_ttl = redis.call('ttl', conj_key)
        if conj_ttl < timeout then
            -- We set conj_key life with a margin over key life to call expire rarer
            -- And add few extra seconds to be extra safe
            redis.call('expire', conj_key, timeout * 2 + 10)
        end
        -- /TOSTRIP
    end
end

return 'OK'
"""


def DJANGO_CACHEOPS_SCHEMA(vs):
    return {
        "table_1": [{"f-1": f"v-{vs[0]}"}, {"f-2": f"v-{vs[1]}"}],
        "table_2": [{"f-1": f"v-{vs[2]}"}, {"f-2": f"v-{vs[3]}"}],
    }


"""
Test the main caching script of https://github.com/Suor/django-cacheops.
The script accesses undeclared keys (that are built based on argument data),
so Dragonfly must run in global (1) or non-atomic (4) multi eval mode.
"""


@dfly_multi_test_args(
    {"default_lua_flags": "allow-undeclared-keys", "proactor_threads": 4},
    {"default_lua_flags": "allow-undeclared-keys disable-atomicity", "proactor_threads": 4},
)
async def test_django_cacheops_script(async_client, num_keys=500):
    script = async_client.register_script(DJANGO_CACHEOPS_SCRIPT)

    data = [(f"k-{k}", [random.randint(0, 10) for _ in range(4)]) for k in range(num_keys)]
    for k, vs in data:
        schema = DJANGO_CACHEOPS_SCHEMA(vs)
        assert (
            await script(keys=["", k, ""], args=["a" * 10, json.dumps(schema, sort_keys=True), 100])
            == "OK"
        )

    # Check schema was built correctly
    base_schema = DJANGO_CACHEOPS_SCHEMA([0] * 4)
    for table, fields in base_schema.items():
        schema = await async_client.smembers(f"schemes:{table}")
        fields = set.union(*(set(part.keys()) for part in fields))
        assert schema == fields

    # Check revese mapping is correct
    for k, vs in data:
        assert await async_client.exists(k)
        for table, fields in DJANGO_CACHEOPS_SCHEMA(vs).items():
            for sub_schema in fields:
                conj_key = f"conj:{table}:" + "&".join(
                    "{}={}".format(f, v) for f, v in sub_schema.items()
                )
                assert await async_client.sismember(conj_key, k)


ASYNQ_ENQUEUE_SCRIPT = """
if redis.call("EXISTS", KEYS[1]) == 1 then
	return 0
end
redis.call("HSET", KEYS[1],
           "msg", ARGV[1],
           "state", "pending",
           "pending_since", ARGV[3])
redis.call("LPUSH", KEYS[2], ARGV[2])
return 1
"""

ASYNQ_DEQUE_SCRIPT = """
if redis.call("EXISTS", KEYS[2]) == 0 then
	local id = redis.call("RPOPLPUSH", KEYS[1], KEYS[3])
	if id then
		local key = ARGV[2] .. id
		redis.call("HSET", key, "state", "active")
		redis.call("HDEL", key, "pending_since")
		redis.call("ZADD", KEYS[4], ARGV[1], id)
		return redis.call("HGET", key, "msg")
	end
end
return nil
"""

"""
Test the main queueing scripts of https://github.com/hibiken/asynq.
The deque script accesses undeclared keys (that are popped from a list),
so Dragonfly must run in global (1) or non-atomic (4) multi eval mode.

Running the deque script in non-atomic mode can introduce inconsistency to an outside observer.
For example, an item can be already placed into the active queue (RPUSH KEYS[3]), buts its state in the hash
wasn't yet updated to active. Because we only access keys that we popped from the list (RPOPLPUSH is still atomic by itself),
the task system should work reliably.
"""


@dfly_multi_test_args(
    {"default_lua_flags": "allow-undeclared-keys", "proactor_threads": 4},
    {"default_lua_flags": "allow-undeclared-keys disable-atomicity", "proactor_threads": 4},
)
async def test_golang_asynq_script(async_pool, num_queues=10, num_tasks=100):
    async def enqueue_worker(queue):
        client = aioredis.Redis(connection_pool=async_pool)
        enqueue = client.register_script(ASYNQ_ENQUEUE_SCRIPT)

        task_ids = 2 * list(range(num_tasks))
        random.shuffle(task_ids)
        res = [
            await enqueue(
                keys=[f"asynq:{{{queue}}}:t:{task_id}", f"asynq:{{{queue}}}:pending"],
                args=[f"{task_id}", task_id, int(time.time())],
            )
            for task_id in task_ids
        ]

        assert sum(res) == num_tasks

    # Start filling the queues
    jobs = [asyncio.create_task(enqueue_worker(f"q-{queue}")) for queue in range(num_queues)]

    collected = 0

    async def dequeue_worker():
        nonlocal collected
        client = aioredis.Redis(connection_pool=async_pool)
        dequeue = client.register_script(ASYNQ_DEQUE_SCRIPT)

        while collected < num_tasks * num_queues:
            # pct = round(collected/(num_tasks*num_queues), 2)
            # print(f'\r    \r{pct}', end='', flush=True)
            for queue in (f"q-{queue}" for queue in range(num_queues)):
                prefix = f"asynq:{{{queue}}}:t:"
                msg = await dequeue(
                    keys=[
                        f"asynq:{{{queue}}}:" + t for t in ["pending", "paused", "active", "lease"]
                    ],
                    args=[int(time.time()), prefix],
                )
                if msg is not None:
                    collected += 1
                    assert await client.hget(prefix + msg, "state") == "active"

    # Run many contending workers
    await asyncio.gather(*(dequeue_worker() for _ in range(num_queues * 2)))

    for job in jobs:
        await job


ERROR_CALL_SCRIPT_TEMPLATE = [
    "redis.{}('LTRIM', 'l', 'a', 'b')",  # error only on evaluation
    "redis.{}('obviously wrong')",  # error immediately on preprocessing
]


@dfly_args({"proactor_threads": 1})
@pytest.mark.asyncio
async def test_eval_error_propagation(async_client):
    CMDS = ["call", "pcall", "acall", "apcall"]

    for cmd, template in itertools.product(CMDS, ERROR_CALL_SCRIPT_TEMPLATE):
        does_abort = "p" not in cmd
        try:
            await async_client.eval(template.format(cmd), 1, "l")
            if does_abort:
                assert False, "Eval must have thrown an error: " + cmd
        except aioredis.RedisError as e:
            if not does_abort:
                assert False, "Error should have been ignored: " + cmd


@dfly_args({"proactor_threads": 1, "default_lua_flags": "allow-undeclared-keys"})
async def test_global_eval_in_multi(async_client: aioredis.Redis):
    GLOBAL_SCRIPT = """
        return redis.call('GET', 'any-key');
    """

    await async_client.set("any-key", "works")

    pipe = async_client.pipeline(transaction=True)
    pipe.set("another-key", "ok")
    pipe.eval(GLOBAL_SCRIPT, 0)
    res = await pipe.execute()

    print(res)
    assert res[1] == "works"


@dfly_args({"proactor_threads": 4, "lua_auto_async": None})
async def test_lua_auto_async(async_client: aioredis.Redis):
    TEST_SCRIPT = """
        for i = 1, 100 do
            redis.call('LPUSH', KEYS[(i % 4) + 1], 'W')
        end
    """

    await async_client.eval(TEST_SCRIPT, 4, "a", "b", "c", "d")

    flushes = (await async_client.info("transaction"))["eval_squashed_flushes"]
    assert 3 <= flushes <= 5  # all 100 commands are executed in a few batches


"""
Ensure liveness even with only a single interpreter in scenarios where EVAL and EVAL inside multi run concurrently while also contending for keys
"""


@dfly_args({"proactor_threads": 2, "interpreter_per_thread": 1})
async def test_one_interpreter(async_client: aioredis.Redis):
    sha = await async_client.script_load("redis.call('GET', KEYS[1])")
    all_keys = [string.ascii_lowercase[i] for i in range(5)]
    total_runs = 100

    async def run(transaction):
        for _ in range(total_runs):
            p = async_client.pipeline(transaction=transaction)
            pkeys = random.choices(all_keys, k=3)
            for key in pkeys:
                p.evalsha(sha, 1, key)
            await p.execute()

    max_blocked = 0

    async def measure_blocked():
        nonlocal max_blocked
        while True:
            max_blocked = max(
                max_blocked, (await async_client.info("STATS"))["blocked_on_interpreter"]
            )
            await asyncio.sleep(0.01)

    tm = [asyncio.create_task(run(True)) for _ in range(10)]
    ts = [asyncio.create_task(run(False)) for _ in range(10)]
    # block_measure = asyncio.create_task(measure_blocked())

    async with async_timeout.timeout(5):
        await asyncio.gather(*(tm + ts))

    # block_measure.cancel()

    # At least some connection was seen blocked
    # Flaky: release build is too fast and never blocks
    # assert max_blocked > 0


"""
Tests migrate/close interaction for the connection
Reproduces #2569
"""


@dfly_args({"proactor_threads": "4", "pipeline_squash": 0})
async def test_migrate_close_connection(async_client: aioredis.Redis, df_server: DflyInstance):
    sha = await async_client.script_load("return redis.call('GET', KEYS[1])")

    async def run():
        reader, writer = await asyncio.open_connection("localhost", df_server.port)

        # write a EVALSHA that will ask for migration (75% it's on the wrong shard)
        writer.write((f"EVALSHA {sha} 1 a\r\n").encode())
        await writer.drain()

        # disconnect the client connection
        writer.close()
        await writer.wait_closed()

    tasks = [asyncio.create_task(run()) for _ in range(50)]
    await asyncio.gather(*tasks)


@pytest.mark.opt_only
@dfly_args({"proactor_threads": 4, "interpreter_per_thread": 4, "lua_mem_gc_threshold": 60000000})
async def test_fill_memory_gc(async_client: aioredis.Redis):
    SCRIPT = """
        local res = {{}}
        for j = 1, 100 do
          for i = 1, 10000 do
            table.insert(res, tostring(i) .. 'data')
          end
        end
    """

    await asyncio.gather(*(async_client.eval(SCRIPT, 0) for _ in range(5)))

    info = await async_client.info("memory")
    # if this assert fails, we likely run gc after script invocations, remove this test
    assert info["used_memory_lua"] > 50 * 1e6

    await async_client.execute_command("SCRIPT GC")
    info = await async_client.info("memory")
    assert info["used_memory_lua"] < 10 * 1e6


@dfly_args({"proactor_threads": 4, "interpreter_per_thread": 4, "lua_mem_gc_threshold": 100000000})
async def test_gc_force_flag(async_client: aioredis.Redis):
    SCRIPT = """
        local res = {{}}
        for j = 1, 10 do
          for i = 1, 1000 do
            table.insert(res, tostring(i) .. 'data')
          end
        end
    """
    for i in range(0, 1000):
        await asyncio.gather(*(async_client.eval(SCRIPT, 0) for _ in range(5)))

    info = await async_client.info("memory")
    assert info["used_memory_lua"] > 1e6

    stats = await async_client.info("stats")
    assert stats["lua_interpreter_return"] == 5000
    assert stats["lua_force_gc_calls"] == 0
    assert stats["lua_gc_duration_total_sec"] == 0
    assert stats["lua_gc_freed_memory_total"] == 0

    await async_client.execute_command("SCRIPT", "GC")

    info = await async_client.info("memory")
    assert info["used_memory_lua"] < 4 * 1e6

    await async_client.execute_command("CONFIG", "SET", "lua_mem_gc_threshold", "1000")

    for i in range(0, 1000):
        await asyncio.gather(*(async_client.eval(SCRIPT, 0) for _ in range(5)))

    info = await async_client.info("memory")
    assert info["used_memory_lua"] < 4 * 1e6

    stats = await async_client.info("stats")
    assert stats["lua_interpreter_return"] >= 10000
    assert stats["lua_force_gc_calls"] > 0
    assert stats["lua_gc_duration_total_sec"] > 0
    assert stats["lua_gc_freed_memory_total"] > 0


@dfly_args({"proactor_threads": 1})
@pytest.mark.asyncio
async def test_StackOverflowByHincrbyfloat(df_server: DflyInstance):
    client = df_server.client()

    await client.execute_command("HSET myhash field 1.0")
    await client.eval("return redis.pcall('HINCRBYFLOAT', KEYS[1], 'field', '1.5')", 1, "myhash")
    assert "2.5" == await client.execute_command("HGET myhash field")


================================================
FILE: tests/dragonfly/generic_test.py
================================================
import logging
import pytest
import redis
import asyncio
from redis import asyncio as aioredis

from . import dfly_multi_test_args, dfly_args
from .instance import DflyInstance, DflyStartException
from .utility import batch_fill_data, gen_test_data, EnvironCntx
from .seeder import DebugPopulateSeeder


@dfly_multi_test_args({"keys_output_limit": 512}, {"keys_output_limit": 1024})
class TestKeys:
    async def test_max_keys(self, async_client: aioredis.Redis, df_server):
        max_keys = df_server["keys_output_limit"]
        pipe = async_client.pipeline()
        batch_fill_data(pipe, gen_test_data(max_keys * 3))
        await pipe.execute()
        keys = await async_client.keys()
        assert len(keys) in range(max_keys, max_keys + 512)


@pytest.fixture(scope="function")
def export_dfly_password() -> str:
    pwd = "flypwd"
    with EnvironCntx(DFLY_requirepass=pwd):
        yield pwd


async def test_password(df_factory, export_dfly_password):
    with df_factory.create() as dfly:
        # Expect password form environment variable
        with pytest.raises(redis.exceptions.AuthenticationError):
            async with aioredis.Redis(port=dfly.port) as client:
                await client.ping()
        async with aioredis.Redis(password=export_dfly_password, port=dfly.port) as client:
            await client.ping()

    # --requirepass should take precedence over environment variable
    requirepass = "requirepass"
    with df_factory.create(requirepass=requirepass) as dfly:
        # Expect password form flag
        with pytest.raises(redis.exceptions.AuthenticationError):
            async with aioredis.Redis(port=dfly.port, password=export_dfly_password) as client:
                await client.ping()
        async with aioredis.Redis(password=requirepass, port=dfly.port) as client:
            await client.ping()


"""
Make sure that multi-hop transactions can't run OOO.
"""

MULTI_HOPS = """
for i = 0, ARGV[1] do
  redis.call('INCR', KEYS[1])
end
"""


@dfly_args({"proactor_threads": 1})
async def test_txq_ooo(async_client: aioredis.Redis, df_server):
    async def task1(k, h):
        c = aioredis.Redis(port=df_server.port)
        for _ in range(100):
            await c.eval(MULTI_HOPS, 1, k, h)

    async def task2(k, n):
        c = aioredis.Redis(port=df_server.port)
        for _ in range(100):
            pipe = c.pipeline(transaction=False)
            pipe.lpush(k, 1)
            for _ in range(n):
                pipe.blpop(k, 0.001)
            await pipe.execute()

    await asyncio.gather(
        task1("i1", 2), task1("i2", 3), task2("l1", 2), task2("l1", 2), task2("l1", 5)
    )


@dfly_args({"proactor_threads": 2, "num_shards": 2})
async def test_blocking_multiple_dbs(async_client: aioredis.Redis, df_server: DflyInstance):
    active = True

    # A task to trigger the flow that eventually looses a transaction
    # blmove is used to trigger a global deadlock, but we could use any
    # command - the effect would be - a deadlocking locally that connection
    async def blmove_task_loose(num):
        async def run(id):
            c = df_server.client()
            await c.lpush(f"key{id}", "val")
            while active:
                await c.blmove(f"key{id}", f"key{id}", 0, "LEFT", "LEFT")
                await asyncio.sleep(0.01)

        tasks = []
        for i in range(num):
            tasks.append(run(i))

        await asyncio.gather(*tasks)

    # A task that creates continuation_trans_ by constantly timing out on
    # an empty set. We could probably use any 2-hop operation like rename.
    async def task_blocking(num):
        async def block(id):
            c = df_server.client()
            while active:
                await c.blmove(f"{{{id}}}from", f"{{{id}}}to", 0.1, "LEFT", "LEFT")

        tasks = []
        for i in range(num):
            tasks.append(block(i))
        await asyncio.gather(*tasks)

    # produce is constantly waking up consumers. It is used to trigger the
    # flow that creates wake ups on a differrent database in the
    # middle of continuation transaction.
    async def tasks_produce(num, iters):
        LPUSH_SCRIPT = """
            redis.call('LPUSH', KEYS[1], "val")
        """

        async def produce(id):
            c = df_server.client(db=1)  # important to be on a different db
            for i in range(iters):
                # Must be a lua script and not multi-exec for some reason.
                await c.eval(LPUSH_SCRIPT, 1, f"list{{{id}}}")

        tasks = []
        for i in range(num):
            task = asyncio.create_task(produce(i))
            tasks.append(task)

        await asyncio.gather(*tasks)
        logging.info("Finished producing")

    # works with producer to constantly block and wake up
    async def tasks_consume(num, iters):
        async def drain(id, iters):
            client = df_server.client(db=1)
            for _ in range(iters):
                await client.blmove(f"list{{{id}}}", f"sink{{{id}}}", 0, "LEFT", "LEFT")

        tasks = []
        for i in range(num):
            task = asyncio.create_task(drain(i, iters))
            tasks.append(task)

        await asyncio.gather(*tasks)
        logging.info("Finished consuming")

    num_keys = 32
    num_iters = 200
    async_task1 = asyncio.create_task(blmove_task_loose(num_keys))
    async_task2 = asyncio.create_task(task_blocking(num_keys))
    logging.info("Starting tasks")
    await asyncio.gather(
        tasks_consume(num_keys, num_iters),
        tasks_produce(num_keys, num_iters),
    )
    logging.info("Finishing tasks")
    active = False
    await asyncio.gather(async_task1, async_task2)


async def test_arg_from_environ_overwritten_by_cli(df_factory):
    with EnvironCntx(DFLY_port="6378"):
        with df_factory.create(port=6377):
            client = aioredis.Redis(port=6377)
            await client.ping()


async def test_arg_from_environ(df_factory):
    with EnvironCntx(DFLY_requirepass="pass"):
        with df_factory.create() as dfly:
            # Expect password from environment variable
            with pytest.raises(redis.exceptions.AuthenticationError):
                client = aioredis.Redis(port=dfly.port)
                await client.ping()

            client = aioredis.Redis(password="pass", port=dfly.port)
            await client.ping()


async def test_unknown_dfly_env(df_factory, export_dfly_password):
    with EnvironCntx(DFLY_abcdef="xyz"):
        dfly = df_factory.create()
        with pytest.raises(DflyStartException):
            dfly.start()
        dfly.set_proc_to_none()


async def test_restricted_commands(df_factory):
    # Restrict GET and SET, then verify non-admin clients are blocked from
    # using these commands, though admin clients can use them.
    with df_factory.create(restricted_commands="get,set", admin_port=1112) as server:
        async with aioredis.Redis(port=server.port) as client:
            with pytest.raises(redis.exceptions.ResponseError):
                await client.get("foo")

            with pytest.raises(redis.exceptions.ResponseError):
                await client.set("foo", "bar")

        async with aioredis.Redis(port=server.admin_port) as admin_client:
            await admin_client.get("foo")
            await admin_client.set("foo", "bar")


@pytest.mark.asyncio
async def test_reply_guard_oom(df_factory, df_seeder_factory):
    master = df_factory.create(
        proactor_threads=1,
        cache_mode="true",
        maxmemory="256mb",
        enable_heartbeat_eviction="false",
        rss_oom_deny_ratio=2,
    )
    df_factory.start_all([master])
    c_master = master.client()
    await c_master.execute_command("DEBUG POPULATE 6000 size 40000")

    seeder = df_seeder_factory.create(
        port=master.port, keys=5000, val_size=1000, stop_on_failure=False
    )
    await seeder.run(target_deviation=0.1)

    info = await c_master.info("stats")
    assert info["evicted_keys"] > 0, "Weak testcase: policy based eviction was not triggered."


@pytest.mark.asyncio
async def test_denyoom_commands(df_factory):
    df_server = df_factory.create(proactor_threads=1, maxmemory="256mb", oom_deny_commands="get")
    df_server.start()
    client = df_server.client()
    await client.execute_command("DEBUG POPULATE 7000 size 44000")

    min_deny = 256 * 1024 * 1024  # 256mb
    info = await client.info("memory")
    print(f'Used memory {info["used_memory"]}, rss {info["used_memory_rss"]}')
    assert info["used_memory"] > min_deny, "Weak testcase: too little used memory"

    # reject set due to oom
    with pytest.raises(redis.exceptions.ResponseError):
        await client.execute_command("set x y")

    # reject get because it is set in oom_deny_commands
    with pytest.raises(redis.exceptions.ResponseError):
        await client.execute_command("get x")

    # mget should not be rejected
    await client.execute_command("mget x")


@pytest.mark.parametrize("type", ["LIST", "HASH", "SET", "ZSET", "STRING", "STREAM"])
@dfly_args({"proactor_threads": 4})
@pytest.mark.asyncio
async def test_rename_huge_values(df_factory, type):
    df_server = df_factory.create()
    df_server.start()
    client = df_server.client()

    logging.debug(f"Generating huge {type}")
    seeder = DebugPopulateSeeder(
        key_target=1,
        data_size=10_000_000,
        collection_size=10_000,
        variance=1,
        samples=1,
        types=[type],
    )
    await seeder.run(client)
    source_data = await DebugPopulateSeeder.capture(client)
    logging.debug(f"src {source_data}")

    # Rename multiple times to make sure the key moves between shards
    orig_name = (await client.execute_command("keys *"))[0]
    old_name = orig_name
    new_name = ""
    for i in range(10):
        new_name = f"new:{i}"
        await client.execute_command(f"rename {old_name} {new_name}")
        old_name = new_name
    await client.execute_command(f"rename {new_name} {orig_name}")
    target_data = await DebugPopulateSeeder.capture(client)

    assert source_data == target_data


@pytest.mark.asyncio
async def test_key_bump_ups(df_factory):
    master = df_factory.create(
        proactor_threads=2,
        cache_mode="true",
    )
    df_factory.start_all([master])
    c_master = master.client()

    await c_master.execute_command("DEBUG POPULATE 18000 KEY 32 RAND")

    info = await c_master.info("stats")
    assert info["bump_ups"] == 0

    keys = await c_master.execute_command("SCAN 0")
    keys = keys[1][0:10]

    # Bump keys
    for key in keys:
        await c_master.execute_command("GET " + key)
    info = await c_master.info("stats")
    assert info["bump_ups"] <= 10

    # Multi get bump
    await c_master.execute_command("MGET " + " ".join(keys))
    info = await c_master.info("stats")
    assert info["bump_ups"] >= 10 and info["bump_ups"] <= 20
    last_bump_ups = info["bump_ups"]

    for key in keys:
        await c_master.execute_command("DEL " + key)

    # DEL should not bump up any key
    info = await c_master.info("stats")
    assert last_bump_ups == info["bump_ups"]

    #  Find key that has slot > 0 and bump it
    while True:
        keys = await c_master.execute_command("SCAN 0")
        key = keys[1][0]

        debug_key_info = await c_master.execute_command("DEBUG OBJECT " + key)
        slot_id = int(dict(map(lambda s: s.split(":"), debug_key_info.split()))["slot"])
        if slot_id == 0:
            # delete the key and continue
            await c_master.execute_command("DEL " + key)
            continue

        await c_master.execute_command("GET " + key)
        debug_key_info = await c_master.execute_command("DEBUG OBJECT " + key)
        new_slot_id = int(dict(map(lambda s: s.split(":"), debug_key_info.split()))["slot"])
        assert new_slot_id + 1 == slot_id
        break


@pytest.mark.debug_only
@pytest.mark.asyncio
async def test_command_empty_key(df_factory):
    df_server = df_factory.create()
    df_server.start()
    client = df_server.client()
    res = await client.lpush("", "a")
    assert res == 1
    res = await client.execute_command("KEYS *")
    assert len(res) == 1


================================================
FILE: tests/dragonfly/http_conf_test.py
================================================
import aiohttp
import json
from . import dfly_args
from .instance import DflyInstance


def get_http_session(*args):
    if args:
        return aiohttp.ClientSession(auth=aiohttp.BasicAuth(*args))
    return aiohttp.ClientSession()


@dfly_args({"proactor_threads": "1", "requirepass": "XXX"})
async def test_password(df_server: DflyInstance):
    async with get_http_session() as session:
        resp = await session.get(f"http://localhost:{df_server.port}/")
        assert resp.status == 401
    async with get_http_session("default", "wrongpassword") as session:
        resp = await session.get(f"http://localhost:{df_server.port}/")
        assert resp.status == 401
    async with get_http_session("default", "XXX") as session:
        resp = await session.get(f"http://localhost:{df_server.port}/")
        assert resp.status == 200


@dfly_args({"proactor_threads": "1", "requirepass": "XXX", "admin_port": 1113})
async def test_skip_metrics(df_server: DflyInstance):
    async with get_http_session("whoops", "whoops") as session:
        resp = await session.get(f"http://localhost:{df_server.port}/metrics")
        assert resp.status == 200
    async with get_http_session("whoops", "whoops") as session:
        resp = await session.get(f"http://localhost:{df_server.admin_port}/metrics")
        assert resp.status == 200


async def test_no_password_main_port(df_server: DflyInstance):
    async with get_http_session("default", "XXX") as session:
        resp = await session.get(f"http://localhost:{df_server.port}/")
        assert resp.status == 200
    async with get_http_session("random") as session:
        resp = await session.get(f"http://localhost:{df_server.port}/")
        assert resp.status == 200
    async with get_http_session() as session:
        resp = await session.get(f"http://localhost:{df_server.port}/")
        assert resp.status == 200


@dfly_args(
    {
        "proactor_threads": "1",
        "requirepass": "XXX",
        "admin_port": 1113,
        "primary_port_http_enabled": True,
        "admin_nopass": True,
    }
)
async def test_no_password_on_admin(df_server: DflyInstance):
    async with get_http_session("default", "XXX") as session:
        resp = await session.get(f"http://localhost:{df_server.admin_port}/")
        assert resp.status == 200
    async with get_http_session("random") as session:
        resp = await session.get(f"http://localhost:{df_server.admin_port}/")
        assert resp.status == 200
    async with get_http_session() as session:
        resp = await session.get(f"http://localhost:{df_server.admin_port}/")
        assert resp.status == 200


@dfly_args({"proactor_threads": "1", "requirepass": "XXX", "admin_port": 1113})
async def test_password_on_admin(df_server: DflyInstance):
    async with get_http_session("default", "badpass") as session:
        resp = await session.get(f"http://localhost:{df_server.admin_port}/")
        assert resp.status == 401
    async with get_http_session() as session:
        resp = await session.get(f"http://localhost:{df_server.admin_port}/")
        assert resp.status == 401
    async with get_http_session("default", "XXX") as session:
        resp = await session.get(f"http://localhost:{df_server.admin_port}/")
        assert resp.status == 200


@dfly_args({"proactor_threads": "1", "expose_http_api": "true"})
async def test_no_password_on_http_api(df_server: DflyInstance):
    async with get_http_session("default", "XXX") as session:
        resp = await session.post(f"http://localhost:{df_server.port}/api", json=["ping"])
        assert resp.status == 200
    async with get_http_session("random") as session:
        resp = await session.post(f"http://localhost:{df_server.port}/api", json=["ping"])
        assert resp.status == 200
    async with get_http_session() as session:
        resp = await session.post(f"http://localhost:{df_server.port}/api", json=["ping"])
        assert resp.status == 200


@dfly_args({"proactor_threads": "1", "expose_http_api": "true"})
async def test_http_api(df_server: DflyInstance):
    client = df_server.client()
    async with get_http_session() as session:
        body = '["set", "foo", "МайяХилли", "ex", "100"]'
        async with session.post(f"http://localhost:{df_server.port}/api", data=body) as resp:
            assert resp.status == 200
            text = await resp.text()
            assert text.strip() == '{"result":"OK"}'

        body = '["get", "foo"]'
        async with session.post(f"http://localhost:{df_server.port}/api", data=body) as resp:
            assert resp.status == 200
            text = await resp.text()
            assert text.strip() == '{"result":"МайяХилли"}'

        body = '["foo", "bar"]'
        async with session.post(f"http://localhost:{df_server.port}/api", data=body) as resp:
            assert resp.status == 200
            text = await resp.text()
            assert text.strip() == '{"error": "unknown command `FOO`"}'

    assert await client.ttl("foo") > 0


@dfly_args({"proactor_threads": "1", "expose_http_api": "true", "requirepass": "XXX"})
async def test_password_on_http_api(df_server: DflyInstance):
    async with get_http_session("default", "badpass") as session:
        resp = await session.post(f"http://localhost:{df_server.port}/api", json=["ping"])
        assert resp.status == 401
    async with get_http_session() as session:
        resp = await session.post(f"http://localhost:{df_server.port}/api", json=["ping"])
        assert resp.status == 401
    async with get_http_session("default", "XXX") as session:
        resp = await session.post(f"http://localhost:{df_server.port}/api", json=["ping"])
        assert resp.status == 200


def get_json_object(json_str):
    try:
        json_obj = json.loads(json_str)
        return json_obj
    except ValueError:
        return None


@dfly_args({"proactor_threads": "1", "expose_http_api": "true", "slowlog_log_slower_than": 0})
async def test_http_api_json_response(df_server: DflyInstance):
    client = df_server.client()
    async with get_http_session() as session:
        body = '["set", "foo","bar"]'
        async with session.post(f"http://localhost:{df_server.port}/api", data=body) as resp:
            assert resp.status == 200
            text = await resp.text()
            json_object = get_json_object(text)
            assert json_object != None
            assert json_object == {"result": "OK"}

        body = '["get", "foo"]'
        async with session.post(f"http://localhost:{df_server.port}/api", data=body) as resp:
            assert resp.status == 200
            text = await resp.text()
            json_object = get_json_object(text)
            assert json_object != None
            assert json_object == {"result": "bar"}

        body = '["slowlog", "get"]'
        async with session.post(f"http://localhost:{df_server.port}/api", data=body) as resp:
            assert resp.status == 200
            text = await resp.text()
            json_object = get_json_object(text)
            assert json_object != None
            # Compare commands
            assert json_object["result"][0][3] == ["GET", "foo"]
            assert json_object["result"][1][3] == ["SET", "foo", "bar"]

        body = '["hset", "myhash", "k1", "1", "k2", "2"]'
        async with session.post(f"http://localhost:{df_server.port}/api", data=body) as resp:
            assert resp.status == 200
            text = await resp.text()
            json_object = get_json_object(text)
            assert json_object != None
            assert json_object == {"result": 2}

        body = '["hkeys", "myhash"]'
        async with session.post(f"http://localhost:{df_server.port}/api", data=body) as resp:
            assert resp.status == 200
            text = await resp.text()
            json_object = get_json_object(text)
            assert json_object != None
            assert json_object["result"] == ["k1", "k2"]


================================================
FILE: tests/dragonfly/instance.py
================================================
import dataclasses
import os
import threading
import time
import subprocess
import random
import aiohttp
import logging
from dataclasses import dataclass
from typing import Dict, Optional, List, Union
import re
import psutil
import itertools
from prometheus_client.parser import text_string_to_metric_families
from redis.asyncio import Redis as RedisClient
from redis.asyncio import RedisCluster as RedisCluster
import signal


START_DELAY = 0.8
START_GDB_DELAY = 5.0


@dataclass
class DflyParams:
    path: str
    cwd: str
    gdb: bool
    direct_output: bool
    buffered_out: bool
    args: Dict[str, Union[str, None]]
    existing_port: int
    existing_admin_port: int
    existing_mc_port: int
    env: any
    log_dir: str


class Colors:
    CLEAR = "\\o33[0m"
    COLORS = [f"\\o33[0;{i}m" for i in range(31, 37)]
    last_color = -1

    @classmethod
    def next(clz):
        clz.last_color = (clz.last_color + 1) % len(clz.COLORS)
        return clz.COLORS[clz.last_color]


class DflyStartException(Exception):
    pass


def symbolize_stack_trace(binary_path, lines):
    addr2line_proc = subprocess.Popen(
        ["/usr/bin/addr2line", "-fCa", "-e", binary_path], stdin=subprocess.PIPE
    )
    for line in lines:
        addr2line_proc.stdin.write(line.encode())

    addr2line_proc.stdin.close()
    addr2line_proc.wait()


def read_sedout(pipe, stacktrace):
    try:
        seen = set()
        pattern = r"@\s*(0x[0-9a-fA-F]+)"
        matcher = re.compile(pattern)

        for line in iter(pipe.readline, b""):
            # Deduplicate output - we somewhere duplicate the output, probably due
            # to tty redirections.
            if line not in seen:
                seen.add(line)
                print(line)
                res = matcher.search(line)
                if res:
                    stacktrace.append(res.group(1) + "\n")
    except ValueError:
        pass
    finally:
        pipe.close()


class DflyInstance:
    """
    Represents a runnable and stoppable Dragonfly instance
    with fixed arguments.
    """

    def __init__(self, params: DflyParams, args):
        self.args = args
        self.args.update(params.args)
        self.params = params
        self.proc: Optional[subprocess.Popen] = None
        self._client: Optional[RedisClient] = None
        self.log_files: List[str] = []
        self.dynamic_port = False
        self.sed_proc = None
        self.clients = []

        if self.params.existing_port:
            self._port = self.params.existing_port
        elif "port" in self.args:
            self._port = int(self.args["port"])
        else:
            # Tell DF to choose a random open port.
            # We'll find out what port it is using lsof.
            self.args["port"] = -1
            self._port = None
            self.dynamic_port = True

        # Some tests check the log files, so make sure the log files
        # exist even when people try to debug their test.
        if "logtostderr" in self.args:
            del self.args["logtostderr"]
            self.args["alsologtostderr"] = None

        # Run with num_shards = (proactor_threads - 1) if possible, so help expose bugs
        if "num_shards" not in self.args:
            threads = psutil.cpu_count()
            if "proactor_threads" in self.args:
                threads = int(self.args["proactor_threads"])
            if threads > 1:
                self.args["num_shards"] = threads - 1

    def __del__(self):
        if self.proc:
            self.stop()
        assert self.proc == None

    def client(self, *args, **kwargs) -> RedisClient:
        host = "localhost" if self["bind"] is None else self["bind"]
        client = RedisClient(host=host, port=self.port, decode_responses=True, *args, **kwargs)
        self.clients.append(client)
        return client

    def admin_client(self, *args, **kwargs) -> RedisClient:
        client = RedisClient(
            port=self.admin_port,
            single_connection_client=True,
            decode_responses=True,
            *args,
            **kwargs,
        )
        self.clients.append(client)
        return client

    def cluster_client(self, *args, **kwargs) -> RedisCluster:
        client = RedisCluster(
            host="localhost", port=self.port, decode_responses=True, *args, **kwargs
        )
        self.clients.append(client)
        return client

    async def close_clients(self):
        for client in self.clients:
            await client.aclose() if hasattr(client, "aclose") else await client.close()

    def __enter__(self):
        self.start()
        return self

    def __repr__(self):
        return f":{self.port}"

    def __exit__(self, exc_type, exc_value, exc_traceback):
        self.stop()

    def start(self):
        if self.params.existing_port:
            return

        self._start()
        self._wait_for_server()

    def _wait_for_server(self):
        if self.params.existing_port:
            return
        # Give Dragonfly time to start and detect possible failure causes
        # Gdb starts slowly
        delay = START_DELAY if not self.params.gdb else START_GDB_DELAY

        # Wait until the process is listening on the port.
        s = time.time()
        while time.time() - s < delay:
            self._check_status()
            try:
                self.get_port_from_psutil()
                logging.info(
                    f"Process {self.proc.pid} started after {time.time() - s:.2f} seconds. port={self.port}"
                )
                break
            except RuntimeError:
                time.sleep(0.05)
        else:
            raise DflyStartException("Process didn't start listening on port in time")

        self.log_files = self.get_logs_from_psutil()

        # Remove first 6 lines - our default header with log locations (as it carries no useful information)
        # Next, replace log-level + date with port and colored arrow
        sed_format = f"1,6d;s/[^ ]*/{self.port}{Colors.next()}➜{Colors.CLEAR}/"
        sed_cmd = ["sed", "-u", "-e", sed_format]
        if self.params.buffered_out:
            sed_cmd.remove("-u")
        if not self.params.direct_output:
            self.sed_proc = subprocess.Popen(
                sed_cmd,
                stdin=self.proc.stdout,
                stdout=subprocess.PIPE,
                bufsize=1,
                universal_newlines=True,
            )
            self.stacktrace = []
            self.sed_thread = threading.Thread(
                target=read_sedout, args=(self.sed_proc.stdout, self.stacktrace), daemon=True
            )
            self.sed_thread.start()

    def set_proc_to_none(self):
        self.proc = None

    def stop(self, kill=False):
        proc, self.proc = self.proc, None
        if proc is None:
            return

        logging.debug(f"Stopping instance on {self._port}")
        try:
            if kill:
                proc.kill()
            else:
                proc.terminate()
                proc.communicate(timeout=120)
                # if the return code is 0 it means normal termination
                # if the return code is negative it means termination by signal
                # if the return code is positive it means abnormal exit
                if proc.returncode != 0:
                    raise Exception(
                        f"Dragonfly did not terminate gracefully, exit code {proc.returncode}, "
                        f"pid: {proc.pid}"
                    )

        except subprocess.TimeoutExpired:
            # We need to send SIGUSR1 to DF such that it prints the stacktrace
            proc.send_signal(signal.SIGUSR1)
            # Then we sleep for 5 seconds such that DF has enough time to print the stacktraces
            # We can't really synchronize here because SIGTERM and SIGKILL do not block even if
            # sigaction explicitly blocks other incoming signals until it handles SIGUSR1.
            # Even worse, on SIGTERM and SIGKILL none of the handlers registered via sigaction
            # are guranteed to run
            time.sleep(5)
            logging.debug(f"Unable to kill the process on port {self._port}")
            logging.debug(f"INFO LOGS of DF are:")
            self.print_info_logs_to_debug_log()
            proc.kill()
            proc.communicate()
            raise Exception("Unable to terminate DragonflyDB gracefully, it was killed")
        finally:
            if self.sed_proc:
                self.sed_proc.communicate()
                self.sed_thread.join()
                symbolize_stack_trace(proc.args[0], self.stacktrace)

    def _start(self):
        if self.params.existing_port:
            return

        if self.dynamic_port:
            self._port = None

        all_args = self.format_args(self.args)
        real_path = os.path.realpath(self.params.path)

        run_cmd = [self.params.path, *all_args]
        if self.params.gdb:
            run_cmd = ["gdb", "--ex", "r", "--args"] + run_cmd

        self.proc = subprocess.Popen(
            run_cmd,
            cwd=self.params.cwd,
            stdout=None if self.params.direct_output else subprocess.PIPE,
            stderr=subprocess.STDOUT,
        )
        logging.info(f"Starting {real_path} {' '.join(all_args)}, pid {self.proc.pid}")

    def _check_status(self):
        if not self.params.existing_port:
            return_code = self.proc.poll()
            if return_code is not None:
                # log stdout of the failed process
                logging.error("Dragonfly process error:\n%s", self.proc.stdout.read().decode())
                self.proc = None
                raise DflyStartException(f"Failed to start instance, return code {return_code}")

    def __getitem__(self, k):
        return self.args.get(k)

    @property
    def port(self) -> int:
        if self._port is None:
            self._port = self.get_port_from_psutil()
        return self._port

    @property
    def admin_port(self) -> Optional[int]:
        if self.params.existing_admin_port:
            return self.params.existing_admin_port
        if "admin_port" in self.args:
            return int(self.args["admin_port"])
        return None

    @property
    def mc_port(self) -> Optional[int]:
        if self.params.existing_mc_port:
            return self.params.existing_mc_port
        if "memcached_port" in self.args:
            return int(self.args["memcached_port"])
        return None

    def get_port_from_psutil(self) -> int:
        if self.proc is None:
            raise RuntimeError("port is not available yet")
        p = psutil.Process(self.proc.pid)

        # If running with gdb, look for port on child
        children = p.children()
        if len(children) == 1 and children[0].name() == "dragonfly":
            p = children[0]

        ports = set()
        try:
            for connection in p.connections():
                if connection.status == "LISTEN":
                    ports.add(connection.laddr.port)
        except psutil.AccessDenied:
            raise RuntimeError("Access denied")

        ports.difference_update({self.admin_port, self.mc_port})
        assert len(ports) < 2, "Open ports detection found too many ports"
        if ports:
            return ports.pop()
        raise RuntimeError("Couldn't parse port")

    def get_logs_from_psutil(self) -> List[str]:
        p = psutil.Process(self.proc.pid)
        rv = []
        for file in p.open_files():
            if ".log." in file.path and "dragonfly" in file.path:
                rv.append(file.path)
        return rv

    def print_info_logs_to_debug_log(self):
        logs = self.log_files
        sed_format = f"s/[^ ]*/{self.port}{Colors.next()}➜{Colors.CLEAR}/"
        sed_cmd = ["sed", "-e", sed_format]
        for log in logs:
            if "INFO" in log:
                with open(log) as file:
                    print(f"🪵🪵🪵🪵🪵🪵 LOG name {log} 🪵🪵🪵🪵🪵🪵")
                    subprocess.call(sed_cmd, stdin=file)

    @staticmethod
    def format_args(args):
        out = []
        for k, v in args.items():
            if v is not None:
                out.append(f"--{k}={v}")
            else:
                out.append(f"--{k}")
        return out

    async def metrics(self):
        session = aiohttp.ClientSession()
        resp = await session.get(f"http://localhost:{self.port}/metrics")
        data = await resp.text(encoding="utf-8")
        await session.close()
        return {
            metric_family.name: metric_family
            for metric_family in text_string_to_metric_families(data)
        }

    def find_in_logs(self, pattern):
        if self.proc is not None:
            raise RuntimeError("Must close server first")

        results = []
        matcher = re.compile(pattern)
        for path in self.log_files:
            for line in open(path):
                if matcher.search(line):
                    results.append(line)
        return results

    @property
    def rss(self):
        if self.proc is None:
            return 0
        process = psutil.Process(self.proc.pid)
        mem_info = process.memory_info()
        return mem_info.rss

    def has_arg(self, arg):
        return arg in self.args


class DflyInstanceFactory:
    """
    A factory for creating dragonfly instances with pre-supplied arguments.
    """

    def __init__(self, params: DflyParams, args):
        self.args = args
        self.params = params
        self.instances = []

    def create(self, existing_port=None, path=None, version=100, **kwargs) -> DflyInstance:
        args = {**self.args, **kwargs}
        args.setdefault("dbfilename", "")
        args.setdefault("noversion_check", None)
        # MacOs does not set it automatically, so we need to set it manually
        args.setdefault("maxmemory", "8G")
        vmod = "dragonfly_connection=1,db_slice=1,listener_interface=1,main_service=1,rdb_save=1,replica=1,cluster_family=1,engine_shard=1,dflycmd=1,snapshot=1,streamer=1"
        args.setdefault("vmodule", vmod)
        args.setdefault("jsonpathv2")
        if version > 1.27:
            args.setdefault("omit_basic_usage")

        if version > 1.31:
            args.setdefault("latency_tracking")

        args.setdefault("log_dir", self.params.log_dir)

        if version >= 1.21 and "serialization_max_chunk_size" not in args:
            args.setdefault("serialization_max_chunk_size", 300000)

        if version > 1.36:
            args.setdefault("serialize_hnsw_index", "true")
            args.setdefault("deserialize_hnsw_index", "true")

        if version >= 1.26:
            args.setdefault("fiber_safety_margin=4096")

        # When a custom S3 endpoint is configured (e.g. MinIO), pass it to Dragonfly
        s3_endpoint = os.environ.get("MINIO_S3_ENDPOINT")
        if s3_endpoint:
            from urllib.parse import urlparse

            # Normalize scheme-less values (e.g. "localhost:9000") so urlparse
            # correctly populates hostname/port instead of treating it as a path.
            to_parse = s3_endpoint if "://" in s3_endpoint else "http://" + s3_endpoint
            parsed = urlparse(to_parse)
            endpoint_host = parsed.hostname or ""
            if parsed.port:
                endpoint_host = f"{endpoint_host}:{parsed.port}"
            if endpoint_host:
                args.setdefault("s3_endpoint", endpoint_host)
                args.setdefault("s3_use_https", "false" if parsed.scheme == "http" else "true")

        for k, v in args.items():
            args[k] = v.format(**self.params.env) if isinstance(v, str) else v

        if existing_port is not None:
            params = dataclasses.replace(self.params, existing_port=existing_port)
        else:
            params = self.params

        if path is not None:
            params = dataclasses.replace(self.params, path=path)

        if version < 1.35:
            params.args.pop("experimental_io_loop_v2", None)

        instance = DflyInstance(params, args)
        self.instances.append(instance)
        return instance

    def start_all(self, instances: List[DflyInstance]):
        """Start multiple instances in parallel"""
        for instance in instances:
            instance._start()

        for instance in instances:
            instance._wait_for_server()

    async def stop_all(self):
        """Stop all launched instances."""
        exceptions = []  # To collect exceptions
        for instance in self.instances:
            try:  # ioloop might be no longer running
                await instance.close_clients()
            except Exception as e:
                pass

            try:
                instance.stop()
            except Exception as e:
                exceptions.append(e)  # Collect the exception
        if exceptions:
            first_exception = exceptions[0]
            raise Exception(
                f"One or more errors occurred while stopping instances. "
                f"First exception: {first_exception}"
            ) from first_exception

    def __repr__(self) -> str:
        return f"Factory({self.args})"


class RedisServer:
    def __init__(self, port):
        self.port = port
        self.proc = None

    def start(self, redis7=None, **kwargs):
        servers = ["redis-server-7.2.2"]
        if not redis7:
            servers += ["redis-server-6.2.11", "valkey-server-8.0.1"]
        command = [
            random.choice(servers),
            f"--port {self.port}",
            "--save ''",
            "--appendonly no",
            "--protected-mode no",
            "--repl-diskless-sync yes",
            "--repl-diskless-sync-delay 0",
        ]
        # Convert kwargs to command-line arguments
        for key, value in kwargs.items():
            if value is None:
                command.append(f"--{key}")
            else:
                command.append(f"--{key} {value}")

        self.proc = subprocess.Popen(command)
        logging.debug(self.proc.args)

    def stop(self):
        self.proc.terminate()
        try:
            self.proc.wait(timeout=10)
        except Exception as e:
            pass


================================================
FILE: tests/dragonfly/json_test.py
================================================
import pytest
import redis
from redis import asyncio as aioredis
from .utility import *
from json import JSONDecoder, JSONEncoder, dumps

jane = {"name": "Jane", "Age": 33, "Location": "Chawton"}

json_num = {"a": {"a": 1, "b": 2, "c": 3}}


async def get_set_json(connection: aioredis.Redis, key, value, path="$"):
    encoder = JSONEncoder()
    await connection.execute_command("json.set", key, path, encoder.encode(value))
    result = await connection.execute_command("json.get", key, path)
    decoder = JSONDecoder()
    return decoder.decode(result)


async def test_basic_json_get_set(async_client: aioredis.Redis):
    key_name = "test-json-key"
    result = await get_set_json(connection=async_client, key=key_name, value=jane)
    assert result, "failed to set JSON value"
    the_type = await async_client.type(key_name)
    assert the_type == "ReJSON-RL"
    assert len(result) == 1
    assert result[0]["name"] == "Jane"
    assert result[0]["Age"] == 33


async def test_access_json_value_as_string(async_client: aioredis.Redis):
    key_name = "test-json-key"
    result = await get_set_json(async_client, key_name, value=jane)
    assert result is not None, "failed to set JSON value"
    # make sure that we have valid JSON here
    the_type = await async_client.type(key_name)
    assert the_type == "ReJSON-RL"
    # you cannot access this key as string
    with pytest.raises(redis.exceptions.ResponseError) as e:
        result = await async_client.get(key_name)

    assert e.value.args[0] == "WRONGTYPE Operation against a key holding the wrong kind of value"


async def test_reset_key_to_string(async_client: aioredis.Redis):
    key_name = "test-json-key"
    result = await get_set_json(async_client, key=key_name, value=jane)
    assert result is not None, "failed to set JSON value"
    # make sure that we have valid JSON here
    the_type = await async_client.type(key_name)
    assert the_type == "ReJSON-RL"

    # set the key to be string - this is legal
    await async_client.set(key_name, "some random value")
    result = await async_client.get(key_name)
    assert result == "some random value"

    # For JSON set the update the root path, we are allowing
    # to change the type to JSON and override it
    result = await get_set_json(async_client, key=key_name, value=jane)
    the_type = await async_client.type(key_name)
    assert the_type == "ReJSON-RL"


async def test_update_value(async_client: aioredis.Redis):
    key_name = "test-json-key"
    result = await get_set_json(async_client, key=key_name, value=json_num)
    assert result is not None, "failed to set JSON value"
    # make sure that we have valid JSON here
    the_type = await async_client.type(key_name)
    assert the_type == "ReJSON-RL"
    result = await get_set_json(async_client, value="0", key=key_name, path="$.a.*")
    assert len(result) == 3
    # make sure that all the values under 'a' where set to 0
    assert result == ["0", "0", "0"]

    # Ensure that after we're changing this into STRING type, it will no longer work
    await async_client.set(key_name, "some random value")
    assert await async_client.type(key_name) == "string"
    with pytest.raises(redis.exceptions.ResponseError) as e:
        await get_set_json(async_client, value="0", key=key_name, path="$.a.*")

    assert e.value.args[0] == "WRONGTYPE Operation against a key holding the wrong kind of value"
    assert await async_client.type(key_name) == "string"


@pytest.mark.parametrize(
    "description,expected_value,expected_type",
    (
        ("array", "[]", "array"),
        ("string", dumps("dragonfly"), "string"),
        ("number", dumps(3.50), "number"),
        ("object", dumps({"dragon": "fly"}, separators=(",", ":")), "object"),
        ("boolean true", "true", "boolean"),
        ("boolean false", "false", "boolean"),
    ),
)
@pytest.mark.asyncio
async def test_arrappend(async_client: aioredis.Redis, description, expected_value, expected_type):
    key_name = "test-json-key"

    await async_client.execute_command("json.set", key_name, "$", "[]")
    await async_client.execute_command("json.arrappend", key_name, "$", expected_value)

    # make sure the value is as expected
    first_element = await async_client.execute_command("json.get", key_name, "$[0]")
    assert first_element == "[{}]".format(expected_value)

    # make sure the type is as expected
    actual_type = await async_client.execute_command("json.type", key_name, "$[0]")
    assert actual_type[0] == expected_type


================================================
FILE: tests/dragonfly/list_family_test.py
================================================
import asyncio
from redis import asyncio as aioredis

import pytest


@pytest.mark.parametrize("index", range(50))
class TestBlPop:
    async def async_blpop(client: aioredis.Redis):
        return await client.blpop(["list1{t}", "list2{t}", "list2{t}", "list1{t}"], 0.5)

    async def blpop_mult_keys(async_client: aioredis.Redis, key: str, val: str):
        task = asyncio.create_task(TestBlPop.async_blpop(async_client))
        await async_client.lpush(key, val)
        result = await asyncio.wait_for(task, 3)
        assert result[1] == val
        watched = await async_client.execute_command("DEBUG WATCHED")
        assert watched == ["awaked", [], "watched", []]

    async def test_blpop_multiple_keys(self, async_client: aioredis.Redis, index):
        await TestBlPop.blpop_mult_keys(async_client, "list1{t}", "a")
        await TestBlPop.blpop_mult_keys(async_client, "list2{t}", "b")


================================================
FILE: tests/dragonfly/management_test.py
================================================
import pytest
import asyncio
from redis import asyncio as aioredis
from redis.exceptions import ResponseError


@pytest.mark.asyncio
async def test_config_cmd(async_client: aioredis.Redis):
    with pytest.raises(ResponseError):
        await async_client.config_set("foo", "bar")
    await async_client.config_set("requirepass", "foobar") == "OK"
    res = await async_client.config_get("*")
    assert len(res) > 0
    assert res["requirepass"] == "foobar"


================================================
FILE: tests/dragonfly/memcache_meta.py
================================================
import pytest
from .instance import DflyInstance
from . import dfly_args
from meta_memcache import (
    Key,
    ServerAddress,
    CacheClient,
    connection_pool_factory_builder,
)
from meta_memcache.protocol import RequestFlags, Success

DEFAULT_ARGS = {"memcached_port": 11211, "proactor_threads": 4}


@pytest.fixture(scope="function")
def meta_client(df_server: DflyInstance):
    result = CacheClient.cache_client_from_servers(
        servers=[
            ServerAddress(host="localhost", port=DEFAULT_ARGS.get("memcached_port")),
        ],
        connection_pool_factory_fn=connection_pool_factory_builder(recv_timeout=5),
    )
    yield result


@dfly_args(DEFAULT_ARGS)
class TestMetaMode:
    def test_basic(self, meta_client: CacheClient):
        pool = meta_client

        assert pool.set("key1", "value1", 100)
        assert pool.set("key1", "value2", 0)
        assert pool.get("key1") == "value2"

        request_flags = RequestFlags(return_value=False)
        response = pool.meta_get(Key("key1"), flags=request_flags)
        assert isinstance(response, Success)
        assert pool.get("key2") is None
        assert pool.delete("key1")
        assert pool.delete("key1") is False

        assert pool.set("cask", "v", 100)
        value, cas_token = pool.get_cas("cask")
        assert value == "v" and cas_token == 0

        k = Key("cask")
        response = pool.meta_multiget([k], RequestFlags(return_cas_token=True, return_value=True))
        assert k in response
        assert response[k].flags.cas_token == 0 and response[k].value == "v"

    def test_gat(self, meta_client: CacheClient):
        resp = meta_client.meta_set(
            Key("k1"), "value1", None, RequestFlags(return_ttl=True, cache_ttl=5)
        )
        assert isinstance(resp, Success)
        val = meta_client.meta_get(Key("k1"), RequestFlags(cache_ttl=15, return_ttl=True))

        # Note the correct behavior is to return previous TTL before it was updated by GAT,
        # but Dragonfly currently returns the updated TTL.
        assert val.flags.ttl == 15  # returns updated ttl


================================================
FILE: tests/dragonfly/memory_test.py
================================================
import asyncio
import logging
import random
import string
import time

import pytest
import redis

from . import dfly_args
from .instance import DflyInstanceFactory
from .utility import tmp_file_name


@pytest.mark.large
@pytest.mark.opt_only
@pytest.mark.parametrize(
    "type, keys, val_size, elements",
    [
        ("JSON", 200_000, 100, 100),
        ("SET", 280_000, 100, 100),
        ("HASH", 250_000, 100, 100),
        ("ZSET", 250_000, 100, 100),
        ("LIST", 300_000, 100, 100),
        ("STRING", 3_500_000, 1000, 1),
        ("STREAM", 280_000, 100, 100),
    ],
)
# We limit to 5gb just in case to sanity check the gh runner. Otherwise, if we ask for too much
# memory it might force the gh runner to run out of memory (since OOM killer might not even
# get a chance to run).
async def test_rss_used_mem_gap(df_factory: DflyInstanceFactory, type, keys, val_size, elements):
    dbfilename = f"dump_{tmp_file_name()}"
    instance = df_factory.create(
        proactor_threads=2,
        maxmemory="5gb",
        dbfilename=dbfilename,
        compression_mode=0,
        serialization_max_chunk_size=8192,
        num_shards=2,
    )
    instance.start()
    # Create a Dragonfly and fill it up with `type` until it reaches `min_rss`, then make sure that
    # the gap between used_memory and rss is no more than `max_unaccounted_ratio`.
    min_rss = 3 * 1024 * 1024 * 1024  # 3gb
    max_unaccounted = 200 * 1024 * 1024  # 200mb
    if type == "JSON":
        # For json data type, the interned string pool stores data on the default heap, not mimalloc.
        max_unaccounted *= 2

    # There is a big rss spike when this test is ran in one the gh runners (not the self hosted)
    # and it fails. This rss spike is not observed locally or on our self host runner so
    # this adjustment is mostly for CI
    if type == "STREAM":
        max_unaccounted = max_unaccounted * 3

    client = instance.client()
    await asyncio.sleep(1)  # Wait for another RSS heartbeat update in Dragonfly

    cmd = f"DEBUG POPULATE {keys} k {val_size} RAND TYPE {type} ELEMENTS {elements}"
    logging.info(f"Running {cmd}")
    await client.execute_command(cmd)

    await asyncio.sleep(2)  # Wait for another RSS heartbeat update in Dragonfly

    async def check_memory():
        info = await client.info("memory")
        logging.info(f'Used memory {info["used_memory"]}, rss {info["used_memory_rss"]}')
        assert info["used_memory"] > min_rss, "Weak testcase: too little used memory"
        delta = info["used_memory_rss"] - info["used_memory"]
        # It could be the case that the machine is configured to use swap if this assertion fails
        assert delta > 0, info
        assert delta < max_unaccounted, info

        if type != "STRING" and type != "JSON":
            # STRINGs keep some of the data inline, so not all of it is accounted in object_used_memory
            # We have a very small over-accounting bug in JSON
            assert info["object_used_memory"] > keys * elements * val_size
            assert info["used_memory"] > info["object_used_memory"]

    await check_memory()

    assert await client.execute_command("SAVE", "DF") == True
    assert await client.execute_command("DFLY", "LOAD", f"{dbfilename}-summary.dfs") == "OK"

    await check_memory()

    # FLUSHALL sync waits for flush to finish and decommit memory, so send INFO immediately after
    p = client.pipeline(transaction=False)
    p.execute_command("FLUSHALL", "SYNC")  # flushall(asynchronous=False) will just issue FLUSHALL$
    p.info("memory")

    info = (await p.execute())[-1]
    assert info["used_memory"] < 4 * 1_000_000  # Table memory
    assert info["used_memory_rss"] < min_rss / 10  # RSS must have been freed


@pytest.mark.asyncio
@dfly_args(
    {
        "maxmemory": "512mb",
        "proactor_threads": 2,
        "rss_oom_deny_ratio": 0.5,
    }
)
@pytest.mark.parametrize("admin_port", [0, 1112])
async def test_rss_oom_ratio(df_factory: DflyInstanceFactory, admin_port):
    """
    Test dragonfly rejects denyoom commands and new connections when rss memory is above maxmemory*rss_oom_deny_ratio
    Test dragonfly does not rejects when rss memory goes below threshold
    """
    df_server = df_factory.create(admin_port=admin_port)
    df_server.start()

    client = df_server.client()
    await client.execute_command("DEBUG POPULATE 10000 key 40000 RAND")

    await asyncio.sleep(1)  # Wait for another RSS heartbeat update in Dragonfly

    new_client = df_server.admin_client() if admin_port else df_server.client()
    await new_client.ping()

    info = await new_client.info("memory")
    logging.debug(f'Used memory {info["used_memory"]}, rss {info["used_memory_rss"]}')

    reject_limit = 256 * 1024 * 1024  # 256mb
    assert info["used_memory_rss"] > reject_limit

    # get command from existing connection should not be rejected
    await client.execute_command("get x")

    # reject set due to oom
    with pytest.raises(redis.exceptions.ResponseError):
        await client.execute_command("set x y")

    if admin_port:
        # new client create should also fail if admin port was set
        client = df_server.client()
        with pytest.raises(redis.exceptions.ConnectionError):
            await client.ping()

    # flush to free memory
    await new_client.flushall()

    await asyncio.sleep(2)  # Wait for another RSS heartbeat update in Dragonfly

    info = await new_client.info("memory")
    logging.debug(f'Used memory {info["used_memory"]}, rss {info["used_memory_rss"]}')
    assert info["used_memory_rss"] < reject_limit

    # new client create shoud not fail after memory usage decrease
    client = df_server.client()
    await client.execute_command("set x y")


@pytest.mark.large
@pytest.mark.asyncio
@dfly_args(
    {
        "maxmemory": "512mb",
        "proactor_threads": 1,
    }
)
async def test_eval_with_oom(df_factory: DflyInstanceFactory):
    """
    Test running eval commands when dragonfly returns OOM on write commands and check rss memory
    This test was writen after detecting memory leak in script runs on OOM state
    """
    df_server = df_factory.create()
    df_server.start()

    client = df_server.client()
    await client.execute_command("DEBUG POPULATE 20000 key 40000 RAND")

    await asyncio.sleep(1)  # Wait for another RSS heartbeat update in Dragonfly

    info = await client.info("memory")
    logging.debug(f'Used memory {info["used_memory"]}, rss {info["used_memory_rss"]}')

    reject_limit = 512 * 1024 * 1024  # 256mb
    assert info["used_memory"] > reject_limit
    rss_before_eval = info["used_memory_rss"]

    pipe = client.pipeline(transaction=False)
    MSET_SCRIPT = """
        redis.call('MSET', KEYS[1], ARGV[1], KEYS[2], ARGV[2])
    """

    for _ in range(20):
        for _ in range(8000):
            pipe.eval(MSET_SCRIPT, 2, "x1", "y1", "x2", "y2")
        # reject mset due to oom
        with pytest.raises(redis.exceptions.ResponseError):
            await pipe.execute()

    await asyncio.sleep(1)  # Wait for another RSS heartbeat update in Dragonfly

    info = await client.info("memory")
    logging.debug(f'Used memory {info["used_memory"]}, rss {info["used_memory_rss"]}')
    assert rss_before_eval * 1.01 > info["used_memory_rss"]


@pytest.mark.parametrize("heartbeat_rss_eviction", [True, False])
async def test_eviction_on_rss_treshold(df_factory: DflyInstanceFactory, heartbeat_rss_eviction):
    max_memory = 1024 * 1024**2  # 10242mb

    df_server = df_factory.create(
        proactor_threads=3,
        cache_mode="yes",
        maxmemory=max_memory,
        enable_heartbeat_eviction="false",
        enable_heartbeat_rss_eviction=heartbeat_rss_eviction,
    )
    df_server.start()
    client = df_server.client()

    data_fill_size = int(0.70 * max_memory)  # 70% of max_memory

    val_size = 1024 * 5  # 5 kb
    num_keys = data_fill_size // val_size

    await client.execute_command("DEBUG", "POPULATE", num_keys, "key", val_size)

    # Create huge list which can be used with LRANGE to increase RSS memory only
    for name in ["list_1", "list_2"]:
        for i in range(1, 1000):
            rand_str = "".join(random.choices(string.ascii_letters, k=val_size))
            await client.execute_command(f"LPUSH {name} {rand_str}")

    # Make them STICK so we don't evict them
    await client.execute_command(f"STICK list_1")
    await client.execute_command(f"STICK list_2")

    await client.execute_command("CONFIG SET enable_heartbeat_eviction true")

    memory_info_before = await client.info("memory")

    # This will increase only RSS memory above treshold
    p = client.pipeline()
    for _ in range(50):
        p.execute_command("LRANGE list_1 0 -1")
        p.execute_command("LRANGE list_2 0 -1")
    await p.execute()

    # Wait for some time
    await asyncio.sleep(3)
    memory_info_after = await client.info("memory")
    stats_info_after = await client.info("stats")

    if heartbeat_rss_eviction:
        # We should see used memory deacrease and number of some number of evicted keys
        assert memory_info_after["used_memory"] < memory_info_before["used_memory"]
        assert stats_info_after["evicted_keys"]
    else:
        # If heartbeat rss eviction is disabled there should be no chage
        assert memory_info_after["used_memory"] == memory_info_before["used_memory"]
        assert stats_info_after["evicted_keys"] == 0


# Github issue #5891
async def test_no_rss_eviction_overflow_on_expired_keys(df_factory: DflyInstanceFactory):
    max_memory = 256 * 1024**2  # 256MB
    df_server = df_factory.create(
        proactor_threads=1, cache_mode="yes", maxmemory=max_memory, vmodule="engine_shard=2"
    )
    df_server.start()
    client = df_server.client()

    data_fill_size = int(0.20 * max_memory)  # 20% of max_memory

    val_size = 1024 * 50  # 50 kb for key
    num_keys = data_fill_size // val_size

    for i in range(0, 5):
        pipe = client.pipeline(transaction=False)
        step_keys = num_keys + i * 10
        await pipe.execute_command("DEBUG", "POPULATE", step_keys, "key_1", val_size)
        await pipe.execute_command("DEBUG", "POPULATE", step_keys + i * 10, "key_2", val_size)
        for i in range(step_keys):
            if i % 2 == 0:
                await pipe.execute_command(f"EXPIRE key_1:{i} 1")
            else:
                await pipe.execute_command(f"EXPIRE key_2:{i} 1")
        await pipe.execute()
        await asyncio.sleep(2)

    await client.execute_command("FLUSHALL")

    # New keys should be added
    await client.execute_command("DEBUG", "POPULATE", num_keys, "key", val_size)
    # Wait so heartbeat eviction
    await asyncio.sleep(5)

    keyspace_info = await client.info("keyspace")
    assert keyspace_info["db0"]["keys"] == num_keys


@pytest.mark.asyncio
async def test_throttle_on_commands_squashing_replies_bytes(df_factory: DflyInstanceFactory):
    df = df_factory.create(
        proactor_threads=2,
        squashed_reply_size_limit=100_000_000,
        vmodule="dragonfly_connection=5",
    )
    df.start()

    client = df.client()
    # 100mb
    await client.execute_command("debug populate 64 test 3125 rand type hash elements 500")

    async def poll():
        # At any point we should not cross this limit
        # 2x the reply_size_limit, 200mb
        assert df.rss < 200_000_000
        cl = df.client()
        pipe = cl.pipeline(transaction=False)
        for i in range(64):
            pipe.execute_command(f"hgetall test:{i}")

        await pipe.execute()

    tasks = []
    for i in range(20):
        tasks.append(asyncio.create_task(poll()))

    for task in tasks:
        await task

    df.stop()
    found = df.find_in_logs("Commands squashing current reply size is overlimit")
    assert len(found) > 0


@pytest.mark.asyncio
async def test_remove_docs_on_eviction(df_factory):
    max_memory = 256 * 1024**2  # 256MB
    df_server = df_factory.create(
        proactor_threads=1,
        cache_mode="yes",
        maxmemory=max_memory,
        vmodule="engine_shard=2",
        eviction_memory_budget_threshold=0.99,
        enable_heartbeat_rss_eviction="no",
    )
    df_server.start()
    client = df_server.client()

    await client.execute_command(
        "FT.CREATE", "idx", "ON", "HASH", "PREFIX", "1", "doc:", "SCHEMA", "v", "TEXT"
    )

    i = 0
    while True:
        random_string = "".join(random.choices(string.ascii_letters + string.digits, k=1_000))
        await client.execute_command("HSET", f"doc:{i}", "v", random_string)
        stats_info = await client.info("stats")
        # Done when see at least 50 evictions
        if stats_info["evicted_keys"] > 50:
            break
        i = i + 1

    # Give some time to eviction stabilize
    await asyncio.sleep(1)

    # Get number of docs in index
    index_info = await client.execute_command(f"FT.INFO idx")
    index_info_num_docs = index_info[9]

    # Get number of keys in database
    keyspace_info = await client.info("keyspace")
    keyspace_keys = keyspace_info["db0"]["keys"]

    assert index_info_num_docs == keyspace_keys


@pytest.mark.asyncio
async def test_memory_shrink_basic(df_factory: DflyInstanceFactory):
    df_server = df_factory.create(proactor_threads=2)
    df_server.start()
    client = df_server.client()

    # Create sparse set - add many elements then delete most
    for i in range(10000):
        await client.sadd("myset", f"elem_{i}")

    # Delete 99% to make it sparse (10000 -> 100)
    for i in range(9900):
        await client.srem("myset", f"elem_{i}")

    # Shrink the set and verify bytes saved
    bytes_saved = await client.execute_command("SHRINK", "myset")
    assert bytes_saved > 0, f"Expected bytes_saved > 0, got {bytes_saved}"

    # Shrinking again should return 0 (already optimal)
    bytes_saved_again = await client.execute_command("SHRINK", "myset")
    assert bytes_saved_again == 0, f"Expected 0, got {bytes_saved_again}"

    # Non-existent key returns null
    result = await client.execute_command("SHRINK", "nonexistent")
    assert result is None


@pytest.mark.asyncio
async def test_memory_shrink_with_scan(df_factory: DflyInstanceFactory):
    df_server = df_factory.create(proactor_threads=1)
    df_server.start()
    client = df_server.client()

    # Create set with many elements
    for i in range(100):
        await client.sadd("set:0", *[f"elem_{j}" for j in range(i * 10, (i + 1) * 10)])

    # Start SCAN
    cursor, keys = await client.sscan("set:0", 0, count=50)

    # Shrink during scan
    await client.execute_command("SHRINK", "set:0")

    # Continue and complete scan
    all_keys = set(keys)
    while cursor != 0:
        cursor, keys = await client.sscan("set:0", cursor, count=50)
        all_keys.update(keys)

    assert len(all_keys) == 1000


@pytest.mark.asyncio
async def test_expiry_heartbeat_responsiveness(df_factory: DflyInstanceFactory):
    df_server = df_factory.create(proactor_threads=1)
    df_server.start()
    client = df_server.client()

    await client.execute_command("DEBUG", "POPULATE", 50000, "key", 1, "EXPIRE", 3, 4)
    await asyncio.sleep(2.5)
    worst_ping = 0
    deadline = time.monotonic() + 60
    while await client.dbsize() > 0:
        t0 = time.monotonic()
        assert t0 < deadline, "All keys did not expire in 60 seconds"
        await client.ping()
        worst_ping = max(time.monotonic() - t0, worst_ping)
        await asyncio.sleep(0.05)
    assert (
        worst_ping < 0.5
    ), f"Worst PING latency {worst_ping:.3f}s exceeded 500ms during mass expiry"


================================================
FILE: tests/dragonfly/proxy.py
================================================
import asyncio
import random


class Proxy:
    def __init__(self, host, port, remote_host, remote_port):
        self.host = host
        self.port = port
        self.remote_host = remote_host
        self.remote_port = remote_port
        self.stop_connections = []
        self.server = None

    async def handle(self, reader, writer):
        try:
            remote_reader, remote_writer = await asyncio.open_connection(
                self.remote_host, self.remote_port
            )
        except OSError:
            writer.close()
            await writer.wait_closed()
            return

        async def forward(reader, writer):
            while True:
                data = await reader.read(1024)
                if not data:
                    break
                writer.write(data)
                await writer.drain()
            writer.close()

        task1 = asyncio.ensure_future(forward(reader, remote_writer))
        task2 = asyncio.ensure_future(forward(remote_reader, writer))

        def cleanup():
            task1.cancel()
            task2.cancel()
            writer.close()
            remote_writer.close()

        self.stop_connections.append(cleanup)

        try:
            await asyncio.gather(task1, task2)
        except (asyncio.CancelledError, ConnectionResetError):
            pass
        finally:
            cleanup()
            if cleanup in self.stop_connections:
                self.stop_connections.remove(cleanup)

    async def start(self):
        self.server = await asyncio.start_server(self.handle, self.host, self.port)

        if self.port == 0:
            _, port = self.server.sockets[0].getsockname()[:2]
            self.port = port

    async def serve(self):
        async with self.server:
            await self.server.serve_forever()

    def drop_connection(self):
        """
        Randomly drop one connection
        """
        if self.stop_connections:
            cb = random.choice(self.stop_connections)
            self.stop_connections.remove(cb)
            cb()

    async def close(self, task=None):
        if self.server is not None:
            self.server.close()
            self.server = None

        for cb in self.stop_connections:
            cb()
        self.stop_connections = []

        if not task == None:
            try:
                await task
            except asyncio.exceptions.CancelledError:
                pass


================================================
FILE: tests/dragonfly/pymemcached_test.py
================================================
import logging
import random
import socket
import ssl
import time

from pymemcache.client.base import Client as MCClient

from . import dfly_args
from .instance import DflyInstance

DEFAULT_ARGS = {"memcached_port": 11212, "proactor_threads": 4}


def read_response(client, expected_len):
    response = b""
    while len(response) < expected_len:
        data = client.recv(1024)
        if not data:
            break
        response += data
    return response


# Generic basic tests
@dfly_args(DEFAULT_ARGS)
class TestMemcached:
    def test_basic(self, memcached_client: MCClient):
        assert not memcached_client.default_noreply

        # set -> replace -> add -> get
        assert memcached_client.set("key1", "value1")
        assert memcached_client.replace("key1", "value2")
        assert not memcached_client.add("key1", "value3")
        assert memcached_client.get("key1") == b"value2"

        # add -> get
        assert memcached_client.add("key2", "value1")
        assert memcached_client.get("key2") == b"value1"

        # delete
        assert memcached_client.delete("key1")
        assert not memcached_client.delete("key3")
        assert memcached_client.get("key1") is None

        # prepend append
        assert memcached_client.set("key4", "B")
        assert memcached_client.prepend("key4", "A")
        assert memcached_client.append("key4", "C")
        assert memcached_client.get("key4") == b"ABC"

        # incr
        memcached_client.set("key5", 0)
        assert memcached_client.incr("key5", 1) == 1
        assert memcached_client.incr("key5", 1) == 2
        assert memcached_client.decr("key5", 1) == 1

        assert memcached_client.gets("key5") == (b"1", b"0")

    # Noreply (and pipeline) tests
    async def test_noreply_pipeline(self, df_server: DflyInstance, memcached_client: MCClient):
        """
        With the noreply option the python client doesn't wait for replies,
        so all the commands are pipelined. Assert pipelines work correctly and the
        succeeding regular command receives a reply (it should join the pipeline as last).
        """

        client = df_server.client()
        for attempts in range(2):
            keys = [f"k{i}" for i in range(1000)]
            values = [f"d{i}" for i in range(len(keys))]

            for k, v in zip(keys, values):
                memcached_client.set(k, v, noreply=True)

            # quick follow up before the pipeline finishes
            assert memcached_client.get("k10") == b"d10"
            # check all commands were executed
            assert memcached_client.get_many(keys) == {k: v.encode() for k, v in zip(keys, values)}

            info = await client.info()
            if info["total_pipelined_commands"] > 100:
                return
            logging.warning(
                f"Have not identified pipelining at attempt {attempts} Info: \n" + str(info)
            )
            await client.flushall()

        assert False, "Pipelining not detected"

    def test_noreply_alternating(self, memcached_client: MCClient):
        """
        Assert alternating noreply works correctly, will cause many dispatch queue emptyings.
        """
        for i in range(200):
            if i % 2 == 0:
                memcached_client.set(f"k{i}", "D1", noreply=True)
                memcached_client.set(f"k{i}", "D2", noreply=True)
                memcached_client.set(f"k{i}", "D3", noreply=True)
            assert memcached_client.add(f"k{i}", "DX", noreply=False) == (i % 2 != 0)

    def test_length_in_set_command(self, df_server: DflyInstance, memcached_client: MCClient):
        """
        Test parser correctly reads value based on length and complains about bad chunks
        """
        cases = [b"NOTFOUR", b"FOUR", b"F4\r\n", b"\r\n\r\n"]

        for case in cases:
            client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            client.connect(("127.0.0.1", int(df_server["memcached_port"])))

            logging.info(f"Case {case}")
            client.sendall(b"set foo 0 0 4\r\n" + case + b"\r\n")
            response = client.recv(256).decode()
            if len(case) == 4:
                assert response == "STORED\r\n"
            else:
                # response should follow up with ERROR due to OUR\r\n being
                # parsed as unknown command but we can not guarantee that
                # it will be read in the same recv call, so just check the prefix.
                assert response.startswith("CLIENT_ERROR bad data chunk\r\n")

            client.close()

    def test_pipeline_get_then_stats_version(self, df_server: DflyInstance):
        """
        Verify GET pipelined before STATS or VERSION doesn't crash the server.
        """
        port = int(df_server["memcached_port"])

        client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        client.settimeout(5)
        client.connect(("127.0.0.1", port))
        client.sendall(b"get nokey\r\nversion\r\n")
        response = read_response(client, len(b"END\r\nVERSION 1.6.0 DF\r\n"))
        client.close()
        assert response == b"END\r\nVERSION 1.6.0 DF\r\n"

        client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        client.settimeout(5)
        client.connect(("127.0.0.1", port))
        client.sendall(b"get nokey\r\nstats\r\n")
        # Read until both GET's END and STATS' END are received before closing.
        response = b""
        while response.count(b"END\r\n") < 2:
            response += client.recv(4096)
        client.close()
        assert response.startswith(b"END\r\nSTAT ")

    def test_error_in_pipeline(self, df_server: DflyInstance, memcached_client: MCClient):
        """
        Verify correct responses to  "get x\r\ngetaa\r\nget y z\r\n"
        """
        client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        client.settimeout(5)
        client.connect(("127.0.0.1", int(df_server["memcached_port"])))

        client.sendall(b"get x\r\ngetaa\r\nget y z\r\n")

        expected = b"END\r\nERROR\r\nEND\r\n"
        response = read_response(client, len(expected))
        client.close()

        assert response == expected

    def test_large_request(self, memcached_client):
        assert memcached_client.set(b"key1", b"d" * 4096, noreply=False)
        assert memcached_client.set(b"key2", b"d" * 4096 * 2, noreply=False)

    def test_version(self, memcached_client: MCClient):
        """
        php-memcached client expects version to be in the format of "n.n.n", so we return 1.5.0 emulating an old memcached server.
        Our real version is being returned in the stats command.
        Also verified manually that php client parses correctly the version string that ends with "DF".
        """
        assert b"1.6.0 DF" == memcached_client.version()
        stats = memcached_client.stats()
        version = stats[b"version"].decode("utf-8")
        assert version.startswith("v") or version == "dev"

    def test_flags(self, memcached_client: MCClient):
        for i in range(1, 20):
            flags = random.randrange(50, 1000)
            memcached_client.set("a", "real-value", flags=flags, noreply=True)

            res = memcached_client.raw_command("get a", "END\r\n").split()
            # workaround sometimes memcached_client.raw_command returns empty str
            if len(res) > 0:
                assert res[2].decode() == str(flags)

    def test_expiration(self, memcached_client: MCClient):
        assert not memcached_client.default_noreply

        assert memcached_client.set("key1", "value1", 2)
        assert memcached_client.set("key2", "value2", int(time.time()) + 2)
        assert memcached_client.set("key3", "value3", int(time.time()) + 200)
        assert memcached_client.get("key1") == b"value1"
        assert memcached_client.get("key2") == b"value2"
        assert memcached_client.get("key3") == b"value3"
        assert memcached_client.set("key3", "value3", int(time.time()) - 200)
        assert memcached_client.get("key3") is None
        time.sleep(2)
        assert memcached_client.get("key1") is None
        assert memcached_client.get("key2") is None
        assert memcached_client.get("key3") is None

    def test_pipeline_cas_crash(self, df_server: DflyInstance, memcached_client: MCClient):
        """
        Tests that an unsupported/invalid command (CAS) sent in a pipeline
        after an async command (GETS) does not crash the server
        and correctly buffers the error reply in order.
        """
        client_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        client_sock.settimeout(5)
        client_sock.connect(("127.0.0.1", int(df_server["memcached_port"])))

        # Command sequence:
        # 1. SET (sync)
        # 2. GETS (async - forces the next command to not be the head)
        # 3. CAS (hits the default block, triggering the early error)
        payload = (
            b"set mykey 0 0 5\r\nvalue\r\n" b"gets mykey\r\n" b"cas mykey 0 0 5 12345\r\nvalue\r\n"
        )
        client_sock.sendall(payload)

        response = b""
        while b"CLIENT_ERROR bad command line format\r\n" not in response:
            data = client_sock.recv(4096)
            if not data:
                break
            response += data
        client_sock.close()

        # Ensure strict ordering: STORED -> GETS (VALUE + END) -> CLIENT_ERROR
        idx_stored = response.find(b"STORED\r\n")
        idx_value = response.find(b"VALUE mykey")
        idx_error = response.find(b"CLIENT_ERROR bad command line format")
        # Look for the GETS terminator specifically AFTER the value
        idx_end = response.find(b"END\r\n", idx_value)

        assert idx_stored != -1 and idx_value != -1 and idx_error != -1 and idx_end != -1
        assert (
            idx_stored < idx_value < idx_end < idx_error
        ), f"Responses out of order/interleaved: {response}"

        # Final sanity check to ensure the connection/server is still healthy
        assert memcached_client.set("sanity_check", "alive")
        assert memcached_client.get("sanity_check") == b"alive"


@dfly_args(DEFAULT_ARGS)
def test_memcached_tls_no_requirepass(df_factory, with_tls_server_args, with_tls_ca_cert_args):
    """
    Test for issue #5084: ability to use TLS for Memcached without requirepass.

    Dragonfly required a password to be set when using TLS, but the Memcached protocol
    does not support password authentication. This test verifies that we can start
    the server with TLS enabled but without specifying requirepass and with the Memcached port.
    """
    # Create arguments for TLS without specifying requirepass
    server_args = {**DEFAULT_ARGS, **with_tls_server_args, "requirepass": "test_password"}

    # Create and start the server - it should not crash
    server = df_factory.create(**server_args)
    server.start()

    # Give the server time to start
    time.sleep(1)

    # Create SSL context for client
    ssl_context = ssl.create_default_context()
    ssl_context.load_verify_locations(with_tls_ca_cert_args["ca_cert"])
    ssl_context.check_hostname = False

    # Disable certificate verification (since we don't provide a client certificate)
    ssl_context.verify_mode = ssl.CERT_NONE

    # Output port information for diagnostics
    logging.info(f"Connecting to memcached port: {server.mc_port} on host: 127.0.0.1")

    # Connect to Memcached over TLS
    client = MCClient(("127.0.0.1", server.mc_port), tls_context=ssl_context)

    # Test basic operations
    assert client.set("foo", "bar")
    assert client.get("foo") == b"bar"


================================================
FILE: tests/dragonfly/redis_replication_test.py
================================================
import time
import pytest
import asyncio
from redis import asyncio as aioredis
import subprocess
from .utility import *
from .instance import DflyInstanceFactory
from .proxy import Proxy


# Checks that master redis and dragonfly replica are synced by writing a random key to master
# and waiting for it to exist in replica. Foreach db in 0..dbcount-1.
async def await_synced(c_master: aioredis.Redis, c_replica: aioredis.Redis, dbcount=1):
    rnd_str = "".join(random.choices(string.ascii_letters, k=10))
    key = "sync_key/" + rnd_str
    for db in range(dbcount):
        await c_master.set(key, "dummy")
        logging.debug(f"set {key} MASTER db = {db}")
        timeout = 30
        while timeout > 0:
            v = await c_replica.get(key)
            logging.debug(f"get {key} from REPLICA db = {db} got {v}")
            if v is not None:
                break
            repl_state = await c_master.info("replication")
            logging.debug(f"replication info: {repl_state}")
            await asyncio.sleep(1)

            timeout -= 1
        await c_master.close()
        await c_replica.close()
        assert timeout > 0, "Timeout while waiting for replica to sync"


async def await_synced_all(c_master, c_replicas):
    for c_replica in c_replicas:
        await await_synced(c_master, c_replica)


async def check_data(seeder, replicas, c_replicas):
    capture = await seeder.capture()
    for replica, c_replica in zip(replicas, c_replicas):
        await wait_available_async(c_replica)
        assert await seeder.compare(capture, port=replica.port)


# Start replication
async def run_replication(client: aioredis.Redis, port):
    res = await client.execute_command("REPLICAOF localhost " + str(port))
    assert res == "OK"
    await wait_available_async(client)


async def replicate_all(replicas, port):
    await asyncio.gather(*(asyncio.create_task(run_replication(c, port)) for c in replicas))


full_sync_replication_specs = [
    ([1], dict(keys=100, dbcount=1, unsupported_types=[ValueType.JSON])),
    ([1], dict(keys=5000, dbcount=2, unsupported_types=[ValueType.JSON])),
    ([2], dict(keys=5000, dbcount=4, unsupported_types=[ValueType.JSON])),
]


@pytest.mark.parametrize("t_replicas, seeder_config", full_sync_replication_specs)
async def test_replication_full_sync(
    df_factory, df_seeder_factory, redis_server, t_replicas, seeder_config, port_picker
):
    master = redis_server
    c_master = aioredis.Redis(port=master.port)
    assert await c_master.ping()

    seeder = df_seeder_factory.create(port=master.port, **seeder_config)
    await seeder.run(target_deviation=0.1)

    replica = df_factory.create(
        port=port_picker.get_available_port(), proactor_threads=t_replicas[0]
    )
    replica.start()
    c_replica = replica.client()
    assert await c_replica.ping()

    await run_replication(c_replica, master.port)
    await await_synced(c_master, c_replica, seeder_config["dbcount"])

    capture = await seeder.capture()
    assert await seeder.compare(capture, port=replica.port)


stable_sync_replication_specs = [
    ([1], dict(keys=100, dbcount=1, unsupported_types=[ValueType.JSON])),
    ([1], dict(keys=10_000, dbcount=2, unsupported_types=[ValueType.JSON])),
    ([2], dict(keys=10_000, dbcount=1, unsupported_types=[ValueType.JSON])),
    ([2], dict(keys=10_000, dbcount=2, unsupported_types=[ValueType.JSON])),
    ([8], dict(keys=10_000, dbcount=4, unsupported_types=[ValueType.JSON])),
]


@pytest.mark.parametrize("t_replicas, seeder_config", stable_sync_replication_specs)
async def test_replication_stable_sync(
    df_factory, df_seeder_factory, redis_server, t_replicas, seeder_config, port_picker
):
    master = redis_server
    c_master = aioredis.Redis(port=master.port)
    assert await c_master.ping()

    replica = df_factory.create(
        port=port_picker.get_available_port(), proactor_threads=t_replicas[0]
    )
    replica.start()
    c_replica = replica.client()
    assert await c_replica.ping()

    await c_replica.execute_command("REPLICAOF", "localhost", master.port)
    await wait_available_async(c_replica)

    seeder = df_seeder_factory.create(port=master.port, **seeder_config)
    await seeder.run(target_ops=1000)

    await await_synced(c_master, c_replica, seeder_config["dbcount"])

    capture = await seeder.capture()
    assert await seeder.compare(capture, port=replica.port)


# Threads for each dragonfly replica, Seeder Config.
replication_specs = [
    ([1], dict(keys=1000, dbcount=1, unsupported_types=[ValueType.JSON])),
    ([6, 6, 6], dict(keys=4_000, dbcount=2, unsupported_types=[ValueType.JSON])),
    ([2, 2], dict(keys=4_000, dbcount=2, unsupported_types=[ValueType.JSON])),
    ([8, 8], dict(keys=4_000, dbcount=2, unsupported_types=[ValueType.JSON])),
    ([1] * 8, dict(keys=500, dbcount=1, unsupported_types=[ValueType.JSON])),
    ([1], dict(keys=100, dbcount=4, unsupported_types=[ValueType.JSON])),
]


@pytest.mark.parametrize("t_replicas, seeder_config", replication_specs)
async def test_redis_replication_all(
    df_factory: DflyInstanceFactory,
    df_seeder_factory,
    redis_server,
    t_replicas,
    seeder_config,
    port_picker,
):
    master = redis_server
    c_master = aioredis.Redis(port=master.port)
    assert await c_master.ping()

    replicas = [
        df_factory.create(port=port_picker.get_available_port(), proactor_threads=t)
        for i, t in enumerate(t_replicas)
    ]

    # Fill master with test data
    seeder = df_seeder_factory.create(port=master.port, **seeder_config)
    await seeder.run(target_deviation=0.1)

    # Start replicas
    df_factory.start_all(replicas)

    c_replicas = [replica.client() for replica in replicas]

    # Start data stream
    stream_task = asyncio.create_task(seeder.run())
    await asyncio.sleep(0.0)

    await replicate_all(c_replicas, master.port)

    # Wait for streaming to finish
    assert (
        not stream_task.done()
    ), "Weak testcase. Increase number of streamed iterations to surpass full sync"
    seeder.stop()
    await stream_task

    # Check data after full sync
    await await_synced_all(c_master, c_replicas)
    await check_data(seeder, replicas, c_replicas)

    # Stream more data in stable state
    await seeder.run(target_ops=2000)

    # Check data after stable state stream
    await await_synced_all(c_master, c_replicas)
    await check_data(seeder, replicas, c_replicas)


master_disconnect_cases = [
    ([6], 1, dict(keys=4_000, dbcount=1, unsupported_types=[ValueType.JSON])),
    ([1, 4, 6], 3, dict(keys=1_000, dbcount=2, unsupported_types=[ValueType.JSON])),
]


@pytest.mark.parametrize("t_replicas, t_disconnect, seeder_config", master_disconnect_cases)
async def test_redis_master_restart(
    df_factory,
    df_seeder_factory,
    redis_server,
    t_replicas,
    t_disconnect,
    seeder_config,
    port_picker,
):
    master = redis_server
    c_master = aioredis.Redis(port=master.port)
    assert await c_master.ping()

    replicas = [
        df_factory.create(port=port_picker.get_available_port(), proactor_threads=t)
        for i, t in enumerate(t_replicas)
    ]

    # Fill master with test data
    seeder = df_seeder_factory.create(port=master.port, **seeder_config)
    await seeder.run(target_deviation=0.1)

    # Start replicas
    df_factory.start_all(replicas)

    c_replicas = [replica.client() for replica in replicas]

    # Start data stream
    stream_task = asyncio.create_task(seeder.run())
    await asyncio.sleep(0.0)

    await replicate_all(c_replicas, master.port)

    # Wait for streaming to finish
    assert (
        not stream_task.done()
    ), "Weak testcase. Increase number of streamed iterations to surpass full sync"
    seeder.stop()
    await stream_task

    for _ in range(t_disconnect):
        master.stop()
        await asyncio.sleep(1)
        master.start()
        await asyncio.sleep(1)
        # fill master with data
        await seeder.run(target_deviation=0.1)

    # Check data after stable state stream
    await wait_available_async(c_replicas)
    await await_synced_all(c_master, c_replicas)
    await check_data(seeder, replicas, c_replicas)


master_disconnect_cases = [
    ([6], dict(keys=4_000, dbcount=1, unsupported_types=[ValueType.JSON])),
    pytest.param(
        [1, 4, 6],
        dict(keys=1_000, dbcount=2, unsupported_types=[ValueType.JSON]),
        marks=pytest.mark.large,
    ),
]


@pytest.mark.parametrize("t_replicas, seeder_config", master_disconnect_cases)
async def test_disconnect_master(
    df_factory,
    df_seeder_factory,
    redis_server,
    t_replicas,
    seeder_config,
    port_picker,
):
    master = redis_server
    c_master = aioredis.Redis(port=master.port)
    assert await c_master.ping()

    proxy = Proxy("127.0.0.1", 1114, "127.0.0.1", master.port)
    await proxy.start()
    proxy_task = asyncio.create_task(proxy.serve())

    replicas = [
        df_factory.create(port=port_picker.get_available_port(), proactor_threads=t)
        for i, t in enumerate(t_replicas)
    ]

    # Fill master with test data
    seeder = df_seeder_factory.create(port=master.port, **seeder_config)
    await seeder.run(target_deviation=0.1)

    # Start replicas
    df_factory.start_all(replicas)

    c_replicas = [replica.client() for replica in replicas]

    # Start data stream
    stream_task = asyncio.create_task(seeder.run())
    await asyncio.sleep(0.5)

    await replicate_all(c_replicas, proxy.port)

    # Break the connection between master and replica
    await proxy.close(proxy_task)
    await asyncio.sleep(2)
    await proxy.start()
    proxy_task = asyncio.create_task(proxy.serve())

    # finish streaming data
    await asyncio.sleep(1)
    seeder.stop()
    await stream_task

    # Check data after stable state stream
    await wait_available_async(c_replicas)
    await await_synced_all(c_master, c_replicas)
    await check_data(seeder, replicas, c_replicas)

    await proxy.close(proxy_task)


================================================
FILE: tests/dragonfly/replication_test.py
================================================
import os
import platform
import shutil
import signal
import struct
import tarfile
import time
import urllib.request
from itertools import chain, repeat

import async_timeout
import pymemcache

from . import dfly_args
from .instance import DflyInstanceFactory, DflyInstance
from .proxy import Proxy
from .seeder import DebugPopulateSeeder, HnswSearchSeeder
from .seeder import Seeder as SeederV2
from .utility import *

ADMIN_PORT = 1211

DISCONNECT_CRASH_FULL_SYNC = 0
DISCONNECT_CRASH_STABLE_SYNC = 1
DISCONNECT_NORMAL_STABLE_SYNC = 2

M_OPT = [pytest.mark.opt_only]
M_SLOW = [pytest.mark.large]
M_STRESS = [pytest.mark.large, pytest.mark.opt_only]
M_NOT_EPOLL = [pytest.mark.exclude_epoll]


"""
Test full replication pipeline. Test full sync with streaming changes and stable state streaming.
"""


@pytest.mark.parametrize(
    "t_master, t_replicas, seeder_config, stream_target",
    [
        # Quick general test that replication is working
        (1, 3 * [1], dict(key_target=1_000), 500),
        # A lot of huge values
        (2, 2 * [1], dict(key_target=5_000, huge_value_target=30), 500),
        (4, [4, 4], dict(key_target=10_000), 1_000),
        pytest.param(6, [6, 6, 6], dict(key_target=100_000), 20_000, marks=M_OPT),
        # Skewed tests with different thread ratio
        pytest.param(8, 6 * [1], dict(key_target=5_000), 2_000, marks=M_SLOW),
        pytest.param(2, [8, 8], dict(key_target=10_000), 2_000, marks=M_SLOW),
        # Everything is big because data size is 10k
        pytest.param(
            2, [2], dict(key_target=1_000, data_size=10_000, huge_value_target=0), 100, marks=M_SLOW
        ),
        # Stress test
        pytest.param(8, [8, 8], dict(key_target=1_000_000, units=16), 50_000, marks=M_STRESS),
    ],
)
@pytest.mark.parametrize("mode", [({}), ({"cache_mode": "true"})])
@pytest.mark.parametrize("background_snapshotting", [False, True])
# Disabled cache_mode until #5371 is fixed
# @pytest.mark.parametrize("point_in_time_replication", [True, False])
async def test_replication_all(
    df_factory: DflyInstanceFactory,
    t_master,
    t_replicas,
    seeder_config,
    stream_target,
    mode,
    background_snapshotting,
    # point_in_time_replication,
):
    args = {}
    if mode:
        args["cache_mode"] = "true"
        args["maxmemory"] = str(t_master * 256) + "mb"

    if background_snapshotting:
        args["background_heartbeat"] = None
        args["background_snapshotting"] = None

    master = df_factory.create(
        admin_port=ADMIN_PORT,
        proactor_threads=t_master,
        # point_in_time_snapshot=point_in_time_replication,
        **args,
    )
    replicas = [
        df_factory.create(admin_port=ADMIN_PORT + i + 1, proactor_threads=t)
        for i, t in enumerate(t_replicas)
    ]

    from_admin_port = random.choice([True, False])

    # Start instances and connect clients
    df_factory.start_all([master] + replicas)
    c_master = master.client()
    c_replicas = [replica.client() for replica in replicas]

    # Fill master with test data
    seeder = SeederV2(**seeder_config, huge_value_add_only=True)
    await seeder.run(c_master, target_deviation=0.01)

    # Start data stream
    stream_task = asyncio.create_task(seeder.run(c_master))
    await asyncio.sleep(0.0)

    # Start replication
    master_port = master.port if not from_admin_port else master.admin_port
    await asyncio.gather(
        *(
            asyncio.create_task(c.execute_command("REPLICAOF localhost " + str(master_port)))
            for c in c_replicas
        )
    )

    # Wait for all replicas to transition into stable sync
    async with async_timeout.timeout(240):
        await wait_for_replicas_state(*c_replicas)

    # Stop streaming data once every replica is in stable sync
    await seeder.stop(c_master)
    await stream_task

    # Check data after full sync
    async def check():
        await check_all_replicas_finished(c_replicas, c_master)
        hashes = await asyncio.gather(*(SeederV2.capture(c) for c in [c_master] + c_replicas))
        assert len(set(hashes)) == 1

    await check()
    # Stream more data in stable state
    await seeder.run(c_master, target_ops=stream_target)

    # Check data after stable state stream
    await check()

    info = await c_master.info()
    preemptions = info["big_value_preemptions"]
    key_capacity = info["prime_capacity"]
    compressed_blobs = info["compressed_blobs"]
    logging.debug(
        f"Compressed blobs {compressed_blobs} .Capacity {key_capacity}. Preemptions {preemptions}"
    )

    assert preemptions >= seeder.huge_value_target * 0.5
    assert compressed_blobs > 0
    # Because data size could be 10k and for that case there will be almost a preemption
    # per bucket.
    if seeder.data_size < 1000:
        # We care that we preempt less times than the total buckets such that we can be
        # sure that we test both flows (with and without preemptions). Preemptions on 3%
        # of buckets seems like a big number but that depends on a few parameters like
        # the size of the hug value and the serialization max chunk size. For the test cases here,
        # it's usually close to 1% but there are some that are close to 3.
        assert preemptions <= (key_capacity * 0.03)


"""
Test disconnecting replicas during different phases while constantly streaming changes to master.

This test is targeted at the master cancellation mechanism that should qickly stop operations for a
disconnected replica.

Three types are tested:
1. Replicas crashing during full sync state
2. Replicas crashing during stable sync state
3. Replicas disconnecting normally with REPLICAOF NO ONE during stable state
"""

# 1. Number of master threads
# 2. Number of threads for each replica that crashes during full sync
# 3. Number of threads for each replica that crashes during stable sync
# 4. Number of threads for each replica that disconnects normally
# 5. Number of distinct keys that are constantly streamed
disconnect_cases = [
    # balanced
    (8, [4, 4], [4, 4], [4], 4_000),
    (4, [2] * 4, [2] * 4, [2, 2], 2_000),
    # full sync heavy
    (8, [4] * 4, [], [], 4_000),
    # stable state heavy
    (8, [], [4] * 4, [], 4_000),
    # disconnect only
    (8, [], [], [4] * 4, 4_000),
]


@pytest.mark.parametrize("t_master, t_crash_fs, t_crash_ss, t_disonnect, n_keys", disconnect_cases)
async def test_disconnect_replica(
    df_factory: DflyInstanceFactory,
    df_seeder_factory,
    t_master,
    t_crash_fs,
    t_crash_ss,
    t_disonnect,
    n_keys,
):
    master = df_factory.create(
        proactor_threads=t_master, vmodule="replica=2,dflycmd=2,server_family=2"
    )
    replicas = [
        (
            df_factory.create(proactor_threads=t, vmodule="replica=2,dflycmd=2,server_family=2"),
            crash_fs,
        )
        for i, (t, crash_fs) in enumerate(
            chain(
                zip(t_crash_fs, repeat(DISCONNECT_CRASH_FULL_SYNC)),
                zip(t_crash_ss, repeat(DISCONNECT_CRASH_STABLE_SYNC)),
                zip(t_disonnect, repeat(DISCONNECT_NORMAL_STABLE_SYNC)),
            )
        )
    ]

    logging.debug("Start master")
    master.start()
    c_master = master.client(single_connection_client=True)

    logging.debug("Start replicas and create clients")
    df_factory.start_all([replica for replica, _ in replicas])

    c_replicas = [(replica, replica.client(), crash_type) for replica, crash_type in replicas]

    def replicas_of_type(tfunc):
        return [args for args in c_replicas if tfunc(args[2])]

    logging.debug("Start data fill loop")
    seeder = df_seeder_factory.create(port=master.port, keys=n_keys, dbcount=2)
    fill_task = asyncio.create_task(seeder.run())

    logging.debug("Run full sync")

    async def full_sync(replica: DflyInstance, c_replica, crash_type):
        await c_replica.execute_command("REPLICAOF localhost " + str(master.port))
        if crash_type == 0:
            await asyncio.sleep(random.random() / 100 + 0.01)
            await c_replica.aclose()
            replica.stop(kill=True)
        else:
            await wait_available_async(c_replica)

    await asyncio.gather(*(full_sync(*args) for args in c_replicas))

    # Wait for master to stream a bit more
    await asyncio.sleep(0.1)

    # Check master survived full sync crashes
    assert await c_master.ping()

    # Check phase-2 replicas survived
    for _, c_replica, _ in replicas_of_type(lambda t: t > 0):
        assert await c_replica.ping()

    logging.debug("Run stable state crashes")

    async def stable_sync(replica, c_replica, crash_type):
        await asyncio.sleep(random.random() / 100)
        await c_replica.aclose()
        replica.stop(kill=True)

    await asyncio.gather(*(stable_sync(*args) for args in replicas_of_type(lambda t: t == 1)))

    # Check master survived all crashes
    assert await c_master.ping()

    # Check phase 3 replica survived
    for _, c_replica, _ in replicas_of_type(lambda t: t > 1):
        assert await c_replica.ping()

    logging.debug("Check master survived all crashes")
    assert await c_master.ping()

    # Check disconnects
    async def disconnect(replica, c_replica, crash_type):
        await asyncio.sleep(random.random() / 100)
        await c_replica.execute_command("REPLICAOF NO ONE")

    logging.debug("disconnect replicas")
    await asyncio.gather(*(disconnect(*args) for args in replicas_of_type(lambda t: t == 2)))

    await asyncio.sleep(0.5)

    logging.debug("Check phase 3 replica survived")
    for replica, c_replica, _ in replicas_of_type(lambda t: t == 2):
        assert await c_replica.ping()
        await c_replica.aclose()

    logging.debug("Stop streaming")
    seeder.stop()
    await fill_task

    logging.debug("Check master survived all disconnects")
    assert await c_master.ping()


"""
Test stopping master during different phases.

This test is targeted at the replica cancellation mechanism that should quickly abort a failed operation
and revert to connection retry state.

Three types are tested:
1. Master crashing during full sync state
2. Master crashing in a random state.
3. Master crashing during stable sync state

"""

# 1. Number of master threads
# 2. Number of threads for each replica
# 3. Number of times a random crash happens
# 4. Number of keys transferred (the more, the higher the propability to not miss full sync)
master_crash_cases = [
    (6, [6], 3, 2_000),
    (4, [4, 4, 4], 3, 2_000),
]


@pytest.mark.large
@pytest.mark.parametrize("t_master, t_replicas, n_random_crashes, n_keys", master_crash_cases)
async def test_disconnect_master(
    df_factory, df_seeder_factory, t_master, t_replicas, n_random_crashes, n_keys
):
    master = df_factory.create(port=1111, proactor_threads=t_master)
    replicas = [df_factory.create(proactor_threads=t) for i, t in enumerate(t_replicas)]

    df_factory.start_all(replicas)
    c_replicas = [replica.client() for replica in replicas]

    seeder = df_seeder_factory.create(port=master.port, keys=n_keys, dbcount=2)

    async def crash_master_fs():
        await asyncio.sleep(random.random() / 10)
        master.stop(kill=True)

    async def start_master():
        await asyncio.sleep(0.2)
        master.start()
        async with master.client() as c_master:
            assert await c_master.ping()
            seeder.reset()
            await seeder.run(target_deviation=0.1)

    await start_master()

    # Crash master during full sync, but with all passing initial connection phase
    await asyncio.gather(
        *(
            c_replica.execute_command("REPLICAOF localhost " + str(master.port))
            for c_replica in c_replicas
        )
    )
    await crash_master_fs()

    await asyncio.sleep(1 + len(replicas) * 0.5)

    for _ in range(n_random_crashes):
        await start_master()
        await asyncio.sleep(random.random() + len(replicas) * random.random() / 10)
        # Crash master in some random state for each replica
        master.stop(kill=True)

    await start_master()
    await asyncio.sleep(1 + len(replicas) * 0.5)  # Replicas check every 500ms.
    capture = await seeder.capture()
    for replica, c_replica in zip(replicas, c_replicas):
        await wait_available_async(c_replica)
        assert await seeder.compare(capture, port=replica.port)

    # Crash master during stable state
    master.stop(kill=True)

    await start_master()
    await asyncio.sleep(1 + len(replicas) * 0.5)
    capture = await seeder.capture()
    for c_replica in c_replicas:
        await wait_available_async(c_replica)
        assert await seeder.compare(capture, port=replica.port)


"""
Test re-connecting replica to different masters.
"""

rotating_master_cases = [(4, [4, 4, 4, 4], dict(keys=2_000, dbcount=4))]


@pytest.mark.large
@pytest.mark.parametrize("t_replica, t_masters, seeder_config", rotating_master_cases)
async def test_rotating_masters(df_factory, df_seeder_factory, t_replica, t_masters, seeder_config):
    replica = df_factory.create(proactor_threads=t_replica)
    masters = [df_factory.create(proactor_threads=t) for i, t in enumerate(t_masters)]
    df_factory.start_all([replica] + masters)

    seeders = [df_seeder_factory.create(port=m.port, **seeder_config) for m in masters]

    c_replica = replica.client()

    await asyncio.gather(*(seeder.run(target_deviation=0.1) for seeder in seeders))

    fill_seeder = None
    fill_task = None

    for master, seeder in zip(masters, seeders):
        if fill_task is not None:
            fill_seeder.stop()
            fill_task.cancel()

        await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
        await wait_available_async(c_replica)

        capture = await seeder.capture()
        assert await seeder.compare(capture, port=replica.port)

        fill_task = asyncio.create_task(seeder.run())
        fill_seeder = seeder

    if fill_task is not None:
        fill_seeder.stop()
        fill_task.cancel()


@pytest.mark.large
async def test_cancel_replication_immediately(df_factory, df_seeder_factory: DflySeederFactory):
    """
    Issue 100 replication commands. This checks that the replication state
    machine can handle cancellation well.
    """
    COMMANDS_TO_ISSUE = 100

    replica = df_factory.create()
    master = df_factory.create()
    df_factory.start_all([replica, master])

    seeder = df_seeder_factory.create(port=master.port)
    c_replica = replica.client(socket_timeout=80)

    await seeder.run(target_deviation=0.1)

    async def ping_status():
        while True:
            await c_replica.info()
            await asyncio.sleep(0.05)

    async def replicate():
        await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
        return True

    ping_job = asyncio.create_task(ping_status())
    replication_commands = [asyncio.create_task(replicate()) for _ in range(COMMANDS_TO_ISSUE)]

    num_successes = 0
    for result in asyncio.as_completed(replication_commands, timeout=80):
        num_successes += await result

    logging.info(f"succeses: {num_successes}")
    assert COMMANDS_TO_ISSUE == num_successes

    await wait_available_async(c_replica)
    capture = await seeder.capture()
    logging.info(f"number of items captured {len(capture)}")
    assert await seeder.compare(capture, replica.port)

    ping_job.cancel()

    replica.stop()
    lines = replica.find_in_logs("Stopping replication")
    # Cancelled 99 times by REPLICAOF command and once by Shutdown() because
    # we stopped the instance
    assert len(lines) == COMMANDS_TO_ISSUE


"""
Test flushall command. Set data to master send flashall and set more data.
Check replica keys at the end.
"""


async def test_flushall(df_factory):
    master = df_factory.create(proactor_threads=4)
    replica = df_factory.create(proactor_threads=2)

    master.start()
    replica.start()

    # Connect replica to master
    c_replica = replica.client()
    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")

    n_keys = 1000

    def gen_test_data(start, end):
        for i in range(start, end):
            yield f"key-{i}", f"value-{i}"

    c_master = master.client()
    pipe = c_master.pipeline(transaction=False)
    # Set simple keys 0..n_keys on master
    batch_fill_data(client=pipe, gen=gen_test_data(0, n_keys), batch_size=3)
    # flushall
    pipe.flushall()
    # Set simple keys n_keys..n_keys*2 on master
    batch_fill_data(client=pipe, gen=gen_test_data(n_keys, n_keys * 2), batch_size=3)

    await pipe.execute()
    # Check replica finished executing the replicated commands
    await check_all_replicas_finished([c_replica], c_master)

    # Check replica keys 0..n_keys-1 dont exist
    pipe = c_replica.pipeline(transaction=False)
    for i in range(n_keys):
        pipe.get(f"key-{i}")
    vals = await pipe.execute()
    assert all(v is None for v in vals)

    # Check replica keys n_keys..n_keys*2-1 exist
    for i in range(n_keys, n_keys * 2):
        pipe.get(f"key-{i}")
    vals = await pipe.execute()
    assert all(v is not None for v in vals)


"""
Test journal rewrites.
"""


@dfly_args({"proactor_threads": 4})
async def test_rewrites(df_factory):
    CLOSE_TIMESTAMP = int(time.time()) + 100
    CLOSE_TIMESTAMP_MS = CLOSE_TIMESTAMP * 1000

    master = df_factory.create()
    replica = df_factory.create()

    master.start()
    replica.start()

    # Connect clients, connect replica to master
    c_master = master.client()
    c_replica = replica.client()
    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_available_async(c_replica)

    # Create monitor and bind utility functions
    m_replica = c_replica.monitor()

    async def get_next_command():
        mcmd = (await m_replica.next_command())["command"]
        # skip select command
        while mcmd == "SELECT 0" or mcmd.startswith("CLIENT SETINFO"):
            mcmd = (await m_replica.next_command())["command"]
        print("Got:", mcmd)
        return mcmd

    async def is_match_rsp(rx):
        mcmd = await get_next_command()
        print(mcmd, rx)
        return re.match(rx, mcmd)

    async def skip_cmd():
        await is_match_rsp(r".*")

    async def skip_cmds(n):
        for _ in range(n):
            await skip_cmd()

    async def check(cmd, rx):
        await c_master.execute_command(cmd)
        match = await is_match_rsp(rx)
        assert match

    async def check_list(cmd, rx_list):
        print("master cmd:", cmd)
        await c_master.execute_command(cmd)
        for rx in rx_list:
            match = await is_match_rsp(rx)
            assert match

    async def check_list_ooo(cmd, rx_list):
        print("master cmd:", cmd)
        await c_master.execute_command(cmd)
        expected_cmds = len(rx_list)
        for i in range(expected_cmds):
            mcmd = await get_next_command()
            # check command matches one regex from list
            match_rx = list(filter(lambda rx: re.match(rx, mcmd), rx_list))
            assert len(match_rx) == 1
            rx_list.remove(match_rx[0])

    async def check_expire(key):
        ttl1 = await c_master.ttl(key)
        ttl2 = await c_replica.ttl(key)
        await skip_cmd()
        assert abs(ttl1 - ttl2) <= 1

    async with m_replica:
        # CHECK EXPIRE, PEXPIRE, PEXPIRE turn into EXPIREAT
        await c_master.set("k-exp", "v")
        await skip_cmd()
        await check("EXPIRE k-exp 100", r"PEXPIREAT k-exp (.*?)")
        await check_expire("k-exp")
        await check("PEXPIRE k-exp 50000", r"PEXPIREAT k-exp (.*?)")
        await check_expire("k-exp")
        await check(f"EXPIREAT k-exp {CLOSE_TIMESTAMP}", rf"PEXPIREAT k-exp {CLOSE_TIMESTAMP_MS}")

        # Check SPOP turns into SREM or SDEL
        await c_master.sadd("k-set", "v1", "v2", "v3")
        await skip_cmd()
        await check("SPOP k-set 1", r"SREM k-set (v1|v2|v3)")
        await check("SPOP k-set 2", r"DEL k-set")

        # Check SET + {EX/PX/EXAT} + {XX/NX/GET} arguments turns into SET PXAT
        await check(f"SET k v EX 100 NX GET", r"SET k v PXAT (.*?)")
        await check_expire("k")
        await check(f"SET k v PX 50000", r"SET k v PXAT (.*?)")
        await check_expire("k")
        # Exact expiry is skewed
        await check(f"SET k v XX EXAT {CLOSE_TIMESTAMP}", rf"SET k v PXAT (.*?)")
        await check_expire("k")

        # Check SET + KEEPTTL doesn't loose KEEPTTL
        await check(f"SET k v KEEPTTL", r"SET k v KEEPTTL")

        # Check SETEX/PSETEX turn into SET PXAT
        await check("SETEX k 100 v", r"SET k v PXAT (.*?)")
        await check_expire("k")
        await check("PSETEX k 500000 v", r"SET k v PXAT (.*?)")
        await check_expire("k")

        # Check GETEX turns into PEXPIREAT or PERSIST
        await check("GETEX k PERSIST", r"PERSIST k")
        await check_expire("k")
        await check("GETEX k EX 100", r"PEXPIREAT k (.*?)")
        await check_expire("k")

        # Check SDIFFSTORE turns into DEL and SADD
        await c_master.sadd("set1", "v1", "v2", "v3")
        await c_master.sadd("set2", "v1", "v2")
        await skip_cmd()
        await skip_cmd()
        await check_list("SDIFFSTORE k set1 set2", [r"DEL k", r"SADD k v3"])

        # Check SINTERSTORE turns into DEL and SADD
        await check_list("SINTERSTORE k set1 set2", [r"DEL k", r"SADD k (.*?)"])

        # Check SMOVE turns into SREM and SADD
        await check_list_ooo("SMOVE set1 set2 v3", [r"SREM set1 v3", r"SADD set2 v3"])

        # Check SUNIONSTORE turns into DEL and SADD
        await check_list_ooo("SUNIONSTORE k set1 set2", [r"DEL k", r"SADD k (.*?)"])

        # Check ZDIFFSTORE turns into DEL and ZADD
        await c_master.execute_command("zadd zet1 1 v1 2 v2 3 v3")
        await c_master.execute_command("zadd zet2 1 v1 2 v2")
        await skip_cmd()
        await skip_cmd()
        await check_list("ZDIFFSTORE k 2 zet1 zet2", [r"DEL k", r"ZADD k 3 v3"])

        # Check ZINTERSTORE turns into DEL and ZADD
        await check_list("ZINTERSTORE k 2 zet1 zet2", [r"DEL k", r"ZADD k (.*?)"])

        # Check ZRANGESTORE turns into SREM and ZADD
        await check_list_ooo("ZRANGESTORE k zet1 2 -1", [r"DEL k", r"ZADD k 3 v3"])

        # Check ZUNIONSTORE turns into DEL and ZADD
        await check_list_ooo("ZUNIONSTORE k 2 zet1 zet2", [r"DEL k", r"ZADD k (.*?)"])

        await c_master.set("k1", "1000")
        await c_master.set("k2", "1100")
        await skip_cmd()
        await skip_cmd()
        # Check BITOP turns into SET
        await check("BITOP OR kdest k1 k2", r"SET kdest 1100")
        # See gh issue #3528
        await c_master.execute_command(f"HSET foo bar val")
        await skip_cmd()
        await check("BITOP NOT foo tmp", r"DEL foo")
        await c_master.execute_command(f"HSET foo bar val")
        await skip_cmd()
        await c_master.set("k3", "-")
        await skip_cmd()
        await check("BITOP NOT foo k3", r"SET foo \\xd2")

        # Check there is no rewrite for LMOVE on single shard
        await c_master.lpush("list", "v1", "v2", "v3", "v4")
        await skip_cmd()
        # Check LMOVE/BLMOVE turns into POP PUSH
        await check_list_ooo("LMOVE list list LEFT RIGHT", [r"LPOP list", r"RPUSH list v4"])
        await check_list_ooo("BLMOVE list list RIGHT LEFT 0", [r"RPOP list", r"LPUSH list v4"])

        # Check RPOPLPUSH turns into RPOP LPUSH
        await check_list_ooo("RPOPLPUSH list list", [r"RPOP list", r"LPUSH list v1"])
        # Check BRPOPLPUSH turns into RPOP LPUSH
        await check_list_ooo("BRPOPLPUSH list list 0", [r"RPOP list", r"LPUSH list v2"])
        # Check BLPOP turns into LPOP
        await check("BLPOP list list1 0", r"LPOP list")
        # Check BRPOP turns into RPOP
        await check("BRPOP list 0", r"RPOP list")

        await c_master.lpush("list1s", "v1", "v2", "v3", "v4")
        await skip_cmd()
        # Check LMOVE turns into LPUSH LPOP on multi shard
        await check_list_ooo("LMOVE list1s list2s LEFT LEFT", [r"LPUSH list2s v4", r"LPOP list1s"])
        # Check RPOPLPUSH turns into LPUSH RPOP on multi shard
        await check_list_ooo("RPOPLPUSH list1s list2s", [r"LPUSH list2s v1", r"RPOP list1s"])
        # Check BRPOPLPUSH turns into LPUSH RPOP on multi shard
        await check_list_ooo("BRPOPLPUSH list1s list2s 0", [r"LPUSH list2s v2", r"RPOP list1s"])

        await check("LMPOP 2 list list1s LEFT", r"LPOP list")
        await check("BLMPOP 0 2 list1s list RIGHT", r"RPOP list1s")

        # MOVE runs as global command, check only one journal entry is sent
        await check("MOVE list2s 2", r"MOVE list2s 2")

        await c_master.set("renamekey", "1000", px=50000)
        await skip_cmd()
        # Check RENAME turns into DEL and RESTORE
        await check_list_ooo(
            "RENAME renamekey renamed",
            [r"DEL renamekey", r"RESTORE renamed (.*?) (.*?) REPLACE ABSTTL"],
        )
        await check_expire("renamed")
        # Check RENAMENX turns into DEL and RESTORE
        await check_list_ooo(
            "RENAMENX renamed renamekey",
            [r"DEL renamed", r"RESTORE renamekey (.*?) (.*?) REPLACE ABSTTL"],
        )
        await check_expire("renamekey")

        # Test autojournaling in the multi-mode
        await c_master.execute_command("XADD k-stream * field value")
        await c_master.execute_command("SADD k-one-element-set value1 value2")
        sha = await c_master.script_load(
            "redis.call('XTRIM', KEYS[1], 'MINID', '0'); return redis.call('SPOP', KEYS[2]);"
        )
        await skip_cmds(3)
        # The first call to XTRIM triggers autojournaling.
        # The SPOP command is executed with CO::NO_AUTOJOURNALING.
        # This test ensures that the SPOP command is still properly replicated
        await check_list_ooo(
            f"EVALSHA {sha} 2 k-stream k-one-element-set",
            [r"XTRIM k-stream MINID 0", r"SREM k-one-element-set value[12]"],
        )

        # TODO next Z-tests won't work with no-point-in-time replication
        # check BZMPOP turns into ZPOPMAX and ZPOPMIN command
        await c_master.zadd("key", {"a": 1, "b": 2, "c": 3})
        await skip_cmd()
        await check("BZMPOP 0 3 key3 key2 key MAX COUNT 3", r"ZPOPMAX key 3")

        await c_master.zadd("key", {"a": 1, "b": 2, "c": 3})
        await skip_cmd()
        await check("BZMPOP 0 3 key3 key2 key MIN", r"ZPOPMIN key 1")

        # Check ZMPOP turns into ZPOPMAX and ZPOPMIN commands
        await c_master.zadd("key", {"a": 1, "b": 2, "c": 3})
        await skip_cmd()
        await check("ZMPOP 3 key3 key2 key MIN COUNT 3", r"ZPOPMIN key 3")

        await c_master.zadd("key", {"a": 1, "b": 2, "c": 3})
        await skip_cmd()
        await check("ZMPOP 3 key3 key2 key MAX", r"ZPOPMAX key 1")

        # Check XREADGROUP turns into XGROUP SETID + XCLAIM (for non-NOACK)
        await c_master.execute_command("XGROUP CREATE mystream mygroup $ MKSTREAM")
        await skip_cmd()
        await c_master.execute_command("XADD mystream * field1 value1")
        await skip_cmd()
        # XREADGROUP without NOACK should journal XCLAIM + XGROUP SETID
        await c_master.execute_command("XREADGROUP GROUP mygroup consumer1 STREAMS mystream >")
        # Consumer creation
        assert await is_match_rsp("XGROUP CREATECONSUMER mystream mygroup consumer1")
        # Expect XCLAIM for the message + XGROUP SETID with ENTRIESREAD
        assert await is_match_rsp(
            r"XCLAIM mystream mygroup consumer1 0 (.*?) TIME \d+ RETRYCOUNT 1 FORCE JUSTID LASTID (.*?)"
        )
        assert await is_match_rsp(r"XGROUP SETID mystream mygroup (.*?) ENTRIESREAD 1")

        # Check XREADGROUP with NOACK only journals XGROUP SETID
        await c_master.execute_command("XADD mystream * field2 value2")
        await skip_cmd()
        await c_master.execute_command(
            "XREADGROUP GROUP mygroup consumer1 NOACK STREAMS mystream >"
        )
        # With NOACK, only XGROUP SETID should be journaled (no XCLAIM)
        assert await is_match_rsp(r"XGROUP SETID mystream mygroup (.*?) ENTRIESREAD 2")


"""
Test automatic replication of expiry.
"""


@dfly_args({"proactor_threads": 4})
async def test_expiry(df_factory: DflyInstanceFactory, n_keys=1000):
    master = df_factory.create()
    replica = df_factory.create()

    df_factory.start_all([master, replica])

    c_master = master.client()
    c_replica = replica.client()

    # Connect replica to master
    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")

    # Set keys
    pipe = c_master.pipeline(transaction=False)
    batch_fill_data(pipe, gen_test_data(n_keys))
    await pipe.execute()

    # Check replica finished executing the replicated commands
    await check_all_replicas_finished([c_replica], c_master)
    # Check keys are on replica
    res = await c_replica.mget(k for k, _ in gen_test_data(n_keys))
    assert all(v is not None for v in res)

    # Set key different expries times in ms
    pipe = c_master.pipeline(transaction=True)
    for k, _ in gen_test_data(n_keys):
        ms = random.randint(20, 500)
        pipe.pexpire(k, ms)
    await pipe.execute()

    # send more traffic for differnt dbs while keys are expired
    for i in range(8):
        is_multi = i % 2
        async with aioredis.Redis(port=master.port, db=i) as c_master_db:
            pipe = c_master_db.pipeline(transaction=is_multi)
            # Set simple keys n_keys..n_keys*2 on master
            start_key = n_keys * (i + 1)
            end_key = start_key + n_keys
            batch_fill_data(client=pipe, gen=gen_test_data(end_key, start_key), batch_size=20)

            await pipe.execute()

    # Wait for master to expire keys
    await asyncio.sleep(3.0)

    # Check all keys with expiry have been deleted
    res = await c_master.mget(k for k, _ in gen_test_data(n_keys))
    assert all(v is None for v in res)

    # Check replica finished executing the replicated commands
    await check_all_replicas_finished([c_replica], c_master)
    res = await c_replica.mget(k for k, _ in gen_test_data(n_keys))
    assert all(v is None for v in res)

    # Set expired keys again
    pipe = c_master.pipeline(transaction=False)
    batch_fill_data(pipe, gen_test_data(n_keys))
    for k, _ in gen_test_data(n_keys):
        pipe.pexpire(k, 500)
    await pipe.execute()
    await asyncio.sleep(1.0)
    # Disconnect from master
    await c_replica.execute_command("REPLICAOF NO ONE")
    # Check replica expires keys on its own
    await asyncio.sleep(1.0)
    res = await c_replica.mget(k for k, _ in gen_test_data(n_keys))
    assert all(v is None for v in res)


@dfly_args({"proactor_threads": 4})
async def test_simple_scripts(df_factory: DflyInstanceFactory):
    master = df_factory.create()
    replicas = [df_factory.create() for _ in range(2)]
    df_factory.start_all([master] + replicas)

    c_replicas = [replica.client() for replica in replicas]
    c_master = master.client()

    # Connect replicas and wait for sync to finish
    for c_replica in c_replicas:
        await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    await check_all_replicas_finished([c_replica], c_master)

    # Generate some scripts and run them
    keys = ["a", "b", "c", "d", "e"]
    for i in range(len(keys) + 1):
        script = ""
        subkeys = keys[:i]
        for key in subkeys:
            script += f"redis.call('INCR', '{key}')"
            script += f"redis.call('INCR', '{key}')"

        await c_master.eval(script, len(subkeys), *subkeys)

    # Wait for replicas
    await check_all_replicas_finished([c_replica], c_master)

    for c_replica in c_replicas:
        assert (await c_replica.mget(keys)) == ["10", "8", "6", "4", "2"]


"""
Test script replication.

Fill multiple lists with values and rotate them one by one with LMOVE until they're at the same place again.
"""

# t_master, t_replicas, num_ops, num_keys, num_parallel, flags
script_cases = [
    (4, [4, 4, 4], 50, 5, 5, ""),
    (4, [4, 4, 4], 50, 5, 5, "disable-atomicity"),
]

script_test_s1 = """
{flags}
local N = ARGV[1]

-- fill each list with its k value
for i, k in pairs(KEYS) do
  for j = 1, N do
    redis.call('LPUSH', k, i-1)
  end
end

-- rotate #KEYS times
for l = 1, #KEYS do
  for j = 1, N do
    for i, k in pairs(KEYS) do
      redis.call('LMOVE', k, KEYS[i%#KEYS+1], 'LEFT', 'RIGHT')
    end
  end
end


return 'OK'
"""


@pytest.mark.parametrize("t_master, t_replicas, num_ops, num_keys, num_par, flags", script_cases)
async def test_scripts(df_factory, t_master, t_replicas, num_ops, num_keys, num_par, flags):
    master = df_factory.create(proactor_threads=t_master)
    replicas = [df_factory.create(proactor_threads=t) for i, t in enumerate(t_replicas)]

    df_factory.start_all([master] + replicas)

    c_master = master.client()
    c_replicas = [replica.client() for replica in replicas]
    for c_replica in c_replicas:
        await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
        await wait_available_async(c_replica)

    script = script_test_s1.format(flags=f"--!df flags={flags}" if flags else "")
    sha = await c_master.script_load(script)

    key_sets = [[f"{i}-{j}" for j in range(num_keys)] for i in range(num_par)]

    rsps = await asyncio.gather(
        *(c_master.evalsha(sha, len(keys), *keys, num_ops) for keys in key_sets)
    )
    assert rsps == ["OK"] * num_par

    await check_all_replicas_finished(c_replicas, c_master)

    for c_replica in c_replicas:
        for key_set in key_sets:
            for j, k in enumerate(key_set):
                l = await c_replica.lrange(k, 0, -1)
                assert l == [f"{j}"] * num_ops


@dfly_args({"proactor_threads": 4})
async def test_auth_master(df_factory, n_keys=20):
    masterpass = "requirepass"
    replicapass = "replicapass"
    master = df_factory.create(requirepass=masterpass)
    replica = df_factory.create(logtostdout=True, masterauth=masterpass, requirepass=replicapass)

    df_factory.start_all([master, replica])

    c_master = master.client(password=masterpass)
    c_replica = replica.client(password=replicapass)

    # Connect replica to master
    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")

    # Set keys
    pipe = c_master.pipeline(transaction=False)
    batch_fill_data(pipe, gen_test_data(n_keys))
    await pipe.execute()

    # Check replica finished executing the replicated commands
    await check_all_replicas_finished([c_replica], c_master)
    # Check keys are on replica
    res = await c_replica.mget(k for k, _ in gen_test_data(n_keys))
    assert all(v is not None for v in res)
    await c_master.connection_pool.disconnect()
    await c_replica.connection_pool.disconnect()


SCRIPT_TEMPLATE = "return {}"


@dfly_args({"proactor_threads": 2})
async def test_script_transfer(df_factory):
    master = df_factory.create()
    replica = df_factory.create()

    df_factory.start_all([master, replica])

    c_master = master.client()
    c_replica = replica.client()

    # Load some scripts into master ahead
    scripts = []
    for i in range(0, 10):
        sha = await c_master.script_load(SCRIPT_TEMPLATE.format(i))
        scripts.append(sha)

    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_available_async(c_replica)

    # transfer in stable state
    for i in range(10, 20):
        sha = await c_master.script_load(SCRIPT_TEMPLATE.format(i))
        scripts.append(sha)

    await check_all_replicas_finished([c_replica], c_master)
    await c_replica.execute_command("REPLICAOF NO ONE")

    for i, sha in enumerate(scripts):
        assert await c_replica.evalsha(sha, 0) == i
    await c_master.connection_pool.disconnect()
    await c_replica.connection_pool.disconnect()


@dfly_args({"proactor_threads": 4})
async def test_role_command(df_factory, n_keys=20):
    master = df_factory.create()
    replica = df_factory.create()

    df_factory.start_all([master, replica])

    c_master = master.client()
    c_replica = replica.client()

    assert await c_master.execute_command("role") == ["master", []]
    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_available_async(c_replica)

    # It may take a bit more time to actually propagate the role change
    # See https://github.com/dragonflydb/dragonfly/pull/2111
    await asyncio.sleep(1)

    assert await c_master.execute_command("role") == [
        "master",
        [["127.0.0.1", str(replica.port), "online"]],
    ]
    assert await c_replica.execute_command("role") == [
        "slave",
        "localhost",
        str(master.port),
        "online",
    ]

    # This tests that we react fast to socket shutdowns and don't hang on
    # things like the ACK or execution fibers.
    master.stop()
    await asyncio.sleep(0.1)
    assert await c_replica.execute_command("role") == [
        "slave",
        "localhost",
        str(master.port),
        "connecting",
    ]

    await c_master.connection_pool.disconnect()
    await c_replica.connection_pool.disconnect()


def parse_lag(replication_info: str):
    lags = re.findall("lag=([0-9]+)\r\n", replication_info)
    assert len(lags) == 1
    return int(lags[0])


async def get_metric_value(inst, metric_name, sample_index=0):
    return (await inst.metrics())[metric_name].samples[sample_index].value


async def assert_lag_condition(inst, client, condition):
    """
    Since lag is a bit random, and we want stable tests, we check
    10 times in quick succession and validate that the condition
    is satisfied at least once.
    We check both `INFO REPLICATION` redis protocol and the `/metrics`
    prometheus endpoint.
    """
    for _ in range(10):
        lag = await get_metric_value(inst, "dragonfly_connected_replica_lag_records")
        if condition(lag):
            break
        print("current prometheus lag =", lag)
        await asyncio.sleep(0.05)
    else:
        assert False, "Lag from prometheus metrics has never satisfied condition!"
    for _ in range(10):
        lag = parse_lag(await client.execute_command("info replication"))
        if condition(lag):
            break
        print("current lag =", lag)
        await asyncio.sleep(0.05)
    else:
        assert False, "Lag has never satisfied condition!"


async def get_replica_reconnects_count(replica_inst):
    return await get_metric_value(replica_inst, "dragonfly_replica_reconnect_count")


async def assert_replica_reconnections(replica_inst, initial_reconnects_count):
    """
    Asserts that the replica has attempted to reconnect at least once.
    """
    reconnects_count = await get_replica_reconnects_count(replica_inst)
    if reconnects_count > initial_reconnects_count:
        return

    assert (
        False
    ), f"Expected reconnect count to increase by at least 1, but it did not. Initial dragonfly_replica_reconnect_count: {initial_reconnects_count}, current count: {reconnects_count}"


@dfly_args({"proactor_threads": 2})
async def test_replication_info(df_factory: DflyInstanceFactory, df_seeder_factory, n_keys=2000):
    master = df_factory.create()
    replica = df_factory.create(replication_acks_interval=100)
    df_factory.start_all([master, replica])
    c_master = master.client()
    c_replica = replica.client()

    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_available_async(c_replica)
    await assert_lag_condition(master, c_master, lambda lag: lag == 0)

    seeder = df_seeder_factory.create(port=master.port, keys=n_keys, dbcount=2)
    fill_task = asyncio.create_task(seeder.run(target_ops=3000))
    await assert_lag_condition(master, c_master, lambda lag: lag > 30)
    seeder.stop()

    await fill_task
    await wait_available_async(c_replica)
    await assert_lag_condition(master, c_master, lambda lag: lag == 0)

    await c_master.connection_pool.disconnect()
    await c_replica.connection_pool.disconnect()


"""
Test flushall command that's invoked while in full sync mode.
This can cause an issue because it will be executed on each shard independently.
More details in https://github.com/dragonflydb/dragonfly/issues/1231
"""


@pytest.mark.large
@pytest.mark.exclude_epoll
async def test_flushall_in_full_sync(df_factory):
    master = df_factory.create(proactor_threads=4)
    replica = df_factory.create(proactor_threads=2)

    df_factory.start_all([master, replica])
    c_master = master.client()
    c_replica = replica.client()

    # Fill master with test data
    seeder = DebugPopulateSeeder(key_target=100_000)
    await seeder.run(c_master)

    # Start replication and wait for full sync
    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    async with async_timeout.timeout(3):
        await wait_for_replicas_state(c_replica, state="full_sync", timeout=0.05)

    syncid, _ = await c_replica.execute_command("DEBUG REPLICA OFFSET")

    # Issue FLUSHALL and record replica role at the same instant
    _, role = await asyncio.gather(c_master.execute_command("FLUSHALL"), c_replica.role())

    # Print warning if replication was too quick
    if role[3] != "full_sync":
        logging.error("!!! Full sync finished too fast. Adjust test parameters !!!")
        return

    # Run a few more commands on top
    post_seeder = SeederV2(key_target=100)
    await post_seeder.run(c_master, target_deviation=0.1)

    await check_all_replicas_finished([c_replica], c_master)

    # Check replica data consisten
    hash1, hash2 = await asyncio.gather(*(SeederV2.capture(c) for c in (c_master, c_replica)))
    assert hash1 == hash2

    # Make sure that a new sync ID is present, meaning replication restarted following FLUSHALL.
    new_syncid, _ = await c_replica.execute_command("DEBUG REPLICA OFFSET")
    assert new_syncid != syncid


"""
Test read-only scripts work with replication. EVAL_RO and the 'no-writes' flags are currently not supported.
"""

READONLY_SCRIPT = """
redis.call('GET', 'A')
redis.call('EXISTS', 'B')
return redis.call('GET', 'WORKS')
"""

WRITE_SCRIPT = """
redis.call('SET', 'A', 'ErrroR')
"""


async def test_readonly_script(df_factory):
    master = df_factory.create(proactor_threads=2)
    replica = df_factory.create(proactor_threads=2)

    df_factory.start_all([master, replica])

    c_master = master.client()
    c_replica = replica.client()

    await c_master.set("WORKS", "YES")

    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_available_async(c_replica)

    await c_replica.eval(READONLY_SCRIPT, 3, "A", "B", "WORKS") == "YES"

    with pytest.raises(aioredis.ResponseError) as roe:
        await c_replica.eval(WRITE_SCRIPT, 1, "A")


take_over_cases = [
    [2, 2],
    [2, 4],
    [4, 2],
    [8, 8],
]


@pytest.mark.exclude_epoll
@pytest.mark.parametrize("master_threads, replica_threads", take_over_cases)
async def test_take_over_counters(df_factory, master_threads, replica_threads):
    master = df_factory.create(proactor_threads=master_threads)
    replica1 = df_factory.create(proactor_threads=replica_threads)
    replica2 = df_factory.create(proactor_threads=replica_threads)
    replica3 = df_factory.create(proactor_threads=replica_threads)
    df_factory.start_all([master, replica1, replica2, replica3])
    c_master = master.client()
    c1 = replica1.client()
    c_blocking = master.client()
    c2 = replica2.client()
    c3 = replica3.client()

    await c1.execute_command(f"REPLICAOF localhost {master.port}")
    await c2.execute_command(f"REPLICAOF localhost {master.port}")
    await c3.execute_command(f"REPLICAOF localhost {master.port}")

    await wait_available_async(c1)

    async def counter(key):
        value = 0
        await c_master.execute_command(f"SET {key} 0")
        start = time.time()
        while time.time() - start < 20:
            try:
                value = await c_master.execute_command(f"INCR {key}")
            except (redis.exceptions.ConnectionError, redis.exceptions.ResponseError) as e:
                break
        else:
            assert False, "The incrementing loop should be exited with a connection error"
        return key, value

    async def block_during_takeover():
        "Add a blocking command during takeover to make sure it doesn't block it."
        start = time.time()
        # The command should just be canceled
        assert await c_blocking.execute_command("BLPOP BLOCKING_KEY1 BLOCKING_KEY2 100") is None
        # And it should happen in reasonable amount of time.
        assert time.time() - start < 10

    async def delayed_takeover():
        await asyncio.sleep(1)
        await c1.execute_command(f"REPLTAKEOVER 5")

    _, _, *results = await asyncio.gather(
        delayed_takeover(), block_during_takeover(), *[counter(f"key{i}") for i in range(16)]
    )
    assert await c1.execute_command("role") == ["master", []]

    for key, client_value in results:
        replicated_value = await c1.get(key)
        assert client_value == int(replicated_value)


@pytest.mark.exclude_epoll
@pytest.mark.parametrize("master_threads, replica_threads", take_over_cases)
async def test_take_over_seeder(
    request, df_factory, df_seeder_factory, master_threads, replica_threads
):
    master = df_factory.create(
        proactor_threads=master_threads, dbfilename=f"dump_{tmp_file_name()}", admin_port=ADMIN_PORT
    )
    replica = df_factory.create(proactor_threads=replica_threads)
    df_factory.start_all([master, replica])

    seeder = df_seeder_factory.create(port=master.port, keys=1000, dbcount=5, stop_on_failure=False)

    c_replica = replica.client()

    await c_replica.execute_command(f"REPLICAOF localhost {master.admin_port}")
    await wait_available_async(c_replica)

    fill_task = asyncio.create_task(seeder.run())

    stop_info = False

    async def info_replication():
        my_client = replica.client()
        while not stop_info:
            await my_client.info("replication")
            await asyncio.sleep(0.5)

    info_task = asyncio.create_task(info_replication())

    # Give the seeder a bit of time.
    await asyncio.sleep(3)
    logging.debug("running repltakover")
    await c_replica.execute_command(f"REPLTAKEOVER 30 SAVE")
    logging.debug("after running repltakover")
    seeder.stop()
    await fill_task

    assert await c_replica.execute_command("role") == ["master", []]
    stop_info = True
    await info_task

    @assert_eventually
    async def assert_master_exists():
        assert master.proc.poll() == 0, "Master process did not exit correctly."

    await assert_master_exists()

    master.start()
    c_master = master.client()
    await wait_available_async(c_master)

    capture = await seeder.capture(port=master.port)
    assert await seeder.compare(capture, port=replica.port)


@pytest.mark.parametrize("master_threads, replica_threads", [[4, 4]])
async def test_take_over_read_commands(df_factory, master_threads, replica_threads):
    master = df_factory.create(proactor_threads=master_threads)
    replica = df_factory.create(proactor_threads=replica_threads)
    df_factory.start_all([master, replica])

    c_master = master.client(socket_timeout=1, socket_connect_timeout=1)
    await c_master.execute_command("SET foo bar")

    c_replica = replica.client()
    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_available_async(c_replica)

    async def prompt():
        client = replica.client()
        master_alive = True
        for i in range(10):
            # TODO remove try block when we no longer shut down master after take over
            if master_alive:
                try:
                    res = await c_master.execute_command("GET foo")
                    assert res == "bar"
                    res = await c_master.execute_command("CONFIG SET aclfile myfile")
                    assert res == "OK"
                except:
                    master_alive = False
            res = await client.execute_command("GET foo")
            assert res == "bar"

    promt_task = asyncio.create_task(prompt())
    await c_replica.execute_command(f"REPLTAKEOVER 5")

    assert await c_replica.execute_command("role") == ["master", []]
    await promt_task


async def test_take_over_timeout(df_factory, df_seeder_factory):
    master = df_factory.create(proactor_threads=2)
    replica = df_factory.create(proactor_threads=2)
    df_factory.start_all([master, replica])

    seeder = df_seeder_factory.create(port=master.port, keys=1000, dbcount=5, stop_on_failure=False)

    c_master = master.client()
    c_replica = replica.client()

    logging.debug(f"PORTS ARE:  {master.port} {replica.port}")

    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_available_async(c_replica)

    fill_task = asyncio.create_task(seeder.run(target_ops=3000))

    # Give the seeder a bit of time.
    await asyncio.sleep(1)
    try:
        await c_replica.execute_command(f"REPLTAKEOVER 0")
    except redis.exceptions.ResponseError as e:
        # Should fail with detailed error message
        assert str(e).startswith("Couldn't execute takeover")
        # Verify it includes diagnostic information
        assert ":" in str(e), "Error message should include diagnostic details"
    else:
        assert False, "Takeover should not succeed."
    seeder.stop()
    await fill_task

    assert await c_master.execute_command("role") == [
        "master",
        [["127.0.0.1", str(replica.port), "online"]],
    ]
    assert await c_replica.execute_command("role") == [
        "slave",
        "localhost",
        str(master.port),
        "online",
    ]


# 1. Number of master threads
# 2. Number of threads for each replica
replication_cases = [(8, 8)]


@pytest.mark.parametrize("t_master, t_replica", replication_cases)
async def test_no_tls_on_admin_port(
    df_factory: DflyInstanceFactory,
    df_seeder_factory,
    t_master,
    t_replica,
    with_tls_server_args,
):
    # 1. Spin up dragonfly without tls, debug populate
    master = df_factory.create(
        no_tls_on_admin_port="true",
        admin_port=ADMIN_PORT,
        **with_tls_server_args,
        requirepass="XXX",
        proactor_threads=t_master,
    )
    master.start()
    c_master = master.admin_client(password="XXX")
    await c_master.execute_command("DEBUG POPULATE 100")
    db_size = await c_master.execute_command("DBSIZE")
    assert 100 == db_size

    # 2. Spin up a replica and initiate a REPLICAOF
    replica = df_factory.create(
        no_tls_on_admin_port="true",
        admin_port=ADMIN_PORT + 1,
        **with_tls_server_args,
        proactor_threads=t_replica,
        requirepass="XXX",
        masterauth="XXX",
    )
    replica.start()
    c_replica = replica.admin_client(password="XXX")
    res = await c_replica.execute_command("REPLICAOF localhost " + str(master.admin_port))
    assert "OK" == res
    await check_all_replicas_finished([c_replica], c_master)

    # 3. Verify that replica dbsize == debug populate key size -- replication works
    db_size = await c_replica.execute_command("DBSIZE")
    assert 100 == db_size


# 1. Number of master threads
# 2. Number of threads for each replica
# 3. Admin port
replication_cases = [(8, 8, False), (8, 8, True)]


@pytest.mark.parametrize("t_master, t_replica, test_admin_port", replication_cases)
async def test_tls_replication(
    df_factory,
    df_seeder_factory,
    t_master,
    t_replica,
    test_admin_port,
    with_ca_tls_server_args,
    with_ca_tls_client_args,
):
    # 1. Spin up dragonfly tls enabled, debug populate
    master = df_factory.create(
        tls_replication="true",
        **with_ca_tls_server_args,
        port=1111,
        admin_port=ADMIN_PORT,
        proactor_threads=t_master,
    )
    master.start()
    c_master = master.client(**with_ca_tls_client_args)
    await c_master.execute_command("DEBUG POPULATE 100")
    db_size = await c_master.execute_command("DBSIZE")
    assert 100 == db_size

    proxy = Proxy(
        "127.0.0.1", 1114, "127.0.0.1", master.port if not test_admin_port else master.admin_port
    )
    await proxy.start()
    proxy_task = asyncio.create_task(proxy.serve())

    # 2. Spin up a replica and initiate a REPLICAOF
    replica = df_factory.create(
        tls_replication="true",
        **with_ca_tls_server_args,
        proactor_threads=t_replica,
    )
    replica.start()
    c_replica = replica.client(**with_ca_tls_client_args)
    res = await c_replica.execute_command("REPLICAOF localhost " + str(proxy.port))
    assert "OK" == res
    await check_all_replicas_finished([c_replica], c_master)

    # 3. Verify that replica dbsize == debug populate key size -- replication works
    db_size = await c_replica.execute_command("DBSIZE")
    assert 100 == db_size

    # 4. Break the connection between master and replica
    await proxy.close(proxy_task)
    await asyncio.sleep(3)
    await proxy.start()
    proxy_task = asyncio.create_task(proxy.serve())

    # Check replica gets new keys
    await c_master.execute_command("SET MY_KEY 1")
    db_size = await c_master.execute_command("DBSIZE")
    assert 101 == db_size

    await check_all_replicas_finished([c_replica], c_master)
    db_size = await c_replica.execute_command("DBSIZE")
    assert 101 == db_size

    await proxy.close(proxy_task)


@dfly_args({"proactor_threads": 2})
async def test_tls_replication_without_ca(
    df_factory,
    df_seeder_factory,
    with_tls_server_args,
    with_ca_tls_client_args,
):
    # 1. Spin up dragonfly tls enabled, debug populate
    master = df_factory.create(tls_replication="true", **with_tls_server_args, requirepass="hi")
    master.start()
    # Somehow redis-py forces to verify the certificate and it fails
    # TODO investigate why and remove with_ca_tls_clients_args
    c_master = master.client(password="hi", **with_ca_tls_client_args)
    await c_master.execute_command("DEBUG POPULATE 100")

    # 2. Spin up a replica and initiate a REPLICAOF
    replica = df_factory.create(
        tls_replication="true", **with_tls_server_args, masterauth="hi", requirepass="hi"
    )
    replica.start()

    c_replica = replica.client(password="hi", **with_ca_tls_client_args)

    res = await c_replica.execute_command("REPLICAOF localhost " + str(master.port))
    assert "OK" == res
    await check_all_replicas_finished([c_replica], c_master)
    assert 100 == await c_replica.execute_command("dbsize")


@pytest.mark.exclude_epoll
async def test_ipv6_replication(df_factory: DflyInstanceFactory):
    """Test that IPV6 addresses work for replication, ::1 is 127.0.0.1 localhost"""
    master = df_factory.create(proactor_threads=1, bind="::1", port=1111)
    replica = df_factory.create(proactor_threads=1, bind="::1", port=1112)

    df_factory.start_all([master, replica])
    c_master = master.client()
    c_replica = replica.client()

    assert await c_master.ping()
    assert await c_replica.ping()
    assert await c_replica.execute_command("REPLICAOF", master["bind"], master["port"]) == "OK"


# busy wait for 'replica' instance to have replication status 'status'
async def wait_for_replica_status(
    replica: aioredis.Redis, status: str, wait_for_seconds=0.01, timeout=20
):
    start = time.time()
    while (time.time() - start) < timeout:
        await asyncio.sleep(wait_for_seconds)

        info = await replica.info("replication")
        if info["master_link_status"] == status:
            return
    raise RuntimeError("Client did not become available in time!")


async def test_replicaof_flag(df_factory):
    # tests --replicaof works under normal conditions
    master = df_factory.create(
        proactor_threads=2,
    )

    # set up master
    master.start()
    c_master = master.client()
    await c_master.set("KEY", "VALUE")
    db_size = await c_master.dbsize()
    assert 1 == db_size

    replica = df_factory.create(
        proactor_threads=2,
        replicaof=f"localhost:{master.port}",  # start to replicate master
    )

    # set up replica. check that it is replicating
    replica.start()
    c_replica = replica.client()

    await wait_available_async(c_replica)  # give it time to startup
    # wait until we have a connection
    await check_all_replicas_finished([c_replica], c_master)

    dbsize = await c_replica.dbsize()
    assert 1 == dbsize

    val = await c_replica.get("KEY")
    assert "VALUE" == val


async def test_replicaof_flag_replication_waits(df_factory):
    # tests --replicaof works when we launch replication before the master
    BASE_PORT = 1111
    replica = df_factory.create(
        proactor_threads=2,
        replicaof=f"localhost:{BASE_PORT}",  # start to replicate master
    )

    # set up replica first
    replica.start()
    c_replica = replica.client()
    await wait_for_replica_status(c_replica, status="down")

    # check that it is in replica mode, yet status is down
    info = await c_replica.info("replication")
    assert info["role"] == "slave"
    assert info["master_host"] == "localhost"
    assert info["master_port"] == BASE_PORT
    assert info["master_link_status"] == "down"

    # set up master
    master = df_factory.create(
        port=BASE_PORT,
        proactor_threads=2,
    )

    master.start()
    c_master = master.client()
    await c_master.set("KEY", "VALUE")
    db_size = await c_master.dbsize()
    assert 1 == db_size

    # check that replication works now
    await wait_for_replica_status(c_replica, status="up")
    await check_all_replicas_finished([c_replica], c_master)

    dbsize = await c_replica.dbsize()
    assert 1 == dbsize

    val = await c_replica.get("KEY")
    assert "VALUE" == val


async def test_replicaof_flag_disconnect(df_factory):
    # test stopping replication when started using --replicaof
    master = df_factory.create(
        proactor_threads=2,
    )

    # set up master
    master.start()
    c_master = master.client()
    await wait_available_async(c_master)

    await c_master.set("KEY", "VALUE")
    db_size = await c_master.dbsize()
    assert 1 == db_size

    replica = df_factory.create(
        proactor_threads=2,
        replicaof=f"localhost:{master.port}",  # start to replicate master
    )

    # set up replica. check that it is replicating
    replica.start()

    c_replica = replica.client()
    await wait_available_async(c_replica)
    await check_all_replicas_finished([c_replica], c_master)

    dbsize = await c_replica.dbsize()
    assert 1 == dbsize

    val = await c_replica.get("KEY")
    assert "VALUE" == val

    await c_replica.replicaof("no", "one")  # disconnect

    role = await c_replica.role()
    assert role[0] == "master"


async def test_df_crash_on_memcached_error(df_factory):
    master = df_factory.create(
        memcached_port=11211,
        proactor_threads=2,
    )

    replica = df_factory.create(
        memcached_port=master.mc_port + 1,
        proactor_threads=2,
    )

    master.start()
    replica.start()

    c_master = master.client()
    await wait_available_async(c_master)

    c_replica = replica.client()
    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_available_async(c_replica)

    memcached_client = pymemcache.Client(f"127.0.0.1:{replica.mc_port}")

    with pytest.raises(pymemcache.exceptions.MemcacheServerError):
        memcached_client.set("key", "data", noreply=False)


async def test_df_crash_on_replicaof_flag(df_factory):
    master = df_factory.create(
        proactor_threads=2,
    )
    master.start()

    replica = df_factory.create(proactor_threads=2, replicaof=f"127.0.0.1:{master.port}")
    replica.start()

    c_master = master.client()
    c_replica = replica.client()

    await wait_available_async(c_master)
    await wait_available_async(c_replica)

    res = await c_replica.execute_command("SAVE DF myfile")
    assert "OK" == res

    res = await c_replica.execute_command("DBSIZE")
    assert res == 0


async def test_network_disconnect(df_factory, df_seeder_factory):
    master = df_factory.create(proactor_threads=6)
    replica = df_factory.create(proactor_threads=4)

    df_factory.start_all([replica, master])
    seeder = df_seeder_factory.create(port=master.port)

    async with replica.client() as c_replica:
        await seeder.run(target_deviation=0.1)

        proxy = Proxy("127.0.0.1", 1111, "127.0.0.1", master.port)
        await proxy.start()
        task = asyncio.create_task(proxy.serve())
        try:
            await c_replica.execute_command(f"REPLICAOF localhost {proxy.port}")

            for _ in range(10):
                await asyncio.sleep(random.randint(0, 10) / 10)
                proxy.drop_connection()

            # Give time to detect dropped connection and reconnect
            await asyncio.sleep(1.0)
            await wait_available_async(c_replica)

            capture = await seeder.capture()
            assert await seeder.compare(capture, replica.port)
        finally:
            await proxy.close(task)


async def test_network_disconnect_active_stream(df_factory, df_seeder_factory):
    master = df_factory.create(proactor_threads=4, shard_repl_backlog_len=4000)
    replica = df_factory.create(proactor_threads=4)

    df_factory.start_all([replica, master])
    seeder = df_seeder_factory.create(port=master.port)

    async with replica.client() as c_replica, master.client() as c_master:
        await seeder.run(target_deviation=0.1)

        proxy = Proxy("127.0.0.1", 1112, "127.0.0.1", master.port)
        await proxy.start()
        task = asyncio.create_task(proxy.serve())
        try:
            await c_replica.execute_command(f"REPLICAOF localhost {proxy.port}")

            fill_task = asyncio.create_task(seeder.run(target_ops=4000))

            for _ in range(3):
                await asyncio.sleep(random.randint(10, 20) / 10)
                proxy.drop_connection()

            seeder.stop()
            await fill_task

            # Give time to detect dropped connection and reconnect
            await asyncio.sleep(1.0)
            await wait_available_async(c_replica)

            logging.debug(await c_replica.execute_command("INFO REPLICATION"))
            logging.debug(await c_master.execute_command("INFO REPLICATION"))

            capture = await seeder.capture()
            assert await seeder.compare(capture, replica.port)
        finally:
            await proxy.close(task)


async def test_network_disconnect_small_buffer(df_factory, df_seeder_factory):
    master = df_factory.create(proactor_threads=4, shard_repl_backlog_len=1)
    replica = df_factory.create(proactor_threads=4)

    df_factory.start_all([replica, master])
    seeder = df_seeder_factory.create(port=master.port)

    async with replica.client() as c_replica, master.client() as c_master:
        await seeder.run(target_deviation=0.1)

        proxy = Proxy("127.0.0.1", 1113, "127.0.0.1", master.port)
        await proxy.start()
        task = asyncio.create_task(proxy.serve())

        try:
            await c_replica.execute_command(f"REPLICAOF localhost {proxy.port}")

            # Wait for the two nodes to be in sync (stable state replication)
            await wait_available_async(c_replica)

            # Now start seeding and dropping
            fill_task = asyncio.create_task(seeder.run())

            for _ in range(3):
                await asyncio.sleep(random.randint(5, 10) / 10)
                proxy.drop_connection()

            seeder.stop()
            await fill_task

            # Give time to detect dropped connection and reconnect
            await asyncio.sleep(1.0)
            await wait_available_async(c_replica)

            # logging.debug(await c_replica.execute_command("INFO REPLICATION"))
            # logging.debug(await c_master.execute_command("INFO REPLICATION"))
            capture = await seeder.capture()
            assert await seeder.compare(capture, replica.port)
        finally:
            await proxy.close(task)

    info = await c_replica.info("replication")
    master.stop()
    lines = master.find_in_logs("Partial sync requested from stale LSN")
    assert len(lines) > 0


async def test_replica_reconnections_after_network_disconnect(df_factory, df_seeder_factory):
    master = df_factory.create(proactor_threads=6)
    replica = df_factory.create(proactor_threads=4)

    df_factory.start_all([replica, master])
    seeder = df_seeder_factory.create(port=master.port)

    async with replica.client() as c_replica:
        await seeder.run(target_deviation=0.1)

        proxy = Proxy("127.0.0.1", 1115, "127.0.0.1", master.port)
        await proxy.start()
        task = asyncio.create_task(proxy.serve())
        try:
            await c_replica.execute_command(f"REPLICAOF localhost {proxy.port}")

            # Wait replica to be up and synchronized with master
            await wait_available_async(c_replica)

            initial_reconnects_count = await get_replica_reconnects_count(replica)

            # Fully drop the server
            await proxy.close(task)

            # After dropping the connection replica should try to reconnect
            await wait_for_replica_status(c_replica, status="down")
            await asyncio.sleep(2)

            # Restart the proxy
            await proxy.start()
            task = asyncio.create_task(proxy.serve())

            # Wait replica to be reconnected and synchronized with master
            await wait_available_async(c_replica)

            capture = await seeder.capture()
            assert await seeder.compare(capture, replica.port)

            # Assert replica reconnects metrics increased
            await assert_replica_reconnections(replica, initial_reconnects_count)

        finally:
            await proxy.close(task)


async def test_search(df_factory):
    master = df_factory.create(proactor_threads=4)
    replica = df_factory.create(proactor_threads=4)

    df_factory.start_all([master, replica])

    c_master = master.client()
    c_replica = replica.client()

    # First, create an index on replica
    await c_replica.execute_command("FT.CREATE", "idx-r", "SCHEMA", "f1", "numeric")
    for i in range(0, 10):
        await c_replica.hset(f"k{i}", mapping={"f1": i})
    assert (await c_replica.ft("idx-r").search("@f1:[5 9]")).total == 5

    # Second, create an index on master
    await c_master.execute_command("FT.CREATE", "idx-m", "SCHEMA", "f2", "numeric")
    for i in range(0, 10):
        await c_master.hset(f"k{i}", mapping={"f2": i * 2})
    assert (await c_master.ft("idx-m").search("@f2:[6 10]")).total == 3

    # Replicate
    await c_replica.execute_command("REPLICAOF", "localhost", master.port)
    await wait_available_async(c_replica)

    # Check master index was picked up and original index was deleted
    assert (await c_replica.execute_command("FT._LIST")) == ["idx-m"]

    # Check query from master runs on replica
    assert (await c_replica.ft("idx-m").search("@f2:[6 10]")).total == 3

    # Set a new key
    await c_master.hset("kNEW", mapping={"f2": 100})
    await asyncio.sleep(0.1)

    assert (await c_replica.ft("idx-m").search("@f2:[100 100]")).docs[0].id == "kNEW"

    # Create a new aux index on master
    await c_master.execute_command("FT.CREATE", "idx-m2", "SCHEMA", "f2", "numeric", "sortable")
    await asyncio.sleep(0.1)

    from redis.commands.search.query import Query

    assert (await c_replica.ft("idx-m2").search(Query("*").sort_by("f2").paging(0, 1))).docs[
        0
    ].id == "k0"


@dfly_args({"proactor_threads": 4})
async def test_search_with_stream(df_factory: DflyInstanceFactory):
    master = df_factory.create()
    replica = df_factory.create()

    df_factory.start_all([master, replica])

    c_master = master.client()
    c_replica = replica.client()

    # fill master with hsets and create index
    p = c_master.pipeline(transaction=False)
    for i in range(10_000):
        p.hset(f"k{i}", mapping={"name": f"name of {i}"})
    await p.execute()

    await c_master.execute_command("FT.CREATE i1 SCHEMA name text")

    # start replication and issue one add command and delete commands on master in parallel
    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    await c_master.hset("secret-key", mapping={"name": "new-secret"})
    for i in range(1_000):
        await c_master.delete(f"k{i}")

    # expect replica to see only 10k - 1k + 1 = 9001 keys in it's index
    await wait_available_async(c_replica)
    await check_all_replicas_finished([c_replica], c_master)
    assert await c_replica.execute_command("FT.SEARCH i1 * LIMIT 0 0") == [9_001]
    assert await c_replica.execute_command('FT.SEARCH i1 "secret"') == [
        1,
        "secret-key",
        ["name", "new-secret"],
    ]


# @pytest.mark.large
async def test_client_pause_with_replica(df_factory, df_seeder_factory):
    master = df_factory.create(proactor_threads=4)
    replica = df_factory.create(proactor_threads=4)
    df_factory.start_all([master, replica])

    seeder = df_seeder_factory.create(port=master.port)

    c_master = master.client()
    c_replica = replica.client()

    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_available_async(c_replica)

    fill_task = asyncio.create_task(seeder.run())

    # Give the seeder a bit of time.
    await asyncio.sleep(1)
    # block the seeder for 4 seconds
    await c_master.execute_command("client pause 4000 write")
    stats = await c_master.info("CommandStats")
    await asyncio.sleep(0.5)
    stats_after_sleep = await c_master.info("CommandStats")
    # Check no commands are executed except info and replconf called from replica
    for cmd, cmd_stats in stats_after_sleep.items():
        if cmd in ["cmdstat_info", "cmdstat_replconf", "cmdstat_multi"]:
            continue
        assert stats[cmd] == cmd_stats, cmd

    await asyncio.sleep(6)
    seeder.stop()
    await fill_task
    stats_after_pause_finish = await c_master.info("CommandStats")
    more_exeuted = False
    for cmd, cmd_stats in stats_after_pause_finish.items():
        if "cmdstat_info" != cmd and "cmdstat_replconf" != cmd_stats and stats[cmd] != cmd_stats:
            more_exeuted = True
    assert more_exeuted

    capture = await seeder.capture(port=master.port)
    assert await seeder.compare(capture, port=replica.port)


@pytest.mark.debug_only
@dfly_args({"proactor_threads": 2})
async def test_replicaof_reject_on_load(df_factory, df_seeder_factory):
    master = df_factory.create()
    replica = df_factory.create(dbfilename=f"dump_{tmp_file_name()}")
    df_factory.start_all([master, replica])

    c_replica = replica.client()

    await c_replica.execute_command(f"DEBUG POPULATE 1000 key 500 RAND type set elements 500")

    replica.stop()
    replica.start()
    # Disable retries so that BusyLoadingError is raised immediately.
    # redis-py >= 7 retries on ConnectionError by default, and BusyLoadingError
    # inherits from ConnectionError, causing the REPLICAOF to be silently
    # retried until loading finishes.
    from redis.retry import Retry
    from redis.backoff import NoBackoff

    c_replica = replica.client(retry=Retry(NoBackoff(), 0))

    @assert_eventually
    async def check_replica_isloading():
        persistence = await c_replica.info("PERSISTENCE")
        assert persistence["loading"] == 1

    # If this fails adjust load of DEBUG POPULATE above.
    await check_replica_isloading()

    # Check replica of not alowed while loading snapshot
    # Keep in mind that if the exception has not been raised, it doesn't mean
    # that there is a bug because it could be the case that while executing
    # INFO PERSISTENCE df is in loading state but when we call REPLICAOF df
    # is no longer in loading state and the assertion false is triggered.
    with pytest.raises(aioredis.BusyLoadingError):
        await c_replica.execute_command(f"REPLICAOF localhost {master.port}")

    # Check one we finish loading snapshot replicaof success
    await wait_available_async(c_replica, timeout=180)
    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")


async def test_heartbeat_eviction_propagation(df_factory):
    master = df_factory.create(
        proactor_threads=1, cache_mode="true", maxmemory="256mb", enable_heartbeat_eviction="false"
    )
    replica = df_factory.create(proactor_threads=1)
    df_factory.start_all([master, replica])

    c_master = master.client()
    c_replica = replica.client()

    # fill the master to use about 233mb > 256mb * 0.9, which will trigger heartbeat eviction.
    await c_master.execute_command("DEBUG POPULATE 233 size 1048576")
    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_available_async(c_replica)

    # now enable heart beat eviction
    await c_master.execute_command("CONFIG SET enable_heartbeat_eviction true")

    while True:
        info = await c_master.info("stats")
        evicted_1 = info["evicted_keys"]
        time.sleep(2)
        info = await c_master.info("stats")
        evicted_2 = info["evicted_keys"]
        if evicted_2 == evicted_1:
            break
        else:
            print("waiting for eviction to finish...", end="\r", flush=True)

    await check_all_replicas_finished([c_replica], c_master)
    keys_master = await c_master.execute_command("keys *")
    keys_replica = await c_replica.execute_command("keys *")
    assert set(keys_master) == set(keys_replica)


async def test_policy_based_eviction_propagation(df_factory, df_seeder_factory):
    master = df_factory.create(
        proactor_threads=2,
        cache_mode="true",
        maxmemory="512mb",
        enable_heartbeat_eviction="false",
        rss_oom_deny_ratio=1.3,
    )
    replica = df_factory.create(proactor_threads=2)
    df_factory.start_all([master, replica])

    c_master = master.client()
    c_replica = replica.client()

    await c_master.execute_command("DEBUG POPULATE 6000 size 88000")

    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_available_async(c_replica)

    seeder = df_seeder_factory.create(
        port=master.port, keys=600, val_size=1000, stop_on_failure=False
    )
    await seeder.run(target_deviation=0.1)

    info = await c_master.info("stats")
    assert (
        info["evicted_keys"] > 0
    ), f"Weak testcase: policy based eviction was not triggered. {await c_master.info()}"

    await check_all_replicas_finished([c_replica], c_master)

    # KEYS may trigger lazy expiry on master, generating DELs not yet received by replica.
    # Fetch master keys first, then re-sync to ensure replica applies any resulting DELs.
    keys_master = await c_master.execute_command("keys k*")
    await check_all_replicas_finished([c_replica], c_master)
    keys_replica = await c_replica.execute_command("keys k*")

    assert set(keys_replica).difference(keys_master) == set()
    assert set(keys_master).difference(keys_replica) == set()


async def test_journal_doesnt_yield_issue_2500(df_factory, df_seeder_factory):
    """
    Issues many SETEX commands through a Lua script so that no yields are done between them.
    In parallel, connect a replica, so that these SETEX commands write their custom journal log.
    This makes sure that no Fiber context switch while inside a shard callback.
    """
    master = df_factory.create()
    replica = df_factory.create()
    df_factory.start_all([master, replica])

    c_master = master.client()
    c_replica = replica.client()

    async def send_setex():
        script = """
        local charset = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890"

        local random_string = function(length)
            local str = ''
            for i=1,length do
                str = str .. charset:sub(math.random(1, #charset))
            end
            return str
        end

        for i = 1, 200 do
            -- 200 iterations to make sure SliceSnapshot dest queue is full
            -- 100 bytes string to make sure serializer is big enough
            redis.call('SETEX', KEYS[1], 1000, random_string(100))
        end
        """

        for i in range(10):
            await asyncio.gather(
                *[c_master.eval(script, 1, random.randint(0, 1_000)) for j in range(3)]
            )

    stream_task = asyncio.create_task(send_setex())
    await asyncio.sleep(0.1)

    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    assert not stream_task.done(), "Weak testcase. finished sending commands before replication."

    await wait_available_async(c_replica)
    await stream_task

    await check_all_replicas_finished([c_replica], c_master)
    keys_master = await c_master.execute_command("keys *")
    keys_replica = await c_replica.execute_command("keys *")
    assert set(keys_master) == set(keys_replica)


@pytest.mark.large
async def test_saving_replica(df_factory):
    master = df_factory.create(proactor_threads=1)
    replica = df_factory.create(proactor_threads=1, dbfilename=f"dump_{tmp_file_name()}")
    df_factory.start_all([master, replica])

    c_master = master.client()
    c_replica = replica.client()

    await c_master.execute_command("DEBUG POPULATE 100000 key 4048 RAND")
    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_available_async(c_replica)

    async def save_replica():
        await c_replica.execute_command("save")

    save_task = asyncio.create_task(save_replica())
    while not await is_saving(c_replica):  # wait for replica start saving
        assert "rdb_changes_since_last_success_save:0" not in await c_replica.execute_command(
            "info persistence"
        ), "Weak test case, finished saving too quickly"
        await asyncio.sleep(0.1)
    await c_replica.execute_command("replicaof no one")
    assert await is_saving(c_replica)
    await save_task
    assert not await is_saving(c_replica)


async def test_start_replicating_while_save(df_factory):
    master = df_factory.create(proactor_threads=4)
    replica = df_factory.create(proactor_threads=4, dbfilename=f"dump_{tmp_file_name()}")
    df_factory.start_all([master, replica])

    c_master = master.client()
    c_replica = replica.client()

    await c_replica.execute_command("DEBUG POPULATE 100000 key 4096 RAND")

    async def save_replica():
        await c_replica.execute_command("save")

    save_task = asyncio.create_task(save_replica())
    while not await is_saving(c_replica):  # wait for server start saving
        assert "rdb_changes_since_last_success_save:0" not in await c_replica.execute_command(
            "info persistence"
        ), "Weak test case, finished saving too quickly"
        await asyncio.sleep(0.1)
    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    assert await is_saving(c_replica)
    await save_task
    assert not await is_saving(c_replica)


async def test_user_acl_replication(df_factory):
    master = df_factory.create(proactor_threads=4)
    replica = df_factory.create(proactor_threads=4)
    df_factory.start_all([master, replica])

    c_master = master.client()
    await c_master.execute_command("ACL SETUSER tmp >tmp ON +ping +dfly +replconf")
    await c_master.execute_command("SET foo bar")
    assert 1 == await c_master.execute_command("DBSIZE")

    c_replica = replica.client()
    await c_replica.execute_command("CONFIG SET masteruser tmp")
    await c_replica.execute_command("CONFIG SET masterauth tmp")
    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")

    await wait_available_async(c_replica)
    assert 1 == await c_replica.execute_command("DBSIZE")

    # revoke acl's from tmp
    await c_master.execute_command("ACL SETUSER tmp -replconf")
    async for info, breaker in info_tick_timer(c_replica, section="REPLICATION"):
        with breaker:
            assert info["master_link_status"] == "down"

    await c_master.execute_command("SET bar foo")

    # reinstate and let replication continue
    await c_master.execute_command("ACL SETUSER tmp +replconf")
    await check_all_replicas_finished([c_replica], c_master, 5)
    assert 2 == await c_replica.execute_command("DBSIZE")


@pytest.mark.parametrize("break_conn", [False, True])
async def test_replica_reconnect(df_factory, break_conn):
    """
    Test replica does not connect to master if master restarted
    step1: create master and replica
    step2: stop master and start again with the same port
    step3: check replica is not replicating the restarted master
    step4: issue new replicaof command
    step5: check replica replicates master
    """
    # Connect replica to master
    master = df_factory.create(proactor_threads=1)
    replica = df_factory.create(proactor_threads=1, break_replication_on_master_restart=break_conn)
    df_factory.start_all([master, replica])

    c_master = master.client()
    c_replica = replica.client()

    await c_master.set("k", "12345")
    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_available_async(c_replica)
    assert (await c_replica.info("REPLICATION"))["master_link_status"] == "up"

    # kill existing master, create master with different repl_id but same port
    master_port = master.port
    master.stop()

    await asyncio.sleep(1)

    repl_info = await c_replica.info("REPLICATION")
    assert repl_info["master_link_status"] == "down", str(repl_info)

    master = df_factory.create(proactor_threads=1, port=master_port)
    df_factory.start_all([master])
    await asyncio.sleep(1)  # We sleep for 0.5s in replica.cc before reconnecting

    # Assert that replica did not reconnected to master with different repl_id
    if break_conn:
        assert await c_master.execute_command("get k") == None
        assert await c_replica.execute_command("get k") == "12345"
        assert await c_master.execute_command("set k 6789")
        assert await c_replica.execute_command("get k") == "12345"
        assert (await c_replica.info("REPLICATION"))["master_link_status"] == "down"
    else:
        assert await c_master.execute_command("get k") == None
        assert await c_replica.execute_command("get k") == None
        assert await c_master.execute_command("set k 6789")
        await check_all_replicas_finished([c_replica], c_master)
        assert await c_replica.execute_command("get k") == "6789"
        assert (await c_replica.info("REPLICATION"))["master_link_status"] == "up"

    # Force re-replication, assert that it worked
    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_available_async(c_replica)
    assert await c_replica.execute_command("get k") == "6789"


async def test_announce_ip_port(df_factory):
    master = df_factory.create()
    replica = df_factory.create(replica_announce_ip="overrode-host", announce_port="1337")

    master.start()
    replica.start()

    # Connect clients, connect replica to master
    c_master = master.client()
    c_replica = replica.client()
    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_available_async(c_replica)

    role, node = await c_master.execute_command("role")
    assert role == "master"
    host, port, _ = node[0]
    assert host == "overrode-host"
    assert port == "1337"


async def test_replication_timeout_on_full_sync(df_factory: DflyInstanceFactory, df_seeder_factory):
    # setting replication_timeout to a very small value to force the replica to timeout
    master = df_factory.create(
        replication_timeout=100, vmodule="replica=2,dflycmd=2,snapshot=1,rdb_save=1,rdb_load=1"
    )
    replica = df_factory.create()

    df_factory.start_all([master, replica])

    c_master = master.client()
    c_replica = replica.client()

    await c_master.execute_command("debug", "populate", "200000", "foo", "5000", "RAND")
    seeder = df_seeder_factory.create(port=master.port)
    seeder_task = asyncio.create_task(seeder.run())

    await asyncio.sleep(0.5)  # wait for seeder running

    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")

    # wait for full sync
    async with async_timeout.timeout(3):
        await wait_for_replicas_state(c_replica, state="full_sync", timeout=0.05)

    await c_replica.execute_command(
        "debug replica pause"
    )  # pause replica to trigger reconnect on master

    await asyncio.sleep(1)

    await c_replica.execute_command("debug replica resume")  # resume replication

    await asyncio.sleep(1)  # replica will start resync
    seeder.stop()
    await seeder_task

    await check_all_replicas_finished([c_replica], c_master, timeout=60)
    await assert_replica_reconnections(replica, 0)


@pytest.mark.exclude_epoll
@dfly_args({"proactor_threads": 1})
async def test_master_stalled_disconnect(df_factory: DflyInstanceFactory):
    # disconnect after 1 second of being blocked
    master = df_factory.create(replication_timeout=1000)
    replica = df_factory.create()

    df_factory.start_all([master, replica])

    c_master = master.client()
    c_replica = replica.client()

    await c_master.execute_command("debug", "populate", "200000", "foo", "500", "RAND")
    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")

    @assert_eventually
    async def check_replica_connected():
        repl_info = await c_master.info("replication")
        assert "slave0" in repl_info

    @assert_eventually
    async def check_replica_disconnected():
        repl_info = await c_master.info("replication")
        assert "slave0" not in repl_info

    await check_replica_connected()
    await c_replica.execute_command("DEBUG REPLICA PAUSE")
    await check_replica_connected()  # still connected
    await asyncio.sleep(1)  # wait for the master to recognize it's being blocked
    await check_replica_disconnected()


def download_dragonfly_release(version):
    path = f"/tmp/old_df/{version}"
    binary = f"{path}/dragonfly-x86_64"
    if os.path.isfile(binary):
        return binary

    # Cleanup in case there's partial files
    if os.path.exists(path):
        shutil.rmtree(path)

    os.makedirs(path)
    gzfile = f"{path}/dragonfly.tar.gz"
    logging.debug(f"Downloading Dragonfly release into {gzfile}...")

    # Download
    urllib.request.urlretrieve(
        f"https://github.com/dragonflydb/dragonfly/releases/download/{version}/dragonfly-x86_64.tar.gz",
        gzfile,
    )

    # Extract
    file = tarfile.open(gzfile)
    file.extractall(path)
    file.close()

    # Return path
    return binary


@pytest.mark.parametrize(
    "cluster_mode, announce_ip, announce_port",
    [
        ("", "localhost", 7000),
        ("emulated", "", 0),
        ("emulated", "localhost", 7000),
    ],
)
async def test_replicate_old_master(
    df_factory: DflyInstanceFactory, cluster_mode, announce_ip, announce_port
):
    cpu = platform.processor()
    if cpu != "x86_64":
        pytest.skip(f"Supported only on x64, running on {cpu}")

    dfly_version = "v1.19.2"
    released_dfly_path = download_dragonfly_release(dfly_version)
    master = df_factory.create(
        version=1.19,
        path=released_dfly_path,
        cluster_mode=cluster_mode,
    )
    replica = df_factory.create(
        cluster_mode=cluster_mode,
        cluster_announce_ip=announce_ip,
        announce_port=announce_port,
    )

    df_factory.start_all([master, replica])

    c_master = master.client()
    c_replica = replica.client()

    assert (
        f"df-{dfly_version}"
        == (await c_master.execute_command("info", "server"))["dragonfly_version"]
    )
    assert dfly_version != (await c_replica.execute_command("info", "server"))["dragonfly_version"]

    await c_master.execute_command("set", "k1", "v1")

    assert await c_replica.execute_command(f"REPLICAOF localhost {master.port}") == "OK"
    await wait_available_async(c_replica)

    assert await c_replica.execute_command("get", "k1") == "v1"


# This Test was intorduced in response to a bug when replicating empty hashmaps (encoded as
# ziplists) created with HSET, HSETEX, HDEL and then replicated 2 times.
# For more information plz refer to the issue on gh:
# https://github.com/dragonflydb/dragonfly/issues/3504
@dfly_args({"proactor_threads": 1})
async def test_empty_hash_map_replicate_old_master(df_factory):
    cpu = platform.processor()
    if cpu != "x86_64":
        pytest.skip(f"Supported only on x64, running on {cpu}")

    dfly_version = "v1.21.2"
    released_dfly_path = download_dragonfly_release(dfly_version)
    # old versions
    instances = [df_factory.create(path=released_dfly_path, version=1.21) for i in range(3)]
    # new version
    instances.append(df_factory.create())

    df_factory.start_all(instances)

    old_c_master = instances[0].client()
    # Create an empty hashmap
    await old_c_master.execute_command("HSET foo a_field a_value")
    await old_c_master.execute_command("HSETEX foo 2 b_field b_value")
    await old_c_master.execute_command("HDEL foo a_field")

    @assert_eventually
    async def check_if_empty():
        assert await old_c_master.execute_command("HGETALL foo") == []

    await check_if_empty()
    assert await old_c_master.execute_command(f"EXISTS foo") == 1
    await old_c_master.aclose()

    async def assert_body(client, result=1, state="online", node_role="slave"):
        async with async_timeout.timeout(10):
            await wait_for_replicas_state(client, state=state, node_role=node_role)

        assert await client.execute_command(f"EXISTS foo") == result
        assert await client.execute_command("REPLTAKEOVER 1") == "OK"

    index = 0
    last_old_replica = 2

    # Adjacent pairs
    for a, b in zip(instances, instances[1:]):
        logging.debug(index)
        client_b = b.client()
        assert await client_b.execute_command(f"REPLICAOF localhost {a.port}") == "OK"

        if index != last_old_replica:
            await assert_body(client_b, state="stable_sync", node_role="replica")
        else:
            await assert_body(client_b, result=0)

        index = index + 1
        await client_b.aclose()


# This Test was intorduced in response to a bug when replicating empty hash maps with
# HSET, HSETEX, HDEL and then loaded via replication.
# For more information plz refer to the issue on gh:
# https://github.com/dragonflydb/dragonfly/issues/3504
@dfly_args({"proactor_threads": 1})
async def test_empty_hashmap_loading_bug(df_factory: DflyInstanceFactory):
    cpu = platform.processor()
    if cpu != "x86_64":
        pytest.skip(f"Supported only on x64, running on {cpu}")

    dfly_version = "v1.21.2"
    released_dfly_path = download_dragonfly_release(dfly_version)

    master = df_factory.create(path=released_dfly_path, version=1.21)
    master.start()

    c_master = master.client()
    # Create an empty hashmap
    await c_master.execute_command("HSET foo a_field a_value")
    await c_master.execute_command("HSETEX foo 2 b_field b_value")
    await c_master.execute_command("HDEL foo a_field")

    @assert_eventually
    async def check_if_empty():
        assert await c_master.execute_command("HGETALL foo") == []

    await check_if_empty()
    assert await c_master.execute_command(f"EXISTS foo") == 1

    replica = df_factory.create()
    replica.start()
    c_replica = replica.client()

    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_for_replicas_state(c_replica)
    assert await c_replica.execute_command(f"dbsize") == 0


async def test_replicate_search_index_to_old_replica(df_factory: DflyInstanceFactory):
    """
    Test that a new master with search indices (including HNSW vector index) can
    replicate to a v1.35 replica. This verifies backward compatibility of replication
    when search indices are defined, ensuring the replica receives the data without
    errors from new RDB AUX fields (search-index, hnsw-index-metadata, HNSW opcodes).
    """
    cpu = platform.processor()
    if cpu != "x86_64":
        pytest.skip(f"Supported only on x64, running on {cpu}")

    dfly_version = "v1.35.1"
    released_dfly_path = download_dragonfly_release(dfly_version)

    # New master (current version) with search index
    master = df_factory.create(proactor_threads=2)
    # Old replica (v1.35)
    replica = df_factory.create(
        version=1.35,
        path=released_dfly_path,
        proactor_threads=2,
    )

    df_factory.start_all([master, replica])

    c_master = master.client()
    c_replica = replica.client()

    # Create a search index with HNSW vector field on the new master
    await c_master.execute_command(
        "FT.CREATE",
        "test_idx",
        "ON",
        "HASH",
        "PREFIX",
        "1",
        "item:",
        "SCHEMA",
        "name",
        "TEXT",
        "price",
        "NUMERIC",
        "SORTABLE",
        "category",
        "TAG",
        "embedding",
        "VECTOR",
        "HNSW",
        "6",
        "TYPE",
        "FLOAT32",
        "DIM",
        "2",
        "DISTANCE_METRIC",
        "L2",
    )

    # Insert test data with vector embeddings
    for i in range(100):
        category = "electronics" if i % 2 == 0 else "clothing"
        embedding = struct.pack("<2f", float(i), float(i * 2))
        await c_master.hset(
            f"item:{i}",
            mapping={
                "name": f"Product {i}",
                "price": str(i * 10),
                "category": category,
                "embedding": embedding,
            },
        )

    # Verify data and index on master
    assert await c_master.dbsize() == 100
    master_idx = c_master.ft("test_idx")
    text_result = await master_idx.search("Product 50")
    assert text_result.total >= 1

    # Verify KNN search on master
    query_vec = struct.pack("<2f", 50.0, 100.0)
    knn_result = await c_master.execute_command(
        "FT.SEARCH", "test_idx", "*=>[KNN 2 @embedding $vec]", "PARAMS", "2", "vec", query_vec
    )
    assert knn_result[0] >= 1
    assert "item:50" in knn_result

    # Start replication from new master to old replica
    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_available_async(c_replica)

    # Verify data replicated successfully
    assert await c_replica.dbsize() == 100
    assert await c_replica.hget("item:0", "name") == "Product 0"
    assert await c_replica.hget("item:99", "name") == "Product 99"

    # Verify KNN search works on old replica (index rebuilt from replicated data)
    knn_result = await c_replica.execute_command(
        "FT.SEARCH", "test_idx", "*=>[KNN 2 @embedding $vec]", "PARAMS", "2", "vec", query_vec
    )
    assert knn_result[0] >= 1
    assert "item:50" in knn_result


async def test_replicating_mc_flags(df_factory):
    master = df_factory.create(memcached_port=11211, proactor_threads=1)
    replica = df_factory.create(
        memcached_port=11212, proactor_threads=1, dbfilename=f"dump_{tmp_file_name()}"
    )
    df_factory.start_all([master, replica])

    c_mc_master = pymemcache.Client(f"127.0.0.1:{master.mc_port}", default_noreply=False)

    c_replica = replica.client()

    assert c_mc_master.set("key1", "value0", noreply=True)
    assert c_mc_master.set("key2", "value2", noreply=True, expire=3600, flags=123456)
    assert c_mc_master.replace("key1", "value1", expire=4000, flags=2, noreply=True)

    c_master = master.client()
    for i in range(3, 100):
        await c_master.set(f"key{i}", f"value{i}")

    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_available_async(c_replica)

    c_mc_replica = pymemcache.Client(f"127.0.0.1:{replica.mc_port}", default_noreply=False)

    async def check_flag(key, flag):
        res = c_mc_replica.raw_command("get " + key, "END\r\n").split()
        # workaround sometimes memcached_client.raw_command returns empty str
        if len(res) > 2:
            assert res[2].decode() == str(flag)

    await check_flag("key1", 2)
    await check_flag("key2", 123456)

    for i in range(1, 100):
        assert c_mc_replica.get(f"key{i}") == str.encode(f"value{i}")


async def test_double_take_over(df_factory, df_seeder_factory):
    master = df_factory.create(proactor_threads=4, dbfilename="", admin_port=ADMIN_PORT)
    replica = df_factory.create(proactor_threads=4, dbfilename="", admin_port=ADMIN_PORT + 1)
    df_factory.start_all([master, replica])

    seeder = df_seeder_factory.create(port=master.port, keys=1000, dbcount=5, stop_on_failure=False)
    await seeder.run(target_deviation=0.1)

    capture = await seeder.capture(port=master.port)

    c_replica = replica.client()

    logging.debug("start replication")
    await c_replica.execute_command(f"REPLICAOF localhost {master.admin_port}")
    await wait_available_async(c_replica)

    logging.debug("running repltakover")
    await c_replica.execute_command(f"REPLTAKEOVER 10")
    assert await c_replica.execute_command("role") == ["master", []]

    @assert_eventually
    async def check_master_status():
        assert master.proc.poll() == 0, "Master process did not exit correctly."

    await check_master_status()

    logging.debug("restart previous master")
    master.start()
    c_master = master.client()

    logging.debug("start second replication")
    await c_master.execute_command(f"REPLICAOF localhost {replica.admin_port}")
    await wait_available_async(c_master)

    logging.debug("running second repltakover")
    await c_master.execute_command(f"REPLTAKEOVER 10")
    assert await c_master.execute_command("role") == ["master", []]

    assert await seeder.compare(capture, port=master.port)


async def test_replica_of_replica(df_factory):
    # Can't connect a replica to a replica, but OK to connect 2 replicas to the same master
    master = df_factory.create(proactor_threads=2)
    replica = df_factory.create(proactor_threads=2)
    replica2 = df_factory.create(proactor_threads=2)

    df_factory.start_all([master, replica, replica2])

    c_replica = replica.client()
    c_replica2 = replica2.client()

    assert await c_replica.execute_command(f"REPLICAOF localhost {master.port}") == "OK"

    with pytest.raises(redis.exceptions.ResponseError):
        await c_replica2.execute_command(f"REPLICAOF localhost {replica.port}")

    assert await c_replica2.execute_command(f"REPLICAOF localhost {master.port}") == "OK"


@pytest.mark.large
async def test_replication_timeout_on_full_sync_heartbeat_expiry(
    df_factory: DflyInstanceFactory, df_seeder_factory
):
    # Timeout set to 3 seconds because we must first saturate the socket such that subsequent
    # writes block. Otherwise, we will break the flows before Heartbeat actually deadlocks.
    master = df_factory.create(
        proactor_threads=2, replication_timeout=3000, vmodule="replica=2,dflycmd=2"
    )
    replica = df_factory.create()

    df_factory.start_all([master, replica])

    c_master = master.client()
    c_replica = replica.client()

    await c_master.execute_command("debug", "populate", "100000", "foo", "5000", "RAND")

    c_master = master.client()
    c_replica = replica.client()

    seeder = ExpirySeeder()
    seeder_task = asyncio.create_task(seeder.run(c_master))
    await seeder.wait_until_n_inserts(50000)
    seeder.stop()
    await seeder_task

    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")

    # wait for full sync
    async with async_timeout.timeout(3):
        await wait_for_replicas_state(c_replica, state="full_sync", timeout=0.05)

    await c_replica.execute_command("debug replica pause")

    # Dragonfly would get stuck here without the bug fix. When replica does not read from the
    # socket, Heartbeat() will block on the journal write for the expired items and shard_handler
    # would never be called and break replication. More details on #3936.

    await asyncio.sleep(6)

    await c_replica.execute_command("debug replica resume")  # resume replication

    await asyncio.sleep(1)  # replica will start resync

    await check_all_replicas_finished([c_replica], c_master, 60)
    await assert_replica_reconnections(replica, 0)


@pytest.mark.exclude_epoll
@dfly_args({"proactor_threads": 1})
async def test_memory_on_big_string_loading(df_factory):
    """
    In this test we want to make sure there is no spike in rss while loading big string value
    1. insert 1 big value to master
    2. replicate master
    3. check rss peak memory on replica node
    """
    master = df_factory.create()
    replica = df_factory.create()

    df_factory.start_all([master, replica])
    c_master = master.client()
    c_replica = replica.client()

    logging.debug("Populate with one big string")
    await c_master.execute_command("DEBUG POPULATE 1 key 200000000 RAND")

    async def get_memory(client, field):
        info = await client.info("memory")
        return info[field]

    logging.debug("Start replication and wait for full sync")
    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_for_replicas_state(c_replica)

    await c_replica.execute_command("memory decommit")
    await asyncio.sleep(1)
    replica_peak_memory = await get_memory(c_replica, "used_memory_peak_rss")
    replica_used_memory = await get_memory(c_replica, "used_memory_rss")

    logging.info(f"Replica Used memory {replica_used_memory}, peak memory {replica_peak_memory}")
    assert replica_peak_memory < 1.1 * replica_used_memory

    # Check replica data consistent
    replica_data = await DebugPopulateSeeder.capture(c_replica)
    master_data = await DebugPopulateSeeder.capture(c_master)
    assert master_data == replica_data


@pytest.mark.exclude_epoll
@pytest.mark.parametrize(
    "element_size, elements_number",
    [(16, 30000), (30000, 16)],
)
@dfly_args({"proactor_threads": 1})
async def test_big_containers(df_factory, element_size, elements_number):
    master = df_factory.create()
    replica = df_factory.create()

    df_factory.start_all([master, replica])
    c_master = master.client()
    c_replica = replica.client()

    logging.debug("Fill master with test data")
    seeder = DebugPopulateSeeder(
        key_target=50,
        data_size=element_size * elements_number,
        collection_size=elements_number,
        variance=1,
        samples=1,
        types=["LIST", "SET", "ZSET", "HASH", "STREAM"],
    )
    await seeder.run(c_master)

    async def get_memory(client, field):
        info = await client.info("memory")
        return info[field]

    await asyncio.sleep(1)  # wait for heartbeat to update rss memory
    used_memory = await get_memory(c_master, "used_memory_rss")

    logging.debug("Start replication and wait for full sync")
    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_for_replicas_state(c_replica)

    peak_memory = await get_memory(c_master, "used_memory_peak_rss")

    logging.info(f"Used memory {used_memory}, peak memory {peak_memory}")
    assert peak_memory < 1.1 * used_memory

    await c_replica.execute_command("memory decommit")
    await asyncio.sleep(1)
    replica_peak_memory = await get_memory(c_replica, "used_memory_peak_rss")
    replica_used_memory = await get_memory(c_replica, "used_memory_rss")

    logging.info(f"Replica Used memory {replica_used_memory}, peak memory {replica_peak_memory}")
    assert replica_peak_memory < 1.1 * replica_used_memory

    # Check replica data consistent
    replica_data = await DebugPopulateSeeder.capture(c_replica)
    master_data = await DebugPopulateSeeder.capture(c_master)
    assert master_data == replica_data


async def test_master_too_big(df_factory):
    master = df_factory.create(proactor_threads=4)
    replica = df_factory.create(proactor_threads=2, maxmemory="600mb")

    df_factory.start_all([master, replica])
    c_master = master.client()
    c_replica = replica.client()
    await c_master.execute_command("DEBUG POPULATE 1000000 key 1000 RAND")
    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")

    # We should never sync due to used memory too high during full sync
    with pytest.raises(TimeoutError):
        await wait_available_async(c_replica, timeout=10)


@dfly_args({"proactor_threads": 4})
async def test_stream_approximate_trimming(df_factory):
    master = df_factory.create()
    replica = df_factory.create()

    df_factory.start_all([master, replica])
    c_master = master.client()
    c_replica = replica.client()

    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_for_replicas_state(c_replica)

    # Step 1: Populate master with 100 streams, each containing 200 entries
    num_streams = 100
    entries_per_stream = 200

    for i in range(num_streams):
        stream_name = f"stream{i}"
        for j in range(entries_per_stream):
            await c_master.execute_command("XADD", stream_name, "*", f"field{j}", f"value{j}")

    # Step 2: Trim each stream to a random size between 70 and 200
    for i in range(num_streams):
        stream_name = f"stream{i}"
        trim_size = random.randint(70, entries_per_stream)
        await c_master.execute_command("XTRIM", stream_name, "MAXLEN", "~", trim_size)

    # Wait for replica sync
    await check_all_replicas_finished([c_replica], c_master)

    # Check replica data consistent
    master_data = await DebugPopulateSeeder.capture(c_master)
    replica_data = await DebugPopulateSeeder.capture(c_replica)
    assert master_data == replica_data

    # Step 3: Trim all streams to 0
    for i in range(num_streams):
        stream_name = f"stream{i}"
        await c_master.execute_command("XTRIM", stream_name, "MAXLEN", "0")

    # Wait for replica sync
    await check_all_replicas_finished([c_replica], c_master)

    # Check replica data consistent
    master_data = await DebugPopulateSeeder.capture(c_master)
    replica_data = await DebugPopulateSeeder.capture(c_replica)
    assert master_data == replica_data


@dfly_args({"proactor_threads": 2})
async def test_replicaof_does_not_flush_if_it_fails_to_connect(df_factory):
    master = df_factory.create(proactor_threads=2)
    replica = df_factory.create(proactor_threads=2)

    df_factory.start_all([master, replica])
    c_master = master.client()
    c_replica = replica.client()

    await c_master.execute_command("SET foo bar")
    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    await check_all_replicas_finished([c_replica], c_master)

    res = await c_replica.execute_command("dbsize")
    assert res == 1
    with pytest.raises(redis.exceptions.ResponseError):
        await c_replica.execute_command(f"REPLICAOF localhost {replica.port}")
    res = await c_replica.execute_command("dbsize")
    assert res == 1


@dfly_args({"proactor_threads": 2})
async def test_replicaof_inside_multi(df_factory):
    master = df_factory.create()
    replica = df_factory.create()
    df_factory.start_all([master, replica])

    async def replicate_inside_multi():
        try:
            c_master = master.client()
            p = c_master.pipeline(transaction=True)
            for i in range(5):
                p.execute_command("dbsize")
            p.execute_command(f"replicaof localhost {replica.port}")
            await p.execute()
            return True
        except redis.exceptions.ResponseError:
            return False

    MULTI_COMMANDS_TO_ISSUE = 30
    replication_commands = [
        asyncio.create_task(replicate_inside_multi()) for _ in range(MULTI_COMMANDS_TO_ISSUE)
    ]

    num_successes = 0
    for result in asyncio.as_completed(replication_commands, timeout=80):
        num_successes += await result

    logging.info(f"succeses: {num_successes}")
    assert MULTI_COMMANDS_TO_ISSUE == num_successes


@pytest.mark.large
async def test_preempt_in_atomic_section_of_heartbeat(df_factory: DflyInstanceFactory):
    master = df_factory.create(proactor_threads=1, serialization_max_chunk_size=100000000000)
    replicas = [df_factory.create(proactor_threads=1) for i in range(2)]

    # Start instances and connect clients
    df_factory.start_all([master] + replicas)
    c_master = master.client()
    c_replicas = [replica.client() for replica in replicas]

    total = 100000
    await c_master.execute_command(f"DEBUG POPULATE {total} tmp 100 TYPE SET ELEMENTS 100")

    thresehold = 50000
    for i in range(thresehold):
        rand = random.randint(1, 10)
        await c_master.execute_command(f"EXPIRE tmp:{i} {rand} NX")

    seeder = SeederV2(key_target=10_000)
    fill_task = asyncio.create_task(seeder.run(master.client()))

    for replica in c_replicas:
        await replica.execute_command(f"REPLICAOF LOCALHOST {master.port}")

    async with async_timeout.timeout(240):
        await wait_for_replicas_state(*c_replicas)

    await fill_task


@pytest.mark.large
async def test_bug_in_json_memory_tracking(df_factory: DflyInstanceFactory):
    """
    This test reproduces a bug in the JSON memory tracking.
    """
    random.seed(42)

    master = df_factory.create(
        proactor_threads=2,
        serialization_max_chunk_size=1,
        vmodule="replica=2,dflycmd=2,snapshot=1,rdb_save=1,rdb_load=1,journal_slice=2",
    )
    replicas = [df_factory.create(proactor_threads=2) for i in range(2)]

    # Start instances and connect clients
    df_factory.start_all([master] + replicas)
    c_master = master.client()
    c_replicas = [replica.client() for replica in replicas]

    total = 100000
    await c_master.execute_command(f"DEBUG POPULATE {total} tmp 1000 TYPE SET ELEMENTS 100")

    threshold = 25000
    for i in range(threshold):
        rand = random.randint(1, 4)
        await c_master.execute_command(f"EXPIRE tmp:{i} {rand} NX")

    seeder = SeederV2(key_target=50_000)
    fill_task = asyncio.create_task(seeder.run(master.client()))
    await asyncio.sleep(0.2)

    for replica in c_replicas:
        await replica.execute_command(f"REPLICAOF LOCALHOST {master.port}")

    async with async_timeout.timeout(240):
        await wait_for_replicas_state(*c_replicas)

    await seeder.stop(c_master)
    await fill_task


@pytest.mark.large
@pytest.mark.opt_only
@dfly_args({"proactor_threads": 2, "serialization_max_chunk_size": 5000, "compression_mode": "0"})
async def test_big_huge_streaming_restart(df_factory: DflyInstanceFactory):
    """
    Restart replicating instance with huge values. Tests that interrupting the streaming process doesn't hinder retrying replication
    """

    master, replica = df_factory.create(), df_factory.create(proactor_threads=1)
    df_factory.start_all([master, replica])
    c_master, c_replica = master.client(), replica.client()

    # Create huge values
    await c_master.execute_command(
        "debug", "populate", "2", "test", "1000", "rand", "type", "zset", "elements", "1000000"
    )

    # Restart replication a few times
    for _ in range(3):
        assert await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
        await asyncio.sleep(random.random() + 0.5)

    # Wait for it to finish finally
    async with async_timeout.timeout(60):
        await wait_for_replicas_state(c_replica)

    # Check that everything is in sync
    hashes = await asyncio.gather(*(SeederV2.capture(c) for c in [c_master, c_replica]))
    assert len(set(hashes)) == 1

    # No in-between errors occured
    replica.stop()
    lines = replica.find_in_logs("Duplicate zset fields detected")
    assert len(lines) == 0


@pytest.mark.large
async def test_replica_snapshot_with_big_values_while_seeding(df_factory: DflyInstanceFactory):
    proactors = 4
    master = df_factory.create(proactor_threads=proactors, dbfilename="")
    replica = df_factory.create(proactor_threads=proactors, dbfilename="")
    df_factory.start_all([master, replica])
    c_master = master.client()
    c_replica = replica.client()

    # 50% big values
    seeder_config = dict(key_target=8_000, huge_value_target=4_000)
    # Fill instance with test data
    seeder = SeederV2(**seeder_config)
    await seeder.run(c_master, target_deviation=0.01)

    assert await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    async with async_timeout.timeout(60):
        await wait_for_replicas_state(c_replica)

    # Start data stream
    stream_task = asyncio.create_task(seeder.run(c_master))
    await asyncio.sleep(1)

    file_name = tmp_file_name()
    assert await c_replica.execute_command(f"SAVE DF {file_name}") == "OK"
    await seeder.stop(c_master)
    await stream_task

    await check_all_replicas_finished([c_replica], c_master)

    # Check that everything is in sync
    hashes = await asyncio.gather(*(SeederV2.capture(c) for c in [c_master, c_replica]))
    assert len(set(hashes)) == 1

    replica.stop()
    lines = replica.find_in_logs("Exit SnapshotSerializer")
    assert len(lines) == (proactors - 1)
    for line in lines:
        # We test the serializtion path of command execution
        side_saved = extract_int_after_prefix("side_saved ", line)
        assert side_saved > 0

    # Check that the produced rdb is loaded correctly
    node = df_factory.create(dbfilename=file_name)
    node.start()
    c_node = node.client()
    await wait_available_async(c_node)
    assert await c_node.execute_command("dbsize") > 0
    await c_node.execute_command("FLUSHALL")


@pytest.mark.parametrize(
    "use_takeover, backlog_len",
    [(False, 2), (False, 1), (True, 1), (True, 10)],
)
async def test_partial_replication_on_same_source_master(df_factory, use_takeover, backlog_len):
    master = df_factory.create()
    replica1 = df_factory.create(shard_repl_backlog_len=backlog_len)
    replica2 = df_factory.create()

    df_factory.start_all([master, replica1, replica2])
    c_master = master.client()
    c_replica1 = replica1.client()
    c_replica2 = replica2.client()

    logging.debug("Fill master with test data")
    seeder = DebugPopulateSeeder(key_target=50)
    await seeder.run(c_master)

    logging.debug("Start replication and wait for full sync")
    await c_replica1.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_for_replicas_state(c_replica1)
    await c_replica2.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_for_replicas_state(c_replica2)

    # Send some traffic
    seeder = SeederV2(key_target=8_000)
    await seeder.run(c_master, target_deviation=0.01)

    # Wait for all journal changes propagate to replicas
    await check_all_replicas_finished([c_replica1, c_replica2], c_master)

    if use_takeover:
        # Promote first replica to master
        await c_replica1.execute_command(f"REPLTAKEOVER 5")
        if backlog_len > 1:
            await c_replica1.execute_command("SET bar foo")
            await c_replica1.execute_command("SET foo bar")

    else:
        # Promote first replica to master
        await c_replica1.execute_command(f"REPLICAOF NO ONE")
        await c_master.set("x", "y")
        await c_master.set("x", "y")
        await check_all_replicas_finished([c_replica2], c_master)

    # Start replication with new master
    await c_replica2.execute_command(f"REPLICAOF localhost {replica1.port}")

    await check_all_replicas_finished([c_replica2], c_replica1)
    # Validate data
    if use_takeover:
        hash1, hash2 = await asyncio.gather(
            *(SeederV2.capture(c) for c in (c_replica1, c_replica2))
        )
        assert hash1 == hash2
        s1 = await c_replica1.execute_command("dbsize")
        s2 = await c_replica1.execute_command("dbsize")
        assert s1 == s2

    # Check we can takeover to the second replica
    await c_replica2.execute_command(f"REPLTAKEOVER 5")

    replica1.stop()
    replica2.stop()
    if use_takeover:
        # Check logs for partial replication
        lines = replica2.find_in_logs(f"Started partial sync with localhost:{replica1.port}")
        assert len(lines) == 1
        # Check no full sync logs
        lines = replica2.find_in_logs(f"Started full sync with localhost:{replica1.port}")
        assert len(lines) == 0
    else:
        lines = replica2.find_in_logs(f"Started full sync with localhost:{replica1.port}")
        assert len(lines) == 1
        # No partial sync after NO ONE
        lines = replica2.find_in_logs(f"Started partial sync with localhost:{replica1.port}")
        assert len(lines) == 0


async def test_partial_replication_on_same_source_master_with_replica_lsn_inc(df_factory):
    server1 = df_factory.create()
    server2 = df_factory.create()
    server3 = df_factory.create()
    server4 = df_factory.create()

    df_factory.start_all([server1, server2, server3, server4])
    c_s2 = server2.client()
    c_s3 = server3.client()
    c_s4 = server4.client()

    logging.debug("Start replication and wait for full sync")
    await c_s2.execute_command(f"REPLICAOF localhost {server1.port}")
    await wait_for_replicas_state(c_s2)
    await c_s3.execute_command(f"REPLICAOF localhost {server1.port}")
    await wait_for_replicas_state(c_s3)

    # Promote server 2 to master
    await c_s2.execute_command(f"REPLTAKEOVER 20")
    # Make server 4 replica of server 2
    await c_s4.execute_command(f"REPLICAOF localhost {server2.port}")
    # Send some write command for lsn inc
    for i in range(100):
        await c_s2.set(i, "val")
    # Make server 3 replica of server 2
    await c_s3.execute_command(f"REPLICAOF localhost {server2.port}")

    await check_all_replicas_finished([c_s3], c_s2)
    await check_all_replicas_finished([c_s4], c_s2)

    s2_sz = await c_s2.dbsize()
    s3_sz = await c_s3.dbsize()
    assert s2_sz == 100
    assert s2_sz == s3_sz

    s4_sz = await c_s4.dbsize()
    assert s3_sz == s4_sz

    server3.stop()
    # Check logs for partial replication
    lines = server3.find_in_logs(f"Started partial sync with localhost:{server2.port}")
    assert len(lines) == 1


async def test_replicate_hset_with_expiry(df_factory: DflyInstanceFactory):
    master = df_factory.create(proactor_threads=2)
    replica = df_factory.create(proactor_threads=2)

    master.start()
    replica.start()

    cm = master.client()
    await cm.execute_command("HSETEX key 86400 name 1234")

    cr = replica.client()
    await cr.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_available_async(cr)

    result = await cr.hgetall("key")

    assert "name" in result
    assert result["name"] == "1234"


async def test_bug_5221(df_factory):
    master = df_factory.create(
        proactor_threads=1,
        cache_mode="true",
        maxmemory="256mb",
        enable_heartbeat_eviction="true",
        eviction_memory_budget_threshold=0.9,
    )
    replica = df_factory.create(proactor_threads=4)
    df_factory.start_all([master, replica])

    c_master = master.client()
    c_replica = replica.client()
    await c_replica.execute_command(f"replicaof localhost {master.port}")

    # Fill master with test data
    seeder = SeederV2(key_target=22000, data_size=1000)
    await seeder.run(c_master, target_deviation=0.01)
    await asyncio.sleep(1)
    await seeder.run(c_master, target_deviation=0.01)
    res = await c_master.execute_command("dbsize")
    assert res > 0


@pytest.mark.parametrize("proactors", [1, 4, 6])
@pytest.mark.parametrize("backlog_len", [1, 256, 1024, 1300])
async def test_partial_sync(df_factory, proactors, backlog_len):
    keys = 5_000
    if proactors > 1:
        keys = 10_000

    # We use lock_on_hashtag because we want to seed enough elements to one flow/journal such that
    # the partial sync stales.
    master = df_factory.create(
        proactor_threads=proactors, shard_repl_backlog_len=backlog_len, lock_on_hashtags=True
    )
    replica = df_factory.create(proactor_threads=proactors)

    df_factory.start_all([replica, master])

    async def stream(client, total):
        for i in range(0, total):
            prefix = "{prefix}"
            # Seed to one shard only. This will eventually cause one of the flows to become stale.
            await client.execute_command(f"SET {prefix}foo{i} bar{i}")

    async with replica.client() as c_replica, master.client() as c_master:
        seeder = SeederV2(key_target=keys)
        await seeder.run(c_master, target_deviation=0.01)

        proxy = Proxy("127.0.0.1", 1113, "127.0.0.1", master.port)
        await proxy.start()
        task = asyncio.create_task(proxy.serve())

        try:
            await c_replica.execute_command(f"REPLICAOF localhost {proxy.port}")
            # Reach stable sync
            await wait_for_replicas_state(c_replica)
            # Stream some elements
            await stream(c_master, backlog_len)

            proxy.drop_connection()
            # Give time to detect dropped connection and reconnect
            await asyncio.sleep(1.0)
            # Partial synced here
            await check_all_replicas_finished([c_replica], c_master)
            hash1, hash2 = await asyncio.gather(
                *(SeederV2.capture(c) for c in (c_master, c_replica))
            )
            assert hash1 == hash2

            await proxy.close()
            # Whoops we moved too much, no partial sync here
            await stream(c_master, backlog_len + 10)
            await proxy.start()
            await asyncio.sleep(1.0)

            await check_all_replicas_finished([c_replica], c_master)

            hash1, hash2 = await asyncio.gather(
                *(SeederV2.capture(c) for c in (c_master, c_replica))
            )
            assert hash1 == hash2
        finally:
            await proxy.close(task)

    master.stop()
    replica.stop()
    # Partial sync worked
    lines = master.find_in_logs("Partial sync requested from LSN")
    # Because we run with num_shards = proactors - 1
    total_attempts = 1
    if proactors > 1:
        total_attempts = proactors - 1 + proactors - 2
    assert len(lines) == total_attempts
    # Second partial sync failed because of stale LSN
    lines = master.find_in_logs("Partial sync requested from stale LSN")
    assert len(lines) == 1


async def test_mc_gat_replication(df_factory):
    master = df_factory.create(memcached_port=11211, proactor_threads=1)
    replica = df_factory.create(memcached_port=11212, proactor_threads=1)
    df_factory.start_all([master, replica])

    cm = pymemcache.Client(f"127.0.0.1:{master.mc_port}", default_noreply=False)

    key = "foo"
    value = b"bar"
    not_found = b"NOTFOUND"
    assert cm.set(key, value, noreply=True)

    async with replica.client() as cl:
        await cl.execute_command(f"REPLICAOF localhost {master.port}")
        await wait_available_async(cl)

    async def state_transitioned_stable(
        init: bytes,
        expected: bytes,
        duration_sec=5,
        sleep_sec=1,
    ):
        """
        Asserts that the state goes from initial to expected and then stays at expected, observing state for duration_sec
        """
        _start = time.time()
        transitioned = False
        state = None
        while time.time() - _start < duration_sec:
            state = cr.get(key, not_found)
            if not transitioned and state == expected:
                transitioned = True
            if transitioned:
                assert (
                    state == expected
                ), f"state moved back to initial after transition {state=} {init=} {expected=}"
            else:
                assert state == init, f"unexpected state: {state=} {init=}"
            await asyncio.sleep(sleep_sec)
        return state == expected

    cr = pymemcache.Client(f"127.0.0.1:{replica.mc_port}", default_noreply=False)

    assert await state_transitioned_stable(not_found, value)

    # Force the key to be removed by setting expiry in the past. Memcache treats expiry > 1 month as absolute from
    # epoch, so 1 month + 1 second expires the key
    month_plus_one = 60 * 60 * 24 * 30 + 1

    # GAT|GATS are not directly exposed in the python client API
    assert cm._fetch_cmd(b"gat", [str(month_plus_one), key], expect_cas=False) == {}

    # The replica should eventually sync the delete operation
    assert await state_transitioned_stable(value, not_found)

    assert cm.set(key, value, noreply=True)
    # expiry is set as now + 1000 seconds, which ensures the key will remain for the duration of the test
    assert cm._fetch_cmd(b"gat", [str(1000), key], expect_cas=False) == {key: value}

    # once the value is synced to the replica, assert that it remains stable and is not removed by setting expiry
    assert await state_transitioned_stable(not_found, value)

    result = cm._fetch_cmd(b"gats", [str(1000), key], expect_cas=True)
    assert len(result) == 1 and key in result, f"missing expected key: {result=}"
    expected_cas_ver = b"0"
    assert result[key] == (value, expected_cas_ver), f"unexpected result for key: {result=}"


@pytest.mark.skip("Fails constantly on CI")
@pytest.mark.large
@pytest.mark.parametrize("serialization_max_size", [1, 64000])
async def test_replication_onmove_flow(df_factory, serialization_max_size):
    master = df_factory.create(
        proactor_threads=2,
        cache_mode=True,
        point_in_time_snapshot=False,
        serialization_max_chunk_size=serialization_max_size,
    )
    replica = df_factory.create(proactor_threads=2)

    df_factory.start_all([master, replica])
    c_master = master.client()
    c_replica = replica.client()

    key_target = 100000
    # Fill master with test data
    await c_master.execute_command(f"DEBUG POPULATE {key_target} key 32 RAND TYPE hash ELEMENTS 10")
    logging.debug("finished populate")

    stop_event = asyncio.Event()

    async def get_keys():
        while not stop_event.is_set():
            pipe = c_master.pipeline(transaction=False)
            for _ in range(50):
                id = random.randint(0, key_target)
                pipe.hlen(f"key:{id}")
            await pipe.execute()

    get_task = asyncio.create_task(get_keys())
    await asyncio.sleep(0.1)

    # Start replication and wait for full sync
    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_for_replicas_state(c_replica)

    info = await c_master.info("stats")
    assert info["bump_ups"] >= 100

    await check_all_replicas_finished([c_replica], c_master)
    stop_event.set()
    await get_task

    # Check replica data consisten
    hash1, hash2 = await asyncio.gather(*(SeederV2.capture(c) for c in (c_master, c_replica)))
    assert hash1 == hash2

    master.stop()
    lines = master.find_in_logs("Exit SnapshotSerializer")
    assert len(lines) > 0
    for line in lines:
        # We test the full sync on moved path execution
        moved_saved = extract_int_after_prefix("moved_saved ", line)
        logging.debug(f"Moved saves {moved_saved}")
        assert moved_saved > 0


@pytest.mark.large
@dfly_args({"proactor_threads": 1})
async def test_big_strings(df_factory):
    master = df_factory.create(
        proactor_threads=1, serialization_max_chunk_size=1, vmodule="snapshot=1"
    )
    replica = df_factory.create(proactor_threads=1)

    df_factory.start_all([master, replica])
    c_master = master.client()
    c_replica = replica.client()

    # 200kb
    value_size = 200_000

    async def get_memory(client, field):
        info = await client.info("memory")
        return info[field]

    capacity = await get_memory(c_master, "prime_capacity")

    seeder = DebugPopulateSeeder(
        key_target=int(capacity * 0.7),
        data_size=value_size,
        collection_size=1,
        variance=1,
        samples=1,
        types=["STRING"],
    )
    await seeder.run(c_master)

    # sanity
    capacity = await get_memory(c_master, "prime_capacity")
    assert capacity < 8000

    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_for_replicas_state(c_replica)

    # Check if replica data is consistent
    replica_data = await DebugPopulateSeeder.capture(c_replica)
    master_data = await DebugPopulateSeeder.capture(c_master)
    assert master_data == replica_data

    replica.stop()
    master.stop()

    lines = master.find_in_logs("Serialization peak bytes: ")
    assert len(lines) == 1
    # We test the serializtion path of command execution
    line = lines[0]
    peak_bytes = extract_int_after_prefix("Serialization peak bytes: ", line)
    assert peak_bytes < value_size


@pytest.mark.large
async def test_takeover_bug_wrong_replica_checked_in_logs(df_factory):
    master = df_factory.create(proactor_threads=4, vmodule="dflycmd=1")
    replicas = [df_factory.create(proactor_threads=2) for _ in range(3)]
    df_factory.start_all([master] + replicas)

    c_master = master.client()
    clients = [r.client() for r in replicas]

    # Connect all replicas
    for c in clients:
        await c.execute_command(f"REPLICAOF localhost {master.port}")
    await asyncio.gather(*[wait_available_async(c) for c in clients])

    # Disconnect replica[1] to create lag
    await clients[1].execute_command("REPLICAOF NO ONE")

    # Write data that replica[1] will miss
    pipe = c_master.pipeline()
    for i in range(10000):
        pipe.set(f"k{i}", "x" * 100)
    await pipe.execute()

    # Reconnect replica[1] and immediately takeover from replica[0]
    await clients[1].execute_command(f"REPLICAOF localhost {master.port}")

    await check_all_replicas_finished(clients, c_master)

    await clients[0].execute_command("REPLTAKEOVER 10")

    # Check master logs
    master.stop(kill=False)

    timeout_logs = master.find_in_logs(
        f"Couldn't synchronize with replica for takeover in time: 127.0.0.1:{replicas[0].port}"
    )
    assert not timeout_logs


@pytest.mark.large
async def test_takeover_timeout_on_unresponsive_master(df_factory):
    master = df_factory.create(proactor_threads=4)
    replica = df_factory.create(proactor_threads=2)
    df_factory.start_all([master, replica])

    c_master = master.client()
    c_replica = replica.client()

    # Setup replication
    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_available_async(c_replica)

    # Write some data
    for i in range(10):
        await c_master.set(f"key{i}", f"val{i}")
    await asyncio.sleep(0.2)

    # PAUSE master process (SIGSTOP) - socket stays open but doesn't respondExpand commentComment on line R3629Code has comments. Press enter to view.
    os.kill(master.proc.pid, signal.SIGSTOP)
    logging.info(f"Paused master process {master.proc.pid}")

    # Try takeover with 5 second timeout
    # BUG: This will hang forever because SendNextPhaseRequest has no timeout
    # FIXED: Should return error within ~15 seconds (5 + buffer)
    start_time = time.time()
    try:
        await asyncio.wait_for(
            c_replica.execute_command("REPLTAKEOVER 5"),
            timeout=20,  # Should complete within 20 seconds
        )
        elapsed = time.time() - start_time
        logging.info(f"Takeover completed in {elapsed:.1f}s")
    except asyncio.TimeoutError:
        elapsed = time.time() - start_time
        pytest.fail(
            f"BUG: REPLTAKEOVER hung for {elapsed:.1f}s without timeout. "
            f"SendNextPhaseRequest in replica.cc has no socket timeout."
        )
    except Exception as e:
        # Expected: connection error or timeout error
        elapsed = time.time() - start_time
        logging.info(f"Takeover failed after {elapsed:.1f}s: {e}")
        # Should fail quickly, not hang
        assert elapsed < 20, f"Took too long: {elapsed:.1f}s"
    finally:
        # Resume master so it can be stopped properly
        try:
            os.kill(master.proc.pid, signal.SIGCONT)
        except Exception:
            pass


async def test_replica_of_self(async_client):
    port = async_client.connection_pool.connection_kwargs["port"]
    with pytest.raises(redis.exceptions.ResponseError):
        await async_client.execute_command(f"replicaof localhost {port}")

    with pytest.raises(redis.exceptions.ResponseError):
        await async_client.execute_command(f"replicaof 127.0.0.1 {port}")


@dfly_args({"replicaof_no_one_start_journal": True, "proactor_threads": 2})
async def test_repl_offset(df_factory):
    master = df_factory.create()
    replica1 = df_factory.create()
    replica2 = df_factory.create()
    replica3 = df_factory.create()

    df_factory.start_all([master, replica1, replica2, replica3])
    c_master = master.client()
    c_replica1 = replica1.client()
    c_replica2 = replica2.client()
    c_replica3 = replica3.client()

    seeder = DebugPopulateSeeder(key_target=50)
    await seeder.run(c_master)

    await c_replica1.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_for_replicas_state(c_replica1)
    await c_replica2.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_for_replicas_state(c_replica2)
    await c_replica3.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_for_replicas_state(c_replica3)

    seeder = SeederV2(key_target=50)
    await seeder.run(c_master, target_deviation=0.01)

    # Wait for all journal changes propagate to replicas
    await check_all_replicas_finished([c_replica1, c_replica2, c_replica3], c_master)

    # Promote first replica to master
    await c_replica1.execute_command(f"REPLTAKEOVER 5")

    # issue 4183
    async def with_timeout_link_down(client):
        async with async_timeout.timeout(2):
            while True:
                info = await client.info("replication")
                if info["master_link_status"] == "down":
                    assert info["slave_repl_offset"] > 0
                    break
                await asyncio.sleep(0.1)

    await with_timeout_link_down(c_replica2)
    assert "OK" == await c_replica2.execute_command("replicaof no one")

    # Partial sync here
    await c_replica3.execute_command(f"REPLICAOF localhost {replica2.port}")
    # Full sync here
    await c_replica1.execute_command(f"REPLICAOF localhost {replica2.port}")

    await check_all_replicas_finished([c_replica1, c_replica3], c_replica2)

    info = await c_replica3.info("replication")
    # 1 repl flow per proactor.
    proactors = 2
    # if `replicaof no one` on `c_replica2` does not preserve the journal offsets,
    # then the assertion below shall fail. In that case, replicas perform a full sync first
    # and as there are no journal changes the slave offsets are 2 (1 per shard).
    assert info["slave_repl_offset"] > proactors
    assert info["psync_successes"] == 1

    await c_replica1.execute_command(f"REPLTAKEOVER 5")
    await with_timeout_link_down(c_replica3)


async def test_partial_sync_with_different_shard_sizes(df_factory):
    master = df_factory.create(proactor_threads=3)
    replica1 = df_factory.create(proactor_threads=4)
    replica2 = df_factory.create(proactor_threads=5)
    replica3 = df_factory.create(proactor_threads=6)

    df_factory.start_all([replica1, replica2, replica3, master])

    c_replica1 = replica1.client()
    c_replica2 = replica2.client()
    c_replica3 = replica3.client()

    c_master = master.client()

    await c_master.execute_command("debug populate 5000")

    await c_replica1.execute_command(f"replicaof localhost {master.port}")
    await c_replica2.execute_command(f"replicaof localhost {master.port}")
    await c_replica3.execute_command(f"replicaof localhost {master.port}")

    seeder = SeederV2(key_target=100)
    await seeder.run(c_master, target_deviation=0.01)

    await check_all_replicas_finished([c_replica1, c_replica2, c_replica3], c_master)

    await c_replica1.execute_command("repltakeover 5")
    await c_replica2.execute_command(f"replicaof localhost {replica1.port}")
    await c_replica3.execute_command(f"replicaof localhost {replica1.port}")

    await check_all_replicas_finished([c_replica2, c_replica3], c_replica1)

    for replica in (replica1, replica2, replica3):
        replica.stop()

    lines = replica2.find_in_logs(f"Started partial sync with localhost:{replica1.port}")
    assert len(lines) == 0
    lines = replica3.find_in_logs(f"Started partial sync with localhost:{replica1.port}")
    assert len(lines) == 0


@pytest.mark.large
async def test_replica_reconnection_leaks_connections(df_factory: DflyInstanceFactory):
    master = df_factory.create(proactor_threads=4)
    replica = df_factory.create(proactor_threads=4)
    df_factory.start_all([master, replica])

    c_master = master.client()
    c_replica = replica.client()

    info = await c_master.info("clients")
    baseline = info["connected_clients"]

    num_cycles = 20
    for _ in range(num_cycles):
        await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
        await wait_for_replicas_state(c_replica)
        await c_replica.execute_command("REPLICAOF NO ONE")

    # Wait for connected_clients to stabilize (stop changing)
    prev = None
    async for info, breaker in info_tick_timer(c_master, "clients", timeout=10):
        with breaker:
            curr = info["connected_clients"]
            assert curr == prev
        prev = curr

    leaked = prev - baseline
    assert leaked == 0, f"connected_clients leaked {leaked} after {num_cycles} reconnect cycles"

    await c_master.aclose()
    await c_replica.aclose()


@dfly_args({"proactor_threads": 2})
async def test_xreadgroup_replication(df_factory):
    master = df_factory.create()
    replica = df_factory.create()

    master.start()
    replica.start()

    c_master = master.client()
    c_replica = replica.client()

    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")

    async def compare_group_info(stream_key, expected_pending, expected_entries_read):
        master_info = await c_master.execute_command(f"XINFO GROUPS {stream_key}")
        replica_info = await c_replica.execute_command(f"XINFO GROUPS {stream_key}")

        # Parse group info (format: [name, consumers, pending, last-delivered-id, entries-read, lag])
        assert len(master_info) == len(replica_info)

        for m_group, r_group in zip(master_info, replica_info):
            m_dict = dict(zip(m_group[::2], m_group[1::2]))
            r_dict = dict(zip(r_group[::2], r_group[1::2]))

            assert m_dict["last-delivered-id"] == r_dict["last-delivered-id"]
            assert m_dict["entries-read"] == r_dict["entries-read"]
            assert m_dict["entries-read"] == expected_entries_read
            assert m_dict["pending"] == r_dict["pending"]
            assert m_dict["pending"] == expected_pending
            assert m_dict["consumers"] == r_dict["consumers"]

    # Case 1: Non-blocking path, NOACK
    await c_master.execute_command("XGROUP CREATE mystream mygroup $ MKSTREAM")
    await c_master.execute_command("XADD mystream * tmp tmp")
    await c_master.execute_command("XREADGROUP GROUP mygroup worker1 NOACK STREAMS mystream >")

    await check_all_replicas_finished([c_replica], c_master)
    await compare_group_info("mystream", 0, 1)

    # Case 2: Non-blocking path, with PEL
    await c_master.execute_command("XADD mystream * tmp tmp")
    await c_master.execute_command("XADD mystream * tmp tmp")
    await c_master.execute_command("XREADGROUP GROUP mygroup worker1 STREAMS mystream >")

    await check_all_replicas_finished([c_replica], c_master)
    await compare_group_info("mystream", 2, 3)

    # Case 3: Blocking path, NOACK

    # Start blocking XREADGROUP in background
    read_task = asyncio.create_task(
        c_master.execute_command(
            "XREADGROUP GROUP mygroup worker1 NOACK BLOCK 0 STREAMS mystream >"
        )
    )
    # Let the blocking command start
    await asyncio.sleep(0.1)
    await c_master.execute_command("XADD mystream * tmp tmp")

    await read_task

    await check_all_replicas_finished([c_replica], c_master)
    await compare_group_info("mystream", 2, 4)

    # Case 4: Blocking path, with PEL

    # Start blocking XREADGROUP in background
    read_task = asyncio.create_task(
        c_master.execute_command("XREADGROUP GROUP mygroup worker1 BLOCK 0 STREAMS mystream >")
    )

    await asyncio.sleep(0.1)
    await c_master.execute_command("XADD mystream * tmp tmp")
    await read_task

    await check_all_replicas_finished([c_replica], c_master)
    await compare_group_info("mystream", 3, 5)

    await c_master.execute_command("flushall")
    # Create consumer
    await c_master.execute_command("XGROUP CREATE mystream mygroup $ MKSTREAM")
    await c_master.execute_command("XADD mystream 2000-0 tmp tmp")
    # Add to PEL but don't ack
    await c_master.execute_command("XREADGROUP GROUP mygroup worker1 STREAMS mystream >")
    await c_master.execute_command("XREADGROUP GROUP mygroup worker2 STREAMS mystream 2000-0")

    await check_all_replicas_finished([c_replica], c_master)
    await compare_group_info("mystream", 1, 1)


"""
Test replication with mismatched dbnum between master and replica.
"""


@dfly_args({"proactor_threads": 2})
async def test_replication_replica_smaller_dbnum_shared_dbs_only(
    df_factory: DflyInstanceFactory,
):
    """
    Replica dbnum < Master dbnum, but master only uses DBs within
    the replica's range. Replication should succeed.
    """
    master = df_factory.create(dbnum=8)
    replica = df_factory.create(dbnum=4)

    df_factory.start_all([master, replica])

    c_master = master.client()

    # Populate data only in DBs 0-3 (within replica's dbnum range)
    for db in range(4):
        c = master.client(db=db)
        for i in range(50):
            await c.set(f"key:{db}:{i}", f"val:{db}:{i}")
        await c.close()

    # Start replication
    c_replica = replica.client()
    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")

    async with async_timeout.timeout(10):
        await wait_for_replicas_state(c_replica)

    await check_all_replicas_finished([c_replica], c_master)

    # Verify all data is present in the replica across shared DBs
    for db in range(4):
        c_m = master.client(db=db)
        c_r = replica.client(db=db)
        for i in range(50):
            assert await c_r.get(f"key:{db}:{i}") == await c_m.get(f"key:{db}:{i}")
        await c_m.close()
        await c_r.close()


@dfly_args({"proactor_threads": 2})
async def test_replication_replica_larger_dbnum(
    df_factory: DflyInstanceFactory,
):
    """
    Replica dbnum > Master dbnum. Replication should succeed;
    the replica's extra DBs remain empty.
    """
    master = df_factory.create(dbnum=4)
    replica = df_factory.create(dbnum=8)

    df_factory.start_all([master, replica])

    c_master = master.client()

    # Populate all DBs on the master (0-3)
    for db in range(4):
        c = master.client(db=db)
        for i in range(50):
            await c.set(f"key:{db}:{i}", f"val:{db}:{i}")
        await c.close()

    # Start replication
    c_replica = replica.client()
    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")

    async with async_timeout.timeout(10):
        await wait_for_replicas_state(c_replica)

    await check_all_replicas_finished([c_replica], c_master)

    # Verify master's data is present in the replica
    for db in range(4):
        c_m = master.client(db=db)
        c_r = replica.client(db=db)
        for i in range(50):
            assert await c_r.get(f"key:{db}:{i}") == await c_m.get(f"key:{db}:{i}")
        await c_m.close()
        await c_r.close()

    # Verify the replica's extra DBs (4-7) are empty
    for db in range(4, 8):
        c_r = replica.client(db=db)
        assert await c_r.dbsize() == 0
        await c_r.close()


# BF.RESERVE with error_rate=0.00001 and capacity=1e9 creates a single bloom filter
# of exactly 2^32 bytes (4 GiB). The chunked RDB loader used `unsigned` for the total
# filter size, which silently overflowed to 0 and broke the RDB stream.
@pytest.mark.large
async def test_sbf_chunked_replication_over_4gb(df_factory: DflyInstanceFactory):
    master = df_factory.create(
        proactor_threads=1,
        maxmemory="6G",
        rdb_sbf_chunked="true",
    )
    replica = df_factory.create(
        proactor_threads=1,
        maxmemory="6G",
    )

    df_factory.start_all([master, replica])

    c_master = master.client()
    c_replica = replica.client()

    await c_master.execute_command("BF.RESERVE", "bf", "0.00001", "1000000000")
    await c_master.execute_command("BF.ADD", "bf", "hello")

    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")

    async with async_timeout.timeout(240):
        await wait_for_replicas_state(c_replica)

    await check_all_replicas_finished([c_replica], c_master)

    assert await c_replica.execute_command("BF.EXISTS", "bf", "hello") == 1


@pytest.mark.parametrize(
    "master_threads, replica_threads",
    [[3, 4], [4, 4], [4, 3]],
)
async def test_hnsw_search_replication_with_network_disruptions(
    df_factory: DflyInstanceFactory,
    master_threads: int,
    replica_threads: int,
):
    """
    Test HNSW search index replication under continuous traffic and a network disruption.

    Creates a master with an HNSW vector index, starts concurrent write traffic and
    search queries, replicates through a proxy, and drops the connection at a random
    moment within the first 10 seconds (may hit full sync or stable sync).
    """
    master = df_factory.create(proactor_threads=master_threads)
    replica = df_factory.create(proactor_threads=replica_threads)
    df_factory.start_all([master, replica])

    c_master = master.client()
    c_replica = replica.client()

    seeder = HnswSearchSeeder(num_initial_docs=500)
    await seeder.create_index(c_master)
    await seeder.seed_initial_docs(c_master)

    proxy = Proxy("127.0.0.1", 0, "127.0.0.1", master.port)
    await proxy.start()
    proxy_task = asyncio.create_task(proxy.serve())

    traffic_task = asyncio.create_task(seeder.run_traffic(c_master))
    search_task = asyncio.create_task(seeder.run_search_queries(c_master))
    replica_search_task = asyncio.create_task(seeder.run_search_queries(c_replica))
    await c_replica.execute_command(f"REPLICAOF localhost {proxy.port}")

    try:
        await asyncio.sleep(random.uniform(0, 10))
        proxy.drop_connection()

        # Give time to detect dropped connection and reconnect
        await asyncio.sleep(1.0)

        await wait_available_async(c_replica)
        seeder.stop()
        await traffic_task
        await search_task
        await replica_search_task

        # Log replica FT.INFO for debugging if assertion fails later
        info = await c_replica.execute_command("FT.INFO", seeder.index_name)
        logging.info(f"Replica FT.INFO: {info}")

        await check_all_replicas_finished([c_replica], c_master)
        await seeder.verify(c_master, c_replica)

    finally:
        seeder.stop()
        traffic_task.cancel()
        search_task.cancel()
        replica_search_task.cancel()
        await proxy.close(proxy_task)


async def test_rm_replication(df_factory: DflyInstanceFactory):
    """Test that RM command propagates deletions to replica and is rejected on replica."""
    master = df_factory.create(proactor_threads=2)
    replica = df_factory.create(proactor_threads=2)

    master.start()
    replica.start()

    c_master = master.client()
    c_replica = replica.client()

    # Populate master with keys before replication starts
    for i in range(20):
        await c_master.set(f"key:{i}", f"val{i}")
    for i in range(5):
        await c_master.set(f"other:{i}", f"val{i}")

    # Set up replication
    await c_replica.execute_command(f"REPLICAOF localhost {master.port}")
    await wait_available_async(c_replica)

    # Verify replica has all keys
    assert await c_replica.dbsize() == 25
    logging.info("Replica has all keys")

    # Run RM on master with a MATCH filter to delete only "key:*" keys
    cursor = 0
    while True:
        result = await c_master.execute_command("RM", cursor, "MATCH", "key:*")
        cursor = int(result[0])
        if cursor == 0:
            break

    # Master should have only "other:*" keys left
    assert await c_master.dbsize() == 5

    # Wait for replication to propagate
    await check_all_replicas_finished([c_replica], c_master)

    # Replica should reflect deletions
    assert await c_replica.dbsize() == 5
    for i in range(5):
        assert await c_replica.exists(f"other:{i}") == 1
    for i in range(20):
        assert await c_replica.exists(f"key:{i}") == 0

    # RM must be rejected on replica (it's a write command)
    with pytest.raises((aioredis.ResponseError, aioredis.ReadOnlyError)):
        await c_replica.execute_command("RM", 0)


================================================
FILE: tests/dragonfly/requirements.txt
================================================
async-timeout>=4.0.3
attrs>=22.1.0
Deprecated>=1.2.13
iniconfig>=1.1.1
packaging>=23.1
pluggy>=1.0.0
py>=1.11.0
pyparsing>=3.0.9
pytest>=7.1.2
redis>=5.2.1
tomli>=2.0.1
wrapt>=1.14.1
pytest-asyncio==0.20.1
pytest-repeat>=0.9.3
pymemcache>=4.0.0
meta_memcache>=2
prometheus_client>=0.17.0
aiohttp>=3.10.2
numpy
pytest-json-report>=1.5.0
psutil>=5.9.5
boto3>=1.28.55
redis-om>=0.3.3
pytest-emoji>=0.2.0
pytest-icdiff>=0.8
pytest-timeout>=2.2.0
asyncio>=3.4.3
fakeredis[json]>=2.26.2
hiredis==2.4.0
PyYAML>=6.0
valkey>=6.0.2
celery>=5.3.0
# bullmq>=2.0.0


================================================
FILE: tests/dragonfly/search_benchmark_test.py
================================================
import logging
import time
import pytest

from . import dfly_args
from .instance import DflyInstance
from .search_benchmark_utils import (
    generate_document_columns,
    create_search_index,
    generate_document_data,
    run_query_load_test,
    set_random_seed,
    INDEX_KEY,
    DOCUMENT_KEY,
)


@dfly_args({"proactor_threads": 4})
@pytest.mark.opt_only
@pytest.mark.large
class TestSearchBenchmark:
    random_seed = 42
    num_documents = 3000
    chunk_size = 1000

    @pytest.fixture(scope="class")
    async def prepared_benchmark_data(self, df_server: DflyInstance):
        set_random_seed(self.random_seed)

        logging.info(f"Preparing benchmark data on port {df_server.port}")
        client = df_server.client()

        # Basic connectivity check
        assert await client.ping() == True

        # Schema Generation
        logging.info("Schema Generation - generating columns and creating search index")
        document_columns = generate_document_columns()
        await create_search_index(client, document_columns)

        # Verify the index was created
        index_info = await client.execute_command(f"FT.INFO {INDEX_KEY}")
        assert index_info is not None
        logging.info(f"Search index '{INDEX_KEY}' created with {len(document_columns)} columns")

        # Data Generation
        logging.info(
            f"Data Generation - generating {self.num_documents:,} documents with full column data"
        )
        stage_start = time.time()
        document_ids = await generate_document_data(
            client=client,
            columns=document_columns,
            num_documents=self.num_documents,
            chunk_size=self.chunk_size,  # Chunk size for batch processing
        )

        # Verify data was generated
        assert len(document_ids) == self.num_documents

        # Verify some documents were stored
        sample_document_id = document_ids[0]
        document_key = DOCUMENT_KEY.format(documentId=sample_document_id)
        stored_document = await client.hgetall(document_key)
        assert stored_document is not None
        assert stored_document["DocumentId"] == sample_document_id
        stage_duration = time.time() - stage_start
        logging.info(
            f"Preparation stage completed in {stage_duration:.2f}s: {len(document_ids)} documents generated and stored"
        )

        await client.aclose()

        return {
            "document_columns": document_columns,
            "document_ids": document_ids,
            "num_documents": self.num_documents,
            "setup_duration": stage_duration,
        }

    async def _run_benchmark(
        self,
        df_server: DflyInstance,
        prepared_benchmark_data,
        num_queries: int,
        num_concurrent_clients: int,
        test_name: str,
    ):
        logging.info(f"Starting {test_name} test on port {df_server.port}")
        logging.info(
            f"Parameters: {prepared_benchmark_data['num_documents']} documents, {num_queries} queries, {num_concurrent_clients} concurrent clients"
        )

        client = df_server.client()

        # Basic connectivity check
        assert await client.ping() == True

        # Query Load Testing
        logging.info(
            f"Query Load Testing - running {num_queries:,} queries with {num_concurrent_clients} concurrent clients"
        )
        stage_start = time.time()
        total_completed = await run_query_load_test(
            df_server=df_server,
            columns=prepared_benchmark_data["document_columns"],
            document_ids=prepared_benchmark_data["document_ids"],
            total_queries=num_queries,
            num_concurrent_clients=num_concurrent_clients,
        )

        # Verify queries completed
        assert total_completed == num_queries
        stage_duration = time.time() - stage_start
        logging.info(
            f"Query Load Testing completed in {stage_duration:.2f}s: {total_completed} queries executed successfully"
        )

        # Final summary
        logging.info(
            f"Benchmark Timings Summary -> Data Generation: {prepared_benchmark_data['setup_duration']:.2f}s | Query Load: {stage_duration:.2f}s"
        )

        # Command statistics
        cmd_stats = await client.info("commandstats")
        logging.info("Command Statistics:")
        for key, value in cmd_stats.items():
            if key.startswith("cmdstat_") and "ft." in key.lower():
                command = key[8:]  # Remove "cmdstat_" prefix
                logging.info(f"  {command}: {value}")

        # Latency statistics
        latency_stats = await client.info("latencystats")
        logging.info("Latency Statistics:")
        for key, value in latency_stats.items():
            if "ft." in key.lower():
                logging.info(f"  {key}: {value}")

        # Memory statistics
        memory_stats = await client.info("memory")
        logging.info("Memory Statistics:")
        important_memory_keys = [
            "used_memory",
            "used_memory_human",
            "used_memory_rss",
            "used_memory_rss_human",
            "used_memory_peak",
            "used_memory_peak_human",
        ]
        for key in important_memory_keys:
            if key in memory_stats:
                logging.info(f"  {key}: {memory_stats[key]}")

        logging.info(f"{test_name} completed successfully")

        # Close client
        await client.aclose()

    async def test_standard_benchmark(self, df_server: DflyInstance, prepared_benchmark_data):
        """Standard benchmark test - 100 queries with 10 concurrent clients."""
        await self._run_benchmark(df_server, prepared_benchmark_data, 100, 10, "Standard Benchmark")

    async def test_small_benchmark(self, df_server: DflyInstance, prepared_benchmark_data):
        """Small benchmark test - 50 queries with 5 concurrent clients."""
        await self._run_benchmark(df_server, prepared_benchmark_data, 50, 5, "Small Benchmark")


================================================
FILE: tests/dragonfly/search_benchmark_utils.py
================================================
import asyncio
import logging
import random
import string
import uuid
import math
from typing import List, Tuple
from redis import asyncio as aioredis
from redis.commands.search.query import Query


def set_random_seed(seed: int):
    random.seed(seed)


INDEX_KEY = "idx:DocumentBase"
DOCUMENT_KEY = "DocumentBase:{documentId}"


# Simple data types for generation
COLUMN_TYPES = {
    "TEXT": {
        "dragonfly_type": "TEXT",
        "generator": lambda: random.choice(PRE_GENERATED_STRINGS),
    },
    "NUMERIC": {
        "dragonfly_type": "NUMERIC",
        "generator": lambda: random.randint(1, 100),
    },
    "TAG": {
        "dragonfly_type": "TAG",
        "generator": lambda: random.choice(PRE_GENERATED_UIDS),
    },
    "BIT": {
        "dragonfly_type": "NUMERIC",
        "generator": lambda: random.choice([0, 1]),
    },
}


PRE_GENERATED_STRINGS = []
PRE_GENERATED_UIDS = []


def _initialize_pre_generated_data(size: int):
    global PRE_GENERATED_STRINGS, PRE_GENERATED_UIDS

    # Clear previous data and generate new
    PRE_GENERATED_STRINGS.clear()
    PRE_GENERATED_UIDS.clear()

    PRE_GENERATED_STRINGS.extend(
        [
            "".join(random.choices(string.ascii_letters, k=k))
            for _ in range(size)
            for k in range(5, 11)  # lengths 5–10
        ]
    )

    PRE_GENERATED_UIDS.extend([str(uuid.uuid4()) for _ in range(size)])


async def generate_document_data(
    client: aioredis.Redis,
    columns: List[Tuple[str, str]],
    num_documents: int,
    chunk_size: int = 1000,
) -> List[str]:
    # Initialize pre-generated data
    _initialize_pre_generated_data(num_documents)

    # Generate document IDs
    document_ids = [str(uuid.uuid4()) for _ in range(num_documents)]

    # Process in chunks for better performance
    chunks_count = math.ceil(num_documents / chunk_size)

    tasks = []
    for chunk_number in range(chunks_count):
        start_idx = chunk_number * chunk_size
        end_idx = min((chunk_number + 1) * chunk_size, num_documents)
        chunk_document_ids = document_ids[start_idx:end_idx]

        task = asyncio.create_task(_generate_documents_chunk(client, chunk_document_ids, columns))
        tasks.append(task)

    await asyncio.gather(*tasks)
    return document_ids


async def _generate_documents_chunk(
    client: aioredis.Redis, document_ids: List[str], columns: List[Tuple[str, str]]
):
    pipeline = client.pipeline()

    for document_id in document_ids:
        document = {"DocumentId": document_id}

        # Generate values for all columns except DocumentId
        for column_name, column_type in columns:
            if column_name == "DocumentId":
                continue

            value = COLUMN_TYPES[column_type]["generator"]()
            if value is not None:
                document[column_name] = value

        doc_key = DOCUMENT_KEY.format(documentId=document_id)
        pipeline.hset(doc_key, mapping=document)

    await pipeline.execute()


def generate_search_query(columns: List[Tuple[str, str]], document_ids: List[str]) -> Query:
    column_names = [name for name, _ in columns]
    num_columns = random.randint(int(len(column_names) / 3.5), int(len(column_names) / 2))
    selected_columns = random.sample(column_names, num_columns)

    if random.random() < 0.5:
        query = Query("*").return_fields(*selected_columns)
        query = query.paging(0, 50)
        return query

    reliable_filter_columns = [name for name, col_type in columns if col_type in ["NUMERIC", "BIT"]]

    if reliable_filter_columns and random.random() < 0.5:
        filter_column = random.choice(reliable_filter_columns)
        filter_column_type = next(col_type for name, col_type in columns if name == filter_column)
        filter_str = create_simple_numeric_filter(filter_column, filter_column_type)
        filter_string = filter_str if filter_str else "*"
    else:
        filter_string = "*"

    query = Query(filter_string).return_fields(*selected_columns)
    query = query.paging(0, 50)
    return query


def create_simple_numeric_filter(property_name: str, property_type: str) -> str:
    if property_type == "NUMERIC":
        return f"@{property_name}: [1 100]"
    elif property_type == "BIT":
        bit_value = random.choice([0, 1])
        return f"@{property_name}: [{bit_value} {bit_value}]"
    else:
        return "*"


async def run_query_client(
    client_id: int,
    df_server,
    columns: List[Tuple[str, str]],
    document_ids: List[str],
    num_queries: int,
) -> int:
    client = df_server.client()

    query_count = 0
    success_count = 0

    try:
        for i in range(num_queries):
            try:
                query = generate_search_query(columns, document_ids)
                results = await client.ft(INDEX_KEY).search(query)
                success_count += 1

            except Exception as e:
                logging.error(f"Client {client_id}: ERROR in query {i}: {e}")

            query_count += 1

    finally:
        if query_count > 0:
            final_success_rate = (success_count / query_count) * 100
            logging.info(
                f"Client {client_id} completed: {success_count}/{query_count} successful queries ({final_success_rate:.1f}%)"
            )
        await client.aclose()

    return success_count


async def run_query_load_test(
    df_server,
    columns: List[Tuple[str, str]],
    document_ids: List[str],
    total_queries: int,
    num_concurrent_clients: int,
) -> int:
    queries_per_client = total_queries // num_concurrent_clients

    tasks = []
    for client_id in range(num_concurrent_clients):
        task = asyncio.create_task(
            run_query_client(client_id, df_server, columns, document_ids, queries_per_client)
        )
        tasks.append(task)

    results = await asyncio.gather(*tasks)
    total_completed = sum(results)
    return total_completed


def generate_document_columns(num_columns: int = 700) -> List[Tuple[str, str]]:
    max_text_fields = 128

    # Available types for generation
    available_types = ["TEXT", "NUMERIC", "BIT", "TAG"]

    columns = []
    existing_names = set()
    text_field_count = 0

    # Standard columns
    standard_columns = [
        ("DocumentId", "TAG"),
        ("Name", "TEXT"),
        ("DocumentNumber", "TEXT"),
        ("Revenue", "NUMERIC"),
        ("NumberOfEmployees", "NUMERIC"),
        ("CreatedOn", "NUMERIC"),
        ("ModifiedOn", "NUMERIC"),
        ("IsPrivate", "BIT"),
        ("StateCode", "NUMERIC"),
        ("StatusCode", "NUMERIC"),
    ]

    columns.extend(standard_columns)
    existing_names.update(name for name, _ in standard_columns)
    text_field_count = sum(1 for _, col_type in standard_columns if col_type == "TEXT")

    while len(columns) < num_columns:
        # Generate unique name
        candidate_name = (
            f"lv_{''.join(random.choices(string.ascii_lowercase, k=random.randint(5, 15)))}"
        )

        if candidate_name in existing_names:
            continue

        # Choose type
        if text_field_count >= max_text_fields:
            column_type = random.choice([t for t in available_types if t != "TEXT"])
        else:
            column_type = random.choice(available_types)
            if column_type == "TEXT":
                text_field_count += 1

        columns.append((candidate_name, column_type))
        existing_names.add(candidate_name)

    logging.info(f"Created {len(columns)} columns, with {text_field_count} TEXT fields")
    return columns


async def create_search_index(client: aioredis.Redis, columns: List[Tuple[str, str]]) -> None:
    text_field_count = sum(1 for _, col_type in columns if col_type == "TEXT")

    if text_field_count > 128:
        raise ValueError(
            f"Too many TEXT fields: {text_field_count}. RediSearch supports a maximum of 128 TEXT fields."
        )

    logging.info(
        f"Creating index with {len(columns)} columns, including {text_field_count} TEXT fields"
    )

    # Create schema directly
    schema_parts = []
    for name, col_type in columns:
        dragonfly_type = COLUMN_TYPES[col_type]["dragonfly_type"]
        schema_parts.append(f"{name} {dragonfly_type}")

    schema_create_command = (
        f"FT.CREATE {INDEX_KEY} ON HASH PREFIX 1 DocumentBase: SCHEMA {' '.join(schema_parts)}"
    )
    await client.execute_command(schema_create_command)


================================================
FILE: tests/dragonfly/search_test.py
================================================
"""
Test compatibility with the redis-py client search module.
Search correctness should be ensured with unit tests.
"""

import copy

import numpy as np
from redis.commands.search.field import TextField, NumericField, TagField, VectorField, GeoField

try:
    from redis.commands.search.indexDefinition import IndexDefinition, IndexType
except ModuleNotFoundError:
    from redis.commands.search.index_definition import IndexDefinition, IndexType
from redis.commands.search.query import Query

from . import dfly_args
from .utility import *

TEST_DATA = [
    {
        "title": "First article",
        "content": "Long description",
        "views": 100,
        "topic": "world, science",
    },
    {
        "title": "Second article",
        "content": "Small text",
        "views": 200,
        "topic": "national, policits",
    },
    {
        "title": "Third piece",
        "content": "Brief description",
        "views": 300,
        "topic": "health, lifestyle",
    },
    {
        "title": "Last piece",
        "content": "Interesting text",
        "views": 400,
        "topic": "world, business",
    },
]

BASIC_TEST_SCHEMA = [
    TextField("title"),
    TextField("content"),
    NumericField("views"),
    TagField("topic"),
]


def fix_schema_naming(itype: IndexType, idx_list: list):
    """Copy all schema fields and for json types, change name to json $.path and add alias"""
    if itype == IndexType.HASH:
        return idx_list
    copies = [copy.copy(idx) for idx in idx_list]
    for idx in copies:
        idx.as_name = idx.name
        idx.name = "$." + idx.name
    return copies


async def index_test_data(async_client: aioredis.Redis, itype: IndexType, prefix=""):
    for i, e in enumerate(TEST_DATA):
        if itype == IndexType.HASH:
            await async_client.hset(prefix + str(i), mapping=e)
        else:
            await async_client.json().set(prefix + str(i), "$", e)


def doc_to_str(index_type, doc):
    if not type(doc) is dict:
        doc = doc.__dict__

    if "json" in doc:
        return json.dumps(json.loads(doc["json"]), sort_keys=True)

    if index_type == IndexType.JSON:
        return json.dumps(doc, sort_keys=True)

    doc = dict(doc)  # copy to remove fields
    doc.pop("id", None)
    doc.pop("payload", None)

    return "//".join(sorted(doc))


def contains_test_data(itype, res, td_indices):
    if res.total != len(td_indices):
        return False

    docset = {doc_to_str(itype, doc) for doc in res.docs}

    for td_entry in (TEST_DATA[tdi] for tdi in td_indices):
        if not doc_to_str(itype, td_entry) in docset:
            return False

    return True


@dfly_args({"proactor_threads": 4})
async def test_management(async_client: aioredis.Redis):
    SCHEMA_1 = [TextField("f1"), NumericField("f2", sortable=True)]
    SCHEMA_2 = [
        NumericField("f3", no_index=True, sortable=True),
        TagField("f4"),
        VectorField(
            "f5",
            algorithm="HNSW",
            attributes={"TYPE": "FLOAT32", "DIM": 1, "DISTANCE_METRIC": "L2", "INITIAL_CAP": 100},
        ),
    ]

    i1 = async_client.ft("i1")
    i2 = async_client.ft("i2")

    await i1.create_index(SCHEMA_1, definition=IndexDefinition(prefix=["p1"]))
    await i2.create_index(SCHEMA_2, definition=IndexDefinition(prefix=["p2"]))

    # Fill indices with 10 and 15 docs respectively
    for i in range(10):
        await async_client.hset(f"p1-{i}", mapping={"f1": "ok", "f2": 11})
    for i in range(15):
        await async_client.hset(
            f"p2-{i}",
            mapping={"f3": 12, "f4": "hmm", "f5": np.array(0).astype(np.float32).tobytes()},
        )

    assert sorted(await async_client.execute_command("FT._LIST")) == ["i1", "i2"]

    i1info = await i1.info()
    assert i1info["index_definition"] == [
        "key_type",
        "HASH",
        "prefixes",
        ["p1"],
        "default_score",
        1,
    ]
    assert i1info["num_docs"] == 10
    assert sorted(i1info["attributes"]) == [
        ["identifier", "f1", "attribute", "f1", "type", "TEXT"],
        [
            "identifier",
            "f2",
            "attribute",
            "f2",
            "type",
            "NUMERIC",
            "SORTABLE",
            "blocksize",
            "10000",
        ],
    ]

    i2info = await i2.info()
    assert i2info["index_definition"] == [
        "key_type",
        "HASH",
        "prefixes",
        ["p2"],
        "default_score",
        1,
    ]
    assert i2info["num_docs"] == 15
    assert sorted(i2info["attributes"]) == [
        [
            "identifier",
            "f3",
            "attribute",
            "f3",
            "type",
            "NUMERIC",
            "NOINDEX",
            "SORTABLE",
            "blocksize",
            "10000",
        ],
        ["identifier", "f4", "attribute", "f4", "type", "TAG"],
        ["identifier", "f5", "attribute", "f5", "type", "VECTOR"],
    ]

    await i1.dropindex()
    await i2.dropindex()

    assert await async_client.execute_command("FT._LIST") == []


@dfly_args({"proactor_threads": 4})
@pytest.mark.parametrize("index_type", [IndexType.HASH, IndexType.JSON])
async def test_basic(async_client: aioredis.Redis, index_type):
    i1 = async_client.ft("i1-" + str(index_type))

    await i1.create_index(
        fix_schema_naming(index_type, BASIC_TEST_SCHEMA),
        definition=IndexDefinition(index_type=index_type),
    )
    await index_test_data(async_client, index_type)

    res = await i1.search("article")
    assert contains_test_data(index_type, res, [0, 1])

    res = await i1.search("text")
    assert contains_test_data(index_type, res, [1, 3])

    res = await i1.search("brief piece")
    assert contains_test_data(index_type, res, [2])

    res = await i1.search("@title:(article|last) @content:text")
    assert contains_test_data(index_type, res, [1, 3])

    res = await i1.search("@views:[200 300]")
    assert contains_test_data(index_type, res, [1, 2])

    res = await i1.search("@views:[0 150] | @views:[350 500]")
    assert contains_test_data(index_type, res, [0, 3])

    res = await i1.search("@topic:{world}")
    assert contains_test_data(index_type, res, [0, 3])

    res = await i1.search("@topic:{business}")
    assert contains_test_data(index_type, res, [3])

    res = await i1.search("@topic:{world | national}")
    assert contains_test_data(index_type, res, [0, 1, 3])

    res = await i1.search("@topic:{science | health}")
    assert contains_test_data(index_type, res, [0, 2])

    await i1.dropindex()


@dfly_args({"proactor_threads": 4})
async def test_big_json(async_client: aioredis.Redis):
    i1 = async_client.ft("i1")
    gen_arr = lambda base: {"blob": [base + str(i) for i in range(100)]}

    await i1.create_index(
        [TextField(name="$.blob", as_name="items")],
        definition=IndexDefinition(index_type=IndexType.JSON),
    )

    await async_client.json().set("k1", "$", gen_arr("alex"))
    await async_client.json().set("k2", "$", gen_arr("bob"))

    res = await i1.search("alex55")
    assert res.docs[0].id == "k1"

    res = await i1.search("bob77")
    assert res.docs[0].id == "k2"

    res = await i1.search("alex11 | bob22")
    assert res.total == 2

    await i1.dropindex()


async def knn_query(idx, query, vector):
    params = {"vec": np.array(vector, dtype=np.float32).tobytes()}
    result = await idx.search(query, params)
    return {doc["id"] for doc in result.docs}


async def knn_query_with_limit(idx, query, vector, limit):
    params = {"vec": np.array(vector, dtype=np.float32).tobytes()}
    result = await idx.search(Query(query).paging(0, limit), params)
    return {doc["id"] for doc in result.docs}


@dfly_args({"proactor_threads": 4})
@pytest.mark.parametrize("index_type", [IndexType.HASH, IndexType.JSON])
@pytest.mark.parametrize("algo_type", ["FLAT", "HNSW"])
async def test_knn(async_client: aioredis.Redis, index_type, algo_type):
    i2 = async_client.ft("i2-" + str(index_type))

    vector_field = VectorField(
        "pos",
        algorithm=algo_type,
        attributes={
            "TYPE": "FLOAT32",
            "DIM": 1,
            "DISTANCE_METRIC": "L2",
            "INITIAL_CAP": 100,
        },
    )

    await i2.create_index(
        fix_schema_naming(index_type, [TagField("even"), vector_field]),
        definition=IndexDefinition(index_type=index_type),
    )

    pipe = async_client.pipeline()
    for i in range(100):
        even = "yes" if i % 2 == 0 else "no"
        if index_type == IndexType.HASH:
            pos = np.array(i, dtype=np.float32).tobytes()
            pipe.hset(f"k{i}", mapping={"even": even, "pos": pos})
        else:
            pipe.json().set(f"k{i}", "$", {"even": even, "pos": [float(i)]})
    await pipe.execute()

    assert await knn_query(i2, "* => [KNN 3 @pos $vec]", [50.0]) == {"k49", "k50", "k51"}

    assert await knn_query(i2, "@even:{yes} => [KNN 3 @pos $vec]", [20.0]) == {"k18", "k20", "k22"}

    assert await knn_query(i2, "@even:{no} => [KNN 4 @pos $vec]", [30.0]) == {
        "k27",
        "k29",
        "k31",
        "k33",
    }

    assert await knn_query(i2, "@even:{yes} => [KNN 3 @pos $vec]", [10.0] == {"k8", "k10", "k12"})
    await i2.dropindex()


NUM_DIMS = 10
NUM_POINTS = 100


@dfly_args({"proactor_threads": 4})
@pytest.mark.parametrize("index_type", [IndexType.HASH, IndexType.JSON])
@pytest.mark.parametrize("algo_type", ["HNSW", "FLAT"])
async def test_multidim_knn(async_client: aioredis.Redis, index_type, algo_type):
    vector_field = VectorField(
        "pos",
        algorithm=algo_type,
        attributes={
            "TYPE": "FLOAT32",
            "DIM": NUM_DIMS,
            "DISTANCE_METRIC": "L2",
        },
    )

    i3 = async_client.ft("i3-" + str(index_type))
    await i3.create_index(
        fix_schema_naming(index_type, [vector_field]),
        definition=IndexDefinition(index_type=index_type),
    )

    # Use fixed seed for deterministic results
    np.random.seed(42)

    def rand_point():
        return np.random.uniform(0, 10, NUM_DIMS).astype(np.float32)

    # Generate points and send to DF
    points = [rand_point() for _ in range(NUM_POINTS)]
    points = list(enumerate(points))

    pipe = async_client.pipeline(transaction=False)
    for i, point in points:
        if index_type == IndexType.HASH:
            pipe.hset(f"k{i}", mapping={"pos": point.tobytes()})
        else:
            pipe.json().set(f"k{i}", "$", {"pos": point.tolist()})
    await pipe.execute()

    # Run 10 random queries
    for _ in range(10):
        center = rand_point()
        limit = np.random.randint(
            1, NUM_POINTS // 10 + 1
        )  # +1 because numpy's randint is exclusive

        expected_ids = [
            f"k{i}"
            for i, point in sorted(points, key=lambda p: np.linalg.norm(center - p[1]))[:limit]
        ]

        if algo_type == "HNSW":
            # We need to search all points because results can be different between expected_ids that
            # distance is  calculated on all points and hnsw which is approximate greedy search
            knn_limit = NUM_POINTS
            got_ids = await knn_query_with_limit(
                i3, f"* => [KNN {knn_limit} @pos $vec]", center, limit
            )
        else:
            got_ids = await knn_query(i3, f"* => [KNN {limit} @pos $vec]", center)

        assert set(expected_ids) == set(got_ids)

    await i3.dropindex()


@dfly_args({"proactor_threads": 4})
async def test_knn_score_return(async_client: aioredis.Redis):
    i1 = async_client.ft("i1")
    vector_field = VectorField(
        "pos",
        algorithm="FLAT",
        attributes={
            "DIM": 1,
            "DISTANCE_METRIC": "L2",
            "INITIAL_CAP": 100,
        },
    )

    await i1.create_index(
        [vector_field],
        definition=IndexDefinition(index_type=IndexType.HASH),
    )

    pipe = async_client.pipeline()
    for i in range(100):
        pipe.hset(f"k{i}", mapping={"pos": np.array(i, dtype=np.float32).tobytes()})
    await pipe.execute()

    params = {"vec": np.array([1.0], dtype=np.float32).tobytes()}
    result = await i1.search("* => [KNN 3 @pos $vec AS distance]", params)

    assert result.total == 3
    assert [d["distance"] for d in result.docs] == ["0", "1", "1"]

    result = await i1.search(
        Query("* => [KNN 3 @pos $vec AS distance]").return_fields("pos"), params
    )
    assert not any(hasattr(d, "distance") for d in result.docs)

    await i1.dropindex()


@dfly_args({"proactor_threads": 4, "dbfilename": "search-data"})
async def test_index_persistence(df_server):
    client = aioredis.Redis(port=df_server.port)

    # Build two indices and fill them with data

    SCHEMA_1 = [TextField("title"), NumericField("views", sortable=True), TagField("topic")]
    SCHEMA_2 = [
        TextField("name"),
        NumericField("age", sortable=True),
        TagField("job", separator=":", case_sensitive=True),
        VectorField(
            "pos",
            algorithm="HNSW",
            attributes={"TYPE": "FLOAT32", "DIM": 1, "DISTANCE_METRIC": "L2", "INITIAL_CAP": 100},
        ),
    ]

    i1 = client.ft("i1")
    await i1.create_index(
        fix_schema_naming(IndexType.JSON, SCHEMA_1),
        stopwords=["interesting", "stopwords"],
        definition=IndexDefinition(index_type=IndexType.JSON, prefix=["blog-"]),
    )

    i2 = client.ft("i2")
    await i2.create_index(
        fix_schema_naming(IndexType.HASH, SCHEMA_2),
        definition=IndexDefinition(index_type=IndexType.HASH, prefix=["people-"]),
    )

    for i in range(150):
        await client.json().set(
            f"blog-{i}",
            ".",
            {"title": f"Post {i}", "views": i * 10, "topic": "even" if i % 2 == 0 else "odd"},
        )

    for i in range(200):
        await client.hset(
            f"people-{i}",
            mapping={
                "name": f"Name {i}",
                "age": i,
                "job": "newsagent" if i % 2 == 0 else "writer",
                "pos": np.array(i / 200.0).astype(np.float32).tobytes(),
            },
        )

    info_1 = await i1.info()
    info_2 = await i2.info()
    assert info_1["num_docs"] == 150
    assert info_2["num_docs"] == 200

    # stop & start server

    df_server.stop()
    df_server.start()

    client = aioredis.Redis(port=df_server.port)
    await wait_available_async(client)

    # Check indices were loaded

    assert {i.decode() for i in await client.execute_command("FT._LIST")} == {"i1", "i2"}

    i1 = client.ft("i1")
    i2 = client.ft("i2")

    info_1_new = await i1.info()
    info_2_new = await i2.info()

    def build_fields_set(info):
        fields = set()
        for field in info["attributes"]:
            fields.add(tuple(field))
        return fields

    assert build_fields_set(info_1) == build_fields_set(info_1_new)
    assert build_fields_set(info_2) == build_fields_set(info_2_new)

    assert info_1["index_definition"] == info_1_new["index_definition"]
    assert info_2["index_definition"] == info_2_new["index_definition"]

    assert info_1["num_docs"] == info_1_new["num_docs"]
    assert info_2["num_docs"] == info_2_new["num_docs"]

    # Check basic queries run correctly

    assert (await i1.search("@views:[0 90]")).total == 10
    assert (await i1.search("@views:[100 190] @topic:{even}")).total == 5

    assert (await i2.search("@job:{writer}")).total == 100
    assert (await i2.search("@job:{writer} @age:[100 200]")).total == 50
    assert (await i2.search("@job:{wRiTeR}")).total == 0

    # Check fields are sortable
    assert (await i1.search(Query("*").sort_by("views", asc=True).paging(0, 1))).docs[0][
        "id"
    ] == "blog-0"
    assert (await i2.search(Query("*").sort_by("age", asc=False).paging(0, 1))).docs[0][
        "age"
    ] == "199"

    # Check stopwords were loaded
    await client.json().set("blog-sw1", ".", {"title": "some stopwords"})
    assert (await i1.search("some")).total == 1
    assert (await i1.search("stopwords")).total == 0

    await i1.dropindex()
    await i2.dropindex()


@dfly_args({"proactor_threads": 4})
def test_redis_om(df_server):
    try:
        import redis_om
    except ModuleNotFoundError:
        skip_if_not_in_github("redis-om python library not installed")
        raise

    client = redis.Redis(port=df_server.port, decode_responses=True)

    class TestCar(redis_om.HashModel, index=True):
        producer: str = redis_om.Field(index=True)
        description: str = redis_om.Field(index=True, full_text_search=True)
        speed: int = redis_om.Field(index=True, sortable=True)

        class Meta:
            database = client

    def extract_producers(testset):
        return sorted([car.producer for car in testset])

    def make_car(producer, description, speed):
        return TestCar(producer=producer, description=description, speed=speed)

    CARS = [
        make_car("BMW", "Very fast and elegant", 200),
        make_car("Audi", "Fast & stylish", 170),
        make_car("Mercedes", "High class but expensive!", 150),
        make_car("Honda", "Good allrounder with flashy looks", 120),
        make_car("Peugeot", "Good allrounder for the whole family", 100),
        make_car("Mini", "Fashinable cooper for the big city", 80),
        make_car("John Deere", "It's not a car, it's a tractor in fact!", 50),
    ]

    for car in CARS:
        car.save()

    redis_om.Migrator().run()

    # Wait for async indexing of existing documents to complete
    for index_name in client.execute_command("FT._LIST"):
        timeout = time.time() + 10
        while int(client.ft(index_name).info()["indexing"]) == 1:
            if time.time() > timeout:
                raise TimeoutError(f"Indexing {index_name} did not complete within 10 seconds")
            time.sleep(0.05)

    # Get all cars
    assert extract_producers(TestCar.find().all()) == extract_producers(CARS)

    # Get all cars of a specific producer
    assert extract_producers(
        TestCar.find((TestCar.producer == "Peugeot") | (TestCar.producer == "Mini"))
    ) == ["Mini", "Peugeot"]

    # Get only fast cars
    assert extract_producers(TestCar.find(TestCar.speed >= 150).all()) == extract_producers(
        [c for c in CARS if c.speed >= 150]
    )

    # Get only slow cars
    assert extract_producers(TestCar.find(TestCar.speed < 100).all()) == extract_producers(
        [c for c in CARS if c.speed < 100]
    )

    # Get all cars which are fast based on description
    assert extract_producers(TestCar.find(TestCar.description % "fast")) == ["Audi", "BMW"]

    # Get all cars which are not marked as extensive by descriptions
    assert extract_producers(
        TestCar.find(~(TestCar.description % "expensive")).all()
    ) == extract_producers([c for c in CARS if c.producer != "Mercedes"])

    # Get a fast allrounder
    assert extract_producers(
        TestCar.find((TestCar.speed >= 110) & (TestCar.description % "allrounder"))
    ) == ["Honda"]

    # What's the slowest car
    assert extract_producers([TestCar.find().sort_by("speed").first()]) == ["John Deere"]

    # What's the fastest car
    assert extract_producers([TestCar.find().sort_by("-speed").first()]) == ["BMW"]

    for index_name in client.execute_command("FT._LIST"):
        client.ft(index_name).dropindex()


@dfly_args({"proactor_threads": 4, "dbfilename": "synonym-persistence"})
async def test_synonym_persistence(df_server):
    """Test that synonyms are persisted across server restarts"""
    client = aioredis.Redis(port=df_server.port)

    # Create index and add documents
    idx = client.ft("idx")
    await idx.create_index([TextField("txt")], definition=IndexDefinition(prefix=["d:"]))
    await client.hset("d:1", mapping={"txt": "car"})
    await client.hset("d:2", mapping={"txt": "automobile"})

    # Add synonyms and verify they work
    await client.execute_command("FT.SYNUPDATE", "idx", "grp", "car", "automobile")
    assert (await idx.search(Query("car"))).total == 2

    # Restart server
    df_server.stop()
    df_server.start()
    client = aioredis.Redis(port=df_server.port)
    await wait_available_async(client)

    idx = client.ft("idx")

    # Verify synonyms still work after restart
    assert (await idx.search(Query("car"))).total == 2


@dfly_args({"proactor_threads": 4})
async def test_ft_info_concurrent_create_drop(df_server):
    """
    Test that FT.INFO doesn't crash when called concurrently with FT.CREATE/FT.DROPINDEX.
    The bug was a DCHECK failure when some shards have the index while others don't.
    """
    ITERATIONS = 500

    async def create_drop_worker(port):
        client = aioredis.Redis(port=port)
        for _ in range(ITERATIONS):
            try:
                await client.execute_command(
                    "FT.CREATE", "idx", "ON", "HASH", "SCHEMA", "f", "TEXT"
                )
            except Exception:
                pass  # Index might already exist
            try:
                await client.execute_command("FT.DROPINDEX", "idx")
            except Exception:
                pass  # Index might not exist
        await client.close()

    async def info_worker(port):
        client = aioredis.Redis(port=port)
        for _ in range(ITERATIONS):
            try:
                await client.execute_command("FT.INFO", "idx")
            except Exception:
                pass  # Index might not exist - that's OK
        await client.close()

    # Run multiple workers concurrently with separate connections
    port = df_server.port
    tasks = [
        create_drop_worker(port),
        create_drop_worker(port),
        create_drop_worker(port),
        create_drop_worker(port),
        create_drop_worker(port),
        info_worker(port),
        info_worker(port),
        info_worker(port),
        info_worker(port),
        info_worker(port),
    ]

    # If there's a crash, this will fail
    await asyncio.gather(*tasks)

    # Verify server is still alive
    client = aioredis.Redis(port=port)
    assert await client.ping()
    await client.close()


@pytest.mark.parametrize(
    "master_threads,replica_threads",
    [
        (4, 4),  # Same thread count
        (4, 3),  # Master has more threads
        (3, 4),  # Replica has more threads
    ],
)
async def test_replicate_all_index_types(df_factory, master_threads, replica_threads):
    """
    Test that all index types (text, numeric, tag, geo, and vector) can be replicated
    via full sync rebuild on the replica side. Uses 10000 elements for stress testing.
    Tests with different thread counts between master and replica to ensure proper
    shard handling during replication.
    """
    from .instance import DflyInstanceFactory

    master = df_factory.create(proactor_threads=master_threads)
    # logbuflevel=-1 forces glog to flush every log line immediately, so INFO messages
    # are visible in the log file when we read it (before the process exits).
    replica = df_factory.create(proactor_threads=replica_threads, logbuflevel=-1)

    df_factory.start_all([master, replica])

    c_master = master.client()
    c_replica = replica.client()

    # Create an index with all field types on master
    await c_master.execute_command(
        "FT.CREATE",
        "all_types_idx",
        "ON",
        "HASH",
        "PREFIX",
        "1",
        "item:",
        "SCHEMA",
        "name",
        "TEXT",
        "price",
        "NUMERIC",
        "SORTABLE",
        "category",
        "TAG",
        "location",
        "GEO",
        "embedding",
        "VECTOR",
        "HNSW",
        "6",
        "TYPE",
        "FLOAT32",
        "DIM",
        "2",
        "DISTANCE_METRIC",
        "L2",
    )

    # Insert 10000 test documents
    NUM_DOCS = 10000
    pipe = c_master.pipeline(transaction=False)
    for i in range(NUM_DOCS):
        lat = 37.0 + (i % 100) * 0.01  # Varying latitudes
        lon = -122.0 + (i // 100) * 0.01  # Varying longitudes
        category = "electronics" if i % 3 == 0 else ("clothing" if i % 3 == 1 else "food")
        embedding = np.array([float(i % 100), float(i // 100)], dtype=np.float32).tobytes()
        pipe.hset(
            f"item:{i}",
            mapping={
                "name": f"Product {i}",
                "price": i,
                "category": category,
                "location": f"{lon},{lat}",
                "embedding": embedding,
            },
        )
        # Execute in batches to avoid memory issues
        if i % 1000 == 999:
            await pipe.execute()
            pipe = c_master.pipeline(transaction=False)
    await pipe.execute()

    # Verify searches work on master
    master_idx = c_master.ft("all_types_idx")

    # Text search
    text_result = await master_idx.search("Product 100")
    assert text_result.total >= 1

    # Numeric search
    numeric_result = await master_idx.search("@price:[1000 2000]")
    assert numeric_result.total == 1001  # prices 1000-2000

    # Tag search - every 3rd item is electronics (0, 3, 6, ...)
    tag_result = await master_idx.search(Query("@category:{electronics}").paging(0, 0))
    expected_electronics = (NUM_DOCS + 2) // 3  # ceil(10000/3)
    assert tag_result.total == expected_electronics

    # Geo search - search around (-122.0, 37.0) with 10km radius
    geo_result = await master_idx.search("@location:[-122.0 37.0 10 km]")
    assert geo_result.total > 0

    # Vector search (KNN)
    query_vec = np.array([50.0, 50.0], dtype=np.float32).tobytes()
    knn_result = await c_master.execute_command(
        "FT.SEARCH",
        "all_types_idx",
        "*=>[KNN 10 @embedding $vec]",
        "PARAMS",
        "2",
        "vec",
        query_vec,
    )
    assert knn_result[0] == 10  # Exactly 10 results for KNN 10

    # Start replication
    await c_replica.execute_command("REPLICAOF", "localhost", master.port)
    await wait_available_async(c_replica)

    # Verify index exists on replica
    indices = await c_replica.execute_command("FT._LIST")
    assert b"all_types_idx" in indices or "all_types_idx" in indices

    replica_idx = c_replica.ft("all_types_idx")

    # Verify all search types work on replica

    # Text search
    replica_text = await replica_idx.search("Product 100")
    assert replica_text.total >= 1

    # Numeric search
    replica_numeric = await replica_idx.search("@price:[1000 2000]")
    assert replica_numeric.total == 1001

    # Tag search
    replica_tag = await replica_idx.search(Query("@category:{electronics}").paging(0, 0))
    assert replica_tag.total == expected_electronics

    # Geo search
    replica_geo = await replica_idx.search("@location:[-122.0 37.0 10 km]")
    assert replica_geo.total == geo_result.total

    # Vector search (KNN) - verify same results as master
    replica_knn = await c_replica.execute_command(
        "FT.SEARCH",
        "all_types_idx",
        "*=>[KNN 10 @embedding $vec]",
        "PARAMS",
        "2",
        "vec",
        query_vec,
    )
    assert replica_knn[0] == 10

    # Extract and compare document keys from KNN results (sorted because order may vary
    # slightly due to floating-point distance ties).
    # Format: [count, key1, fields1, key2, fields2, ...]
    master_knn_keys = sorted([knn_result[i] for i in range(1, len(knn_result), 2)])
    replica_knn_keys = sorted([replica_knn[i] for i in range(1, len(replica_knn), 2)])
    assert master_knn_keys == replica_knn_keys, (
        f"KNN results differ between master and replica: "
        f"master={master_knn_keys}, replica={replica_knn_keys}"
    )

    # Verify the HNSW index was actually restored from the serialized graph (not rebuilt
    # from scratch). Check replica's INFO log for the restoration message.
    info_logs = [f for f in replica.log_files if "INFO" in f]
    assert info_logs, "Could not find replica INFO log file"
    with open(info_logs[0], "r") as f:
        log_content = f.read()
    if master_threads == replica_threads:
        assert (
            "Restored HNSW index" in log_content
        ), "Expected HNSW index to be restored from serialized graph (same shard count)"
    else:
        assert (
            "global_ids remapped" in log_content
        ), "Expected HNSW index to be restored with global_id remapping (different shard count)"
    rebuild_lines = [
        l.strip()
        for l in log_content.splitlines()
        if "Will rebuild from scratch" in l and "HNSW" in l
    ]
    assert (
        not rebuild_lines
    ), "HNSW index fell back to rebuild from scratch unexpectedly:\n" + "\n".join(rebuild_lines)


@dfly_args({"proactor_threads": 4})
async def test_vector_search_with_geo_and_tags(async_client: aioredis.Redis):
    """
    Test combining vector search (KNN) with geo radius filter and category tags.
    This tests complex queries that use multiple index types together with 10000 elements.
    """
    idx = async_client.ft("combined_idx")

    # Create index with vector, geo, and tag fields
    await idx.create_index(
        [
            TextField("name"),
            TagField("category"),
            GeoField("location"),
            VectorField(
                "embedding",
                algorithm="HNSW",
                attributes={
                    "TYPE": "FLOAT32",
                    "DIM": 3,
                    "DISTANCE_METRIC": "L2",
                    "INITIAL_CAP": 10000,
                },
            ),
        ],
        definition=IndexDefinition(index_type=IndexType.HASH, prefix=["place:"]),
    )

    # Insert 10000 places with varying locations and categories
    NUM_PLACES = 10000
    categories = ["restaurant", "cafe", "bar", "shop", "hotel"]

    pipe = async_client.pipeline(transaction=False)
    for i in range(NUM_PLACES):
        # Distribute locations across a grid
        lat = 37.0 + (i % 100) * 0.01  # 100 different latitudes
        lon = -122.5 + (i // 100) * 0.01  # 100 different longitudes
        category = categories[i % len(categories)]
        # Create embeddings that form clusters based on category
        cat_offset = (i % len(categories)) * 10
        embedding = np.array(
            [float(i % 100) + cat_offset, float(i // 100), float(i % 10)], dtype=np.float32
        )
        pipe.hset(
            f"place:{i}",
            mapping={
                "name": f"Place {i}",
                "category": category,
                "location": f"{lon},{lat}",
                "embedding": embedding.tobytes(),
            },
        )
        # Execute in batches
        if i % 1000 == 999:
            await pipe.execute()
            pipe = async_client.pipeline(transaction=False)
    await pipe.execute()

    # Test 1: Vector search only - find places with embeddings closest to a point
    query_vec = np.array([50.0, 50.0, 5.0], dtype=np.float32).tobytes()
    result = await async_client.execute_command(
        "FT.SEARCH",
        "combined_idx",
        "*=>[KNN 10 @embedding $vec]",
        "PARAMS",
        "2",
        "vec",
        query_vec,
        "RETURN",
        "1",
        "name",
    )
    assert result[0] == 10

    # Test 2: Vector search filtered by tag - only restaurants (every 5th item starting from 0)
    result = await async_client.execute_command(
        "FT.SEARCH",
        "combined_idx",
        "@category:{restaurant}=>[KNN 10 @embedding $vec]",
        "PARAMS",
        "2",
        "vec",
        query_vec,
        "RETURN",
        "2",
        "name",
        "category",
    )
    assert result[0] == 10
    # Verify all results are restaurants
    result_str = str(result)
    for cat in ["cafe", "bar", "shop", "hotel"]:
        # The category field should not contain other categories
        assert (
            f"'category', '{cat}'" not in result_str and f"b'category', b'{cat}'" not in result_str
        )

    # COMMENTED OUT: Test 3 - Triggers DCHECK failure due to unsorted geo results
    # See: src/core/search/indices.cc:622 - GeoIndex::RadiusSearch doesn't sort results
    # This causes DCHECK failure at src/core/search/search.cc:402 when combining filters
    # TODO: Uncomment after fixing GeoIndex::RadiusSearch to sort results
    #
    # # Test 3: Vector search filtered by geo - only places near center (within 5km)
    # result = await async_client.execute_command(
    #     "FT.SEARCH",
    #     "combined_idx",
    #     "@location:[-122.0 37.5 5 km]=>[KNN 20 @embedding $vec]",
    #     "PARAMS",
    #     "2",
    #     "vec",
    #     query_vec,
    #     "RETURN",
    #     "2",
    #     "name",
    #     "location",
    # )
    # # Should find places within the geo radius
    # assert result[0] >= 1
    # assert result[0] <= 20

    # COMMENTED OUT: Test 4 - Triggers DCHECK failure due to unsorted geo results
    # See: src/core/search/indices.cc:622 - GeoIndex::RadiusSearch doesn't sort results
    # This causes DCHECK failure at src/core/search/search.cc:402 when combining geo + tag filters
    # TODO: Uncomment after fixing GeoIndex::RadiusSearch to sort results
    #
    # # Test 4: Combined - vector search with both geo AND tag filters
    # # Find cafes (category index 1) near a specific location
    # query_vec_cafe = np.array([60.0, 50.0, 5.0], dtype=np.float32).tobytes()  # Near cafe cluster
    # result = await async_client.execute_command(
    #     "FT.SEARCH",
    #     "combined_idx",
    #     "@category:{cafe} @location:[-122.0 37.5 20 km]=>[KNN 10 @embedding $vec]",
    #     "PARAMS",
    #     "2",
    #     "vec",
    #     query_vec_cafe,
    #     "RETURN",
    #     "2",
    #     "name",
    #     "category",
    # )
    # # Should find cafes within the geo and vector constraints
    # assert result[0] >= 1
    # result_str = str(result)
    # # Should not contain other categories
    # assert "restaurant" not in result_str.lower() or "category" not in result_str

    # COMMENTED OUT: Test 5 - Triggers DCHECK failure due to unsorted geo results
    # See: src/core/search/indices.cc:622 - GeoIndex::RadiusSearch doesn't sort results
    # This causes DCHECK failure at src/core/search/search.cc:402 when combining geo + tag filters
    # TODO: Uncomment after fixing GeoIndex::RadiusSearch to sort results
    #
    # # Test 5: Tag search with geo filter (no vector)
    # result = await idx.search(
    #     Query("@category:{restaurant} @location:[-122.0 37.5 50 km]").paging(0, 0)
    # )
    # # Should find restaurants within 50km radius
    # assert result.total >= 1

    # Test 6: Count documents per category
    for cat in categories:
        result = await idx.search(Query(f"@category:{{{cat}}}").paging(0, 0))
        expected_count = NUM_PLACES // len(categories)
        assert (
            result.total == expected_count
        ), f"Expected {expected_count} {cat}s, got {result.total}"

    await idx.dropindex()


================================================
FILE: tests/dragonfly/seeder/README.md
================================================
## Seeder library

Please use the testing frameworks factories to obtain proper seeder instances!

### 1. Static seeder

The DebugPopulateSeeder is a thin wrapper around `DEBUG POPULATE` with a little bit of fuzziness for collection sizes. It should be preffered for generating "static" data for snapshotting, memory consumption tests, etc.

```python
s = DebugPopulateSeeder(key_target=10_000)
await s.run(client) # Creates around 10k keys
```

### 2. Checking consistency

Use `SeederBase.capture()` (accessed via `DebugPopulateSeeder` or `Seeder`) to calculate a "state hashes" based on all the data inside an instance. Equal data produces equal hashes (equal hashes don't guarantee equal data but what are the odds...).

```python
# Fill master with ~10k keys
s = DebugPopulateSeeder(key_target=10_000)
await seeder.run(master)

# "Replicate" or other operations
replicate(master, replica)

# Ensure master and replica have same state hashes
master_hashes, replica_hashes = await asyncio.gather(
    DebugPopulateSeeder.capture(master), # note it's a static method
    DebugPopulateSeeder.capture(replica)
)
assert master_hashes == replica_hashes
```

### 3. Dynamic seeder

Contrary to the static seeder, the normal seeder issues a more complicated mix of commands, supports deleting keys and sending modification traffic.
The seeder tries to maintain a specific number of keys, quickly filling or emptying the instance to reach the target. Once reached, it will issue a balanced load of all kinds of operations.

```python
# Configure how many keys we want
s = Seeder(key_target=10_000)

# Fill instance with keys until it's 10k +- 1%
# Will create many new keys with data and reach equilibrium
await s.run(client, target_deviation=0.01)
assert abs(client.dbsize() - 10_000) <= 100

# Run 5k operations, balanced mix of create/delete/modify
await s.run(client, target_ops=5000)

# Now we want only 500 keys, issue many deletes
s.change_key_target(500)
await s.run(client, target_deviation=0.01)
```

### 4. Working with load

A seeders `run(client)` can be called without any target. It can only be stopped with

```python
# Fill instance with keys
s = Seeder()
await seeder.run(client, target_deviation=0.01)

# Start seeder without target
# Because the instance reached its key target, the seeder
# will issue a balanced mix of modifications/additions/deletions
seeding_task = asyncio.create_task(s.run(client))

# Do operations under fuzzy load
save(client)

await s.stop(client) # request stop, no immediate effect
await seeding_task # wait for actual stop and cleanup
```


================================================
FILE: tests/dragonfly/seeder/__init__.py
================================================
import asyncio
import random
import logging
import re
import typing
import math
import redis
import redis.asyncio as aioredis
from dataclasses import dataclass
import time
import sys

import numpy as np

try:
    from importlib import resources as impresources
except ImportError:
    # CI runs on python < 3.8
    import importlib_resources as impresources


class SeederBase:
    UID_COUNTER = 1  # multiple generators should not conflict on keys
    CACHED_SCRIPTS = {}
    DEFAULT_TYPES = ["STRING", "LIST", "SET", "HASH", "ZSET", "JSON", "STREAM"]

    def __init__(self, types: typing.Optional[typing.List[str]] = None, seed=None):
        self.uid = SeederBase.UID_COUNTER
        SeederBase.UID_COUNTER += 1
        self.types = types if types is not None else SeederBase.DEFAULT_TYPES

        self.seed = random.randrange(sys.maxsize)
        if seed is not None:
            self.seed = seed

        random.seed(int(self.seed))
        logging.debug(f"Random seed: {self.seed}, check: {random.randrange(100)}")

    @classmethod
    async def capture(
        clz, client: aioredis.Redis, types: typing.Optional[typing.List[str]] = None
    ) -> typing.Tuple[int]:
        """Generate hash capture for all data stored in instance pointed by client"""

        sha = await client.script_load(clz._load_script("hash"))
        types_to_capture = types if types is not None else clz.DEFAULT_TYPES
        return tuple(
            await asyncio.gather(
                *(clz._run_capture(client, sha, data_type) for data_type in types_to_capture)
            )
        )

    @staticmethod
    async def _run_capture(client, sha, data_type):
        s = time.time()
        res = await client.evalsha(sha, 0, data_type)
        logging.debug(f"hash capture of {data_type} took {time.time() - s}")
        return res

    @staticmethod
    def _read_file(fname):
        try:
            script_file = impresources.files(__package__) / fname
            with script_file.open("rt") as f:
                return f.read()
        except AttributeError:
            return impresources.read_text(__package__, fname)

    @classmethod
    def _load_script(clz, fname):
        if fname in clz.CACHED_SCRIPTS:
            return clz.CACHED_SCRIPTS[fname]

        script = clz._read_file(f"script-{fname}.lua")
        requested = re.findall(r"-- import:(.*?) --", script)
        for request in requested:
            lib = clz._read_file(f"script-{request}.lua")
            script = script.replace(f"-- import:{request} --", lib)

        clz.CACHED_SCRIPTS[fname] = script
        return script


class DebugPopulateSeeder(SeederBase):
    """Wrapper around DEBUG POPULATE with fuzzy key sizes and a balanced type mix"""

    def __init__(
        self,
        key_target=10_000,
        data_size=100,
        variance=5,
        samples=10,
        collection_size=None,
        types: typing.Optional[typing.List[str]] = None,
        seed=None,
    ):
        SeederBase.__init__(self, types, seed)
        self.key_target = key_target
        self.data_size = data_size
        self.variance = variance
        self.samples = samples

        if collection_size is None:
            self.collection_size = data_size ** (1 / 3)
        else:
            self.collection_size = collection_size

    async def run(self, client: aioredis.Redis):
        """Run with specified options until key_target is met"""
        samples = [
            (dtype, f"k-s{self.uid}u{i}-") for i, dtype in enumerate(self.types * self.samples)
        ]

        # Handle samples in chuncks of 24 to not overload client pool and instance
        chunk_size = 24
        for i in range(0, len(samples), chunk_size):
            await asyncio.gather(
                *(
                    self._run_unit(client, dtype, prefix)
                    for dtype, prefix in samples[i : i + chunk_size]
                )
            )

    async def _run_unit(self, client: aioredis.Redis, dtype: str, prefix: str):
        key_target = self.key_target // (self.samples * len(self.types))
        if dtype == "STRING":
            dsize = random.uniform(self.data_size / self.variance, self.data_size * self.variance)
            csize = 1
        else:
            csize = self.collection_size
            csize = math.ceil(random.uniform(csize / self.variance, csize * self.variance))
            dsize = self.data_size // csize

        args = ["DEBUG", "POPULATE", key_target, prefix, math.ceil(dsize)]
        args += ["RAND", "TYPE", dtype, "ELEMENTS", csize]
        return await client.execute_command(*args)


class Seeder(SeederBase):
    @dataclass
    class Unit:
        prefix: str
        type: str
        counter: int
        stop_key: str

    units: typing.List[Unit]

    def __init__(
        self,
        units=10,
        key_target=10_000,
        data_size=100,
        collection_size=None,
        types: typing.Optional[typing.List[str]] = None,
        huge_value_target=5,
        huge_value_size=100000,
        seed=None,
        huge_value_add_only=False,
    ):
        SeederBase.__init__(self, types, seed)
        self.key_target = key_target
        self.data_size = data_size
        if collection_size is None:
            self.collection_size = math.ceil(data_size ** (1 / 3))
        else:
            self.collection_size = collection_size

        self.huge_value_add_only = huge_value_add_only
        self.huge_value_target = huge_value_target
        self.huge_value_size = huge_value_size

        self.units = [
            Seeder.Unit(
                prefix=f"k-s{self.uid}u{i}-",
                type=self.types[i % len(self.types)],
                counter=0,
                stop_key=f"_s{self.uid}u{i}-stop",
            )
            for i in range(units)
        ]

    async def run(self, client: aioredis.Redis, target_ops=None, target_deviation=None):
        """Run seeder until one of the targets or until stopped if none are set"""

        using_stopkey = target_ops is None and target_deviation is None
        args = [
            self.key_target / len(self.units),
            target_ops if target_ops is not None else 0,
            target_deviation if target_deviation is not None else -1,
            self.data_size,
            self.collection_size,
            int(self.huge_value_add_only),
            self.huge_value_target / len(self.units),
            self.huge_value_size,
            self.seed,
        ]

        sha = await client.script_load(Seeder._load_script("generate"))
        for unit in self.units:
            # Must be serial, otherwise cluster clients throws an exception
            await self._run_unit(client, sha, unit, using_stopkey, args)

    async def stop(self, client: aioredis.Redis):
        """Request seeder seeder if it's running without a target, future returned from start() must still be awaited"""

        for unit in self.units:
            # Must be serial, otherwise cluster clients throws an exception
            await client.set(unit.stop_key, "X")

    def change_key_target(self, target: int):
        """Change key target, applied only on succeeding runs"""

        self.key_target = max(target, 100)  # math breaks with low values

    @staticmethod
    async def _run_unit(client: aioredis.Redis, sha: str, unit: Unit, using_stopkey, args):
        await client.delete(unit.stop_key)

        s = time.time()

        args = [
            unit.prefix,
            unit.type,
            unit.counter,
            unit.stop_key if using_stopkey else "",
        ] + args

        result = await client.evalsha(sha, 0, *args)
        result = result.split()
        unit.counter = int(result[0])
        huge_entries = int(result[1])

        msg = f"running unit {unit.prefix}/{unit.type} took {time.time() - s}, target {args[4+0]}"
        if huge_entries > 0:
            msg = f"{msg}. Total huge entries {huge_entries} added."

        logging.debug(msg)


class HnswSearchSeeder:

    def __init__(
        self,
        index_name="hnsw_idx",
        prefix="doc:",
        num_dims=4,
        num_initial_docs=200,
        seed=42,
    ):
        self.index_name = index_name
        self.prefix = prefix
        self.num_dims = num_dims
        self.num_initial_docs = num_initial_docs
        self.seed = seed

        self._doc_counter = 0
        self._stop_event = asyncio.Event()

    def _make_embedding(self):
        return np.random.uniform(-10, 10, self.num_dims).astype(np.float32)

    async def create_index(self, client: aioredis.Redis):
        await client.execute_command(
            "FT.CREATE",
            self.index_name,
            "ON",
            "HASH",
            "PREFIX",
            "1",
            self.prefix,
            "SCHEMA",
            "title",
            "TEXT",
            "doc_id",
            "TAG",
            "embedding",
            "VECTOR",
            "HNSW",
            "6",
            "TYPE",
            "FLOAT32",
            "DIM",
            str(self.num_dims),
            "DISTANCE_METRIC",
            "L2",
        )

    async def seed_initial_docs(self, client: aioredis.Redis):
        pipe = client.pipeline(transaction=False)
        for i in range(self.num_initial_docs):
            emb = self._make_embedding()
            pipe.hset(
                f"{self.prefix}{i}",
                mapping={
                    "title": f"Product {i}",
                    "doc_id": str(i),
                    "embedding": emb.tobytes(),
                },
            )
        await pipe.execute()
        self._doc_counter = self.num_initial_docs

    def stop(self):
        self._stop_event.set()

    async def _search_knn(self, client, query_vec, k=5):
        """Run a KNN search and return (total_count, set_of_doc_ids)."""
        r = await client.execute_command(
            "FT.SEARCH",
            self.index_name,
            "*=>[KNN {k} @embedding $vec]".format(k=k),
            "PARAMS",
            "2",
            "vec",
            query_vec,
            "LIMIT",
            "0",
            str(k),
        )
        doc_ids = set(r[i] for i in range(1, len(r), 2))
        return r[0], doc_ids

    async def _search_knn_filtered(self, client, query_vec, doc_id, k=5):
        """Run a filtered KNN search for a specific document by its doc_id TAG.

        With a TAG filter, Dragonfly bypasses KNN approximate search and just
        checks presence in the index, making this a reliable existence check.
        """
        doc_key = doc_id if isinstance(doc_id, str) else doc_id.decode()
        doc_num = doc_key[len(self.prefix) :] if doc_key.startswith(self.prefix) else doc_key
        r = await client.execute_command(
            "FT.SEARCH",
            self.index_name,
            "@doc_id:{{{id}}}=>[KNN {k} @embedding $vec]".format(id=doc_num, k=k),
            "PARAMS",
            "2",
            "vec",
            query_vec,
            "LIMIT",
            "0",
            str(k),
        )
        return r[0] > 0

    async def verify(self, *clients: aioredis.Redis, num_queries=10):
        if len(clients) < 2:
            raise ValueError("Need at least two clients to compare")

        sizes = [await c.dbsize() for c in clients]
        for i in range(1, len(sizes)):
            assert (
                sizes[0] == sizes[i]
            ), f"dbsize mismatch: client[0]={sizes[0]} vs client[{i}]={sizes[i]}"

        # HNSW is approximate, so KNN results between master and replica may differ.
        # For any document that appears on one side but not the other, we run a
        # filtered KNN search using the doc_id TAG. With a filter, Dragonfly skips
        # approximate KNN and just checks index presence, so this reliably verifies
        # that the replica has indexed all documents.
        k = 5

        for q in range(num_queries):
            query_vec = self._make_embedding().tobytes()
            results = []
            for c in clients:
                total, doc_ids = await self._search_knn(c, query_vec, k)
                results.append((total, doc_ids))

            assert results[0][0] > 0, "KNN search returned no results on master"

            for i in range(1, len(results)):
                master_ids = results[0][1]
                replica_ids = results[i][1]

                # Check documents found on master but not on replica
                missing_on_replica = master_ids - replica_ids
                truly_missing = []
                for doc_id in missing_on_replica:
                    if not await self._search_knn_filtered(clients[i], query_vec, doc_id, 1):
                        truly_missing.append(doc_id)

                assert not truly_missing, (
                    f"Query {q}: documents {truly_missing} found on master but "
                    f"not indexed on replica (client[{i}]). "
                    f"Master results: {sorted(master_ids)}, "
                    f"Replica results: {sorted(replica_ids)}"
                )

                # Check documents found on replica but not on master
                missing_on_master = replica_ids - master_ids
                truly_missing = []
                for doc_id in missing_on_master:
                    if not await self._search_knn_filtered(clients[0], query_vec, doc_id, k):
                        truly_missing.append(doc_id)

                assert not truly_missing, (
                    f"Query {q}: documents {truly_missing} found on replica "
                    f"(client[{i}]) but not indexed on master. "
                    f"Master results: {sorted(master_ids)}, "
                    f"Replica results: {sorted(replica_ids)}"
                )

    async def run_traffic(self, client: aioredis.Redis, sleep_interval=0.01):
        self._stop_event.clear()
        while not self._stop_event.is_set():
            op = random.choice(["insert", "update", "delete"])
            try:
                if op == "insert":
                    emb = self._make_embedding()
                    await client.hset(
                        f"{self.prefix}{self._doc_counter}",
                        mapping={
                            "title": f"Product {self._doc_counter}",
                            "doc_id": str(self._doc_counter),
                            "embedding": emb.tobytes(),
                        },
                    )
                    self._doc_counter += 1
                elif op == "update":
                    key_id = random.randint(0, max(self._doc_counter - 1, 0))
                    key = f"{self.prefix}{key_id}"
                    if not await client.exists(key):
                        continue
                    emb = self._make_embedding()
                    await client.hset(key, mapping={"embedding": emb.tobytes()})
                elif op == "delete":
                    key_id = random.randint(0, max(self._doc_counter - 1, 0))
                    await client.delete(f"{self.prefix}{key_id}")
            except (redis.exceptions.ConnectionError, redis.exceptions.ResponseError):
                await asyncio.sleep(sleep_interval)
            await asyncio.sleep(sleep_interval)

    async def run_search_queries(self, client: aioredis.Redis, sleep_interval=0.05):
        while not self._stop_event.is_set():
            try:
                query_vec = self._make_embedding().tobytes()
                await client.execute_command(
                    "FT.SEARCH",
                    self.index_name,
                    "*=>[KNN 5 @embedding $vec]",
                    "PARAMS",
                    "2",
                    "vec",
                    query_vec,
                    "LIMIT",
                    "0",
                    "5",
                )
            except (redis.exceptions.ConnectionError, redis.exceptions.ResponseError):
                pass
            await asyncio.sleep(sleep_interval)


================================================
FILE: tests/dragonfly/seeder/script-generate.lua
================================================
--!df flags=disable-atomicity

--[[
Script for quickly generating various data
]] --
-- import:genlib --
-- import:utillib --

-- inputs: unit identifiers
local prefix = ARGV[1]
local type = ARGV[2]
local key_counter = tonumber(ARGV[3])
local stop_key = ARGV[4]
-- inputs: task specific
local key_target = tonumber(ARGV[5])
local total_ops = tonumber(ARGV[6])
local min_dev = tonumber(ARGV[7])
local data_size = tonumber(ARGV[8])
local collection_size = tonumber(ARGV[9])
local huge_value_keys_add_only = tonumber(ARGV[10])
-- Probability of each key in key_target to be a big value
local huge_value_target = tonumber(ARGV[11])
local huge_value_size = tonumber(ARGV[12])
-- Seed
local seed = tonumber(ARGV[13])
math.randomseed(seed)

-- collect all keys belonging to this script
-- assumes exclusive ownership
local keys = LU_collect_keys(prefix, type)

LG_funcs.init(data_size, collection_size, huge_value_target, huge_value_size)
local addfunc = LG_funcs['add_' .. string.lower(type)]
local modfunc = LG_funcs['mod_' .. string.lower(type)]
local huge_entries = LG_funcs["get_huge_entries"]
local is_huge_entry = LG_funcs["is_huge_entry"]
-- Keep track of total number of keys including huge value keys. Intialize
-- to number of keys that currently exists.
local total_keys = #keys

local function action_add()
    local key = prefix .. tostring(key_counter)
    local op_type = string.lower(type)
    local is_next_huge_entry = false
    key_counter = key_counter + 1
    total_keys = total_keys + 1

    if huge_value_keys_add_only == 1 then
        is_next_huge_entry = is_huge_entry(op_type)
    end

    table.insert(keys, key)
    addfunc(key, keys)


   -- If we allow adding only huge value keys we will now remove it from
    -- table so it wouldn't be selected for any action_del / action_mod
    if is_next_huge_entry then
        table.remove(keys)
    end
end

local function action_mod()
    local key = keys[math.random(#keys)]
    modfunc(key, keys)
end

local function action_del()
    total_keys = total_keys - 1
    local key_idx = math.random(#keys)
    keys[key_idx], keys[#keys] = keys[#keys], keys[key_idx]
    local key = table.remove(keys)
    redis.acall('DEL', key)
end

-- set equilibrium point as key target, see intensity calculations below
local real_target = key_target
key_target = key_target / 0.956

-- accumulative probabilities: [add, add + delete, modify = 1-( add + delete) ]
local p_add = 0
local p_del = 0

local counter = 0
while true do
    counter = counter + 1

    -- break if we reached target ops
    if total_ops > 0 and counter > total_ops then
        break
    end

    -- break if we reached our target deviation
    if min_dev > 0 and math.abs(total_keys - real_target) / real_target < min_dev then
        break
    end

    -- break if stop key was set (every 100 ops to not slow down)
    if stop_key ~= '' and counter % 100 == 0 and redis.call('EXISTS', stop_key) then
        break
    end

    -- fast path, if we have less than half of the target, always grow
    if total_keys * 2 < key_target then
        action_add()
        goto continue
    end

    -- update probability only every 10 iterations
    if counter % 10 == 0 then
        -- calculate intensity (not normalized probabilities)
        -- please see attached plots in PR to understand convergence
        -- https://github.com/dragonflydb/dragonfly/pull/2556

        -- the add intensity is monotonically decreasing with keycount growing,
        -- the delete intensity is monotonically increasing with keycount growing,
        -- the point where the intensities are equal is the equilibrium point,
        -- based on the formulas it's ~0.956 * key_target
        local i_add = math.max(0, 1 - (total_keys / key_target) ^ 16)
        local i_del = (total_keys / key_target) ^ 16

        -- we are only interested in large amounts of modification commands when we are in an
        -- equilibrium, where there are no low intensities
        local i_mod = math.max(0, 7 * math.min(i_add, i_del) ^ 3)

        -- transform intensities to [0, 1] probability ranges
        local sum = i_add + i_del + i_mod
        p_add = i_add / sum
        p_del = p_add + i_del / sum
    end

    -- generate random action
    local p = math.random()
    if p < p_add then
        action_add()
    elseif p < p_del then
        action_del()
    else
        action_mod()
    end

    ::continue::
end

-- clear stop key
if stop_key ~= '' then
    redis.call('DEL', stop_key)
end

return tostring(key_counter) .. " " .. tostring(huge_entries())


================================================
FILE: tests/dragonfly/seeder/script-genlib.lua
================================================
local LG_funcs = {}

function LG_funcs.init(dsize, csize, large_val_count, large_val_sz)
    LG_funcs.dsize = dsize
    LG_funcs.csize = csize
    LG_funcs.esize = math.ceil(dsize / csize)
    LG_funcs.huge_value_target = large_val_count
    LG_funcs.huge_value_size = large_val_sz
end

local huge_entries = 0


local function is_huge_entry()
    if huge_entries >= LG_funcs.huge_value_target then
        return false
    else
        huge_entries = huge_entries + 1
        return true
    end
end


local function randstr()
    local str
    local is_huge = is_huge_entry()
    if is_huge then
        str = dragonfly.randstr(LG_funcs.huge_value_size)
    else
        str = dragonfly.randstr(LG_funcs.esize)
    end
    return str
end

local function randstr_sequence()
    local strs
    local is_huge = is_huge_entry()
    if is_huge then
        strs = dragonfly.randstr(LG_funcs.huge_value_size, LG_funcs.csize)
    else
        strs = dragonfly.randstr(LG_funcs.esize, LG_funcs.csize)
    end
    return strs
end

-- strings
-- store blobs of random chars

function LG_funcs.add_string(key)
    redis.apcall('SET', key, dragonfly.randstr(LG_funcs.dsize))
end

function LG_funcs.mod_string(key)
    -- APPEND and SETRANGE are the only modifying operations for strings,
    -- issue APPEND rarely to not grow data too much
    if math.random() < 0.05 then
        redis.apcall('APPEND', key, '+')
    else
        local replacement = dragonfly.randstr(LG_funcs.dsize // 2)
        redis.apcall('SETRANGE', key, math.random(0, LG_funcs.dsize // 2), replacement)
    end
end

-- lists
-- store list of random blobs of default container/element sizes

function LG_funcs.add_list(key, keys)
    redis.apcall('LPUSH', key, unpack(randstr_sequence()))
end

function LG_funcs.mod_list(key, keys)
    -- equally likely pops and pushes, we rely on the list size being large enough
    -- to "highly likely" not get emptied out by consequitve pops
    local action = math.random(1, 4)
    if action == 1 then
        redis.apcall('RPOP', key)
    elseif action == 2 then
        redis.apcall('LPOP', key)
    elseif action == 3 then
      redis.apcall('LPUSH', key, randstr())
    else
      redis.apcall('RPUSH', key, randstr())
    end
end

-- sets
-- store sets of blobs of default container/element sizes

function LG_funcs.add_set(key, keys)
    if #keys > 100 and math.random() < 0.05 then
        -- we assume that elements overlap with a very low proabiblity, so
        -- SDIFF is expected to be equal to the origin set.
        -- Repeating this operation too often can lead to two equal sets being chosen
        local i1 = math.random(#keys)
        local i2 = math.random(#keys)
        while i1 == i2 do
            i2 = math.random(#keys)
        end
        redis.apcall('SDIFFSTORE', key, keys[i1], keys[i2])
    else
        redis.apcall('SADD', key, unpack(randstr_sequence()))
    end
end

function LG_funcs.mod_set(key, keys)
     -- equally likely pops and additions
    if math.random() < 0.5 then
        redis.apcall('SPOP', key)
    else
        redis.apcall('SADD', key, randstr())
    end
end


-- hashes
-- store  {to_string(i): value for i in [1, csize]},
-- where `value` is a random string for even indices and a number for odd indices

function LG_funcs.add_hash(key, keys)
    local blobs = randstr_sequence()
    local limit = LG_funcs.csize

    local htable = {}
    for i = 1, limit do
        htable[i * 2 - 1] = tostring(i)
        htable[i * 2] = blobs[i]
    end

    redis.apcall('HSET', key, unpack(htable))
end

function LG_funcs.mod_hash(key, keys)
    local idx = math.random(LG_funcs.csize)
    redis.apcall('HSET', key, tostring(idx), randstr())
end

-- sorted sets

function LG_funcs.add_zset(key, keys)
    -- TODO: We don't support ZDIFFSTORE
    local blobs = randstr_sequence()
    local ztable = {}

    local limit = LG_funcs.csize

    for i = 1, limit do
        ztable[i * 2 - 1] = tostring(i)
        ztable[i * 2] = blobs[i]
    end
    redis.apcall('ZADD', key, unpack(ztable))
end

function LG_funcs.mod_zset(key, keys)
    local action = math.random(1, 4)
    if action <= 2 then
        local size = LG_funcs.csize * 2
        redis.apcall('ZADD', key, math.random(0, size), randstr())
    elseif action == 3 then
        redis.apcall('ZPOPMAX', key)
    else
        redis.apcall('ZPOPMIN', key)
    end
end

-- json
-- store single list of integers inside object

function LG_funcs.add_json(key)
    -- generate single list of counters
    local seed = math.random(100)
    local counters = {}
    for i = 1, LG_funcs.csize do
        counters[i] = ((i + seed) * 123) % 701
    end
    redis.apcall('JSON.SET', key, '$', cjson.encode({counters = counters}))
end

function LG_funcs.mod_json(key, dbsize)
    local action = math.random(1, 4)
    if action == 1 then
        redis.apcall('JSON.ARRAPPEND', key, '$.counters', math.random(701))
    elseif action == 2 then
        redis.apcall('JSON.ARRPOP', key, '$.counters')
    elseif action == 3 then
        redis.apcall('JSON.NUMMULTBY', key, '$.counters[' .. math.random(LG_funcs.csize ) .. ']', 2)
    else
        redis.apcall('JSON.NUMINCRBY', key, '$.counters[' .. math.random(LG_funcs.csize ) .. ']', 1)
    end
end

-- streams
-- store sequences of timestamped events

function LG_funcs.add_stream(key)
    local entries = {}

    local limit = LG_funcs.csize
    local blobs = randstr_sequence()

    for i = 1, limit do
        table.insert(entries, tostring(i))
        table.insert(entries, blobs[i])
    end

    redis.apcall('XADD', key, '*', unpack(entries))
end

function LG_funcs.mod_stream(key)
    local action = math.random(1, 3)
    if action <= 2 then
        local size = LG_funcs.csize * 2
        redis.apcall('XADD', key, '*', math.random(0, size), randstr())
    else
        local maxlen = math.random(0, 100)
        redis.apcall('XTRIM', key, 'MAXLEN', '~', maxlen)
    end
end

function LG_funcs.get_huge_entries()
  return huge_entries
end

-- Check if next entry generate huge value keys
function LG_funcs.is_huge_entry(type)
    -- These types doesn't generate huge value
    if type == "string" or type == "json" then
        return false
    else
        return huge_entries < LG_funcs.huge_value_target
    end
end


================================================
FILE: tests/dragonfly/seeder/script-hash.lua
================================================
--!df flags=disable-atomicity
--[[
Script for quickly computing single 64bit hash for keys of types specified in ARGV[].
Keys of every type are sorted lexicographically to ensure consistent order.
]]--

-- import:hashlib --
-- import:utillib --

-- inputs
local type = ARGV[1]

local OUT_HASH = 0

local function process(type)
    local keys = LU_collect_keys('', type)
    local hfunc = LH_funcs[type]

    -- sort to provide consistent order
    table.sort(keys)

    if type == 'string' then
        -- batch with MGET to reduce per-key round trips (important for tiering)
        local batch_size = 16
        for i = 1, #keys, batch_size do
            local batch = {}
            for j = i, math.min(i + batch_size - 1, #keys) do
                table.insert(batch, keys[j])
            end
            OUT_HASH = dragonfly.ihash(OUT_HASH, false, 'MGET', table.unpack(batch))
        end
    else
        for _, key in ipairs(keys) do
            -- hand hash over to callback
            OUT_HASH = hfunc(key, OUT_HASH)
        end
    end
end

process(string.lower(type))

return OUT_HASH


================================================
FILE: tests/dragonfly/seeder/script-hashlib.lua
================================================
local LH_funcs = {}

function LH_funcs.string(key, hash)
    -- add value to hash
    return dragonfly.ihash(hash, false, 'GET', key)
end

function LH_funcs.list(key, hash)
    -- add values to hash
    return dragonfly.ihash(hash, false, 'LRANGE', key, 0, -1)
end

function LH_funcs.set(key, hash)
    -- add values to hash, sort before to avoid ambiguity
    return dragonfly.ihash(hash, true, 'SMEMBERS', key)
end

function LH_funcs.zset(key, hash)
    -- add values to hash, ZRANGE returns always sorted values
    return dragonfly.ihash(hash, false, 'ZRANGE', key, 0, -1, 'WITHSCORES')
end

function LH_funcs.hash(key, hash)
    -- add values to hash, first convert to key-value pairs and sort
    return dragonfly.ihash(hash, true, 'HGETALL', key)
end

function LH_funcs.json(key, hash)
    -- add values to hash, note JSON.GET returns just a string
    return dragonfly.ihash(hash, false, 'JSON.GET', key)
end

function LH_funcs.stream(key, hash)
    return dragonfly.ihash(hash, false, 'XRANGE', key, '-', '+')
end


================================================
FILE: tests/dragonfly/seeder/script-utillib.lua
================================================
-- collect all keys into table specific type on specific prefix. Uses SCAN--
local function LU_collect_keys(prefix, type)
    -- SCAN wants this weird type name for json
    if string.lower(type) == 'json' then
        type = 'ReJSON-RL'
    end

    local pattern = prefix .. "*"
    local cursor = "0"
    local keys = {}
    repeat
        local result = redis.call("SCAN", cursor, "COUNT", 500, "TYPE", type, "MATCH", pattern)
        cursor = result[1]
        local scan_keys = result[2]
        for i, key in ipairs(scan_keys) do
            table.insert(keys, key)
        end
    until cursor == "0"
    return keys
end


================================================
FILE: tests/dragonfly/seeder_test.py
================================================
import asyncio
import async_timeout
import string
from redis import asyncio as aioredis
from . import dfly_args
from .seeder import Seeder, DebugPopulateSeeder
from .instance import DflyInstanceFactory, DflyInstance
from .utility import *


@dfly_args({"proactor_threads": 4})
async def test_static_seeder(async_client: aioredis.Redis):
    s = DebugPopulateSeeder(key_target=10_000, data_size=100)
    await s.run(async_client)

    assert abs(await async_client.dbsize() - 10_000) <= 70


@dfly_args({"proactor_threads": 4})
async def test_static_collection_size(async_client: aioredis.Redis):
    async def check_list():
        keys = await async_client.keys()
        for key in keys:
            assert await async_client.llen(key) == 1
            assert len(await async_client.lpop(key)) == 10_000

    s = DebugPopulateSeeder(
        key_target=10, data_size=10_000, variance=1, samples=1, collection_size=1, types=["LIST"]
    )
    await s.run(async_client)
    await check_list()

    await async_client.flushall()

    s = Seeder(
        units=1,
        key_target=10,
        data_size=10_000,
        collection_size=1,
        types=["LIST"],
        huge_value_target=0,
        huge_value_size=0,
    )
    await s.run(async_client)


@dfly_args({"proactor_threads": 4})
async def test_seeder_key_target(async_client: aioredis.Redis):
    """Ensure seeder reaches its key targets"""
    s = Seeder(units=len(Seeder.DEFAULT_TYPES) * 2, key_target=5000)

    # Ensure tests are not reasonably slow
    async with async_timeout.timeout(20):
        # Fill with 5k keys, 1% derivation = 50
        await s.run(async_client, target_deviation=0.01)
        assert abs(await async_client.dbsize() - 5000) <= 50

        # Run 1k ops, ensure key balance stays the "more or less" the same
        await s.run(async_client, target_ops=1000)
        assert abs(await async_client.dbsize() - 5000) <= 100

        # Run one second until stopped
        task = asyncio.create_task(s.run(async_client))
        await asyncio.sleep(1.0)
        await s.stop(async_client)
        await task

        # Change key target, 100 is actual minimum because "math breaks"
        s.change_key_target(0)
        await s.run(async_client, target_deviation=0.5)  # don't set low precision with low values
        assert await async_client.dbsize() < 200

        # Get cmdstat calls
        info = await async_client.info("ALL")
        calls = {
            k.split("_")[1]: v["calls"]
            for k, v in info.items()
            if k.startswith("cmdstat_") and v["calls"] > 50
        }
        assert len(calls) > 15  # we use at least 15 different commands


@dfly_args({"proactor_threads": 4})
async def test_seeder_capture(async_client: aioredis.Redis):
    """Ensure same data produces same state hashes"""

    async def set_data():
        p = async_client.pipeline()
        p.mset(mapping={f"string{i}": f"{i}" for i in range(100)})
        p.lpush("list1", *list(string.ascii_letters))
        p.sadd("set1", *list(string.ascii_letters))
        p.hset("hash1", mapping={f"{i}": l for i, l in enumerate(string.ascii_letters)})
        p.zadd("zset1", mapping={l: i for i, l in enumerate(string.ascii_letters)})
        await p.execute()

    # Capture with filled data
    await set_data()
    capture = await Seeder.capture(async_client)

    # Check hashes are 0 without data
    await async_client.flushall()
    assert all(h == 0 for h in (await Seeder.capture(async_client)))

    # Check setting the same data results in same hashes
    await set_data()
    assert capture == await Seeder.capture(async_client)

    # Check changing the data gives different hahses
    await async_client.lpush("list1", "NEW")
    assert capture != await Seeder.capture(async_client)

    # Undo our change
    await async_client.lpop("list1")
    assert capture == await Seeder.capture(async_client)

    # Do another change
    await async_client.spop("set1")
    assert capture != await Seeder.capture(async_client)


@pytest.mark.asyncio
@dfly_args({"proactor_threads": 2})
async def test_seeder_fake_redis(
    df_factory: DflyInstanceFactory, df_seeder_factory: DflySeederFactory
):
    instance = df_factory.create()
    df_factory.start_all([instance])

    seeder = df_seeder_factory.create(
        keys=100, port=instance.port, unsupported_types=[ValueType.JSON], mirror_to_fake_redis=True
    )

    await seeder.run(target_ops=5_000)

    capture = await seeder.capture_fake_redis()

    assert await seeder.compare(capture, instance.port)


@pytest.mark.asyncio
@dfly_args({"proactor_threads": 2})
async def test_seeder_huge_value(
    df_factory: DflyInstanceFactory, df_seeder_factory: DflySeederFactory
):
    instance = df_factory.create()
    df_factory.start_all([instance])

    expected_huge_value_count = 10
    seeder = df_seeder_factory.create(
        keys=100,
        port=instance.port,
        huge_value_count=expected_huge_value_count,
        huge_value_size=240_000,
    )

    def custom_command_generation_probability():
        return [
            0.0,
            0.0,
            100.0,
        ]  # We will only execute GROW commands

    # Provide custom function for command generation probability
    seeder.gen.size_change_probs = custom_command_generation_probability

    await seeder.run(target_ops=100)

    client = instance.client()

    keys = await client.execute_command("KEYS *")
    huge_val_keys_count = 0

    for key in keys:
        key_size = await client.execute_command(f"MEMORY USAGE {key}")
        # Count all keys that have memory - i.e. contain huge strings
        if key_size != None and key_size > 100_000:
            huge_val_keys_count += 1

    assert huge_val_keys_count == expected_huge_value_count


================================================
FILE: tests/dragonfly/sentinel_test.py
================================================
import pathlib
import subprocess
from typing import Awaitable
from redis import asyncio as aioredis
import pytest
import time
import asyncio
from datetime import datetime
from sys import stderr
import logging

from .utility import assert_eventually, wait_available_async

from .instance import DflyInstanceFactory
from . import dfly_args


# Helper function to parse some sentinel cli commands output as key value dictionaries.
# Output is expected be of even number of lines where each pair of consecutive lines results in a single key value pair.
# If new_dict_key is not empty, encountering it in the output will start a new dictionary, this let us return multiple
# dictionaries, for example in the 'slaves' command, one dictionary for each slave.
def stdout_as_list_of_dicts(cp: subprocess.CompletedProcess, new_dict_key=""):
    lines = cp.stdout.splitlines()
    res = []
    d = None
    if new_dict_key == "":
        d = dict()
        res.append(d)
    for i in range(0, len(lines), 2):
        if (lines[i]) == new_dict_key:  # assumes output never has '' as a key
            d = dict()
            res.append(d)
        d[lines[i]] = lines[i + 1]
    return res


def wait_for(func, pred, timeout_sec, timeout_msg=""):
    while not pred(func()):
        assert timeout_sec > 0, timeout_msg
        timeout_sec = timeout_sec - 1
        time.sleep(1)


async def await_for(func, pred, timeout_sec, timeout_msg=""):
    done = False
    while not done:
        val = func()
        if isinstance(val, Awaitable):
            val = await val
        done = pred(val)
        assert timeout_sec > 0, timeout_msg
        timeout_sec = timeout_sec - 1
        await asyncio.sleep(1)


@assert_eventually
async def assert_master_became_replica(client):
    repl_info = await client.info("replication")
    assert repl_info["role"] == "slave"


class Sentinel:
    def __init__(self, port, master_port, config_dir) -> None:
        self.config_file = pathlib.Path(config_dir).joinpath("sentinel.conf")
        self.port = port
        self.image = "bitnami/redis-sentinel:latest"
        self.container_name = "sentinel_test_py_sentinel"
        self.default_deployment = "my_deployment"
        self.initial_master_port = master_port
        self.proc = None

    def start(self):
        config = [
            f"port {self.port}",
            f"sentinel monitor {self.default_deployment} 127.0.0.1 {self.initial_master_port} 1",
            f"sentinel down-after-milliseconds {self.default_deployment} 3000",
            f"slave-priority 100",
        ]
        self.config_file.write_text("\n".join(config))

        logging.info(self.config_file.read_text())

        self.proc = subprocess.Popen(
            ["redis-server-6.2.11", f"{self.config_file.absolute()}", "--sentinel"]
        )

    def stop(self):
        self.proc.terminate()
        self.proc.wait(timeout=10)

    def run_cmd(
        self, args, sentinel_cmd=True, capture_output=False, assert_ok=True
    ) -> subprocess.CompletedProcess:
        run_args = ["redis-cli", "-p", f"{self.port}"]
        if sentinel_cmd:
            run_args = run_args + ["sentinel"]
        run_args = run_args + args
        cp = subprocess.run(run_args, capture_output=capture_output, text=True)
        if assert_ok:
            assert cp.returncode == 0, f"Command failed: {run_args}"
        return cp

    def wait_ready(self):
        wait_for(
            lambda: self.run_cmd(["ping"], sentinel_cmd=False, assert_ok=False),
            lambda cp: cp.returncode == 0,
            timeout_sec=10,
            timeout_msg="Timeout waiting for sentinel to become ready.",
        )

    def master(self, deployment="") -> dict:
        if deployment == "":
            deployment = self.default_deployment
        cp = self.run_cmd(["master", deployment], capture_output=True)
        return stdout_as_list_of_dicts(cp)[0]

    def slaves(self, deployment="") -> dict:
        if deployment == "":
            deployment = self.default_deployment
        cp = self.run_cmd(["slaves", deployment], capture_output=True)
        return stdout_as_list_of_dicts(cp)

    def live_master_port(self, deployment=""):
        if deployment == "":
            deployment = self.default_deployment
        cp = self.run_cmd(["get-master-addr-by-name", deployment], capture_output=True)
        return int(cp.stdout.splitlines()[1])

    def failover(self, deployment=""):
        if deployment == "":
            deployment = self.default_deployment
        self.run_cmd(
            [
                "failover",
                deployment,
            ]
        )


@pytest.fixture(
    scope="function"
)  # Sentinel has state which we don't want carried over form test to test.
def sentinel(tmp_dir, port_picker) -> Sentinel:
    s = Sentinel(port_picker.get_available_port(), port_picker.get_available_port(), tmp_dir)
    s.start()
    s.wait_ready()
    yield s
    s.stop()


@pytest.mark.asyncio
@pytest.mark.large
async def test_failover(df_factory: DflyInstanceFactory, sentinel, port_picker):
    master = df_factory.create(port=sentinel.initial_master_port)
    replica = df_factory.create(port=port_picker.get_available_port())

    master.start()
    replica.start()

    master_client = aioredis.Redis(port=master.port)
    replica_client = aioredis.Redis(port=replica.port)
    logging.info("master: " + str(master.port) + " replica: " + str(replica.port))

    await replica_client.execute_command("REPLICAOF localhost " + str(master.port))

    assert sentinel.live_master_port() == master.port

    # Verify sentinel picked up replica.
    await await_for(
        lambda: sentinel.master(),
        lambda m: m["num-slaves"] == "1",
        timeout_sec=15,
        timeout_msg="Timeout waiting for sentinel to pick up replica.",
    )
    sentinel.failover()

    # Verify sentinel switched.
    await await_for(
        lambda: sentinel.live_master_port(),
        lambda p: p == replica.port,
        timeout_sec=10,
        timeout_msg="Timeout waiting for sentinel to report replica as master.",
    )
    assert sentinel.slaves()[0]["port"] == str(master.port)

    # Verify we can now write to replica and read replicated value from master.
    assert await replica_client.set("key", "value"), "Failed to set key on promoted replica."

    logging.info("key was set on promoted replica, awaiting get on promoted replica. ")

    await assert_master_became_replica(master_client)
    await wait_available_async(master_client)

    try:
        await await_for(
            lambda: master_client.get("key"),
            lambda val: val == b"value",
            10,
            "Timeout waiting for key to exist in replica.",
        )
    except AssertionError:
        syncid, r_offset = await master_client.execute_command("DEBUG REPLICA OFFSET")
        replicaoffset_cmd = "DFLY REPLICAOFFSET " + syncid.decode()
        m_offset = await replica_client.execute_command(replicaoffset_cmd)
        logging.info(f"{syncid.decode()} {r_offset} {m_offset}")
        logging.info("replica client role:")
        logging.info(await replica_client.execute_command("role"))
        logging.info("master client role:")
        logging.info(await master_client.execute_command("role"))
        logging.info("replica client info:")
        logging.info(await replica_client.info())
        logging.info("master client info:")
        logging.info(await master_client.info())
        replica_val = await replica_client.get("key")
        master_val = await master_client.get("key")
        logging.info(f"replica val: {replica_val}")
        logging.info(f"master val: {master_val}")
        raise


@pytest.mark.asyncio
@pytest.mark.large
async def test_master_failure(df_factory, sentinel, port_picker):
    master = df_factory.create(port=sentinel.initial_master_port)
    replica = df_factory.create(port=port_picker.get_available_port())

    master.start()
    replica.start()

    replica_client = aioredis.Redis(port=replica.port)

    await replica_client.execute_command("REPLICAOF localhost " + str(master.port))

    assert sentinel.live_master_port() == master.port

    # Verify sentinel picked up replica.
    await await_for(
        lambda: sentinel.master(),
        lambda m: m["num-slaves"] == "1",
        timeout_sec=15,
        timeout_msg="Timeout waiting for sentinel to pick up replica.",
    )

    # Simulate master failure.
    master.stop()

    # Verify replica promoted.
    await await_for(
        lambda: sentinel.live_master_port(),
        lambda p: p == replica.port,
        timeout_sec=300,
        timeout_msg="Timeout waiting for sentinel to report replica as master.",
    )

    # Verify we can now write to replica.
    await replica_client.set("key", "value")
    assert await replica_client.get("key") == b"value"


@dfly_args({"info_replication_valkey_compatible": True})
@pytest.mark.asyncio
async def test_priority_on_failover(df_factory, sentinel, port_picker):
    master = df_factory.create(port=sentinel.initial_master_port)
    # lower priority is the best candidate for sentinel
    low_priority_repl = df_factory.create(
        port=port_picker.get_available_port(), replica_priority=20
    )
    mid_priority_repl = df_factory.create(
        port=port_picker.get_available_port(), replica_priority=60
    )
    high_priority_repl = df_factory.create(
        port=port_picker.get_available_port(), replica_priority=80
    )

    master.start()
    low_priority_repl.start()
    mid_priority_repl.start()
    high_priority_repl.start()

    high_client = aioredis.Redis(port=high_priority_repl.port)
    await high_client.execute_command("REPLICAOF localhost " + str(master.port))

    mid_client = aioredis.Redis(port=mid_priority_repl.port)
    await mid_client.execute_command("REPLICAOF localhost " + str(master.port))

    low_client = aioredis.Redis(port=low_priority_repl.port)
    await low_client.execute_command("REPLICAOF localhost " + str(master.port))

    assert sentinel.live_master_port() == master.port

    # Verify sentinel picked up replica.
    await await_for(
        lambda: sentinel.master(),
        lambda m: m["num-slaves"] == "3",
        timeout_sec=15,
        timeout_msg="Timeout waiting for sentinel to pick up replica.",
    )

    # Simulate master failure.
    master.stop()

    # Verify replica promoted.
    await await_for(
        lambda: sentinel.live_master_port(),
        lambda p: p == low_priority_repl.port,
        timeout_sec=30,
        timeout_msg="Timeout waiting for sentinel to report replica as master.",
    )


================================================
FILE: tests/dragonfly/server_family_test.py
================================================
import platform

import aiohttp
from prometheus_client.samples import Sample
from pymemcache import Client

from redis.exceptions import ResponseError

from . import dfly_args
from .instance import DflyInstance
from .utility import *


@pytest.fixture(scope="class")
def connection(df_server: DflyInstance):
    return redis.Connection(port=df_server.port)


class TestServer:
    def test_quit(self, connection: redis.Connection):
        connection.send_command("QUIT")
        assert connection.read_response() == b"OK"

        with pytest.raises(redis.exceptions.ConnectionError) as e:
            connection.read_response()

    def test_quit_after_sub(self, connection):
        connection.send_command("SUBSCRIBE", "foo")
        connection.read_response()

        connection.send_command("QUIT")
        assert connection.read_response() == b"OK"

        with pytest.raises(redis.exceptions.ConnectionError) as e:
            connection.read_response()

    async def test_multi_exec(self, async_client: aioredis.Redis):
        pipeline = async_client.pipeline()
        pipeline.set("foo", "bar")
        pipeline.get("foo")
        val = await pipeline.execute()
        assert val == [True, "bar"]


"""
see https://github.com/dragonflydb/dragonfly/issues/457
For now we would not allow for eval command inside multi
As this would create to level transactions (in effect recursive call
to Schedule function).
When this issue is fully fixed, this test would failed, and then it should
change to match the fact that we supporting this operation.
For now we are expecting to get an error
"""


async def test_multi_eval(async_client: aioredis.Redis):
    pipeline = async_client.pipeline()
    pipeline.set("foo", "bar")
    pipeline.get("foo")
    pipeline.eval("return 43", 0)
    val = await pipeline.execute()
    assert val == [True, "bar", 43]


async def test_connection_name(async_client: aioredis.Redis):
    name = await async_client.execute_command("CLIENT GETNAME")
    assert name == "default-async-fixture"
    await async_client.execute_command("CLIENT SETNAME test_conn_name")
    name = await async_client.execute_command("CLIENT GETNAME")
    assert name == "test_conn_name"


async def test_get_databases(async_client: aioredis.Redis):
    """
    make sure that the config get databases command is working
    to ensure compatibility with UI frameworks like AnotherRedisDesktopManager
    """
    dbnum = await async_client.config_get("databases")
    assert dbnum == {"databases": "16"}


async def test_client_kill(df_factory):
    with df_factory.create(port=1111, admin_port=1112) as instance:
        instance: DflyInstance
        from redis.backoff import NoBackoff
        from redis.asyncio.retry import Retry

        client = instance.client(retry=Retry(NoBackoff(), 0))
        admin_client = instance.admin_client()
        await admin_client.ping()

        # This creates `client_conn` as a non-auto-reconnect client
        async with client.client() as client_conn:
            assert len(await client_conn.execute_command("CLIENT LIST")) == 2
            assert len(await admin_client.execute_command("CLIENT LIST")) == 2

            # Can't kill admin from regular connection
            with pytest.raises(ResponseError) as e_info:
                await client_conn.execute_command("CLIENT KILL LADDR 127.0.0.1:1112")

            assert len(await admin_client.execute_command("CLIENT LIST")) == 2
            await admin_client.execute_command("CLIENT KILL LADDR 127.0.0.1:1111")
            assert len(await admin_client.execute_command("CLIENT LIST")) == 1
            with pytest.raises(redis.exceptions.ConnectionError) as e_info:
                await client_conn.ping()


async def test_scan(async_client: aioredis.Redis):
    """
    make sure that the scan command is working with python
    """

    def gen_test_data():
        for i in range(10):
            yield f"key-{i}", f"value-{i}"

    for key, val in gen_test_data():
        res = await async_client.set(key, val)
        assert res is not None
        cur, keys = await async_client.scan(cursor=0, match=key, count=2)
        assert cur == 0
        assert len(keys) == 1
        assert keys[0] == key


def configure_slowlog_parsing(async_client: aioredis.Redis):
    def parse_slowlog_get(response, **options):
        logging.info(f"slowlog response: {response}")

        def stringify(item):
            if isinstance(item, bytes):
                return item.decode()
            if isinstance(item, list):
                return [stringify(i) for i in item]
            return item

        def parse_item(item):
            item = stringify(item)
            result = {"id": item[0], "start_time": int(item[1]), "duration": int(item[2])}
            result["command"] = " ".join(item[3])
            result["client_address"] = item[4]
            result["client_name"] = item[5]
            return result

        return [parse_item(item) for item in response]

    async_client.set_response_callback("SLOWLOG GET", parse_slowlog_get)
    return async_client


@pytest.mark.asyncio
@dfly_args({"slowlog_log_slower_than": 0, "slowlog_max_len": 3})
async def test_slowlog_client_name_and_ip(df_factory, async_client: aioredis.Redis):
    df = df_factory.create()
    df.start()
    expected_clientname = "dragonfly"

    await async_client.client_setname(expected_clientname)
    async_client = configure_slowlog_parsing(async_client)

    client_list = await async_client.client_list()
    addr = client_list[0]["addr"]

    slowlog = await async_client.slowlog_get(1)
    assert slowlog[0]["client_name"] == expected_clientname
    assert slowlog[0]["client_address"] == addr


@pytest.mark.asyncio
@dfly_args({"slowlog_log_slower_than": 0, "slowlog_max_len": 3})
async def test_blocking_commands_should_not_show_up_in_slow_log(
    df_factory, async_client: aioredis.Redis
):
    await async_client.slowlog_reset()
    df = df_factory.create()
    df.start()
    async_client = configure_slowlog_parsing(async_client)

    await async_client.blpop("mykey", 0.5)
    reply = await async_client.slowlog_get()

    # blpop does not show up, only the previous reset
    assert reply[0]["command"] == "SLOWLOG RESET"


@dfly_args({"memcached_port": 11211, "admin_port": 1112})
async def test_metric_labels(
    df_server: DflyInstance, async_client: aioredis.Redis, memcached_client: Client
):
    result = await async_client.set("foo", "bar")
    assert result, "Failed to set key"

    result = await async_client.get("foo")
    assert result == "bar", "Failed to read value"

    def match_label_value(s: Sample, name, func):
        assert "listener" in s.labels
        if s.labels["listener"] == name:
            assert func(s.value)

    metrics = await df_server.metrics()
    for sample in metrics["dragonfly_commands_processed"].samples:
        match_label_value(sample, "main", lambda v: v > 0)
        match_label_value(sample, "other", lambda v: v == 0)
    for sample in metrics["dragonfly_connected_clients"].samples:
        match_label_value(sample, "main", lambda v: v == 1)
        match_label_value(sample, "other", lambda v: v == 0)

    # Memcached client also counts as main
    memcached_client.set("foo", "bar")

    metrics = await df_server.metrics()
    for sample in metrics["dragonfly_commands_processed"].samples:
        match_label_value(sample, "main", lambda v: v > 0)
        match_label_value(sample, "other", lambda v: v == 0)
    for sample in metrics["dragonfly_connected_clients"].samples:
        match_label_value(sample, "main", lambda v: v == 2)
        match_label_value(sample, "other", lambda v: v == 0)

    # admin client counts as other
    async with aioredis.Redis(port=1112) as admin:
        await admin.ping()

        metrics = await df_server.metrics()
        for sample in metrics["dragonfly_commands_processed"].samples:
            match_label_value(sample, "main", lambda v: v > 0)
            # memcached listener processes command as other
            match_label_value(sample, "other", lambda v: v > 0)
        for sample in metrics["dragonfly_connected_clients"].samples:
            match_label_value(sample, "main", lambda v: v == 2)
            match_label_value(sample, "other", lambda v: v == 1)


async def test_latency_stats(async_client: aioredis.Redis):
    for _ in range(100):
        await async_client.set("foo", "bar")
        await async_client.get("foo")
        await async_client.get("bar")
        await async_client.hgetall("missing")

    latency_stats = await async_client.info("LATENCYSTATS")
    for expected in {"hgetall", "set", "get"}:
        key = f"latency_percentiles_usec_{expected}"
        assert key in latency_stats
        assert latency_stats[key].keys() == {"p50", "p99", "p99.9"}

    await async_client.config_resetstat()
    latency_stats = await async_client.info("LATENCYSTATS")
    # Only stats for the `config resetstat` command should remain in stats
    assert (
        len(latency_stats) == 1 and "latency_percentiles_usec_config" in latency_stats
    ), f"unexpected latency stats after reset: {latency_stats}"


@dfly_args({"latency_tracking": False})
async def test_latency_stats_disabled(async_client: aioredis.Redis):
    for _ in range(100):
        await async_client.set("foo", "bar")
    assert await async_client.info("LATENCYSTATS") == {}


@pytest.mark.skipif(
    platform.machine() != "x86_64" or platform.system() != "Linux",
    reason="Validate metrics only on one platform to simplify download",
)
async def test_metrics_sanity_check(df_server: DflyInstance):
    lint_errors = frozenset(
        (
            "no help text",
            """should have "_total" suffix""",
            """should not have "_count" suffix""",
            "metric names should not contain abbreviated units",
        )
    )

    async with aiohttp.ClientSession() as s:
        metrics_url = f"http://localhost:{df_server.port}/metrics"
        async with s.get(metrics_url, raise_for_status=True) as response:
            metrics = await response.text()
    result = subprocess.run(
        ["promtool", "check", "metrics"],
        input=metrics,
        capture_output=True,
        text=True,
    )

    actual_errors = []
    if result.returncode != 0:
        for e in result.stderr.splitlines():
            if any(e.endswith(error) for error in lint_errors):
                logging.debug(f"ignored linting error: {e}")
            else:
                actual_errors.append(e)

    for error in actual_errors:
        logging.error(f"found error: {error}")

    assert actual_errors == []


@pytest.mark.opt_only
@dfly_args({"proactor_threads": "2"})
async def test_huffman_tables_built(df_server: DflyInstance):
    async_client = df_server.client()
    # Insert enough data to trigger background huffman table building
    key_name = "keyfooobarrsoooooooooooooooooooooooooooooooooooooooooooooooo"
    await async_client.execute_command("DEBUG", "POPULATE", "1000000", key_name, "14")

    @assert_eventually(times=500)
    async def check_metrics():
        metrics = await df_server.metrics()
        m = metrics["dragonfly_huffman_tables_built"]
        assert m.samples[0].value > 0

    await check_metrics()


================================================
FILE: tests/dragonfly/set_test.py
================================================
import pytest
from redis import asyncio as aioredis
from .instance import DflyInstance, DflyInstanceFactory
import logging
import asyncio


@pytest.mark.asyncio
async def test_sscan_regression(df_factory: DflyInstanceFactory):
    df = df_factory.create(
        proactor_threads=2,
    )
    df.start()

    client = df.client()

    await client.execute_command(f"SADD key el1 el2")

    element = "a*" * 3

    cursor = await client.execute_command(f"SSCAN key 0 match {element}.pt")
    length = len(cursor[1])
    # Takes 3 seconds
    res = await client.execute_command("SLOWLOG GET 100")
    assert res == []


@pytest.mark.asyncio
async def test_spop_with_null_byte_members(df_factory: DflyInstanceFactory):
    df = df_factory.create(proactor_threads=1)

    df.start()

    client = df.client()

    num_members = 10

    for i in range(num_members):
        await client.sadd("set", "b'MEMBER\x01\x02\x00_KEY{i}'".format(i=i))

    assert await client.scard("set") == num_members

    await client.spop("set")

    assert await client.scard("set") == num_members - 1


================================================
FILE: tests/dragonfly/shutdown_test.py
================================================
import pytest
import asyncio
import redis
from redis import asyncio as aioredis
from pathlib import Path

from . import dfly_args
from .utility import wait_available_async

BASIC_ARGS = {"dir": "{DRAGONFLY_TMP}/"}


@pytest.mark.skip(
    reason="Currently we can not guarantee that on shutdown if command is executed and value is written we response before breaking the connection"
)
@dfly_args({"proactor_threads": "4"})
class TestDflyAutoLoadSnapshot:
    """
    Test automatic loading of dump files on startup with timestamp.
    When command is executed if a value is written we should send the response before shutdown
    """

    @pytest.mark.asyncio
    async def test_gracefull_shutdown(self, df_factory):
        df_args = {"dbfilename": "dump", **BASIC_ARGS, "port": 1111}

        df_server = df_factory.create(**df_args)
        df_server.start()
        client = aioredis.Redis(port=df_server.port)

        async def counter(key):
            value = 0
            await client.execute_command(f"SET {key} 0")
            while True:
                try:
                    value = await client.execute_command(f"INCR {key}")
                except (redis.exceptions.ConnectionError, redis.exceptions.ResponseError) as e:
                    break
            return key, value

        async def delayed_takeover():
            await asyncio.sleep(1)
            await client.execute_command("SHUTDOWN")
            await client.connection_pool.disconnect()

        _, *results = await asyncio.gather(
            delayed_takeover(), *[counter(f"key{i}") for i in range(16)]
        )

        df_server.start()
        client = aioredis.Redis(port=df_server.port)

        for key, acknowleged_value in results:
            value_from_snapshot = await client.get(key)
            assert acknowleged_value == int(value_from_snapshot)

        await client.connection_pool.disconnect()


@dfly_args({"proactor_threads": "2"})
class TestShutdownOptions:
    @pytest.mark.asyncio
    async def test_shutdown_abort_and_invalid_option(self, df_factory):
        df_args = {"dbfilename": "dump", **BASIC_ARGS, "port": 1121}
        df_server = df_factory.create(**df_args)
        df_server.start()

        client = aioredis.Redis(port=df_server.port)

        # ABORT should be rejected and server should remain responsive
        with pytest.raises(redis.exceptions.ResponseError):
            await client.execute_command("SHUTDOWN ABORT")

        pong = await client.ping()
        assert pong is True

        # Invalid option -> syntax error
        with pytest.raises(redis.exceptions.ResponseError):
            await client.execute_command("SHUTDOWN FOO")

        await client.connection_pool.disconnect()
        df_server.stop()

    @pytest.mark.asyncio
    async def test_shutdown_safe_persists_snapshot(self, df_factory, tmp_path):
        # Ensure snapshot dir exists and is used
        snap_dir = tmp_path
        df_args = {"dbfilename": "dump", "dir": str(snap_dir) + "/", "port": 1122}

        df_server = df_factory.create(**df_args)
        df_server.start()

        client = aioredis.Redis(port=df_server.port)
        await client.set("safe_key", "safe_value")

        # SHUTDOWN SAFE should save synchronously and then stop
        try:
            await client.execute_command("SHUTDOWN SAFE")
        except Exception:
            # Connection may be dropped as part of shutdown; this is acceptable
            pass

        await client.connection_pool.disconnect()

        # Restart and verify data persisted
        df_server.start()
        client = aioredis.Redis(port=df_server.port)
        await wait_available_async(client)
        val = await client.get("safe_key")
        assert val == b"safe_value"
        await client.connection_pool.disconnect()
        df_server.stop()

    @pytest.mark.asyncio
    async def test_shutdown_save_persists_snapshot(self, df_factory, tmp_path):
        # SAVE should follow the same synchronous path as SAFE
        snap_dir = tmp_path
        df_args = {"dbfilename": "dump", "dir": str(snap_dir) + "/", "port": 1123}

        df_server = df_factory.create(**df_args)
        df_server.start()

        client = aioredis.Redis(port=df_server.port)
        await client.set("save_key", "save_value")

        try:
            await client.execute_command("SHUTDOWN SAVE")
        except Exception:
            pass

        await client.connection_pool.disconnect()

        df_server.start()
        client = aioredis.Redis(port=df_server.port)
        await wait_available_async(client)
        val = await client.get("save_key")
        assert val == b"save_value"
        await client.connection_pool.disconnect()
        df_server.stop()


================================================
FILE: tests/dragonfly/snapshot_test.py
================================================
import pytest
import logging
import os
import glob
import asyncio
from async_timeout import timeout
import redis
from redis import asyncio as aioredis
from pathlib import Path
import boto3
from .instance import DflyInstanceFactory, RedisServer
from random import randint as rand
import string
import random
from pymemcache.client.base import Client as MCClient

from . import dfly_args
from .utility import assert_eventually, wait_available_async, is_saving, tmp_file_name

from .seeder import DebugPopulateSeeder

BASIC_ARGS = {"dir": "{DRAGONFLY_TMP}/", "proactor_threads": 4}
FILE_FORMATS = ["RDB", "DF"]

# Should be used where text auxiliary mechanisms like filenames
LIGHTWEIGHT_SEEDER_ARGS = dict(key_target=100, data_size=100, variance=1, samples=1)


def find_main_file(path: Path, pattern):
    return next(iter(glob.glob(str(path) + "/" + pattern)), None)


async def get_metric_value(inst, metric_name, sample_index=0):
    return (await inst.metrics())[metric_name].samples[sample_index].value


async def assert_metric_value(inst, metric_name, expected_value):
    actual_value = await get_metric_value(inst, metric_name)
    assert (
        actual_value == expected_value
    ), f"Expected {metric_name} to be {expected_value}, got ${actual_value}"


@pytest.mark.opt_only
@pytest.mark.parametrize("format", FILE_FORMATS)
@pytest.mark.parametrize(
    "seeder_opts",
    [
        # Many small keys, high variance
        dict(key_target=50_000, data_size=100, variance=10, samples=50),
        # A few large keys, high variance
        dict(key_target=1000, data_size=5_000, variance=10, samples=10),
    ],
)
@dfly_args({**BASIC_ARGS})
async def test_consistency(df_factory, format: str, seeder_opts: dict):
    """
    Test consistency over a large variety of data with different sizes
    """
    dbfilename = f"dump_{tmp_file_name()}"
    instance = df_factory.create(dbfilename=dbfilename)
    instance.start()
    async_client = instance.client()
    await DebugPopulateSeeder(**seeder_opts).run(async_client)

    start_capture = await DebugPopulateSeeder.capture(async_client)

    # save + flush + load
    await async_client.execute_command("SAVE", format)
    assert await async_client.flushall()
    await async_client.execute_command(
        "DFLY",
        "LOAD",
        f"{dbfilename}.rdb" if format == "RDB" else f"{dbfilename}-summary.dfs",
    )

    assert (await DebugPopulateSeeder.capture(async_client)) == start_capture


@pytest.mark.parametrize("format", FILE_FORMATS)
@dfly_args({**BASIC_ARGS})
async def test_multidb(df_factory, format: str):
    """
    Test serialization of multiple logical databases
    """
    dbfilename = f"dump_{tmp_file_name()}"
    instance = df_factory.create(dbfilename=dbfilename)
    instance.start()
    async_client = instance.client()
    start_captures = []
    for dbid in range(10):
        db_client = instance.client(db=dbid)
        await DebugPopulateSeeder(key_target=1000).run(db_client)
        start_captures.append(await DebugPopulateSeeder.capture(db_client))

    # save + flush + load
    await async_client.execute_command("SAVE", format)
    assert await async_client.flushall()
    await async_client.execute_command(
        "DFLY",
        "LOAD",
        f"{dbfilename}.rdb" if format == "RDB" else f"{dbfilename}-summary.dfs",
    )

    for dbid in range(10):
        db_client = instance.client(db=dbid)
        assert (await DebugPopulateSeeder.capture(db_client)) == start_captures[dbid]


@pytest.mark.parametrize(
    "save_type, dbfilename, pattern",
    [
        ("rdb", "test-autoload1-{{timestamp}}", "test-autoload1-*.rdb"),
        ("df", "test-autoload2-{{timestamp}}", "test-autoload2-*-summary.dfs"),
        ("rdb", "test-autoload3-{{timestamp}}.rdb", "test-autoload3-*.rdb"),
        ("rdb", "test-autoload4", "test-autoload4.rdb"),
        ("df", "test-autoload5", "test-autoload5-summary.dfs"),
        ("rdb", "test-autoload6.rdb", "test-autoload6.rdb"),
    ],
)
async def test_dbfilenames(
    df_factory, tmp_dir: Path, save_type: str, dbfilename: str, pattern: str
):
    df_args = {**BASIC_ARGS, "dbfilename": dbfilename, "port": 1111}

    if save_type == "rdb":
        df_args["nodf_snapshot_format"] = None

    start_capture = None

    with df_factory.create(**df_args) as df_server:
        async with df_server.client() as client:
            await wait_available_async(client)

            # We use the seeder just to check we don't loose any files (and thus keys)
            await DebugPopulateSeeder(**LIGHTWEIGHT_SEEDER_ARGS).run(client)
            start_capture = await DebugPopulateSeeder.capture(client)

            await client.execute_command("SAVE " + save_type)

    file = find_main_file(tmp_dir, pattern)
    assert file is not None
    assert os.path.basename(file).startswith(dbfilename.split("{{")[0])

    with df_factory.create(**df_args) as df_server:
        async with df_server.client() as client:
            await wait_available_async(client)
            assert await DebugPopulateSeeder.capture(client) == start_capture


@dfly_args(
    {
        **BASIC_ARGS,
        "dbfilename": "test-redis-load-rdb",
    }
)
async def test_redis_load_snapshot(
    async_client: aioredis.Redis, df_server, redis_local_server: RedisServer, tmp_dir: Path
):
    """
    Test redis server loading dragonfly snapshot rdb format
    """
    await DebugPopulateSeeder(
        **LIGHTWEIGHT_SEEDER_ARGS, types=["STRING", "LIST", "SET", "HASH", "ZSET", "STREAM"]
    ).run(async_client)

    await async_client.lpush("list", "A" * 10_000)

    await async_client.execute_command("SAVE", "rdb")
    dbsize = await async_client.dbsize()

    await async_client.connection_pool.disconnect()
    df_server.stop()

    redis_local_server.start(dir=tmp_dir, redis7=True, dbfilename="test-redis-load-rdb.rdb")
    await asyncio.sleep(1)
    c_master = aioredis.Redis(port=redis_local_server.port)
    await c_master.ping()

    assert await c_master.dbsize() == dbsize


@pytest.mark.large
@dfly_args({**BASIC_ARGS, "dbfilename": "test-cron", "snapshot_cron": "* * * * *"})
async def test_cron_snapshot(tmp_dir: Path, async_client: aioredis.Redis):
    await DebugPopulateSeeder(**LIGHTWEIGHT_SEEDER_ARGS).run(async_client)

    file = None
    async with timeout(65):
        while file is None:
            await asyncio.sleep(1)
            file = find_main_file(tmp_dir, "test-cron-summary.dfs")

    assert file is not None, os.listdir(tmp_dir)


@pytest.mark.skip("Fails and also causes all TLS tests to fail")
@pytest.mark.large
@dfly_args({**BASIC_ARGS, "dbfilename": "test-failed-saving", "snapshot_cron": "* * * * *"})
async def test_cron_snapshot_failed_saving(df_server, tmp_dir: Path, async_client: aioredis.Redis):
    await DebugPopulateSeeder(**LIGHTWEIGHT_SEEDER_ARGS).run(async_client)

    backups_total = await get_metric_value(df_server, "dragonfly_backups")
    failed_backups_total = await get_metric_value(df_server, "dragonfly_failed_backups")

    file = None
    async with timeout(65):
        while file is None:
            await asyncio.sleep(1)
            file = find_main_file(tmp_dir, "test-failed-saving-summary.dfs")

    assert file is not None, os.listdir(tmp_dir)

    await assert_metric_value(df_server, "dragonfly_backups", backups_total + 1)
    await assert_metric_value(df_server, "dragonfly_failed_backups", failed_backups_total)

    # Remove all files from directory
    for dir_file in tmp_dir.iterdir():
        os.unlink(dir_file)

    # Make directory read-only
    os.chmod(tmp_dir, 0o555)

    # Wait for the next SAVE command
    await asyncio.sleep(65)
    file = find_main_file(tmp_dir, "test-failed-saving-summary.dfs")

    # Make directory writable again
    os.chmod(tmp_dir, 0o777)

    assert file is None, os.listdir(tmp_dir)

    await assert_metric_value(df_server, "dragonfly_backups", backups_total + 2)
    await assert_metric_value(df_server, "dragonfly_failed_backups", failed_backups_total + 1)


@pytest.mark.large
@dfly_args({**BASIC_ARGS, "dbfilename": "test-cron-set"})
async def test_set_cron_snapshot(tmp_dir: Path, async_client: aioredis.Redis):
    await DebugPopulateSeeder(**LIGHTWEIGHT_SEEDER_ARGS).run(async_client)

    await async_client.config_set("snapshot_cron", "* * * * *")

    file = None
    async with timeout(65):
        while file is None:
            await asyncio.sleep(1)
            file = find_main_file(tmp_dir, "test-cron-set-summary.dfs")

    assert file is not None


@dfly_args(
    {**BASIC_ARGS, "dbfilename": "test-save-rename-command", "rename_command": "save=save-foo"}
)
async def test_shutdown_save_with_rename(df_server):
    """Checks that on shutdown we save snapshot"""
    client = df_server.client()

    await DebugPopulateSeeder(**LIGHTWEIGHT_SEEDER_ARGS).run(client)
    start_capture = await DebugPopulateSeeder.capture(client)

    await client.connection_pool.disconnect()
    df_server.stop()
    df_server.start()
    client = df_server.client()

    await wait_available_async(client)
    assert await DebugPopulateSeeder.capture(client) == start_capture

    await client.connection_pool.disconnect()


@pytest.mark.opt_only
async def test_parallel_snapshot(async_client):
    """Dragonfly does not allow simultaneous save operations, send 2 save operations and make sure one is rejected"""

    await async_client.execute_command("debug", "populate", "1000000", "askldjh", "1000", "RAND")

    async def save():
        try:
            await async_client.execute_command("save", "rdb", "dump")
            return True
        except Exception as e:
            return False

    save_successes = sum(await asyncio.gather(*(save() for _ in range(2))), 0)
    assert save_successes == 1, "Only one SAVE must be successful"


@pytest.mark.opt_only
async def test_parallel_snapshot_race_condition(async_client):
    await async_client.execute_command("debug", "populate", "300000", "racekey", "2000", "RAND")

    async def save_operation(operation_id):
        try:
            await async_client.execute_command("save", "rdb", "dump")
            return f"success_{operation_id}"
        except Exception as e:
            return f"failed_{operation_id}_{type(e).__name__}"

    # Fire many concurrent operations to maximize collision probability
    # The more concurrent operations, the higher chance of hitting the race window
    num_concurrent = 3

    # Multiple rounds to increase overall probability
    for round_num in range(2):
        tasks = [save_operation(f"r{round_num}_op{i}") for i in range(num_concurrent)]

        # Execute all operations simultaneously to hit race condition
        results = await asyncio.gather(*tasks, return_exceptions=True)

        successes = [r for r in results if isinstance(r, str) and r.startswith("success_")]
        failures = [r for r in results if isinstance(r, str) and r.startswith("failed_")]
        exceptions = [r for r in results if not isinstance(r, str)]

        # Exactly one should succeed, rest should fail gracefully
        assert (
            len(successes) == 1
        ), f"Round {round_num}: Expected exactly 1 success, got {len(successes)} successes, {len(failures)} failures, {len(exceptions)} exceptions. Results: {results}"

        # Short delay between rounds
        await asyncio.sleep(0.05)


async def test_path_escapes(df_factory):
    """Test that we don't allow path escapes. We just check that df_server.start()
    fails because we don't have a much better way to test that."""

    df_server = df_factory.create(dbfilename="../../../../etc/passwd")
    with pytest.raises(Exception):
        df_server.start()


@dfly_args({**BASIC_ARGS, "dbfilename": "test-info-persistence"})
async def test_info_persistence_field(async_client):
    """Test is_loading field on INFO PERSISTENCE during snapshot loading"""

    await DebugPopulateSeeder(**LIGHTWEIGHT_SEEDER_ARGS).run(async_client)

    # Wait for snapshot to finish loading and try INFO PERSISTENCE
    await wait_available_async(async_client)
    assert "loading:0" in (await async_client.execute_command("INFO PERSISTENCE"))


def delete_s3_objects(bucket, prefix):
    client = boto3.client("s3")
    resp = client.list_objects_v2(
        Bucket=bucket,
        Prefix=prefix,
    )
    keys = []
    for obj in resp["Contents"]:
        keys.append({"Key": obj["Key"]})
    client.delete_objects(
        Bucket=bucket,
        Delete={"Objects": keys},
    )


# If DRAGONFLY_S3_BUCKET is configured, AWS credentials must also be
# configured.
@pytest.mark.skipif(
    "DRAGONFLY_S3_BUCKET" not in os.environ or os.environ["DRAGONFLY_S3_BUCKET"] == "",
    reason="AWS S3 snapshots bucket is not configured",
)
async def test_exit_on_s3_snapshot_load_err(df_factory):
    invalid_s3_dir = "s3://{DRAGONFLY_S3_BUCKET}" + "_invalid_bucket_"
    df_server = df_factory.create(dir=invalid_s3_dir, dbfilename="db")
    with pytest.raises(Exception):
        df_server.start()
        df_server.stop()


# If DRAGONFLY_S3_BUCKET is configured, AWS credentials must also be
# configured.
@pytest.mark.skipif(
    "DRAGONFLY_S3_BUCKET" not in os.environ or os.environ["DRAGONFLY_S3_BUCKET"] == "",
    reason="AWS S3 snapshots bucket is not configured",
)
@dfly_args({**BASIC_ARGS, "dir": "s3://{DRAGONFLY_S3_BUCKET}{DRAGONFLY_TMP}", "dbfilename": ""})
async def test_s3_snapshot(async_client, tmp_dir):
    seeder = DebugPopulateSeeder(key_target=10_000)
    await seeder.run(async_client)

    start_capture = await DebugPopulateSeeder.capture(async_client)

    try:
        # save + flush + load
        await async_client.execute_command("SAVE DF snapshot")
        assert await async_client.flushall()
        await async_client.execute_command(
            "DFLY LOAD "
            + os.environ["DRAGONFLY_S3_BUCKET"]
            + str(tmp_dir)
            + "/snapshot-summary.dfs"
        )

        assert await DebugPopulateSeeder.capture(async_client) == start_capture

    finally:
        delete_s3_objects(
            os.environ["DRAGONFLY_S3_BUCKET"],
            str(tmp_dir)[1:],
        )


# If DRAGONFLY_S3_BUCKET is configured, AWS credentials must also be
# configured.
@pytest.mark.skipif(
    "DRAGONFLY_S3_BUCKET" not in os.environ or os.environ["DRAGONFLY_S3_BUCKET"] == "",
    reason="AWS S3 snapshots bucket is not configured",
)
@dfly_args(
    {
        **BASIC_ARGS,
        "dir": "s3://{DRAGONFLY_S3_BUCKET}{DRAGONFLY_TMP}",
        "dbfilename": "snapshot-{{Y}}{{m}}{{d}}-{{timestamp}}",
    }
)
async def test_s3_reload_snapshot_after_restart(df_factory, tmp_dir):
    # this test checks that after saving to s3, stopping the server and starting a new one
    # we can load the snapshot from s3 correctly.
    try:
        instance = df_factory.create()
        instance.start()
        async_client = instance.client()
        seeder = DebugPopulateSeeder(key_target=10_000)
        await seeder.run(async_client)
        start_capture = await DebugPopulateSeeder.capture(async_client)
        # instance stop generates snapshot on exit
        instance.stop()

        new_instance = df_factory.create()
        new_instance.start()
        new_async_client = new_instance.client()

        await wait_available_async(new_async_client)

        assert await DebugPopulateSeeder.capture(new_async_client) == start_capture

    finally:
        delete_s3_objects(
            os.environ["DRAGONFLY_S3_BUCKET"],
            str(tmp_dir)[1:],
        )


# If DRAGONFLY_S3_BUCKET is configured, AWS credentials must also be
# configured.
@pytest.mark.skipif(
    "DRAGONFLY_S3_BUCKET" not in os.environ or os.environ["DRAGONFLY_S3_BUCKET"] == "",
    reason="AWS S3 snapshots bucket is not configured",
)
@dfly_args({**BASIC_ARGS})
async def test_s3_save_local_dir(async_client, tmp_dir):
    seeder = DebugPopulateSeeder(key_target=10_000)
    await seeder.run(async_client)

    try:
        # SAVE to S3 bucket with `s3_dump` as filename prefix
        await async_client.execute_command(
            "SAVE", "DF", "s3://" + os.environ["DRAGONFLY_S3_BUCKET"] + str(tmp_dir), "s3_dump"
        )

    finally:
        delete_s3_objects(
            os.environ["DRAGONFLY_S3_BUCKET"],
            str(tmp_dir)[1:] + "/s3_dump",
        )


@dfly_args({**BASIC_ARGS, "dbfilename": "test-shutdown"})
class TestDflySnapshotOnShutdown:
    SEEDER_ARGS = dict(key_target=10_000)

    """Test multi file snapshot"""

    async def _get_info_memory_fields(self, client):
        res = await client.execute_command("INFO MEMORY")
        fields = {}
        for line in res.splitlines():
            if line.startswith("#"):
                continue
            k, v = line.split(":")
            if k == "object_used_memory" or k.startswith("type_used_memory_"):
                fields.update({k: int(v)})
        return fields

    async def _delete_all_keys(self, client: aioredis.Redis):
        while True:
            keys = await client.keys()
            if len(keys) == 0:
                break
            await client.delete(*keys)

    async def test_memory_counters(self, async_client: aioredis.Redis):
        memory_counters = await self._get_info_memory_fields(async_client)
        assert memory_counters == {"object_used_memory": 0}

        seeder = DebugPopulateSeeder(**self.SEEDER_ARGS)
        await seeder.run(async_client)

        memory_counters = await self._get_info_memory_fields(async_client)
        assert all(value > 0 for value in memory_counters.values())

        await self._delete_all_keys(async_client)
        memory_counters = await self._get_info_memory_fields(async_client)
        assert memory_counters == {"object_used_memory": 0}

    async def test_snapshot(self, df_server, async_client):
        """Checks that:
        1. After reloading the snapshot file the data is the same
        2. Memory counters after loading should be non zero
        3. Memory counters after deleting all keys loaded by snapshot - this validates the memory
           counting when loading from snapshot."""

        seeder = DebugPopulateSeeder(**self.SEEDER_ARGS)
        await seeder.run(async_client)
        start_capture = await DebugPopulateSeeder.capture(async_client)

        memory_before = await self._get_info_memory_fields(async_client)

        await async_client.connection_pool.disconnect()
        df_server.stop()
        df_server.start()

        async_client = df_server.client()
        await wait_available_async(async_client)

        assert await DebugPopulateSeeder.capture(async_client) == start_capture

        memory_after = await self._get_info_memory_fields(async_client)
        for counter, value in memory_before.items():
            # Counters should be non zero.
            assert memory_after[counter] > 0

        await self._delete_all_keys(async_client)
        memory_empty = await self._get_info_memory_fields(async_client)
        assert memory_empty == {"object_used_memory": 0}


@pytest.mark.parametrize("format", FILE_FORMATS)
@dfly_args({**BASIC_ARGS, "dbfilename": "info-while-snapshot"})
async def test_infomemory_while_snapshotting(df_factory, format: str):
    instance = df_factory.create(dbfilename=f"dump_{tmp_file_name()}")
    instance.start()
    async_client = instance.client()
    await async_client.execute_command("DEBUG POPULATE 10000 key 4048 RAND")

    async def save():
        await async_client.execute_command("SAVE", format)

    save_finished = False

    async def info_in_loop():
        while not save_finished:
            await async_client.execute_command("INFO MEMORY")
            await asyncio.sleep(0.1)

    save_task = asyncio.create_task(save())
    info_task = asyncio.create_task(info_in_loop())

    await save_task
    save_finished = True
    await info_task


@dfly_args({**BASIC_ARGS, "dbfilename": "test-bgsave"})
async def test_bgsave_and_save(async_client: aioredis.Redis):
    await async_client.execute_command("DEBUG POPULATE 200000")

    await async_client.execute_command("BGSAVE")
    with pytest.raises(redis.exceptions.ResponseError):
        await async_client.execute_command("BGSAVE")

    while await is_saving(async_client):
        await asyncio.sleep(0.1)
    await async_client.execute_command("BGSAVE")
    with pytest.raises(redis.exceptions.ResponseError):
        await async_client.execute_command("SAVE")

    while await is_saving(async_client):
        await asyncio.sleep(0.1)
    await async_client.execute_command("SAVE")


@pytest.mark.exclude_epoll
@dfly_args(
    {
        **BASIC_ARGS,
        "dbfilename": "tiered-entries",
        "tiered_prefix": "/tmp/tiered/backing",
        "tiered_offload_threshold": "1.0",  # ask offloading loop to offload as much as possible
    }
)
async def test_tiered_entries(async_client: aioredis.Redis):
    """This test makes sure tieried entries are correctly persisted"""

    # With variance 4: 512 - 8192 we include small and large values
    await DebugPopulateSeeder(key_target=5000, data_size=1024, variance=4, types=["STRING"]).run(
        async_client
    )

    # Compute the capture, this brings all items back to memory... so we'll wait for offloading
    start_capture = await DebugPopulateSeeder.capture(async_client)

    # Wait until the total_stashes counter stops increasing, meaning offloading finished
    last_writes, current_writes = 0, -1
    while last_writes != current_writes:
        await asyncio.sleep(0.1)
        last_writes = current_writes
        current_writes = (await async_client.info("TIERED"))["tiered_total_stashes"]

    # Save + flush + load
    await async_client.execute_command("SAVE", "DF")
    assert await async_client.flushall()
    await async_client.execute_command(
        "DFLY",
        "LOAD",
        "tiered-entries-summary.dfs",
    )

    # Compare captures
    assert await DebugPopulateSeeder.capture(async_client) == start_capture


@pytest.mark.skip
@pytest.mark.large
@pytest.mark.opt_only
@dfly_args(
    {
        **BASIC_ARGS,
        "maxmemory": "2G",
        "dbfilename": "tiered-entries",
        "tiered_prefix": "/tmp/tiered/backing",
        "tiered_offload_threshold": "0.5",  # ask to keep below 0.5 * 2G
        "tiered_storage_write_depth": 1000,
        "tiered_experimental_cooling": "false",
    }
)
async def test_tiered_entries_throttle(async_client: aioredis.Redis):
    """
    This test ensures that tiered entries are correctly persisted and loaded back
    when memory is limited and tiered storage throttling is enabled.
    """

    # Populate the database with a large number of string keys to exceed the in-memory threshold
    # and trigger tiered storage offloading/throttling. Each key is 4KB, total ~3GB.
    await DebugPopulateSeeder(
        key_target=750_000, data_size=4096, samples=20, variance=1, types=["STRING"]
    ).run(async_client)

    # Capture the initial state of the database for later comparison
    logging.info("Seeder completed, starting capture")
    start_capture = await DebugPopulateSeeder.capture(async_client)

    # Check memory usage after population. The peak memory should remain below the set limit (2.3GB).
    # This validates that tiered storage throttling is working as expected.
    # TODO: investigate why it sometimes exceeds the expected limit.
    info = await async_client.info("ALL")
    assert info["used_memory_peak"] < 2300e6

    logging.info("Memory usage check completed, starting save and load")
    await async_client.execute_command("SAVE", "DF")
    assert await async_client.flushall()
    await async_client.execute_command(
        "DFLY",
        "LOAD",
        "tiered-entries-summary.dfs",
    )

    logging.info("Save and load completed, starting consistency checks after reload")
    # After reload, check that memory usage is still within the expected bounds.
    # This ensures that loading from tiered storage does not violate memory constraints.
    # TODO: investigate high error margin.
    info = await async_client.info("ALL")
    assert info["used_memory_peak"] < 2300e6

    assert await DebugPopulateSeeder.capture(async_client) == start_capture


@pytest.mark.large
async def test_rdb_load_with_tiering_6823(df_factory: DflyInstanceFactory):
    """
    Regression test for RDB load with tiering. Verifies that loading a snapshot
    into a tiered instance produces correct memory accounting (no underflow)
    and preserves data integrity. Covers #6823.
    """
    dbfilename = f"dump_{tmp_file_name()}"

    # 1. Create a non-tiered instance, populate with DEBUG POPULATE and save a DF snapshot.
    plain = df_factory.create(
        proactor_threads=4,
        dbfilename=dbfilename,
    )
    plain.start()
    plain_client = plain.client()

    await plain_client.execute_command("DEBUG POPULATE 50000 key 8192 RAND")
    num_keys = await plain_client.dbsize()

    await plain_client.execute_command("SAVE", "DF")
    plain.stop()

    # 2. Start a tiered instance and load the snapshot. Before the fix this would crash
    #    with "Check failed: obj_memory_usage + size >= 0" in AccountObjectMemory.
    tiered = df_factory.create(
        proactor_threads=1,
        dbfilename="",
        maxmemory="256MB",
        tiered_prefix="/tmp/tiered/rdb_load_test",
        tiered_offload_threshold="0.9",
        tiered_experimental_cooling="false",
        tiered_storage_write_depth=10,
    )
    tiered.start()
    tiered_client = tiered.client()

    assert await tiered_client.execute_command("DFLY", "LOAD", f"{dbfilename}-summary.dfs") == "OK"

    # Wait for tiering to stash entries
    @assert_eventually(timeout=30)
    async def assert_tiered_reached():
        info = await tiered_client.info("TIERED")
        assert info["tiered_entries"] > 40_000

    await assert_tiered_reached()

    info = await tiered_client.info("memory")
    used_mem = info["used_memory"]
    obj_mem = info["object_used_memory"]
    assert used_mem > 20_000_000 and used_mem < 300_000_000
    assert obj_mem > 20_000_000 and obj_mem < 300_000_000

    assert info["num_entries"] == num_keys


@dfly_args({"serialization_max_chunk_size": 4096, "proactor_threads": 1})
@pytest.mark.parametrize(
    "cont_type",
    [("HASH"), ("SET"), ("ZSET"), ("LIST"), ("STREAM")],
)
@pytest.mark.large
async def test_big_value_serialization_memory_limit(df_factory, cont_type):
    dbfilename = f"dump_{tmp_file_name()}"
    instance = df_factory.create(dbfilename=dbfilename)
    instance.start()
    client = instance.client()

    one_gb = 1_000_000_000
    elements = 1000
    element_size = 1_000_000  # 1mb

    await client.execute_command(
        f"debug populate 1 prefix {element_size} TYPE {cont_type} RAND ELEMENTS {elements}"
    )
    await asyncio.sleep(1)

    info = await client.info("ALL")
    assert info["used_memory_peak_rss"] < (one_gb * 1.2)
    # if we execute SAVE below without big value serialization we trigger the assertion below.
    # note the peak would reach (one_gb * 3) without it.
    await client.execute_command("SAVE")
    info = await client.info("ALL")

    assert info["used_memory_peak_rss"] < (one_gb * 1.3)

    await client.execute_command("FLUSHALL")
    await client.aclose()


@dfly_args(
    {
        "dir": "{DRAGONFLY_TMP}/",
        "memcached_port": 11211,
        "proactor_threads": 4,
        "dbfilename": "test-MC-flags",
    }
)
async def test_mc_flags_saving(memcached_client: MCClient, async_client: aioredis.Redis):
    async def check_flag(key, flag):
        res = memcached_client.raw_command("get " + key, "END\r\n").split()
        # workaround sometimes memcached_client.raw_command returns empty str
        if len(res) > 2:
            assert res[2].decode() == str(flag)

    assert memcached_client.set("key1", "value1", noreply=True)
    assert memcached_client.set("key2", "value1", noreply=True, expire=3600, flags=123456)
    assert memcached_client.replace("key1", "value2", expire=4000, flags=2, noreply=True)

    await check_flag("key1", 2)
    await check_flag("key2", 123456)

    await async_client.execute_command("SAVE", "DF")
    assert await async_client.flushall()

    await async_client.execute_command(
        "DFLY",
        "LOAD",
        "test-MC-flags-summary.dfs",
    )

    await check_flag("key1", 2)
    await check_flag("key2", 123456)


================================================
FILE: tests/dragonfly/test_dash_gc.py
================================================
import asyncio
from redis import asyncio as aioredis
from . import dfly_args
from .seeder import Seeder
import logging


@dfly_args({"proactor_threads": 2, "maxmemory": "1G"})
async def test_gc_merges_segments_and_shrinks_capacity(async_client: aioredis.Redis):
    value_size = 50
    target_keys = 10_000
    value = "x" * value_size

    batch_size = 100
    for batch_start in range(0, target_keys, batch_size):
        batch_end = min(batch_start + batch_size, target_keys)
        pipeline = async_client.pipeline()
        for i in range(batch_start, batch_end):
            pipeline.set(f"key{i}", value)
        await pipeline.execute()

    await asyncio.sleep(0.5)

    stats_before = await async_client.info("MEMORY")

    # Delete 90% of keys to create very sparse segments
    keys_to_delete = [f"key{i}" for i in range(target_keys) if i % 10 != 0]
    keys_left = [f"key{i}" for i in range(target_keys) if i % 10 == 0]

    for batch_start in range(0, len(keys_to_delete), 1000):
        await async_client.delete(*keys_to_delete[batch_start : batch_start + 1000])

    # Run GC with aggressive threshold to trigger merges
    segments_merged = await async_client.execute_command("DEBUG", "COMPACT-TABLE", "0.5")

    stats_after = await async_client.info("MEMORY")
    assert segments_merged > 0
    # Fewer segments means fewer buckets, so the table's total capacity must shrink
    assert stats_after["prime_capacity"] < stats_before["prime_capacity"], (
        f"Table capacity should shrink after GC: before={stats_before['prime_capacity']}, "
        f"after={stats_after['prime_capacity']}"
    )

    logging.info(
        f"COMPACT-TABLE merged {segments_merged} segments, "
        f"capacity {stats_before['prime_capacity']} -> {stats_after['prime_capacity']}"
    )

    for key in keys_left:
        res = await async_client.get(key)
        assert res == value


@dfly_args({"proactor_threads": 1, "maxmemory": "2G"})
async def test_gc_concurrent_with_seeding(async_client: aioredis.Redis):
    """
    Verify COMPACT-TABLE running concurrently with data insertion doesn't corrupt seeded data.

    a) Grow the dash table via DEBUG POPULATE with a prefix
    b) Delete all populated keys to create sparse segments
    c) Run DEBUG COMPACT-TABLE concurrently with Seeder
    d) Assert all data seeded by Seeder exists in the dash table
    """
    # a) Grow the dash table by seeding a large number of keys with a prefix
    populate_prefix = "gc-init-"
    await async_client.execute_command("DEBUG", "POPULATE", 100_000, populate_prefix, 50)

    # b) Delete all keys with the populate prefix to leave the segments sparse
    cursor = 0
    while True:
        cursor, keys = await async_client.scan(cursor, match=f"{populate_prefix}*", count=1000)
        if keys:
            await async_client.delete(*keys)
        if cursor == 0:
            break

    assert await async_client.dbsize() == 0

    # c) Run COMPACT-TABLE concurrently with Seeder so GC reclaims sparse segments
    #    while new data is being written
    key_target = 5_000
    seeder = Seeder(key_target=key_target, data_size=100)

    async def run_gc():
        for _ in range(10):
            await async_client.execute_command("DEBUG", "COMPACT-TABLE", "0.5")
            await asyncio.sleep(0.05)

    await asyncio.gather(
        seeder.run(async_client, target_deviation=0.05),
        run_gc(),
    )

    # d) Capture a reference snapshot of the data seeder wrote, then run GC again
    #    and verify the full dataset is unchanged (no corruption or partial loss).
    capture_before = await Seeder.capture(async_client)
    assert all(h != 0 for h in capture_before), "Seeder should have written data for all types"

    for _ in range(5):
        await async_client.execute_command("DEBUG", "COMPACT-TABLE", "0.5")
        await asyncio.sleep(0.05)

    capture_after = await Seeder.capture(async_client)
    assert (
        capture_before == capture_after
    ), "Data should be identical after GC: seeder dataset must survive concurrent GC runs"


================================================
FILE: tests/dragonfly/tiering_test.py
================================================
import async_timeout
import asyncio
import itertools
import logging
import pytest
import random
import redis.asyncio as aioredis

from . import dfly_args
from .seeder import DebugPopulateSeeder, Seeder as SeederV2
from .utility import (
    info_tick_timer,
    wait_for_replicas_state,
    check_all_replicas_finished,
    LogMonitor,
)
from .instance import DflyInstance, DflyInstanceFactory

BASIC_ARGS = {
    "proactor_threads": 4,
    "tiered_prefix": "/tmp/tiered/backing",
    "tiered_offload_threshold": "1.0",  # offload immediately
    "tiered_storage_write_depth": 1000,
    "maxmemory": "1G",
}


@pytest.mark.large
@pytest.mark.opt_only
@dfly_args({**BASIC_ARGS, "tiered_experimental_cooling": "false"})
async def test_basic_memory_usage(async_client: aioredis.Redis):
    """
    Loading 1GB of mixed size strings (256b-16kb) will keep most of them on disk and thus RAM remains almost unused
    """

    seeder = DebugPopulateSeeder(
        key_target=200_000, data_size=2048, variance=8, samples=100, types=["STRING"]
    )
    await seeder.run(async_client)

    # Wait for tiering stashes
    async for info, breaker in info_tick_timer(async_client, section="TIERED", timeout=60):
        with breaker:
            assert info["tiered_entries"] > 195_000

    info = await async_client.info("ALL")
    assert info["num_entries"] == 200_000

    assert (
        info["tiered_allocated_bytes"] > 195_000 * 2048 * 0.8
    )  # 0.8 just to be sure because it fluctuates due to variance

    assert info["used_memory"] < 50 * 1024 * 1024
    assert (
        info["used_memory_rss"] < 500 * 1024 * 1024
    )  # the grown table itself takes up lots of space


@pytest.mark.large
@pytest.mark.exclude_epoll
@pytest.mark.opt_only
@dfly_args(
    {
        **BASIC_ARGS,
    }
)
async def test_mixed_append(async_client: aioredis.Redis):
    """
    Issue conflicting mixed APPEND calls for a limited subset of keys with aggressive offloading in the background.
    Make sure no appends were lost
    """

    # Generate operations and shuffle them, key number `k` will have `k` append operations
    key_range = list(range(100, 300))
    ops = list(itertools.chain(*map(lambda k: itertools.repeat(k, k), key_range)))
    random.shuffle(ops)

    # Split list into n workers and run it
    async def run(sub_ops):
        p = async_client.pipeline(transaction=False)
        for k in sub_ops:
            p.append(f"k{k}", 10 * "x")
        await p.execute()

    n = 20
    await asyncio.gather(*(run(ops[i::n]) for i in range(n)))

    async for info, breaker in info_tick_timer(async_client, section="TIERED"):
        with breaker:
            assert info["tiered_entries"] > len(key_range) / 5

    # Verify lengths
    p = async_client.pipeline(transaction=False)
    for k in key_range:
        p.strlen(f"k{k}")
    res = await p.execute()

    assert res == [10 * k for k in key_range]


@pytest.mark.large
@pytest.mark.exclude_epoll
@pytest.mark.opt_only
@dfly_args(
    {
        "proactor_threads": 2,
        "tiered_prefix": "/tmp/tiered/backing_master",
        "maxmemory": "512MB",
        "cache_mode": True,
        "tiered_offload_threshold": "0.6",
        "tiered_upload_threshold": "0.2",
        "tiered_storage_write_depth": 1500,
    }
)
async def test_replication(
    async_client: aioredis.Redis, df_server: DflyInstance, df_factory: DflyInstanceFactory
):
    """
    Test replication with tiered storage for strings
    """

    # Fill master with values
    seeder = DebugPopulateSeeder(key_target=400000, data_size=2000, samples=100, types=["STRING"])
    await seeder.run(async_client)

    # Start replica
    replica = df_factory.create(
        proactor_threads=2,
        cache_mode=True,
        maxmemory="512MB",
        tiered_prefix="/tmp/tiered/backing_replica",
        tiered_offload_threshold="0.5",
        tiered_storage_write_depth=1500,
    )
    replica.start()
    replica_client = replica.client()

    # Get some keys and start tasks that append to values
    keys = await async_client.keys()

    async def fill_job():
        for i, key in enumerate(keys):
            await async_client.append(key, f":{i}:")
            await asyncio.sleep(0.005)  # limit qps

    fill_tasks = [asyncio.create_task(fill_job()) for _ in range(3)]

    # Start replication
    await replica_client.replicaof("localhost", df_server.port)
    logging.info("Waiting for replica to sync")

    # Wait for replication to finish
    try:
        async with async_timeout.timeout(500):
            await wait_for_replicas_state(replica_client)
    except asyncio.TimeoutError:
        master_info = await async_client.info("ALL")
        replica_info = await replica_client.info("ALL")
        pytest.fail(
            f"Replica did not sync in time. \nmaster: {master_info} \n\nreplica: {replica_info}"
        )

    # cancel filler and wait for replica to catch up
    for task in fill_tasks:
        task.cancel()
    await asyncio.gather(*fill_tasks, return_exceptions=True)
    await check_all_replicas_finished([replica_client], async_client, timeout=500)

    #
    # Check that everything is in sync
    hashes = await asyncio.gather(
        *(SeederV2.capture(c, types=["STRING"]) for c in [async_client, replica_client])
    )

    if len(set(hashes)) != 1:
        for key in keys:
            key_master = await async_client.get(key)
            key_replica = await replica_client.get(key)
            assert key_master == key_replica
        assert False, "Inconsistency detected, but key not determined"


@pytest.mark.large
@pytest.mark.exclude_epoll
@pytest.mark.opt_only
@dfly_args(
    {
        **BASIC_ARGS,
        "proactor_threads": 2,
        "maxmemory": "512MB",
        "serialization_max_chunk_size": 64000,
        "tiered_experimental_cooling": False,
    }
)
async def test_tiered_replication_with_hashes(
    async_client: aioredis.Redis, df_server: DflyInstance, df_factory: DflyInstanceFactory
):
    """
    Test replication from a tiered master with large string and hash data.
    Verifies that the replica does not encounter internal RDB loading errors.
    """

    # Fill master with data
    await async_client.execute_command("DEBUG POPULATE 200000 key 3000")
    await async_client.execute_command("DEBUG POPULATE 200 hash 70 RAND TYPE HASH ELEMENTS 900")

    # Start replica
    replica = df_factory.create(
        proactor_threads=1,
        dbfilename="",
    )
    replica.start()
    replica_client = replica.client()

    # Monitor replica logs for RDB loading errors in the background
    monitor = LogMonitor(replica, "Internal error when loading RDB")
    monitor.start()

    # Start replication
    await replica_client.replicaof("localhost", df_server.port)
    logging.info("Waiting for replica to sync")

    # Wait for replication to finish or RDB error
    try:
        async with async_timeout.timeout(500):
            wait_task = asyncio.create_task(wait_for_replicas_state(replica_client))
            done, _ = await asyncio.wait(
                [wait_task, monitor.task], return_when=asyncio.FIRST_COMPLETED
            )
            if monitor.task in done:
                wait_task.cancel()
                await asyncio.gather(wait_task, return_exceptions=True)
                monitor.assert_no_match()
            if wait_task in done:
                wait_task.result()  # propagate exceptions
    except asyncio.TimeoutError:
        master_info = await async_client.info("ALL")
        replica_info = await replica_client.info("ALL")
        pytest.fail(
            f"Replica did not sync in time. \nmaster: {master_info} \n\nreplica: {replica_info}"
        )
    finally:
        await monitor.stop()

    await check_all_replicas_finished([replica_client], async_client, timeout=500)
    monitor.assert_no_match()


================================================
FILE: tests/dragonfly/tls_conf_test.py
================================================
import pytest
import redis
from .utility import *
from .instance import DflyStartException


async def test_tls_no_auth(df_factory, with_tls_server_args):
    # Needs some authentication
    server = df_factory.create(**with_tls_server_args)
    with pytest.raises(DflyStartException):
        server.start()


async def test_tls_no_key(df_factory):
    # Needs a private key and certificate.
    server = df_factory.create(tls=None, requirepass="XXX")
    with pytest.raises(DflyStartException):
        server.start()


async def test_tls_password(df_factory, with_tls_server_args, with_tls_ca_cert_args):
    with df_factory.create(requirepass="XXX", **with_tls_server_args) as server:
        async with server.client(
            ssl=True, password="XXX", ssl_ca_certs=with_tls_ca_cert_args["ca_cert"]
        ) as client:
            await client.ping()


async def test_tls_client_certs(
    df_factory, with_ca_tls_server_args, with_tls_client_args, with_tls_ca_cert_args
):
    with df_factory.create(**with_ca_tls_server_args) as server:
        async with server.client(
            **with_tls_client_args, ssl_ca_certs=with_tls_ca_cert_args["ca_cert"]
        ) as client:
            await client.ping()


async def test_client_tls_no_auth(df_factory):
    server = df_factory.create(tls_replication=None)
    with pytest.raises(DflyStartException):
        server.start()


async def test_client_tls_password(df_factory):
    with df_factory.create(tls_replication=None, masterauth="XXX"):
        pass


async def test_client_tls_cert(df_factory, with_tls_server_args):
    key_args = with_tls_server_args.copy()
    key_args.pop("tls")
    with df_factory.create(tls_replication=None, **key_args):
        pass


async def test_config_enable_tls_with_ca_dir(
    df_factory, with_ca_dir_tls_server_args, with_tls_client_args
):
    server_args, ca_cert = with_ca_dir_tls_server_args
    server_args["tls"] = "true"

    with df_factory.create(**server_args) as server:
        async with server.client(**with_tls_client_args, ssl_ca_certs=ca_cert) as client:
            await client.execute_command("SET foo 44")
            res = await client.execute_command("GET foo")
            assert res == "44"


async def test_config_update_tls_certs(
    df_factory, with_tls_server_args, with_tls_ca_cert_args, tmp_dir
):
    # Generate new certificates.
    ca_key = os.path.join(tmp_dir, "ca-key-new.pem")
    ca_cert = os.path.join(tmp_dir, "ca-cert-new.pem")
    gen_ca_cert(ca_key, ca_cert)
    tls_server_key = os.path.join(tmp_dir, "df-key-new.pem")
    tls_server_req = os.path.join(tmp_dir, "df-req-new.pem")
    tls_server_cert = os.path.join(tmp_dir, "df-cert-new.pem")
    gen_certificate(
        ca_key,
        ca_cert,
        tls_server_req,
        tls_server_key,
        tls_server_cert,
    )

    with df_factory.create(requirepass="XXX", **with_tls_server_args) as server:
        async with server.client(
            ssl=True, password="XXX", ssl_ca_certs=with_tls_ca_cert_args["ca_cert"]
        ) as client:
            await client.config_set(
                "tls_key_file",
                tls_server_key,
            )
            await client.config_set("tls_cert_file", tls_server_cert)
            # Note must still set `tls true` to reload the TLS context.
            await client.config_set("tls", "true")

            # The existing connection should still work.
            await client.ping()

        # Connecting with the old CA should fail.
        with pytest.raises(redis.exceptions.ConnectionError):
            async with server.client(
                ssl=True, password="XXX", ssl_ca_certs=with_tls_ca_cert_args["ca_cert"]
            ) as client:
                await client.ping()

        # Connecting with the new CA should succeed.
        async with server.client(ssl=True, password="XXX", ssl_ca_certs=ca_cert) as client:
            await client.ping()


async def test_config_enable_tls(
    df_factory, with_ca_tls_server_args, with_tls_client_args, with_tls_ca_cert_args
):
    with df_factory.create() as server:
        async with server.client() as client:
            await client.ping()

            # Note the order here matters as flags are applied in order.
            await client.config_set(
                "tls_key_file",
                with_ca_tls_server_args["tls_key_file"],
            )
            await client.config_set(
                "tls_cert_file",
                with_ca_tls_server_args["tls_cert_file"],
            )
            await client.config_set(
                "tls_ca_cert_file",
                with_ca_tls_server_args["tls_ca_cert_file"],
            )
            await client.config_set(
                "tls",
                "true",
            )

            # The existing client should still be connected.
            await client.ping()

        # Connecting without TLS should fail.
        with pytest.raises(redis.exceptions.ConnectionError):
            async with server.client() as client_unauth:
                await client_unauth.ping()

        # Connecting with TLS should succeed.
        async with server.client(
            **with_tls_client_args, ssl_ca_certs=with_tls_ca_cert_args["ca_cert"]
        ) as client_tls:
            await client_tls.ping()


async def test_config_disable_tls(
    df_factory, with_ca_tls_server_args, with_tls_client_args, with_tls_ca_cert_args
):
    with df_factory.create(**with_ca_tls_server_args) as server:
        async with server.client(
            **with_tls_client_args, ssl_ca_certs=with_tls_ca_cert_args["ca_cert"]
        ) as client_tls:
            await client_tls.config_set("tls", "false")

        # Connecting without TLS should succeed.
        async with server.client() as client_unauth:
            await client_unauth.ping()


================================================
FILE: tests/dragonfly/utility.py
================================================
import asyncio
import functools
import itertools
import logging
import sys
import wrapt
from redis import asyncio as aioredis
import redis
import random
import string
import time
import difflib
import json
import subprocess
import pytest
import os
import fakeredis
from typing import Iterable, Union
from enum import Enum
import re


def tmp_file_name():
    return "".join(random.choices(string.ascii_letters, k=10))


def chunked(n, iterable):
    """Transform iterable into iterator of chunks of size n"""
    it = iter(iterable)
    while True:
        chunk = tuple(itertools.islice(it, n))
        if not chunk:
            return
        yield chunk


def eprint(*args, **kwargs):
    """Print to stderr"""
    print(*args, file=sys.stderr, **kwargs)


def gen_test_data(n, start=0, seed=None):
    for i in range(start, n):
        yield "k-" + str(i), "v-" + str(i) + ("-" + str(seed) if seed else "")


def batch_fill_data(client, gen, batch_size=100):
    for group in chunked(batch_size, gen):
        client.mset({k: v for k, v, in group})


async def tick_timer(func, timeout=5, step=0.1):
    """
    Async generator with automatic break when all asserts pass

    for object, breaker in tick_timer():
        with breaker:
            assert conditions on object

    If the generator times out, the last failed assert is raised
    """

    class ticker_breaker:
        def __init__(self):
            self.exc = None
            self.entered = False

        def __enter__(self):
            self.entered = True

        def __exit__(self, exc_type, exc_value, trace):
            if exc_value:
                self.exc = exc_value
                return True

    last_error = None
    start = time.time()
    while time.time() - start < timeout:
        breaker = ticker_breaker()
        yield (await func(), breaker)
        if breaker.entered and not breaker.exc:
            return

        last_error = breaker.exc
        await asyncio.sleep(step)

    if last_error:
        raise TimeoutError("Timed out!") from last_error
    raise TimeoutError("Timed out!")


async def info_tick_timer(client: aioredis.Redis, section=None, **kwargs):
    async for x in tick_timer(lambda: client.info(section), **kwargs):
        yield x


# wait for a process becomes "responsive":
# for a master - waits that it finishes loading a snapshot if it's budy doing so,
# and for replica it waits until it finishes its full sync stage and reaches the stable sync state.
async def wait_available_async(
    clients: Union[aioredis.Redis, Iterable[aioredis.Redis]], timeout=120
):
    if not isinstance(clients, aioredis.Redis):
        # Syntactic sugar to seamlessly handle an array of clients.
        return await asyncio.gather(*(wait_available_async(c) for c in clients))

    """Block until instance exits loading phase"""
    # First we make sure that ping passes
    start = time.time()
    while (time.time() - start) < timeout:
        try:
            await clients.ping()
            break
        except aioredis.BusyLoadingError as e:
            assert "Dragonfly is loading the dataset in memory" in str(e)
    timeout -= time.time() - start
    if timeout <= 0:
        raise TimeoutError("Timed out!")

    # Secondly for replicas, we make sure they reached stable state replicaton
    async for info, breaker in info_tick_timer(clients, "REPLICATION", timeout=timeout):
        with breaker:
            assert info["role"] == "master" or "slave_repl_offset" in info, info


class SizeChange(Enum):
    SHRINK = 0
    NO_CHANGE = 1
    GROW = 2


class ValueType(Enum):
    STRING = 0
    LIST = 1
    SET = 2
    HSET = 3
    ZSET = 4
    JSON = 5


class CommandGenerator:
    """Class for generating complex command sequences"""

    def __init__(
        self,
        target_keys,
        val_size,
        huge_val_count,
        huge_val_size,
        batch_size,
        max_multikey,
        unsupported_types=[],
    ):
        self.key_cnt_target = target_keys
        self.val_size = val_size
        self.batch_size = min(batch_size, target_keys)
        self.max_multikey = max_multikey
        self.unsupported_types = unsupported_types

        # Generate sorted list of random samples in target_keys range
        self.huge_val_sample = sorted(random.sample(range(target_keys), huge_val_count))
        self.huge_val_size = huge_val_size

        # Key management
        self.key_sets = [set() for _ in ValueType]
        self.key_cursor = 0
        self.key_cnt = 0

        # Grow factors
        self.diff_speed = 5
        self.base_diff_prob = 0.2
        self.min_diff_prob = 0.1

    def keys(self):
        return itertools.chain(*self.key_sets)

    def keys_and_types(self):
        return ((k, t) for t in list(ValueType) for k in self.set_for_type(t))

    def set_for_type(self, t: ValueType):
        return self.key_sets[t.value]

    def add_key(self, t: ValueType):
        """Add new key of type t"""
        k, self.key_cursor = self.key_cursor, self.key_cursor + 1
        self.set_for_type(t).add(k)
        return k

    def random_type(self):
        return random.choice([t for t in ValueType if (t not in self.unsupported_types)])

    def randomize_nonempty_set(self):
        """Return random non-empty set and its type"""
        if not any(self.key_sets):
            return None, None

        t = self.random_type()
        s = self.set_for_type(t)

        if len(s) == 0:
            return self.randomize_nonempty_set()
        else:
            return s, t

    def randomize_key(self, t=None, pop=False):
        """Return random key and its type"""
        if t is None:
            s, t = self.randomize_nonempty_set()
        else:
            s = self.set_for_type(t)

        if s is None or len(s) == 0:
            return None, None

        k = s.pop()
        if not pop:
            s.add(k)

        return k, t

    def generate_val(self, t: ValueType, idx):
        """Generate filler value of configured size for type t"""

        # If current key count matches huge val sample than we will create one element with huge val size.
        generate_huge_val = False
        if len(self.huge_val_sample) and self.huge_val_sample[0] == (self.key_cnt + idx):
            generate_huge_val = True
            # Remove this sample from list
            self.huge_val_sample.pop(0)

        def rand_str(k=3, s=""):
            # Use small k value to reduce mem usage and increase number of ops
            return s.join(random.choices(string.ascii_letters, k=k))

        if t == ValueType.STRING:
            # Random string for MSET
            return (rand_str(self.huge_val_size if generate_huge_val else self.val_size),)
        elif t == ValueType.LIST:
            # Random sequence k-letter elements for LPUSH
            list_size = self.val_size // 4
            element_size = (
                self.huge_val_size // list_size if generate_huge_val else self.val_size // list_size
            )
            return tuple(rand_str(element_size) for i in range(list_size))
        elif t == ValueType.SET:
            # Random sequence of k-letter elements for SADD
            set_size = self.val_size // 4
            element_size = (
                self.huge_val_size // set_size if generate_huge_val else self.val_size // set_size
            )
            return tuple(rand_str(element_size) for i in range(set_size))
        elif t == ValueType.HSET:
            # Random sequence of k-letter keys + int and two start values for HSET
            hset_size = self.val_size // 5
            element_size = (
                self.huge_val_size // hset_size if generate_huge_val else self.val_size // hset_size
            )
            elements = (
                (
                    rand_str(element_size),
                    random.randint(0, self.val_size),
                )
                for i in range(hset_size)
            )
            return ("v0", 0, "v1", 0) + tuple(itertools.chain(*elements))
        elif t == ValueType.ZSET:
            # Random sequnce of k-letter members and int score for ZADD
            # The length of the sequence will vary between val_size/4 and 130.
            # This ensures that we test both the ZSET implementation with listpack and the our custom BPtree.
            value_sizes = [self.val_size // 4, 130]
            probabilities = [8, 1]
            zset_size = random.choices(value_sizes, probabilities)[0]
            element_size = (
                self.huge_val_size // zset_size if generate_huge_val else self.val_size // zset_size
            )
            elements = (
                (
                    random.randint(0, self.val_size),
                    rand_str(element_size),
                )
                for i in range(zset_size)
            )
            return tuple(itertools.chain(*elements))
        elif t == ValueType.JSON:
            # Json object with keys:
            # - arr (array of random strings)
            # - ints (array of objects {i:random integer})
            # - i (random integer)
            json_size = self.val_size // 6
            element_size = (
                self.huge_val_size // json_size if generate_huge_val else self.val_size // json_size
            )
            ints = [{"i": random.randint(0, 100)} for i in range(json_size)]
            strs = [rand_str(element_size) for i in range(json_size)]
            return "$", json.dumps({"arr": strs, "ints": ints, "i": random.randint(0, 100)})
        else:
            assert False, "Invalid ValueType"

    def gen_shrink_cmd(self):
        """
        Generate command that shrinks data: DEL of random keys or almost immediate <=50ms PEXPIRE.
        """
        if random.random() < 0.3:
            key, _ = self.randomize_key(pop=True)
            if key == None:
                return None, 0
            return ("PEXPIRE", f"k{key}", f"{random.randint(0, 50)}"), -1
        else:
            keys_gen = (
                self.randomize_key(pop=True) for _ in range(random.randint(1, self.max_multikey))
            )
            keys = [f"k{k}" for k, _ in keys_gen if k is not None]

            if len(keys) == 0:
                return None, 0
            return ("DEL", *keys), -len(keys)

    UPDATE_ACTIONS = [
        ("APPEND {k} {val}", ValueType.STRING),
        ("SETRANGE {k} 10 {val}", ValueType.STRING),
        ("LPUSH {k} {val}", ValueType.LIST),
        ("LPOP {k}", ValueType.LIST),
        ("SADD {k} {val}", ValueType.SET),
        # ("SPOP {k}", ValueType.SET),  # Disabled because it is inconsistent
        ("HSETNX {k} v0 {val}", ValueType.HSET),
        ("HINCRBY {k} v1 1", ValueType.HSET),
        ("ZPOPMIN {k} 1", ValueType.ZSET),
        ("ZADD {k} 0 {val}", ValueType.ZSET),
        ("JSON.NUMINCRBY {k} $..i 1", ValueType.JSON),
        ("JSON.ARRPOP {k} $.arr", ValueType.JSON),
        ('JSON.ARRAPPEND {k} $.arr "{val}"', ValueType.JSON),
    ]

    def gen_update_cmd(self):
        """
        Generate command that makes no change to keyset: random of UPDATE_ACTIONS.
        """
        cmd, t = random.choice(self.UPDATE_ACTIONS)
        k, _ = self.randomize_key(t)
        val = "".join(random.choices(string.ascii_letters, k=3))
        return cmd.format(k=f"k{k}", val=val).split() if k is not None else None, 0

    GROW_ACTINONS = {
        ValueType.STRING: "MSET",
        ValueType.LIST: "LPUSH",
        ValueType.SET: "SADD",
        ValueType.HSET: "HMSET",
        ValueType.ZSET: "ZADD",
        ValueType.JSON: "JSON.MSET",
    }

    def gen_grow_cmd(self):
        """
        Generate command that grows keyset: Initialize key of random type with filler value.
        """
        # TODO: Implement COPY in Dragonfly.
        t = self.random_type()
        if t in [ValueType.STRING, ValueType.JSON]:
            count = random.randint(1, self.max_multikey)
        else:
            count = 1

        keys = (self.add_key(t) for _ in range(count))
        payload = itertools.chain(
            *((f"k{k}",) + self.generate_val(t, idx) for idx, k in enumerate(keys))
        )
        filtered_payload = filter(lambda p: p is not None, payload)

        return (self.GROW_ACTINONS[t],) + tuple(filtered_payload), count

    def make(self, action):
        """Create command for action and return it together with number of keys added (removed)"""
        if action == SizeChange.SHRINK:
            return self.gen_shrink_cmd()
        elif action == SizeChange.NO_CHANGE:
            return self.gen_update_cmd()
        else:
            return self.gen_grow_cmd()

    def reset(self):
        self.key_sets = [set() for _ in ValueType]
        self.key_cursor = 0
        self.key_cnt = 0

    def size_change_probs(self):
        """Calculate probabilities of size change actions"""
        # Relative distance to key target
        dist = (self.key_cnt_target - self.key_cnt) / self.key_cnt_target
        # Shrink has a roughly twice as large expected number of changed keys than grow
        return [
            max(self.base_diff_prob - self.diff_speed * dist, self.min_diff_prob),
            15.0,
            max(self.base_diff_prob + 2 * self.diff_speed * dist, self.min_diff_prob),
        ]

    def generate(self):
        """Generate next batch of commands, return it and ratio of current keys to target"""
        changes = []
        cmds = []
        while len(cmds) < self.batch_size:
            # Re-calculating changes in small groups
            if len(changes) == 0:
                changes = random.choices(list(SizeChange), weights=self.size_change_probs(), k=20)

            cmd, delta = self.make(changes.pop())
            if cmd is not None:
                cmds.append(cmd)
                self.key_cnt += delta
        return cmds, self.key_cnt / self.key_cnt_target


class DataCapture:
    """
    Captured state of single database.
    """

    def __init__(self, entries):
        self.entries = entries

    def compare(self, other):
        if self.entries == other.entries:
            return True

        self._print_diff(other)
        return False

    def _print_diff(self, other):
        eprint("=== DIFF ===")
        printed = 0
        diff = difflib.ndiff(self.entries, other.entries)
        for line in diff:
            if line.startswith(" "):
                continue
            eprint(line)
            if printed >= 20:
                eprint("... omitted ...")
                break
            printed += 1
        eprint("=== END DIFF ===")


class DflySeeder:
    """
    Data seeder with support for multiple types and commands.

    Usage:

    Create a seeder with target number of keys (100k) of specified size (200) and work on 5 dbs,

        seeder = new DflySeeder(keys=100_000, value_size=200, dbcount=5)

    Stop when we are in 5% of target number of keys (i.e. above 95_000)
    Because its probabilistic we might never reach exactly 100_000.

        await seeder.run(target_deviation=0.05)

    Run 3000 commands in stable state, crate a capture and compare it to
    replica on port 1112

        await seeder.run(target_op=3000)
        capture = await seeder.capture()
        assert await seeder.compare(capture, port=1112)
    """

    def __init__(
        self,
        port=6379,
        keys=1000,
        val_size=50,
        huge_value_count=5,
        huge_value_size=100000,
        batch_size=100,
        max_multikey=5,
        dbcount=1,
        multi_transaction_probability=0.3,
        log_file=None,
        unsupported_types=[],
        stop_on_failure=True,
        cluster_mode=False,
        mirror_to_fake_redis=False,
        pipeline=True,
    ):
        if cluster_mode:
            max_multikey = 1
            multi_transaction_probability = 0
            unsupported_types.append(ValueType.JSON)  # Cluster aio client doesn't support JSON

        self.cluster_mode = cluster_mode
        self.gen = CommandGenerator(
            keys,
            val_size,
            huge_value_count,
            huge_value_size,
            batch_size,
            max_multikey,
            unsupported_types,
        )
        self.port = port
        self.dbcount = dbcount
        self.multi_transaction_probability = multi_transaction_probability
        self.stop_flag = False
        self.stop_on_failure = stop_on_failure
        self.fake_redis = None
        self.use_pipeline = pipeline

        self.log_file = log_file
        if self.log_file is not None:
            open(self.log_file, "w").close()

        if mirror_to_fake_redis:
            logging.debug("Creating FakeRedis instance")
            self.fake_redis = fakeredis.FakeAsyncRedis()
            self.use_pipeline = False

    async def run(self, target_ops=None, target_deviation=None):
        """
        Run a seeding cycle on all dbs either until stop(), a fixed number of commands (target_ops)
        or until reaching an allowed deviation from the target number of keys (target_deviation)
        """
        logging.debug(f"Running ops:{target_ops} deviation:{target_deviation}")
        self.stop_flag = False
        queues = [asyncio.Queue(maxsize=3) for _ in range(self.dbcount)]
        producer = asyncio.create_task(
            self._generator_task(queues, target_ops=target_ops, target_deviation=target_deviation)
        )
        consumers = [
            asyncio.create_task(self._executor_task(i, queue)) for i, queue in enumerate(queues)
        ]

        time_start = time.time()

        cmdcount = await producer
        for consumer in consumers:
            await consumer

        took = time.time() - time_start
        qps = round(cmdcount * self.dbcount / took, 2)
        logging.debug(f"Filling took: {took}, QPS: {qps}")

    def stop(self):
        """Stop all invocations to run"""
        self.stop_flag = True

    def reset(self):
        """Reset internal state. Needs to be called after flush or restart"""
        self.gen.reset()

    async def capture_fake_redis(self):
        keys = sorted(list(self.gen.keys_and_types()))
        # TODO: support multiple databases
        assert self.dbcount == 1
        assert self.fake_redis != None
        capture = DataCapture(await self._capture_entries(self.fake_redis, keys))
        return [capture]

    async def capture(self, port=None):
        """Create DataCapture for all dbs"""

        if port is None:
            port = self.port
        logging.debug(f"Starting capture from {port=}")
        keys = sorted(list(self.gen.keys_and_types()))

        captures = await asyncio.gather(
            *(self._capture_db(port=port, target_db=db, keys=keys) for db in range(self.dbcount))
        )
        return captures

    async def compare(self, initial_captures, port=6379):
        """Compare data capture with all dbs of instance and return True if all dbs are correct"""
        print(f"comparing capture to {port}")
        target_captures = await self.capture(port=port)

        for db, target_capture, initial_capture in zip(
            range(self.dbcount), target_captures, initial_captures
        ):
            print(f"comparing capture to {port}, db: {db}")
            if not initial_capture.compare(target_capture):
                eprint(f">>> Inconsistent data on port {port}, db {db}")
                return False
        return True

    def target(self, key_cnt):
        self.gen.key_cnt_target = key_cnt

    def _make_client(self, **kwargs):
        if self.cluster_mode:
            return aioredis.RedisCluster(host="127.0.0.1", **kwargs)
        else:
            return aioredis.Redis(**kwargs)

    async def _close_client(self, client):
        if not self.cluster_mode:
            await client.connection_pool.disconnect()
        await client.aclose()

    async def _capture_db(self, port, target_db, keys):
        client = self._make_client(port=port, db=target_db)
        capture = DataCapture(await self._capture_entries(client, keys))

        await self._close_client(client)

        return capture

    async def _generator_task(self, queues, target_ops=None, target_deviation=None):
        cpu_time = 0
        submitted = 0
        batches = 0
        deviation = 0.0

        file = None
        if self.log_file:
            file = open(self.log_file, "a")

        def should_run():
            if self.stop_flag:
                return False
            if target_ops is not None and submitted >= target_ops:
                return False
            if target_deviation is not None and (
                deviation > 1 or abs(1 - deviation) < target_deviation
            ):
                return False
            return True

        def stringify_cmd(cmd):
            if isinstance(cmd, tuple):
                return " ".join(str(c) for c in cmd)
            else:
                return str(cmd)

        while should_run():
            start_time = time.time()
            blob, deviation = self.gen.generate()
            is_multi_transaction = random.random() < self.multi_transaction_probability
            tx_data = (blob, is_multi_transaction)
            cpu_time += time.time() - start_time

            await asyncio.gather(*(q.put(tx_data) for q in queues))
            submitted += len(blob)
            batches += 1

            if file is not None:
                pattern = "MULTI\n{}\nEXEC\n" if is_multi_transaction else "{}\n"
                file.write(pattern.format("\n".join(stringify_cmd(cmd) for cmd in blob)))

            print(".", end="", flush=True)
            await asyncio.sleep(0.0)

        print("\ncpu time", cpu_time, "batches", batches, "commands", submitted)

        await asyncio.gather(*(q.put(None) for q in queues))
        for q in queues:
            await q.join()

        if file is not None:
            file.flush()

        return submitted

    async def _executor_task(self, db, queue):
        client = self._make_client(port=self.port, db=db)

        while True:
            tx_data = await queue.get()
            if tx_data is None:
                queue.task_done()
                break

            try:
                if self.use_pipeline:
                    pipe = client.pipeline(transaction=tx_data[1])
                    for cmd in tx_data[0]:
                        pipe.execute_command(*cmd)
                    await pipe.execute()
                else:
                    for cmd in tx_data[0]:
                        dfly_resp = await client.execute_command(*cmd)
                        # To mirror consistently to Fake Redis we must only send to it successful
                        # commands. We can't use pipes because they might succeed partially.
                        if self.fake_redis is not None:
                            fake_resp = await self.fake_redis.execute_command(*cmd)
                            assert dfly_resp == fake_resp
            except (redis.exceptions.ConnectionError, redis.exceptions.ResponseError) as e:
                if self.stop_on_failure:
                    await self._close_client(client)
                    raise SystemExit(e)
            except Exception as e:
                await self._close_client(client)
                raise SystemExit(e)
            queue.task_done()

        await self._close_client(client)

    CAPTURE_COMMANDS = {
        ValueType.STRING: lambda pipe, k: pipe.get(k),
        ValueType.LIST: lambda pipe, k: pipe.lrange(k, 0, -1),
        ValueType.SET: lambda pipe, k: pipe.smembers(k),
        ValueType.HSET: lambda pipe, k: pipe.hgetall(k),
        ValueType.ZSET: lambda pipe, k: pipe.zrange(k, start=0, end=-1, withscores=True),
        ValueType.JSON: lambda pipe, k: pipe.execute_command("JSON.GET", k, "$"),
    }

    CAPTURE_EXTRACTORS = {
        ValueType.STRING: lambda res, tostr: (tostr(res),),
        ValueType.LIST: lambda res, tostr: (tostr(s) for s in res),
        ValueType.SET: lambda res, tostr: sorted(tostr(s) for s in res),
        ValueType.HSET: lambda res, tostr: sorted(
            tostr(k) + "=" + tostr(v) for k, v in res.items()
        ),
        ValueType.ZSET: lambda res, tostr: (tostr(s) + "-" + str(f) for (s, f) in res),
        ValueType.JSON: lambda res, tostr: (tostr(res),),
    }

    async def _capture_entries(self, client, keys):
        def tostr(b):
            return b.decode("utf-8") if isinstance(b, bytes) else str(b)

        entries = []
        for group in chunked(self.gen.batch_size * 2, keys):
            pipe = client.pipeline(transaction=False)
            for k, t in group:
                self.CAPTURE_COMMANDS[t](pipe, f"k{k}")

            results = await pipe.execute()
            for (k, t), res in zip(group, results):
                out = f"{t.name} k{k}: " + " ".join(self.CAPTURE_EXTRACTORS[t](res, tostr))
                entries.append(out)

        return entries


class DflySeederFactory:
    """
    Used to pass params to a DflySeeder.
    """

    def __init__(self, log_file=None):
        self.log_file = log_file

    def __repr__(self) -> str:
        return f"DflySeederFactory(log_file={self.log_file})"

    def create(self, **kwargs):
        return DflySeeder(log_file=self.log_file, **kwargs)


def gen_ca_cert(ca_key_path, ca_cert_path):
    # We first need to generate the tls certificates to be used by the server

    # Generate CA (certificate authority) key and self-signed certificate
    # In production, CA should be generated by a third party authority
    # Expires in one day and is not encrtypted (-nodes)
    # X.509 format for the key
    step = rf"openssl req -x509 -newkey rsa:4096 -days 1 -nodes -keyout {ca_key_path} -out {ca_cert_path} "
    step += '-subj "/C=GR/ST=SKG/L=Thessaloniki/O=KK/OU=AcmeStudios/CN=localhost/emailAddress=acme@gmail.com"'
    subprocess.run(step, shell=True)


def gen_certificate(
    ca_key_path, ca_certificate_path, certificate_request_path, private_key_path, certificate_path
):
    # Generate Dragonfly's private key and certificate signing request (CSR)
    step1 = rf"openssl req -newkey rsa:4096 -nodes -keyout {private_key_path} -out {certificate_request_path} "
    step1 += '-subj "/C=GR/ST=SKG/L=Thessaloniki/O=KK/OU=Comp/CN=localhost/emailAddress=does_not_exist@gmail.com"'
    subprocess.run(step1, shell=True)

    # Use CA's private key to sign dragonfly's CSR and get back the signed certificate
    step2 = rf"openssl x509 -req -in {certificate_request_path} -days 1 -CA {ca_certificate_path} -CAkey {ca_key_path} -CAcreateserial -out {certificate_path}"
    subprocess.run(step2, shell=True)


class EnvironCntx:
    def __init__(self, **kwargs):
        self.updates = kwargs
        self.undo = {}

    def __enter__(self):
        for k, v in self.updates.items():
            if k in os.environ:
                self.undo[k] = os.environ[k]
            os.environ[k] = v

    def __exit__(self, exc_type, exc_value, exc_traceback):
        for k, v in self.updates.items():
            if k in self.undo:
                os.environ[k] = self.undo[k]
            else:
                del os.environ[k]


async def is_saving(c_client: aioredis.Redis):
    return "saving:1" in (await c_client.execute_command("INFO PERSISTENCE"))


def assert_eventually(wrapped=None, *, times=100, timeout=None):
    if wrapped is None:
        return functools.partial(assert_eventually, times=times, timeout=timeout)

    @wrapt.decorator
    async def wrapper(wrapped, instance, args, kwargs):
        max_attempts = times
        if timeout is not None:  # If timeout is set, we will ignore times and use timeout.
            start = time.time()
            max_attempts = 1 << 32  # Effectively infinite

        for attempt in range(max_attempts):
            try:
                result = await wrapped(*args, **kwargs)
                return result
            except AssertionError:
                if timeout is not None and (time.time() - start) > timeout:
                    raise
                if attempt == max_attempts - 1:
                    raise
                await asyncio.sleep(0.1)

    return wrapper(wrapped)


def skip_if_not_in_github(reason: str = "Redis server not found"):
    if os.getenv("GITHUB_ACTIONS") == None:
        pytest.skip(reason)


class ExpirySeeder:
    def __init__(self, stop_on_failure=True, timeout=3):
        self.stop_flag = False
        self.i = 0
        self.batch_size = 200
        self.stop_on_failure = stop_on_failure
        self.timeout = timeout

    async def run(self, client):
        while not self.stop_flag:
            try:
                pipeline = client.pipeline(transaction=False)
                for i in range(0, self.batch_size):
                    pipeline.execute_command(f"SET tmp{self.i} bar{self.i} EX {self.timeout}")
                    self.i = self.i + 1
                await pipeline.execute()
            except (redis.exceptions.ConnectionError, redis.exceptions.ResponseError) as e:
                if self.stop_on_failure:
                    return
                else:
                    raise SystemExit(e)

    async def wait_until_n_inserts(self, count):
        while not self.i > count:
            await asyncio.sleep(0.5)

    def stop(self):
        self.stop_flag = True


def extract_int_after_prefix(prefix, line):
    match = re.search(prefix + "(\\d+)", line)
    assert match
    return int(match.group(1))


async def wait_for_replicas_state(*clients, state="online", node_role="slave", timeout=0.05):
    """Wait until all clients (replicas) reach passed state"""
    while len(clients) > 0:
        await asyncio.sleep(timeout)
        roles = await asyncio.gather(*(c.role() for c in clients))
        clients = [c for c, role in zip(clients, roles) if role[0] != node_role or role[3] != state]


async def check_replica_finished_exec(c_replica: aioredis.Redis, m_offset):
    role = await c_replica.role()
    if role[0] != "slave" or role[3] != "online":
        return False
    syncid, r_offset = await c_replica.execute_command("DEBUG REPLICA OFFSET")

    logging.debug(f"  offset {syncid} {r_offset} {m_offset}")
    return r_offset == m_offset


async def check_all_replicas_finished(c_replicas, c_master, timeout=20):
    logging.debug("Waiting for replicas to finish")

    waiting_for = list(c_replicas)
    start = time.time()
    while (time.time() - start) < timeout:
        if not waiting_for:
            logging.debug("All replicas finished after %s seconds", time.time() - start)
            return
        await asyncio.sleep(0.2)
        m_offset = await c_master.execute_command("DFLY REPLICAOFFSET")
        finished_list = await asyncio.gather(
            *(check_replica_finished_exec(c, m_offset) for c in waiting_for)
        )

        # Remove clients that finished from waiting list
        waiting_for = [c for (c, finished) in zip(waiting_for, finished_list) if not finished]

    first_r: aioredis.Redis = waiting_for[0]
    logging.error("Replica not finished, role %s", await first_r.role())
    raise RuntimeError("Not all replicas finished in time!")


class LogMonitor:
    """
    Monitors an instance's INFO log files for a specific pattern in the background.

    Usage:
        monitor = LogMonitor(instance, "Internal error when loading RDB")
        monitor.start()
        # ... do work ...
        await monitor.stop()       # stops polling
        monitor.assert_no_match()  # raises AssertionError if pattern was found

    Can also be used with asyncio.wait to fail fast:
        done, _ = await asyncio.wait(
            [work_task, monitor.task], return_when=asyncio.FIRST_COMPLETED
        )
        if monitor.task in done:
            monitor.assert_no_match()
    """

    def __init__(self, instance, pattern: str, poll_interval: float = 0.5):
        self.instance = instance
        self.pattern = pattern
        self.poll_interval = poll_interval
        self.matched_lines = []
        self._stop_event = asyncio.Event()
        self.task = None

    def start(self):
        self.task = asyncio.create_task(self._poll())

    async def _poll(self):
        file_positions = {}
        while not self._stop_event.is_set():
            for log_path in self.instance.log_files:
                if "INFO" not in log_path:
                    continue
                pos = file_positions.get(log_path, 0)
                try:
                    with open(log_path, "r") as f:
                        f.seek(pos)
                        new_content = f.read()
                        file_positions[log_path] = f.tell()
                except FileNotFoundError:
                    continue
                for line in new_content.splitlines():
                    if self.pattern in line:
                        self.matched_lines.append(line.strip())
                        self._stop_event.set()
                        return
            await asyncio.sleep(self.poll_interval)

    async def stop(self):
        self._stop_event.set()
        if self.task:
            self.task.cancel()
            try:
                await self.task
            except asyncio.CancelledError:
                pass

    def assert_no_match(self):
        assert not self.matched_lines, f"Log pattern '{self.pattern}' found:\n" + "\n".join(
            self.matched_lines
        )


================================================
FILE: tests/dragonfly/valkey_search/README.md
================================================
# Valkey-Search Integration Tests for Dragonfly

Integration tests from [valkey-search](https://github.com/valkey-io/valkey-search) project, adapted to run on Dragonfly without modifying the original test code.

## Prerequisites

1. Build Dragonfly

2. Install Python dependencies:
   ```bash
   pip install -r tests/dragonfly/requirements.txt
   ```

## Setup

1. Sync tests from valkey-search:
   ```bash
   cd tests/dragonfly/valkey_search
   ./sync-valkey-search-tests.sh
   ```

2. Set environment variables:
   ```bash
   export DRAGONFLY_PATH="/path/to/dragonfly/build-dbg/dragonfly"
   export ROOT_DIR="/path/to/dragonfly/tests/dragonfly/valkey_search"
   ```

## Running Tests

```bash
# All tests
pytest tests/dragonfly/valkey_search/integration/ -v

# Specific test file
pytest tests/dragonfly/valkey_search/integration/test_ft_create.py -v

# Specific test
pytest tests/dragonfly/valkey_search/integration/test_ft_create.py::TestSearchFTCreateCMD::test_ft_create_fails_on_replica_cmd -v
```

## Structure

```
tests/dragonfly/valkey_search/
 __init__.py                          # Mock framework for valkey-search imports
 conftest.py                          # Pytest configuration
 util.py                              # Utility functions (waiters)
 valkey_search_test_case_dragonfly.py # Dragonfly adapter (real replicas, clusters)
 sync-valkey-search-tests.sh          # Script to sync tests
 integration/                         # Synced from valkey-search (not in git)
```

## How It Works

1. **Infrastructure files** (committed to git) provide compatibility layer
2. **Test files** (in `integration/`, not in git) are synced from valkey-search
3. **Mock framework** (`__init__.py`) replaces valkey-search imports with Dragonfly equivalents
4. **Adapter** (`valkey_search_test_case_dragonfly.py`) creates real Dragonfly instances with replicas
5. **Original tests run unchanged** - all adaptation happens in infrastructure layer
6. **Python 3.8 compatibility** - sync script patches all `.py` files to add `from __future__ import annotations`


================================================
FILE: tests/dragonfly/valkey_search/__init__.py
================================================
"""
Valkey-search integration tests for Dragonfly

This module automatically adapts original valkey-search tests to run on Dragonfly
by replacing valkeytestframework imports with Dragonfly equivalents.
"""

import sys
import types
import os
from . import util

# Check if integration directory exists before attempting import
_integration_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "integration")
INTEGRATION_AVAILABLE = os.path.isdir(_integration_dir)

if not INTEGRATION_AVAILABLE:
    pass  # pytest_ignore_collect in conftest.py will skip these tests
else:
    from .integration import compatibility

    # Add current directory to path for imports
    current_dir = os.path.dirname(os.path.abspath(__file__))
    if current_dir not in sys.path:
        sys.path.insert(0, current_dir)

    # Import the Dragonfly-specific test case classes
    with open(os.path.join(current_dir, "valkey_search_test_case_dragonfly.py")) as f:
        exec(f.read())

    # Create a mock module for valkey_search_test_case
    mock_module = types.ModuleType("valkey_search_test_case")
    mock_module.ValkeySearchTestCaseBase = ValkeySearchTestCaseBase
    mock_module.ValkeySearchTestCaseDebugMode = ValkeySearchTestCaseDebugMode
    mock_module.ValkeySearchClusterTestCase = ValkeySearchClusterTestCase
    mock_module.ValkeySearchClusterTestCaseDebugMode = ValkeySearchClusterTestCaseDebugMode
    mock_module.Node = Node
    mock_module.ReplicationGroup = ReplicationGroup

    # Replace the module in sys.modules
    sys.modules["valkey_search_test_case"] = mock_module

    # Also need to provide valkeytestframework modules
    valkey_test_framework = types.ModuleType("valkeytestframework")

    valkey_test_case = types.ModuleType("valkeytestframework.valkey_test_case")
    valkey_test_case.ValkeyTestCase = ValkeyTestCase
    valkey_test_case.ReplicationTestCase = ReplicationTestCase
    valkey_test_case.ValkeyServerHandle = ValkeyServerHandle

    util_module = types.ModuleType("valkeytestframework.util")
    waiters_module = types.ModuleType("valkeytestframework.util.waiters")

    waiters_module.wait_for_true = util.waiters.wait_for_true
    waiters_module.wait_for_equal = util.waiters.wait_for_equal
    waiters_module.wait_for_not_equal = util.waiters.wait_for_not_equal
    waiters_module.wait_for_condition = util.waiters.wait_for_condition
    util_module.waiters = waiters_module

    # Also add direct util module access
    sys.modules["util"] = util_module
    sys.modules["util.waiters"] = waiters_module

    conftest_module = types.ModuleType("valkeytestframework.conftest")
    conftest_module.resource_port_tracker = types.ModuleType("resource_port_tracker")

    # Setup compatibility as a module in sys.modules
    sys.modules["compatibility"] = compatibility

    # Also set up the submodules
    if hasattr(compatibility, "data_sets"):
        sys.modules["compatibility.data_sets"] = compatibility.data_sets

    # Add all modules to sys.modules
    sys.modules["valkeytestframework"] = valkey_test_framework
    sys.modules["valkeytestframework.valkey_test_case"] = valkey_test_case
    sys.modules["valkeytestframework.util"] = util_module
    sys.modules["valkeytestframework.util.waiters"] = waiters_module
    sys.modules["valkeytestframework.conftest"] = conftest_module


================================================
FILE: tests/dragonfly/valkey_search/conftest.py
================================================
"""
Pytest configuration for valkey-search tests on Dragonfly
"""

import os
import pytest
from .. import dfly_args

# Check if integration directory exists
_integration_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "integration")
INTEGRATION_AVAILABLE = os.path.isdir(_integration_dir)


def pytest_ignore_collect(collection_path, config):
    """Skip collection of integration tests if directory is not synced."""
    if not INTEGRATION_AVAILABLE and "integration" in str(collection_path):
        return True
    return None


# List of tests to skip - add test node IDs here
# Example format: "integration/test_file.py::TestClass::test_method"
SKIP_TESTS = [
    "integration/compatibility_test.py::TestAnswersCMD::test_answers",
    "integration/test_cancel.py::TestCancelCMD::test_timeoutCMD",
    "integration/test_cancel.py::TestCancelCME::test_timeoutCME",
    "integration/test_eviction.py::TestEviction::test_eviction_with_search_index",
    "integration/test_fanout_base.py::TestFanoutBase::test_fanout_retry",
    "integration/test_fanout_base.py::TestFanoutBase::test_fanout_shutdown",
    "integration/test_fanout_base.py::TestFanoutBase::test_fanout_timeout",
    "integration/test_flushall.py::TestFlushAllCME::test_flushallCME",
    "integration/test_ft_create_consistency.py::TestFTCreateConsistency::test_create_force_index_name_error_retry",
    "integration/test_ft_create_consistency.py::TestFTCreateConsistency::test_duplicate_creation",
    "integration/test_ft_create_consistency.py::TestFTCreateConsistency::test_concurrent_creation",
    "integration/test_ft_create_consistency.py::TestFTCreateConsistency::test_create_timeout",
    "integration/test_ft_dropindex_consistency.py::TestFTDropindexConsistency::test_dropindex_synchronize_handle_message_first",
    "integration/test_ft_dropindex_consistency.py::TestFTDropindexConsistency::test_dropindex_synchronize_consistency_check_first",
    "integration/test_info.py::TestVSSBasic::test_info_fields_present",
    "integration/test_info_cluster.py::TestFTInfoCluster::test_ft_info_cluster_success",
    "integration/test_info_cluster.py::TestFTInfoCluster::test_ft_info_cluster_force_index_name_error_retry",
    "integration/test_info_cluster.py::TestFTInfoCluster::test_ft_info_cluster_retry",
    "integration/test_info_primary.py::TestFTInfoPrimary::test_ft_info_primary_success",
    "integration/test_info_primary.py::TestFTInfoPrimary::test_ft_info_primary_force_index_name_error_retry",
    "integration/test_info_primary.py::TestFTInfoPrimary::test_ft_info_primary_retry",
    "integration/test_oom_handling.py::TestSearchOOMHandlingCME::test_search_oom_cme",
    "integration/test_oom_handling.py::TestSearchOOMHandlingCMD::test_search_oom_cmd",
    "integration/test_query_parser.py::TestQueryParser::test_query_string_depth_limit",
    "integration/test_query_parser.py::TestQueryParser::test_query_string_terms_count_limit",
    "integration/test_reclaimable_memory.py::TestReclaimableMemory::test_reclaimable_memory_with_vector_operations",
    "integration/test_reclaimable_memory.py::TestReclaimableMemory::test_reclaimable_memory_multiple_indexes",
    "integration/test_skip_index_load.py::TestRDBCorruptedIndex::test_corrupted_rdb_skip_index_load_succeeds",
    "integration/test_valkey_search_acl.py::TestCommandsACLs::test_acl_specific_search_commands_permissions",
    "integration/test_valkey_search_acl.py::TestCommandsACLs::test_index_with_several_prefixes_permissions",
    "integration/test_valkey_search_acl.py::TestCommandsACLs::test_valkey_search_cmds_categories",
]


# Apply dfly_args to all test classes in this directory
def pytest_collection_modifyitems(items):
    """Apply dfly_args decorator to all test classes and skip marked tests"""
    for item in items:
        if item.cls and not hasattr(item.cls, "_dfly_args_applied"):
            # Apply the decorator to the class
            decorated_class = dfly_args({"proactor_threads": 4})(item.cls)
            item.cls._dfly_args_applied = True

        # Skip tests that are in the skip list
        # Get the relative path from valkey_search directory
        item_path = str(item.nodeid)
        for skip_pattern in SKIP_TESTS:
            if skip_pattern in item_path:
                item.add_marker(pytest.mark.skip(reason=f"Test skipped: {skip_pattern}"))


================================================
FILE: tests/dragonfly/valkey_search/sync-valkey-search-tests.sh
================================================
#!/bin/bash
set -e

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
INTEGRATION_DIR="$SCRIPT_DIR/integration"
VALKEY_SEARCH_REPO="https://github.com/valkey-io/valkey-search.git"
TEMP_DIR=$(mktemp -d)

# Accept optional tag/revision parameter
TAG_OR_REV="${1:-}"

if [ -n "$TAG_OR_REV" ]; then
  echo "Syncing valkey-search tests from tag/revision: $TAG_OR_REV"
else
  echo "Syncing valkey-search tests from latest commit..."
fi

# Remove old integration directory
rm -rf "$INTEGRATION_DIR"

# Clone to temp directory
if [ -n "$TAG_OR_REV" ]; then
  # Clone with full history and checkout specific tag/revision
  git clone "$VALKEY_SEARCH_REPO" "$TEMP_DIR" >/dev/null 2>&1
  pushd "$TEMP_DIR" >/dev/null
  git checkout "$TAG_OR_REV" >/dev/null 2>&1
  popd >/dev/null
else
  # Clone only the latest commit (shallow clone)
  git clone --depth=1 "$VALKEY_SEARCH_REPO" "$TEMP_DIR" >/dev/null 2>&1
fi

# Copy integration directory
cp -r "$TEMP_DIR/integration" "$INTEGRATION_DIR"

# Patch all Python files for Python 3.8 compatibility
# Add 'from __future__ import annotations' to support modern type hints
echo "Patching Python files for Python 3.8 compatibility..."
find "$INTEGRATION_DIR" -name "*.py" -type f | while read -r file; do
  # Check if the file doesn't already have 'from __future__ import annotations'
  if ! grep -q "from __future__ import annotations" "$file"; then
    sed -i '1i from __future__ import annotations' "$file"
  fi
done

# Cleanup
rm -rf "$TEMP_DIR"

echo "Done. Synced $(find "$INTEGRATION_DIR" -name '*test*.py' | wc -l) test files."


================================================
FILE: tests/dragonfly/valkey_search/util.py
================================================
"""
Utility module for valkey-search tests running on Dragonfly
Provides waiters functionality compatible with valkeytestframework.util.waiters
"""

import time


class waiters:
    """Waiters utility class for test synchronization"""

    @staticmethod
    def wait_for_true(func, timeout=30, interval=0.1):
        """
        Wait for a function to return True

        Args:
            func: Function to call repeatedly until it returns True
            timeout: Maximum time to wait in seconds (default: 30)
            interval: Time between checks in seconds (default: 0.1)

        Returns:
            True if function returned True within timeout, False otherwise
        """
        start_time = time.time()
        while time.time() - start_time < timeout:
            try:
                if func():
                    return True
            except Exception:
                # Ignore exceptions during polling
                pass
            time.sleep(interval)
        return False

    @staticmethod
    def wait_for_equal(func, value, timeout=30, interval=0.1):
        """
        Wait for a function to return a specific value

        Args:
            func: Function to call repeatedly
            value: Expected return value
            timeout: Maximum time to wait in seconds (default: 30)
            interval: Time between checks in seconds (default: 0.1)

        Returns:
            True if function returned expected value within timeout, False otherwise
        """
        start_time = time.time()
        while time.time() - start_time < timeout:
            try:
                if func() == value:
                    return True
            except Exception:
                # Ignore exceptions during polling
                pass
            time.sleep(interval)
        return False

    @staticmethod
    def wait_for_not_equal(func, value, timeout=30, interval=0.1):
        """
        Wait for a function to return a value different from the specified one

        Args:
            func: Function to call repeatedly
            value: Value that should NOT be returned
            timeout: Maximum time to wait in seconds (default: 30)
            interval: Time between checks in seconds (default: 0.1)

        Returns:
            True if function returned different value within timeout, False otherwise
        """
        start_time = time.time()
        while time.time() - start_time < timeout:
            try:
                if func() != value:
                    return True
            except Exception:
                # Ignore exceptions during polling
                pass
            time.sleep(interval)
        return False

    @staticmethod
    def wait_for_condition(condition_func, timeout=30, interval=0.1):
        """
        Wait for a condition function to return True
        Alias for wait_for_true for compatibility
        """
        return waiters.wait_for_true(condition_func, timeout, interval)


# For backward compatibility with direct import style
wait_for_true = waiters.wait_for_true
wait_for_equal = waiters.wait_for_equal
wait_for_not_equal = waiters.wait_for_not_equal
wait_for_condition = waiters.wait_for_condition


================================================
FILE: tests/dragonfly/valkey_search/valkey_search_test_case_dragonfly.py
================================================
"""
Dragonfly adapter for valkey_search_test_case.py
Creates real Dragonfly instances with replicas and clusters
"""

import os
import time
import pytest
import valkey
from valkey import ResponseError
from valkey.client import Valkey
from typing import List
import random
import string
import logging

# Import Dragonfly test infrastructure
from ..instance import DflyInstance, DflyInstanceFactory

LOGS_DIR = "/tmp/dragonfly-valkey-test-logs"

if "LOGS_DIR" in os.environ:
    LOGS_DIR = os.environ["LOGS_DIR"]


class Node:
    """This class represents a Dragonfly instance as a valkey server node"""

    def __init__(
        self,
        client=None,
        server=None,
        logfile=None,
        df_instance=None,
    ):
        self.client: Valkey = client
        self.server = server
        self.logfile: str = logfile
        self.df_instance: DflyInstance = df_instance

    def does_logfile_contains(self, pattern: str) -> bool:
        # For Dragonfly, simplified log checking
        return True


class ValkeyServerHandle:
    """Adapter for Dragonfly instance to look like ValkeyServerHandle"""

    def __init__(self, df_instance: DflyInstance):
        self.df_instance = df_instance
        self.bind_ip = "127.0.0.1"
        self.port = df_instance.port if df_instance else 6379

    def pid(self):
        return self.df_instance.proc.pid if self.df_instance and self.df_instance.proc else None

    def get_new_client(self):
        return valkey.Valkey(host=self.bind_ip, port=self.port, decode_responses=False)


class ReplicationGroup:
    """Replication group for Dragonfly"""

    def __init__(
        self,
        primary,
        replicas=None,
    ):
        self.primary: Node = primary
        self.replicas: List[Node] = replicas or []
        self._setup_done = False

    def setup_replications_cluster(self):
        # For cluster mode - not needed for single master/replica
        pass

    def setup_replications_cmd(self):
        """Setup replication using REPLICAOF command"""
        if self._setup_done or not self.replicas:
            return

        primary_ip = "localhost"
        primary_port = self.primary.df_instance.port

        # Configure each replica
        for replica in self.replicas:
            try:
                # Use REPLICAOF to setup replication
                result = replica.client.execute_command(f"REPLICAOF {primary_ip} {primary_port}")
                logging.debug(f"Setup replica on port {replica.df_instance.port}: {result}")
            except Exception as e:
                logging.error(f"Failed to setup replica: {e}")

        self._setup_done = True
        self._wait_for_replication()

    def _wait_for_replication(self):
        """Wait for replicas to sync"""
        # Give replicas time to connect
        time.sleep(0.5)

        # Check if replicas are connected
        try:
            info = self.primary.client.info("replication")
            connected_slaves = info.get("connected_slaves", 0)
            logging.debug(f"Connected slaves: {connected_slaves}, expected: {len(self.replicas)}")
        except Exception as e:
            logging.debug(f"Could not check replication status: {e}")

    def _check_all_replicas_are_connected(self):
        try:
            return self.primary.client.info("replication")["connected_slaves"] == len(self.replicas)
        except:
            return False

    def _check_is_replica_online(self, name) -> bool:
        try:
            replica_status = self.primary.client.info("replication")[name]
            return replica_status["state"] == "online"
        except:
            return False  # Assume offline if we can't check

    def get_replica_connection(self, index) -> Valkey:
        if index < len(self.replicas):
            return self.replicas[index].client
        raise IndexError(f"No replica at index {index}")

    def get_primary_connection(self) -> Valkey:
        return self.primary.client

    @staticmethod
    def cleanup(rg):
        """Cleanup Dragonfly instances"""
        # Cleanup is handled by Dragonfly fixtures
        pass


class ValkeySearchTestCaseCommon:
    """Common base class for tests"""

    pass


class ValkeyTestCase(ValkeySearchTestCaseCommon):
    """Base test case class"""

    pass


class ReplicationTestCase(ValkeyTestCase):
    """Replication test case"""

    pass


class ValkeySearchTestCaseBase(ValkeySearchTestCaseCommon):
    """Base test case for valkey-search tests running on Dragonfly"""

    @pytest.fixture(autouse=True)
    def setup_test(self, request, df_factory: DflyInstanceFactory):
        """Setup test with Dragonfly instances"""
        # Get replica count from parametrize if provided
        replica_count = 0
        if hasattr(request, "param") and "replica_count" in request.param:
            replica_count = request.param["replica_count"]

        # Create primary instance
        primary_df = df_factory.create(proactor_threads=4)
        primary_df.start()

        primary_client = valkey.Valkey(
            host="127.0.0.1", port=primary_df.port, decode_responses=False
        )

        primary_server = ValkeyServerHandle(primary_df)
        primary_node = Node(
            client=primary_client, server=primary_server, logfile=None, df_instance=primary_df
        )

        # Create replica instances
        replicas: List[Node] = []
        for i in range(replica_count):
            replica_df = df_factory.create(proactor_threads=4)
            replica_df.start()

            replica_client = valkey.Valkey(
                host="127.0.0.1", port=replica_df.port, decode_responses=False
            )

            replica_server = ValkeyServerHandle(replica_df)
            replica_node = Node(
                client=replica_client, server=replica_server, logfile=None, df_instance=replica_df
            )
            replicas.append(replica_node)

        # Setup replication group
        self.rg = ReplicationGroup(primary=primary_node, replicas=replicas)

        # Configure replication
        if replica_count > 0:
            self.rg.setup_replications_cmd()

        self.server = self.rg.primary.server
        self.client = self.rg.primary.client
        self.nodes: List[Node] = [self.rg.primary] + self.rg.replicas

        yield

        # Cleanup is handled by df_factory

    def verify_error_response(self, client, cmd, expected_err_reply):
        try:
            if isinstance(cmd, str):
                cmd_args = cmd.split()
            else:
                cmd_args = cmd
            client.execute_command(*cmd_args)
            assert False, f"Expected error '{expected_err_reply}' but command succeeded"
        except ResponseError as e:
            error_str = str(e)
            assert (
                expected_err_reply in error_str
            ), f"Actual error message: '{error_str}' doesn't contain expected: '{expected_err_reply}'"
            return error_str

    def verify_server_key_count(self, client, expected_num_keys):
        actual_num_keys = client.dbsize()
        assert (
            actual_num_keys == expected_num_keys
        ), f"Actual key number {actual_num_keys} is different from expected key number {expected_num_keys}"

    def generate_random_string(self, length=7):
        """Creates a random string with specified length."""
        characters = string.ascii_letters + string.digits
        random_string = "".join(random.choice(characters) for _ in range(length))
        return random_string

    def parse_valkey_info(self, section):
        mem_info = self.client.execute_command("INFO " + section)
        if isinstance(mem_info, bytes):
            mem_info = mem_info.decode("utf-8")
        lines = mem_info.split("\\r\\n")
        stats_dict = {}
        for line in lines:
            if ":" in line:
                key, value = line.split(":", 1)
                stats_dict[key.strip()] = value.strip()
        return stats_dict

    def start_new_server(self, is_primary=True) -> Node:
        """Return existing or create new server"""
        if is_primary:
            return self.rg.primary
        elif self.rg.replicas:
            return self.rg.replicas[0]
        else:
            # No replicas configured
            return self.rg.primary

    def get_replica_connection(self, index) -> Valkey:
        return self.rg.get_replica_connection(index)

    def get_primary_connection(self) -> Valkey:
        return self.rg.get_primary_connection()


class ValkeySearchTestCaseDebugMode(ValkeySearchTestCaseBase):
    """Debug mode variant"""

    pass


class ValkeySearchClusterTestCase(ValkeySearchTestCaseCommon):
    """Cluster test case - simplified for single Dragonfly instance"""

    CLUSTER_SIZE = 1  # Simplified to single node
    REPLICAS_COUNT = 0

    @pytest.fixture(autouse=True)
    def setup_test(self, request, df_factory: DflyInstanceFactory):
        """Setup cluster test with Dragonfly instances"""

        # Get replica count from parametrize if provided
        replica_count = 0
        if hasattr(request, "param") and "replica_count" in request.param:
            replica_count = request.param["replica_count"]

        # Create primary instance
        primary_df = df_factory.create(proactor_threads=4)
        primary_df.start()

        primary_client = valkey.Valkey(
            host="127.0.0.1", port=primary_df.port, decode_responses=False
        )

        primary_server = ValkeyServerHandle(primary_df)
        primary_node = Node(
            client=primary_client, server=primary_server, logfile=None, df_instance=primary_df
        )

        # Create replica instances
        replicas: List[Node] = []
        for i in range(replica_count):
            replica_df = df_factory.create(proactor_threads=4)
            replica_df.start()

            replica_client = valkey.Valkey(
                host="127.0.0.1", port=replica_df.port, decode_responses=False
            )

            replica_server = ValkeyServerHandle(replica_df)
            replica_node = Node(
                client=replica_client, server=replica_server, logfile=None, df_instance=replica_df
            )
            replicas.append(replica_node)

        rg = ReplicationGroup(primary=primary_node, replicas=replicas)

        # Configure replication
        if replica_count > 0:
            rg.setup_replications_cmd()

        self.replication_groups = [rg]
        self.nodes: List[Node] = [rg.primary] + rg.replicas

        yield

        # Cleanup handled by df_factory

    def get_primary(self, index):
        return self.replication_groups[index].primary.server

    def get_primary_port(self, index):
        return self.replication_groups[index].primary.server.port

    def new_client_for_primary(self, index):
        return self.replication_groups[index].primary.server.get_new_client()

    def client_for_primary(self, index):
        return self.replication_groups[index].primary.client

    def get_all_primary_clients(self) -> List[Valkey]:
        return [rg.primary.client for rg in self.replication_groups]

    def get_replication_group(self, index):
        return self.replication_groups[index]

    def new_cluster_client(self):
        """Return regular client for single-node"""
        return self.replication_groups[0].primary.client


class ValkeySearchClusterTestCaseDebugMode(ValkeySearchClusterTestCase):
    """Debug mode cluster variant"""

    pass


================================================
FILE: tests/fakeredis/README.md
================================================
Running FakeRedis tests on Dragonfly
====================================

FakeRedis is a Python library that provides a full implementation of the Redis protocol. It is useful for testing Redis
clients and for running Redis commands in Python code without having a running Redis server.

The tests in this directory are running against FakeRedis and against a dragonfly instance.
The results are then compared to ensure that the two implementations are consistent.

## Prerequisites

- Python 3.10 or above is required to run the tests.
- Poetry is required to install the dependencies.
- A dragonfly instance running on port 6380.

## Setup environment

1. Install Poetry by following the instructions at https://python-poetry.org/docs/#installation.
2. From the root directory of the tests (`dragonfly/tests/fakeredis`) run `poetry env use python3.10` (or higher) to
   create a virtual environment for Python 3.10.
3. Run `poetry install` to install the dependencies.
4. Run `poetry run pytest -v` to run all the tests.
5. Or alternatively, run `poetry run pytest -v test/{test-name}` to run a specific set of tests.

## Tests

- `test_connection.py`: Tests for the connection parameters to the Dragonfly server.
- `test_zadd.py`: Considering the various options for the ZADD command, it has its own set of tests.
- `test_json/*.py`: Tests for the JSON commands.
- `test_stack/*.py`: Tests for the stack commands, bloom filter, cuckoo filter, CMS, TDigest, time-series, top-k.
- `test_mixins/*.py`: Tests for various generic commands: bitmap, geospacial, hash, list, pubsub, scripting, streams,
  string, etc.
- `test_hypothesis.py`: Hypothesis tests for the mixins commands. These tests are using [hypothesis][1] and generate
  random tests with edge cases. Note these tests take significantly more time to run.

## General info

- `@pytest.mark.unsupported_server_types("dragonfly")` decorator indicates to pytest that the test should not run on
  dragonfly.
  - Some tests are skipped the commands are CURRENTLY not supported (e.g., `GEORARIUS`).
  - Others are skipped because they cause an expected behavior, and usually marked with TODO comment as well.

[1]: https://hypothesis.readthedocs.io/en/latest/


================================================
FILE: tests/fakeredis/pyproject.toml
================================================
[build-system]
requires = ["poetry_core"]
build-backend = "poetry.core.masonry.api"

[tool.poetry]
name = "dragonfly-fakeredis-tests"
packages = [
    { include = "test" },
]
version = "0.1.0"
description = "Tests running against dragonfly and fakeredis to check compatibility"
authors = [
    "Daniel Moran <daniel@moransoftware.ca>",
]
maintainers = [
    "Daniel Moran <daniel@moransoftware.ca>",
]

[tool.poetry.dependencies]
python = "^3.10"
redis = ">=5"
fakeredis = { version = "^2.26.1", extras = ["json", "bf", "cf", "lua"] }
hypothesis = "^6.111"
pytest = "^8.3"
pytest-timeout = "^2.3.1"
pytest-asyncio = "^0.24"
pytest-cov = "^5.0"
pytest-mock = "^3.14"
pytest-html = "^4.1"

[tool.pytest.ini_options]
asyncio_default_fixture_loop_scope = "function"
markers = [
    "slow: marks tests as slow (deselect with '-m \"not slow\"')",
    "fake: run tests only with fake redis",
    "real: run tests with a locally running real Redis server",
    "disconnected",
    "min_server",
    "max_server",
    "decode_responses",
    "unsupported_server_types",
]
asyncio_mode = "strict"
generate_report_on_test = true
render_collapsed = "failed,error"
addopts = [
    "--self-contained-html",
    "--import-mode=importlib",
]


================================================
FILE: tests/fakeredis/test/__init__.py
================================================


================================================
FILE: tests/fakeredis/test/conftest.py
================================================
from typing import Callable, Tuple, Union, Optional

import fakeredis
import pytest
import pytest_asyncio
import redis
from fakeredis._server import _create_version


def _check_lua_module_supported() -> bool:
    redis = fakeredis.FakeRedis(lua_modules={"cjson"})
    try:
        redis.eval("return cjson.encode({})", 0)
        return True
    except Exception:
        return False


@pytest_asyncio.fixture(scope="session")
def real_redis_version() -> Tuple[str, Union[None, Tuple[int, ...]]]:
    """Returns server's version or None if server is not running"""
    client = None
    try:
        client = redis.StrictRedis("localhost", port=6380, db=2)
        client_info = client.info()
        server_type = "dragonfly" if "dragonfly_version" in client_info else "redis"
        server_version = (
            client_info["redis_version"] if server_type != "dragonfly" else (7, 0)
        )
        server_version = _create_version(server_version) or (7,)
        return server_type, server_version
    except redis.ConnectionError:
        pytest.exit("Redis is not running")
        return "redis", (6,)
    finally:
        if hasattr(client, "close"):
            client.close()  # Absent in older versions of redis-py


@pytest_asyncio.fixture(name="fake_server")
def _fake_server(request) -> fakeredis.FakeServer:
    min_server_marker = request.node.get_closest_marker("min_server")
    server_version = min_server_marker.args[0] if min_server_marker else "7"
    server = fakeredis.FakeServer(version=server_version)
    server.connected = request.node.get_closest_marker("disconnected") is None
    return server


@pytest_asyncio.fixture
def r(request, create_redis) -> redis.Redis:
    rconn = create_redis(db=2)
    connected = request.node.get_closest_marker("disconnected") is None
    if connected:
        rconn.flushall()
    yield rconn
    if connected:
        rconn.flushall()
    if hasattr(r, "close"):
        rconn.close()  # Older versions of redis-py don't have this method


def _marker_version_value(request, marker_name: str):
    marker_value = request.node.get_closest_marker(marker_name)
    if marker_value is None:
        return (0,) if marker_name == "min_server" else (100,)
    return _create_version(marker_value.args[0])


@pytest_asyncio.fixture(
    name="create_redis",
    params=[
        pytest.param("StrictRedis", marks=pytest.mark.real),
        pytest.param("FakeStrictRedis", marks=pytest.mark.fake),
    ],
)
def _create_redis(request) -> Callable[[int], redis.Redis]:
    cls_name = request.param
    server_type, server_version = request.getfixturevalue("real_redis_version")
    if not cls_name.startswith("Fake") and not server_version:
        pytest.skip("Redis is not running")
    unsupported_server_types = request.node.get_closest_marker(
        "unsupported_server_types"
    )
    if unsupported_server_types and server_type in unsupported_server_types.args:
        pytest.skip(f"Server type {server_type} is not supported")
    min_server = _marker_version_value(request, "min_server")
    max_server = _marker_version_value(request, "max_server")
    if server_version < min_server:
        pytest.skip(
            f"Redis server {min_server} or more required but {server_version} found"
        )
    if server_version > max_server:
        pytest.skip(
            f"Redis server {max_server} or less required but {server_version} found"
        )
    decode_responses = request.node.get_closest_marker("decode_responses") is not None
    lua_modules_marker = request.node.get_closest_marker("load_lua_modules")
    lua_modules = set(lua_modules_marker.args) if lua_modules_marker else None
    if lua_modules and not _check_lua_module_supported():
        pytest.skip("LUA modules not supported by fakeredis")

    def factory(db=2):
        if cls_name.startswith("Fake"):
            fake_server = request.getfixturevalue("fake_server")
            cls = getattr(fakeredis, cls_name)
            return cls(
                db=db,
                decode_responses=decode_responses,
                server=fake_server,
                lua_modules=lua_modules,
            )
        # Real
        cls = getattr(redis, cls_name)
        return cls("localhost", port=6380, db=db, decode_responses=decode_responses)

    return factory


@pytest_asyncio.fixture(
    name="async_redis",
    params=[
        pytest.param("fake", marks=pytest.mark.fake),
        pytest.param("real", marks=pytest.mark.real),
    ],
)
async def _req_aioredis2(request) -> redis.asyncio.Redis:
    server_type, server_version = request.getfixturevalue("real_redis_version")
    if request.param != "fake" and not server_version:
        pytest.skip("Redis is not running")
    unsupported_server_types = request.node.get_closest_marker(
        "unsupported_server_types"
    )
    if unsupported_server_types and server_type in unsupported_server_types.args:
        pytest.skip(f"Server type {server_type} is not supported")
    min_server_marker = _marker_version_value(request, "min_server")
    max_server_marker = _marker_version_value(request, "max_server")
    if server_version < min_server_marker:
        pytest.skip(
            f"Redis server {min_server_marker} or more required but {server_version} found"
        )
    if server_version > max_server_marker:
        pytest.skip(
            f"Redis server {max_server_marker} or less required but {server_version} found"
        )
    lua_modules_marker = request.node.get_closest_marker("load_lua_modules")
    lua_modules = set(lua_modules_marker.args) if lua_modules_marker else None
    if lua_modules and not _check_lua_module_supported():
        pytest.skip("LUA modules not supported by fakeredis")
    fake_server: Optional[fakeredis.FakeServer]
    if request.param == "fake":
        fake_server = request.getfixturevalue("fake_server")
        ret = fakeredis.FakeAsyncRedis(server=fake_server, lua_modules=lua_modules)
    else:
        ret = redis.asyncio.Redis(host="localhost", port=6380, db=2)
        fake_server = None
    if not fake_server or fake_server.connected:
        await ret.flushall()

    yield ret

    if not fake_server or fake_server.connected:
        await ret.flushall()
    await ret.connection_pool.disconnect()


================================================
FILE: tests/fakeredis/test/test_asyncredis.py
================================================
import asyncio
import sys


if sys.version_info >= (3, 11):
    from asyncio import timeout as async_timeout
else:
    from async_timeout import timeout as async_timeout
import pytest
import pytest_asyncio
import redis
import redis.asyncio

from fakeredis import FakeServer, aioredis
from test import testtools

pytestmark = []
pytestmark.extend(
    [
        pytest.mark.asyncio,
    ]
)


@pytest_asyncio.fixture
async def conn(async_redis: redis.asyncio.Redis):
    """A single connection, rather than a pool."""
    async with async_redis.client() as conn:
        yield conn


async def test_ping(async_redis: redis.asyncio.Redis):
    pong = await async_redis.ping()
    assert pong is True


async def test_types(async_redis: redis.asyncio.Redis):
    await async_redis.hset(
        "hash", mapping={"key1": "value1", "key2": "value2", "key3": 123}
    )
    result = await async_redis.hgetall("hash")
    assert result == {b"key1": b"value1", b"key2": b"value2", b"key3": b"123"}


async def test_transaction(async_redis: redis.asyncio.Redis):
    async with async_redis.pipeline(transaction=True) as tr:
        tr.set("key1", "value1")
        tr.set("key2", "value2")
        ok1, ok2 = await tr.execute()
    assert ok1
    assert ok2
    result = await async_redis.get("key1")
    assert result == b"value1"


async def test_transaction_fail(async_redis: redis.asyncio.Redis):
    await async_redis.set("foo", "1")
    async with async_redis.pipeline(transaction=True) as tr:
        await tr.watch("foo")
        await async_redis.set("foo", "2")  # Different connection
        tr.multi()
        tr.get("foo")
        with pytest.raises(redis.asyncio.WatchError):
            await tr.execute()


async def test_pubsub(async_redis, event_loop):
    queue = asyncio.Queue()

    async def reader(ps):
        while True:
            message = await ps.get_message(ignore_subscribe_messages=True, timeout=5)
            if message is not None:
                if message.get("data") == b"stop":
                    break
                queue.put_nowait(message)

    async with async_timeout(5), async_redis.pubsub() as ps:
        await ps.subscribe("channel")
        task = event_loop.create_task(reader(ps))
        await async_redis.publish("channel", "message1")
        await async_redis.publish("channel", "message2")
        result1 = await queue.get()
        result2 = await queue.get()
        assert result1 == {
            "channel": b"channel",
            "pattern": None,
            "type": "message",
            "data": b"message1",
        }
        assert result2 == {
            "channel": b"channel",
            "pattern": None,
            "type": "message",
            "data": b"message2",
        }
        await async_redis.publish("channel", "stop")
        await task


@pytest.mark.slow
async def test_pubsub_timeout(async_redis: redis.asyncio.Redis):
    async with async_redis.pubsub() as ps:
        await ps.subscribe("channel")
        await ps.get_message(timeout=0.5)  # Subscription message
        message = await ps.get_message(timeout=0.5)
        assert message is None


@pytest.mark.slow
async def test_pubsub_disconnect(async_redis: redis.asyncio.Redis):
    async with async_redis.pubsub() as ps:
        await ps.subscribe("channel")
        await ps.connection.disconnect()
        message = await ps.get_message(timeout=0.5)  # Subscription message
        assert message is not None
        message = await ps.get_message(timeout=0.5)
        assert message is None


async def test_blocking_ready(async_redis, conn):
    """Blocking command which does not need to block."""
    await async_redis.rpush("list", "x")
    result = await conn.blpop("list", timeout=1)
    assert result == (b"list", b"x")


@pytest.mark.slow
async def test_blocking_timeout(conn):
    """Blocking command that times out without completing."""
    result = await conn.blpop("missing", timeout=1)
    assert result is None


@pytest.mark.slow
async def test_blocking_unblock(async_redis, conn, event_loop):
    """Blocking command that gets unblocked after some time."""

    async def unblock():
        await asyncio.sleep(0.1)
        await async_redis.rpush("list", "y")

    task = event_loop.create_task(unblock())
    result = await conn.blpop("list", timeout=1)
    assert result == (b"list", b"y")
    await task


async def test_wrongtype_error(async_redis: redis.asyncio.Redis):
    await async_redis.set("foo", "bar")
    with pytest.raises(redis.asyncio.ResponseError, match="^WRONGTYPE"):
        await async_redis.rpush("foo", "baz")


async def test_syntax_error(async_redis: redis.asyncio.Redis):
    with pytest.raises(
        redis.asyncio.ResponseError,
        match="^wrong number of arguments for 'get' command$",
    ):
        await async_redis.execute_command("get")


@testtools.run_test_if_lupa
class TestScripts:
    async def test_no_script_error(self, async_redis: redis.asyncio.Redis):
        with pytest.raises(redis.exceptions.NoScriptError):
            await async_redis.evalsha("0123456789abcdef0123456789abcdef", 0)

    @pytest.mark.min_server("7")
    async def test_failed_script_error7(self, async_redis):
        await async_redis.set("foo", "bar")
        with pytest.raises(redis.asyncio.ResponseError):
            await async_redis.eval('return redis.call("ZCOUNT", KEYS[1])', 1, "foo")


async def test_type(async_redis: redis.asyncio.Redis):
    await async_redis.set("string_key", "value")
    await async_redis.lpush("list_key", "value")
    await async_redis.sadd("set_key", "value")
    await async_redis.zadd("zset_key", {"value": 1})
    await async_redis.hset("hset_key", "key", "value")

    assert b"string" == await async_redis.type("string_key")  # noqa: E721
    assert b"list" == await async_redis.type("list_key")  # noqa: E721
    assert b"set" == await async_redis.type("set_key")  # noqa: E721
    assert b"zset" == await async_redis.type("zset_key")  # noqa: E721
    assert b"hash" == await async_redis.type("hset_key")  # noqa: E721
    assert b"none" == await async_redis.type("none_key")  # noqa: E721


async def test_xdel(async_redis: redis.asyncio.Redis):
    stream = "stream"

    # deleting from an empty stream doesn't do anything
    assert await async_redis.xdel(stream, 1) == 0

    m1 = await async_redis.xadd(stream, {"foo": "bar"})
    m2 = await async_redis.xadd(stream, {"foo": "bar"})
    m3 = await async_redis.xadd(stream, {"foo": "bar"})

    # xdel returns the number of deleted elements
    assert await async_redis.xdel(stream, m1) == 1
    assert await async_redis.xdel(stream, m2, m3) == 2


async def test_connection_with_username_and_password():
    server = FakeServer()
    r = aioredis.FakeRedis(server=server, username="username", password="password")

    test_value = "this_is_a_test"
    await r.hset("test:key", "test_hash", test_value)
    result = await r.hget("test:key", "test_hash")
    assert result.decode() == test_value


@pytest.mark.asyncio
async def test_cause_fakeredis_bug(async_redis):
    if sys.version_info < (3, 11):
        return

    async def worker_task():
        assert await async_redis.rpush("list1", "list1_val") == 1  # 1
        assert await async_redis.blpop("list2") == (b"list2", b"list2_val")  # 4
        assert await async_redis.set("foo", "bar") is True  # 5

    async with asyncio.TaskGroup() as tg:
        tg.create_task(worker_task())
        assert await async_redis.blpop("list1") == (b"list1", b"list1_val")  # 2
        assert await async_redis.rpush("list2", "list2_val") == 1  # 3

    # await async_redis.get("foo")  # uncomment to make test pass
    assert await async_redis.get("foo") == b"bar"


================================================
FILE: tests/fakeredis/test/test_hypotesis_joint/__init__.py
================================================


================================================
FILE: tests/fakeredis/test/test_hypotesis_joint/test_joint.py
================================================
import hypothesis.strategies as st

from .. import test_hypothesis as tests
from ..test_hypothesis.base import BaseTest, common_commands, commands
from ..test_hypothesis.test_string import string_commands

bad_commands = (
    # redis-py splits the command on spaces, and hangs if that ends up being an empty list
    commands(
        st.text().filter(lambda x: bool(x.split())), st.lists(st.binary() | st.text())
    )
)


class TestJoint(BaseTest):
    create_command_strategy = (
        tests.TestString.create_command_strategy
        | tests.TestHash.create_command_strategy
        | tests.TestList.create_command_strategy
        | tests.TestSet.create_command_strategy
        | tests.TestZSet.create_command_strategy
    )
    command_strategy = (
        tests.TestServer.server_commands
        | tests.TestConnection.connection_commands
        | string_commands
        | tests.TestHash.hash_commands
        | tests.TestList.list_commands
        | tests.TestSet.set_commands
        | tests.TestZSet.zset_commands
        | common_commands
        | bad_commands
    )


================================================
FILE: tests/fakeredis/test/test_hypothesis/__init__.py
================================================
__all__ = [
    "TestConnection",
    "TestHash",
    "TestList",
    "TestServer",
    "TestSet",
    "TestString",
    "TestTransaction",
    "TestZSet",
]

from .test_connection import TestConnection
from .test_hash import TestHash
from .test_list import TestList
from .test_server import TestServer
from .test_set import TestSet
from .test_string import TestString
from .test_transaction import TestTransaction
from .test_zset import TestZSet


================================================
FILE: tests/fakeredis/test/test_hypothesis/_server_info.py
================================================
from typing import Tuple, Union

import pytest
import redis


def server_info() -> Tuple[str, Union[None, Tuple[int, ...]]]:
    """Returns server's version or None if server is not running"""
    client = None
    try:
        client = redis.Redis("localhost", port=6380, db=2)
        client_info = client.info()
        server_type = "dragonfly" if "dragonfly_version" in client_info else "redis"
        server_version = (7, 0)
        return server_type, server_version
    except redis.ConnectionError as e:
        print(e)
        pytest.exit("Redis is not running")
        return "redis", (6,)
    finally:
        if hasattr(client, "close"):
            client.close()  # Absent in older versions of redis-py


server_type, redis_ver = server_info()


================================================
FILE: tests/fakeredis/test/test_hypothesis/base.py
================================================
import functools
import math
import string
import sys
from typing import Any, List, Tuple, Type, Optional

import fakeredis
import hypothesis
import hypothesis.stateful
import hypothesis.strategies as st
import pytest
import redis
from hypothesis.stateful import rule, initialize, precondition
from hypothesis.strategies import SearchStrategy

from ._server_info import redis_ver

self_strategy = st.runner()

MAX_INT = 2_147_483_647
MIN_INT = -2_147_483_648


@st.composite
def sample_attr(draw, name):
    """Strategy for sampling a specific attribute from a state machine"""
    machine = draw(self_strategy)
    values = getattr(machine, name)
    position = draw(st.integers(min_value=0, max_value=len(values) - 1))
    return values[position]


keys = sample_attr("keys")
fields = sample_attr("fields")
values = sample_attr("values")
scores = sample_attr("scores")

eng_text = st.builds(
    lambda x: x.encode(), st.text(alphabet=string.ascii_letters, min_size=1)
)
ints = st.integers(min_value=MIN_INT, max_value=MAX_INT)
int_as_bytes = st.builds(lambda x: str(_default_normalize(x)).encode(), ints)
floats = st.floats(
    width=32, allow_nan=False, allow_subnormal=False, allow_infinity=False
)
float_as_bytes = st.builds(lambda x: repr(_default_normalize(x)).encode(), floats)
counts = st.integers(min_value=-3, max_value=3) | ints
# Redis has an integer overflow bug in swapdb, so we confine the numbers to
# a limited range (https://github.com/antirez/redis/issues/5737).
dbnums = st.integers(min_value=0, max_value=3) | st.integers(
    min_value=-1000, max_value=1000
)
# The filter is to work around https://github.com/antirez/redis/issues/5632
patterns = st.text(
    alphabet=st.sampled_from("[]^$*.?-azAZ\\\r\n\t")
) | st.binary().filter(lambda x: b"\0" not in x)

# Redis has integer overflow bugs in time computations, which is why we set a maximum.
expires_seconds = st.integers(min_value=5, max_value=1_000)
expires_ms = st.integers(min_value=5_000, max_value=50_000)


class WrappedException:
    """Wraps an exception for the purposes of comparison."""

    def __init__(self, exc):
        self.wrapped = exc

    def __str__(self):
        return str(self.wrapped)

    def __repr__(self):
        return "WrappedException({!r})".format(self.wrapped)

    def __eq__(self, other):
        if not isinstance(other, WrappedException):
            return NotImplemented
        if type(self.wrapped) != type(other.wrapped):  # noqa: E721
            return False
        return True
        # return self.wrapped.args == other.wrapped.args

    def __ne__(self, other):
        if not isinstance(other, WrappedException):
            return NotImplemented
        return not self == other


def _wrap_exceptions(obj):
    if isinstance(obj, list):
        return [_wrap_exceptions(item) for item in obj]
    elif isinstance(obj, Exception):
        return WrappedException(obj)
    else:
        return obj


def _sort_list(lst):
    if isinstance(lst, list):
        return sorted(lst)
    else:
        return lst


def _normalize_if_number(x):
    if isinstance(x, list):
        return [_normalize_if_number(i) for i in x]
    try:
        res = float(x)
        return x if math.isnan(res) else res
    except ValueError:
        return x


def _flatten(args):
    if isinstance(args, (list, tuple)):
        for arg in args:
            yield from _flatten(arg)
    elif args is not None:
        yield args


def _default_normalize(x: Any) -> Any:
    if redis_ver >= (7,) and (isinstance(x, float) or isinstance(x, int)):
        return 0 + x

    return x


class Command:
    def __init__(self, *args):
        args = list(_flatten(args))
        args = [_default_normalize(x) for x in args]
        self.args = tuple(args)

    def __repr__(self):
        parts = [repr(arg) for arg in self.args]
        return "Command({})".format(", ".join(parts))

    @staticmethod
    def encode(arg):
        encoder = redis.connection.Encoder("utf-8", "replace", False)
        return encoder.encode(arg)

    @property
    def normalize(self):
        command = self.encode(self.args[0]).lower() if self.args else None
        # Functions that return a list in arbitrary order
        unordered = {
            b"keys",
            b"sort",
            b"hgetall",
            b"hkeys",
            b"hvals",
            b"sdiff",
            b"sinter",
            b"sunion",
            b"smembers",
            b"hexpire",
        }
        if command in unordered:
            return _sort_list
        else:
            return _normalize_if_number

    @property
    def testable(self) -> bool:
        """Whether this command is suitable for a test.

        The fuzzer can create commands with behavior that is non-deterministic, not supported, or which hits redis bugs.
        """
        N = len(self.args)
        if N == 0:
            return False
        command = self.encode(self.args[0]).lower()
        if not command.split():
            return False
        if command == b"keys" and N == 2 and self.args[1] != b"*":
            return False
        # Redis will ignore a NULL character in some commands but not others,
        # e.g., it recognises EXEC\0 but not MULTI\00.
        # Rather than try to reproduce this quirky behavior, just skip these tests.
        if b"\0" in command:
            return False
        return True


def zero_or_more(*args) -> List[SearchStrategy]:
    return [st.none() | st.just(arg) for arg in args]


def commands(*args, **kwargs):
    return st.builds(functools.partial(Command, **kwargs), *args)


# # TODO: all expiry-related commands
common_commands = (
    commands(st.sampled_from(["del", "persist", "type", "unlink"]), keys)
    | commands(st.just("exists"), st.lists(keys))
    | commands(st.just("keys"), st.just("*"))
    # Disabled for now due to redis giving wrong answers
    # (https://github.com/antirez/redis/issues/5632)
    # | commands(st.just('keys'), patterns)
    | commands(st.just("move"), keys, dbnums)
    | commands(st.sampled_from(["rename", "renamenx"]), keys, keys)
    # TODO: find a better solution to sort instability than throwing
    #  away the sort entirely with normalize. This also prevents us
    #  using LIMIT.
    | commands(st.just("sort"), keys, *zero_or_more("asc", "desc", "alpha"))
)


@hypothesis.settings(max_examples=1000)
class CommonMachine(hypothesis.stateful.RuleBasedStateMachine):
    create_command_strategy = st.nothing()

    def __init__(self):
        super().__init__()
        try:
            self.real = redis.StrictRedis("localhost", port=6380, db=2)
            self.real.ping()
        except redis.ConnectionError:
            pytest.skip("redis is not running")
        if self.real.info("server").get("arch_bits") != 64:
            self.real.connection_pool.disconnect()
            pytest.skip("redis server is not 64-bit")
        self.fake = fakeredis.FakeStrictRedis(
            server=fakeredis.FakeServer(version=redis_ver), port=6380, db=2
        )
        # Disable the response parsing so that we can check the raw values returned
        self.fake.response_callbacks.clear()
        self.real.response_callbacks.clear()
        self.transaction_normalize = []
        self.keys = []
        self.fields = []
        self.values = []
        self.scores = []
        self.initialized_data = False
        try:
            self.real.execute_command("discard")
        except redis.ResponseError:
            pass
        self.real.flushall()

    def teardown(self) -> None:
        self.real.connection_pool.disconnect()
        self.fake.connection_pool.disconnect()
        super().teardown()

    @staticmethod
    def _evaluate(
        client: redis.Redis, command
    ) -> Tuple[Any, Optional[Type[Exception]]]:
        try:
            result = client.execute_command(*command.args)
            if result != "QUEUED":
                result = command.normalize(result)
            exc = None
        except Exception as e:
            result = exc = e
        return _wrap_exceptions(result), exc

    def _compare(self, command: Command) -> None:
        fake_result, fake_exc = self._evaluate(self.fake, command)
        real_result, real_exc = self._evaluate(self.real, command)

        if fake_exc is not None and real_exc is None:
            print(
                f"{fake_exc} raised on only on fake when running {command}",
                file=sys.stderr,
            )
            raise fake_exc
        elif real_exc is not None and fake_exc is None:
            assert real_exc == fake_exc, f"Expected exception {real_exc} not raised"
        elif (
            real_exc is None
            and isinstance(real_result, list)
            and command.args
            and command.args[0].lower() == "exec"
        ):
            assert fake_result is not None
            # Transactions need to use the normalize functions of the component commands.
            assert len(self.transaction_normalize) == len(real_result)
            assert len(self.transaction_normalize) == len(fake_result)
            for n, r, f in zip(self.transaction_normalize, real_result, fake_result):
                assert n(f) == n(r)
            self.transaction_normalize = []
        elif isinstance(fake_result, list):
            assert len(fake_result) == len(real_result), (
                f"Discrepancy when running command {command}, fake({fake_result}) != real({real_result})",
            )
            for i in range(len(fake_result)):
                assert fake_result[i] == real_result[i] or (
                    type(fake_result[i]) is float
                    and fake_result[i] == pytest.approx(real_result[i])
                ), f"Discrepancy when running command {command}, fake({fake_result}) != real({real_result})"

        else:
            assert fake_result == real_result or (
                type(fake_result) is float and fake_result == pytest.approx(real_result)
            ), f"Discrepancy when running command {command}, fake({fake_result}) != real({real_result})"
            if real_result == b"QUEUED":
                # Since redis removes the distinction between simple strings and
                # bulk strings, this might not actually indicate that we're in a
                # transaction. But it is extremely unlikely that hypothesis will
                # find such examples.
                self.transaction_normalize.append(command.normalize)
        if len(command.args) == 1 and Command.encode(command.args[0]).lower() in (
            b"discard",
            b"exec",
        ):
            self.transaction_normalize = []

    @initialize(
        attrs=st.fixed_dictionaries(
            dict(
                keys=st.lists(eng_text, min_size=2, max_size=5, unique=True),
                fields=st.lists(eng_text, min_size=2, max_size=5, unique=True),
                values=st.lists(
                    eng_text | int_as_bytes | float_as_bytes,
                    min_size=2,
                    max_size=5,
                    unique=True,
                ),
                scores=st.lists(
                    floats,
                    min_size=2,
                    max_size=5,
                    unique=True,
                ),
            )
        )
    )
    def init_attrs(self, attrs):
        for key, value in attrs.items():
            setattr(self, key, value)

    # hypothesis doesn't allow ordering of @initialize, so we have to put
    # preconditions on rules to ensure we call init_data exactly once and
    # after init_attrs.
    @precondition(lambda self: not self.initialized_data)
    @rule(
        commands=self_strategy.flatmap(
            lambda self: st.lists(self.create_command_strategy)
        )
    )
    def init_data(self, commands) -> None:
        for command in commands:
            self._compare(command)
        self.initialized_data = True

    @precondition(lambda self: self.initialized_data)
    @rule(command=self_strategy.flatmap(lambda self: self.command_strategy))
    def one_command(self, command: Command) -> None:
        self._compare(command)


class BaseTest:
    """Base class for test classes."""

    command_strategy: SearchStrategy
    create_command_strategy = st.nothing()

    @pytest.mark.slow
    def test(self):
        class Machine(CommonMachine):
            create_command_strategy = self.create_command_strategy
            command_strategy = self.command_strategy

        # hypothesis.settings.register_profile(
        #     "debug", max_examples=10, verbosity=hypothesis.Verbosity.debug
        # )
        hypothesis.settings.register_profile(
            "debug", verbosity=hypothesis.Verbosity.debug
        )
        hypothesis.settings.load_profile("debug")
        hypothesis.stateful.run_state_machine_as_test(Machine)


================================================
FILE: tests/fakeredis/test/test_hypothesis/test_connection.py
================================================
import hypothesis.strategies as st

from .base import BaseTest, commands, values, common_commands


class TestConnection(BaseTest):
    # TODO: tests for select
    connection_commands = (
        commands(st.just("echo"), values)
        | commands(st.just("ping"), st.lists(values, max_size=2))
        # | commands(st.just("swapdb"), dbnums, dbnums)
    )
    command_strategy = connection_commands | common_commands


================================================
FILE: tests/fakeredis/test/test_hypothesis/test_hash.py
================================================
import hypothesis.strategies as st

from .base import (
    BaseTest,
    commands,
    values,
    keys,
    common_commands,
    fields,
    ints,
    expires_seconds,
)


class TestHash(BaseTest):
    hash_commands = (
        commands(st.just("hset"), keys, st.lists(st.tuples(fields, values)))
        | commands(st.just("hdel"), keys, st.lists(fields))
        | commands(st.just("hexists"), keys, fields)
        | commands(st.just("hget"), keys, fields)
        | commands(st.sampled_from(["hgetall", "hkeys", "hvals"]), keys)
        | commands(st.just("hincrby"), keys, fields, ints)
        | commands(st.just("hlen"), keys)
        | commands(st.just("hmget"), keys, st.lists(fields))
        | commands(st.just("hset"), keys, st.lists(st.tuples(fields, values)))
        | commands(st.just("hsetnx"), keys, fields, values)
        | commands(st.just("hstrlen"), keys, fields)
        | commands(
            st.just("hpersist"),
            st.just("fields"),
            st.just(2),
            st.lists(fields, min_size=2, max_size=2),
        )
        | commands(
            st.just("hexpire"),
            keys,
            expires_seconds,
            st.just("fields"),
            st.just(2),
            st.lists(fields, min_size=2, max_size=2, unique=True),
        )
    )
    create_command_strategy = commands(
        st.just("hset"), keys, st.lists(st.tuples(fields, values), min_size=1)
    )
    command_strategy = hash_commands | common_commands


================================================
FILE: tests/fakeredis/test/test_hypothesis/test_list.py
================================================
import hypothesis.strategies as st

from .base import (
    BaseTest,
    commands,
    values,
    keys,
    common_commands,
    counts,
    ints,
)


class TestList(BaseTest):
    # TODO: blocking commands
    list_commands = (
        commands(st.just("lindex"), keys, counts)
        | commands(
            st.just("linsert"),
            keys,
            st.sampled_from(["before", "after", "BEFORE", "AFTER"]) | st.binary(),
            values,
            values,
        )
        | commands(st.just("llen"), keys)
        | commands(
            st.sampled_from(["lpop", "rpop"]),
            keys,
            st.just(None) | st.just([]) | ints,
        )
        | commands(
            st.sampled_from(["lpush", "lpushx", "rpush", "rpushx"]),
            keys,
            st.lists(values),
        )
        | commands(st.just("lrange"), keys, counts, counts)
        | commands(st.just("lrem"), keys, counts, values)
        | commands(st.just("lset"), keys, counts, values)
        | commands(st.just("ltrim"), keys, counts, counts)
        | commands(st.just("rpoplpush"), keys, keys)
    )
    create_command_strategy = commands(
        st.just("rpush"), keys, st.lists(values, min_size=1)
    )
    command_strategy = list_commands | common_commands


================================================
FILE: tests/fakeredis/test/test_hypothesis/test_server.py
================================================
import hypothesis.strategies as st

from .base import (
    BaseTest,
    commands,
    common_commands,
    keys,
    values,
)
from .test_string import string_commands


class TestServer(BaseTest):
    # TODO: real redis raises an error if there is a save already in progress.
    #  Find a better way to test this. commands(st.just('bgsave'))
    server_commands = (
        commands(st.just("dbsize"))
        | commands(st.sampled_from(["flushdb", "flushall"]))
        # TODO: result is non-deterministic
        # | commands(st.just('lastsave'))
        | commands(st.just("save"))
    )
    create_command_strategy = commands(st.just("set"), keys, values)
    command_strategy = server_commands | string_commands | common_commands


================================================
FILE: tests/fakeredis/test/test_hypothesis/test_set.py
================================================
import hypothesis.strategies as st

from .base import (
    BaseTest,
    commands,
    keys,
    common_commands,
    fields,
)


class TestSet(BaseTest):
    set_commands = (
        commands(st.just("sadd"), keys, st.lists(fields))
        | commands(st.just("scard"), keys)
        | commands(st.sampled_from(["sdiff", "sinter", "sunion"]), st.lists(keys))
        | commands(
            st.sampled_from(["sdiffstore", "sinterstore", "sunionstore"]),
            keys,
            st.lists(keys),
        )
        | commands(st.just("sismember"), keys, fields)
        | commands(st.just("smembers"), keys)
        | commands(st.just("smove"), keys, keys, fields)
        | commands(st.just("srem"), keys, st.lists(fields))
    )
    create_command_strategy = commands(
        st.just("sadd"), keys, st.lists(fields, min_size=1)
    )
    command_strategy = set_commands | common_commands


================================================
FILE: tests/fakeredis/test/test_hypothesis/test_string.py
================================================
import hypothesis.strategies as st

from .base import (
    BaseTest,
    commands,
    values,
    keys,
    common_commands,
    counts,
    int_as_bytes,
    zero_or_more,
    ints,
    expires_seconds,
    expires_ms,
)

optional_bitcount_range = st.just(()) | st.tuples(int_as_bytes, int_as_bytes)
# todo: Should be addressed
# str_len = st.integers(min_value=-3, max_value=3) | st.integers(
#     min_value=-2147483647, max_value=2147483648
# )
str_len = st.integers(min_value=-3, max_value=3) | st.integers(
    min_value=-3000, max_value=3000
)

string_commands = (
    commands(st.just("append"), keys, values)
    | commands(st.just("bitcount"), keys, optional_bitcount_range)
    | commands(st.sampled_from(["incr", "decr"]), keys)
    | commands(st.sampled_from(["incrby", "decrby"]), keys, values)
    | commands(st.just("get"), keys)
    | commands(st.just("getbit"), keys, counts)
    | commands(
        st.just("setbit"),
        keys,
        counts,
        st.integers(min_value=0, max_value=1) | ints,
    )
    | commands(st.sampled_from(["substr", "getrange"]), keys, str_len, counts)
    | commands(st.just("getset"), keys, values)
    | commands(st.just("mget"), st.lists(keys))
    | commands(st.sampled_from(["mset", "msetnx"]), st.lists(st.tuples(keys, values)))
    | commands(
        st.just("set"),
        keys,
        values,
        *zero_or_more("nx", "xx", "keepttl"),
    )
    | commands(st.just("setex"), keys, expires_seconds, values)
    | commands(st.just("psetex"), keys, expires_ms, values)
    | commands(st.just("setnx"), keys, values)
    | commands(st.just("setrange"), keys, str_len, values)
    | commands(st.just("strlen"), keys)
)


class TestString(BaseTest):
    create_command_strategy = commands(st.just("set"), keys, values)
    command_strategy = string_commands | common_commands


================================================
FILE: tests/fakeredis/test/test_hypothesis/test_transaction.py
================================================
import hypothesis.strategies as st

from .base import (
    BaseTest,
    commands,
    values,
    keys,
    common_commands,
    counts,
    zero_or_more,
    ints,
    expires_seconds,
    expires_ms,
)
from .test_string import TestString


class TestTransaction(BaseTest):
    transaction_commands = (
        commands(st.sampled_from(["multi", "discard", "exec", "unwatch"]))
        | commands(st.just("watch"), keys)
        | commands(st.just("append"), keys, values)
        | commands(st.just("bitcount"), keys)
        | commands(st.just("bitcount"), keys, values, values)
        | commands(st.sampled_from(["incr", "decr"]), keys)
        | commands(st.sampled_from(["incrby", "decrby"]), keys, values)
        | commands(st.just("get"), keys)
        | commands(st.just("getbit"), keys, counts)
        | commands(
            st.just("setbit"),
            keys,
            counts,
            st.integers(min_value=0, max_value=1) | ints,
        )
        | commands(st.sampled_from(["substr", "getrange"]), keys, counts, counts)
        | commands(st.just("getset"), keys, values)
        | commands(st.just("mget"), st.lists(keys))
        | commands(
            st.sampled_from(["mset", "msetnx"]), st.lists(st.tuples(keys, values))
        )
        | commands(
            st.just("set"),
            keys,
            values,
            *zero_or_more("nx", "xx", "keepttl"),
        )
        | commands(st.just("setex"), keys, expires_seconds, values)
        | commands(st.just("psetex"), keys, expires_ms, values)
        | commands(st.just("setnx"), keys, values)
        | commands(st.just("setrange"), keys, counts, values)
        | commands(st.just("strlen"), keys)
    )
    create_command_strategy = TestString.create_command_strategy
    command_strategy = transaction_commands | common_commands


================================================
FILE: tests/fakeredis/test/test_hypothesis/test_zset.py
================================================
import operator

import hypothesis.strategies as st

from .base import (
    BaseTest,
    commands,
    keys,
    common_commands,
    counts,
    fields,
    zero_or_more,
    scores,
    Command,
    float_as_bytes,
)

score_tests = scores | st.builds(lambda x: b"(" + repr(x).encode(), scores)
limits = st.just(()) | st.tuples(st.just("limit"), counts, counts)
string_tests = st.sampled_from([b"+", b"-"]) | st.builds(
    operator.add, st.sampled_from([b"(", b"["]), fields
)
zset_no_score_create_commands = commands(
    st.just("zadd"), keys, st.lists(st.tuples(st.just(0), fields), min_size=1)
)
zset_no_score_commands = (  # TODO: test incr
    commands(
        st.just("zadd"),
        keys,
        *zero_or_more("nx", "xx", "ch", "incr"),
        st.lists(st.tuples(st.just(0), fields)),
    )
    | commands(st.just("zlexcount"), keys, string_tests, string_tests)
    | commands(
        st.sampled_from(["zrangebylex", "zrevrangebylex"]),
        keys,
        string_tests,
        string_tests,
        limits,
    )
    | commands(st.just("zremrangebylex"), keys, string_tests, string_tests)
)


def optional(arg):
    return st.none() | st.just(arg)


def build_zstore(command, dest, sources, weights, aggregate) -> Command:
    args = [command, dest, len(sources)]
    args += [source[0] for source in sources]
    if weights:
        args.append("weights")
        args += [source[1] for source in sources]
    if aggregate:
        args += ["aggregate", aggregate]
    return Command(args)


class TestZSet(BaseTest):
    zset_commands = (
        commands(
            st.just("zadd"),
            keys,
            *zero_or_more("nx", "xx", "ch", "incr"),
            st.lists(st.tuples(scores, fields)),
        )
        | commands(st.just("zcard"), keys)
        | commands(st.just("zcount"), keys, score_tests, score_tests)
        | commands(st.just("zincrby"), keys, scores, fields)
        | commands(
            st.sampled_from(["zrange", "zrevrange"]),
            keys,
            counts,
            counts,
            optional("withscores"),
        )
        | commands(
            st.sampled_from(["zrangebyscore", "zrevrangebyscore"]),
            keys,
            score_tests,
            score_tests,
            limits,
            optional("withscores"),
        )
        | commands(st.sampled_from(["zrank", "zrevrank"]), keys, fields)
        | commands(st.just("zrem"), keys, st.lists(fields))
        | commands(st.just("zremrangebyrank"), keys, counts, counts)
        | commands(st.just("zremrangebyscore"), keys, score_tests, score_tests)
        | commands(st.just("zscore"), keys, fields)
        | st.builds(
            build_zstore,
            command=st.sampled_from(["zunionstore", "zinterstore"]),
            dest=keys,
            sources=st.lists(st.tuples(keys, float_as_bytes)),
            weights=st.booleans(),
            aggregate=st.sampled_from([None, "sum", "min", "max"]),
        )
    )
    # TODO: zscan, zpopmin/zpopmax, bzpopmin/bzpopmax, probably more
    create_command_strategy = commands(
        st.just("zadd"), keys, st.lists(st.tuples(scores, fields), min_size=1)
    )
    command_strategy = zset_commands | common_commands


class TestZSetNoScores(BaseTest):
    create_command_strategy = zset_no_score_create_commands
    command_strategy = zset_no_score_commands | common_commands


================================================
FILE: tests/fakeredis/test/test_issues.py
================================================
import pytest
import redis.client


def test_causes_crash(r: redis.Redis):
    key = b"}W\xfa\x87\xf4"
    key2 = b"\xf3\xba\x00\xa1\x1c\xac\x01A\x8b\xc4\xe9\xe2\xa8"
    r.rpush(key, b"!\xef\x9e\xd2", b"1175417134")
    r.rpoplpush(key, key)
    r.lrange(key, -1, 14795)
    with pytest.raises(redis.ResponseError):
        r.rename(key2, key2)
    r.lrange(key, 2, 0)
    r.sort(key, alpha=True)
    r.llen(key)
    r.keys("*")
    r.keys("*")
    r.lindex(key, 1)
    r.exists(key, key2, key, key2, key2, key2, key2, key)
    r.linsert(key, "AFTER", b"inf", b"!\xef\x9e\xd2")
    with pytest.raises(redis.ResponseError):
        r.linsert(
            key,
            b"W8\xe9&",
            b"-43950",
            b"-43950",
        )
    r.rpoplpush(key, key)
    with pytest.raises(redis.ResponseError):
        r.exists()
    r.lrem(key2, -56700, b"-6.816602725023744e+16")
    r.lrem(key, -3, b"1175417134")
    r.llen(key2)
    r.lrem(key, -3, b"!\xef\x9e\xd2")


def test_another_test_causes_crash(r: redis.Redis):
    key1 = b"\xc2\xdb"
    key2 = b"z`\xf8,\xe2\x02\xb3\x85\xc5"
    key3 = b"\xf4<\xe1\xb6\xcb\xde\xaf"
    key4 = b"\xad"
    r.rpush(key1, b"i\x05\x0b\xb1")
    r.rpush(
        key2,
        b"i\x05\x0b\xb1",
        b"i\x05\x0b\xb1",
        b"\\h\xf2",
        b"\\h\xf2",
        b"i\x05\x0b\xb1",
        b"\\h\xf2",
    )
    r.rpush(
        key3,
        b"\\h\xf2",
        b"i\x05\x0b\xb1",
        b"i\x05\x0b\xb1",
        b"\\h\xf2",
        b"\\h\xf2",
        b"\\h\xf2",
        b"\\h\xf2",
        b"i\x05\x0b\xb1",
    )
    r.rpush(
        key1,
        b"i\x05\x0b\xb1",
        b"\\h\xf2",
        b"i\x05\x0b\xb1",
        b"\\h\xf2",
        b"i\x05\x0b\xb1",
        b"\\h\xf2",
        b"\\h\xf2",
        b"i\x05\x0b\xb1",
    )
    r.rpush(key2, b"i\x05\x0b\xb1")

    r.lpop(b"")
    with pytest.raises(redis.ResponseError):
        r.rpushx(key4)
    r.move(b"", 1)
    with pytest.raises(redis.ResponseError):
        r.move(key3, -730)
    r.ltrim(key3, -51547, -2)
    r.rpoplpush(key4, b"")

    r.rpush(key4, b"i\x05\x0b\xb1", b"\\h\xf2", b"\\h\xf2")
    r.persist(key2)
    r.exists(key1, key1, key1)

    r.ltrim(b"", -12584, -3)
    r.lrem(key4, -1, b"i\x05\x0b\xb1")
    with pytest.raises(redis.ResponseError):
        r.linsert(key2, b"\xa5", b"\\h\xf2", b"i\x05\x0b\xb1")
    r.linsert(key2, "BEFORE", b"\\h\xf2", b"\\h\xf2")
    r.ltrim(key2, 1, -2_147_483_648)
    r.ltrim(key1, -4200252, 1)
    r.rpush(
        b"",
        b"i\x05\x0b\xb1",
        b"i\x05\x0b\xb1",
        b"i\x05\x0b\xb1",
        b"\\h\xf2",
        b"\\h\xf2",
        b"i\x05\x0b\xb1",
    )
    with pytest.raises(redis.ResponseError):
        r.rpop(key1, -2_147_483_648)
    r.lrem(key1, 77, b"i\x05\x0b\xb1")
    r.rpoplpush(b"", key2)
    r.ltrim(b"", 0, 1)
    r.unlink(b"")
    r.ltrim(key1, 0, 0)
    r.lrem(key3, 31029, b"\\h\xf2")
    r.lrange(key1, -2, -91)
    r.rpoplpush(key1, key2)
    r.rpush(
        key1,
        b"\\h\xf2",
        b"\\h\xf2",
        b"\\h\xf2",
        b"\\h\xf2",
        b"\\h\xf2",
        b"\\h\xf2",
        b"\\h\xf2",
        b"\\h\xf2",
        b"\\h\xf2",
        b"\\h\xf2",
        b"\\h\xf2",
        b"i\x05\x0b\xb1",
        b"i\x05\x0b\xb1",
        b"\\h\xf2",
    )
    r.ltrim(key1, 0, 18)
    r.keys("*")
    with pytest.raises(redis.ResponseError):
        r.move(key4, 993)
    r.lrange(b"", 0, 38001)
    with pytest.raises(redis.ResponseError):
        r.sort(key4)
    r.lindex(key1, -2)
    r.rpoplpush(key4, key4)
    r.lrem(key4, -18528, b"\\h\xf2")


================================================
FILE: tests/fakeredis/test/test_json/__init__.py
================================================


================================================
FILE: tests/fakeredis/test/test_json/test_json.py
================================================
"""
Tests for `fakeredis-py`'s emulation of Redis's JSON.GET command subset.
"""

from __future__ import annotations

import json
from test import testtools

import pytest
import redis
from redis.commands.json.path import Path

json_tests = pytest.importorskip("jsonpath_ng")


def test_jsonget(r: redis.Redis):
    data = {"x": "bar", "y": {"x": 33}}
    r.json().set("foo", Path.root_path(), data)
    assert r.json().get("foo") == data
    assert r.json().get("foo", Path("$..x")) == ["bar", 33]

    data2 = {"x": "bar"}
    r.json().set(
        "foo2",
        Path.root_path(),
        data2,
    )
    assert r.json().get("foo2") == data2
    assert r.json().get("foo2", "$") == [
        data2,
    ]
    assert r.json().get("foo2", Path("$.a"), Path("$.x")) == {"$.a": [], "$.x": ["bar"]}

    assert r.json().get("non-existing-key") is None

    r.json().set(
        "foo2",
        Path.root_path(),
        {"x": "bar", "y": {"x": 33}},
    )
    assert r.json().get("foo2") == {"x": "bar", "y": {"x": 33}}
    assert r.json().get("foo2", Path("$..x")) == ["bar", 33]

    r.json().set(
        "foo",
        Path.root_path(),
        {"x": "bar"},
    )
    assert r.json().get("foo") == {"x": "bar"}
    assert r.json().get("foo", Path("$.a"), Path("$.x")) == {"$.a": [], "$.x": ["bar"]}
    assert r.json().get("unknown", "$") is None


def test_json_setgetdeleteforget(r: redis.Redis):
    data = {"x": "bar"}
    assert r.json().set("foo", Path.root_path(), data) == 1
    assert r.json().get("foo") == data
    assert r.json().get("baz") is None
    assert r.json().delete("foo") == 1
    assert r.json().forget("foo") == 0  # second delete
    assert r.exists("foo") == 0


def test_json_delete_with_dollar(r: redis.Redis):
    doc1 = {"a": 1, "nested": {"a": 2, "b": 3}}
    assert r.json().set("doc1", Path.root_path(), doc1)
    assert r.json().delete("doc1", "$..a") == 2
    assert r.json().get("doc1", Path.root_path()) == {"nested": {"b": 3}}

    doc2 = {"a": {"a": 2, "b": 3}, "b": ["a", "b"], "nested": {"b": [True, "a", "b"]}}
    r.json().set("doc2", "$", doc2)
    assert r.json().delete("doc2", "$..a") == 1
    assert r.json().get("doc2", Path.root_path()) == {
        "nested": {"b": [True, "a", "b"]},
        "b": ["a", "b"],
    }

    doc3 = [
        {
            "ciao": ["non ancora"],
            "nested": [
                {"ciao": [1, "a"]},
                {"ciao": [2, "a"]},
                {"ciaoc": [3, "non", "ciao"]},
                {"ciao": [4, "a"]},
                {"e": [5, "non", "ciao"]},
            ],
        }
    ]
    assert r.json().set("doc3", Path.root_path(), doc3)
    assert r.json().delete("doc3", '$.[0]["nested"]..ciao') == 3

    doc3val = [
        [
            {
                "ciao": ["non ancora"],
                "nested": [
                    {},
                    {},
                    {"ciaoc": [3, "non", "ciao"]},
                    {},
                    {"e": [5, "non", "ciao"]},
                ],
            }
        ]
    ]
    assert r.json().get("doc3", Path.root_path()) == doc3val[0]

    # Test default path
    assert r.json().delete("doc3") == 1
    assert r.json().get("doc3", Path.root_path()) is None

    r.json().delete("not_a_document", "..a")


def test_json_et_non_dict_value(r: redis.Redis):
    r.json().set(
        "str",
        Path.root_path(),
        "str_val",
    )
    assert r.json().get("str") == "str_val"

    r.json().set("bool", Path.root_path(), True)
    assert r.json().get("bool") is True

    r.json().set("bool", Path.root_path(), False)
    assert r.json().get("bool") is False


def test_jsonset_existential_modifiers_should_succeed(r: redis.Redis):
    obj = {"foo": "bar"}
    assert r.json().set("obj", Path.root_path(), obj)

    # Test that flags prevent updates when conditions are unmet
    assert (
        r.json().set(
            "obj",
            Path("foo"),
            "baz",
            nx=True,
        )
        is None
    )
    assert r.json().get("obj") == obj

    assert (
        r.json().set(
            "obj",
            Path("qaz"),
            "baz",
            xx=True,
        )
        is None
    )
    assert r.json().get("obj") == obj

    # Test that flags allow updates when conditions are met
    assert r.json().set("obj", Path("foo"), "baz", xx=True) == 1
    assert r.json().set("obj", Path("foo2"), "qaz", nx=True) == 1
    assert r.json().get("obj") == {"foo": "baz", "foo2": "qaz"}

    # Test with raw
    obj = {"foo": "bar"}
    testtools.raw_command(r, "json.set", "obj", "$", json.dumps(obj))
    assert r.json().get("obj") == obj


def test_jsonset_flags_should_be_mutually_exclusive(r: redis.Redis):
    with pytest.raises(Exception):
        r.json().set("obj", Path("foo"), "baz", nx=True, xx=True)
    with pytest.raises(redis.ResponseError):
        testtools.raw_command(
            r, "json.set", "obj", "$", json.dumps({"foo": "bar"}), "NX", "XX"
        )


def test_json_unknown_param(r: redis.Redis):
    with pytest.raises(redis.ResponseError):
        testtools.raw_command(
            r, "json.set", "obj", "$", json.dumps({"foo": "bar"}), "unknown"
        )


def test_jsonmget(r: redis.Redis):
    # Test mget with multi paths
    r.json().set(
        "doc1",
        "$",
        {"a": 1, "b": 2, "nested": {"a": 3}, "c": None, "nested2": {"a": None}},
    )
    r.json().set(
        "doc2",
        "$",
        {"a": 4, "b": 5, "nested": {"a": 6}, "c": None, "nested2": {"a": [None]}},
    )
    r.json().set(
        "doc3",
        "$",
        {
            "a": 5,
            "b": 5,
            "nested": {"a": 8},
            "c": None,
            "nested2": {"a": {"b": "nested3"}},
        },
    )
    # Compare also to single JSON.GET
    assert r.json().get("doc1", Path("$..a")) == [1, 3, None]
    assert r.json().get("doc2", "$..a") == [4, 6, [None]]
    assert r.json().get("doc3", "$..a") == [5, 8, {"b": "nested3"}]

    # Test mget with single path
    assert r.json().mget(["doc1"], "$..a") == [[1, 3, None]]

    # Test mget with multi path
    assert r.json().mget(["doc1", "doc2", "doc3"], "$..a") == [
        [1, 3, None],
        [4, 6, [None]],
        [5, 8, {"b": "nested3"}],
    ]

    # Test missing key
    assert r.json().mget(["doc1", "missing_doc"], "$..a") == [[1, 3, None], None]

    assert r.json().mget(["missing_doc1", "missing_doc2"], "$..a") == [None, None]


def test_jsonmget_should_succeed(r: redis.Redis):
    r.json().set("1", Path.root_path(), 1)
    r.json().set("2", Path.root_path(), 2)

    assert r.json().mget(["1"], Path.root_path()) == [1]

    assert r.json().mget([1, 2], Path.root_path()) == [1, 2]


def test_jsonclear(r: redis.Redis):
    r.json().set(
        "arr",
        Path.root_path(),
        [0, 1, 2, 3, 4],
    )

    assert 1 == r.json().clear(
        "arr",
        Path.root_path(),
    )
    assert [] == r.json().get("arr")


def test_jsonclear_dollar(r: redis.Redis):
    data = {
        "nested1": {"a": {"foo": 10, "bar": 20}},
        "a": ["foo"],
        "nested2": {"a": "claro"},
        "nested3": {"a": {"baz": 50}},
    }
    r.json().set("doc1", "$", data)
    # Test multi
    assert r.json().clear("doc1", "$..a") == 3

    assert r.json().get("doc1", "$") == [
        {"nested1": {"a": {}}, "a": [], "nested2": {"a": "claro"}, "nested3": {"a": {}}}
    ]

    # Test single
    r.json().set("doc1", "$", data)
    assert r.json().clear("doc1", "$.nested1.a") == 1
    assert r.json().get("doc1", "$") == [
        {
            "nested1": {"a": {}},
            "a": ["foo"],
            "nested2": {"a": "claro"},
            "nested3": {"a": {"baz": 50}},
        }
    ]

    # Test missing path (defaults to root)
    assert r.json().clear("doc1") == 1
    assert r.json().get("doc1", "$") == [{}]


def test_jsonclear_no_doc(r: redis.Redis):
    # Test missing key
    with pytest.raises(redis.ResponseError):
        r.json().clear("non_existing_doc", "$..a")


def test_jsonstrlen(r: redis.Redis):
    data = {"x": "bar", "y": {"x": 33}}
    r.json().set("foo", Path.root_path(), data)
    assert r.json().strlen("foo", Path("$..x")) == [3, None]

    r.json().set("foo2", Path.root_path(), "data2")
    assert r.json().strlen("foo2") == 5
    assert r.json().strlen("foo2", Path.root_path()) == 5

    r.json().set("foo3", Path.root_path(), {"x": "string"})
    assert r.json().strlen("foo3", Path("$.x")) == [
        6,
    ]

    assert r.json().strlen("non-existing") is None

    r.json().set("str", Path.root_path(), "foo")
    assert r.json().strlen("str", Path.root_path()) == 3
    # Test multi
    r.json().set(
        "doc1", "$", {"a": "foo", "nested1": {"a": "hello"}, "nested2": {"a": 31}}
    )
    assert r.json().strlen("doc1", "$..a") == [3, 5, None]

    res2 = r.json().strappend("doc1", "bar", "$..a")
    res1 = r.json().strlen("doc1", "$..a")
    assert res1 == res2

    # Test single
    assert r.json().strlen("doc1", "$.nested1.a") == [8]
    assert r.json().strlen("doc1", "$.nested2.a") == [None]

    # Test missing key
    # Note: Dragonfly returns NIL in the accordance to the official docs
    # with pytest.raises(redis.ResponseError):
    #    r.json().strlen("non_existing_doc", "$..a")


def test_toggle(r: redis.Redis):
    r.json().set("bool", Path.root_path(), False)
    assert r.json().toggle("bool", Path.root_path())
    assert r.json().toggle("bool", Path.root_path()) is False

    r.json().set("num", Path.root_path(), 1)

    with pytest.raises(redis.exceptions.ResponseError):
        r.json().toggle("num", Path.root_path())


def test_toggle_dollar(r: redis.Redis):
    data = {
        "a": ["foo"],
        "nested1": {"a": False},
        "nested2": {"a": 31},
        "nested3": {"a": True},
    }
    r.json().set("doc1", "$", data)
    # Test multi
    assert r.json().toggle("doc1", "$..a") == [None, 1, None, 0]
    data["nested1"]["a"] = True
    data["nested3"]["a"] = False
    assert r.json().get("doc1", "$") == [data]

    # Test missing key
    with pytest.raises(redis.exceptions.ResponseError):
        r.json().toggle("non_existing_doc", "$..a")


def test_json_commands_in_pipeline(r: redis.Redis):
    p = r.json().pipeline()
    p.set("foo", Path.root_path(), "bar")
    p.get("foo")
    p.delete("foo")
    assert [True, "bar", 1] == p.execute()
    assert r.keys() == []
    assert r.get("foo") is None

    # now with a true, json object
    r.flushdb()
    p = r.json().pipeline()
    d = {"hello": "world", "oh": "snap"}

    with pytest.deprecated_call():
        p.jsonset("foo", Path.root_path(), d)
        p.jsonget("foo")

    p.exists("not-a-real-key")
    p.delete("foo")

    assert [True, d, 0, 1] == p.execute()
    assert r.keys() == []
    assert r.get("foo") is None


def test_strappend(r: redis.Redis):
    # Test single
    r.json().set("json-key", Path.root_path(), "foo")
    assert r.json().strappend("json-key", "bar") == 6
    assert "foobar" == r.json().get("json-key", Path.root_path())

    # Test multi
    r.json().set(
        "doc1",
        Path.root_path(),
        {
            "a": "foo",
            "nested1": {"a": "hello"},
            "nested2": {"a": 31},
        },
    )
    assert r.json().strappend("doc1", "bar", "$..a") == [6, 8, None]
    assert r.json().get("doc1") == {
        "a": "foobar",
        "nested1": {"a": "hellobar"},
        "nested2": {"a": 31},
    }

    # Test single
    assert r.json().strappend(
        "doc1",
        "baz",
        "$.nested1.a",
    ) == [11]
    assert r.json().get("doc1") == {
        "a": "foobar",
        "nested1": {"a": "hellobarbaz"},
        "nested2": {"a": 31},
    }

    # Test missing key
    with pytest.raises(redis.exceptions.ResponseError):
        r.json().strappend("non_existing_doc", "$..a", "err")

    # Test multi
    r.json().set(
        "doc2",
        Path.root_path(),
        {
            "a": "foo",
            "nested1": {"a": "hello"},
            "nested2": {"a": "hi"},
        },
    )
    assert r.json().strappend("doc2", "bar", "$.*.a") == [8, 5]
    assert r.json().get("doc2") == {
        "a": "foo",
        "nested1": {"a": "hellobar"},
        "nested2": {"a": "hibar"},
    }

    # Test missing path
    r.json().set(
        "doc1",
        Path.root_path(),
        {
            "a": "foo",
            "nested1": {"a": "hello"},
            "nested2": {"a": 31},
        },
    )
    with pytest.raises(redis.exceptions.ResponseError):
        r.json().strappend("doc1", "add", "piu")

    # Test raw command with no arguments
    with pytest.raises(redis.ResponseError):
        testtools.raw_command(r, "json.strappend", "")


@pytest.mark.decode_responses(True)
def test_decode_null(r: redis.Redis):
    assert r.json().get("abc") is None


def test_decode_response_disabaled_null(r: redis.Redis):
    assert r.json().get("abc") is None


def test_json_get_jset(r: redis.Redis):
    assert r.json().set("foo", Path.root_path(), "bar") == 1
    assert "bar" == r.json().get("foo")
    assert r.json().get("baz") is None
    assert 1 == r.json().delete("foo")
    assert r.exists("foo") == 0


def test_nonascii_setgetdelete(r: redis.Redis):
    assert r.json().set(
        "not-ascii",
        Path.root_path(),
        "hyvää-élève",
    )
    assert "hyvää-élève" == r.json().get(
        "not-ascii",
        no_escape=True,
    )
    assert 1 == r.json().delete("not-ascii")
    assert r.exists("not-ascii") == 0


def test_json_setbinarykey(r: redis.Redis):
    data = {"hello": "world", b"some": "value"}

    with pytest.raises(TypeError):
        r.json().set("some-key", Path.root_path(), data)

    assert r.json().set("some-key", Path.root_path(), data, decode_keys=True)


def test_set_file(r: redis.Redis):
    # Standard Library Imports
    import json
    import tempfile

    obj = {"hello": "world"}
    jsonfile = tempfile.NamedTemporaryFile(suffix=".json")
    with open(jsonfile.name, "w+") as fp:
        fp.write(json.dumps(obj))

    no_json_file = tempfile.NamedTemporaryFile()
    no_json_file.write(b"Hello World")

    assert r.json().set_file("test", Path.root_path(), jsonfile.name)
    assert r.json().get("test") == obj
    with pytest.raises(json.JSONDecodeError):
        r.json().set_file("test2", Path.root_path(), no_json_file.name)


def test_set_path(r: redis.Redis):
    # Standard Library Imports
    import json
    import tempfile

    root = tempfile.mkdtemp()
    jsonfile = tempfile.NamedTemporaryFile(mode="w+", dir=root, delete=False)
    no_json_file = tempfile.NamedTemporaryFile(mode="a+", dir=root, delete=False)
    jsonfile.write(json.dumps({"hello": "world"}))
    jsonfile.close()
    no_json_file.write("hello")

    result = {jsonfile.name: True, no_json_file.name: False}
    assert r.json().set_path(Path.root_path(), root) == result
    assert r.json().get(jsonfile.name.rsplit(".")[0]) == {"hello": "world"}


def test_type(r: redis.Redis):
    r.json().set("1", Path.root_path(), 1)

    assert r.json().type("1", Path.root_path()) == b"integer"
    assert r.json().type("1") == b"integer"  # noqa: E721

    meta_data = {
        "object": {},
        "array": [],
        "string": "str",
        "integer": 42,
        "number": 1.2,
        "boolean": False,
        "null": None,
    }
    data = {k: {"a": meta_data[k]} for k in meta_data}
    r.json().set("doc1", "$", data)

    # Dragonfly does not guarantee the traversal order for multi field traversal
    # json.type api assumes a predefined order and is not designed very well.
    # Test multi by comparing unordered sets
    assert set(r.json().type("doc1", "$..a")) == set(
        [k.encode() for k in meta_data.keys()]
    )  # noqa: E721

    # Test single
    assert r.json().type("doc1", "$.integer.a") == [b"integer"]  # noqa: E721
    assert r.json().type("doc1") == b"object"  # noqa: E721

    # Test missing key
    assert r.json().type("non_existing_doc", "..a") is None


def test_objlen(r: redis.Redis):
    # Test missing key, and path
    with pytest.raises(redis.ResponseError):
        r.json().objlen("non_existing_doc", "$..a")

    obj = {"foo": "bar", "baz": "qaz"}

    r.json().set("obj", Path.root_path(), obj)
    assert len(obj) == r.json().objlen("obj", Path.root_path())

    r.json().set("obj", Path.root_path(), obj)
    assert len(obj) == r.json().objlen("obj")
    r.json().set(
        "doc1",
        "$",
        {
            "a": ["foo"],
            "nested1": {"a": {"foo": 10, "bar": 20}},
            "nested2": {"a": {"baz": 50}},
        },
    )
    # Test multi
    assert r.json().objlen("doc1", "$..a") == [None, 2, 1]
    # Test single
    assert r.json().objlen("doc1", "$.nested1.a") == [2]

    assert r.json().objlen("doc1", "$.nowhere") == []

    # Test legacy
    assert r.json().objlen("doc1", ".*.a") == 2

    # Test single
    assert r.json().objlen("doc1", ".nested2.a") == 1

    # Test missing key
    assert r.json().objlen("non_existing_doc", "..a") is None

    # Test missing path
    # with pytest.raises(exceptions.ResponseError):
    r.json().objlen("doc1", ".nowhere")


def test_objkeys(r: redis.Redis):
    obj = {"foo": "bar", "baz": "qaz"}
    r.json().set("obj", Path.root_path(), obj)
    keys = r.json().objkeys("obj", Path.root_path())
    keys.sort()
    exp = list(obj.keys())
    exp.sort()
    assert exp == keys

    r.json().set("obj", Path.root_path(), obj)

    # Dragonfly does not guarantee the order (implementation detail)
    assert set(r.json().objkeys("obj")) == obj.keys()

    assert r.json().objkeys("fakekey") is None

    r.json().set(
        "doc1",
        "$",
        {
            "nested1": {"a": {"foo": 10, "bar": 20}},
            "a": ["foo"],
            "nested2": {"a": {"baz": 50}},
        },
    )

    # Test single
    assert set(r.json().objkeys("doc1", "$.nested1.a")[0]) == {b"foo", b"bar"}

    # Test legacy
    assert set(r.json().objkeys("doc1", ".*.a")) == {"foo", "bar"}
    # Test single
    assert r.json().objkeys("doc1", ".nested2.a") == ["baz"]

    # Test missing key
    assert r.json().objkeys("non_existing_doc", "..a") is None

    # Test non existing doc
    with pytest.raises(redis.ResponseError):
        assert r.json().objkeys("non_existing_doc", "$..a") == []

    assert r.json().objkeys("doc1", "$..nowhere") == []


def test_numincrby(r: redis.Redis):
    r.json().set("num", Path.root_path(), 1)

    assert 2 == r.json().numincrby("num", Path.root_path(), 1)
    assert 2.5 == r.json().numincrby("num", Path.root_path(), 0.5)
    assert 1.25 == r.json().numincrby("num", Path.root_path(), -1.25)
    # Test NUMINCRBY
    r.json().set("doc1", "$", {"a": "b", "b": [{"a": 2}, {"a": 5.0}, {"a": "c"}]})
    # Test multi
    assert r.json().numincrby("doc1", "$..a", 2) == [None, 4, 7.0, None]

    assert r.json().numincrby("doc1", "$..a", 2.5) == [None, 6.5, 9.5, None]
    # Test single
    assert r.json().numincrby("doc1", "$.b[1].a", 2) == [11.5]

    assert r.json().numincrby("doc1", "$.b[2].a", 2) == [None]
    assert r.json().numincrby("doc1", "$.b[1].a", 3.5) == [15.0]


def test_nummultby(r: redis.Redis):
    r.json().set("num", Path.root_path(), 1)

    with pytest.deprecated_call():
        assert r.json().nummultby("num", Path.root_path(), 2) == 2
        assert r.json().nummultby("num", Path.root_path(), 2.5) == 5
        assert r.json().nummultby("num", Path.root_path(), 0.5) == 2.5

    r.json().set("doc1", "$", {"a": "b", "b": [{"a": 2}, {"a": 5.0}, {"a": "c"}]})

    # test list
    with pytest.deprecated_call():
        assert r.json().nummultby("doc1", "$..a", 2) == [None, 4, 10, None]
        assert r.json().nummultby("doc1", "$..a", 2.5) == [None, 10.0, 25.0, None]

    # Test single
    with pytest.deprecated_call():
        assert r.json().nummultby("doc1", "$.b[1].a", 2) == [50.0]
        assert r.json().nummultby("doc1", "$.b[2].a", 2) == [None]
        assert r.json().nummultby("doc1", "$.b[1].a", 3) == [150.0]

    # test missing keys
    with pytest.raises(redis.ResponseError):
        r.json().numincrby("non_existing_doc", "$..a", 2)
        r.json().nummultby("non_existing_doc", "$..a", 2)

    # Test legacy NUMINCRBY
    r.json().set("doc1", "$", {"a": "b", "b": [{"a": 2}, {"a": 5.0}, {"a": "c"}]})
    assert r.json().numincrby("doc1", ".b[0].a", 3) == 5

    # Test legacy NUMMULTBY
    r.json().set("doc1", "$", {"a": "b", "b": [{"a": 2}, {"a": 5.0}, {"a": "c"}]})

    with pytest.deprecated_call():
        assert r.json().nummultby("doc1", ".b[0].a", 3) == 6


@testtools.run_test_if_redispy_ver("gte", "4.6")
@pytest.mark.min_server("7.1")
def test_json_merge(r: redis.Redis):
    # Test with root path $
    assert r.json().set(
        "person_data",
        "$",
        {"person1": {"personal_data": {"name": "John"}}},
    )
    assert r.json().merge(
        "person_data", "$", {"person1": {"personal_data": {"hobbies": "reading"}}}
    )
    assert r.json().get("person_data") == {
        "person1": {"personal_data": {"name": "John", "hobbies": "reading"}}
    }

    # Test with root path path $.person1.personal_data
    assert r.json().merge(
        "person_data", "$.person1.personal_data", {"country": "Israel"}
    )
    assert r.json().get("person_data") == {
        "person1": {
            "personal_data": {"name": "John", "hobbies": "reading", "country": "Israel"}
        }
    }

    # Test with null value to delete a value
    assert r.json().merge("person_data", "$.person1.personal_data", {"name": None})
    assert r.json().get("person_data") == {
        "person1": {"personal_data": {"country": "Israel", "hobbies": "reading"}}
    }


@testtools.run_test_if_redispy_ver("gte", "4.6")
@pytest.mark.min_server("7.1")
def test_mset(r: redis.Redis):
    r.json().mset([("1", Path.root_path(), 1), ("2", Path.root_path(), 2)])

    assert r.json().mget(["1"], Path.root_path()) == [1]
    assert r.json().mget(["1", "2"], Path.root_path()) == [1, 2]


================================================
FILE: tests/fakeredis/test/test_json/test_json_arr_commands.py
================================================
import pytest
import redis
from redis.commands.json.path import Path

from test.testtools import raw_command

json_tests = pytest.importorskip("jsonpath_ng")


def test_arrlen(r: redis.Redis):
    r.json().set(
        "arr",
        Path.root_path(),
        [0, 1, 2, 3, 4],
    )
    assert (
        r.json().arrlen(
            "arr",
            Path.root_path(),
        )
        == 5
    )
    assert r.json().arrlen("arr") == 5
    assert r.json().arrlen("fake-key") is None

    r.json().set(
        "doc1",
        Path.root_path(),
        {
            "a": ["foo"],
            "nested1": {"a": ["hello", None, "world"]},
            "nested2": {"a": 31},
        },
    )

    assert r.json().arrlen("doc1", "$..a") == [1, 3, None]
    assert r.json().arrlen("doc1", "$.nested1.a") == [3]

    r.json().set(
        "doc2",
        "$",
        {
            "a": ["foo"],
            "nested1": {"a": ["hello", 1, 1, None, "world"]},
            "nested2": {"a": 31},
        },
    )
    assert r.json().arrlen("doc2", "$..a") == [1, 5, None]
    assert r.json().arrlen("doc2", ".nested1.a") == 5
    r.json().set(
        "doc1",
        "$",
        {
            "a": ["foo"],
            "nested1": {"a": ["hello", None, "world"]},
            "nested2": {"a": 31},
        },
    )

    # Test multi
    assert r.json().arrlen("doc1", "$..a") == [1, 3, None]
    assert r.json().arrappend("doc1", "$..a", "non", "abba", "stanza") == [
        4,
        6,
        None,
    ]

    r.json().clear("doc1", "$.a")
    assert r.json().arrlen("doc1", "$..a") == [0, 6, None]
    # Test single
    assert r.json().arrlen("doc1", "$.nested1.a") == [6]

    # Test missing key
    with pytest.raises(redis.ResponseError):
        r.json().arrappend("non_existing_doc", "$..a")

    r.json().set(
        "doc1",
        "$",
        {
            "a": ["foo"],
            "nested1": {"a": ["hello", None, "world"]},
            "nested2": {"a": 31},
        },
    )
    # Test multi (return result of last path)
    assert r.json().arrlen("doc1", "$..a") == [1, 3, None]
    assert r.json().arrappend("doc1", "..a", "non", "abba", "stanza") == 6

    # Test single
    assert r.json().arrlen("doc1", ".nested1.a") == 6

    # Test missing key
    assert r.json().arrlen("non_existing_doc", "..a") is None


def test_arrappend(r: redis.Redis):
    with pytest.raises(redis.ResponseError):
        r.json().arrappend("non-existing-key", Path.root_path(), 2)

    r.json().set("arr", Path.root_path(), [1])
    assert r.json().arrappend("arr", Path.root_path(), 2) == 2
    assert r.json().arrappend("arr", Path.root_path(), 3, 4) == 4
    assert r.json().arrappend("arr", Path.root_path(), *[5, 6, 7]) == 7
    assert r.json().get("arr") == [1, 2, 3, 4, 5, 6, 7]
    r.json().set(
        "doc1",
        "$",
        {
            "a": ["foo"],
            "nested1": {"a": ["hello", None, "world"]},
            "nested2": {"a": 31},
        },
    )
    # Test multi
    assert r.json().arrappend("doc1", "$..a", "bar", "racuda") == [3, 5, None]
    assert r.json().get("doc1", "$") == [
        {
            "a": ["foo", "bar", "racuda"],
            "nested1": {"a": ["hello", None, "world", "bar", "racuda"]},
            "nested2": {"a": 31},
        }
    ]
    assert r.json().arrappend("doc1", "$.nested1.a", "baz") == [6]

    # Test legacy
    r.json().set(
        "doc1",
        "$",
        {
            "a": ["foo"],
            "nested1": {"a": ["hello", None, "world"]},
            "nested2": {"a": 31},
        },
    )
    # Test multi (all paths are updated, but return result of last path)
    assert r.json().arrappend("doc1", "..a", "bar", "racuda") == 5

    assert r.json().get("doc1", "$") == [
        {
            "a": ["foo", "bar", "racuda"],
            "nested1": {"a": ["hello", None, "world", "bar", "racuda"]},
            "nested2": {"a": 31},
        }
    ]
    # Test single
    assert r.json().arrappend("doc1", ".nested1.a", "baz") == 6
    assert r.json().get("doc1", "$") == [
        {
            "a": ["foo", "bar", "racuda"],
            "nested1": {"a": ["hello", None, "world", "bar", "racuda", "baz"]},
            "nested2": {"a": 31},
        }
    ]

    # Test missing key
    with pytest.raises(redis.ResponseError):
        r.json().arrappend("non_existing_doc", "$..a")


def test_arrindex(r: redis.Redis):
    r.json().set(
        "foo",
        Path.root_path(),
        [0, 1, 2, 3, 4],
    )

    assert r.json().arrindex("foo", Path.root_path(), 1) == 1
    assert r.json().arrindex("foo", Path.root_path(), 1, 2) == -1

    r.json().set(
        "store",
        "$",
        {
            "store": {
                "book": [
                    {
                        "category": "reference",
                        "author": "Nigel Rees",
                        "title": "Sayings of the Century",
                        "price": 8.95,
                        "size": [10, 20, 30, 40],
                    },
                    {
                        "category": "fiction",
                        "author": "Evelyn Waugh",
                        "title": "Sword of Honour",
                        "price": 12.99,
                        "size": [50, 60, 70, 80],
                    },
                    {
                        "category": "fiction",
                        "author": "Herman Melville",
                        "title": "Moby Dick",
                        "isbn": "0-553-21311-3",
                        "price": 8.99,
                        "size": [5, 10, 20, 30],
                    },
                    {
                        "category": "fiction",
                        "author": "J. R. R. Tolkien",
                        "title": "The Lord of the Rings",
                        "isbn": "0-395-19395-8",
                        "price": 22.99,
                        "size": [5, 6, 7, 8],
                    },
                ],
                "bicycle": {"color": "red", "price": 19.95},
            }
        },
    )

    # Temporary disable filter expressions tests
    #
    # assert r.json().get("store", "$.store.book[?(@.price<10)].size") == [
    #     [10, 20, 30, 40],
    #     [5, 10, 20, 30],
    # ]
    # assert r.json().arrindex("store", "$.store.book[?(@.price<10)].size", "20") == [
    #     -1,
    #     -1,
    # ]

    # Test index of int scalar in multi values
    r.json().set(
        "test_num",
        ".",
        [
            {"arr": [0, 1, 3.0, 3, 2, 1, 0, 3]},
            {"nested1_found": {"arr": [5, 4, 3, 2, 1, 0, 1, 2, 3.0, 2, 4, 5]}},
            {"nested2_not_found": {"arr": [2, 4, 6]}},
            {"nested3_scalar": {"arr": "3"}},
            [
                {"nested41_not_arr": {"arr_renamed": [1, 2, 3]}},
                {"nested42_empty_arr": {"arr": []}},
            ],
        ],
    )

    assert r.json().get("test_num", "$..arr") == [
        [0, 1, 3.0, 3, 2, 1, 0, 3],
        [5, 4, 3, 2, 1, 0, 1, 2, 3.0, 2, 4, 5],
        [2, 4, 6],
        "3",
        [],
    ]

    assert r.json().arrindex("test_num", "$..nonexistingpath", 3) == []
    assert r.json().arrindex("test_num", "$..arr", 3) == [3, 2, -1, None, -1]

    # Test index of double scalar in multi values
    assert r.json().arrindex("test_num", "$..arr", 3.0) == [2, 8, -1, None, -1]

    # Test index of string scalar in multi values
    r.json().set(
        "test_string",
        ".",
        [
            {"arr": ["bazzz", "bar", 2, "baz", 2, "ba", "baz", 3]},
            {
                "nested1_found": {
                    "arr": [None, "baz2", "buzz", 2, 1, 0, 1, "2", "baz", 2, 4, 5]
                }
            },
            {"nested2_not_found": {"arr": ["baz2", 4, 6]}},
            {"nested3_scalar": {"arr": "3"}},
            [
                {"nested41_arr": {"arr_renamed": [1, "baz", 3]}},
                {"nested42_empty_arr": {"arr": []}},
            ],
        ],
    )
    assert r.json().get("test_string", "$..arr") == [
        ["bazzz", "bar", 2, "baz", 2, "ba", "baz", 3],
        [None, "baz2", "buzz", 2, 1, 0, 1, "2", "baz", 2, 4, 5],
        ["baz2", 4, 6],
        "3",
        [],
    ]

    assert r.json().arrindex("test_string", "$..arr", "baz") == [
        3,
        8,
        -1,
        None,
        -1,
    ]

    assert r.json().arrindex("test_string", "$..arr", "baz", 2) == [
        3,
        8,
        -1,
        None,
        -1,
    ]
    assert r.json().arrindex("test_string", "$..arr", "baz", 4) == [
        6,
        8,
        -1,
        None,
        -1,
    ]
    assert r.json().arrindex("test_string", "$..arr", "baz", -5) == [
        3,
        8,
        -1,
        None,
        -1,
    ]
    assert r.json().arrindex("test_string", "$..arr", "baz", 4, 7) == [
        6,
        -1,
        -1,
        None,
        -1,
    ]
    assert r.json().arrindex("test_string", "$..arr", "baz", 4, -1) == [
        6,
        8,
        -1,
        None,
        -1,
    ]
    assert r.json().arrindex("test_string", "$..arr", "baz", 4, 0) == [
        6,
        8,
        -1,
        None,
        -1,
    ]
    assert r.json().arrindex("test_string", "$..arr", "5", 7, -1) == [
        -1,
        -1,
        -1,
        None,
        -1,
    ]
    assert r.json().arrindex("test_string", "$..arr", "5", 7, 0) == [
        -1,
        -1,
        -1,
        None,
        -1,
    ]

    # Test index of None scalar in multi values
    r.json().set(
        "test_None",
        ".",
        [
            {"arr": ["bazzz", "None", 2, None, 2, "ba", "baz", 3]},
            {
                "nested1_found": {
                    "arr": ["zaz", "baz2", "buzz", 2, 1, 0, 1, "2", None, 2, 4, 5]
                }
            },
            {"nested2_not_found": {"arr": ["None", 4, 6]}},
            {"nested3_scalar": {"arr": None}},
            [
                {"nested41_arr": {"arr_renamed": [1, None, 3]}},
                {"nested42_empty_arr": {"arr": []}},
            ],
        ],
    )
    assert r.json().get("test_None", "$..arr") == [
        ["bazzz", "None", 2, None, 2, "ba", "baz", 3],
        ["zaz", "baz2", "buzz", 2, 1, 0, 1, "2", None, 2, 4, 5],
        ["None", 4, 6],
        None,
        [],
    ]

    # Test with none-scalar value
    # assert r.json().arrindex("test_None", "$..nested42_empty_arr.arr", {"arr": []}) == [-1]

    # Test legacy (path begins with dot)
    # Test index of int scalar in single value
    assert r.json().arrindex("test_num", ".[0].arr", 3) == 3
    assert r.json().arrindex("test_num", ".[0].arr", 9) == -1

    with pytest.raises(redis.ResponseError):
        r.json().arrindex("test_num", ".[0].arr_not", 3)
    # Test index of string scalar in single value
    assert r.json().arrindex("test_string", ".[0].arr", "baz") == 3
    assert r.json().arrindex("test_string", ".[0].arr", "faz") == -1
    # Test index of None scalar in single value
    assert r.json().arrindex("test_None", ".[0].arr", "None") == 1
    assert r.json().arrindex("test_None", "..nested2_not_found.arr", "None") == 0


def test_arrinsert(r: redis.Redis):
    r.json().set(
        "arr",
        Path.root_path(),
        [0, 4],
    )

    assert r.json().arrinsert("arr", Path.root_path(), 1, *[1, 2, 3]) == 5
    assert r.json().get("arr") == [0, 1, 2, 3, 4]

    # test prepends
    r.json().set("val2", Path.root_path(), [5, 6, 7, 8, 9])
    assert r.json().arrinsert("val2", Path.root_path(), 0, ["some", "thing"]) == 6
    assert r.json().get("val2") == [["some", "thing"], 5, 6, 7, 8, 9]
    r.json().set(
        "doc1",
        "$",
        {
            "a": ["foo"],
            "nested1": {"a": ["hello", None, "world"]},
            "nested2": {"a": 31},
        },
    )
    # Test multi
    assert r.json().arrinsert("doc1", "$..a", "1", "bar", "racuda") == [3, 5, None]

    assert r.json().get("doc1", "$") == [
        {
            "a": ["foo", "bar", "racuda"],
            "nested1": {"a": ["hello", "bar", "racuda", None, "world"]},
            "nested2": {"a": 31},
        }
    ]
    # Test single
    assert r.json().arrinsert("doc1", "$.nested1.a", -2, "baz") == [6]
    assert r.json().get("doc1", "$") == [
        {
            "a": ["foo", "bar", "racuda"],
            "nested1": {"a": ["hello", "bar", "racuda", "baz", None, "world"]},
            "nested2": {"a": 31},
        }
    ]

    # Test missing key
    with pytest.raises(redis.ResponseError):
        r.json().arrappend("non_existing_doc", "$..a")


def test_arrpop(r: redis.Redis):
    r.json().set(
        "arr",
        Path.root_path(),
        [0, 1, 2, 3, 4],
    )
    assert raw_command(r, "json.arrpop", "arr") == b"4"

    r.json().set(
        "arr",
        Path.root_path(),
        [0, 1, 2, 3, 4],
    )
    assert r.json().arrpop("arr", Path.root_path(), 4) == 4
    assert r.json().arrpop("arr", Path.root_path(), -1) == 3
    assert r.json().arrpop("arr", Path.root_path()) == 2
    assert r.json().arrpop("arr", Path.root_path(), 0) == 0
    assert r.json().get("arr") == [1]

    # test out of bounds
    r.json().set(
        "arr",
        Path.root_path(),
        [0, 1, 2, 3, 4],
    )
    assert r.json().arrpop("arr", Path.root_path(), 99) == 4

    # none test
    r.json().set(
        "arr",
        Path.root_path(),
        [],
    )
    assert r.json().arrpop("arr") is None

    r.json().set(
        "doc1",
        "$",
        {
            "a": ["foo"],
            "nested1": {"a": ["hello", None, "world"]},
            "nested2": {"a": 31},
        },
    )

    # # Test legacy
    r.json().set(
        "doc1",
        "$",
        {
            "a": ["foo"],
            "nested1": {"a": ["hello", None, "world"]},
            "nested2": {"a": 31},
        },
    )
    # Test multi (all paths are updated, but return result of last path)
    assert r.json().arrpop("doc1", "..a", "1") is None
    assert r.json().get("doc1", "$") == [
        {"a": [], "nested1": {"a": ["hello", "world"]}, "nested2": {"a": 31}}
    ]

    # # Test missing key
    with pytest.raises(redis.ResponseError):
        r.json().arrpop("non_existing_doc", "..a")


def test_arrtrim(r: redis.Redis):
    r.json().set(
        "arr",
        Path.root_path(),
        [0, 1, 2, 3, 4],
    )

    assert r.json().arrtrim("arr", Path.root_path(), 1, 3) == 3
    assert r.json().get("arr") == [1, 2, 3]

    # <0 test, should be 0 equivalent
    r.json().set(
        "arr",
        Path.root_path(),
        [0, 1, 2, 3, 4],
    )
    assert r.json().arrtrim("arr", Path.root_path(), -1, 3) == 0

    # testing stop > end
    r.json().set(
        "arr",
        Path.root_path(),
        [0, 1, 2, 3, 4],
    )
    assert r.json().arrtrim("arr", Path.root_path(), 3, 99) == 2

    # start > array size and stop
    r.json().set(
        "arr",
        Path.root_path(),
        [0, 1, 2, 3, 4],
    )
    assert r.json().arrtrim("arr", Path.root_path(), 9, 1) == 0

    # all larger
    r.json().set(
        "arr",
        Path.root_path(),
        [0, 1, 2, 3, 4],
    )
    assert r.json().arrtrim("arr", Path.root_path(), 9, 11) == 0

    r.json().set(
        "doc1",
        "$",
        {
            "a": ["foo"],
            "nested1": {"a": ["hello", None, "world"]},
            "nested2": {"a": 31},
        },
    )
    # Test multi
    assert r.json().arrtrim("doc1", "$..a", "1", -1) == [0, 2, None]
    assert r.json().get("doc1", "$") == [
        {"a": [], "nested1": {"a": [None, "world"]}, "nested2": {"a": 31}}
    ]

    r.json().set(
        "doc1", "$", {"a": [], "nested1": {"a": [None, "world"]}, "nested2": {"a": 31}}
    )
    assert r.json().arrtrim("doc1", "$..a", "1", "1") == [0, 1, None]
    assert r.json().get("doc1", "$") == [
        {"a": [], "nested1": {"a": ["world"]}, "nested2": {"a": 31}}
    ]
    # Test single
    assert r.json().arrtrim("doc1", "$.nested1.a", 1, 0) == [0]
    assert r.json().get("doc1", "$") == [
        {"a": [], "nested1": {"a": []}, "nested2": {"a": 31}}
    ]

    # Test missing key
    with pytest.raises(redis.ResponseError):
        r.json().arrtrim("non_existing_doc", "..a", "0", 1)

    # Test legacy
    r.json().set(
        "doc1",
        "$",
        {
            "a": ["foo"],
            "nested1": {"a": ["hello", None, "world"]},
            "nested2": {"a": 31},
        },
    )

    # Test multi (all paths are updated, but return result of last path)
    assert r.json().arrtrim("doc1", "..a", "1", "-1") == 2

    # Test single
    assert r.json().arrtrim("doc1", ".nested1.a", "1", "1") == 1
    assert r.json().get("doc1", "$") == [
        {"a": [], "nested1": {"a": ["world"]}, "nested2": {"a": 31}}
    ]

    # Test missing key
    with pytest.raises(redis.ResponseError):
        r.json().arrtrim("non_existing_doc", "..a", 1, 1)


================================================
FILE: tests/fakeredis/test/test_json/test_json_commands.py
================================================
"""Tests for `fakeredis-py`'s emulation of Redis's JSON command subset."""

from __future__ import annotations

from typing import (
    Any,
    Dict,
    List,
    Tuple,
)

import pytest

json_tests = pytest.importorskip("jsonpath_ng")


SAMPLE_DATA = {
    "a": ["foo"],
    "nested1": {"a": ["hello", None, "world"]},
    "nested2": {"a": 31},
}


@pytest.fixture(scope="function")
def json_data() -> Dict[str, Any]:
    """A module-scoped "blob" of JSON-encodable data."""
    return {
        "L1": {
            "a": {
                "A1_B1": 10,
                "A1_B2": False,
                "A1_B3": {
                    "A1_B3_C1": None,
                    "A1_B3_C2": [
                        "A1_B3_C2_D1_1",
                        "A1_B3_C2_D1_2",
                        -19.5,
                        "A1_B3_C2_D1_4",
                        "A1_B3_C2_D1_5",
                        {"A1_B3_C2_D1_6_E1": True},
                    ],
                    "A1_B3_C3": [1],
                },
                "A1_B4": {"A1_B4_C1": "foo"},
            }
        },
        "L2": {
            "a": {
                "A2_B1": 20,
                "A2_B2": False,
                "A2_B3": {
                    "A2_B3_C1": None,
                    "A2_B3_C2": [
                        "A2_B3_C2_D1_1",
                        "A2_B3_C2_D1_2",
                        -37.5,
                        "A2_B3_C2_D1_4",
                        "A2_B3_C2_D1_5",
                        {"A2_B3_C2_D1_6_E1": False},
                    ],
                    "A2_B3_C3": [2],
                },
                "A2_B4": {"A2_B4_C1": "bar"},
            }
        },
    }


def load_types_data(nested_key_name: str) -> Tuple[Dict[str, Any], List[bytes]]:
    """Generate a structure with sample of all types"""
    type_samples = {
        "object": {},
        "array": [],
        "string": "str",
        "integer": 42,
        "number": 1.2,
        "boolean": False,
        "null": None,
    }
    jdata = {}

    for k, v in type_samples.items():
        jdata[f"nested_{k}"] = {nested_key_name: v}

    return jdata, [k.encode() for k in type_samples.keys()]


================================================
FILE: tests/fakeredis/test/test_mixins/__init__.py
================================================


================================================
FILE: tests/fakeredis/test/test_mixins/test_bitmap_commands.py
================================================
import pytest
import redis
import redis.client

from test.testtools import raw_command


def test_getbit(r: redis.Redis):
    r.setbit("foo", 3, 1)
    assert r.getbit("foo", 0) == 0
    assert r.getbit("foo", 1) == 0
    assert r.getbit("foo", 2) == 0
    assert r.getbit("foo", 3) == 1
    assert r.getbit("foo", 4) == 0
    assert r.getbit("foo", 100) == 0


def test_getbit_wrong_type(r: redis.Redis):
    r.rpush("foo", b"x")
    with pytest.raises(redis.ResponseError):
        r.getbit("foo", 1)


@pytest.mark.min_server("7")
@pytest.mark.skip("Fails on FakeRedis")
def test_bitcount_error(r: redis.Redis):
    with pytest.raises(redis.ResponseError) as e:
        raw_command(r, b"BITCOUNT", b"", b"", b"")
    assert str(e.value) == "value is not an integer or out of range"


@pytest.mark.min_server("7")
def test_bitcount_does_not_exist(r: redis.Redis):
    res = raw_command(r, b"BITCOUNT", b"", 0, 0)
    assert res == 0


def test_multiple_bits_set(r: redis.Redis):
    r.setbit("foo", 1, 1)
    r.setbit("foo", 3, 1)
    r.setbit("foo", 5, 1)

    assert r.getbit("foo", 0) == 0
    assert r.getbit("foo", 1) == 1
    assert r.getbit("foo", 2) == 0
    assert r.getbit("foo", 3) == 1
    assert r.getbit("foo", 4) == 0
    assert r.getbit("foo", 5) == 1
    assert r.getbit("foo", 6) == 0


def test_unset_bits(r: redis.Redis):
    r.setbit("foo", 1, 1)
    r.setbit("foo", 2, 0)
    r.setbit("foo", 3, 1)
    assert r.getbit("foo", 1) == 1
    r.setbit("foo", 1, 0)
    assert r.getbit("foo", 1) == 0
    r.setbit("foo", 3, 0)
    assert r.getbit("foo", 3) == 0


def test_get_set_bits(r: redis.Redis):
    # set bit 5
    assert not r.setbit("a", 5, True)
    assert r.getbit("a", 5)
    # unset bit 4
    assert not r.setbit("a", 4, False)
    assert not r.getbit("a", 4)
    # set bit 4
    assert not r.setbit("a", 4, True)
    assert r.getbit("a", 4)
    # set bit 5 again
    assert r.setbit("a", 5, True)
    assert r.getbit("a", 5)


def test_setbits_and_getkeys(r: redis.Redis):
    # The bit operations and the get commands
    # should play nicely with each other.
    r.setbit("foo", 1, 1)
    assert r.get("foo") == b"@"
    r.setbit("foo", 2, 1)
    assert r.get("foo") == b"`"
    r.setbit("foo", 3, 1)
    assert r.get("foo") == b"p"
    r.setbit("foo", 9, 1)
    assert r.get("foo") == b"p@"
    r.setbit("foo", 54, 1)
    assert r.get("foo") == b"p@\x00\x00\x00\x00\x02"


def test_setbit_wrong_type(r: redis.Redis):
    r.rpush("foo", b"x")
    with pytest.raises(redis.ResponseError):
        r.setbit("foo", 0, 1)


def test_setbit_expiry(r: redis.Redis):
    r.set("foo", b"0x00", ex=10)
    r.setbit("foo", 1, 1)
    assert r.ttl("foo") > 0


def test_bitcount(r: redis.Redis):
    r.delete("foo")
    assert r.bitcount("foo") == 0
    r.setbit("foo", 1, 1)
    assert r.bitcount("foo") == 1
    r.setbit("foo", 8, 1)
    assert r.bitcount("foo") == 2
    assert r.bitcount("foo", 1, 1) == 1
    r.setbit("foo", 57, 1)
    assert r.bitcount("foo") == 3
    r.set("foo", " ")
    assert r.bitcount("foo") == 1
    r.set("key", "foobar")
    with pytest.raises(redis.ResponseError):
        raw_command(r, "bitcount", "key", "1", "2", "dsd")
    assert r.bitcount("key") == 26
    assert r.bitcount("key", start=0, end=0) == 4
    assert r.bitcount("key", start=1, end=1) == 6


@pytest.mark.min_server("7")
def test_bitcount_mode_redis7(r: redis.Redis):
    r.set("key", "foobar")
    assert r.bitcount("key", start=1, end=1, mode="byte") == 6
    assert r.bitcount("key", start=5, end=30, mode="bit") == 17
    with pytest.raises(redis.ResponseError):
        r.bitcount("key", start=5, end=30, mode="dscd")
    with pytest.raises(redis.ResponseError):
        raw_command(r, "bitcount", "key", "1", "2", "dsd", "cd")


def test_bitcount_wrong_type(r: redis.Redis):
    r.rpush("foo", b"x")
    with pytest.raises(redis.ResponseError):
        r.bitcount("foo")


def test_bitop(r: redis.Redis):
    r.set("key1", "foobar")
    r.set("key2", "abcdef")

    assert r.bitop("and", "dest", "key1", "key2") == 6
    assert r.get("dest") == b"`bc`ab"

    assert r.bitop("not", "dest1", "key1") == 6
    assert r.get("dest1") == b"\x99\x90\x90\x9d\x9e\x8d"

    assert r.bitop("or", "dest-or", "key1", "key2") == 6
    assert r.get("dest-or") == b"goofev"

    assert r.bitop("xor", "dest-xor", "key1", "key2") == 6
    assert r.get("dest-xor") == b"\x07\r\x0c\x06\x04\x14"


def test_bitop_errors(r: redis.Redis):
    r.set("key1", "foobar")
    r.set("key2", "abcdef")
    r.sadd("key-set", "member1")
    with pytest.raises(redis.ResponseError):
        r.bitop("not", "dest", "key1", "key2")
    with pytest.raises(redis.ResponseError):
        r.bitop("badop", "dest", "key1", "key2")
    with pytest.raises(redis.ResponseError):
        r.bitop("and", "dest", "key1", "key-set")
    with pytest.raises(redis.ResponseError):
        r.bitop("and", "dest")


def test_bitpos(r: redis.Redis):
    key = "key:bitpos"
    r.set(key, b"\xff\xf0\x00")
    assert r.bitpos(key, 0) == 12
    assert r.bitpos(key, 0, 2, -1) == 16
    assert r.bitpos(key, 0, -2, -1) == 12
    r.set(key, b"\x00\xff\xf0")
    assert r.bitpos(key, 1, 0) == 8
    assert r.bitpos(key, 1, 1) == 8
    r.set(key, b"\x00\x00\x00")
    assert r.bitpos(key, 1) == -1
    r.set(key, b"\xff\xf0\x00")


@pytest.mark.min_server("7")
def test_bitops_mode_redis7(r: redis.Redis):
    key = "key:bitpos"
    r.set(key, b"\xff\xf0\x00")
    assert r.bitpos(key, 0, 8, -1, "bit") == 12
    assert r.bitpos(key, 1, 8, -1, "bit") == 8
    with pytest.raises(redis.ResponseError):
        assert r.bitpos(key, 0, 8, -1, "bad_mode") == 12


def test_bitpos_wrong_arguments(r: redis.Redis):
    key = "key:bitpos:wrong:args"
    r.set(key, b"\xff\xf0\x00")
    with pytest.raises(redis.ResponseError):
        raw_command(r, "bitpos", key, "7")
    with pytest.raises(redis.ResponseError):
        raw_command(r, "bitpos", key, 1, "6", "5", "BYTE", "6")
    with pytest.raises(redis.ResponseError):
        raw_command(r, "bitpos", key)


def test_bitfield_wrong_arguments(r: redis.Redis):
    key = "key:bitfield:wrong:args"
    with pytest.raises(redis.ResponseError):
        raw_command(r, "bitfield")
    with pytest.raises(redis.ResponseError):
        raw_command(r, "bitfield", key, "foo")
    with pytest.raises(redis.ResponseError):
        raw_command(r, "bitfield", key, "overflow")
    with pytest.raises(redis.ResponseError):
        raw_command(r, "bitfield", key, "overflow", "foo")


def test_bitfield_get(r: redis.Redis):
    key = "key:bitfield_get"
    r.set(key, b"\xff\xf0\x00")
    for i in range(0, 12):
        assert r.bitfield(key).get("u1", i).get("i1", i).execute() == [1, -1]
    for i in range(12, 25):
        for j in range(1, 63):
            assert r.bitfield(key).get(f"u{j}", i).get(f"i{j}", i).execute() == [0, 0]

    for i in range(0, 11):
        assert r.bitfield(key).get("u2", i).get("i2", i).execute() == [3, -1]
    assert r.bitfield(key).get("u2", 11).get("i2", 11).execute() == [2, -2]
    assert r.bitfield(key).get("u8", 0).get("u8", 8).get("u8", 16).execute() == [
        0xFF,
        0xF0,
        0,
    ]
    assert r.bitfield(key).get("i8", 0).get("i8", 8).get("i8", 16).execute() == [
        ~0,
        ~0x0F,
        0,
    ]

    assert r.bitfield(key).get("u32", 8).get("u8", 100).execute() == [0xF000_0000, 0]

    r.set(key, b"\x01\x23\x45\x67\x89\xab\xcd\xef")
    for enc in ("i16", "u16"):
        assert r.bitfield(key).get(enc, 0).execute() == [0x0123]
        assert r.bitfield(key).get(enc, 4).execute() == [0x1234]
        assert r.bitfield(key).get(enc, 8).execute() == [0x2345]

        assert r.bitfield(key).get(enc, 1).execute() == [0x0246]
        assert r.bitfield(key).get(enc, 5).execute() == [0x2468]
        assert r.bitfield(key).get(enc, 9).execute() == [0x468A]

        assert r.bitfield(key).get(enc, 2).execute() == [0x048D]
        assert r.bitfield(key).get(enc, 6).execute() == [0x48D1]

    assert r.bitfield(key).get("u16", 10).get("i16", 10).execute() == [
        0x8D15,
        0xD15 - 0x8000,
    ]
    assert r.bitfield(key).get("u32", 16).get("u48", 8).execute() == [
        0x456789AB,
        0x2345_6789_ABCD,
    ]
    assert r.bitfield(key).get("i32", 16).get("i48", 8).execute() == [
        0x456789AB,
        0x2345_6789_ABCD,
    ]
    assert r.bitfield(key).get("u63", 1).execute() == [0x123456789_ABCDEF]
    assert r.bitfield(key).get("i63", 1).execute() == [0x123456789_ABCDEF]
    assert r.bitfield(key).get("i64", 0).execute() == [0x123456789_ABCDEF]
    assert raw_command(r, "bitfield", key, "get", "i16", 0) == [0x0123]


def test_bitfield_set(r: redis.Redis):
    key = "key:bitfield_set"
    r.set(key, b"\xff\xf0\x00")
    assert r.bitfield(key).set("u8", 0, 0x55).set("u8", 16, 0xAA).execute() == [0xFF, 0]
    assert r.get(key) == b"\x55\xf0\xaa"
    assert r.bitfield(key).set("u1", 0, 1).set("u1", 16, 2).execute() == [0, 1]
    assert r.get(key) == b"\xd5\xf0\x2a"
    assert r.bitfield(key).set("i1", 31, 1).set("i1", 30, 1).execute() == [0, 0]
    assert r.get(key) == b"\xd5\xf0\x2a\x03"
    assert r.bitfield(key).set("u36", 4, 0xBADC0FFE).execute() == [0x5_F02A_0300]
    assert r.get(key) == b"\xd0\xba\xdc\x0f\xfe"
    assert r.bitfield(key, "WRAP").set("u12", 8, 0xFFF).execute() == [0xBAD]
    assert r.get(key) == b"\xd0\xff\xfc\x0f\xfe"


def test_bitfield_set_sat(r: redis.Redis):
    key = "key:bitfield_set"
    r.set(key, b"\xff\xf0\x00")
    assert r.bitfield(key, "SAT").set("u8", 4, 0x123).set("u8", 8, 0x55).execute() == [
        0xFF,
        0xF0,
    ]
    assert r.get(key) == b"\xff\x55\x00"
    assert r.bitfield(key, "SAT").set("u12", 0, -1).set("u1", 1, 2).execute() == [
        0xFF5,
        1,
    ]
    assert r.get(key) == b"\xff\xf5\x00"
    assert r.bitfield(key, "SAT").set("i4", 0, 8).set("i4", 4, 7).execute() == [-1, -1]
    assert r.get(key) == b"\x77\xf5\x00"
    assert r.bitfield(key, "SAT").set("i4", 4, -8).set("i4", 0, -9).execute() == [7, 7]
    assert r.get(key) == b"\x88\xf5\x00"
    assert r.bitfield(key, "SAT").set("i60", 0, -(1 << 62) + 1).execute() == [
        0x88F5000_00000000 - (1 << 60)
    ]
    assert r.get(key) == b"\x80" + b"\0" * 7
    assert r.bitfield(key, "SAT").set("u60", 0, -(1 << 63) + 1).execute() == [1 << 59]
    assert r.get(key) == b"\xff" * 7 + b"\xf0"


def test_bitfield_set_fail(r: redis.Redis):
    key = "key:bitfield_set"
    r.set(key, b"\xff\xf0\x00")
    assert r.bitfield(key, "FAIL").set("u8", 4, 0x123).set("u8", 8, 0x55).execute() == [
        None,
        0xF0,
    ]
    assert r.get(key) == b"\xff\x55\x00"
    assert r.bitfield(key, "FAIL").set("u12", 0, -1).set("u1", 1, 2).execute() == [
        None,
        None,
    ]
    assert r.get(key) == b"\xff\x55\x00"
    assert r.bitfield(key, "FAIL").set("i4", 0, 8).set("i4", 4, 7).execute() == [
        None,
        -1,
    ]
    assert r.get(key) == b"\xf7\x55\x00"
    assert r.bitfield(key, "FAIL").set("i4", 4, -8).set("i4", 0, -9).execute() == [
        7,
        None,
    ]
    assert r.get(key) == b"\xf8\x55\x00"


def test_bitfield_incr(r: redis.Redis):
    key = "key:bitfield_incr"
    r.set(key, b"\xff\xf0\x00")
    assert r.bitfield(key).incrby("u8", 0, 0x55).incrby("u8", 16, 0xAA).execute() == [
        0x54,
        0xAA,
    ]
    assert r.get(key) == b"\x54\xf0\xaa"
    assert r.bitfield(key).incrby("u1", 0, 1).incrby("u1", 16, 2).execute() == [1, 1]
    assert r.get(key) == b"\xd4\xf0\xaa"
    assert r.bitfield(key).incrby("i1", 31, 1).incrby("i1", 30, 1).execute() == [-1, -1]
    assert r.get(key) == b"\xd4\xf0\xaa\x03"
    assert r.bitfield(key).incrby("u36", 4, 0xBADC0FFE).execute() == [0x5_AB86_12FE]
    assert r.get(key) == b"\xd5\xab\x86\x12\xfe"
    assert r.bitfield(key, "WRAP").incrby("u12", 8, 0xFFF).execute() == [0xAB7]
    assert r.get(key) == b"\xd5\xab\x76\x12\xfe"


def test_bitfield_incr_sat(r: redis.Redis):
    key = "key:bitfield_incr_sat"
    r.set(key, b"\xff\xf0\x00")
    assert r.bitfield(key, "SAT").incrby("u8", 4, 0x123).incrby("u8", 8, 0x55).execute() == [
        0xFF,
        0xFF,
    ]
    assert r.get(key) == b"\xff\xff\x00"
    assert r.bitfield(key, "SAT").incrby("u12", 0, -1).incrby("u1", 1, 2).execute() == [
        0xFFE,
        1,
    ]
    assert r.get(key) == b"\xff\xef\x00"
    assert r.bitfield(key, "SAT").incrby("i4", 0, 8).incrby("i4", 4, 7).execute() == [
        7,
        6,
    ]
    assert r.get(key) == b"\x76\xef\x00"
    assert r.bitfield(key, "SAT").incrby("i4", 4, -8).incrby("i4", 0, -9).execute() == [
        -2,
        -2,
    ]
    assert r.get(key) == b"\xee\xef\x00"
    assert r.bitfield(key, "SAT").incrby("i60", 0, -(1 << 62) + 1).execute() == [-(1 << 59)]
    assert r.get(key) == b"\x80" + b"\0" * 7
    assert r.bitfield(key, "SAT").set("u60", 0, -(1 << 63) + 1).execute() == [1 << 59]
    assert r.get(key) == b"\xff" * 7 + b"\xf0"


def test_bitfield_incr_fail(r: redis.Redis):
    key = "key:bitfield_incr_fail"
    r.set(key, b"\xff\xf0\x00")
    assert r.bitfield(key, "FAIL").incrby("u8", 4, 0x123).incrby("u8", 8, 0x55).execute() == [
        None,
        None,
    ]
    assert r.get(key) == b"\xff\xf0\x00"
    assert r.bitfield(key, "FAIL").incrby("u12", 0, -1).incrby("u1", 1, 2).execute() == [
        0xFFE,
        None,
    ]
    assert r.get(key) == b"\xff\xe0\x00"
    assert r.bitfield(key, "FAIL").incrby("i4", 0, 8).incrby("i4", 4, 7).execute() == [
        7,
        6,
    ]
    assert r.get(key) == b"\x76\xe0\x00"
    assert r.bitfield(key, "FAIL").incrby("i4", 4, -8).incrby("i4", 0, -9).execute() == [-2, -2]
    assert r.get(key) == b"\xee\xe0\x00"


def test_bitfield_get_wrong_arguments(r: redis.Redis):
    key = "key:bitfield_get:wrong:args"
    r.set(key, b"\xff\xf0\x00")
    with pytest.raises(redis.ResponseError):
        raw_command(r, "bitfield", key, "get")
    with pytest.raises(redis.ResponseError):
        raw_command(r, "bitfield", key, "get", "i16")
    with pytest.raises(redis.ResponseError):
        raw_command(r, "bitfield", key, "get", "i16", -1)
    for encoding in ("I8", "i-42", "i5?", "u0", "i0", "i65", "u64", "i 60"):
        with pytest.raises(redis.ResponseError):
            raw_command(r, "bitfield", key, "get", encoding, 0)


def test_bitfield_set_wrong_arguments(r: redis.Redis):
    key = "key:bitfield_set:wrong:args"
    r.set(key, b"\xff\xf0\x00")
    with pytest.raises(redis.ResponseError):
        raw_command(r, "bitfield", key, "set")
    with pytest.raises(redis.ResponseError):
        raw_command(r, "bitfield", key, "set", "i16")
    with pytest.raises(redis.ResponseError):
        raw_command(r, "bitfield", key, "set", "i16", -1)
    with pytest.raises(redis.ResponseError):
        raw_command(r, "bitfield", key, "set", "i16", 0, "foo")
    for encoding in ("I8", "i-42", "i5?", "u0", "i0", "i65", "u64", "i 60"):
        with pytest.raises(redis.ResponseError):
            raw_command(r, "bitfield", key, "set", encoding, 0, 0)


================================================
FILE: tests/fakeredis/test/test_mixins/test_connection.py
================================================
import pytest
import redis
import redis.client
from fakeredis import _msgs as msgs
from redis.exceptions import ResponseError

from test import testtools
from test.testtools import raw_command


def test_ping(r: redis.Redis):
    assert r.ping()
    assert testtools.raw_command(r, "ping", "test") == b"test"
    with pytest.raises(
        redis.ResponseError, match=msgs.WRONG_ARGS_MSG6.format("ping")[4:]
    ):
        raw_command(r, "ping", "arg1", "arg2")


def test_echo(r: redis.Redis):
    assert r.echo(b"hello") == b"hello"
    assert r.echo("hello") == b"hello"


def test_unknown_command(r: redis.Redis):
    with pytest.raises(redis.ResponseError):
        raw_command(r, "0 3 3")


@pytest.mark.decode_responses
class TestDecodeResponses:
    def test_decode_str(self, r):
        r.set("foo", "bar")
        assert r.get("foo") == "bar"

    def test_decode_set(self, r):
        r.sadd("foo", "member1")
        assert set(r.smembers("foo")) == {"member1"}

    def test_decode_list(self, r):
        r.rpush("foo", "a", "b")
        assert r.lrange("foo", 0, -1) == ["a", "b"]

    def test_decode_dict(self, r):
        r.hset("foo", "key", "value")
        assert r.hgetall("foo") == {"key": "value"}

    def test_decode_error(self, r):
        r.set("foo", "bar")
        with pytest.raises(ResponseError) as exc_info:
            r.hset("foo", "bar", "baz")
        assert isinstance(exc_info.value.args[0], str)


================================================
FILE: tests/fakeredis/test/test_mixins/test_generic_commands.py
================================================
from datetime import datetime, timedelta
from time import sleep, time

import pytest
import redis
from fakeredis import _msgs as msgs
from redis.exceptions import ResponseError

from test.testtools import raw_command


@pytest.mark.slow
def test_expireat_should_expire_key_by_datetime(r: redis.Redis):
    r.set("foo", "bar")
    assert r.get("foo") == b"bar"
    r.expireat("foo", datetime.now() + timedelta(seconds=1))
    sleep(1.5)
    assert r.get("foo") is None
    assert r.expireat("bar", datetime.now()) is False


@pytest.mark.slow
def test_expireat_should_expire_key_by_timestamp(r: redis.Redis):
    r.set("foo", "bar")
    assert r.get("foo") == b"bar"
    r.expireat("foo", int(time() + 1))
    sleep(1.5)
    assert r.get("foo") is None
    assert r.expire("bar", 1) is False


def test_expireat_should_return_true_for_existing_key(r: redis.Redis):
    r.set("foo", "bar")
    assert r.expireat("foo", int(time() + 1)) is True


def test_expireat_should_return_false_for_missing_key(r: redis.Redis):
    assert r.expireat("missing", int(time() + 1)) is False


def test_del_operator(r: redis.Redis):
    r["foo"] = "bar"
    del r["foo"]
    assert r.get("foo") is None


def test_expire_should_not_handle_floating_point_values(r: redis.Redis):
    r.set("foo", "bar")
    with pytest.raises(redis.ResponseError, match="value is not an integer or out of range"):
        r.expire("something_new", 1.2)
        r.pexpire("something_new", 1000.2)
        r.expire("some_unused_key", 1.2)
        r.pexpire("some_unused_key", 1000.2)


def test_ttl_should_return_minus_one_for_non_expiring_key(r: redis.Redis):
    r.set("foo", "bar")
    assert r.get("foo") == b"bar"
    assert r.ttl("foo") == -1


def test_sort_range_offset_range(r: redis.Redis):
    r.rpush("foo", "2")
    r.rpush("foo", "1")
    r.rpush("foo", "4")
    r.rpush("foo", "3")

    assert r.sort("foo", start=0, num=2) == [b"1", b"2"]


def test_sort_range_offset_range_and_desc(r: redis.Redis):
    r.rpush("foo", "2")
    r.rpush("foo", "1")
    r.rpush("foo", "4")
    r.rpush("foo", "3")

    assert r.sort("foo", start=0, num=1, desc=True) == [b"4"]


def test_sort_range_offset_norange(r: redis.Redis):
    with pytest.raises(redis.RedisError):
        r.sort("foo", start=1)


def test_sort_range_with_large_range(r: redis.Redis):
    r.rpush("foo", "2")
    r.rpush("foo", "1")
    r.rpush("foo", "4")
    r.rpush("foo", "3")
    # num=20 even though len(foo) is 4.
    assert r.sort("foo", start=1, num=20) == [b"2", b"3", b"4"]


def test_sort_descending(r: redis.Redis):
    r.rpush("foo", "1")
    r.rpush("foo", "2")
    r.rpush("foo", "3")
    assert r.sort("foo", desc=True) == [b"3", b"2", b"1"]


def test_sort_alpha(r: redis.Redis):
    r.rpush("foo", "2a")
    r.rpush("foo", "1b")
    r.rpush("foo", "2b")
    r.rpush("foo", "1a")

    assert r.sort("foo", alpha=True) == [b"1a", b"1b", b"2a", b"2b"]


def test_sort_foo(r: redis.Redis):
    r.rpush("foo", "2a")
    r.rpush("foo", "1b")
    r.rpush("foo", "2b")
    r.rpush("foo", "1a")
    with pytest.raises(redis.ResponseError):
        r.sort("foo", alpha=False)


def test_sort_empty(r: redis.Redis):
    assert r.sort("foo") == []


def test_sort_wrong_type(r: redis.Redis):
    r.set("string", "3")
    with pytest.raises(redis.ResponseError):
        r.sort("string")


@pytest.mark.unsupported_server_types("dragonfly")
def test_sort_with_store_option(r: redis.Redis):
    r.rpush("foo", "2")
    r.rpush("foo", "1")
    r.rpush("foo", "4")
    r.rpush("foo", "3")

    assert r.sort("foo", store="bar") == 4
    assert r.lrange("bar", 0, -1) == [b"1", b"2", b"3", b"4"]


@pytest.mark.unsupported_server_types("dragonfly")
def test_sort_with_by_and_get_option(r: redis.Redis):
    r.rpush("foo", "2")
    r.rpush("foo", "1")
    r.rpush("foo", "4")
    r.rpush("foo", "3")

    r["weight_1"] = "4"
    r["weight_2"] = "3"
    r["weight_3"] = "2"
    r["weight_4"] = "1"

    r["data_1"] = "one"
    r["data_2"] = "two"
    r["data_3"] = "three"
    r["data_4"] = "four"

    assert r.sort("foo", by="weight_*", get="data_*") == [
        b"four",
        b"three",
        b"two",
        b"one",
    ]
    assert r.sort("foo", by="weight_*", get="#") == [b"4", b"3", b"2", b"1"]
    assert r.sort("foo", by="weight_*", get=("data_*", "#")) == [
        b"four",
        b"4",
        b"three",
        b"3",
        b"two",
        b"2",
        b"one",
        b"1",
    ]
    assert r.sort("foo", by="weight_*", get="data_1") == [None, None, None, None]
    # Test sort with different parameters order
    assert raw_command(r, "sort", "foo", "get", "data_*", "by", "weight_*", "get", "#") == [
        b"four",
        b"4",
        b"three",
        b"3",
        b"two",
        b"2",
        b"one",
        b"1",
    ]


@pytest.mark.unsupported_server_types("dragonfly")
def test_sort_with_hash(r: redis.Redis):
    r.rpush("foo", "middle")
    r.rpush("foo", "eldest")
    r.rpush("foo", "youngest")
    r.hset("record_youngest", "age", 1)
    r.hset("record_youngest", "name", "baby")

    r.hset("record_middle", "age", 10)
    r.hset("record_middle", "name", "teen")

    r.hset("record_eldest", "age", 20)
    r.hset("record_eldest", "name", "adult")

    assert r.sort("foo", by="record_*->age") == [b"youngest", b"middle", b"eldest"]
    assert r.sort("foo", by="record_*->age", get="record_*->name") == [
        b"baby",
        b"teen",
        b"adult",
    ]


def test_sort_with_set(r: redis.Redis):
    r.sadd("foo", "3")
    r.sadd("foo", "1")
    r.sadd("foo", "2")
    assert r.sort("foo") == [b"1", b"2", b"3"]


def test_ttl_should_return_minus_two_for_non_existent_key(r: redis.Redis):
    assert r.get("foo") is None
    assert r.ttl("foo") == -2


def test_type(r: redis.Redis):
    r.set("string_key", "value")
    r.lpush("list_key", "value")
    r.sadd("set_key", "value")
    r.zadd("zset_key", {"value": 1})
    r.hset("hset_key", "key", "value")

    assert r.type("string_key") == b"string"  # noqa: E721
    assert r.type("list_key") == b"list"  # noqa: E721
    assert r.type("set_key") == b"set"  # noqa: E721
    assert r.type("zset_key") == b"zset"  # noqa: E721
    assert r.type("hset_key") == b"hash"  # noqa: E721
    assert r.type("none_key") == b"none"  # noqa: E721


def test_unlink(r: redis.Redis):
    r.set("foo", "bar")
    r.unlink("foo")
    assert r.get("foo") is None


def test_dump_missing(r: redis.Redis):
    assert r.dump("foo") is None


def test_dump_restore(r: redis.Redis):
    r.set("foo", "bar")
    dump = r.dump("foo")
    r.restore("baz", 0, dump)
    assert r.get("baz") == b"bar"
    assert r.ttl("baz") == -1


def test_dump_restore_ttl(r: redis.Redis):
    r.set("foo", "bar")
    dump = r.dump("foo")
    r.restore("baz", 2000, dump)
    assert r.get("baz") == b"bar"
    assert 1000 <= r.pttl("baz") <= 2000


def test_dump_restore_replace(r: redis.Redis):
    r.set("foo", "bar")
    dump = r.dump("foo")
    r.set("foo", "baz")
    r.restore("foo", 0, dump, replace=True)
    assert r.get("foo") == b"bar"


def test_restore_exists(r: redis.Redis):
    r.set("foo", "bar")
    dump = r.dump("foo")
    with pytest.raises(redis.exceptions.ResponseError):
        r.restore("foo", 0, dump)


def test_restore_invalid_dump(r: redis.Redis):
    r.set("foo", "bar")
    dump = r.dump("foo")
    with pytest.raises(redis.exceptions.ResponseError):
        r.restore("baz", 0, dump[:-1])


def test_restore_invalid_ttl(r: redis.Redis):
    r.set("foo", "bar")
    dump = r.dump("foo")
    with pytest.raises(redis.exceptions.ResponseError):
        r.restore("baz", -1, dump)


def test_set_then_get(r: redis.Redis):
    assert r.set("foo", "bar") is True
    assert r.get("foo") == b"bar"


def test_exists(r: redis.Redis):
    assert "foo" not in r
    r.set("foo", "bar")
    assert "foo" in r
    with pytest.raises(redis.ResponseError, match=msgs.WRONG_ARGS_MSG6.format("exists")[4:]):
        raw_command(r, "exists")


@pytest.mark.slow
def test_expire_should_expire_key(r: redis.Redis):
    r.set("foo", "bar")
    assert r.get("foo") == b"bar"
    r.expire("foo", 1)
    sleep(1.5)
    assert r.get("foo") is None
    assert r.expire("bar", 1) is False


def test_expire_should_throw_error(r: redis.Redis):
    r.set("foo", "bar")
    assert r.get("foo") == b"bar"
    with pytest.raises(ResponseError):
        r.expire("foo", 1, nx=True, xx=True)
    with pytest.raises(ResponseError):
        r.expire("foo", 1, gt=True, lt=True)


@pytest.mark.max_server("7")
def test_expire_extra_params_return_error(r: redis.Redis):
    with pytest.raises(redis.exceptions.ResponseError):
        r.expire("foo", 1, nx=True)


def test_expire_should_return_true_for_existing_key(r: redis.Redis):
    r.set("foo", "bar")
    assert r.expire("foo", 1) is True


def test_expire_should_return_false_for_missing_key(r: redis.Redis):
    assert r.expire("missing", 1) is False


@pytest.mark.slow
def test_expire_should_expire_key_using_timedelta(r: redis.Redis):
    r.set("foo", "bar")
    assert r.get("foo") == b"bar"
    r.expire("foo", timedelta(seconds=1))
    sleep(1.5)
    assert r.get("foo") is None
    assert r.expire("bar", 1) is False


@pytest.mark.slow
def test_expire_should_expire_immediately_with_millisecond_timedelta(r: redis.Redis):
    r.set("foo", "bar")
    assert r.get("foo") == b"bar"
    r.expire("foo", timedelta(milliseconds=750))
    assert r.get("foo") is None
    assert r.expire("bar", 1) is False


def test_watch_expire(r: redis.Redis):
    """EXPIRE should mark a key as changed for WATCH."""
    r.set("foo", "bar")
    with r.pipeline() as p:
        p.watch("foo")
        r.expire("foo", 10000)
        p.multi()
        p.get("foo")
        with pytest.raises(redis.exceptions.WatchError):
            p.execute()


@pytest.mark.slow
def test_pexpire_should_expire_key(r: redis.Redis):
    r.set("foo", "bar")
    assert r.get("foo") == b"bar"
    r.pexpire("foo", 150)
    sleep(0.2)
    assert r.get("foo") is None
    assert r.pexpire("bar", 1) == 0


def test_pexpire_should_return_truthy_for_existing_key(r: redis.Redis):
    r.set("foo", "bar")
    assert r.pexpire("foo", 1)


def test_pexpire_should_return_falsey_for_missing_key(r: redis.Redis):
    assert not r.pexpire("missing", 1)


@pytest.mark.slow
def test_pexpire_should_expire_key_using_timedelta(r: redis.Redis):
    r.set("foo", "bar")
    assert r.get("foo") == b"bar"
    r.pexpire("foo", timedelta(milliseconds=750))
    sleep(0.5)
    assert r.get("foo") == b"bar"
    sleep(0.5)
    assert r.get("foo") is None
    assert r.pexpire("bar", 1) == 0


@pytest.mark.slow
def test_pexpireat_should_expire_key_by_datetime(r: redis.Redis):
    r.set("foo", "bar")
    assert r.get("foo") == b"bar"
    r.pexpireat("foo", datetime.now() + timedelta(milliseconds=150))
    sleep(0.2)
    assert r.get("foo") is None
    assert r.pexpireat("bar", datetime.now()) == 0


@pytest.mark.slow
def test_pexpireat_should_expire_key_by_timestamp(r: redis.Redis):
    r.set("foo", "bar")
    assert r.get("foo") == b"bar"
    r.pexpireat("foo", int(time() * 1000 + 150))
    sleep(0.2)
    assert r.get("foo") is None
    assert r.expire("bar", 1) is False


def test_pexpireat_should_return_true_for_existing_key(r: redis.Redis):
    r.set("foo", "bar")
    assert r.pexpireat("foo", int(time() * 1000 + 150))


def test_pexpireat_should_return_false_for_missing_key(r: redis.Redis):
    assert not r.pexpireat("missing", int(time() * 1000 + 150))


def test_pttl_should_return_minus_one_for_non_expiring_key(r: redis.Redis):
    r.set("foo", "bar")
    assert r.get("foo") == b"bar"
    assert r.pttl("foo") == -1


def test_pttl_should_return_minus_two_for_non_existent_key(r: redis.Redis):
    assert r.get("foo") is None
    assert r.pttl("foo") == -2


def test_randomkey_returns_none_on_empty_db(r: redis.Redis):
    assert r.randomkey() is None


def test_randomkey_returns_existing_key(r: redis.Redis):
    r.set("foo", 1)
    r.set("bar", 2)
    r.set("baz", 3)
    assert r.randomkey().decode() in ("foo", "bar", "baz")


def test_persist(r: redis.Redis):
    r.set("foo", "bar", ex=20)
    assert r.persist("foo") == 1
    assert r.ttl("foo") == -1
    assert r.persist("foo") == 0


def test_watch_persist(r: redis.Redis):
    """PERSIST should mark a variable as changed."""
    r.set("foo", "bar", ex=10000)
    with r.pipeline() as p:
        p.watch("foo")
        r.persist("foo")
        p.multi()
        p.get("foo")
        with pytest.raises(redis.exceptions.WatchError):
            p.execute()


def test_set_existing_key_persists(r: redis.Redis):
    r.set("foo", "bar", ex=20)
    r.set("foo", "foo")
    assert r.ttl("foo") == -1


def test_set_non_str_keys(r: redis.Redis):
    assert r.set(2, "bar") is True
    assert r.get(2) == b"bar"
    assert r.get("2") == b"bar"


def test_getset_not_exist(r: redis.Redis):
    val = r.getset("foo", "bar")
    assert val is None
    assert r.get("foo") == b"bar"


def test_get_float_type(r: redis.Redis):  # Test for issue #58
    r.set("key", 123)
    assert r.get("key") == b"123"
    r.incr("key")
    assert r.get("key") == b"124"


def test_set_float_value(r: redis.Redis):
    x = 1.23456789123456789
    r.set("foo", x)
    assert float(r.get("foo")) == x


@pytest.mark.min_server("7")
def test_expire_should_not_expire__when_no_expire_is_set(r: redis.Redis):
    r.set("foo", "bar")
    assert r.get("foo") == b"bar"
    assert r.expire("foo", 1, xx=True) == 0


@pytest.mark.min_server("7")
def test_expire_should_not_expire__when_expire_is_set(r: redis.Redis):
    r.set("foo", "bar")
    assert r.get("foo") == b"bar"
    assert r.expire("foo", 1, nx=True) == 1
    assert r.expire("foo", 2, nx=True) == 0


@pytest.mark.min_server("7")
def test_expire_should_expire__when_expire_is_greater(r: redis.Redis):
    r.set("foo", "bar")
    assert r.get("foo") == b"bar"
    assert r.expire("foo", 100) == 1
    assert r.get("foo") == b"bar"
    assert r.expire("foo", 200, gt=True) == 1


@pytest.mark.min_server("7")
def test_expire_should_expire__when_expire_is_lessthan(r: redis.Redis):
    r.set("foo", "bar")
    assert r.get("foo") == b"bar"
    assert r.expire("foo", 20) == 1
    assert r.expire("foo", 10, lt=True) == 1


def test_rename(r: redis.Redis):
    r.set("foo", "unique value")
    assert r.rename("foo", "bar")
    assert r.get("foo") is None
    assert r.get("bar") == b"unique value"


def test_rename_nonexistent_key(r: redis.Redis):
    with pytest.raises(redis.ResponseError):
        r.rename("foo", "bar")


def test_renamenx_doesnt_exist(r: redis.Redis):
    r.set("foo", "unique value")
    assert r.renamenx("foo", "bar")
    assert r.get("foo") is None
    assert r.get("bar") == b"unique value"


def test_rename_does_exist(r: redis.Redis):
    r.set("foo", "unique value")
    r.set("bar", "unique value2")
    assert not r.renamenx("foo", "bar")
    assert r.get("foo") == b"unique value"
    assert r.get("bar") == b"unique value2"


def test_rename_expiry(r: redis.Redis):
    r.set("foo", "value1", ex=10)
    r.set("bar", "value2")
    r.rename("foo", "bar")
    assert r.ttl("bar") > 0


def test_keys(r: redis.Redis):
    r.set("", "empty")
    r.set("abc\n", "")
    r.set("abc\\", "")
    r.set("abcde", "")
    r.set(b"\xfe\xcd", "")
    assert sorted(r.keys()) == [b"", b"abc\n", b"abc\\", b"abcde", b"\xfe\xcd"]
    assert r.keys("??") == [b"\xfe\xcd"]
    # empty pattern not the same as no pattern
    assert r.keys("") == [b""]
    # ? must match \n
    assert sorted(r.keys("abc?")) == [b"abc\n", b"abc\\"]
    # must be anchored at both ends
    assert r.keys("abc") == []
    assert r.keys("bcd") == []
    # wildcard test
    assert r.keys("a*de") == [b"abcde"]
    # positive groups
    assert sorted(r.keys("abc[d\n]*")) == [b"abc\n", b"abcde"]
    assert r.keys("abc[c-e]?") == [b"abcde"]

    # Not working in Dragonfly with reverse range
    # assert r.keys("abc[e-c]?") == [b"abcde"]
    assert r.keys("abc[e-e]?") == []
    assert r.keys("abcd[ef") == [b"abcde"]
    assert r.keys("abcd[]") == []
    # negative groups
    assert r.keys("abc[^d\\\\]*") == [b"abc\n"]
    assert r.keys("abc[^]e") == [b"abcde"]
    # escaping
    assert r.keys(r"abc\?e") == []
    assert r.keys(r"abc\de") == [b"abcde"]
    assert r.keys(r"abc[\d]e") == [b"abcde"]
    # some escaping cases that redis handles strangely
    assert r.keys("abc\\") == [b"abc\\"]
    # assert r.keys(r"abc[\c-e]e") == [] dragonfly matches abcde
    # assert r.keys(r"abc[c-\e]e") == [] dragonfly matches abcde


def test_contains(r: redis.Redis):
    assert not r.exists("foo")
    r.set("foo", "bar")
    assert r.exists("foo")


def test_delete(r: redis.Redis):
    r["foo"] = "bar"
    assert r.delete("foo") == 1
    assert r.get("foo") is None


@pytest.mark.slow
def test_delete_expire(r: redis.Redis):
    r.set("foo", "bar", ex=1)
    r.delete("foo")
    r.set("foo", "bar")
    sleep(2)
    assert r.get("foo") == b"bar"


def test_delete_multiple(r: redis.Redis):
    r["one"] = "one"
    r["two"] = "two"
    r["three"] = "three"
    # Since redis>=2.7.6 returns number of deleted items.
    assert r.delete("one", "two") == 2
    assert r.get("one") is None
    assert r.get("two") is None
    assert r.get("three") == b"three"
    assert r.delete("one", "two") == 0
    # If any keys are deleted, True is returned.
    assert r.delete("two", "three", "three") == 1
    assert r.get("three") is None


def test_delete_nonexistent_key(r: redis.Redis):
    assert r.delete("foo") == 0


def test_basic_sort(r: redis.Redis):
    r.rpush("foo", "2")
    r.rpush("foo", "1")
    r.rpush("foo", "3")

    assert r.sort("foo") == [b"1", b"2", b"3"]
    assert raw_command(r, "sort", "foo", "asc") == [b"1", b"2", b"3"]


def test_key_patterns(r: redis.Redis):
    r.mset({"one": 1, "two": 2, "three": 3, "four": 4})
    assert sorted(r.keys("*o*")) == [b"four", b"one", b"two"]
    assert r.keys("t??") == [b"two"]
    assert sorted(r.keys("*")) == [b"four", b"one", b"three", b"two"]
    assert sorted(r.keys()) == [b"four", b"one", b"three", b"two"]


# seems like a rather peculiar behavior of Redis, maybe a bug? Disabling for Dragonfly for now.
@pytest.mark.min_server("7")
@pytest.mark.unsupported_server_types("dragonfly")
def test_watch_when_setbit_does_not_change_value(r: redis.Redis):
    r.set("foo", b"0")

    with r.pipeline() as p:
        p.watch("foo")
        assert r.setbit("foo", 0, 0) == 0
        assert p.multi() is None
        assert p.execute() == []


def test_from_hypothesis_redis7(r: redis.Redis):
    r.set("foo", b"0")
    assert r.setbit("foo", 0, 0) == 0
    assert r.append("foo", b"") == 1

    r.set(b"", b"")
    assert r.setbit(b"", 0, 0) == 0
    assert r.get(b"") == b"\x00"


================================================
FILE: tests/fakeredis/test/test_mixins/test_geo_commands.py
================================================
from typing import Dict, Any

import pytest
import redis

from test import testtools


def test_geoadd_ch(r: redis.Redis):
    values = (2.1909389952632, 41.433791470673, "place1")
    assert r.geoadd("a", values) == 1
    values = (
        2.1909389952632,
        31.433791470673,
        "place1",
        2.1873744593677,
        41.406342043777,
        "place2",
    )
    assert r.geoadd("a", values, ch=True) == 2
    assert r.zrange("a", 0, -1) == [b"place1", b"place2"]


def test_geoadd(r: redis.Redis):
    values = (
        2.1909389952632,
        41.433791470673,
        "place1",
        2.1873744593677,
        41.406342043777,
        "place2",
    )
    assert r.geoadd("barcelona", values) == 2
    assert r.zcard("barcelona") == 2

    values = (2.1909389952632, 41.433791470673, "place1")
    assert r.geoadd("a", values) == 1

    with pytest.raises(redis.DataError):
        r.geoadd("barcelona", (1, 2))
    with pytest.raises(redis.DataError):
        r.geoadd("t", values, ch=True, nx=True, xx=True)
    with pytest.raises(redis.ResponseError):
        testtools.raw_command(r, "geoadd", "barcelona", "1", "2")
    with pytest.raises(redis.ResponseError):
        testtools.raw_command(
            r,
            "geoadd",
            "barcelona",
            "nx",
            "xx",
            *values,
        )


def test_geoadd_xx(r: redis.Redis):
    values = (
        2.1909389952632,
        41.433791470673,
        "place1",
        2.1873744593677,
        41.406342043777,
        "place2",
    )
    assert r.geoadd("a", values) == 2
    values = (
        2.1909389952632,
        41.433791470673,
        b"place1",
        2.1873744593677,
        41.406342043777,
        b"place2",
        2.1804738294738,
        41.405647879212,
        b"place3",
    )
    assert r.geoadd("a", values, nx=True) == 1
    assert r.zrange("a", 0, -1) == [b"place3", b"place2", b"place1"]


def test_geohash(r: redis.Redis):
    values = (
        2.1909389952632,
        41.433791470673,
        "place1",
        2.1873744593677,
        41.406342043777,
        "place2",
    )
    r.geoadd("barcelona", values)
    assert r.geohash("barcelona", "place1", "place2", "place3") == [
        "sp3e9yg3kd0",
        "sp3e9cbc3t0",
        None,
    ]


def test_geopos(r: redis.Redis):
    values = (
        2.1909389952632,
        41.433791470673,
        "place1",
        2.1873744593677,
        41.406342043777,
        "place2",
    )
    r.geoadd("barcelona", values)
    # small errors may be introduced.
    assert r.geopos("barcelona", "place1", "place4", "place2") == [
        pytest.approx((2.1909389952632, 41.433791470673), 0.00001),
        None,
        pytest.approx((2.1873744593677, 41.406342043777), 0.00001),
    ]


def test_geodist(r: redis.Redis):
    values = (
        2.1909389952632,
        41.433791470673,
        "place1",
        2.1873744593677,
        41.406342043777,
        "place2",
    )
    assert r.geoadd("barcelona", values) == 2
    assert r.geodist("barcelona", "place1", "place2") == pytest.approx(
        3067.4157, 0.0001
    )


def test_geodist_units(r: redis.Redis):
    values = (
        2.1909389952632,
        41.433791470673,
        "place1",
        2.1873744593677,
        41.406342043777,
        "place2",
    )
    r.geoadd("barcelona", values)
    assert r.geodist("barcelona", "place1", "place2", "km") == pytest.approx(
        3.0674, 0.0001
    )
    assert r.geodist("barcelona", "place1", "place2", "mi") == pytest.approx(
        1.906, 0.0001
    )
    assert r.geodist("barcelona", "place1", "place2", "ft") == pytest.approx(
        10063.6998, 0.0001
    )
    with pytest.raises(redis.RedisError):
        assert r.geodist("x", "y", "z", "inches")


def test_geodist_missing_one_member(r: redis.Redis):
    values = (2.1909389952632, 41.433791470673, "place1")
    r.geoadd("barcelona", values)
    assert r.geodist("barcelona", "place1", "missing_member", "km") is None


@pytest.mark.unsupported_server_types("dragonfly")
@pytest.mark.parametrize(
    "long,lat,radius,extra,expected",
    [
        (2.191, 41.433, 1000, {}, [b"place1"]),
        (2.187, 41.406, 1000, {}, [b"place2"]),
        (1, 2, 1000, {}, []),
        (2.191, 41.433, 1, {"unit": "km"}, [b"place1"]),
        (2.191, 41.433, 3000, {"count": 1}, [b"place1"]),
    ],
)
def test_georadius(
    r: redis.Redis,
    long: float,
    lat: float,
    radius: float,
    extra: Dict[str, Any],
    expected,
):
    values = (
        2.1909389952632,
        41.433791470673,
        "place1",
        2.1873744593677,
        41.406342043777,
        "place2",
    )
    r.geoadd("barcelona", values)
    assert r.georadius("barcelona", long, lat, radius, **extra) == expected


@pytest.mark.unsupported_server_types("dragonfly")
@pytest.mark.parametrize(
    "member,radius,extra,expected",
    [
        ("place1", 1000, {}, [b"place1"]),
        ("place2", 1000, {}, [b"place2"]),
        ("place1", 1, {"unit": "km"}, [b"place1"]),
        ("place1", 3000, {"count": 1}, [b"place1"]),
    ],
)
def test_georadiusbymember(
    r: redis.Redis, member: str, radius: float, extra: Dict[str, Any], expected
):
    values = (
        2.1909389952632,
        41.433791470673,
        "place1",
        2.1873744593677,
        41.406342043777,
        b"place2",
    )
    r.geoadd("barcelona", values)
    assert r.georadiusbymember("barcelona", member, radius, **extra) == expected
    assert r.georadiusbymember(
        "barcelona", member, radius, **extra, store_dist="extract"
    ) == len(expected)
    assert r.zcard("extract") == len(expected)


@pytest.mark.unsupported_server_types("dragonfly")
def test_georadius_with(r: redis.Redis):
    values = (
        2.1909389952632,
        41.433791470673,
        "place1",
        2.1873744593677,
        41.406342043777,
        "place2",
    )

    r.geoadd("barcelona", values)
    # test a bunch of combinations to test the parse response function.
    res = r.georadius(
        "barcelona",
        2.191,
        41.433,
        1,
        unit="km",
        withdist=True,
        withcoord=True,
    )
    assert res == [
        pytest.approx(
            [b"place1", 0.0881, pytest.approx((2.1909, 41.4337), 0.0001)], 0.001
        )
    ]

    res = r.georadius(
        "barcelona", 2.191, 41.433, 1, unit="km", withdist=True, withcoord=True
    )
    assert res == [
        pytest.approx(
            [b"place1", 0.0881, pytest.approx((2.1909, 41.4337), 0.0001)], 0.001
        )
    ]

    res = r.georadius("barcelona", 2.191, 41.433, 1, unit="km", withcoord=True)
    assert res == [[b"place1", pytest.approx((2.1909, 41.4337), 0.0001)]]

    # test no values.
    assert (
        r.georadius(
            "barcelona",
            2,
            1,
            1,
            unit="km",
            withdist=True,
            withcoord=True,
        )
        == []
    )


@pytest.mark.unsupported_server_types("dragonfly")
def test_georadius_count(r: redis.Redis):
    values = (
        2.1909389952632,
        41.433791470673,
        "place1",
        2.1873744593677,
        41.406342043777,
        "place2",
    )

    r.geoadd("barcelona", values)

    assert (
        r.georadius("barcelona", 2.191, 41.433, 3000, count=1, store="barcelona") == 1
    )
    assert r.georadius("barcelona", 2.191, 41.433, 3000, store_dist="extract") == 1
    assert r.zcard("extract") == 1
    res = r.georadius("barcelona", 2.191, 41.433, 3000, count=1, any=True)
    assert (res == [b"place2"]) or res == [b"place1"]

    values = (
        13.361389,
        38.115556,
        "Palermo",
        15.087269,
        37.502669,
        "Catania",
    )

    r.geoadd("Sicily", values)
    assert (
        testtools.raw_command(
            r,
            "GEORADIUS",
            "Sicily",
            "15",
            "37",
            "200",
            "km",
            "STOREDIST",
            "neardist",
            "STORE",
            "near",
        )
        == 2
    )
    assert r.zcard("near") == 2
    assert r.zcard("neardist") == 0


def test_georadius_errors(r: redis.Redis):
    values = (
        13.361389,
        38.115556,
        "Palermo",
        15.087269,
        37.502669,
        "Catania",
    )

    r.geoadd("Sicily", values)

    with pytest.raises(redis.DataError):  # Unsupported unit
        r.georadius("barcelona", 2.191, 41.433, 3000, unit="dsf")
    with pytest.raises(redis.ResponseError):  # Unsupported unit
        testtools.raw_command(
            r,
            "GEORADIUS",
            "Sicily",
            "15",
            "37",
            "200",
            "ddds",
            "STOREDIST",
            "neardist",
            "STORE",
            "near",
        )

    bad_values = (
        13.361389,
        38.115556,
        "Palermo",
        15.087269,
        "Catania",
    )
    with pytest.raises(redis.DataError):
        r.geoadd("newgroup", bad_values)
    with pytest.raises(redis.ResponseError):
        testtools.raw_command(r, "geoadd", "newgroup", *bad_values)


@pytest.mark.unsupported_server_types("dragonfly")
def test_geosearch(r: redis.Redis):
    values = (
        2.1909389952632,
        41.433791470673,
        b"place1",
        2.1873744593677,
        41.406342043777,
        b"place2",
        2.583333,
        41.316667,
        b"place3",
    )
    r.geoadd("barcelona", values)
    assert r.geosearch("barcelona", longitude=2.191, latitude=41.433, radius=1000) == [
        b"place1"
    ]
    assert r.geosearch("barcelona", longitude=2.187, latitude=41.406, radius=1000) == [
        b"place2"
    ]
    # assert r.geosearch("barcelona", longitude=2.191, latitude=41.433, height=1000, width=1000) == [b"place1"]
    assert set(r.geosearch("barcelona", member="place3", radius=100, unit="km")) == {
        b"place2",
        b"place1",
        b"place3",
    }
    # test count
    assert r.geosearch(
        "barcelona", member="place3", radius=100, unit="km", count=2
    ) == [
        b"place3",
        b"place2",
    ]
    assert r.geosearch(
        "barcelona", member="place3", radius=100, unit="km", count=1, any=True
    )[0] in [
        b"place1",
        b"place3",
        b"place2",
    ]


================================================
FILE: tests/fakeredis/test/test_mixins/test_hash_commands.py
================================================
import pytest
import redis
import redis.client

from test import testtools


def test_hstrlen_missing(r: redis.Redis):
    assert r.hstrlen("foo", "doesnotexist") == 0

    r.hset("foo", "key", "value")
    assert r.hstrlen("foo", "doesnotexist") == 0


def test_hstrlen(r: redis.Redis):
    r.hset("foo", "key", "value")
    assert r.hstrlen("foo", "key") == 5


def test_hset_then_hget(r: redis.Redis):
    assert r.hset("foo", "key", "value") == 1
    assert r.hget("foo", "key") == b"value"


def test_hset_update(r: redis.Redis):
    assert r.hset("foo", "key", "value") == 1
    assert r.hset("foo", "key", "value") == 0


def test_hset_wrong_type(r: redis.Redis):
    r.zadd("foo", {"bar": 1})
    with pytest.raises(redis.ResponseError):
        r.hset("foo", "key", "value")


def test_hgetall(r: redis.Redis):
    assert r.hset("foo", "k1", "v1") == 1
    assert r.hset("foo", "k2", "v2") == 1
    assert r.hset("foo", "k3", "v3") == 1
    assert r.hgetall("foo") == {b"k1": b"v1", b"k2": b"v2", b"k3": b"v3"}


def test_hgetall_empty_key(r: redis.Redis):
    assert r.hgetall("foo") == {}


def test_hgetall_wrong_type(r: redis.Redis):
    r.zadd("foo", {"bar": 1})
    with pytest.raises(redis.ResponseError):
        r.hgetall("foo")


def test_hexists(r: redis.Redis):
    r.hset("foo", "bar", "v1")
    assert r.hexists("foo", "bar") == 1
    assert r.hexists("foo", "baz") == 0
    assert r.hexists("bar", "bar") == 0


def test_hexists_wrong_type(r: redis.Redis):
    r.zadd("foo", {"bar": 1})
    with pytest.raises(redis.ResponseError):
        r.hexists("foo", "key")


def test_hkeys(r: redis.Redis):
    r.hset("foo", "k1", "v1")
    r.hset("foo", "k2", "v2")
    assert set(r.hkeys("foo")) == {b"k1", b"k2"}
    assert set(r.hkeys("bar")) == set()


def test_hkeys_wrong_type(r: redis.Redis):
    r.zadd("foo", {"bar": 1})
    with pytest.raises(redis.ResponseError):
        r.hkeys("foo")


def test_hlen(r: redis.Redis):
    r.hset("foo", "k1", "v1")
    r.hset("foo", "k2", "v2")
    assert r.hlen("foo") == 2


def test_hlen_wrong_type(r: redis.Redis):
    r.zadd("foo", {"bar": 1})
    with pytest.raises(redis.ResponseError):
        r.hlen("foo")


def test_hvals(r: redis.Redis):
    r.hset("foo", "k1", "v1")
    r.hset("foo", "k2", "v2")
    assert set(r.hvals("foo")) == {b"v1", b"v2"}
    assert set(r.hvals("bar")) == set()


def test_hvals_wrong_type(r: redis.Redis):
    r.zadd("foo", {"bar": 1})
    with pytest.raises(redis.ResponseError):
        r.hvals("foo")


def test_hmget(r: redis.Redis):
    r.hset("foo", "k1", "v1")
    r.hset("foo", "k2", "v2")
    r.hset("foo", "k3", "v3")
    # Normal case.
    assert r.hmget("foo", ["k1", "k3"]) == [b"v1", b"v3"]
    assert r.hmget("foo", "k1", "k3") == [b"v1", b"v3"]
    # Key does not exist.
    assert r.hmget("bar", ["k1", "k3"]) == [None, None]
    assert r.hmget("bar", "k1", "k3") == [None, None]
    # Some keys in the hash do not exist.
    assert r.hmget("foo", ["k1", "k500"]) == [b"v1", None]
    assert r.hmget("foo", "k1", "k500") == [b"v1", None]


def test_hmget_wrong_type(r: redis.Redis):
    r.zadd("foo", {"bar": 1})
    with pytest.raises(redis.ResponseError):
        r.hmget("foo", "key1", "key2")


def test_hdel(r: redis.Redis):
    r.hset("foo", "k1", "v1")
    r.hset("foo", "k2", "v2")
    r.hset("foo", "k3", "v3")
    assert r.hget("foo", "k1") == b"v1"
    assert r.hdel("foo", "k1") == 1
    assert r.hget("foo", "k1") is None
    assert r.hdel("foo", "k1") == 0
    # Since redis>=2.7.6 returns number of deleted items.
    assert r.hdel("foo", "k2", "k3") == 2
    assert r.hget("foo", "k2") is None
    assert r.hget("foo", "k3") is None
    assert r.hdel("foo", "k2", "k3") == 0


def test_hdel_wrong_type(r: redis.Redis):
    r.zadd("foo", {"bar": 1})
    with pytest.raises(redis.ResponseError):
        r.hdel("foo", "key")


def test_hincrby(r: redis.Redis):
    r.hset("foo", "counter", 0)
    assert r.hincrby("foo", "counter") == 1
    assert r.hincrby("foo", "counter") == 2
    assert r.hincrby("foo", "counter") == 3


def test_hincrby_with_no_starting_value(r: redis.Redis):
    assert r.hincrby("foo", "counter") == 1
    assert r.hincrby("foo", "counter") == 2
    assert r.hincrby("foo", "counter") == 3


def test_hincrby_with_range_param(r: redis.Redis):
    assert r.hincrby("foo", "counter", 2) == 2
    assert r.hincrby("foo", "counter", 2) == 4
    assert r.hincrby("foo", "counter", 2) == 6


def test_hincrby_wrong_type(r: redis.Redis):
    r.zadd("foo", {"bar": 1})
    with pytest.raises(redis.ResponseError):
        r.hincrby("foo", "key", 2)


def test_hincrbyfloat(r: redis.Redis):
    r.hset("foo", "counter", 0.0)
    assert r.hincrbyfloat("foo", "counter") == 1.0
    assert r.hincrbyfloat("foo", "counter") == 2.0
    assert r.hincrbyfloat("foo", "counter") == 3.0


def test_hincrbyfloat_with_no_starting_value(r: redis.Redis):
    assert r.hincrbyfloat("foo", "counter") == 1.0
    assert r.hincrbyfloat("foo", "counter") == 2.0
    assert r.hincrbyfloat("foo", "counter") == 3.0


def test_hincrbyfloat_with_range_param(r: redis.Redis):
    assert r.hincrbyfloat("foo", "counter", 0.1) == pytest.approx(0.1)
    assert r.hincrbyfloat("foo", "counter", 0.1) == pytest.approx(0.2)
    assert r.hincrbyfloat("foo", "counter", 0.1) == pytest.approx(0.3)


def test_hincrbyfloat_on_non_float_value_raises_error(r: redis.Redis):
    r.hset("foo", "counter", "cat")
    with pytest.raises(redis.ResponseError):
        r.hincrbyfloat("foo", "counter")


def test_hincrbyfloat_with_non_float_amount_raises_error(r: redis.Redis):
    with pytest.raises(redis.ResponseError):
        r.hincrbyfloat("foo", "counter", "cat")


def test_hincrbyfloat_wrong_type(r: redis.Redis):
    r.zadd("foo", {"bar": 1})
    with pytest.raises(redis.ResponseError):
        r.hincrbyfloat("foo", "key", 0.1)


def test_hincrbyfloat_precision(r: redis.Redis):
    x = 1.23456789123456789
    assert r.hincrbyfloat("foo", "bar", x) == x
    assert float(r.hget("foo", "bar")) == x


def test_hsetnx(r: redis.Redis):
    assert r.hsetnx("foo", "newkey", "v1") == 1
    assert r.hsetnx("foo", "newkey", "v1") == 0
    assert r.hget("foo", "newkey") == b"v1"


def test_hmset_empty_raises_error(r: redis.Redis):
    with pytest.raises(redis.DataError):
        r.hmset("foo", {})


@testtools.run_test_if_redispy_ver("lte", "4.6")
def test_hmset_redispy4(r: redis.Redis):
    r.hset("foo", "k1", "v1")
    assert r.hmset("foo", {"k2": "v2", "k3": "v3"}) is True


def test_hmset_wrong_type(r: redis.Redis):
    r.zadd("foo", {"bar": 1})
    with pytest.raises(redis.ResponseError):
        r.hmset("foo", {"key": "value"})


def test_empty_hash(r: redis.Redis):
    r.hset("foo", "bar", "baz")
    r.hdel("foo", "bar")
    assert not r.exists("foo")


def test_hset_removing_last_field_delete_key(r: redis.Redis):
    r.hset(b"3L", b"f1", b"v1")
    r.hdel(b"3L", b"f1")
    assert r.keys("*") == []


def test_hscan(r: redis.Redis):
    # Set up the data
    name = "hscan-test"
    for ix in range(20):
        k = "key:%s" % ix
        v = "result:%s" % ix
        r.hset(name, k, v)
    expected = r.hgetall(name)
    assert len(expected) == 20  # Ensure we know what we're testing

    # Test that we page through the results and get everything out
    results = {}
    cursor = "0"
    while cursor != 0:
        cursor, data = r.hscan(name, cursor, count=6)
        results.update(data)
    assert expected == results

    # Test the iterator version
    results = {}
    for key, val in r.hscan_iter(name, count=6):
        results[key] = val
    assert expected == results

    # Now test that the MATCH functionality works
    results = {}
    cursor = "0"
    while cursor != 0:
        cursor, data = r.hscan(name, cursor, match="*7", count=100)
        results.update(data)
    assert b"key:7" in results
    assert b"key:17" in results
    assert len(results) == 2

    # Test the match on iterator
    results = {}
    for key, val in r.hscan_iter(name, match="*7"):
        results[key] = val
    assert b"key:7" in results
    assert b"key:17" in results
    assert len(results) == 2


def test_hrandfield(r: redis.Redis):
    assert r.hrandfield("key") is None
    hash = {b"a": 1, b"b": 2, b"c": 3, b"d": 4, b"e": 5}
    r.hset("key", mapping=hash)
    assert r.hrandfield("key") is not None
    assert len(r.hrandfield("key", 0)) == 0
    res = r.hrandfield("key", 2)
    assert len(res) == 2
    assert res[0] in set(hash.keys())
    assert res[1] in set(hash.keys())
    # with values
    res = r.hrandfield("key", 2, True)
    assert len(res) == 4
    assert res[0] in set(hash.keys())
    assert res[1] in {str(x).encode() for x in hash.values()}
    assert res[2] in set(hash.keys())
    assert res[3] in {str(x).encode() for x in hash.values()}
    # without duplications
    assert len(r.hrandfield("key", 10)) == 5
    # with duplications
    assert len(r.hrandfield("key", -10)) == 10

    with pytest.raises(redis.ResponseError):
        testtools.raw_command(r, "HRANDFIELD", "key", 3, "WITHVALUES", 3)


================================================
FILE: tests/fakeredis/test/test_mixins/test_list_commands.py
================================================
import threading
from time import sleep

import pytest
import redis
import redis.client

from .. import testtools


def _push_thread(r: redis.Redis) -> threading.Thread:
    def run():
        sleep(0.5)
        r.rpush("foo", "value1")
        sleep(0.5)
        # Will wake the condition variable
        r.set("bar", "go back to sleep some more")
        r.rpush("foo", "value2")

    thread = threading.Thread(target=run)
    thread.start()
    return thread


def test_lpush_then_lrange_all(r: redis.Redis):
    assert r.lpush("foo", "bar") == 1
    assert r.lpush("foo", "baz") == 2
    assert r.lpush("foo", "bam", "buzz") == 4
    assert r.lrange("foo", 0, -1) == [b"buzz", b"bam", b"baz", b"bar"]


def test_lpush_then_lrange_portion(r: redis.Redis):
    r.lpush("foo", "one")
    r.lpush("foo", "two")
    r.lpush("foo", "three")
    r.lpush("foo", "four")
    assert r.lrange("foo", 0, 2) == [b"four", b"three", b"two"]
    assert r.lrange("foo", 0, 3) == [b"four", b"three", b"two", b"one"]


def test_lrange_negative_indices(r: redis.Redis):
    r.rpush("foo", "a", "b", "c")
    assert r.lrange("foo", -1, -2) == []
    assert r.lrange("foo", -2, -1) == [b"b", b"c"]


def test_lpush_key_does_not_exist(r: redis.Redis):
    assert r.lrange("foo", 0, -1) == []


def test_lpush_with_nonstr_key(r: redis.Redis):
    r.lpush(1, "one")
    r.lpush(1, "two")
    r.lpush(1, "three")
    assert r.lrange(1, 0, 2) == [b"three", b"two", b"one"]
    assert r.lrange("1", 0, 2) == [b"three", b"two", b"one"]


def test_lpush_wrong_type(r: redis.Redis):
    r.set("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.lpush("foo", "element")


def test_llen(r: redis.Redis):
    r.lpush("foo", "one")
    r.lpush("foo", "two")
    r.lpush("foo", "three")
    assert r.llen("foo") == 3


def test_llen_no_exist(r: redis.Redis):
    assert r.llen("foo") == 0


def test_llen_wrong_type(r: redis.Redis):
    r.set("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.llen("foo")


def test_lrem_positive_count(r: redis.Redis):
    r.lpush("foo", "same")
    r.lpush("foo", "same")
    r.lpush("foo", "different")
    r.lrem("foo", 2, "same")
    assert r.lrange("foo", 0, -1) == [b"different"]


def test_lrem_negative_count(r: redis.Redis):
    r.lpush("foo", "removeme")
    r.lpush("foo", "three")
    r.lpush("foo", "two")
    r.lpush("foo", "one")
    r.lpush("foo", "removeme")
    r.lrem("foo", -1, "removeme")
    # Should remove it from the end of the list,
    # leaving the 'removeme' from the front of the list alone.
    assert r.lrange("foo", 0, -1) == [b"removeme", b"one", b"two", b"three"]


def test_lrem_zero_count(r: redis.Redis):
    r.lpush("foo", "one")
    r.lpush("foo", "one")
    r.lpush("foo", "one")
    r.lrem("foo", 0, "one")
    assert r.lrange("foo", 0, -1) == []


def test_lrem_default_value(r: redis.Redis):
    r.lpush("foo", "one")
    r.lpush("foo", "one")
    r.lpush("foo", "one")
    r.lrem("foo", 0, "one")
    assert r.lrange("foo", 0, -1) == []


def test_lrem_does_not_exist(r: redis.Redis):
    r.lpush("foo", "one")
    r.lrem("foo", 0, "one")
    # These should be noops.
    r.lrem("foo", -2, "one")
    r.lrem("foo", 2, "one")


def test_lrem_return_value(r: redis.Redis):
    r.lpush("foo", "one")
    count = r.lrem("foo", 0, "one")
    assert count == 1
    assert r.lrem("foo", 0, "one") == 0


def test_lrem_wrong_type(r: redis.Redis):
    r.set("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.lrem("foo", 0, "element")


def test_rpush(r: redis.Redis):
    r.rpush("foo", "one")
    r.rpush("foo", "two")
    r.rpush("foo", "three")
    r.rpush("foo", "four", "five")
    assert r.lrange("foo", 0, -1) == [b"one", b"two", b"three", b"four", b"five"]


def test_rpush_wrong_type(r: redis.Redis):
    r.set("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.rpush("foo", "element")


def test_lpop(r: redis.Redis):
    assert r.rpush("foo", "one") == 1
    assert r.rpush("foo", "two") == 2
    assert r.rpush("foo", "three") == 3
    assert r.lpop("foo") == b"one"
    assert r.lpop("foo") == b"two"
    assert r.lpop("foo") == b"three"


def test_lpop_empty_list(r: redis.Redis):
    r.rpush("foo", "one")
    r.lpop("foo")
    assert r.lpop("foo") is None
    # Verify what happens if we try to pop from a key
    # we've never seen before.
    assert r.lpop("noexists") is None


def test_lpop_zero_elem(r: redis.Redis):
    r.rpush(b"\x00", b"")
    assert r.lpop(b"\x00", 0) == []


def test_lpop_zero_non_existing_list(r: redis.Redis):
    assert r.lpop(b"", 0) is None


def test_lpop_zero_wrong_type(r: redis.Redis):
    r.set(b"", b"")
    with pytest.raises(redis.ResponseError):
        r.lpop(b"", 0)


def test_lpop_wrong_type(r: redis.Redis):
    r.set("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.lpop("foo")


@pytest.mark.min_server("6.2")
def test_lpop_count(r: redis.Redis):
    assert r.rpush("foo", "one") == 1
    assert r.rpush("foo", "two") == 2
    assert r.rpush("foo", "three") == 3
    assert testtools.raw_command(r, "lpop", "foo", 2) == [b"one", b"two"]
    # See https://github.com/redis/redis/issues/9680
    raw = testtools.raw_command(r, "rpop", "foo", 0)
    assert raw is None or raw == []  # https://github.com/redis/redis/pull/10095


@pytest.mark.min_server("6.2")
def test_lpop_count_negative(r: redis.Redis):
    with pytest.raises(redis.ResponseError):
        testtools.raw_command(r, "lpop", "foo", -1)


def test_lset(r: redis.Redis):
    r.rpush("foo", "one")
    r.rpush("foo", "two")
    r.rpush("foo", "three")
    r.lset("foo", 0, "four")
    r.lset("foo", -2, "five")
    assert r.lrange("foo", 0, -1) == [b"four", b"five", b"three"]


def test_lset_index_out_of_range(r: redis.Redis):
    r.rpush("foo", "one")
    with pytest.raises(redis.ResponseError):
        r.lset("foo", 3, "three")


def test_lset_wrong_type(r: redis.Redis):
    r.set("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.lset("foo", 0, "element")


def test_rpushx(r: redis.Redis):
    r.rpush("foo", "one")
    r.rpushx("foo", "two")
    r.rpushx("bar", "three")
    assert r.lrange("foo", 0, -1) == [b"one", b"two"]
    assert r.lrange("bar", 0, -1) == []


def test_rpushx_wrong_type(r: redis.Redis):
    r.set("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.rpushx("foo", "element")


def test_ltrim(r: redis.Redis):
    r.rpush("foo", "one")
    r.rpush("foo", "two")
    r.rpush("foo", "three")
    r.rpush("foo", "four")

    assert r.ltrim("foo", 1, 3)
    assert r.lrange("foo", 0, -1) == [b"two", b"three", b"four"]
    assert r.ltrim("foo", 1, -1)
    assert r.lrange("foo", 0, -1) == [b"three", b"four"]


def test_ltrim_with_non_existent_key(r: redis.Redis):
    assert r.ltrim("foo", 0, -1)


def test_ltrim_expiry(r: redis.Redis):
    r.rpush("foo", "one", "two", "three")
    r.expire("foo", 10)
    r.ltrim("foo", 1, 2)
    assert r.ttl("foo") > 0


def test_ltrim_wrong_type(r: redis.Redis):
    r.set("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.ltrim("foo", 1, -1)


def test_lindex(r: redis.Redis):
    r.rpush("foo", "one")
    r.rpush("foo", "two")
    assert r.lindex("foo", 0) == b"one"
    assert r.lindex("foo", 4) is None
    assert r.lindex("bar", 4) is None


def test_lindex_wrong_type(r: redis.Redis):
    r.set("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.lindex("foo", 0)


def test_lpushx(r: redis.Redis):
    r.lpush("foo", "two")
    r.lpushx("foo", "one")
    r.lpushx("bar", "one")
    assert r.lrange("foo", 0, -1) == [b"one", b"two"]
    assert r.lrange("bar", 0, -1) == []


def test_lpushx_wrong_type(r: redis.Redis):
    r.set("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.lpushx("foo", "element")


def test_rpop(r: redis.Redis):
    assert r.rpop("foo") is None
    r.rpush("foo", "one")
    r.rpush("foo", "two")
    assert r.rpop("foo") == b"two"
    assert r.rpop("foo") == b"one"
    assert r.rpop("foo") is None


def test_rpop_wrong_type(r: redis.Redis):
    r.set("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.rpop("foo")


@pytest.mark.min_server("6.2")
def test_rpop_count(r: redis.Redis):
    assert r.rpush("foo", "one") == 1
    assert r.rpush("foo", "two") == 2
    assert r.rpush("foo", "three") == 3
    assert testtools.raw_command(r, "rpop", "foo", 2) == [b"three", b"two"]
    # See https://github.com/redis/redis/issues/9680
    raw = testtools.raw_command(r, "rpop", "foo", 0)
    assert raw is None or raw == []  # https://github.com/redis/redis/pull/10095


@pytest.mark.min_server("6.2")
def test_rpop_count_negative(r: redis.Redis):
    with pytest.raises(redis.ResponseError):
        testtools.raw_command(r, "rpop", "foo", -1)


def test_linsert_before(r: redis.Redis):
    r.rpush("foo", "hello")
    r.rpush("foo", "world")
    assert r.linsert("foo", "before", "world", "there") == 3
    assert r.lrange("foo", 0, -1) == [b"hello", b"there", b"world"]
    assert r.linsert("empty_list", "before", "world", "there") == 0


def test_linsert_after(r: redis.Redis):
    r.rpush("foo", "hello")
    r.rpush("foo", "world")
    assert r.linsert("foo", "after", "hello", "there") == 3
    assert r.lrange("foo", 0, -1) == [b"hello", b"there", b"world"]


def test_linsert_bad_command(r: redis.Redis):
    with pytest.raises(redis.ResponseError):
        testtools.raw_command(r, "LINSERT", "x", "NOT_BEFORE", "pivot", "val")


def test_linsert_no_pivot(r: redis.Redis):
    r.rpush("foo", "hello")
    r.rpush("foo", "world")
    assert r.linsert("foo", "after", "goodbye", "bar") == -1
    assert r.lrange("foo", 0, -1) == [b"hello", b"world"]


def test_linsert_wrong_type(r: redis.Redis):
    r.set("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.linsert("foo", "after", "bar", "element")


def test_rpoplpush(r: redis.Redis):
    assert r.rpoplpush("foo", "bar") is None
    assert r.lpop("bar") is None
    r.rpush("foo", "one")
    r.rpush("foo", "two")
    r.rpush("bar", "one")

    assert r.rpoplpush("foo", "bar") == b"two"
    assert r.lrange("foo", 0, -1) == [b"one"]
    assert r.lrange("bar", 0, -1) == [b"two", b"one"]

    # Catch instances where we store bytes and strings inconsistently
    # and thus bar = ['two', b'one']
    assert r.lrem("bar", -1, "two") == 1


def test_rpoplpush_to_nonexistent_destination(r: redis.Redis):
    r.rpush("foo", "one")
    assert r.rpoplpush("foo", "bar") == b"one"
    assert r.rpop("bar") == b"one"


def test_rpoplpush_expiry(r: redis.Redis):
    r.rpush("foo", "one")
    r.rpush("bar", "two")
    r.expire("bar", 10)
    r.rpoplpush("foo", "bar")
    assert r.ttl("bar") > 0


def test_rpoplpush_one_to_self(r: redis.Redis):
    r.rpush("list", "element")
    assert r.brpoplpush("list", "list") == b"element"
    assert r.lrange("list", 0, -1) == [b"element"]


def test_rpoplpush_wrong_type(r: redis.Redis):
    r.set("foo", "bar")
    r.rpush("list", "element")
    with pytest.raises(redis.ResponseError):
        r.rpoplpush("foo", "list")
    assert r.get("foo") == b"bar"
    assert r.lrange("list", 0, -1) == [b"element"]
    with pytest.raises(redis.ResponseError):
        r.rpoplpush("list", "foo")
    assert r.get("foo") == b"bar"
    assert r.lrange("list", 0, -1) == [b"element"]


def test_blpop_single_list(r: redis.Redis):
    r.rpush("foo", "one")
    r.rpush("foo", "two")
    r.rpush("foo", "three")
    assert r.blpop(["foo"], timeout=1) == (b"foo", b"one")


def test_blpop_test_multiple_lists(r: redis.Redis):
    r.rpush("baz", "zero")
    assert r.blpop(["foo", "baz"], timeout=1) == (b"baz", b"zero")
    assert not r.exists("baz")

    r.rpush("foo", "one")
    r.rpush("foo", "two")
    # bar has nothing, so the returned value should come
    # from foo.
    assert r.blpop(["bar", "foo"], timeout=1) == (b"foo", b"one")
    r.rpush("bar", "three")
    # bar now has something, so the returned value should come
    # from bar.
    assert r.blpop(["bar", "foo"], timeout=1) == (b"bar", b"three")
    assert r.blpop(["bar", "foo"], timeout=1) == (b"foo", b"two")


def test_blpop_allow_single_key(r: redis.Redis):
    # blpop converts single key arguments to a one element list.
    r.rpush("foo", "one")
    assert r.blpop("foo", timeout=1) == (b"foo", b"one")


@pytest.mark.slow
def test_blpop_block(r: redis.Redis):
    thread = _push_thread(r)
    try:
        assert r.blpop("foo") == (b"foo", b"value1")
        assert r.blpop("foo", timeout=5) == (b"foo", b"value2")
    finally:
        thread.join()


@pytest.mark.slow
def test_blpop_block_float(r: redis.Redis):
    thread = _push_thread(r)
    try:
        assert testtools.raw_command(r, "blpop", "foo", 0) == [b"foo", b"value1"]
        assert testtools.raw_command(r, "blpop", "foo", 1.1) == [b"foo", b"value2"]
    finally:
        thread.join()


@pytest.mark.slow
def test_brpop_block(r: redis.Redis):
    thread = _push_thread(r)
    try:
        assert r.brpop("foo") == (b"foo", b"value1")
        assert r.brpop("foo", timeout=5) == (b"foo", b"value2")
    finally:
        thread.join()


def test_blpop_wrong_type(r: redis.Redis):
    r.set("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.blpop("foo", timeout=1)


def test_blpop_transaction(r: redis.Redis):
    p = r.pipeline()
    p.multi()
    p.blpop("missing", timeout=1000)
    result = p.execute()
    # Blocking commands behave like non-blocking versions in transactions
    assert result == [None]


def test_brpop_test_multiple_lists(r: redis.Redis):
    r.rpush("baz", "zero")
    assert r.brpop(["foo", "baz"], timeout=1) == (b"baz", b"zero")
    assert not r.exists("baz")

    r.rpush("foo", "one")
    r.rpush("foo", "two")
    assert r.brpop(["bar", "foo"], timeout=1) == (b"foo", b"two")


def test_brpop_single_key(r: redis.Redis):
    r.rpush("foo", "one")
    r.rpush("foo", "two")
    assert r.brpop("foo", timeout=1) == (b"foo", b"two")


def test_brpop_wrong_type(r: redis.Redis):
    r.set("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.brpop("foo", timeout=1)


def test_brpoplpush_multi_keys(r: redis.Redis):
    assert r.lpop("bar") is None
    r.rpush("foo", "one")
    r.rpush("foo", "two")
    assert r.brpoplpush("foo", "bar", timeout=1) == b"two"
    assert r.lrange("bar", 0, -1) == [b"two"]

    # Catch instances where we store bytes and strings inconsistently
    # and thus bar = ['two']
    assert r.lrem("bar", -1, "two") == 1


@pytest.mark.unsupported_server_types("dragonfly")  # TODO Should this be supported?
def test_brpoplpush_wrong_type(r: redis.Redis):
    r.set("foo", "bar")
    r.rpush("list", "element")
    with pytest.raises(redis.ResponseError):
        r.brpoplpush("foo", "list")
    assert r.get("foo") == b"bar"
    assert r.lrange("list", 0, -1) == [b"element"]
    with pytest.raises(redis.ResponseError):
        r.brpoplpush("list", "foo")
    assert r.get("foo") == b"bar"
    assert r.lrange("list", 0, -1) == [b"element"]


@pytest.mark.slow
def test_blocking_operations_when_empty(r: redis.Redis):
    assert r.blpop(["foo"], timeout=1) is None
    assert r.blpop(["bar", "foo"], timeout=1) is None
    assert r.brpop("foo", timeout=1) is None
    assert r.brpoplpush("foo", "bar", timeout=1) is None


def test_empty_list(r: redis.Redis):
    r.rpush("foo", "bar")
    r.rpop("foo")
    assert not r.exists("foo")


def test_lmove_to_nonexistent_destination(r: redis.Redis):
    r.rpush("foo", "one")
    assert r.lmove("foo", "bar", "RIGHT", "LEFT") == b"one"
    assert r.rpop("bar") == b"one"


def test_lmove_expiry(r: redis.Redis):
    r.rpush("foo", "one")
    r.rpush("bar", "two")
    r.expire("bar", 10)
    r.lmove("foo", "bar", "RIGHT", "LEFT")
    assert r.ttl("bar") > 0


def test_lmove_wrong_type(r: redis.Redis):
    r.rpush("foo", "one")
    r.rpush("bar", "two")
    with pytest.raises(redis.ResponseError):
        testtools.raw_command(r, "LMOVE", "foo", "bar", "left", "NOT_LEFT_OR_RIGHT")

    r.set("foo", "bar")
    r.rpush("list", "element")
    with pytest.raises(redis.ResponseError):
        r.lmove("foo", "list", "RIGHT", "LEFT")
    assert r.get("foo") == b"bar"
    assert r.lrange("list", 0, -1) == [b"element"]
    with pytest.raises(redis.ResponseError):
        r.lmove("list", "foo", "RIGHT", "LEFT")
    assert r.get("foo") == b"bar"
    assert r.lrange("list", 0, -1) == [b"element"]


def test_lmove(r: redis.Redis):
    assert r.lmove("foo", "bar", "RIGHT", "LEFT") is None
    assert r.lpop("bar") is None
    r.rpush("foo", "one")
    r.rpush("foo", "two")
    r.rpush("bar", "one")

    # RPOPLPUSH
    assert r.lmove("foo", "bar", "RIGHT", "LEFT") == b"two"
    assert r.lrange("foo", 0, -1) == [b"one"]
    assert r.lrange("bar", 0, -1) == [b"two", b"one"]
    # LPOPRPUSH
    assert r.lmove("bar", "bar", "LEFT", "RIGHT") == b"two"
    assert r.lrange("bar", 0, -1) == [b"one", b"two"]
    # RPOPRPUSH
    r.rpush("foo", "three")
    assert r.lmove("foo", "bar", "RIGHT", "RIGHT") == b"three"
    assert r.lrange("foo", 0, -1) == [b"one"]
    assert r.lrange("bar", 0, -1) == [b"one", b"two", b"three"]
    # LPOPLPUSH
    assert r.lmove("bar", "foo", "LEFT", "LEFT") == b"one"
    assert r.lrange("foo", 0, -1) == [b"one", b"one"]
    assert r.lrange("bar", 0, -1) == [b"two", b"three"]

    # Catch instances where we store bytes and strings inconsistently
    # and thus bar = ['two', b'one']
    assert r.lrem("bar", -1, "two") == 1


def test_blmove(r: redis.Redis):
    r.rpush("a", "one", "two", "three", "four")
    assert r.blmove("a", "b", 5)
    assert r.blmove("a", "b", 1, "RIGHT", "LEFT")


def test_lpos(r: redis.Redis):
    assert r.rpush("a", "a", "b", "c", "1", "2", "3", "c", "c") == 8
    assert r.lpos("a", "a") == 0
    assert r.lpos("a", "c") == 2

    assert r.lpos("a", "c", rank=1) == 2
    assert r.lpos("a", "c", rank=2) == 6
    assert r.lpos("a", "c", rank=4) is None
    assert r.lpos("a", "c", rank=-1) == 7
    assert r.lpos("a", "c", rank=-2) == 6

    assert r.lpos("a", "c", count=0) == [2, 6, 7]
    assert r.lpos("a", "c", count=1) == [2]
    assert r.lpos("a", "c", count=2) == [2, 6]
    assert r.lpos("a", "c", count=100) == [2, 6, 7]

    assert r.lpos("a", "c", count=0, rank=2) == [6, 7]
    assert r.lpos("a", "c", count=2, rank=-1) == [7, 6]

    assert r.lpos("axxx", "c", count=0, rank=2) == []
    assert r.lpos("axxx", "c") is None

    assert r.lpos("a", "x", count=2) == []
    assert r.lpos("a", "x") is None

    assert r.lpos("a", "a", count=0, maxlen=1) == [0]
    assert r.lpos("a", "c", count=0, maxlen=1) == []
    assert r.lpos("a", "c", count=0, maxlen=3) == [2]
    assert r.lpos("a", "c", count=0, maxlen=3, rank=-1) == [7, 6]
    assert r.lpos("a", "c", count=0, maxlen=7, rank=2) == [6]


@pytest.mark.unsupported_server_types("dragonfly")
@pytest.mark.min_server("7")
def test_blmpop(r: redis.Redis):
    r.rpush("a", "1", "2", "3", "4", "5")
    res = [b"a", [b"1", b"2"]]
    assert r.blmpop(1, "2", "b", "a", direction="LEFT", count=2) == res
    with pytest.raises(TypeError):
        r.blmpop(1, "2", "b", "a", count=2)
    r.rpush("b", "6", "7", "8", "9")
    assert r.blmpop(0, "2", "b", "a", direction="LEFT") == [b"b", [b"6"]]
    assert r.blmpop(1, "2", "foo", "bar", direction="RIGHT") is None


@pytest.mark.unsupported_server_types("dragonfly")
@pytest.mark.min_server("7")
def test_lmpop(r: redis.Redis):
    r.rpush("foo", "1", "2", "3", "4", "5")
    result = [b"foo", [b"1", b"2"]]
    assert r.lmpop("2", "bar", "foo", direction="LEFT", count=2) == result
    with pytest.raises(redis.ResponseError):
        r.lmpop("2", "bar", "foo", direction="up", count=2)
    r.rpush("bar", "a", "b", "c", "d")
    assert r.lmpop("2", "bar", "foo", direction="LEFT") == [b"bar", [b"a"]]


================================================
FILE: tests/fakeredis/test/test_mixins/test_pubsub_commands.py
================================================
import threading
import time
import uuid
from queue import Queue
from time import sleep
from typing import Optional, Dict, Any

import pytest
import redis
from redis.client import PubSub

from .. import testtools


def wait_for_message(
    pubsub: PubSub, timeout=0.5, ignore_subscribe_messages=False
) -> Optional[Dict[str, Any]]:
    now = time.time()
    timeout = now + timeout
    while now < timeout:
        message = pubsub.get_message(
            ignore_subscribe_messages=ignore_subscribe_messages
        )
        if message is not None:
            return message
        time.sleep(0.01)
        now = time.time()
    return None


def make_message(_type, channel, data, pattern=None):
    return {
        "type": _type,
        "pattern": pattern and pattern.encode("utf-8") or None,
        "channel": channel and channel.encode("utf-8") or None,
        "data": data.encode("utf-8") if isinstance(data, str) else data,
    }


def test_ping_pubsub(r: redis.Redis):
    p = r.pubsub()
    p.subscribe("channel")
    p.parse_response()  # Consume the subscribe command reply
    p.ping()
    assert p.parse_response() == [b"pong", b""]
    p.ping("test")
    assert p.parse_response() == [b"pong", b"test"]


@pytest.mark.slow
def test_pubsub_subscribe(r: redis.Redis):
    pubsub = r.pubsub()
    pubsub.subscribe("channel")
    sleep(1)
    expected_message = {
        "type": "subscribe",
        "pattern": None,
        "channel": b"channel",
        "data": 1,
    }
    message = pubsub.get_message()
    keys = list(pubsub.channels.keys())

    key = keys[0]
    key = key if type(key) is bytes else bytes(key, encoding="utf-8")

    assert len(keys) == 1
    assert key == b"channel"
    assert message == expected_message


@pytest.mark.slow
def test_pubsub_numpat(r: redis.Redis):
    p = r.pubsub()
    p.psubscribe("*oo", "*ar", "b*z")
    for i in range(3):
        assert wait_for_message(p)["type"] == "psubscribe"
    assert r.pubsub_numpat() == 3


@pytest.mark.slow
def test_pubsub_psubscribe(r: redis.Redis):
    pubsub = r.pubsub()
    pubsub.psubscribe("channel.*")
    sleep(1)
    expected_message = {
        "type": "psubscribe",
        "pattern": None,
        "channel": b"channel.*",
        "data": 1,
    }

    message = pubsub.get_message()
    keys = list(pubsub.patterns.keys())
    assert len(keys) == 1
    assert message == expected_message


@pytest.mark.slow
def test_pubsub_unsubscribe(r: redis.Redis):
    pubsub = r.pubsub()
    pubsub.subscribe("channel-1", "channel-2", "channel-3")
    sleep(1)
    expected_message = {
        "type": "unsubscribe",
        "pattern": None,
        "channel": b"channel-1",
        "data": 2,
    }
    pubsub.get_message()
    pubsub.get_message()
    pubsub.get_message()

    # unsubscribe from one
    pubsub.unsubscribe("channel-1")
    sleep(1)
    message = pubsub.get_message()
    keys = list(pubsub.channels.keys())
    assert message == expected_message
    assert len(keys) == 2

    # unsubscribe from multiple
    pubsub.unsubscribe()
    sleep(1)
    pubsub.get_message()
    pubsub.get_message()
    keys = list(pubsub.channels.keys())
    assert message == expected_message
    assert len(keys) == 0


@pytest.mark.slow
def test_pubsub_punsubscribe(r: redis.Redis):
    pubsub = r.pubsub()
    pubsub.psubscribe("channel-1.*", "channel-2.*", "channel-3.*")
    sleep(1)
    expected_message = {
        "type": "punsubscribe",
        "pattern": None,
        "channel": b"channel-1.*",
        "data": 2,
    }
    pubsub.get_message()
    pubsub.get_message()
    pubsub.get_message()

    # unsubscribe from one
    pubsub.punsubscribe("channel-1.*")
    sleep(1)
    message = pubsub.get_message()
    keys = list(pubsub.patterns.keys())
    assert message == expected_message
    assert len(keys) == 2

    # unsubscribe from multiple
    pubsub.punsubscribe()
    sleep(1)
    pubsub.get_message()
    pubsub.get_message()
    keys = list(pubsub.patterns.keys())
    assert len(keys) == 0


@pytest.mark.slow
def test_pubsub_listen(r: redis.Redis):
    def _listen(pubsub, q):
        count = 0
        for message in pubsub.listen():
            q.put(message)
            count += 1
            if count == 4:
                pubsub.close()

    channel = "ch1"
    patterns = ["ch1*", "ch[1]", "ch?"]
    pubsub = r.pubsub()
    pubsub.subscribe(channel)
    pubsub.psubscribe(*patterns)
    sleep(1)
    msgs = [pubsub.get_message() for _ in range(4)]
    assert msgs[0]["type"] == "subscribe"
    for i in range(1, 4):
        assert msgs[i]["type"] == "psubscribe"

    q = Queue()
    t = threading.Thread(target=_listen, args=(pubsub, q))
    t.start()
    msg = "hello world"
    r.publish(channel, msg)
    t.join()

    msgs = [q.get() for _ in range(4)]

    bpatterns = [pattern.encode() for pattern in patterns]
    bpatterns.append(channel.encode())
    msg = msg.encode()
    for item in msgs:
        assert item["data"] == msg
        assert item["channel"] in bpatterns


@pytest.mark.slow
def test_pubsub_listen_handler(r: redis.Redis):
    def _handler(message):
        calls.append(message)

    channel = "ch1"
    patterns = {"ch?": _handler}
    calls = []

    pubsub = r.pubsub()
    pubsub.subscribe(ch1=_handler)
    pubsub.psubscribe(**patterns)
    sleep(1)
    msg1 = pubsub.get_message()
    msg2 = pubsub.get_message()
    assert msg1["type"] == "subscribe"
    assert msg2["type"] == "psubscribe"
    msg = "hello world"
    r.publish(channel, msg)
    sleep(1)
    for i in range(2):
        msg = pubsub.get_message()
        assert msg is None  # get_message returns None when handler is used
    pubsub.close()
    calls.sort(key=lambda call: call["type"])
    assert calls == [
        {"pattern": None, "channel": b"ch1", "data": b"hello world", "type": "message"},
        {
            "pattern": b"ch?",
            "channel": b"ch1",
            "data": b"hello world",
            "type": "pmessage",
        },
    ]


@pytest.mark.slow
def test_pubsub_ignore_sub_messages_listen(r: redis.Redis):
    def _listen(pubsub, q):
        count = 0
        for message in pubsub.listen():
            q.put(message)
            count += 1
            if count == 4:
                pubsub.close()

    channel = "ch1"
    patterns = ["ch1*", "ch[1]", "ch?"]
    pubsub = r.pubsub(ignore_subscribe_messages=True)
    pubsub.subscribe(channel)
    pubsub.psubscribe(*patterns)
    sleep(1)

    q = Queue()
    t = threading.Thread(target=_listen, args=(pubsub, q))
    t.start()
    msg = "hello world"
    r.publish(channel, msg)
    t.join()

    msg1 = q.get()
    msg2 = q.get()
    msg3 = q.get()
    msg4 = q.get()

    bpatterns = [pattern.encode() for pattern in patterns]
    bpatterns.append(channel.encode())
    msg = msg.encode()
    assert msg1["data"] == msg
    assert msg1["channel"] in bpatterns
    assert msg2["data"] == msg
    assert msg2["channel"] in bpatterns
    assert msg3["data"] == msg
    assert msg3["channel"] in bpatterns
    assert msg4["data"] == msg
    assert msg4["channel"] in bpatterns


@pytest.mark.slow
def test_pubsub_binary(r: redis.Redis):
    def _listen(pubsub, q):
        for message in pubsub.listen():
            q.put(message)
            pubsub.close()

    pubsub = r.pubsub(ignore_subscribe_messages=True)
    pubsub.subscribe("channel\r\n\xff")
    sleep(1)

    q = Queue()
    t = threading.Thread(target=_listen, args=(pubsub, q))
    t.start()
    msg = b"\x00hello world\r\n\xff"
    r.publish("channel\r\n\xff", msg)
    t.join()

    received = q.get()
    assert received["data"] == msg


@pytest.mark.slow
def test_pubsub_run_in_thread(r: redis.Redis):
    q = Queue()

    pubsub = r.pubsub()
    pubsub.subscribe(channel=q.put)
    pubsub_thread = pubsub.run_in_thread()

    msg = b"Hello World"
    r.publish("channel", msg)

    retrieved = q.get()
    assert retrieved["data"] == msg

    pubsub_thread.stop()
    # Newer versions of redis wait for an unsubscribe message, which sometimes comes early
    # https://github.com/andymccurdy/redis-py/issues/1150
    if pubsub.channels:
        pubsub.channels = {}
    pubsub_thread.join()
    assert not pubsub_thread.is_alive()

    pubsub.subscribe(channel=None)
    with pytest.raises(redis.exceptions.PubSubError):
        pubsub_thread = pubsub.run_in_thread()

    pubsub.unsubscribe("channel")

    pubsub.psubscribe(channel=None)
    with pytest.raises(redis.exceptions.PubSubError):
        pubsub_thread = pubsub.run_in_thread()


@pytest.mark.slow
@pytest.mark.parametrize(
    "timeout_value",
    [1, pytest.param(None, marks=testtools.run_test_if_redispy_ver("gte", "3.2"))],
)
def test_pubsub_timeout(r, timeout_value):
    def publish():
        sleep(0.1)
        r.publish("channel", "hello")

    p = r.pubsub()
    p.subscribe("channel")
    p.parse_response()  # Drains the subscribe command message
    publish_thread = threading.Thread(target=publish)
    publish_thread.start()
    message = p.get_message(timeout=timeout_value)
    assert message == {
        "type": "message",
        "pattern": None,
        "channel": b"channel",
        "data": b"hello",
    }
    publish_thread.join()

    if timeout_value is not None:
        # For infinite timeout case don't wait for the message that will never appear.
        message = p.get_message(timeout=timeout_value)
        assert message is None


def test_pubsub_channels(r: redis.Redis):
    p = r.pubsub()
    p.subscribe("foo", "bar", "baz", "test")
    expected = {b"foo", b"bar", b"baz", b"test"}
    assert set(r.pubsub_channels()) == expected


def test_pubsub_channels_pattern(r: redis.Redis):
    p = r.pubsub()
    p.subscribe("foo", "bar", "baz", "test")
    assert set(r.pubsub_channels("b*")) == {
        b"bar",
        b"baz",
    }


def test_pubsub_no_subcommands(r: redis.Redis):
    with pytest.raises(redis.ResponseError):
        testtools.raw_command(r, "PUBSUB")


@pytest.mark.min_server("7")
@pytest.mark.max_server("7")
def test_pubsub_help_redis7(r: redis.Redis):
    assert testtools.raw_command(r, "PUBSUB HELP") == [
        b"PUBSUB <subcommand> [<arg> [value] [opt] ...]. Subcommands are:",
        b"CHANNELS [<pattern>]",
        b"    Return the currently active channels matching a <pattern> (default: '*')"
        b".",
        b"NUMPAT",
        b"    Return number of subscriptions to patterns.",
        b"NUMSUB [<channel> ...]",
        b"    Return the number of subscribers for the specified channels, excluding",
        b"    pattern subscriptions(default: no channels).",
        b"SHARDCHANNELS [<pattern>]",
        b"    Return the currently active shard level channels matching a <pattern> (d"
        b"efault: '*').",
        b"SHARDNUMSUB [<shardchannel> ...]",
        b"    Return the number of subscribers for the specified shard level channel(s"
        b")",
        b"HELP",
        b"    Prints this help.",
    ]


@pytest.mark.min_server("7.1")
def test_pubsub_help_redis71(r: redis.Redis):
    assert testtools.raw_command(r, "PUBSUB HELP") == [
        b"PUBSUB <subcommand> [<arg> [value] [opt] ...]. Subcommands are:",
        b"CHANNELS [<pattern>]",
        b"    Return the currently active channels matching a <pattern> (default: '*')"
        b".",
        b"NUMPAT",
        b"    Return number of subscriptions to patterns.",
        b"NUMSUB [<channel> ...]",
        b"    Return the number of subscribers for the specified channels, excluding",
        b"    pattern subscriptions(default: no channels).",
        b"SHARDCHANNELS [<pattern>]",
        b"    Return the currently active shard level channels matching a <pattern> (d"
        b"efault: '*').",
        b"SHARDNUMSUB [<shardchannel> ...]",
        b"    Return the number of subscribers for the specified shard level channel(s"
        b")",
        b"HELP",
        b"    Print this help.",
    ]


def test_pubsub_numsub(r: redis.Redis):
    a = uuid.uuid4().hex
    b = uuid.uuid4().hex
    c = uuid.uuid4().hex
    p1 = r.pubsub()
    p2 = r.pubsub()

    p1.subscribe(a, b, c)
    p2.subscribe(a, b)

    assert r.pubsub_numsub(a, b, c) == [
        (a.encode(), 2),
        (b.encode(), 2),
        (c.encode(), 1),
    ]
    assert r.pubsub_numsub() == []
    assert r.pubsub_numsub(a, "non-existing") == [(a.encode(), 2), (b"non-existing", 0)]


@pytest.mark.min_server("7")
@testtools.run_test_if_redispy_ver("gte", "5.0.0rc2")
@pytest.mark.unsupported_server_types("dragonfly")
def test_published_message_to_shard_channel(r: redis.Redis):
    p = r.pubsub()
    p.ssubscribe("foo")
    assert wait_for_message(p) == make_message("ssubscribe", "foo", 1)
    assert r.spublish("foo", "test message") == 1

    message = wait_for_message(p)
    assert isinstance(message, dict)
    assert message == make_message("smessage", "foo", "test message")


@pytest.mark.min_server("7")
@testtools.run_test_if_redispy_ver("gte", "5.0.0")
@pytest.mark.unsupported_server_types("dragonfly")
def test_subscribe_property_with_shard_channels_cluster(r: redis.Redis):
    p = r.pubsub()
    keys = ["foo", "bar", "uni" + chr(4456) + "code"]
    assert p.subscribed is False
    p.ssubscribe(keys[0])
    # we're now subscribed even though we haven't processed the reply from the server just yet
    assert p.subscribed is True
    assert wait_for_message(p) == make_message("ssubscribe", keys[0], 1)
    # we're still subscribed
    assert p.subscribed is True

    # unsubscribe from all shard_channels
    p.sunsubscribe()
    # we're still technically subscribed until we process the response messages from the server
    assert p.subscribed is True
    assert wait_for_message(p) == make_message("sunsubscribe", keys[0], 0)
    # now we're no longer subscribed as no more messages can be delivered to any channels we were listening to
    assert p.subscribed is False

    # subscribing again flips the flag back
    p.ssubscribe(keys[0])
    assert p.subscribed is True
    assert wait_for_message(p) == make_message("ssubscribe", keys[0], 1)

    # unsubscribe again
    p.sunsubscribe()
    assert p.subscribed is True
    # subscribe to another shard_channel before reading the unsubscribe response
    p.ssubscribe(keys[1])
    assert p.subscribed is True
    # read the unsubscribe for key1
    assert wait_for_message(p) == make_message("sunsubscribe", keys[0], 0)
    # we're still subscribed to key2, so subscribed should still be True
    assert p.subscribed is True
    # read the key2 subscribe message
    assert wait_for_message(p) == make_message("ssubscribe", keys[1], 1)
    p.sunsubscribe()
    # haven't read the message yet, so we're still subscribed
    assert p.subscribed is True
    assert wait_for_message(p) == make_message("sunsubscribe", keys[1], 0)
    # now we're finally unsubscribed
    assert p.subscribed is False


@pytest.mark.min_server("7")
@testtools.run_test_if_redispy_ver("gte", "5.0.0")
@pytest.mark.unsupported_server_types("dragonfly")
def test_pubsub_shardnumsub(r: redis.Redis):
    channels = {b"foo", b"bar", b"baz"}
    p1 = r.pubsub()
    p1.ssubscribe(*channels)
    for node in channels:
        assert wait_for_message(p1)["type"] == "ssubscribe"
    p2 = r.pubsub()
    p2.ssubscribe("bar", "baz")
    for i in range(2):
        assert wait_for_message(p2)["type"] == "ssubscribe"
    p3 = r.pubsub()
    p3.ssubscribe("baz")
    assert wait_for_message(p3)["type"] == "ssubscribe"

    channels = [(b"foo", 1), (b"bar", 2), (b"baz", 3)]
    assert r.pubsub_shardnumsub("foo", "bar", "baz", target_nodes="all") == channels


@pytest.mark.min_server("7")
@testtools.run_test_if_redispy_ver("gte", "5.0.0rc2")
@pytest.mark.unsupported_server_types("dragonfly")
def test_pubsub_shardchannels(r: redis.Redis):
    p = r.pubsub()
    p.ssubscribe("foo", "bar", "baz", "quux")
    for i in range(4):
        assert wait_for_message(p)["type"] == "ssubscribe"
    expected = [b"bar", b"baz", b"foo", b"quux"]
    assert all([channel in r.pubsub_shardchannels() for channel in expected])


================================================
FILE: tests/fakeredis/test/test_mixins/test_scan.py
================================================
from time import sleep

import pytest
import redis

from test.testtools import key_val_dict


def test_sscan_delete_key_while_scanning_should_not_returns_it_in_scan(r: redis.Redis):
    size = 600
    name = "sscan-test"
    all_keys_set = {f"{i}".encode() for i in range(size)}
    r.sadd(name, *[k for k in all_keys_set])
    assert r.scard(name) == size

    cursor, keys = r.sscan(name, 0)
    assert len(keys) < len(all_keys_set)

    key_to_remove = next(x for x in all_keys_set if x not in keys)
    assert r.srem(name, key_to_remove) == 1
    assert not r.sismember(name, key_to_remove)
    while cursor != 0:
        cursor, data = r.sscan(name, cursor=cursor)
        keys.extend(data)
    assert len(set(keys)) == len(keys)
    assert len(keys) == size - 1
    assert key_to_remove not in keys


def test_hscan_delete_key_while_scanning_should_not_returns_it_in_scan(r: redis.Redis):
    size = 600
    name = "hscan-test"
    all_keys_dict = key_val_dict(size=size)
    r.hset(name, mapping=all_keys_dict)
    assert len(r.hgetall(name)) == size

    cursor, keys = r.hscan(name, 0)
    assert len(keys) < len(all_keys_dict)

    key_to_remove = next(x for x in all_keys_dict if x not in keys)
    assert r.hdel(name, key_to_remove) == 1
    assert r.hget(name, key_to_remove) is None
    while cursor != 0:
        cursor, data = r.hscan(name, cursor=cursor)
        keys.update(data)
    assert len(set(keys)) == len(keys)
    assert len(keys) == size - 1
    assert key_to_remove not in keys


def test_scan_delete_unseen_key_while_scanning_should_not_returns_it_in_scan(
    r: redis.Redis,
):
    size = 30
    all_keys_dict = key_val_dict(size=size)
    assert all(r.set(k, v) for k, v in all_keys_dict.items())
    assert len(r.keys()) == size

    cursor, keys = r.scan()

    key_to_remove = next(x for x in all_keys_dict if x not in keys)
    assert r.delete(key_to_remove) == 1
    assert r.get(key_to_remove) is None
    while cursor != 0:
        cursor, data = r.scan(cursor=cursor)
        keys.extend(data)
    assert len(set(keys)) == len(keys)
    assert len(keys) == size - 1
    assert key_to_remove not in keys


# @pytest.mark.xfail # todo
# def test_scan_delete_seen_key_while_scanning_should_return_all_keys(r: redis.Redis):
#     size = 30
#     all_keys_dict = key_val_dict(size=size)
#     assert all(r.set(k, v) for k, v in all_keys_dict.items())
#     assert len(r.keys()) == size
#
#     cursor, keys = r.scan()
#
#     key_to_remove = keys[0]
#     assert r.delete(keys[0]) == 1
#     assert r.get(key_to_remove) is None
#     while cursor != 0:
#         cursor, data = r.scan(cursor=cursor)
#         keys.extend(data)
#
#     assert len(set(keys)) == len(keys)
#     keys = set(keys)
#     assert len(keys) == size, f"{set(all_keys_dict).difference(keys)} is not empty but should be"
#     assert key_to_remove in keys


def test_scan_add_key_while_scanning_should_return_all_keys(r: redis.Redis):
    size = 30
    all_keys_dict = key_val_dict(size=size)
    assert all(r.set(k, v) for k, v in all_keys_dict.items())
    assert len(r.keys()) == size

    cursor, keys = r.scan()

    r.set("new_key", "new val")
    while cursor != 0:
        cursor, data = r.scan(cursor=cursor)
        keys.extend(data)

    keys = set(keys)
    assert (
        len(keys) >= size
    ), f"{set(all_keys_dict).difference(keys)} is not empty but should be"


def test_scan(r: redis.Redis):
    # Set up the data
    for ix in range(20):
        k = "scan-test:%s" % ix
        v = "result:%s" % ix
        r.set(k, v)
    expected = r.keys()
    assert len(expected) == 20  # Ensure we know what we're testing

    # Test that we page through the results and get everything out
    results = []
    cursor = "0"
    while cursor != 0:
        cursor, data = r.scan(cursor, count=6)
        results.extend(data)
    assert set(expected) == set(results)

    # Now test that the MATCH functionality works
    results = []
    cursor = "0"
    while cursor != 0:
        cursor, data = r.scan(cursor, match="*7", count=100)
        results.extend(data)
    assert b"scan-test:7" in results
    assert b"scan-test:17" in results
    assert len(set(results)) == 2

    # Test the match on iterator
    results = [r for r in r.scan_iter(match="*7")]
    assert b"scan-test:7" in results
    assert b"scan-test:17" in results
    assert len(set(results)) == 2


def test_scan_single(r: redis.Redis):
    r.set("foo1", "bar1")
    assert r.scan(match="foo*") == (0, [b"foo1"])


def test_scan_iter_single_page(r: redis.Redis):
    r.set("foo1", "bar1")
    r.set("foo2", "bar2")
    assert set(r.scan_iter(match="foo*")) == {b"foo1", b"foo2"}
    assert set(r.scan_iter()) == {b"foo1", b"foo2"}
    assert set(r.scan_iter(match="")) == set()
    assert set(r.scan_iter(match="foo1", _type="string")) == {
        b"foo1",
    }


def test_scan_iter_multiple_pages(r: redis.Redis):
    all_keys = key_val_dict(size=100)
    assert all(r.set(k, v) for k, v in all_keys.items())
    assert set(r.scan_iter()) == set(all_keys)


def test_scan_iter_multiple_pages_with_match(r: redis.Redis):
    all_keys = key_val_dict(size=100)
    assert all(r.set(k, v) for k, v in all_keys.items())
    # Now add a few keys that don't match the key:<number> pattern.
    r.set("otherkey", "foo")
    r.set("andanother", "bar")
    actual = set(r.scan_iter(match="key:*"))
    assert actual == set(all_keys)


def test_scan_multiple_pages_with_count_arg(r: redis.Redis):
    all_keys = key_val_dict(size=100)
    assert all(r.set(k, v) for k, v in all_keys.items())
    assert set(r.scan_iter(count=1000)) == set(all_keys)


def test_scan_all_in_single_call(r: redis.Redis):
    all_keys = key_val_dict(size=100)
    assert all(r.set(k, v) for k, v in all_keys.items())
    # Specify way more than the 100 keys we've added.
    actual = r.scan(count=1000)
    assert set(actual[1]) == set(all_keys)
    assert actual[0] == 0


@pytest.mark.slow
def test_scan_expired_key(r: redis.Redis):
    r.set("expiringkey", "value")
    r.pexpire("expiringkey", 1)
    sleep(1)
    assert r.scan()[1] == []


def test_scan_stream(r: redis.Redis):
    r.xadd("mystream", {"test": "value"})
    assert r.type("mystream") == b"stream"  # noqa: E721
    for s in r.scan_iter(_type="STRING"):
        print(s)


================================================
FILE: tests/fakeredis/test/test_mixins/test_scripting.py
================================================
from __future__ import annotations

import pytest
import redis
import redis.client
from redis.exceptions import ResponseError

from test.testtools import raw_command

json_tests = pytest.importorskip("lupa")


@pytest.mark.min_server("7")
def test_script_exists_redis7(r: redis.Redis):
    # test response for no arguments by bypassing the py-redis command
    # as it requires at least one argument
    with pytest.raises(redis.ResponseError):
        raw_command(r, "SCRIPT EXISTS")

    # use single character characters for non-existing scripts, as those
    # will never be equal to an actual sha1 hash digest
    assert r.script_exists("a") == [0]
    assert r.script_exists("a", "b", "c", "d", "e", "f") == [0, 0, 0, 0, 0, 0]

    sha1_one = r.script_load("return 'a'")
    assert r.script_exists(sha1_one) == [1]
    assert r.script_exists(sha1_one, "a") == [1, 0]
    assert r.script_exists("a", "b", "c", sha1_one, "e") == [0, 0, 0, 1, 0]

    sha1_two = r.script_load("return 'b'")
    assert r.script_exists(sha1_one, sha1_two) == [1, 1]
    assert r.script_exists("a", sha1_one, "c", sha1_two, "e", "f") == [0, 1, 0, 1, 0, 0]


@pytest.mark.parametrize("args", [("a",), tuple("abcdefghijklmn")])
@pytest.mark.unsupported_server_types("dragonfly")
def test_script_flush_errors_with_args(r, args):
    with pytest.raises(redis.ResponseError):
        raw_command(r, "SCRIPT FLUSH %s" % " ".join(args))


def test_script_flush(r: redis.Redis):
    # generate/load six unique scripts and store their sha1 hash values
    sha1_values = [r.script_load("return '%s'" % char) for char in "abcdef"]

    # assert the scripts all exist prior to flushing
    assert r.script_exists(*sha1_values) == [1] * len(sha1_values)

    # flush and assert OK response
    assert r.script_flush() is True

    # assert none of the scripts exists after flushing
    assert r.script_exists(*sha1_values) == [0] * len(sha1_values)


def test_script_no_subcommands(r: redis.Redis):
    with pytest.raises(redis.ResponseError):
        raw_command(r, "SCRIPT")


@pytest.mark.max_server("7")
def test_script_help(r: redis.Redis):
    assert raw_command(r, "SCRIPT HELP") == [
        b"SCRIPT <subcommand> [<arg> [value] [opt] ...]. Subcommands are:",
        b"DEBUG (YES|SYNC|NO)",
        b"    Set the debug mode for subsequent scripts executed.",
        b"EXISTS <sha1> [<sha1> ...]",
        b"    Return information about the existence of the scripts in the script cach"
        b"e.",
        b"FLUSH [ASYNC|SYNC]",
        b"    Flush the Lua scripts cache. Very dangerous on replicas.",
        b"    When called without the optional mode argument, the behavior is determin"
        b"ed by the",
        b"    lazyfree-lazy-user-flush configuration directive. Valid modes are:",
        b"    * ASYNC: Asynchronously flush the scripts cache.",
        b"    * SYNC: Synchronously flush the scripts cache.",
        b"KILL",
        b"    Kill the currently executing Lua script.",
        b"LOAD <script>",
        b"    Load a script into the scripts cache without executing it.",
        b"HELP",
        b"    Prints this help.",
    ]


@pytest.mark.min_server("7.1")
def test_script_help71(r: redis.Redis):
    assert raw_command(r, "SCRIPT HELP") == [
        b"SCRIPT <subcommand> [<arg> [value] [opt] ...]. Subcommands are:",
        b"DEBUG (YES|SYNC|NO)",
        b"    Set the debug mode for subsequent scripts executed.",
        b"EXISTS <sha1> [<sha1> ...]",
        b"    Return information about the existence of the scripts in the script cach"
        b"e.",
        b"FLUSH [ASYNC|SYNC]",
        b"    Flush the Lua scripts cache. Very dangerous on replicas.",
        b"    When called without the optional mode argument, the behavior is determin"
        b"ed by the",
        b"    lazyfree-lazy-user-flush configuration directive. Valid modes are:",
        b"    * ASYNC: Asynchronously flush the scripts cache.",
        b"    * SYNC: Synchronously flush the scripts cache.",
        b"KILL",
        b"    Kill the currently executing Lua script.",
        b"LOAD <script>",
        b"    Load a script into the scripts cache without executing it.",
        b"HELP",
        b"    Print this help.",
    ]


@pytest.mark.max_server("7.1")
def test_eval_blpop(r: redis.Redis):
    r.rpush("foo", "bar")
    with pytest.raises(
        redis.ResponseError, match="This Redis command is not allowed from script"
    ):
        r.eval('return redis.pcall("BLPOP", KEYS[1], 1)', 1, "foo")


def test_eval_set_value_to_arg(r: redis.Redis):
    r.eval('redis.call("SET", KEYS[1], ARGV[1])', 1, "foo", "bar")
    val = r.get("foo")
    assert val == b"bar"


def test_eval_conditional(r: redis.Redis):
    lua = """
    local val = redis.call("GET", KEYS[1])
    if val == ARGV[1] then
        redis.call("SET", KEYS[1], ARGV[2])
    else
        redis.call("SET", KEYS[1], ARGV[1])
    end
    """
    r.eval(lua, 1, "foo", "bar", "baz")
    val = r.get("foo")
    assert val == b"bar"
    r.eval(lua, 1, "foo", "bar", "baz")
    val = r.get("foo")
    assert val == b"baz"


def test_eval_table(r: redis.Redis):
    lua = """
    local a = {}
    a[1] = "foo"
    a[2] = "bar"
    a[17] = "baz"
    return a
    """
    val = r.eval(lua, 0)
    assert val == [b"foo", b"bar"]


def test_eval_table_with_nil(r: redis.Redis):
    lua = """
    local a = {}
    a[1] = "foo"
    a[2] = nil
    a[3] = "bar"
    return a
    """
    val = r.eval(lua, 0)
    assert val == [b"foo"]


def test_eval_table_with_numbers(r: redis.Redis):
    lua = """
    local a = {}
    a[1] = 42
    return a
    """
    val = r.eval(lua, 0)
    assert val == [42]


def test_eval_nested_table(r: redis.Redis):
    lua = """
    local a = {}
    a[1] = {}
    a[1][1] = "foo"
    return a
    """
    val = r.eval(lua, 0)
    assert val == [[b"foo"]]


def test_eval_iterate_over_argv(r: redis.Redis):
    lua = """
    for i, v in ipairs(ARGV) do
    end
    return ARGV
    """
    val = r.eval(lua, 0, "a", "b", "c")
    assert val == [b"a", b"b", b"c"]


def test_eval_iterate_over_keys(r: redis.Redis):
    lua = """
    for i, v in ipairs(KEYS) do
    end
    return KEYS
    """
    val = r.eval(lua, 2, "a", "b", "c")
    assert val == [b"a", b"b"]


def test_eval_mget(r: redis.Redis):
    r.set("foo1", "bar1")
    r.set("foo2", "bar2")
    val = r.eval('return redis.call("mget", "foo1", "foo2")', 2, "foo1", "foo2")
    assert val == [b"bar1", b"bar2"]


def test_eval_mget_not_set(r: redis.Redis):
    val = r.eval('return redis.call("mget", "foo1", "foo2")', 2, "foo1", "foo2")
    assert val == [None, None]


def test_eval_hgetall(r: redis.Redis):
    r.hset("foo", "k1", "bar")
    r.hset("foo", "k2", "baz")
    val = r.eval('return redis.call("hgetall", "foo")', 1, "foo")
    sorted_val = sorted([val[:2], val[2:]])
    assert sorted_val == [[b"k1", b"bar"], [b"k2", b"baz"]]


def test_eval_hgetall_iterate(r: redis.Redis):
    r.hset("foo", "k1", "bar")
    r.hset("foo", "k2", "baz")
    lua = """
    local result = redis.call("hgetall", "foo")
    for i, v in ipairs(result) do
    end
    return result
    """
    val = r.eval(lua, 1, "foo")
    sorted_val = sorted([val[:2], val[2:]])
    assert sorted_val == [[b"k1", b"bar"], [b"k2", b"baz"]]


def test_eval_invalid_command(r: redis.Redis):
    with pytest.raises(ResponseError):
        r.eval('return redis.call("FOO")', 0)


def test_eval_syntax_error(r: redis.Redis):
    with pytest.raises(ResponseError):
        r.eval('return "', 0)


def test_eval_runtime_error(r: redis.Redis):
    with pytest.raises(ResponseError):
        r.eval('error("CRASH")', 0)


def test_eval_more_keys_than_args(r: redis.Redis):
    with pytest.raises(ResponseError):
        r.eval("return 1", 42)


def test_eval_numkeys_float_string(r: redis.Redis):
    with pytest.raises(ResponseError):
        r.eval("return KEYS[1]", "0.7", "foo")


def test_eval_numkeys_integer_string(r: redis.Redis):
    val = r.eval("return KEYS[1]", "1", "foo")
    assert val == b"foo"


def test_eval_numkeys_negative(r: redis.Redis):
    with pytest.raises(ResponseError):
        r.eval("return KEYS[1]", -1, "foo")


def test_eval_numkeys_float(r: redis.Redis):
    with pytest.raises(ResponseError):
        r.eval("return KEYS[1]", 0.7, "foo")


def test_eval_global_variable(r: redis.Redis):
    # Redis doesn't allow script to define global variables
    with pytest.raises(ResponseError):
        r.eval("a=10", 0)


def test_eval_global_and_return_ok(r: redis.Redis):
    # Redis doesn't allow script to define global variables
    with pytest.raises(ResponseError):
        r.eval(
            """
            a=10
            return redis.status_reply("Everything is awesome")
            """,
            0,
        )


# Dragonfly uses lua5.4, so it natively supports doubles.
# To use legacy rounding of doubles to integers run dragonfly with --lua_resp2_legacy_float
def test_eval_convert_number(r: redis.Redis):
    # Redis forces all Lua numbers to integer
    val = r.eval("return 3.2", 0)
    assert val == 3
    val = r.eval("return 3.8", 0)
    assert val == 3
    val = r.eval("return -3.8", 0)
    assert val == -3


def test_eval_convert_bool(r: redis.Redis):
    # Redis converts true to 1 and false to nil (which redis-py converts to None)
    assert r.eval("return false", 0) is None
    val = r.eval("return true", 0)
    assert val == 1
    assert not isinstance(val, bool)


@pytest.mark.min_server("7")
@pytest.mark.unsupported_server_types("dragonfly")  # dragonfly allows this
def test_eval_call_bool7(r: redis.Redis):
    # Redis doesn't allow Lua bools to be passed to [p]call
    with pytest.raises(
        redis.ResponseError,
        match=r"Lua redis lib command arguments must be strings or integers",
    ):
        r.eval('return redis.call("SET", KEYS[1], true)', 1, "testkey")


def test_eval_return_error(r: redis.Redis):
    with pytest.raises(redis.ResponseError, match="Testing") as exc_info:
        r.eval('return {err="Testing"}', 0)
    assert isinstance(exc_info.value.args[0], str)
    with pytest.raises(redis.ResponseError, match="Testing") as exc_info:
        r.eval('return redis.error_reply("Testing")', 0)
    assert isinstance(exc_info.value.args[0], str)


def test_eval_return_redis_error(r: redis.Redis):
    with pytest.raises(redis.ResponseError) as exc_info:
        r.eval('return redis.pcall("BADCOMMAND")', 0)
    assert isinstance(exc_info.value.args[0], str)


def test_eval_return_ok(r: redis.Redis):
    val = r.eval('return {ok="Testing"}', 0)
    assert val == b"Testing"
    val = r.eval('return redis.status_reply("Testing")', 0)
    assert val == b"Testing"


def test_eval_return_ok_nested(r: redis.Redis):
    val = r.eval(
        """
        local a = {}
        a[1] = {ok="Testing"}
        return a
        """,
        0,
    )
    assert val == [b"Testing"]


def test_eval_return_ok_wrong_type(r: redis.Redis):
    with pytest.raises(redis.ResponseError):
        r.eval("return redis.status_reply(123)", 0)


def test_eval_pcall(r: redis.Redis):
    val = r.eval(
        """
        local a = {}
        a[1] = redis.pcall("foo")
        return a
        """,
        0,
    )
    assert isinstance(val, list)
    assert len(val) == 1
    assert isinstance(val[0], ResponseError)


def test_eval_pcall_return_value(r: redis.Redis):
    with pytest.raises(ResponseError):
        r.eval('return redis.pcall("foo")', 0)


def test_eval_delete(r: redis.Redis):
    r.set("foo", "bar")
    val = r.get("foo")
    assert val == b"bar"
    val = r.eval('redis.call("DEL", KEYS[1])', 1, "foo")
    assert val is None


def test_eval_exists(r: redis.Redis):
    val = r.eval('return redis.call("exists", KEYS[1]) == 0', 1, "foo")
    assert val == 1


@pytest.mark.unsupported_server_types("dragonfly")
def test_eval_flushdb(r: redis.Redis):
    r.set("foo", "bar")
    val = r.eval(
        """
        local value = redis.call("FLUSHDB");
        return type(value) == "table" and value.ok == "OK";
        """,
        0,
    )
    assert val == 1


@pytest.mark.unsupported_server_types("dragonfly")
def test_eval_flushall(r, create_redis):
    r1 = create_redis(db=2)
    r2 = create_redis(db=3)

    r1["r1"] = "r1"
    r2["r2"] = "r2"

    val = r.eval(
        """
        local value = redis.call("FLUSHALL");
        return type(value) == "table" and value.ok == "OK";
        """,
        0,
    )

    assert val == 1
    assert "r1" not in r1
    assert "r2" not in r2


# Dragonfly lua supports doubles
@pytest.mark.unsupported_server_types("dragonfly")
def test_eval_incrbyfloat(r: redis.Redis):
    r.set("foo", 0.5)
    val = r.eval(
        """
        local value = redis.call("INCRBYFLOAT", KEYS[1], 2.0);
        return type(value) == "string" and tonumber(value) == 2.5;
        """,
        1,
        "foo",
    )
    assert val == 1


def test_eval_lrange(r: redis.Redis):
    r.rpush("foo", "a", "b")
    val = r.eval(
        """
        local value = redis.call("LRANGE", KEYS[1], 0, -1);
        return type(value) == "table" and value[1] == "a" and value[2] == "b";
        """,
        1,
        "foo",
    )
    assert val == 1


def test_eval_ltrim(r: redis.Redis):
    r.rpush("foo", "a", "b", "c", "d")
    val = r.eval(
        """
        local value = redis.call("LTRIM", KEYS[1], 1, 2);
        return type(value) == "table" and value.ok == "OK";
        """,
        1,
        "foo",
    )
    assert val == 1
    assert r.lrange("foo", 0, -1) == [b"b", b"c"]


def test_eval_lset(r: redis.Redis):
    r.rpush("foo", "a", "b")
    val = r.eval(
        """
        local value = redis.call("LSET", KEYS[1], 0, "z");
        return type(value) == "table" and value.ok == "OK";
        """,
        1,
        "foo",
    )
    assert val == 1
    assert r.lrange("foo", 0, -1) == [b"z", b"b"]


def test_eval_sdiff(r: redis.Redis):
    r.sadd("foo", "a", "b", "c", "f", "e", "d")
    r.sadd("bar", "b")
    val = r.eval(
        """
        local value = redis.call("SDIFF", KEYS[1], KEYS[2]);
        if type(value) ~= "table" then
            return redis.error_reply(type(value) .. ", should be table");
        else
            return value;
        end
        """,
        2,
        "foo",
        "bar",
    )
    # Note: while fakeredis sorts the result when using Lua, this isn't
    # actually part of the redis contract (see
    # https://github.com/antirez/redis/issues/5538), and for Redis 5 we
    # need to sort val to pass the test.
    assert sorted(val) == [b"a", b"c", b"d", b"e", b"f"]


def test_script(r: redis.Redis):
    script = r.register_script("return ARGV[1]")
    result = script(args=[42])
    assert result == b"42"


def test_lua_log_no_message(r: redis.Redis):
    script = "redis.log(redis.LOG_DEBUG)"
    script = r.register_script(script)
    with pytest.raises(redis.ResponseError):
        script()


@pytest.mark.unsupported_server_types("dragonfly")
def test_lua_log_wrong_level(r: redis.Redis):
    script = "redis.log(10, 'string')"
    script = r.register_script(script)
    with pytest.raises(redis.ResponseError):
        script()


def test_hscan_cursors_are_bytes(r: redis.Redis):
    r.hset("hkey", "foo", 1)

    result = r.eval(
        """
        local results = redis.call("HSCAN", KEYS[1], "0")
        return results[1]
        """,
        1,
        "hkey",
    )

    assert result == b"0"
    assert isinstance(result, bytes)


@pytest.mark.xfail  # TODO
def test_deleting_while_scan(r: redis.Redis):
    for i in range(100):
        r.set(f"key-{i}", i)

    assert len(r.keys()) == 100

    script = """
        local cursor = 0
        local seen = {}
        repeat
            local result = redis.call('SCAN', cursor)
            for _,key in ipairs(result[2]) do
                seen[#seen+1] = key
                redis.call('DEL', key)
            end
            cursor = tonumber(result[1])
        until cursor == 0
        return seen
    """

    assert len(r.register_script(script)()) == 100
    assert len(r.keys()) == 0


================================================
FILE: tests/fakeredis/test/test_mixins/test_server_commands.py
================================================
from datetime import datetime
from time import sleep

import pytest
import redis
from redis.exceptions import ResponseError


@pytest.mark.unsupported_server_types("dragonfly")
def test_swapdb(r, create_redis):
    r1 = create_redis(3)
    r.set("foo", "abc")
    r.set("bar", "xyz")
    r1.set("foo", "foo")
    r1.set("baz", "baz")
    assert r.swapdb(2, 3)
    assert r.get("foo") == b"foo"
    assert r.get("bar") is None
    assert r.get("baz") == b"baz"
    assert r1.get("foo") == b"abc"
    assert r1.get("bar") == b"xyz"
    assert r1.get("baz") is None


@pytest.mark.unsupported_server_types("dragonfly")
def test_swapdb_same_db(r: redis.Redis):
    assert r.swapdb(1, 1)


def test_save(r: redis.Redis):
    assert r.save()


@pytest.mark.unsupported_server_types("dragonfly")
def test_bgsave(r: redis.Redis):
    assert r.bgsave()
    with pytest.raises(ResponseError):
        r.execute_command("BGSAVE", "SCHEDULE", "FOO")
    with pytest.raises(ResponseError):
        r.execute_command("BGSAVE", "FOO")


def test_lastsave(r: redis.Redis):
    assert isinstance(r.lastsave(), datetime)


@pytest.mark.unsupported_server_types("dragonfly")
@pytest.mark.slow
def test_bgsave_timestamp_update(r: redis.Redis):
    early_timestamp = r.lastsave()
    sleep(1)
    assert r.bgsave()
    sleep(1)
    late_timestamp = r.lastsave()
    assert early_timestamp < late_timestamp


@pytest.mark.slow
def test_save_timestamp_update(r: redis.Redis):
    early_timestamp = r.lastsave()
    sleep(1)
    assert r.save()
    late_timestamp = r.lastsave()
    assert early_timestamp < late_timestamp


def test_dbsize(r: redis.Redis):
    assert r.dbsize() == 0
    r.set("foo", "bar")
    r.set("bar", "foo")
    assert r.dbsize() == 2


def test_flushdb_redispy4(r: redis.Redis):
    r.set("foo", "bar")
    assert r.keys() == [b"foo"]
    assert r.flushdb() is True
    assert r.keys() == []


================================================
FILE: tests/fakeredis/test/test_mixins/test_set_commands.py
================================================
from __future__ import annotations

import os
from datetime import timedelta
from time import sleep

import pytest
import redis
import redis.client
from redis.exceptions import ResponseError


def test_sadd(r: redis.Redis):
    assert r.sadd("foo", "member1") == 1
    assert r.sadd("foo", "member1") == 0
    assert set(r.smembers("foo")) == {b"member1"}
    assert r.sadd("foo", "member2", "member3") == 2
    assert set(r.smembers("foo")) == {b"member1", b"member2", b"member3"}
    assert r.sadd("foo", "member3", "member4") == 1
    assert set(r.smembers("foo")) == {b"member1", b"member2", b"member3", b"member4"}


def test_sadd_redispy_5(r: redis.Redis):
    assert r.sadd("foo", "member1") == 1
    assert r.sadd("foo", "member1") == 0
    assert r.smembers("foo") == {b"member1"}
    assert r.sadd("foo", "member2", "member3") == 2
    assert r.smembers("foo") == {b"member1", b"member2", b"member3"}
    assert r.sadd("foo", "member3", "member4") == 1
    assert r.smembers("foo") == {b"member1", b"member2", b"member3", b"member4"}


def test_sadd_as_str_type(r: redis.Redis):
    assert r.sadd("foo", *range(3)) == 3
    assert set(r.smembers("foo")) == {b"0", b"1", b"2"}


def test_sadd_wrong_type(r: redis.Redis):
    r.zadd("foo", {"member": 1})
    with pytest.raises(redis.ResponseError):
        r.sadd("foo", "member2")


def test_scard(r: redis.Redis):
    r.sadd("foo", "member1")
    r.sadd("foo", "member2")
    r.sadd("foo", "member2")
    assert r.scard("foo") == 2


def test_scard_wrong_type(r: redis.Redis):
    r.zadd("foo", {"member": 1})
    with pytest.raises(redis.ResponseError):
        r.scard("foo")


def test_sdiff(r: redis.Redis):
    r.sadd("foo", "member1")
    r.sadd("foo", "member2")
    r.sadd("bar", "member2")
    r.sadd("bar", "member3")
    assert set(r.sdiff("foo", "bar")) == {b"member1"}
    # Original sets shouldn't be modified.
    assert set(r.smembers("foo")) == {b"member1", b"member2"}
    assert set(r.smembers("bar")) == {b"member2", b"member3"}


def test_sdiff_one_key(r: redis.Redis):
    r.sadd("foo", "member1")
    r.sadd("foo", "member2")
    assert set(r.sdiff("foo")) == {b"member1", b"member2"}


def test_sdiff_empty(r: redis.Redis):
    assert set(r.sdiff("foo")) == set()


def test_sdiff_wrong_type(r: redis.Redis):
    r.zadd("foo", {"member": 1})
    r.sadd("bar", "member")
    with pytest.raises(redis.ResponseError):
        r.sdiff("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.sdiff("bar", "foo")


def test_sdiffstore(r: redis.Redis):
    r.sadd("foo", "member1")
    r.sadd("foo", "member2")
    r.sadd("bar", "member2")
    r.sadd("bar", "member3")
    assert r.sdiffstore("baz", "foo", "bar") == 1

    # Catch instances where we store bytes and strings inconsistently
    # and thus baz = {'member1', b'member1'}
    r.sadd("baz", "member1")
    assert r.scard("baz") == 1


def test_sinter(r: redis.Redis):
    r.sadd("foo", "member1")
    r.sadd("foo", "member2")
    r.sadd("bar", "member2")
    r.sadd("bar", "member3")
    assert set(r.sinter("foo", "bar")) == {b"member2"}
    assert set(r.sinter("foo")) == {b"member1", b"member2"}


def test_sinter_bytes_keys(r: redis.Redis):
    foo = os.urandom(10)
    bar = os.urandom(10)
    r.sadd(foo, "member1")
    r.sadd(foo, "member2")
    r.sadd(bar, "member2")
    r.sadd(bar, "member3")
    assert set(r.sinter(foo, bar)) == {b"member2"}
    assert set(r.sinter(foo)) == {b"member1", b"member2"}


def test_sinter_wrong_type(r: redis.Redis):
    r.zadd("foo", {"member": 1})
    r.sadd("bar", "member")
    with pytest.raises(redis.ResponseError):
        r.sinter("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.sinter("bar", "foo")


def test_sinterstore(r: redis.Redis):
    r.sadd("foo", "member1")
    r.sadd("foo", "member2")
    r.sadd("bar", "member2")
    r.sadd("bar", "member3")
    assert r.sinterstore("baz", "foo", "bar") == 1

    # Catch instances where we store bytes and strings inconsistently
    # and thus baz = {'member2', b'member2'}
    r.sadd("baz", "member2")
    assert r.scard("baz") == 1


def test_sismember(r: redis.Redis):
    assert not r.sismember("foo", "member1")
    r.sadd("foo", "member1")
    assert r.sismember("foo", "member1")


def test_smismember(r: redis.Redis):
    assert r.smismember("foo", ["member1", "member2", "member3"]) == [0, 0, 0]
    r.sadd("foo", "member1", "member2", "member3")
    assert r.smismember("foo", ["member1", "member2", "member3"]) == [1, 1, 1]
    assert r.smismember("foo", ["member1", "member2", "member3", "member4"]) == [
        1,
        1,
        1,
        0,
    ]
    assert r.smismember("foo", ["member4", "member2", "member3"]) == [0, 1, 1]
    # should also work if provided values as arguments
    assert r.smismember("foo", "member4", "member2", "member3") == [0, 1, 1]


def test_smismember_wrong_type(r: redis.Redis):
    # verify that command fails when the key itself is not a SET
    r.zadd("foo", {"member": 1})
    with pytest.raises(redis.ResponseError):
        r.smismember("foo", "member")

    # verify that command fails if the input parameter is of wrong type
    r.sadd("foo2", "member1", "member2", "member3")
    with pytest.raises(redis.DataError, match="Invalid input of type"):
        r.smismember("foo2", [["member1", "member2"]])


def test_sismember_wrong_type(r: redis.Redis):
    r.zadd("foo", {"member": 1})
    with pytest.raises(redis.ResponseError):
        r.sismember("foo", "member")


def test_smembers(r: redis.Redis):
    assert set(r.smembers("foo")) == set()


def test_smembers_copy(r: redis.Redis):
    r.sadd("foo", "member1")
    ret = r.smembers("foo")
    r.sadd("foo", "member2")
    assert r.smembers("foo") != ret


def test_smembers_wrong_type(r: redis.Redis):
    r.zadd("foo", {"member": 1})
    with pytest.raises(redis.ResponseError):
        r.smembers("foo")


def test_smembers_runtime_error(r: redis.Redis):
    r.sadd("foo", "member1", "member2")
    for member in r.smembers("foo"):
        r.srem("foo", member)


def test_smove(r: redis.Redis):
    r.sadd("foo", "member1")
    r.sadd("foo", "member2")
    assert r.smove("foo", "bar", "member1")
    assert set(r.smembers("bar")) == {b"member1"}


def test_smove_non_existent_key(r: redis.Redis):
    assert not r.smove("foo", "bar", "member1")


def test_smove_wrong_type(r: redis.Redis):
    r.zadd("foo", {"member": 1})
    r.sadd("bar", "member")
    with pytest.raises(redis.ResponseError):
        r.smove("bar", "foo", "member")
    # Must raise the error before removing member from bar
    assert set(r.smembers("bar")) == {b"member"}
    with pytest.raises(redis.ResponseError):
        r.smove("foo", "bar", "member")


def test_spop(r: redis.Redis):
    # This is tricky because it pops a random element.
    r.sadd("foo", "member1")
    assert r.spop("foo") == b"member1"
    assert r.spop("foo") is None


def test_spop_wrong_type(r: redis.Redis):
    r.zadd("foo", {"member": 1})
    with pytest.raises(redis.ResponseError):
        r.spop("foo")


def test_srandmember(r: redis.Redis):
    r.sadd("foo", "member1")
    assert r.srandmember("foo") == b"member1"
    # Shouldn't be removed from the set.
    assert r.srandmember("foo") == b"member1"


def test_srandmember_number(r: redis.Redis):
    """srandmember works with the number argument."""
    assert r.srandmember("foo", 2) == []
    r.sadd("foo", b"member1")
    assert r.srandmember("foo", 2) == [b"member1"]
    r.sadd("foo", b"member2")
    assert set(r.srandmember("foo", 2)) == {b"member1", b"member2"}
    r.sadd("foo", b"member3")
    res = r.srandmember("foo", 2)
    assert len(res) == 2
    for e in res:
        assert e in {b"member1", b"member2", b"member3"}


def test_srandmember_wrong_type(r: redis.Redis):
    r.zadd("foo", {"member": 1})
    with pytest.raises(redis.ResponseError):
        r.srandmember("foo")


def test_srem(r: redis.Redis):
    r.sadd("foo", "member1", "member2", "member3", "member4")
    assert set(r.smembers("foo")) == {b"member1", b"member2", b"member3", b"member4"}
    assert r.srem("foo", "member1") == 1
    assert set(r.smembers("foo")) == {b"member2", b"member3", b"member4"}
    assert r.srem("foo", "member1") == 0
    # Since redis>=2.7.6 returns number of deleted items.
    assert r.srem("foo", "member2", "member3") == 2
    assert set(r.smembers("foo")) == {b"member4"}
    assert r.srem("foo", "member3", "member4") == 1
    assert set(r.smembers("foo")) == set()
    assert r.srem("foo", "member3", "member4") == 0


def test_srem_wrong_type(r: redis.Redis):
    r.zadd("foo", {"member": 1})
    with pytest.raises(redis.ResponseError):
        r.srem("foo", "member")


def test_sunion(r: redis.Redis):
    r.sadd("foo", "member1")
    r.sadd("foo", "member2")
    r.sadd("bar", "member2")
    r.sadd("bar", "member3")
    assert set(r.sunion("foo", "bar")) == {b"member1", b"member2", b"member3"}


def test_sunion_wrong_type(r: redis.Redis):
    r.zadd("foo", {"member": 1})
    r.sadd("bar", "member")
    with pytest.raises(redis.ResponseError):
        r.sunion("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.sunion("bar", "foo")


def test_sunionstore(r: redis.Redis):
    r.sadd("foo", "member1")
    r.sadd("foo", "member2")
    r.sadd("bar", "member2")
    r.sadd("bar", "member3")
    assert r.sunionstore("baz", "foo", "bar") == 3
    assert set(r.smembers("baz")) == {b"member1", b"member2", b"member3"}

    # Catch instances where we store bytes and strings inconsistently
    # and thus baz = {b'member1', b'member2', b'member3', 'member3'}
    r.sadd("baz", "member3")
    assert r.scard("baz") == 3


def test_empty_set(r: redis.Redis):
    r.sadd("foo", "bar")
    r.srem("foo", "bar")
    assert not r.exists("foo")


def test_sscan(r: redis.Redis):
    # Set up the data
    name = "sscan-test"
    for ix in range(20):
        k = "sscan-test:%s" % ix
        r.sadd(name, k)
    expected = r.smembers(name)
    assert len(expected) == 20  # Ensure we know what we're testing

    # Test that we page through the results and get everything out
    results = []
    cursor = "0"
    while cursor != 0:
        cursor, data = r.sscan(name, cursor, count=6)
        results.extend(data)
    assert set(expected) == set(results)

    # Test the iterator version
    results = [r for r in r.sscan_iter(name, count=6)]
    assert set(expected) == set(results)

    # Now test that the MATCH functionality works
    results = []
    cursor = "0"
    while cursor != 0:
        cursor, data = r.sscan(name, cursor, match="*7", count=100)
        results.extend(data)
    assert b"sscan-test:7" in results
    assert b"sscan-test:17" in results
    assert len(results) == 2

    # Test the match on iterator
    results = [r for r in r.sscan_iter(name, match="*7")]
    assert b"sscan-test:7" in results
    assert b"sscan-test:17" in results
    assert len(results) == 2


@pytest.mark.min_server("7")
def test_sintercard(r: redis.Redis):
    r.sadd("foo", "member1")
    r.sadd("foo", "member2")
    r.sadd("bar", "member2")
    r.sadd("bar", "member3")
    assert r.sintercard(2, ["foo", "bar"]) == 1
    assert r.sintercard(1, ["foo"]) == 2


@pytest.mark.min_server("7")
def test_sintercard_key_doesnt_exist(r: redis.Redis):
    r.sadd("foo", "member1")
    r.sadd("foo", "member2")
    r.sadd("bar", "member2")
    r.sadd("bar", "member3")
    assert r.sintercard(2, ["foo", "bar"]) == 1
    assert r.sintercard(1, ["foo"]) == 2
    assert r.sintercard(1, ["foo"], limit=1) == 1
    assert r.sintercard(3, ["foo", "bar", "ddd"]) == 0


@pytest.mark.min_server("7")
def test_sintercard_bytes_keys(r: redis.Redis):
    foo = os.urandom(10)
    bar = os.urandom(10)
    r.sadd(foo, "member1")
    r.sadd(foo, "member2")
    r.sadd(bar, "member2")
    r.sadd(bar, "member3")
    assert r.sintercard(2, [foo, bar]) == 1
    assert r.sintercard(1, [foo]) == 2
    assert r.sintercard(1, [foo], limit=1) == 1


@pytest.mark.min_server("7")
def test_sintercard_wrong_type(r: redis.Redis):
    r.zadd("foo", {"member": 1})
    r.sadd("bar", "member")
    with pytest.raises(redis.ResponseError):
        r.sintercard(2, ["foo", "bar"])
    with pytest.raises(redis.ResponseError):
        r.sintercard(2, ["bar", "foo"])


@pytest.mark.min_server("7")
def test_sintercard_syntax_error(r: redis.Redis):
    r.zadd("foo", {"member": 1})
    r.sadd("bar", "member")
    with pytest.raises(redis.ResponseError):
        r.sintercard(3, ["foo", "bar"])
    with pytest.raises(redis.ResponseError):
        r.sintercard(1, ["bar", "foo"])
    with pytest.raises(redis.ResponseError):
        r.sintercard(1, ["bar", "foo"], limit="x")


def test_pfadd(r: redis.Redis):
    key = "hll-pfadd"
    assert r.pfadd(key, "a", "b", "c", "d", "e", "f", "g") == 1
    assert r.pfcount(key) == 7


def test_pfcount(r: redis.Redis):
    key1 = "hll-pfcount01"
    key2 = "hll-pfcount02"
    key3 = "hll-pfcount03"
    assert r.pfadd(key1, "foo", "bar", "zap") == 1
    assert r.pfadd(key1, "zap", "zap", "zap") == 0
    assert r.pfadd(key1, "foo", "bar") == 0
    assert r.pfcount(key1) == 3
    assert r.pfadd(key2, "1", "2", "3") == 1
    assert r.pfcount(key2) == 3
    assert r.pfcount(key1, key2) == 6
    assert r.pfadd(key3, "foo", "bar", "zip") == 1
    assert r.pfcount(key3) == 3
    assert r.pfcount(key1, key3) == 4
    assert r.pfcount(key1, key2, key3) == 7


def test_pfmerge(r: redis.Redis):
    key1 = "hll-pfmerge01"
    key2 = "hll-pfmerge02"
    key3 = "hll-pfmerge03"
    assert r.pfadd(key1, "foo", "bar", "zap", "a") == 1
    assert r.pfadd(key2, "a", "b", "c", "foo") == 1
    assert r.pfmerge(key3, key1, key2)
    assert r.pfcount(key3) == 6


@pytest.mark.slow
def test_set_ex_should_expire_value(r: redis.Redis):
    r.set("foo", "bar")
    assert r.get("foo") == b"bar"
    r.set("foo", "bar", ex=1)
    sleep(2)
    assert r.get("foo") is None


@pytest.mark.slow
def test_set_px_should_expire_value(r: redis.Redis):
    r.set("foo", "bar", px=500)
    sleep(1.5)
    assert r.get("foo") is None


@pytest.mark.slow
def test_psetex_expire_value(r: redis.Redis):
    with pytest.raises(ResponseError):
        r.psetex("foo", 0, "bar")
    r.psetex("foo", 500, "bar")
    sleep(1.5)
    assert r.get("foo") is None


@pytest.mark.slow
def test_psetex_expire_value_using_timedelta(r: redis.Redis):
    with pytest.raises(ResponseError):
        r.psetex("foo", timedelta(seconds=0), "bar")
    r.psetex("foo", timedelta(seconds=0.5), "bar")
    sleep(1.5)
    assert r.get("foo") is None


================================================
FILE: tests/fakeredis/test/test_mixins/test_sortedset_commands.py
================================================
from __future__ import annotations

import math
from collections import OrderedDict
from typing import Tuple, List, Optional

import pytest
import redis
import redis.client

from test import testtools


def round_str(x):
    assert isinstance(x, bytes)
    return round(float(x))


def test_zpopmin(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zadd("foo", {"two": 2})
    r.zadd("foo", {"three": 3})
    assert r.zpopmin("foo", count=2) == [(b"one", 1.0), (b"two", 2.0)]
    assert r.zpopmin("foo", count=2) == [(b"three", 3.0)]


def test_zpopmin_too_many(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zadd("foo", {"two": 2})
    r.zadd("foo", {"three": 3})
    assert r.zpopmin("foo", count=5) == [(b"one", 1.0), (b"two", 2.0), (b"three", 3.0)]


def test_zpopmax(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zadd("foo", {"two": 2})
    r.zadd("foo", {"three": 3})
    assert r.zpopmax("foo", count=2) == [(b"three", 3.0), (b"two", 2.0)]
    assert r.zpopmax("foo", count=2) == [(b"one", 1.0)]


def test_zrange_same_score(r: redis.Redis):
    r.zadd("foo", {"two_a": 2})
    r.zadd("foo", {"two_b": 2})
    r.zadd("foo", {"two_c": 2})
    r.zadd("foo", {"two_d": 2})
    r.zadd("foo", {"two_e": 2})
    assert r.zrange("foo", 2, 3) == [b"two_c", b"two_d"]


def test_zrange_with_bylex_and_byscore(r: redis.Redis):
    r.zadd("foo", {"one_a": 0})
    r.zadd("foo", {"two_a": 0})
    r.zadd("foo", {"two_b": 0})
    r.zadd("foo", {"three_a": 0})
    with pytest.raises(redis.ResponseError):
        testtools.raw_command(r, "zrange", "foo", "(t", "+", "bylex", "byscore")


def test_zrange_with_rev_and_bylex(r: redis.Redis):
    r.zadd("foo", {"one_a": 0})
    r.zadd("foo", {"two_a": 0})
    r.zadd("foo", {"two_b": 0})
    r.zadd("foo", {"three_a": 0})
    assert r.zrange("foo", b"+", b"(t", desc=True, bylex=True) == [
        b"two_b",
        b"two_a",
        b"three_a",
    ]
    assert r.zrange("foo", b"[two_b", b"(t", desc=True, bylex=True) == [
        b"two_b",
        b"two_a",
        b"three_a",
    ]
    assert r.zrange("foo", b"(two_b", b"(t", desc=True, bylex=True) == [
        b"two_a",
        b"three_a",
    ]
    assert r.zrange("foo", b"[two_b", b"[three_a", desc=True, bylex=True) == [
        b"two_b",
        b"two_a",
        b"three_a",
    ]
    assert r.zrange("foo", b"[two_b", b"(three_a", desc=True, bylex=True) == [
        b"two_b",
        b"two_a",
    ]
    assert r.zrange("foo", b"(two_b", b"-", desc=True, bylex=True) == [
        b"two_a",
        b"three_a",
        b"one_a",
    ]
    assert r.zrange("foo", b"(two_b", b"[two_b", bylex=True) == []
    # reversed max + and min - boundaries
    # these will be always empty, but allowed by redis
    assert r.zrange("foo", b"-", b"+", desc=True, bylex=True) == []
    assert r.zrange("foo", b"[three_a", b"+", desc=True, bylex=True) == []
    assert r.zrange("foo", b"-", b"[o", desc=True, bylex=True) == []


def test_zrange_with_bylex(r: redis.Redis):
    r.zadd("foo", {"one_a": 0})
    r.zadd("foo", {"two_a": 0})
    r.zadd("foo", {"two_b": 0})
    r.zadd("foo", {"three_a": 0})
    assert r.zrange("foo", b"(t", b"+", bylex=True) == [b"three_a", b"two_a", b"two_b"]
    assert r.zrange("foo", b"(t", b"[two_b", bylex=True) == [
        b"three_a",
        b"two_a",
        b"two_b",
    ]
    assert r.zrange("foo", b"(t", b"(two_b", bylex=True) == [b"three_a", b"two_a"]
    assert r.zrange("foo", b"[three_a", b"[two_b", bylex=True) == [
        b"three_a",
        b"two_a",
        b"two_b",
    ]
    assert r.zrange("foo", b"(three_a", b"[two_b", bylex=True) == [b"two_a", b"two_b"]
    assert r.zrange("foo", b"-", b"(two_b", bylex=True) == [
        b"one_a",
        b"three_a",
        b"two_a",
    ]
    assert r.zrange("foo", b"[two_b", b"(two_b", bylex=True) == []
    # reversed max + and min - boundaries
    # these will be always empty, but allowed by redis
    assert r.zrange("foo", b"+", b"-", bylex=True) == []
    assert r.zrange("foo", b"+", b"[three_a", bylex=True) == []
    assert r.zrange("foo", b"[o", b"-", bylex=True) == []


def test_zrange_with_byscore(r: redis.Redis):
    r.zadd("foo", {"zero": 0})
    r.zadd("foo", {"two": 2})
    r.zadd("foo", {"two_a_also": 2})
    r.zadd("foo", {"two_b_also": 2})
    r.zadd("foo", {"four": 4})
    assert r.zrange("foo", 1, 3, byscore=True) == [b"two", b"two_a_also", b"two_b_also"]
    assert r.zrange("foo", 2, 3, byscore=True) == [b"two", b"two_a_also", b"two_b_also"]
    assert r.zrange("foo", 0, 4, byscore=True) == [
        b"zero",
        b"two",
        b"two_a_also",
        b"two_b_also",
        b"four",
    ]
    assert r.zrange("foo", "-inf", 1, byscore=True) == [b"zero"]
    assert r.zrange("foo", 2, "+inf", byscore=True) == [
        b"two",
        b"two_a_also",
        b"two_b_also",
        b"four",
    ]
    assert r.zrange("foo", "-inf", "+inf", byscore=True) == [
        b"zero",
        b"two",
        b"two_a_also",
        b"two_b_also",
        b"four",
    ]


def test_zcard(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zadd("foo", {"two": 2})
    assert r.zcard("foo") == 2


def test_zcard_non_existent_key(r: redis.Redis):
    assert r.zcard("foo") == 0


def test_zcard_wrong_type(r: redis.Redis):
    r.sadd("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.zcard("foo")


def test_zcount(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zadd("foo", {"three": 2})
    r.zadd("foo", {"five": 5})
    assert r.zcount("foo", 2, 4) == 1
    assert r.zcount("foo", 1, 4) == 2
    assert r.zcount("foo", 0, 5) == 3
    assert r.zcount("foo", 4, "+inf") == 1
    assert r.zcount("foo", "-inf", 4) == 2
    assert r.zcount("foo", "-inf", "+inf") == 3


def test_zcount_exclusive(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zadd("foo", {"three": 2})
    r.zadd("foo", {"five": 5})
    assert r.zcount("foo", "-inf", "(2") == 1
    assert r.zcount("foo", "-inf", 2) == 2
    assert r.zcount("foo", "(5", "+inf") == 0
    assert r.zcount("foo", "(1", 5) == 2
    assert r.zcount("foo", "(2", "(5") == 0
    assert r.zcount("foo", "(1", "(5") == 1
    assert r.zcount("foo", 2, "(5") == 1


def test_zcount_wrong_type(r: redis.Redis):
    r.sadd("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.zcount("foo", "-inf", "+inf")


def test_zincrby(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    assert r.zincrby("foo", 10, "one") == 11
    assert r.zrange("foo", 0, -1, withscores=True) == [(b"one", 11)]


def test_zincrby_wrong_type(r: redis.Redis):
    r.sadd("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.zincrby("foo", 10, "one")


def test_zrange_descending(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zadd("foo", {"two": 2})
    r.zadd("foo", {"three": 3})
    assert r.zrange("foo", 0, -1, desc=True) == [b"three", b"two", b"one"]


def test_zrange_descending_with_scores(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zadd("foo", {"two": 2})
    r.zadd("foo", {"three": 3})
    assert r.zrange("foo", 0, -1, desc=True, withscores=True) == [
        (b"three", 3),
        (b"two", 2),
        (b"one", 1),
    ]


def test_zrange_with_positive_indices(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zadd("foo", {"two": 2})
    r.zadd("foo", {"three": 3})
    assert r.zrange("foo", 0, 1) == [b"one", b"two"]


def test_zrange_wrong_type(r: redis.Redis):
    r.sadd("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.zrange("foo", 0, -1)


def test_zrange_score_cast(r: redis.Redis):
    r.zadd("foo", {"one": 1.2})
    r.zadd("foo", {"two": 2.2})

    expected_without_cast_round = [(b"one", 1.2), (b"two", 2.2)]
    expected_with_cast_round = [(b"one", 1.0), (b"two", 2.0)]
    assert r.zrange("foo", 0, 2, withscores=True) == expected_without_cast_round
    assert (
        r.zrange("foo", 0, 2, withscores=True, score_cast_func=round_str)
        == expected_with_cast_round
    )


def test_zrank(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zadd("foo", {"two": 2})
    r.zadd("foo", {"three": 3})
    assert r.zrank("foo", "one") == 0
    assert r.zrank("foo", "two") == 1
    assert r.zrank("foo", "three") == 2


def test_zrank_non_existent_member(r: redis.Redis):
    assert r.zrank("foo", "one") is None


def test_zrank_wrong_type(r: redis.Redis):
    r.sadd("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.zrank("foo", "one")


def test_zrem(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zadd("foo", {"two": 2})
    r.zadd("foo", {"three": 3})
    r.zadd("foo", {"four": 4})
    assert r.zrem("foo", "one") == 1
    assert r.zrange("foo", 0, -1) == [b"two", b"three", b"four"]
    # Since redis>=2.7.6 returns number of deleted items.
    assert r.zrem("foo", "two", "three") == 2
    assert r.zrange("foo", 0, -1) == [b"four"]
    assert r.zrem("foo", "three", "four") == 1
    assert r.zrange("foo", 0, -1) == []
    assert r.zrem("foo", "three", "four") == 0


def test_zrem_non_existent_member(r: redis.Redis):
    assert not r.zrem("foo", "one")


def test_zrem_numeric_member(r: redis.Redis):
    r.zadd("foo", {"128": 13.0, "129": 12.0})
    assert r.zrem("foo", 128) == 1
    assert r.zrange("foo", 0, -1) == [b"129"]


def test_zrem_wrong_type(r: redis.Redis):
    r.sadd("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.zrem("foo", "bar")


def test_zscore(r: redis.Redis):
    r.zadd("foo", {"one": 54})
    assert r.zscore("foo", "one") == 54


def test_zscore_non_existent_member(r: redis.Redis):
    assert r.zscore("foo", "one") is None


def test_zscore_wrong_type(r: redis.Redis):
    r.sadd("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.zscore("foo", "one")


def test_zmscore(r: redis.Redis):
    """When all the requested sorted-set members are in the cache, a valid
    float value should be returned for each requested member.

    The order of the returned scores should always match the order in
    which the set members were supplied.
    """
    cache_key: str = "scored-set-members"
    members: Tuple[str, ...] = ("one", "two", "three", "four", "five", "six")
    scores: Tuple[float, ...] = (1.1, 2.2, 3.3, 4.4, 5.5, 6.6)

    r.zadd(cache_key, dict(zip(members, scores)))
    cached_scores: List[Optional[float]] = r.zmscore(
        cache_key,
        list(members),
    )

    assert all(cached_scores[idx] == score for idx, score in enumerate(scores))


def test_zmscore_missing_members(r: redis.Redis):
    """When none of the requested sorted-set members are in the cache, a value
    of `None` should be returned once for each requested member."""
    cache_key: str = "scored-set-members"
    members: Tuple[str, ...] = ("one", "two", "three", "four", "five", "six")

    r.zadd(cache_key, {"eight": 8.8})
    cached_scores: List[Optional[float]] = r.zmscore(
        cache_key,
        list(members),
    )

    assert all(score is None for score in cached_scores)


def test_zmscore_mixed_membership(r: redis.Redis):
    """When only some requested sorted-set members are in the cache, a
    valid float value should be returned for each present member and `None` for
    each missing member.

    The order of the returned scores should always match the order in
    which the set members were supplied.
    """
    cache_key: str = "scored-set-members"
    members: Tuple[str, ...] = ("one", "two", "three", "four", "five", "six")
    scores: Tuple[float, ...] = (1.1, 2.2, 3.3, 4.4, 5.5, 6.6)

    r.zadd(
        cache_key,
        dict((member, scores[idx]) for (idx, member) in enumerate(members) if idx % 2 != 0),
    )

    cached_scores: List[Optional[float]] = r.zmscore(cache_key, list(members))

    assert all(cached_scores[idx] is None for (idx, score) in enumerate(scores) if idx % 2 == 0)
    assert all(cached_scores[idx] == score for (idx, score) in enumerate(scores) if idx % 2 != 0)


def test_zrevrank(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zadd("foo", {"two": 2})
    r.zadd("foo", {"three": 3})
    assert r.zrevrank("foo", "one") == 2
    assert r.zrevrank("foo", "two") == 1
    assert r.zrevrank("foo", "three") == 0


def test_zrevrank_non_existent_member(r: redis.Redis):
    assert r.zrevrank("foo", "one") is None


def test_zrevrank_wrong_type(r: redis.Redis):
    r.sadd("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.zrevrank("foo", "one")


def test_zrevrange(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zadd("foo", {"two": 2})
    r.zadd("foo", {"three": 3})
    assert r.zrevrange("foo", 0, 1) == [b"three", b"two"]
    assert r.zrevrange("foo", 0, -1) == [b"three", b"two", b"one"]


def test_zrevrange_sorted_keys(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zadd("foo", {"two": 2})
    r.zadd("foo", {"two_b": 2})
    r.zadd("foo", {"three": 3})
    assert r.zrevrange("foo", 0, 2) == [b"three", b"two_b", b"two"]
    assert r.zrevrange("foo", 0, -1) == [b"three", b"two_b", b"two", b"one"]


def test_zrevrange_wrong_type(r: redis.Redis):
    r.sadd("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.zrevrange("foo", 0, 2)


def test_zrevrange_score_cast(r: redis.Redis):
    r.zadd("foo", {"one": 1.2})
    r.zadd("foo", {"two": 2.2})

    expected_without_cast_round = [(b"two", 2.2), (b"one", 1.2)]
    expected_with_cast_round = [(b"two", 2.0), (b"one", 1.0)]
    assert r.zrevrange("foo", 0, 2, withscores=True) == expected_without_cast_round
    assert (
        r.zrevrange("foo", 0, 2, withscores=True, score_cast_func=round_str)
        == expected_with_cast_round
    )


def test_zrange_with_large_int(r: redis.Redis):
    with pytest.raises(redis.ResponseError, match="value is not an integer or out of range"):
        r.zrange("", 0, 9223372036854775808)
    with pytest.raises(redis.ResponseError, match="value is not an integer or out of range"):
        r.zrange("", 0, -9223372036854775809)


def test_zrangebyscore(r: redis.Redis):
    r.zadd("foo", {"zero": 0})
    r.zadd("foo", {"two": 2})
    r.zadd("foo", {"two_a_also": 2})
    r.zadd("foo", {"two_b_also": 2})
    r.zadd("foo", {"four": 4})
    assert r.zrangebyscore("foo", 1, 3) == [b"two", b"two_a_also", b"two_b_also"]
    assert r.zrangebyscore("foo", 2, 3) == [b"two", b"two_a_also", b"two_b_also"]
    assert r.zrangebyscore("foo", 0, 4) == [
        b"zero",
        b"two",
        b"two_a_also",
        b"two_b_also",
        b"four",
    ]
    assert r.zrangebyscore("foo", "-inf", 1) == [b"zero"]
    assert r.zrangebyscore("foo", 2, "+inf") == [
        b"two",
        b"two_a_also",
        b"two_b_also",
        b"four",
    ]
    assert r.zrangebyscore("foo", "-inf", "+inf") == [
        b"zero",
        b"two",
        b"two_a_also",
        b"two_b_also",
        b"four",
    ]


def test_zrangebysore_exclusive(r: redis.Redis):
    r.zadd("foo", {"zero": 0})
    r.zadd("foo", {"two": 2})
    r.zadd("foo", {"four": 4})
    r.zadd("foo", {"five": 5})
    assert r.zrangebyscore("foo", "(0", 6) == [b"two", b"four", b"five"]
    assert r.zrangebyscore("foo", "(2", "(5") == [b"four"]
    assert r.zrangebyscore("foo", 0, "(4") == [b"zero", b"two"]


def test_zrangebyscore_raises_error(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zadd("foo", {"two": 2})
    r.zadd("foo", {"three": 3})
    with pytest.raises(redis.ResponseError):
        r.zrangebyscore("foo", "one", 2)
    with pytest.raises(redis.ResponseError):
        r.zrangebyscore("foo", 2, "three")
    with pytest.raises(redis.ResponseError):
        r.zrangebyscore("foo", 2, "3)")
    with pytest.raises(redis.RedisError):
        r.zrangebyscore("foo", 2, "3)", 0, None)


def test_zrangebyscore_wrong_type(r: redis.Redis):
    r.sadd("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.zrangebyscore("foo", "(1", "(2")


def test_zrangebyscore_slice(r: redis.Redis):
    r.zadd("foo", {"two_a": 2})
    r.zadd("foo", {"two_b": 2})
    r.zadd("foo", {"two_c": 2})
    r.zadd("foo", {"two_d": 2})
    assert r.zrangebyscore("foo", 0, 4, 0, 2) == [b"two_a", b"two_b"]
    assert r.zrangebyscore("foo", 0, 4, 1, 3) == [b"two_b", b"two_c", b"two_d"]


def test_zrangebyscore_withscores(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zadd("foo", {"two": 2})
    r.zadd("foo", {"three": 3})
    assert r.zrangebyscore("foo", 1, 3, 0, 2, True) == [(b"one", 1), (b"two", 2)]


def test_zrangebyscore_cast_scores(r: redis.Redis):
    r.zadd("foo", {"two": 2})
    r.zadd("foo", {"two_a_also": 2.2})

    expected_without_cast_round = [(b"two", 2.0), (b"two_a_also", 2.2)]
    expected_with_cast_round = [(b"two", 2.0), (b"two_a_also", 2.0)]
    assert sorted(r.zrangebyscore("foo", 2, 3, withscores=True)) == sorted(
        expected_without_cast_round
    )
    assert sorted(
        r.zrangebyscore("foo", 2, 3, withscores=True, score_cast_func=round_str)
    ) == sorted(expected_with_cast_round)


def test_zrevrangebyscore(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zadd("foo", {"two": 2})
    r.zadd("foo", {"three": 3})
    assert r.zrevrangebyscore("foo", 3, 1) == [b"three", b"two", b"one"]
    assert r.zrevrangebyscore("foo", 3, 2) == [b"three", b"two"]
    assert r.zrevrangebyscore("foo", 3, 1, 0, 1) == [b"three"]
    assert r.zrevrangebyscore("foo", 3, 1, 1, 2) == [b"two", b"one"]


def test_zrevrangebyscore_exclusive(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zadd("foo", {"two": 2})
    r.zadd("foo", {"three": 3})
    assert r.zrevrangebyscore("foo", "(3", 1) == [b"two", b"one"]
    assert r.zrevrangebyscore("foo", 3, "(2") == [b"three"]
    assert r.zrevrangebyscore("foo", "(3", "(1") == [b"two"]
    assert r.zrevrangebyscore("foo", "(2", 1, 0, 1) == [b"one"]
    assert r.zrevrangebyscore("foo", "(2", "(1", 0, 1) == []
    assert r.zrevrangebyscore("foo", "(3", "(0", 1, 2) == [b"one"]


def test_zrevrangebyscore_raises_error(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zadd("foo", {"two": 2})
    r.zadd("foo", {"three": 3})
    with pytest.raises(redis.ResponseError):
        r.zrevrangebyscore("foo", "three", 1)
    with pytest.raises(redis.ResponseError):
        r.zrevrangebyscore("foo", 3, "one")
    with pytest.raises(redis.ResponseError):
        r.zrevrangebyscore("foo", 3, "1)")
    with pytest.raises(redis.ResponseError):
        r.zrevrangebyscore("foo", "((3", "1)")


def test_zrevrangebyscore_wrong_type(r: redis.Redis):
    r.sadd("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.zrevrangebyscore("foo", "(3", "(1")


def test_zrevrangebyscore_cast_scores(r: redis.Redis):
    r.zadd("foo", {"two": 2})
    r.zadd("foo", {"two_a_also": 2.2})

    assert r.zrevrangebyscore("foo", 3, 2, withscores=True) == [
        (b"two_a_also", 2.2),
        (b"two", 2.0),
    ]

    assert r.zrevrangebyscore("foo", 3, 2, withscores=True, score_cast_func=round_str) == [
        (b"two_a_also", 2.0),
        (b"two", 2.0),
    ]


def test_zrangebylex(r: redis.Redis):
    r.zadd("foo", {"one_a": 0})
    r.zadd("foo", {"two_a": 0})
    r.zadd("foo", {"two_b": 0})
    r.zadd("foo", {"three_a": 0})
    assert r.zrangebylex("foo", b"(t", b"+") == [b"three_a", b"two_a", b"two_b"]
    assert r.zrangebylex("foo", b"(t", b"[two_b") == [b"three_a", b"two_a", b"two_b"]
    assert r.zrangebylex("foo", b"(t", b"(two_b") == [b"three_a", b"two_a"]
    assert r.zrangebylex("foo", b"[three_a", b"[two_b") == [
        b"three_a",
        b"two_a",
        b"two_b",
    ]
    assert r.zrangebylex("foo", b"(three_a", b"[two_b") == [b"two_a", b"two_b"]
    assert r.zrangebylex("foo", b"-", b"(two_b") == [b"one_a", b"three_a", b"two_a"]
    assert r.zrangebylex("foo", b"[two_b", b"(two_b") == []
    # reversed max + and min - boundaries
    # these will be always empty, but allowed by redis
    assert r.zrangebylex("foo", b"+", b"-") == []
    assert r.zrangebylex("foo", b"+", b"[three_a") == []
    assert r.zrangebylex("foo", b"[o", b"-") == []


def test_zrangebylex_wrong_type(r: redis.Redis):
    r.sadd("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.zrangebylex("foo", b"-", b"+")


def test_zlexcount(r: redis.Redis):
    r.zadd("foo", {"one_a": 0})
    r.zadd("foo", {"two_a": 0})
    r.zadd("foo", {"two_b": 0})
    r.zadd("foo", {"three_a": 0})
    assert r.zlexcount("foo", b"(t", b"+") == 3
    assert r.zlexcount("foo", b"(t", b"[two_b") == 3
    assert r.zlexcount("foo", b"(t", b"(two_b") == 2
    assert r.zlexcount("foo", b"[three_a", b"[two_b") == 3
    assert r.zlexcount("foo", b"(three_a", b"[two_b") == 2
    assert r.zlexcount("foo", b"-", b"(two_b") == 3
    assert r.zlexcount("foo", b"[two_b", b"(two_b") == 0
    # reversed max + and min - boundaries
    # these will be always empty, but allowed by redis
    assert r.zlexcount("foo", b"+", b"-") == 0
    assert r.zlexcount("foo", b"+", b"[three_a") == 0
    assert r.zlexcount("foo", b"[o", b"-") == 0


def test_zlexcount_wrong_type(r: redis.Redis):
    r.sadd("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.zlexcount("foo", b"-", b"+")


def test_zrangebylex_with_limit(r: redis.Redis):
    r.zadd("foo", {"one_a": 0})
    r.zadd("foo", {"two_a": 0})
    r.zadd("foo", {"two_b": 0})
    r.zadd("foo", {"three_a": 0})
    assert r.zrangebylex("foo", b"-", b"+", 1, 2) == [b"three_a", b"two_a"]

    # negative offset no results
    assert r.zrangebylex("foo", b"-", b"+", -1, 3) == []

    # negative limit ignored
    assert r.zrangebylex("foo", b"-", b"+", 0, -2) == [
        b"one_a",
        b"three_a",
        b"two_a",
        b"two_b",
    ]
    assert r.zrangebylex("foo", b"-", b"+", 1, -2) == [b"three_a", b"two_a", b"two_b"]
    assert r.zrangebylex("foo", b"+", b"-", 1, 1) == []


def test_zrangebylex_raises_error(r: redis.Redis):
    r.zadd("foo", {"one_a": 0})
    r.zadd("foo", {"two_a": 0})
    r.zadd("foo", {"two_b": 0})
    r.zadd("foo", {"three_a": 0})

    with pytest.raises(redis.ResponseError):
        r.zrangebylex("foo", b"", b"[two_b")

    with pytest.raises(redis.ResponseError):
        r.zrangebylex("foo", b"-", b"two_b")

    with pytest.raises(redis.ResponseError):
        r.zrangebylex("foo", b"(t", b"two_b")

    with pytest.raises(redis.ResponseError):
        r.zrangebylex("foo", b"t", b"+")

    with pytest.raises(redis.ResponseError):
        r.zrangebylex("foo", b"[two_a", b"")

    with pytest.raises(redis.RedisError):
        r.zrangebylex("foo", b"(two_a", b"[two_b", 1)


def test_zrevrangebylex(r: redis.Redis):
    r.zadd("foo", {"one_a": 0})
    r.zadd("foo", {"two_a": 0})
    r.zadd("foo", {"two_b": 0})
    r.zadd("foo", {"three_a": 0})
    assert r.zrevrangebylex("foo", b"+", b"(t") == [b"two_b", b"two_a", b"three_a"]
    assert r.zrevrangebylex("foo", b"[two_b", b"(t") == [b"two_b", b"two_a", b"three_a"]
    assert r.zrevrangebylex("foo", b"(two_b", b"(t") == [b"two_a", b"three_a"]
    assert r.zrevrangebylex("foo", b"[two_b", b"[three_a") == [
        b"two_b",
        b"two_a",
        b"three_a",
    ]
    assert r.zrevrangebylex("foo", b"[two_b", b"(three_a") == [b"two_b", b"two_a"]
    assert r.zrevrangebylex("foo", b"(two_b", b"-") == [b"two_a", b"three_a", b"one_a"]
    assert r.zrangebylex("foo", b"(two_b", b"[two_b") == []
    # reversed max + and min - boundaries
    # these will be always empty, but allowed by redis
    assert r.zrevrangebylex("foo", b"-", b"+") == []
    assert r.zrevrangebylex("foo", b"[three_a", b"+") == []
    assert r.zrevrangebylex("foo", b"-", b"[o") == []


def test_zrevrangebylex_with_limit(r: redis.Redis):
    r.zadd("foo", {"one_a": 0})
    r.zadd("foo", {"two_a": 0})
    r.zadd("foo", {"two_b": 0})
    r.zadd("foo", {"three_a": 0})
    assert r.zrevrangebylex("foo", b"+", b"-", 1, 2) == [b"two_a", b"three_a"]


def test_zrevrangebylex_raises_error(r: redis.Redis):
    r.zadd("foo", {"one_a": 0})
    r.zadd("foo", {"two_a": 0})
    r.zadd("foo", {"two_b": 0})
    r.zadd("foo", {"three_a": 0})

    with pytest.raises(redis.ResponseError):
        r.zrevrangebylex("foo", b"[two_b", b"")

    with pytest.raises(redis.ResponseError):
        r.zrevrangebylex("foo", b"two_b", b"-")

    with pytest.raises(redis.ResponseError):
        r.zrevrangebylex("foo", b"two_b", b"(t")

    with pytest.raises(redis.ResponseError):
        r.zrevrangebylex("foo", b"+", b"t")

    with pytest.raises(redis.ResponseError):
        r.zrevrangebylex("foo", b"", b"[two_a")

    with pytest.raises(redis.RedisError):
        r.zrevrangebylex("foo", b"[two_a", b"(two_b", 1)


def test_zrevrangebylex_wrong_type(r: redis.Redis):
    r.sadd("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.zrevrangebylex("foo", b"+", b"-")


def test_zremrangebyrank(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zadd("foo", {"two": 2})
    r.zadd("foo", {"three": 3})
    assert r.zremrangebyrank("foo", 0, 1) == 2
    assert r.zrange("foo", 0, -1) == [b"three"]


def test_zremrangebyrank_negative_indices(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zadd("foo", {"two": 2})
    r.zadd("foo", {"three": 3})
    assert r.zremrangebyrank("foo", -2, -1) == 2
    assert r.zrange("foo", 0, -1) == [b"one"]


def test_zremrangebyrank_out_of_bounds(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    assert r.zremrangebyrank("foo", 1, 3) == 0


def test_zremrangebyrank_wrong_type(r: redis.Redis):
    r.sadd("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.zremrangebyrank("foo", 1, 3)


def test_zremrangebyscore(r: redis.Redis):
    r.zadd("foo", {"zero": 0})
    r.zadd("foo", {"two": 2})
    r.zadd("foo", {"four": 4})
    # Outside of range.
    assert r.zremrangebyscore("foo", 5, 10) == 0
    assert r.zrange("foo", 0, -1) == [b"zero", b"two", b"four"]
    # Middle of range.
    assert r.zremrangebyscore("foo", 1, 3) == 1
    assert r.zrange("foo", 0, -1) == [b"zero", b"four"]
    assert r.zremrangebyscore("foo", 1, 3) == 0
    # Entire range.
    assert r.zremrangebyscore("foo", 0, 4) == 2
    assert r.zrange("foo", 0, -1) == []


def test_zremrangebyscore_exclusive(r: redis.Redis):
    r.zadd("foo", {"zero": 0})
    r.zadd("foo", {"two": 2})
    r.zadd("foo", {"four": 4})
    assert r.zremrangebyscore("foo", "(0", 1) == 0
    assert r.zrange("foo", 0, -1) == [b"zero", b"two", b"four"]
    assert r.zremrangebyscore("foo", "-inf", "(0") == 0
    assert r.zrange("foo", 0, -1) == [b"zero", b"two", b"four"]
    assert r.zremrangebyscore("foo", "(2", 5) == 1
    assert r.zrange("foo", 0, -1) == [b"zero", b"two"]
    assert r.zremrangebyscore("foo", 0, "(2") == 1
    assert r.zrange("foo", 0, -1) == [b"two"]
    assert r.zremrangebyscore("foo", "(1", "(3") == 1
    assert r.zrange("foo", 0, -1) == []


def test_zremrangebyscore_raises_error(r: redis.Redis):
    r.zadd("foo", {"zero": 0})
    r.zadd("foo", {"two": 2})
    r.zadd("foo", {"four": 4})
    with pytest.raises(redis.ResponseError):
        r.zremrangebyscore("foo", "three", 1)
    with pytest.raises(redis.ResponseError):
        r.zremrangebyscore("foo", 3, "one")
    with pytest.raises(redis.ResponseError):
        r.zremrangebyscore("foo", 3, "1)")
    with pytest.raises(redis.ResponseError):
        r.zremrangebyscore("foo", "((3", "1)")


def test_zremrangebyscore_badkey(r: redis.Redis):
    assert r.zremrangebyscore("foo", 0, 2) == 0


def test_zremrangebyscore_wrong_type(r: redis.Redis):
    r.sadd("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.zremrangebyscore("foo", 0, 2)


def test_zremrangebylex(r: redis.Redis):
    r.zadd("foo", {"two_a": 0})
    r.zadd("foo", {"two_b": 0})
    r.zadd("foo", {"one_a": 0})
    r.zadd("foo", {"three_a": 0})
    assert r.zremrangebylex("foo", b"(three_a", b"[two_b") == 2
    assert r.zremrangebylex("foo", b"(three_a", b"[two_b") == 0
    assert r.zremrangebylex("foo", b"-", b"(o") == 0
    assert r.zremrangebylex("foo", b"-", b"[one_a") == 1
    assert r.zremrangebylex("foo", b"[tw", b"+") == 0
    assert r.zremrangebylex("foo", b"[t", b"+") == 1
    assert r.zremrangebylex("foo", b"[t", b"+") == 0


def test_zremrangebylex_error(r: redis.Redis):
    r.zadd("foo", {"two_a": 0})
    r.zadd("foo", {"two_b": 0})
    r.zadd("foo", {"one_a": 0})
    r.zadd("foo", {"three_a": 0})
    with pytest.raises(redis.ResponseError):
        r.zremrangebylex("foo", b"(t", b"two_b")

    with pytest.raises(redis.ResponseError):
        r.zremrangebylex("foo", b"t", b"+")

    with pytest.raises(redis.ResponseError):
        r.zremrangebylex("foo", b"[two_a", b"")


def test_zremrangebylex_badkey(r: redis.Redis):
    assert r.zremrangebylex("foo", b"(three_a", b"[two_b") == 0


def test_zremrangebylex_wrong_type(r: redis.Redis):
    r.sadd("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.zremrangebylex("foo", b"bar", b"baz")


def test_zunionstore(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zadd("foo", {"two": 2})
    r.zadd("bar", {"one": 1})
    r.zadd("bar", {"two": 2})
    r.zadd("bar", {"three": 3})
    r.zunionstore("baz", ["foo", "bar"])
    assert r.zrange("baz", 0, -1, withscores=True) == [
        (b"one", 2),
        (b"three", 3),
        (b"two", 4),
    ]


def test_zunionstore_sum(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zadd("foo", {"two": 2})
    r.zadd("bar", {"one": 1})
    r.zadd("bar", {"two": 2})
    r.zadd("bar", {"three": 3})
    r.zunionstore("baz", ["foo", "bar"], aggregate="SUM")
    assert r.zrange("baz", 0, -1, withscores=True) == [
        (b"one", 2),
        (b"three", 3),
        (b"two", 4),
    ]


def test_zunionstore_max(r: redis.Redis):
    r.zadd("foo", {"one": 0})
    r.zadd("foo", {"two": 0})
    r.zadd("bar", {"one": 1})
    r.zadd("bar", {"two": 2})
    r.zadd("bar", {"three": 3})
    r.zunionstore("baz", ["foo", "bar"], aggregate="MAX")
    assert r.zrange("baz", 0, -1, withscores=True) == [
        (b"one", 1),
        (b"two", 2),
        (b"three", 3),
    ]


def test_zunionstore_min(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zadd("foo", {"two": 2})
    r.zadd("bar", {"one": 0})
    r.zadd("bar", {"two": 0})
    r.zadd("bar", {"three": 3})
    r.zunionstore("baz", ["foo", "bar"], aggregate="MIN")
    assert r.zrange("baz", 0, -1, withscores=True) == [
        (b"one", 0),
        (b"two", 0),
        (b"three", 3),
    ]


def test_zunionstore_weights(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zadd("foo", {"two": 2})
    r.zadd("bar", {"one": 1})
    r.zadd("bar", {"two": 2})
    r.zadd("bar", {"four": 4})
    r.zunionstore("baz", {"foo": 1, "bar": 2}, aggregate="SUM")
    assert r.zrange("baz", 0, -1, withscores=True) == [
        (b"one", 3),
        (b"two", 6),
        (b"four", 8),
    ]


def test_zunionstore_nan_to_zero(r: redis.Redis):
    r.zadd("foo", {"x": math.inf})
    r.zadd("foo2", {"x": math.inf})
    r.zunionstore("bar", OrderedDict([("foo", 1.0), ("foo2", 0.0)]))
    # This is different to test_zinterstore_nan_to_zero because of a quirk
    # in redis. See https://github.com/antirez/redis/issues/3954.
    assert r.zscore("bar", "x") == math.inf


def test_zunionstore_nan_to_zero2(r: redis.Redis):
    r.zadd("foo", {"zero": 0})
    r.zadd("foo2", {"one": 1})
    r.zadd("foo3", {"one": 1})
    r.zunionstore("bar", {"foo": math.inf}, aggregate="SUM")
    assert r.zrange("bar", 0, -1, withscores=True) == [(b"zero", 0)]
    r.zunionstore("bar", OrderedDict([("foo2", math.inf), ("foo3", -math.inf)]))
    assert r.zrange("bar", 0, -1, withscores=True) == [(b"one", 0)]


@pytest.mark.unsupported_server_types("dragonfly")
def test_zunionstore_nan_to_zero_ordering(r: redis.Redis):
    r.zadd("foo", {"e1": math.inf})
    r.zadd("bar", {"e1": -math.inf, "e2": 0.0})
    r.zunionstore("baz", ["foo", "bar", "foo"])
    assert r.zscore("baz", "e1") == 0.0


def test_zunionstore_mixed_set_types(r: redis.Redis):
    # No score, redis will use 1.0.
    r.sadd("foo", "one")
    r.sadd("foo", "two")
    r.zadd("bar", {"one": 1})
    r.zadd("bar", {"two": 2})
    r.zadd("bar", {"three": 3})
    r.zunionstore("baz", ["foo", "bar"], aggregate="SUM")
    assert r.zrange("baz", 0, -1, withscores=True) == [
        (b"one", 2),
        (b"three", 3),
        (b"two", 3),
    ]


def test_zunionstore_badkey(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zadd("foo", {"two": 2})
    r.zunionstore("baz", ["foo", "bar"], aggregate="SUM")
    assert r.zrange("baz", 0, -1, withscores=True) == [(b"one", 1), (b"two", 2)]
    r.zunionstore("baz", {"foo": 1, "bar": 2}, aggregate="SUM")
    assert r.zrange("baz", 0, -1, withscores=True) == [(b"one", 1), (b"two", 2)]


def test_zunionstore_wrong_type(r: redis.Redis):
    r.set("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.zunionstore("baz", ["foo", "bar"])


def test_zinterstore(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zadd("foo", {"two": 2})
    r.zadd("bar", {"one": 1})
    r.zadd("bar", {"two": 2})
    r.zadd("bar", {"three": 3})
    r.zinterstore("baz", ["foo", "bar"])
    assert r.zrange("baz", 0, -1, withscores=True) == [(b"one", 2), (b"two", 4)]


@pytest.mark.unsupported_server_types("dragonfly")
def test_zinterstore_mixed_set_types(r: redis.Redis):
    r.sadd("foo", "one")
    r.sadd("foo", "two")
    r.zadd("bar", {"one": 1})
    r.zadd("bar", {"two": 2})
    r.zadd("bar", {"three": 3})
    r.zinterstore("baz", ["foo", "bar"], aggregate="SUM")
    assert r.zrange("baz", 0, -1, withscores=True) == [(b"one", 2), (b"two", 3)]


def test_zinterstore_max(r: redis.Redis):
    r.zadd("foo", {"one": 0})
    r.zadd("foo", {"two": 0})
    r.zadd("bar", {"one": 1})
    r.zadd("bar", {"two": 2})
    r.zadd("bar", {"three": 3})
    r.zinterstore("baz", ["foo", "bar"], aggregate="MAX")
    assert r.zrange("baz", 0, -1, withscores=True) == [(b"one", 1), (b"two", 2)]


def test_zinterstore_onekey(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zinterstore("baz", ["foo"], aggregate="MAX")
    assert r.zrange("baz", 0, -1, withscores=True) == [(b"one", 1)]


def test_zinterstore_nokey(r: redis.Redis):
    with pytest.raises(redis.ResponseError):
        r.zinterstore("baz", [], aggregate="MAX")


@pytest.mark.unsupported_server_types("dragonfly")  # TODO causes a crash!
def test_zinterstore_nan_to_zero(r: redis.Redis):
    r.zadd("foo", {"x": math.inf})
    r.zadd("foo2", {"x": math.inf})
    r.zinterstore("bar", OrderedDict([("foo", 1.0), ("foo2", 0.0)]))
    assert r.zscore("bar", "x") == 0.0


def test_zunionstore_nokey(r: redis.Redis):
    with pytest.raises(redis.ResponseError):
        r.zunionstore("baz", [], aggregate="MAX")


@pytest.mark.unsupported_server_types("dragonfly")  # TODO Hang server
def test_zinterstore_wrong_type(r: redis.Redis):
    r.set("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.zinterstore("baz", ["foo", "bar"])


def test_empty_zset(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zrem("foo", "one")
    assert not r.exists("foo")


def test_zpopmax_too_many(r: redis.Redis):
    r.zadd("foo", {"one": 1})
    r.zadd("foo", {"two": 2})
    r.zadd("foo", {"three": 3})
    assert r.zpopmax("foo", count=5) == [
        (b"three", 3.0),
        (b"two", 2.0),
        (b"one", 1.0),
    ]


def test_bzpopmin(r: redis.Redis):
    r.zadd("foo", {"one": 1, "two": 2, "three": 3})
    r.zadd("bar", {"a": 1.5, "b": 2, "c": 3})
    assert r.bzpopmin(["foo", "bar"], 0) == (b"foo", b"one", 1.0)
    assert r.bzpopmin(["foo", "bar"], 0) == (b"foo", b"two", 2.0)
    assert r.bzpopmin(["foo", "bar"], 0) == (b"foo", b"three", 3.0)
    assert r.bzpopmin(["foo", "bar"], 0) == (b"bar", b"a", 1.5)


def test_bzpopmax(r: redis.Redis):
    r.zadd("foo", {"one": 1, "two": 2, "three": 3})
    r.zadd("bar", {"a": 1.5, "b": 2.5, "c": 3.5})
    assert r.bzpopmax(["foo", "bar"], 0) == (b"foo", b"three", 3.0)
    assert r.bzpopmax(["foo", "bar"], 0) == (b"foo", b"two", 2.0)
    assert r.bzpopmax(["foo", "bar"], 0) == (b"foo", b"one", 1.0)
    assert r.bzpopmax(["foo", "bar"], 0) == (b"bar", b"c", 3.5)


def test_zscan(r: redis.Redis):
    # Set up the data
    name = "zscan-test"
    for ix in range(20):
        r.zadd(name, {"key:%s" % ix: ix})
    expected = dict(r.zrange(name, 0, -1, withscores=True))

    # Test the basic version
    results = {}
    for key, val in r.zscan_iter(name, count=6):
        results[key] = val
    assert results == expected

    # Now test that the MATCH functionality works
    results = {}
    cursor = "0"
    while cursor != 0:
        cursor, data = r.zscan(name, cursor, match="*7", count=6)
        results.update(data)
    assert results == {b"key:7": 7.0, b"key:17": 17.0}


def test_zrandemember(r: redis.Redis):
    r.zadd("a", {"a1": 1, "a2": 2, "a3": 3, "a4": 4, "a5": 5})
    assert r.zrandmember("a") is not None
    assert len(r.zrandmember("a", 2)) == 2
    # with scores
    assert len(r.zrandmember("a", 2, True)) == 4
    # without duplications
    assert len(r.zrandmember("a", 10)) == 5
    # with duplications
    assert len(r.zrandmember("a", -10)) == 10


@pytest.mark.unsupported_server_types("dragonfly")
def test_zdiffstore(r: redis.Redis):
    r.zadd("a", {"a1": 1, "a2": 2, "a3": 3})
    r.zadd("b", {"a1": 1, "a2": 2})
    assert r.zdiffstore("out", ["a", "b"])
    assert r.zrange("out", 0, -1) == [b"a3"]
    assert r.zrange("out", 0, -1, withscores=True) == [(b"a3", 3.0)]


def test_zdiff(r: redis.Redis):
    r.zadd("a", {"a1": 1, "a2": 2, "a3": 3})
    r.zadd("b", {"a1": 1, "a2": 2})
    assert r.zdiff(["a", "b"]) == [b"a3"]
    assert r.zdiff(["a", "b"], withscores=True) == [b"a3", b"3"]


def test_zunion(r: redis.Redis):
    r.zadd("a", {"a1": 1, "a2": 1, "a3": 1})
    r.zadd("b", {"a1": 2, "a2": 2, "a3": 2})
    r.zadd("c", {"a1": 6, "a3": 5, "a4": 4})
    # sum
    assert r.zunion(["a", "b", "c"]) == [b"a2", b"a4", b"a3", b"a1"]
    assert r.zunion(["a", "b", "c"], withscores=True) == [
        (b"a2", 3),
        (b"a4", 4),
        (b"a3", 8),
        (b"a1", 9),
    ]
    # max
    assert r.zunion(["a", "b", "c"], aggregate="MAX", withscores=True) == [
        (b"a2", 2),
        (b"a4", 4),
        (b"a3", 5),
        (b"a1", 6),
    ]
    # min
    assert r.zunion(["a", "b", "c"], aggregate="MIN", withscores=True) == [
        (b"a1", 1),
        (b"a2", 1),
        (b"a3", 1),
        (b"a4", 4),
    ]
    # with weight
    assert r.zunion({"a": 1, "b": 2, "c": 3}, withscores=True) == [
        (b"a2", 5),
        (b"a4", 12),
        (b"a3", 20),
        (b"a1", 23),
    ]


def test_zinter(r: redis.Redis):
    r.zadd("a", {"a1": 1, "a2": 2, "a3": 1})
    r.zadd("b", {"a1": 2, "a2": 2, "a3": 2})
    r.zadd("c", {"a1": 6, "a3": 5, "a4": 4})
    assert r.zinter(["a", "b", "c"]) == [b"a3", b"a1"]
    # invalid aggregation
    with pytest.raises(redis.DataError):
        r.zinter(["a", "b", "c"], aggregate="foo", withscores=True)
    # aggregate with SUM
    assert r.zinter(["a", "b", "c"], withscores=True) == [(b"a3", 8), (b"a1", 9)]
    # aggregate with MAX
    assert r.zinter(["a", "b", "c"], aggregate="MAX", withscores=True) == [
        (b"a3", 5),
        (b"a1", 6),
    ]
    # aggregate with MIN
    assert r.zinter(["a", "b", "c"], aggregate="MIN", withscores=True) == [
        (b"a1", 1),
        (b"a3", 1),
    ]
    # with weights
    assert r.zinter({"a": 1, "b": 2, "c": 3}, withscores=True) == [
        (b"a3", 20),
        (b"a1", 23),
    ]


@pytest.mark.min_server("7")
def test_zintercard(r: redis.Redis):
    r.zadd("a", {"a1": 1, "a2": 2, "a3": 1})
    r.zadd("b", {"a1": 2, "a2": 2, "a3": 2})
    r.zadd("c", {"a1": 6, "a3": 5, "a4": 4})
    assert r.zintercard(3, ["a", "b", "c"]) == 2
    assert r.zintercard(3, ["a", "b", "c"], limit=1) == 1


@pytest.mark.unsupported_server_types("dragonfly")
def test_zrangestore(r: redis.Redis):
    r.zadd("a", {"a1": 1, "a2": 2, "a3": 3})
    assert r.zrangestore("b", "a", 0, 1)
    assert r.zrange("b", 0, -1) == [b"a1", b"a2"]
    assert r.zrangestore("b", "a", 1, 2)
    assert r.zrange("b", 0, -1) == [b"a2", b"a3"]
    assert r.zrange("b", 0, -1, withscores=True) == [(b"a2", 2), (b"a3", 3)]
    # reversed order
    assert r.zrangestore("b", "a", 1, 2, desc=True)
    assert r.zrange("b", 0, -1) == [b"a1", b"a2"]
    # by score
    assert r.zrangestore("b", "a", 2, 1, byscore=True, offset=0, num=1, desc=True)
    assert r.zrange("b", 0, -1) == [b"a2"]
    # by lex
    assert r.zrange("a", "[a2", "(a3", bylex=True, offset=0, num=1) == [b"a2"]
    assert r.zrangestore("b", "a", "[a2", "(a3", bylex=True, offset=0, num=1)
    assert r.zrange("b", 0, -1) == [b"a2"]


@pytest.mark.min_server("7")
@pytest.mark.unsupported_server_types("dragonfly")
def test_zmpop(r: redis.Redis):
    r.zadd("a", {"a1": 1, "a2": 2, "a3": 3})
    res = [b"a", [[b"a1", b"1"], [b"a2", b"2"]]]
    assert r.zmpop("2", ["b", "a"], min=True, count=2) == res
    with pytest.raises(redis.DataError):
        r.zmpop("2", ["b", "a"], count=2)
    r.zadd("b", {"b1": 10, "ab": 9, "b3": 8})
    assert r.zmpop("2", ["b", "a"], max=True) == [b"b", [[b"b1", b"10"]]]


@pytest.mark.min_server("7")
@pytest.mark.unsupported_server_types("dragonfly")
def test_bzmpop(r: redis.Redis):
    r.zadd("a", {"a1": 1, "a2": 2, "a3": 3})
    res = [b"a", [[b"a1", b"1"], [b"a2", b"2"]]]
    assert r.bzmpop(1, "2", ["b", "a"], min=True, count=2) == res
    with pytest.raises(redis.DataError):
        r.bzmpop(1, "2", ["b", "a"], count=2)
    r.zadd("b", {"b1": 10, "ab": 9, "b3": 8})
    res = [b"b", [[b"b1", b"10"]]]
    assert r.bzmpop(0, "2", ["b", "a"], max=True) == res
    assert r.bzmpop(1, "2", ["foo", "bar"], max=True) is None


================================================
FILE: tests/fakeredis/test/test_mixins/test_streams_commands.py
================================================
import threading
import time
from typing import List

import pytest
import redis
from fakeredis import _msgs as msgs

from test import testtools


def get_ids(results):
    return [result[0] for result in results]


def add_items(r: redis.Redis, stream: str, n: int):
    id_list = list()
    for i in range(n):
        id_list.append(r.xadd(stream, {"k": i}))
    return id_list


def test_xadd_redis__green(r: redis.Redis):
    stream = "stream"
    before = int(1000 * time.time())
    m1 = r.xadd(stream, {"some": "other"})
    ts1, seq1 = m1.decode().split("-")
    after = int(1000 * time.time()) + 1
    assert before <= int(ts1) <= after
    seq1 = int(seq1)
    m2 = r.xadd(stream, {"add": "more"}, id=f"{ts1}-{seq1 + 1}")
    ts2, seq2 = m2.decode().split("-")
    assert ts1 == ts2
    assert int(seq2) == int(seq1) + 1

    stream = "stream2"
    m1 = r.xadd(stream, {"some": "other"})
    ts1, seq1 = m1.decode().split("-")
    ts1 = int(ts1) - 1
    with pytest.raises(redis.ResponseError):
        r.xadd(stream, {"add": "more"}, id=f"{ts1}-*")
    with pytest.raises(redis.ResponseError):
        r.xadd(stream, {"add": "more"}, id=f"{ts1}-1")


@pytest.mark.min_server("7")
def test_xadd_redis7(r: redis.Redis):  # Using ts-*
    stream = "stream"
    m1 = r.xadd(stream, {"some": "other"})
    ts1, seq1 = m1.decode().split("-")
    m2 = r.xadd(stream, {"add": "more"}, id=f"{ts1}-*")
    ts2, seq2 = m2.decode().split("-")
    ts1, seq1 = int(ts1), int(seq1)
    ts2, seq2 = int(ts2), int(seq2)
    assert ts2 == ts1
    assert seq2 == seq1 + 1


def test_xadd_maxlen(r: redis.Redis):
    stream = "stream"
    id_list = add_items(r, stream, 10)
    maxlen = 5
    id_list.append(r.xadd(stream, {"k": "new"}, maxlen=maxlen, approximate=False))
    assert r.xlen(stream) == maxlen
    results = r.xrange(stream, id_list[0])
    assert get_ids(results) == id_list[len(id_list) - maxlen :]
    with pytest.raises(redis.ResponseError):
        testtools.raw_command(
            r, "xadd", stream, "maxlen", "3", "minid", "sometestvalue", "field", "value"
        )
    assert r.set("non-a-stream", 1) == 1
    with pytest.raises(redis.ResponseError):
        r.xlen("non-a-stream")


def test_xadd_minid(r: redis.Redis):
    stream = "stream"
    id_list = add_items(r, stream, 10)
    minid = id_list[6]
    id_list.append(r.xadd(stream, {"k": "new"}, minid=minid, approximate=False))
    assert r.xlen(stream) == len(id_list) - 6
    results = r.xrange(stream, id_list[0])
    assert get_ids(results) == id_list[6:]


def test_xtrim(r: redis.Redis):
    stream = "stream"

    # trimming an empty key doesn't do anything
    assert r.xtrim(stream, 1000) == 0
    add_items(r, stream, 4)

    # trimming an amount larger than the number of messages doesn't do anything
    assert r.xtrim(stream, 5, approximate=False) == 0

    # 1 message is trimmed
    assert r.xtrim(stream, 3, approximate=False) == 1


@pytest.mark.min_server("6.2.4")
def test_xtrim_minlen_and_length_args(r: redis.Redis):
    stream = "stream"
    add_items(r, stream, 4)

    # Future self: No limits without approximate, according to the api
    # with pytest.raises(redis.ResponseError):
    #     assert r.xtrim(stream, 3, approximate=False, limit=2)

    with pytest.raises(redis.DataError):
        assert r.xtrim(stream, maxlen=3, minid="sometestvalue")

    with pytest.raises(redis.ResponseError):
        testtools.raw_command(
            r, "xtrim", stream, "maxlen", "3", "minid", "sometestvalue"
        )
    # minid with a limit
    stream = "s2"
    m1 = add_items(r, stream, 4)[0]
    assert r.xtrim(stream, minid=m1, limit=3) == 0

    # pure minid
    m4 = add_items(r, stream, 4)[-1]
    assert r.xtrim(stream, approximate=False, minid=m4) == 7

    # minid approximate
    r.xadd(stream, {"foo": "bar"})
    r.xadd(stream, {"foo": "bar"})
    m3 = r.xadd(stream, {"foo": "bar"})
    r.xadd(stream, {"foo": "bar"})
    assert r.xtrim(stream, approximate=False, minid=m3) == 3


def test_xadd_nomkstream(r: redis.Redis):
    r.xadd("stream2", {"some": "other"}, nomkstream=True)
    assert r.xlen("stream2") == 0
    # nomkstream option
    stream = "stream"
    r.xadd(stream, {"foo": "bar"})
    r.xadd(stream, {"some": "other"}, nomkstream=False)
    assert r.xlen(stream) == 2
    r.xadd(stream, {"some": "other"}, nomkstream=True)
    assert r.xlen(stream) == 3


def _add_to_stream(r: redis.Redis, stream_name: str, n: int):
    res = []
    for _ in range(n):
        res.append(r.xadd(stream_name, {"foo": "bar"}))
    return res


def test_xrevrange(r: redis.Redis):
    stream = "stream"
    m1, m2, m3, m4 = _add_to_stream(r, stream, 4)

    results = r.xrevrange(stream, max=m4)
    assert get_ids(results) == [m4, m3, m2, m1]

    results = r.xrevrange(stream, max=m3, min=m2)
    assert get_ids(results) == [m3, m2]

    results = r.xrevrange(stream, min=m3)
    assert get_ids(results) == [m4, m3]

    results = r.xrevrange(stream, min=m2, count=1)
    assert get_ids(results) == [m4]


def test_xrange(r: redis.Redis):
    m = r.xadd("stream1", {"foo": "bar"})
    assert r.xrange("stream1") == [
        (m, {b"foo": b"bar"}),
    ]

    stream = "stream2"
    m = testtools.raw_command(
        r, "xadd", stream, "*", b"field", b"value", b"foo", b"bar"
    )

    assert r.xrevrange(stream) == [
        (m, {b"field": b"value", b"foo": b"bar"}),
    ]

    stream = "stream"
    m1, m2, m3, m4 = _add_to_stream(r, stream, 4)

    results = r.xrange(stream, min=m1)
    assert get_ids(results) == [m1, m2, m3, m4]

    results = r.xrange(stream, min=m2, max=m3)
    assert get_ids(results) == [m2, m3]

    results = r.xrange(stream, max=m3)
    assert get_ids(results) == [m1, m2, m3]

    results = r.xrange(stream, max=m2, count=1)
    assert get_ids(results) == [m1]


def get_stream_message(client, stream, message_id):
    """Fetch a stream message and format it as a (message_id, fields) pair"""
    response = client.xrange(stream, min=message_id, max=message_id)
    assert len(response) == 1
    return response[0]


def test_xread_multiple_streams_blocking(r: redis.Redis):
    stream1 = "stream1"
    stream2 = "stream2"
    m1 = r.xadd(stream1, {"foo": "bar"})
    m2 = r.xadd(stream2, {"bing": "baz"})

    res = r.xread(streams={stream1: 0, stream2: 0}, block=10)
    assert len(res) == 2


def test_xread_blocking_no_count(r: redis.Redis):
    k = "key"
    r.xadd(k, {"value": 1234})
    streams = {k: "0"}
    m1 = r.xread(streams=streams, block=10)
    assert m1[0][1][0][1] == {b"value": b"1234"}


def test_xread(r: redis.Redis):
    stream = "stream"
    m1 = r.xadd(stream, {"foo": "bar"})
    m2 = r.xadd(stream, {"bing": "baz"})

    expected = [
        [
            stream.encode(),
            [get_stream_message(r, stream, m1), get_stream_message(r, stream, m2)],
        ]
    ]
    # xread starting at 0 returns both messages
    assert r.xread(streams={stream: 0}) == expected

    expected = [[stream.encode(), [get_stream_message(r, stream, m1)]]]
    # xread starting at 0 and count=1 returns only the first message
    assert r.xread(streams={stream: 0}, count=1) == expected

    expected = [[stream.encode(), [get_stream_message(r, stream, m2)]]]
    # xread starting at m1 returns only the second message
    assert r.xread(streams={stream: m1}) == expected

    # xread starting at the last message returns an empty list
    assert r.xread(streams={stream: m2}) == []


def test_xread_count(r: redis.Redis):
    r.xadd("test", {"x": 1})
    result = r.xread(streams={"test": "0"}, count=100, block=10)
    assert result[0][0] == b"test"
    assert result[0][1][0][1] == {b"x": b"1"}


def test_xread_bad_commands(r: redis.Redis):
    with pytest.raises(redis.ResponseError) as exc_info:
        testtools.raw_command(r, "xread", "foo", "11-1")
    print(exc_info)
    with pytest.raises(redis.ResponseError) as ex2:
        testtools.raw_command(
            r,
            "xread",
            "streams",
            "foo",
        )
    print(ex2)


def test_xdel(r: redis.Redis):
    stream = "stream"

    # deleting from an empty stream doesn't do anything
    assert r.xdel(stream, 1) == 0

    m1 = r.xadd(stream, {"foo": "bar"})
    m2 = r.xadd(stream, {"foo": "bar"})
    m3 = r.xadd(stream, {"foo": "bar"})

    # xdel returns the number of deleted elements
    assert r.xdel(stream, m1) == 1
    assert r.xdel(stream, m2, m3) == 2

    with pytest.raises(redis.ResponseError) as ex:
        testtools.raw_command(r, "XDEL", stream)
    assert ex.value.args[0] == msgs.WRONG_ARGS_MSG6.format("xdel")[4:]
    assert r.xdel("non-existing-key", "1-1") == 0


def test_xgroup_destroy(r: redis.Redis):
    stream = "stream"
    group = "group"
    r.xadd(stream, {"foo": "bar"})

    assert r.xgroup_destroy(stream, group) == 0

    r.xgroup_create(stream, group, 0)
    assert r.xgroup_destroy(stream, group) == 1


@pytest.mark.min_server("7")
def test_xgroup_create_redis7(r: redis.Redis):
    stream, group = "stream", "group"
    message_id = r.xadd(stream, {"foo": "bar"})
    r.xgroup_create(stream, group, message_id)
    r.xadd(stream, {"foo": "bar"})
    expected = [
        {
            "name": group.encode(),
            "consumers": 0,
            "pending": 0,
            "last-delivered-id": message_id,
            "entries-read": None,
            "lag": 1,
        }
    ]
    assert r.xinfo_groups(stream) == expected


@pytest.mark.min_server("7")
@pytest.mark.unsupported_server_types("dragonfly")
def test_xgroup_setid_redis7(r: redis.Redis):
    stream, group = "stream", "group"
    message_id = r.xadd(stream, {"foo": "bar"})

    r.xgroup_create(stream, group, 0)
    # advance the last_delivered_id to the message_id
    r.xgroup_setid(stream, group, message_id, entries_read=2)
    expected = [
        {
            "name": group.encode(),
            "consumers": 0,
            "pending": 0,
            "last-delivered-id": message_id,
            "entries-read": 2,
            "lag": -1,
        }
    ]
    assert r.xinfo_groups(stream) == expected


def test_xgroup_delconsumer(r: redis.Redis):
    stream, group, consumer = "stream", "group", "consumer"
    r.xadd(stream, {"foo": "bar"})
    r.xadd(stream, {"foo": "bar"})
    r.xgroup_create(stream, group, 0)

    # a consumer that hasn't yet read any messages doesn't do anything
    assert r.xgroup_delconsumer(stream, group, consumer) == 0

    # read all messages from the group
    r.xreadgroup(group, consumer, streams={stream: ">"})

    # deleting the consumer should return 2 pending messages
    assert r.xgroup_delconsumer(stream, group, consumer) == 2


def test_xgroup_createconsumer(r: redis.Redis):
    stream, group, consumer = "stream", "group", "consumer"
    r.xadd(stream, {"foo": "bar"})
    r.xadd(stream, {"foo": "bar"})
    r.xgroup_create(stream, group, 0)
    assert r.xgroup_createconsumer(stream, group, consumer) == 1
    # Adding consumer with existing consumer name does nothing
    assert r.xgroup_createconsumer(stream, group, consumer) == 0

    # read all messages from the group
    r.xreadgroup(group, consumer, streams={stream: ">"})

    # deleting the consumer should return 2 pending messages
    assert r.xgroup_delconsumer(stream, group, consumer) == 2


def test_xinfo_consumers(r: redis.Redis):
    stream, group, consumer1, consumer2 = "stream", "group", "consumer1", "consumer2"
    r.xadd(stream, {"foo": "bar"})
    r.xadd(stream, {"foo": "bar"})
    r.xadd(stream, {"foo": "bar"})

    r.xgroup_create(stream, group, 0)
    r.xreadgroup(group, consumer1, streams={stream: ">"}, count=1)
    r.xreadgroup(group, consumer2, streams={stream: ">"})
    info = r.xinfo_consumers(stream, group)
    assert len(info) == 2
    expected = [
        {"name": consumer1.encode(), "pending": 1},
        {"name": consumer2.encode(), "pending": 2},
    ]

    # we can't determine the idle/inactive time, so just make sure it's an int
    assert isinstance(info[0].pop("idle"), int)
    assert isinstance(info[1].pop("idle"), int)
    assert isinstance(info[0].pop("inactive", 0), int)
    assert isinstance(info[1].pop("inactive", 0), int)
    assert info == expected


def test_xreadgroup(r: redis.Redis):
    stream, group, consumer = "stream", "group", "consumer1"
    with pytest.raises(redis.exceptions.ResponseError):
        r.xreadgroup(group, consumer, streams={stream: ">"})
    c1 = {b"foo": b"bar"}
    c2 = {b"bing": b"baz"}
    m1 = r.xadd(stream, c1)
    m2 = r.xadd(stream, c2)
    with pytest.raises(
        redis.exceptions.ResponseError,
        match=msgs.XREADGROUP_KEY_OR_GROUP_NOT_FOUND_MSG.format(stream, group),
    ):
        r.xreadgroup(group, consumer, streams={stream: ">"})
    r.xgroup_create(stream, group, 0)

    expected = [
        [
            stream.encode(),
            [get_stream_message(r, stream, m1), get_stream_message(r, stream, m2)],
        ]
    ]
    # xread starting at 0 returns both messages
    assert r.xreadgroup(group, consumer, streams={stream: ">"}) == expected

    r.xgroup_destroy(stream, group)
    r.xgroup_create(stream, group, 0)

    expected = [[stream.encode(), [get_stream_message(r, stream, m1)]]]
    # xread with count=1 returns only the first message
    assert r.xreadgroup(group, consumer, streams={stream: ">"}, count=1) == expected

    r.xgroup_destroy(stream, group)

    # create the group using $ as the last id meaning subsequent reads
    # will only find messages added after this
    r.xgroup_create(stream, group, "$")

    expected = []
    # xread starting after the last message returns an empty message list
    assert r.xreadgroup(group, consumer, streams={stream: ">"}) == expected

    # xreadgroup with noack does not have any items in the PEL
    r.xgroup_destroy(stream, group)
    r.xgroup_create(stream, group, "0")
    assert (
        len(r.xreadgroup(group, consumer, streams={stream: ">"}, noack=True)[0][1]) == 2
    )
    # now there should be nothing pending
    res = r.xreadgroup(group, consumer, streams={stream: "0"})
    assert len(res[0][1]) == 0

    r.xgroup_destroy(stream, group)
    r.xgroup_create(stream, group, "0")

    assert r.xreadgroup(group, consumer, streams={stream: ">"}) == [
        [stream.encode(), [(m1, c1), (m2, c2)]]
    ]
    # delete all the messages in the stream
    assert r.xtrim(stream, 0) == 2
    # TODO groups keep ids of deleted messages
    # expected = [[stream.encode(), [(m1, {}), (m2, {})]]]
    # assert r.xreadgroup(group, consumer, streams={stream: "0"}) == expected
    r.xreadgroup(group, consumer, streams={stream: ">"}, count=10, block=500)


def test_xinfo_stream(r: redis.Redis):
    stream = "stream"
    m1 = r.xadd(stream, {"foo": "bar"})
    m2 = r.xadd(stream, {"foo": "bar"})
    info = r.xinfo_stream(stream)

    assert info["length"] == 2
    assert info["first-entry"] == get_stream_message(r, stream, m1)
    assert info["last-entry"] == get_stream_message(r, stream, m2)


def assert_consumer_info(
    r: redis.Redis, stream: str, group: str, equal_keys: List
) -> List:
    res = r.xinfo_consumers(stream, group)
    assert len(res) == len(equal_keys)
    for i in range(len(equal_keys)):
        for k in res[i]:
            if k in equal_keys[i]:
                assert (
                    res[i][k] == equal_keys[i][k]
                ), f"res[{i}][{k}] mismatch, {res}!={equal_keys}"
            else:
                print(f"res[{i}][{k}]={res[i][k]}")
    return res


def test_xack(r: redis.Redis):
    stream, group, consumer = "stream", "group", "consumer"
    # xack on a stream that doesn't exist
    assert r.xack(stream, group, "0-0") == 0

    m1 = r.xadd(stream, {"one": "one"})
    m2 = r.xadd(stream, {"two": "two"})
    m3 = r.xadd(stream, {"three": "three"})

    # xack on a group that doesn't exist
    assert r.xack(stream, group, m1) == 0

    r.xgroup_create(stream, group, 0)
    r.xreadgroup(group, consumer, streams={stream: ">"})
    assert_consumer_info(r, stream, group, [{"name": b"consumer", "pending": 3}])
    assert r.xack(stream, group, m1) == 1
    time.sleep(0.01)
    res = assert_consumer_info(r, stream, group, [{"name": b"consumer", "pending": 2}])
    assert "idle" in res[0] and res[0]["idle"] > 0
    assert r.xack(stream, group, m2, m3) == 2
    assert_consumer_info(r, stream, group, [{"name": b"consumer", "pending": 0}])


@pytest.mark.min_server("7")
def test_xinfo_stream_redis7(r: redis.Redis):
    stream = "stream"
    m1 = r.xadd(stream, {"foo": "bar"})
    m2 = r.xadd(stream, {"foo": "bar"})
    info = r.xinfo_stream(stream)

    assert info["length"] == 2
    assert info["first-entry"] == get_stream_message(r, stream, m1)
    assert info["last-entry"] == get_stream_message(r, stream, m2)
    assert info["max-deleted-entry-id"] == b"0-0"
    assert info["entries-added"] == 2
    assert info["recorded-first-entry-id"] == m1

    r.xtrim(stream, 0)
    # Info about empty stream
    info = r.xinfo_stream(stream)

    assert info["length"] == 0
    assert info["first-entry"] is None
    assert info["last-entry"] is None
    assert info["max-deleted-entry-id"] == b"0-0"
    assert info["entries-added"] == 2
    assert info["recorded-first-entry-id"] == b"0-0"

    with pytest.raises(redis.exceptions.ResponseError):
        r.xinfo_stream("non-existing-key")


def test_xinfo_stream_full(r: redis.Redis):
    stream, group = "stream", "group"
    m1 = r.xadd(stream, {"foo": "bar"})
    r.xgroup_create(stream, group, 0)
    info = r.xinfo_stream(stream, full=True)

    assert info["length"] == 1
    assert m1 in info["entries"]
    assert len(info["groups"]) == 1


def test_xpending(r: redis.Redis):
    stream, group, consumer1, consumer2 = "stream", "group", "consumer1", "consumer2"
    m1 = r.xadd(stream, {"foo": "bar"})
    m2 = r.xadd(stream, {"foo": "bar"})
    r.xgroup_create(stream, group, 0)

    # xpending on a group that has no consumers yet
    expected = {"pending": 0, "min": None, "max": None, "consumers": []}
    assert r.xpending(stream, group) == expected

    # read 1 message from the group with each consumer
    r.xreadgroup(group, consumer1, streams={stream: ">"}, count=1)
    r.xreadgroup(group, consumer2, streams={stream: ">"}, count=1)

    expected = {
        "pending": 2,
        "min": m1,
        "max": m2,
        "consumers": [
            {"name": consumer1.encode(), "pending": 1},
            {"name": consumer2.encode(), "pending": 1},
        ],
    }
    assert r.xpending(stream, group) == expected


def test_xpending_range(r: redis.Redis):
    stream, group, consumer1, consumer2 = "stream", "group", "consumer1", "consumer2"
    m1 = r.xadd(stream, {"foo": "bar"})
    m2 = r.xadd(stream, {"foo": "bar"})
    r.xgroup_create(stream, group, 0)

    # xpending range on a group that has no consumers yet
    assert r.xpending_range(stream, group, min="-", max="+", count=5) == []

    # read 1 message from the group with each consumer
    r.xreadgroup(group, consumer1, streams={stream: ">"}, count=1)
    r.xreadgroup(group, consumer2, streams={stream: ">"}, count=1)

    response = r.xpending_range(stream, group, min="-", max="+", count=5)
    assert len(response) == 2
    assert response[0]["message_id"] == m1
    assert response[0]["consumer"] == consumer1.encode()
    assert response[1]["message_id"] == m2
    assert response[1]["consumer"] == consumer2.encode()

    # test with consumer name
    response = r.xpending_range(
        stream, group, min="-", max="+", count=5, consumername=consumer1
    )
    assert response[0]["message_id"] == m1
    assert response[0]["consumer"] == consumer1.encode()


def test_xpending_range_idle(r: redis.Redis):
    stream, group, consumer1, consumer2 = "stream", "group", "consumer1", "consumer2"
    r.xadd(stream, {"foo": "bar"})
    r.xadd(stream, {"foo": "bar"})
    r.xgroup_create(stream, group, 0)

    # read 1 message from the group with each consumer
    r.xreadgroup(group, consumer1, streams={stream: ">"}, count=1)
    r.xreadgroup(group, consumer2, streams={stream: ">"}, count=1)

    response = r.xpending_range(stream, group, min="-", max="+", count=5)
    assert len(response) == 2
    response = r.xpending_range(stream, group, min="-", max="+", count=5, idle=1000)
    assert len(response) == 0


def test_xpending_range_negative(r: redis.Redis):
    stream, group = "stream", "group"
    with pytest.raises(redis.DataError):
        r.xpending_range(stream, group, min="-", max="+", count=None)
    with pytest.raises(ValueError):
        r.xpending_range(stream, group, min="-", max="+", count="one")
    with pytest.raises(redis.DataError):
        r.xpending_range(stream, group, min="-", max="+", count=-1)
    with pytest.raises(ValueError):
        r.xpending_range(stream, group, min="-", max="+", count=5, idle="one")
    with pytest.raises(redis.exceptions.ResponseError):
        r.xpending_range(stream, group, min="-", max="+", count=5, idle=1.5)
    with pytest.raises(redis.DataError):
        r.xpending_range(stream, group, min="-", max="+", count=5, idle=-1)
    with pytest.raises(redis.DataError):
        r.xpending_range(stream, group, min=None, max=None, count=None, idle=0)
    with pytest.raises(redis.DataError):
        r.xpending_range(stream, group, min=None, max=None, count=None, consumername=0)


@pytest.mark.min_server("7")
@testtools.run_test_if_redispy_ver("gte", "4.4")
def test_xautoclaim_redis7(r: redis.Redis):
    stream, group, consumer1, consumer2 = "stream", "group", "consumer1", "consumer2"

    message_id1 = r.xadd(stream, {"john": "wick"})
    message_id2 = r.xadd(stream, {"johny": "deff"})
    message = get_stream_message(r, stream, message_id1)
    r.xgroup_create(stream, group, 0)

    # trying to claim a message that isn't already pending doesn't
    # do anything
    assert r.xautoclaim(stream, group, consumer2, min_idle_time=0) == [b"0-0", [], []]

    # read the group as consumer1 to initially claim the messages
    r.xreadgroup(group, consumer1, streams={stream: ">"})

    # claim one message as consumer2
    response = r.xautoclaim(stream, group, consumer2, min_idle_time=0, count=1)
    assert response[1] == [message]

    # reclaim the messages as consumer1, but use the justid argument
    # which only returns message ids
    assert r.xautoclaim(
        stream, group, consumer1, min_idle_time=0, start_id=0, justid=True
    ) == [
        message_id1,
        message_id2,
    ]
    assert r.xautoclaim(
        stream, group, consumer1, min_idle_time=0, start_id=message_id2, justid=True
    ) == [message_id2]


@pytest.mark.min_server("7")
def test_xclaim_trimmed_redis7(r: redis.Redis):
    # xclaim should not raise an exception if the item is not there
    stream, group = "stream", "group"

    r.xgroup_create(stream, group, id="$", mkstream=True)

    # add a couple of new items
    sid1 = r.xadd(stream, {"item": 0})
    sid2 = r.xadd(stream, {"item": 0})

    # read them from consumer1
    r.xreadgroup(group, "consumer1", {stream: ">"})

    # add a 3rd and trim the stream down to 2 items
    r.xadd(stream, {"item": 3}, maxlen=2, approximate=False)

    # xclaim them from consumer2
    # the item that is still in the stream should be returned
    item = r.xclaim(stream, group, "consumer2", 0, [sid1, sid2])
    assert len(item) == 1
    assert item[0][0] == sid2


def test_xclaim(r: redis.Redis):
    stream, group, consumer1, consumer2 = "stream", "group", "consumer1", "consumer2"

    message_id = r.xadd(stream, {"john": "wick"})
    message = get_stream_message(r, stream, message_id)
    r.xgroup_create(stream, group, 0)

    # trying to claim a message that isn't already pending doesn't
    # do anything
    assert (
        r.xclaim(stream, group, consumer2, min_idle_time=0, message_ids=(message_id,))
        == []
    )

    # read the group as consumer1 to initially claim the messages
    r.xreadgroup(group, consumer1, streams={stream: ">"})

    # claim the message as consumer2
    assert r.xclaim(
        stream, group, consumer2, min_idle_time=0, message_ids=(message_id,)
    ) == [
        message,
    ]

    # reclaim the message as consumer1, but use the justid argument
    # which only returns message ids
    assert r.xclaim(
        stream,
        group,
        consumer1,
        min_idle_time=0,
        message_ids=(message_id,),
        justid=True,
    ) == [
        message_id,
    ]


def test_xread_blocking(create_redis):
    # thread with xread block 0 should hang
    # putting data in the stream should unblock it
    event = threading.Event()
    event.clear()

    def thread_func():
        while not event.is_set():
            time.sleep(0.1)
        r = create_redis(db=1)
        r.xadd("stream", {"x": "1"})
        time.sleep(0.1)

    t = threading.Thread(target=thread_func)
    t.start()
    r1 = create_redis(db=1)
    event.set()
    result = r1.xread({"stream": "$"}, block=0, count=1)
    event.clear()
    t.join()
    assert result[0][0] == b"stream"
    assert result[0][1][0][1] == {b"x": b"1"}


def test_stream_ttl(r: redis.Redis):
    stream = "stream"

    m1 = r.xadd(stream, {"foo": "bar"})
    expected = [
        [
            stream.encode(),
            [get_stream_message(r, stream, m1)],
        ]
    ]
    assert r.xread(streams={stream: 0}) == expected
    assert r.xtrim(stream, 0) == 1
    assert r.ttl(stream) == -1


================================================
FILE: tests/fakeredis/test/test_mixins/test_string_commands.py
================================================
from __future__ import annotations

import time
from datetime import timedelta

import pytest
import redis
import redis.client
from redis.exceptions import ResponseError

from ..testtools import raw_command


def test_append(r: redis.Redis):
    assert r.set("foo", "bar")
    assert r.append("foo", "baz") == 6
    assert r.get("foo") == b"barbaz"


def test_append_with_no_preexisting_key(r: redis.Redis):
    assert r.append("foo", "bar") == 3
    assert r.get("foo") == b"bar"


def test_append_wrong_type(r: redis.Redis):
    r.rpush("foo", b"x")
    with pytest.raises(redis.ResponseError):
        r.append("foo", b"x")


def test_decr(r: redis.Redis):
    r.set("foo", 10)
    assert r.decr("foo") == 9
    assert r.get("foo") == b"9"


def test_decr_newkey(r: redis.Redis):
    r.decr("foo")
    assert r.get("foo") == b"-1"


def test_decr_expiry(r: redis.Redis):
    r.set("foo", 10, ex=10)
    r.decr("foo", 5)
    assert r.ttl("foo") > 0


def test_decr_badtype(r: redis.Redis):
    r.set("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.decr("foo", 15)
    r.rpush("foo2", 1)
    with pytest.raises(redis.ResponseError):
        r.decr("foo2", 15)


def test_get_does_not_exist(r: redis.Redis):
    assert r.get("foo") is None


def test_get_with_non_str_keys(r: redis.Redis):
    assert r.set("2", "bar") is True
    assert r.get(2) == b"bar"


def test_get_invalid_type(r: redis.Redis):
    assert r.hset("foo", "key", "value") == 1
    with pytest.raises(redis.ResponseError):
        r.get("foo")


def test_getset_exists(r: redis.Redis):
    r.set("foo", "bar")
    val = r.getset("foo", b"baz")
    assert val == b"bar"
    val = r.getset("foo", b"baz2")
    assert val == b"baz"


def test_getset_wrong_type(r: redis.Redis):
    r.rpush("foo", b"x")
    with pytest.raises(redis.ResponseError):
        r.getset("foo", "bar")


def test_getdel(r: redis.Redis):
    r["foo"] = "bar"
    assert r.getdel("foo") == b"bar"
    assert r.get("foo") is None


def test_getdel_doesnt_exist(r: redis.Redis):
    assert r.getdel("foo") is None


def test_incr_with_no_preexisting_key(r: redis.Redis):
    assert r.incr("foo") == 1
    assert r.incr("bar", 2) == 2


def test_incr_by(r: redis.Redis):
    assert r.incrby("foo") == 1
    assert r.incrby("bar", 2) == 2


def test_incr_preexisting_key(r: redis.Redis):
    r.set("foo", 15)
    assert r.incr("foo", 5) == 20
    assert r.get("foo") == b"20"


def test_incr_expiry(r: redis.Redis):
    r.set("foo", 15, ex=10)
    r.incr("foo", 5)
    assert r.ttl("foo") > 0


def test_incr_bad_type(r: redis.Redis):
    r.set("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.incr("foo", 15)
    r.rpush("foo2", 1)
    with pytest.raises(redis.ResponseError):
        r.incr("foo2", 15)


def test_incr_with_float(r: redis.Redis):
    with pytest.raises(redis.ResponseError):
        r.incr("foo", 2.0)


def test_incr_followed_by_mget(r: redis.Redis):
    r.set("foo", 15)
    assert r.incr("foo", 5) == 20
    assert r.get("foo") == b"20"


def test_incr_followed_by_mget_returns_strings(r: redis.Redis):
    r.incr("foo", 1)
    assert r.mget(["foo"]) == [b"1"]


def test_incrbyfloat(r: redis.Redis):
    r.set("foo", 0)
    assert r.incrbyfloat("foo", 1.0) == 1.0
    assert r.incrbyfloat("foo", 1.0) == 2.0


def test_incrbyfloat_with_noexist(r: redis.Redis):
    assert r.incrbyfloat("foo", 1.0) == 1.0
    assert r.incrbyfloat("foo", 1.0) == 2.0


def test_incrbyfloat_expiry(r: redis.Redis):
    r.set("foo", 1.5, ex=10)
    r.incrbyfloat("foo", 2.5)
    assert r.ttl("foo") > 0


def test_incrbyfloat_bad_type(r: redis.Redis):
    r.set("foo", "bar")
    with pytest.raises(redis.ResponseError, match="not a valid float"):
        r.incrbyfloat("foo", 1.0)
    r.rpush("foo2", 1)
    with pytest.raises(redis.ResponseError):
        r.incrbyfloat("foo2", 1.0)


def test_incrbyfloat_precision(r: redis.Redis):
    x = 1.23456789123456789
    assert r.incrbyfloat("foo", x) == x
    assert float(r.get("foo")) == x


def test_mget(r: redis.Redis):
    r.set("foo", "one")
    r.set("bar", "two")
    assert r.mget(["foo", "bar"]) == [b"one", b"two"]
    assert r.mget(["foo", "bar", "baz"]) == [b"one", b"two", None]
    assert r.mget("foo", "bar") == [b"one", b"two"]


def test_mget_with_no_keys(r: redis.Redis):
    assert r.mget([]) == []


def test_mget_mixed_types(r: redis.Redis):
    r.hset("hash", "bar", "baz")
    r.zadd("zset", {"bar": 1})
    r.sadd("set", "member")
    r.rpush("list", "item1")
    r.set("string", "value")
    assert r.mget(["hash", "zset", "set", "string", "absent"]) == [
        None,
        None,
        None,
        b"value",
        None,
    ]


def test_mset_with_no_keys(r: redis.Redis):
    with pytest.raises(redis.ResponseError):
        r.mset({})


def test_mset(r: redis.Redis):
    assert r.mset({"foo": "one", "bar": "two"}) is True
    assert r.mset({"foo": "one", "bar": "two"}) is True
    assert r.mget("foo", "bar") == [b"one", b"two"]


def test_msetnx(r: redis.Redis):
    assert r.msetnx({"foo": "one", "bar": "two"})
    assert not r.msetnx({"bar": "two", "baz": "three"})
    assert r.mget("foo", "bar", "baz") == [b"one", b"two", None]


def test_setex(r: redis.Redis):
    assert r.setex("foo", 100, "bar") is True
    assert r.get("foo") == b"bar"


def test_setex_using_timedelta(r: redis.Redis):
    assert r.setex("foo", timedelta(seconds=100), "bar") is True
    assert r.get("foo") == b"bar"


def test_setex_using_float(r: redis.Redis):
    with pytest.raises(redis.ResponseError, match="integer"):
        r.setex("foo", 1.2, "bar")


@pytest.mark.min_server("6.2")
def test_setex_overflow(r: redis.Redis):
    with pytest.raises(ResponseError):
        r.setex("foo", 18446744073709561, "bar")  # Overflows longlong in ms


def test_set_ex(r: redis.Redis):
    assert r.set("foo", "bar", ex=100) is True
    assert r.get("foo") == b"bar"


@pytest.mark.min_server("6.2")
def test_set_exat(r: redis.Redis):
    curr_time = int(time.time())
    assert r.set("foo", "bar", exat=curr_time + 100) is True
    assert r.get("foo") == b"bar"


@pytest.mark.min_server("6.2")
def test_set_pxat(r: redis.Redis):
    curr_time = int(time.time() * 1000)
    assert r.set("foo", "bar", pxat=curr_time + 100) is True
    assert r.get("foo") == b"bar"
    time.sleep(0.15)
    assert r.get("foo") is None


def test_set_ex_using_timedelta(r: redis.Redis):
    assert r.set("foo", "bar", ex=timedelta(seconds=100)) is True
    assert r.get("foo") == b"bar"


def test_set_ex_overflow(r: redis.Redis):
    with pytest.raises(ResponseError):
        r.set("foo", "bar", ex=18446744073709561)  # Overflows longlong in ms


def test_set_px_overflow(r: redis.Redis):
    with pytest.raises(ResponseError):
        r.set("foo", "bar", px=2**63 - 2)  # Overflows after adding current time


def test_set_px(r: redis.Redis):
    assert r.set("foo", "bar", px=100) is True
    assert r.get("foo") == b"bar"


def test_set_px_using_timedelta(r: redis.Redis):
    assert r.set("foo", "bar", px=timedelta(milliseconds=100)) is True
    assert r.get("foo") == b"bar"


def test_set_conflicting_expire_options(r: redis.Redis):
    with pytest.raises(ResponseError):
        r.set("foo", "bar", ex=1, px=1)


def test_set_raises_wrong_ex(r: redis.Redis):
    with pytest.raises(ResponseError):
        r.set("foo", "bar", ex=-100)
    with pytest.raises(ResponseError):
        r.set("foo", "bar", ex=0)
    assert not r.exists("foo")


def test_set_using_timedelta_raises_wrong_ex(r: redis.Redis):
    with pytest.raises(ResponseError):
        r.set("foo", "bar", ex=timedelta(seconds=-100))
    with pytest.raises(ResponseError):
        r.set("foo", "bar", ex=timedelta(seconds=0))
    assert not r.exists("foo")


def test_set_raises_wrong_px(r: redis.Redis):
    with pytest.raises(ResponseError):
        r.set("foo", "bar", px=-100)
    with pytest.raises(ResponseError):
        r.set("foo", "bar", px=0)
    assert not r.exists("foo")


def test_set_using_timedelta_raises_wrong_px(r: redis.Redis):
    with pytest.raises(ResponseError):
        r.set("foo", "bar", px=timedelta(milliseconds=-100))
    with pytest.raises(ResponseError):
        r.set("foo", "bar", px=timedelta(milliseconds=0))
    assert not r.exists("foo")


def test_setex_raises_wrong_ex(r: redis.Redis):
    with pytest.raises(ResponseError):
        r.setex("foo", -100, "bar")
    with pytest.raises(ResponseError):
        r.setex("foo", 0, "bar")
    assert not r.exists("foo")


def test_setex_using_timedelta_raises_wrong_ex(r: redis.Redis):
    with pytest.raises(ResponseError):
        r.setex("foo", timedelta(seconds=-100), "bar")
    with pytest.raises(ResponseError):
        r.setex("foo", timedelta(seconds=-100), "bar")
    assert not r.exists("foo")


def test_setnx(r: redis.Redis):
    assert r.setnx("foo", "bar")
    assert r.get("foo") == b"bar"
    assert not r.setnx("foo", "baz")
    assert r.get("foo") == b"bar"


def test_set_nx(r: redis.Redis):
    assert r.set("foo", "bar", nx=True) is True
    assert r.get("foo") == b"bar"
    assert r.set("foo", "bar", nx=True) is None
    assert r.get("foo") == b"bar"


def test_set_xx(r: redis.Redis):
    assert r.set("foo", "bar", xx=True) is None
    r.set("foo", "bar")
    assert r.set("foo", "bar", xx=True) is True


@pytest.mark.min_server("6.2")
def test_set_get(r: redis.Redis):
    assert raw_command(r, "set", "foo", "bar", "GET") is None
    assert r.get("foo") == b"bar"
    assert raw_command(r, "set", "foo", "baz", "GET") == b"bar"
    assert r.get("foo") == b"baz"


@pytest.mark.min_server("6.2")
def test_set_get_xx(r: redis.Redis):
    assert raw_command(r, "set", "foo", "bar", "XX", "GET") is None
    assert r.get("foo") is None
    r.set("foo", "bar")
    assert raw_command(r, "set", "foo", "baz", "XX", "GET") == b"bar"
    assert r.get("foo") == b"baz"
    assert raw_command(r, "set", "foo", "baz", "GET") == b"baz"


@pytest.mark.min_server("7")
def test_set_get_nx_redis7(r: redis.Redis):
    # Note: this will most likely fail on a 7.0 server, based on the docs for SET
    assert raw_command(r, "set", "foo", "bar", "NX", "GET") is None


@pytest.mark.min_server("6.2")
def set_get_wrongtype(r: redis.Redis):
    r.lpush("foo", "bar")
    with pytest.raises(redis.ResponseError):
        raw_command(r, "set", "foo", "bar", "GET")


def test_substr(r: redis.Redis):
    r["foo"] = "one_two_three"
    assert r.substr("foo", 0) == b"one_two_three"
    assert r.substr("foo", 0, 2) == b"one"
    assert r.substr("foo", 4, 6) == b"two"
    assert r.substr("foo", -5) == b"three"
    assert r.substr("foo", -4, -5) == b""
    assert r.substr("foo", -5, -3) == b"thr"


def test_substr_noexist_key(r: redis.Redis):
    assert r.substr("foo", 0) == b""
    assert r.substr("foo", 10) == b""
    assert r.substr("foo", -5, -1) == b""


def test_substr_wrong_type(r: redis.Redis):
    r.rpush("foo", b"x")
    with pytest.raises(redis.ResponseError):
        r.substr("foo", 0)


def test_strlen(r: redis.Redis):
    r["foo"] = "bar"

    assert r.strlen("foo") == 3
    assert r.strlen("noexists") == 0


def test_strlen_wrong_type(r: redis.Redis):
    r.rpush("foo", b"x")
    with pytest.raises(redis.ResponseError):
        r.strlen("foo")


def test_setrange(r: redis.Redis):
    r.set("foo", "test")
    assert r.setrange("foo", 1, "aste") == 5
    assert r.get("foo") == b"taste"

    r.set("foo", "test")
    assert r.setrange("foo", 1, "a") == 4
    assert r.get("foo") == b"tast"

    assert r.setrange("bar", 2, "test") == 6
    assert r.get("bar") == b"\x00\x00test"


def test_setrange_expiry(r: redis.Redis):
    r.set("foo", "test", ex=10)
    r.setrange("foo", 1, "aste")
    assert r.ttl("foo") > 0


def test_large_command(r: redis.Redis):
    r.set("foo", "bar" * 10000)
    assert r.get("foo") == b"bar" * 10000


def test_saving_non_ascii_chars_as_value(r: redis.Redis):
    assert r.set("foo", "Ñandu") is True
    assert r.get("foo") == "Ñandu".encode()


def test_saving_unicode_type_as_value(r: redis.Redis):
    assert r.set("foo", "Ñandu") is True
    assert r.get("foo") == "Ñandu".encode()


def test_saving_non_ascii_chars_as_key(r: redis.Redis):
    assert r.set("Ñandu", "foo") is True
    assert r.get("Ñandu") == b"foo"


def test_saving_unicode_type_as_key(r: redis.Redis):
    assert r.set("Ñandu", "foo") is True
    assert r.get("Ñandu") == b"foo"


def test_future_newbytes(r: redis.Redis):
    # bytes = pytest.importorskip('builtins', reason='future.types not available').bytes
    r.set(bytes(b"\xc3\x91andu"), "foo")
    assert r.get("Ñandu") == b"foo"


def test_future_newstr(r: redis.Redis):
    # str = pytest.importorskip('builtins', reason='future.types not available').str
    r.set(str("Ñandu"), "foo")
    assert r.get("Ñandu") == b"foo"


def test_setitem_getitem(r: redis.Redis):
    assert r.keys() == []
    r["foo"] = "bar"
    assert r["foo"] == b"bar"


def test_getitem_non_existent_key(r: redis.Redis):
    assert r.keys() == []
    assert "noexists" not in r.keys()


@pytest.mark.slow
def test_getex(r: redis.Redis):
    # Exceptions
    with pytest.raises(redis.ResponseError):
        raw_command(r, "getex", "foo", "px", 1000, "ex", 1)
    with pytest.raises(redis.ResponseError):
        raw_command(r, "getex", "foo", "dsac", 1000, "ex", 1)

    r.set("foo", "val")
    assert r.getex("foo", ex=1) == b"val"
    time.sleep(1.5)
    assert r.get("foo") is None

    r.set("foo2", "val")
    assert r.getex("foo2", px=1000) == b"val"
    time.sleep(1.5)
    assert r.get("foo2") is None

    r.set("foo4", "val")
    r.getex("foo4", exat=int(time.time() + 1))
    time.sleep(1.5)
    assert r.get("foo4") is None

    r.set("foo2", "val")
    r.getex("foo2", pxat=int(time.time() + 1) * 1000)
    time.sleep(1.5)
    assert r.get("foo2") is None

    r.setex("foo5", 1, "val")
    r.getex("foo5", persist=True)
    assert r.ttl("foo5") == -1
    time.sleep(1.5)
    assert r.get("foo5") == b"val"


@pytest.mark.min_server("7")
@pytest.mark.unsupported_server_types("dragonfly")
def test_lcs(r: redis.Redis):
    r.mset({"key1": "ohmytext", "key2": "mynewtext"})
    assert r.lcs("key1", "key2") == b"mytext"
    assert r.lcs("key1", "key2", len=True) == 6

    assert r.lcs("key1", "key2", idx=True, minmatchlen=3, withmatchlen=True) == [
        b"matches",
        [[[4, 7], [5, 8], 4]],
        b"len",
        6,
    ]
    assert r.lcs("key1", "key2", idx=True, minmatchlen=3) == [
        b"matches",
        [[[4, 7], [5, 8]]],
        b"len",
        6,
    ]

    with pytest.raises(redis.ResponseError):
        assert r.lcs("key1", "key2", len=True, idx=True)
    with pytest.raises(redis.ResponseError):
        raw_command(r, "lcs", "key1", "key2", "not_supported_arg")


================================================
FILE: tests/fakeredis/test/test_mixins/test_zadd.py
================================================
import pytest
import redis
import redis.client
from packaging.version import Version

from test.testtools import raw_command

REDIS_VERSION = Version(redis.__version__)


def test_zadd(r: redis.Redis):
    r.zadd("foo", {"four": 4})
    r.zadd("foo", {"three": 3})
    assert r.zadd("foo", {"two": 2, "one": 1, "zero": 0}) == 3
    assert r.zrange("foo", 0, -1) == [b"zero", b"one", b"two", b"three", b"four"]
    assert r.zadd("foo", {"zero": 7, "one": 1, "five": 5}) == 1
    assert r.zrange("foo", 0, -1) == [
        b"one",
        b"two",
        b"three",
        b"four",
        b"five",
        b"zero",
    ]


def test_zadd_empty(r: redis.Redis):
    # Have to add at least one key/value pair
    with pytest.raises(redis.RedisError):
        r.zadd("foo", {})


@pytest.mark.min_server("7")
def test_zadd_minus_zero_redis7(r: redis.Redis):
    r.zadd("foo", {"a": -0.0})
    r.zadd("foo", {"a": 0.0})
    assert raw_command(r, "zscore", "foo", "a") == b"0"


def test_zadd_wrong_type(r: redis.Redis):
    r.sadd("foo", "bar")
    with pytest.raises(redis.ResponseError):
        r.zadd("foo", {"two": 2})


def test_zadd_multiple(r: redis.Redis):
    r.zadd("foo", {"one": 1, "two": 2})
    assert r.zrange("foo", 0, 0) == [b"one"]
    assert r.zrange("foo", 1, 1) == [b"two"]


@pytest.mark.parametrize(
    "param,return_value,state",
    [
        ({"four": 2.0, "three": 1.0}, 0, [(b"three", 3.0), (b"four", 4.0)]),
        (
            {"four": 2.0, "three": 1.0, "zero": 0.0},
            1,
            [(b"zero", 0.0), (b"three", 3.0), (b"four", 4.0)],
        ),
        (
            {"two": 2.0, "one": 1.0},
            2,
            [(b"one", 1.0), (b"two", 2.0), (b"three", 3.0), (b"four", 4.0)],
        ),
    ],
)
@pytest.mark.parametrize("ch", [False, True])
def test_zadd_with_nx(r, param, return_value, state, ch):
    r.zadd("foo", {"four": 4.0, "three": 3.0})
    assert r.zadd("foo", param, nx=True, ch=ch) == return_value
    assert r.zrange("foo", 0, -1, withscores=True) == state


@pytest.mark.parametrize(
    "param,return_value,state",
    [
        ({"four": 2.0, "three": 1.0}, 0, [(b"three", 3.0), (b"four", 4.0)]),
        (
            {"four": 5.0, "three": 1.0, "zero": 0.0},
            2,
            [
                (b"zero", 0.0),
                (b"three", 3.0),
                (b"four", 5.0),
            ],
        ),
        (
            {"two": 2.0, "one": 1.0},
            2,
            [(b"one", 1.0), (b"two", 2.0), (b"three", 3.0), (b"four", 4.0)],
        ),
    ],
)
def test_zadd_with_gt_and_ch(r, param, return_value, state):
    r.zadd("foo", {"four": 4.0, "three": 3.0})
    assert r.zadd("foo", param, gt=True, ch=True) == return_value
    assert r.zrange("foo", 0, -1, withscores=True) == state


@pytest.mark.parametrize(
    "param,return_value,state",
    [
        ({"four": 2.0, "three": 1.0}, 0, [(b"three", 3.0), (b"four", 4.0)]),
        (
            {"four": 5.0, "three": 1.0, "zero": 0.0},
            1,
            [(b"zero", 0.0), (b"three", 3.0), (b"four", 5.0)],
        ),
        (
            {"two": 2.0, "one": 1.0},
            2,
            [(b"one", 1.0), (b"two", 2.0), (b"three", 3.0), (b"four", 4.0)],
        ),
    ],
)
def test_zadd_with_gt(r, param, return_value, state):
    r.zadd("foo", {"four": 4.0, "three": 3.0})
    assert r.zadd("foo", param, gt=True) == return_value
    assert r.zrange("foo", 0, -1, withscores=True) == state


@pytest.mark.parametrize(
    "param,return_value,state",
    [
        ({"four": 4.0, "three": 1.0}, 1, [(b"three", 1.0), (b"four", 4.0)]),
        (
            {"four": 4.0, "three": 1.0, "zero": 0.0},
            2,
            [(b"zero", 0.0), (b"three", 1.0), (b"four", 4.0)],
        ),
        (
            {"two": 2.0, "one": 1.0},
            2,
            [(b"one", 1.0), (b"two", 2.0), (b"three", 3.0), (b"four", 4.0)],
        ),
    ],
)
def test_zadd_with_ch(r, param, return_value, state):
    r.zadd("foo", {"four": 4.0, "three": 3.0})
    assert r.zadd("foo", param, ch=True) == return_value
    assert r.zrange("foo", 0, -1, withscores=True) == state


@pytest.mark.parametrize(
    "param,changed,state",
    [
        ({"four": 2.0, "three": 1.0}, 2, [(b"three", 1.0), (b"four", 2.0)]),
        (
            {"four": 4.0, "three": 3.0, "zero": 0.0},
            0,
            [(b"three", 3.0), (b"four", 4.0)],
        ),
        ({"two": 2.0, "one": 1.0}, 0, [(b"three", 3.0), (b"four", 4.0)]),
    ],
)
@pytest.mark.parametrize("ch", [False, True])
def test_zadd_with_xx(r, param, changed, state, ch):
    r.zadd("foo", {"four": 4.0, "three": 3.0})
    assert r.zadd("foo", param, xx=True, ch=ch) == (changed if ch else 0)
    assert r.zrange("foo", 0, -1, withscores=True) == state


@pytest.mark.parametrize("ch", [False, True])
def test_zadd_with_nx_and_xx(r, ch):
    r.zadd("foo", {"four": 4.0, "three": 3.0})
    with pytest.raises(redis.DataError):
        r.zadd("foo", {"four": -4.0, "three": -3.0}, nx=True, xx=True, ch=ch)


@pytest.mark.parametrize("ch", [False, True])
def test_zadd_incr(r, ch):
    r.zadd("foo", {"four": 4.0, "three": 3.0})
    assert r.zadd("foo", {"four": 1.0}, incr=True, ch=ch) == 5.0
    assert r.zadd("foo", {"three": 1.0}, incr=True, nx=True, ch=ch) is None
    assert r.zscore("foo", "three") == 3.0
    assert r.zadd("foo", {"bar": 1.0}, incr=True, xx=True, ch=ch) is None
    assert r.zadd("foo", {"three": 1.0}, incr=True, xx=True, ch=ch) == 4.0


def test_zadd_with_xx_and_gt_and_ch(r: redis.Redis):
    r.zadd("test", {"one": 1})
    assert r.zscore("test", "one") == 1.0
    assert r.zadd("test", {"one": 4}, xx=True, gt=True, ch=True) == 1
    assert r.zscore("test", "one") == 4.0
    assert r.zadd("test", {"one": 0}, xx=True, gt=True, ch=True) == 0
    assert r.zscore("test", "one") == 4.0


def test_zadd_and_zrangebyscore(r: redis.Redis):
    raw_command(r, "zadd", "", 0.0, "")
    assert raw_command(r, "zrangebyscore", "", 0.0, 0.0, "limit", 0, 0) == []
    with pytest.raises(redis.RedisError):
        raw_command(r, "zrangebyscore", "", 0.0, 0.0, "limit", 0)
    with pytest.raises(redis.RedisError):
        raw_command(r, "zadd", "t", 0.0, "xx", "")


================================================
FILE: tests/fakeredis/test/test_stack/__init__.py
================================================


================================================
FILE: tests/fakeredis/test/test_stack/test_bloomfilter.py
================================================
import pytest
import redis
from fakeredis import _msgs as msgs
from redis.commands.bf import BFInfo

bloom_tests = pytest.importorskip("probables")


def intlist(obj):
    return [int(v) for v in obj]


def test_create_bf(r: redis.Redis):
    assert r.bf().create("bloom", 0.01, 1000)
    assert r.bf().create("bloom_e", 0.01, 1000, expansion=1)
    assert r.bf().create("bloom_ns", 0.01, 1000, noScale=True)


@pytest.mark.unsupported_server_types("dragonfly")
def test_bf_reserve(r: redis.Redis):
    assert r.bf().reserve("bloom", 0.01, 1000)
    assert r.bf().reserve("bloom_ns", 0.01, 1000, noScale=True)
    with pytest.raises(
        redis.exceptions.ResponseError, match=msgs.NONSCALING_FILTERS_CANNOT_EXPAND_MSG
    ):
        assert r.bf().reserve("bloom_e", 0.01, 1000, expansion=1, noScale=True)
    with pytest.raises(redis.exceptions.ResponseError, match=msgs.ITEM_EXISTS_MSG):
        assert r.bf().reserve("bloom", 0.01, 1000)


def test_bf_add(r: redis.Redis):
    assert r.bf().add("key", "value") == 1
    assert r.bf().add("key", "value") == 0

    r.set("key1", "value")
    with pytest.raises(redis.exceptions.ResponseError):
        r.bf().add("key1", "v")
    assert r.bf().create("bloom", 0.01, 1000)
    assert 1 == r.bf().add("bloom", "foo")
    assert 0 == r.bf().add("bloom", "foo")
    assert [0] == intlist(r.bf().madd("bloom", "foo"))
    assert [0, 1] == r.bf().madd("bloom", "foo", "bar")
    assert [0, 0, 1] == r.bf().madd("bloom", "foo", "bar", "baz")
    assert 1 == r.bf().exists("bloom", "foo")
    assert 0 == r.bf().exists("bloom", "noexist")
    assert [1, 0] == intlist(r.bf().mexists("bloom", "foo", "noexist"))


def test_bf_madd(r: redis.Redis):
    assert r.bf().madd("key", "v1", "v2", "v2") == [1, 1, 0]
    assert r.bf().madd("key", "v1", "v2", "v4") == [0, 0, 1]

    r.set("key1", "value")
    with pytest.raises(redis.exceptions.ResponseError):
        r.bf().add("key1", "v")


@pytest.mark.unsupported_server_types("dragonfly")
def test_bf_card(r: redis.Redis):
    assert r.bf().madd("key", "v1", "v2", "v3") == [1, 1, 1]
    assert r.bf().card("key") == 3
    assert r.bf().card("key-new") == 0

    r.set("key1", "value")
    with pytest.raises(redis.exceptions.ResponseError):
        r.bf().card("key1")
    # return 0 if the key does not exist
    assert r.bf().card("not_exist") == 0

    # Store a filter
    assert r.bf().add("bf1", "item_foo") == 1
    assert r.bf().card("bf1") == 1

    # Error when key is of a type other than Bloom filter.
    with pytest.raises(redis.ResponseError):
        r.set("setKey", "value")
        r.bf().card("setKey")


def test_bf_exists(r: redis.Redis):
    assert r.bf().madd("key", "v1", "v2", "v3") == [1, 1, 1]
    assert r.bf().exists("key", "v1") == 1
    assert r.bf().exists("key", "v5") == 0
    assert r.bf().exists("key-new", "v5") == 0

    r.set("key1", "value")
    with pytest.raises(redis.exceptions.ResponseError):
        r.bf().add("key1", "v")


def test_bf_mexists(r: redis.Redis):
    assert r.bf().madd("key", "v1", "v2", "v3") == [1, 1, 1]
    assert r.bf().mexists("key", "v1") == [
        1,
    ]
    assert r.bf().mexists("key", "v1", "v5") == [1, 0]
    assert r.bf().mexists("key-new", "v5") == [
        0,
    ]

    r.set("key1", "value")
    with pytest.raises(redis.exceptions.ResponseError):
        r.bf().add("key1", "v")


@pytest.mark.unsupported_server_types("dragonfly")
def test_bf_insert(r: redis.Redis):
    assert r.bf().create("key", 0.01, 1000)
    assert r.bf().insert("key", ["foo"]) == [1]
    assert r.bf().insert("key", ["foo", "bar"]) == [0, 1]
    assert r.bf().insert("captest", ["foo"], capacity=10) == [1]
    assert r.bf().insert("errtest", ["foo"], error=0.01) == [1]
    assert r.bf().exists("key", "foo") == 1
    assert r.bf().exists("key", "noexist") == 0
    assert r.bf().mexists("key", "foo", "noexist") == [1, 0]
    with pytest.raises(redis.exceptions.ResponseError, match=msgs.NOT_FOUND_MSG):
        r.bf().insert("nocreate", [1, 2, 3], noCreate=True)
    # with pytest.raises(redis.exceptions.ResponseError, match=msgs.NONSCALING_FILTERS_CANNOT_EXPAND_MSG):
    #     r.bf().insert("nocreate", [1, 2, 3], expansion=2, noScale=True)
    assert r.bf().create("bloom", 0.01, 1000)
    assert [1] == intlist(r.bf().insert("bloom", ["foo"]))
    assert [0, 1] == intlist(r.bf().insert("bloom", ["foo", "bar"]))
    assert 1 == r.bf().exists("bloom", "foo")
    assert 0 == r.bf().exists("bloom", "noexist")
    assert [1, 0] == intlist(r.bf().mexists("bloom", "foo", "noexist"))
    info = r.bf().info("bloom")
    assert 2 == info.get("insertedNum")
    assert 1000 == info.get("capacity")
    assert 1 == info.get("filterNum")


@pytest.mark.unsupported_server_types("dragonfly")
def test_bf_scandump_and_loadchunk(r: redis.Redis):
    r.bf().create("myBloom", "0.0001", "1000")

    # Test is probabilistic and might fail. It is OK to change variables if
    # certain to not break anything

    res = 0
    for x in range(1000):
        r.bf().add("myBloom", x)
        assert r.bf().exists("myBloom", x)
        rv = r.bf().exists("myBloom", f"nonexist_{x}")
        res += rv == x
    assert res < 5

    cmds = list()
    first = 0
    while first is not None:
        cur = r.bf().scandump("myBloom", first)
        if cur[0] == 0:
            first = None
        else:
            first = cur[0]
            cmds.append(cur)

    # Remove the filter
    r.bf().client.delete("myBloom")

    # Now, load all the commands:
    for cmd in cmds:
        r.bf().loadchunk("myBloom1", *cmd)

    for x in range(1000):
        assert r.bf().exists("myBloom1", x), f"{x} not in filter"


@pytest.mark.unsupported_server_types("dragonfly")
def test_bf_info(r: redis.Redis):
    # Store a filter
    r.bf().create("nonscaling", "0.0001", "1000", noScale=True)
    info: BFInfo = r.bf().info("nonscaling")
    assert info.expansionRate is None

    expansion = 4
    r.bf().create("expanding", "0.0001", "1000", expansion=expansion)
    info = r.bf().info("expanding")
    assert info.expansionRate == 4
    assert info.capacity == 1000
    assert info.insertedNum == 0


================================================
FILE: tests/fakeredis/test/test_stack/test_cms.py
================================================
import pytest
import redis

from test import testtools

json_tests = pytest.importorskip("probables")

pytestmark = []


def test_cms_create(r: redis.Redis):
    assert r.cms().initbydim("cmsDim", 100, 5)
    assert r.cms().initbyprob("cmsProb", 0.01, 0.01)

    with pytest.raises(redis.exceptions.ResponseError):
        r.cms().initbydim("cmsDim", 1, 5)

    with pytest.raises(redis.exceptions.ResponseError):
        r.cms().initbydim("cmsDim2", 0, 5)

    with pytest.raises(redis.exceptions.ResponseError):
        r.cms().initbydim("cmsDim2", 3, 0)

    with pytest.raises(redis.exceptions.ResponseError):
        r.cms().initbyprob("cmsProb", 0.01, 0.1)

    with pytest.raises(redis.exceptions.ResponseError):
        r.cms().initbyprob("cmsProb2", 2, 0.01)

    with pytest.raises(redis.exceptions.ResponseError):
        r.cms().initbyprob("cmsProb2", 0.01, 0)


def test_cms_incrby(r: redis.Redis):
    assert r.cms().initbydim("cmsDim", 100, 5)
    assert r.cms().initbyprob("cmsProb", 0.01, 0.01)

    assert r.cms().incrby("cmsDim", ["foo"], [3]) == [3]
    assert r.cms().incrby("cmsDim", ["foo", "bar"], [4, 1]) == [7, 1]
    assert r.cms().query("cmsDim", "foo") == [7]
    assert r.cms().query("cmsDim", "foo", "bar") == [7, 1]
    assert r.cms().query("cmsDim", "noexist") == [0]

    with pytest.raises(redis.exceptions.ResponseError):
        r.cms().query("cmsDim")

    with pytest.raises(redis.exceptions.ResponseError):
        r.cms().query("noexist", "foo")

    with pytest.raises(redis.exceptions.ResponseError):
        testtools.raw_command(r, "CMS.INCRBY", "cmsDim", "foo", 1, "bar")

    with pytest.raises(redis.exceptions.ResponseError, match="CMS: key does not exist"):
        r.cms().incrby("noexist", ["foo", "bar"], [3, 4])

    with pytest.raises(redis.exceptions.ResponseError, match="CMS: Cannot parse number"):
        r.cms().incrby("cmsDim", ["foo", "bar"], [3, "four"])


def test_cms_merge(r: redis.Redis):
    assert r.cms().initbydim("cmsDim", 100, 5)
    assert r.cms().initbydim("cms2", 100, 5)

    assert r.cms().incrby("cmsDim", ["foo"], [3]) == [3]
    assert r.cms().incrby("cms2", ["foo", "bar"], [4, 1]) == [4, 1]
    assert r.cms().merge("cmsDim", 1, ["cms2"])
    assert r.cms().query("cmsDim", "foo", "bar") == [4, 1]

    with pytest.raises(redis.exceptions.ResponseError, match="CMS: key does not exist"):
        r.cms().merge("noexist", 1, ["cms2"])

    # This shared test hard-coded one error string, but the FakeStrictRedis run can raise a different one.
    with pytest.raises(
        redis.exceptions.ResponseError,
        match=r"CMS: (wrong number of keys|Number of keys must be positive)",
    ):
        r.cms().merge("cms2", 0, ["cmsDim"])

    with pytest.raises(
        redis.exceptions.ResponseError,
        match="wrong number of arguments for '.*' command",
    ):
        r.cms().merge("cms2", 1, [])

    with pytest.raises(redis.exceptions.ResponseError, match="CMS: wrong number of keys/weights"):
        r.cms().merge("cmsDim", 1, ["cms2", "cms1"], [4, 3])

    with pytest.raises(redis.exceptions.ResponseError, match="CMS: key does not exist"):
        r.cms().merge("cmsDim", 2, ["cms2", "noexist"], [4, 3])


def test_cms_info(r: redis.Redis):
    assert r.cms().initbydim("A", 1000, 5)
    assert r.cms().initbydim("B", 1000, 5)
    assert r.cms().initbydim("C", 1000, 5)

    assert r.cms().incrby("A", ["foo", "bar", "baz"], [5, 3, 9])
    assert r.cms().incrby("B", ["foo", "bar", "baz"], [2, 3, 1])
    assert r.cms().query("A", "foo", "bar", "baz") == [5, 3, 9]
    assert r.cms().query("B", "foo", "bar", "baz") == [2, 3, 1]
    assert r.cms().merge("C", 2, ["A", "B"])
    assert r.cms().query("C", "foo", "bar", "baz") == [7, 6, 10]

    assert r.cms().merge("C", 2, ["A", "B"], ["1", "2"])
    assert r.cms().query("C", "foo", "bar", "baz") == [9, 9, 11]

    assert r.cms().merge("C", 2, ["A", "B"], ["2", "3"])
    assert r.cms().query("C", "foo", "bar", "baz") == [16, 15, 21]
    info = r.cms().info("A")
    assert info.width == 1000
    assert info.depth == 5
    assert info.count == 17

    with pytest.raises(redis.exceptions.ResponseError, match="CMS: key does not exist"):
        r.cms().info("noexist")


@pytest.mark.xfail(reason="Bug in pyprobables")
def test_cms_merge_fail(r: redis.Redis):
    assert r.cms().initbydim("A", 1000, 5)
    assert r.cms().initbydim("B", 1000, 5)
    assert r.cms().initbydim("C", 1000, 5)

    assert r.cms().incrby("A", ["foo", "bar", "baz"], [5, 3, 9])
    assert r.cms().incrby("B", ["foo", "bar", "baz"], [2, 3, 1])
    assert r.cms().query("A", "foo", "bar", "baz") == [5, 3, 9]
    assert r.cms().query("B", "foo", "bar", "baz") == [2, 3, 1]
    assert r.cms().merge("C", 2, ["A", "B"])
    assert r.cms().query("C", "foo", "bar", "baz") == [7, 6, 10]

    assert r.cms().merge("C", 2, ["A", "B"], ["2", "3"])
    info = r.cms().info("C")
    assert info.width == 1000
    assert info.depth == 5
    assert info.count == 52


================================================
FILE: tests/fakeredis/test/test_stack/test_cuckoofilter.py
================================================
import pytest
import redis

cuckoofilters_tests = pytest.importorskip("probables")

topk_tests = pytest.importorskip("probables")

pytestmark = []
pytestmark.extend(
    [
        pytest.mark.unsupported_server_types("dragonfly"),
    ]
)


def test_cf_add_and_insert(r: redis.Redis):
    assert r.cf().create("cuckoo", 1000)
    assert r.cf().add("cuckoo", "filter")
    assert not r.cf().addnx("cuckoo", "filter")
    assert 1 == r.cf().addnx("cuckoo", "newItem")
    assert [1] == r.cf().insert("captest", ["foo"])
    assert [1] == r.cf().insert("captest", ["foo"], capacity=1000)
    assert [1] == r.cf().insertnx("captest", ["bar"])
    assert [1] == r.cf().insertnx("captest", ["food"], nocreate="1")
    assert [0, 0, 1] == r.cf().insertnx("captest", ["foo", "bar", "baz"])
    assert [0] == r.cf().insertnx("captest", ["bar"], capacity=1000)
    assert [1] == r.cf().insert("empty1", ["foo"], capacity=1000)
    assert [1] == r.cf().insertnx("empty2", ["bar"], capacity=1000)
    info = r.cf().info("captest")
    assert info.get("insertedNum") == 5
    assert info.get("deletedNum") == 0
    assert info.get("filterNum") == 1


def test_create_cf(r: redis.Redis):
    assert r.cf().create("cuckoo", 1000)
    assert r.cf().create("cuckoo_e", 1000, expansion=1)
    assert r.cf().create("cuckoo_bs", 1000, bucket_size=4)
    assert r.cf().create("cuckoo_mi", 1000, max_iterations=10)
    assert r.cms().initbydim("cmsDim", 100, 5)
    assert r.cms().initbyprob("cmsProb", 0.01, 0.01)
    assert r.topk().reserve("topk", 5, 100, 5, 0.9)


def test_cf_exists_and_del(r: redis.Redis):
    assert r.cf().create("cuckoo", 1000)
    assert r.cf().add("cuckoo", "filter")
    assert r.cf().exists("cuckoo", "filter")
    assert not r.cf().exists("cuckoo", "notexist")
    assert [1, 0] == r.cf().mexists("cuckoo", "filter", "notexist")
    assert 1 == r.cf().count("cuckoo", "filter")
    assert 0 == r.cf().count("cuckoo", "notexist")
    assert r.cf().delete("cuckoo", "filter")
    assert 0 == r.cf().count("cuckoo", "filter")


================================================
FILE: tests/fakeredis/test/test_stack/test_tdigest.py
================================================
from math import inf

import pytest
import redis

topk_tests = pytest.importorskip("probables")
pytestmark = []
pytestmark.extend(
    [
        pytest.mark.unsupported_server_types("dragonfly"),
    ]
)


def test_tdigest_reset(r: redis.Redis):
    assert r.tdigest().create("tDigest", 10)
    # reset on empty histogram
    assert r.tdigest().reset("tDigest")
    # insert data-points into sketch
    assert r.tdigest().add("tDigest", list(range(10)))

    assert r.tdigest().reset("tDigest")
    # assert we have 0 unmerged
    info = r.tdigest().info("tDigest")
    assert 0 == info.get("unmerged_weight")


def test_tdigest_merge(r: redis.Redis):
    assert r.tdigest().create("to-tDigest", 10)
    assert r.tdigest().create("from-tDigest", 10)
    # insert data-points into sketch
    assert r.tdigest().add("from-tDigest", [1.0] * 10)
    assert r.tdigest().add("to-tDigest", [2.0] * 10)
    # merge from-tdigest into to-tdigest
    assert r.tdigest().merge("to-tDigest", 1, "from-tDigest")
    # we should now have 110 weight on to-histogram
    info = r.tdigest().info("to-tDigest")
    assert 20 == float(info["merged_weight"]) + float(info["unmerged_weight"])
    # test override
    assert r.tdigest().create("from-override", 10)
    assert r.tdigest().create("from-override-2", 10)
    assert r.tdigest().add("from-override", [3.0] * 10)
    assert r.tdigest().add("from-override-2", [4.0] * 10)
    assert r.tdigest().merge(
        "to-tDigest", 2, "from-override", "from-override-2", override=True
    )
    assert 3.0 == r.tdigest().min("to-tDigest")
    assert 4.0 == r.tdigest().max("to-tDigest")


def test_tdigest_min_and_max(r: redis.Redis):
    assert r.tdigest().create("tDigest", 100)
    # insert data-points into sketch
    assert r.tdigest().add("tDigest", [1, 2, 3])
    # min/max
    assert 3 == r.tdigest().max("tDigest")
    assert 1 == r.tdigest().min("tDigest")


def test_tdigest_quantile(r: redis.Redis):
    assert r.tdigest().create("tDigest", 500)
    # insert data-points into sketch
    assert r.tdigest().add("tDigest", list([x * 0.01 for x in range(1, 10000)]))
    # assert min min/max have same result as quantile 0 and 1
    res = r.tdigest().quantile("tDigest", 1.0)
    assert r.tdigest().max("tDigest") == res[0]
    res = r.tdigest().quantile("tDigest", 0.0)
    assert r.tdigest().min("tDigest") == res[0]

    assert 1.0 == round(r.tdigest().quantile("tDigest", 0.01)[0], 2)
    assert 99.0 == round(r.tdigest().quantile("tDigest", 0.99)[0], 2)

    # test multiple quantiles
    assert r.tdigest().create("t-digest", 100)
    assert r.tdigest().add("t-digest", [1, 2, 3, 4, 5])
    assert [3.0, 5.0] == r.tdigest().quantile("t-digest", 0.5, 0.8)


def test_tdigest_cdf(r: redis.Redis):
    assert r.tdigest().create("tDigest", 100)
    # insert data-points into sketch
    assert r.tdigest().add("tDigest", list(range(1, 10)))
    assert 0.1 == round(r.tdigest().cdf("tDigest", 1.0)[0], 1)
    assert 0.9 == round(r.tdigest().cdf("tDigest", 9.0)[0], 1)
    res = r.tdigest().cdf("tDigest", 1.0, 9.0)
    assert [0.1, 0.9] == [round(x, 1) for x in res]


def test_tdigest_trimmed_mean(r: redis.Redis):
    assert r.tdigest().create("tDigest", 100)
    # insert data-points into sketch
    assert r.tdigest().add("tDigest", list(range(1, 10)))
    assert 5 == r.tdigest().trimmed_mean("tDigest", 0.1, 0.9)
    assert 4.5 == r.tdigest().trimmed_mean("tDigest", 0.4, 0.5)


def test_tdigest_rank(r: redis.Redis):
    assert r.tdigest().create("t-digest", 500)
    assert r.tdigest().add("t-digest", list(range(0, 20)))
    assert -1 == r.tdigest().rank("t-digest", -1)[0]
    assert 0 == r.tdigest().rank("t-digest", 0)[0]
    assert 10 == r.tdigest().rank("t-digest", 10)[0]
    assert [-1, 20, 9] == r.tdigest().rank("t-digest", -20, 20, 9)


def test_tdigest_revrank(r: redis.Redis):
    assert r.tdigest().create("t-digest", 500)
    assert r.tdigest().add("t-digest", list(range(0, 20)))
    assert -1 == r.tdigest().revrank("t-digest", 20)[0]
    assert 19 == r.tdigest().revrank("t-digest", 0)[0]
    assert [-1, 19, 9] == r.tdigest().revrank("t-digest", 21, 0, 10)


def test_tdigest_byrank(r: redis.Redis):
    assert r.tdigest().create("t-digest", 500)
    assert r.tdigest().add("t-digest", list(range(1, 11)))
    assert 1 == r.tdigest().byrank("t-digest", 0)[0]
    assert 10 == r.tdigest().byrank("t-digest", 9)[0]
    assert r.tdigest().byrank("t-digest", 100)[0] == inf
    with pytest.raises(redis.ResponseError):
        r.tdigest().byrank("t-digest", -1)[0]


def test_tdigest_byrevrank(r: redis.Redis):
    assert r.tdigest().create("t-digest", 500)
    assert r.tdigest().add("t-digest", list(range(1, 11)))
    assert 10 == r.tdigest().byrevrank("t-digest", 0)[0]
    assert 1 == r.tdigest().byrevrank("t-digest", 9)[0]
    assert r.tdigest().byrevrank("t-digest", 100)[0] == -inf
    with pytest.raises(redis.ResponseError):
        r.tdigest().byrevrank("t-digest", -1)[0]


================================================
FILE: tests/fakeredis/test/test_stack/test_topk.py
================================================
import pytest
import redis

topk_tests = pytest.importorskip("probables")

pytestmark = []
pytestmark.extend(
    [
        pytest.mark.unsupported_server_types("dragonfly"),
    ]
)


def test_topk_incrby(r: redis.Redis):
    assert r.topk().reserve("topk", 3, 10, 3, 1)
    assert [None, None, None] == r.topk().incrby(
        "topk", ["bar", "baz", "42"], [3, 6, 2]
    )
    assert [None, "bar"] == r.topk().incrby("topk", ["42", "xyzzy"], [8, 4])
    with pytest.deprecated_call():
        assert [3, 6, 10, 4, 0] == r.topk().count(
            "topk", "bar", "baz", "42", "xyzzy", 4
        )


def test_topk(r: redis.Redis):
    # test list with empty buckets
    assert r.topk().reserve("topk", 3, 50, 4, 0.9)
    ret = r.topk().add(
        "topk",
        "A",
        "B",
        "C",
        "D",
        "D",
        "E",
        "A",
        "A",
        "B",
        "C",
        "G",
        "D",
        "B",
        "D",
        "A",
        "E",
        "E",
        1,
    )
    assert len(ret) == 18

    with pytest.deprecated_call():
        assert r.topk().count("topk", "A", "B", "C", "D", "E", "F", "G") == [
            4,
            3,
            2,
            4,
            3,
            0,
            1,
        ]
    ret = r.topk().query("topk", "A", "B", "C", "D", "E", "F", "G")
    assert (ret == [1, 0, 0, 1, 1, 0, 0]) or (ret == [1, 1, 0, 1, 0, 0, 0])
    # test full list
    assert r.topk().reserve("topklist", 3, 50, 3, 0.9)
    assert r.topk().add(
        "topklist",
        "A",
        "B",
        "D",
        "E",
        "A",
        "A",
        "B",
        "C",
        "G",
        "D",
        "B",
        "A",
        "B",
        "E",
        "E",
    )
    with pytest.deprecated_call():
        assert r.topk().count("topklist", "A", "B", "C", "D", "E", "F", "G") == [
            4,
            4,
            1,
            2,
            3,
            0,
            1,
        ]
    assert r.topk().list("topklist") == ["A", "B", "E"]
    assert r.topk().list("topklist", withcount=True) == ["A", 4, "B", 4, "E", 3]
    info = r.topk().info("topklist")
    assert 3 == info["k"]
    assert 50 == info["width"]
    assert 3 == info["depth"]
    assert 0.9 == round(float(info["decay"]), 1)


================================================
FILE: tests/fakeredis/test/test_transactions.py
================================================
from __future__ import annotations

import fakeredis
import pytest
import redis
import redis.client

from . import testtools


def test_multiple_successful_watch_calls(r: redis.Redis):
    p = r.pipeline()
    p.watch("bam")
    p.multi()
    p.set("foo", "bar")
    # Check that the watched keys buffer has been emptied.
    p.execute()

    # bam is no longer being watched, so it's ok to modify
    # it now.
    p.watch("foo")
    r.set("bam", "boo")
    p.multi()
    p.set("foo", "bats")
    assert p.execute() == [True]


def test_watch_state_is_cleared_after_abort(r: redis.Redis):
    # redis-py's pipeline handling and connection pooling interferes with this
    # test, so raw commands are used instead.
    testtools.raw_command(r, "watch", "foo")
    testtools.raw_command(r, "multi")
    with pytest.raises(redis.ResponseError):
        testtools.raw_command(r, "mget")  # Wrong number of arguments
    with pytest.raises(redis.exceptions.ExecAbortError):
        testtools.raw_command(r, "exec")

    testtools.raw_command(
        r, "set", "foo", "bar"
    )  # Should NOT trigger the watch from earlier
    testtools.raw_command(r, "multi")
    testtools.raw_command(r, "set", "abc", "done")
    testtools.raw_command(r, "exec")

    assert r.get("abc") == b"done"


def test_pipeline_transaction_shortcut(r: redis.Redis):
    # This example taken pretty much from the redis-py documentation.
    r.set("OUR-SEQUENCE-KEY", 13)
    calls = []

    def client_side_incr(pipe):
        calls.append((pipe,))
        current_value = pipe.get("OUR-SEQUENCE-KEY")
        next_value = int(current_value) + 1

        if len(calls) < 3:
            # Simulate a change from another thread.
            r.set("OUR-SEQUENCE-KEY", next_value)

        pipe.multi()
        pipe.set("OUR-SEQUENCE-KEY", next_value)

    res = r.transaction(client_side_incr, "OUR-SEQUENCE-KEY")

    assert res == [True]
    assert int(r.get("OUR-SEQUENCE-KEY")) == 16
    assert len(calls) == 3


def test_pipeline_transaction_value_from_callable(r: redis.Redis):
    def callback(pipe):
        # No need to do anything here since we only want the return value
        return "OUR-RETURN-VALUE"

    res = r.transaction(callback, "OUR-SEQUENCE-KEY", value_from_callable=True)
    assert res == "OUR-RETURN-VALUE"


def test_pipeline_empty(r: redis.Redis):
    p = r.pipeline()
    assert len(p) == 0


def test_pipeline_length(r: redis.Redis):
    p = r.pipeline()
    p.set("baz", "quux").get("baz")
    assert len(p) == 2


def test_pipeline_no_commands(r: redis.Redis):
    # Prior to 3.4, redis-py's execute is a nop if there are no commands
    # queued, so it succeeds even if watched keys have been changed.
    r.set("foo", "1")
    p = r.pipeline()
    p.watch("foo")
    r.set("foo", "2")
    with pytest.raises(redis.WatchError):
        p.execute()


def test_pipeline_failed_transaction(r: redis.Redis):
    p = r.pipeline()
    p.multi()
    p.set("foo", "bar")
    # Deliberately induce a syntax error
    p.execute_command("set")
    # It should be an ExecAbortError, but redis-py tries to DISCARD after the
    # failed EXEC, which raises a ResponseError.
    with pytest.raises(redis.ResponseError):
        p.execute()
    assert not r.exists("foo")


def test_pipeline_srem_no_change(r: redis.Redis):
    # A regression test for a case picked up by hypothesis tests.
    p = r.pipeline()
    p.watch("foo")
    r.srem("foo", "bar")
    p.multi()
    p.set("foo", "baz")
    p.execute()
    assert r.get("foo") == b"baz"


# The behaviour changed in redis 6.0 (see https://github.com/redis/redis/issues/6594).
@pytest.mark.min_server("6.0")
def test_pipeline_move(r: redis.Redis):
    # A regression test for a case picked up by hypothesis tests.
    r.set("foo", "bar")
    p = r.pipeline()
    p.watch("foo")
    r.move("foo", 1)
    # Ensure the transaction isn't empty, which had different behaviour in
    # older versions of redis-py.
    p.multi()
    p.set("bar", "baz")
    with pytest.raises(redis.exceptions.WatchError):
        p.execute()


@pytest.mark.min_server("6.0.6")
def test_exec_bad_arguments(r: redis.Redis):
    # Redis 6.0.6 changed the behaviour of exec so that it always fails with
    # EXECABORT, even when it's just bad syntax.
    with pytest.raises(redis.exceptions.ExecAbortError):
        r.execute_command("exec", "blahblah")


@pytest.mark.min_server("6.0.6")
def test_exec_bad_arguments_abort(r: redis.Redis):
    r.execute_command("multi")
    with pytest.raises(redis.exceptions.ExecAbortError):
        r.execute_command("exec", "blahblah")
    # Should have aborted the transaction, so we can run another one
    p = r.pipeline()
    p.multi()
    p.set("bar", "baz")
    p.execute()
    assert r.get("bar") == b"baz"


def test_pipeline(r: redis.Redis):
    # The pipeline method returns an object for
    # issuing multiple commands in a batch.
    p = r.pipeline()
    p.watch("bam")
    p.multi()
    p.set("foo", "bar").get("foo")
    p.lpush("baz", "quux")
    p.lpush("baz", "quux2").lrange("baz", 0, -1)
    res = p.execute()

    # Check return values returned as list.
    assert res == [True, b"bar", 1, 2, [b"quux2", b"quux"]]

    # Check side effects happened as expected.
    assert r.lrange("baz", 0, -1) == [b"quux2", b"quux"]

    # Check that the command buffer has been emptied.
    assert p.execute() == []


def test_pipeline_ignore_errors(r: redis.Redis):
    """Test the pipeline ignoring errors when asked."""
    with r.pipeline() as p:
        p.set("foo", "bar")
        p.rename("baz", "bats")
        with pytest.raises(redis.exceptions.ResponseError):
            p.execute()
        assert [] == p.execute()
    with r.pipeline() as p:
        p.set("foo", "bar")
        p.rename("baz", "bats")
        res = p.execute(raise_on_error=False)

        assert [] == p.execute()

        assert len(res) == 2
        assert isinstance(res[1], redis.exceptions.ResponseError)


def test_pipeline_non_transactional(r: redis.Redis):
    # For our simple-minded model I don't think
    # there is any observable difference.
    p = r.pipeline(transaction=False)
    res = p.set("baz", "quux").get("baz").execute()

    assert res == [True, b"quux"]


def test_pipeline_raises_when_watched_key_changed(r: redis.Redis):
    r.set("foo", "bar")
    r.rpush("greet", "hello")
    p = r.pipeline()
    try:
        p.watch("greet", "foo")
        nextf = bytes(p.get("foo")) + b"baz"
        # Simulate change happening on another thread.
        r.rpush("greet", "world")
        # Begin pipelining.
        p.multi()
        p.set("foo", nextf)

        with pytest.raises(redis.WatchError):
            p.execute()
    finally:
        p.reset()


def test_pipeline_succeeds_despite_unwatched_key_changed(r: redis.Redis):
    # Same setup as before except for the params to the WATCH command.
    r.set("foo", "bar")
    r.rpush("greet", "hello")
    p = r.pipeline()
    try:
        # Only watch one of the 2 keys.
        p.watch("foo")
        nextf = bytes(p.get("foo")) + b"baz"
        # Simulate change happening on another thread.
        r.rpush("greet", "world")
        p.multi()
        p.set("foo", nextf)
        p.execute()

        # Check the commands were executed.
        assert r.get("foo") == b"barbaz"
    finally:
        p.reset()


def test_pipeline_succeeds_when_watching_nonexistent_key(r: redis.Redis):
    r.set("foo", "bar")
    r.rpush("greet", "hello")
    p = r.pipeline()
    try:
        # Also watch a nonexistent key.
        p.watch("foo", "bam")
        nextf = bytes(p.get("foo")) + b"baz"
        # Simulate change happening on another thread.
        r.rpush("greet", "world")
        p.multi()
        p.set("foo", nextf)
        p.execute()

        # Check the commands were executed.
        assert r.get("foo") == b"barbaz"
    finally:
        p.reset()


def test_watch_state_is_cleared_across_multiple_watches(r: redis.Redis):
    r.set("foo", "one")
    r.set("bar", "baz")
    p = r.pipeline()

    try:
        p.watch("foo")
        # Simulate change happening on another thread.
        r.set("foo", "three")
        p.multi()
        p.set("foo", "three")
        with pytest.raises(redis.WatchError):
            p.execute()

        # Now watch another key.  It should be ok to change
        # foo as we're no longer watching it.
        p.watch("bar")
        r.set("foo", "four")
        p.multi()
        p.set("bar", "five")
        assert p.execute() == [True]
    finally:
        p.reset()


@pytest.mark.fake
def test_socket_cleanup_watch(fake_server):
    r1 = fakeredis.FakeStrictRedis(server=fake_server)
    r2 = fakeredis.FakeStrictRedis(server=fake_server)
    pipeline = r1.pipeline(transaction=False)
    # This needs some poking into redis-py internals to ensure that we reach
    # FakeSocket._cleanup. We need to close the socket while there is still
    # a watch in place, but not allow it to be garbage collected (hence we
    # set 'sock' even though it is unused).
    with pipeline:
        pipeline.watch("test")
        sock = pipeline.connection._sock  # noqa: F841
        pipeline.connection.disconnect()
    r2.set("test", "foo")


def test_get_within_pipeline(r: redis.Redis):
    r.set("test", "foo")
    r.set("test2", "foo2")
    expected_keys = set(r.keys())
    with r.pipeline() as p:
        assert set(r.keys()) == expected_keys
        p.watch("test")
        assert set(r.keys()) == expected_keys


@pytest.mark.fake
def test_get_within_pipeline_w_host():
    r = fakeredis.FakeRedis("localhost")
    r.set("test", "foo")
    r.set("test2", "foo2")
    expected_keys = set(r.keys())
    with r.pipeline() as p:
        assert set(r.keys()) == expected_keys
        p.watch("test")
        assert set(r.keys()) == expected_keys


@pytest.mark.fake
def test_get_within_pipeline_no_args():
    r = fakeredis.FakeRedis()
    r.set("test", "foo")
    r.set("test2", "foo2")
    expected_keys = set(r.keys())
    with r.pipeline() as p:
        assert set(r.keys()) == expected_keys
        p.watch("test")
        assert set(r.keys()) == expected_keys


================================================
FILE: tests/fakeredis/test/testtools.py
================================================
import importlib.util

import pytest
import redis
from packaging.version import Version

REDIS_VERSION = Version(redis.__version__)


def key_val_dict(size=100):
    return {f"key:{i}".encode(): f"val:{i}".encode() for i in range(size)}


def raw_command(r: redis.Redis, *args):
    """Like execute_command, but does not do command-specific response parsing"""
    response_callbacks = r.response_callbacks
    try:
        r.response_callbacks = {}
        return r.execute_command(*args)
    finally:
        r.response_callbacks = response_callbacks


ALLOWED_CONDITIONS = {"eq", "gte", "lte", "lt", "gt", "ne"}


def run_test_if_redispy_ver(condition: str, ver: str):
    if condition not in ALLOWED_CONDITIONS:
        raise ValueError(
            f"condition {condition} is not in allowed conditions ({ALLOWED_CONDITIONS})"
        )
    cond = False
    cond = cond or condition == "eq" and REDIS_VERSION == Version(ver)
    cond = cond or condition == "gte" and REDIS_VERSION >= Version(ver)
    cond = cond or condition == "lte" and REDIS_VERSION <= Version(ver)
    cond = cond or condition == "lt" and REDIS_VERSION < Version(ver)
    cond = cond or condition == "gt" and REDIS_VERSION > Version(ver)
    cond = cond or condition == "ne" and REDIS_VERSION != Version(ver)
    return pytest.mark.skipif(
        not cond,
        reason=f"Test is not applicable to redis-py {REDIS_VERSION} ({condition}, {ver})",
    )


_lua_module = importlib.util.find_spec("lupa")
run_test_if_lupa = pytest.mark.skipif(
    _lua_module is None, reason="Test is only applicable if lupa is installed"
)

fake_only = pytest.mark.parametrize(
    "create_redis",
    [pytest.param("FakeStrictRedis", marks=pytest.mark.fake)],
    indirect=True,
)


================================================
FILE: tests/integration/.dockerignore
================================================
Dockerfile
*.Dockerfile
stress_shutdown.sh
get_sets.sh
async.py
generate_sets.py
venv


================================================
FILE: tests/integration/.run_ioredis_valid_test.sh
================================================
#!/usr/bin/env bash

# The following tests are not supported
#"should reconnect if reconnectOnError
# supported in transaction blocks
# rejects when monitor is disabled
# should resend unfulfilled commands to the correct
# should set the name before any subscribe
# should name the connection if options
# scanStream
# should affect the old way
# should support Map
# should support object
# should batch all commands before ready event
# should support key prefixing for sort
# should be sent on the connect event

## Some issues that are still open need to be resolved such as
# https://github.com/dragonflydb/dragonfly/issues/457
# and https://github.com/dragonflydb/dragonfly/issues/458


# The follwing tests would pass once we support script flush command:
# does not fallback to EVAL in manual transaction
# does not fallback to EVAL in regular
# should reload scripts on redis restart (reconnect)"


TS_NODE_TRANSPILE_ONLY=true NODE_ENV=test mocha \
"test/helpers/*.ts" "test/unit/**/*.ts" "test/functional/**/*.ts" \
-g "should reload scripts on redis restart|should reconnect if reconnectOnError|should be supported in transaction blocks|rejects when monitor is disabled|should resend unfulfilled commands to the correct|should set the name before any subscribe|should name the connection if options|scanStream|does not fallback to EVAL|should try to use EVALSHA and fallback to EVAL|should use evalsha when script|should affect the old way|should support Map|should support object|should batch all commands before ready event|should support key prefixing for sort|should be sent on the connect event|spub|ssub|should support parallel script execution|works for moved" \
--invert


================================================
FILE: tests/integration/async.py
================================================
#!/usr/bin/env python3

"""
This is the script that helped to reproduce https://github.com/dragonflydb/dragonfly/issues/150
The outcome - stalled code with all its connections deadlocked.
Reproduced only with dragonfly in release mode on multi-core machine.
"""

import asyncio
import aioredis

from loguru import logger as log
import sys
import random

connection_pool = aioredis.ConnectionPool(
    host="localhost", port=6379, db=1, decode_responses=True, max_connections=16
)


key_index = 1


async def post_to_redis(sem, db_name, index):
    global key_index
    async with sem:
        results = None
        try:
            redis_client = aioredis.Redis(connection_pool=connection_pool)
            async with redis_client.pipeline(transaction=True) as pipe:
                for i in range(1, 15):
                    pipe.hsetnx(name=f"key_{key_index}", key="name", value="bla")
                    key_index += 1
                # log.info(f"after first half {key_index}")
                for i in range(1, 15):
                    pipe.hsetnx(name=f"bla_{key_index}", key="name2", value="bla")
                    key_index += 1
                assert len(pipe.command_stack) > 0
                log.info(f"before pipe.execute {key_index}")
                results = await pipe.execute()
                log.info(f"after pipe.execute {key_index}")
        finally:
            # log.info(f"before close {index}")
            await redis_client.aclose()
            # log.info(f"after close {index} {len(results)}")


async def do_concurrent(db_name):
    tasks = []
    sem = asyncio.Semaphore(10)
    for i in range(1, 3000):
        tasks.append(post_to_redis(sem, db_name, i))
    res = await asyncio.gather(*tasks)


if __name__ == "__main__":
    log.remove()
    log.add(sys.stdout, enqueue=True, level="INFO")
    loop = asyncio.get_event_loop()
    loop.run_until_complete(do_concurrent("my_db"))


================================================
FILE: tests/integration/gen_sets.sh
================================================
#!/bin/bash

memtier_benchmark -p 6379 --command "sadd __key__ __data__"   -n 20 --threads=4 \
    -c 10 --command-key-pattern=R --distinct-client-seed -c 30 --data-size=64 \
    --key-prefix="key:"  --hide-histogram --random-data --key-maximum=10000


================================================
FILE: tests/integration/generate_sets.py
================================================
#!/usr/bin/env python3

import argparse
import random
import string
import redis as rclient
import uuid
import time


def fill_set(args, redis: rclient.Redis):
    for j in range(args.num):
        token = uuid.uuid1().hex
        # print(token)
        key = f"USER_OTP:{token}"
        arr = []
        for i in range(30):
            otp = "".join(random.choices(string.ascii_uppercase + string.digits, k=12))
            arr.append(otp)
        redis.execute_command("sadd", key, *arr)


def fill_hset(args, redis):
    for j in range(args.num):
        token = uuid.uuid1().hex
        key = f"USER_INFO:{token}"
        phone = f"555-999-{j}"
        user_id = "user" * 5 + f"-{j}"
        redis.hset(key, "phone", phone)
        redis.hset(key, "user_id", user_id)
        redis.hset(key, "login_time", time.time())


def main():
    parser = argparse.ArgumentParser(description="fill hset entities")
    parser.add_argument("-p", type=int, help="redis port", dest="port", default=6380)
    parser.add_argument("-n", type=int, help="number of keys", dest="num", default=10000)
    parser.add_argument(
        "--type", type=str, choices=["hset", "set"], help="set type", default="hset"
    )

    args = parser.parse_args()
    redis = rclient.Redis(host="localhost", port=args.port, db=0)
    if args.type == "hset":
        fill_hset(args, redis)
    elif args.type == "set":
        fill_set(args, redis)


if __name__ == "__main__":
    main()


================================================
FILE: tests/integration/ioredis.Dockerfile
================================================
# syntax=docker/dockerfile:1

FROM node:18.7.0
ENV NODE_ENV=development
ENV RUN_IN_DOCKER=1

WORKDIR /app

# Git
RUN apt update -y && apt install -y git

# The latest version from io-redis contain changes that we need to have
# to successfully run the tests
RUN git clone https://github.com/luin/ioredis

WORKDIR /app/ioredis

RUN npm install

# Script to run the tests that curretly pass successfully.
# Note that in DF we still don't have support for cluster and we
# want to skip tests such as elasticache, also we have some issues that
# need to be resolved such as
# https://github.com/dragonflydb/dragonfly/issues/457
# and https://github.com/dragonflydb/dragonfly/issues/458
ADD .run_ioredis_valid_test.sh run_tests.sh

ENTRYPOINT [ "npm", "run", "env", "--", "TS_NODE_TRANSPILE_ONLY=true", "NODE_ENV=test" ]


================================================
FILE: tests/integration/jedis.Dockerfile
================================================
# syntax=docker/dockerfile:1

FROM maven:3.8.6-jdk-11
ENV NODE_ENV=development

WORKDIR /app
# Clone jedis dragonfly fork
RUN git clone -b dragonfly https://github.com/dragonflydb/jedis.git

WORKDIR /app/jedis

# Build the client and tests
RUN mvn test -DskipTests 

# Run selected tests
CMD mvn surefire:test -Dtest="AllKindOfValuesCommandsTest,BitCommandsTest,ControlCommandsTest,ControlCommandsTest,HashesCommandsTest,ListCommandsTest,ScriptingCommandsTest,ScriptingCommandsTest,SetCommandsTest,SetCommandsTest,SetCommandsTest,TransactionCommandsTest,ClientCommandsTest,PublishSubscribeCommandsTest,SortedSetCommandsTest,SortingCommandsTest,StreamsCommandsTest" 


================================================
FILE: tests/integration/node-redis.Dockerfile
================================================
# syntax=docker/dockerfile:1

FROM node:18.7.0
ENV NODE_ENV=development

WORKDIR /app
# Clone node-redis dragonfly fork
RUN git clone -b dragonfly https://github.com/dragonflydb/node-redis.git

WORKDIR /app/node-redis

RUN npm install && npm run build:tests-tools

CMD npm run test -w ./packages/client -- --redis-version=2.8


================================================
FILE: tests/integration/pascaldekloe.Dockerfile
================================================
FROM golang:1.20

RUN git clone https://github.com/pascaldekloe/redis.git
WORKDIR redis

ENV TEST_REDIS_ADDR=localhost

CMD ["go", "test", "-v"]


================================================
FILE: tests/integration/relay.Dockerfile
================================================
# to build the test
# docker build --pull -t relay-test -f ./relay.Dockerfile .
# to run the test, start dragonfly locally with port 6379
# then
# docker run --network=host -t relay-test

FROM linuxmintd/mint21.2-amd64

ARG DEBIAN_FRONTEND=noninteractive

RUN apt-get update

RUN add-apt-repository -y ppa:ondrej/php

RUN apt-get install -y \
  curl \
  php-dev

# Install Relay dependencies
RUN apt-get install -y \
  php-msgpack \
  php-igbinary

ARG RELAY=v0.6.8

# Download Relay
RUN PHP=$(php -r 'echo substr(PHP_VERSION, 0, 3);') \
  && curl -L "https://builds.r2.relay.so/$RELAY/relay-$RELAY-php$PHP-debian-x86-64+libssl3.tar.gz" | tar xz --strip-components=1 -C /tmp

# Copy relay.{so,ini}
RUN cp "/tmp/relay.ini" $(php-config --ini-dir)/30-relay.ini \
  && cp "/tmp/relay-pkg.so" $(php-config --extension-dir)/relay.so

# Inject UUID
RUN sed -i "s/00000000-0000-0000-0000-000000000000/$(cat /proc/sys/kernel/random/uuid)/" $(php-config --extension-dir)/relay.so

# needed by the Relay benchmark
RUN apt-get install -y composer php-curl

# checkout relay benchmark
RUN git clone https://github.com/cachewerk/relay.git

WORKDIR relay
RUN composer install

WORKDIR benchmarks

CMD ./run --filter '^(Relay)'


================================================
FILE: tests/integration/run_ioredis_on_docker.sh
================================================
#!/usr/bin/env bash
# Running this with --build would build the image as well
if [ "$1" = "--build" ]; then
    docker build -t ioredis-test -f ./ioredis.Dockerfile . || {
        echo "failed to build io redis image"
        exit 1
    }
fi

# run the tests
echo "running ioredis tests"
docker run --rm -i --network=host ioredis-test ./run_tests.sh
if [ $? -ne 0 ];then
	echo "some tests failed - please look at the output from this run"
	exit 1
else
	echo "finish runing tests successfully"
	exit 0
fi


================================================
FILE: tests/integration/stress_shutdown.sh
================================================
#!/bin/bash

while true; do
./dragonfly  --vmodule=accept_server=1,listener_interface=1 --logbuflevel=-1 &
DRAGON_PID=$!
echo "dragonfly pid $DRAGON_PID"

sleep 0.5

memtier_benchmark -p 6379 --ratio 1:0  -n 100000 --threads=2 --expiry-range=15-25  --distinct-client-seed \
                  --hide-histogram 2> /dev/null > /dev/null &
MEMT_ID=$!

echo "memtier pid $MEMT_ID"
echo "Running.............."
sleep 5
echo "killing dragonfly"

kill $DRAGON_PID
wait $DRAGON_PID

done

================================================
FILE: tests/pytest.ini
================================================
[pytest]
log_format = [%(asctime)s.%(msecs)03d %(levelname)s] %(message)s
log_cli_format = [%(asctime)s.%(msecs)03d %(levelname)s] %(message)s
log_date_format = %Y-%m-%d %H:%M:%S
log_file_level=INFO
log_cli = true
asyncio_mode=auto
addopts = -ra --emoji -m "not large"
markers =
# Tests that should only run on release builds and take significant amount of time to run.
# For example stress tests found in replication.
# `opt_only` runs only on regression-test and release workflow (skipped for ci)
  opt_only: marks tests that are only reasonable to run against an opt-built Dragonfly
# Usually tests that are known to fail for iouring and we skip them on epoll workflows
  exclude_epoll: marks tests that should not run on epoll socket
# Tests that should only run in debug mode because release builds are fast enough
# for their assertions to hold. They never run on release build.
  debug_only: mark tests that should run only in debug mode
# Heavy tests that require large runners and significant resources to complete.
# Run only in heavy-tests workflow on CI-LARGE runners.
  large: marks tests as large/heavy (deselect with '-m "not large"')
filterwarnings =
    ignore::DeprecationWarning


================================================
FILE: tools/balls_bins.py
================================================
#!/usr/bin/env python3

"""Simulate throwing balls into bins."""

import numpy as np
import argparse
import matplotlib.pyplot as plt


def simulate_balls_into_bins(balls: int, bins: int, threshold: int, exact, trials=10000):
    """Simulate throwing M balls into N bins for a given number of trials."""
    counts = np.zeros(bins, dtype=int)
    success = 0
    exact_success = 0
    deltas = []

    for _ in range(trials):
        # Reset counts for each trial
        counts.fill(0)

        # Throw M balls into the bins
        bins_seq = np.random.randint(0, bins, balls)
        unique, counts_bins = np.unique(bins_seq, return_counts=True)
        counts[unique] += counts_bins
        deltas.append(counts.max() - counts.min())
        # Check if any bin has K or more balls
        if np.any(counts >= threshold):
            success += 1
        if exact is not None:
            if np.any(counts == exact):
                exact_success += 1

    probability = success / trials
    return deltas, probability, exact_success / trials


def main():
    parser = argparse.ArgumentParser(description="Simulate throwing balls into bins.")
    parser.add_argument("--balls", type=int, default=30, help="Number of balls to throw.")
    parser.add_argument("--bins", type=int, default=3, help="Number of bins.")
    parser.add_argument(
        "--high-threshold",
        type=int,
        default=15,
        help="Minimum number of balls for the success condition",
    )
    parser.add_argument(
        "--exact-num", type=int, help="Exact number of balls for the success condition."
    )
    parser.add_argument(
        "--trials", type=int, default=10000, help="Number of trials. Default is 10,000."
    )

    args = parser.parse_args()

    deltas, atleast_p, exact_p = simulate_balls_into_bins(
        args.balls, args.bins, args.high_threshold, args.exact_num, args.trials
    )

    print(f"Probability that at least one bin has {args.high_threshold} or more balls: {atleast_p}")
    if args.exact_num is not None:
        print(f"Probability that at least one bin has {args.exact_num} balls: {exact_p}")

    print(
        f"Histogram of the difference between the most and least populated bins for {args.trials} trials"
    )
    plt.hist(deltas, bins=30, color="steelblue", edgecolor="none")
    plt.show()


if __name__ == "__main__":
    main()


================================================
FILE: tools/benchmark/k8s-benchmark-job.yaml
================================================
---
apiVersion: batch/v1
kind: Job
metadata:
  name: memtier-benchmark
spec:
  backoffLimit: 0
  template:
    spec:
      containers:
        - name: memtier
          image: redislabs/memtier_benchmark:latest
          args:
            - memtier_benchmark --pipeline=30 --key-maximum=100000 -c 10 -t 2 --test-time=600 --reconnect-interval=10000 --distinct-client-seed --hide-histogram -s dragonfly-sample
          command:
            - sh # This is important! without it memtier cannot DIG the dragonfly SVC domain
            - -c
          resources:
            requests:
              cpu: "2"
              memory: "500Mi"
            limits:
              cpu: "2"
              memory: "500Mi"
      restartPolicy: Never


================================================
FILE: tools/benchmark/post_run_checks.py
================================================
#!/usr/bin/env python3
import redis
import time


def main():
    max_unaccounted = 200 * 1024 * 1024  # 200mb

    client = redis.Redis(decode_responses=True)
    info = client.info("server")
    # Check version upgrade finsihed from last released version to last weekly docker build
    assert info["dragonfly_version"] == "df-HEAD-HASH-NOTFOUND"

    info = client.info("memory")
    print(f'Used memory {info["used_memory"]}, rss {info["used_memory_rss"]}')
    assert info["used_memory_rss"] - info["used_memory"] < max_unaccounted

    info = client.info("replication")
    assert info["role"] == "master"
    replication_state = info["slave0"]
    assert replication_state["state"] == "online"

    def is_zero_lag(replication_state):
        return replication_state["lag"] == 0

    # Wait for 10 seconds for lag to be zero
    for _ in range(10):
        if is_zero_lag(replication_state):
            break
        time.sleep(1)
        replication_state = client.info("replication")["slave0"]

    if replication_state["lag"] != 0:
        print(f"Lag is bad, expected 0, got {replication_state['lag']}")
        info = client.info("all")
        print(f"Info all output: {info}")
        assert False


if __name__ == "__main__":
    main()


================================================
FILE: tools/cache_logs_player.py
================================================
#!/usr/bin/env python3
import argparse
from datetime import datetime
import aioredis
import asyncio
from aiocsv import AsyncReader
import aiofiles

'''
To install: pip install -r requirements.txt
'''


class Command:
    args = None
    sync_id = 0 # Commands with the same sync_id will be executed synchrnously

class TwitterCacheTraceParser:
    """
    https://github.com/twitter/cache-trace
    """
    def parse(self, csv) -> Command:
        operation = csv[5]
        key = csv[1] + "a"
        value_size = int(csv[3])
        synthetic_value = "".zfill(value_size)

        client_id = csv[4]
        ttl = csv[6]

        cmd = Command()
        cmd.sync_id = client_id

        if operation == "get":
            cmd.args = ["GET", key]
        elif operation == 'gets':
            cmd.args = ["GET", key]
        elif operation == 'set':
            cmd.args = ["SET", key, synthetic_value]
        elif operation == 'add':
            cmd.args = ["SET", key, synthetic_value]
        elif operation == 'replace':
            cmd.args = ["SET", key, synthetic_value]
        elif operation == 'cas':
            cmd.args = ["SET", key, synthetic_value]
        elif operation == 'append':
            cmd.args = ["APPEND", key, synthetic_value]
        elif operation == 'prepend':
            cmd.args = ["SET", key, synthetic_value]
        elif operation == 'delete':
            cmd.args = ["DEL", key]
        elif operation == 'incr':
            cmd.args = ["INCR", key]
        elif operation == 'decr':
            cmd.args = ["DECR", key]

        return cmd

class AsyncWorker:
    QUEUE_SIZE = 100000

    def __init__(self, redis_client) -> None:
        self.queue = asyncio.Queue(self.QUEUE_SIZE)
        self.redis_client = redis_client
        self.working = False

    async def put(self, batch: list) -> None:
        await self.queue.put(batch)

    async def work(self) -> None:
        self.working = True
        while self.working or not self.queue.empty() :
            batch = await self.queue.get()
            await self.execute(batch)

    async def execute(self, batch) -> None:
        async with self.redis_client.pipeline(transaction=False) as pipe:
            for cmd in batch:
                pipe.execute_command(*cmd.args)
            await pipe.execute()

    def start(self) -> asyncio.Task:
        return asyncio.create_task(self.work())

    def stop(self) -> None:
        self.working = False

class AsyncWorkerPool:
    """
    Mangaes worker pool to send commands in parallel
    Maintains synchronous order for commands with the same sync_id
    """
    def __init__(self, redis_client, num_workers) -> None:
        self.redis_client = redis_client
        self.num_workers = num_workers
        self.workers = []
        self.tasks = []
        self.sync_id_to_worker = {}
        self.next_worker_index = -1

    def allocate(self, sync_id) -> AsyncWorker:
        if not sync_id in self.sync_id_to_worker:
            self.next_worker_index = (self.next_worker_index + 1) % self.num_workers

            if len(self.workers) <= self.next_worker_index:
                assert len(self.workers) == self.next_worker_index
                self.workers.append(AsyncWorker(self.redis_client))
                self.tasks.append(self.workers[self.next_worker_index].start())

            self.sync_id_to_worker[sync_id] = self.workers[self.next_worker_index]

        return self.sync_id_to_worker[sync_id]

    async def put(self, batch: list, sync_id: int) -> None:
        worker = self.allocate(sync_id)
        await worker.put(batch)

    async def stop(self):
        for worker in self.workers:
            worker.stop()
        await asyncio.gather(*self.tasks)


class AsyncPlayer:
    READ_BATCH_SIZE = 10 * 1000 * 1000

    def __init__(self, redis_uri, num_workers) -> None:
        self.redis_uri = redis_uri
        self.redis_client = aioredis.from_url(f"redis://{self.redis_uri}", encoding="utf-8", decode_responses=True)
        self.worker_pool = AsyncWorkerPool(self.redis_client, 100)

        self.batch_by_sync_id = {}

    async def dispatch_batches(self):
        for sync_id in self.batch_by_sync_id:
            await self.worker_pool.put(self.batch_by_sync_id[sync_id], sync_id)
        self.batch_by_sync_id.clear()

    async def read_and_dispatch(self, csv_file, parser):
        print(f"dispatching from {csv_file}")

        line_count = 0

        async with aiofiles.open(csv_file, mode="r", encoding="utf-8", newline="") as afp:
            async for row in AsyncReader(afp):
                cmd = parser.parse(row)
                if not self.batch_by_sync_id.get(cmd.sync_id):
                    self.batch_by_sync_id[cmd.sync_id] = []
                batch = self.batch_by_sync_id[cmd.sync_id]
                batch.append(cmd)
                line_count = line_count + 1
                if (line_count >= self.READ_BATCH_SIZE):
                    await self.dispatch_batches()
                    line_count = 0
            # handle the remaining lines
            await self.dispatch_batches()

    async def print_stats(self):
        info = await self.redis_client.execute_command("info", "stats")
        print(f"{datetime.now()}: {info}")

    async def report_stats(self):
        while True:
            self.print_stats()

    async def report_stats(self):
        while True:
            await self.print_stats()
            await asyncio.sleep(10)

    async def play(self, csv_file, parser) -> None:
        print(f"pinging {self.redis_uri} successful?")
        print(await self.redis_client.ping())

        read_dispatch_task = asyncio.create_task(self.read_and_dispatch(csv_file, parser))
        stats_task = asyncio.create_task(self.report_stats())

        await read_dispatch_task
        print(f"finished reading {csv_file}")

        await self.worker_pool.stop()
        stats_task.cancel()
        print("all done")
        await self.print_stats()

def main():
    parser = argparse.ArgumentParser(description='Cache Logs Player')
    parser.add_argument('-u', '--uri', type=str, default='localhost:6379', help='Redis server URI')
    parser.add_argument('-f', '--csv_file', type=str, default='/home/ari/Downloads/cluster017.csv', help='Redis server URI')
    parser.add_argument('--num_workers', type=int, default=100, help='Maximum number of workers sending commands in parllel')

    args = parser.parse_args()

    player = AsyncPlayer(redis_uri=args.uri, num_workers=args.num_workers)
    asyncio.run(player.play(args.csv_file, TwitterCacheTraceParser()))

if __name__ == "__main__":
    main()


================================================
FILE: tools/cache_testing.py
================================================
#!/usr/bin/env python

import redis
import aioredis
import asyncio
import argparse
import numpy as np

'''
Run Cache Testing.
This tool performs cache testing for Dragonfly
by calling the `incrby` function on a constrained set
of items, as defined by the user. Additionally, it
distributes the frequency of `incrby` calls for each
item based on a Zipfian distribution (with alpha values
between 0 and 1 being representative of real-life cache
load scenarios)
'''


def rand_zipf_generator(alpha: float, upper: int, batch: int):
    """
    n: The upper bound of the values to generate a zipfian distribution over
    (n = 30 would generate a distribution of given alpha from values 1 to 30)
    alpha: The alpha parameter to be used while creating the Zipfian distribution
    num_samples: The total number of samples to generate over the Zipfian distribution
    This is a generator that yields up to count values using a generator.
    """

    # Calculate Zeta values from 1 to n:
    tmp = np.power(np.arange(1, upper+1), -alpha)
    zeta = np.r_[0.0, np.cumsum(tmp)]

    # Store the translation map:
    distMap = [x / zeta[-1] for x in zeta]

    while True:
        # Generate an array of uniform 0-1 pseudo-random values:
        u = np.random.random(batch)

        # bisect them with distMap
        v = np.searchsorted(distMap, u)

        samples = [t-1 for t in v]
        yield samples


def update_stats(hits, misses, value_index, total_count):
    """
    A void function that uses terminal control sequences
    to update hit/miss ratio stats for the user
    while the testing tool runs.
    """
    percent_complete = (value_index + 1) / total_count

    # Use the terminal control sequence to move the cursor to the beginning of the line
    print("\r", end="")

    # Print the loading bar and current hit rate
    print("[{}{}] {:.0f}%, current hit rate: {:.6f}%".format("#" * int(percent_complete * 20), " " *
          int(20 - percent_complete * 20), percent_complete * 100, (hits / (hits + misses)) * 100), end="")


async def run_single_conn(redis_client, keys_gen, args) -> None:
    misses = 0
    hits = 0
    val = 'x' * args.length
    items_sent = 0
    last_stat = 0
    for keys in keys_gen:
        if len(keys) == 1:
            result = await redis_client.set(str(keys[0]), val, nx=True)
            responses = [result]
        else:
            p = redis_client.pipeline(transaction=False)
            for key in keys:
                p.set(str(key), val, nx=True)
            responses = await p.execute()

        for resp in responses:
            if resp:
                misses += 1
            else:
                hits += 1
        items_sent += len(keys)
        if items_sent // 100 != last_stat:
            last_stat = items_sent // 100
            update_stats(hits, misses, items_sent, args.count)
        if items_sent >= args.count:
            break
    print()


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Cache Benchmark', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-c', '--count', type=int, default=100000,
                        help='total number of operations')
    parser.add_argument('-u', '--uri', type=str,
                        default='localhost:6379', help='Redis server URI')
    parser.add_argument('-a', '--alpha', type=float, default=1.0,
                        help='alpha value being used for the Zipf distribution')
    parser.add_argument('--upper_bound', type=int, default=1000,
                        help='the number of values to be used in the distribution')
    parser.add_argument('-d', '--length', type=int, default=10,
                        help='the length of the values to be used in the distribution')
    parser.add_argument('-p', '--pipeline', type=int,
                        default=1, help='pipeline size')
    parser.add_argument('-t', '--test', action='store_true')

    args = parser.parse_args()
    if args.test:
        for idx, items in enumerate(rand_zipf_generator(args.alpha, args.upper_bound, 1)):
            assert len(items) == 1
            print(items[0])
            if idx == args.count:
                break
        exit(0)

    r = aioredis.from_url(
        f"redis://{args.uri}", encoding="utf-8", decode_responses=True)

    distribution_keys_generator = rand_zipf_generator(
        args.alpha, args.upper_bound, args.pipeline)

    asyncio.run(run_single_conn(r, distribution_keys_generator, args))


================================================
FILE: tools/cluster_mgr.py
================================================
#!/usr/bin/env python3

import argparse
from argparse import RawTextHelpFormatter
import json
import math
from typing import Iterable, List
import redis
import subprocess
import time

"""
To install: pip install -r requirements.txt
"""


def die_with_err(err):
    print("!!!", err)
    exit(-1)


class Node:
    def __init__(self, host, port):
        self.id = ""
        self.host = host
        self.port = port

    def update_id(node):
        node.id = send_command(node, ["cluster", "myid"])
        print(f"- ID {node.id}")

    def __repr__(self):
        return f"{self.host}:{self.port}/{self.id}"

    def to_dict(self):
        return {"id": self.id, "ip": self.host, "port": self.port}


class Master(Node):
    def __init__(self, host, port):
        Node.__init__(self, host, port)
        self.replicas = []


def start_node(node, dragonfly_bin, threads):
    f = open(f"/tmp/dfly.cluster.node.{node.port}.log", "w")
    print(f"- Log file for node {node.port}: {f.name}")
    subprocess.Popen(
        [
            f"{dragonfly_bin}",
            f"--port={node.port}",
            "--cluster_mode=yes",
            f"--proactor_threads={threads}",
            "--dbfilename=",
            f"--logtostderr",
            "--proactor_affinity_mode=off",
            "--omit_basic_usage",
        ],
        stderr=f,
    )


def send_command(node, command, print_errors=True):
    client = redis.Redis(decode_responses=True, host=node.host, port=node.port)

    for i in range(0, 5):
        try:
            result = client.execute_command(*command)
            return result
        except Exception as e:
            if print_errors:
                print(e)
            time.sleep(0.1 * i)
        finally:
            client.close()

    if print_errors:
        print(f"Unable to run command {command} against {node.host}:{node.port} after 5 attempts!")

    return Exception()


class SlotRange:
    def __init__(self, start, end):
        assert start <= end
        self.start = start
        self.end = end

    def to_dict(self):
        return {"start": self.start, "end": self.end}

    @classmethod
    def from_dict(cls, d):
        return cls(d["start"], d["end"])

    def __repr__(self):
        return f"({self.start}-{self.end})"

    def merge(self, other: "SlotRange"):
        if self.end + 1 == other.start:
            self.end = other.end
            return True
        elif other.end + 1 == self.start:
            self.start = other.start
            return True
        return False

    def contains(self, slot_id):
        return self.start <= slot_id <= self.end

    def remove(self, slot_id):
        assert self.contains(slot_id)

        if self.start < self.end:
            if slot_id == self.start:
                return None, SlotRange(self.start + 1, self.end)
            elif slot_id == self.end:
                return SlotRange(self.start, self.end - 1), None
            elif self.start < slot_id < self.end:
                return SlotRange(self.start, slot_id - 1), SlotRange(slot_id + 1, self.end)
        return None, None


# Custom JSON encoder to handle SlotRange objects
class ClusterConfigEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, SlotRange) or isinstance(obj, Node):
            return obj.to_dict()
        return super().default(obj)


def build_config_from_list(masters: List[Master]):
    total_slots = 16384
    slots_per_node = math.floor(total_slots / len(masters))

    config = []
    for i, master in enumerate(masters):
        slot_range = SlotRange(i * slots_per_node, (i + 1) * slots_per_node - 1)
        c = {
            "slot_ranges": [slot_range],
            "master": master,
            "replicas": master.replicas,
        }
        config.append(c)

    # Adjust the last slot range to include any remaining slots
    config[-1]["slot_ranges"][-1].end += total_slots % len(masters)
    return config


def get_nodes_from_config(config):
    nodes = []
    for shard in config:
        nodes.append(shard["master"])
        for replica in shard["replicas"]:
            nodes.append(replica)

    for node in nodes:
        node.update_id()
    return nodes


def push_config(config):
    def push_to_node(node, config):
        # Use the custom encoder to convert SlotRange objects during serialization
        config_str = json.dumps(config, indent=2, cls=ClusterConfigEncoder)
        response = send_command(node, ["dflycluster", "config", config_str])
        print(f"- Push to {node.port}: {response}")

    for node in get_nodes_from_config(config):
        push_to_node(node, config)


def create_locally(args):
    print(f"Setting up a Dragonfly cluster:")
    print(f"- Master nodes: {args.num_masters}")
    print(f"- Ports: {args.first_port}...{args.first_port + args.num_masters - 1}")
    print(f"- Replicas for each master: {args.replicas_per_master}")
    print()

    next_port = args.first_port
    masters = []
    for i in range(args.num_masters):
        master = Master("127.0.0.1", next_port)
        next_port += 1
        for j in range(args.replicas_per_master):
            replica = Node("127.0.0.1", next_port)
            master.replicas.append(replica)
            next_port += 1
        masters.append(master)

    nodes = []
    for master in masters:
        nodes.append(master)
        for replica in master.replicas:
            nodes.append(replica)

    print("Starting nodes...")
    for node in nodes:
        start_node(node, args.dragonfly_bin, args.threads)
    print()
    time.sleep(0.5)

    if args.replicas_per_master > 0:
        print("Configuring replication...")
        for master in masters:
            for replica in master.replicas:
                response = send_command(replica, ["replicaof", master.host, master.port])
                print(f"- {replica.port} replicating {master.port}: {response}")
        print()

    print(f"Getting IDs...")
    for n in nodes:
        n.update_id()
    print()

    config = build_config_from_list(masters)
    print(f"Pushing config:\n{config}\n")
    push_config(config)
    print()


def config_single_remote(args):
    print(
        f"Configuring remote Dragonfly {args.target_host}:{args.target_port} to be a single-server cluster"
    )

    master = Master(args.target_host, args.target_port)
    master.update_id()

    test = send_command(master, ["get", "x"], print_errors=False)
    if type(test) is not Exception:
        die_with_err("Node either not found or already configured")

    config = build_config_from_list([master])
    print(f"Pushing config:\n{config}\n")
    push_config(config)
    print()


def build_config_from_existing(args):
    def list_to_dict(l):
        return {l[i]: l[i + 1] for i in range(0, len(l), 2)}

    def build_node(node_list):
        d = list_to_dict(node_list)
        node = Node(d["endpoint"], d["port"])
        node.id = d["id"]
        return node

    def build_slots(slot_list):
        slots = []
        for i in range(0, len(slot_list), 2):
            slots.append(SlotRange(slot_list[i], slot_list[i + 1]))
        return slots

    client = redis.Redis(decode_responses=True, host=args.target_host, port=args.target_port)
    existing = client.execute_command("cluster", "shards")
    config = []
    for shard_list in existing:
        shard = list_to_dict(shard_list)
        config.append(
            {
                "slot_ranges": build_slots(shard["slots"]),
                "master": build_node(shard["nodes"][0]),
                "replicas": [build_node(replica) for replica in shard["nodes"][1::]],
            }
        )

    client.close()
    return config


def find_master(config, host, port, die_if_not_found=True):
    new_owner = None
    for shard in config:
        if shard["master"].host == host and shard["master"].port == port:
            new_owner = shard
            break

    if new_owner == None and die_if_not_found:
        die_with_err(f"Can't find master (hint: use flag --target_host / --target_port).")

    return new_owner


def find_replica(config, host, port):
    for shard in config:
        for replica in shard["replicas"]:
            if replica.host == host and replica.port == port:
                return replica, shard
    die_with_err("Can't find target node")


def attach(args):
    print(f"Attaching remote Dragonfly {args.attach_host}:{args.attach_port} to cluster")
    if args.attach_as_replica:
        newcomer = Node(args.attach_host, args.attach_port)
        replica_resp = send_command(newcomer, ["info", "replication"])
        if replica_resp["role"] != "slave":
            die_with_err("Node is not in replica mode")
        if (
            replica_resp["master_host"] != args.target_host
            or replica_resp["master_port"] != args.target_port
        ):
            die_with_err("Node is not a replica of target")

        newcomer.update_id()

        config = build_config_from_existing(args)
        master_node = find_master(config, args.target_host, args.target_port)

        master_node["replicas"].append(newcomer)
        print(f"Pushing config:\n{config}\n")
        push_config(config)
    else:
        newcomer = Master(args.attach_host, args.attach_port)
        replica_resp = send_command(newcomer, ["info", "replication"])
        if replica_resp["role"] != "master":
            die_with_err("Node is not in master mode")
        newcomer.update_id()

        newcomer_config = build_config_from_list([newcomer])
        newcomer_config[0]["slot_ranges"] = []
        config = build_config_from_existing(args)
        print(f"Pushing config:\n{config}\n")
        push_config([*config, newcomer_config[0]])
    print()


def detach(args):
    print(f"Detaching remote Dragonfly {args.target_host}:{args.target_port} from cluster")
    print(
        "Important: detached node will not receive a new config! This means that the detached node will still 'think' that it belongs to the cluster"
    )
    config = build_config_from_existing(args)
    node = find_master(config, args.target_host, args.target_port, die_if_not_found=False)
    if node == None:
        replica, master = find_replica(config, args.target_host, args.target_port)
        master["replicas"].remove(replica)
    else:
        if len(node["slot_ranges"]) != 0:
            die_with_err("Can't detach a master with assigned slots")
        if len(node["replicas"]) != 0:
            die_with_err("Can't detach a master with replicas")
        config = [m for m in config if m != node]
    push_config(config)


def takeover(args):
    print(f"Promoting Dragonfly {args.target_host}:{args.target_port} from replica to master")
    print(
        "Important: do not forget to send command REPLICAOF NO ONE to new master, and update "
        "           additional replicas if such exist"
    )
    print("Important: previous master will be detached from the cluster")

    config = build_config_from_existing(args)
    replica, master = find_replica(config, args.target_host, args.target_port)
    master["replicas"].remove(replica)
    master["master"] = replica

    push_config(config)


def move(args):
    config = build_config_from_existing(args)
    new_owner = find_master(config, args.target_host, args.target_port)

    def remove_slot(slot_id, from_range: SlotRange, slot_ranges: list):
        slot_ranges.remove(from_range)
        left, right = from_range.remove(slot_id)
        if left:
            slot_ranges.append(left)
        if right:
            slot_ranges.append(right)

    def add_slot(slot, to_shard):
        slot_range = SlotRange(slot, slot)
        for existing_range in to_shard["slot_ranges"]:
            if existing_range.merge(slot_range):
                return
        to_shard["slot_ranges"].append(slot_range)

    def find_slot(slot, config):
        for shard in config:
            for slot_range in shard["slot_ranges"]:
                if slot_range.contains(slot):
                    return shard, slot_range
        return None, None

    def pack(slot_ranges):
        slot_objects = sorted(slot_ranges, key=lambda x: x.start)
        packed = []
        for slot_range in slot_objects:
            if packed and packed[-1].merge(slot_range):
                continue
            packed.append(slot_range)
        return packed

    for slot in range(args.slot_start, args.slot_end + 1):
        shard, slot_range = find_slot(slot, config)
        if shard == None or shard == new_owner:
            continue
        remove_slot(slot, slot_range, shard["slot_ranges"])
        add_slot(slot, new_owner)

    for shard in config:
        shard["slot_ranges"] = pack(shard["slot_ranges"])

    # Use the custom encoder for printing the JSON
    print(f"Pushing new config:\n{json.dumps(config, indent=2, cls=ClusterConfigEncoder)}\n")
    push_config(config)


def migrate(args):
    config = build_config_from_existing(args)
    target = find_master(config, args.target_host, args.target_port)
    target_node = target["master"]
    target_node.update_id()

    # Find source node
    source = None
    for node in config:
        slots: Iterable[SlotRange] = node["slot_ranges"]
        for slot in slots:
            if slot.start <= args.slot_start and slot.end >= args.slot_end:
                source = node
                break
    if source == None:
        die_with_err("Unsupported slot range migration (currently only 1-node migration supported)")

    source["migrations"] = [
        {
            "slot_ranges": [{"start": args.slot_start, "end": args.slot_end}],
            "node_id": target_node.id,
            "ip": target_node.host,
            "port": target_node.port,
        }
    ]
    push_config(config)

    # wait for migration finish
    sync_status = []
    while True:
        sync_status = send_command(target_node, ["DFLYCLUSTER", "SLOT-MIGRATION-STATUS"])
        if len(sync_status) == 0:
            # Migration didn't start yet
            continue
        if len(sync_status) != 1:
            die_with_err(f"Unexpected number of migrations {len(sync_status)}: {sync_status}")
        if "FATAL" in sync_status[0]:
            die_with_err(f"Error in migration {len(sync_status)}: {sync_status}")
        if "FINISHED" in sync_status[0]:
            print(f"Migration finished: {sync_status[0]}")
            break

    # Push new config to all nodes
    print("Updating all nodes with new slots state")
    move(args)


def populate(args):
    config = build_config_from_existing(args)
    for shard in config:
        master = shard["master"]
        slot_ranges = shard["slot_ranges"]
        for slot_range in slot_ranges:
            cmd = [
                "debug",
                "populate",
                str(args.size),
                "key",
                str(args.valsize),
                "SLOTS",
                str(slot_range.start),
                str(slot_range.end),
            ]
            send_command(master, cmd)


def print_config(args):
    config = build_config_from_existing(args)
    print(json.dumps(config, indent=2, cls=ClusterConfigEncoder))


def shutdown(args):
    config = build_config_from_existing(args)
    for node in get_nodes_from_config(config):
        send_command(node, ["shutdown"])


def main():
    parser = argparse.ArgumentParser(
        description="""
Dragonfly Manual Cluster Manager

This tool helps managing a Dragonfly cluster manually.
Cluster can either be local or remote:
- Starting Dragonfly instances must be done locally, binary path can be set with `--dragonfly_bin` (default: ../build-opt/dragonfly)
- Remote Dragonflies must already be started, and initialized with `--cluster_mode=yes`

Example usage:

Create a 3 node cluster locally:
  ./cluster_mgr.py --action=create_locally --num_masters=3
This will create 3 Dragonfly processes with ports 7001-7003.
Ports can be overridden with `--first_port`.

Create a 6 node cluster locally, 3 of them masters with 1 replica each:
  ./cluster_mgr.py --action=create_locally --num_masters=3 --replicas_per_master=1

Connect to existing cluster and print current config:
  ./cluster_mgr.py --action=print_config
This will connect to 127.0.0.1:6379 by default. Override with `--target_host` and `--target_port`

Configure an existing Dragonfly server to be a standalone cluster (owning all slots):
  ./cluster_mgr.py --action=config_single_remote
This connects to an *existing* Dragonfly server, and pushes a config telling it to own all slots.
This will connect to 127.0.0.1:6379 by default. Override with `--target_host` and `--target_port`

Attach an existing Dragonfly server to an existing cluster (owning no slots):
  ./cluster_mgr.py --action=attach --attach_host=HOST --attach_port=PORT
This will connect to existing cluster present at 127.0.0.1:6379 by default. Override with
`--target_host` and `--target_port`.
To attach node as a replica - use --attach_as_replica=True. In such case, the node will be a
replica of --target_host/--target_port.

To set up a new cluster - start the servers and then use
  ./cluster_mgr.py --action=config_single_remote ...
  ./cluster_mgr.py --action=attach ...
And repeat `--action=attach` for all servers.
Afterwards, distribute the slots between the servers as desired with `--action=move` or
`--action=migrate`.

To detach (remove) a node from the cluster:
  ./cluster_mgr.py --action=detach --target_host=X --target_port=X
Notes:
- If the node is a master, it must not have any slots assigned to it.
- The node will not be notified that it's no longer in a cluster. It's a good idea to shut it down
  after detaching it from the cluster.

To take over (turn replica to master):
  ./cluster_mgr.py --action=takeover --target_host=X --target_port=X
Notes:
- You'll need to run REPLICAOF NO ONE on the new master
- If previous master had other replicas, you'll need to update them with REPLICAOF as well
- Previous master will be detached from cluster. It's a good idea to shut it down.

Connect to cluster and move slots 10-20 to target:
  ./cluster_mgr.py --action=move --slot_start=10 --slot_end=20 --target_host=X --target_port=X
WARNING: This will NOT migrate existing data, i.e. data in slots 10-20 will be erased.

Migrate slots 10-20 to target:
  ./cluster_mgr.py --action=migrate --slot_start=10 --slot_end=20 --target_host=X --target_port=X
Unlike --action=move above, this will migrate the data to the new owner.

Connect to cluster and shutdown all nodes:
  ./cluster_mgr.py --action=shutdown --target_port=X
WARNING: Be careful! This will close all Dragonfly servers connected to the cluster.
""",
        formatter_class=RawTextHelpFormatter,
    )
    parser.add_argument(
        "--action",
        default="",
        help="Which action to take? See `--help`",
    )
    parser.add_argument(
        "--num_masters", type=int, default=3, help="Number of master nodes in cluster"
    )
    parser.add_argument(
        "--replicas_per_master", type=int, default=0, help="How many replicas for each master"
    )
    parser.add_argument("--first_port", type=int, default=7001, help="First master's port")
    parser.add_argument("--threads", type=int, default=2, help="Threads per node")
    parser.add_argument(
        "--slot_start", type=int, default=0, help="First slot to move / migrate (inclusive)"
    )
    parser.add_argument(
        "--slot_end", type=int, default=100, help="Last slot to move / migrate (inclusive)"
    )
    parser.add_argument("--target_host", default="127.0.0.1", help="Master host/ip")
    parser.add_argument("--target_port", type=int, default=6379, help="Master port")
    parser.add_argument(
        "--attach_host", default="127.0.0.1", help="New cluster node master host/ip"
    )
    parser.add_argument(
        "--attach_port", type=int, default=6379, help="New cluster node master port"
    )
    parser.add_argument(
        "--attach_as_replica", type=bool, default=False, help="Is the attached node a replica?"
    )
    parser.add_argument(
        "--dragonfly_bin", default="../build-opt/dragonfly", help="Dragonfly binary path"
    )
    parser.add_argument(
        "--size", type=int, default=1000000, help="Number of keys to populate in each slotrange"
    )
    parser.add_argument(
        "--valsize", type=int, default=16, help="Value size for each key during population"
    )

    args = parser.parse_args()

    actions = dict(
        [
            (f.__name__, f)
            for f in [
                create_locally,
                shutdown,
                config_single_remote,
                attach,
                detach,
                takeover,
                move,
                print_config,
                migrate,
                populate,
            ]
        ]
    )
    action = actions.get(args.action.lower())
    if action:
        action(args)
    else:
        die_with_err(f'Error - unknown action "{args.action}". See --help')


if __name__ == "__main__":
    main()


================================================
FILE: tools/defrag_db.py
================================================
import redis.asyncio as aioredis
import argparse
import asyncio

"""
This script iterates over all keys and "recycles" them.
Recycling is done by DUMPing the key first and then re-creating it with EXPIRE.
This will trigger re-allocation of internal data structures in order to reduce
memory fragmentation.
"""

SCRIPT = """
local recycled = 0
for _, key in ipairs(KEYS) do
    local ttl = redis.call('PTTL', key)
    local dumpedData = redis.call('DUMP', key)

    if dumpedData then
        redis.call('RESTORE', key, 0, dumpedData, 'REPLACE')
        if ttl > 0 then
            redis.call('PEXPIRE', key, ttl)
        end
        recycled = recycled + 1
    end
end
return recycled
"""

total_recycled = 0


async def workerfn(client_supplier, sha, queue):
    global total_recycled

    r = client_supplier()
    while True:
        keys = await queue.get()

        try:
            recycled = await r.evalsha(sha, len(keys), *keys)
        except Exception as e:
            raise SystemExit(e)

        if isinstance(recycled, int):
            total_recycled += recycled
        else:
            print("Error recycling", recycled)

        queue.task_done()


async def infofn():
    while True:
        await asyncio.sleep(0.5)
        print("Keys processed:", total_recycled)


async def main(client_supplier, scan_type, num_workers, queue_size, batch_size):
    r = client_supplier()
    sha = await r.script_load(SCRIPT)
    queue = asyncio.Queue(maxsize=queue_size)

    workers = [
        asyncio.create_task(workerfn(client_supplier, sha, queue)) for _ in range(num_workers)
    ]
    info_worker = asyncio.create_task(infofn())

    keys = []
    async for key in r.scan_iter("*", count=batch_size * 2, _type=scan_type):
        keys.append(key)
        if len(keys) >= batch_size:
            await queue.put(keys)
            keys = []

    await queue.put(keys)
    await queue.join()

    info_worker.cancel()
    for w in workers:
        w.cancel()

    await asyncio.gather(*workers, info_worker, return_exceptions=True)
    print("Recycled in total:", total_recycled)


arg_parser = argparse.ArgumentParser()
arg_parser.add_argument("--workers", type=int, default=8)
arg_parser.add_argument("--batch", type=int, default=20)

arg_parser.add_argument(
    "--type", type=str, default=None, help="Process keys only of specified type"
)

arg_parser.add_argument("--db", type=int)
arg_parser.add_argument("--port", type=int, default=6379)
arg_parser.add_argument("--host", type=str, default="localhost")
args = arg_parser.parse_args()


def client_supplier():
    return aioredis.StrictRedis(db=args.db, port=args.port, host=args.host)


asyncio.run(main(client_supplier, args.type, args.workers, args.workers * 2, args.batch))


================================================
FILE: tools/defrag_mem_test.py
================================================
#!/usr/bin/env python3
import asyncio
import aioredis
import async_timeout
import sys
import argparse

"""
To install: pip install -r requirements.txt

Run
dragonfly --mem_defrag_threshold=0.01 --mem_defrag_waste_threshold=0.01
defrag_mem_test.py -k 8000000 -v 645

This program would try to re-create the issue with memory defragmentation.
See issue number 448 for more details.
To run this:
    You can just execute this from the command line without any arguemnts.
    Or you can run with --help to see the options.
    The defaults are:
    number of keys: 800,000
    value size: 64 bytes
    key name pattern: key-for-testing
    host: localhost
    port: default redis port
    Please note that this would create 4 * number of keys entries
    You can see the memory usage/defrag state with the monitoring task that
    prints the current state

NOTE:
    If this seems to get stuck please kill it with ctrl+c
    This can happen in case we don't have "defrag_realloc_total > 0"
"""


class TaskCancel:
    def __init__(self):
        self.run = True

    def dont_stop(self):
        return self.run

    def stop(self):
        self.run = False


async def run_cmd(connection, cmd, sub_val):
    val = await connection.execute_command(cmd, sub_val)
    return val


async def handle_defrag_stats(connection, prev):
    info = await run_cmd(connection, "info", "stats")
    if info is not None:
        if info["defrag_task_invocation_total"] != prev:
            print("--------------------------------------------------------------")
            print(f"defrag_task_invocation_total: {info['defrag_task_invocation_total']:,}")
            print(f"defrag_realloc_total: {info['defrag_realloc_total']:,}")
            print(f"defrag_attempt_total: {info['defrag_attempt_total']:,}")
            print("--------------------------------------------------------------")
            if info["defrag_realloc_total"] > 0:
                return True, None
            return False, info["defrag_task_invocation_total"]
    return False, None


async def memory_stats(connection):
    print("--------------------------------------------------------------")
    info = await run_cmd(connection, "info", "memory")
    # print(f"memory commited: {info['comitted_memory']:,}")
    print(f"memory used: {info['used_memory']:,}")
    # print(f"memory usage ratio: {info['comitted_memory']/info['used_memory']:.2f}")
    print("--------------------------------------------------------------")


async def stats_check(connection, condition):
    try:
        defrag_task_invocation_total = 0
        runs = 0
        while condition.dont_stop():
            await asyncio.sleep(0.3)
            done, d = await handle_defrag_stats(connection, defrag_task_invocation_total)
            if done:
                print("defrag task successfully found memory locations to reallocate")
                condition.stop()
            else:
                if d is not None:
                    defrag_task_invocation_total = d
            runs += 1
            if runs % 3 == 0:
                await memory_stats(connection)
        for i in range(5):
            done, d = await handle_defrag_stats(connection, -1)
            if done:
                print("defrag task successfully found memory locations to reallocate")
                return True
            else:
                await asyncio.sleep(2)
        return True
    except Exception as e:
        print(f"failed to run monitor task: {e}")
    return False


async def delete_keys(connection, keys):
    results = await connection.delete(*keys)
    return results


def generate_keys(pattern: str, count: int, batch_size: int) -> list:
    for i in range(1, count, batch_size):
        batch = [f"{pattern}{j}" for j in range(i, batch_size + i, 3)]
        yield batch


async def mem_cleanup(connection, pattern, num, cond, keys_count):
    counter = 0
    for keys in generate_keys(pattern=pattern, count=keys_count, batch_size=950):
        if cond.dont_stop() == False:
            print(f"task number {num} that deleted keys {pattern} finished")
            return counter
        counter += await delete_keys(connection, keys)
        await asyncio.sleep(0.2)
    print(f"task number {num} that deleted keys {pattern} finished")
    return counter


async def run_tasks(pool, key_name, value_size, keys_count):
    keys = [f"{key_name}-{i}" for i in range(4)]
    stop_cond = TaskCancel()
    try:
        connection = aioredis.Redis(connection_pool=pool)
        for key in keys:
            print(f"creating key {key} with size {value_size} of count {keys_count}")
            await connection.execute_command("DEBUG", "POPULATE", keys_count, key, value_size)
            await asyncio.sleep(2)
        tasks = []
        count = 0
        for key in keys:
            pattern = f"{key}:"
            print(f"deleting keys from {pattern}")
            tasks.append(
                mem_cleanup(
                    connection=connection,
                    pattern=pattern,
                    num=count,
                    cond=stop_cond,
                    keys_count=int(keys_count),
                )
            )
            count += 1
        monitor_task = asyncio.create_task(stats_check(connection, stop_cond))
        total = await asyncio.gather(*tasks, return_exceptions=True)
        print(f"successfully deleted {sum(total)} keys")
        stop_cond.stop()
        await monitor_task
        print("finish executing")
        return True
    except Exception as e:
        print(f"got error {e} while running delete keys")
        return False


def connect_and_run(key_name, value_size, keys_count, host="localhost", port=6379):
    async_pool = aioredis.ConnectionPool(
        host=host, port=port, db=0, decode_responses=True, max_connections=16
    )

    loop = asyncio.new_event_loop()
    success = loop.run_until_complete(
        run_tasks(pool=async_pool, key_name=key_name, value_size=value_size, keys_count=keys_count)
    )
    return success


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="active memory testing", formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument("-k", "--keys", type=int, default=800000, help="total number of keys")
    parser.add_argument("-v", "--value_size", type=int, default=645, help="size of the values")
    parser.add_argument(
        "-n", "--key_name", type=str, default="key-for-testing", help="the base key name"
    )
    parser.add_argument("-s", "--server", type=str, default="localhost", help="server host name")
    parser.add_argument("-p", "--port", type=int, default=6379, help="server port number")
    args = parser.parse_args()
    keys_num = args.keys
    key_name = args.key_name
    value_size = args.value_size
    host = args.server
    port = args.port
    print(
        f"running key deletion on {host}:{port} for keys {key_name} value size of {value_size} and number of keys {keys_num}"
    )
    result = connect_and_run(
        key_name=key_name, value_size=value_size, keys_count=keys_num, host=host, port=port
    )
    if result == True:
        print("finished successfully")
    else:
        print("failed")


================================================
FILE: tools/docker/entrypoint.sh
================================================
#!/bin/sh

# This is important in order to provide enough locked memory to dragonfly
# when running on kernels < 5.12.
# This line should reside before `set -e` so it could fail silently
# in case the container runs in non-privileged mode.
ulimit -l 65000 2> /dev/null

set -e

# first arg is `-some-option`
if [ "${1#-}" != "$1" ]; then
    # override arguments by prepending "dragonfly --logtostderr" to them.
    set -- dragonfly --logtostderr "$@"
fi

# allow the docker container to be started with `--user`
if [ "$1" = 'dragonfly' -a "$(id -u)" = '0' ]; then
    # find all the files in the WORKDIR including the dir itself that do not
    # have dfly user on them and chmod them to dfly.
    find . \! -user dfly -exec chown dfly '{}' +
    # runs this script under user dfly
    exec setpriv --reuid=dfly --regid=dfly --clear-groups -- "$0" "$@"
fi

um="$(umask)"
if [ "$um" = '0022' ]; then
    umask 0077  # restrict access permissions only to the owner
fi

exec "$@"


================================================
FILE: tools/docker/fetch_release.sh
================================================
#!/bin/sh

set -e

PLATFORM=$1

PSHORT=${PLATFORM#"linux/"}
echo "PSHORT ${PSHORT}"


if [ "${PSHORT}" = "amd64" ]; then
  SUFFIX='x86_64'
else
  SUFFIX='aarch64'
fi

mv /tmp/dragonfly-${SUFFIX} /build/dragonfly
ls -l /build/

================================================
FILE: tools/docker/healthcheck.sh
================================================
#!/bin/sh

# Cleanup function to prevent zombie processes (issue #5844)
# This is critical when dragonfly runs as PID 1 without an init system
cleanup() {
  # Wait for all background/child processes to finish
  wait 2>/dev/null || true
}

# Set trap to ensure cleanup runs on exit, regardless of how the script exits
trap cleanup EXIT

HOST="localhost"
PORT=$HEALTHCHECK_PORT

if [ -z "$HEALTHCHECK_PORT" ]; then
  # try unpriveleged version first. This should cover cases when the container is running
  # without root, for example:
  # docker run  --group-add 999  --cap-drop=ALL --user 999 docker.dragonflydb.io/dragonflydb/dragonfly
  DF_NET=$(netstat -tlnp | grep "/dragonfly")
  if [ -z "$DF_NET" ]; then
    # if we failed, then lets try the priveleged version. is triggerred by the regular command:
    # docker run docker.dragonflydb.io/dragonflydb/dragonfly
    DF_NET=$(su dfly -c "netstat -tlnp" | grep "/dragonfly")
  fi

  # check all the TCP ports, and fetch the port.
  # For cases when dragonfly opens multiple ports, we filter with tail to choose one of them.
  PORT=$(echo $DF_NET | grep -oE ':[0-9]+' | cut -c2- | tail -n 1)
fi

_healthcheck="nc -q1 $HOST $PORT"

echo PING | ${_healthcheck}

exit $?


================================================
FILE: tools/eviction/fill_db.py
================================================
#!/usr/bin/env python3

"""
This script implements facilities for assessing cache eviction.
Two major functions have been implemented that allow users to
  1. Populate Dragonfly with a specified key and value length distributions.
  2. Measuring the cache hit rate of Dragonfly with workloads that access keys using Zipfian distribution.

Usage:
To perform database population, simply run:

./fill_db.py -f

This will automatically populate the database to the point where about 2X of maxmemory (specified by Dragonfly)
of KV pairs will be inserted. By default, we always stop at 2X maxmemory, and this can be changed using the -r
option, for instance

./fill_db.py -f -r 0.25  # population stops at 4x maxmemory

To accelerate the population, we can use multiple processes running this script in parallel. A convenient script
has been provided in this directory:
./run_fill_db.sh 10  # use 10 processes to fill in parallel

After database has been populated, we can start measuring cache hit rate using the -m option:
./fill_db.py -m
Note that the measurement must be done after the population as this mode relies on reading back the complete key
space inserted during the population phase. By default, we perform 100000 set operations for calculating cache hit rate.
This number can be changed using the -c option:
./fill_db.py -m -c 2000
"""


import redis
import string
from random import choice
from random import shuffle
import numpy as np

import asyncio
from redis import asyncio as aioredis
import os
import argparse
import re
import glob

all_val_lens = [400, 800, 1600, 25000]
val_lens_probs = [0.003, 0.78, 0.2, 0.017]

all_key_lens = [35, 60, 70]
key_lens_probs = [0.2, 0.06, 0.74]

inserted_keys = []


def random_str(len):
    return "".join(
        choice(string.ascii_letters + string.digits + string.punctuation) for i in range(len)
    )


def random_key():
    global all_key_lens, key_lens_probs
    return random_str(np.random.choice(all_key_lens, p=key_lens_probs))


def random_val():
    global all_val_lens, val_lens_probs
    return random_str(np.random.choice(all_val_lens, p=val_lens_probs))


def flush_keys_to_file(file_name):
    global inserted_keys
    with open(file_name, "a") as f:
        for key in inserted_keys:
            f.write(f"{key}\n")


def read_keys_from_file(file_name):
    global inserted_keys
    with open(file_name) as file:
        for line in file:
            inserted_keys.append(line.rstrip())


def read_keys():
    global inserted_keys
    inserted_keys.clear()
    key_files = glob.glob("./keys_*.txt")
    for key_file in key_files:
        read_keys_from_file(key_file)


def sync_populate_db():
    r = redis.Redis(decode_responses=True)
    n = 0
    while True:
        r.set(random_key(), random_val())
        n += 1
        if n % 1000 == 0:
            print("\r>> Number of key-value pairs inserted: {}".format(n), end="")


def sync_query_db():
    global inserted_keys
    r = redis.Redis(decode_responses=True)
    n = 0
    read_keys()
    misses = 0
    hits = 0
    for key in inserted_keys:
        resp = r.set(key, random_val(), nx=True)
        # print(resp)
        if resp:
            misses += 1
        else:
            hits += 1
        n += 1
        if n % 1000 == 0:
            print(
                "\r>> Number of key-value pairs inserted: {0}, hit: {1}, miss: {2}".format(
                    n, hits, misses
                ),
                end="",
            )


async def populate_db(ratio):
    global inserted_keys
    r = aioredis.Redis(decode_responses=True)
    n = 0
    misses = 0
    hits = 0

    total_key_count = 0
    while True:
        # await r.set(random_key(), random_val())
        pipeline = r.pipeline(False)
        for x in range(200):
            k = random_key()
            inserted_keys.append(k)
            pipeline.set(k, random_val())
            # pipeline.set(k, random_val(), nx=True)
        await pipeline.execute()
        # responses = await pipeline.execute()
        # for resp in responses:
        #    if resp:
        #        misses += 1
        #    else:
        #        hits += 1

        # key file names are in keys_xxxx.txt format
        key_file_name = "keys_" + str(os.getpid()) + ".txt"
        flush_keys_to_file(key_file_name)
        inserted_keys.clear()
        n += 200

        if total_key_count == 0:
            db_info = await r.info()
            used_mem = float(db_info["used_memory"])
            max_mem = float(db_info["maxmemory"])
            redline = 0.9
            # we will know the total number of keys of the whole space
            # only when we approach the maxmemory of the db
            if used_mem >= max_mem * redline:
                total_key_count = int(float(n) / ratio)
                print(
                    "\n>> Determined target key count: {0}, current key count: {1}, ratio: {2}".format(
                        total_key_count, n, ratio
                    ),
                    end="",
                )
        else:
            if n >= total_key_count:
                print("\n>> Target number of keys reached: {}, stopping...".format(n), end="")
                break
        if n % 1000 == 0:
            print("\r>> Number of key-value pairs inserted: {0}".format(n), end="")
            # print("\r>> Number of key-value pairs inserted: {0}, hit: {1}, miss: {2}".format(n, hits, misses), end='')


def rand_zipf_generator(alpha: float, upper: int, batch: int):
    """
    n: The upper bound of the values to generate a zipfian distribution over
    (n = 30 would generate a distribution of given alpha from values 1 to 30)
    alpha: The alpha parameter to be used while creating the Zipfian distribution
    num_samples: The total number of samples to generate over the Zipfian distribution
    This is a generator that yields up to count values using a generator.
    """

    # Calculate Zeta values from 1 to n:
    tmp = np.power(np.arange(1, upper + 1), -alpha)
    zeta = np.r_[0.0, np.cumsum(tmp)]

    # Store the translation map:
    distMap = [x / zeta[-1] for x in zeta]

    while True:
        # Generate an array of uniform 0-1 pseudo-random values:
        u = np.random.random(batch)

        # bisect them with distMap
        v = np.searchsorted(distMap, u)

        samples = [t - 1 for t in v]
        yield samples


def rearrange_keys():
    """
    This function potentially provides the capability for testing different caching workloads.
    for instance, if we rearrange all the keys via sorting based on the k-v memory usage,
    we will generate a zipfian hotspot that prefers to access small kv pairs (or larger kv pairs)
    current implementation just uses a random shuffle.
    """
    global inserted_keys
    shuffle(inserted_keys)


async def query_db_with_locality(count):
    global inserted_keys
    r = aioredis.Redis(decode_responses=True)
    n = 0
    read_keys()
    rearrange_keys()
    misses = 0
    hits = 0
    pipeline_size = 200
    key_index_gen = rand_zipf_generator(1.0, len(inserted_keys), pipeline_size)
    for key_indices in key_index_gen:
        pipeline = r.pipeline(False)
        # print(key_indices)
        for key_index in key_indices:
            k = inserted_keys[key_index]
            pipeline.set(k, random_val(), nx=True)

        responses = await pipeline.execute()
        n += pipeline_size
        for resp in responses:
            if resp:
                misses += 1
            else:
                hits += 1
        print(
            "\r>> Number of ops: {0}, hit: {1}, miss: {2}, hit rate: {3:.4f}".format(
                n, hits, misses, float(hits) / float(hits + misses)
            ),
            end="",
        )
        if n >= count:
            break
    hit_rate = float(hits) / float(hits + misses)
    print("\n>> Cache hit rate: {:.4f}".format(hit_rate))


class Range(object):
    def __init__(self, start, end):
        self.start = start
        self.end = end

    def __eq__(self, other):
        return self.start <= other <= self.end


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Cache Benchmark", formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )

    parser.add_argument(
        "-f",
        "--fill",
        action="store_true",
        help="fill database with random key-value pairs with their lengths follow some distributions",
    )

    parser.add_argument(
        "-r",
        "--ratio",
        type=float,
        default=0.5,
        choices=[Range(0.0, 1.0)],
        help="the ratio between in memory data size and total data size",
    )

    parser.add_argument(
        "-m",
        "--measure",
        action="store_true",
        help="measure cache hit rate by visiting the entire key space with a Zipfian distribution",
    )

    parser.add_argument(
        "-c",
        "--count",
        type=int,
        default=100000,
        help="total number of operations to be performed when measuring cache hit rate",
    )

    args = parser.parse_args()

    if args.fill:
        asyncio.run(populate_db(args.ratio))
        exit(0)

    if args.measure:
        asyncio.run(query_db_with_locality(args.count))


================================================
FILE: tools/eviction/run_fill_db.sh
================================================
#!/bin/sh

rm ./keys_*.txt
for i in `seq 1 $1`
do
    echo "launching process $i to fill.."
    ./fill_db.py -f &
done

wait


================================================
FILE: tools/eviction/stop_fill_db.sh
================================================
#!/bin/sh
ps -ef | grep fill_db.py | grep -v grep | awk '{print $2}' | xargs kill -9


================================================
FILE: tools/faulty_io.sh
================================================
#!/bin/bash

# Utility script for creating block devices with fault injection to test tiering
#
if [[ $EUID -ne 0 ]]; then
   echo "This script must be run as root"
   exit 1
fi

function finalize_block_dev {
    mke2fs /dev/mapper/$1
    mkdir -p /mnt/$1
    mount /dev/mapper/$1  /mnt/$1
    chmod o+rw /mnt/$1
}

function remove_block_dev {
     umount /dev/mapper/$1
    rm -rf /mnt/$1
}

if [[ "$1" == "create" ]]
then
    # Create backing file of 256MB
    dd if=/dev/zero of=./tiering_backing bs=1024 count=262144
    # Create loopback device
    DEV=$(losetup --find --show ./tiering_backing)

    # Create first block device with flaky sectors
    dmsetup create tiering_flaky << EOF
    0 20000 linear $DEV 0
    20000 105424 flakey $DEV 0 1 1
EOF
    finalize_block_dev tiering_flaky
elif [[ "$1" == "remove" ]]
then
    remove_block_dev tiering_flaky
    dmsetup remove_all
    losetup -a | grep tiering | awk -F ':' '{print $1}'| xargs losetup --detach
    rm ./tiering_backing
else
    echo """Devices created by this script:
    1. /mnt/tiering_flaky_1 - flaky device with 1:1 second success/error intervals

use with either create/remove arguments"""
fi


================================================
FILE: tools/generate-tls-files.sh
================================================
#!/bin/bash

# This script generates locally-signed TLS files for development usage.
# It's probably a good idea to run in an empty, temporary directory.
#
# Example usage:
#
# mkdir /tmp/dfly-tls
# cd /tmp/dfly-tls
# ~/dragonfly/tools/generate-tls-files.sh
# ~/dragonfly/build-dbg/dragonfly \
#      --dbfilename= \
#      --logtostdout \
#      --tls=true \
#      --tls_key_file=/tmp/dfly-tls/df-key.pem \
#      --tls_cert_file=/tmp/dfly-tls/df-cert.pem \
#      --requirepass=XXX
# redis-cli --tls --cacert /tmp/dfly-tls/ca-cert.pem -a XXX

CA_KEY_PATH=ca-key.pem
CA_CERTIFICATE_PATH=ca-cert.pem
CERTIFICATE_REQUEST_PATH=df-req.pem
PRIVATE_KEY_PATH=df-key.pem
CERTIFICATE_PATH=df-cert.pem

echo "Generating files in local directory (rm *.pem to cleanup)"

openssl req -x509 -newkey rsa:4096 -days 1 -nodes \
  -keyout ${CA_KEY_PATH} \
  -out ${CA_CERTIFICATE_PATH} \
  -subj "/C=GR/ST=SKG/L=Thessaloniki/O=KK/OU=AcmeStudios/CN=Gr/emailAddress=acme@gmail.com"

openssl req -newkey rsa:4096 -nodes \
  -keyout ${PRIVATE_KEY_PATH} \
  -out ${CERTIFICATE_REQUEST_PATH} \
  -subj "/C=GR/ST=SKG/L=Thessaloniki/O=KK/OU=Comp/CN=Gr/emailAddress=does_not_exist@gmail.com"

openssl x509 -req \
  -in ${CERTIFICATE_REQUEST_PATH} \
  -days 1 \
  -CA ${CA_CERTIFICATE_PATH} \
  -CAkey ${CA_KEY_PATH} \
  -CAcreateserial -out ${CERTIFICATE_PATH}

echo "You can now run:"
echo "dragonfly --tls=true --tls_key_file=${PRIVATE_KEY_PATH} --tls_cert_file=${CERTIFICATE_PATH} --requirepass=XXX"
echo "redis-cli --tls --cacert ${CA_CERTIFICATE_PATH} -a XXX"


================================================
FILE: tools/json_benchmark.py
================================================
#!/usr/bin/env python

import multiprocessing
import time
import redis
import sys
import argparse
from urllib.parse import urlparse
import os
from collections import defaultdict
import math

'''
Run JSON benchmark for 3 commands:
    JSON.SET
    JSON.GET
    JSON.TYPE
We want to the overall time it takes
to save and access keys that contains
JSON values with this benchmark.
This also verify that the basic functionalities
for using JSON types work correctly
'''

def ping(r):
    r.ping()

def jsonset(r, i):
    key = "json-{}".format(i)
    r.execute_command('JSON.SET', key, '.', '{"a":123456, "b": "hello", "nested": {"abc": "ffffff", "bfb": null}}')


def jsonget(r, i):
    key = "json-{}".format(i)
    r.execute_command('JSON.GET', key, '$.a', '$..abc')

def jsontype(r, i):
    key = "json-{}".format(i)
    r.execute_command('JSON.TYPE', key, '$.a')

def runWorker(ctx):
    wpid = os.getpid()
    print( '{} '.format(wpid))

    rep = defaultdict(int)
    r = redis.StrictRedis(host=ctx['host'], port=ctx['port'])
    work = ctx['work']
    if ctx['pipeline'] == 0:
        total_count = int(ctx['count'])
        for i in range(0, total_count):
            s0 = time.time()
            jsonset(r, i)
            s1 = time.time() - s0
            bin = int(math.floor(s1 * 1000)) + 1
            rep[bin] += 1
        for i in range(0, total_count):
            s0 = time.time()
            jsonget(r, i)
            s1 = time.time() - s0
            bin = int(math.floor(s1 * 1000)) + 1
            rep[bin] += 1
        for i in range(0, total_count):
            s0 = time.time()
            jsontype(r, i)
            s1 = time.time() - s0
            bin = int(math.floor(s1 * 1000)) + 1
            rep[bin] += 1
    else:
        for i in range(0, ctx['count'], ctx['pipeline']):
            p = r.pipeline()
            s0 = time.time()
            for j in range(0, ctx['pipeline']):
                work(p)
            p.execute()
            s1 = time.time() - s0
            bin = int(math.floor(s1 * 1000)) + 1
            rep[bin] += ctx['pipeline']

    return rep

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='ReJSON Benchmark', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-c', '--count', type=int, default=100000, help='total number of operations')
    parser.add_argument('-p', '--pipeline', type=int, default=0, help='pipeline size')
    parser.add_argument('-w', '--workers', type=int, default=8, help='number of worker processes')
    parser.add_argument('-u', '--uri', type=str, default='redis://localhost:6379', help='Redis server URI')
    args = parser.parse_args()
    uri = urlparse(args.uri)

    r = redis.Redis(host=uri.hostname, port=uri.port)

    pool = multiprocessing.Pool(args.workers)
    s0 = time.time()
    ctx = {
        'count': args.count / args.workers,
        'pipeline': args.pipeline,
        'host': uri.hostname,
        'port': uri.port,
        'work': jsonset,
    }

    print ('Starting workers: ')
    p = multiprocessing.Pool(args.workers)
    results = p.map(runWorker, (ctx, ) * args.workers)
    print("")
    sys.stdout.flush()

    s1 = time.time() - s0
    agg = defaultdict(int)
    for res in results:
        for k, v in res.items():
            agg[k] += v

    print()
    count = args.count * 3
    print (f'Count: {args.count}, Workers: {args.workers}, Pipeline: {args.pipeline}')
    print (f'Using hireds: {redis.utils.HIREDIS_AVAILABLE}')
    print (f'Runtime: {round(s1, 2):,} seconds')
    print (f'Throughput: {round(count/s1, 2):,} requests per second')
    for k, v in sorted(agg.items()):
        perc = 100.0 * v / count
        print (f'{perc:.4f}% <= {k:,} milliseconds')


================================================
FILE: tools/local/gen-test-certs.sh
================================================
#!/bin/bash
set -e

SCRIPT_DIR=$(dirname "$0")
ROOT_DIR=$(readlink -f "$SCRIPT_DIR/../..")
GEN_DIR=$ROOT_DIR/genfiles/tls


#   genfiles/tls/ca.{crt,key}          Self signed CA certificate.
#   genfiles/tls/dragonfly.{crt,key}   A certificate with no key usage/policy restrictions.
#   genfiles/tls/client.{crt,key}      A certificate restricted for SSL client usage.
#   genfiles/tls/server.{crt,key}      A certificate restricted for SSL server usage.

: '
To run dragonfly use:
dragonfly --tls --tls_key_file ../genfiles/tls/server.key  --tls_cert_file ../genfiles/tls/server.crt -requirepass pass

Or with CA (does not require password):
dragonfly --tls --tls_key_file ../genfiles/tls/server.key  --tls_cert_file ../genfiles/tls/server.crt \
--tls_ca_cert_file ../genfiles/tls/ca.crt

To connect with client (without ca):
openssl s_client   -state -crlf  -connect 127.0.0.1:6379

With CA:
openssl s_client   -state -crlf -CAfile ../genfiles/tls/ca.crt  -cert ../genfiles/tls/client.crt -key ../genfiles/tls/client.key  -connect 127.0.0.1:6379

Similarly, to connect with redis-cli (no CA):
redis-cli --tls --insecure -a pass

With CA:
redis-cli --tls  --cacert ../genfiles/tls/ca.crt  --cert ../genfiles/tls/client.crt --key ../genfiles/tls/client.key

memtier (without CA):
memtier_benchmark --tls --key ../genfiles/tls/client.key  --cert ../genfiles/tls/client.crt -a pass

memtier (with CA):
memtier_benchmark --tls --key ../genfiles/tls/client.key  --cert ../genfiles/tls/client.crt --cacert ../genfiles/tls/ca.crt
'

generate_cert() {
    local name=$1
    local cn="$2"
    local opts="$3"

    local keyfile=$GEN_DIR/${name}.key
    local certfile=$GEN_DIR/${name}.crt

    [ -f $keyfile ] || openssl genpkey -algorithm ED25519 -out $keyfile
    openssl req -new -sha256 \
        -subj "/O=Dragonfly Test/CN=$cn" \
        -key $keyfile | \
        openssl x509 \
            -req -sha256 \
            -CA $GEN_DIR/ca.crt \
            -CAkey $GEN_DIR/ca.key \
            -CAserial $GEN_DIR/ca.txt \
            -CAcreateserial \
            -days 365 \
            $opts \
            -out $certfile
}

mkdir -p $GEN_DIR
[ -f $GEN_DIR/ca.key ] || openssl genpkey -algorithm ED25519 -out $GEN_DIR/ca.key

# -x509: self-signed certificate, -nodes: no password
openssl req \
    -x509 -new -nodes -sha256 \
    -key $GEN_DIR/ca.key \
    -days 3650 \
    -subj '/O=Dragonfly Test/CN=Certificate Authority' \
    -out $GEN_DIR/ca.crt

cat > $GEN_DIR/openssl.cnf <<_END_
[ server_cert ]
keyUsage = digitalSignature, keyEncipherment
nsCertType = server

[ client_cert ]
keyUsage = digitalSignature, keyEncipherment
nsCertType = client
_END_

generate_cert server "Server-only" "-extfile $GEN_DIR/openssl.cnf -extensions server_cert"
generate_cert client "Client-only" "-extfile $GEN_DIR/openssl.cnf -extensions client_cert"
generate_cert dragonfly "Generic-cert"


================================================
FILE: tools/local/monitoring/docker-compose.yml
================================================
version: '3.8'

# To run redis exporter, run: docker compose --profile redis up
# To run memcached and its exporter: docker compose --profile memcached up
#
# Note you may still need to disable/change scraping job configs
# in prometheus.yml
#
volumes:
  prometheus_data:
  grafana_data:
  memcached_data:

services:
  change_vol_ownership:
    image: alpine
    user: root
    volumes:
      - memcached_data:/memcached
    command: chown -R 11211:11211 /memcached

  prometheus:
    image: prom/prometheus:v3.0.0
    restart: always
    volumes:
      - ./prometheus:/etc/prometheus/
      - prometheus_data:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.console.libraries=/usr/share/prometheus/console_libraries'
      - '--web.console.templates=/usr/share/prometheus/consoles'
    ports:
      - 9090:9090
    extra_hosts:
      - 'host.docker.internal:host-gateway'
    depends_on:
      node-exporter:
        condition: service_started
      change_vol_ownership:
        condition: service_completed_successfully

  memcached:
    image: memcached
    restart: unless-stopped
    ports:
      - "11211:11211"
    command: "-t 8 -m 10000 -c 10000 --pidfile=/memcached/memcached.pid"
    pid: host
    volumes:
      - memcached_data:/memcached
    profiles: [memcached]


  node-exporter:
    image: prom/node-exporter
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
    command:
      - '--path.procfs=/host/proc'
      - '--path.sysfs=/host/sys'
      - --collector.filesystem.ignored-mount-points
      - '^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)'
    ports:
      - 9100:9100
    restart: always
    deploy:
      mode: global

  grafana:
    image: grafana/grafana:10.1.10
    user: '472'
    restart: always
    environment:
      # do not really need it, just to leave it as an example on how to install plugins here
      - GF_INSTALL_PLUGINS=grafana-clock-panel
      - GF_RENDERING_SERVER_URL=http://renderer:8081/render
      - GF_RENDERING_CALLBACK_URL=http://grafana:3000/
    volumes:
      - grafana_data:/var/lib/grafana
      - ./grafana/provisioning/:/etc/grafana/provisioning/
    env_file:
      - ./grafana/config.monitoring
    ports:
      - 3000:3000
    depends_on:
      - prometheus

  memcached-exporter:
    image: prom/memcached-exporter
    container_name: memcached-exporter
    restart: unless-stopped
    ports:
      - "9150:9150"
    pid: host
    command:
      - --memcached.address=memcached:11211
      - --memcached.pid-file=/memcached/memcached.pid
    volumes:
      - memcached_data:/memcached
    profiles: [memcached]
    depends_on:
      - memcached

  redis-exporter:
    image: quay.io/oliver006/redis_exporter
    profiles: [redis]
    ports:
      - 9121:9121
    extra_hosts:
      - "host.docker.internal:host-gateway"
    restart: always
    environment:
      - REDIS_ADDR=host.docker.internal:6379

  renderer:
    image: grafana/grafana-image-renderer:latest
    ports:
      - 8081


================================================
FILE: tools/local/monitoring/grafana/config.monitoring
================================================
GF_SECURITY_ADMIN_USER=admin
GF_SECURITY_ADMIN_PASSWORD=foobar
GF_USERS_ALLOW_SIGN_UP=false


================================================
FILE: tools/local/monitoring/grafana/provisioning/dashboards/dashboard.yml
================================================
apiVersion: 1

providers:
- name: dashboards
  orgId: 1
  folder: ''
  type: file
  disableDeletion: false
  editable: true
  options:
    path: /etc/grafana/provisioning/dashboards


================================================
FILE: tools/local/monitoring/grafana/provisioning/dashboards/dragonfly.json
================================================
{
  "annotations": {
    "list": [
      {
        "builtIn": 1,
        "datasource": {
          "type": "datasource",
          "uid": "grafana"
        },
        "enable": true,
        "hide": true,
        "iconColor": "rgba(0, 211, 255, 1)",
        "name": "Annotations & Alerts",
        "target": {
          "limit": 100,
          "matchAny": false,
          "tags": [],
          "type": "dashboard"
        },
        "type": "dashboard"
      }
    ]
  },
  "editable": true,
  "fiscalYearStartMonth": 0,
  "graphTooltip": 0,
  "links": [],
  "liveNow": false,
  "panels": [
    {
      "collapsed": false,
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 0
      },
      "id": 17,
      "panels": [],
      "title": "Basic metrics",
      "type": "row"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "fixedColor": "rgb(31, 120, 193)",
            "mode": "fixed"
          },
          "decimals": 0,
          "mappings": [
            {
              "options": {
                "match": "null",
                "result": {
                  "text": "N/A"
                }
              },
              "type": "special"
            }
          ],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "s"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 7,
        "w": 4,
        "x": 0,
        "y": 1
      },
      "id": 9,
      "maxDataPoints": 100,
      "options": {
        "colorMode": "none",
        "graphMode": "area",
        "justifyMode": "auto",
        "orientation": "horizontal",
        "percentChangeColorMode": "standard",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "showPercentChange": false,
        "textMode": "auto",
        "wideLayout": true
      },
      "pluginVersion": "10.1.10",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr":
              "max(max_over_time(dragonfly_uptime_in_seconds{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__interval]))",
          "format": "time_series",
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "",
          "metric": "",
          "range": true,
          "refId": "A",
          "step": 1800
        }
      ],
      "title": "Uptime",
      "type": "stat"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "fixedColor": "rgb(31, 120, 193)",
            "mode": "fixed"
          },
          "decimals": 0,
          "mappings": [
            {
              "options": {
                "match": "null",
                "result": {
                  "text": "N/A"
                }
              },
              "type": "special"
            }
          ],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "none"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 7,
        "w": 2,
        "x": 4,
        "y": 1
      },
      "hideTimeOverride": true,
      "id": 12,
      "maxDataPoints": 100,
      "options": {
        "colorMode": "none",
        "graphMode": "area",
        "justifyMode": "auto",
        "orientation": "horizontal",
        "percentChangeColorMode": "standard",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "showPercentChange": false,
        "textMode": "auto",
        "wideLayout": true
      },
      "pluginVersion": "10.1.10",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr": "dragonfly_connected_clients{namespace=\"$namespace\",pod=~\"$pod_name\"}",
          "format": "time_series",
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "",
          "metric": "",
          "range": true,
          "refId": "A",
          "step": 2
        }
      ],
      "timeFrom": "1m",
      "title": "Clients",
      "type": "stat"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "decimals": 0,
          "mappings": [
            {
              "options": {
                "match": "null",
                "result": {
                  "text": "N/A"
                }
              },
              "type": "special"
            }
          ],
          "max": 100,
          "min": 0,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "rgba(50, 172, 45, 0.97)",
                "value": null
              },
              {
                "color": "rgba(237, 129, 40, 0.89)",
                "value": 80
              },
              {
                "color": "rgba(245, 54, 54, 0.9)",
                "value": 95
              }
            ]
          },
          "unit": "percent"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 7,
        "w": 6,
        "x": 6,
        "y": 1
      },
      "hideTimeOverride": true,
      "id": 11,
      "maxDataPoints": 100,
      "options": {
        "minVizHeight": 75,
        "minVizWidth": 75,
        "orientation": "horizontal",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "showThresholdLabels": false,
        "showThresholdMarkers": true,
        "sizing": "auto"
      },
      "pluginVersion": "10.1.10",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr":
              "100 * (dragonfly_memory_used_bytes{namespace=\"$namespace\",pod=~\"$pod_name\"}  / dragonfly_memory_max_bytes{namespace=\"$namespace\",pod=~\"$pod_name\"} )",
          "format": "time_series",
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "",
          "metric": "",
          "range": true,
          "refId": "A",
          "step": 2
        }
      ],
      "timeFrom": "1m",
      "title": "Memory Usage",
      "type": "gauge"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 22,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "normal"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "links": [],
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "short"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 7,
        "w": 12,
        "x": 12,
        "y": 1
      },
      "id": 2,
      "options": {
        "alertThreshold": true,
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "pluginVersion": "10.1.10",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr":
              "sum(irate(dragonfly_commands_total{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval])) by (cmd)",
          "format": "time_series",
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "{{pod}}",
          "metric": "A",
          "range": true,
          "refId": "A",
          "step": 240,
          "target": ""
        }
      ],
      "title": "Commands Executed / sec",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "auto",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "links": [],
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "bytes"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 7,
        "w": 12,
        "x": 0,
        "y": 8
      },
      "id": 7,
      "options": {
        "alertThreshold": true,
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "pluginVersion": "10.1.10",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr": "dragonfly_memory_used_bytes{namespace=\"$namespace\",pod=~\"$pod_name\"} ",
          "format": "time_series",
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "used",
          "metric": "",
          "range": true,
          "refId": "A",
          "step": 240,
          "target": ""
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr": "dragonfly_memory_max_bytes{namespace=\"$namespace\",pod=~\"$pod_name\"} ",
          "format": "time_series",
          "hide": false,
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "max",
          "range": true,
          "refId": "B",
          "step": 240
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr": "dragonfly_used_memory_rss_bytes{namespace=\"$namespace\",pod=~\"$pod_name\"} ",
          "format": "time_series",
          "hide": false,
          "legendFormat": "RSS",
          "range": true,
          "refId": "C"
        }
      ],
      "title": "Total Memory Usage",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 22,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "normal"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "links": [],
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "s"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 7,
        "w": 12,
        "x": 12,
        "y": 8
      },
      "id": 35,
      "options": {
        "alertThreshold": true,
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "pluginVersion": "10.1.10",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr":
              "max by (cmd) (irate(dragonfly_commands_duration_seconds{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval]) / (irate(dragonfly_commands_total{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval])))",
          "format": "time_series",
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "__auto",
          "metric": "A",
          "range": true,
          "refId": "A",
          "step": 240,
          "target": ""
        }
      ],
      "title": "Latency per command ",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "auto",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "links": [],
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          }
        },
        "overrides": []
      },
      "gridPos": {
        "h": 7,
        "w": 12,
        "x": 0,
        "y": 15
      },
      "id": 5,
      "options": {
        "alertThreshold": true,
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "pluginVersion": "10.1.10",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr": "sum (dragonfly_db_keys{namespace=\"$namespace\",pod=~\"$pod_name\"}) by (db)",
          "format": "time_series",
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "{{ db }} ",
          "range": true,
          "refId": "A",
          "step": 240,
          "target": ""
        }
      ],
      "title": "Total Items per DB",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 22,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "normal"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "links": [],
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "short"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 7,
        "w": 12,
        "x": 12,
        "y": 15
      },
      "id": 24,
      "options": {
        "alertThreshold": true,
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "pluginVersion": "10.1.10",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr":
              "irate(dragonfly_keyspace_hits_total{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval])",
          "format": "time_series",
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "hits",
          "metric": "A",
          "range": true,
          "refId": "A",
          "step": 240,
          "target": ""
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr":
              "irate(dragonfly_keyspace_misses_total{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval])",
          "format": "time_series",
          "hide": false,
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "misses",
          "metric": "A",
          "range": true,
          "refId": "B",
          "step": 240,
          "target": ""
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr":
              "irate(dragonfly_keyspace_mutations_total{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval])",
          "format": "time_series",
          "hide": false,
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "mutations",
          "metric": "A",
          "range": true,
          "refId": "C",
          "step": 240,
          "target": ""
        }
      ],
      "title": "Keys Ops / sec",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "auto",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "links": [],
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          }
        },
        "overrides": []
      },
      "gridPos": {
        "h": 7,
        "w": 12,
        "x": 0,
        "y": 22
      },
      "id": 8,
      "options": {
        "alertThreshold": true,
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "pluginVersion": "10.1.10",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr":
              "sum(irate(dragonfly_expired_keys_total{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval])) by (pod)",
          "format": "time_series",
          "hide": false,
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "expired",
          "metric": "",
          "range": true,
          "refId": "A",
          "step": 240,
          "target": ""
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr":
              "sum(irate(dragonfly_evicted_keys_total{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval])) by (pod)",
          "format": "time_series",
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "evicted",
          "range": true,
          "refId": "B",
          "step": 240
        }
      ],
      "title": "Expired / Evicted",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "auto",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "links": [],
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          }
        },
        "overrides": []
      },
      "gridPos": {
        "h": 7,
        "w": 12,
        "x": 12,
        "y": 22
      },
      "id": 25,
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "pluginVersion": "10.1.10",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr":
              "dragonfly_db_keys{namespace=\"$namespace\",pod=~\"$pod_name\"}/ on(namespace, pod, db) dragonfly_db_capacity{namespace=\"$namespace\",pod=~\"$pod_name\", type=\"prime\"}",
          "format": "time_series",
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "{{ db }} ",
          "range": true,
          "refId": "A",
          "step": 240,
          "target": ""
        }
      ],
      "title": "Table Load per DB",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "auto",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "links": [],
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "s"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 7,
        "w": 12,
        "x": 0,
        "y": 29
      },
      "id": 26,
      "options": {
        "alertThreshold": true,
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "pluginVersion": "10.1.10",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr":
              "irate(dragonfly_reply_duration_seconds{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__interval]) / irate(dragonfly_reply_total{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__interval])",
          "format": "time_series",
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "{{ pod }} input",
          "range": true,
          "refId": "A",
          "step": 240
        }
      ],
      "title": "Reply Latency",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "auto",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "links": [],
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          }
        },
        "overrides": []
      },
      "gridPos": {
        "h": 7,
        "w": 12,
        "x": 12,
        "y": 29
      },
      "id": 10,
      "options": {
        "alertThreshold": true,
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "pluginVersion": "10.1.10",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr":
              "irate(dragonfly_net_input_bytes_total{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval])",
          "format": "time_series",
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "{{ pod }} input",
          "range": true,
          "refId": "A",
          "step": 240
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr":
              "irate(dragonfly_net_output_bytes_total{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval])",
          "format": "time_series",
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "{{ pod }} output",
          "range": true,
          "refId": "B",
          "step": 240
        }
      ],
      "title": "Network I/O",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "auto",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "links": [],
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          }
        },
        "overrides": []
      },
      "gridPos": {
        "h": 7,
        "w": 12,
        "x": 0,
        "y": 36
      },
      "id": 16,
      "options": {
        "alertThreshold": true,
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "pluginVersion": "10.1.10",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr": "dragonfly_connected_clients{namespace=\"$namespace\",pod=\"$pod_name\"}",
          "format": "time_series",
          "interval": "",
          "intervalFactor": 1,
          "legendFormat": "{{pod}}",
          "range": true,
          "refId": "A"
        }
      ],
      "title": "Dragonfly connected clients",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "auto",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "links": [],
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "s"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 7,
        "w": 12,
        "x": 12,
        "y": 36
      },
      "id": 27,
      "options": {
        "alertThreshold": true,
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "pluginVersion": "10.1.10",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr":
              "rate(dragonfly_pipeline_commands_duration_seconds{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval])/rate(dragonfly_pipeline_commands_total{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval])",
          "format": "time_series",
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "pipeline",
          "range": true,
          "refId": "A",
          "step": 240
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr":
              "rate(dragonfly_cmd_squash_hop_duration_seconds{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval])/rate(dragonfly_cmd_squash_hop_total{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval])",
          "format": "time_series",
          "hide": false,
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "execute_hop",
          "range": true,
          "refId": "B",
          "step": 240
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr":
              "rate(dragonfly_pipeline_queue_wait_duration_seconds{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval])/rate(dragonfly_pipeline_commands_total{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval])",
          "format": "time_series",
          "hide": false,
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "wait_queue",
          "range": true,
          "refId": "C",
          "step": 240
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "exemplar": false,
          "expr":
              "rate(dragonfly_pipeline_dispatch_flush_duration_seconds{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval])/rate(dragonfly_pipeline_dispatch_calls_total{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval])",
          "format": "time_series",
          "instant": false,
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "dispatch_flush",
          "range": true,
          "refId": "D",
          "step": 240
        }
      ],
      "title": "Pipeline Latency",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "auto",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          }
        },
        "overrides": []
      },
      "gridPos": {
        "h": 8,
        "w": 12,
        "x": 0,
        "y": 43
      },
      "id": 22,
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "pluginVersion": "10.1.10",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "expr": "dragonfly_pipeline_queue_length{namespace=\"$namespace\",pod=~\"$pod_name\"}",
          "instant": false,
          "legendFormat": "avr_pipeline_depth",
          "range": true,
          "refId": "A"
        }
      ],
      "title": "Pipeline length",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "auto",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "links": [],
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          }
        },
        "overrides": []
      },
      "gridPos": {
        "h": 7,
        "w": 12,
        "x": 12,
        "y": 43
      },
      "id": 13,
      "options": {
        "alertThreshold": true,
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "pluginVersion": "10.1.10",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr":
              "sum (dragonfly_db_keys{namespace=\"$namespace\",pod=~\"$pod_name\"}) - sum (dragonfly_db_keys_expiring{namespace=\"$namespace\",pod=~\"$pod_name\"}) ",
          "format": "time_series",
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "not expiring",
          "range": true,
          "refId": "A",
          "step": 240,
          "target": ""
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr": "sum (dragonfly_db_keys_expiring{namespace=\"$namespace\",pod=~\"$pod_name\"})",
          "format": "time_series",
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "expiring",
          "metric": "",
          "range": true,
          "refId": "B",
          "step": 240
        }
      ],
      "title": "Expiring vs Not-Expiring Keys",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "description": "",
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "auto",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          }
        },
        "overrides": []
      },
      "gridPos": {
        "h": 8,
        "w": 12,
        "x": 12,
        "y": 50
      },
      "id": 28,
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "pluginVersion": "10.1.10",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "expr":
              "irate(dragonfly_cmd_squash_commands_total\n{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval])/irate(dragonfly_cmd_squash_hop_total{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval])",
          "hide": false,
          "instant": false,
          "legendFormat": "squash_len",
          "range": true,
          "refId": "A"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "expr":
              "irate(dragonfly_pipeline_dispatch_commands_total\n{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval])/irate(dragonfly_pipeline_dispatch_calls_total{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval])",
          "hide": false,
          "instant": false,
          "legendFormat": "dispatch_len",
          "range": true,
          "refId": "B"
        }
      ],
      "title": "Average Squashing Length",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "auto",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          }
        },
        "overrides": []
      },
      "gridPos": {
        "h": 8,
        "w": 12,
        "x": 0,
        "y": 51
      },
      "id": 21,
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "pluginVersion": "10.1.10",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "expr":
              "dragonfly_replication_full_sync_bytes{namespace=\"$namespace\",pod=~\"$pod_name\"}",
          "instant": false,
          "legendFormat": "fullsync",
          "range": true,
          "refId": "A"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "expr":
              "dragonfly_replication_streaming_bytes{namespace=\"$namespace\",pod=~\"$pod_name\"}",
          "hide": false,
          "instant": false,
          "legendFormat": "stable_sync",
          "range": true,
          "refId": "B"
        }
      ],
      "title": "Master Replication memory",
      "type": "timeseries"
    },
    {
      "collapsed": true,
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 59
      },
      "id": 36,
      "panels": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 0,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "insertNulls": false,
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "auto",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              }
            },
            "overrides": []
          },
          "gridPos": {
            "h": 8,
            "w": 12,
            "x": 0,
            "y": 67
          },
          "id": 33,
          "options": {
            "legend": {
              "calcs": [],
              "displayMode": "list",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "single",
              "sort": "none"
            }
          },
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${DS_PROMETHEUS}"
              },
              "disableTextWrap": false,
              "editorMode": "builder",
              "expr": "rate(dragonfly_tiered_hits[$__rate_interval])",
              "fullMetaSearch": false,
              "includeNullMetadata": true,
              "instant": false,
              "legendFormat": "{{type}}",
              "range": true,
              "refId": "A",
              "useBackend": false
            }
          ],
          "title": "Tiered hits",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 0,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "insertNulls": false,
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "auto",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              }
            },
            "overrides": []
          },
          "gridPos": {
            "h": 8,
            "w": 12,
            "x": 12,
            "y": 67
          },
          "id": 34,
          "options": {
            "legend": {
              "calcs": [],
              "displayMode": "list",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "single",
              "sort": "none"
            }
          },
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${DS_PROMETHEUS}"
              },
              "disableTextWrap": false,
              "editorMode": "builder",
              "expr": "rate(dragonfly_tiered_overload[$__rate_interval])",
              "fullMetaSearch": false,
              "includeNullMetadata": true,
              "instant": false,
              "legendFormat": "{{type}}",
              "range": true,
              "refId": "A",
              "useBackend": false
            }
          ],
          "title": "Tiered overload",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "description": "Tiered bytes",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 0,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "insertNulls": false,
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "auto",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "bytes"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 8,
            "w": 12,
            "x": 0,
            "y": 75
          },
          "id": 31,
          "options": {
            "legend": {
              "calcs": [],
              "displayMode": "list",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "single",
              "sort": "none"
            }
          },
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${DS_PROMETHEUS}"
              },
              "disableTextWrap": false,
              "editorMode": "builder",
              "expr": "dragonfly_tiered_bytes",
              "format": "time_series",
              "fullMetaSearch": false,
              "includeNullMetadata": true,
              "instant": false,
              "interval": "",
              "legendFormat": "{{type}}",
              "range": true,
              "refId": "A",
              "useBackend": false
            }
          ],
          "title": "Tiered bytes",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 0,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "insertNulls": false,
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "auto",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              }
            },
            "overrides": []
          },
          "gridPos": {
            "h": 8,
            "w": 12,
            "x": 12,
            "y": 75
          },
          "id": 32,
          "options": {
            "legend": {
              "calcs": [],
              "displayMode": "list",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "single",
              "sort": "none"
            }
          },
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${DS_PROMETHEUS}"
              },
              "disableTextWrap": false,
              "editorMode": "builder",
              "expr": "rate(dragonfly_tiered_events[$__rate_interval])",
              "fullMetaSearch": false,
              "includeNullMetadata": true,
              "instant": false,
              "legendFormat": "{{type}}",
              "range": true,
              "refId": "A",
              "useBackend": false
            }
          ],
          "title": "Tiered events",
          "type": "timeseries"
        }
      ],
      "title": "Tiered",
      "type": "row"
    },
    {
      "collapsed": false,
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 60
      },
      "id": 19,
      "panels": [],
      "title": "Advanced metrics",
      "type": "row"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "auto",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "s"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 8,
        "w": 12,
        "x": 0,
        "y": 61
      },
      "id": 18,
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "pluginVersion": "10.1.10",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "disableTextWrap": false,
          "editorMode": "code",
          "expr":
              "irate(dragonfly_fiber_switch_delay_seconds_total{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval])/rate(dragonfly_fiber_switch_total{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval])",
          "fullMetaSearch": false,
          "includeNullMetadata": false,
          "instant": false,
          "legendFormat": "switch",
          "range": true,
          "refId": "A",
          "useBackend": false
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "expr":
              "irate(dragonfly_fiber_longrun_seconds{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval])/irate(dragonfly_fiber_longrun_total{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval])",
          "hide": false,
          "instant": false,
          "legendFormat": "longrun",
          "range": true,
          "refId": "B"
        }
      ],
      "title": "FiberSwitchDelay",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "description": "",
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "auto",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "percentunit"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 8,
        "w": 12,
        "x": 12,
        "y": 61
      },
      "id": 29,
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "pluginVersion": "10.1.10",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "expr":
              "1 - irate(dragonfly_pipeline_dispatch_calls_total\n{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval])/(irate(dragonfly_pipeline_dispatch_calls_total{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval]) + irate(dragonfly_cmd_squash_stats_ignored_total{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval]))",
          "hide": false,
          "instant": false,
          "legendFormat": "__auto",
          "range": true,
          "refId": "B"
        }
      ],
      "title": "Metrics Select Pipeline Ratio",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "description": "",
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "auto",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "none"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 8,
        "w": 12,
        "x": 0,
        "y": 69
      },
      "id": 30,
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "pluginVersion": "10.1.10",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "expr":
              "irate(dragonfly_net_read_yields_total\n{namespace=\"$namespace\",pod=~\"$pod_name\"}[$__rate_interval])",
          "hide": false,
          "instant": false,
          "legendFormat": "__auto",
          "range": true,
          "refId": "B"
        }
      ],
      "title": "Read Yields Per second",
      "type": "timeseries"
    }
  ],
  "refresh": "10s",
  "schemaVersion": 38,
  "style": "dark",
  "tags": [
    "prometheus",
    "dragonfly"
  ],
  "templating": {
    "list": [
      {
        "current": {
          "selected": false,
          "text": "Prometheus",
          "value": "PBFA97CFB590B2093"
        },
        "hide": 0,
        "includeAll": false,
        "label": "Prometheus",
        "multi": false,
        "name": "DS_PROMETHEUS",
        "options": [],
        "query": "prometheus",
        "refresh": 1,
        "regex": "",
        "skipUrlSync": false,
        "type": "datasource"
      },
      {
        "current": {
          "isNone": true,
          "selected": false,
          "text": "None",
          "value": ""
        },
        "datasource": {
          "uid": "$DS_PROMETHEUS"
        },
        "definition": "label_values(dragonfly_version, namespace)",
        "hide": 0,
        "includeAll": false,
        "label": "Namespace",
        "multi": false,
        "name": "namespace",
        "options": [],
        "query": {
          "query": "label_values(dragonfly_version, namespace)",
          "refId": "StandardVariableQuery"
        },
        "refresh": 2,
        "regex": "",
        "skipUrlSync": false,
        "sort": 0,
        "type": "query"
      },
      {
        "current": {
          "isNone": true,
          "selected": false,
          "text": "None",
          "value": ""
        },
        "datasource": {
          "uid": "$DS_PROMETHEUS"
        },
        "definition": "label_values(dragonfly_version{namespace=\"$namespace\"}, pod)",
        "hide": 0,
        "includeAll": false,
        "label": "Pod Name",
        "multi": false,
        "name": "pod_name",
        "options": [],
        "query": {
          "query": "label_values(dragonfly_version{namespace=\"$namespace\"}, pod)",
          "refId": "StandardVariableQuery"
        },
        "refresh": 2,
        "regex": "",
        "skipUrlSync": false,
        "sort": 1,
        "type": "query"
      }
    ]
  },
  "time": {
    "from": "now-5m",
    "to": "now"
  },
  "timepicker": {},
  "timezone": "browser",
  "title": "Dragonfly Dashboard",
  "uid": "xDLNRKUWz",
  "version": 1,
  "weekStart": ""
}


================================================
FILE: tools/local/monitoring/grafana/provisioning/dashboards/memcached.json
================================================
{
  "annotations": {
    "list": [
      {
        "builtIn": 1,
        "datasource": {
          "type": "datasource",
          "uid": "grafana"
        },
        "enable": true,
        "hide": true,
        "iconColor": "rgba(0, 211, 255, 1)",
        "name": "Annotations & Alerts",
        "type": "dashboard"
      }
    ]
  },
  "description":
      "Memcached dashboard for grafana 6.5.1 or above.\r\nDataSource: Prometheus\r\nCollector: Memcached official exporter : https://github.com/prometheus/memcached_exporter/releases \r\n",
  "editable": true,
  "fiscalYearStartMonth": 0,
  "gnetId": 11527,
  "graphTooltip": 0,
  "links": [],
  "liveNow": false,
  "panels": [
    {
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "mappings": [
            {
              "options": {
                "0": {
                  "text": "DOWN"
                },
                "1": {
                  "text": "UP"
                }
              },
              "type": "value"
            }
          ],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "#d44a3a",
                "value": null
              },
              {
                "color": "rgba(237, 129, 40, 0.89)",
                "value": 0.1
              },
              {
                "color": "#299c46",
                "value": 0.9
              }
            ]
          },
          "unit": "short"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 4,
        "w": 2,
        "x": 0,
        "y": 0
      },
      "id": 6,
      "links": [],
      "maxDataPoints": 100,
      "options": {
        "colorMode": "background",
        "graphMode": "none",
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "textMode": "auto"
      },
      "pluginVersion": "10.1.10",
      "repeatDirection": "h",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr": "memcached_up{job=\"$job\"}",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "",
          "refId": "A"
        }
      ],
      "title": "Instance State",
      "type": "stat"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "fixedColor": "rgb(31, 120, 193)",
            "mode": "fixed"
          },
          "decimals": 1,
          "mappings": [
            {
              "options": {
                "match": "null",
                "result": {
                  "text": "N/A"
                }
              },
              "type": "special"
            }
          ],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "#37872D",
                "value": null
              },
              {
                "color": "rgba(237, 129, 40, 0.89)",
                "value": 100000
              },
              {
                "color": "#d44a3a",
                "value": 200000
              }
            ]
          },
          "unit": "s"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 4,
        "w": 3,
        "x": 2,
        "y": 0
      },
      "id": 7,
      "links": [],
      "maxDataPoints": 100,
      "options": {
        "colorMode": "none",
        "graphMode": "area",
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "textMode": "auto"
      },
      "pluginVersion": "10.1.10",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr": "memcached_uptime_seconds{job=\"$job\"}",
          "format": "time_series",
          "intervalFactor": 1,
          "refId": "A"
        }
      ],
      "title": "Up Time",
      "type": "stat"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "mappings": [
            {
              "id": 0,
              "op": "=",
              "text": "N/A",
              "type": 1,
              "value": "null"
            }
          ],
          "max": 1,
          "min": 0,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "rgba(50, 172, 45, 0.97)",
                "value": null
              },
              {
                "color": "rgba(237, 129, 40, 0.89)",
                "value": 0.75
              },
              {
                "color": "rgba(245, 54, 54, 0.9)",
                "value": 0.9
              }
            ]
          },
          "unit": "percentunit"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 4,
        "w": 3,
        "x": 5,
        "y": 0
      },
      "id": 23,
      "links": [],
      "options": {
        "orientation": "horizontal",
        "reduceOptions": {
          "calcs": [
            "mean"
          ],
          "fields": "",
          "values": false
        },
        "showThresholdLabels": false,
        "showThresholdMarkers": true
      },
      "pluginVersion": "10.1.10",
      "targets": [
        {
          "aggregation": "Last",
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "decimals": 2,
          "displayAliasType": "Warning / Critical",
          "displayType": "Regular",
          "displayValueWithAlias": "Never",
          "expr": "memcached_current_bytes{job=\"$job\"}/memcached_limit_bytes{job=\"$job\"}",
          "format": "time_series",
          "instant": false,
          "intervalFactor": 1,
          "legendFormat": "Memory used",
          "refId": "A",
          "step": 20,
          "units": "none",
          "valueHandler": "Number Threshold"
        }
      ],
      "title": "Memory usage",
      "type": "gauge"
    },
    {
      "aliasColors": {
        "ratio": "#6ED0E0"
      },
      "bars": true,
      "dashLength": 10,
      "dashes": false,
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "editable": true,
      "error": false,
      "fieldConfig": {
        "defaults": {
          "links": []
        },
        "overrides": []
      },
      "fill": 0,
      "fillGradient": 0,
      "grid": {},
      "gridPos": {
        "h": 6,
        "w": 8,
        "x": 8,
        "y": 0
      },
      "hiddenSeries": false,
      "id": 25,
      "legend": {
        "alignAsTable": false,
        "avg": false,
        "current": false,
        "max": false,
        "min": false,
        "rightSide": false,
        "show": false,
        "total": false,
        "values": false
      },
      "lines": false,
      "linewidth": 2,
      "links": [],
      "nullPointMode": "connected",
      "options": {
        "alertThreshold": true
      },
      "percentage": false,
      "pluginVersion": "10.1.10",
      "pointradius": 5,
      "points": false,
      "renderer": "flot",
      "seriesOverrides": [],
      "spaceLength": 10,
      "stack": false,
      "steppedLine": false,
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr":
              "sum (memcached_commands_total{command=\"get\",job=\"$job\"}) / (sum (memcached_commands_total{command=\"get\",job=\"$job\"}) + sum (memcached_commands_total{command=\"set\",job=\"$job\"}))",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "Get",
          "refId": "A",
          "step": 5,
          "target": ""
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr":
              "sum (memcached_commands_total{command=\"set\",job=\"$job\"}) / (sum (memcached_commands_total{command=\"get\",job=\"$job\"}) + sum (memcached_commands_total{command=\"set\",job=\"$job\"}))",
          "format": "time_series",
          "intervalFactor": 2,
          "legendFormat": "Set",
          "refId": "B",
          "step": 10
        }
      ],
      "thresholds": [],
      "timeRegions": [],
      "title": "Get & Set ratio",
      "tooltip": {
        "msResolution": false,
        "shared": false,
        "sort": 0,
        "value_type": "cumulative"
      },
      "type": "graph",
      "xaxis": {
        "mode": "series",
        "show": true,
        "values": [
          "current"
        ]
      },
      "yaxes": [
        {
          "format": "percentunit",
          "logBase": 1,
          "max": "1",
          "min": "0",
          "show": true
        },
        {
          "format": "short",
          "logBase": 1,
          "show": false
        }
      ],
      "yaxis": {
        "align": false
      }
    },
    {
      "aliasColors": {},
      "bars": true,
      "dashLength": 10,
      "dashes": false,
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "editable": true,
      "error": false,
      "fieldConfig": {
        "defaults": {
          "links": []
        },
        "overrides": []
      },
      "fill": 1,
      "fillGradient": 0,
      "grid": {},
      "gridPos": {
        "h": 6,
        "w": 8,
        "x": 16,
        "y": 0
      },
      "height": "120px",
      "hiddenSeries": false,
      "id": 29,
      "legend": {
        "avg": false,
        "current": false,
        "max": false,
        "min": false,
        "show": false,
        "total": false,
        "values": false
      },
      "lines": false,
      "linewidth": 2,
      "links": [],
      "nullPointMode": "connected",
      "options": {
        "alertThreshold": true
      },
      "percentage": false,
      "pluginVersion": "10.1.10",
      "pointradius": 5,
      "points": false,
      "renderer": "flot",
      "seriesOverrides": [
        {
          "alias": "miss",
          "color": "#E24D42"
        }
      ],
      "spaceLength": 10,
      "stack": false,
      "steppedLine": false,
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr":
              "sum (memcached_commands_total{status=\"hit\",command=\"get\",}) / sum (memcached_commands_total{command=\"get\",job=\"$job\"})",
          "format": "time_series",
          "hide": false,
          "intervalFactor": 1,
          "legendFormat": "hit",
          "refId": "C",
          "step": 5
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr":
              "sum (memcached_commands_total{status=\"miss\",command=\"get\",job=\"$job\"}) / sum (memcached_commands_total{command=\"get\",job=\"$job\"})",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "miss",
          "refId": "A",
          "step": 5
        }
      ],
      "thresholds": [],
      "timeRegions": [],
      "title": "Hit & Miss ratio",
      "tooltip": {
        "msResolution": true,
        "shared": false,
        "sort": 0,
        "value_type": "cumulative"
      },
      "type": "graph",
      "xaxis": {
        "mode": "series",
        "show": true,
        "values": [
          "current"
        ]
      },
      "yaxes": [
        {
          "format": "percentunit",
          "logBase": 1,
          "max": "1",
          "min": 0,
          "show": true
        },
        {
          "format": "short",
          "logBase": 1,
          "show": false
        }
      ],
      "yaxis": {
        "align": false
      }
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "mappings": [
            {
              "options": {
                "match": "null",
                "result": {
                  "text": "N/A"
                }
              },
              "type": "special"
            }
          ],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "short"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 2,
        "w": 4,
        "x": 0,
        "y": 4
      },
      "id": 35,
      "links": [],
      "maxDataPoints": 100,
      "options": {
        "colorMode": "none",
        "graphMode": "none",
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "textMode": "auto"
      },
      "pluginVersion": "10.1.10",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr": "sum (memcached_items_evicted_total{job=\"$job\"})",
          "format": "time_series",
          "intervalFactor": 1,
          "refId": "A",
          "step": 20
        }
      ],
      "title": "Evicts (total)",
      "type": "stat"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "mappings": [
            {
              "options": {
                "match": "null",
                "result": {
                  "text": "N/A"
                }
              },
              "type": "special"
            }
          ],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "short"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 2,
        "w": 4,
        "x": 4,
        "y": 4
      },
      "id": 37,
      "links": [],
      "maxDataPoints": 100,
      "options": {
        "colorMode": "none",
        "graphMode": "none",
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "textMode": "auto"
      },
      "pluginVersion": "10.1.10",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr": "sum (memcached_items_reclaimed_total{job=\"$job\"})",
          "format": "time_series",
          "intervalFactor": 1,
          "refId": "A",
          "step": 20
        }
      ],
      "title": "Reclaims (total)",
      "type": "stat"
    },
    {
      "aliasColors": {
        "evicts": "#890F02",
        "memcached_items_evicted_total{instance=\"172.17.0.1:9150\",job=\"prometheus\"}": "#890F02",
        "reclaims": "#3F6833"
      },
      "bars": false,
      "dashLength": 10,
      "dashes": false,
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "editable": true,
      "error": false,
      "fieldConfig": {
        "defaults": {
          "links": []
        },
        "overrides": []
      },
      "fill": 1,
      "fillGradient": 0,
      "grid": {},
      "gridPos": {
        "h": 7,
        "w": 8,
        "x": 0,
        "y": 6
      },
      "height": "240px",
      "hiddenSeries": false,
      "id": 27,
      "legend": {
        "avg": false,
        "current": false,
        "max": false,
        "min": false,
        "show": true,
        "total": false,
        "values": false
      },
      "lines": true,
      "linewidth": 2,
      "links": [],
      "nullPointMode": "connected",
      "options": {
        "alertThreshold": true
      },
      "percentage": false,
      "pluginVersion": "10.1.10",
      "pointradius": 5,
      "points": false,
      "renderer": "flot",
      "seriesOverrides": [
        {
          "alias": "reclaims",
          "yaxis": 2
        }
      ],
      "spaceLength": 10,
      "stack": false,
      "steppedLine": false,
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr": "sum (irate (memcached_items_evicted_total{job=\"$job\"}[5m]))",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "evicts",
          "refId": "A",
          "step": 5,
          "target": ""
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr": "sum (irate (memcached_items_reclaimed_total{job=\"$job\"}[5m]))",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "reclaims",
          "refId": "B",
          "step": 5
        }
      ],
      "thresholds": [],
      "timeRegions": [],
      "title": "Evicts & Reclaims rate",
      "tooltip": {
        "msResolution": false,
        "shared": true,
        "sort": 0,
        "value_type": "cumulative"
      },
      "type": "graph",
      "xaxis": {
        "mode": "time",
        "show": true,
        "values": []
      },
      "yaxes": [
        {
          "format": "short",
          "logBase": 1,
          "show": true
        },
        {
          "format": "short",
          "logBase": 1,
          "show": true
        }
      ],
      "yaxis": {
        "align": false
      }
    },
    {
      "aliasColors": {},
      "bars": false,
      "dashLength": 10,
      "dashes": false,
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "fieldConfig": {
        "defaults": {
          "links": []
        },
        "overrides": []
      },
      "fill": 1,
      "fillGradient": 0,
      "gridPos": {
        "h": 7,
        "w": 8,
        "x": 8,
        "y": 6
      },
      "hiddenSeries": false,
      "id": 10,
      "legend": {
        "alignAsTable": true,
        "avg": false,
        "current": true,
        "max": true,
        "min": true,
        "rightSide": false,
        "show": true,
        "total": false,
        "values": true
      },
      "lines": true,
      "linewidth": 2,
      "links": [],
      "nullPointMode": "null",
      "options": {
        "alertThreshold": true
      },
      "percentage": false,
      "pluginVersion": "10.1.10",
      "pointradius": 5,
      "points": false,
      "renderer": "flot",
      "seriesOverrides": [],
      "spaceLength": 10,
      "stack": false,
      "steppedLine": false,
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr": "memcached_current_connections{job=\"$job\"}",
          "format": "time_series",
          "interval": "1m",
          "intervalFactor": 1,
          "legendFormat": "Connections",
          "refId": "A"
        }
      ],
      "thresholds": [],
      "timeRegions": [],
      "title": "Connections",
      "tooltip": {
        "shared": true,
        "sort": 0,
        "value_type": "individual"
      },
      "type": "graph",
      "xaxis": {
        "mode": "time",
        "show": true,
        "values": []
      },
      "yaxes": [
        {
          "format": "none",
          "logBase": 1,
          "show": true
        },
        {
          "format": "short",
          "logBase": 1,
          "show": true
        }
      ],
      "yaxis": {
        "align": false
      }
    },
    {
      "aliasColors": {},
      "bars": false,
      "dashLength": 10,
      "dashes": false,
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "editable": true,
      "error": false,
      "fieldConfig": {
        "defaults": {
          "links": []
        },
        "overrides": []
      },
      "fill": 1,
      "fillGradient": 0,
      "grid": {},
      "gridPos": {
        "h": 7,
        "w": 8,
        "x": 16,
        "y": 6
      },
      "height": "240px",
      "hiddenSeries": false,
      "id": 31,
      "legend": {
        "avg": false,
        "current": false,
        "max": false,
        "min": false,
        "show": false,
        "total": false,
        "values": false
      },
      "lines": true,
      "linewidth": 2,
      "links": [],
      "nullPointMode": "connected",
      "options": {
        "alertThreshold": true
      },
      "percentage": false,
      "pluginVersion": "10.1.10",
      "pointradius": 5,
      "points": false,
      "renderer": "flot",
      "seriesOverrides": [],
      "spaceLength": 10,
      "stack": false,
      "steppedLine": false,
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr":
              "sum (irate (memcached_commands_total{status=\"hit\",command=\"get\",job=\"$job\"}[5m])) / sum (irate (memcached_commands_total{command=\"get\",job=\"$job\"}[5m]\n))",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "Hit",
          "refId": "A",
          "step": 5
        }
      ],
      "thresholds": [],
      "timeRegions": [],
      "title": "Hit rate",
      "tooltip": {
        "msResolution": true,
        "shared": false,
        "sort": 0,
        "value_type": "cumulative"
      },
      "type": "graph",
      "xaxis": {
        "mode": "time",
        "show": true,
        "values": [
          "total"
        ]
      },
      "yaxes": [
        {
          "format": "percentunit",
          "logBase": 1,
          "min": 0,
          "show": true
        },
        {
          "format": "short",
          "logBase": 1,
          "show": false
        }
      ],
      "yaxis": {
        "align": false
      }
    },
    {
      "aliasColors": {},
      "bars": false,
      "dashLength": 10,
      "dashes": false,
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "fieldConfig": {
        "defaults": {
          "links": []
        },
        "overrides": []
      },
      "fill": 1,
      "fillGradient": 0,
      "gridPos": {
        "h": 7,
        "w": 8,
        "x": 0,
        "y": 13
      },
      "hiddenSeries": false,
      "id": 19,
      "legend": {
        "alignAsTable": true,
        "avg": false,
        "current": true,
        "max": true,
        "min": true,
        "rightSide": false,
        "show": true,
        "total": false,
        "values": true
      },
      "lines": true,
      "linewidth": 2,
      "links": [],
      "nullPointMode": "null",
      "options": {
        "alertThreshold": true
      },
      "percentage": false,
      "pluginVersion": "10.1.10",
      "pointradius": 5,
      "points": false,
      "renderer": "flot",
      "seriesOverrides": [],
      "spaceLength": 10,
      "stack": false,
      "steppedLine": false,
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr": "delta(memcached_read_bytes_total{job=\"$job\"}[1m])",
          "format": "time_series",
          "interval": "1m",
          "intervalFactor": 1,
          "legendFormat": "read",
          "refId": "A"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr": "delta(memcached_written_bytes_total{job=\"$job\"}[1m])",
          "format": "time_series",
          "interval": "1m",
          "intervalFactor": 1,
          "legendFormat": "write",
          "refId": "B"
        }
      ],
      "thresholds": [],
      "timeRegions": [],
      "title": "Read/Write",
      "tooltip": {
        "shared": true,
        "sort": 0,
        "value_type": "individual"
      },
      "type": "graph",
      "xaxis": {
        "mode": "time",
        "show": true,
        "values": []
      },
      "yaxes": [
        {
          "format": "bytes",
          "logBase": 1,
          "show": true
        },
        {
          "format": "short",
          "logBase": 1,
          "show": true
        }
      ],
      "yaxis": {
        "align": false
      }
    },
    {
      "aliasColors": {},
      "bars": false,
      "dashLength": 10,
      "dashes": false,
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "fieldConfig": {
        "defaults": {
          "links": []
        },
        "overrides": []
      },
      "fill": 1,
      "fillGradient": 0,
      "gridPos": {
        "h": 7,
        "w": 8,
        "x": 8,
        "y": 13
      },
      "hiddenSeries": false,
      "id": 5,
      "legend": {
        "avg": false,
        "current": false,
        "max": false,
        "min": false,
        "show": true,
        "total": false,
        "values": false
      },
      "lines": true,
      "linewidth": 1,
      "links": [],
      "nullPointMode": "null",
      "options": {
        "alertThreshold": true
      },
      "percentage": false,
      "pluginVersion": "10.1.10",
      "pointradius": 2,
      "points": false,
      "renderer": "flot",
      "seriesOverrides": [],
      "spaceLength": 10,
      "stack": false,
      "steppedLine": false,
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr": "memcached_current_bytes{job=\"$job\"}/memcached_limit_bytes{job=\"$job\"}",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "Memory",
          "refId": "A"
        }
      ],
      "thresholds": [],
      "timeRegions": [],
      "title": "Memory Used",
      "tooltip": {
        "shared": true,
        "sort": 0,
        "value_type": "individual"
      },
      "type": "graph",
      "xaxis": {
        "mode": "time",
        "show": true,
        "values": []
      },
      "yaxes": [
        {
          "format": "short",
          "logBase": 1,
          "show": true
        },
        {
          "format": "short",
          "logBase": 1,
          "show": true
        }
      ],
      "yaxis": {
        "align": false
      }
    },
    {
      "aliasColors": {},
      "bars": false,
      "dashLength": 10,
      "dashes": false,
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "fieldConfig": {
        "defaults": {
          "links": []
        },
        "overrides": []
      },
      "fill": 1,
      "fillGradient": 0,
      "gridPos": {
        "h": 7,
        "w": 8,
        "x": 16,
        "y": 13
      },
      "hiddenSeries": false,
      "id": 4,
      "legend": {
        "avg": false,
        "current": false,
        "max": false,
        "min": false,
        "show": true,
        "total": false,
        "values": false
      },
      "lines": true,
      "linewidth": 1,
      "links": [],
      "nullPointMode": "null",
      "options": {
        "alertThreshold": true
      },
      "percentage": false,
      "pluginVersion": "10.1.10",
      "pointradius": 2,
      "points": false,
      "renderer": "flot",
      "seriesOverrides": [],
      "spaceLength": 10,
      "stack": false,
      "steppedLine": false,
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr": "memcached_current_items{job=\"$job\"}",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "Items",
          "refId": "A"
        }
      ],
      "thresholds": [],
      "timeRegions": [],
      "title": "Items in cache",
      "tooltip": {
        "shared": true,
        "sort": 0,
        "value_type": "individual"
      },
      "type": "graph",
      "xaxis": {
        "mode": "time",
        "show": true,
        "values": []
      },
      "yaxes": [
        {
          "format": "short",
          "logBase": 1,
          "show": true
        },
        {
          "format": "short",
          "logBase": 1,
          "show": true
        }
      ],
      "yaxis": {
        "align": false
      }
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "description": "",
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "auto",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "bytes"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 8,
        "w": 12,
        "x": 0,
        "y": 20
      },
      "id": 38,
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "editorMode": "code",
          "expr": "memcached_process_resident_memory_bytes",
          "instant": true,
          "legendFormat": "__auto",
          "range": true,
          "refId": "A"
        }
      ],
      "title": "RSS Memory",
      "type": "timeseries"
    },
    {
      "aliasColors": {},
      "bars": false,
      "dashLength": 10,
      "dashes": false,
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "fieldConfig": {
        "defaults": {
          "links": []
        },
        "overrides": []
      },
      "fill": 1,
      "fillGradient": 0,
      "gridPos": {
        "h": 8,
        "w": 12,
        "x": 12,
        "y": 20
      },
      "hiddenSeries": false,
      "id": 20,
      "legend": {
        "alignAsTable": true,
        "avg": true,
        "current": true,
        "max": true,
        "min": true,
        "rightSide": false,
        "show": true,
        "total": false,
        "values": true
      },
      "lines": true,
      "linewidth": 2,
      "links": [],
      "nullPointMode": "null",
      "options": {
        "alertThreshold": true
      },
      "percentage": false,
      "pluginVersion": "10.1.10",
      "pointradius": 5,
      "points": false,
      "renderer": "flot",
      "seriesOverrides": [],
      "spaceLength": 10,
      "stack": false,
      "steppedLine": false,
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr": "sum(delta(memcached_commands_total{job=\"$job\"}[30s]))/30",
          "format": "time_series",
          "interval": "15s",
          "intervalFactor": 1,
          "legendFormat": "QPS",
          "refId": "A"
        }
      ],
      "thresholds": [],
      "timeRegions": [],
      "title": "QPS",
      "tooltip": {
        "shared": true,
        "sort": 0,
        "value_type": "individual"
      },
      "type": "graph",
      "xaxis": {
        "mode": "time",
        "show": true,
        "values": []
      },
      "yaxes": [
        {
          "format": "short",
          "logBase": 1,
          "show": true
        },
        {
          "format": "short",
          "logBase": 1,
          "show": true
        }
      ],
      "yaxis": {
        "align": false
      }
    },
    {
      "aliasColors": {},
      "bars": false,
      "dashLength": 10,
      "dashes": false,
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "editable": true,
      "error": false,
      "fieldConfig": {
        "defaults": {
          "links": []
        },
        "overrides": []
      },
      "fill": 1,
      "fillGradient": 0,
      "grid": {},
      "gridPos": {
        "h": 8,
        "w": 12,
        "x": 0,
        "y": 28
      },
      "hiddenSeries": false,
      "id": 33,
      "legend": {
        "alignAsTable": true,
        "avg": false,
        "current": true,
        "hideEmpty": false,
        "hideZero": false,
        "max": false,
        "min": false,
        "rightSide": true,
        "show": true,
        "sideWidth": 120,
        "total": false,
        "values": true
      },
      "lines": true,
      "linewidth": 2,
      "links": [],
      "nullPointMode": "connected",
      "options": {
        "alertThreshold": true
      },
      "percentage": false,
      "pluginVersion": "10.1.10",
      "pointradius": 5,
      "points": false,
      "renderer": "flot",
      "seriesOverrides": [],
      "spaceLength": 10,
      "stack": false,
      "steppedLine": false,
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr": "sum (irate (memcached_commands_total{job=\"$job\"}[5m])) by (command)",
          "format": "time_series",
          "intervalFactor": 2,
          "legendFormat": "{{command}}",
          "refId": "A",
          "step": 4,
          "target": ""
        }
      ],
      "thresholds": [],
      "timeRegions": [],
      "title": "Commands",
      "tooltip": {
        "msResolution": false,
        "shared": true,
        "sort": 0,
        "value_type": "cumulative"
      },
      "type": "graph",
      "xaxis": {
        "mode": "time",
        "show": true,
        "values": []
      },
      "yaxes": [
        {
          "format": "short",
          "logBase": 1,
          "show": true
        },
        {
          "format": "short",
          "logBase": 1,
          "show": false
        }
      ],
      "yaxis": {
        "align": false
      }
    },
    {
      "aliasColors": {},
      "bars": false,
      "dashLength": 10,
      "dashes": false,
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "decimals": 2,
      "fieldConfig": {
        "defaults": {
          "links": []
        },
        "overrides": []
      },
      "fill": 1,
      "fillGradient": 0,
      "gridPos": {
        "h": 9,
        "w": 12,
        "x": 12,
        "y": 28
      },
      "hiddenSeries": false,
      "id": 11,
      "legend": {
        "alignAsTable": true,
        "avg": true,
        "current": true,
        "max": true,
        "min": true,
        "rightSide": false,
        "show": true,
        "total": false,
        "values": true
      },
      "lines": true,
      "linewidth": 2,
      "links": [],
      "nullPointMode": "null",
      "options": {
        "alertThreshold": true
      },
      "percentage": false,
      "pluginVersion": "10.1.10",
      "pointradius": 5,
      "points": false,
      "renderer": "flot",
      "seriesOverrides": [],
      "spaceLength": 10,
      "stack": false,
      "steppedLine": false,
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr":
              "sum (delta(memcached_commands_total{job=\"$job\", status=\"hit\",command=\"get\"}[1m]))  / sum (delta(memcached_commands_total{job=\"$job\",command=\"get\"}[1m])) * 100",
          "format": "time_series",
          "interval": "1m",
          "intervalFactor": 1,
          "legendFormat": "get",
          "refId": "A"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr":
              "sum (delta(memcached_commands_total{job=\"$job\", status=\"hit\",command=\"delete\"}[1m]))  / sum (delta(memcached_commands_total{job=\"$job\",command=\"delete\"}[1m])) * 100",
          "format": "time_series",
          "interval": "1m",
          "intervalFactor": 1,
          "legendFormat": "delete",
          "refId": "B"
        }
      ],
      "thresholds": [],
      "timeRegions": [],
      "title": "Hit Ratio Per Command",
      "tooltip": {
        "shared": true,
        "sort": 0,
        "value_type": "individual"
      },
      "type": "graph",
      "xaxis": {
        "mode": "time",
        "show": true,
        "values": []
      },
      "yaxes": [
        {
          "decimals": 2,
          "format": "percent",
          "logBase": 1,
          "show": true
        },
        {
          "format": "short",
          "logBase": 1,
          "show": true
        }
      ],
      "yaxis": {
        "align": false
      }
    },
    {
      "aliasColors": {},
      "bars": false,
      "dashLength": 10,
      "dashes": false,
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "decimals": 2,
      "fieldConfig": {
        "defaults": {
          "links": []
        },
        "overrides": []
      },
      "fill": 1,
      "fillGradient": 0,
      "gridPos": {
        "h": 9,
        "w": 12,
        "x": 0,
        "y": 36
      },
      "hiddenSeries": false,
      "id": 9,
      "legend": {
        "alignAsTable": true,
        "avg": true,
        "current": true,
        "max": true,
        "min": true,
        "rightSide": false,
        "show": true,
        "total": false,
        "values": true
      },
      "lines": true,
      "linewidth": 2,
      "links": [],
      "nullPointMode": "null",
      "options": {
        "alertThreshold": true
      },
      "percentage": false,
      "pluginVersion": "10.1.10",
      "pointradius": 5,
      "points": false,
      "renderer": "flot",
      "seriesOverrides": [],
      "spaceLength": 10,
      "stack": false,
      "steppedLine": false,
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr":
              "sum (delta(memcached_commands_total{job=\"$job\", status=\"hit\"}[1m]))  / sum (delta(memcached_commands_total{job=\"$job\"}[1m])) * 100",
          "format": "time_series",
          "interval": "1m",
          "intervalFactor": 1,
          "legendFormat": "Hit Ratio",
          "refId": "A"
        }
      ],
      "thresholds": [],
      "timeRegions": [],
      "title": "Hit Ratio",
      "tooltip": {
        "shared": true,
        "sort": 0,
        "value_type": "individual"
      },
      "type": "graph",
      "xaxis": {
        "mode": "time",
        "show": true,
        "values": []
      },
      "yaxes": [
        {
          "decimals": 2,
          "format": "percent",
          "logBase": 1,
          "show": true
        },
        {
          "format": "short",
          "logBase": 1,
          "show": true
        }
      ],
      "yaxis": {
        "align": false
      }
    }
  ],
  "refresh": "10s",
  "schemaVersion": 38,
  "style": "dark",
  "tags": [],
  "templating": {
    "list": [
      {
        "current": {
          "selected": false,
          "text": "memcached-exporter",
          "value": "memcached-exporter"
        },
        "datasource": {
          "type": "prometheus",
          "uid": "PBFA97CFB590B2093"
        },
        "definition": "",
        "hide": 0,
        "includeAll": false,
        "multi": false,
        "name": "job",
        "options": [],
        "query": "label_values(memcached_up, job)",
        "refresh": 1,
        "regex": "",
        "skipUrlSync": false,
        "sort": 0,
        "tagValuesQuery": "",
        "tagsQuery": "",
        "type": "query",
        "useTags": false
      }
    ]
  },
  "time": {
    "from": "now-6h",
    "to": "now"
  },
  "timepicker": {
    "refresh_intervals": [
      "5s",
      "10s",
      "30s",
      "1m",
      "5m",
      "15m",
      "30m",
      "1h",
      "2h",
      "1d"
    ],
    "time_options": [
      "5m",
      "15m",
      "1h",
      "6h",
      "12h",
      "24h",
      "2d",
      "7d",
      "30d"
    ]
  },
  "timezone": "",
  "title": "Memcached",
  "uid": "AQxf3X-mk",
  "version": 1,
  "weekStart": ""
}


================================================
FILE: tools/local/monitoring/grafana/provisioning/dashboards/node-exporter.json
================================================
{
  "__inputs": [
    {
      "name": "DS_PROMETHEUS",
      "label": "Prometheus",
      "description": "",
      "type": "datasource",
      "pluginId": "prometheus",
      "pluginName": "Prometheus"
    }
  ],
  "__elements": {},
  "__requires": [
    {
      "type": "panel",
      "id": "bargauge",
      "name": "Bar gauge",
      "version": ""
    },
    {
      "type": "panel",
      "id": "gauge",
      "name": "Gauge",
      "version": ""
    },
    {
      "type": "grafana",
      "id": "grafana",
      "name": "Grafana",
      "version": "9.4.3"
    },
    {
      "type": "datasource",
      "id": "prometheus",
      "name": "Prometheus",
      "version": "1.0.0"
    },
    {
      "type": "panel",
      "id": "stat",
      "name": "Stat",
      "version": ""
    },
    {
      "type": "panel",
      "id": "timeseries",
      "name": "Time series",
      "version": ""
    }
  ],
  "annotations": {
    "list": [
      {
        "$$hashKey": "object:1058",
        "builtIn": 1,
        "datasource": {
          "type": "datasource",
          "uid": "grafana"
        },
        "enable": true,
        "hide": true,
        "iconColor": "rgba(0, 211, 255, 1)",
        "name": "Annotations & Alerts",
        "target": {
          "limit": 100,
          "matchAny": false,
          "tags": [],
          "type": "dashboard"
        },
        "type": "dashboard"
      }
    ]
  },
  "editable": true,
  "fiscalYearStartMonth": 0,
  "gnetId": 1860,
  "graphTooltip": 1,
  "id": null,
  "links": [
    {
      "icon": "external link",
      "tags": [],
      "targetBlank": true,
      "title": "GitHub",
      "type": "link",
      "url": "https://github.com/rfmoz/grafana-dashboards"
    },
    {
      "icon": "external link",
      "tags": [],
      "targetBlank": true,
      "title": "Grafana",
      "type": "link",
      "url": "https://grafana.com/grafana/dashboards/1860"
    }
  ],
  "liveNow": false,
  "panels": [
    {
      "collapsed": false,
      "datasource": {
        "type": "prometheus",
        "uid": "000000001"
      },
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 0
      },
      "id": 261,
      "panels": [],
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "000000001"
          },
          "refId": "A"
        }
      ],
      "title": "Quick CPU / Mem / Disk",
      "type": "row"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${datasource}"
      },
      "description": "Resource pressure via PSI",
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "decimals": 1,
          "links": [],
          "mappings": [],
          "max": 1,
          "min": 0,
          "thresholds": {
            "mode": "percentage",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "dark-yellow",
                "value": 70
              },
              {
                "color": "dark-red",
                "value": 90
              }
            ]
          },
          "unit": "percentunit"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 4,
        "w": 3,
        "x": 0,
        "y": 1
      },
      "id": 323,
      "links": [],
      "options": {
        "displayMode": "basic",
        "minVizHeight": 10,
        "minVizWidth": 0,
        "orientation": "horizontal",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "showUnfilled": true,
        "text": {}
      },
      "pluginVersion": "9.4.3",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "editorMode": "code",
          "exemplar": false,
          "expr":
              "irate(node_pressure_cpu_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
          "format": "time_series",
          "instant": true,
          "intervalFactor": 1,
          "legendFormat": "CPU",
          "range": false,
          "refId": "CPU some",
          "step": 240
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "editorMode": "code",
          "exemplar": false,
          "expr":
              "irate(node_pressure_memory_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
          "format": "time_series",
          "hide": false,
          "instant": true,
          "intervalFactor": 1,
          "legendFormat": "Mem",
          "range": false,
          "refId": "Memory some",
          "step": 240
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "editorMode": "code",
          "exemplar": false,
          "expr":
              "irate(node_pressure_io_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
          "format": "time_series",
          "hide": false,
          "instant": true,
          "intervalFactor": 1,
          "legendFormat": "I/O",
          "range": false,
          "refId": "I/O some",
          "step": 240
        }
      ],
      "title": "Pressure",
      "type": "bargauge"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${datasource}"
      },
      "description": "Busy state of all CPU cores together",
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "decimals": 1,
          "mappings": [
            {
              "options": {
                "match": "null",
                "result": {
                  "text": "N/A"
                }
              },
              "type": "special"
            }
          ],
          "max": 100,
          "min": 0,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "rgba(50, 172, 45, 0.97)",
                "value": null
              },
              {
                "color": "rgba(237, 129, 40, 0.89)",
                "value": 85
              },
              {
                "color": "rgba(245, 54, 54, 0.9)",
                "value": 95
              }
            ]
          },
          "unit": "percent"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 4,
        "w": 3,
        "x": 3,
        "y": 1
      },
      "id": 20,
      "links": [],
      "options": {
        "orientation": "auto",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "showThresholdLabels": false,
        "showThresholdMarkers": true
      },
      "pluginVersion": "9.4.3",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "editorMode": "code",
          "exemplar": false,
          "expr":
              "100 * (1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", instance=\"$node\"}[$__rate_interval])))",
          "hide": false,
          "instant": true,
          "intervalFactor": 1,
          "legendFormat": "",
          "range": false,
          "refId": "A",
          "step": 240
        }
      ],
      "title": "CPU Busy",
      "type": "gauge"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${datasource}"
      },
      "description": "System load  over all CPU cores together",
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "decimals": 1,
          "mappings": [
            {
              "options": {
                "match": "null",
                "result": {
                  "text": "N/A"
                }
              },
              "type": "special"
            }
          ],
          "max": 100,
          "min": 0,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "rgba(50, 172, 45, 0.97)",
                "value": null
              },
              {
                "color": "rgba(237, 129, 40, 0.89)",
                "value": 85
              },
              {
                "color": "rgba(245, 54, 54, 0.9)",
                "value": 95
              }
            ]
          },
          "unit": "percent"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 4,
        "w": 3,
        "x": 6,
        "y": 1
      },
      "id": 155,
      "links": [],
      "options": {
        "orientation": "auto",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "showThresholdLabels": false,
        "showThresholdMarkers": true
      },
      "pluginVersion": "9.4.3",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "editorMode": "code",
          "exemplar": false,
          "expr":
              "scalar(node_load1{instance=\"$node\",job=\"$job\"}) * 100 / count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))",
          "format": "time_series",
          "hide": false,
          "instant": true,
          "intervalFactor": 1,
          "range": false,
          "refId": "A",
          "step": 240
        }
      ],
      "title": "Sys Load",
      "type": "gauge"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${datasource}"
      },
      "description": "Non available RAM memory",
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "decimals": 1,
          "mappings": [],
          "max": 100,
          "min": 0,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "rgba(50, 172, 45, 0.97)",
                "value": null
              },
              {
                "color": "rgba(237, 129, 40, 0.89)",
                "value": 80
              },
              {
                "color": "rgba(245, 54, 54, 0.9)",
                "value": 90
              }
            ]
          },
          "unit": "percent"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 4,
        "w": 3,
        "x": 9,
        "y": 1
      },
      "hideTimeOverride": false,
      "id": 16,
      "links": [],
      "options": {
        "orientation": "auto",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "showThresholdLabels": false,
        "showThresholdMarkers": true
      },
      "pluginVersion": "9.4.3",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "editorMode": "code",
          "exemplar": false,
          "expr":
              "((node_memory_MemTotal_bytes{instance=\"$node\", job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\", job=\"$job\"}) / node_memory_MemTotal_bytes{instance=\"$node\", job=\"$job\"}) * 100",
          "format": "time_series",
          "hide": true,
          "instant": true,
          "intervalFactor": 1,
          "range": false,
          "refId": "A",
          "step": 240
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "editorMode": "code",
          "exemplar": false,
          "expr":
              "(1 - (node_memory_MemAvailable_bytes{instance=\"$node\", job=\"$job\"} / node_memory_MemTotal_bytes{instance=\"$node\", job=\"$job\"})) * 100",
          "format": "time_series",
          "hide": false,
          "instant": true,
          "intervalFactor": 1,
          "range": false,
          "refId": "B",
          "step": 240
        }
      ],
      "title": "RAM Used",
      "type": "gauge"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${datasource}"
      },
      "description": "Used Swap",
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "decimals": 1,
          "mappings": [
            {
              "options": {
                "match": "null",
                "result": {
                  "text": "N/A"
                }
              },
              "type": "special"
            }
          ],
          "max": 100,
          "min": 0,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "rgba(50, 172, 45, 0.97)",
                "value": null
              },
              {
                "color": "rgba(237, 129, 40, 0.89)",
                "value": 10
              },
              {
                "color": "rgba(245, 54, 54, 0.9)",
                "value": 25
              }
            ]
          },
          "unit": "percent"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 4,
        "w": 3,
        "x": 12,
        "y": 1
      },
      "id": 21,
      "links": [],
      "options": {
        "orientation": "auto",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "showThresholdLabels": false,
        "showThresholdMarkers": true
      },
      "pluginVersion": "9.4.3",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "editorMode": "code",
          "exemplar": false,
          "expr":
              "((node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"}) / (node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"})) * 100",
          "instant": true,
          "intervalFactor": 1,
          "range": false,
          "refId": "A",
          "step": 240
        }
      ],
      "title": "SWAP Used",
      "type": "gauge"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${datasource}"
      },
      "description": "Used Root FS",
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "decimals": 1,
          "mappings": [
            {
              "options": {
                "match": "null",
                "result": {
                  "text": "N/A"
                }
              },
              "type": "special"
            }
          ],
          "max": 100,
          "min": 0,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "rgba(50, 172, 45, 0.97)",
                "value": null
              },
              {
                "color": "rgba(237, 129, 40, 0.89)",
                "value": 80
              },
              {
                "color": "rgba(245, 54, 54, 0.9)",
                "value": 90
              }
            ]
          },
          "unit": "percent"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 4,
        "w": 3,
        "x": 15,
        "y": 1
      },
      "id": 154,
      "links": [],
      "options": {
        "orientation": "auto",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "showThresholdLabels": false,
        "showThresholdMarkers": true
      },
      "pluginVersion": "9.4.3",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "editorMode": "code",
          "exemplar": false,
          "expr":
              "100 - ((node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"} * 100) / node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"})",
          "format": "time_series",
          "instant": true,
          "intervalFactor": 1,
          "range": false,
          "refId": "A",
          "step": 240
        }
      ],
      "title": "Root FS Used",
      "type": "gauge"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${datasource}"
      },
      "description": "Total number of CPU cores",
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "mappings": [
            {
              "options": {
                "match": "null",
                "result": {
                  "text": "N/A"
                }
              },
              "type": "special"
            }
          ],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "short"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 2,
        "w": 2,
        "x": 18,
        "y": 1
      },
      "id": 14,
      "links": [],
      "maxDataPoints": 100,
      "options": {
        "colorMode": "none",
        "graphMode": "none",
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "textMode": "auto"
      },
      "pluginVersion": "9.4.3",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "editorMode": "code",
          "exemplar": false,
          "expr": "count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))",
          "instant": true,
          "legendFormat": "__auto",
          "range": false,
          "refId": "A"
        }
      ],
      "title": "CPU Cores",
      "type": "stat"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${datasource}"
      },
      "description": "System uptime",
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "decimals": 1,
          "mappings": [
            {
              "options": {
                "match": "null",
                "result": {
                  "text": "N/A"
                }
              },
              "type": "special"
            }
          ],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "s"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 2,
        "w": 4,
        "x": 20,
        "y": 1
      },
      "hideTimeOverride": true,
      "id": 15,
      "links": [],
      "maxDataPoints": 100,
      "options": {
        "colorMode": "none",
        "graphMode": "none",
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "textMode": "auto"
      },
      "pluginVersion": "9.4.3",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "editorMode": "code",
          "exemplar": false,
          "expr":
              "node_time_seconds{instance=\"$node\",job=\"$job\"} - node_boot_time_seconds{instance=\"$node\",job=\"$job\"}",
          "instant": true,
          "intervalFactor": 1,
          "range": false,
          "refId": "A",
          "step": 240
        }
      ],
      "title": "Uptime",
      "type": "stat"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${datasource}"
      },
      "description": "Total RootFS",
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "decimals": 0,
          "mappings": [
            {
              "options": {
                "match": "null",
                "result": {
                  "text": "N/A"
                }
              },
              "type": "special"
            }
          ],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "rgba(50, 172, 45, 0.97)",
                "value": null
              },
              {
                "color": "rgba(237, 129, 40, 0.89)",
                "value": 70
              },
              {
                "color": "rgba(245, 54, 54, 0.9)",
                "value": 90
              }
            ]
          },
          "unit": "bytes"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 2,
        "w": 2,
        "x": 18,
        "y": 3
      },
      "id": 23,
      "links": [],
      "maxDataPoints": 100,
      "options": {
        "colorMode": "none",
        "graphMode": "none",
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "textMode": "auto"
      },
      "pluginVersion": "9.4.3",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "editorMode": "code",
          "exemplar": false,
          "expr":
              "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"}",
          "format": "time_series",
          "hide": false,
          "instant": true,
          "intervalFactor": 1,
          "range": false,
          "refId": "A",
          "step": 240
        }
      ],
      "title": "RootFS Total",
      "type": "stat"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${datasource}"
      },
      "description": "Total RAM",
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "decimals": 0,
          "mappings": [
            {
              "options": {
                "match": "null",
                "result": {
                  "text": "N/A"
                }
              },
              "type": "special"
            }
          ],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "bytes"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 2,
        "w": 2,
        "x": 20,
        "y": 3
      },
      "id": 75,
      "links": [],
      "maxDataPoints": 100,
      "options": {
        "colorMode": "none",
        "graphMode": "none",
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "textMode": "auto"
      },
      "pluginVersion": "9.4.3",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "editorMode": "code",
          "exemplar": false,
          "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"}",
          "instant": true,
          "intervalFactor": 1,
          "range": false,
          "refId": "A",
          "step": 240
        }
      ],
      "title": "RAM Total",
      "type": "stat"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${datasource}"
      },
      "description": "Total SWAP",
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "decimals": 0,
          "mappings": [
            {
              "options": {
                "match": "null",
                "result": {
                  "text": "N/A"
                }
              },
              "type": "special"
            }
          ],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "bytes"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 2,
        "w": 2,
        "x": 22,
        "y": 3
      },
      "id": 18,
      "links": [],
      "maxDataPoints": 100,
      "options": {
        "colorMode": "none",
        "graphMode": "none",
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "textMode": "auto"
      },
      "pluginVersion": "9.4.3",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "editorMode": "code",
          "exemplar": false,
          "expr": "node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"}",
          "instant": true,
          "intervalFactor": 1,
          "range": false,
          "refId": "A",
          "step": 240
        }
      ],
      "title": "SWAP Total",
      "type": "stat"
    },
    {
      "collapsed": false,
      "datasource": {
        "type": "prometheus",
        "uid": "000000001"
      },
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 5
      },
      "id": 263,
      "panels": [],
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "000000001"
          },
          "refId": "A"
        }
      ],
      "title": "Basic CPU / Mem / Net / Disk",
      "type": "row"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${datasource}"
      },
      "description": "Basic CPU info",
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 40,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "lineInterpolation": "smooth",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "percent"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "links": [],
          "mappings": [],
          "min": 0,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "percentunit"
        },
        "overrides": [
          {
            "matcher": {
              "id": "byName",
              "options": "Busy Iowait"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#890F02",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Idle"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#052B51",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Busy Iowait"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#890F02",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Idle"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#7EB26D",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Busy System"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#EAB839",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Busy User"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#0A437C",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Busy Other"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#6D1F62",
                  "mode": "fixed"
                }
              }
            ]
          }
        ]
      },
      "gridPos": {
        "h": 7,
        "w": 12,
        "x": 0,
        "y": 6
      },
      "id": 77,
      "links": [],
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true,
          "width": 250
        },
        "tooltip": {
          "mode": "multi",
          "sort": "desc"
        }
      },
      "pluginVersion": "9.2.0",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "editorMode": "code",
          "exemplar": false,
          "expr":
              "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"system\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))",
          "format": "time_series",
          "hide": false,
          "instant": false,
          "intervalFactor": 1,
          "legendFormat": "Busy System",
          "range": true,
          "refId": "A",
          "step": 240
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "editorMode": "code",
          "expr":
              "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"user\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))",
          "format": "time_series",
          "hide": false,
          "intervalFactor": 1,
          "legendFormat": "Busy User",
          "range": true,
          "refId": "B",
          "step": 240
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "editorMode": "code",
          "expr":
              "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"iowait\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "Busy Iowait",
          "range": true,
          "refId": "C",
          "step": 240
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "editorMode": "code",
          "expr":
              "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=~\".*irq\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "Busy IRQs",
          "range": true,
          "refId": "D",
          "step": 240
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "editorMode": "code",
          "expr":
              "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\",  mode!='idle',mode!='user',mode!='system',mode!='iowait',mode!='irq',mode!='softirq'}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "Busy Other",
          "range": true,
          "refId": "E",
          "step": 240
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "editorMode": "code",
          "expr":
              "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"idle\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "Idle",
          "range": true,
          "refId": "F",
          "step": 240
        }
      ],
      "title": "CPU Basic",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${datasource}"
      },
      "description": "Basic memory usage",
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 40,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "normal"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "links": [],
          "mappings": [],
          "min": 0,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "bytes"
        },
        "overrides": [
          {
            "matcher": {
              "id": "byName",
              "options": "Apps"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#629E51",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Buffers"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#614D93",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Cache"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#6D1F62",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Cached"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#511749",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Committed"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#508642",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Free"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#0A437C",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options":
                  "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#CFFAFF",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Inactive"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#584477",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "PageTables"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#0A50A1",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Page_Tables"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#0A50A1",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "RAM_Free"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#E0F9D7",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "SWAP Used"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#BF1B00",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Slab"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#806EB7",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Slab_Cache"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#E0752D",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Swap"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#BF1B00",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Swap Used"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#BF1B00",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Swap_Cache"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#C15C17",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Swap_Free"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#2F575E",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Unused"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#EAB839",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "RAM Total"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#E0F9D7",
                  "mode": "fixed"
                }
              },
              {
                "id": "custom.fillOpacity",
                "value": 0
              },
              {
                "id": "custom.stacking",
                "value": {
                  "group": false,
                  "mode": "normal"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "RAM Cache + Buffer"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#052B51",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "RAM Free"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#7EB26D",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Available"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#DEDAF7",
                  "mode": "fixed"
                }
              },
              {
                "id": "custom.fillOpacity",
                "value": 0
              },
              {
                "id": "custom.stacking",
                "value": {
                  "group": false,
                  "mode": "normal"
                }
              }
            ]
          }
        ]
      },
      "gridPos": {
        "h": 7,
        "w": 12,
        "x": 12,
        "y": 6
      },
      "id": 78,
      "links": [],
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true,
          "width": 350
        },
        "tooltip": {
          "mode": "multi",
          "sort": "none"
        }
      },
      "pluginVersion": "9.2.0",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"}",
          "format": "time_series",
          "hide": false,
          "intervalFactor": 1,
          "legendFormat": "RAM Total",
          "refId": "A",
          "step": 240
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "expr":
              "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"} - (node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} + node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} + node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"})",
          "format": "time_series",
          "hide": false,
          "intervalFactor": 1,
          "legendFormat": "RAM Used",
          "refId": "B",
          "step": 240
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "expr":
              "node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} + node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} + node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"}",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "RAM Cache + Buffer",
          "refId": "C",
          "step": 240
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "expr": "node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "RAM Free",
          "refId": "D",
          "step": 240
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "expr":
              "(node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"})",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "SWAP Used",
          "refId": "E",
          "step": 240
        }
      ],
      "title": "Memory Basic",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${datasource}"
      },
      "description": "Basic network info per interface",
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 40,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "links": [],
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "bps"
        },
        "overrides": [
          {
            "matcher": {
              "id": "byName",
              "options": "Recv_bytes_eth2"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#7EB26D",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Recv_bytes_lo"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#0A50A1",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Recv_drop_eth2"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#6ED0E0",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Recv_drop_lo"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#E0F9D7",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Recv_errs_eth2"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#BF1B00",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Recv_errs_lo"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#CCA300",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Trans_bytes_eth2"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#7EB26D",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Trans_bytes_lo"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#0A50A1",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Trans_drop_eth2"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#6ED0E0",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Trans_drop_lo"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#E0F9D7",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Trans_errs_eth2"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#BF1B00",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Trans_errs_lo"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#CCA300",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "recv_bytes_lo"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#0A50A1",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "recv_drop_eth0"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#99440A",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "recv_drop_lo"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#967302",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "recv_errs_eth0"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#BF1B00",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "recv_errs_lo"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#890F02",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "trans_bytes_eth0"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#7EB26D",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "trans_bytes_lo"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#0A50A1",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "trans_drop_eth0"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#99440A",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "trans_drop_lo"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#967302",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "trans_errs_eth0"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#BF1B00",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "trans_errs_lo"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#890F02",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byRegexp",
              "options": "/.*trans.*/"
            },
            "properties": [
              {
                "id": "custom.transform",
                "value": "negative-Y"
              }
            ]
          }
        ]
      },
      "gridPos": {
        "h": 7,
        "w": 12,
        "x": 0,
        "y": 13
      },
      "id": 74,
      "links": [],
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "multi",
          "sort": "none"
        }
      },
      "pluginVersion": "9.2.0",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "expr":
              "irate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "recv {{device}}",
          "refId": "A",
          "step": 240
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "expr":
              "irate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "trans {{device}} ",
          "refId": "B",
          "step": 240
        }
      ],
      "title": "Network Traffic Basic",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${datasource}"
      },
      "description": "Disk space used of all filesystems mounted",
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 40,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "links": [],
          "mappings": [],
          "max": 100,
          "min": 0,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "percent"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 7,
        "w": 12,
        "x": 12,
        "y": 13
      },
      "id": 152,
      "links": [],
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "multi",
          "sort": "none"
        }
      },
      "pluginVersion": "9.2.0",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "expr":
              "100 - ((node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'} * 100) / node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'})",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "{{mountpoint}}",
          "refId": "A",
          "step": 240
        }
      ],
      "title": "Disk Space Used Basic",
      "type": "timeseries"
    },
    {
      "collapsed": true,
      "datasource": {
        "type": "prometheus",
        "uid": "000000001"
      },
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 20
      },
      "id": 265,
      "panels": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description": "",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "percentage",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 70,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "smooth",
                "lineWidth": 2,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "percent"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "percentunit"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "Idle - Waiting for something to happen"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#052B51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Iowait - Waiting for I/O to complete"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Irq - Servicing interrupts"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Nice - Niced processes executing in user mode"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#C15C17",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Softirq - Servicing softirqs"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E24D42",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options":
                      "Steal - Time spent in other operating systems when running in a virtualized environment"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#FCE2DE",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "System - Processes executing in kernel mode"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#508642",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "User - Normal processes executing in user mode"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#5195CE",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 12,
            "w": 12,
            "x": 0,
            "y": 21
          },
          "id": 3,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 250
            },
            "tooltip": {
              "mode": "multi",
              "sort": "desc"
            }
          },
          "pluginVersion": "9.2.0",
          "targets":
              [
                {
                  "datasource": {
                    "type": "prometheus",
                    "uid": "${datasource}"
                  },
                  "editorMode": "code",
                  "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"system\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))",
                  "format": "time_series",
                  "interval": "",
                  "intervalFactor": 1,
                  "legendFormat": "System - Processes executing in kernel mode",
                  "range": true,
                  "refId": "A",
                  "step": 240
                },
                {
                  "datasource":
                      {
                        "type": "prometheus",
                        "uid": "${datasource}"
                      },
                  "editorMode": "code",
                  "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"user\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))",
                  "format": "time_series",
                  "intervalFactor": 1,
                  "legendFormat": "User - Normal processes executing in user mode",
                  "range": true,
                  "refId": "B",
                  "step": 240
                },
                {
                  "datasource":
                      {
                        "type": "prometheus",
                        "uid": "${datasource}"
                      },
                  "editorMode": "code",
                  "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"nice\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))",
                  "format": "time_series",
                  "intervalFactor": 1,
                  "legendFormat": "Nice - Niced processes executing in user mode",
                  "range": true,
                  "refId": "C",
                  "step": 240
                },
                {
                  "datasource":
                      {
                        "type": "prometheus",
                        "uid": "${datasource}"
                      },
                  "editorMode": "code",
                  "expr":
                      "sum by(instance) (irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"iowait\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))",
                  "format": "time_series",
                  "intervalFactor": 1,
                  "legendFormat": "Iowait - Waiting for I/O to complete",
                  "range": true,
                  "refId": "E",
                  "step": 240
                },
                {
                  "datasource": {
                    "type": "prometheus",
                    "uid": "${datasource}"
                  },
                  "editorMode": "code",
                  "expr":
                      "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"irq\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))",
                  "format": "time_series",
                  "intervalFactor": 1,
                  "legendFormat": "Irq - Servicing interrupts",
                  "range": true,
                  "refId": "F",
                  "step": 240
                },
                {
                  "datasource": {
                    "type": "prometheus",
                    "uid": "${datasource}"
                  },
                  "editorMode": "code",
                  "expr":
                      "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"softirq\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))",
                  "format": "time_series",
                  "intervalFactor": 1,
                  "legendFormat": "Softirq - Servicing softirqs",
                  "range": true,
                  "refId": "G",
                  "step": 240
                },
                {
                  "datasource": {
                    "type": "prometheus",
                    "uid": "${datasource}"
                  },
                  "editorMode": "code",
                  "expr":
                      "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"steal\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))",
                  "format": "time_series",
                  "intervalFactor": 1,
                  "legendFormat":
                      "Steal - Time spent in other operating systems when running in a virtualized environment",
                  "range": true,
                  "refId": "H",
                  "step": 240
                },
                {
                  "datasource": {
                    "type": "prometheus",
                    "uid": "${datasource}"
                  },
                  "editorMode": "code",
                  "expr":
                      "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"idle\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))",
                  "format": "time_series",
                  "hide": false,
                  "intervalFactor": 1,
                  "legendFormat": "Idle - Waiting for something to happen",
                  "range": true,
                  "refId": "J",
                  "step": 240
                }
              ],
          "title": "CPU",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description": "",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "bytes",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 40,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "normal"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "bytes"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "Apps"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#629E51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Buffers"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#614D93",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6D1F62",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cached"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#511749",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Committed"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#508642",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A437C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options":
                      "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#CFFAFF",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Inactive"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#584477",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "PageTables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Page_Tables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "RAM_Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0F9D7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Slab"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#806EB7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Slab_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0752D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap - Swap memory usage"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#C15C17",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap_Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#2F575E",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Unused"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Unused - Free memory unassigned"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#052B51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Hardware Corrupted - *./"
                },
                "properties": [
                  {
                    "id": "custom.stacking",
                    "value": {
                      "group": false,
                      "mode": "normal"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 12,
            "w": 12,
            "x": 12,
            "y": 21
          },
          "id": 24,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 350
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Slab_bytes{instance=\"$node\",job=\"$job\"} - node_memory_PageTables_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapCached_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": false,
              "intervalFactor": 1,
              "legendFormat": "Apps - Memory used by user-space applications",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_PageTables_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": false,
              "intervalFactor": 1,
              "legendFormat":
                  "PageTables - Memory used to map between virtual and physical memory addresses",
              "refId": "B",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_SwapCached_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat":
                  "SwapCache - Memory that keeps track of pages that have been fetched from swap but not yet been modified",
              "refId": "C",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_Slab_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": false,
              "intervalFactor": 1,
              "legendFormat":
                  "Slab - Memory used by the kernel to cache data structures for its own use (caches like inode, dentry, etc)",
              "refId": "D",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": false,
              "intervalFactor": 1,
              "legendFormat": "Cache - Parked file data (file content) cache",
              "refId": "E",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": false,
              "intervalFactor": 1,
              "legendFormat": "Buffers - Block device (e.g. harddisk) cache",
              "refId": "F",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": false,
              "intervalFactor": 1,
              "legendFormat": "Unused - Free memory unassigned",
              "refId": "G",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "(node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"})",
              "format": "time_series",
              "hide": false,
              "intervalFactor": 1,
              "legendFormat": "Swap - Swap space used",
              "refId": "H",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_HardwareCorrupted_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": false,
              "intervalFactor": 1,
              "legendFormat":
                  "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working",
              "refId": "I",
              "step": 240
            }
          ],
          "title": "Memory Stack",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "bits out (-) / in (+)",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 40,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "bps"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "receive_packets_eth0"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#7EB26D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "receive_packets_lo"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E24D42",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "transmit_packets_eth0"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#7EB26D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "transmit_packets_lo"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E24D42",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Trans.*/"
                },
                "properties": [
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 12,
            "w": 12,
            "x": 0,
            "y": 33
          },
          "id": 84,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "{{device}} - Receive",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "{{device}} - Transmit",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Network Traffic",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description": "",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "bytes",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 40,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "bytes"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 12,
            "w": 12,
            "x": 12,
            "y": 33
          },
          "id": 156,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'} - node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "{{mountpoint}}",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "Disk Space Used",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description": "",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "IO read (-) / write (+)",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "iops"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Read.*/"
                },
                "properties": [
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#7EB26D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6ED0E0",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EF843C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sde_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E24D42",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#584477",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda2_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BA43A9",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda3_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F4D598",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0752D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#962D82",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#614D93",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#9AC48A",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#65C5DB",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F9934E",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EA6460",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sde1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0F9D7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#FCEACA",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sde3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F9E2D2",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 12,
            "w": 12,
            "x": 0,
            "y": 45
          },
          "id": 229,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "single",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[$__rate_interval])",
              "intervalFactor": 4,
              "legendFormat": "{{device}} - Reads completed",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[$__rate_interval])",
              "intervalFactor": 1,
              "legendFormat": "{{device}} - Writes completed",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Disk IOps",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description": "",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "bytes read (-) / write (+)",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 40,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "Bps"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "io time"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#890F02",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*read*./"
                },
                "properties": [
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#7EB26D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6ED0E0",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EF843C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sde.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E24D42",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byType",
                  "options": "time"
                },
                "properties": [
                  {
                    "id": "custom.axisPlacement",
                    "value": "hidden"
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 12,
            "w": 12,
            "x": 12,
            "y": 45
          },
          "id": 42,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_disk_read_bytes_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[$__rate_interval])",
              "format": "time_series",
              "hide": false,
              "intervalFactor": 1,
              "legendFormat": "{{device}} - Successfully read bytes",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_disk_written_bytes_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[$__rate_interval])",
              "format": "time_series",
              "hide": false,
              "intervalFactor": 1,
              "legendFormat": "{{device}} - Successfully written bytes",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "I/O Usage Read / Write",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description": "",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "%util",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 40,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "percentunit"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "io time"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#890F02",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byType",
                  "options": "time"
                },
                "properties": [
                  {
                    "id": "custom.axisPlacement",
                    "value": "hidden"
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 12,
            "w": 12,
            "x": 0,
            "y": 57
          },
          "id": 127,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_disk_io_time_seconds_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"} [$__rate_interval])",
              "format": "time_series",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "{{device}}",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "I/O Utilization",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "percentage",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "bars",
                "fillOpacity": 70,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "smooth",
                "lineWidth": 2,
                "pointSize": 3,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "mappings": [],
              "max": 1,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "percentunit"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/^Guest - /"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#5195ce",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/^GuestNice - /"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#c15c17",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 12,
            "w": 12,
            "x": 12,
            "y": 57
          },
          "id": 319,
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "desc"
            }
          },
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "editorMode": "code",
              "expr":
                  "sum by(instance) (irate(node_cpu_guest_seconds_total{instance=\"$node\",job=\"$job\", mode=\"user\"}[1m])) / on(instance) group_left sum by (instance)((irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}[1m])))",
              "hide": false,
              "legendFormat":
                  "Guest - Time spent running a virtual CPU for a guest operating system",
              "range": true,
              "refId": "A"
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "editorMode": "code",
              "expr":
                  "sum by(instance) (irate(node_cpu_guest_seconds_total{instance=\"$node\",job=\"$job\", mode=\"nice\"}[1m])) / on(instance) group_left sum by (instance)((irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}[1m])))",
              "hide": false,
              "legendFormat":
                  "GuestNice - Time spent running a niced guest  (virtual CPU for guest operating system)",
              "range": true,
              "refId": "B"
            }
          ],
          "title": "CPU spent seconds in guests (VMs)",
          "type": "timeseries"
        }
      ],
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "000000001"
          },
          "refId": "A"
        }
      ],
      "title": "CPU / Memory / Net / Disk",
      "type": "row"
    },
    {
      "collapsed": true,
      "datasource": {
        "type": "prometheus",
        "uid": "000000001"
      },
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 21
      },
      "id": 266,
      "panels": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "bytes",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "normal"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "bytes"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "Apps"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#629E51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Buffers"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#614D93",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6D1F62",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cached"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#511749",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Committed"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#508642",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A437C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options":
                      "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#CFFAFF",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Inactive"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#584477",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "PageTables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Page_Tables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "RAM_Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0F9D7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Slab"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#806EB7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Slab_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0752D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#C15C17",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap_Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#2F575E",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Unused"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 54
          },
          "id": 136,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 350
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_Inactive_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat":
                  "Inactive - Memory which has been less recently used.  It is more eligible to be reclaimed for other purposes",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_Active_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat":
                  "Active - Memory that has been used more recently and usually not reclaimed unless absolutely necessary",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Memory Active / Inactive",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "bytes",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "bytes"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "Apps"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#629E51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Buffers"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#614D93",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6D1F62",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cached"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#511749",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Committed"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#508642",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A437C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options":
                      "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#CFFAFF",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Inactive"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#584477",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "PageTables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Page_Tables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "RAM_Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0F9D7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Slab"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#806EB7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Slab_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0752D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#C15C17",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap_Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#2F575E",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Unused"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*CommitLimit - *./"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  },
                  {
                    "id": "custom.fillOpacity",
                    "value": 0
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 54
          },
          "id": 135,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 350
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_Committed_AS_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "Committed_AS - Amount of memory presently allocated on the system",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_CommitLimit_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat":
                  "CommitLimit - Amount of  memory currently available to be allocated on the system",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Memory Committed",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "bytes",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "normal"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "bytes"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "Apps"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#629E51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Buffers"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#614D93",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6D1F62",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cached"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#511749",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Committed"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#508642",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A437C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options":
                      "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#CFFAFF",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Inactive"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#584477",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "PageTables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Page_Tables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "RAM_Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0F9D7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Slab"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#806EB7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Slab_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0752D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#C15C17",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap_Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#2F575E",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Unused"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 64
          },
          "id": 191,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 350
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_Inactive_file_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": false,
              "intervalFactor": 1,
              "legendFormat": "Inactive_file - File-backed memory on inactive LRU list",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_Inactive_anon_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": false,
              "intervalFactor": 1,
              "legendFormat":
                  "Inactive_anon - Anonymous and swap cache on inactive LRU list, including tmpfs (shmem)",
              "refId": "B",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_Active_file_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": false,
              "intervalFactor": 1,
              "legendFormat": "Active_file - File-backed memory on active LRU list",
              "refId": "C",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_Active_anon_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": false,
              "intervalFactor": 1,
              "legendFormat":
                  "Active_anon - Anonymous and swap cache on active least-recently-used (LRU) list, including tmpfs",
              "refId": "D",
              "step": 240
            }
          ],
          "title": "Memory Active / Inactive Detail",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "bytes",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "bytes"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "Active"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#99440A",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Buffers"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#58140C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6D1F62",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cached"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#511749",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Committed"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#508642",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Dirty"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6ED0E0",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#B7DBAB",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Inactive"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EA6460",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Mapped"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#052B51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "PageTables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Page_Tables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Slab_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#C15C17",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#511749",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total RAM"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#052B51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total RAM + Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#052B51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#614D93",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "VmallocUsed"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EA6460",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 64
          },
          "id": 130,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_Writeback_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "Writeback - Memory which is actively being written back to disk",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_WritebackTmp_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "WritebackTmp - Memory used by FUSE for temporary writeback buffers",
              "refId": "B",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_Dirty_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "Dirty - Memory which is waiting to get written back to the disk",
              "refId": "C",
              "step": 240
            }
          ],
          "title": "Memory Writeback and Dirty",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "bytes",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "bytes"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "Apps"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#629E51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Buffers"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#614D93",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6D1F62",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cached"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#511749",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Committed"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#508642",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A437C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options":
                      "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#CFFAFF",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Inactive"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#584477",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "PageTables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Page_Tables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "RAM_Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0F9D7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Slab"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#806EB7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Slab_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0752D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#C15C17",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap_Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#2F575E",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Unused"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options":
                      "ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated  with huge pages"
                },
                "properties": [
                  {
                    "id": "custom.fillOpacity",
                    "value": 0
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options":
                      "ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated  with huge pages"
                },
                "properties": [
                  {
                    "id": "custom.fillOpacity",
                    "value": 0
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 74
          },
          "id": 138,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 350
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_Mapped_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat":
                  "Mapped - Used memory in mapped pages files which have been mapped, such as libraries",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_Shmem_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat":
                  "Shmem - Used shared memory (shared between several processes, thus including RAM disks)",
              "refId": "B",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_ShmemHugePages_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat":
                  "ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated  with huge pages",
              "refId": "C",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_ShmemPmdMapped_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat":
                  "ShmemPmdMapped - Amount of shared (shmem/tmpfs) memory backed by huge pages",
              "refId": "D",
              "step": 240
            }
          ],
          "title": "Memory Shared and Mapped",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "bytes",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "normal"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "bytes"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "Active"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#99440A",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Buffers"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#58140C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6D1F62",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cached"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#511749",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Committed"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#508642",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Dirty"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6ED0E0",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#B7DBAB",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Inactive"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EA6460",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Mapped"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#052B51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "PageTables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Page_Tables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Slab_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#C15C17",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#511749",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total RAM"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#052B51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total RAM + Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#052B51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#614D93",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "VmallocUsed"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EA6460",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 74
          },
          "id": 131,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_SUnreclaim_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat":
                  "SUnreclaim - Part of Slab, that cannot be reclaimed on memory pressure",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat":
                  "SReclaimable - Part of Slab, that might be reclaimed, such as caches",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Memory Slab",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "bytes",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "bytes"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "Active"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#99440A",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Buffers"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#58140C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6D1F62",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cached"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#511749",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Committed"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#508642",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Dirty"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6ED0E0",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#B7DBAB",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Inactive"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EA6460",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Mapped"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#052B51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "PageTables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Page_Tables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Slab_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#C15C17",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#511749",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total RAM"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#052B51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total RAM + Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#052B51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "VmallocUsed"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EA6460",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 84
          },
          "id": 70,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_VmallocChunk_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": false,
              "intervalFactor": 1,
              "legendFormat":
                  "VmallocChunk - Largest contiguous block of vmalloc area which is free",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_VmallocTotal_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": false,
              "intervalFactor": 1,
              "legendFormat": "VmallocTotal - Total size of vmalloc memory area",
              "refId": "B",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_VmallocUsed_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": false,
              "intervalFactor": 1,
              "legendFormat": "VmallocUsed - Amount of vmalloc area which is used",
              "refId": "C",
              "step": 240
            }
          ],
          "title": "Memory Vmalloc",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "bytes",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "bytes"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "Apps"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#629E51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Buffers"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#614D93",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6D1F62",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cached"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#511749",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Committed"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#508642",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A437C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options":
                      "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#CFFAFF",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Inactive"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#584477",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "PageTables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Page_Tables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "RAM_Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0F9D7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Slab"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#806EB7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Slab_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0752D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#C15C17",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap_Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#2F575E",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Unused"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 84
          },
          "id": 159,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 350
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_Bounce_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "Bounce - Memory used for block device bounce buffers",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "Memory Bounce",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "bytes",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "bytes"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "Active"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#99440A",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Buffers"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#58140C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6D1F62",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cached"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#511749",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Committed"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#508642",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Dirty"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6ED0E0",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#B7DBAB",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Inactive"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EA6460",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Mapped"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#052B51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "PageTables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Page_Tables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Slab_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#C15C17",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#511749",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total RAM"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#052B51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total RAM + Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#052B51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "VmallocUsed"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EA6460",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Inactive *./"
                },
                "properties": [
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 94
          },
          "id": 129,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_AnonHugePages_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "AnonHugePages - Memory in anonymous huge pages",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_AnonPages_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "AnonPages - Memory in user pages not backed by files",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Memory Anonymous",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "bytes",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "bytes"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "Apps"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#629E51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Buffers"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#614D93",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6D1F62",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cached"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#511749",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Committed"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#508642",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A437C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options":
                      "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#CFFAFF",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Inactive"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#584477",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "PageTables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Page_Tables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "RAM_Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0F9D7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Slab"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#806EB7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Slab_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0752D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#C15C17",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap_Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#2F575E",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Unused"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 94
          },
          "id": 160,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 350
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_KernelStack_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "KernelStack - Kernel memory stack. This is not reclaimable",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_Percpu_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "PerCPU - Per CPU memory allocated dynamically by loadable modules",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Memory Kernel / CPU",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "pages",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "Active"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#99440A",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Buffers"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#58140C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6D1F62",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cached"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#511749",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Committed"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#508642",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Dirty"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6ED0E0",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#B7DBAB",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Inactive"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EA6460",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Mapped"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#052B51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "PageTables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Page_Tables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Slab_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#C15C17",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#511749",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total RAM"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#806EB7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total RAM + Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#806EB7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "VmallocUsed"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EA6460",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 104
          },
          "id": 140,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_HugePages_Free{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "HugePages_Free - Huge pages in the pool that are not yet allocated",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_HugePages_Rsvd{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat":
                  "HugePages_Rsvd - Huge pages for which a commitment to allocate from the pool has been made, but no allocation has yet been made",
              "refId": "B",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_HugePages_Surp{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat":
                  "HugePages_Surp - Huge pages in the pool above the value in /proc/sys/vm/nr_hugepages",
              "refId": "C",
              "step": 240
            }
          ],
          "title": "Memory HugePages Counter",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "bytes",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "bytes"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "Active"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#99440A",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Buffers"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#58140C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6D1F62",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cached"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#511749",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Committed"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#508642",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Dirty"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6ED0E0",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#B7DBAB",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Inactive"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EA6460",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Mapped"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#052B51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "PageTables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Page_Tables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Slab_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#C15C17",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#511749",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total RAM"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#806EB7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total RAM + Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#806EB7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "VmallocUsed"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EA6460",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 104
          },
          "id": 71,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_HugePages_Total{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "HugePages - Total size of the pool of huge pages",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_Hugepagesize_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "Hugepagesize - Huge Page size",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Memory HugePages Size",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "bytes",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "bytes"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "Active"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#99440A",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Buffers"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#58140C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6D1F62",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cached"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#511749",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Committed"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#508642",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Dirty"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6ED0E0",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#B7DBAB",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Inactive"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EA6460",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Mapped"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#052B51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "PageTables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Page_Tables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Slab_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#C15C17",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#511749",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total RAM"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#052B51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total RAM + Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#052B51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "VmallocUsed"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EA6460",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 114
          },
          "id": 128,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_DirectMap1G_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "DirectMap1G - Amount of pages mapped as this size",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_DirectMap2M_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "DirectMap2M - Amount of pages mapped as this size",
              "refId": "B",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_DirectMap4k_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "DirectMap4K - Amount of pages mapped as this size",
              "refId": "C",
              "step": 240
            }
          ],
          "title": "Memory DirectMap",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "bytes",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "bytes"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "Apps"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#629E51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Buffers"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#614D93",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6D1F62",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cached"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#511749",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Committed"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#508642",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A437C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options":
                      "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#CFFAFF",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Inactive"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#584477",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "PageTables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Page_Tables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "RAM_Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0F9D7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Slab"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#806EB7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Slab_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0752D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#C15C17",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap_Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#2F575E",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Unused"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 114
          },
          "id": 137,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 350
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_Unevictable_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat":
                  "Unevictable - Amount of unevictable memory that can't be swapped out for a variety of reasons",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_Mlocked_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat":
                  "MLocked - Size of pages locked to memory using the mlock() system call",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Memory Unevictable and MLocked",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "bytes",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "bytes"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "Active"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#99440A",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Buffers"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#58140C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6D1F62",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cached"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#511749",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Committed"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#508642",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Dirty"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6ED0E0",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#B7DBAB",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Inactive"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EA6460",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Mapped"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#052B51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "PageTables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Page_Tables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Slab_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#C15C17",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#511749",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total RAM"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#052B51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total RAM + Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#052B51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#614D93",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "VmallocUsed"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EA6460",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 124
          },
          "id": 132,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_memory_NFS_Unstable_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat":
                  "NFS Unstable - Memory in NFS pages sent to the server, but not yet committed to the storage",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "Memory NFS",
          "type": "timeseries"
        }
      ],
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "000000001"
          },
          "refId": "A"
        }
      ],
      "title": "Memory Meminfo",
      "type": "row"
    },
    {
      "collapsed": true,
      "datasource": {
        "type": "prometheus",
        "uid": "000000001"
      },
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 22
      },
      "id": 267,
      "panels": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "pages out (-) / in (+)",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*out/"
                },
                "properties": [
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 41
          },
          "id": 176,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_vmstat_pgpgin{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "Pagesin - Page in operations",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_vmstat_pgpgout{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "Pagesout - Page out operations",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Memory Pages In / Out",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "pages out (-) / in (+)",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*out/"
                },
                "properties": [
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 41
          },
          "id": 22,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_vmstat_pswpin{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "Pswpin - Pages swapped in",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_vmstat_pswpout{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "Pswpout - Pages swapped out",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Memory Pages Swap In / Out",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "faults",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "normal"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "Apps"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#629E51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Buffers"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#614D93",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6D1F62",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cached"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#511749",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Committed"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#508642",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A437C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options":
                      "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#CFFAFF",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Inactive"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#584477",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "PageTables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Page_Tables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "RAM_Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0F9D7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Slab"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#806EB7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Slab_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0752D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#C15C17",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap_Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#2F575E",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Unused"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Pgfault - Page major and minor fault operations"
                },
                "properties": [
                  {
                    "id": "custom.fillOpacity",
                    "value": 0
                  },
                  {
                    "id": "custom.stacking",
                    "value": {
                      "group": false,
                      "mode": "normal"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 51
          },
          "id": 175,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 350
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_vmstat_pgfault{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "Pgfault - Page major and minor fault operations",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_vmstat_pgmajfault{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "Pgmajfault - Major page fault operations",
              "refId": "B",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_vmstat_pgfault{instance=\"$node\",job=\"$job\"}[$__rate_interval])  - irate(node_vmstat_pgmajfault{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "Pgminfault - Minor page fault operations",
              "refId": "C",
              "step": 240
            }
          ],
          "title": "Memory Page Faults",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "counter",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "Active"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#99440A",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Buffers"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#58140C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6D1F62",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Cached"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#511749",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Committed"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#508642",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Dirty"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6ED0E0",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Free"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#B7DBAB",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Inactive"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EA6460",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Mapped"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#052B51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "PageTables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Page_Tables"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Slab_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Swap_Cache"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#C15C17",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#511749",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total RAM"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#052B51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total RAM + Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#052B51",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Total Swap"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#614D93",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "VmallocUsed"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EA6460",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 51
          },
          "id": 307,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_vmstat_oom_kill{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "oom killer invocations ",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "OOM Killer",
          "type": "timeseries"
        }
      ],
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "000000001"
          },
          "refId": "A"
        }
      ],
      "title": "Memory Vmstat",
      "type": "row"
    },
    {
      "collapsed": true,
      "datasource": {
        "type": "prometheus",
        "uid": "000000001"
      },
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 23
      },
      "id": 293,
      "panels": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description": "",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "seconds",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "s"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Variation*./"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#890F02",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 24
          },
          "id": 260,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_timex_estimated_error_seconds{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "Estimated error in seconds",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_timex_offset_seconds{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "Time offset in between local system and reference clock",
              "refId": "B",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_timex_maxerror_seconds{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "Maximum error in seconds",
              "refId": "C",
              "step": 240
            }
          ],
          "title": "Time Synchronized Drift",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description": "",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "counter",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 24
          },
          "id": 291,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_timex_loop_time_constant{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "Phase-locked loop time adjust",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "Time PLL Adjust",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description": "",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "counter",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Variation*./"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#890F02",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 34
          },
          "id": 168,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_timex_sync_status{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "Is clock synchronized to a reliable server (1 = yes, 0 = no)",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_timex_frequency_adjustment_ratio{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "Local clock frequency adjustment",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Time Synchronized Status",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description": "",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "seconds",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "s"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 34
          },
          "id": 294,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_timex_tick_seconds{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "Seconds between clock ticks",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_timex_tai_offset_seconds{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "International Atomic Time (TAI) offset",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Time Misc",
          "type": "timeseries"
        }
      ],
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "000000001"
          },
          "refId": "A"
        }
      ],
      "title": "System Timesync",
      "type": "row"
    },
    {
      "collapsed": true,
      "datasource": {
        "type": "prometheus",
        "uid": "000000001"
      },
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 24
      },
      "id": 312,
      "panels": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "counter",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 73
          },
          "id": 62,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_procs_blocked{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "Processes blocked waiting for I/O to complete",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_procs_running{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "Processes in runnable state",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Processes Status",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description": "Enable with --collector.processes argument on node-exporter",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "counter",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "normal"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 73
          },
          "id": 315,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_processes_state{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "{{ state }}",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "Processes State",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "forks / sec",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 83
          },
          "id": 148,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "irate(node_forks_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "hide": false,
              "intervalFactor": 1,
              "legendFormat": "Processes forks second",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "Processes  Forks",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "bytes",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "decbytes"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Max.*/"
                },
                "properties": [
                  {
                    "id": "custom.fillOpacity",
                    "value": 0
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 83
          },
          "id": 149,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(process_virtual_memory_bytes{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "Processes virtual memory size in bytes",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "process_resident_memory_max_bytes{instance=\"$node\",job=\"$job\"}",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "Maximum amount of virtual memory available in bytes",
              "refId": "B",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(process_virtual_memory_bytes{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "Processes virtual memory size in bytes",
              "refId": "C",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(process_virtual_memory_max_bytes{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "Maximum amount of virtual memory available in bytes",
              "refId": "D",
              "step": 240
            }
          ],
          "title": "Processes Memory",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description": "Enable with --collector.processes argument on node-exporter",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "counter",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "PIDs limit"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F2495C",
                      "mode": "fixed"
                    }
                  },
                  {
                    "id": "custom.fillOpacity",
                    "value": 0
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 93
          },
          "id": 313,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_processes_pids{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "Number of PIDs",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_processes_max_processes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "PIDs limit",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "PIDs Number and Limit",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "seconds",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "s"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*waiting.*/"
                },
                "properties": [
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 93
          },
          "id": 305,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_schedstat_running_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "CPU {{ cpu }} - seconds spent running a process",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_schedstat_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "CPU {{ cpu }} - seconds spent by processing waiting for this CPU",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Process schedule stats Running / Waiting",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description": "Enable with --collector.processes argument on node-exporter",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "counter",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "Threads limit"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F2495C",
                      "mode": "fixed"
                    }
                  },
                  {
                    "id": "custom.fillOpacity",
                    "value": 0
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 103
          },
          "id": 314,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_processes_threads{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "Allocated threads",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_processes_max_threads{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "Threads limit",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Threads Number and Limit",
          "type": "timeseries"
        }
      ],
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "000000001"
          },
          "refId": "A"
        }
      ],
      "title": "System Processes",
      "type": "row"
    },
    {
      "collapsed": true,
      "datasource": {
        "type": "prometheus",
        "uid": "000000001"
      },
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 25
      },
      "id": 269,
      "panels": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "counter",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 26
          },
          "id": 8,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_context_switches_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "Context switches",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "irate(node_intr_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "hide": false,
              "intervalFactor": 1,
              "legendFormat": "Interrupts",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Context Switches / Interrupts",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "counter",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 26
          },
          "id": 7,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_load1{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 4,
              "legendFormat": "Load 1m",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_load5{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 4,
              "legendFormat": "Load 5m",
              "refId": "B",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_load15{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 4,
              "legendFormat": "Load 15m",
              "refId": "C",
              "step": 240
            }
          ],
          "title": "System Load",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 0,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "hertz"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "Max"
                },
                "properties": [
                  {
                    "id": "custom.lineStyle",
                    "value": {
                      "dash": [
                        10,
                        10
                      ],
                      "fill": "dash"
                    }
                  },
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "blue",
                      "mode": "fixed"
                    }
                  },
                  {
                    "id": "custom.fillOpacity",
                    "value": 10
                  },
                  {
                    "id": "custom.hideFrom",
                    "value": {
                      "legend": true,
                      "tooltip": false,
                      "viz": false
                    }
                  },
                  {
                    "id": "custom.fillBelowTo",
                    "value": "Min"
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Min"
                },
                "properties": [
                  {
                    "id": "custom.lineStyle",
                    "value": {
                      "dash": [
                        10,
                        10
                      ],
                      "fill": "dash"
                    }
                  },
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "blue",
                      "mode": "fixed"
                    }
                  },
                  {
                    "id": "custom.hideFrom",
                    "value": {
                      "legend": true,
                      "tooltip": false,
                      "viz": false
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 36
          },
          "id": 321,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "desc"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "editorMode": "code",
              "expr": "node_cpu_scaling_frequency_hertz{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "CPU {{ cpu }}",
              "range": true,
              "refId": "B",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "editorMode": "code",
              "expr": "avg(node_cpu_scaling_frequency_max_hertz{instance=\"$node\",job=\"$job\"})",
              "format": "time_series",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "Max",
              "range": true,
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "editorMode": "code",
              "expr": "avg(node_cpu_scaling_frequency_min_hertz{instance=\"$node\",job=\"$job\"})",
              "format": "time_series",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "Min",
              "range": true,
              "refId": "C",
              "step": 240
            }
          ],
          "title": "CPU Frequency Scaling",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description": "https://docs.kernel.org/accounting/psi.html",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 10,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "percentunit"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "Memory some"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "dark-red",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Memory full"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "light-red",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "I/O some"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "dark-blue",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "I/O full"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "light-blue",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 36
          },
          "id": 322,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "editorMode": "code",
              "expr":
                  "rate(node_pressure_cpu_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "CPU some",
              "range": true,
              "refId": "CPU some",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "editorMode": "code",
              "expr":
                  "rate(node_pressure_memory_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "hide": false,
              "intervalFactor": 1,
              "legendFormat": "Memory some",
              "range": true,
              "refId": "Memory some",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "editorMode": "code",
              "expr":
                  "rate(node_pressure_memory_stalled_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "hide": false,
              "intervalFactor": 1,
              "legendFormat": "Memory full",
              "range": true,
              "refId": "Memory full",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "editorMode": "code",
              "expr":
                  "rate(node_pressure_io_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "hide": false,
              "intervalFactor": 1,
              "legendFormat": "I/O some",
              "range": true,
              "refId": "I/O some",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "editorMode": "code",
              "expr":
                  "rate(node_pressure_io_stalled_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "hide": false,
              "intervalFactor": 1,
              "legendFormat": "I/O full",
              "range": true,
              "refId": "I/O full",
              "step": 240
            }
          ],
          "title": "Pressure Stall Information",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description": "Enable with --collector.interrupts argument on node-exporter",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "counter",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Critical*./"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E24D42",
                      "mode": "fixed"
                    }
                  },
                  {
                    "id": "custom.fillOpacity",
                    "value": 0
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Max*./"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EF843C",
                      "mode": "fixed"
                    }
                  },
                  {
                    "id": "custom.fillOpacity",
                    "value": 0
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 46
          },
          "id": 259,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_interrupts_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "{{ type }} - {{ info }}",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "Interrupts Detail",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "counter",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 46
          },
          "id": 306,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_schedstat_timeslices_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "CPU {{ cpu }}",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "Schedule timeslices executed by each cpu",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "counter",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 56
          },
          "id": 151,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_entropy_available_bits{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "Entropy available to random number generators",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "Entropy",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "seconds",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "s"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 56
          },
          "id": 308,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(process_cpu_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "Time spent",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "CPU time spent in user and system contexts",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "counter",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Max*./"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#890F02",
                      "mode": "fixed"
                    }
                  },
                  {
                    "id": "custom.fillOpacity",
                    "value": 0
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 66
          },
          "id": 64,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "process_max_fds{instance=\"$node\",job=\"$job\"}",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "Maximum open file descriptors",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "process_open_fds{instance=\"$node\",job=\"$job\"}",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "Open file descriptors",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "File Descriptors",
          "type": "timeseries"
        }
      ],
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "000000001"
          },
          "refId": "A"
        }
      ],
      "title": "System Misc",
      "type": "row"
    },
    {
      "collapsed": true,
      "datasource": {
        "type": "prometheus",
        "uid": "000000001"
      },
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 26
      },
      "id": 304,
      "panels": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "temperature",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "celsius"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Critical*./"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E24D42",
                      "mode": "fixed"
                    }
                  },
                  {
                    "id": "custom.fillOpacity",
                    "value": 0
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Max*./"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EF843C",
                      "mode": "fixed"
                    }
                  },
                  {
                    "id": "custom.fillOpacity",
                    "value": 0
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 59
          },
          "id": 158,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "node_hwmon_temp_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "{{ chip_name }} {{ sensor }} temp",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "node_hwmon_temp_crit_alarm_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": true,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "{{ chip_name }} {{ sensor }} Critical Alarm",
              "refId": "B",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "node_hwmon_temp_crit_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "{{ chip_name }} {{ sensor }} Critical",
              "refId": "C",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "node_hwmon_temp_crit_hyst_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": true,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "{{ chip_name }} {{ sensor }} Critical Historical",
              "refId": "D",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "node_hwmon_temp_max_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": true,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "{{ chip_name }} {{ sensor }} Max",
              "refId": "E",
              "step": 240
            }
          ],
          "title": "Hardware temperature monitor",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "counter",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Max*./"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EF843C",
                      "mode": "fixed"
                    }
                  },
                  {
                    "id": "custom.fillOpacity",
                    "value": 0
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 59
          },
          "id": 300,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_cooling_device_cur_state{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "Current {{ name }} in {{ type }}",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_cooling_device_max_state{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "Max {{ name }} in {{ type }}",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Throttle cooling device",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "counter",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 69
          },
          "id": 302,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_power_supply_online{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "{{ power_supply }} online",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "Power supply",
          "type": "timeseries"
        }
      ],
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "000000001"
          },
          "refId": "A"
        }
      ],
      "title": "Hardware Misc",
      "type": "row"
    },
    {
      "collapsed": true,
      "datasource": {
        "type": "prometheus",
        "uid": "000000001"
      },
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 27
      },
      "id": 296,
      "panels": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "counter",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 46
          },
          "id": 297,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_systemd_socket_accepted_connections_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "{{ name }} Connections",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "Systemd Sockets",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "counter",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "normal"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "Failed"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F2495C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Inactive"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#FF9830",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Active"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#73BF69",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Deactivating"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#FFCB7D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "Activating"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#C8F2C2",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 46
          },
          "id": 298,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"activating\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "Activating",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"active\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "Active",
              "refId": "B",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"deactivating\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "Deactivating",
              "refId": "C",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"failed\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "Failed",
              "refId": "D",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"inactive\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "Inactive",
              "refId": "E",
              "step": 240
            }
          ],
          "title": "Systemd Units State",
          "type": "timeseries"
        }
      ],
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "000000001"
          },
          "refId": "A"
        }
      ],
      "title": "Systemd",
      "type": "row"
    },
    {
      "collapsed": true,
      "datasource": {
        "type": "prometheus",
        "uid": "000000001"
      },
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 28
      },
      "id": 270,
      "panels": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description":
              "The number (after merges) of I/O requests completed per second for the device",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "IO read (-) / write (+)",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "iops"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Read.*/"
                },
                "properties": [
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#7EB26D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6ED0E0",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EF843C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sde_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E24D42",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#584477",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda2_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BA43A9",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda3_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F4D598",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0752D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#962D82",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#614D93",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#9AC48A",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#65C5DB",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F9934E",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EA6460",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sde1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0F9D7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#FCEACA",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sde3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F9E2D2",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 47
          },
          "id": 9,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "single",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "intervalFactor": 4,
              "legendFormat": "{{device}} - Reads completed",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "intervalFactor": 1,
              "legendFormat": "{{device}} - Writes completed",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Disk IOps Completed",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description": "The number of bytes read from or written to the device per second",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "bytes read (-) / write (+)",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "Bps"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Read.*/"
                },
                "properties": [
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#7EB26D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6ED0E0",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EF843C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sde_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E24D42",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#584477",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda2_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BA43A9",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda3_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F4D598",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0752D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#962D82",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#614D93",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#9AC48A",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#65C5DB",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F9934E",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EA6460",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sde1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0F9D7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#FCEACA",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sde3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F9E2D2",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 47
          },
          "id": 33,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "single",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_disk_read_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "intervalFactor": 4,
              "legendFormat": "{{device}} - Read bytes",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_disk_written_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "{{device}} - Written bytes",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Disk R/W Data",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description":
              "The average time for requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them.",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "time. read (-) / write (+)",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 30,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "s"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Read.*/"
                },
                "properties": [
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#7EB26D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6ED0E0",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EF843C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sde_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E24D42",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#584477",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda2_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BA43A9",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda3_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F4D598",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0752D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#962D82",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#614D93",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#9AC48A",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#65C5DB",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F9934E",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EA6460",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sde1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0F9D7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#FCEACA",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sde3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F9E2D2",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 57
          },
          "id": 37,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "single",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_disk_read_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]) / irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "hide": false,
              "interval": "",
              "intervalFactor": 4,
              "legendFormat": "{{device}} - Read wait time avg",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_disk_write_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]) / irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "{{device}} - Write wait time avg",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Disk Average Wait Time",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description": "The average queue length of the requests that were issued to the device",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "aqu-sz",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "none"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#7EB26D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6ED0E0",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EF843C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sde_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E24D42",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#584477",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda2_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BA43A9",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda3_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F4D598",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0752D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#962D82",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#614D93",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#9AC48A",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#65C5DB",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F9934E",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EA6460",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sde1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0F9D7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#FCEACA",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sde3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F9E2D2",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 57
          },
          "id": 35,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "single",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_disk_io_time_weighted_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "interval": "",
              "intervalFactor": 4,
              "legendFormat": "{{device}}",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "Average Queue Size",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description":
              "The number of read and write requests merged per second that were queued to the device",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "I/Os",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "iops"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Read.*/"
                },
                "properties": [
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#7EB26D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6ED0E0",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EF843C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sde_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E24D42",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#584477",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda2_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BA43A9",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda3_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F4D598",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0752D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#962D82",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#614D93",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#9AC48A",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#65C5DB",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F9934E",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EA6460",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sde1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0F9D7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#FCEACA",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sde3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F9E2D2",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 67
          },
          "id": 133,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "single",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_disk_reads_merged_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "intervalFactor": 1,
              "legendFormat": "{{device}} - Read merged",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_disk_writes_merged_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "intervalFactor": 1,
              "legendFormat": "{{device}} - Write merged",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Disk R/W Merged",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description":
              "Percentage of elapsed time during which I/O requests were issued to the device (bandwidth utilization for the device). Device saturation occurs when this value is close to 100% for devices serving requests serially.  But for devices  serving requests in parallel, such as RAID arrays and modern SSDs, this number does not reflect their performance limits.",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "%util",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 30,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "percentunit"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#7EB26D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6ED0E0",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EF843C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sde_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E24D42",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#584477",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda2_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BA43A9",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda3_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F4D598",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0752D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#962D82",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#614D93",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#9AC48A",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#65C5DB",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F9934E",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EA6460",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sde1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0F9D7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#FCEACA",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sde3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F9E2D2",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 67
          },
          "id": 36,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "single",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_disk_io_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "interval": "",
              "intervalFactor": 4,
              "legendFormat": "{{device}} - IO",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_disk_discard_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "interval": "",
              "intervalFactor": 4,
              "legendFormat": "{{device}} - discard",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Time Spent Doing I/Os",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description":
              "The number of outstanding requests at the instant the sample was taken. Incremented as requests are given to appropriate struct request_queue and decremented as they finish.",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "Outstanding req.",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "none"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#7EB26D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6ED0E0",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EF843C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sde_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E24D42",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#584477",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda2_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BA43A9",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda3_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F4D598",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0752D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#962D82",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#614D93",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#9AC48A",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#65C5DB",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F9934E",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EA6460",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sde1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0F9D7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#FCEACA",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sde3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F9E2D2",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 77
          },
          "id": 34,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "single",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_disk_io_now{instance=\"$node\",job=\"$job\"}",
              "interval": "",
              "intervalFactor": 4,
              "legendFormat": "{{device}} - IO now",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "Instantaneous Queue Size",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description": "",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "IOs",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "iops"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#7EB26D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EAB839",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#6ED0E0",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EF843C",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sde_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E24D42",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#584477",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda2_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BA43A9",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sda3_.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F4D598",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#0A50A1",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#BF1B00",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdb3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0752D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#962D82",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#614D93",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdc3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#9AC48A",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#65C5DB",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F9934E",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#EA6460",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sde1.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E0F9D7",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sdd2.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#FCEACA",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*sde3.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F9E2D2",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 77
          },
          "id": 301,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "single",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_disk_discards_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "interval": "",
              "intervalFactor": 4,
              "legendFormat": "{{device}} - Discards completed",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_disk_discards_merged_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "{{device}} - Discards merged",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Disk IOps Discards completed / merged",
          "type": "timeseries"
        }
      ],
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "000000001"
          },
          "refId": "A"
        }
      ],
      "title": "Storage Disk",
      "type": "row"
    },
    {
      "collapsed": true,
      "datasource": {
        "type": "prometheus",
        "uid": "000000001"
      },
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 29
      },
      "id": 271,
      "panels": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description": "",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "bytes",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "bytes"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 62
          },
          "id": 43,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}",
              "format": "time_series",
              "hide": false,
              "intervalFactor": 1,
              "legendFormat": "{{mountpoint}} - Available",
              "metric": "",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "node_filesystem_free_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}",
              "format": "time_series",
              "hide": true,
              "intervalFactor": 1,
              "legendFormat": "{{mountpoint}} - Free",
              "refId": "B",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}",
              "format": "time_series",
              "hide": true,
              "intervalFactor": 1,
              "legendFormat": "{{mountpoint}} - Size",
              "refId": "C",
              "step": 240
            }
          ],
          "title": "Filesystem space available",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description": "",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "file nodes",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 62
          },
          "id": 41,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "node_filesystem_files_free{instance=\"$node\",job=\"$job\",device!~'rootfs'}",
              "format": "time_series",
              "hide": false,
              "intervalFactor": 1,
              "legendFormat": "{{mountpoint}} - Free file nodes",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "File Nodes Free",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description": "",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "files",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 72
          },
          "id": 28,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "single",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_filefd_maximum{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 4,
              "legendFormat": "Max open files",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_filefd_allocated{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "Open files",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "File Descriptor",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description": "",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "file Nodes",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 72
          },
          "id": 219,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_filesystem_files{instance=\"$node\",job=\"$job\",device!~'rootfs'}",
              "format": "time_series",
              "hide": false,
              "intervalFactor": 1,
              "legendFormat": "{{mountpoint}} - File nodes total",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "File Nodes Size",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description": "",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "counter",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "normal"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "max": 1,
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "/ ReadOnly"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#890F02",
                      "mode": "fixed"
                    }
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 82
          },
          "id": 44,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_filesystem_readonly{instance=\"$node\",job=\"$job\",device!~'rootfs'}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "{{mountpoint}} - ReadOnly",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "node_filesystem_device_error{instance=\"$node\",job=\"$job\",device!~'rootfs',fstype!~'tmpfs'}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "{{mountpoint}} - Device error",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Filesystem in ReadOnly / Error",
          "type": "timeseries"
        }
      ],
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "000000001"
          },
          "refId": "A"
        }
      ],
      "title": "Storage Filesystem",
      "type": "row"
    },
    {
      "collapsed": true,
      "datasource": {
        "type": "prometheus",
        "uid": "000000001"
      },
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 30
      },
      "id": 272,
      "panels": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "packets out (-) / in (+)",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "pps"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "receive_packets_eth0"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#7EB26D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "receive_packets_lo"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E24D42",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "transmit_packets_eth0"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#7EB26D",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byName",
                  "options": "transmit_packets_lo"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#E24D42",
                      "mode": "fixed"
                    }
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Trans.*/"
                },
                "properties": [
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 47
          },
          "id": 60,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 300
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_network_receive_packets_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "{{device}} - Receive",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_network_transmit_packets_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "{{device}} - Transmit",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Network Traffic by Packets",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "packets out (-) / in (+)",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "pps"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Trans.*/"
                },
                "properties": [
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 47
          },
          "id": 142,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 300
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_network_receive_errs_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "{{device}} - Receive errors",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_network_transmit_errs_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "{{device}} - Transmit errors",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Network Traffic Errors",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "packets out (-) / in (+)",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "pps"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Trans.*/"
                },
                "properties": [
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 57
          },
          "id": 143,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 300
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_network_receive_drop_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "{{device}} - Receive drop",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_network_transmit_drop_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "{{device}} - Transmit drop",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Network Traffic Drop",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "packets out (-) / in (+)",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "pps"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Trans.*/"
                },
                "properties": [
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 57
          },
          "id": 141,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 300
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_network_receive_compressed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "{{device}} - Receive compressed",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_network_transmit_compressed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "{{device}} - Transmit compressed",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Network Traffic Compressed",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "packets out (-) / in (+)",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "pps"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Trans.*/"
                },
                "properties": [
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 67
          },
          "id": 146,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 300
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_network_receive_multicast_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "{{device}} - Receive multicast",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "Network Traffic Multicast",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "packets out (-) / in (+)",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "pps"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Trans.*/"
                },
                "properties": [
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 67
          },
          "id": 144,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 300
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_network_receive_fifo_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "{{device}} - Receive fifo",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_network_transmit_fifo_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "{{device}} - Transmit fifo",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Network Traffic Fifo",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "packets out (-) / in (+)",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "pps"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Trans.*/"
                },
                "properties": [
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 77
          },
          "id": 145,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 300
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_network_receive_frame_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "hide": false,
              "intervalFactor": 1,
              "legendFormat": "{{device}} - Receive frame",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "Network Traffic Frame",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "counter",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 77
          },
          "id": 231,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 300
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_network_transmit_carrier_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "{{device}} - Statistic transmit_carrier",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "Network Traffic Carrier",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "counter",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Trans.*/"
                },
                "properties": [
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 87
          },
          "id": 232,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 300
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_network_transmit_colls_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "{{device}} - Transmit colls",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "Network Traffic Colls",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "entries",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byName",
                  "options": "NF conntrack limit"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#890F02",
                      "mode": "fixed"
                    }
                  },
                  {
                    "id": "custom.fillOpacity",
                    "value": 0
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 87
          },
          "id": 61,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_nf_conntrack_entries{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "NF conntrack entries",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_nf_conntrack_entries_limit{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "NF conntrack limit",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "NF Conntrack",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "Entries",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 97
          },
          "id": 230,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_arp_entries{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "{{ device }} - ARP entries",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "ARP Entries",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "bytes",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "decimals": 0,
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "bytes"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 97
          },
          "id": 288,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_network_mtu_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "{{ device }} - Bytes",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "MTU",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "bytes",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "decimals": 0,
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "bytes"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 107
          },
          "id": 280,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_network_speed_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "{{ device }} - Speed",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "Speed",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "packets",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "decimals": 0,
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "none"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 107
          },
          "id": 289,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_network_transmit_queue_length{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "{{ device }} -   Interface transmit queue length",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "Queue Length",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "packetes drop (-) / process (+)",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Dropped.*/"
                },
                "properties": [
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 117
          },
          "id": 290,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 300
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_softnet_processed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "CPU {{cpu}} - Processed",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_softnet_dropped_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "CPU {{cpu}} - Dropped",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Softnet Packets",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "counter",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 117
          },
          "id": 310,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 300
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_softnet_times_squeezed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "CPU {{cpu}} - Squeezed",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "Softnet Out of Quota",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "counter",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 127
          },
          "id": 309,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 300
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_network_up{operstate=\"up\",instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "{{interface}} - Operational state UP",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_network_carrier{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "instant": false,
              "legendFormat": "{{device}} - Physical link state",
              "refId": "B"
            }
          ],
          "title": "Network Operational Status",
          "type": "timeseries"
        }
      ],
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "000000001"
          },
          "refId": "A"
        }
      ],
      "title": "Network Traffic",
      "type": "row"
    },
    {
      "collapsed": true,
      "datasource": {
        "type": "prometheus",
        "uid": "000000001"
      },
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 31
      },
      "id": 273,
      "panels": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "counter",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 48
          },
          "id": 63,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 300
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_sockstat_TCP_alloc{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "TCP_alloc - Allocated sockets",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_sockstat_TCP_inuse{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "TCP_inuse - Tcp sockets currently in use",
              "refId": "B",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_sockstat_TCP_mem{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": true,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "TCP_mem - Used memory for tcp",
              "refId": "C",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_sockstat_TCP_orphan{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "TCP_orphan - Orphan sockets",
              "refId": "D",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_sockstat_TCP_tw{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "TCP_tw - Sockets waiting close",
              "refId": "E",
              "step": 240
            }
          ],
          "title": "Sockstat TCP",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "counter",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 48
          },
          "id": 124,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 300
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_sockstat_UDPLITE_inuse{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "UDPLITE_inuse - Udplite sockets currently in use",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_sockstat_UDP_inuse{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "UDP_inuse - Udp sockets currently in use",
              "refId": "B",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_sockstat_UDP_mem{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "UDP_mem - Used memory for udp",
              "refId": "C",
              "step": 240
            }
          ],
          "title": "Sockstat UDP",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "counter",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 58
          },
          "id": 125,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 300
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_sockstat_FRAG_inuse{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "FRAG_inuse - Frag sockets currently in use",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_sockstat_RAW_inuse{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "RAW_inuse - Raw sockets currently in use",
              "refId": "C",
              "step": 240
            }
          ],
          "title": "Sockstat FRAG / RAW",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "bytes",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "bytes"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 58
          },
          "id": 220,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 300
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_sockstat_TCP_mem_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "mem_bytes - TCP sockets in that state",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_sockstat_UDP_mem_bytes{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "mem_bytes - UDP sockets in that state",
              "refId": "B",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_sockstat_FRAG_memory{instance=\"$node\",job=\"$job\"}",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "FRAG_memory - Used memory for frag",
              "refId": "C"
            }
          ],
          "title": "Sockstat Memory Size",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "sockets",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 68
          },
          "id": 126,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 300
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_sockstat_sockets_used{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "Sockets_used - Sockets currently in use",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "Sockstat Used",
          "type": "timeseries"
        }
      ],
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "000000001"
          },
          "refId": "A"
        }
      ],
      "title": "Network Sockstat",
      "type": "row"
    },
    {
      "collapsed": true,
      "datasource": {
        "type": "prometheus",
        "uid": "000000001"
      },
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 32
      },
      "id": 274,
      "panels": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "octets out (-) / in (+)",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green",
                    "value": null
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Out.*/"
                },
                "properties": [
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 33
          },
          "id": 221,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 300
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_netstat_IpExt_InOctets{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "InOctets - Received octets",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_netstat_IpExt_OutOctets{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "intervalFactor": 1,
              "legendFormat": "OutOctets - Sent octets",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Netstat IP In / Out Octets",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "datagrams",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green",
                    "value": null
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 33
          },
          "id": 81,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true,
              "width": 300
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_netstat_Ip_Forwarding{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "Forwarding - IP forwarding",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "Netstat IP Forwarding",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "messages out (-) / in (+)",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green",
                    "value": null
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Out.*/"
                },
                "properties": [
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 43
          },
          "id": 115,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_netstat_Icmp_InMsgs{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat":
                  "InMsgs -  Messages which the entity received. Note that this counter includes all those counted by icmpInErrors",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_netstat_Icmp_OutMsgs{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat":
                  "OutMsgs - Messages which this entity attempted to send. Note that this counter includes all those counted by icmpOutErrors",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "ICMP In / Out",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "messages out (-) / in (+)",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green",
                    "value": null
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Out.*/"
                },
                "properties": [
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 43
          },
          "id": 50,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_netstat_Icmp_InErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat":
                  "InErrors - Messages which the entity received but determined as having ICMP-specific errors (bad ICMP checksums, bad length, etc.)",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "ICMP Errors",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "datagrams out (-) / in (+)",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green",
                    "value": null
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Out.*/"
                },
                "properties": [
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Snd.*/"
                },
                "properties": [
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 53
          },
          "id": 55,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_netstat_Udp_InDatagrams{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "InDatagrams - Datagrams received",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_netstat_Udp_OutDatagrams{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "OutDatagrams - Datagrams sent",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "UDP In / Out",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "datagrams",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green",
                    "value": null
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 53
          },
          "id": 109,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_netstat_Udp_InErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat":
                  "InErrors - UDP Datagrams that could not be delivered to an application",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_netstat_Udp_NoPorts{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "NoPorts - UDP Datagrams received on a port with no listener",
              "refId": "B",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_netstat_UdpLite_InErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "interval": "",
              "legendFormat":
                  "InErrors Lite - UDPLite Datagrams that could not be delivered to an application",
              "refId": "C"
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_netstat_Udp_RcvbufErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "RcvbufErrors - UDP buffer errors received",
              "refId": "D",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_netstat_Udp_SndbufErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "SndbufErrors - UDP buffer errors send",
              "refId": "E",
              "step": 240
            }
          ],
          "title": "UDP Errors",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "datagrams out (-) / in (+)",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green",
                    "value": null
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Out.*/"
                },
                "properties": [
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              },
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Snd.*/"
                },
                "properties": [
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 63
          },
          "id": 299,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_netstat_Tcp_InSegs{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "instant": false,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat":
                  "InSegs - Segments received, including those received in error. This count includes segments received on currently established connections",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_netstat_Tcp_OutSegs{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat":
                  "OutSegs - Segments sent, including those on current connections but excluding those containing only retransmitted octets",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "TCP In / Out",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description": "",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "counter",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green",
                    "value": null
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 63
          },
          "id": 104,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_netstat_TcpExt_ListenOverflows{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "ListenOverflows - Times the listen queue of a socket overflowed",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_netstat_TcpExt_ListenDrops{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "ListenDrops - SYNs to LISTEN sockets ignored",
              "refId": "B",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_netstat_TcpExt_TCPSynRetrans{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat":
                  "TCPSynRetrans - SYN-SYN/ACK retransmits to break down retransmissions in SYN, fast/timeout retransmits",
              "refId": "C",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_netstat_Tcp_RetransSegs{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "interval": "",
              "legendFormat":
                  "RetransSegs - Segments retransmitted - that is, the number of TCP segments transmitted containing one or more previously transmitted octets",
              "refId": "D"
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_netstat_Tcp_InErrs{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "interval": "",
              "legendFormat": "InErrs - Segments received in error (e.g., bad TCP checksums)",
              "refId": "E"
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_netstat_Tcp_OutRsts{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "interval": "",
              "legendFormat": "OutRsts - Segments sent with RST flag",
              "refId": "F"
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "editorMode": "code",
              "expr":
                  "irate(node_netstat_TcpExt_TCPRcvQDrop{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "hide": false,
              "interval": "",
              "legendFormat":
                  "TCPRcvQDrop - Packets meant to be queued in rcv queue but dropped because socket rcvbuf limit hit",
              "range": true,
              "refId": "G"
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "editorMode": "code",
              "expr":
                  "irate(node_netstat_TcpExt_TCPOFOQueue{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "hide": false,
              "interval": "",
              "legendFormat":
                  "TCPOFOQueue - TCP layer receives an out of order packet and has enough memory to queue it",
              "range": true,
              "refId": "H"
            }
          ],
          "title": "TCP Errors",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "connections",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green",
                    "value": null
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*MaxConn *./"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#890F02",
                      "mode": "fixed"
                    }
                  },
                  {
                    "id": "custom.fillOpacity",
                    "value": 0
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 73
          },
          "id": 85,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_netstat_Tcp_CurrEstab{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat":
                  "CurrEstab - TCP connections for which the current state is either ESTABLISHED or CLOSE- WAIT",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_netstat_Tcp_MaxConn{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat":
                  "MaxConn - Limit on the total number of TCP connections the entity can support (Dynamic is \"-1\")",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "TCP Connections",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description": "",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "counter out (-) / in (+)",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green",
                    "value": null
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*Sent.*/"
                },
                "properties": [
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 73
          },
          "id": 91,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_netstat_TcpExt_SyncookiesFailed{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "SyncookiesFailed - Invalid SYN cookies received",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_netstat_TcpExt_SyncookiesRecv{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "SyncookiesRecv - SYN cookies received",
              "refId": "B",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_netstat_TcpExt_SyncookiesSent{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "SyncookiesSent - SYN cookies sent",
              "refId": "C",
              "step": 240
            }
          ],
          "title": "TCP SynCookie",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "connections",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green",
                    "value": null
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 83
          },
          "id": 82,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_netstat_Tcp_ActiveOpens{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat":
                  "ActiveOpens - TCP connections that have made a direct transition to the SYN-SENT state from the CLOSED state",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr":
                  "irate(node_netstat_Tcp_PassiveOpens{instance=\"$node\",job=\"$job\"}[$__rate_interval])",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat":
                  "PassiveOpens - TCP connections that have made a direct transition to the SYN-RCVD state from the LISTEN state",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "TCP Direct Transition",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description": "Enable with --collector.tcpstat argument on node-exporter",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "connections",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "min": 0,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green",
                    "value": null
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 83
          },
          "id": 320,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "editorMode": "code",
              "expr":
                  "node_tcp_connection_states{state=\"established\",instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "established - TCP sockets in established state",
              "range": true,
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "editorMode": "code",
              "expr":
                  "node_tcp_connection_states{state=\"fin_wait2\",instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "fin_wait2 - TCP sockets in fin_wait2 state",
              "range": true,
              "refId": "B",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "editorMode": "code",
              "expr":
                  "node_tcp_connection_states{state=\"listen\",instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "listen - TCP sockets in listen state",
              "range": true,
              "refId": "C",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "editorMode": "code",
              "expr":
                  "node_tcp_connection_states{state=\"time_wait\",instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "time_wait - TCP sockets in time_wait state",
              "range": true,
              "refId": "D",
              "step": 240
            }
          ],
          "title": "TCP Stat",
          "type": "timeseries"
        }
      ],
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "000000001"
          },
          "refId": "A"
        }
      ],
      "title": "Network Netstat",
      "type": "row"
    },
    {
      "collapsed": true,
      "datasource": {
        "type": "prometheus",
        "uid": "000000001"
      },
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 33
      },
      "id": 279,
      "panels": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description": "",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "seconds",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "normal"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "s"
            },
            "overrides": []
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 0,
            "y": 66
          },
          "id": 40,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_scrape_collector_duration_seconds{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "{{collector}} - Scrape duration",
              "refId": "A",
              "step": 240
            }
          ],
          "title": "Node Exporter Scrape Time",
          "type": "timeseries"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${datasource}"
          },
          "description": "",
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "palette-classic"
              },
              "custom": {
                "axisCenteredZero": false,
                "axisColorMode": "text",
                "axisLabel": "counter",
                "axisPlacement": "auto",
                "barAlignment": 0,
                "drawStyle": "line",
                "fillOpacity": 20,
                "gradientMode": "none",
                "hideFrom": {
                  "legend": false,
                  "tooltip": false,
                  "viz": false
                },
                "lineInterpolation": "linear",
                "lineStyle": {
                  "fill": "solid"
                },
                "lineWidth": 1,
                "pointSize": 5,
                "scaleDistribution": {
                  "type": "linear"
                },
                "showPoints": "never",
                "spanNulls": false,
                "stacking": {
                  "group": "A",
                  "mode": "none"
                },
                "thresholdsStyle": {
                  "mode": "off"
                }
              },
              "links": [],
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green"
                  },
                  {
                    "color": "red",
                    "value": 80
                  }
                ]
              },
              "unit": "short"
            },
            "overrides": [
              {
                "matcher": {
                  "id": "byRegexp",
                  "options": "/.*error.*/"
                },
                "properties": [
                  {
                    "id": "color",
                    "value": {
                      "fixedColor": "#F2495C",
                      "mode": "fixed"
                    }
                  },
                  {
                    "id": "custom.transform",
                    "value": "negative-Y"
                  }
                ]
              }
            ]
          },
          "gridPos": {
            "h": 10,
            "w": 12,
            "x": 12,
            "y": 66
          },
          "id": 157,
          "links": [],
          "options": {
            "legend": {
              "calcs": [
                "mean",
                "lastNotNull",
                "max",
                "min"
              ],
              "displayMode": "table",
              "placement": "bottom",
              "showLegend": true
            },
            "tooltip": {
              "mode": "multi",
              "sort": "none"
            }
          },
          "pluginVersion": "9.2.0",
          "targets": [
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_scrape_collector_success{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "{{collector}} - Scrape success",
              "refId": "A",
              "step": 240
            },
            {
              "datasource": {
                "type": "prometheus",
                "uid": "${datasource}"
              },
              "expr": "node_textfile_scrape_error{instance=\"$node\",job=\"$job\"}",
              "format": "time_series",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
              "legendFormat": "{{collector}} - Scrape textfile error (1 = true)",
              "refId": "B",
              "step": 240
            }
          ],
          "title": "Node Exporter Scrape",
          "type": "timeseries"
        }
      ],
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "000000001"
          },
          "refId": "A"
        }
      ],
      "title": "Node Exporter",
      "type": "row"
    }
  ],
  "refresh": "1m",
  "revision": 1,
  "schemaVersion": 38,
  "style": "dark",
  "tags": [
    "linux"
  ],
  "templating": {
    "list": [
      {
        "current": {
          "selected": false,
          "text": "default",
          "value": "default"
        },
        "hide": 0,
        "includeAll": false,
        "label": "Datasource",
        "multi": false,
        "name": "datasource",
        "options": [],
        "query": "prometheus",
        "queryValue": "",
        "refresh": 1,
        "regex": "",
        "skipUrlSync": false,
        "type": "datasource"
      },
      {
        "current": {},
        "datasource": {
          "type": "prometheus",
          "uid": "${datasource}"
        },
        "definition": "",
        "hide": 0,
        "includeAll": false,
        "label": "Job",
        "multi": false,
        "name": "job",
        "options": [],
        "query": {
          "query": "label_values(node_uname_info, job)",
          "refId": "Prometheus-job-Variable-Query"
        },
        "refresh": 1,
        "regex": "",
        "skipUrlSync": false,
        "sort": 1,
        "tagValuesQuery": "",
        "tagsQuery": "",
        "type": "query",
        "useTags": false
      },
      {
        "current": {},
        "datasource": {
          "type": "prometheus",
          "uid": "${datasource}"
        },
        "definition": "label_values(node_uname_info{job=\"$job\"}, instance)",
        "hide": 0,
        "includeAll": false,
        "label": "Host",
        "multi": false,
        "name": "node",
        "options": [],
        "query": {
          "query": "label_values(node_uname_info{job=\"$job\"}, instance)",
          "refId": "Prometheus-node-Variable-Query"
        },
        "refresh": 1,
        "regex": "",
        "skipUrlSync": false,
        "sort": 1,
        "tagValuesQuery": "",
        "tagsQuery": "",
        "type": "query",
        "useTags": false
      },
      {
        "current": {
          "selected": false,
          "text": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+",
          "value": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+"
        },
        "hide": 2,
        "includeAll": false,
        "multi": false,
        "name": "diskdevices",
        "options": [
          {
            "selected": true,
            "text": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+",
            "value": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+"
          }
        ],
        "query": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+",
        "skipUrlSync": false,
        "type": "custom"
      }
    ]
  },
  "time": {
    "from": "now-24h",
    "to": "now"
  },
  "timepicker": {
    "refresh_intervals": [
      "5s",
      "10s",
      "30s",
      "1m",
      "5m",
      "15m",
      "30m",
      "1h",
      "2h",
      "1d"
    ],
    "time_options": [
      "5m",
      "15m",
      "1h",
      "6h",
      "12h",
      "24h",
      "2d",
      "7d",
      "30d"
    ]
  },
  "timezone": "browser",
  "title": "Node Exporter Full",
  "uid": "rYdddlPWk",
  "version": 92,
  "weekStart": ""
}


================================================
FILE: tools/local/monitoring/grafana/provisioning/dashboards/redis.json
================================================
{
  "annotations": {
    "list": [
      {
        "builtIn": 1,
        "datasource": {
          "type": "datasource",
          "uid": "grafana"
        },
        "enable": true,
        "hide": true,
        "iconColor": "rgba(0, 211, 255, 1)",
        "name": "Annotations & Alerts",
        "type": "dashboard"
      }
    ]
  },
  "description": "Redis Dashboard for Prometheus Redis Exporter 1.x",
  "editable": true,
  "fiscalYearStartMonth": 0,
  "gnetId": 763,
  "graphTooltip": 1,
  "id": 4,
  "links": [],
  "liveNow": false,
  "panels": [
    {
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "fixedColor": "rgb(31, 120, 193)",
            "mode": "fixed"
          },
          "decimals": 0,
          "mappings": [
            {
              "options": {
                "match": "null",
                "result": {
                  "text": "N/A"
                }
              },
              "type": "special"
            }
          ],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "s"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 7,
        "w": 3,
        "x": 0,
        "y": 0
      },
      "id": 9,
      "links": [],
      "maxDataPoints": 100,
      "options": {
        "colorMode": "none",
        "graphMode": "area",
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "showPercentChange": false,
        "textMode": "auto",
        "wideLayout": true
      },
      "pluginVersion": "10.1.10",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr":
              "max(max_over_time(redis_uptime_in_seconds{instance=~\"$instance\"}[$__interval]))",
          "format": "time_series",
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "",
          "metric": "",
          "refId": "A",
          "step": 1800
        }
      ],
      "title": "Max Uptime",
      "type": "stat"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "fixedColor": "rgb(31, 120, 193)",
            "mode": "fixed"
          },
          "decimals": 0,
          "mappings": [
            {
              "options": {
                "match": "null",
                "result": {
                  "text": "N/A"
                }
              },
              "type": "special"
            }
          ],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "none"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 7,
        "w": 2,
        "x": 3,
        "y": 0
      },
      "hideTimeOverride": true,
      "id": 12,
      "links": [],
      "maxDataPoints": 100,
      "options": {
        "colorMode": "none",
        "graphMode": "area",
        "justifyMode": "auto",
        "orientation": "horizontal",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "showPercentChange": false,
        "textMode": "auto",
        "wideLayout": true
      },
      "pluginVersion": "10.1.10",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr": "sum(redis_connected_clients{instance=~\"$instance\"})",
          "format": "time_series",
          "intervalFactor": 2,
          "legendFormat": "",
          "metric": "",
          "refId": "A",
          "step": 2
        }
      ],
      "timeFrom": "1m",
      "title": "Clients",
      "type": "stat"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "decimals": 0,
          "mappings": [
            {
              "options": {
                "match": "null",
                "result": {
                  "text": "N/A"
                }
              },
              "type": "special"
            }
          ],
          "max": 100,
          "min": 0,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "rgba(50, 172, 45, 0.97)",
                "value": null
              },
              {
                "color": "rgba(237, 129, 40, 0.89)",
                "value": 80
              },
              {
                "color": "rgba(245, 54, 54, 0.9)",
                "value": 95
              }
            ]
          },
          "unit": "percent"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 7,
        "w": 3,
        "x": 5,
        "y": 0
      },
      "hideTimeOverride": true,
      "id": 11,
      "links": [],
      "maxDataPoints": 100,
      "options": {
        "minVizHeight": 75,
        "minVizWidth": 75,
        "orientation": "horizontal",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "showThresholdLabels": false,
        "showThresholdMarkers": true,
        "sizing": "auto"
      },
      "pluginVersion": "10.1.10",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr":
              "sum(100 * (redis_memory_used_bytes{instance=~\"$instance\"}  / redis_memory_max_bytes{instance=~\"$instance\"}))",
          "format": "time_series",
          "intervalFactor": 2,
          "legendFormat": "",
          "metric": "",
          "refId": "A",
          "step": 2
        }
      ],
      "timeFrom": "1m",
      "title": "Memory Usage",
      "type": "gauge"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 80,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": true,
            "stacking": {
              "group": "A",
              "mode": "normal"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "links": [],
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "short"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 7,
        "w": 8,
        "x": 8,
        "y": 0
      },
      "id": 18,
      "links": [],
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": false
        },
        "tooltip": {
          "mode": "multi",
          "sort": "desc"
        }
      },
      "pluginVersion": "10.3.3",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr":
              "sum(rate(redis_commands_total{instance=~\"$instance\"} [$__rate_interval])) by (cmd)",
          "format": "time_series",
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "{{ cmd }}",
          "metric": "redis_command_calls_total",
          "refId": "A",
          "step": 240
        }
      ],
      "title": "Total Commands / sec",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 10,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 2,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": true,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "links": [],
          "mappings": [],
          "min": 0,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "short"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 7,
        "w": 8,
        "x": 16,
        "y": 0
      },
      "id": 1,
      "links": [],
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": false
        },
        "tooltip": {
          "mode": "multi",
          "sort": "none"
        }
      },
      "pluginVersion": "10.3.3",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr": "irate(redis_keyspace_hits_total{instance=~\"$instance\"}[$__rate_interval])",
          "format": "time_series",
          "hide": false,
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "hits, {{ instance }}",
          "metric": "",
          "refId": "A",
          "step": 240,
          "target": ""
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr": "irate(redis_keyspace_misses_total{instance=~\"$instance\"}[$__rate_interval])",
          "format": "time_series",
          "hide": false,
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "misses, {{ instance }}",
          "metric": "",
          "refId": "B",
          "step": 240,
          "target": ""
        }
      ],
      "title": "Hits / Misses per Sec",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 10,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 2,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "links": [],
          "mappings": [],
          "min": 0,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "bytes"
        },
        "overrides": [
          {
            "matcher": {
              "id": "byName",
              "options": "max"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#BF1B00",
                  "mode": "fixed"
                }
              }
            ]
          }
        ]
      },
      "gridPos": {
        "h": 7,
        "w": 12,
        "x": 0,
        "y": 7
      },
      "id": 7,
      "links": [],
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "multi",
          "sort": "none"
        }
      },
      "pluginVersion": "10.3.3",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr": "redis_memory_used_bytes{instance=~\"$instance\"}",
          "format": "time_series",
          "intervalFactor": 2,
          "legendFormat": "used, {{ instance }}",
          "metric": "",
          "refId": "A",
          "step": 240,
          "target": ""
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr": "redis_memory_max_bytes{instance=~\"$instance\"}",
          "format": "time_series",
          "hide": false,
          "intervalFactor": 2,
          "legendFormat": "max, {{ instance }}",
          "refId": "B",
          "step": 240
        }
      ],
      "title": "Total Memory Usage",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 10,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 2,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": true,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "links": [],
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "bytes"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 7,
        "w": 12,
        "x": 12,
        "y": 7
      },
      "id": 10,
      "links": [],
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "multi",
          "sort": "none"
        }
      },
      "pluginVersion": "10.3.3",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr":
              "sum(rate(redis_net_input_bytes_total{instance=~\"$instance\"}[$__rate_interval]))",
          "format": "time_series",
          "intervalFactor": 2,
          "legendFormat": "{{ input }}",
          "refId": "A",
          "step": 240
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr":
              "sum(rate(redis_net_output_bytes_total{instance=~\"$instance\"}[$__rate_interval]))",
          "format": "time_series",
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "{{ output }}",
          "refId": "B",
          "step": 240
        }
      ],
      "title": "Network I/O",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 70,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 2,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": true,
            "stacking": {
              "group": "A",
              "mode": "normal"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "links": [],
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "none"
        },
        "overrides": [
          {
            "matcher": {
              "id": "byValue",
              "options": {
                "op": "gte",
                "reducer": "allIsZero",
                "value": 0
              }
            },
            "properties": [
              {
                "id": "custom.hideFrom",
                "value": {
                  "legend": true,
                  "tooltip": true,
                  "viz": false
                }
              }
            ]
          }
        ]
      },
      "gridPos": {
        "h": 7,
        "w": 12,
        "x": 0,
        "y": 14
      },
      "id": 5,
      "links": [],
      "options": {
        "legend": {
          "calcs": [
            "lastNotNull"
          ],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "multi",
          "sort": "none"
        }
      },
      "pluginVersion": "10.3.3",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr": "sum (redis_db_keys{instance=~\"$instance\"}) by (db, instance)",
          "format": "time_series",
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "{{ db }}, {{ instance }}",
          "refId": "A",
          "step": 240,
          "target": ""
        }
      ],
      "title": "Total Items per DB",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 70,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 2,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": true,
            "stacking": {
              "group": "A",
              "mode": "normal"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "links": [],
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "short"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 7,
        "w": 12,
        "x": 12,
        "y": 14
      },
      "id": 13,
      "links": [],
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "multi",
          "sort": "none"
        }
      },
      "pluginVersion": "10.3.3",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr":
              "sum (redis_db_keys{instance=~\"$instance\"}) by (instance) - sum (redis_db_keys_expiring{instance=~\"$instance\"}) by (instance)",
          "format": "time_series",
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "not expiring, {{ instance }}",
          "refId": "A",
          "step": 240,
          "target": ""
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr": "sum (redis_db_keys_expiring{instance=~\"$instance\"}) by (instance)",
          "format": "time_series",
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "expiring, {{ instance }}",
          "metric": "",
          "refId": "B",
          "step": 240
        }
      ],
      "title": "Expiring vs Not-Expiring Keys",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 10,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 2,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": true,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "links": [],
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "short"
        },
        "overrides": [
          {
            "matcher": {
              "id": "byName",
              "options": "evicts"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#890F02",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options":
                  "memcached_items_evicted_total{instance=\"172.17.0.1:9150\",job=\"prometheus\"}"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#890F02",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "reclaims"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#3F6833",
                  "mode": "fixed"
                }
              }
            ]
          }
        ]
      },
      "gridPos": {
        "h": 7,
        "w": 12,
        "x": 0,
        "y": 21
      },
      "id": 8,
      "links": [],
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "multi",
          "sort": "none"
        }
      },
      "pluginVersion": "10.3.3",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr":
              "sum(rate(redis_expired_keys_total{instance=~\"$instance\"}[$__rate_interval])) by (instance)",
          "format": "time_series",
          "hide": false,
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "expired, {{ instance }}",
          "metric": "",
          "refId": "A",
          "step": 240,
          "target": ""
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr":
              "sum(rate(redis_evicted_keys_total{instance=~\"$instance\"}[$__rate_interval])) by (instance)",
          "format": "time_series",
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "evicted, {{ instance }}",
          "refId": "B",
          "step": 240
        }
      ],
      "title": "Expired/Evicted Keys",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 10,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "links": [],
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "short"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 7,
        "w": 12,
        "x": 12,
        "y": 21
      },
      "id": 16,
      "links": [],
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "multi",
          "sort": "none"
        }
      },
      "pluginVersion": "10.3.3",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr": "sum(redis_connected_clients{instance=~\"$instance\"})",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "connected",
          "refId": "A"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr": "sum(redis_blocked_clients{instance=~\"$instance\"})",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "blocked",
          "refId": "B"
        }
      ],
      "title": "Connected/Blocked Clients",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": true,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "links": [],
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green"
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "s",
          "unitScale": true
        },
        "overrides": [
          {
            "matcher": {
              "id": "byValue",
              "options": {
                "op": "gte",
                "reducer": "allIsZero",
                "value": 0
              }
            },
            "properties": [
              {
                "id": "custom.hideFrom",
                "value": {
                  "legend": true,
                  "tooltip": true,
                  "viz": false
                }
              }
            ]
          }
        ]
      },
      "gridPos": {
        "h": 7,
        "w": 12,
        "x": 0,
        "y": 28
      },
      "id": 20,
      "links": [],
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "multi",
          "sort": "desc"
        }
      },
      "pluginVersion": "10.3.3",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr":
              "sum(irate(redis_commands_duration_seconds_total{instance =~ \"$instance\"}[$__rate_interval])) by (cmd)\n  /\nsum(irate(redis_commands_total{instance =~ \"$instance\"}[$__rate_interval])) by (cmd)\n",
          "format": "time_series",
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "{{ cmd }}",
          "metric": "redis_command_calls_total",
          "refId": "A",
          "step": 240
        }
      ],
      "title": "Average Time Spent by Command / sec",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "PBFA97CFB590B2093"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 80,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": true,
            "stacking": {
              "group": "A",
              "mode": "normal"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "links": [],
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green"
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "s",
          "unitScale": true
        },
        "overrides": []
      },
      "gridPos": {
        "h": 7,
        "w": 12,
        "x": 12,
        "y": 28
      },
      "id": 14,
      "links": [],
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "multi",
          "sort": "desc"
        }
      },
      "pluginVersion": "10.3.3",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
          "expr":
              "sum(irate(redis_commands_duration_seconds_total{instance=~\"$instance\"}[$__rate_interval])) by (cmd) != 0",
          "format": "time_series",
          "interval": "",
          "intervalFactor": 2,
          "legendFormat": "{{ cmd }}",
          "metric": "redis_command_calls_total",
          "refId": "A",
          "step": 240
        }
      ],
      "title": "Total Time Spent by Command / sec",
      "type": "timeseries"
    }
  ],
  "refresh": "",
  "schemaVersion": 38,
  "style": "dark",
  "tags": [
    "prometheus",
    "redis"
  ],
  "templating": {
    "list": [
      {
        "current": {
          "isNone": true,
          "selected": false,
          "text": "None",
          "value": ""
        },
        "datasource": {
          "type": "prometheus",
          "uid": "PBFA97CFB590B2093"
        },
        "definition": "label_values(redis_up, namespace)",
        "hide": 0,
        "includeAll": false,
        "multi": false,
        "name": "namespace",
        "options": [],
        "query": "label_values(redis_up, namespace)",
        "refresh": 2,
        "regex": "",
        "skipUrlSync": false,
        "sort": 1,
        "tagValuesQuery": "",
        "tagsQuery": "",
        "type": "query",
        "useTags": false
      },
      {
        "current": {
          "selected": true,
          "text": [
            "redis-exporter:9121"
          ],
          "value": [
            "redis-exporter:9121"
          ]
        },
        "datasource": {
          "type": "prometheus",
          "uid": "PBFA97CFB590B2093"
        },
        "definition": "label_values(redis_up{namespace=~\"$namespace\"}, instance)",
        "hide": 0,
        "includeAll": false,
        "multi": true,
        "name": "instance",
        "options": [],
        "query": "label_values(redis_up{namespace=~\"$namespace\"}, instance)",
        "refresh": 2,
        "regex": "",
        "skipUrlSync": false,
        "sort": 1,
        "tagValuesQuery": "",
        "tagsQuery": "",
        "type": "query",
        "useTags": false
      }
    ]
  },
  "time": {
    "from": "now-15m",
    "to": "now"
  },
  "timepicker": {
    "refresh_intervals": [
      "5s",
      "10s",
      "30s",
      "1m",
      "5m",
      "15m",
      "30m",
      "1h",
      "2h",
      "1d"
    ],
    "time_options": [
      "5m",
      "15m",
      "1h",
      "6h",
      "12h",
      "24h",
      "2d",
      "7d",
      "30d"
    ]
  },
  "timezone": "browser",
  "title": "Redis Dashboard for Prometheus",
  "uid": "e008bc3f-81a2-40f9-baf2-a33fd8dec7ec",
  "version": 3,
  "weekStart": ""
}


================================================
FILE: tools/local/monitoring/grafana/provisioning/datasources/datasource.yml
================================================
# config file version
apiVersion: 1

# list of datasources that should be deleted from the database
deleteDatasources:
  - name: Prometheus
    orgId: 1

# list of datasources to insert/update depending
# whats available in the database
datasources:
  # <string, required> name of the datasource. Required
- name: Prometheus
  # <string, required> datasource type. Required
  type: prometheus
  # <string, required> access mode. direct or proxy. Required
  access: proxy
  # <int> org id. will default to orgId 1 if not specified
  orgId: 1
  # <string> url
  url: http://prometheus:9090
  # <string> database password, if used
  password:
  # <string> database user, if used
  user:
  # <string> database name, if used
  database:
  # <bool> enable/disable basic auth
  basicAuth: false
  # <string> basic auth username, if used
  basicAuthUser:
  # <string> basic auth password, if used
  basicAuthPassword:
  # <bool> enable/disable with credentials headers
  withCredentials:
  # <bool> mark as default datasource. Max one per org
  isDefault: true
  # <map> fields that will be converted to json and stored in json_data
  jsonData:
     graphiteVersion: "1.1"
     tlsAuth: false
     tlsAuthWithCACert: false
     timeInterval: 1s  # Based on https://stackoverflow.com/a/66830690
  # <string> json object of data that will be encrypted.
  secureJsonData:
    tlsCACert: "..."
    tlsClientCert: "..."
    tlsClientKey: "..."
  version: 1
  # <bool> allow users to edit datasources from the UI.
  editable: true


================================================
FILE: tools/local/monitoring/prometheus/prometheus.yml
================================================
# my global config
global:
  scrape_interval:     5s
  evaluation_interval: 5s

  # Attach these labels to any time series or alerts when communicating with
  # external systems (federation, remote storage, Alertmanager).
  external_labels:
      monitor: 'my-project'

# Load and evaluate rules in this file every 'evaluation_interval' seconds.
# rule_files:
  # - 'alert.rules'
  # - "first.rules"
  # - "second.rules"

# alert
# alerting:
#   alertmanagers:
#   - scheme: http
#     static_configs:
#     - targets:
#       - "alertmanager:9093"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.

  - job_name: dragonfly
    scrape_interval: 1s
    static_configs:
      - targets: ['host.docker.internal:6379']

  - job_name: 'prometheus'
    scrape_interval: 1s
    static_configs:
         - targets: ['localhost:9090']


  - job_name: 'node-exporter'
    scrape_interval: 1s
    static_configs:
      - targets: ['node-exporter:9100']
        labels:
          instance: node

  - job_name: 'memcached-exporter'
    scrape_interval: 1s
    static_configs:
      - targets: ['memcached-exporter:9150']

  - job_name: 'redis-exporter'
    scrape_interval: 1s
    static_configs:
      - targets: ['redis-exporter:9121']


================================================
FILE: tools/packaging/Dockerfile.alpine-dev
================================================
# syntax=docker/dockerfile:1
FROM gcr.io/cadvisor/cadvisor:v0.46.0 AS libpfm_donor

FROM alpine:3 AS builder

# "openssl-libs-static" fixes "Could NOT find OpenSSL, try to set the path to OpenSSL root folder in the"
RUN apk add autoconf-archive automake bash bison boost-dev cmake coreutils net-tools \
        curl ccache git gcc gdb g++ libunwind-dev libtool make ninja \
        openssl-dev openssl-libs-static patch zip zstd-static

# This is required to make static linking work
RUN ls -1 /usr/lib/libboost_*.so | while read -r _file; do ln -sfv ${_file} ${_file//.so/.a}; done

# Borrow libpfm from cadvisor, so we don't have to build it ourselves
# https://github.com/google/cadvisor/blob/master/deploy/Dockerfile
COPY --from=libpfm_donor /usr/local/lib/libpfm.so* /usr/local/lib/

WORKDIR /build

COPY ./Makefile ./CMakeLists.txt ./
COPY src ./src

COPY .git ./.git
COPY patches ./patches
COPY helio ./helio

RUN make release

RUN build-release/dragonfly --version

FROM alpine:3

COPY tools/docker/entrypoint.sh /usr/local/bin/entrypoint.sh
COPY tools/docker/healthcheck.sh /usr/local/bin/healthcheck.sh
COPY --from=builder /build/build-release/dragonfly /usr/local/bin/

RUN apk --no-cache add libgcc libstdc++  \
     setpriv netcat-openbsd boost-context tini && ldd /usr/local/bin/dragonfly

RUN addgroup -S -g 1000 dfly && adduser -S -G dfly -u 999 dfly
RUN mkdir /data && chown dfly:dfly /data

VOLUME /data
WORKDIR /data

HEALTHCHECK CMD /usr/local/bin/healthcheck.sh

# Use tini as PID 1 to properly reap zombie processes (issue #5844)
ENTRYPOINT ["/sbin/tini", "--", "entrypoint.sh"]

EXPOSE 6379

CMD ["dragonfly", "--logtostderr"]


================================================
FILE: tools/packaging/Dockerfile.ubuntu-dev
================================================
# syntax=docker/dockerfile:1
FROM ghcr.io/romange/ubuntu-dev:20-gcc14 AS builder

WORKDIR /build

COPY ./Makefile ./CMakeLists.txt ./
COPY src ./src

COPY .git ./.git
COPY patches ./patches
COPY helio ./helio

RUN make release

RUN build-release/dragonfly --version

FROM ubuntu:22.04

RUN --mount=type=tmpfs,target=/var/cache/apt \
    --mount=type=tmpfs,target=/var/lib/apt/lists \
    apt update && \
    apt install -q -y --no-install-recommends netcat-openbsd ca-certificates redis-tools net-tools tini

RUN groupadd -r -g 999 dfly && useradd -r -g dfly -u 999 dfly
RUN mkdir /data && chown dfly:dfly /data

VOLUME /data
WORKDIR /data

COPY tools/docker/entrypoint.sh /usr/local/bin/entrypoint.sh
COPY tools/docker/healthcheck.sh /usr/local/bin/healthcheck.sh
COPY --from=builder /build/build-release/dragonfly /usr/local/bin/

HEALTHCHECK CMD /usr/local/bin/healthcheck.sh

# Use tini as PID 1 to properly reap zombie processes (issue #5844)
ENTRYPOINT ["/usr/bin/tini", "--", "entrypoint.sh"]

# For inter-container communication.
EXPOSE 6379

CMD ["dragonfly", "--logtostderr"]


================================================
FILE: tools/packaging/Dockerfile.ubuntu-prod
================================================
# syntax=docker/dockerfile:1
FROM ghcr.io/romange/ubuntu-dev:20-gcc14 AS builder

ARG TARGETPLATFORM

WORKDIR /build
COPY tools/docker/fetch_release.sh /tmp/
COPY releases/dragonfly-* /tmp/

RUN /tmp/fetch_release.sh ${TARGETPLATFORM}

# Now prod image
FROM ubuntu:22.04

# ARG in fact change the env vars during the build process
# ENV persist the env vars for the built image as well.
ARG QEMU_CPU
ARG DEBIAN_FRONTEND=noninteractive

RUN --mount=type=tmpfs,target=/var/cache/apt \
    --mount=type=tmpfs,target=/var/lib/apt/lists \
    apt -q update && \
    apt install -q -y --no-install-recommends netcat-openbsd ca-certificates redis-tools net-tools tini

RUN groupadd -r -g 999 dfly && useradd -r -g dfly -u 999 dfly
RUN mkdir /data && chown dfly:dfly /data

VOLUME /data
WORKDIR /data

COPY tools/docker/entrypoint.sh /usr/local/bin/entrypoint.sh
COPY tools/docker/healthcheck.sh /usr/local/bin/healthcheck.sh
COPY --from=builder /build/dragonfly /usr/local/bin/

HEALTHCHECK CMD /usr/local/bin/healthcheck.sh

# Use tini as PID 1 to properly reap zombie processes (issue #5844)
ENTRYPOINT ["/usr/bin/tini", "--", "entrypoint.sh"]

# For inter-container communication.
EXPOSE 6379

CMD ["dragonfly", "--logtostderr"]


================================================
FILE: tools/packaging/README.md
================================================
# Installation Packages

## Overview
This directory includes a set of files and scripts to build installation package for various Linux distributions.

## Debian
The file to build the Debian package all located under "debian" directory.
The resulting package will install the binary of Dragonfly as well as generate a new service entry for dragonfly,
that can be controlled with "systemctl" command, to start, stop and check status of.
### Building
To build the package, you have a script called "generate_debian_package.sh". This script accepts the following parameters:
* Optional binary path - the location from which to take the binary for the installation. The default for this is "repo path/build-opt".
The location to which the resulting package is writing is at the location from which the script is executed.
This script is depends on the following packages:
* git
* moreutils
* debhelper
* dpkg-dev

To build:
```
/path/to/dragonfly/tools/packaging/generate_debian_package.sh [/path/to/dragonfly-binary-file]
```

This can only be run on Debian based hosts.
You can use the flowing docker file to generate this package:
```
FROM ubuntu:20.04
ARG DEBIAN_FRONTEND=noninteractive
RUN apt update -y && apt-get install -y gcc dpkg-dev gpg vim wget git moreutils debhelper
```
Build the above docker and then run it with your dragonfly source code path mount as volume for the build:
```
docker build -t ubuntu-package .
docker run --rm -ti -v /path/to/dragonfly-repo:/mydocker-path ubuntu-package bash
```
Again note that you need to be at "main" branch to successfully build this package.
Note: If at the end of the installation you see a message "/usr/bin/deb-systemd-helper: error: systemctl preset failed on dragonfly.service: No such file or directory",
you can ignore it, this seem to be related to [the following issue](https://groups.google.com/g/linux.debian.bugs.dist/c/m6xGZ82TdvM).


================================================
FILE: tools/packaging/debian/compat
================================================
11


================================================
FILE: tools/packaging/debian/control
================================================
Source: dragonfly
Maintainer: DragonflyDB authors  <dragonfly@dragonflydb.io>
Standards-Version:  4.2.1
Priority: optional
Section: database
Vcs-Git: https://github.com/dragonflydb/dragonfly

Package: dragonfly
Architecture: amd64 arm64
Depends: libc6, openssl, adduser, zstd
Homepage: https://dragonflydb.io
Description: A fast in-memory store that is fully compatible with Redis™* and Memcached.


================================================
FILE: tools/packaging/debian/dragonfly.conf
================================================
--pidfile=/var/run/dragonfly/dragonfly.pid
--log_dir=/var/log/dragonfly
--dir=/var/lib/dragonfly
--max_log_size=20
--version_check=true


================================================
FILE: tools/packaging/debian/dragonfly.install
================================================
debian/dragonfly.service /lib/systemd/system
debian/dragonfly.conf	/etc/dragonfly
debian/bin/dragonfly /usr/bin


================================================
FILE: tools/packaging/debian/dragonfly.logrotate
================================================
# installed by debhelper by convention into /etc/logrotate.d/

/var/log/dragonfly/dragonfly*.log {
        daily
        missingok

        compress
        compresscmd zstd
        uncompresscmd unzstd
        compressext .zst
        notifempty

# do not create an empty file after the rotation.
        nocreate
        prerotate
                if lsof -t $1 > /dev/null; then
                # file is open. Skipping rotation."
                exit 0
                fi
        endscript

# Possible hook to upload rotated logs to cloud storage.
        postrotate
                echo "TBD: POSTROTATE"
        endscript
}


================================================
FILE: tools/packaging/debian/dragonfly.postinst
================================================
#!/bin/sh

# Script to run at the end of the installation
set -eu

USER="dfly"
DIR_NAME="dragonfly"
GROUP="$USER"
CONFFILE="/etc/${DIR_NAME}/${DIR_NAME}.conf"

if [ "$1" = "configure" ]
then
	if ! dpkg-statoverride --list ${CONFFILE} >/dev/null 2>&1
	then
		dpkg-statoverride --update --add ${USER} ${GROUP} 640 ${CONFFILE}
	fi
fi

#DEBHELPER#

if [ "$1" = "configure" ]
then
	find /etc/${DIR_NAME} -maxdepth 1 -type d -name '${DIR_NAME}.*.d' -empty -delete
fi

exit 0


================================================
FILE: tools/packaging/debian/dragonfly.postrm
================================================
#!/bin/sh
# Script to run at the end of remove
set -eu
DIR_NAME="dragonfly"
USER_NAME="dfly"
CONFFILE="/etc/${DIR_NAME}/${DIR_NAME}.conf"

# When purging the package, remove all trances
if [ "${1}" = "purge" ]
then
	userdel ${USER_NAME} || true
	rm -rf /var/lib/${DIR_NAME} /var/log/${DIR_NAME} /etc/${DIR_NAME} /var/run/${DIR_NAME}
	dpkg-statoverride --remove ${CONFFILE} || test $? -eq 2
fi

#DEBHELPER#

exit 0


================================================
FILE: tools/packaging/debian/dragonfly.preinst
================================================
#!/bin/sh

set -eu
# Script to run before the installation starts.
# We are creating a user "dragonfly", and the directories that
# would be used by the application

USER="dfly"
DIR_NAME="dragonfly"

setup_dir () {
	DIR="${1}"
	MODE="${2}"
	GROUP="${3}"

	mkdir -p ${DIR} || {
		echo "failed to create dir ${DIR}"
		return 1
	}

	if ! dpkg-statoverride --list ${DIR} >/dev/null 2>&1
	then
		echo "changing owner for ${DIR} to user ${USER}"
		chown ${USER}:${GROUP} ${DIR}
		chmod ${MODE} ${DIR}
	fi
}

if [ "$1" = "install" ]; then
	if ! id ${USER} >/dev/null 2>&1 ; then
		echo "trying to create user ${USER}"
		adduser \
			--system \
			--home /var/lib/${DIR_NAME} \
			--quiet \
			--group \
			${USER} || {
			echo "failed to add user ${USER}"
			exit 1
		}

		setup_dir /var/log/${DIR_NAME} 2755 adm
		setup_dir /var/lib/${DIR_NAME} 755 ${USER}
		setup_dir /var/run/${DIR_NAME} 755 ${USER}
		setup_dir /etc/${DIR_NAME} 2775 ${USER}
	fi
fi
#DEBHELPER#

exit 0


================================================
FILE: tools/packaging/debian/dragonfly.service
================================================
[Unit]
Description=Modern and fast key-value store
After=network.target
Documentation=

[Service]
Type=simple
EnvironmentFile=-/etc/dragonfly/environment
ExecStart=/usr/bin/dragonfly --flagfile=/etc/dragonfly/dragonfly.conf
PIDFile=/var/run/dragonfly/dragonfly.pid
TimeoutStopSec=infinity
Restart=always
User=dfly
Group=dfly
RuntimeDirectory=dragonfly
RuntimeDirectoryMode=2755

UMask=007
PrivateTmp=yes
LimitNOFILE=262144
PrivateDevices=yes
ProtectHome=yes
ProtectSystem=full

ReadWritePaths=-/var/lib/dragonfly
ReadWritePaths=-/var/log/dragonfly
ReadWritePaths=-/var/run/dragonfly

NoNewPrivileges=true
CapabilityBoundingSet=CAP_SETGID CAP_SETUID CAP_SYS_RESOURCE
MemoryDenyWriteExecute=true
ProtectKernelModules=true
ProtectKernelTunables=true
ProtectControlGroups=true
RestrictRealtime=true
RestrictNamespaces=true
RestrictAddressFamilies=AF_INET AF_INET6 AF_UNIX


[Install]
WantedBy=multi-user.target
Alias=dragonfly.service


================================================
FILE: tools/packaging/debian/rules
================================================
#!/usr/bin/make -f

export DEB_BUILD_OPTIONS="noopt nostrip nocheck"

%:
	dh $@


override_dh_auto_build:
	@echo "no build is done here"

override_dh_installchangelogs:
	@echo "no change long installation"

override_dh_auto_test:
	@echo "no testing"

override_dh_auto_clean:
	dh_auto_clean


================================================
FILE: tools/packaging/generate_changelog.sh
================================================
#!/usr/bin/env bash

# This would generate a change log required for build Debian installation package.
# Don't run this script on your local machine, run this inside docker
# you would need to install git client as well as moreutils
# apt install -y git moreutils
# note: This script should run on branch "main".

set -eu

if [ $# -ne 1 ]; then
	echo "usage: <git repo path> <target path>"
	exit 1
fi
SCRIPT_ABS_PATH=$(realpath $0)

THIS_DIR=$(dirname ${SCRIPT_ABS_PATH})
GIT_DIR=$1
PACKGE_DIR=${THIS_DIR}/debian
CHANGE_LOG=${PACKGE_DIR}/changelog
cd ${GIT_DIR}
git config --global --add safe.directory ${GIT_DIR}
has_tags=$(git tag -l v* | wc -l 2>/dev/null)
if [ "$has_tags" = "" -o "$has_tags" = "0" ]; then
	git fetch --all --tags || {
		echo "failed to fetch tags, cannot build changelog file"
		exit 1
	}
fi

>${CHANGE_LOG}
prevtag=v0.2.0
pkgname=`cat ${PACKGE_DIR}/control | grep '^Package: ' | sed 's/^Package: //'`
git tag -l v* | sort -V | while read tag; do
    (echo "$pkgname (${tag#v}) unstable; urgency=low"; git log --pretty=format:'  * %s' $prevtag..$tag; git log --pretty='format:%n%n -- %aN <%aE>  %aD%n%n' $tag^..$tag) | cat - ${CHANGE_LOG} | sponge ${CHANGE_LOG}
        prevtag=$tag
done
if [ -f ${CHANGE_LOG} ]; then
	haslnes=$(wc -l ${CHANGE_LOG} 2>/dev/null | awk '{print $1}')
	if [ "$haslnes" = "" ]; then
		echo "empty file ${CHANGE_LOG}, failed to generate changelog"
		exit 1
	fi
else
	echo "failed to generate ${CHANGE_LOG}"
	exit 1
fi


================================================
FILE: tools/packaging/generate_debian_package.sh
================================================
#!/usr/bin/env bash

# Generate a debian package from a pre-build dragonfly bianry and set of files as well as generating change log from git history.
# The result is debian install package file (.deb file).
# This script accept 2 parameters:
#	1. Optioanl path to the location at which the binary file is located.
# this depends on
# * git
# * moreutils
# * debhelper
# e.g. apt update -y && apt install -y git moreutils debhelper
# Please note that is must run from main branch.
# Best running this from inside a container.
# The result are writing to the location from which you would execute the script (not where the script is located).
# Version number is the tag number.
# Params:
#	* optional location to the binary to place at the package


set -eu


if [ $# -ge 1 ]; then
    VERSION_FILE=$1
    if ! [ -f ${VERSION_FILE} ]; then
        echo "binary file ${VERSION_FILE} does not exist"
        exit 1
    fi

else
    echo "no binary file provided"
    exit 1
fi

SCRIPT_ABS_PATH=$(realpath $0)
SCRIPT_PATH=$(dirname ${SCRIPT_ABS_PATH})
PACKAGES_PATH=${SCRIPT_PATH}/debian
CHANGELOG_SCRIPT=generate_changelog.sh
ROOT_ABS_PATH=$(realpath $SCRIPT_PATH/../..)
TEMP_WORK_DIR=$(mktemp -d)
BASE_DIR=${TEMP_WORK_DIR}/packages
BASE_PATH=${BASE_DIR}/dragonfly
BINARY_TARGET_DIR=${BASE_PATH}/debian/bin

function cleanup {
    echo $@
    rm -rf ${TEMP_WORK_DIR}
    exit 1
}

mkdir -p ${BASE_PATH} || cleanup "failed to create working directory for building the package"

cp -r ${PACKAGES_PATH} ${BASE_PATH} || cleanup "failed to copy required files for the package build from ${PACKAGES_PATH}"

cp ${SCRIPT_PATH}/${CHANGELOG_SCRIPT} ${BASE_PATH} || cleanup "failed to copy changelog script to ${BASE_PATH}"

mkdir -p ${BINARY_TARGET_DIR} || cleanup "failed to create install directory for building the package"

cp ${VERSION_FILE} ${BINARY_TARGET_DIR}/dragonfly || cleanup "failed to copy binary to target dir"

${BASE_PATH}/${CHANGELOG_SCRIPT} ${ROOT_ABS_PATH} || cleanup "failed to generate changelog for package"

MY_DIR=${PWD}
cd ${BASE_PATH}
dpkg-buildpackage --build=binary || cleanup "failed to generate the package"

TEMP_RESULT_FILE=$(ls ../*.deb)
if [ "$TEMP_RESULT_FILE" = "" ]; then
    cleanup "failed to find debian file"
fi

for fl in ${TEMP_RESULT_FILE}; do
    destfile=$(basename ${fl} | sed 's/_\([0-9.]*_\)/_/')
    mv ${fl} ${MY_DIR}/${destfile}
done

cd ${MY_DIR}
RESULT_FILE=$(ls *.deb 2>/dev/null)
echo "successfully built the install package at ${MY_DIR}/${RESULT_FILE}"
rm -rf ${TEMP_WORK_DIR}


================================================
FILE: tools/packaging/osrepos/README.md
================================================
# Package repositories for rpm and debian packages

This directory contains scripts and definitions for setting up YUM and apt repositories for Linux users to install
dragonfly packages.

The repositories are served as static websites. The generate-site workflow is used to set up and deploy the sites using
scripts and definitions included here.

The workflow does the following tasks:

* Download the latest 5 releases from dragonfly releases page, specifically deb and rpm assets
    * for deb files, only the latest package is downloaded and present (see note below)
* Set up a directory structure separating deb and rpm files into version specific paths
* Sign the packages (see note on GPG)
* Deploy the assets prepared, along with the public GPG key and repo definitions for apt and rpm tooling

## Using the YUM repository

Add the repository using:

```shell
sudo dnf config-manager addrepo --from-repofile=https://packages.dragonflydb.io/dragonfly.repo
```

Then install dragonfly as usual, or a specific version:

```shell
sudo dnf -y install dragonfly-0:v1.33.1-1.fc30.x86_64
```

## Using the APT repository

First download the public GPG key to an appropriate location:

```shell
sudo curl -Lo /usr/share/keyrings/dragonfly-keyring.public https://packages.dragonflydb.io/pgp-key.public
```

Then add the sources file:

```shell
sudo curl -Lo /etc/apt/sources.list.d/dragonfly.sources https://packages.dragonflydb.io/dragonfly.sources
```

Finally install dragonfly using apt

```shell
sudo apt update && sudo apt install dragonfly
```

#### Versions in APT repository

Unlike the yum repo, the apt repo only has the latest version. The reason for this is the tool, `reprepro` supplied by
debian to build repositories only supports multiple
versions in version 5.4 onwards, and the github runner using ubuntu-latest does not have this version.

Another option would be to use the components feature of apt repositories in the sources file we ask users to install,
but then the versions would need
to be hardcoded in the sources file and the user would have
to update the file with each new release which makes for a bad user experience. As of now users wanting older packages
should download them directly.

### Signing packages

The packages are signed using the GPG key imported from the secret GPG_PRIVATE_KEY in this repository.

The corresponding public key is served with site assets, so the apt/yum/dnf based tooling can consume the public key to
verify package integrity.

### TODO

- [X] debian packages signing (not required? release file is signed)
- [X] debian repo metadata setup
- [ ] tests asserting that packages are installable?


================================================
FILE: tools/packaging/osrepos/dragonfly.repo
================================================
[dragonfly]
name=Dragonfly Packages
baseurl=https://packages.dragonflydb.io/rpm/
enabled=1
gpgcheck=1
gpgkey=https://packages.dragonflydb.io/pgp-key.public


================================================
FILE: tools/packaging/osrepos/dragonfly.sources
================================================
Types: deb
URIs: https://packages.dragonflydb.io/deb
Suites: noble
Components: main
Signed-By: /usr/share/keyrings/dragonfly-keyring.public


================================================
FILE: tools/packaging/osrepos/pgp-key.public
================================================
-----BEGIN PGP PUBLIC KEY BLOCK-----

mQINBGjkpygBEADuvzXdOXChr/e4Uh2UBne60NPjmuhpjmArfMfqySeRezJ1Nuvd
AvKNuYRyCw+zsh0Zc/sSANpIdAeKPqrfZJgfEIJI0f8WVjfqsCKi+yWB7Bx0GjQ9
y/xoFLKkT7p0P/F4yRlb8kQq2KVP9UvcZBETJY96TpQIJM4N3XoG+8DsELW5HYF2
6sbhgmaNUsxm9oH5UqHcBc7TTgUp10GmZFR4dTeB1IffD/eLMVDMQ8ygzmVxkJPQ
zEKfpFFzseTVyreQlZ5U4GDR8FiB0mY4gZxbCywNqZRycyMM7v4EHuUO0fOgRHdl
5dseF+H1aEG/00JRo6zjiIbgMga0x9wYmVWvTU4wLnGoomukEMCkEQxlil1QjUlK
XI0EltU03DuGki5uhYc9dSS1h74ku2xWePaMsvmxrTphRo1WQBDutzVXSIZ6NBc3
BN+VBHcumVvif9aRrsfsj2CXhnOB61AW+VWk3fk0evW9cceXZDA0NgGdyeTfS7EI
pioaWtmE3Uv3AfHTlNbMytxG7d7k7oAT2xV6z2IygyQZ5LI1tvSJJ+I5kZHKeruj
k2bFp6H9FGi+g4kA+z9QWgkt+0UXYbjKZAs5Es1uGrRk6o1rAyVTKBKz62F0YQbK
j8Q49Z6iSobaKeQG8naCVkALSM49i4Zpw3x1jUpd7k8/KhpJObq3rewqIQARAQAB
tCREcmFnb25mbHkgPHBhY2thZ2luZ0BkcmFnb25mbHlkYi5pbz6JAlIEEwEKADwW
IQRgvYPC7oTdikxvMGcSMEAYvD0qugUCaOSnKAMbLwQFCwkIBwICIgIGFQoJCAsC
BBYCAwECHgcCF4AACgkQEjBAGLw9KrpGbw//VH2zUjaoSh7SnKGdDOA7A95o2EET
ZvChxImyb6xNKfUoMajPnKcJFg514aPFKLuJl4qJmikxdqBF/bYkznCQSJcLQhsT
pvkqanUh/XwBqbJye1QjBq1o0qXLgeY/Ciz2nqupwLQdzvGHO6+2Yk04T89pnZEo
CDSoZKkacu8TpalStqzqDlumryXZzdZ35hAu9OT0fVc2wtcMiY3pznLG1iawNk8I
bzme0ezGA/fk7xEptEbGlb1OtUV5+iG/SFEVvic8GTNf1yLQNCVK3QzD1ciL3MzR
OTH8a04ov2bMxjl8bIefKE/dFBeCSKbvkfTSMAEgqUAuRp7gvoO7uHO05A5AHU2i
y4agskGkgQR9u1yqUXyYIM9kkpuUqqAkwRqg1pw55LG686Xe35QYH4zbpgvr45/Q
JRPFjCbLzR1ZcNyrecHgrq2M9WNlk6dtdWBSJuc7L0M8KJqfrPxQmMpMm/KR43Ey
um0FCgb2J+ceO2W4GrE/DHHoNTt2iio2gMcmRXM7XTmVupsigbYk7AqGncLIQ60B
94jtv16ggXIeA5sPqmyssARXtweTM+EzLLs4K79be4K5j/yyg3CxxvZcq5CZNwoi
fbQgGVNb4SS+nv2r1mVe9XNSonmVVrAqSIFpptH5ahqgaRDUnmy0Lzk7qiHv02OW
PjbSiwQGHDHwq98=
=SOT5
-----END PGP PUBLIC KEY BLOCK-----


================================================
FILE: tools/packaging/osrepos/reprepro-config/distributions
================================================
Codename: noble
Suite: stable
Architectures: amd64 arm64
Components: main
Origin: Dragonfly
Label: Dragonfly
Description: Dragonfly APT repository
SignWith: 60BD83C2EE84DD8A4C6F306712304018BC3D2ABA


================================================
FILE: tools/packaging/osrepos/reprepro-config/options
================================================
verbose


================================================
FILE: tools/packaging/osrepos/requirements.txt
================================================
certifi>=2025.10.5
charset-normalizer>=3.4.3
idna>=3.10
requests>=2.32.5
urllib3>=2.5.0


================================================
FILE: tools/packaging/osrepos/scripts/fetch-releases.py
================================================
import dataclasses
import enum
import os.path
import time

import requests

"""
Fetches the latest five releases for RPM and the single latest release for DEB.
RPM files are placed in the destination folder where the DNF repo will expect them.
DEB files are placed in a temporary location from where they will be copied by the
reprepro tool.
"""

RELEASE_URL = "https://api.github.com/repos/dragonflydb/dragonfly/releases"


class AssetKind(enum.Enum):
    RPM = 1
    DEB = 2


@dataclasses.dataclass
class Package:
    kind: AssetKind
    download_url: str
    version: str
    filename: str
    arch: str

    @staticmethod
    def from_url(url: str) -> "Package":
        tokens = url.split("/")
        filename = tokens[-1]
        kind = AssetKind.RPM if filename.endswith(".rpm") else AssetKind.DEB
        if kind == AssetKind.DEB:
            arch = filename.split(".")[0].split("_")[1]
        else:
            arch = filename.split(".")[1]
        return Package(
            kind=kind, download_url=url, version=tokens[-2], filename=filename, arch=arch
        )

    def storage_path(self, root: str) -> str:
        match self.kind:
            case AssetKind.RPM:
                return os.path.join(root, "rpm", self.version)
            case AssetKind.DEB:
                # Debian packages are stored in a temporary path.
                # The reprepro tool will copy them later to the final path.
                return os.path.join("deb_tmp", self.arch, self.version)


def collect_download_urls() -> list[Package]:
    packages = []
    # TODO retry logic
    response = requests.get(RELEASE_URL)
    releases = response.json()
    for release in releases[:5]:
        for asset in release["assets"]:
            if asset["name"].endswith(".rpm") or asset["name"].endswith(".deb"):
                packages.append(Package.from_url(asset["browser_download_url"]))
    return packages


def download_packages(root: str, packages: list[Package]):
    # The debian repository building tool, reprepo, only supports a single package per version by default.
    # The ability to support multiple versions has been added but is not present in ubuntu-latest on
    # github action runners yet. So we only download one package per architecture, the latest, for ubuntu.
    # The rest of the scripts work on a set of packages, so that when the Limit parameter is supported,
    # we can remove this flag and start hosting more than the latest versions.
    # Another alternative would be to use the components feature of reprepo, but it would involve updating
    # the repository definition itself for each release, which is a bad experience for end users.
    deb_done = 0
    for package in packages:
        # Download the latest arm and amd64 package for .deb format
        if package.kind == AssetKind.DEB and deb_done == 2:
            continue

        print(f"Downloading {package.download_url}")
        path = package.storage_path(root)
        if not os.path.exists(path):
            os.makedirs(path)

        target = os.path.join(path, package.filename)
        # TODO retry logic
        response = requests.get(package.download_url)
        with open(target, "wb") as f:
            f.write(response.content)
        print(f"Downloaded {package.download_url}")
        time.sleep(0.5)
        if package.kind == AssetKind.DEB:
            deb_done += 1


def main(root: str):
    packages = collect_download_urls()
    download_packages(root, packages)


if __name__ == "__main__":
    import sys

    if len(sys.argv) == 1:
        print(f"Usage: {sys.argv[0]} <site folder>")
        sys.exit(1)
    main(sys.argv[1])


================================================
FILE: tools/packaging/osrepos/scripts/generate-apt-repo.sh
================================================
set -e

METADATA_ROOT=_site/deb
mkdir -pv ${METADATA_ROOT}/conf

cp -av reprepro-config/* ${METADATA_ROOT}/conf

reprepro -b ${METADATA_ROOT} createsymlinks
reprepro -b ${METADATA_ROOT} export

for file in $(find deb_tmp -type f -name "*.deb"); do
  reprepro -b ${METADATA_ROOT} includedeb noble "${file}"
done

# reprepro copied files to the destination, the temporary files can be removed now
rm -rf deb_tmp


================================================
FILE: tools/packaging/osrepos/scripts/generate-index.py
================================================
import os.path

"""
Generates index.html files recursively in all directories. Note that this is strictly an optional step.
Both YUM and APT repositories work fine without index listing, but this is useful for debugging a broken
site.
"""

HEADER = """<!DOCTYPE html>
<html>
<body>
"""

FOOTER = """</body>
</html>
"""


def build_index(dirpath):
    print(f"building index.html for {dirpath}")
    target = os.path.join(dirpath, "index.html")
    with open(target, "w") as f:
        f.write(HEADER.format(dir=dirpath))
        for item in sorted(os.listdir(dirpath)):
            if item == "index.html":
                continue
            name = item + "/" if os.path.isdir(os.path.join(dirpath, item)) else item
            f.write(f"""<a href="{item}">{name}</a><br>\n""")
        f.write(FOOTER)


def recurse_dir(root):
    for root, dirs, _ in os.walk(root):
        build_index(root)


if __name__ == "__main__":
    import sys

    if len(sys.argv) == 1:
        print(f"Usage: {sys.argv[0]} <site folder>")
        sys.exit(1)

    recurse_dir(sys.argv[1])


================================================
FILE: tools/packaging/osrepos/scripts/sign-rpms.sh
================================================
set -e

# GPG key must have been imported

echo "Signing RPMs with key id ${1}"

# The script fails in CI without an empty GPG_TTY
GPG_TTY=""
export GPG_TTY

for file in $(find _site/rpm -type f -name "*.rpm"); do
  echo "Signing ${file}"
  rpm --define "__gpg /usr/bin/gpg" --define "%_signature gpg" --define "%_gpg_name ${1}" --addsign "${file}"
done


================================================
FILE: tools/packaging/rpm/build_rpm.sh
================================================
#!/bin/bash

set -e

# Get the full path of the binary
ARCHIVE=$(realpath "$1")
VERSION="$2"
echo "Preparing $ARCHIVE"

SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"

# Setup RPM build environment in a unique subdirectory under /tmp
RPM_ROOT=$(mktemp -d /tmp/rpmbuild_XXXXXX)
echo "Working dir is $RPM_ROOT"
mkdir -p $RPM_ROOT/{BUILD,RPMS,SOURCES,SPECS}

# Put the archive and configuration files to the SOURCES directory
ln -s "$ARCHIVE" -t "$RPM_ROOT/SOURCES/"
cp $SCRIPT_DIR/dragonfly.service $RPM_ROOT/SOURCES/
cp $SCRIPT_DIR/dragonfly.conf $RPM_ROOT/SOURCES/

cp $SCRIPT_DIR/dragonfly.spec $RPM_ROOT/SPECS/

rpmbuild --define "_topdir $RPM_ROOT" --define "version $VERSION" -bb "$RPM_ROOT/SPECS/dragonfly.spec"
mv $RPM_ROOT/RPMS/*.rpm ./


================================================
FILE: tools/packaging/rpm/dragonfly.service
================================================
[Unit]
Description=Modern and fast key-value store
After=network.target
Documentation=

[Service]
Type=simple
EnvironmentFile=-/etc/dragonfly/environment
ExecStart=/usr/local/bin/dragonfly --flagfile=/etc/dragonfly/dragonfly.conf
PIDFile=/var/run/dragonfly/dragonfly.pid
TimeoutStopSec=infinity
Restart=always
User=dfly
Group=dfly
RuntimeDirectory=dragonfly
RuntimeDirectoryMode=2755

UMask=007
PrivateTmp=yes
LimitNOFILE=262144
PrivateDevices=yes
ProtectHome=yes
ProtectSystem=full

ReadWritePaths=-/var/lib/dragonfly
ReadWritePaths=-/var/log/dragonfly
ReadWritePaths=-/var/run/dragonfly

NoNewPrivileges=true
CapabilityBoundingSet=CAP_SETGID CAP_SETUID CAP_SYS_RESOURCE
MemoryDenyWriteExecute=true
ProtectKernelModules=true
ProtectKernelTunables=true
ProtectControlGroups=true
RestrictRealtime=true
RestrictNamespaces=true
RestrictAddressFamilies=AF_INET AF_INET6 AF_UNIX


[Install]
WantedBy=multi-user.target
Alias=dragonfly.service


================================================
FILE: tools/packaging/rpm/dragonfly.spec
================================================
%define     pkg_name dragonfly
%define     archive dragonfly-%{_arch}.tar.gz

# How the package name looks like
%define     _build_name_fmt  %%{NAME}.%%{ARCH}.rpm

Name:       %{pkg_name}
Version:    %{version}
Release:    1%{?dist}
Summary:    DragonflyDB memory store
License:    BUSL-1.1
URL:        https://www.dragonflydb.io
Source0:    %{archive}
Source1:    dragonfly.service
Source2:    dragonfly.conf
Group:      Applications/System
Provides:   user(dfly)
Provides:   group(dfly)

%description
DragonflyDB is a vertically scalable and memory efficient in-memory store
that is compatible with Redis OSS and Memcached.

%pre

getent group dfly >/dev/null || groupadd -r dfly
getent passwd dfly >/dev/null || useradd -r -g dfly -M -s /sbin/nologin -c "User for DragonflyDB service" dfly

%prep

%build
tar xvfz %{SOURCE0}
mv ./dragonfly-%{_arch} ./dragonfly

%install
mkdir -p %{buildroot}/usr/local/bin
mkdir -p %{buildroot}/etc/dragonfly
mkdir -p %{buildroot}/var/log/dragonfly
mkdir -p %{buildroot}/var/lib/dragonfly

install -m 755 ./dragonfly %{buildroot}/usr/local/bin/
mkdir -p %{buildroot}/usr/lib/systemd/system
cp %{SOURCE1} %{buildroot}/usr/lib/systemd/system/
cp %{SOURCE2} %{buildroot}/etc/dragonfly/

%clean
rm -rf %{buildroot}
rm -rf %{_builddir}/*

%files
%attr(-,dfly,dfly) /usr/local/bin/dragonfly
%attr(-,dfly,dfly) /usr/lib/systemd/system/dragonfly.service
%attr(-,dfly,dfly) /etc/dragonfly/dragonfly.conf

%changelog


================================================
FILE: tools/parse_allocator_tracking_logs.py
================================================
#!/usr/bin/env python3

"""
Usage:
1. First run Dragonfly with tracking allocator enabled. Must be a single allocator range with 100% samping rate to catch both allocations and deallocations.
2. Finish tracking.
3. cat /tmp/dragonfly.INFO |  ./parse_allocator_tracking_logs.py
"""
import re
import sys


def parse_log(log_lines):
    memory_map = {}

    allocation_pattern = re.compile(r"Allocating (\d+) bytes \((0x[0-9a-f]+)\)")
    deallocation_pattern = re.compile(r"Deallocating (\d+) bytes \((0x[0-9a-f]+)\)")

    for line in log_lines:
        allocation_match = allocation_pattern.search(line)
        deallocation_match = deallocation_pattern.search(line)

        if allocation_match:
            size = int(allocation_match.group(1))
            address = allocation_match.group(2)
            assert address not in memory_map
            memory_map[address] = (size, line)
        elif deallocation_match:
            size = int(deallocation_match.group(1))
            address = deallocation_match.group(2)
            if address in memory_map:
                assert size == memory_map[address][0]
                del memory_map[address]
            else:
                print(f"Deallocating non existing address: {address} {size}")

    return memory_map


if __name__ == "__main__":
    log_lines = sys.stdin.readlines()
    memory_map = parse_log(log_lines)

    for address, item in memory_map.items():
        print(f"Address: {address}, Size: {item[0]} bytes, original line: `{item[1]}`")


================================================
FILE: tools/plot_memtier_latency.py
================================================
#!/usr/bin/env python3
"""
Script to read memtier_benchmark JSON output and generate interactive latency charts.

The script generates interactive HTML charts (using Plotly) where you can:
- Click on legend items to show/hide time series
- Zoom in/out and pan
- Hover over data points for detailed information

To generate the JSON file, run memtier_benchmark with the --json-out-file option:

    memtier_benchmark --server <host> --port <port> \\
        --json-out-file memtier_out.json \\
        [other options...]

Example:
    memtier_benchmark --json-out-file memtier_out.json \\
        --clients 25 --threads 4 --test-time 120 \\
        --ratio 1:10

Then run this script to visualize the results:
    ./plot_memtier_latency.py memtier_out.json

Requirements:
    pip install plotly matplotlib numpy

Note: If plotly is not available, falls back to static SVG charts.
"""

import json
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import webbrowser
import tempfile
import os

# Try to import plotly for interactive charts
try:
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots

    PLOTLY_AVAILABLE = True
except ImportError:
    PLOTLY_AVAILABLE = False
    print("Warning: plotly not available. Install with: pip install plotly")


def load_json_data(filepath):
    """Load JSON data from file."""
    with open(filepath, "r") as f:
        return json.load(f)


def extract_latency_timeseries(data, operation, ignore_last_seconds=3):
    """
    Extract latency time series data from memtier output.

    Args:
        data: Parsed JSON data
        operation: Operation type (e.g., 'Mgets', 'Sets', 'Gets', etc.)
        ignore_last_seconds: Number of seconds to ignore from the end

    Returns:
        Dictionary with time series data
    """
    time_serie = data["ALL STATS"][operation]["Time-Serie"]

    times = []
    avg_latencies = []
    p50_latencies = []
    p99_latencies = []
    p99_9_latencies = []
    min_latencies = []
    max_latencies = []
    ops_per_sec = []

    # Sort time points and determine cutoff
    sorted_times = sorted(time_serie.keys(), key=lambda x: int(x))
    if ignore_last_seconds > 0 and len(sorted_times) > ignore_last_seconds:
        # Remove last N seconds
        sorted_times = sorted_times[:-ignore_last_seconds]

    for time_point in sorted_times:
        interval_data = time_serie[time_point]
        times.append(int(time_point))
        avg_latencies.append(interval_data["Average Latency"])
        p50_latencies.append(interval_data.get("p50.00", 0))
        p99_latencies.append(interval_data.get("p99.00", 0))
        p99_9_latencies.append(interval_data.get("p99.90", 0))
        min_latencies.append(interval_data["Min Latency"])
        max_latencies.append(interval_data["Max Latency"])

        # Calculate ops/sec for this interval (count per second)
        ops_per_sec.append(interval_data["Count"])

    return {
        "times": times,
        "avg": avg_latencies,
        "p50": p50_latencies,
        "p99": p99_latencies,
        "p99.9": p99_9_latencies,
        "min": min_latencies,
        "max": max_latencies,
        "ops_per_sec": ops_per_sec,
    }


def plot_latency_chart_interactive(data, output_file="latency_chart.html", open_browser=True):
    """
    Generate interactive latency chart using Plotly.

    Args:
        data: Parsed JSON data
        output_file: Output filename for the chart
        open_browser: If True, open the chart in the browser
    """
    if not PLOTLY_AVAILABLE:
        print("Plotly not available. Falling back to matplotlib...")
        # Change extension to svg for matplotlib fallback
        svg_file = output_file.replace(".html", ".svg")
        return plot_latency_chart(data, svg_file, open_browser)

    # Get all available operations from ALL STATS (excluding 'Runtime')
    all_stats = data["ALL STATS"]
    operations = [
        key
        for key in all_stats.keys()
        if key != "Runtime" and isinstance(all_stats[key], dict) and "Time-Serie" in all_stats[key]
    ]

    if not operations:
        print("Error: No operation data found in JSON")
        return

    # Extract data for all operations
    ops_data = {}
    for op in operations:
        ops_data[op] = extract_latency_timeseries(data, op, ignore_last_seconds=3)

    # Determine subplot layout
    num_ops = len(operations)
    if num_ops == 1:
        rows, cols = 2, 1
        specs = [[{"secondary_y": False}], [{"secondary_y": False}]]
        subplot_titles = [f"{operations[0]} Latency", "Throughput"]
    elif num_ops == 2:
        rows, cols = 2, 2
        specs = [
            [{"secondary_y": False}, {"secondary_y": False}],
            [{"secondary_y": False}, {"secondary_y": False}],
        ]
        subplot_titles = [
            f"{operations[0]} Latency",
            f"{operations[1]} Latency",
            "Latency Comparison",
            "Throughput",
        ]
    else:
        rows = num_ops + 1
        cols = 1
        specs = [[{"secondary_y": False}] for _ in range(rows)]
        subplot_titles = [f"{op} Latency" for op in operations] + ["Throughput"]

    # Create subplots
    fig = make_subplots(
        rows=rows,
        cols=cols,
        subplot_titles=subplot_titles,
        specs=specs,
        vertical_spacing=0.12,
        horizontal_spacing=0.1,
    )

    # Plot individual operation latencies
    for idx, op in enumerate(operations):
        if num_ops == 2:
            row = (idx // cols) + 1
            col = (idx % cols) + 1
        else:
            row = idx + 1
            col = 1

        op_data = ops_data[op]

        # Add traces with independent visibility toggle (no legendgroup)
        fig.add_trace(
            go.Scatter(
                x=op_data["times"],
                y=op_data["avg"],
                name=f"{op} Avg",
                mode="lines",
                line=dict(width=2),
            ),
            row=row,
            col=col,
        )
        fig.add_trace(
            go.Scatter(
                x=op_data["times"],
                y=op_data["p50"],
                name=f"{op} p50",
                mode="lines",
                line=dict(width=2),
            ),
            row=row,
            col=col,
        )
        fig.add_trace(
            go.Scatter(
                x=op_data["times"],
                y=op_data["p99"],
                name=f"{op} p99",
                mode="lines",
                line=dict(width=2),
            ),
            row=row,
            col=col,
        )
        fig.add_trace(
            go.Scatter(
                x=op_data["times"],
                y=op_data["p99.9"],
                name=f"{op} p99.9",
                mode="lines",
                line=dict(width=2),
            ),
            row=row,
            col=col,
        )

        fig.update_xaxes(title_text="Time (seconds)", row=row, col=col)
        fig.update_yaxes(title_text="Latency (ms)", row=row, col=col)

    # Add comparison plot if multiple operations and layout allows
    if num_ops == 2:
        comp_row, comp_col = 2, 1
        for op in operations:
            op_data = ops_data[op]
            fig.add_trace(
                go.Scatter(
                    x=op_data["times"],
                    y=op_data["p99"],
                    name=f"{op} p99 (comp)",
                    mode="lines",
                    line=dict(width=2, dash="solid"),
                ),
                row=comp_row,
                col=comp_col,
            )
            fig.add_trace(
                go.Scatter(
                    x=op_data["times"],
                    y=op_data["avg"],
                    name=f"{op} Avg (comp)",
                    mode="lines",
                    line=dict(width=2, dash="dash"),
                ),
                row=comp_row,
                col=comp_col,
            )
        fig.update_xaxes(title_text="Time (seconds)", row=comp_row, col=comp_col)
        fig.update_yaxes(title_text="Latency (ms)", row=comp_row, col=comp_col)

    # Add throughput plot
    if num_ops == 2:
        tp_row, tp_col = 2, 2
    else:
        tp_row = rows
        tp_col = 1

    for op in operations:
        op_data = ops_data[op]
        fig.add_trace(
            go.Scatter(
                x=op_data["times"],
                y=op_data["ops_per_sec"],
                name=f"{op} ops/sec",
                mode="lines",
                line=dict(width=2),
            ),
            row=tp_row,
            col=tp_col,
        )

    fig.update_xaxes(title_text="Time (seconds)", row=tp_row, col=tp_col)
    fig.update_yaxes(title_text="Operations per Second", row=tp_row, col=tp_col)

    # Update layout
    fig.update_layout(
        title_text="Memtier Benchmark - Latency Analysis (Interactive - Click legend to toggle)",
        height=300 * rows,
        hovermode="x unified",
        showlegend=True,
        legend=dict(orientation="v", yanchor="top", y=1, xanchor="left", x=1.02),
    )

    # Add annotation with statistics
    stats_lines = ["<b>Overall Statistics (last 3 seconds excluded):</b><br>"]
    for op in operations:
        op_stats = all_stats[op]
        stats_lines.append(
            f"{op}: Avg={op_stats['Average Latency']:.3f}ms, "
            f"p99={op_stats['Percentile Latencies']['p99.00']:.3f}ms, "
            f"Ops/sec={op_stats['Ops/sec']:.2f}<br>"
        )
    stats_lines.append(f"Duration: {all_stats['Runtime']['Total duration'] / 1000:.1f}s")

    fig.add_annotation(
        text="".join(stats_lines),
        xref="paper",
        yref="paper",
        x=0.5,
        y=-0.05,
        showarrow=False,
        font=dict(size=10),
        bgcolor="wheat",
        bordercolor="black",
        borderwidth=1,
        xanchor="center",
        yanchor="top",
    )

    # Save to HTML
    fig.write_html(output_file)
    print(f"Interactive chart saved to: {output_file}")

    # Open in browser
    if open_browser:
        abs_path = os.path.abspath(output_file)
        file_url = f"file://{abs_path}"
        print(f"Opening chart in browser: {file_url}")
        webbrowser.open(file_url)


def plot_latency_chart(data, output_file="latency_chart.svg", open_browser=True):
    """
    Generate latency chart from memtier data.

    Args:
        data: Parsed JSON data
        output_file: Output filename for the chart
        open_browser: If True, open the chart in the browser
    """
    # Get all available operations from ALL STATS (excluding 'Runtime')
    all_stats = data["ALL STATS"]
    operations = [
        key
        for key in all_stats.keys()
        if key != "Runtime" and isinstance(all_stats[key], dict) and "Time-Serie" in all_stats[key]
    ]

    if not operations:
        print("Error: No operation data found in JSON")
        return

    # Extract data for all operations
    ops_data = {}
    for op in operations:
        ops_data[op] = extract_latency_timeseries(data, op, ignore_last_seconds=3)

    # Determine number of subplots needed
    num_ops = len(operations)
    if num_ops == 1:
        # Single operation: 2x2 grid with detailed views
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        axes = axes.flatten()
    elif num_ops == 2:
        # Two operations: 2x2 grid
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        axes = axes.flatten()
    else:
        # Multiple operations: dynamic grid
        rows = (num_ops + 1) // 2 + 1
        fig, axes = plt.subplots(rows, 2, figsize=(16, 4 * rows))
        axes = axes.flatten()

    fig.suptitle("Memtier Benchmark - Latency Analysis", fontsize=16, fontweight="bold")

    # Plot each operation's latency percentiles
    for idx, op in enumerate(operations):
        if idx >= len(axes) - 1:  # Save last plot for throughput
            break

        ax = axes[idx]
        op_data = ops_data[op]

        ax.plot(op_data["times"], op_data["avg"], label="Average", linewidth=2)
        ax.plot(op_data["times"], op_data["p50"], label="p50", linewidth=2)
        ax.plot(op_data["times"], op_data["p99"], label="p99", linewidth=2)
        ax.plot(op_data["times"], op_data["p99.9"], label="p99.9", linewidth=2)
        ax.set_xlabel("Time (seconds)", fontsize=12)
        ax.set_ylabel("Latency (ms)", fontsize=12)
        ax.set_title(f"{op} Operations - Latency Percentiles", fontsize=14, fontweight="bold")
        ax.legend(loc="best")
        ax.grid(True, alpha=0.3)

    # Comparison plot (if multiple operations)
    if num_ops > 1:
        comparison_idx = min(num_ops, len(axes) - 2)
        ax_comp = axes[comparison_idx]

        for op in operations:
            op_data = ops_data[op]
            ax_comp.plot(op_data["times"], op_data["p99"], label=f"{op} p99", linewidth=2)
            ax_comp.plot(
                op_data["times"],
                op_data["avg"],
                label=f"{op} Avg",
                linewidth=2,
                linestyle="--",
                alpha=0.7,
            )

        ax_comp.set_xlabel("Time (seconds)", fontsize=12)
        ax_comp.set_ylabel("Latency (ms)", fontsize=12)
        ax_comp.set_title("Operations Comparison - Latency", fontsize=14, fontweight="bold")
        ax_comp.legend(loc="best")
        ax_comp.grid(True, alpha=0.3)

    # Throughput plot
    throughput_idx = min(num_ops + 1, len(axes) - 1) if num_ops > 1 else len(axes) - 1
    ax_throughput = axes[throughput_idx]

    for op in operations:
        op_data = ops_data[op]
        ax_throughput.plot(
            op_data["times"], op_data["ops_per_sec"], label=f"{op} ops/sec", linewidth=2
        )

    ax_throughput.set_xlabel("Time (seconds)", fontsize=12)
    ax_throughput.set_ylabel("Operations per Second", fontsize=12)
    ax_throughput.set_title("Throughput Over Time", fontsize=14, fontweight="bold")
    ax_throughput.legend(loc="best")
    ax_throughput.grid(True, alpha=0.3)

    # Hide any unused subplots
    for idx in range(throughput_idx + 1, len(axes)):
        axes[idx].set_visible(False)

    # Add overall statistics as text
    stats_lines = ["Overall Statistics (last 3 seconds excluded):"]
    for op in operations:
        op_stats = all_stats[op]
        stats_lines.append(
            f"{op}: Avg={op_stats['Average Latency']:.3f}ms, "
            f"p99={op_stats['Percentile Latencies']['p99.00']:.3f}ms, "
            f"Ops/sec={op_stats['Ops/sec']:.2f}"
        )
    stats_lines.append(f"Duration: {all_stats['Runtime']['Total duration'] / 1000:.1f}s")
    stats_text = "\n".join(stats_lines)

    fig.text(
        0.5,
        0.02,
        stats_text,
        ha="center",
        fontsize=10,
        bbox=dict(boxstyle="round", facecolor="wheat", alpha=0.5),
    )

    plt.tight_layout(rect=[0, 0.05, 1, 0.97])
    plt.savefig(output_file, dpi=300, bbox_inches="tight", format="svg")
    print(f"Chart saved to: {output_file}")

    # Open in browser
    if open_browser:
        abs_path = os.path.abspath(output_file)
        file_url = f"file://{abs_path}"
        print(f"Opening chart in browser: {file_url}")
        webbrowser.open(file_url)

    plt.close()


def print_summary(data):
    """Print summary statistics."""
    print("\n" + "=" * 60)
    print("MEMTIER BENCHMARK SUMMARY")
    print("=" * 60)

    config = data["configuration"]
    runtime = data["ALL STATS"]["Runtime"]

    print(f"\nConfiguration:")
    print(f"  Server: {config['server']}:{config['port']}")
    print(f"  Clients: {config['clients']}")
    print(f"  Threads: {config['threads']}")
    print(f"  Duration: {runtime['Total duration'] / 1000:.1f}s")
    print(f"  Pipeline: {config['pipeline']}")
    print(f"  Ratio (SET:GET): {config['ratio']}")

    # Get all operations dynamically
    all_stats = data["ALL STATS"]
    operations = [
        key
        for key in all_stats.keys()
        if key != "Runtime" and isinstance(all_stats[key], dict) and "Count" in all_stats[key]
    ]

    for op in operations:
        op_stats = all_stats[op]
        print(f"\n{op} Operations:")
        print(f"  Total: {op_stats['Count']:,}")
        print(f"  Ops/sec: {op_stats['Ops/sec']:.2f}")
        print(f"  Avg Latency: {op_stats['Average Latency']:.3f} ms")
        print(f"  Min Latency: {op_stats['Min Latency']:.3f} ms")
        print(f"  Max Latency: {op_stats['Max Latency']:.3f} ms")
        print(f"  p50: {op_stats['Percentile Latencies']['p50.00']:.3f} ms")
        print(f"  p99: {op_stats['Percentile Latencies']['p99.00']:.3f} ms")
        print(f"  p99.9: {op_stats['Percentile Latencies']['p99.90']:.3f} ms")

    print("\n" + "=" * 60 + "\n")


def main():
    """Main function."""
    import sys

    # Get input file from command line or use default
    if len(sys.argv) > 1:
        input_file = sys.argv[1]
    else:
        input_file = "memtier_out.json"

    # Get output file from command line or use default
    if len(sys.argv) > 2:
        output_file = sys.argv[2]
    else:
        # Use .html for interactive charts by default
        output_file = "latency_chart.html" if PLOTLY_AVAILABLE else "latency_chart.svg"

    # Check if input file exists
    if not Path(input_file).exists():
        print(f"Error: Input file '{input_file}' not found!")
        print(f"\nUsage: {sys.argv[0]} [input_file.json] [output_file.html|.svg]")
        print(f"\nTo generate the JSON file, run memtier_benchmark with --json-out-file:")
        print(f"  memtier_benchmark --server <host> --port <port> \\")
        print(f"      --json-out-file memtier_out.json \\")
        print(f"      [other options...]")
        sys.exit(1)

    # Load and process data
    print(f"Loading data from: {input_file}")
    data = load_json_data(input_file)

    # Print summary
    print_summary(data)

    # Generate chart
    print(f"Generating latency chart...")

    # Use interactive chart if output is .html, otherwise use matplotlib
    if output_file.endswith(".html"):
        plot_latency_chart_interactive(data, output_file)
    else:
        plot_latency_chart(data, output_file)

    print(f"\nDone!")
    if PLOTLY_AVAILABLE:
        print(f"Tip: Use .html extension for interactive charts with toggleable series")


if __name__ == "__main__":
    main()


================================================
FILE: tools/release.sh
================================================
#!/usr/bin/env sh

RELEASE_DIR=build-release
APP_PATH=$RELEASE_DIR/dragonfly

set -e

if ! [ -f "helio/blaze.sh" ]; then
   echo "ERROR"
   echo "Could not find helio. Please only run this script from repo root."
   echo "If you are already on the repo root, you might've cloned without submodules."
   echo "Try running 'git submodule update --init --recursive'"
   exit 1
fi

pwd

make release

if ! [ -f ${APP_PATH} ]; then
   echo "ERROR"
   echo "Failed to generate new dragonfly binary."
   exit 1
fi

echo "Running ${APP_PATH} --version"
${APP_PATH} --version

if readelf -a ${APP_PATH} | grep GLIBC_PRIVATE >/dev/null 2>&1 ; then
   echo "ERROR"
   echo "The generated binary contain invalid GLIBC version entries."
   exit 1
fi

make package
echo "Release package created: "
ls -lh $RELEASE_DIR/


================================================
FILE: tools/replay/go.mod
================================================
module dragonfydb.io/traffic-replay

go 1.18

require (
	github.com/influxdata/tdigest v0.0.1
	github.com/pterm/pterm v0.12.25
	github.com/redis/go-redis/v9 v9.7.3
)

require (
	github.com/atomicgo/cursor v0.0.1 // indirect
	github.com/cespare/xxhash/v2 v2.2.0 // indirect
	github.com/davecgh/go-spew v1.1.1 // indirect
	github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
	github.com/gookit/color v1.4.2 // indirect
	github.com/mattn/go-runewidth v0.0.13 // indirect
	github.com/pmezard/go-difflib v1.0.0 // indirect
	github.com/rivo/uniseg v0.2.0 // indirect
	github.com/stretchr/testify v1.7.0 // indirect
	github.com/xo/terminfo v0.0.0-20210125001918-ca9a967f8778 // indirect
	golang.org/x/sys v0.1.0 // indirect
	golang.org/x/term v0.0.0-20210220032956-6a3ed077a48d // indirect
	gopkg.in/yaml.v3 v3.0.1 // indirect
)


================================================
FILE: tools/replay/go.sum
================================================
github.com/atomicgo/cursor v0.0.1 h1:xdogsqa6YYlLfM+GyClC/Lchf7aiMerFiZQn7soTOoU=
github.com/atomicgo/cursor v0.0.1/go.mod h1:cBON2QmmrysudxNBFthvMtN32r3jxVRIvzkUiF/RuIk=
github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs=
github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA=
github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78=
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc=
github.com/google/go-cmp v0.2.0 h1:+dTQ8DZQJz0Mb/HjFlkptS1FeQ4cWSnN941F8aEG4SQ=
github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
github.com/gookit/color v1.4.2 h1:tXy44JFSFkKnELV6WaMo/lLfu/meqITX3iAV52do7lk=
github.com/gookit/color v1.4.2/go.mod h1:fqRyamkC1W8uxl+lxCQxOT09l/vYfZ+QeiX3rKQHCoQ=
github.com/influxdata/tdigest v0.0.1 h1:XpFptwYmnEKUqmkcDjrzffswZ3nvNeevbUSLPP/ZzIY=
github.com/influxdata/tdigest v0.0.1/go.mod h1:Z0kXnxzbTC2qrx4NaIzYkE1k66+6oEDQTvL95hQFh5Y=
github.com/mattn/go-runewidth v0.0.13 h1:lTGmDsbAYt5DmK6OnoV7EuIF1wEIFAcxld6ypU4OSgU=
github.com/mattn/go-runewidth v0.0.13/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pterm/pterm v0.12.25 h1:l9a8FU4XmJHs6rug8/YV1L2g/LBdMgaTvBBgwxD8avI=
github.com/pterm/pterm v0.12.25/go.mod h1:PhQ89w4i95rhgE+xedAoqous6K9X+r6aSOI2eFF7DZI=
github.com/redis/go-redis/v9 v9.7.3 h1:YpPyAayJV+XErNsatSElgRZZVCwXX9QzkKYNvO7x0wM=
github.com/redis/go-redis/v9 v9.7.3/go.mod h1:bGUrSggJ9X9GUmZpZNEOQKaANxSGgOEBRltRTZHSvrA=
github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/xo/terminfo v0.0.0-20210125001918-ca9a967f8778 h1:QldyIu/L63oPpyvQmHgvgickp1Yw510KJOqX7H24mg8=
github.com/xo/terminfo v0.0.0-20210125001918-ca9a967f8778/go.mod h1:2MuV+tbUrU1zIOPMxZ5EncGwgmMJsa+9ucAQZXxsObs=
golang.org/x/exp v0.0.0-20180321215751-8460e604b9de h1:xSjD6HQTqT0H/k60N5yYBtnN1OEkVy7WIo/DYyxKRO0=
golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.1.0 h1:kunALQeHf1/185U1i0GOB/fy1IPRDDpuoOOqRReG57U=
golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20210220032956-6a3ed077a48d h1:SZxvLBoTP5yHO3Frd4z4vrF+DBX9vMVanchswa69toE=
golang.org/x/term v0.0.0-20210220032956-6a3ed077a48d/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
gonum.org/v1/gonum v0.0.0-20181121035319-3f7ecaa7e8ca h1:PupagGYwj8+I4ubCxcmcBRk3VlUWtTg5huQpZR9flmE=
gonum.org/v1/gonum v0.0.0-20181121035319-3f7ecaa7e8ca/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo=
gonum.org/v1/netlib v0.0.0-20181029234149-ec6d1f5cefe6/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=


================================================
FILE: tools/replay/main.go
================================================
package main

import (
	"flag"
	"fmt"
	"os"
	"sort"
	"strings"
	"sync"
	"sync/atomic"
	"time"

	"github.com/pterm/pterm"
)

var fHost = flag.String("host", "127.0.0.1:6379", "Redis host")
var fCompareHost = flag.String("compare-host", "", "Redis host to compare with")
var fClientBuffer = flag.Int("buffer", 100, "How many records to buffer per client")
var fPace = flag.Bool("pace", true, "whether to pace the traffic according to the original timings.false - to pace as fast as possible")
var fSkip = flag.Uint("skip", 0, "skip N records")
var fSkipTimeSec = flag.Int("skip-time-sec", 0, "skip records in the first N seconds of the recording")
var fIgnoreParseErrors = flag.Bool("ignore-parse-errors", false, "ignore parsing errors")
var fTimeLimit = flag.Int("time-limit", 0, "time limit in seconds (0 = no limit)")

func RenderTable(area *pterm.AreaPrinter, files []string, workers []FileWorker) {
	tableData := pterm.TableData{{"file", "parsed", "processed", "delayed", "clients", "avg(us)", "p75(us)", "p90(us)", "p99(us)", "p99.9(us)"}}
	for i := range workers {
		workers[i].latencyMu.Lock()
		avg := 0.0
		if workers[i].latencyCount > 0 {
			avg = workers[i].latencySum / float64(workers[i].latencyCount)
		}
		p75 := workers[i].latencyDigest.Quantile(0.75)
		p90 := workers[i].latencyDigest.Quantile(0.9)
		p99 := workers[i].latencyDigest.Quantile(0.99)
		p999 := workers[i].latencyDigest.Quantile(0.999)
		workers[i].latencyMu.Unlock()
		tableData = append(tableData, []string{
			files[i],
			fmt.Sprint(atomic.LoadUint64(&workers[i].parsed)),
			fmt.Sprint(atomic.LoadUint64(&workers[i].processed)),
			fmt.Sprint(atomic.LoadUint64(&workers[i].delayed)),
			fmt.Sprint(atomic.LoadUint64(&workers[i].clients)),
			fmt.Sprintf("%.0f", avg),
			fmt.Sprintf("%.0f", p75),
			fmt.Sprintf("%.0f", p90),
			fmt.Sprintf("%.0f", p99),
			fmt.Sprintf("%.0f", p999),
		})
	}
	content, _ := pterm.DefaultTable.WithHasHeader().WithBoxed().WithData(tableData).Srender()
	area.Update(content)
}

// RenderPipelineRangesTable renders the latency digests for each pipeline range
func RenderPipelineRangesTable(area *pterm.AreaPrinter, files []string, workers []FileWorker) {
	tableData := pterm.TableData{{"file", "Pipeline Range", "p75(us)", "p90(us)", "p99(us)", "p99.9(us)"}}
	for i := range workers {
		workers[i].latencyMu.Lock()
		for _, rng := range pipelineRanges {
			if digest, ok := workers[i].perRange[rng.label]; ok {
				p75 := digest.Quantile(0.75)
				p90 := digest.Quantile(0.9)
				p99 := digest.Quantile(0.99)
				p999 := digest.Quantile(0.999)
				tableData = append(tableData, []string{
					files[i],
					rng.label,
					fmt.Sprintf("%.0f", p75),
					fmt.Sprintf("%.0f", p90),
					fmt.Sprintf("%.0f", p99),
					fmt.Sprintf("%.0f", p999),
				})
			}
		}
		workers[i].latencyMu.Unlock()
	}
	content, _ := pterm.DefaultTable.WithHasHeader().WithBoxed().WithData(tableData).Srender()
	area.Update(content)
}

func Run(files []string) {
	baseTime := DetermineBaseTime(files)

	var skipUntil uint64
	effectiveBaseTime := baseTime
	if *fSkipTimeSec > 0 {
		skipDuration := time.Duration(*fSkipTimeSec) * time.Second
		skipUntil = uint64(baseTime.Add(skipDuration).UnixNano())
		effectiveBaseTime = baseTime.Add(skipDuration)
	}
	timeOffset := time.Now().Add(500 * time.Millisecond).Sub(effectiveBaseTime)
	fmt.Println("Offset -> ", timeOffset)

	// Calculate stop time based on recording timestamps if time limit is specified
	var stopUntil uint64
	if *fTimeLimit > 0 {
		limitDuration := time.Duration(*fTimeLimit) * time.Second
		stopUntil = uint64(effectiveBaseTime.Add(limitDuration).UnixNano())
		fmt.Printf("Time limit set to %d seconds\n", *fTimeLimit)
	}

	// Start a worker for every file. They take care of spawning client workers.
	var wg sync.WaitGroup
	workers := make([]FileWorker, len(files))
	for i := range workers {
		workers[i] = FileWorker{timeOffset: timeOffset, skipUntil: skipUntil, stopUntil: stopUntil}
		wg.Add(1)
		go workers[i].Run(files[i], &wg)
	}

	wgDone := make(chan bool)
	go func() {
		wg.Wait()
		wgDone <- true
	}()

	// Render table while running
	area, _ := pterm.DefaultArea.WithCenter().Start()
	for running := true; running; {
		select {
		case <-wgDone:
			running = false
		case <-time.After(100 * time.Millisecond):
			RenderTable(area, files, workers)
		}
	}

	RenderTable(area, files, workers) // to show last stats
	areaPipelineRanges, _ := pterm.DefaultArea.WithCenter().Start()
	RenderPipelineRangesTable(areaPipelineRanges, files, workers) // to render per pipeline-range latency digests
}

func Print(files []string) {
	type StreamTop struct {
		record Record
		ch     chan Record
	}

	// Start file reader goroutines
	var wg sync.WaitGroup
	wg.Add(len(files))

	tops := make([]StreamTop, len(files))
	for i, file := range files {
		tops[i].ch = make(chan Record, 100)
		go func(ch chan Record, file string) {
			parseRecords(file, func(r Record) bool {
				ch <- r
				return true
			}, *fIgnoreParseErrors)
			close(ch)
			wg.Done()
		}(tops[i].ch, file)
	}

	// Pick record with minimum time from each channel
	for {
		minTime := ^uint64(0)
		minIndex := -1
		for i := range tops {
			if tops[i].record.Time == 0 {
				if r, ok := <-tops[i].ch; ok {
					tops[i].record = r
				}
			}

			if rt := tops[i].record.Time; rt > 0 && rt < minTime {
				minTime = rt
				minIndex = i
			}
		}

		if minIndex == -1 {
			break
		}

		fmt.Println(tops[minIndex].record.values...)
		tops[minIndex].record = Record{}
	}

	wg.Wait()
}

func Analyze(files []string) {
	total := 0
	chained := 0
	clients := 0
	cmdCounts := make(map[string]uint)

	// count stats
	for _, file := range files {
		fileClients := make(map[uint32]bool)

		parseRecords(file, func(r Record) bool {
			total += 1
			if r.HasMore > 0 {
				chained += 1
			}

			fileClients[r.Client] = true
			cmdCounts[r.values[0].(string)] += 1

			return true
		}, *fIgnoreParseErrors)

		clients += len(fileClients)
	}

	// sort commands by frequencies
	type Freq struct {
		cmd   string
		count uint
	}
	var sortedCmds []Freq
	for cmd, count := range cmdCounts {
		sortedCmds = append(sortedCmds, Freq{cmd, count})
	}
	sort.Slice(sortedCmds, func(i, j int) bool {
		return sortedCmds[i].count > sortedCmds[j].count
	})

	// Print all the info
	fmt.Println("Total commands", total)
	fmt.Println("Has more%", 100*float32(chained)/float32(total))
	fmt.Println("Total clients", clients)

	for _, freq := range sortedCmds {
		fmt.Printf("%8d | %v \n", freq.count, freq.cmd)
	}
}

func main() {
	flag.Usage = func() {
		binaryName := os.Args[0]

		fmt.Fprintf(os.Stderr, "Usage: %s [options] <command> <files...>\n", binaryName)
		fmt.Fprintln(os.Stderr, "\nOptions:")
		flag.PrintDefaults()
		fmt.Fprintln(os.Stderr, "\nCommands:")
		fmt.Fprintln(os.Stderr, "  run  - replays the traffic")
		fmt.Fprintln(os.Stderr, "  print - prints the command")
		fmt.Fprintln(os.Stderr, "  analyze - analyzes the traffic")

		fmt.Fprintln(os.Stderr, "\nExamples:")
		fmt.Fprintf(os.Stderr, "   %s -host 192.168.1.10:6379 -buffer 50 run *.bin\n", binaryName)
		fmt.Fprintf(os.Stderr, "   %s -skip-time-sec 30 run *.bin\n", binaryName)
		fmt.Fprintf(os.Stderr, "   %s -time-limit 60 run *.bin\n", binaryName)
		fmt.Fprintf(os.Stderr, "   %s print *.bin\n", binaryName)
	}

	flag.Parse()
	if flag.NArg() < 2 {
		flag.Usage()
		os.Exit(1)
	}

	cmd := flag.Arg(0)
	files := flag.Args()[1:]

	switch strings.ToLower(cmd) {
	case "run":
		Run(files)
	case "print":
		Print(files)
	case "analyze":
		Analyze(files)
	}
}


================================================
FILE: tools/replay/parsing.go
================================================
package main

import (
	"bufio"
	"encoding/binary"
	"io"
	"log"
	"os"
)

var kBigEmptyBytes = make([]byte, 100_000)

func parseStrings(file io.Reader) (out []interface{}, err error) {
	var num, strLen uint32
	err = binary.Read(file, binary.LittleEndian, &num)
	if err != nil {
		return nil, err
	}

	out = make([]interface{}, num)
	for i := range out {
		err = binary.Read(file, binary.LittleEndian, &strLen)
		if err != nil {
			return nil, err
		}
		out[i] = strLen
	}

	for i := range out {
		strLen = out[i].(uint32)
		buf := make([]byte, strLen)

		_, err := io.ReadFull(file, buf)
		if err != nil {
			return nil, err
		}

		out[i] = string(buf)
	}
	return
}

func parseRecords(filename string, cb func(Record) bool, ignoreErrors bool) error {
	file, err := os.Open(filename)
	if err != nil {
		return err
	}
	defer file.Close()

	reader := bufio.NewReader(file)

	var version uint8
	binary.Read(reader, binary.LittleEndian, &version)
	if version != 2 {
		panic("Requires version two replayer, roll back in commits!")
	}

	recordNum := 0
	for {
		var rec Record
		err := binary.Read(reader, binary.LittleEndian, &rec.RecordHeader)
		if err != nil {
			if err == io.EOF {
				break
			}
			return err
		}

		rec.values, err = parseStrings(reader)
		if err != nil {
			log.Printf("Could not parse %vth record", recordNum)
			if !ignoreErrors {
				return err
			}
			log.Printf("Ignoring parse error and continuing")
			recordNum++
			continue
		}

		if !cb(rec) {
			return nil
		}
		recordNum++
	}

	return nil
}


================================================
FILE: tools/replay/workers.go
================================================
package main

import (
	"context"
	"fmt"
	"log"
	"math"
	"strings"
	"sync"
	"sync/atomic"
	"time"

	"github.com/influxdata/tdigest"
	"github.com/redis/go-redis/v9"
)

type RecordHeader struct {
	Client  uint32
	Time    uint64
	DbIndex uint32
	HasMore uint32
}

type Record struct {
	RecordHeader
	values []interface{} // instead of []string to unwrap into variadic
}

// Determine earliest time
func DetermineBaseTime(files []string) time.Time {
	var minTime uint64 = math.MaxUint64
	for _, file := range files {
		parseRecords(file, func(r Record) bool {
			if r.Time < minTime {
				minTime = r.Time
			}
			return false
		}, *fIgnoreParseErrors)
	}
	return time.Unix(0, int64(minTime))
}

// Handles a single connection/client
type ClientWorker struct {
	redis     *redis.Client
	compare     *redis.Client
	incoming  chan Record
	processed uint
	pipe      redis.Pipeliner
	comparePipe redis.Pipeliner
}

// Pipeline length ranges for summary
var pipelineRanges = []struct {
	label string
	min   int
	max   int // inclusive, except last
}{
	{"0-29", 0, 29},
	{"30-79", 30, 79},
	{"80-199", 80, 199},
	{"200+", 200, 1 << 30},
}

var compareIgnoreCmds = []string{
    "HELLO",
    "AUTH",
    "SELECT",
    "INFO",
    "TIME",
    "CLIENT",
    "CONFIG",
}

// Handles a single file and distributes messages to clients
type FileWorker struct {
	clientGroup sync.WaitGroup
	timeOffset  time.Duration
	skipUntil   uint64
	stopUntil   uint64 // timestamp when to stop processing traffic (0 = no limit)
	// stats for output, updated by clients, read by rendering goroutine
	processed uint64
	delayed   uint64
	parsed    uint64
	clients   uint64

	latencyDigest *tdigest.TDigest
	latencyMu     sync.Mutex

	latencySum   float64 // sum of all batch latencies (microseconds)
	latencyCount uint64  // number of batches

	// per-pipeline-range latency digests
	perRange map[string]*tdigest.TDigest
}

// Helper function to track latency and update digests
func trackLatency(worker *FileWorker, batchLatency float64, size int) {
	worker.latencyMu.Lock()
	defer worker.latencyMu.Unlock()
	worker.latencyDigest.Add(batchLatency, 1)
	worker.latencySum += batchLatency
	worker.latencyCount++
	// Add to per-range digest
	if worker.perRange != nil {
		for _, rng := range pipelineRanges {
			if size >= rng.min && size <= rng.max {
				worker.perRange[rng.label].Add(batchLatency, 1)
				break
			}
		}
	}
}

func ignoreCompareCmd(c redis.Cmder) bool {
    args := c.Args()
    if len(args) == 0 {
        return true
    }
    name := strings.ToUpper(fmt.Sprint(args[0]))
    for _, ign := range compareIgnoreCmds {
        if name == ign {
            return true
        }
    }
    return false
}

func cmdAsString(c redis.Cmder) string {
    args := c.Args()
    if len(args) == 0 {
        return "<no-args>"
    }

    name := strings.ToUpper(fmt.Sprint(args[0]))
    if len(args) == 1 {
        return name
    }

    parts := make([]string, 0, len(args) - 1)
    for _, a := range args[1:] {
        s := fmt.Sprint(a)
        parts = append(parts, s)
    }
    return name + " " + strings.Join(parts, " ")
}

func cmdResultString(cm redis.Cmder) string {
    if err := cm.Err(); err != nil {
        if err == redis.Nil {
            return "(nil)"
        }
        return "ERR: " + err.Error()
    }

    if cmd, ok := cm.(*redis.Cmd); ok {
        v := cmd.Val()
        s := fmt.Sprintf("%v", v)
        return s
    }

    return fmt.Sprintf("<unknown Cmder %T>", cm)
}

func compareCmdResults(a, b []redis.Cmder, lastMsg Record) {
    if len(a) != len(b) {
        log.Fatalf("[COMPARE] mismatch count: primary=%d compare=%d (last client=%d time=%d)", len(a), len(b), lastMsg.Client, lastMsg.Time)
        return
    }

    for i := range a {
		if (ignoreCompareCmd(a[i])) {
			continue
		}
		pa := cmdResultString(a[i])
        pb := cmdResultString(b[i])
        if pa != pb {
			log.Fatalf("[COMPARE] mismatch at idx %d cmd=%s\n  primary=%s\n  compare=%s\n  (client=%d time=%d)", i, cmdAsString(a[i]), pa, pb, lastMsg.Client, lastMsg.Time)
		}
    }
}

func (c *ClientWorker) Run(pace bool, worker *FileWorker) {
	for msg := range c.incoming {
		if c.processed == 0 && msg.DbIndex != 0 {
			// There is no easy way to switch, we rely on connection pool consisting only of one connection
			c.redis.Do(context.Background(), []interface{}{"SELECT", fmt.Sprint(msg.DbIndex)})
			if c.compare != nil {
        		c.compare.Do(context.Background(), []interface{}{"SELECT", fmt.Sprint(msg.DbIndex)})
    		}
		}

		lag := time.Until(worker.HappensAt(time.Unix(0, int64(msg.Time))))
		if lag < 0 {
			atomic.AddUint64(&worker.delayed, 1)
		}

		if pace {
			time.Sleep(lag)
		}

		c.pipe.Do(context.Background(), msg.values...).Result()
		if c.comparePipe != nil {
    		c.comparePipe.Do(context.Background(), msg.values...).Result()
		}

		atomic.AddUint64(&worker.processed, 1)

		if msg.HasMore == 0 {
			size := c.pipe.Len()
			start := time.Now()
			cmds, _ := c.pipe.Exec(context.Background())
			batchLatency := float64(time.Since(start).Microseconds())
			trackLatency(worker, batchLatency, size)
			c.processed += uint(size)

    		if c.comparePipe != nil {
        		ccmds, _ := c.comparePipe.Exec(context.Background())
        		compareCmdResults(cmds, ccmds, msg)
    		}
		}
	}

	if size := c.pipe.Len(); size >= 0 {
		start := time.Now()
		c.pipe.Exec(context.Background())
		batchLatency := float64(time.Since(start).Microseconds())
		trackLatency(worker, batchLatency, size)
		c.processed += uint(size)
	}

	worker.clientGroup.Done()
}

func NewClient(w *FileWorker, pace bool) *ClientWorker {
	client := &ClientWorker{
		redis:    redis.NewClient(&redis.Options{Addr: *fHost, PoolSize: 1, DisableIndentity: true}),
		incoming: make(chan Record, *fClientBuffer),
	}
	client.pipe = client.redis.Pipeline()

	if *fCompareHost != "" {
        client.compare = redis.NewClient(&redis.Options{Addr: *fCompareHost, PoolSize: 1, DisableIndentity: true})
        client.comparePipe = client.compare.Pipeline()
    }

	atomic.AddUint64(&w.clients, 1)
	w.clientGroup.Add(1)
	go client.Run(pace, w)
	return client
}

func (w *FileWorker) Run(file string, wg *sync.WaitGroup) {
	w.latencyDigest = tdigest.NewWithCompression(1000)
	w.perRange = make(map[string]*tdigest.TDigest)
	for _, rng := range pipelineRanges {
		w.perRange[rng.label] = tdigest.NewWithCompression(500)
	}
	clients := make(map[uint32]*ClientWorker, 0)
	recordId := uint64(0)
	err := parseRecords(file, func(r Record) bool {
		client, ok := clients[r.Client]
		if !ok {
			client = NewClient(w, *fPace)
			clients[r.Client] = client
		}
		cmdName := strings.ToLower(r.values[0].(string))
		recordId += 1
		if cmdName != "eval" && recordId < uint64(*fSkip) {
			return true
		}

		if w.skipUntil > 0 && r.Time < w.skipUntil {
			return true
		}

		atomic.AddUint64(&w.parsed, 1)

		if w.stopUntil > 0 && r.Time > w.stopUntil {
			return true
		}

		client.incoming <- r
		return true
	}, *fIgnoreParseErrors)

	if err != nil {
		log.Fatalf("Could not parse records for file %s: %v", file, err)
	}

	for _, client := range clients {
		close(client.incoming)
	}
	w.clientGroup.Wait()
	wg.Done()
}

func (w *FileWorker) HappensAt(recordTime time.Time) time.Time {
	return recordTime.Add(w.timeOffset)
}


================================================
FILE: tools/requirements.txt
================================================
aioredis==2.0.1
async_timeout==4.0.2
pytoml==0.1.21
PyYAML==6.0
railroad==0.5.0
redis==4.4.4
requests>=2.32.0
aiocsv==1.2.3
aiofiles==22.1.0
numpy==2.1.3


================================================
FILE: tools/run_master_replica.sh
================================================
#!/usr/bin/env bash
set -euo pipefail

ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
DFLY_BIN="${DFLY_BIN:-${ROOT_DIR}/build-dbg/dragonfly}"
MASTER_PORT="${MASTER_PORT:-6379}"
REPLICA_PORT="${REPLICA_PORT:-6380}"

if [[ ! -x "${DFLY_BIN}" ]]; then
    echo "Dragonfly binary not found. Build it first (e.g., cd build-dbg && ninja dragonfly)." >&2
    exit 1
fi

MASTER_DIR="$(mktemp -d -t dfly-master-XXXXXX)"
REPLICA_DIR="$(mktemp -d -t dfly-replica-XXXXXX)"
MASTER_LOG_DIR="$(mktemp -d -t dfly-master-logs-XXXXXX)"
REPLICA_LOG_DIR="$(mktemp -d -t dfly-replica-logs-XXXXXX)"

cleanup() {
  if [[ -n "${REPLICA_PID:-}" ]]; then
    kill "${REPLICA_PID}" 2>/dev/null || true
  fi
  if [[ -n "${MASTER_PID:-}" ]]; then
    kill "${MASTER_PID}" 2>/dev/null || true
  fi
  rm -rf "${MASTER_DIR}" "${REPLICA_DIR}" "${MASTER_LOG_DIR}" "${REPLICA_LOG_DIR}"
}
trap cleanup EXIT

set -x
echo "Starting master on port ${MASTER_PORT} (threads=4, shards=3)..."
"${DFLY_BIN}" \
  --port "${MASTER_PORT}" --dir "${MASTER_DIR}" --omit_basic_usage \
  --dbfilename="" \
  --proactor_threads 4 --num_shards 3 \
  --log_dir "${MASTER_LOG_DIR}" &
MASTER_PID=$!

echo "Starting replica on port ${REPLICA_PORT} (threads=3, shards=2)..."
"${DFLY_BIN}" \
  --port "${REPLICA_PORT}" --dir "${REPLICA_DIR}" --omit_basic_usage \
  --dbfilename="" \
  --proactor_threads 3 --num_shards 2 \
  --replicaof "127.0.0.1:${MASTER_PORT}" \
  --log_dir "${REPLICA_LOG_DIR}" \
  --replicaof_no_one_start_journal &
REPLICA_PID=$!

set +x
sleep 0.5
echo -e "\n\n\nReplication running. Master PID: ${MASTER_PID}, Replica PID: ${REPLICA_PID}"
echo "Master port: ${MASTER_PORT}, Replica port: ${REPLICA_PORT}"
echo "Press Ctrl+C to stop."

wait "${MASTER_PID}" "${REPLICA_PID}"


================================================
FILE: tools/vector-benches/README.md
================================================
Tool for benchmarking vector search with randomized vectors.

## Logic
The tool connects to the Redis/Dragonfly instance.
1. Checks if the database has enough data (at least 50% of requested `-n`). If not, it **flushes the DB** and generates random vectors.
2. Checks if the index `idx` exists. If not, it creates it.
3. Runs concurrent search queries and reports latency/QPS.

## Arguments

| Flag | Default | Description |
|------|---------|-------------|
| `-n` | 50000 | **Number of vectors** to populate if the DB is empty. |
| `-q` | 1000 | **Total number of queries** to run during the benchmark. |
| `-t` | 8 | **Query threads**. Number of concurrent workers sending queries. |
| `-d` | 100 | **Vector dimension**. Size of the float32 vectors. |
| `-k` | 10 | **Top K**. Number of nearest neighbors to retrieve per query. |
| `-p` | 6379 | **Port** of the server. |
| `-h` | localhost | **Host** of the server. |

Run with `-h` (help) to see these defaults in the tool itself.


================================================
FILE: tools/vector-benches/go.mod
================================================
module dragonfly.vsbench

go 1.24.5

require (
	atomicgo.dev/cursor v0.2.0 // indirect
	atomicgo.dev/keyboard v0.2.9 // indirect
	atomicgo.dev/schedule v0.1.0 // indirect
	github.com/cespare/xxhash/v2 v2.3.0 // indirect
	github.com/containerd/console v1.0.5 // indirect
	github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
	github.com/gomodule/redigo v1.8.3 // indirect
	github.com/gookit/color v1.5.4 // indirect
	github.com/lithammer/fuzzysearch v1.1.8 // indirect
	github.com/mattn/go-runewidth v0.0.16 // indirect
	github.com/pterm/pterm v0.12.82 // indirect
	github.com/redis/go-redis/v9 v9.14.1 // indirect
	github.com/rivo/uniseg v0.4.7 // indirect
	github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect
	golang.org/x/sys v0.33.0 // indirect
	golang.org/x/term v0.32.0 // indirect
	golang.org/x/text v0.26.0 // indirect
)


================================================
FILE: tools/vector-benches/go.sum
================================================
atomicgo.dev/cursor v0.2.0 h1:H6XN5alUJ52FZZUkI7AlJbUc1aW38GWZalpYRPpoPOw=
atomicgo.dev/cursor v0.2.0/go.mod h1:Lr4ZJB3U7DfPPOkbH7/6TOtJ4vFGHlgj1nc+n900IpU=
atomicgo.dev/keyboard v0.2.9 h1:tOsIid3nlPLZ3lwgG8KZMp/SFmr7P0ssEN5JUsm78K8=
atomicgo.dev/keyboard v0.2.9/go.mod h1:BC4w9g00XkxH/f1HXhW2sXmJFOCWbKn9xrOunSFtExQ=
atomicgo.dev/schedule v0.1.0 h1:nTthAbhZS5YZmgYbb2+DH8uQIZcTlIrd4eYr3UQxEjs=
atomicgo.dev/schedule v0.1.0/go.mod h1:xeUa3oAkiuHYh8bKiQBRojqAMq3PXXbJujjb0hw8pEU=
github.com/MarvinJWendt/testza v0.1.0/go.mod h1:7AxNvlfeHP7Z/hDQ5JtE3OKYT3XFUeLCDE2DQninSqs=
github.com/MarvinJWendt/testza v0.2.1/go.mod h1:God7bhG8n6uQxwdScay+gjm9/LnO4D3kkcZX4hv9Rp8=
github.com/MarvinJWendt/testza v0.2.8/go.mod h1:nwIcjmr0Zz+Rcwfh3/4UhBp7ePKVhuBExvZqnKYWlII=
github.com/MarvinJWendt/testza v0.2.10/go.mod h1:pd+VWsoGUiFtq+hRKSU1Bktnn+DMCSrDrXDpX2bG66k=
github.com/MarvinJWendt/testza v0.2.12/go.mod h1:JOIegYyV7rX+7VZ9r77L/eH6CfJHHzXjB69adAhzZkI=
github.com/MarvinJWendt/testza v0.3.0/go.mod h1:eFcL4I0idjtIx8P9C6KkAuLgATNKpX4/2oUqKc6bF2c=
github.com/MarvinJWendt/testza v0.4.2/go.mod h1:mSdhXiKH8sg/gQehJ63bINcCKp7RtYewEjXsvsVUPbE=
github.com/RediSearch/redisearch-go v1.1.1 h1:YElqguUO9lSqCYszrQcoTUoB9zBRyb2gkO4+yh3STMo=
github.com/RediSearch/redisearch-go v1.1.1/go.mod h1:vcSdla+ZmI3B9doZbLoUrwNJfuvJzRt+/FoE38JcMS8=
github.com/atomicgo/cursor v0.0.1/go.mod h1:cBON2QmmrysudxNBFthvMtN32r3jxVRIvzkUiF/RuIk=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/containerd/console v1.0.3/go.mod h1:7LqA/THxQ86k76b8c/EMSiaJ3h1eZkMkXar0TQ1gf3U=
github.com/containerd/console v1.0.5 h1:R0ymNeydRqH2DmakFNdmjR2k0t7UPuiOV/N/27/qqsc=
github.com/containerd/console v1.0.5/go.mod h1:YynlIjWYF8myEu6sdkwKIvGQq+cOckRm6So2avqoYAk=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78=
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc=
github.com/go-redis/redis v6.15.9+incompatible h1:K0pv1D7EQUjfyoMql+r/jZqCLizCGKFlFgcHWWmHQjg=
github.com/go-redis/redis v6.15.9+incompatible/go.mod h1:NAIEuMOZ/fxfXJIrKDQDz8wamY7mA7PouImQ2Jvg6kA=
github.com/gomodule/redigo v1.8.3 h1:HR0kYDX2RJZvAup8CsiJwxB4dTCSC0AaUq6S4SiLwUc=
github.com/gomodule/redigo v1.8.3/go.mod h1:P9dn9mFrCBvWhGE1wpxx6fgq7BAeLBk+UUUzlpkBYO0=
github.com/gookit/color v1.4.2/go.mod h1:fqRyamkC1W8uxl+lxCQxOT09l/vYfZ+QeiX3rKQHCoQ=
github.com/gookit/color v1.5.0/go.mod h1:43aQb+Zerm/BWh2GnrgOQm7ffz7tvQXEKV6BFMl7wAo=
github.com/gookit/color v1.5.4 h1:FZmqs7XOyGgCAxmWyPslpiok1k05wmY3SJTytgvYFs0=
github.com/gookit/color v1.5.4/go.mod h1:pZJOeOS8DM43rXbp4AZo1n9zCU2qjpcRko0b6/QJi9w=
github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
github.com/klauspost/cpuid/v2 v2.0.10/go.mod h1:g2LTdtYhdyuGPqyWyv7qRAmj1WBqxuObKfj5c0PQa7c=
github.com/klauspost/cpuid/v2 v2.0.12/go.mod h1:g2LTdtYhdyuGPqyWyv7qRAmj1WBqxuObKfj5c0PQa7c=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/lithammer/fuzzysearch v1.1.8 h1:/HIuJnjHuXS8bKaiTMeeDlW2/AyIWk2brx1V8LFgLN4=
github.com/lithammer/fuzzysearch v1.1.8/go.mod h1:IdqeyBClc3FFqSzYq/MXESsS4S0FsZ5ajtkr5xPLts4=
github.com/mattn/go-runewidth v0.0.13/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc=
github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pterm/pterm v0.12.27/go.mod h1:PhQ89w4i95rhgE+xedAoqous6K9X+r6aSOI2eFF7DZI=
github.com/pterm/pterm v0.12.29/go.mod h1:WI3qxgvoQFFGKGjGnJR849gU0TsEOvKn5Q8LlY1U7lg=
github.com/pterm/pterm v0.12.30/go.mod h1:MOqLIyMOgmTDz9yorcYbcw+HsgoZo3BQfg2wtl3HEFE=
github.com/pterm/pterm v0.12.31/go.mod h1:32ZAWZVXD7ZfG0s8qqHXePte42kdz8ECtRyEejaWgXU=
github.com/pterm/pterm v0.12.33/go.mod h1:x+h2uL+n7CP/rel9+bImHD5lF3nM9vJj80k9ybiiTTE=
github.com/pterm/pterm v0.12.36/go.mod h1:NjiL09hFhT/vWjQHSj1athJpx6H8cjpHXNAK5bUw8T8=
github.com/pterm/pterm v0.12.40/go.mod h1:ffwPLwlbXxP+rxT0GsgDTzS3y3rmpAO1NMjUkGTYf8s=
github.com/pterm/pterm v0.12.82 h1:+D9wYhCaeaK0FIQoZtqbNQuNpe2lB2tajKKsTd5paVQ=
github.com/pterm/pterm v0.12.82/go.mod h1:TyuyrPjnxfwP+ccJdBTeWHtd/e0ybQHkOS/TakajZCw=
github.com/redis/go-redis/v9 v9.14.1 h1:nDCrEiJmfOWhD76xlaw+HXT0c9hfNWeXgl0vIRYSDvQ=
github.com/redis/go-redis/v9 v9.14.1/go.mod h1:huWgSWd8mW6+m0VPhJjSSQ+d6Nh1VICQ6Q5lHuCH/Iw=
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/xo/terminfo v0.0.0-20210125001918-ca9a967f8778/go.mod h1:2MuV+tbUrU1zIOPMxZ5EncGwgmMJsa+9ucAQZXxsObs=
github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavMF/ppJZNG9ZpyihvCd0w101no=
github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20210421230115-4e50805a0758/go.mod h1:72T/g9IO56b78aLF+1Kcs5dz7/ng1VjMUvfKvpfy+jM=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210420072515-93ed5bcd2bfe/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210421221651-33663a62ff08/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20211013075003-97ac67df715c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220319134239-a9b59b0215f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw=
golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210220032956-6a3ed077a48d/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210615171337-6886f2dfbf5b/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
golang.org/x/term v0.32.0 h1:DR4lr0TjUs3epypdhTOkMmuF5CDFJ/8pOnbzMZPQ7bg=
golang.org/x/term v0.32.0/go.mod h1:uZG1FhGx848Sqfsq4/DlJr3xGGsYMu/L5GW4abiaEPQ=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/text v0.26.0 h1:P42AVeLghgTYr4+xUnTRKDMqpar+PtX7KWuNQL21L8M=
golang.org/x/text v0.26.0/go.mod h1:QK15LZJUUQVJxhz7wXgxSy/CJaTFjd0G+YLonydOVQA=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.0/go.mod h1:xkSsbof2nBLbhDlRMhhhyNLN/zl3eTqcnHD5viDpcZ0=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=


================================================
FILE: tools/vector-benches/main.go
================================================
package main

import (
	"context"
	"flag"
	"fmt"
	"math/rand/v2"
	"os"
	"reflect"
	"slices"
	"sort"
	"strings"
	"sync"
	"time"
	"unsafe"

	"github.com/pterm/pterm"
	"github.com/redis/go-redis/v9"
)

var nEntries = flag.Int("n", 50000, "Number of vectors")
var nQueries = flag.Int("q", 1000, "Number of total queries")
var nQueryJobs = flag.Int("t", 8, "Query threads (jobs)")
var nDim = flag.Int("d", 100, "Vector dimension")
var nTop = flag.Int("k", 10, "Top K vectors selected")

var fPort = flag.Int("p", 6379, "Port")
var fHost = flag.String("h", "localhost", "Host")

func formatLargeNumber(n int) string {
	if n < 1000000 {
		return fmt.Sprintf("%.1fK", float64(n)/1000.0)
	} else {
		return fmt.Sprintf("%.1fM", float64(n)/1000000.0)
	}
}

// Convert float slice to byte slice without copies
func VecToSlice(vec []float32) []byte {
	rawPtr := unsafe.Pointer(&vec[0])
	byteLen := len(vec) * int(unsafe.Sizeof(vec[0]))
	var out []byte
	sheader := (*reflect.SliceHeader)(unsafe.Pointer(&out))
	sheader.Data = uintptr(rawPtr)
	sheader.Len = byteLen
	sheader.Cap = byteLen
	return out
}

// Generate random vector
func RandVec(dim uint) []float32 {
	res := make([]float32, dim)
	for i := range res {
		res[i] = rand.Float32()
	}
	return res
}

// Create index
func CreateIndex(ctx context.Context, rdb *redis.Client, dim uint) error {
	hnswOptions := &redis.FTHNSWOptions{Type: "FLOAT32", Dim: int(dim), DistanceMetric: "L2"}
	schema := redis.FieldSchema{FieldName: "v", FieldType: redis.SearchFieldTypeVector, VectorArgs: &redis.FTVectorArgs{HNSWOptions: hnswOptions}}
	_, err := rdb.FTCreate(ctx, "idx", &redis.FTCreateOptions{}, &schema).Result()
	return err
}

func WaitForIndex(ctx context.Context, rdb *redis.Client) {
	info, _ := rdb.Info(ctx).Result()
	if strings.Contains(info, "dragonfly") {
		return
	}

	for {
		idxInfo, err := rdb.FTInfo(ctx, "idx").Result()
		if err != nil {
			panic(err)
		}
		if idxInfo.PercentIndexed >= 1.0 {
			return
		}
		time.Sleep(time.Millisecond * 100)
	}
}

// Fill with random vectors
func Fill(ctx context.Context, rdb *redis.Client, prefix string, entries uint, dim uint) {
	const kBatchSize = uint(100)
	for i := uint(0); i < entries/kBatchSize; i += 1 {
		p := rdb.Pipeline()
		for j := 0; j < int(kBatchSize); j += 1 {
			key := fmt.Sprint(prefix, i, ":", j)
			vec := RandVec(dim)
			p.HSet(ctx, key, "v", VecToSlice(vec))
		}
		_, err := p.Exec(ctx)
		if err != nil {
			panic(err)
		}
	}
}

// Distribute Fill() over workers
func FillParallel(ctx context.Context, rdb *redis.Client, entries uint, dim uint) {
	const kJobs = uint(8)
	wg := sync.WaitGroup{}
	wg.Add(int(kJobs))

	for i := uint(0); i < kJobs; i += 1 {
		li := i
		go func() {
			Fill(ctx, rdb, fmt.Sprint("k", li, ":"), entries/kJobs, dim)
			wg.Done()
		}()
	}
	wg.Wait()
}

// Perform queries and measure latencies
func Query(ctx context.Context, rdb *redis.Client, queries uint, limit uint, dim uint) []time.Duration {
	latencies := make([]time.Duration, queries)
	query := fmt.Sprintf("*=>[KNN %v @v $vec]", limit)

	for i := range latencies {
		searchOptions := &redis.FTSearchOptions{
			DialectVersion: 2,
			NoContent:      true,
			Params:         map[string]interface{}{"vec": VecToSlice(RandVec(dim))},
		}

		start := time.Now()
		res, err := rdb.FTSearchWithArgs(ctx, "idx", query, searchOptions).Result()
		if err != nil {
			panic(err)
		}
		if res.Total != int(limit) {
			panic("Didn't hit limit")
		}
		latencies[i] = time.Since(start)
	}
	return latencies
}

// Call Query() from multiple workers and combine latencies sorted
func RunQueries(ctx context.Context, rdb *redis.Client) (time.Duration, []time.Duration) {
	jobs := uint(*nQueryJobs)
	jobQueries := uint(*nQueries) / jobs
	latencies := make([][]time.Duration, jobs)

	start := time.Now()
	wg := sync.WaitGroup{}

	wg.Add(len(latencies))
	for i := range latencies {
		li := i
		go func() {
			latencies[li] = Query(ctx, rdb, jobQueries, uint(*nTop), uint(*nDim))
			wg.Done()
		}()
	}

	wg.Wait()
	took := time.Since(start)

	// Unify all latencies and sort them
	allLatencies := make([]time.Duration, 0, 1000)
	for _, sub := range latencies {
		allLatencies = append(allLatencies, sub...)
	}
	sort.Slice(allLatencies, func(i, j int) bool {
		return allLatencies[i] < allLatencies[j]
	})
	return took, allLatencies
}

func Print(took time.Duration, latencies []time.Duration) {
	qps := float64(*nQueries) / took.Seconds()
	style := pterm.NewStyle(pterm.Bold)

	pterm.Print("Entries(n): ", formatLargeNumber(*nEntries), " Queries(q): ", formatLargeNumber(*nQueries), " ")
	pterm.Print("Dimension(d): ", *nDim, " Threads(t): ", *nQueryJobs, " Top(k): ", *nTop)
	pterm.Println()
	pterm.Println()

	pterm.Println("Took:", took)
	pterm.Print("QPS: ")
	style.Println(int(qps))

	p50 := float64(latencies[len(latencies)/2]) / float64(time.Millisecond)
	pterm.Printf("P50: %.2f ms", p50)

	pterm.Print(" P95: ")
	p95 := float64(latencies[int(float32(len(latencies))*0.95)]) / float64(time.Millisecond)
	style.Printf("%.2f ms", p95)

	p99 := float64(latencies[int(float32(len(latencies))*0.99)]) / float64(time.Millisecond)
	pterm.Printf(" P99: %.2f ms \n", p99)
}

func CheckData(ctx context.Context, rdb *redis.Client) {
	size, _ := rdb.DBSize(ctx).Result()
	if size*2 < int64(*nEntries) {
		rdb.FlushAll(ctx).Result()
		pterm.Println("Filling database")
		FillParallel(ctx, rdb, uint(*nEntries), uint(*nDim))
	}
}

func CheckIndex(ctx context.Context, rdb *redis.Client) {
	indices, _ := rdb.FT_List(ctx).Result()
	if slices.Contains(indices, "idx") {
		pterm.Println("Index exists")
	} else {
		pterm.Println("Creating index with", formatLargeNumber(*nEntries), "entries")
		start := time.Now()
		err := CreateIndex(ctx, rdb, uint(*nDim))
		if err != nil {
			panic(err)
		}
		WaitForIndex(ctx, rdb)
		pterm.Println("Created index in", time.Since(start))
	}
}

func main() {
	flag.Usage = func() {
		fmt.Fprintln(os.Stderr, "Randomized vector search benchmark")
		flag.PrintDefaults()
	}
	flag.Parse()

	ctx := context.Background()
	rdb := redis.NewClient(&redis.Options{
		Protocol:    2,
		ReadTimeout: -1, // due to possibly long index construction
		Addr:        fmt.Sprint(*fHost, ":", *fPort),
	})

	CheckData(ctx, rdb)
	CheckIndex(ctx, rdb)

	pterm.Println("Running queries")
	took, latencies := RunQueries(ctx, rdb)
	Print(took, latencies)
}